1 /*
2 * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
120
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
125
126 #include <os/log.h>
127
128 #if CONFIG_MACF
129 #include <security/mac_framework.h>
130 #endif /* MAC */
131
132 #if MULTIPATH
133 #include <netinet/mp_pcb.h>
134 #include <netinet/mptcp_var.h>
135 #endif /* MULTIPATH */
136
137 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
138
139 #if DEBUG || DEVELOPMENT
140 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
141 #else
142 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
143 #endif
144
145 /* TODO: this should be in a header file somewhere */
146 extern char *proc_name_address(void *p);
147
148 static u_int32_t so_cache_hw; /* High water mark for socache */
149 static u_int32_t so_cache_timeouts; /* number of timeouts */
150 static u_int32_t so_cache_max_freed; /* max freed per timeout */
151 static u_int32_t cached_sock_count = 0;
152 STAILQ_HEAD(, socket) so_cache_head;
153 int max_cached_sock_count = MAX_CACHED_SOCKETS;
154 static uint64_t so_cache_time;
155 static int socketinit_done;
156 static struct zone *so_cache_zone;
157
158 static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
159 static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
160
161 #include <machine/limits.h>
162
163 static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
164 static void filt_sordetach(struct knote *kn);
165 static int filt_soread(struct knote *kn, long hint);
166 static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167 static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
168
169 static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
170 static void filt_sowdetach(struct knote *kn);
171 static int filt_sowrite(struct knote *kn, long hint);
172 static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173 static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
174
175 static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
176 static void filt_sockdetach(struct knote *kn);
177 static int filt_sockev(struct knote *kn, long hint);
178 static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179 static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
180
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 .f_isfd = 1,
186 .f_attach = filt_sorattach,
187 .f_detach = filt_sordetach,
188 .f_event = filt_soread,
189 .f_touch = filt_sortouch,
190 .f_process = filt_sorprocess,
191 };
192
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 .f_isfd = 1,
195 .f_attach = filt_sowattach,
196 .f_detach = filt_sowdetach,
197 .f_event = filt_sowrite,
198 .f_touch = filt_sowtouch,
199 .f_process = filt_sowprocess,
200 };
201
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 .f_isfd = 1,
204 .f_attach = filt_sockattach,
205 .f_detach = filt_sockdetach,
206 .f_event = filt_sockev,
207 .f_touch = filt_socktouch,
208 .f_process = filt_sockprocess,
209 };
210
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 .f_isfd = 1,
213 .f_attach = filt_sorattach,
214 .f_detach = filt_sordetach,
215 .f_event = filt_soread,
216 .f_touch = filt_sortouch,
217 .f_process = filt_sorprocess,
218 };
219
220 SYSCTL_DECL(_kern_ipc);
221
222 #define EVEN_MORE_LOCKING_DEBUG 0
223
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227
228 static unsigned long sodefunct_calls = 0;
229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230 &sodefunct_calls, "");
231
232 ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
233 so_gen_t so_gencnt; /* generation count for sockets */
234
235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
236
237 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
238 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
239 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
240 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
241 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
242 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
243 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
245 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
246
247 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
248
249 int somaxconn = SOMAXCONN;
250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
251 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
252
253 /* Should we get a maximum also ??? */
254 static int sosendmaxchain = 65536;
255 static int sosendminchain = 16384;
256 static int sorecvmincopy = 16384;
257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
258 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
260 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
261
262 /*
263 * Set to enable jumbo clusters (if available) for large writes when
264 * the socket is marked with SOF_MULTIPAGES; see below.
265 */
266 int sosendjcl = 1;
267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
268 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
269
270 /*
271 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272 * writes on the socket for all protocols on any network interfaces,
273 * depending upon sosendjcl above. Be extra careful when setting this
274 * to 1, because sending down packets that cross physical pages down to
275 * broken drivers (those that falsely assume that the physical pages
276 * are contiguous) might lead to system panics or silent data corruption.
277 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279 * capable. Set this to 1 only for testing/debugging purposes.
280 */
281 int sosendjcl_ignore_capab = 0;
282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
283 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
284
285 /*
286 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287 * writes on the socket for all protocols on any network interfaces.
288 * Be extra careful when setting this to 1, because sending down packets with
289 * clusters larger that 2 KB might lead to system panics or data corruption.
290 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291 * on the outgoing interface
292 * Set this to 1 for testing/debugging purposes only.
293 */
294 int sosendbigcl_ignore_capab = 0;
295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
296 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
297
298 int sodefunctlog = 0;
299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
300 &sodefunctlog, 0, "");
301
302 int sothrottlelog = 0;
303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
304 &sothrottlelog, 0, "");
305
306 int sorestrictrecv = 1;
307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
308 &sorestrictrecv, 0, "Enable inbound interface restrictions");
309
310 int sorestrictsend = 1;
311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
312 &sorestrictsend, 0, "Enable outbound interface restrictions");
313
314 int soreserveheadroom = 1;
315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
316 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
317
318 #if (DEBUG || DEVELOPMENT)
319 int so_notsent_lowat_check = 1;
320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
321 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
322 #endif /* DEBUG || DEVELOPMENT */
323
324 int so_accept_list_waits = 0;
325 #if (DEBUG || DEVELOPMENT)
326 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
327 &so_accept_list_waits, 0, "number of waits for listener incomp list");
328 #endif /* DEBUG || DEVELOPMENT */
329
330 extern struct inpcbinfo tcbinfo;
331
332 /* TODO: these should be in header file */
333 extern int get_inpcb_str_size(void);
334 extern int get_tcp_str_size(void);
335
336 vm_size_t so_cache_zone_element_size;
337
338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
339 user_ssize_t *);
340 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
341 static void cached_sock_free(struct socket *);
342
343 /*
344 * Maximum of extended background idle sockets per process
345 * Set to zero to disable further setting of the option
346 */
347
348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
349 #define SO_IDLE_BK_IDLE_TIME 600
350 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
351
352 struct soextbkidlestat soextbkidlestat;
353
354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
355 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
356 "Maximum of extended background idle sockets per process");
357
358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
359 &soextbkidlestat.so_xbkidle_time, 0,
360 "Time in seconds to keep extended background idle sockets");
361
362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
363 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
364 "High water mark for extended background idle sockets");
365
366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
367 &soextbkidlestat, soextbkidlestat, "");
368
369 int so_set_extended_bk_idle(struct socket *, int);
370
371
372 /*
373 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
374 * setting the DSCP code on the packet based on the service class; see
375 * <rdar://problem/11277343> for details.
376 */
377 __private_extern__ u_int32_t sotcdb = 0;
378 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
379 &sotcdb, 0, "");
380
381 void
socketinit(void)382 socketinit(void)
383 {
384 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
385 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
386
387 #ifdef __LP64__
388 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
389 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
392 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
394 #else
395 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
396 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
399 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
401 #endif
402
403 if (socketinit_done) {
404 printf("socketinit: already called...\n");
405 return;
406 }
407 socketinit_done = 1;
408
409 PE_parse_boot_argn("socket_debug", &socket_debug,
410 sizeof(socket_debug));
411
412 STAILQ_INIT(&so_cache_head);
413
414 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
415 + get_inpcb_str_size() + 4 + get_tcp_str_size());
416
417 so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
418 ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
419
420 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
421 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
422 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
423 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
424
425 in_pcbinit();
426 }
427
428 static void
cached_sock_alloc(struct socket ** so,zalloc_flags_t how)429 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
430 {
431 caddr_t temp;
432 uintptr_t offset;
433
434 lck_mtx_lock(&so_cache_mtx);
435
436 if (!STAILQ_EMPTY(&so_cache_head)) {
437 VERIFY(cached_sock_count > 0);
438
439 *so = STAILQ_FIRST(&so_cache_head);
440 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
441 STAILQ_NEXT((*so), so_cache_ent) = NULL;
442
443 cached_sock_count--;
444 lck_mtx_unlock(&so_cache_mtx);
445
446 temp = (*so)->so_saved_pcb;
447 bzero((caddr_t)*so, sizeof(struct socket));
448
449 (*so)->so_saved_pcb = temp;
450 } else {
451 lck_mtx_unlock(&so_cache_mtx);
452
453 *so = zalloc_flags(so_cache_zone, how | Z_ZERO);
454
455 /*
456 * Define offsets for extra structures into our
457 * single block of memory. Align extra structures
458 * on longword boundaries.
459 */
460
461 offset = (uintptr_t)*so;
462 offset += sizeof(struct socket);
463
464 offset = ALIGN(offset);
465
466 (*so)->so_saved_pcb = (caddr_t)offset;
467 offset += get_inpcb_str_size();
468
469 offset = ALIGN(offset);
470
471 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
472 (caddr_t)offset;
473 }
474
475 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
476 }
477
478 static void
cached_sock_free(struct socket * so)479 cached_sock_free(struct socket *so)
480 {
481 lck_mtx_lock(&so_cache_mtx);
482
483 so_cache_time = net_uptime();
484 if (++cached_sock_count > max_cached_sock_count) {
485 --cached_sock_count;
486 lck_mtx_unlock(&so_cache_mtx);
487 zfree(so_cache_zone, so);
488 } else {
489 if (so_cache_hw < cached_sock_count) {
490 so_cache_hw = cached_sock_count;
491 }
492
493 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
494
495 so->cache_timestamp = so_cache_time;
496 lck_mtx_unlock(&so_cache_mtx);
497 }
498 }
499
500 void
so_update_last_owner_locked(struct socket * so,proc_t self)501 so_update_last_owner_locked(struct socket *so, proc_t self)
502 {
503 if (so->last_pid != 0) {
504 /*
505 * last_pid and last_upid should remain zero for sockets
506 * created using sock_socket. The check above achieves that
507 */
508 if (self == PROC_NULL) {
509 self = current_proc();
510 }
511
512 if (so->last_upid != proc_uniqueid(self) ||
513 so->last_pid != proc_pid(self)) {
514 so->last_upid = proc_uniqueid(self);
515 so->last_pid = proc_pid(self);
516 proc_getexecutableuuid(self, so->last_uuid,
517 sizeof(so->last_uuid));
518 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
519 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
520 }
521 }
522 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
523 }
524 }
525
526 void
so_update_policy(struct socket * so)527 so_update_policy(struct socket *so)
528 {
529 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
530 (void) inp_update_policy(sotoinpcb(so));
531 }
532 }
533
534 #if NECP
535 static void
so_update_necp_policy(struct socket * so,struct sockaddr * override_local_addr,struct sockaddr * override_remote_addr)536 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
537 struct sockaddr *override_remote_addr)
538 {
539 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
540 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
541 override_remote_addr, 0);
542 }
543 }
544 #endif /* NECP */
545
546 boolean_t
so_cache_timer(void)547 so_cache_timer(void)
548 {
549 struct socket *p;
550 int n_freed = 0;
551 boolean_t rc = FALSE;
552
553 lck_mtx_lock(&so_cache_mtx);
554 so_cache_timeouts++;
555 so_cache_time = net_uptime();
556
557 while (!STAILQ_EMPTY(&so_cache_head)) {
558 VERIFY(cached_sock_count > 0);
559 p = STAILQ_FIRST(&so_cache_head);
560 if ((so_cache_time - p->cache_timestamp) <
561 SO_CACHE_TIME_LIMIT) {
562 break;
563 }
564
565 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
566 --cached_sock_count;
567
568 zfree(so_cache_zone, p);
569
570 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
571 so_cache_max_freed++;
572 break;
573 }
574 }
575
576 /* Schedule again if there is more to cleanup */
577 if (!STAILQ_EMPTY(&so_cache_head)) {
578 rc = TRUE;
579 }
580
581 lck_mtx_unlock(&so_cache_mtx);
582 return rc;
583 }
584
585 /*
586 * Get a socket structure from our zone, and initialize it.
587 * We don't implement `waitok' yet (see comments in uipc_domain.c).
588 * Note that it would probably be better to allocate socket
589 * and PCB at the same time, but I'm not convinced that all
590 * the protocols can be easily modified to do this.
591 */
592 struct socket *
soalloc(int waitok,int dom,int type)593 soalloc(int waitok, int dom, int type)
594 {
595 zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
596 struct socket *so;
597
598 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
599 cached_sock_alloc(&so, how);
600 } else {
601 so = zalloc_flags(socket_zone, how | Z_ZERO);
602 }
603 if (so != NULL) {
604 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
605
606 /*
607 * Increment the socket allocation statistics
608 */
609 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
610 }
611
612 return so;
613 }
614
615 int
socreate_internal(int dom,struct socket ** aso,int type,int proto,struct proc * p,uint32_t flags,struct proc * ep)616 socreate_internal(int dom, struct socket **aso, int type, int proto,
617 struct proc *p, uint32_t flags, struct proc *ep)
618 {
619 struct protosw *prp;
620 struct socket *so;
621 int error = 0;
622 #if defined(XNU_TARGET_OS_OSX)
623 pid_t rpid = -1;
624 #endif
625
626 #if TCPDEBUG
627 extern int tcpconsdebug;
628 #endif
629
630 VERIFY(aso != NULL);
631 *aso = NULL;
632
633 if (proto != 0) {
634 prp = pffindproto(dom, proto, type);
635 } else {
636 prp = pffindtype(dom, type);
637 }
638
639 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
640 if (pffinddomain(dom) == NULL) {
641 return EAFNOSUPPORT;
642 }
643 if (proto != 0) {
644 if (pffindprotonotype(dom, proto) != NULL) {
645 return EPROTOTYPE;
646 }
647 }
648 return EPROTONOSUPPORT;
649 }
650 if (prp->pr_type != type) {
651 return EPROTOTYPE;
652 }
653 so = soalloc(1, dom, type);
654 if (so == NULL) {
655 return ENOBUFS;
656 }
657
658 switch (dom) {
659 case PF_LOCAL:
660 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
661 break;
662 case PF_INET:
663 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
664 if (type == SOCK_STREAM) {
665 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
666 } else {
667 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
668 }
669 break;
670 case PF_ROUTE:
671 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
672 break;
673 case PF_NDRV:
674 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
675 break;
676 case PF_KEY:
677 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
678 break;
679 case PF_INET6:
680 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
681 if (type == SOCK_STREAM) {
682 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
683 } else {
684 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
685 }
686 break;
687 case PF_SYSTEM:
688 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
689 break;
690 case PF_MULTIPATH:
691 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
692 break;
693 default:
694 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
695 break;
696 }
697
698 if (flags & SOCF_MPTCP) {
699 so->so_state |= SS_NBIO;
700 }
701
702 TAILQ_INIT(&so->so_incomp);
703 TAILQ_INIT(&so->so_comp);
704 so->so_type = (short)type;
705 so->last_upid = proc_uniqueid(p);
706 so->last_pid = proc_pid(p);
707 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
708 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
709
710 if (ep != PROC_NULL && ep != p) {
711 so->e_upid = proc_uniqueid(ep);
712 so->e_pid = proc_pid(ep);
713 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
714 so->so_flags |= SOF_DELEGATED;
715 #if defined(XNU_TARGET_OS_OSX)
716 if (ep->p_responsible_pid != so->e_pid) {
717 rpid = ep->p_responsible_pid;
718 }
719 #endif
720 }
721
722 #if defined(XNU_TARGET_OS_OSX)
723 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
724 rpid = p->p_responsible_pid;
725 }
726
727 so->so_rpid = -1;
728 uuid_clear(so->so_ruuid);
729 if (rpid >= 0) {
730 proc_t rp = proc_find(rpid);
731 if (rp != PROC_NULL) {
732 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
733 so->so_rpid = rpid;
734 proc_rele(rp);
735 }
736 }
737 #endif
738
739 so->so_cred = kauth_cred_proc_ref(p);
740 if (!suser(kauth_cred_get(), NULL)) {
741 so->so_state |= SS_PRIV;
742 }
743
744 so->so_proto = prp;
745 so->so_rcv.sb_flags |= SB_RECV;
746 so->so_rcv.sb_so = so->so_snd.sb_so = so;
747 so->next_lock_lr = 0;
748 so->next_unlock_lr = 0;
749
750 /*
751 * Attachment will create the per pcb lock if necessary and
752 * increase refcount for creation, make sure it's done before
753 * socket is inserted in lists.
754 */
755 so->so_usecount++;
756
757 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
758 if (error != 0) {
759 /*
760 * Warning:
761 * If so_pcb is not zero, the socket will be leaked,
762 * so protocol attachment handler must be coded carefuly
763 */
764 if (so->so_pcb != NULL) {
765 os_log_error(OS_LOG_DEFAULT,
766 "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
767 error, dom, proto, type);
768 }
769 /*
770 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
771 */
772 so->so_state |= SS_NOFDREF;
773 so->so_flags |= SOF_PCBCLEARING;
774 VERIFY(so->so_usecount > 0);
775 so->so_usecount--;
776 sofreelastref(so, 1); /* will deallocate the socket */
777 return error;
778 }
779
780 /*
781 * Note: needs so_pcb to be set after pru_attach
782 */
783 if (prp->pr_update_last_owner != NULL) {
784 (*prp->pr_update_last_owner)(so, p, ep);
785 }
786
787 atomic_add_32(&prp->pr_domain->dom_refs, 1);
788
789 /* Attach socket filters for this protocol */
790 sflt_initsock(so);
791 #if TCPDEBUG
792 if (tcpconsdebug == 2) {
793 so->so_options |= SO_DEBUG;
794 }
795 #endif
796 so_set_default_traffic_class(so);
797
798 /*
799 * If this thread or task is marked to create backgrounded sockets,
800 * mark the socket as background.
801 */
802 if (!(flags & SOCF_MPTCP) &&
803 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
804 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
805 so->so_background_thread = current_thread();
806 }
807
808 switch (dom) {
809 /*
810 * Don't mark Unix domain or system
811 * eligible for defunct by default.
812 */
813 case PF_LOCAL:
814 case PF_SYSTEM:
815 so->so_flags |= SOF_NODEFUNCT;
816 break;
817 default:
818 break;
819 }
820
821 /*
822 * Entitlements can't be checked at socket creation time except if the
823 * application requested a feature guarded by a privilege (c.f., socket
824 * delegation).
825 * The priv(9) and the Sandboxing APIs are designed with the idea that
826 * a privilege check should only be triggered by a userland request.
827 * A privilege check at socket creation time is time consuming and
828 * could trigger many authorisation error messages from the security
829 * APIs.
830 */
831
832 *aso = so;
833
834 return 0;
835 }
836
837 /*
838 * Returns: 0 Success
839 * EAFNOSUPPORT
840 * EPROTOTYPE
841 * EPROTONOSUPPORT
842 * ENOBUFS
843 * <pru_attach>:ENOBUFS[AF_UNIX]
844 * <pru_attach>:ENOBUFS[TCP]
845 * <pru_attach>:ENOMEM[TCP]
846 * <pru_attach>:??? [other protocol families, IPSEC]
847 */
848 int
socreate(int dom,struct socket ** aso,int type,int proto)849 socreate(int dom, struct socket **aso, int type, int proto)
850 {
851 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
852 PROC_NULL);
853 }
854
855 int
socreate_delegate(int dom,struct socket ** aso,int type,int proto,pid_t epid)856 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
857 {
858 int error = 0;
859 struct proc *ep = PROC_NULL;
860
861 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
862 error = ESRCH;
863 goto done;
864 }
865
866 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
867
868 /*
869 * It might not be wise to hold the proc reference when calling
870 * socreate_internal since it calls soalloc with M_WAITOK
871 */
872 done:
873 if (ep != PROC_NULL) {
874 proc_rele(ep);
875 }
876
877 return error;
878 }
879
880 /*
881 * Returns: 0 Success
882 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
883 * <pru_bind>:EAFNOSUPPORT Address family not supported
884 * <pru_bind>:EADDRNOTAVAIL Address not available.
885 * <pru_bind>:EINVAL Invalid argument
886 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
887 * <pru_bind>:EACCES Permission denied
888 * <pru_bind>:EADDRINUSE Address in use
889 * <pru_bind>:EAGAIN Resource unavailable, try again
890 * <pru_bind>:EPERM Operation not permitted
891 * <pru_bind>:???
892 * <sf_bind>:???
893 *
894 * Notes: It's not possible to fully enumerate the return codes above,
895 * since socket filter authors and protocol family authors may
896 * not choose to limit their error returns to those listed, even
897 * though this may result in some software operating incorrectly.
898 *
899 * The error codes which are enumerated above are those known to
900 * be returned by the tcp_usr_bind function supplied.
901 */
902 int
sobindlock(struct socket * so,struct sockaddr * nam,int dolock)903 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
904 {
905 struct proc *p = current_proc();
906 int error = 0;
907
908 if (dolock) {
909 socket_lock(so, 1);
910 }
911
912 so_update_last_owner_locked(so, p);
913 so_update_policy(so);
914
915 #if NECP
916 so_update_necp_policy(so, nam, NULL);
917 #endif /* NECP */
918
919 /*
920 * If this is a bind request on a socket that has been marked
921 * as inactive, reject it now before we go any further.
922 */
923 if (so->so_flags & SOF_DEFUNCT) {
924 error = EINVAL;
925 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
926 __func__, proc_pid(p), proc_best_name(p),
927 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
928 SOCK_DOM(so), SOCK_TYPE(so), error);
929 goto out;
930 }
931
932 /* Socket filter */
933 error = sflt_bind(so, nam);
934
935 if (error == 0) {
936 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
937 }
938 out:
939 if (dolock) {
940 socket_unlock(so, 1);
941 }
942
943 if (error == EJUSTRETURN) {
944 error = 0;
945 }
946
947 return error;
948 }
949
950 void
sodealloc(struct socket * so)951 sodealloc(struct socket *so)
952 {
953 kauth_cred_unref(&so->so_cred);
954
955 /* Remove any filters */
956 sflt_termsock(so);
957
958 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
959
960 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
961 cached_sock_free(so);
962 } else {
963 zfree(socket_zone, so);
964 }
965 }
966
967 /*
968 * Returns: 0 Success
969 * EINVAL
970 * EOPNOTSUPP
971 * <pru_listen>:EINVAL[AF_UNIX]
972 * <pru_listen>:EINVAL[TCP]
973 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
974 * <pru_listen>:EINVAL[TCP] Invalid argument
975 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
976 * <pru_listen>:EACCES[TCP] Permission denied
977 * <pru_listen>:EADDRINUSE[TCP] Address in use
978 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
979 * <pru_listen>:EPERM[TCP] Operation not permitted
980 * <sf_listen>:???
981 *
982 * Notes: Other <pru_listen> returns depend on the protocol family; all
983 * <sf_listen> returns depend on what the filter author causes
984 * their filter to return.
985 */
986 int
solisten(struct socket * so,int backlog)987 solisten(struct socket *so, int backlog)
988 {
989 struct proc *p = current_proc();
990 int error = 0;
991
992 socket_lock(so, 1);
993
994 so_update_last_owner_locked(so, p);
995 so_update_policy(so);
996
997 #if NECP
998 so_update_necp_policy(so, NULL, NULL);
999 #endif /* NECP */
1000
1001 if (so->so_proto == NULL) {
1002 error = EINVAL;
1003 goto out;
1004 }
1005 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1006 error = EOPNOTSUPP;
1007 goto out;
1008 }
1009
1010 /*
1011 * If the listen request is made on a socket that is not fully
1012 * disconnected, or on a socket that has been marked as inactive,
1013 * reject the request now.
1014 */
1015 if ((so->so_state &
1016 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1017 (so->so_flags & SOF_DEFUNCT)) {
1018 error = EINVAL;
1019 if (so->so_flags & SOF_DEFUNCT) {
1020 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1021 "(%d)\n", __func__, proc_pid(p),
1022 proc_best_name(p),
1023 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1024 SOCK_DOM(so), SOCK_TYPE(so), error);
1025 }
1026 goto out;
1027 }
1028
1029 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1030 error = EPERM;
1031 goto out;
1032 }
1033
1034 error = sflt_listen(so);
1035 if (error == 0) {
1036 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1037 }
1038
1039 if (error) {
1040 if (error == EJUSTRETURN) {
1041 error = 0;
1042 }
1043 goto out;
1044 }
1045
1046 if (TAILQ_EMPTY(&so->so_comp)) {
1047 so->so_options |= SO_ACCEPTCONN;
1048 }
1049 /*
1050 * POSIX: The implementation may have an upper limit on the length of
1051 * the listen queue-either global or per accepting socket. If backlog
1052 * exceeds this limit, the length of the listen queue is set to the
1053 * limit.
1054 *
1055 * If listen() is called with a backlog argument value that is less
1056 * than 0, the function behaves as if it had been called with a backlog
1057 * argument value of 0.
1058 *
1059 * A backlog argument of 0 may allow the socket to accept connections,
1060 * in which case the length of the listen queue may be set to an
1061 * implementation-defined minimum value.
1062 */
1063 if (backlog <= 0 || backlog > somaxconn) {
1064 backlog = somaxconn;
1065 }
1066
1067 so->so_qlimit = (short)backlog;
1068 out:
1069 socket_unlock(so, 1);
1070 return error;
1071 }
1072
1073 /*
1074 * The "accept list lock" protects the fields related to the listener queues
1075 * because we can unlock a socket to respect the lock ordering between
1076 * the listener socket and its clients sockets. The lock ordering is first to
1077 * acquire the client socket before the listener socket.
1078 *
1079 * The accept list lock serializes access to the following fields:
1080 * - of the listener socket:
1081 * - so_comp
1082 * - so_incomp
1083 * - so_qlen
1084 * - so_inqlen
1085 * - of client sockets that are in so_comp or so_incomp:
1086 * - so_head
1087 * - so_list
1088 *
1089 * As one can see the accept list lock protects the consistent of the
1090 * linkage of the client sockets.
1091 *
1092 * Note that those fields may be read without holding the accept list lock
1093 * for a preflight provided the accept list lock is taken when committing
1094 * to take an action based on the result of the preflight. The preflight
1095 * saves the cost of doing the unlock/lock dance.
1096 */
1097 void
so_acquire_accept_list(struct socket * head,struct socket * so)1098 so_acquire_accept_list(struct socket *head, struct socket *so)
1099 {
1100 lck_mtx_t *mutex_held;
1101
1102 if (head->so_proto->pr_getlock == NULL) {
1103 return;
1104 }
1105 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1106 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1107
1108 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1109 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1110 return;
1111 }
1112 if (so != NULL) {
1113 socket_unlock(so, 0);
1114 }
1115 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1116 so_accept_list_waits += 1;
1117 msleep((caddr_t)&head->so_incomp, mutex_held,
1118 PSOCK | PCATCH, __func__, NULL);
1119 }
1120 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1121 if (so != NULL) {
1122 socket_unlock(head, 0);
1123 socket_lock(so, 0);
1124 socket_lock(head, 0);
1125 }
1126 }
1127
1128 void
so_release_accept_list(struct socket * head)1129 so_release_accept_list(struct socket *head)
1130 {
1131 if (head->so_proto->pr_getlock != NULL) {
1132 lck_mtx_t *mutex_held;
1133
1134 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1135 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1136
1137 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1138 wakeup((caddr_t)&head->so_incomp);
1139 }
1140 }
1141
1142 void
sofreelastref(struct socket * so,int dealloc)1143 sofreelastref(struct socket *so, int dealloc)
1144 {
1145 struct socket *head = so->so_head;
1146
1147 /* Assume socket is locked */
1148
1149 #if FLOW_DIVERT
1150 if (so->so_flags & SOF_FLOW_DIVERT) {
1151 flow_divert_detach(so);
1152 }
1153 #endif /* FLOW_DIVERT */
1154
1155 #if CONTENT_FILTER
1156 if (dealloc && ((so->so_flags & SOF_CONTENT_FILTER) != 0)) {
1157 cfil_sock_detach(so);
1158 }
1159 #endif /* CONTENT_FILTER */
1160
1161 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1162 soflow_detach(so);
1163 }
1164
1165 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1166 selthreadclear(&so->so_snd.sb_sel);
1167 selthreadclear(&so->so_rcv.sb_sel);
1168 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1169 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1170 so->so_event = sonullevent;
1171 return;
1172 }
1173 if (head != NULL) {
1174 /*
1175 * Need to lock the listener when the protocol has
1176 * per socket locks
1177 */
1178 if (head->so_proto->pr_getlock != NULL) {
1179 socket_lock(head, 1);
1180 so_acquire_accept_list(head, so);
1181 }
1182 if (so->so_state & SS_INCOMP) {
1183 so->so_state &= ~SS_INCOMP;
1184 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1185 head->so_incqlen--;
1186 head->so_qlen--;
1187 so->so_head = NULL;
1188
1189 if (head->so_proto->pr_getlock != NULL) {
1190 so_release_accept_list(head);
1191 socket_unlock(head, 1);
1192 }
1193 } else if (so->so_state & SS_COMP) {
1194 if (head->so_proto->pr_getlock != NULL) {
1195 so_release_accept_list(head);
1196 socket_unlock(head, 1);
1197 }
1198 /*
1199 * We must not decommission a socket that's
1200 * on the accept(2) queue. If we do, then
1201 * accept(2) may hang after select(2) indicated
1202 * that the listening socket was ready.
1203 */
1204 selthreadclear(&so->so_snd.sb_sel);
1205 selthreadclear(&so->so_rcv.sb_sel);
1206 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1207 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1208 so->so_event = sonullevent;
1209 return;
1210 } else {
1211 if (head->so_proto->pr_getlock != NULL) {
1212 so_release_accept_list(head);
1213 socket_unlock(head, 1);
1214 }
1215 printf("sofree: not queued\n");
1216 }
1217 }
1218 sowflush(so);
1219 sorflush(so);
1220
1221 /* 3932268: disable upcall */
1222 so->so_rcv.sb_flags &= ~SB_UPCALL;
1223 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1224 so->so_event = sonullevent;
1225
1226 if (dealloc) {
1227 sodealloc(so);
1228 }
1229 }
1230
1231 void
soclose_wait_locked(struct socket * so)1232 soclose_wait_locked(struct socket *so)
1233 {
1234 lck_mtx_t *mutex_held;
1235
1236 if (so->so_proto->pr_getlock != NULL) {
1237 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1238 } else {
1239 mutex_held = so->so_proto->pr_domain->dom_mtx;
1240 }
1241 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1242
1243 /*
1244 * Double check here and return if there's no outstanding upcall;
1245 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1246 */
1247 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1248 return;
1249 }
1250 so->so_rcv.sb_flags &= ~SB_UPCALL;
1251 so->so_snd.sb_flags &= ~SB_UPCALL;
1252 so->so_flags |= SOF_CLOSEWAIT;
1253
1254 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1255 "soclose_wait_locked", NULL);
1256 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1257 so->so_flags &= ~SOF_CLOSEWAIT;
1258 }
1259
1260 /*
1261 * Close a socket on last file table reference removal.
1262 * Initiate disconnect if connected.
1263 * Free socket when disconnect complete.
1264 */
1265 int
soclose_locked(struct socket * so)1266 soclose_locked(struct socket *so)
1267 {
1268 int error = 0;
1269 struct timespec ts;
1270
1271 if (so->so_usecount == 0) {
1272 panic("soclose: so=%p refcount=0", so);
1273 /* NOTREACHED */
1274 }
1275
1276 sflt_notify(so, sock_evt_closing, NULL);
1277
1278 if (so->so_upcallusecount) {
1279 soclose_wait_locked(so);
1280 }
1281
1282 #if CONTENT_FILTER
1283 /*
1284 * We have to wait until the content filters are done
1285 */
1286 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1287 cfil_sock_close_wait(so);
1288 cfil_sock_is_closed(so);
1289 cfil_sock_detach(so);
1290 }
1291 #endif /* CONTENT_FILTER */
1292
1293 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1294 soflow_detach(so);
1295 }
1296
1297 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1298 soresume(current_proc(), so, 1);
1299 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1300 }
1301
1302 if ((so->so_options & SO_ACCEPTCONN)) {
1303 struct socket *sp, *sonext;
1304 int persocklock = 0;
1305 int incomp_overflow_only;
1306
1307 /*
1308 * We do not want new connection to be added
1309 * to the connection queues
1310 */
1311 so->so_options &= ~SO_ACCEPTCONN;
1312
1313 /*
1314 * We can drop the lock on the listener once
1315 * we've acquired the incoming list
1316 */
1317 if (so->so_proto->pr_getlock != NULL) {
1318 persocklock = 1;
1319 so_acquire_accept_list(so, NULL);
1320 socket_unlock(so, 0);
1321 }
1322 again:
1323 incomp_overflow_only = 1;
1324
1325 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1326 /*
1327 * Radar 5350314
1328 * skip sockets thrown away by tcpdropdropblreq
1329 * they will get cleanup by the garbage collection.
1330 * otherwise, remove the incomp socket from the queue
1331 * and let soabort trigger the appropriate cleanup.
1332 */
1333 if (sp->so_flags & SOF_OVERFLOW) {
1334 continue;
1335 }
1336
1337 if (persocklock != 0) {
1338 socket_lock(sp, 1);
1339 }
1340
1341 /*
1342 * Radar 27945981
1343 * The extra reference for the list insure the
1344 * validity of the socket pointer when we perform the
1345 * unlock of the head above
1346 */
1347 if (sp->so_state & SS_INCOMP) {
1348 sp->so_state &= ~SS_INCOMP;
1349 sp->so_head = NULL;
1350 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1351 so->so_incqlen--;
1352 so->so_qlen--;
1353
1354 (void) soabort(sp);
1355 } else {
1356 panic("%s sp %p in so_incomp but !SS_INCOMP",
1357 __func__, sp);
1358 }
1359
1360 if (persocklock != 0) {
1361 socket_unlock(sp, 1);
1362 }
1363 }
1364
1365 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1366 /* Dequeue from so_comp since sofree() won't do it */
1367 if (persocklock != 0) {
1368 socket_lock(sp, 1);
1369 }
1370
1371 if (sp->so_state & SS_COMP) {
1372 sp->so_state &= ~SS_COMP;
1373 sp->so_head = NULL;
1374 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1375 so->so_qlen--;
1376
1377 (void) soabort(sp);
1378 } else {
1379 panic("%s sp %p in so_comp but !SS_COMP",
1380 __func__, sp);
1381 }
1382
1383 if (persocklock) {
1384 socket_unlock(sp, 1);
1385 }
1386 }
1387
1388 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1389 #if (DEBUG | DEVELOPMENT)
1390 panic("%s head %p so_comp not empty", __func__, so);
1391 #endif /* (DEVELOPMENT || DEBUG) */
1392
1393 goto again;
1394 }
1395
1396 if (!TAILQ_EMPTY(&so->so_comp)) {
1397 #if (DEBUG | DEVELOPMENT)
1398 panic("%s head %p so_comp not empty", __func__, so);
1399 #endif /* (DEVELOPMENT || DEBUG) */
1400
1401 goto again;
1402 }
1403
1404 if (persocklock) {
1405 socket_lock(so, 0);
1406 so_release_accept_list(so);
1407 }
1408 }
1409 if (so->so_pcb == NULL) {
1410 /* 3915887: mark the socket as ready for dealloc */
1411 so->so_flags |= SOF_PCBCLEARING;
1412 goto discard;
1413 }
1414 if (so->so_state & SS_ISCONNECTED) {
1415 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1416 error = sodisconnectlocked(so);
1417 if (error) {
1418 goto drop;
1419 }
1420 }
1421 if (so->so_options & SO_LINGER) {
1422 lck_mtx_t *mutex_held;
1423
1424 if ((so->so_state & SS_ISDISCONNECTING) &&
1425 (so->so_state & SS_NBIO)) {
1426 goto drop;
1427 }
1428 if (so->so_proto->pr_getlock != NULL) {
1429 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1430 } else {
1431 mutex_held = so->so_proto->pr_domain->dom_mtx;
1432 }
1433 while (so->so_state & SS_ISCONNECTED) {
1434 ts.tv_sec = (so->so_linger / 100);
1435 ts.tv_nsec = (so->so_linger % 100) *
1436 NSEC_PER_USEC * 1000 * 10;
1437 error = msleep((caddr_t)&so->so_timeo,
1438 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1439 if (error) {
1440 /*
1441 * It's OK when the time fires,
1442 * don't report an error
1443 */
1444 if (error == EWOULDBLOCK) {
1445 error = 0;
1446 }
1447 break;
1448 }
1449 }
1450 }
1451 }
1452 drop:
1453 if (so->so_usecount == 0) {
1454 panic("soclose: usecount is zero so=%p", so);
1455 /* NOTREACHED */
1456 }
1457 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1458 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1459 if (error == 0) {
1460 error = error2;
1461 }
1462 }
1463 if (so->so_usecount <= 0) {
1464 panic("soclose: usecount is zero so=%p", so);
1465 /* NOTREACHED */
1466 }
1467 discard:
1468 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1469 (so->so_state & SS_NOFDREF)) {
1470 panic("soclose: NOFDREF");
1471 /* NOTREACHED */
1472 }
1473 so->so_state |= SS_NOFDREF;
1474
1475 if ((so->so_flags & SOF_KNOTE) != 0) {
1476 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1477 }
1478
1479 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1480
1481 VERIFY(so->so_usecount > 0);
1482 so->so_usecount--;
1483 sofree(so);
1484 return error;
1485 }
1486
1487 int
soclose(struct socket * so)1488 soclose(struct socket *so)
1489 {
1490 int error = 0;
1491 socket_lock(so, 1);
1492
1493 if (so->so_retaincnt == 0) {
1494 error = soclose_locked(so);
1495 } else {
1496 /*
1497 * if the FD is going away, but socket is
1498 * retained in kernel remove its reference
1499 */
1500 so->so_usecount--;
1501 if (so->so_usecount < 2) {
1502 panic("soclose: retaincnt non null and so=%p "
1503 "usecount=%d\n", so, so->so_usecount);
1504 }
1505 }
1506 socket_unlock(so, 1);
1507 return error;
1508 }
1509
1510 /*
1511 * Must be called at splnet...
1512 */
1513 /* Should already be locked */
1514 int
soabort(struct socket * so)1515 soabort(struct socket *so)
1516 {
1517 int error;
1518
1519 #ifdef MORE_LOCKING_DEBUG
1520 lck_mtx_t *mutex_held;
1521
1522 if (so->so_proto->pr_getlock != NULL) {
1523 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1524 } else {
1525 mutex_held = so->so_proto->pr_domain->dom_mtx;
1526 }
1527 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1528 #endif
1529
1530 if ((so->so_flags & SOF_ABORTED) == 0) {
1531 so->so_flags |= SOF_ABORTED;
1532 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1533 if (error) {
1534 sofree(so);
1535 return error;
1536 }
1537 }
1538 return 0;
1539 }
1540
1541 int
soacceptlock(struct socket * so,struct sockaddr ** nam,int dolock)1542 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1543 {
1544 int error;
1545
1546 if (dolock) {
1547 socket_lock(so, 1);
1548 }
1549
1550 so_update_last_owner_locked(so, PROC_NULL);
1551 so_update_policy(so);
1552 #if NECP
1553 so_update_necp_policy(so, NULL, NULL);
1554 #endif /* NECP */
1555
1556 if ((so->so_state & SS_NOFDREF) == 0) {
1557 panic("soaccept: !NOFDREF");
1558 }
1559 so->so_state &= ~SS_NOFDREF;
1560 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1561
1562 if (dolock) {
1563 socket_unlock(so, 1);
1564 }
1565 return error;
1566 }
1567
1568 int
soaccept(struct socket * so,struct sockaddr ** nam)1569 soaccept(struct socket *so, struct sockaddr **nam)
1570 {
1571 return soacceptlock(so, nam, 1);
1572 }
1573
1574 int
soacceptfilter(struct socket * so,struct socket * head)1575 soacceptfilter(struct socket *so, struct socket *head)
1576 {
1577 struct sockaddr *local = NULL, *remote = NULL;
1578 int error = 0;
1579
1580 /*
1581 * Hold the lock even if this socket has not been made visible
1582 * to the filter(s). For sockets with global locks, this protects
1583 * against the head or peer going away
1584 */
1585 socket_lock(so, 1);
1586 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1587 sogetaddr_locked(so, &local, 0) != 0) {
1588 so->so_state &= ~SS_NOFDREF;
1589 socket_unlock(so, 1);
1590 soclose(so);
1591 /* Out of resources; try it again next time */
1592 error = ECONNABORTED;
1593 goto done;
1594 }
1595
1596 error = sflt_accept(head, so, local, remote);
1597
1598 /*
1599 * If we get EJUSTRETURN from one of the filters, mark this socket
1600 * as inactive and return it anyway. This newly accepted socket
1601 * will be disconnected later before we hand it off to the caller.
1602 */
1603 if (error == EJUSTRETURN) {
1604 error = 0;
1605 (void) sosetdefunct(current_proc(), so,
1606 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1607 }
1608
1609 if (error != 0) {
1610 /*
1611 * This may seem like a duplication to the above error
1612 * handling part when we return ECONNABORTED, except
1613 * the following is done while holding the lock since
1614 * the socket has been exposed to the filter(s) earlier.
1615 */
1616 so->so_state &= ~SS_NOFDREF;
1617 socket_unlock(so, 1);
1618 soclose(so);
1619 /* Propagate socket filter's error code to the caller */
1620 } else {
1621 socket_unlock(so, 1);
1622 }
1623 done:
1624 /* Callee checks for NULL pointer */
1625 sock_freeaddr(remote);
1626 sock_freeaddr(local);
1627 return error;
1628 }
1629
1630 /*
1631 * Returns: 0 Success
1632 * EOPNOTSUPP Operation not supported on socket
1633 * EISCONN Socket is connected
1634 * <pru_connect>:EADDRNOTAVAIL Address not available.
1635 * <pru_connect>:EINVAL Invalid argument
1636 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1637 * <pru_connect>:EACCES Permission denied
1638 * <pru_connect>:EADDRINUSE Address in use
1639 * <pru_connect>:EAGAIN Resource unavailable, try again
1640 * <pru_connect>:EPERM Operation not permitted
1641 * <sf_connect_out>:??? [anything a filter writer might set]
1642 */
1643 int
soconnectlock(struct socket * so,struct sockaddr * nam,int dolock)1644 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1645 {
1646 int error;
1647 struct proc *p = current_proc();
1648 tracker_metadata_t metadata = { };
1649
1650 if (dolock) {
1651 socket_lock(so, 1);
1652 }
1653
1654 so_update_last_owner_locked(so, p);
1655 so_update_policy(so);
1656
1657 #if NECP
1658 so_update_necp_policy(so, NULL, nam);
1659 #endif /* NECP */
1660
1661 /*
1662 * If this is a listening socket or if this is a previously-accepted
1663 * socket that has been marked as inactive, reject the connect request.
1664 */
1665 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1666 error = EOPNOTSUPP;
1667 if (so->so_flags & SOF_DEFUNCT) {
1668 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1669 "(%d)\n", __func__, proc_pid(p),
1670 proc_best_name(p),
1671 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1672 SOCK_DOM(so), SOCK_TYPE(so), error);
1673 }
1674 if (dolock) {
1675 socket_unlock(so, 1);
1676 }
1677 return error;
1678 }
1679
1680 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1681 if (dolock) {
1682 socket_unlock(so, 1);
1683 }
1684 return EPERM;
1685 }
1686
1687 /*
1688 * If protocol is connection-based, can only connect once.
1689 * Otherwise, if connected, try to disconnect first.
1690 * This allows user to disconnect by connecting to, e.g.,
1691 * a null address.
1692 */
1693 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1694 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1695 (error = sodisconnectlocked(so)))) {
1696 error = EISCONN;
1697 } else {
1698 /*
1699 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1700 * a tracker domain. Mark socket accordingly. Skip lookup if socket has already been marked a tracker.
1701 */
1702 if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1703 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) {
1704 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1705 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1706 }
1707 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1708 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1709 }
1710 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1711 printf("connect() - failed necp_set_socket_domain_attributes");
1712 }
1713 }
1714 }
1715
1716 /*
1717 * Run connect filter before calling protocol:
1718 * - non-blocking connect returns before completion;
1719 */
1720 error = sflt_connectout(so, nam);
1721 if (error != 0) {
1722 if (error == EJUSTRETURN) {
1723 error = 0;
1724 }
1725 } else {
1726 error = (*so->so_proto->pr_usrreqs->pru_connect)
1727 (so, nam, p);
1728 if (error != 0) {
1729 so->so_state &= ~SS_ISCONNECTING;
1730 }
1731 }
1732 }
1733 if (dolock) {
1734 socket_unlock(so, 1);
1735 }
1736 return error;
1737 }
1738
1739 int
soconnect(struct socket * so,struct sockaddr * nam)1740 soconnect(struct socket *so, struct sockaddr *nam)
1741 {
1742 return soconnectlock(so, nam, 1);
1743 }
1744
1745 /*
1746 * Returns: 0 Success
1747 * <pru_connect2>:EINVAL[AF_UNIX]
1748 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1749 * <pru_connect2>:??? [other protocol families]
1750 *
1751 * Notes: <pru_connect2> is not supported by [TCP].
1752 */
1753 int
soconnect2(struct socket * so1,struct socket * so2)1754 soconnect2(struct socket *so1, struct socket *so2)
1755 {
1756 int error;
1757
1758 socket_lock(so1, 1);
1759 if (so2->so_proto->pr_lock) {
1760 socket_lock(so2, 1);
1761 }
1762
1763 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1764
1765 socket_unlock(so1, 1);
1766 if (so2->so_proto->pr_lock) {
1767 socket_unlock(so2, 1);
1768 }
1769 return error;
1770 }
1771
1772 int
soconnectxlocked(struct socket * so,struct sockaddr * src,struct sockaddr * dst,struct proc * p,uint32_t ifscope,sae_associd_t aid,sae_connid_t * pcid,uint32_t flags,void * arg,uint32_t arglen,uio_t auio,user_ssize_t * bytes_written)1773 soconnectxlocked(struct socket *so, struct sockaddr *src,
1774 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1775 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1776 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1777 {
1778 int error;
1779 tracker_metadata_t metadata = { };
1780
1781 so_update_last_owner_locked(so, p);
1782 so_update_policy(so);
1783
1784 /*
1785 * If this is a listening socket or if this is a previously-accepted
1786 * socket that has been marked as inactive, reject the connect request.
1787 */
1788 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1789 error = EOPNOTSUPP;
1790 if (so->so_flags & SOF_DEFUNCT) {
1791 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1792 "(%d)\n", __func__, proc_pid(p),
1793 proc_best_name(p),
1794 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1795 SOCK_DOM(so), SOCK_TYPE(so), error);
1796 }
1797 return error;
1798 }
1799
1800 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1801 return EPERM;
1802 }
1803
1804 /*
1805 * If protocol is connection-based, can only connect once
1806 * unless PR_MULTICONN is set. Otherwise, if connected,
1807 * try to disconnect first. This allows user to disconnect
1808 * by connecting to, e.g., a null address.
1809 */
1810 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1811 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1812 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1813 (error = sodisconnectlocked(so)) != 0)) {
1814 error = EISCONN;
1815 } else {
1816 /*
1817 * For TCP, check if destination address is a tracker and mark the socket accordingly
1818 * (only if it hasn't been marked yet).
1819 */
1820 if (so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP &&
1821 !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1822 if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) {
1823 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1824 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1825 }
1826 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1827 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1828 }
1829 if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) {
1830 printf("connectx() - failed necp_set_socket_domain_attributes");
1831 }
1832 }
1833 }
1834
1835 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1836 (flags & CONNECT_DATA_IDEMPOTENT)) {
1837 so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1838
1839 if (flags & CONNECT_DATA_AUTHENTICATED) {
1840 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1841 }
1842 }
1843
1844 /*
1845 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1846 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1847 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1848 * Case 3 allows user to combine write with connect even if they have
1849 * no use for TFO (such as regular TCP, and UDP).
1850 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1851 */
1852 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1853 ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1854 so->so_flags1 |= SOF1_PRECONNECT_DATA;
1855 }
1856
1857 /*
1858 * If a user sets data idempotent and does not pass an uio, or
1859 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1860 * SOF1_DATA_IDEMPOTENT.
1861 */
1862 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1863 (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1864 /* We should return EINVAL instead perhaps. */
1865 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1866 }
1867
1868 /*
1869 * Run connect filter before calling protocol:
1870 * - non-blocking connect returns before completion;
1871 */
1872 error = sflt_connectout(so, dst);
1873 if (error != 0) {
1874 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1875 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1876 if (error == EJUSTRETURN) {
1877 error = 0;
1878 }
1879 } else {
1880 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1881 (so, src, dst, p, ifscope, aid, pcid,
1882 flags, arg, arglen, auio, bytes_written);
1883 if (error != 0) {
1884 so->so_state &= ~SS_ISCONNECTING;
1885 if (error != EINPROGRESS) {
1886 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1887 }
1888 }
1889 }
1890 }
1891
1892 return error;
1893 }
1894
1895 int
sodisconnectlocked(struct socket * so)1896 sodisconnectlocked(struct socket *so)
1897 {
1898 int error;
1899
1900 if ((so->so_state & SS_ISCONNECTED) == 0) {
1901 error = ENOTCONN;
1902 goto bad;
1903 }
1904 if (so->so_state & SS_ISDISCONNECTING) {
1905 error = EALREADY;
1906 goto bad;
1907 }
1908
1909 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1910 if (error == 0) {
1911 sflt_notify(so, sock_evt_disconnected, NULL);
1912 }
1913
1914 bad:
1915 return error;
1916 }
1917
1918 /* Locking version */
1919 int
sodisconnect(struct socket * so)1920 sodisconnect(struct socket *so)
1921 {
1922 int error;
1923
1924 socket_lock(so, 1);
1925 error = sodisconnectlocked(so);
1926 socket_unlock(so, 1);
1927 return error;
1928 }
1929
1930 int
sodisconnectxlocked(struct socket * so,sae_associd_t aid,sae_connid_t cid)1931 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1932 {
1933 int error;
1934
1935 /*
1936 * Call the protocol disconnectx handler; let it handle all
1937 * matters related to the connection state of this session.
1938 */
1939 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1940 if (error == 0) {
1941 /*
1942 * The event applies only for the session, not for
1943 * the disconnection of individual subflows.
1944 */
1945 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1946 sflt_notify(so, sock_evt_disconnected, NULL);
1947 }
1948 }
1949 return error;
1950 }
1951
1952 int
sodisconnectx(struct socket * so,sae_associd_t aid,sae_connid_t cid)1953 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1954 {
1955 int error;
1956
1957 socket_lock(so, 1);
1958 error = sodisconnectxlocked(so, aid, cid);
1959 socket_unlock(so, 1);
1960 return error;
1961 }
1962
1963 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1964
1965 /*
1966 * sosendcheck will lock the socket buffer if it isn't locked and
1967 * verify that there is space for the data being inserted.
1968 *
1969 * Returns: 0 Success
1970 * EPIPE
1971 * sblock:EWOULDBLOCK
1972 * sblock:EINTR
1973 * sbwait:EBADF
1974 * sbwait:EINTR
1975 * [so_error]:???
1976 */
1977 int
sosendcheck(struct socket * so,struct sockaddr * addr,user_ssize_t resid,int32_t clen,int32_t atomic,int flags,int * sblocked)1978 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1979 int32_t clen, int32_t atomic, int flags, int *sblocked)
1980 {
1981 int error = 0;
1982 int32_t space;
1983 int assumelock = 0;
1984
1985 restart:
1986 if (*sblocked == 0) {
1987 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1988 so->so_send_filt_thread != 0 &&
1989 so->so_send_filt_thread == current_thread()) {
1990 /*
1991 * We're being called recursively from a filter,
1992 * allow this to continue. Radar 4150520.
1993 * Don't set sblocked because we don't want
1994 * to perform an unlock later.
1995 */
1996 assumelock = 1;
1997 } else {
1998 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1999 if (error) {
2000 if (so->so_flags & SOF_DEFUNCT) {
2001 goto defunct;
2002 }
2003 return error;
2004 }
2005 *sblocked = 1;
2006 }
2007 }
2008
2009 /*
2010 * If a send attempt is made on a socket that has been marked
2011 * as inactive (disconnected), reject the request.
2012 */
2013 if (so->so_flags & SOF_DEFUNCT) {
2014 defunct:
2015 error = EPIPE;
2016 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
2017 __func__, proc_selfpid(), proc_best_name(current_proc()),
2018 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2019 SOCK_DOM(so), SOCK_TYPE(so), error);
2020 return error;
2021 }
2022
2023 if (so->so_state & SS_CANTSENDMORE) {
2024 #if CONTENT_FILTER
2025 /*
2026 * Can re-inject data of half closed connections
2027 */
2028 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2029 so->so_snd.sb_cfil_thread == current_thread() &&
2030 cfil_sock_data_pending(&so->so_snd) != 0) {
2031 CFIL_LOG(LOG_INFO,
2032 "so %llx ignore SS_CANTSENDMORE",
2033 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2034 } else
2035 #endif /* CONTENT_FILTER */
2036 return EPIPE;
2037 }
2038 if (so->so_error) {
2039 error = so->so_error;
2040 so->so_error = 0;
2041 return error;
2042 }
2043
2044 if ((so->so_state & SS_ISCONNECTED) == 0) {
2045 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2046 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2047 (resid != 0 || clen == 0) &&
2048 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2049 return ENOTCONN;
2050 }
2051 } else if (addr == 0) {
2052 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2053 ENOTCONN : EDESTADDRREQ;
2054 }
2055 }
2056
2057 space = sbspace(&so->so_snd);
2058
2059 if (flags & MSG_OOB) {
2060 space += 1024;
2061 }
2062 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2063 clen > so->so_snd.sb_hiwat) {
2064 return EMSGSIZE;
2065 }
2066
2067 if ((space < resid + clen &&
2068 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2069 space < clen)) ||
2070 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2071 /*
2072 * don't block the connectx call when there's more data
2073 * than can be copied.
2074 */
2075 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2076 if (space == 0) {
2077 return EWOULDBLOCK;
2078 }
2079 if (space < (int32_t)so->so_snd.sb_lowat) {
2080 return 0;
2081 }
2082 }
2083 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2084 assumelock) {
2085 return EWOULDBLOCK;
2086 }
2087 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
2088 *sblocked = 0;
2089 error = sbwait(&so->so_snd);
2090 if (error) {
2091 if (so->so_flags & SOF_DEFUNCT) {
2092 goto defunct;
2093 }
2094 return error;
2095 }
2096 goto restart;
2097 }
2098 return 0;
2099 }
2100
2101 /*
2102 * Send on a socket.
2103 * If send must go all at once and message is larger than
2104 * send buffering, then hard error.
2105 * Lock against other senders.
2106 * If must go all at once and not enough room now, then
2107 * inform user that this would block and do nothing.
2108 * Otherwise, if nonblocking, send as much as possible.
2109 * The data to be sent is described by "uio" if nonzero,
2110 * otherwise by the mbuf chain "top" (which must be null
2111 * if uio is not). Data provided in mbuf chain must be small
2112 * enough to send all at once.
2113 *
2114 * Returns nonzero on error, timeout or signal; callers
2115 * must check for short counts if EINTR/ERESTART are returned.
2116 * Data and control buffers are freed on return.
2117 *
2118 * Returns: 0 Success
2119 * EOPNOTSUPP
2120 * EINVAL
2121 * ENOBUFS
2122 * uiomove:EFAULT
2123 * sosendcheck:EPIPE
2124 * sosendcheck:EWOULDBLOCK
2125 * sosendcheck:EINTR
2126 * sosendcheck:EBADF
2127 * sosendcheck:EINTR
2128 * sosendcheck:??? [value from so_error]
2129 * <pru_send>:ECONNRESET[TCP]
2130 * <pru_send>:EINVAL[TCP]
2131 * <pru_send>:ENOBUFS[TCP]
2132 * <pru_send>:EADDRINUSE[TCP]
2133 * <pru_send>:EADDRNOTAVAIL[TCP]
2134 * <pru_send>:EAFNOSUPPORT[TCP]
2135 * <pru_send>:EACCES[TCP]
2136 * <pru_send>:EAGAIN[TCP]
2137 * <pru_send>:EPERM[TCP]
2138 * <pru_send>:EMSGSIZE[TCP]
2139 * <pru_send>:EHOSTUNREACH[TCP]
2140 * <pru_send>:ENETUNREACH[TCP]
2141 * <pru_send>:ENETDOWN[TCP]
2142 * <pru_send>:ENOMEM[TCP]
2143 * <pru_send>:ENOBUFS[TCP]
2144 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2145 * <pru_send>:EINVAL[AF_UNIX]
2146 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2147 * <pru_send>:EPIPE[AF_UNIX]
2148 * <pru_send>:ENOTCONN[AF_UNIX]
2149 * <pru_send>:EISCONN[AF_UNIX]
2150 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2151 * <sf_data_out>:??? [whatever a filter author chooses]
2152 *
2153 * Notes: Other <pru_send> returns depend on the protocol family; all
2154 * <sf_data_out> returns depend on what the filter author causes
2155 * their filter to return.
2156 */
2157 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2158 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2159 struct mbuf *top, struct mbuf *control, int flags)
2160 {
2161 struct mbuf **mp;
2162 struct mbuf *m, *freelist = NULL;
2163 struct soflow_hash_entry *dgram_flow_entry = NULL;
2164 user_ssize_t space, len, resid, orig_resid;
2165 int clen = 0, error, dontroute, sendflags;
2166 int atomic = sosendallatonce(so) || top;
2167 int sblocked = 0;
2168 struct proc *p = current_proc();
2169 uint16_t headroom = 0;
2170 ssize_t mlen;
2171 boolean_t en_tracing = FALSE;
2172
2173 if (uio != NULL) {
2174 resid = uio_resid(uio);
2175 } else {
2176 resid = top->m_pkthdr.len;
2177 }
2178
2179 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2180 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2181
2182 socket_lock(so, 1);
2183
2184 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2185 dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2186 }
2187
2188 /*
2189 * trace if tracing & network (vs. unix) sockets & and
2190 * non-loopback
2191 */
2192 if (ENTR_SHOULDTRACE &&
2193 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2194 struct inpcb *inp = sotoinpcb(so);
2195 if (inp->inp_last_outifp != NULL &&
2196 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2197 en_tracing = TRUE;
2198 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2199 VM_KERNEL_ADDRPERM(so),
2200 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2201 (int64_t)resid);
2202 orig_resid = resid;
2203 }
2204 }
2205
2206 /*
2207 * Re-injection should not affect process accounting
2208 */
2209 if ((flags & MSG_SKIPCFIL) == 0) {
2210 so_update_last_owner_locked(so, p);
2211 so_update_policy(so);
2212
2213 #if NECP
2214 so_update_necp_policy(so, NULL, addr);
2215 #endif /* NECP */
2216 }
2217
2218 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2219 error = EOPNOTSUPP;
2220 goto out_locked;
2221 }
2222
2223 /*
2224 * In theory resid should be unsigned.
2225 * However, space must be signed, as it might be less than 0
2226 * if we over-committed, and we must use a signed comparison
2227 * of space and resid. On the other hand, a negative resid
2228 * causes us to loop sending 0-length segments to the protocol.
2229 *
2230 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2231 *
2232 * Note: We limit resid to be a positive int value as we use
2233 * imin() to set bytes_to_copy -- radr://14558484
2234 */
2235 if (resid < 0 || resid > INT_MAX ||
2236 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2237 error = EINVAL;
2238 goto out_locked;
2239 }
2240
2241 dontroute = (flags & MSG_DONTROUTE) &&
2242 (so->so_options & SO_DONTROUTE) == 0 &&
2243 (so->so_proto->pr_flags & PR_ATOMIC);
2244 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2245
2246 if (control != NULL) {
2247 clen = control->m_len;
2248 }
2249
2250 if (soreserveheadroom != 0) {
2251 headroom = so->so_pktheadroom;
2252 }
2253
2254 do {
2255 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2256 &sblocked);
2257 if (error) {
2258 goto out_locked;
2259 }
2260
2261 mp = ⊤
2262 space = sbspace(&so->so_snd) - clen;
2263 space += ((flags & MSG_OOB) ? 1024 : 0);
2264
2265 do {
2266 if (uio == NULL) {
2267 /*
2268 * Data is prepackaged in "top".
2269 */
2270 resid = 0;
2271 if (flags & MSG_EOR) {
2272 top->m_flags |= M_EOR;
2273 }
2274 } else {
2275 int chainlength;
2276 int bytes_to_copy;
2277 boolean_t jumbocl;
2278 boolean_t bigcl;
2279 int bytes_to_alloc;
2280
2281 bytes_to_copy = imin((int)resid, (int)space);
2282
2283 bytes_to_alloc = bytes_to_copy;
2284 if (top == NULL) {
2285 bytes_to_alloc += headroom;
2286 }
2287
2288 if (sosendminchain > 0) {
2289 chainlength = 0;
2290 } else {
2291 chainlength = sosendmaxchain;
2292 }
2293
2294 /*
2295 * Use big 4 KB cluster when the outgoing interface
2296 * does not prefer 2 KB clusters
2297 */
2298 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2299 sosendbigcl_ignore_capab;
2300
2301 /*
2302 * Attempt to use larger than system page-size
2303 * clusters for large writes only if there is
2304 * a jumbo cluster pool and if the socket is
2305 * marked accordingly.
2306 */
2307 jumbocl = sosendjcl && njcl > 0 &&
2308 ((so->so_flags & SOF_MULTIPAGES) ||
2309 sosendjcl_ignore_capab) &&
2310 bigcl;
2311
2312 socket_unlock(so, 0);
2313
2314 do {
2315 int num_needed;
2316 int hdrs_needed = (top == NULL) ? 1 : 0;
2317
2318 /*
2319 * try to maintain a local cache of mbuf
2320 * clusters needed to complete this
2321 * write the list is further limited to
2322 * the number that are currently needed
2323 * to fill the socket this mechanism
2324 * allows a large number of mbufs/
2325 * clusters to be grabbed under a single
2326 * mbuf lock... if we can't get any
2327 * clusters, than fall back to trying
2328 * for mbufs if we fail early (or
2329 * miscalcluate the number needed) make
2330 * sure to release any clusters we
2331 * haven't yet consumed.
2332 */
2333 if (freelist == NULL &&
2334 bytes_to_alloc > MBIGCLBYTES &&
2335 jumbocl) {
2336 num_needed =
2337 bytes_to_alloc / M16KCLBYTES;
2338
2339 if ((bytes_to_alloc -
2340 (num_needed * M16KCLBYTES))
2341 >= MINCLSIZE) {
2342 num_needed++;
2343 }
2344
2345 freelist =
2346 m_getpackets_internal(
2347 (unsigned int *)&num_needed,
2348 hdrs_needed, M_WAIT, 0,
2349 M16KCLBYTES);
2350 /*
2351 * Fall back to 4K cluster size
2352 * if allocation failed
2353 */
2354 }
2355
2356 if (freelist == NULL &&
2357 bytes_to_alloc > MCLBYTES &&
2358 bigcl) {
2359 num_needed =
2360 bytes_to_alloc / MBIGCLBYTES;
2361
2362 if ((bytes_to_alloc -
2363 (num_needed * MBIGCLBYTES)) >=
2364 MINCLSIZE) {
2365 num_needed++;
2366 }
2367
2368 freelist =
2369 m_getpackets_internal(
2370 (unsigned int *)&num_needed,
2371 hdrs_needed, M_WAIT, 0,
2372 MBIGCLBYTES);
2373 /*
2374 * Fall back to cluster size
2375 * if allocation failed
2376 */
2377 }
2378
2379 /*
2380 * Allocate a cluster as we want to
2381 * avoid to split the data in more
2382 * that one segment and using MINCLSIZE
2383 * would lead us to allocate two mbufs
2384 */
2385 if (soreserveheadroom != 0 &&
2386 freelist == NULL &&
2387 ((top == NULL &&
2388 bytes_to_alloc > _MHLEN) ||
2389 bytes_to_alloc > _MLEN)) {
2390 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2391 MCLBYTES;
2392 freelist =
2393 m_getpackets_internal(
2394 (unsigned int *)&num_needed,
2395 hdrs_needed, M_WAIT, 0,
2396 MCLBYTES);
2397 /*
2398 * Fall back to a single mbuf
2399 * if allocation failed
2400 */
2401 } else if (freelist == NULL &&
2402 bytes_to_alloc > MINCLSIZE) {
2403 num_needed =
2404 bytes_to_alloc / MCLBYTES;
2405
2406 if ((bytes_to_alloc -
2407 (num_needed * MCLBYTES)) >=
2408 MINCLSIZE) {
2409 num_needed++;
2410 }
2411
2412 freelist =
2413 m_getpackets_internal(
2414 (unsigned int *)&num_needed,
2415 hdrs_needed, M_WAIT, 0,
2416 MCLBYTES);
2417 /*
2418 * Fall back to a single mbuf
2419 * if allocation failed
2420 */
2421 }
2422 /*
2423 * For datagram protocols, leave
2424 * headroom for protocol headers
2425 * in the first cluster of the chain
2426 */
2427 if (freelist != NULL && atomic &&
2428 top == NULL && headroom > 0) {
2429 freelist->m_data += headroom;
2430 }
2431
2432 /*
2433 * Fall back to regular mbufs without
2434 * reserving the socket headroom
2435 */
2436 if (freelist == NULL) {
2437 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2438 if (top == NULL) {
2439 MGETHDR(freelist,
2440 M_WAIT, MT_DATA);
2441 } else {
2442 MGET(freelist,
2443 M_WAIT, MT_DATA);
2444 }
2445 }
2446
2447 if (freelist == NULL) {
2448 error = ENOBUFS;
2449 socket_lock(so, 0);
2450 goto out_locked;
2451 }
2452 /*
2453 * For datagram protocols,
2454 * leave room for protocol
2455 * headers in first mbuf.
2456 */
2457 if (atomic && top == NULL &&
2458 bytes_to_copy > 0 &&
2459 bytes_to_copy < MHLEN) {
2460 MH_ALIGN(freelist,
2461 bytes_to_copy);
2462 }
2463 }
2464 m = freelist;
2465 freelist = m->m_next;
2466 m->m_next = NULL;
2467
2468 if ((m->m_flags & M_EXT)) {
2469 mlen = m->m_ext.ext_size -
2470 M_LEADINGSPACE(m);
2471 } else if ((m->m_flags & M_PKTHDR)) {
2472 mlen = MHLEN - M_LEADINGSPACE(m);
2473 m_add_crumb(m, PKT_CRUMB_SOSEND);
2474 } else {
2475 mlen = MLEN - M_LEADINGSPACE(m);
2476 }
2477 len = imin((int)mlen, bytes_to_copy);
2478
2479 chainlength += len;
2480
2481 space -= len;
2482
2483 error = uiomove(mtod(m, caddr_t),
2484 (int)len, uio);
2485
2486 resid = uio_resid(uio);
2487
2488 m->m_len = (int32_t)len;
2489 *mp = m;
2490 top->m_pkthdr.len += len;
2491 if (error) {
2492 break;
2493 }
2494 mp = &m->m_next;
2495 if (resid <= 0) {
2496 if (flags & MSG_EOR) {
2497 top->m_flags |= M_EOR;
2498 }
2499 break;
2500 }
2501 bytes_to_copy = imin((int)resid, (int)space);
2502 } while (space > 0 &&
2503 (chainlength < sosendmaxchain || atomic ||
2504 resid < MINCLSIZE));
2505
2506 socket_lock(so, 0);
2507
2508 if (error) {
2509 goto out_locked;
2510 }
2511 }
2512
2513 if (dontroute) {
2514 so->so_options |= SO_DONTROUTE;
2515 }
2516
2517 /*
2518 * Compute flags here, for pru_send and NKEs
2519 *
2520 * If the user set MSG_EOF, the protocol
2521 * understands this flag and nothing left to
2522 * send then use PRU_SEND_EOF instead of PRU_SEND.
2523 */
2524 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2525 ((flags & MSG_EOF) &&
2526 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2527 (resid <= 0)) ? PRUS_EOF :
2528 /* If there is more to send set PRUS_MORETOCOME */
2529 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2530
2531 if ((flags & MSG_SKIPCFIL) == 0) {
2532 /*
2533 * Socket filter processing
2534 */
2535 error = sflt_data_out(so, addr, &top,
2536 &control, (sendflags & MSG_OOB) ?
2537 sock_data_filt_flag_oob : 0);
2538 if (error) {
2539 if (error == EJUSTRETURN) {
2540 error = 0;
2541 goto packet_consumed;
2542 }
2543 goto out_locked;
2544 }
2545 #if CONTENT_FILTER
2546 /*
2547 * Content filter processing
2548 */
2549 error = cfil_sock_data_out(so, addr, top,
2550 control, sendflags, dgram_flow_entry);
2551 if (error) {
2552 if (error == EJUSTRETURN) {
2553 error = 0;
2554 goto packet_consumed;
2555 }
2556 goto out_locked;
2557 }
2558 #endif /* CONTENT_FILTER */
2559 }
2560 error = (*so->so_proto->pr_usrreqs->pru_send)
2561 (so, sendflags, top, addr, control, p);
2562
2563 packet_consumed:
2564 if (dontroute) {
2565 so->so_options &= ~SO_DONTROUTE;
2566 }
2567
2568 clen = 0;
2569 control = NULL;
2570 top = NULL;
2571 mp = ⊤
2572 if (error) {
2573 goto out_locked;
2574 }
2575 } while (resid && space > 0);
2576 } while (resid);
2577
2578 out_locked:
2579 if (sblocked) {
2580 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2581 } else {
2582 socket_unlock(so, 1);
2583 }
2584 if (top != NULL) {
2585 m_freem(top);
2586 }
2587 if (control != NULL) {
2588 m_freem(control);
2589 }
2590 if (freelist != NULL) {
2591 m_freem_list(freelist);
2592 }
2593
2594 if (dgram_flow_entry != NULL) {
2595 soflow_free_flow(dgram_flow_entry);
2596 }
2597
2598 soclearfastopen(so);
2599
2600 if (en_tracing) {
2601 /* resid passed here is the bytes left in uio */
2602 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2603 VM_KERNEL_ADDRPERM(so),
2604 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2605 (int64_t)(orig_resid - resid));
2606 }
2607 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2608 so->so_snd.sb_cc, space, error);
2609
2610 return error;
2611 }
2612
2613 int
sosend_reinject(struct socket * so,struct sockaddr * addr,struct mbuf * top,struct mbuf * control,uint32_t sendflags)2614 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2615 {
2616 struct mbuf *m0 = NULL, *control_end = NULL;
2617
2618 socket_lock_assert_owned(so);
2619
2620 /*
2621 * top must points to mbuf chain to be sent.
2622 * If control is not NULL, top must be packet header
2623 */
2624 VERIFY(top != NULL &&
2625 (control == NULL || top->m_flags & M_PKTHDR));
2626
2627 /*
2628 * If control is not passed in, see if we can get it
2629 * from top.
2630 */
2631 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2632 // Locate start of control if present and start of data
2633 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2634 if (m0->m_flags & M_PKTHDR) {
2635 top = m0;
2636 break;
2637 } else if (m0->m_type == MT_CONTROL) {
2638 if (control == NULL) {
2639 // Found start of control
2640 control = m0;
2641 }
2642 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2643 // Found end of control
2644 control_end = m0;
2645 }
2646 }
2647 }
2648 if (control_end != NULL) {
2649 control_end->m_next = NULL;
2650 }
2651 }
2652
2653 int error = (*so->so_proto->pr_usrreqs->pru_send)
2654 (so, sendflags, top, addr, control, current_proc());
2655
2656 return error;
2657 }
2658
2659 /*
2660 * Supported only connected sockets (no address) without ancillary data
2661 * (control mbuf) for atomic protocols
2662 */
2663 int
sosend_list(struct socket * so,struct uio ** uioarray,u_int uiocnt,int flags)2664 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2665 {
2666 struct mbuf *m, *freelist = NULL;
2667 struct soflow_hash_entry *dgram_flow_entry = NULL;
2668 user_ssize_t len, resid;
2669 int error, dontroute;
2670 int atomic = sosendallatonce(so);
2671 int sblocked = 0;
2672 struct proc *p = current_proc();
2673 u_int uiofirst = 0;
2674 u_int uiolast = 0;
2675 struct mbuf *top = NULL;
2676 uint16_t headroom = 0;
2677 ssize_t mlen;
2678 boolean_t bigcl;
2679
2680 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2681 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2682
2683 if (so->so_type != SOCK_DGRAM) {
2684 error = EINVAL;
2685 goto out;
2686 }
2687 if (atomic == 0) {
2688 error = EINVAL;
2689 goto out;
2690 }
2691 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2692 error = EPROTONOSUPPORT;
2693 goto out;
2694 }
2695 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2696 error = EINVAL;
2697 goto out;
2698 }
2699 resid = uio_array_resid(uioarray, uiocnt);
2700
2701 /*
2702 * In theory resid should be unsigned.
2703 * However, space must be signed, as it might be less than 0
2704 * if we over-committed, and we must use a signed comparison
2705 * of space and resid. On the other hand, a negative resid
2706 * causes us to loop sending 0-length segments to the protocol.
2707 *
2708 * Note: We limit resid to be a positive int value as we use
2709 * imin() to set bytes_to_copy -- radr://14558484
2710 */
2711 if (resid < 0 || resid > INT_MAX) {
2712 error = EINVAL;
2713 goto out;
2714 }
2715
2716 socket_lock(so, 1);
2717 so_update_last_owner_locked(so, p);
2718 so_update_policy(so);
2719
2720 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2721 dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, resid, true, 0);
2722 }
2723
2724 #if NECP
2725 so_update_necp_policy(so, NULL, NULL);
2726 #endif /* NECP */
2727
2728 dontroute = (flags & MSG_DONTROUTE) &&
2729 (so->so_options & SO_DONTROUTE) == 0 &&
2730 (so->so_proto->pr_flags & PR_ATOMIC);
2731 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2732
2733 error = sosendcheck(so, NULL, resid, 0, atomic, flags, &sblocked);
2734 if (error) {
2735 goto release;
2736 }
2737
2738 /*
2739 * Use big 4 KB clusters when the outgoing interface does not prefer
2740 * 2 KB clusters
2741 */
2742 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2743
2744 if (soreserveheadroom != 0) {
2745 headroom = so->so_pktheadroom;
2746 }
2747
2748 do {
2749 int i;
2750 int num_needed = 0;
2751 int chainlength;
2752 size_t maxpktlen = 0;
2753 int bytes_to_alloc;
2754
2755 if (sosendminchain > 0) {
2756 chainlength = 0;
2757 } else {
2758 chainlength = sosendmaxchain;
2759 }
2760
2761 socket_unlock(so, 0);
2762
2763 /*
2764 * Find a set of uio that fit in a reasonable number
2765 * of mbuf packets
2766 */
2767 for (i = uiofirst; i < uiocnt; i++) {
2768 struct uio *auio = uioarray[i];
2769
2770 len = uio_resid(auio);
2771
2772 /* Do nothing for empty messages */
2773 if (len == 0) {
2774 continue;
2775 }
2776
2777 num_needed += 1;
2778 uiolast += 1;
2779
2780 if (len > maxpktlen) {
2781 maxpktlen = len;
2782 }
2783
2784 chainlength += len;
2785 if (chainlength > sosendmaxchain) {
2786 break;
2787 }
2788 }
2789 /*
2790 * Nothing left to send
2791 */
2792 if (num_needed == 0) {
2793 socket_lock(so, 0);
2794 break;
2795 }
2796 /*
2797 * Allocate buffer large enough to include headroom space for
2798 * network and link header
2799 *
2800 */
2801 bytes_to_alloc = (int) maxpktlen + headroom;
2802
2803 /*
2804 * Allocate a single contiguous buffer of the smallest available
2805 * size when possible
2806 */
2807 if (bytes_to_alloc > MCLBYTES &&
2808 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2809 freelist = m_getpackets_internal(
2810 (unsigned int *)&num_needed,
2811 num_needed, M_WAIT, 1,
2812 MBIGCLBYTES);
2813 } else if (bytes_to_alloc > _MHLEN &&
2814 bytes_to_alloc <= MCLBYTES) {
2815 freelist = m_getpackets_internal(
2816 (unsigned int *)&num_needed,
2817 num_needed, M_WAIT, 1,
2818 MCLBYTES);
2819 } else {
2820 freelist = m_allocpacket_internal(
2821 (unsigned int *)&num_needed,
2822 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2823 }
2824
2825 if (freelist == NULL) {
2826 socket_lock(so, 0);
2827 error = ENOMEM;
2828 goto release;
2829 }
2830 /*
2831 * Copy each uio of the set into its own mbuf packet
2832 */
2833 for (i = uiofirst, m = freelist;
2834 i < uiolast && m != NULL;
2835 i++) {
2836 int bytes_to_copy;
2837 struct mbuf *n;
2838 struct uio *auio = uioarray[i];
2839
2840 bytes_to_copy = (int)uio_resid(auio);
2841
2842 /* Do nothing for empty messages */
2843 if (bytes_to_copy == 0) {
2844 continue;
2845 }
2846 /*
2847 * Leave headroom for protocol headers
2848 * in the first mbuf of the chain
2849 */
2850 m->m_data += headroom;
2851
2852 for (n = m; n != NULL; n = n->m_next) {
2853 if ((m->m_flags & M_EXT)) {
2854 mlen = m->m_ext.ext_size -
2855 M_LEADINGSPACE(m);
2856 } else if ((m->m_flags & M_PKTHDR)) {
2857 mlen =
2858 MHLEN - M_LEADINGSPACE(m);
2859 } else {
2860 mlen = MLEN - M_LEADINGSPACE(m);
2861 }
2862 len = imin((int)mlen, bytes_to_copy);
2863
2864 /*
2865 * Note: uiomove() decrements the iovec
2866 * length
2867 */
2868 error = uiomove(mtod(n, caddr_t),
2869 (int)len, auio);
2870 if (error != 0) {
2871 break;
2872 }
2873 n->m_len = (int32_t)len;
2874 m->m_pkthdr.len += len;
2875
2876 VERIFY(m->m_pkthdr.len <= maxpktlen);
2877
2878 bytes_to_copy -= len;
2879 resid -= len;
2880 }
2881 if (m->m_pkthdr.len == 0) {
2882 printf(
2883 "%s:%d so %llx pkt %llx type %u len null\n",
2884 __func__, __LINE__,
2885 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2886 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2887 m->m_type);
2888 }
2889 if (error != 0) {
2890 break;
2891 }
2892 m = m->m_nextpkt;
2893 }
2894
2895 socket_lock(so, 0);
2896
2897 if (error) {
2898 goto release;
2899 }
2900 top = freelist;
2901 freelist = NULL;
2902
2903 if (dontroute) {
2904 so->so_options |= SO_DONTROUTE;
2905 }
2906
2907 if ((flags & MSG_SKIPCFIL) == 0) {
2908 struct mbuf **prevnextp = NULL;
2909
2910 for (i = uiofirst, m = top;
2911 i < uiolast && m != NULL;
2912 i++) {
2913 struct mbuf *nextpkt = m->m_nextpkt;
2914
2915 /*
2916 * Socket filter processing
2917 */
2918 error = sflt_data_out(so, NULL, &m,
2919 NULL, 0);
2920 if (error != 0 && error != EJUSTRETURN) {
2921 goto release;
2922 }
2923
2924 #if CONTENT_FILTER
2925 if (error == 0) {
2926 /*
2927 * Content filter processing
2928 */
2929 error = cfil_sock_data_out(so, NULL, m,
2930 NULL, 0, dgram_flow_entry);
2931 if (error != 0 && error != EJUSTRETURN) {
2932 goto release;
2933 }
2934 }
2935 #endif /* CONTENT_FILTER */
2936 /*
2937 * Remove packet from the list when
2938 * swallowed by a filter
2939 */
2940 if (error == EJUSTRETURN) {
2941 error = 0;
2942 if (prevnextp != NULL) {
2943 *prevnextp = nextpkt;
2944 } else {
2945 top = nextpkt;
2946 }
2947 }
2948
2949 m = nextpkt;
2950 if (m != NULL) {
2951 prevnextp = &m->m_nextpkt;
2952 }
2953 }
2954 }
2955 if (top != NULL) {
2956 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2957 (so, 0, top, NULL, NULL, p);
2958 }
2959
2960 if (dontroute) {
2961 so->so_options &= ~SO_DONTROUTE;
2962 }
2963
2964 top = NULL;
2965 uiofirst = uiolast;
2966 } while (resid > 0 && error == 0);
2967 release:
2968 if (sblocked) {
2969 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2970 } else {
2971 socket_unlock(so, 1);
2972 }
2973 out:
2974 if (top != NULL) {
2975 m_freem(top);
2976 }
2977 if (freelist != NULL) {
2978 m_freem_list(freelist);
2979 }
2980
2981 if (dgram_flow_entry != NULL) {
2982 soflow_free_flow(dgram_flow_entry);
2983 }
2984
2985 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2986 so->so_snd.sb_cc, 0, error);
2987
2988 return error;
2989 }
2990
2991 /*
2992 * May return ERESTART when packet is dropped by MAC policy check
2993 */
2994 static int
soreceive_addr(struct proc * p,struct socket * so,struct sockaddr ** psa,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp,int canwait)2995 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2996 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2997 {
2998 int error = 0;
2999 struct mbuf *m = *mp;
3000 struct mbuf *nextrecord = *nextrecordp;
3001
3002 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
3003 #if CONFIG_MACF_SOCKET_SUBSET
3004 /*
3005 * Call the MAC framework for policy checking if we're in
3006 * the user process context and the socket isn't connected.
3007 */
3008 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
3009 struct mbuf *m0 = m;
3010 /*
3011 * Dequeue this record (temporarily) from the receive
3012 * list since we're about to drop the socket's lock
3013 * where a new record may arrive and be appended to
3014 * the list. Upon MAC policy failure, the record
3015 * will be freed. Otherwise, we'll add it back to
3016 * the head of the list. We cannot rely on SB_LOCK
3017 * because append operation uses the socket's lock.
3018 */
3019 do {
3020 m->m_nextpkt = NULL;
3021 sbfree(&so->so_rcv, m);
3022 m = m->m_next;
3023 } while (m != NULL);
3024 m = m0;
3025 so->so_rcv.sb_mb = nextrecord;
3026 SB_EMPTY_FIXUP(&so->so_rcv);
3027 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
3028 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
3029 socket_unlock(so, 0);
3030
3031 error = mac_socket_check_received(kauth_cred_get(), so,
3032 mtod(m, struct sockaddr *));
3033
3034 if (error != 0) {
3035 /*
3036 * MAC policy failure; free this record and
3037 * process the next record (or block until
3038 * one is available). We have adjusted sb_cc
3039 * and sb_mbcnt above so there is no need to
3040 * call sbfree() again.
3041 */
3042 m_freem(m);
3043 /*
3044 * Clear SB_LOCK but don't unlock the socket.
3045 * Process the next record or wait for one.
3046 */
3047 socket_lock(so, 0);
3048 sbunlock(&so->so_rcv, TRUE); /* stay locked */
3049 error = ERESTART;
3050 goto done;
3051 }
3052 socket_lock(so, 0);
3053 /*
3054 * If the socket has been defunct'd, drop it.
3055 */
3056 if (so->so_flags & SOF_DEFUNCT) {
3057 m_freem(m);
3058 error = ENOTCONN;
3059 goto done;
3060 }
3061 /*
3062 * Re-adjust the socket receive list and re-enqueue
3063 * the record in front of any packets which may have
3064 * been appended while we dropped the lock.
3065 */
3066 for (m = m0; m->m_next != NULL; m = m->m_next) {
3067 sballoc(&so->so_rcv, m);
3068 }
3069 sballoc(&so->so_rcv, m);
3070 if (so->so_rcv.sb_mb == NULL) {
3071 so->so_rcv.sb_lastrecord = m0;
3072 so->so_rcv.sb_mbtail = m;
3073 }
3074 m = m0;
3075 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3076 so->so_rcv.sb_mb = m;
3077 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3078 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3079 }
3080 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3081 if (psa != NULL) {
3082 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3083 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3084 error = EWOULDBLOCK;
3085 goto done;
3086 }
3087 }
3088 if (flags & MSG_PEEK) {
3089 m = m->m_next;
3090 } else {
3091 sbfree(&so->so_rcv, m);
3092 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3093 panic("%s: about to create invalid socketbuf",
3094 __func__);
3095 /* NOTREACHED */
3096 }
3097 MFREE(m, so->so_rcv.sb_mb);
3098 m = so->so_rcv.sb_mb;
3099 if (m != NULL) {
3100 m->m_nextpkt = nextrecord;
3101 } else {
3102 so->so_rcv.sb_mb = nextrecord;
3103 SB_EMPTY_FIXUP(&so->so_rcv);
3104 }
3105 }
3106 done:
3107 *mp = m;
3108 *nextrecordp = nextrecord;
3109
3110 return error;
3111 }
3112
3113 /*
3114 * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3115 * so clear the data portion in order not to leak the file pointers
3116 */
3117 static void
sopeek_scm_rights(struct mbuf * rights)3118 sopeek_scm_rights(struct mbuf *rights)
3119 {
3120 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3121
3122 if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3123 VERIFY(cm->cmsg_len <= rights->m_len);
3124 memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3125 }
3126 }
3127
3128 /*
3129 * Process one or more MT_CONTROL mbufs present before any data mbufs
3130 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3131 * just copy the data; if !MSG_PEEK, we call into the protocol to
3132 * perform externalization.
3133 */
3134 static int
soreceive_ctl(struct socket * so,struct mbuf ** controlp,int flags,struct mbuf ** mp,struct mbuf ** nextrecordp)3135 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3136 struct mbuf **mp, struct mbuf **nextrecordp)
3137 {
3138 int error = 0;
3139 struct mbuf *cm = NULL, *cmn;
3140 struct mbuf **cme = &cm;
3141 struct sockbuf *sb_rcv = &so->so_rcv;
3142 struct mbuf **msgpcm = NULL;
3143 struct mbuf *m = *mp;
3144 struct mbuf *nextrecord = *nextrecordp;
3145 struct protosw *pr = so->so_proto;
3146
3147 /*
3148 * Externalizing the control messages would require us to
3149 * drop the socket's lock below. Once we re-acquire the
3150 * lock, the mbuf chain might change. In order to preserve
3151 * consistency, we unlink all control messages from the
3152 * first mbuf chain in one shot and link them separately
3153 * onto a different chain.
3154 */
3155 do {
3156 if (flags & MSG_PEEK) {
3157 if (controlp != NULL) {
3158 if (*controlp == NULL) {
3159 msgpcm = controlp;
3160 }
3161 *controlp = m_copy(m, 0, m->m_len);
3162
3163 /*
3164 * If we failed to allocate an mbuf,
3165 * release any previously allocated
3166 * mbufs for control data. Return
3167 * an error. Keep the mbufs in the
3168 * socket as this is using
3169 * MSG_PEEK flag.
3170 */
3171 if (*controlp == NULL) {
3172 m_freem(*msgpcm);
3173 error = ENOBUFS;
3174 goto done;
3175 }
3176
3177 if (pr->pr_domain->dom_externalize != NULL) {
3178 sopeek_scm_rights(*controlp);
3179 }
3180
3181 controlp = &(*controlp)->m_next;
3182 }
3183 m = m->m_next;
3184 } else {
3185 m->m_nextpkt = NULL;
3186 sbfree(sb_rcv, m);
3187 sb_rcv->sb_mb = m->m_next;
3188 m->m_next = NULL;
3189 *cme = m;
3190 cme = &(*cme)->m_next;
3191 m = sb_rcv->sb_mb;
3192 }
3193 } while (m != NULL && m->m_type == MT_CONTROL);
3194
3195 if (!(flags & MSG_PEEK)) {
3196 if (sb_rcv->sb_mb != NULL) {
3197 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3198 } else {
3199 sb_rcv->sb_mb = nextrecord;
3200 SB_EMPTY_FIXUP(sb_rcv);
3201 }
3202 if (nextrecord == NULL) {
3203 sb_rcv->sb_lastrecord = m;
3204 }
3205 }
3206
3207 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3208 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3209
3210 while (cm != NULL) {
3211 int cmsg_level;
3212 int cmsg_type;
3213
3214 cmn = cm->m_next;
3215 cm->m_next = NULL;
3216 cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3217 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3218
3219 /*
3220 * Call the protocol to externalize SCM_RIGHTS message
3221 * and return the modified message to the caller upon
3222 * success. Otherwise, all other control messages are
3223 * returned unmodified to the caller. Note that we
3224 * only get into this loop if MSG_PEEK is not set.
3225 */
3226 if (pr->pr_domain->dom_externalize != NULL &&
3227 cmsg_level == SOL_SOCKET &&
3228 cmsg_type == SCM_RIGHTS) {
3229 /*
3230 * Release socket lock: see 3903171. This
3231 * would also allow more records to be appended
3232 * to the socket buffer. We still have SB_LOCK
3233 * set on it, so we can be sure that the head
3234 * of the mbuf chain won't change.
3235 */
3236 socket_unlock(so, 0);
3237 error = (*pr->pr_domain->dom_externalize)(cm);
3238 socket_lock(so, 0);
3239 } else {
3240 error = 0;
3241 }
3242
3243 if (controlp != NULL && error == 0) {
3244 *controlp = cm;
3245 controlp = &(*controlp)->m_next;
3246 } else {
3247 (void) m_free(cm);
3248 }
3249 cm = cmn;
3250 }
3251 /*
3252 * Update the value of nextrecord in case we received new
3253 * records when the socket was unlocked above for
3254 * externalizing SCM_RIGHTS.
3255 */
3256 if (m != NULL) {
3257 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3258 } else {
3259 nextrecord = sb_rcv->sb_mb;
3260 }
3261
3262 done:
3263 *mp = m;
3264 *nextrecordp = nextrecord;
3265
3266 return error;
3267 }
3268
3269 /*
3270 * If we have less data than requested, block awaiting more
3271 * (subject to any timeout) if:
3272 * 1. the current count is less than the low water mark, or
3273 * 2. MSG_WAITALL is set, and it is possible to do the entire
3274 * receive operation at once if we block (resid <= hiwat).
3275 * 3. MSG_DONTWAIT is not set
3276 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3277 * we have to do the receive in sections, and thus risk returning
3278 * a short count if a timeout or signal occurs after we start.
3279 */
3280 static boolean_t
so_should_wait(struct socket * so,struct uio * uio,struct mbuf * m,int flags)3281 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3282 {
3283 struct protosw *pr = so->so_proto;
3284
3285 /* No mbufs in the receive-queue? Wait! */
3286 if (m == NULL) {
3287 return true;
3288 }
3289
3290 /* Not enough data in the receive socket-buffer - we may have to wait */
3291 if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3292 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3293 /*
3294 * Application did set the lowater-mark, so we should wait for
3295 * this data to be present.
3296 */
3297 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3298 return true;
3299 }
3300
3301 /*
3302 * Application wants all the data - so let's try to do the
3303 * receive-operation at once by waiting for everything to
3304 * be there.
3305 */
3306 if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3307 return true;
3308 }
3309 }
3310
3311 return false;
3312 }
3313
3314 /*
3315 * Implement receive operations on a socket.
3316 * We depend on the way that records are added to the sockbuf
3317 * by sbappend*. In particular, each record (mbufs linked through m_next)
3318 * must begin with an address if the protocol so specifies,
3319 * followed by an optional mbuf or mbufs containing ancillary data,
3320 * and then zero or more mbufs of data.
3321 * In order to avoid blocking network interrupts for the entire time here,
3322 * we splx() while doing the actual copy to user space.
3323 * Although the sockbuf is locked, new data may still be appended,
3324 * and thus we must maintain consistency of the sockbuf during that time.
3325 *
3326 * The caller may receive the data as a single mbuf chain by supplying
3327 * an mbuf **mp0 for use in returning the chain. The uio is then used
3328 * only for the count in uio_resid.
3329 *
3330 * Returns: 0 Success
3331 * ENOBUFS
3332 * ENOTCONN
3333 * EWOULDBLOCK
3334 * uiomove:EFAULT
3335 * sblock:EWOULDBLOCK
3336 * sblock:EINTR
3337 * sbwait:EBADF
3338 * sbwait:EINTR
3339 * sodelayed_copy:EFAULT
3340 * <pru_rcvoob>:EINVAL[TCP]
3341 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3342 * <pru_rcvoob>:???
3343 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3344 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3345 * <pr_domain->dom_externalize>:???
3346 *
3347 * Notes: Additional return values from calls through <pru_rcvoob> and
3348 * <pr_domain->dom_externalize> depend on protocols other than
3349 * TCP or AF_UNIX, which are documented above.
3350 */
3351 int
soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)3352 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3353 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3354 {
3355 struct mbuf *m, **mp, *ml = NULL;
3356 struct mbuf *nextrecord, *free_list;
3357 int flags, error, offset;
3358 user_ssize_t len;
3359 struct protosw *pr = so->so_proto;
3360 int moff, type = 0;
3361 user_ssize_t orig_resid = uio_resid(uio);
3362 user_ssize_t delayed_copy_len;
3363 int can_delay;
3364 struct proc *p = current_proc();
3365 boolean_t en_tracing = FALSE;
3366
3367 /*
3368 * Sanity check on the length passed by caller as we are making 'int'
3369 * comparisons
3370 */
3371 if (orig_resid < 0 || orig_resid > INT_MAX) {
3372 return EINVAL;
3373 }
3374
3375 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3376 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3377 so->so_rcv.sb_hiwat);
3378
3379 socket_lock(so, 1);
3380 so_update_last_owner_locked(so, p);
3381 so_update_policy(so);
3382
3383 #ifdef MORE_LOCKING_DEBUG
3384 if (so->so_usecount == 1) {
3385 panic("%s: so=%x no other reference on socket", __func__, so);
3386 /* NOTREACHED */
3387 }
3388 #endif
3389 mp = mp0;
3390 if (psa != NULL) {
3391 *psa = NULL;
3392 }
3393 if (controlp != NULL) {
3394 *controlp = NULL;
3395 }
3396 if (flagsp != NULL) {
3397 flags = *flagsp & ~MSG_EOR;
3398 } else {
3399 flags = 0;
3400 }
3401
3402 /*
3403 * If a recv attempt is made on a previously-accepted socket
3404 * that has been marked as inactive (disconnected), reject
3405 * the request.
3406 */
3407 if (so->so_flags & SOF_DEFUNCT) {
3408 struct sockbuf *sb = &so->so_rcv;
3409
3410 error = ENOTCONN;
3411 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3412 __func__, proc_pid(p), proc_best_name(p),
3413 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3414 SOCK_DOM(so), SOCK_TYPE(so), error);
3415 /*
3416 * This socket should have been disconnected and flushed
3417 * prior to being returned from sodefunct(); there should
3418 * be no data on its receive list, so panic otherwise.
3419 */
3420 if (so->so_state & SS_DEFUNCT) {
3421 sb_empty_assert(sb, __func__);
3422 }
3423 socket_unlock(so, 1);
3424 return error;
3425 }
3426
3427 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3428 pr->pr_usrreqs->pru_preconnect) {
3429 /*
3430 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3431 * calling write() right after this. *If* the app calls a read
3432 * we do not want to block this read indefinetely. Thus,
3433 * we trigger a connect so that the session gets initiated.
3434 */
3435 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3436
3437 if (error) {
3438 socket_unlock(so, 1);
3439 return error;
3440 }
3441 }
3442
3443 if (ENTR_SHOULDTRACE &&
3444 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3445 /*
3446 * enable energy tracing for inet sockets that go over
3447 * non-loopback interfaces only.
3448 */
3449 struct inpcb *inp = sotoinpcb(so);
3450 if (inp->inp_last_outifp != NULL &&
3451 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3452 en_tracing = TRUE;
3453 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3454 VM_KERNEL_ADDRPERM(so),
3455 ((so->so_state & SS_NBIO) ?
3456 kEnTrFlagNonBlocking : 0),
3457 (int64_t)orig_resid);
3458 }
3459 }
3460
3461 /*
3462 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3463 * regardless of the flags argument. Here is the case were
3464 * out-of-band data is not inline.
3465 */
3466 if ((flags & MSG_OOB) ||
3467 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3468 (so->so_options & SO_OOBINLINE) == 0 &&
3469 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3470 m = m_get(M_WAIT, MT_DATA);
3471 if (m == NULL) {
3472 socket_unlock(so, 1);
3473 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3474 ENOBUFS, 0, 0, 0, 0);
3475 return ENOBUFS;
3476 }
3477 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3478 if (error) {
3479 goto bad;
3480 }
3481 socket_unlock(so, 0);
3482 do {
3483 error = uiomove(mtod(m, caddr_t),
3484 imin((int)uio_resid(uio), m->m_len), uio);
3485 m = m_free(m);
3486 } while (uio_resid(uio) && error == 0 && m != NULL);
3487 socket_lock(so, 0);
3488 bad:
3489 if (m != NULL) {
3490 m_freem(m);
3491 }
3492
3493 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3494 if (error == EWOULDBLOCK || error == EINVAL) {
3495 /*
3496 * Let's try to get normal data:
3497 * EWOULDBLOCK: out-of-band data not
3498 * receive yet. EINVAL: out-of-band data
3499 * already read.
3500 */
3501 error = 0;
3502 goto nooob;
3503 } else if (error == 0 && flagsp != NULL) {
3504 *flagsp |= MSG_OOB;
3505 }
3506 }
3507 socket_unlock(so, 1);
3508 if (en_tracing) {
3509 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3510 VM_KERNEL_ADDRPERM(so), 0,
3511 (int64_t)(orig_resid - uio_resid(uio)));
3512 }
3513 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3514 0, 0, 0, 0);
3515
3516 return error;
3517 }
3518 nooob:
3519 if (mp != NULL) {
3520 *mp = NULL;
3521 }
3522
3523 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3524 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3525 }
3526
3527 free_list = NULL;
3528 delayed_copy_len = 0;
3529 restart:
3530 #ifdef MORE_LOCKING_DEBUG
3531 if (so->so_usecount <= 1) {
3532 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3533 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3534 }
3535 #endif
3536 /*
3537 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3538 * and if so just return to the caller. This could happen when
3539 * soreceive() is called by a socket upcall function during the
3540 * time the socket is freed. The socket buffer would have been
3541 * locked across the upcall, therefore we cannot put this thread
3542 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3543 * we may livelock), because the lock on the socket buffer will
3544 * only be released when the upcall routine returns to its caller.
3545 * Because the socket has been officially closed, there can be
3546 * no further read on it.
3547 *
3548 * A multipath subflow socket would have its SS_NOFDREF set by
3549 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3550 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3551 */
3552 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3553 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3554 socket_unlock(so, 1);
3555 return 0;
3556 }
3557
3558 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3559 if (error) {
3560 socket_unlock(so, 1);
3561 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3562 0, 0, 0, 0);
3563 if (en_tracing) {
3564 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3565 VM_KERNEL_ADDRPERM(so), 0,
3566 (int64_t)(orig_resid - uio_resid(uio)));
3567 }
3568 return error;
3569 }
3570
3571 m = so->so_rcv.sb_mb;
3572 if (so_should_wait(so, uio, m, flags)) {
3573 /*
3574 * Panic if we notice inconsistencies in the socket's
3575 * receive list; both sb_mb and sb_cc should correctly
3576 * reflect the contents of the list, otherwise we may
3577 * end up with false positives during select() or poll()
3578 * which could put the application in a bad state.
3579 */
3580 SB_MB_CHECK(&so->so_rcv);
3581
3582 if (so->so_error) {
3583 if (m != NULL) {
3584 goto dontblock;
3585 }
3586 error = so->so_error;
3587 if ((flags & MSG_PEEK) == 0) {
3588 so->so_error = 0;
3589 }
3590 goto release;
3591 }
3592 if (so->so_state & SS_CANTRCVMORE) {
3593 #if CONTENT_FILTER
3594 /*
3595 * Deal with half closed connections
3596 */
3597 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3598 cfil_sock_data_pending(&so->so_rcv) != 0) {
3599 CFIL_LOG(LOG_INFO,
3600 "so %llx ignore SS_CANTRCVMORE",
3601 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3602 } else
3603 #endif /* CONTENT_FILTER */
3604 if (m != NULL) {
3605 goto dontblock;
3606 } else {
3607 goto release;
3608 }
3609 }
3610 for (; m != NULL; m = m->m_next) {
3611 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3612 m = so->so_rcv.sb_mb;
3613 goto dontblock;
3614 }
3615 }
3616 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3617 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3618 error = ENOTCONN;
3619 goto release;
3620 }
3621 if (uio_resid(uio) == 0) {
3622 goto release;
3623 }
3624
3625 if ((so->so_state & SS_NBIO) ||
3626 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3627 error = EWOULDBLOCK;
3628 goto release;
3629 }
3630 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3631 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3632 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3633 #if EVEN_MORE_LOCKING_DEBUG
3634 if (socket_debug) {
3635 printf("Waiting for socket data\n");
3636 }
3637 #endif
3638
3639 /*
3640 * Depending on the protocol (e.g. TCP), the following
3641 * might cause the socket lock to be dropped and later
3642 * be reacquired, and more data could have arrived and
3643 * have been appended to the receive socket buffer by
3644 * the time it returns. Therefore, we only sleep in
3645 * sbwait() below if and only if the wait-condition is still
3646 * true.
3647 */
3648 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3649 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3650 }
3651
3652 error = 0;
3653 if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3654 error = sbwait(&so->so_rcv);
3655 }
3656
3657 #if EVEN_MORE_LOCKING_DEBUG
3658 if (socket_debug) {
3659 printf("SORECEIVE - sbwait returned %d\n", error);
3660 }
3661 #endif
3662 if (so->so_usecount < 1) {
3663 panic("%s: after 2nd sblock so=%p ref=%d on socket",
3664 __func__, so, so->so_usecount);
3665 /* NOTREACHED */
3666 }
3667 if (error) {
3668 socket_unlock(so, 1);
3669 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3670 0, 0, 0, 0);
3671 if (en_tracing) {
3672 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3673 VM_KERNEL_ADDRPERM(so), 0,
3674 (int64_t)(orig_resid - uio_resid(uio)));
3675 }
3676 return error;
3677 }
3678 goto restart;
3679 }
3680 dontblock:
3681 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3682 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3683 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3684 nextrecord = m->m_nextpkt;
3685
3686 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3687 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3688 mp0 == NULL);
3689 if (error == ERESTART) {
3690 goto restart;
3691 } else if (error != 0) {
3692 goto release;
3693 }
3694 orig_resid = 0;
3695 }
3696
3697 /*
3698 * Process one or more MT_CONTROL mbufs present before any data mbufs
3699 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3700 * just copy the data; if !MSG_PEEK, we call into the protocol to
3701 * perform externalization.
3702 */
3703 if (m != NULL && m->m_type == MT_CONTROL) {
3704 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3705 if (error != 0) {
3706 goto release;
3707 }
3708 orig_resid = 0;
3709 }
3710
3711 if (m != NULL) {
3712 if (!(flags & MSG_PEEK)) {
3713 /*
3714 * We get here because m points to an mbuf following
3715 * any MT_SONAME or MT_CONTROL mbufs which have been
3716 * processed above. In any case, m should be pointing
3717 * to the head of the mbuf chain, and the nextrecord
3718 * should be either NULL or equal to m->m_nextpkt.
3719 * See comments above about SB_LOCK.
3720 */
3721 if (m != so->so_rcv.sb_mb ||
3722 m->m_nextpkt != nextrecord) {
3723 panic("%s: post-control !sync so=%p m=%p "
3724 "nextrecord=%p\n", __func__, so, m,
3725 nextrecord);
3726 /* NOTREACHED */
3727 }
3728 if (nextrecord == NULL) {
3729 so->so_rcv.sb_lastrecord = m;
3730 }
3731 }
3732 type = m->m_type;
3733 if (type == MT_OOBDATA) {
3734 flags |= MSG_OOB;
3735 }
3736 } else {
3737 if (!(flags & MSG_PEEK)) {
3738 SB_EMPTY_FIXUP(&so->so_rcv);
3739 }
3740 }
3741 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3742 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3743
3744 moff = 0;
3745 offset = 0;
3746
3747 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3748 can_delay = 1;
3749 } else {
3750 can_delay = 0;
3751 }
3752
3753 while (m != NULL &&
3754 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3755 if (m->m_type == MT_OOBDATA) {
3756 if (type != MT_OOBDATA) {
3757 break;
3758 }
3759 } else if (type == MT_OOBDATA) {
3760 break;
3761 }
3762
3763 if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
3764 m->m_type != MT_HEADER) {
3765 break;
3766 }
3767 /*
3768 * Make sure to allways set MSG_OOB event when getting
3769 * out of band data inline.
3770 */
3771 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3772 (so->so_options & SO_OOBINLINE) != 0 &&
3773 (so->so_state & SS_RCVATMARK) != 0) {
3774 flags |= MSG_OOB;
3775 }
3776 so->so_state &= ~SS_RCVATMARK;
3777 len = uio_resid(uio) - delayed_copy_len;
3778 if (so->so_oobmark && len > so->so_oobmark - offset) {
3779 len = so->so_oobmark - offset;
3780 }
3781 if (len > m->m_len - moff) {
3782 len = m->m_len - moff;
3783 }
3784 /*
3785 * If mp is set, just pass back the mbufs.
3786 * Otherwise copy them out via the uio, then free.
3787 * Sockbuf must be consistent here (points to current mbuf,
3788 * it points to next record) when we drop priority;
3789 * we must note any additions to the sockbuf when we
3790 * block interrupts again.
3791 */
3792 if (mp == NULL) {
3793 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3794 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3795 if (can_delay && len == m->m_len) {
3796 /*
3797 * only delay the copy if we're consuming the
3798 * mbuf and we're NOT in MSG_PEEK mode
3799 * and we have enough data to make it worthwile
3800 * to drop and retake the lock... can_delay
3801 * reflects the state of the 2 latter
3802 * constraints moff should always be zero
3803 * in these cases
3804 */
3805 delayed_copy_len += len;
3806 } else {
3807 if (delayed_copy_len) {
3808 error = sodelayed_copy(so, uio,
3809 &free_list, &delayed_copy_len);
3810
3811 if (error) {
3812 goto release;
3813 }
3814 /*
3815 * can only get here if MSG_PEEK is not
3816 * set therefore, m should point at the
3817 * head of the rcv queue; if it doesn't,
3818 * it means something drastically
3819 * changed while we were out from behind
3820 * the lock in sodelayed_copy. perhaps
3821 * a RST on the stream. in any event,
3822 * the stream has been interrupted. it's
3823 * probably best just to return whatever
3824 * data we've moved and let the caller
3825 * sort it out...
3826 */
3827 if (m != so->so_rcv.sb_mb) {
3828 break;
3829 }
3830 }
3831 socket_unlock(so, 0);
3832 error = uiomove(mtod(m, caddr_t) + moff,
3833 (int)len, uio);
3834 socket_lock(so, 0);
3835
3836 if (error) {
3837 goto release;
3838 }
3839 }
3840 } else {
3841 uio_setresid(uio, (uio_resid(uio) - len));
3842 }
3843 if (len == m->m_len - moff) {
3844 if (m->m_flags & M_EOR) {
3845 flags |= MSG_EOR;
3846 }
3847 if (flags & MSG_PEEK) {
3848 m = m->m_next;
3849 moff = 0;
3850 } else {
3851 nextrecord = m->m_nextpkt;
3852 sbfree(&so->so_rcv, m);
3853 m->m_nextpkt = NULL;
3854
3855 if (mp != NULL) {
3856 *mp = m;
3857 mp = &m->m_next;
3858 so->so_rcv.sb_mb = m = m->m_next;
3859 *mp = NULL;
3860 } else {
3861 if (free_list == NULL) {
3862 free_list = m;
3863 } else {
3864 ml->m_next = m;
3865 }
3866 ml = m;
3867 so->so_rcv.sb_mb = m = m->m_next;
3868 ml->m_next = NULL;
3869 }
3870 if (m != NULL) {
3871 m->m_nextpkt = nextrecord;
3872 if (nextrecord == NULL) {
3873 so->so_rcv.sb_lastrecord = m;
3874 }
3875 } else {
3876 so->so_rcv.sb_mb = nextrecord;
3877 SB_EMPTY_FIXUP(&so->so_rcv);
3878 }
3879 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3880 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3881 }
3882 } else {
3883 if (flags & MSG_PEEK) {
3884 moff += len;
3885 } else {
3886 if (mp != NULL) {
3887 int copy_flag;
3888
3889 if (flags & MSG_DONTWAIT) {
3890 copy_flag = M_DONTWAIT;
3891 } else {
3892 copy_flag = M_WAIT;
3893 }
3894 *mp = m_copym(m, 0, (int)len, copy_flag);
3895 /*
3896 * Failed to allocate an mbuf?
3897 * Adjust uio_resid back, it was
3898 * adjusted down by len bytes which
3899 * we didn't copy over.
3900 */
3901 if (*mp == NULL) {
3902 uio_setresid(uio,
3903 (uio_resid(uio) + len));
3904 break;
3905 }
3906 }
3907 m->m_data += len;
3908 m->m_len -= len;
3909 so->so_rcv.sb_cc -= len;
3910 }
3911 }
3912 if (so->so_oobmark) {
3913 if ((flags & MSG_PEEK) == 0) {
3914 so->so_oobmark -= len;
3915 if (so->so_oobmark == 0) {
3916 so->so_state |= SS_RCVATMARK;
3917 break;
3918 }
3919 } else {
3920 offset += len;
3921 if (offset == so->so_oobmark) {
3922 break;
3923 }
3924 }
3925 }
3926 if (flags & MSG_EOR) {
3927 break;
3928 }
3929 /*
3930 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3931 * (for non-atomic socket), we must not quit until
3932 * "uio->uio_resid == 0" or an error termination.
3933 * If a signal/timeout occurs, return with a short
3934 * count but without error. Keep sockbuf locked
3935 * against other readers.
3936 */
3937 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3938 (uio_resid(uio) - delayed_copy_len) > 0 &&
3939 !sosendallatonce(so) && !nextrecord) {
3940 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3941 #if CONTENT_FILTER
3942 && cfil_sock_data_pending(&so->so_rcv) == 0
3943 #endif /* CONTENT_FILTER */
3944 )) {
3945 goto release;
3946 }
3947
3948 /*
3949 * Depending on the protocol (e.g. TCP), the following
3950 * might cause the socket lock to be dropped and later
3951 * be reacquired, and more data could have arrived and
3952 * have been appended to the receive socket buffer by
3953 * the time it returns. Therefore, we only sleep in
3954 * sbwait() below if and only if the socket buffer is
3955 * empty, in order to avoid a false sleep.
3956 */
3957 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3958 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3959 }
3960
3961 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3962 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3963
3964 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3965 error = 0;
3966 goto release;
3967 }
3968 /*
3969 * have to wait until after we get back from the sbwait
3970 * to do the copy because we will drop the lock if we
3971 * have enough data that has been delayed... by dropping
3972 * the lock we open up a window allowing the netisr
3973 * thread to process the incoming packets and to change
3974 * the state of this socket... we're issuing the sbwait
3975 * because the socket is empty and we're expecting the
3976 * netisr thread to wake us up when more packets arrive;
3977 * if we allow that processing to happen and then sbwait
3978 * we could stall forever with packets sitting in the
3979 * socket if no further packets arrive from the remote
3980 * side.
3981 *
3982 * we want to copy before we've collected all the data
3983 * to satisfy this request to allow the copy to overlap
3984 * the incoming packet processing on an MP system
3985 */
3986 if (delayed_copy_len > sorecvmincopy &&
3987 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3988 error = sodelayed_copy(so, uio,
3989 &free_list, &delayed_copy_len);
3990
3991 if (error) {
3992 goto release;
3993 }
3994 }
3995 m = so->so_rcv.sb_mb;
3996 if (m != NULL) {
3997 nextrecord = m->m_nextpkt;
3998 }
3999 SB_MB_CHECK(&so->so_rcv);
4000 }
4001 }
4002 #ifdef MORE_LOCKING_DEBUG
4003 if (so->so_usecount <= 1) {
4004 panic("%s: after big while so=%p ref=%d on socket",
4005 __func__, so, so->so_usecount);
4006 /* NOTREACHED */
4007 }
4008 #endif
4009
4010 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
4011 if (so->so_options & SO_DONTTRUNC) {
4012 flags |= MSG_RCVMORE;
4013 } else {
4014 flags |= MSG_TRUNC;
4015 if ((flags & MSG_PEEK) == 0) {
4016 (void) sbdroprecord(&so->so_rcv);
4017 }
4018 }
4019 }
4020
4021 /*
4022 * pru_rcvd below (for TCP) may cause more data to be received
4023 * if the socket lock is dropped prior to sending the ACK; some
4024 * legacy OpenTransport applications don't handle this well
4025 * (if it receives less data than requested while MSG_HAVEMORE
4026 * is set), and so we set the flag now based on what we know
4027 * prior to calling pru_rcvd.
4028 */
4029 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4030 flags |= MSG_HAVEMORE;
4031 }
4032
4033 if ((flags & MSG_PEEK) == 0) {
4034 if (m == NULL) {
4035 so->so_rcv.sb_mb = nextrecord;
4036 /*
4037 * First part is an inline SB_EMPTY_FIXUP(). Second
4038 * part makes sure sb_lastrecord is up-to-date if
4039 * there is still data in the socket buffer.
4040 */
4041 if (so->so_rcv.sb_mb == NULL) {
4042 so->so_rcv.sb_mbtail = NULL;
4043 so->so_rcv.sb_lastrecord = NULL;
4044 } else if (nextrecord->m_nextpkt == NULL) {
4045 so->so_rcv.sb_lastrecord = nextrecord;
4046 }
4047 SB_MB_CHECK(&so->so_rcv);
4048 }
4049 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4050 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4051 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4052 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4053 }
4054 }
4055
4056 if (delayed_copy_len) {
4057 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4058 if (error) {
4059 goto release;
4060 }
4061 }
4062 if (free_list != NULL) {
4063 m_freem_list(free_list);
4064 free_list = NULL;
4065 }
4066
4067 if (orig_resid == uio_resid(uio) && orig_resid &&
4068 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4069 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4070 goto restart;
4071 }
4072
4073 if (flagsp != NULL) {
4074 *flagsp |= flags;
4075 }
4076 release:
4077 #ifdef MORE_LOCKING_DEBUG
4078 if (so->so_usecount <= 1) {
4079 panic("%s: release so=%p ref=%d on socket", __func__,
4080 so, so->so_usecount);
4081 /* NOTREACHED */
4082 }
4083 #endif
4084 if (delayed_copy_len) {
4085 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4086 }
4087
4088 if (free_list != NULL) {
4089 m_freem_list(free_list);
4090 }
4091
4092 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4093
4094 if (en_tracing) {
4095 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4096 VM_KERNEL_ADDRPERM(so),
4097 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4098 (int64_t)(orig_resid - uio_resid(uio)));
4099 }
4100 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4101 so->so_rcv.sb_cc, 0, error);
4102
4103 return error;
4104 }
4105
4106 /*
4107 * Returns: 0 Success
4108 * uiomove:EFAULT
4109 */
4110 static int
sodelayed_copy(struct socket * so,struct uio * uio,struct mbuf ** free_list,user_ssize_t * resid)4111 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4112 user_ssize_t *resid)
4113 {
4114 int error = 0;
4115 struct mbuf *m;
4116
4117 m = *free_list;
4118
4119 socket_unlock(so, 0);
4120
4121 while (m != NULL && error == 0) {
4122 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4123 m = m->m_next;
4124 }
4125 m_freem_list(*free_list);
4126
4127 *free_list = NULL;
4128 *resid = 0;
4129
4130 socket_lock(so, 0);
4131
4132 return error;
4133 }
4134
4135 static int
sodelayed_copy_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,struct mbuf ** free_list,user_ssize_t * resid)4136 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4137 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4138 {
4139 #pragma unused(so)
4140 int error = 0;
4141 struct mbuf *ml, *m;
4142 int i = 0;
4143 struct uio *auio;
4144
4145 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4146 ml = ml->m_nextpkt, i++) {
4147 auio = msgarray[i].uio;
4148 for (m = ml; m != NULL; m = m->m_next) {
4149 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4150 if (error != 0) {
4151 goto out;
4152 }
4153 }
4154 }
4155 out:
4156 m_freem_list(*free_list);
4157
4158 *free_list = NULL;
4159 *resid = 0;
4160
4161 return error;
4162 }
4163
4164 int
soreceive_list(struct socket * so,struct recv_msg_elem * msgarray,u_int uiocnt,int * flagsp)4165 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4166 int *flagsp)
4167 {
4168 struct mbuf *m;
4169 struct mbuf *nextrecord;
4170 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4171 int error;
4172 user_ssize_t len, pktlen, delayed_copy_len = 0;
4173 struct protosw *pr = so->so_proto;
4174 user_ssize_t resid;
4175 struct proc *p = current_proc();
4176 struct uio *auio = NULL;
4177 int npkts = 0;
4178 int sblocked = 0;
4179 struct sockaddr **psa = NULL;
4180 struct mbuf **controlp = NULL;
4181 int can_delay;
4182 int flags;
4183 struct mbuf *free_others = NULL;
4184
4185 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4186 so, uiocnt,
4187 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4188
4189 /*
4190 * Sanity checks:
4191 * - Only supports don't wait flags
4192 * - Only support datagram sockets (could be extended to raw)
4193 * - Must be atomic
4194 * - Protocol must support packet chains
4195 * - The uio array is NULL (should we panic?)
4196 */
4197 if (flagsp != NULL) {
4198 flags = *flagsp;
4199 } else {
4200 flags = 0;
4201 }
4202 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4203 MSG_NBIO)) {
4204 printf("%s invalid flags 0x%x\n", __func__, flags);
4205 error = EINVAL;
4206 goto out;
4207 }
4208 if (so->so_type != SOCK_DGRAM) {
4209 error = EINVAL;
4210 goto out;
4211 }
4212 if (sosendallatonce(so) == 0) {
4213 error = EINVAL;
4214 goto out;
4215 }
4216 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4217 error = EPROTONOSUPPORT;
4218 goto out;
4219 }
4220 if (msgarray == NULL) {
4221 printf("%s uioarray is NULL\n", __func__);
4222 error = EINVAL;
4223 goto out;
4224 }
4225 if (uiocnt == 0) {
4226 printf("%s uiocnt is 0\n", __func__);
4227 error = EINVAL;
4228 goto out;
4229 }
4230 /*
4231 * Sanity check on the length passed by caller as we are making 'int'
4232 * comparisons
4233 */
4234 resid = recv_msg_array_resid(msgarray, uiocnt);
4235 if (resid < 0 || resid > INT_MAX) {
4236 error = EINVAL;
4237 goto out;
4238 }
4239
4240 if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4241 can_delay = 1;
4242 } else {
4243 can_delay = 0;
4244 }
4245
4246 socket_lock(so, 1);
4247 so_update_last_owner_locked(so, p);
4248 so_update_policy(so);
4249
4250 #if NECP
4251 so_update_necp_policy(so, NULL, NULL);
4252 #endif /* NECP */
4253
4254 /*
4255 * If a recv attempt is made on a previously-accepted socket
4256 * that has been marked as inactive (disconnected), reject
4257 * the request.
4258 */
4259 if (so->so_flags & SOF_DEFUNCT) {
4260 struct sockbuf *sb = &so->so_rcv;
4261
4262 error = ENOTCONN;
4263 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4264 __func__, proc_pid(p), proc_best_name(p),
4265 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4266 SOCK_DOM(so), SOCK_TYPE(so), error);
4267 /*
4268 * This socket should have been disconnected and flushed
4269 * prior to being returned from sodefunct(); there should
4270 * be no data on its receive list, so panic otherwise.
4271 */
4272 if (so->so_state & SS_DEFUNCT) {
4273 sb_empty_assert(sb, __func__);
4274 }
4275 goto release;
4276 }
4277
4278 next:
4279 /*
4280 * The uio may be empty
4281 */
4282 if (npkts >= uiocnt) {
4283 error = 0;
4284 goto release;
4285 }
4286 restart:
4287 /*
4288 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4289 * and if so just return to the caller. This could happen when
4290 * soreceive() is called by a socket upcall function during the
4291 * time the socket is freed. The socket buffer would have been
4292 * locked across the upcall, therefore we cannot put this thread
4293 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4294 * we may livelock), because the lock on the socket buffer will
4295 * only be released when the upcall routine returns to its caller.
4296 * Because the socket has been officially closed, there can be
4297 * no further read on it.
4298 */
4299 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4300 (SS_NOFDREF | SS_CANTRCVMORE)) {
4301 error = 0;
4302 goto release;
4303 }
4304
4305 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4306 if (error) {
4307 goto release;
4308 }
4309 sblocked = 1;
4310
4311 m = so->so_rcv.sb_mb;
4312 /*
4313 * Block awaiting more datagram if needed
4314 */
4315 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4316 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4317 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4318 /*
4319 * Panic if we notice inconsistencies in the socket's
4320 * receive list; both sb_mb and sb_cc should correctly
4321 * reflect the contents of the list, otherwise we may
4322 * end up with false positives during select() or poll()
4323 * which could put the application in a bad state.
4324 */
4325 SB_MB_CHECK(&so->so_rcv);
4326
4327 if (so->so_error) {
4328 error = so->so_error;
4329 if ((flags & MSG_PEEK) == 0) {
4330 so->so_error = 0;
4331 }
4332 goto release;
4333 }
4334 if (so->so_state & SS_CANTRCVMORE) {
4335 goto release;
4336 }
4337 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4338 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4339 error = ENOTCONN;
4340 goto release;
4341 }
4342 if ((so->so_state & SS_NBIO) ||
4343 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4344 error = EWOULDBLOCK;
4345 goto release;
4346 }
4347 /*
4348 * Do not block if we got some data
4349 */
4350 if (free_list != NULL) {
4351 error = 0;
4352 goto release;
4353 }
4354
4355 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4356 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4357
4358 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4359 sblocked = 0;
4360
4361 error = sbwait(&so->so_rcv);
4362 if (error) {
4363 goto release;
4364 }
4365 goto restart;
4366 }
4367
4368 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4369 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4370 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4371
4372 /*
4373 * Consume the current uio index as we have a datagram
4374 */
4375 auio = msgarray[npkts].uio;
4376 resid = uio_resid(auio);
4377 msgarray[npkts].which |= SOCK_MSG_DATA;
4378 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4379 &msgarray[npkts].psa : NULL;
4380 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4381 &msgarray[npkts].controlp : NULL;
4382 npkts += 1;
4383 nextrecord = m->m_nextpkt;
4384
4385 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4386 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4387 if (error == ERESTART) {
4388 goto restart;
4389 } else if (error != 0) {
4390 goto release;
4391 }
4392 }
4393
4394 if (m != NULL && m->m_type == MT_CONTROL) {
4395 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4396 if (error != 0) {
4397 goto release;
4398 }
4399 }
4400
4401 if (m->m_pkthdr.len == 0) {
4402 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4403 __func__, __LINE__,
4404 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4405 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4406 m->m_type);
4407 }
4408
4409 /*
4410 * Loop to copy the mbufs of the current record
4411 * Support zero length packets
4412 */
4413 ml = NULL;
4414 pktlen = 0;
4415 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4416 if (m->m_len == 0) {
4417 panic("%p m_len zero", m);
4418 }
4419 if (m->m_type == 0) {
4420 panic("%p m_type zero", m);
4421 }
4422 /*
4423 * Clip to the residual length
4424 */
4425 if (len > m->m_len) {
4426 len = m->m_len;
4427 }
4428 pktlen += len;
4429 /*
4430 * Copy the mbufs via the uio or delay the copy
4431 * Sockbuf must be consistent here (points to current mbuf,
4432 * it points to next record) when we drop priority;
4433 * we must note any additions to the sockbuf when we
4434 * block interrupts again.
4435 */
4436 if (len > 0 && can_delay == 0) {
4437 socket_unlock(so, 0);
4438 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4439 socket_lock(so, 0);
4440 if (error) {
4441 goto release;
4442 }
4443 } else {
4444 delayed_copy_len += len;
4445 }
4446
4447 if (len == m->m_len) {
4448 /*
4449 * m was entirely copied
4450 */
4451 sbfree(&so->so_rcv, m);
4452 nextrecord = m->m_nextpkt;
4453 m->m_nextpkt = NULL;
4454
4455 /*
4456 * Set the first packet to the head of the free list
4457 */
4458 if (free_list == NULL) {
4459 free_list = m;
4460 }
4461 /*
4462 * Link current packet to tail of free list
4463 */
4464 if (ml == NULL) {
4465 if (free_tail != NULL) {
4466 free_tail->m_nextpkt = m;
4467 }
4468 free_tail = m;
4469 }
4470 /*
4471 * Link current mbuf to last mbuf of current packet
4472 */
4473 if (ml != NULL) {
4474 ml->m_next = m;
4475 }
4476 ml = m;
4477
4478 /*
4479 * Move next buf to head of socket buffer
4480 */
4481 so->so_rcv.sb_mb = m = ml->m_next;
4482 ml->m_next = NULL;
4483
4484 if (m != NULL) {
4485 m->m_nextpkt = nextrecord;
4486 if (nextrecord == NULL) {
4487 so->so_rcv.sb_lastrecord = m;
4488 }
4489 } else {
4490 so->so_rcv.sb_mb = nextrecord;
4491 SB_EMPTY_FIXUP(&so->so_rcv);
4492 }
4493 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4494 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4495 } else {
4496 /*
4497 * Stop the loop on partial copy
4498 */
4499 break;
4500 }
4501 }
4502 #ifdef MORE_LOCKING_DEBUG
4503 if (so->so_usecount <= 1) {
4504 panic("%s: after big while so=%llx ref=%d on socket",
4505 __func__,
4506 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4507 /* NOTREACHED */
4508 }
4509 #endif
4510 /*
4511 * Tell the caller we made a partial copy
4512 */
4513 if (m != NULL) {
4514 if (so->so_options & SO_DONTTRUNC) {
4515 /*
4516 * Copyout first the freelist then the partial mbuf
4517 */
4518 socket_unlock(so, 0);
4519 if (delayed_copy_len) {
4520 error = sodelayed_copy_list(so, msgarray,
4521 uiocnt, &free_list, &delayed_copy_len);
4522 }
4523
4524 if (error == 0) {
4525 error = uiomove(mtod(m, caddr_t), (int)len,
4526 auio);
4527 }
4528 socket_lock(so, 0);
4529 if (error) {
4530 goto release;
4531 }
4532
4533 m->m_data += len;
4534 m->m_len -= len;
4535 so->so_rcv.sb_cc -= len;
4536 flags |= MSG_RCVMORE;
4537 } else {
4538 (void) sbdroprecord(&so->so_rcv);
4539 nextrecord = so->so_rcv.sb_mb;
4540 m = NULL;
4541 flags |= MSG_TRUNC;
4542 }
4543 }
4544
4545 if (m == NULL) {
4546 so->so_rcv.sb_mb = nextrecord;
4547 /*
4548 * First part is an inline SB_EMPTY_FIXUP(). Second
4549 * part makes sure sb_lastrecord is up-to-date if
4550 * there is still data in the socket buffer.
4551 */
4552 if (so->so_rcv.sb_mb == NULL) {
4553 so->so_rcv.sb_mbtail = NULL;
4554 so->so_rcv.sb_lastrecord = NULL;
4555 } else if (nextrecord->m_nextpkt == NULL) {
4556 so->so_rcv.sb_lastrecord = nextrecord;
4557 }
4558 SB_MB_CHECK(&so->so_rcv);
4559 }
4560 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4561 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4562
4563 /*
4564 * We can continue to the next packet as long as:
4565 * - We haven't exhausted the uio array
4566 * - There was no error
4567 * - A packet was not truncated
4568 * - We can still receive more data
4569 */
4570 if (npkts < uiocnt && error == 0 &&
4571 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4572 (so->so_state & SS_CANTRCVMORE) == 0) {
4573 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4574 sblocked = 0;
4575
4576 goto next;
4577 }
4578 if (flagsp != NULL) {
4579 *flagsp |= flags;
4580 }
4581
4582 release:
4583 /*
4584 * pru_rcvd may cause more data to be received if the socket lock
4585 * is dropped so we set MSG_HAVEMORE now based on what we know.
4586 * That way the caller won't be surprised if it receives less data
4587 * than requested.
4588 */
4589 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4590 flags |= MSG_HAVEMORE;
4591 }
4592
4593 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4594 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4595 }
4596
4597 if (sblocked) {
4598 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4599 } else {
4600 socket_unlock(so, 1);
4601 }
4602
4603 if (delayed_copy_len) {
4604 error = sodelayed_copy_list(so, msgarray, uiocnt,
4605 &free_list, &delayed_copy_len);
4606 }
4607 out:
4608 /*
4609 * Amortize the cost of freeing the mbufs
4610 */
4611 if (free_list != NULL) {
4612 m_freem_list(free_list);
4613 }
4614 if (free_others != NULL) {
4615 m_freem_list(free_others);
4616 }
4617
4618 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4619 0, 0, 0, 0);
4620 return error;
4621 }
4622
4623 static int
so_statistics_event_to_nstat_event(int64_t * input_options,uint64_t * nstat_event)4624 so_statistics_event_to_nstat_event(int64_t *input_options,
4625 uint64_t *nstat_event)
4626 {
4627 int error = 0;
4628 switch (*input_options) {
4629 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4630 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4631 break;
4632 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4633 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4634 break;
4635 #if (DEBUG || DEVELOPMENT)
4636 case SO_STATISTICS_EVENT_RESERVED_1:
4637 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4638 break;
4639 case SO_STATISTICS_EVENT_RESERVED_2:
4640 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4641 break;
4642 #endif /* (DEBUG || DEVELOPMENT) */
4643 default:
4644 error = EINVAL;
4645 break;
4646 }
4647 return error;
4648 }
4649
4650 /*
4651 * Returns: 0 Success
4652 * EINVAL
4653 * ENOTCONN
4654 * <pru_shutdown>:EINVAL
4655 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4656 * <pru_shutdown>:ENOBUFS[TCP]
4657 * <pru_shutdown>:EMSGSIZE[TCP]
4658 * <pru_shutdown>:EHOSTUNREACH[TCP]
4659 * <pru_shutdown>:ENETUNREACH[TCP]
4660 * <pru_shutdown>:ENETDOWN[TCP]
4661 * <pru_shutdown>:ENOMEM[TCP]
4662 * <pru_shutdown>:EACCES[TCP]
4663 * <pru_shutdown>:EMSGSIZE[TCP]
4664 * <pru_shutdown>:ENOBUFS[TCP]
4665 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4666 * <pru_shutdown>:??? [other protocol families]
4667 */
4668 int
soshutdown(struct socket * so,int how)4669 soshutdown(struct socket *so, int how)
4670 {
4671 int error;
4672
4673 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4674
4675 switch (how) {
4676 case SHUT_RD:
4677 case SHUT_WR:
4678 case SHUT_RDWR:
4679 socket_lock(so, 1);
4680 if ((so->so_state &
4681 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4682 error = ENOTCONN;
4683 } else {
4684 error = soshutdownlock(so, how);
4685 }
4686 socket_unlock(so, 1);
4687 break;
4688 default:
4689 error = EINVAL;
4690 break;
4691 }
4692
4693 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4694
4695 return error;
4696 }
4697
4698 int
soshutdownlock_final(struct socket * so,int how)4699 soshutdownlock_final(struct socket *so, int how)
4700 {
4701 struct protosw *pr = so->so_proto;
4702 int error = 0;
4703
4704 sflt_notify(so, sock_evt_shutdown, &how);
4705
4706 if (how != SHUT_WR) {
4707 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4708 /* read already shut down */
4709 error = ENOTCONN;
4710 goto done;
4711 }
4712 sorflush(so);
4713 }
4714 if (how != SHUT_RD) {
4715 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4716 /* write already shut down */
4717 error = ENOTCONN;
4718 goto done;
4719 }
4720 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4721 }
4722 done:
4723 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4724 return error;
4725 }
4726
4727 int
soshutdownlock(struct socket * so,int how)4728 soshutdownlock(struct socket *so, int how)
4729 {
4730 int error = 0;
4731
4732 #if CONTENT_FILTER
4733 /*
4734 * A content filter may delay the actual shutdown until it
4735 * has processed the pending data
4736 */
4737 if (so->so_flags & SOF_CONTENT_FILTER) {
4738 error = cfil_sock_shutdown(so, &how);
4739 if (error == EJUSTRETURN) {
4740 error = 0;
4741 goto done;
4742 } else if (error != 0) {
4743 goto done;
4744 }
4745 }
4746 #endif /* CONTENT_FILTER */
4747
4748 error = soshutdownlock_final(so, how);
4749
4750 done:
4751 return error;
4752 }
4753
4754 void
sowflush(struct socket * so)4755 sowflush(struct socket *so)
4756 {
4757 struct sockbuf *sb = &so->so_snd;
4758
4759 /*
4760 * Obtain lock on the socket buffer (SB_LOCK). This is required
4761 * to prevent the socket buffer from being unexpectedly altered
4762 * while it is used by another thread in socket send/receive.
4763 *
4764 * sblock() must not fail here, hence the assertion.
4765 */
4766 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4767 VERIFY(sb->sb_flags & SB_LOCK);
4768
4769 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4770 sb->sb_flags |= SB_DROP;
4771 sb->sb_upcall = NULL;
4772 sb->sb_upcallarg = NULL;
4773
4774 sbunlock(sb, TRUE); /* keep socket locked */
4775
4776 selthreadclear(&sb->sb_sel);
4777 sbrelease(sb);
4778 }
4779
4780 void
sorflush(struct socket * so)4781 sorflush(struct socket *so)
4782 {
4783 struct sockbuf *sb = &so->so_rcv;
4784 struct protosw *pr = so->so_proto;
4785 struct sockbuf asb;
4786 #ifdef notyet
4787 lck_mtx_t *mutex_held;
4788 /*
4789 * XXX: This code is currently commented out, because we may get here
4790 * as part of sofreelastref(), and at that time, pr_getlock() may no
4791 * longer be able to return us the lock; this will be fixed in future.
4792 */
4793 if (so->so_proto->pr_getlock != NULL) {
4794 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4795 } else {
4796 mutex_held = so->so_proto->pr_domain->dom_mtx;
4797 }
4798
4799 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4800 #endif /* notyet */
4801
4802 sflt_notify(so, sock_evt_flush_read, NULL);
4803
4804 socantrcvmore(so);
4805
4806 /*
4807 * Obtain lock on the socket buffer (SB_LOCK). This is required
4808 * to prevent the socket buffer from being unexpectedly altered
4809 * while it is used by another thread in socket send/receive.
4810 *
4811 * sblock() must not fail here, hence the assertion.
4812 */
4813 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4814 VERIFY(sb->sb_flags & SB_LOCK);
4815
4816 /*
4817 * Copy only the relevant fields from "sb" to "asb" which we
4818 * need for sbrelease() to function. In particular, skip
4819 * sb_sel as it contains the wait queue linkage, which would
4820 * wreak havoc if we were to issue selthreadclear() on "asb".
4821 * Make sure to not carry over SB_LOCK in "asb", as we need
4822 * to acquire it later as part of sbrelease().
4823 */
4824 bzero(&asb, sizeof(asb));
4825 asb.sb_cc = sb->sb_cc;
4826 asb.sb_hiwat = sb->sb_hiwat;
4827 asb.sb_mbcnt = sb->sb_mbcnt;
4828 asb.sb_mbmax = sb->sb_mbmax;
4829 asb.sb_ctl = sb->sb_ctl;
4830 asb.sb_lowat = sb->sb_lowat;
4831 asb.sb_mb = sb->sb_mb;
4832 asb.sb_mbtail = sb->sb_mbtail;
4833 asb.sb_lastrecord = sb->sb_lastrecord;
4834 asb.sb_so = sb->sb_so;
4835 asb.sb_flags = sb->sb_flags;
4836 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4837 asb.sb_flags |= SB_DROP;
4838
4839 /*
4840 * Ideally we'd bzero() these and preserve the ones we need;
4841 * but to do that we'd need to shuffle things around in the
4842 * sockbuf, and we can't do it now because there are KEXTS
4843 * that are directly referring to the socket structure.
4844 *
4845 * Setting SB_DROP acts as a barrier to prevent further appends.
4846 * Clearing SB_SEL is done for selthreadclear() below.
4847 */
4848 sb->sb_cc = 0;
4849 sb->sb_hiwat = 0;
4850 sb->sb_mbcnt = 0;
4851 sb->sb_mbmax = 0;
4852 sb->sb_ctl = 0;
4853 sb->sb_lowat = 0;
4854 sb->sb_mb = NULL;
4855 sb->sb_mbtail = NULL;
4856 sb->sb_lastrecord = NULL;
4857 sb->sb_timeo.tv_sec = 0;
4858 sb->sb_timeo.tv_usec = 0;
4859 sb->sb_upcall = NULL;
4860 sb->sb_upcallarg = NULL;
4861 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4862 sb->sb_flags |= SB_DROP;
4863
4864 sbunlock(sb, TRUE); /* keep socket locked */
4865
4866 /*
4867 * Note that selthreadclear() is called on the original "sb" and
4868 * not the local "asb" because of the way wait queue linkage is
4869 * implemented. Given that selwakeup() may be triggered, SB_SEL
4870 * should no longer be set (cleared above.)
4871 */
4872 selthreadclear(&sb->sb_sel);
4873
4874 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4875 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4876 }
4877
4878 sbrelease(&asb);
4879 }
4880
4881 /*
4882 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4883 * an additional variant to handle the case where the option value needs
4884 * to be some kind of integer, but not a specific size.
4885 * In addition to their use here, these functions are also called by the
4886 * protocol-level pr_ctloutput() routines.
4887 *
4888 * Returns: 0 Success
4889 * EINVAL
4890 * copyin:EFAULT
4891 */
4892 int
sooptcopyin(struct sockopt * sopt,void * buf,size_t len,size_t minlen)4893 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4894 {
4895 size_t valsize;
4896
4897 /*
4898 * If the user gives us more than we wanted, we ignore it,
4899 * but if we don't get the minimum length the caller
4900 * wants, we return EINVAL. On success, sopt->sopt_valsize
4901 * is set to however much we actually retrieved.
4902 */
4903 if ((valsize = sopt->sopt_valsize) < minlen) {
4904 return EINVAL;
4905 }
4906 if (valsize > len) {
4907 sopt->sopt_valsize = valsize = len;
4908 }
4909
4910 if (sopt->sopt_p != kernproc) {
4911 return copyin(sopt->sopt_val, buf, valsize);
4912 }
4913
4914 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4915 return 0;
4916 }
4917
4918 /*
4919 * sooptcopyin_timeval
4920 * Copy in a timeval value into tv_p, and take into account whether the
4921 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4922 * code here so that we can verify the 64-bit tv_sec value before we lose
4923 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4924 */
4925 static int
sooptcopyin_timeval(struct sockopt * sopt,struct timeval * tv_p)4926 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4927 {
4928 int error;
4929
4930 if (proc_is64bit(sopt->sopt_p)) {
4931 struct user64_timeval tv64;
4932
4933 if (sopt->sopt_valsize < sizeof(tv64)) {
4934 return EINVAL;
4935 }
4936
4937 sopt->sopt_valsize = sizeof(tv64);
4938 if (sopt->sopt_p != kernproc) {
4939 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4940 if (error != 0) {
4941 return error;
4942 }
4943 } else {
4944 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4945 sizeof(tv64));
4946 }
4947 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4948 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4949 return EDOM;
4950 }
4951
4952 tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4953 tv_p->tv_usec = tv64.tv_usec;
4954 } else {
4955 struct user32_timeval tv32;
4956
4957 if (sopt->sopt_valsize < sizeof(tv32)) {
4958 return EINVAL;
4959 }
4960
4961 sopt->sopt_valsize = sizeof(tv32);
4962 if (sopt->sopt_p != kernproc) {
4963 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4964 if (error != 0) {
4965 return error;
4966 }
4967 } else {
4968 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4969 sizeof(tv32));
4970 }
4971 #ifndef __LP64__
4972 /*
4973 * K64todo "comparison is always false due to
4974 * limited range of data type"
4975 */
4976 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4977 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4978 return EDOM;
4979 }
4980 #endif
4981 tv_p->tv_sec = tv32.tv_sec;
4982 tv_p->tv_usec = tv32.tv_usec;
4983 }
4984 return 0;
4985 }
4986
4987 int
soopt_cred_check(struct socket * so,int priv,boolean_t allow_root,boolean_t ignore_delegate)4988 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4989 boolean_t ignore_delegate)
4990 {
4991 kauth_cred_t cred = NULL;
4992 proc_t ep = PROC_NULL;
4993 uid_t uid;
4994 int error = 0;
4995
4996 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4997 ep = proc_find(so->e_pid);
4998 if (ep) {
4999 cred = kauth_cred_proc_ref(ep);
5000 }
5001 }
5002
5003 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
5004
5005 /* uid is 0 for root */
5006 if (uid != 0 || !allow_root) {
5007 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
5008 }
5009 if (cred) {
5010 kauth_cred_unref(&cred);
5011 }
5012 if (ep != PROC_NULL) {
5013 proc_rele(ep);
5014 }
5015
5016 return error;
5017 }
5018
5019 /*
5020 * Returns: 0 Success
5021 * EINVAL
5022 * ENOPROTOOPT
5023 * ENOBUFS
5024 * EDOM
5025 * sooptcopyin:EINVAL
5026 * sooptcopyin:EFAULT
5027 * sooptcopyin_timeval:EINVAL
5028 * sooptcopyin_timeval:EFAULT
5029 * sooptcopyin_timeval:EDOM
5030 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5031 * <pr_ctloutput>:???w
5032 * sflt_attach_private:??? [whatever a filter author chooses]
5033 * <sf_setoption>:??? [whatever a filter author chooses]
5034 *
5035 * Notes: Other <pru_listen> returns depend on the protocol family; all
5036 * <sf_listen> returns depend on what the filter author causes
5037 * their filter to return.
5038 */
5039 int
sosetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5040 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5041 {
5042 int error, optval;
5043 int64_t long_optval;
5044 struct linger l;
5045 struct timeval tv;
5046
5047 if (sopt->sopt_dir != SOPT_SET) {
5048 sopt->sopt_dir = SOPT_SET;
5049 }
5050
5051 if (dolock) {
5052 socket_lock(so, 1);
5053 }
5054
5055 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
5056 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
5057 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
5058 /* the socket has been shutdown, no more sockopt's */
5059 error = EINVAL;
5060 goto out;
5061 }
5062
5063 error = sflt_setsockopt(so, sopt);
5064 if (error != 0) {
5065 if (error == EJUSTRETURN) {
5066 error = 0;
5067 }
5068 goto out;
5069 }
5070
5071 if (sopt->sopt_level != SOL_SOCKET) {
5072 if (so->so_proto != NULL &&
5073 so->so_proto->pr_ctloutput != NULL) {
5074 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5075 goto out;
5076 }
5077 error = ENOPROTOOPT;
5078 } else {
5079 /*
5080 * Allow socket-level (SOL_SOCKET) options to be filtered by
5081 * the protocol layer, if needed. A zero value returned from
5082 * the handler means use default socket-level processing as
5083 * done by the rest of this routine. Otherwise, any other
5084 * return value indicates that the option is unsupported.
5085 */
5086 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5087 pru_socheckopt(so, sopt)) != 0) {
5088 goto out;
5089 }
5090
5091 error = 0;
5092 switch (sopt->sopt_name) {
5093 case SO_LINGER:
5094 case SO_LINGER_SEC:
5095 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5096 if (error != 0) {
5097 goto out;
5098 }
5099
5100 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5101 (short)l.l_linger : (short)(l.l_linger * hz);
5102 if (l.l_onoff != 0) {
5103 so->so_options |= SO_LINGER;
5104 } else {
5105 so->so_options &= ~SO_LINGER;
5106 }
5107 break;
5108
5109 case SO_DEBUG:
5110 case SO_KEEPALIVE:
5111 case SO_DONTROUTE:
5112 case SO_USELOOPBACK:
5113 case SO_BROADCAST:
5114 case SO_REUSEADDR:
5115 case SO_REUSEPORT:
5116 case SO_OOBINLINE:
5117 case SO_TIMESTAMP:
5118 case SO_TIMESTAMP_MONOTONIC:
5119 case SO_TIMESTAMP_CONTINUOUS:
5120 case SO_DONTTRUNC:
5121 case SO_WANTMORE:
5122 case SO_WANTOOBFLAG:
5123 case SO_NOWAKEFROMSLEEP:
5124 case SO_NOAPNFALLBK:
5125 error = sooptcopyin(sopt, &optval, sizeof(optval),
5126 sizeof(optval));
5127 if (error != 0) {
5128 goto out;
5129 }
5130 if (optval) {
5131 so->so_options |= sopt->sopt_name;
5132 } else {
5133 so->so_options &= ~sopt->sopt_name;
5134 }
5135 #if SKYWALK
5136 inp_update_netns_flags(so);
5137 #endif /* SKYWALK */
5138 break;
5139
5140 case SO_SNDBUF:
5141 case SO_RCVBUF:
5142 case SO_SNDLOWAT:
5143 case SO_RCVLOWAT:
5144 error = sooptcopyin(sopt, &optval, sizeof(optval),
5145 sizeof(optval));
5146 if (error != 0) {
5147 goto out;
5148 }
5149
5150 /*
5151 * Values < 1 make no sense for any of these
5152 * options, so disallow them.
5153 */
5154 if (optval < 1) {
5155 error = EINVAL;
5156 goto out;
5157 }
5158
5159 switch (sopt->sopt_name) {
5160 case SO_SNDBUF:
5161 case SO_RCVBUF: {
5162 struct sockbuf *sb =
5163 (sopt->sopt_name == SO_SNDBUF) ?
5164 &so->so_snd : &so->so_rcv;
5165 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5166 error = ENOBUFS;
5167 goto out;
5168 }
5169 sb->sb_flags |= SB_USRSIZE;
5170 sb->sb_flags &= ~SB_AUTOSIZE;
5171 sb->sb_idealsize = (u_int32_t)optval;
5172 break;
5173 }
5174 /*
5175 * Make sure the low-water is never greater than
5176 * the high-water.
5177 */
5178 case SO_SNDLOWAT: {
5179 int space = sbspace(&so->so_snd);
5180 u_int32_t hiwat = so->so_snd.sb_hiwat;
5181
5182 if (so->so_snd.sb_flags & SB_UNIX) {
5183 struct unpcb *unp =
5184 (struct unpcb *)(so->so_pcb);
5185 if (unp != NULL &&
5186 unp->unp_conn != NULL) {
5187 hiwat += unp->unp_conn->unp_cc;
5188 }
5189 }
5190
5191 so->so_snd.sb_lowat =
5192 (optval > hiwat) ?
5193 hiwat : optval;
5194
5195 if (space >= so->so_snd.sb_lowat) {
5196 sowwakeup(so);
5197 }
5198 break;
5199 }
5200 case SO_RCVLOWAT: {
5201 int64_t data_len;
5202 so->so_rcv.sb_lowat =
5203 (optval > so->so_rcv.sb_hiwat) ?
5204 so->so_rcv.sb_hiwat : optval;
5205 data_len = so->so_rcv.sb_cc
5206 - so->so_rcv.sb_ctl;
5207 if (data_len >= so->so_rcv.sb_lowat) {
5208 sorwakeup(so);
5209 }
5210 break;
5211 }
5212 }
5213 break;
5214
5215 case SO_SNDTIMEO:
5216 case SO_RCVTIMEO:
5217 error = sooptcopyin_timeval(sopt, &tv);
5218 if (error != 0) {
5219 goto out;
5220 }
5221
5222 switch (sopt->sopt_name) {
5223 case SO_SNDTIMEO:
5224 so->so_snd.sb_timeo = tv;
5225 break;
5226 case SO_RCVTIMEO:
5227 so->so_rcv.sb_timeo = tv;
5228 break;
5229 }
5230 break;
5231
5232 case SO_NKE: {
5233 struct so_nke nke;
5234
5235 error = sooptcopyin(sopt, &nke, sizeof(nke),
5236 sizeof(nke));
5237 if (error != 0) {
5238 goto out;
5239 }
5240
5241 error = sflt_attach_internal(so, nke.nke_handle);
5242 break;
5243 }
5244
5245 case SO_NOSIGPIPE:
5246 error = sooptcopyin(sopt, &optval, sizeof(optval),
5247 sizeof(optval));
5248 if (error != 0) {
5249 goto out;
5250 }
5251 if (optval != 0) {
5252 so->so_flags |= SOF_NOSIGPIPE;
5253 } else {
5254 so->so_flags &= ~SOF_NOSIGPIPE;
5255 }
5256 break;
5257
5258 case SO_NOADDRERR:
5259 error = sooptcopyin(sopt, &optval, sizeof(optval),
5260 sizeof(optval));
5261 if (error != 0) {
5262 goto out;
5263 }
5264 if (optval != 0) {
5265 so->so_flags |= SOF_NOADDRAVAIL;
5266 } else {
5267 so->so_flags &= ~SOF_NOADDRAVAIL;
5268 }
5269 break;
5270
5271 case SO_REUSESHAREUID:
5272 error = sooptcopyin(sopt, &optval, sizeof(optval),
5273 sizeof(optval));
5274 if (error != 0) {
5275 goto out;
5276 }
5277 if (optval != 0) {
5278 so->so_flags |= SOF_REUSESHAREUID;
5279 } else {
5280 so->so_flags &= ~SOF_REUSESHAREUID;
5281 }
5282 break;
5283
5284 case SO_NOTIFYCONFLICT:
5285 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5286 error = EPERM;
5287 goto out;
5288 }
5289 error = sooptcopyin(sopt, &optval, sizeof(optval),
5290 sizeof(optval));
5291 if (error != 0) {
5292 goto out;
5293 }
5294 if (optval != 0) {
5295 so->so_flags |= SOF_NOTIFYCONFLICT;
5296 } else {
5297 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5298 }
5299 break;
5300
5301 case SO_RESTRICTIONS:
5302 error = sooptcopyin(sopt, &optval, sizeof(optval),
5303 sizeof(optval));
5304 if (error != 0) {
5305 goto out;
5306 }
5307
5308 error = so_set_restrictions(so, optval);
5309 break;
5310
5311 case SO_AWDL_UNRESTRICTED:
5312 if (SOCK_DOM(so) != PF_INET &&
5313 SOCK_DOM(so) != PF_INET6) {
5314 error = EOPNOTSUPP;
5315 goto out;
5316 }
5317 error = sooptcopyin(sopt, &optval, sizeof(optval),
5318 sizeof(optval));
5319 if (error != 0) {
5320 goto out;
5321 }
5322 if (optval != 0) {
5323 error = soopt_cred_check(so,
5324 PRIV_NET_RESTRICTED_AWDL, false, false);
5325 if (error == 0) {
5326 inp_set_awdl_unrestricted(
5327 sotoinpcb(so));
5328 }
5329 } else {
5330 inp_clear_awdl_unrestricted(sotoinpcb(so));
5331 }
5332 break;
5333 case SO_INTCOPROC_ALLOW:
5334 if (SOCK_DOM(so) != PF_INET6) {
5335 error = EOPNOTSUPP;
5336 goto out;
5337 }
5338 error = sooptcopyin(sopt, &optval, sizeof(optval),
5339 sizeof(optval));
5340 if (error != 0) {
5341 goto out;
5342 }
5343 if (optval != 0 &&
5344 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5345 error = soopt_cred_check(so,
5346 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5347 if (error == 0) {
5348 inp_set_intcoproc_allowed(
5349 sotoinpcb(so));
5350 }
5351 } else if (optval == 0) {
5352 inp_clear_intcoproc_allowed(sotoinpcb(so));
5353 }
5354 break;
5355
5356 case SO_LABEL:
5357 error = EOPNOTSUPP;
5358 break;
5359
5360 case SO_UPCALLCLOSEWAIT:
5361 error = sooptcopyin(sopt, &optval, sizeof(optval),
5362 sizeof(optval));
5363 if (error != 0) {
5364 goto out;
5365 }
5366 if (optval != 0) {
5367 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5368 } else {
5369 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5370 }
5371 break;
5372
5373 case SO_RANDOMPORT:
5374 error = sooptcopyin(sopt, &optval, sizeof(optval),
5375 sizeof(optval));
5376 if (error != 0) {
5377 goto out;
5378 }
5379 if (optval != 0) {
5380 so->so_flags |= SOF_BINDRANDOMPORT;
5381 } else {
5382 so->so_flags &= ~SOF_BINDRANDOMPORT;
5383 }
5384 break;
5385
5386 case SO_NP_EXTENSIONS: {
5387 struct so_np_extensions sonpx;
5388
5389 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5390 sizeof(sonpx));
5391 if (error != 0) {
5392 goto out;
5393 }
5394 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5395 error = EINVAL;
5396 goto out;
5397 }
5398 /*
5399 * Only one bit defined for now
5400 */
5401 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5402 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5403 so->so_flags |= SOF_NPX_SETOPTSHUT;
5404 } else {
5405 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5406 }
5407 }
5408 break;
5409 }
5410
5411 case SO_TRAFFIC_CLASS: {
5412 error = sooptcopyin(sopt, &optval, sizeof(optval),
5413 sizeof(optval));
5414 if (error != 0) {
5415 goto out;
5416 }
5417 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5418 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5419 error = so_set_net_service_type(so, netsvc);
5420 goto out;
5421 }
5422 error = so_set_traffic_class(so, optval);
5423 if (error != 0) {
5424 goto out;
5425 }
5426 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5427 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5428 break;
5429 }
5430
5431 case SO_RECV_TRAFFIC_CLASS: {
5432 error = sooptcopyin(sopt, &optval, sizeof(optval),
5433 sizeof(optval));
5434 if (error != 0) {
5435 goto out;
5436 }
5437 if (optval == 0) {
5438 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5439 } else {
5440 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5441 }
5442 break;
5443 }
5444
5445 #if (DEVELOPMENT || DEBUG)
5446 case SO_TRAFFIC_CLASS_DBG: {
5447 struct so_tcdbg so_tcdbg;
5448
5449 error = sooptcopyin(sopt, &so_tcdbg,
5450 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5451 if (error != 0) {
5452 goto out;
5453 }
5454 error = so_set_tcdbg(so, &so_tcdbg);
5455 if (error != 0) {
5456 goto out;
5457 }
5458 break;
5459 }
5460 #endif /* (DEVELOPMENT || DEBUG) */
5461
5462 case SO_PRIVILEGED_TRAFFIC_CLASS:
5463 error = priv_check_cred(kauth_cred_get(),
5464 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5465 if (error != 0) {
5466 goto out;
5467 }
5468 error = sooptcopyin(sopt, &optval, sizeof(optval),
5469 sizeof(optval));
5470 if (error != 0) {
5471 goto out;
5472 }
5473 if (optval == 0) {
5474 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5475 } else {
5476 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5477 }
5478 break;
5479
5480 #if (DEVELOPMENT || DEBUG)
5481 case SO_DEFUNCTIT:
5482 error = sosetdefunct(current_proc(), so, 0, FALSE);
5483 if (error == 0) {
5484 error = sodefunct(current_proc(), so, 0);
5485 }
5486
5487 break;
5488 #endif /* (DEVELOPMENT || DEBUG) */
5489
5490 case SO_DEFUNCTOK:
5491 error = sooptcopyin(sopt, &optval, sizeof(optval),
5492 sizeof(optval));
5493 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5494 if (error == 0) {
5495 error = EBADF;
5496 }
5497 goto out;
5498 }
5499 /*
5500 * Any process can set SO_DEFUNCTOK (clear
5501 * SOF_NODEFUNCT), but only root can clear
5502 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5503 */
5504 if (optval == 0 &&
5505 kauth_cred_issuser(kauth_cred_get()) == 0) {
5506 error = EPERM;
5507 goto out;
5508 }
5509 if (optval) {
5510 so->so_flags &= ~SOF_NODEFUNCT;
5511 } else {
5512 so->so_flags |= SOF_NODEFUNCT;
5513 }
5514
5515 if (SOCK_DOM(so) == PF_INET ||
5516 SOCK_DOM(so) == PF_INET6) {
5517 char s[MAX_IPv6_STR_LEN];
5518 char d[MAX_IPv6_STR_LEN];
5519 struct inpcb *inp = sotoinpcb(so);
5520
5521 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5522 "[%s %s:%d -> %s:%d] is now marked "
5523 "as %seligible for "
5524 "defunct\n", __func__, proc_selfpid(),
5525 proc_best_name(current_proc()),
5526 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5527 (SOCK_TYPE(so) == SOCK_STREAM) ?
5528 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5529 ((SOCK_DOM(so) == PF_INET) ?
5530 (void *)&inp->inp_laddr.s_addr :
5531 (void *)&inp->in6p_laddr), s, sizeof(s)),
5532 ntohs(inp->in6p_lport),
5533 inet_ntop(SOCK_DOM(so),
5534 (SOCK_DOM(so) == PF_INET) ?
5535 (void *)&inp->inp_faddr.s_addr :
5536 (void *)&inp->in6p_faddr, d, sizeof(d)),
5537 ntohs(inp->in6p_fport),
5538 (so->so_flags & SOF_NODEFUNCT) ?
5539 "not " : "");
5540 } else {
5541 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5542 "is now marked as %seligible for "
5543 "defunct\n",
5544 __func__, proc_selfpid(),
5545 proc_best_name(current_proc()),
5546 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5547 SOCK_DOM(so), SOCK_TYPE(so),
5548 (so->so_flags & SOF_NODEFUNCT) ?
5549 "not " : "");
5550 }
5551 break;
5552
5553 case SO_ISDEFUNCT:
5554 /* This option is not settable */
5555 error = EINVAL;
5556 break;
5557
5558 case SO_OPPORTUNISTIC:
5559 error = sooptcopyin(sopt, &optval, sizeof(optval),
5560 sizeof(optval));
5561 if (error == 0) {
5562 error = so_set_opportunistic(so, optval);
5563 }
5564 break;
5565
5566 case SO_FLUSH:
5567 /* This option is handled by lower layer(s) */
5568 error = 0;
5569 break;
5570
5571 case SO_RECV_ANYIF:
5572 error = sooptcopyin(sopt, &optval, sizeof(optval),
5573 sizeof(optval));
5574 if (error == 0) {
5575 error = so_set_recv_anyif(so, optval);
5576 }
5577 break;
5578
5579 case SO_TRAFFIC_MGT_BACKGROUND: {
5580 /* This option is handled by lower layer(s) */
5581 error = 0;
5582 break;
5583 }
5584
5585 #if FLOW_DIVERT
5586 case SO_FLOW_DIVERT_TOKEN:
5587 error = flow_divert_token_set(so, sopt);
5588 break;
5589 #endif /* FLOW_DIVERT */
5590
5591
5592 case SO_DELEGATED:
5593 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5594 sizeof(optval))) != 0) {
5595 break;
5596 }
5597
5598 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5599 break;
5600
5601 case SO_DELEGATED_UUID: {
5602 uuid_t euuid;
5603
5604 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5605 sizeof(euuid))) != 0) {
5606 break;
5607 }
5608
5609 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5610 break;
5611 }
5612
5613 #if NECP
5614 case SO_NECP_ATTRIBUTES:
5615 if (SOCK_DOM(so) == PF_MULTIPATH) {
5616 /* Handled by MPTCP itself */
5617 break;
5618 }
5619
5620 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5621 error = EINVAL;
5622 goto out;
5623 }
5624
5625 error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
5626 break;
5627
5628 case SO_NECP_CLIENTUUID: {
5629 if (SOCK_DOM(so) == PF_MULTIPATH) {
5630 /* Handled by MPTCP itself */
5631 break;
5632 }
5633
5634 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5635 error = EINVAL;
5636 goto out;
5637 }
5638
5639 struct inpcb *inp = sotoinpcb(so);
5640 if (!uuid_is_null(inp->necp_client_uuid)) {
5641 // Clear out the old client UUID if present
5642 necp_inpcb_remove_cb(inp);
5643 }
5644
5645 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5646 sizeof(uuid_t), sizeof(uuid_t));
5647 if (error != 0) {
5648 goto out;
5649 }
5650
5651 if (uuid_is_null(inp->necp_client_uuid)) {
5652 error = EINVAL;
5653 goto out;
5654 }
5655
5656 pid_t current_pid = proc_pid(current_proc());
5657 error = necp_client_register_socket_flow(current_pid,
5658 inp->necp_client_uuid, inp);
5659 if (error != 0) {
5660 uuid_clear(inp->necp_client_uuid);
5661 goto out;
5662 }
5663
5664 if (inp->inp_lport != 0) {
5665 // There is a bound local port, so this is not
5666 // a fresh socket. Assign to the client.
5667 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5668 }
5669
5670 break;
5671 }
5672 case SO_NECP_LISTENUUID: {
5673 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5674 error = EINVAL;
5675 goto out;
5676 }
5677
5678 struct inpcb *inp = sotoinpcb(so);
5679 if (!uuid_is_null(inp->necp_client_uuid)) {
5680 error = EINVAL;
5681 goto out;
5682 }
5683
5684 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5685 sizeof(uuid_t), sizeof(uuid_t));
5686 if (error != 0) {
5687 goto out;
5688 }
5689
5690 if (uuid_is_null(inp->necp_client_uuid)) {
5691 error = EINVAL;
5692 goto out;
5693 }
5694
5695 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5696 inp->necp_client_uuid, inp);
5697 if (error != 0) {
5698 uuid_clear(inp->necp_client_uuid);
5699 goto out;
5700 }
5701
5702 // Mark that the port registration is held by NECP
5703 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5704
5705 break;
5706 }
5707 #endif /* NECP */
5708
5709 case SO_EXTENDED_BK_IDLE:
5710 error = sooptcopyin(sopt, &optval, sizeof(optval),
5711 sizeof(optval));
5712 if (error == 0) {
5713 error = so_set_extended_bk_idle(so, optval);
5714 }
5715 break;
5716
5717 case SO_MARK_CELLFALLBACK:
5718 error = sooptcopyin(sopt, &optval, sizeof(optval),
5719 sizeof(optval));
5720 if (error != 0) {
5721 goto out;
5722 }
5723 if (optval < 0) {
5724 error = EINVAL;
5725 goto out;
5726 }
5727 if (optval == 0) {
5728 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5729 } else {
5730 so->so_flags1 |= SOF1_CELLFALLBACK;
5731 }
5732 break;
5733
5734 case SO_FALLBACK_MODE:
5735 error = sooptcopyin(sopt, &optval, sizeof(optval),
5736 sizeof(optval));
5737 if (error != 0) {
5738 goto out;
5739 }
5740 if (optval < SO_FALLBACK_MODE_NONE ||
5741 optval > SO_FALLBACK_MODE_PREFER) {
5742 error = EINVAL;
5743 goto out;
5744 }
5745 so->so_fallback_mode = (u_int8_t)optval;
5746 break;
5747
5748 case SO_MARK_KNOWN_TRACKER: {
5749 error = sooptcopyin(sopt, &optval, sizeof(optval),
5750 sizeof(optval));
5751 if (error != 0) {
5752 goto out;
5753 }
5754 if (optval < 0) {
5755 error = EINVAL;
5756 goto out;
5757 }
5758 if (optval == 0) {
5759 so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5760 } else {
5761 so->so_flags1 |= SOF1_KNOWN_TRACKER;
5762 }
5763 break;
5764 }
5765
5766 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5767 error = sooptcopyin(sopt, &optval, sizeof(optval),
5768 sizeof(optval));
5769 if (error != 0) {
5770 goto out;
5771 }
5772 if (optval < 0) {
5773 error = EINVAL;
5774 goto out;
5775 }
5776 if (optval == 0) {
5777 so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5778 } else {
5779 so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5780 }
5781 break;
5782 }
5783
5784 case SO_MARK_APPROVED_APP_DOMAIN: {
5785 error = sooptcopyin(sopt, &optval, sizeof(optval),
5786 sizeof(optval));
5787 if (error != 0) {
5788 goto out;
5789 }
5790 if (optval < 0) {
5791 error = EINVAL;
5792 goto out;
5793 }
5794 if (optval == 0) {
5795 so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5796 } else {
5797 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5798 }
5799 break;
5800 }
5801
5802 case SO_STATISTICS_EVENT:
5803 error = sooptcopyin(sopt, &long_optval,
5804 sizeof(long_optval), sizeof(long_optval));
5805 if (error != 0) {
5806 goto out;
5807 }
5808 u_int64_t nstat_event = 0;
5809 error = so_statistics_event_to_nstat_event(
5810 &long_optval, &nstat_event);
5811 if (error != 0) {
5812 goto out;
5813 }
5814 nstat_pcb_event(sotoinpcb(so), nstat_event);
5815 break;
5816
5817 case SO_NET_SERVICE_TYPE: {
5818 error = sooptcopyin(sopt, &optval, sizeof(optval),
5819 sizeof(optval));
5820 if (error != 0) {
5821 goto out;
5822 }
5823 error = so_set_net_service_type(so, optval);
5824 break;
5825 }
5826
5827 case SO_QOSMARKING_POLICY_OVERRIDE:
5828 error = priv_check_cred(kauth_cred_get(),
5829 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5830 if (error != 0) {
5831 goto out;
5832 }
5833 error = sooptcopyin(sopt, &optval, sizeof(optval),
5834 sizeof(optval));
5835 if (error != 0) {
5836 goto out;
5837 }
5838 if (optval == 0) {
5839 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5840 } else {
5841 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5842 }
5843 break;
5844
5845 case SO_MPKL_SEND_INFO: {
5846 struct so_mpkl_send_info so_mpkl_send_info;
5847
5848 error = sooptcopyin(sopt, &so_mpkl_send_info,
5849 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5850 if (error != 0) {
5851 goto out;
5852 }
5853 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5854 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5855
5856 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5857 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5858 } else {
5859 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5860 }
5861 break;
5862 }
5863 case SO_WANT_KEV_SOCKET_CLOSED: {
5864 error = sooptcopyin(sopt, &optval, sizeof(optval),
5865 sizeof(optval));
5866 if (error != 0) {
5867 goto out;
5868 }
5869 if (optval == 0) {
5870 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5871 } else {
5872 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5873 }
5874 break;
5875 }
5876 case SO_MARK_WAKE_PKT: {
5877 error = sooptcopyin(sopt, &optval, sizeof(optval),
5878 sizeof(optval));
5879 if (error != 0) {
5880 goto out;
5881 }
5882 if (optval == 0) {
5883 so->so_flags &= ~SOF_MARK_WAKE_PKT;
5884 } else {
5885 so->so_flags |= SOF_MARK_WAKE_PKT;
5886 }
5887 break;
5888 }
5889 case SO_RECV_WAKE_PKT: {
5890 error = sooptcopyin(sopt, &optval, sizeof(optval),
5891 sizeof(optval));
5892 if (error != 0) {
5893 goto out;
5894 }
5895 if (optval == 0) {
5896 so->so_flags &= ~SOF_RECV_WAKE_PKT;
5897 } else {
5898 so->so_flags |= SOF_RECV_WAKE_PKT;
5899 }
5900 break;
5901 }
5902 default:
5903 error = ENOPROTOOPT;
5904 break;
5905 }
5906 if (error == 0 && so->so_proto != NULL &&
5907 so->so_proto->pr_ctloutput != NULL) {
5908 (void) so->so_proto->pr_ctloutput(so, sopt);
5909 }
5910 }
5911 out:
5912 if (dolock) {
5913 socket_unlock(so, 1);
5914 }
5915 return error;
5916 }
5917
5918 /* Helper routines for getsockopt */
5919 int
sooptcopyout(struct sockopt * sopt,void * buf,size_t len)5920 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5921 {
5922 int error;
5923 size_t valsize;
5924
5925 error = 0;
5926
5927 /*
5928 * Documented get behavior is that we always return a value,
5929 * possibly truncated to fit in the user's buffer.
5930 * Traditional behavior is that we always tell the user
5931 * precisely how much we copied, rather than something useful
5932 * like the total amount we had available for her.
5933 * Note that this interface is not idempotent; the entire answer must
5934 * generated ahead of time.
5935 */
5936 valsize = MIN(len, sopt->sopt_valsize);
5937 sopt->sopt_valsize = valsize;
5938 if (sopt->sopt_val != USER_ADDR_NULL) {
5939 if (sopt->sopt_p != kernproc) {
5940 error = copyout(buf, sopt->sopt_val, valsize);
5941 } else {
5942 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5943 }
5944 }
5945 return error;
5946 }
5947
5948 static int
sooptcopyout_timeval(struct sockopt * sopt,const struct timeval * tv_p)5949 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5950 {
5951 int error;
5952 size_t len;
5953 struct user64_timeval tv64 = {};
5954 struct user32_timeval tv32 = {};
5955 const void * val;
5956 size_t valsize;
5957
5958 error = 0;
5959 if (proc_is64bit(sopt->sopt_p)) {
5960 len = sizeof(tv64);
5961 tv64.tv_sec = tv_p->tv_sec;
5962 tv64.tv_usec = tv_p->tv_usec;
5963 val = &tv64;
5964 } else {
5965 len = sizeof(tv32);
5966 tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5967 tv32.tv_usec = tv_p->tv_usec;
5968 val = &tv32;
5969 }
5970 valsize = MIN(len, sopt->sopt_valsize);
5971 sopt->sopt_valsize = valsize;
5972 if (sopt->sopt_val != USER_ADDR_NULL) {
5973 if (sopt->sopt_p != kernproc) {
5974 error = copyout(val, sopt->sopt_val, valsize);
5975 } else {
5976 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5977 }
5978 }
5979 return error;
5980 }
5981
5982 /*
5983 * Return: 0 Success
5984 * ENOPROTOOPT
5985 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5986 * <pr_ctloutput>:???
5987 * <sf_getoption>:???
5988 */
5989 int
sogetoptlock(struct socket * so,struct sockopt * sopt,int dolock)5990 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5991 {
5992 int error, optval;
5993 struct linger l;
5994 struct timeval tv;
5995
5996 if (sopt->sopt_dir != SOPT_GET) {
5997 sopt->sopt_dir = SOPT_GET;
5998 }
5999
6000 if (dolock) {
6001 socket_lock(so, 1);
6002 }
6003
6004 error = sflt_getsockopt(so, sopt);
6005 if (error != 0) {
6006 if (error == EJUSTRETURN) {
6007 error = 0;
6008 }
6009 goto out;
6010 }
6011
6012 if (sopt->sopt_level != SOL_SOCKET) {
6013 if (so->so_proto != NULL &&
6014 so->so_proto->pr_ctloutput != NULL) {
6015 error = (*so->so_proto->pr_ctloutput)(so, sopt);
6016 goto out;
6017 }
6018 error = ENOPROTOOPT;
6019 } else {
6020 /*
6021 * Allow socket-level (SOL_SOCKET) options to be filtered by
6022 * the protocol layer, if needed. A zero value returned from
6023 * the handler means use default socket-level processing as
6024 * done by the rest of this routine. Otherwise, any other
6025 * return value indicates that the option is unsupported.
6026 */
6027 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
6028 pru_socheckopt(so, sopt)) != 0) {
6029 goto out;
6030 }
6031
6032 error = 0;
6033 switch (sopt->sopt_name) {
6034 case SO_LINGER:
6035 case SO_LINGER_SEC:
6036 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
6037 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
6038 so->so_linger : so->so_linger / hz;
6039 error = sooptcopyout(sopt, &l, sizeof(l));
6040 break;
6041
6042 case SO_USELOOPBACK:
6043 case SO_DONTROUTE:
6044 case SO_DEBUG:
6045 case SO_KEEPALIVE:
6046 case SO_REUSEADDR:
6047 case SO_REUSEPORT:
6048 case SO_BROADCAST:
6049 case SO_OOBINLINE:
6050 case SO_TIMESTAMP:
6051 case SO_TIMESTAMP_MONOTONIC:
6052 case SO_TIMESTAMP_CONTINUOUS:
6053 case SO_DONTTRUNC:
6054 case SO_WANTMORE:
6055 case SO_WANTOOBFLAG:
6056 case SO_NOWAKEFROMSLEEP:
6057 case SO_NOAPNFALLBK:
6058 optval = so->so_options & sopt->sopt_name;
6059 integer:
6060 error = sooptcopyout(sopt, &optval, sizeof(optval));
6061 break;
6062
6063 case SO_TYPE:
6064 optval = so->so_type;
6065 goto integer;
6066
6067 case SO_NREAD:
6068 if (so->so_proto->pr_flags & PR_ATOMIC) {
6069 int pkt_total;
6070 struct mbuf *m1;
6071
6072 pkt_total = 0;
6073 m1 = so->so_rcv.sb_mb;
6074 while (m1 != NULL) {
6075 if (m1->m_type == MT_DATA ||
6076 m1->m_type == MT_HEADER ||
6077 m1->m_type == MT_OOBDATA) {
6078 pkt_total += m1->m_len;
6079 }
6080 m1 = m1->m_next;
6081 }
6082 optval = pkt_total;
6083 } else {
6084 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6085 }
6086 goto integer;
6087
6088 case SO_NUMRCVPKT:
6089 if (so->so_proto->pr_flags & PR_ATOMIC) {
6090 int cnt = 0;
6091 struct mbuf *m1;
6092
6093 m1 = so->so_rcv.sb_mb;
6094 while (m1 != NULL) {
6095 cnt += 1;
6096 m1 = m1->m_nextpkt;
6097 }
6098 optval = cnt;
6099 goto integer;
6100 } else {
6101 error = ENOPROTOOPT;
6102 break;
6103 }
6104
6105 case SO_NWRITE:
6106 optval = so->so_snd.sb_cc;
6107 goto integer;
6108
6109 case SO_ERROR:
6110 optval = so->so_error;
6111 so->so_error = 0;
6112 goto integer;
6113
6114 case SO_SNDBUF: {
6115 u_int32_t hiwat = so->so_snd.sb_hiwat;
6116
6117 if (so->so_snd.sb_flags & SB_UNIX) {
6118 struct unpcb *unp =
6119 (struct unpcb *)(so->so_pcb);
6120 if (unp != NULL && unp->unp_conn != NULL) {
6121 hiwat += unp->unp_conn->unp_cc;
6122 }
6123 }
6124
6125 optval = hiwat;
6126 goto integer;
6127 }
6128 case SO_RCVBUF:
6129 optval = so->so_rcv.sb_hiwat;
6130 goto integer;
6131
6132 case SO_SNDLOWAT:
6133 optval = so->so_snd.sb_lowat;
6134 goto integer;
6135
6136 case SO_RCVLOWAT:
6137 optval = so->so_rcv.sb_lowat;
6138 goto integer;
6139
6140 case SO_SNDTIMEO:
6141 case SO_RCVTIMEO:
6142 tv = (sopt->sopt_name == SO_SNDTIMEO ?
6143 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
6144
6145 error = sooptcopyout_timeval(sopt, &tv);
6146 break;
6147
6148 case SO_NOSIGPIPE:
6149 optval = (so->so_flags & SOF_NOSIGPIPE);
6150 goto integer;
6151
6152 case SO_NOADDRERR:
6153 optval = (so->so_flags & SOF_NOADDRAVAIL);
6154 goto integer;
6155
6156 case SO_REUSESHAREUID:
6157 optval = (so->so_flags & SOF_REUSESHAREUID);
6158 goto integer;
6159
6160
6161 case SO_NOTIFYCONFLICT:
6162 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6163 goto integer;
6164
6165 case SO_RESTRICTIONS:
6166 optval = so_get_restrictions(so);
6167 goto integer;
6168
6169 case SO_AWDL_UNRESTRICTED:
6170 if (SOCK_DOM(so) == PF_INET ||
6171 SOCK_DOM(so) == PF_INET6) {
6172 optval = inp_get_awdl_unrestricted(
6173 sotoinpcb(so));
6174 goto integer;
6175 } else {
6176 error = EOPNOTSUPP;
6177 }
6178 break;
6179
6180 case SO_INTCOPROC_ALLOW:
6181 if (SOCK_DOM(so) == PF_INET6) {
6182 optval = inp_get_intcoproc_allowed(
6183 sotoinpcb(so));
6184 goto integer;
6185 } else {
6186 error = EOPNOTSUPP;
6187 }
6188 break;
6189
6190 case SO_LABEL:
6191 error = EOPNOTSUPP;
6192 break;
6193
6194 case SO_PEERLABEL:
6195 error = EOPNOTSUPP;
6196 break;
6197
6198 #ifdef __APPLE_API_PRIVATE
6199 case SO_UPCALLCLOSEWAIT:
6200 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6201 goto integer;
6202 #endif
6203 case SO_RANDOMPORT:
6204 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6205 goto integer;
6206
6207 case SO_NP_EXTENSIONS: {
6208 struct so_np_extensions sonpx = {};
6209
6210 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6211 SONPX_SETOPTSHUT : 0;
6212 sonpx.npx_mask = SONPX_MASK_VALID;
6213
6214 error = sooptcopyout(sopt, &sonpx,
6215 sizeof(struct so_np_extensions));
6216 break;
6217 }
6218
6219 case SO_TRAFFIC_CLASS:
6220 optval = so->so_traffic_class;
6221 goto integer;
6222
6223 case SO_RECV_TRAFFIC_CLASS:
6224 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6225 goto integer;
6226
6227 #if (DEVELOPMENT || DEBUG)
6228 case SO_TRAFFIC_CLASS_DBG:
6229 error = sogetopt_tcdbg(so, sopt);
6230 break;
6231 #endif /* (DEVELOPMENT || DEBUG) */
6232
6233 case SO_PRIVILEGED_TRAFFIC_CLASS:
6234 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6235 goto integer;
6236
6237 case SO_DEFUNCTOK:
6238 optval = !(so->so_flags & SOF_NODEFUNCT);
6239 goto integer;
6240
6241 case SO_ISDEFUNCT:
6242 optval = (so->so_flags & SOF_DEFUNCT);
6243 goto integer;
6244
6245 case SO_OPPORTUNISTIC:
6246 optval = so_get_opportunistic(so);
6247 goto integer;
6248
6249 case SO_FLUSH:
6250 /* This option is not gettable */
6251 error = EINVAL;
6252 break;
6253
6254 case SO_RECV_ANYIF:
6255 optval = so_get_recv_anyif(so);
6256 goto integer;
6257
6258 case SO_TRAFFIC_MGT_BACKGROUND:
6259 /* This option is handled by lower layer(s) */
6260 if (so->so_proto != NULL &&
6261 so->so_proto->pr_ctloutput != NULL) {
6262 (void) so->so_proto->pr_ctloutput(so, sopt);
6263 }
6264 break;
6265
6266 #if FLOW_DIVERT
6267 case SO_FLOW_DIVERT_TOKEN:
6268 error = flow_divert_token_get(so, sopt);
6269 break;
6270 #endif /* FLOW_DIVERT */
6271
6272 #if NECP
6273 case SO_NECP_ATTRIBUTES:
6274 if (SOCK_DOM(so) == PF_MULTIPATH) {
6275 /* Handled by MPTCP itself */
6276 break;
6277 }
6278
6279 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6280 error = EINVAL;
6281 goto out;
6282 }
6283
6284 error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt);
6285 break;
6286
6287 case SO_NECP_CLIENTUUID: {
6288 uuid_t *ncu;
6289
6290 if (SOCK_DOM(so) == PF_MULTIPATH) {
6291 ncu = &mpsotomppcb(so)->necp_client_uuid;
6292 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6293 ncu = &sotoinpcb(so)->necp_client_uuid;
6294 } else {
6295 error = EINVAL;
6296 goto out;
6297 }
6298
6299 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6300 break;
6301 }
6302
6303 case SO_NECP_LISTENUUID: {
6304 uuid_t *nlu;
6305
6306 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6307 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6308 nlu = &sotoinpcb(so)->necp_client_uuid;
6309 } else {
6310 error = ENOENT;
6311 goto out;
6312 }
6313 } else {
6314 error = EINVAL;
6315 goto out;
6316 }
6317
6318 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6319 break;
6320 }
6321 #endif /* NECP */
6322
6323 #if CONTENT_FILTER
6324 case SO_CFIL_SOCK_ID: {
6325 cfil_sock_id_t sock_id;
6326
6327 sock_id = cfil_sock_id_from_socket(so);
6328
6329 error = sooptcopyout(sopt, &sock_id,
6330 sizeof(cfil_sock_id_t));
6331 break;
6332 }
6333 #endif /* CONTENT_FILTER */
6334
6335 case SO_EXTENDED_BK_IDLE:
6336 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6337 goto integer;
6338 case SO_MARK_CELLFALLBACK:
6339 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6340 ? 1 : 0;
6341 goto integer;
6342 case SO_FALLBACK_MODE:
6343 optval = so->so_fallback_mode;
6344 goto integer;
6345 case SO_MARK_KNOWN_TRACKER: {
6346 optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6347 ? 1 : 0;
6348 goto integer;
6349 }
6350 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6351 optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6352 ? 1 : 0;
6353 goto integer;
6354 }
6355 case SO_MARK_APPROVED_APP_DOMAIN: {
6356 optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6357 ? 1 : 0;
6358 goto integer;
6359 }
6360 case SO_NET_SERVICE_TYPE: {
6361 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6362 optval = so->so_netsvctype;
6363 } else {
6364 optval = NET_SERVICE_TYPE_BE;
6365 }
6366 goto integer;
6367 }
6368 case SO_NETSVC_MARKING_LEVEL:
6369 optval = so_get_netsvc_marking_level(so);
6370 goto integer;
6371
6372 case SO_MPKL_SEND_INFO: {
6373 struct so_mpkl_send_info so_mpkl_send_info;
6374
6375 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6376 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6377 error = sooptcopyout(sopt, &so_mpkl_send_info,
6378 sizeof(struct so_mpkl_send_info));
6379 break;
6380 }
6381 case SO_MARK_WAKE_PKT:
6382 optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6383 goto integer;
6384 case SO_RECV_WAKE_PKT:
6385 optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6386 goto integer;
6387 default:
6388 error = ENOPROTOOPT;
6389 break;
6390 }
6391 }
6392 out:
6393 if (dolock) {
6394 socket_unlock(so, 1);
6395 }
6396 return error;
6397 }
6398
6399 /*
6400 * The size limits on our soopt_getm is different from that on FreeBSD.
6401 * We limit the size of options to MCLBYTES. This will have to change
6402 * if we need to define options that need more space than MCLBYTES.
6403 */
6404 int
soopt_getm(struct sockopt * sopt,struct mbuf ** mp)6405 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6406 {
6407 struct mbuf *m, *m_prev;
6408 int sopt_size = (int)sopt->sopt_valsize;
6409 int how;
6410
6411 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6412 return EMSGSIZE;
6413 }
6414
6415 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6416 MGET(m, how, MT_DATA);
6417 if (m == NULL) {
6418 return ENOBUFS;
6419 }
6420 if (sopt_size > MLEN) {
6421 MCLGET(m, how);
6422 if ((m->m_flags & M_EXT) == 0) {
6423 m_free(m);
6424 return ENOBUFS;
6425 }
6426 m->m_len = min(MCLBYTES, sopt_size);
6427 } else {
6428 m->m_len = min(MLEN, sopt_size);
6429 }
6430 sopt_size -= m->m_len;
6431 *mp = m;
6432 m_prev = m;
6433
6434 while (sopt_size > 0) {
6435 MGET(m, how, MT_DATA);
6436 if (m == NULL) {
6437 m_freem(*mp);
6438 return ENOBUFS;
6439 }
6440 if (sopt_size > MLEN) {
6441 MCLGET(m, how);
6442 if ((m->m_flags & M_EXT) == 0) {
6443 m_freem(*mp);
6444 m_freem(m);
6445 return ENOBUFS;
6446 }
6447 m->m_len = min(MCLBYTES, sopt_size);
6448 } else {
6449 m->m_len = min(MLEN, sopt_size);
6450 }
6451 sopt_size -= m->m_len;
6452 m_prev->m_next = m;
6453 m_prev = m;
6454 }
6455 return 0;
6456 }
6457
6458 /* copyin sopt data into mbuf chain */
6459 int
soopt_mcopyin(struct sockopt * sopt,struct mbuf * m)6460 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6461 {
6462 struct mbuf *m0 = m;
6463
6464 if (sopt->sopt_val == USER_ADDR_NULL) {
6465 return 0;
6466 }
6467 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6468 if (sopt->sopt_p != kernproc) {
6469 int error;
6470
6471 error = copyin(sopt->sopt_val, mtod(m, char *),
6472 m->m_len);
6473 if (error != 0) {
6474 m_freem(m0);
6475 return error;
6476 }
6477 } else {
6478 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6479 mtod(m, char *), m->m_len);
6480 }
6481 sopt->sopt_valsize -= m->m_len;
6482 sopt->sopt_val += m->m_len;
6483 m = m->m_next;
6484 }
6485 /* should be allocated enoughly at ip6_sooptmcopyin() */
6486 if (m != NULL) {
6487 panic("soopt_mcopyin");
6488 /* NOTREACHED */
6489 }
6490 return 0;
6491 }
6492
6493 /* copyout mbuf chain data into soopt */
6494 int
soopt_mcopyout(struct sockopt * sopt,struct mbuf * m)6495 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6496 {
6497 struct mbuf *m0 = m;
6498 size_t valsize = 0;
6499
6500 if (sopt->sopt_val == USER_ADDR_NULL) {
6501 return 0;
6502 }
6503 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6504 if (sopt->sopt_p != kernproc) {
6505 int error;
6506
6507 error = copyout(mtod(m, char *), sopt->sopt_val,
6508 m->m_len);
6509 if (error != 0) {
6510 m_freem(m0);
6511 return error;
6512 }
6513 } else {
6514 bcopy(mtod(m, char *),
6515 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6516 }
6517 sopt->sopt_valsize -= m->m_len;
6518 sopt->sopt_val += m->m_len;
6519 valsize += m->m_len;
6520 m = m->m_next;
6521 }
6522 if (m != NULL) {
6523 /* enough soopt buffer should be given from user-land */
6524 m_freem(m0);
6525 return EINVAL;
6526 }
6527 sopt->sopt_valsize = valsize;
6528 return 0;
6529 }
6530
6531 void
sohasoutofband(struct socket * so)6532 sohasoutofband(struct socket *so)
6533 {
6534 if (so->so_pgid < 0) {
6535 gsignal(-so->so_pgid, SIGURG);
6536 } else if (so->so_pgid > 0) {
6537 proc_signal(so->so_pgid, SIGURG);
6538 }
6539 selwakeup(&so->so_rcv.sb_sel);
6540 if (so->so_rcv.sb_flags & SB_KNOTE) {
6541 KNOTE(&so->so_rcv.sb_sel.si_note,
6542 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6543 }
6544 }
6545
6546 int
sopoll(struct socket * so,int events,kauth_cred_t cred,void * wql)6547 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6548 {
6549 #pragma unused(cred)
6550 struct proc *p = current_proc();
6551 int revents = 0;
6552
6553 socket_lock(so, 1);
6554 so_update_last_owner_locked(so, PROC_NULL);
6555 so_update_policy(so);
6556
6557 if (events & (POLLIN | POLLRDNORM)) {
6558 if (soreadable(so)) {
6559 revents |= events & (POLLIN | POLLRDNORM);
6560 }
6561 }
6562
6563 if (events & (POLLOUT | POLLWRNORM)) {
6564 if (sowriteable(so)) {
6565 revents |= events & (POLLOUT | POLLWRNORM);
6566 }
6567 }
6568
6569 if (events & (POLLPRI | POLLRDBAND)) {
6570 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6571 revents |= events & (POLLPRI | POLLRDBAND);
6572 }
6573 }
6574
6575 if (revents == 0) {
6576 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6577 /*
6578 * Darwin sets the flag first,
6579 * BSD calls selrecord first
6580 */
6581 so->so_rcv.sb_flags |= SB_SEL;
6582 selrecord(p, &so->so_rcv.sb_sel, wql);
6583 }
6584
6585 if (events & (POLLOUT | POLLWRNORM)) {
6586 /*
6587 * Darwin sets the flag first,
6588 * BSD calls selrecord first
6589 */
6590 so->so_snd.sb_flags |= SB_SEL;
6591 selrecord(p, &so->so_snd.sb_sel, wql);
6592 }
6593 }
6594
6595 socket_unlock(so, 1);
6596 return revents;
6597 }
6598
6599 int
soo_kqfilter(struct fileproc * fp,struct knote * kn,struct kevent_qos_s * kev)6600 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6601 {
6602 struct socket *so = (struct socket *)fp_get_data(fp);
6603 int result;
6604
6605 socket_lock(so, 1);
6606 so_update_last_owner_locked(so, PROC_NULL);
6607 so_update_policy(so);
6608
6609 switch (kn->kn_filter) {
6610 case EVFILT_READ:
6611 kn->kn_filtid = EVFILTID_SOREAD;
6612 break;
6613 case EVFILT_WRITE:
6614 kn->kn_filtid = EVFILTID_SOWRITE;
6615 break;
6616 case EVFILT_SOCK:
6617 kn->kn_filtid = EVFILTID_SCK;
6618 break;
6619 case EVFILT_EXCEPT:
6620 kn->kn_filtid = EVFILTID_SOEXCEPT;
6621 break;
6622 default:
6623 socket_unlock(so, 1);
6624 knote_set_error(kn, EINVAL);
6625 return 0;
6626 }
6627
6628 /*
6629 * call the appropriate sub-filter attach
6630 * with the socket still locked
6631 */
6632 result = knote_fops(kn)->f_attach(kn, kev);
6633
6634 socket_unlock(so, 1);
6635
6636 return result;
6637 }
6638
6639 static int
filt_soread_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6640 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6641 {
6642 int retval = 0;
6643 int64_t data = 0;
6644
6645 if (so->so_options & SO_ACCEPTCONN) {
6646 /*
6647 * Radar 6615193 handle the listen case dynamically
6648 * for kqueue read filter. This allows to call listen()
6649 * after registering the kqueue EVFILT_READ.
6650 */
6651
6652 retval = !TAILQ_EMPTY(&so->so_comp);
6653 data = so->so_qlen;
6654 goto out;
6655 }
6656
6657 /* socket isn't a listener */
6658 /*
6659 * NOTE_LOWAT specifies new low water mark in data, i.e.
6660 * the bytes of protocol data. We therefore exclude any
6661 * control bytes.
6662 */
6663 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6664
6665 if (kn->kn_sfflags & NOTE_OOB) {
6666 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6667 kn->kn_fflags |= NOTE_OOB;
6668 data -= so->so_oobmark;
6669 retval = 1;
6670 goto out;
6671 }
6672 }
6673
6674 if ((so->so_state & SS_CANTRCVMORE)
6675 #if CONTENT_FILTER
6676 && cfil_sock_data_pending(&so->so_rcv) == 0
6677 #endif /* CONTENT_FILTER */
6678 ) {
6679 kn->kn_flags |= EV_EOF;
6680 kn->kn_fflags = so->so_error;
6681 retval = 1;
6682 goto out;
6683 }
6684
6685 if (so->so_error) { /* temporary udp error */
6686 retval = 1;
6687 goto out;
6688 }
6689
6690 int64_t lowwat = so->so_rcv.sb_lowat;
6691 /*
6692 * Ensure that when NOTE_LOWAT is used, the derived
6693 * low water mark is bounded by socket's rcv buf's
6694 * high and low water mark values.
6695 */
6696 if (kn->kn_sfflags & NOTE_LOWAT) {
6697 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6698 lowwat = so->so_rcv.sb_hiwat;
6699 } else if (kn->kn_sdata > lowwat) {
6700 lowwat = kn->kn_sdata;
6701 }
6702 }
6703
6704 /*
6705 * While the `data` field is the amount of data to read,
6706 * 0-sized packets need to wake up the kqueue, see 58140856,
6707 * so we need to take control bytes into account too.
6708 */
6709 retval = (so->so_rcv.sb_cc >= lowwat);
6710
6711 out:
6712 if (retval && kev) {
6713 knote_fill_kevent(kn, kev, data);
6714 }
6715 return retval;
6716 }
6717
6718 static int
filt_sorattach(struct knote * kn,__unused struct kevent_qos_s * kev)6719 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6720 {
6721 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6722
6723 /* socket locked */
6724
6725 /*
6726 * If the caller explicitly asked for OOB results (e.g. poll())
6727 * from EVFILT_READ, then save that off in the hookid field
6728 * and reserve the kn_flags EV_OOBAND bit for output only.
6729 */
6730 if (kn->kn_filter == EVFILT_READ &&
6731 kn->kn_flags & EV_OOBAND) {
6732 kn->kn_flags &= ~EV_OOBAND;
6733 kn->kn_hook32 = EV_OOBAND;
6734 } else {
6735 kn->kn_hook32 = 0;
6736 }
6737 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6738 so->so_rcv.sb_flags |= SB_KNOTE;
6739 }
6740
6741 /* indicate if event is already fired */
6742 return filt_soread_common(kn, NULL, so);
6743 }
6744
6745 static void
filt_sordetach(struct knote * kn)6746 filt_sordetach(struct knote *kn)
6747 {
6748 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6749
6750 socket_lock(so, 1);
6751 if (so->so_rcv.sb_flags & SB_KNOTE) {
6752 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6753 so->so_rcv.sb_flags &= ~SB_KNOTE;
6754 }
6755 }
6756 socket_unlock(so, 1);
6757 }
6758
6759 /*ARGSUSED*/
6760 static int
filt_soread(struct knote * kn,long hint)6761 filt_soread(struct knote *kn, long hint)
6762 {
6763 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6764 int retval;
6765
6766 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6767 socket_lock(so, 1);
6768 }
6769
6770 retval = filt_soread_common(kn, NULL, so);
6771
6772 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6773 socket_unlock(so, 1);
6774 }
6775
6776 return retval;
6777 }
6778
6779 static int
filt_sortouch(struct knote * kn,struct kevent_qos_s * kev)6780 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6781 {
6782 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6783 int retval;
6784
6785 socket_lock(so, 1);
6786
6787 /* save off the new input fflags and data */
6788 kn->kn_sfflags = kev->fflags;
6789 kn->kn_sdata = kev->data;
6790
6791 /* determine if changes result in fired events */
6792 retval = filt_soread_common(kn, NULL, so);
6793
6794 socket_unlock(so, 1);
6795
6796 return retval;
6797 }
6798
6799 static int
filt_sorprocess(struct knote * kn,struct kevent_qos_s * kev)6800 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6801 {
6802 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6803 int retval;
6804
6805 socket_lock(so, 1);
6806 retval = filt_soread_common(kn, kev, so);
6807 socket_unlock(so, 1);
6808
6809 return retval;
6810 }
6811
6812 int
so_wait_for_if_feedback(struct socket * so)6813 so_wait_for_if_feedback(struct socket *so)
6814 {
6815 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6816 (so->so_state & SS_ISCONNECTED)) {
6817 struct inpcb *inp = sotoinpcb(so);
6818 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6819 return 1;
6820 }
6821 }
6822 return 0;
6823 }
6824
6825 static int
filt_sowrite_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so)6826 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6827 {
6828 int ret = 0;
6829 int64_t data = sbspace(&so->so_snd);
6830
6831 if (so->so_state & SS_CANTSENDMORE) {
6832 kn->kn_flags |= EV_EOF;
6833 kn->kn_fflags = so->so_error;
6834 ret = 1;
6835 goto out;
6836 }
6837
6838 if (so->so_error) { /* temporary udp error */
6839 ret = 1;
6840 goto out;
6841 }
6842
6843 if (!socanwrite(so)) {
6844 ret = 0;
6845 goto out;
6846 }
6847
6848 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6849 ret = 1;
6850 goto out;
6851 }
6852
6853 int64_t lowwat = so->so_snd.sb_lowat;
6854
6855 if (kn->kn_sfflags & NOTE_LOWAT) {
6856 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6857 lowwat = so->so_snd.sb_hiwat;
6858 } else if (kn->kn_sdata > lowwat) {
6859 lowwat = kn->kn_sdata;
6860 }
6861 }
6862
6863 if (data >= lowwat) {
6864 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6865 #if (DEBUG || DEVELOPMENT)
6866 && so_notsent_lowat_check == 1
6867 #endif /* DEBUG || DEVELOPMENT */
6868 ) {
6869 if ((SOCK_DOM(so) == PF_INET ||
6870 SOCK_DOM(so) == PF_INET6) &&
6871 so->so_type == SOCK_STREAM) {
6872 ret = tcp_notsent_lowat_check(so);
6873 }
6874 #if MPTCP
6875 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6876 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6877 ret = mptcp_notsent_lowat_check(so);
6878 }
6879 #endif
6880 else {
6881 ret = 1;
6882 goto out;
6883 }
6884 } else {
6885 ret = 1;
6886 }
6887 }
6888 if (so_wait_for_if_feedback(so)) {
6889 ret = 0;
6890 }
6891
6892 out:
6893 if (ret && kev) {
6894 knote_fill_kevent(kn, kev, data);
6895 }
6896 return ret;
6897 }
6898
6899 static int
filt_sowattach(struct knote * kn,__unused struct kevent_qos_s * kev)6900 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6901 {
6902 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6903
6904 /* socket locked */
6905 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6906 so->so_snd.sb_flags |= SB_KNOTE;
6907 }
6908
6909 /* determine if its already fired */
6910 return filt_sowrite_common(kn, NULL, so);
6911 }
6912
6913 static void
filt_sowdetach(struct knote * kn)6914 filt_sowdetach(struct knote *kn)
6915 {
6916 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6917 socket_lock(so, 1);
6918
6919 if (so->so_snd.sb_flags & SB_KNOTE) {
6920 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6921 so->so_snd.sb_flags &= ~SB_KNOTE;
6922 }
6923 }
6924 socket_unlock(so, 1);
6925 }
6926
6927 /*ARGSUSED*/
6928 static int
filt_sowrite(struct knote * kn,long hint)6929 filt_sowrite(struct knote *kn, long hint)
6930 {
6931 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6932 int ret;
6933
6934 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6935 socket_lock(so, 1);
6936 }
6937
6938 ret = filt_sowrite_common(kn, NULL, so);
6939
6940 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6941 socket_unlock(so, 1);
6942 }
6943
6944 return ret;
6945 }
6946
6947 static int
filt_sowtouch(struct knote * kn,struct kevent_qos_s * kev)6948 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6949 {
6950 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6951 int ret;
6952
6953 socket_lock(so, 1);
6954
6955 /*save off the new input fflags and data */
6956 kn->kn_sfflags = kev->fflags;
6957 kn->kn_sdata = kev->data;
6958
6959 /* determine if these changes result in a triggered event */
6960 ret = filt_sowrite_common(kn, NULL, so);
6961
6962 socket_unlock(so, 1);
6963
6964 return ret;
6965 }
6966
6967 static int
filt_sowprocess(struct knote * kn,struct kevent_qos_s * kev)6968 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6969 {
6970 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
6971 int ret;
6972
6973 socket_lock(so, 1);
6974 ret = filt_sowrite_common(kn, kev, so);
6975 socket_unlock(so, 1);
6976
6977 return ret;
6978 }
6979
6980 static int
filt_sockev_common(struct knote * kn,struct kevent_qos_s * kev,struct socket * so,long ev_hint)6981 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6982 struct socket *so, long ev_hint)
6983 {
6984 int ret = 0;
6985 int64_t data = 0;
6986 uint32_t level_trigger = 0;
6987
6988 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6989 kn->kn_fflags |= NOTE_CONNRESET;
6990 }
6991 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6992 kn->kn_fflags |= NOTE_TIMEOUT;
6993 }
6994 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6995 kn->kn_fflags |= NOTE_NOSRCADDR;
6996 }
6997 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6998 kn->kn_fflags |= NOTE_IFDENIED;
6999 }
7000 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
7001 kn->kn_fflags |= NOTE_KEEPALIVE;
7002 }
7003 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
7004 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
7005 }
7006 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
7007 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
7008 }
7009 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
7010 (so->so_state & SS_ISCONNECTED)) {
7011 kn->kn_fflags |= NOTE_CONNECTED;
7012 level_trigger |= NOTE_CONNECTED;
7013 }
7014 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
7015 (so->so_state & SS_ISDISCONNECTED)) {
7016 kn->kn_fflags |= NOTE_DISCONNECTED;
7017 level_trigger |= NOTE_DISCONNECTED;
7018 }
7019 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
7020 if (so->so_proto != NULL &&
7021 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
7022 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
7023 }
7024 }
7025
7026 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
7027 tcp_notify_ack_active(so)) {
7028 kn->kn_fflags |= NOTE_NOTIFY_ACK;
7029 }
7030
7031 if ((so->so_state & SS_CANTRCVMORE)
7032 #if CONTENT_FILTER
7033 && cfil_sock_data_pending(&so->so_rcv) == 0
7034 #endif /* CONTENT_FILTER */
7035 ) {
7036 kn->kn_fflags |= NOTE_READCLOSED;
7037 level_trigger |= NOTE_READCLOSED;
7038 }
7039
7040 if (so->so_state & SS_CANTSENDMORE) {
7041 kn->kn_fflags |= NOTE_WRITECLOSED;
7042 level_trigger |= NOTE_WRITECLOSED;
7043 }
7044
7045 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
7046 (so->so_flags & SOF_SUSPENDED)) {
7047 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7048
7049 /* If resume event was delivered before, reset it */
7050 kn->kn_hook32 &= ~NOTE_RESUME;
7051
7052 kn->kn_fflags |= NOTE_SUSPEND;
7053 level_trigger |= NOTE_SUSPEND;
7054 }
7055
7056 if ((ev_hint & SO_FILT_HINT_RESUME) ||
7057 (so->so_flags & SOF_SUSPENDED) == 0) {
7058 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
7059
7060 /* If suspend event was delivered before, reset it */
7061 kn->kn_hook32 &= ~NOTE_SUSPEND;
7062
7063 kn->kn_fflags |= NOTE_RESUME;
7064 level_trigger |= NOTE_RESUME;
7065 }
7066
7067 if (so->so_error != 0) {
7068 ret = 1;
7069 data = so->so_error;
7070 kn->kn_flags |= EV_EOF;
7071 } else {
7072 u_int32_t data32 = 0;
7073 get_sockev_state(so, &data32);
7074 data = data32;
7075 }
7076
7077 /* Reset any events that are not requested on this knote */
7078 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7079 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
7080
7081 /* Find the level triggerred events that are already delivered */
7082 level_trigger &= kn->kn_hook32;
7083 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
7084
7085 /* Do not deliver level triggerred events more than once */
7086 if ((kn->kn_fflags & ~level_trigger) != 0) {
7087 ret = 1;
7088 }
7089
7090 if (ret && kev) {
7091 /*
7092 * Store the state of the events being delivered. This
7093 * state can be used to deliver level triggered events
7094 * ateast once and still avoid waking up the application
7095 * multiple times as long as the event is active.
7096 */
7097 if (kn->kn_fflags != 0) {
7098 kn->kn_hook32 |= (kn->kn_fflags &
7099 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7100 }
7101
7102 /*
7103 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7104 * only one of them and remember the last one that was
7105 * delivered last
7106 */
7107 if (kn->kn_fflags & NOTE_SUSPEND) {
7108 kn->kn_hook32 &= ~NOTE_RESUME;
7109 }
7110 if (kn->kn_fflags & NOTE_RESUME) {
7111 kn->kn_hook32 &= ~NOTE_SUSPEND;
7112 }
7113
7114 knote_fill_kevent(kn, kev, data);
7115 }
7116 return ret;
7117 }
7118
7119 static int
filt_sockattach(struct knote * kn,__unused struct kevent_qos_s * kev)7120 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7121 {
7122 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7123
7124 /* socket locked */
7125 kn->kn_hook32 = 0;
7126 if (KNOTE_ATTACH(&so->so_klist, kn)) {
7127 so->so_flags |= SOF_KNOTE;
7128 }
7129
7130 /* determine if event already fired */
7131 return filt_sockev_common(kn, NULL, so, 0);
7132 }
7133
7134 static void
filt_sockdetach(struct knote * kn)7135 filt_sockdetach(struct knote *kn)
7136 {
7137 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7138 socket_lock(so, 1);
7139
7140 if ((so->so_flags & SOF_KNOTE) != 0) {
7141 if (KNOTE_DETACH(&so->so_klist, kn)) {
7142 so->so_flags &= ~SOF_KNOTE;
7143 }
7144 }
7145 socket_unlock(so, 1);
7146 }
7147
7148 static int
filt_sockev(struct knote * kn,long hint)7149 filt_sockev(struct knote *kn, long hint)
7150 {
7151 int ret = 0, locked = 0;
7152 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7153 long ev_hint = (hint & SO_FILT_HINT_EV);
7154
7155 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7156 socket_lock(so, 1);
7157 locked = 1;
7158 }
7159
7160 ret = filt_sockev_common(kn, NULL, so, ev_hint);
7161
7162 if (locked) {
7163 socket_unlock(so, 1);
7164 }
7165
7166 return ret;
7167 }
7168
7169
7170
7171 /*
7172 * filt_socktouch - update event state
7173 */
7174 static int
filt_socktouch(struct knote * kn,struct kevent_qos_s * kev)7175 filt_socktouch(
7176 struct knote *kn,
7177 struct kevent_qos_s *kev)
7178 {
7179 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7180 uint32_t changed_flags;
7181 int ret;
7182
7183 socket_lock(so, 1);
7184
7185 /* save off the [result] data and fflags */
7186 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7187
7188 /* save off the new input fflags and data */
7189 kn->kn_sfflags = kev->fflags;
7190 kn->kn_sdata = kev->data;
7191
7192 /* restrict the current results to the (smaller?) set of new interest */
7193 /*
7194 * For compatibility with previous implementations, we leave kn_fflags
7195 * as they were before.
7196 */
7197 //kn->kn_fflags &= kev->fflags;
7198
7199 /*
7200 * Since we keep track of events that are already
7201 * delivered, if any of those events are not requested
7202 * anymore the state related to them can be reset
7203 */
7204 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7205
7206 /* determine if we have events to deliver */
7207 ret = filt_sockev_common(kn, NULL, so, 0);
7208
7209 socket_unlock(so, 1);
7210
7211 return ret;
7212 }
7213
7214 /*
7215 * filt_sockprocess - query event fired state and return data
7216 */
7217 static int
filt_sockprocess(struct knote * kn,struct kevent_qos_s * kev)7218 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7219 {
7220 struct socket *so = (struct socket *)fp_get_data(kn->kn_fp);
7221 int ret = 0;
7222
7223 socket_lock(so, 1);
7224
7225 ret = filt_sockev_common(kn, kev, so, 0);
7226
7227 socket_unlock(so, 1);
7228
7229 return ret;
7230 }
7231
7232 void
get_sockev_state(struct socket * so,u_int32_t * statep)7233 get_sockev_state(struct socket *so, u_int32_t *statep)
7234 {
7235 u_int32_t state = *(statep);
7236
7237 /*
7238 * If the state variable is already used by a previous event,
7239 * reset it.
7240 */
7241 if (state != 0) {
7242 return;
7243 }
7244
7245 if (so->so_state & SS_ISCONNECTED) {
7246 state |= SOCKEV_CONNECTED;
7247 } else {
7248 state &= ~(SOCKEV_CONNECTED);
7249 }
7250 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7251 *(statep) = state;
7252 }
7253
7254 #define SO_LOCK_HISTORY_STR_LEN \
7255 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7256
7257 __private_extern__ const char *
solockhistory_nr(struct socket * so)7258 solockhistory_nr(struct socket *so)
7259 {
7260 size_t n = 0;
7261 int i;
7262 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7263
7264 bzero(lock_history_str, sizeof(lock_history_str));
7265 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7266 n += scnprintf(lock_history_str + n,
7267 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7268 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7269 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7270 }
7271 return lock_history_str;
7272 }
7273
7274 lck_mtx_t *
socket_getlock(struct socket * so,int flags)7275 socket_getlock(struct socket *so, int flags)
7276 {
7277 if (so->so_proto->pr_getlock != NULL) {
7278 return (*so->so_proto->pr_getlock)(so, flags);
7279 } else {
7280 return so->so_proto->pr_domain->dom_mtx;
7281 }
7282 }
7283
7284 void
socket_lock(struct socket * so,int refcount)7285 socket_lock(struct socket *so, int refcount)
7286 {
7287 void *lr_saved;
7288
7289 lr_saved = __builtin_return_address(0);
7290
7291 if (so->so_proto->pr_lock) {
7292 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7293 } else {
7294 #ifdef MORE_LOCKING_DEBUG
7295 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7296 LCK_MTX_ASSERT_NOTOWNED);
7297 #endif
7298 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7299 if (refcount) {
7300 so->so_usecount++;
7301 }
7302 so->lock_lr[so->next_lock_lr] = lr_saved;
7303 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7304 }
7305 }
7306
7307 void
socket_lock_assert_owned(struct socket * so)7308 socket_lock_assert_owned(struct socket *so)
7309 {
7310 lck_mtx_t *mutex_held;
7311
7312 if (so->so_proto->pr_getlock != NULL) {
7313 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7314 } else {
7315 mutex_held = so->so_proto->pr_domain->dom_mtx;
7316 }
7317
7318 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7319 }
7320
7321 int
socket_try_lock(struct socket * so)7322 socket_try_lock(struct socket *so)
7323 {
7324 lck_mtx_t *mtx;
7325
7326 if (so->so_proto->pr_getlock != NULL) {
7327 mtx = (*so->so_proto->pr_getlock)(so, 0);
7328 } else {
7329 mtx = so->so_proto->pr_domain->dom_mtx;
7330 }
7331
7332 return lck_mtx_try_lock(mtx);
7333 }
7334
7335 void
socket_unlock(struct socket * so,int refcount)7336 socket_unlock(struct socket *so, int refcount)
7337 {
7338 void *lr_saved;
7339 lck_mtx_t *mutex_held;
7340
7341 lr_saved = __builtin_return_address(0);
7342
7343 if (so == NULL || so->so_proto == NULL) {
7344 panic("%s: null so_proto so=%p", __func__, so);
7345 /* NOTREACHED */
7346 }
7347
7348 if (so->so_proto->pr_unlock) {
7349 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7350 } else {
7351 mutex_held = so->so_proto->pr_domain->dom_mtx;
7352 #ifdef MORE_LOCKING_DEBUG
7353 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7354 #endif
7355 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7356 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7357
7358 if (refcount) {
7359 if (so->so_usecount <= 0) {
7360 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7361 "lrh=%s", __func__, so->so_usecount, so,
7362 SOCK_DOM(so), so->so_type,
7363 SOCK_PROTO(so), solockhistory_nr(so));
7364 /* NOTREACHED */
7365 }
7366
7367 so->so_usecount--;
7368 if (so->so_usecount == 0) {
7369 sofreelastref(so, 1);
7370 }
7371 }
7372 lck_mtx_unlock(mutex_held);
7373 }
7374 }
7375
7376 /* Called with socket locked, will unlock socket */
7377 void
sofree(struct socket * so)7378 sofree(struct socket *so)
7379 {
7380 lck_mtx_t *mutex_held;
7381
7382 if (so->so_proto->pr_getlock != NULL) {
7383 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7384 } else {
7385 mutex_held = so->so_proto->pr_domain->dom_mtx;
7386 }
7387 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7388
7389 sofreelastref(so, 0);
7390 }
7391
7392 void
soreference(struct socket * so)7393 soreference(struct socket *so)
7394 {
7395 socket_lock(so, 1); /* locks & take one reference on socket */
7396 socket_unlock(so, 0); /* unlock only */
7397 }
7398
7399 void
sodereference(struct socket * so)7400 sodereference(struct socket *so)
7401 {
7402 socket_lock(so, 0);
7403 socket_unlock(so, 1);
7404 }
7405
7406 /*
7407 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7408 * possibility of using jumbo clusters. Caller must ensure to hold
7409 * the socket lock.
7410 */
7411 void
somultipages(struct socket * so,boolean_t set)7412 somultipages(struct socket *so, boolean_t set)
7413 {
7414 if (set) {
7415 so->so_flags |= SOF_MULTIPAGES;
7416 } else {
7417 so->so_flags &= ~SOF_MULTIPAGES;
7418 }
7419 }
7420
7421 void
soif2kcl(struct socket * so,boolean_t set)7422 soif2kcl(struct socket *so, boolean_t set)
7423 {
7424 if (set) {
7425 so->so_flags1 |= SOF1_IF_2KCL;
7426 } else {
7427 so->so_flags1 &= ~SOF1_IF_2KCL;
7428 }
7429 }
7430
7431 int
so_isdstlocal(struct socket * so)7432 so_isdstlocal(struct socket *so)
7433 {
7434 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7435
7436 if (SOCK_DOM(so) == PF_INET) {
7437 return inaddr_local(inp->inp_faddr);
7438 } else if (SOCK_DOM(so) == PF_INET6) {
7439 return in6addr_local(&inp->in6p_faddr);
7440 }
7441
7442 return 0;
7443 }
7444
7445 int
sosetdefunct(struct proc * p,struct socket * so,int level,boolean_t noforce)7446 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7447 {
7448 struct sockbuf *rcv, *snd;
7449 int err = 0, defunct;
7450
7451 rcv = &so->so_rcv;
7452 snd = &so->so_snd;
7453
7454 defunct = (so->so_flags & SOF_DEFUNCT);
7455 if (defunct) {
7456 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7457 panic("%s: SB_DROP not set", __func__);
7458 /* NOTREACHED */
7459 }
7460 goto done;
7461 }
7462
7463 if (so->so_flags & SOF_NODEFUNCT) {
7464 if (noforce) {
7465 err = EOPNOTSUPP;
7466 if (p != PROC_NULL) {
7467 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7468 "name %s level %d) so 0x%llx [%d,%d] "
7469 "is not eligible for defunct "
7470 "(%d)\n", __func__, proc_selfpid(),
7471 proc_best_name(current_proc()), proc_pid(p),
7472 proc_best_name(p), level,
7473 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7474 SOCK_DOM(so), SOCK_TYPE(so), err);
7475 }
7476 return err;
7477 }
7478 so->so_flags &= ~SOF_NODEFUNCT;
7479 if (p != PROC_NULL) {
7480 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7481 "name %s level %d) so 0x%llx [%d,%d] "
7482 "defunct by force "
7483 "(%d)\n", __func__, proc_selfpid(),
7484 proc_best_name(current_proc()), proc_pid(p),
7485 proc_best_name(p), level,
7486 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7487 SOCK_DOM(so), SOCK_TYPE(so), err);
7488 }
7489 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7490 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7491 struct ifnet *ifp = inp->inp_last_outifp;
7492
7493 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7494 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7495 } else if (so->so_flags & SOF_DELEGATED) {
7496 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7497 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7498 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7499 } else if (noforce && p != PROC_NULL) {
7500 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7501
7502 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7503 so->so_extended_bk_start = net_uptime();
7504 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7505
7506 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7507
7508 err = EOPNOTSUPP;
7509 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7510 "name %s level %d) so 0x%llx [%d,%d] "
7511 "extend bk idle "
7512 "(%d)\n", __func__, proc_selfpid(),
7513 proc_best_name(current_proc()), proc_pid(p),
7514 proc_best_name(p), level,
7515 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7516 SOCK_DOM(so), SOCK_TYPE(so), err);
7517 return err;
7518 } else {
7519 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7520 }
7521 }
7522
7523 so->so_flags |= SOF_DEFUNCT;
7524
7525 /* Prevent further data from being appended to the socket buffers */
7526 snd->sb_flags |= SB_DROP;
7527 rcv->sb_flags |= SB_DROP;
7528
7529 /* Flush any existing data in the socket buffers */
7530 if (rcv->sb_cc != 0) {
7531 rcv->sb_flags &= ~SB_SEL;
7532 selthreadclear(&rcv->sb_sel);
7533 sbrelease(rcv);
7534 }
7535 if (snd->sb_cc != 0) {
7536 snd->sb_flags &= ~SB_SEL;
7537 selthreadclear(&snd->sb_sel);
7538 sbrelease(snd);
7539 }
7540
7541 done:
7542 if (p != PROC_NULL) {
7543 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7544 "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7545 proc_selfpid(), proc_best_name(current_proc()),
7546 proc_pid(p), proc_best_name(p), level,
7547 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7548 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7549 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7550 " extbkidle" : "");
7551 }
7552 return err;
7553 }
7554
7555 int
sodefunct(struct proc * p,struct socket * so,int level)7556 sodefunct(struct proc *p, struct socket *so, int level)
7557 {
7558 struct sockbuf *rcv, *snd;
7559
7560 if (!(so->so_flags & SOF_DEFUNCT)) {
7561 panic("%s improperly called", __func__);
7562 /* NOTREACHED */
7563 }
7564 if (so->so_state & SS_DEFUNCT) {
7565 goto done;
7566 }
7567
7568 rcv = &so->so_rcv;
7569 snd = &so->so_snd;
7570
7571 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7572 char s[MAX_IPv6_STR_LEN];
7573 char d[MAX_IPv6_STR_LEN];
7574 struct inpcb *inp = sotoinpcb(so);
7575
7576 if (p != PROC_NULL) {
7577 SODEFUNCTLOG(
7578 "%s[%d, %s]: (target pid %d name %s level %d) "
7579 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7580 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7581 " snd_fl 0x%x]\n", __func__,
7582 proc_selfpid(), proc_best_name(current_proc()),
7583 proc_pid(p), proc_best_name(p), level,
7584 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7585 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7586 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7587 (void *)&inp->inp_laddr.s_addr :
7588 (void *)&inp->in6p_laddr),
7589 s, sizeof(s)), ntohs(inp->in6p_lport),
7590 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7591 (void *)&inp->inp_faddr.s_addr :
7592 (void *)&inp->in6p_faddr,
7593 d, sizeof(d)), ntohs(inp->in6p_fport),
7594 (uint32_t)rcv->sb_sel.si_flags,
7595 (uint32_t)snd->sb_sel.si_flags,
7596 rcv->sb_flags, snd->sb_flags);
7597 }
7598 } else if (p != PROC_NULL) {
7599 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7600 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7601 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7602 proc_selfpid(), proc_best_name(current_proc()),
7603 proc_pid(p), proc_best_name(p), level,
7604 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7605 SOCK_DOM(so), SOCK_TYPE(so),
7606 (uint32_t)rcv->sb_sel.si_flags,
7607 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7608 snd->sb_flags);
7609 }
7610
7611 /*
7612 * Unwedge threads blocked on sbwait() and sb_lock().
7613 */
7614 sbwakeup(rcv);
7615 sbwakeup(snd);
7616
7617 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7618 if (rcv->sb_flags & SB_LOCK) {
7619 sbunlock(rcv, TRUE); /* keep socket locked */
7620 }
7621 if (snd->sb_flags & SB_LOCK) {
7622 sbunlock(snd, TRUE); /* keep socket locked */
7623 }
7624 /*
7625 * Flush the buffers and disconnect. We explicitly call shutdown
7626 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7627 * states are set for the socket. This would also flush out data
7628 * hanging off the receive list of this socket.
7629 */
7630 (void) soshutdownlock_final(so, SHUT_RD);
7631 (void) soshutdownlock_final(so, SHUT_WR);
7632 (void) sodisconnectlocked(so);
7633
7634 /*
7635 * Explicitly handle connectionless-protocol disconnection
7636 * and release any remaining data in the socket buffers.
7637 */
7638 if (!(so->so_state & SS_ISDISCONNECTED)) {
7639 (void) soisdisconnected(so);
7640 }
7641
7642 if (so->so_error == 0) {
7643 so->so_error = EBADF;
7644 }
7645
7646 if (rcv->sb_cc != 0) {
7647 rcv->sb_flags &= ~SB_SEL;
7648 selthreadclear(&rcv->sb_sel);
7649 sbrelease(rcv);
7650 }
7651 if (snd->sb_cc != 0) {
7652 snd->sb_flags &= ~SB_SEL;
7653 selthreadclear(&snd->sb_sel);
7654 sbrelease(snd);
7655 }
7656 so->so_state |= SS_DEFUNCT;
7657 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7658
7659 done:
7660 return 0;
7661 }
7662
7663 int
soresume(struct proc * p,struct socket * so,int locked)7664 soresume(struct proc *p, struct socket *so, int locked)
7665 {
7666 if (locked == 0) {
7667 socket_lock(so, 1);
7668 }
7669
7670 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7671 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7672 "[%d,%d] resumed from bk idle\n",
7673 __func__, proc_selfpid(), proc_best_name(current_proc()),
7674 proc_pid(p), proc_best_name(p),
7675 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7676 SOCK_DOM(so), SOCK_TYPE(so));
7677
7678 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7679 so->so_extended_bk_start = 0;
7680 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7681
7682 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7683 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7684 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7685 }
7686 if (locked == 0) {
7687 socket_unlock(so, 1);
7688 }
7689
7690 return 0;
7691 }
7692
7693 /*
7694 * Does not attempt to account for sockets that are delegated from
7695 * the current process
7696 */
7697 int
so_set_extended_bk_idle(struct socket * so,int optval)7698 so_set_extended_bk_idle(struct socket *so, int optval)
7699 {
7700 int error = 0;
7701
7702 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7703 SOCK_PROTO(so) != IPPROTO_TCP) {
7704 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7705 error = EOPNOTSUPP;
7706 } else if (optval == 0) {
7707 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7708
7709 soresume(current_proc(), so, 1);
7710 } else {
7711 struct proc *p = current_proc();
7712 struct fileproc *fp;
7713 int count = 0;
7714
7715 /*
7716 * Unlock socket to avoid lock ordering issue with
7717 * the proc fd table lock
7718 */
7719 socket_unlock(so, 0);
7720
7721 proc_fdlock(p);
7722 fdt_foreach(fp, p) {
7723 struct socket *so2;
7724
7725 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7726 continue;
7727 }
7728
7729 so2 = (struct socket *)fp_get_data(fp);
7730 if (so != so2 &&
7731 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7732 count++;
7733 }
7734 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7735 break;
7736 }
7737 }
7738 proc_fdunlock(p);
7739
7740 socket_lock(so, 0);
7741
7742 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7743 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7744 error = EBUSY;
7745 } else if (so->so_flags & SOF_DELEGATED) {
7746 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7747 error = EBUSY;
7748 } else {
7749 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7750 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7751 }
7752 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7753 "%s marked for extended bk idle\n",
7754 __func__, proc_selfpid(), proc_best_name(current_proc()),
7755 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7756 SOCK_DOM(so), SOCK_TYPE(so),
7757 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7758 "is" : "not");
7759 }
7760
7761 return error;
7762 }
7763
7764 static void
so_stop_extended_bk_idle(struct socket * so)7765 so_stop_extended_bk_idle(struct socket *so)
7766 {
7767 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7768 so->so_extended_bk_start = 0;
7769
7770 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7771 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7772 /*
7773 * Force defunct
7774 */
7775 sosetdefunct(current_proc(), so,
7776 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7777 if (so->so_flags & SOF_DEFUNCT) {
7778 sodefunct(current_proc(), so,
7779 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7780 }
7781 }
7782
7783 void
so_drain_extended_bk_idle(struct socket * so)7784 so_drain_extended_bk_idle(struct socket *so)
7785 {
7786 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7787 /*
7788 * Only penalize sockets that have outstanding data
7789 */
7790 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7791 so_stop_extended_bk_idle(so);
7792
7793 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7794 }
7795 }
7796 }
7797
7798 /*
7799 * Return values tells if socket is still in extended background idle
7800 */
7801 int
so_check_extended_bk_idle_time(struct socket * so)7802 so_check_extended_bk_idle_time(struct socket *so)
7803 {
7804 int ret = 1;
7805
7806 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7807 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7808 __func__, proc_selfpid(), proc_best_name(current_proc()),
7809 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7810 SOCK_DOM(so), SOCK_TYPE(so));
7811 if (net_uptime() - so->so_extended_bk_start >
7812 soextbkidlestat.so_xbkidle_time) {
7813 so_stop_extended_bk_idle(so);
7814
7815 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7816
7817 ret = 0;
7818 } else {
7819 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7820
7821 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7822 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7823 }
7824 }
7825
7826 return ret;
7827 }
7828
7829 void
resume_proc_sockets(proc_t p)7830 resume_proc_sockets(proc_t p)
7831 {
7832 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7833 struct fileproc *fp;
7834 struct socket *so;
7835
7836 proc_fdlock(p);
7837 fdt_foreach(fp, p) {
7838 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7839 continue;
7840 }
7841
7842 so = (struct socket *)fp_get_data(fp);
7843 (void) soresume(p, so, 0);
7844 }
7845 proc_fdunlock(p);
7846
7847 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7848 }
7849 }
7850
7851 __private_extern__ int
so_set_recv_anyif(struct socket * so,int optval)7852 so_set_recv_anyif(struct socket *so, int optval)
7853 {
7854 int ret = 0;
7855
7856 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7857 if (optval) {
7858 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7859 } else {
7860 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7861 }
7862 #if SKYWALK
7863 inp_update_netns_flags(so);
7864 #endif /* SKYWALK */
7865 }
7866
7867
7868 return ret;
7869 }
7870
7871 __private_extern__ int
so_get_recv_anyif(struct socket * so)7872 so_get_recv_anyif(struct socket *so)
7873 {
7874 int ret = 0;
7875
7876 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7877 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7878 }
7879
7880 return ret;
7881 }
7882
7883 int
so_set_restrictions(struct socket * so,uint32_t vals)7884 so_set_restrictions(struct socket *so, uint32_t vals)
7885 {
7886 int nocell_old, nocell_new;
7887 int noexpensive_old, noexpensive_new;
7888 int noconstrained_old, noconstrained_new;
7889
7890 /*
7891 * Deny-type restrictions are trapdoors; once set they cannot be
7892 * unset for the lifetime of the socket. This allows them to be
7893 * issued by a framework on behalf of the application without
7894 * having to worry that they can be undone.
7895 *
7896 * Note here that socket-level restrictions overrides any protocol
7897 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7898 * socket restriction issued on the socket has a higher precendence
7899 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7900 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7901 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7902 */
7903 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7904 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7905 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7906 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7907 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7908 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7909 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7910 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7911 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7912
7913 /* we can only set, not clear restrictions */
7914 if ((nocell_new - nocell_old) == 0 &&
7915 (noexpensive_new - noexpensive_old) == 0 &&
7916 (noconstrained_new - noconstrained_old) == 0) {
7917 return 0;
7918 }
7919 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7920 if (nocell_new - nocell_old != 0) {
7921 /*
7922 * if deny cellular is now set, do what's needed
7923 * for INPCB
7924 */
7925 inp_set_nocellular(sotoinpcb(so));
7926 }
7927 if (noexpensive_new - noexpensive_old != 0) {
7928 inp_set_noexpensive(sotoinpcb(so));
7929 }
7930 if (noconstrained_new - noconstrained_old != 0) {
7931 inp_set_noconstrained(sotoinpcb(so));
7932 }
7933 }
7934
7935 if (SOCK_DOM(so) == PF_MULTIPATH) {
7936 mptcp_set_restrictions(so);
7937 }
7938
7939 return 0;
7940 }
7941
7942 uint32_t
so_get_restrictions(struct socket * so)7943 so_get_restrictions(struct socket *so)
7944 {
7945 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7946 SO_RESTRICT_DENY_OUT |
7947 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7948 }
7949
7950 int
so_set_effective_pid(struct socket * so,int epid,struct proc * p,boolean_t check_cred)7951 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7952 {
7953 struct proc *ep = PROC_NULL;
7954 int error = 0;
7955
7956 /* pid 0 is reserved for kernel */
7957 if (epid == 0) {
7958 error = EINVAL;
7959 goto done;
7960 }
7961
7962 /*
7963 * If this is an in-kernel socket, prevent its delegate
7964 * association from changing unless the socket option is
7965 * coming from within the kernel itself.
7966 */
7967 if (so->last_pid == 0 && p != kernproc) {
7968 error = EACCES;
7969 goto done;
7970 }
7971
7972 /*
7973 * If this is issued by a process that's recorded as the
7974 * real owner of the socket, or if the pid is the same as
7975 * the process's own pid, then proceed. Otherwise ensure
7976 * that the issuing process has the necessary privileges.
7977 */
7978 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7979 if ((error = priv_check_cred(kauth_cred_get(),
7980 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7981 error = EACCES;
7982 goto done;
7983 }
7984 }
7985
7986 /* Find the process that corresponds to the effective pid */
7987 if ((ep = proc_find(epid)) == PROC_NULL) {
7988 error = ESRCH;
7989 goto done;
7990 }
7991
7992 /*
7993 * If a process tries to delegate the socket to itself, then
7994 * there's really nothing to do; treat it as a way for the
7995 * delegate association to be cleared. Note that we check
7996 * the passed-in proc rather than calling proc_selfpid(),
7997 * as we need to check the process issuing the socket option
7998 * which could be kernproc. Given that we don't allow 0 for
7999 * effective pid, it means that a delegated in-kernel socket
8000 * stays delegated during its lifetime (which is probably OK.)
8001 */
8002 if (epid == proc_pid(p)) {
8003 so->so_flags &= ~SOF_DELEGATED;
8004 so->e_upid = 0;
8005 so->e_pid = 0;
8006 uuid_clear(so->e_uuid);
8007 } else {
8008 so->so_flags |= SOF_DELEGATED;
8009 so->e_upid = proc_uniqueid(ep);
8010 so->e_pid = proc_pid(ep);
8011 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
8012
8013 #if defined(XNU_TARGET_OS_OSX)
8014 if (ep->p_responsible_pid != so->e_pid) {
8015 proc_t rp = proc_find(ep->p_responsible_pid);
8016 if (rp != PROC_NULL) {
8017 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
8018 so->so_rpid = ep->p_responsible_pid;
8019 proc_rele(rp);
8020 } else {
8021 uuid_clear(so->so_ruuid);
8022 so->so_rpid = -1;
8023 }
8024 }
8025 #endif
8026 }
8027 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8028 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
8029 }
8030 done:
8031 if (error == 0 && net_io_policy_log) {
8032 uuid_string_t buf;
8033
8034 uuid_unparse(so->e_uuid, buf);
8035 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8036 "euuid %s%s\n", __func__, proc_name_address(p),
8037 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8038 SOCK_DOM(so), SOCK_TYPE(so),
8039 so->e_pid, proc_name_address(ep), buf,
8040 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8041 } else if (error != 0 && net_io_policy_log) {
8042 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
8043 "ERROR (%d)\n", __func__, proc_name_address(p),
8044 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
8045 SOCK_DOM(so), SOCK_TYPE(so),
8046 epid, (ep == PROC_NULL) ? "PROC_NULL" :
8047 proc_name_address(ep), error);
8048 }
8049
8050 /* Update this socket's policy upon success */
8051 if (error == 0) {
8052 so->so_policy_gencnt *= -1;
8053 so_update_policy(so);
8054 #if NECP
8055 so_update_necp_policy(so, NULL, NULL);
8056 #endif /* NECP */
8057 }
8058
8059 if (ep != PROC_NULL) {
8060 proc_rele(ep);
8061 }
8062
8063 return error;
8064 }
8065
8066 int
so_set_effective_uuid(struct socket * so,uuid_t euuid,struct proc * p,boolean_t check_cred)8067 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
8068 {
8069 uuid_string_t buf;
8070 uuid_t uuid;
8071 int error = 0;
8072
8073 /* UUID must not be all-zeroes (reserved for kernel) */
8074 if (uuid_is_null(euuid)) {
8075 error = EINVAL;
8076 goto done;
8077 }
8078
8079 /*
8080 * If this is an in-kernel socket, prevent its delegate
8081 * association from changing unless the socket option is
8082 * coming from within the kernel itself.
8083 */
8084 if (so->last_pid == 0 && p != kernproc) {
8085 error = EACCES;
8086 goto done;
8087 }
8088
8089 /* Get the UUID of the issuing process */
8090 proc_getexecutableuuid(p, uuid, sizeof(uuid));
8091
8092 /*
8093 * If this is issued by a process that's recorded as the
8094 * real owner of the socket, or if the uuid is the same as
8095 * the process's own uuid, then proceed. Otherwise ensure
8096 * that the issuing process has the necessary privileges.
8097 */
8098 if (check_cred &&
8099 (uuid_compare(euuid, so->last_uuid) != 0 ||
8100 uuid_compare(euuid, uuid) != 0)) {
8101 if ((error = priv_check_cred(kauth_cred_get(),
8102 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8103 error = EACCES;
8104 goto done;
8105 }
8106 }
8107
8108 /*
8109 * If a process tries to delegate the socket to itself, then
8110 * there's really nothing to do; treat it as a way for the
8111 * delegate association to be cleared. Note that we check
8112 * the uuid of the passed-in proc rather than that of the
8113 * current process, as we need to check the process issuing
8114 * the socket option which could be kernproc itself. Given
8115 * that we don't allow 0 for effective uuid, it means that
8116 * a delegated in-kernel socket stays delegated during its
8117 * lifetime (which is okay.)
8118 */
8119 if (uuid_compare(euuid, uuid) == 0) {
8120 so->so_flags &= ~SOF_DELEGATED;
8121 so->e_upid = 0;
8122 so->e_pid = 0;
8123 uuid_clear(so->e_uuid);
8124 } else {
8125 so->so_flags |= SOF_DELEGATED;
8126 /*
8127 * Unlike so_set_effective_pid(), we only have the UUID
8128 * here and the process ID is not known. Inherit the
8129 * real {pid,upid} of the socket.
8130 */
8131 so->e_upid = so->last_upid;
8132 so->e_pid = so->last_pid;
8133 uuid_copy(so->e_uuid, euuid);
8134 }
8135 /*
8136 * The following will clear the effective process name as it's the same
8137 * as the real process
8138 */
8139 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8140 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8141 }
8142 done:
8143 if (error == 0 && net_io_policy_log) {
8144 uuid_unparse(so->e_uuid, buf);
8145 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8146 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8147 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8148 SOCK_TYPE(so), so->e_pid, buf,
8149 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8150 } else if (error != 0 && net_io_policy_log) {
8151 uuid_unparse(euuid, buf);
8152 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8153 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8154 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8155 SOCK_TYPE(so), buf, error);
8156 }
8157
8158 /* Update this socket's policy upon success */
8159 if (error == 0) {
8160 so->so_policy_gencnt *= -1;
8161 so_update_policy(so);
8162 #if NECP
8163 so_update_necp_policy(so, NULL, NULL);
8164 #endif /* NECP */
8165 }
8166
8167 return error;
8168 }
8169
8170 void
netpolicy_post_msg(uint32_t ev_code,struct netpolicy_event_data * ev_data,uint32_t ev_datalen)8171 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8172 uint32_t ev_datalen)
8173 {
8174 struct kev_msg ev_msg;
8175
8176 /*
8177 * A netpolicy event always starts with a netpolicy_event_data
8178 * structure, but the caller can provide for a longer event
8179 * structure to post, depending on the event code.
8180 */
8181 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8182
8183 bzero(&ev_msg, sizeof(ev_msg));
8184 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8185 ev_msg.kev_class = KEV_NETWORK_CLASS;
8186 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8187 ev_msg.event_code = ev_code;
8188
8189 ev_msg.dv[0].data_ptr = ev_data;
8190 ev_msg.dv[0].data_length = ev_datalen;
8191
8192 kev_post_msg(&ev_msg);
8193 }
8194
8195 void
socket_post_kev_msg(uint32_t ev_code,struct kev_socket_event_data * ev_data,uint32_t ev_datalen)8196 socket_post_kev_msg(uint32_t ev_code,
8197 struct kev_socket_event_data *ev_data,
8198 uint32_t ev_datalen)
8199 {
8200 struct kev_msg ev_msg;
8201
8202 bzero(&ev_msg, sizeof(ev_msg));
8203 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8204 ev_msg.kev_class = KEV_NETWORK_CLASS;
8205 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8206 ev_msg.event_code = ev_code;
8207
8208 ev_msg.dv[0].data_ptr = ev_data;
8209 ev_msg.dv[0].data_length = ev_datalen;
8210
8211 kev_post_msg(&ev_msg);
8212 }
8213
8214 void
socket_post_kev_msg_closed(struct socket * so)8215 socket_post_kev_msg_closed(struct socket *so)
8216 {
8217 struct kev_socket_closed ev = {};
8218 struct sockaddr *socksa = NULL, *peersa = NULL;
8219 int err;
8220
8221 if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8222 return;
8223 }
8224 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8225 if (err == 0) {
8226 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8227 &peersa);
8228 if (err == 0) {
8229 memcpy(&ev.ev_data.kev_sockname, socksa,
8230 min(socksa->sa_len,
8231 sizeof(ev.ev_data.kev_sockname)));
8232 memcpy(&ev.ev_data.kev_peername, peersa,
8233 min(peersa->sa_len,
8234 sizeof(ev.ev_data.kev_peername)));
8235 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8236 &ev.ev_data, sizeof(ev));
8237 }
8238 }
8239 free_sockaddr(socksa);
8240 free_sockaddr(peersa);
8241 }
8242