1 /*
2 * Copyright (c) 2003-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30 /*
31 * todo:
32 * 1) ramesh is looking into how to replace taking a reference on
33 * the user's map (vm_map_reference()) since it is believed that
34 * would not hold the process for us.
35 * 2) david is looking into a way for us to set the priority of the
36 * worker threads to match that of the user's thread when the
37 * async IO was queued.
38 */
39
40
41 /*
42 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
43 */
44
45 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/file_internal.h>
48 #include <sys/filedesc.h>
49 #include <sys/kernel.h>
50 #include <sys/vnode_internal.h>
51 #include <sys/kauth.h>
52 #include <sys/mount_internal.h>
53 #include <sys/param.h>
54 #include <sys/proc_internal.h>
55 #include <sys/sysctl.h>
56 #include <sys/unistd.h>
57 #include <sys/user.h>
58
59 #include <sys/aio_kern.h>
60 #include <sys/sysproto.h>
61
62 #include <machine/limits.h>
63
64 #include <mach/mach_types.h>
65 #include <kern/kern_types.h>
66 #include <kern/waitq.h>
67 #include <kern/zalloc.h>
68 #include <kern/task.h>
69 #include <kern/sched_prim.h>
70
71 #include <vm/vm_map_xnu.h>
72
73 #include <os/refcnt.h>
74
75 #include <sys/kdebug.h>
76 #define AIO_work_queued 1
77 #define AIO_worker_wake 2
78 #define AIO_completion_sig 3
79 #define AIO_completion_cleanup_wait 4
80 #define AIO_completion_cleanup_wake 5
81 #define AIO_completion_suspend_wake 6
82 #define AIO_fsync_delay 7
83 #define AIO_cancel 10
84 #define AIO_cancel_async_workq 11
85 #define AIO_cancel_sync_workq 12
86 #define AIO_cancel_activeq 13
87 #define AIO_cancel_doneq 14
88 #define AIO_fsync 20
89 #define AIO_read 30
90 #define AIO_write 40
91 #define AIO_listio 50
92 #define AIO_error 60
93 #define AIO_error_val 61
94 #define AIO_error_activeq 62
95 #define AIO_error_workq 63
96 #define AIO_return 70
97 #define AIO_return_val 71
98 #define AIO_return_activeq 72
99 #define AIO_return_workq 73
100 #define AIO_exec 80
101 #define AIO_exit 90
102 #define AIO_exit_sleep 91
103 #define AIO_close 100
104 #define AIO_close_sleep 101
105 #define AIO_suspend 110
106 #define AIO_suspend_sleep 111
107 #define AIO_worker_thread 120
108
109 __options_decl(aio_entry_flags_t, uint32_t, {
110 AIO_READ = 0x00000001, /* a read */
111 AIO_WRITE = 0x00000002, /* a write */
112 AIO_FSYNC = 0x00000004, /* aio_fsync with op = O_SYNC */
113 AIO_DSYNC = 0x00000008, /* aio_fsync with op = O_DSYNC (not supported yet) */
114 AIO_LIO = 0x00000010, /* lio_listio generated IO */
115 AIO_LIO_WAIT = 0x00000020, /* lio_listio is waiting on the leader */
116
117 /*
118 * These flags mean that this entry is blocking either:
119 * - close (AIO_CLOSE_WAIT)
120 * - exit or exec (AIO_EXIT_WAIT)
121 *
122 * These flags are mutually exclusive, and the AIO_EXIT_WAIT variant
123 * will also neuter notifications in do_aio_completion_and_unlock().
124 */
125 AIO_CLOSE_WAIT = 0x00004000,
126 AIO_EXIT_WAIT = 0x00008000,
127 });
128
129 /*! @struct aio_workq_entry
130 *
131 * @discussion
132 * This represents a piece of aio/lio work.
133 *
134 * The ownership rules go as follows:
135 *
136 * - the "proc" owns one refcount on the entry (from creation), while it is
137 * enqueued on the aio_activeq and then the aio_doneq.
138 *
139 * either aio_return() (user read the status) or _aio_exit() (the process
140 * died) will dequeue the entry and consume this ref.
141 *
142 * - the async workqueue owns one refcount once the work is submitted,
143 * which is consumed in do_aio_completion_and_unlock().
144 *
145 * This ref protects the entry for the the end of
146 * do_aio_completion_and_unlock() (when signal delivery happens).
147 *
148 * - lio_listio() for batches picks one of the entries to be the "leader"
149 * of the batch. Each work item will have a refcount on its leader
150 * so that the accounting of the batch completion can be done on the leader
151 * (to be able to decrement lio_pending).
152 *
153 * This ref is consumed in do_aio_completion_and_unlock() as well.
154 *
155 * - lastly, in lio_listio() when the LIO_WAIT behavior is requested,
156 * an extra ref is taken in this syscall as it needs to keep accessing
157 * the leader "lio_pending" field until it hits 0.
158 */
159 struct aio_workq_entry {
160 /* queue lock */
161 TAILQ_ENTRY(aio_workq_entry) aio_workq_link;
162
163 /* Proc lock */
164 TAILQ_ENTRY(aio_workq_entry) aio_proc_link; /* p_aio_activeq or p_aio_doneq */
165 user_ssize_t returnval; /* return value from read / write request */
166 errno_t errorval; /* error value from read / write request */
167 os_refcnt_t aio_refcount;
168 aio_entry_flags_t flags;
169
170 int lio_pending; /* pending I/Os in lio group, only on leader */
171 struct aio_workq_entry *lio_leader; /* pointer to the lio leader, can be self */
172
173 /* Initialized and never changed, safe to access */
174 struct proc *procp; /* user proc that queued this request */
175 user_addr_t uaiocbp; /* pointer passed in from user land */
176 struct user_aiocb aiocb; /* copy of aiocb from user land */
177 struct vfs_context context; /* context which enqueued the request */
178
179 /* Initialized, and possibly freed by aio_work_thread() or at free if cancelled */
180 vm_map_t aio_map; /* user land map we have a reference to */
181 };
182
183 /*
184 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
185 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
186 * (proc.aio_activeq) when one of our worker threads start the IO.
187 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
188 * when the IO request completes. The request remains on aio_doneq until
189 * user process calls aio_return or the process exits, either way that is our
190 * trigger to release aio resources.
191 */
192 typedef struct aio_workq {
193 TAILQ_HEAD(, aio_workq_entry) aioq_entries;
194 lck_spin_t aioq_lock;
195 struct waitq aioq_waitq;
196 } *aio_workq_t;
197
198 #define AIO_NUM_WORK_QUEUES 1
199 struct aio_anchor_cb {
200 os_atomic(int) aio_total_count; /* total extant entries */
201
202 /* Hash table of queues here */
203 int aio_num_workqs;
204 struct aio_workq aio_async_workqs[AIO_NUM_WORK_QUEUES];
205 };
206 typedef struct aio_anchor_cb aio_anchor_cb;
207
208 /*
209 * Notes on aio sleep / wake channels.
210 * We currently pick a couple fields within the proc structure that will allow
211 * us sleep channels that currently do not collide with any other kernel routines.
212 * At this time, for binary compatibility reasons, we cannot create new proc fields.
213 */
214 #define AIO_SUSPEND_SLEEP_CHAN p_aio_activeq
215 #define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count
216
217 #define ASSERT_AIO_FROM_PROC(aiop, theproc) \
218 if ((aiop)->procp != (theproc)) { \
219 panic("AIO on a proc list that does not belong to that proc."); \
220 }
221
222 /*
223 * LOCAL PROTOTYPES
224 */
225 static void aio_proc_lock(proc_t procp);
226 static void aio_proc_lock_spin(proc_t procp);
227 static void aio_proc_unlock(proc_t procp);
228 static lck_mtx_t *aio_proc_mutex(proc_t procp);
229 static bool aio_has_active_requests_for_process(proc_t procp);
230 static bool aio_proc_has_active_requests_for_file(proc_t procp, int fd);
231 static boolean_t is_already_queued(proc_t procp, user_addr_t aiocbp);
232
233 static aio_workq_t aio_entry_workq(aio_workq_entry *entryp);
234 static void aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
235 static void aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
236 static void aio_entry_ref(aio_workq_entry *entryp);
237 static void aio_entry_unref(aio_workq_entry *entryp);
238 static bool aio_entry_try_workq_remove(aio_workq_entry *entryp);
239 static boolean_t aio_delay_fsync_request(aio_workq_entry *entryp);
240 static void aio_free_request(aio_workq_entry *entryp);
241
242 static void aio_workq_init(aio_workq_t wq);
243 static void aio_workq_lock_spin(aio_workq_t wq);
244 static void aio_workq_unlock(aio_workq_t wq);
245 static lck_spin_t *aio_workq_lock(aio_workq_t wq);
246
247 static void aio_work_thread(void *arg, wait_result_t wr);
248 static aio_workq_entry *aio_get_some_work(void);
249
250 static int aio_queue_async_request(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
251 static int aio_validate(proc_t, aio_workq_entry *entryp);
252
253 static int do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, aio_entry_flags_t);
254 static void do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp);
255 static int do_aio_fsync(aio_workq_entry *entryp);
256 static int do_aio_read(aio_workq_entry *entryp);
257 static int do_aio_write(aio_workq_entry *entryp);
258 static void do_munge_aiocb_user32_to_user(struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp);
259 static void do_munge_aiocb_user64_to_user(struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp);
260 static aio_workq_entry *aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
261 static int aio_copy_in_list(proc_t, user_addr_t, user_addr_t *, int);
262
263 #define ASSERT_AIO_PROC_LOCK_OWNED(p) LCK_MTX_ASSERT(aio_proc_mutex(p), LCK_MTX_ASSERT_OWNED)
264 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q) LCK_SPIN_ASSERT(aio_workq_lock(q), LCK_ASSERT_OWNED)
265
266 /*
267 * EXTERNAL PROTOTYPES
268 */
269
270 /* in ...bsd/kern/sys_generic.c */
271 extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
272 user_addr_t bufp, user_size_t nbyte,
273 off_t offset, int flags, user_ssize_t *retval);
274 extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
275 user_addr_t bufp, user_size_t nbyte, off_t offset,
276 int flags, user_ssize_t *retval);
277
278 /*
279 * aio external global variables.
280 */
281 extern int aio_max_requests; /* AIO_MAX - configurable */
282 extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */
283 extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */
284
285
286 /*
287 * aio static variables.
288 */
289 static aio_anchor_cb aio_anchor = {
290 .aio_num_workqs = AIO_NUM_WORK_QUEUES,
291 };
292 os_refgrp_decl(static, aio_refgrp, "aio", NULL);
293 static LCK_GRP_DECLARE(aio_proc_lock_grp, "aio_proc");
294 static LCK_GRP_DECLARE(aio_queue_lock_grp, "aio_queue");
295 static LCK_MTX_DECLARE(aio_proc_mtx, &aio_proc_lock_grp);
296
297 static KALLOC_TYPE_DEFINE(aio_workq_zonep, aio_workq_entry, KT_DEFAULT);
298
299 /* Hash */
300 static aio_workq_t
aio_entry_workq(__unused aio_workq_entry * entryp)301 aio_entry_workq(__unused aio_workq_entry *entryp)
302 {
303 return &aio_anchor.aio_async_workqs[0];
304 }
305
306 static void
aio_workq_init(aio_workq_t wq)307 aio_workq_init(aio_workq_t wq)
308 {
309 TAILQ_INIT(&wq->aioq_entries);
310 lck_spin_init(&wq->aioq_lock, &aio_queue_lock_grp, LCK_ATTR_NULL);
311 waitq_init(&wq->aioq_waitq, WQT_QUEUE, SYNC_POLICY_FIFO);
312 }
313
314
315 /*
316 * Can be passed a queue which is locked spin.
317 */
318 static void
aio_workq_remove_entry_locked(aio_workq_t queue,aio_workq_entry * entryp)319 aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
320 {
321 ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
322
323 if (entryp->aio_workq_link.tqe_prev == NULL) {
324 panic("Trying to remove an entry from a work queue, but it is not on a queue");
325 }
326
327 TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
328 entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
329 }
330
331 static void
aio_workq_add_entry_locked(aio_workq_t queue,aio_workq_entry * entryp)332 aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
333 {
334 ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
335
336 TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
337 }
338
339 static void
aio_proc_lock(proc_t procp)340 aio_proc_lock(proc_t procp)
341 {
342 lck_mtx_lock(aio_proc_mutex(procp));
343 }
344
345 static void
aio_proc_lock_spin(proc_t procp)346 aio_proc_lock_spin(proc_t procp)
347 {
348 lck_mtx_lock_spin(aio_proc_mutex(procp));
349 }
350
351 static bool
aio_has_any_work(void)352 aio_has_any_work(void)
353 {
354 return os_atomic_load(&aio_anchor.aio_total_count, relaxed) != 0;
355 }
356
357 static bool
aio_try_proc_insert_active_locked(proc_t procp,aio_workq_entry * entryp)358 aio_try_proc_insert_active_locked(proc_t procp, aio_workq_entry *entryp)
359 {
360 int old, new;
361
362 ASSERT_AIO_PROC_LOCK_OWNED(procp);
363
364 if (procp->p_aio_total_count >= aio_max_requests_per_process) {
365 return false;
366 }
367
368 if (is_already_queued(procp, entryp->uaiocbp)) {
369 return false;
370 }
371
372 os_atomic_rmw_loop(&aio_anchor.aio_total_count, old, new, relaxed, {
373 if (old >= aio_max_requests) {
374 os_atomic_rmw_loop_give_up(return false);
375 }
376 new = old + 1;
377 });
378
379 TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link);
380 procp->p_aio_total_count++;
381 return true;
382 }
383
384 static void
aio_proc_move_done_locked(proc_t procp,aio_workq_entry * entryp)385 aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
386 {
387 TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link);
388 TAILQ_INSERT_TAIL(&procp->p_aio_doneq, entryp, aio_proc_link);
389 }
390
391 static void
aio_proc_remove_done_locked(proc_t procp,aio_workq_entry * entryp)392 aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
393 {
394 TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
395 entryp->aio_proc_link.tqe_prev = NULL;
396 if (os_atomic_dec_orig(&aio_anchor.aio_total_count, relaxed) <= 0) {
397 panic("Negative total AIO count!");
398 }
399 if (procp->p_aio_total_count-- <= 0) {
400 panic("proc %p: p_aio_total_count accounting mismatch", procp);
401 }
402 }
403
404 static void
aio_proc_unlock(proc_t procp)405 aio_proc_unlock(proc_t procp)
406 {
407 lck_mtx_unlock(aio_proc_mutex(procp));
408 }
409
410 static lck_mtx_t*
aio_proc_mutex(proc_t procp)411 aio_proc_mutex(proc_t procp)
412 {
413 return &procp->p_mlock;
414 }
415
416 static void
aio_entry_ref(aio_workq_entry * entryp)417 aio_entry_ref(aio_workq_entry *entryp)
418 {
419 os_ref_retain(&entryp->aio_refcount);
420 }
421
422 static void
aio_entry_unref(aio_workq_entry * entryp)423 aio_entry_unref(aio_workq_entry *entryp)
424 {
425 if (os_ref_release(&entryp->aio_refcount) == 0) {
426 aio_free_request(entryp);
427 }
428 }
429
430 static bool
aio_entry_try_workq_remove(aio_workq_entry * entryp)431 aio_entry_try_workq_remove(aio_workq_entry *entryp)
432 {
433 /* Can only be cancelled if it's still on a work queue */
434 if (entryp->aio_workq_link.tqe_prev != NULL) {
435 aio_workq_t queue;
436
437 /* Will have to check again under the lock */
438 queue = aio_entry_workq(entryp);
439 aio_workq_lock_spin(queue);
440 if (entryp->aio_workq_link.tqe_prev != NULL) {
441 aio_workq_remove_entry_locked(queue, entryp);
442 aio_workq_unlock(queue);
443 return true;
444 } else {
445 aio_workq_unlock(queue);
446 }
447 }
448
449 return false;
450 }
451
452 static void
aio_workq_lock_spin(aio_workq_t wq)453 aio_workq_lock_spin(aio_workq_t wq)
454 {
455 lck_spin_lock(aio_workq_lock(wq));
456 }
457
458 static void
aio_workq_unlock(aio_workq_t wq)459 aio_workq_unlock(aio_workq_t wq)
460 {
461 lck_spin_unlock(aio_workq_lock(wq));
462 }
463
464 static lck_spin_t*
aio_workq_lock(aio_workq_t wq)465 aio_workq_lock(aio_workq_t wq)
466 {
467 return &wq->aioq_lock;
468 }
469
470 /*
471 * aio_cancel - attempt to cancel one or more async IO requests currently
472 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
473 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
474 * is NULL then all outstanding async IO request for the given file
475 * descriptor are cancelled (if possible).
476 */
477 int
aio_cancel(proc_t p,struct aio_cancel_args * uap,int * retval)478 aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval)
479 {
480 struct user_aiocb my_aiocb;
481 int result;
482
483 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) | DBG_FUNC_START,
484 VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
485
486 /* quick check to see if there are any async IO requests queued up */
487 if (!aio_has_any_work()) {
488 result = 0;
489 *retval = AIO_ALLDONE;
490 goto ExitRoutine;
491 }
492
493 *retval = -1;
494 if (uap->aiocbp != USER_ADDR_NULL) {
495 if (proc_is64bit(p)) {
496 struct user64_aiocb aiocb64;
497
498 result = copyin(uap->aiocbp, &aiocb64, sizeof(aiocb64));
499 if (result == 0) {
500 do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
501 }
502 } else {
503 struct user32_aiocb aiocb32;
504
505 result = copyin(uap->aiocbp, &aiocb32, sizeof(aiocb32));
506 if (result == 0) {
507 do_munge_aiocb_user32_to_user(&aiocb32, &my_aiocb);
508 }
509 }
510
511 if (result != 0) {
512 result = EAGAIN;
513 goto ExitRoutine;
514 }
515
516 /* NOTE - POSIX standard says a mismatch between the file */
517 /* descriptor passed in and the file descriptor embedded in */
518 /* the aiocb causes unspecified results. We return EBADF in */
519 /* that situation. */
520 if (uap->fd != my_aiocb.aio_fildes) {
521 result = EBADF;
522 goto ExitRoutine;
523 }
524 }
525
526 aio_proc_lock(p);
527 result = do_aio_cancel_locked(p, uap->fd, uap->aiocbp, 0);
528 ASSERT_AIO_PROC_LOCK_OWNED(p);
529 aio_proc_unlock(p);
530
531 if (result != -1) {
532 *retval = result;
533 result = 0;
534 goto ExitRoutine;
535 }
536
537 result = EBADF;
538
539 ExitRoutine:
540 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) | DBG_FUNC_END,
541 VM_KERNEL_ADDRPERM(p), uap->aiocbp, result, 0, 0);
542
543 return result;
544 }
545
546
547 /*
548 * _aio_close - internal function used to clean up async IO requests for
549 * a file descriptor that is closing.
550 * THIS MAY BLOCK.
551 */
552 __private_extern__ void
_aio_close(proc_t p,int fd)553 _aio_close(proc_t p, int fd)
554 {
555 int error;
556
557 /* quick check to see if there are any async IO requests queued up */
558 if (!aio_has_any_work()) {
559 return;
560 }
561
562 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) | DBG_FUNC_START,
563 VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
564
565 /* cancel all async IO requests on our todo queues for this file descriptor */
566 aio_proc_lock(p);
567 error = do_aio_cancel_locked(p, fd, USER_ADDR_NULL, AIO_CLOSE_WAIT);
568 ASSERT_AIO_PROC_LOCK_OWNED(p);
569 if (error == AIO_NOTCANCELED) {
570 /*
571 * AIO_NOTCANCELED is returned when we find an aio request for this process
572 * and file descriptor on the active async IO queue. Active requests cannot
573 * be cancelled so we must wait for them to complete. We will get a special
574 * wake up call on our channel used to sleep for ALL active requests to
575 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
576 * when we must wait for all active aio requests.
577 */
578
579 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep) | DBG_FUNC_NONE,
580 VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
581
582 while (aio_proc_has_active_requests_for_file(p, fd)) {
583 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0);
584 }
585 }
586
587 aio_proc_unlock(p);
588
589 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) | DBG_FUNC_END,
590 VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
591 }
592
593
594 /*
595 * aio_error - return the error status associated with the async IO
596 * request referred to by uap->aiocbp. The error status is the errno
597 * value that would be set by the corresponding IO request (read, wrtie,
598 * fdatasync, or sync).
599 */
600 int
aio_error(proc_t p,struct aio_error_args * uap,int * retval)601 aio_error(proc_t p, struct aio_error_args *uap, int *retval)
602 {
603 aio_workq_entry *entryp;
604 int error;
605
606 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) | DBG_FUNC_START,
607 VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
608
609 /* see if there are any aios to check */
610 if (!aio_has_any_work()) {
611 return EINVAL;
612 }
613
614 aio_proc_lock(p);
615
616 /* look for a match on our queue of async IO requests that have completed */
617 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
618 if (entryp->uaiocbp == uap->aiocbp) {
619 ASSERT_AIO_FROM_PROC(entryp, p);
620
621 *retval = entryp->errorval;
622 error = 0;
623
624 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val) | DBG_FUNC_NONE,
625 VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
626 goto ExitRoutine;
627 }
628 }
629
630 /* look for a match on our queue of active async IO requests */
631 TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
632 if (entryp->uaiocbp == uap->aiocbp) {
633 ASSERT_AIO_FROM_PROC(entryp, p);
634 *retval = EINPROGRESS;
635 error = 0;
636 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq) | DBG_FUNC_NONE,
637 VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
638 goto ExitRoutine;
639 }
640 }
641
642 error = EINVAL;
643
644 ExitRoutine:
645 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) | DBG_FUNC_END,
646 VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
647 aio_proc_unlock(p);
648
649 return error;
650 }
651
652
653 /*
654 * aio_fsync - asynchronously force all IO operations associated
655 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
656 * queued at the time of the call to the synchronized completion state.
657 * NOTE - we do not support op O_DSYNC at this point since we do not support the
658 * fdatasync() call.
659 */
660 int
aio_fsync(proc_t p,struct aio_fsync_args * uap,int * retval)661 aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval)
662 {
663 aio_entry_flags_t fsync_kind;
664 int error;
665
666 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) | DBG_FUNC_START,
667 VM_KERNEL_ADDRPERM(p), uap->aiocbp, uap->op, 0, 0);
668
669 *retval = 0;
670 /* 0 := O_SYNC for binary backward compatibility with Panther */
671 if (uap->op == O_SYNC || uap->op == 0) {
672 fsync_kind = AIO_FSYNC;
673 } else if (uap->op == O_DSYNC) {
674 fsync_kind = AIO_DSYNC;
675 } else {
676 *retval = -1;
677 error = EINVAL;
678 goto ExitRoutine;
679 }
680
681 error = aio_queue_async_request(p, uap->aiocbp, fsync_kind);
682 if (error != 0) {
683 *retval = -1;
684 }
685
686 ExitRoutine:
687 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) | DBG_FUNC_END,
688 VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
689
690 return error;
691 }
692
693
694 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
695 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
696 * (uap->aiocbp->aio_buf).
697 */
698 int
aio_read(proc_t p,struct aio_read_args * uap,int * retval)699 aio_read(proc_t p, struct aio_read_args *uap, int *retval)
700 {
701 int error;
702
703 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) | DBG_FUNC_START,
704 VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
705
706 *retval = 0;
707
708 error = aio_queue_async_request(p, uap->aiocbp, AIO_READ);
709 if (error != 0) {
710 *retval = -1;
711 }
712
713 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) | DBG_FUNC_END,
714 VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
715
716 return error;
717 }
718
719
720 /*
721 * aio_return - return the return status associated with the async IO
722 * request referred to by uap->aiocbp. The return status is the value
723 * that would be returned by corresponding IO request (read, write,
724 * fdatasync, or sync). This is where we release kernel resources
725 * held for async IO call associated with the given aiocb pointer.
726 */
727 int
aio_return(proc_t p,struct aio_return_args * uap,user_ssize_t * retval)728 aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval)
729 {
730 aio_workq_entry *entryp;
731 int error = EINVAL;
732
733 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) | DBG_FUNC_START,
734 VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
735
736 /* See if there are any entries to check */
737 if (!aio_has_any_work()) {
738 goto ExitRoutine;
739 }
740
741 aio_proc_lock(p);
742 *retval = 0;
743
744 /* look for a match on our queue of async IO requests that have completed */
745 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
746 ASSERT_AIO_FROM_PROC(entryp, p);
747 if (entryp->uaiocbp == uap->aiocbp) {
748 /* Done and valid for aio_return(), pull it off the list */
749 aio_proc_remove_done_locked(p, entryp);
750
751 *retval = entryp->returnval;
752 error = 0;
753 aio_proc_unlock(p);
754
755 aio_entry_unref(entryp);
756
757 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val) | DBG_FUNC_NONE,
758 VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
759 goto ExitRoutine;
760 }
761 }
762
763 /* look for a match on our queue of active async IO requests */
764 TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
765 ASSERT_AIO_FROM_PROC(entryp, p);
766 if (entryp->uaiocbp == uap->aiocbp) {
767 error = EINPROGRESS;
768 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq) | DBG_FUNC_NONE,
769 VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
770 break;
771 }
772 }
773
774 aio_proc_unlock(p);
775
776 ExitRoutine:
777 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) | DBG_FUNC_END,
778 VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
779
780 return error;
781 }
782
783
784 /*
785 * _aio_exec - internal function used to clean up async IO requests for
786 * a process that is going away due to exec(). We cancel any async IOs
787 * we can and wait for those already active. We also disable signaling
788 * for cancelled or active aio requests that complete.
789 * This routine MAY block!
790 */
791 __private_extern__ void
_aio_exec(proc_t p)792 _aio_exec(proc_t p)
793 {
794 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) | DBG_FUNC_START,
795 VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
796
797 _aio_exit(p);
798
799 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) | DBG_FUNC_END,
800 VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
801 }
802
803
804 /*
805 * _aio_exit - internal function used to clean up async IO requests for
806 * a process that is terminating (via exit() or exec()). We cancel any async IOs
807 * we can and wait for those already active. We also disable signaling
808 * for cancelled or active aio requests that complete. This routine MAY block!
809 */
810 __private_extern__ void
_aio_exit(proc_t p)811 _aio_exit(proc_t p)
812 {
813 TAILQ_HEAD(, aio_workq_entry) tofree = TAILQ_HEAD_INITIALIZER(tofree);
814 aio_workq_entry *entryp, *tmp;
815 int error;
816
817 /* quick check to see if there are any async IO requests queued up */
818 if (!aio_has_any_work()) {
819 return;
820 }
821
822 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) | DBG_FUNC_START,
823 VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
824
825 aio_proc_lock(p);
826
827 /*
828 * cancel async IO requests on the todo work queue and wait for those
829 * already active to complete.
830 */
831 error = do_aio_cancel_locked(p, -1, USER_ADDR_NULL, AIO_EXIT_WAIT);
832 ASSERT_AIO_PROC_LOCK_OWNED(p);
833 if (error == AIO_NOTCANCELED) {
834 /*
835 * AIO_NOTCANCELED is returned when we find an aio request for this process
836 * on the active async IO queue. Active requests cannot be cancelled so we
837 * must wait for them to complete. We will get a special wake up call on
838 * our channel used to sleep for ALL active requests to complete. This sleep
839 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
840 * active aio requests.
841 */
842
843 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep) | DBG_FUNC_NONE,
844 VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
845
846 while (aio_has_active_requests_for_process(p)) {
847 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0);
848 }
849 }
850
851 assert(!aio_has_active_requests_for_process(p));
852
853 /* release all aio resources used by this process */
854 TAILQ_FOREACH_SAFE(entryp, &p->p_aio_doneq, aio_proc_link, tmp) {
855 ASSERT_AIO_FROM_PROC(entryp, p);
856
857 aio_proc_remove_done_locked(p, entryp);
858 TAILQ_INSERT_TAIL(&tofree, entryp, aio_proc_link);
859 }
860
861 aio_proc_unlock(p);
862
863 /* free all the entries outside of the aio_proc_lock() */
864 TAILQ_FOREACH_SAFE(entryp, &tofree, aio_proc_link, tmp) {
865 entryp->aio_proc_link.tqe_prev = NULL;
866 aio_entry_unref(entryp);
867 }
868
869 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) | DBG_FUNC_END,
870 VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
871 }
872
873
874 static bool
should_cancel(aio_workq_entry * entryp,int fd,user_addr_t aiocbp,aio_entry_flags_t reason)875 should_cancel(aio_workq_entry *entryp, int fd, user_addr_t aiocbp,
876 aio_entry_flags_t reason)
877 {
878 if (reason & AIO_EXIT_WAIT) {
879 /* caller is _aio_exit() */
880 return true;
881 }
882 if (fd != entryp->aiocb.aio_fildes) {
883 /* not the file we're looking for */
884 return false;
885 }
886 /*
887 * aio_cancel() or _aio_close() cancel
888 * everything for a given fd when aiocbp is NULL
889 */
890 return aiocbp == USER_ADDR_NULL || entryp->uaiocbp == aiocbp;
891 }
892
893 /*
894 * do_aio_cancel_locked - cancel async IO requests (if possible). We get called by
895 * aio_cancel, close, and at exit.
896 * There are three modes of operation: 1) cancel all async IOs for a process -
897 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
898 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
899 * aiocbp.
900 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
901 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
902 * target async IO requests, and AIO_ALLDONE if all target async IO requests
903 * were already complete.
904 * WARNING - do not deference aiocbp in this routine, it may point to user
905 * land data that has not been copied in (when called from aio_cancel())
906 *
907 * Called with proc locked, and returns the same way.
908 */
909 static int
do_aio_cancel_locked(proc_t p,int fd,user_addr_t aiocbp,aio_entry_flags_t reason)910 do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
911 aio_entry_flags_t reason)
912 {
913 bool multiple_matches = (aiocbp == USER_ADDR_NULL);
914 aio_workq_entry *entryp, *tmp;
915 int result;
916
917 ASSERT_AIO_PROC_LOCK_OWNED(p);
918
919 /* look for a match on our queue of async todo work. */
920 again:
921 result = -1;
922 TAILQ_FOREACH_SAFE(entryp, &p->p_aio_activeq, aio_proc_link, tmp) {
923 ASSERT_AIO_FROM_PROC(entryp, p);
924
925 if (!should_cancel(entryp, fd, aiocbp, reason)) {
926 continue;
927 }
928
929 if (reason) {
930 /* mark the entry as blocking close or exit/exec */
931 entryp->flags |= reason;
932 if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
933 panic("Close and exit flags set at the same time");
934 }
935 }
936
937 /* Can only be cancelled if it's still on a work queue */
938 if (aio_entry_try_workq_remove(entryp)) {
939 entryp->errorval = ECANCELED;
940 entryp->returnval = -1;
941
942 /* Now it's officially cancelled. Do the completion */
943 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq) | DBG_FUNC_NONE,
944 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
945 fd, 0, 0);
946 do_aio_completion_and_unlock(p, entryp);
947
948 aio_proc_lock(p);
949
950 if (multiple_matches) {
951 /*
952 * Restart from the head of the proc active queue since it
953 * may have been changed while we were away doing completion
954 * processing.
955 *
956 * Note that if we found an uncancellable AIO before, we will
957 * either find it again or discover that it's been completed,
958 * so resetting the result will not cause us to return success
959 * despite outstanding AIOs.
960 */
961 goto again;
962 }
963
964 return AIO_CANCELED;
965 }
966
967 /*
968 * It's been taken off the active queue already, i.e. is in flight.
969 * All we can do is ask for notification.
970 */
971 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq) | DBG_FUNC_NONE,
972 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
973 fd, 0, 0);
974
975 result = AIO_NOTCANCELED;
976 if (!multiple_matches) {
977 return result;
978 }
979 }
980
981 /*
982 * if we didn't find any matches on the todo or active queues then look for a
983 * match on our queue of async IO requests that have completed and if found
984 * return AIO_ALLDONE result.
985 *
986 * Proc AIO lock is still held.
987 */
988 if (result == -1) {
989 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
990 ASSERT_AIO_FROM_PROC(entryp, p);
991 if (should_cancel(entryp, fd, aiocbp, reason)) {
992 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq) | DBG_FUNC_NONE,
993 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
994 fd, 0, 0);
995
996 result = AIO_ALLDONE;
997 if (!multiple_matches) {
998 return result;
999 }
1000 }
1001 }
1002 }
1003
1004 return result;
1005 }
1006
1007
1008 /*
1009 * aio_suspend - suspend the calling thread until at least one of the async
1010 * IO operations referenced by uap->aiocblist has completed, until a signal
1011 * interrupts the function, or uap->timeoutp time interval (optional) has
1012 * passed.
1013 * Returns 0 if one or more async IOs have completed else -1 and errno is
1014 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1015 * woke us up.
1016 */
1017 int
aio_suspend(proc_t p,struct aio_suspend_args * uap,int * retval)1018 aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval)
1019 {
1020 __pthread_testcancel(1);
1021 return aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval);
1022 }
1023
1024
1025 int
aio_suspend_nocancel(proc_t p,struct aio_suspend_nocancel_args * uap,int * retval)1026 aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval)
1027 {
1028 int error;
1029 int i;
1030 uint64_t abstime;
1031 struct user_timespec ts;
1032 aio_workq_entry *entryp;
1033 user_addr_t *aiocbpp;
1034 size_t aiocbpp_size;
1035
1036 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) | DBG_FUNC_START,
1037 VM_KERNEL_ADDRPERM(p), uap->nent, 0, 0, 0);
1038
1039 *retval = -1;
1040 abstime = 0;
1041 aiocbpp = NULL;
1042
1043 if (!aio_has_any_work()) {
1044 error = EINVAL;
1045 goto ExitThisRoutine;
1046 }
1047
1048 if (uap->nent < 1 || uap->nent > aio_max_requests_per_process ||
1049 os_mul_overflow(sizeof(user_addr_t), uap->nent, &aiocbpp_size)) {
1050 error = EINVAL;
1051 goto ExitThisRoutine;
1052 }
1053
1054 if (uap->timeoutp != USER_ADDR_NULL) {
1055 if (proc_is64bit(p)) {
1056 struct user64_timespec temp;
1057 error = copyin(uap->timeoutp, &temp, sizeof(temp));
1058 if (error == 0) {
1059 ts.tv_sec = (user_time_t)temp.tv_sec;
1060 ts.tv_nsec = (user_long_t)temp.tv_nsec;
1061 }
1062 } else {
1063 struct user32_timespec temp;
1064 error = copyin(uap->timeoutp, &temp, sizeof(temp));
1065 if (error == 0) {
1066 ts.tv_sec = temp.tv_sec;
1067 ts.tv_nsec = temp.tv_nsec;
1068 }
1069 }
1070 if (error != 0) {
1071 error = EAGAIN;
1072 goto ExitThisRoutine;
1073 }
1074
1075 if (ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) {
1076 error = EINVAL;
1077 goto ExitThisRoutine;
1078 }
1079
1080 nanoseconds_to_absolutetime((uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1081 &abstime);
1082 clock_absolutetime_interval_to_deadline(abstime, &abstime);
1083 }
1084
1085 aiocbpp = (user_addr_t *)kalloc_data(aiocbpp_size, Z_WAITOK);
1086 if (aiocbpp == NULL || aio_copy_in_list(p, uap->aiocblist, aiocbpp, uap->nent)) {
1087 error = EAGAIN;
1088 goto ExitThisRoutine;
1089 }
1090
1091 /* check list of aio requests to see if any have completed */
1092 check_for_our_aiocbp:
1093 aio_proc_lock_spin(p);
1094 for (i = 0; i < uap->nent; i++) {
1095 user_addr_t aiocbp;
1096
1097 /* NULL elements are legal so check for 'em */
1098 aiocbp = *(aiocbpp + i);
1099 if (aiocbp == USER_ADDR_NULL) {
1100 continue;
1101 }
1102
1103 /* return immediately if any aio request in the list is done */
1104 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1105 ASSERT_AIO_FROM_PROC(entryp, p);
1106 if (entryp->uaiocbp == aiocbp) {
1107 aio_proc_unlock(p);
1108 *retval = 0;
1109 error = 0;
1110 goto ExitThisRoutine;
1111 }
1112 }
1113 }
1114
1115 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep) | DBG_FUNC_NONE,
1116 VM_KERNEL_ADDRPERM(p), uap->nent, 0, 0, 0);
1117
1118 /*
1119 * wait for an async IO to complete or a signal fires or timeout expires.
1120 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1121 * interrupts us. If an async IO completes before a signal fires or our
1122 * timeout expires, we get a wakeup call from aio_work_thread().
1123 */
1124
1125 error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p),
1126 PCATCH | PWAIT | PDROP, "aio_suspend", abstime);
1127 if (error == 0) {
1128 /*
1129 * got our wakeup call from aio_work_thread().
1130 * Since we can get a wakeup on this channel from another thread in the
1131 * same process we head back up to make sure this is for the correct aiocbp.
1132 * If it is the correct aiocbp we will return from where we do the check
1133 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1134 * else we will fall out and just sleep again.
1135 */
1136 goto check_for_our_aiocbp;
1137 } else if (error == EWOULDBLOCK) {
1138 /* our timeout expired */
1139 error = EAGAIN;
1140 } else {
1141 /* we were interrupted */
1142 error = EINTR;
1143 }
1144
1145 ExitThisRoutine:
1146 if (aiocbpp != NULL) {
1147 kfree_data(aiocbpp, aiocbpp_size);
1148 }
1149
1150 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) | DBG_FUNC_END,
1151 VM_KERNEL_ADDRPERM(p), uap->nent, error, 0, 0);
1152
1153 return error;
1154 }
1155
1156
1157 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1158 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1159 * (uap->aiocbp->aio_buf).
1160 */
1161
1162 int
aio_write(proc_t p,struct aio_write_args * uap,int * retval __unused)1163 aio_write(proc_t p, struct aio_write_args *uap, int *retval __unused)
1164 {
1165 int error;
1166
1167 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) | DBG_FUNC_START,
1168 VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
1169
1170 error = aio_queue_async_request(p, uap->aiocbp, AIO_WRITE);
1171
1172 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) | DBG_FUNC_END,
1173 VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
1174
1175 return error;
1176 }
1177
1178
1179 static int
aio_copy_in_list(proc_t procp,user_addr_t aiocblist,user_addr_t * aiocbpp,int nent)1180 aio_copy_in_list(proc_t procp, user_addr_t aiocblist, user_addr_t *aiocbpp,
1181 int nent)
1182 {
1183 int result;
1184
1185 /* copyin our aiocb pointers from list */
1186 result = copyin(aiocblist, aiocbpp,
1187 proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1188 : (nent * sizeof(user32_addr_t)));
1189 if (result) {
1190 return result;
1191 }
1192
1193 /*
1194 * We depend on a list of user_addr_t's so we need to
1195 * munge and expand when these pointers came from a
1196 * 32-bit process
1197 */
1198 if (!proc_is64bit(procp)) {
1199 /* copy from last to first to deal with overlap */
1200 user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1201 user_addr_t *my_addrp = aiocbpp + (nent - 1);
1202
1203 for (int i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
1204 *my_addrp = (user_addr_t) (*my_ptrp);
1205 }
1206 }
1207
1208 return 0;
1209 }
1210
1211
1212 static int
aio_copy_in_sigev(proc_t procp,user_addr_t sigp,struct user_sigevent * sigev)1213 aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1214 {
1215 int result = 0;
1216
1217 if (sigp == USER_ADDR_NULL) {
1218 goto out;
1219 }
1220
1221 /*
1222 * We need to munge aio_sigevent since it contains pointers.
1223 * Since we do not know if sigev_value is an int or a ptr we do
1224 * NOT cast the ptr to a user_addr_t. This means if we send
1225 * this info back to user space we need to remember sigev_value
1226 * was not expanded for the 32-bit case.
1227 *
1228 * Notes: This does NOT affect us since we don't support
1229 * sigev_value yet in the aio context.
1230 */
1231 if (proc_is64bit(procp)) {
1232 #if __LP64__
1233 struct user64_sigevent sigevent64;
1234
1235 result = copyin(sigp, &sigevent64, sizeof(sigevent64));
1236 if (result == 0) {
1237 sigev->sigev_notify = sigevent64.sigev_notify;
1238 sigev->sigev_signo = sigevent64.sigev_signo;
1239 sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1240 sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1241 sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1242 }
1243 #else
1244 panic("64bit process on 32bit kernel is not supported");
1245 #endif
1246 } else {
1247 struct user32_sigevent sigevent32;
1248
1249 result = copyin(sigp, &sigevent32, sizeof(sigevent32));
1250 if (result == 0) {
1251 sigev->sigev_notify = sigevent32.sigev_notify;
1252 sigev->sigev_signo = sigevent32.sigev_signo;
1253 sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1254 sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1255 sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1256 }
1257 }
1258
1259 if (result != 0) {
1260 result = EAGAIN;
1261 }
1262
1263 out:
1264 return result;
1265 }
1266
1267 /*
1268 * validate user_sigevent. at this point we only support
1269 * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means
1270 * sigev_value, sigev_notify_function, and sigev_notify_attributes
1271 * are ignored, since SIGEV_THREAD is unsupported. This is consistent
1272 * with no [RTS] (RalTime Signal) option group support.
1273 */
1274 static int
aio_sigev_validate(const struct user_sigevent * sigev)1275 aio_sigev_validate(const struct user_sigevent *sigev)
1276 {
1277 switch (sigev->sigev_notify) {
1278 case SIGEV_SIGNAL:
1279 {
1280 int signum;
1281
1282 /* make sure we have a valid signal number */
1283 signum = sigev->sigev_signo;
1284 if (signum <= 0 || signum >= NSIG ||
1285 signum == SIGKILL || signum == SIGSTOP) {
1286 return EINVAL;
1287 }
1288 }
1289 break;
1290
1291 case SIGEV_NONE:
1292 break;
1293
1294 case SIGEV_THREAD:
1295 /* Unsupported [RTS] */
1296
1297 default:
1298 return EINVAL;
1299 }
1300
1301 return 0;
1302 }
1303
1304
1305 /*
1306 * aio_try_enqueue_work_locked
1307 *
1308 * Queue up the entry on the aio asynchronous work queue in priority order
1309 * based on the relative priority of the request. We calculate the relative
1310 * priority using the nice value of the caller and the value
1311 *
1312 * Parameters: procp Process queueing the I/O
1313 * entryp The work queue entry being queued
1314 * leader The work leader if any
1315 *
1316 * Returns: Wether the enqueue was successful
1317 *
1318 * Notes: This function is used for both lio_listio and aio
1319 *
1320 * XXX: At some point, we may have to consider thread priority
1321 * rather than process priority, but we don't maintain the
1322 * adjusted priority for threads the POSIX way.
1323 *
1324 * Called with proc locked.
1325 */
1326 static bool
aio_try_enqueue_work_locked(proc_t procp,aio_workq_entry * entryp,aio_workq_entry * leader)1327 aio_try_enqueue_work_locked(proc_t procp, aio_workq_entry *entryp,
1328 aio_workq_entry *leader)
1329 {
1330 aio_workq_t queue = aio_entry_workq(entryp);
1331
1332 ASSERT_AIO_PROC_LOCK_OWNED(procp);
1333
1334 /* Onto proc queue */
1335 if (!aio_try_proc_insert_active_locked(procp, entryp)) {
1336 return false;
1337 }
1338
1339 if (leader) {
1340 aio_entry_ref(leader); /* consumed in do_aio_completion_and_unlock */
1341 leader->lio_pending++;
1342 entryp->lio_leader = leader;
1343 }
1344
1345 /* And work queue */
1346 aio_entry_ref(entryp); /* consumed in do_aio_completion_and_unlock */
1347 aio_workq_lock_spin(queue);
1348 aio_workq_add_entry_locked(queue, entryp);
1349 waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue),
1350 THREAD_AWAKENED, WAITQ_WAKEUP_DEFAULT);
1351 aio_workq_unlock(queue);
1352
1353 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) | DBG_FUNC_START,
1354 VM_KERNEL_ADDRPERM(procp), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1355 entryp->flags, entryp->aiocb.aio_fildes, 0);
1356 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) | DBG_FUNC_END,
1357 entryp->aiocb.aio_offset, 0, entryp->aiocb.aio_nbytes, 0, 0);
1358 return true;
1359 }
1360
1361
1362 /*
1363 * lio_listio - initiate a list of IO requests. We process the list of
1364 * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1365 * (mode == LIO_NOWAIT).
1366 *
1367 * The caller gets error and return status for each aiocb in the list
1368 * via aio_error and aio_return. We must keep completed requests until
1369 * released by the aio_return call.
1370 */
1371 int
lio_listio(proc_t p,struct lio_listio_args * uap,int * retval __unused)1372 lio_listio(proc_t p, struct lio_listio_args *uap, int *retval __unused)
1373 {
1374 aio_workq_entry *entries[AIO_LISTIO_MAX] = { };
1375 user_addr_t aiocbpp[AIO_LISTIO_MAX];
1376 struct user_sigevent aiosigev = { };
1377 int result = 0;
1378 int lio_count = 0;
1379
1380 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) | DBG_FUNC_START,
1381 VM_KERNEL_ADDRPERM(p), uap->nent, uap->mode, 0, 0);
1382
1383 if (!(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT)) {
1384 result = EINVAL;
1385 goto ExitRoutine;
1386 }
1387
1388 if (uap->nent < 1 || uap->nent > AIO_LISTIO_MAX) {
1389 result = EINVAL;
1390 goto ExitRoutine;
1391 }
1392
1393 /*
1394 * Use sigevent passed in to lio_listio for each of our calls, but
1395 * only do completion notification after the last request completes.
1396 */
1397 if (uap->sigp != USER_ADDR_NULL) {
1398 result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1399 if (result) {
1400 goto ExitRoutine;
1401 }
1402 result = aio_sigev_validate(&aiosigev);
1403 if (result) {
1404 goto ExitRoutine;
1405 }
1406 }
1407
1408 if (aio_copy_in_list(p, uap->aiocblist, aiocbpp, uap->nent)) {
1409 result = EAGAIN;
1410 goto ExitRoutine;
1411 }
1412
1413 /*
1414 * allocate/parse all entries
1415 */
1416 for (int i = 0; i < uap->nent; i++) {
1417 aio_workq_entry *entryp;
1418
1419 /* NULL elements are legal so check for 'em */
1420 if (aiocbpp[i] == USER_ADDR_NULL) {
1421 continue;
1422 }
1423
1424 entryp = aio_create_queue_entry(p, aiocbpp[i], AIO_LIO);
1425 if (entryp == NULL) {
1426 result = EAGAIN;
1427 goto ExitRoutine;
1428 }
1429
1430 /*
1431 * This refcount is cleaned up on exit if the entry
1432 * isn't submitted
1433 */
1434 entries[lio_count++] = entryp;
1435 if (uap->mode == LIO_NOWAIT) {
1436 /* Set signal hander, if any */
1437 entryp->aiocb.aio_sigevent = aiosigev;
1438 }
1439 }
1440
1441 if (lio_count == 0) {
1442 /* There's nothing to submit */
1443 goto ExitRoutine;
1444 }
1445
1446 /*
1447 * Past this point we're commited and will not bail out
1448 *
1449 * - keep a reference on the leader for LIO_WAIT
1450 * - perform the submissions and optionally wait
1451 */
1452
1453 aio_workq_entry *leader = entries[0];
1454 if (uap->mode == LIO_WAIT) {
1455 aio_entry_ref(leader); /* consumed below */
1456 }
1457
1458 aio_proc_lock_spin(p);
1459
1460 for (int i = 0; i < lio_count; i++) {
1461 if (aio_try_enqueue_work_locked(p, entries[i], leader)) {
1462 entries[i] = NULL; /* the entry was submitted */
1463 } else {
1464 result = EAGAIN;
1465 }
1466 }
1467
1468 if (uap->mode == LIO_WAIT && result == 0) {
1469 leader->flags |= AIO_LIO_WAIT;
1470
1471 while (leader->lio_pending) {
1472 /* If we were interrupted, fail out (even if all finished) */
1473 if (msleep(leader, aio_proc_mutex(p),
1474 PCATCH | PRIBIO | PSPIN, "lio_listio", 0) != 0) {
1475 result = EINTR;
1476 break;
1477 }
1478 }
1479
1480 leader->flags &= ~AIO_LIO_WAIT;
1481 }
1482
1483 aio_proc_unlock(p);
1484
1485 if (uap->mode == LIO_WAIT) {
1486 aio_entry_unref(leader);
1487 }
1488
1489 ExitRoutine:
1490 /* Consume unsubmitted entries */
1491 for (int i = 0; i < lio_count; i++) {
1492 if (entries[i]) {
1493 aio_entry_unref(entries[i]);
1494 }
1495 }
1496
1497 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) | DBG_FUNC_END,
1498 VM_KERNEL_ADDRPERM(p), result, 0, 0, 0);
1499
1500 return result;
1501 }
1502
1503
1504 /*
1505 * aio worker thread. this is where all the real work gets done.
1506 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1507 * after new work is queued up.
1508 */
1509 __attribute__((noreturn))
1510 static void
aio_work_thread(void * arg __unused,wait_result_t wr __unused)1511 aio_work_thread(void *arg __unused, wait_result_t wr __unused)
1512 {
1513 aio_workq_entry *entryp;
1514 int error;
1515 vm_map_t currentmap;
1516 vm_map_t oldmap = VM_MAP_NULL;
1517 task_t oldaiotask = TASK_NULL;
1518 struct uthread *uthreadp = NULL;
1519 proc_t p = NULL;
1520
1521 for (;;) {
1522 /*
1523 * returns with the entry ref'ed.
1524 * sleeps until work is available.
1525 */
1526 entryp = aio_get_some_work();
1527 p = entryp->procp;
1528
1529 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) | DBG_FUNC_START,
1530 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1531 entryp->flags, 0, 0);
1532
1533 /*
1534 * Assume the target's address space identity for the duration
1535 * of the IO. Note: don't need to have the entryp locked,
1536 * because the proc and map don't change until it's freed.
1537 */
1538 currentmap = get_task_map(proc_task(current_proc()));
1539 if (currentmap != entryp->aio_map) {
1540 uthreadp = (struct uthread *) current_uthread();
1541 oldaiotask = uthreadp->uu_aio_task;
1542 /*
1543 * workq entries at this stage cause _aio_exec() and _aio_exit() to
1544 * block until we hit `do_aio_completion_and_unlock()` below,
1545 * which means that it is safe to dereference p->task without
1546 * holding a lock or taking references.
1547 */
1548 uthreadp->uu_aio_task = proc_task(p);
1549 oldmap = vm_map_switch(entryp->aio_map);
1550 }
1551
1552 if ((entryp->flags & AIO_READ) != 0) {
1553 error = do_aio_read(entryp);
1554 } else if ((entryp->flags & AIO_WRITE) != 0) {
1555 error = do_aio_write(entryp);
1556 } else if ((entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0) {
1557 error = do_aio_fsync(entryp);
1558 } else {
1559 error = EINVAL;
1560 }
1561
1562 /* Restore old map */
1563 if (currentmap != entryp->aio_map) {
1564 vm_map_switch(oldmap);
1565 uthreadp->uu_aio_task = oldaiotask;
1566 }
1567
1568 /* liberate unused map */
1569 vm_map_deallocate(entryp->aio_map);
1570 entryp->aio_map = VM_MAP_NULL;
1571
1572 KERNEL_DEBUG(SDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) | DBG_FUNC_END,
1573 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1574 entryp->errorval, entryp->returnval, 0);
1575
1576 /* we're done with the IO request so pop it off the active queue and */
1577 /* push it on the done queue */
1578 aio_proc_lock(p);
1579 entryp->errorval = error;
1580 do_aio_completion_and_unlock(p, entryp);
1581 }
1582 }
1583
1584
1585 /*
1586 * aio_get_some_work - get the next async IO request that is ready to be executed.
1587 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1588 * IO requests at the time the aio_fsync call came in have completed.
1589 * NOTE - AIO_LOCK must be held by caller
1590 */
1591 static aio_workq_entry *
aio_get_some_work(void)1592 aio_get_some_work(void)
1593 {
1594 aio_workq_entry *entryp = NULL;
1595 aio_workq_t queue = NULL;
1596
1597 /* Just one queue for the moment. In the future there will be many. */
1598 queue = &aio_anchor.aio_async_workqs[0];
1599 aio_workq_lock_spin(queue);
1600
1601 /*
1602 * Hold the queue lock.
1603 *
1604 * pop some work off the work queue and add to our active queue
1605 * Always start with the queue lock held.
1606 */
1607 while ((entryp = TAILQ_FIRST(&queue->aioq_entries))) {
1608 /*
1609 * Pull of of work queue. Once it's off, it can't be cancelled,
1610 * so we can take our ref once we drop the queue lock.
1611 */
1612
1613 aio_workq_remove_entry_locked(queue, entryp);
1614
1615 aio_workq_unlock(queue);
1616
1617 /*
1618 * Check if it's an fsync that must be delayed. No need to lock the entry;
1619 * that flag would have been set at initialization.
1620 */
1621 if ((entryp->flags & AIO_FSYNC) != 0) {
1622 /*
1623 * Check for unfinished operations on the same file
1624 * in this proc's queue.
1625 */
1626 aio_proc_lock_spin(entryp->procp);
1627 if (aio_delay_fsync_request(entryp)) {
1628 /* It needs to be delayed. Put it back on the end of the work queue */
1629 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay) | DBG_FUNC_NONE,
1630 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1631 0, 0, 0);
1632
1633 aio_proc_unlock(entryp->procp);
1634
1635 aio_workq_lock_spin(queue);
1636 aio_workq_add_entry_locked(queue, entryp);
1637 continue;
1638 }
1639 aio_proc_unlock(entryp->procp);
1640 }
1641
1642 return entryp;
1643 }
1644
1645 /* We will wake up when someone enqueues something */
1646 waitq_assert_wait64(&queue->aioq_waitq, CAST_EVENT64_T(queue), THREAD_UNINT, 0);
1647 aio_workq_unlock(queue);
1648 thread_block(aio_work_thread);
1649
1650 __builtin_unreachable();
1651 }
1652
1653 /*
1654 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1655 * A big, simple hammer: only send it off if it's the most recently filed IO which has
1656 * not been completed.
1657 */
1658 static boolean_t
aio_delay_fsync_request(aio_workq_entry * entryp)1659 aio_delay_fsync_request(aio_workq_entry *entryp)
1660 {
1661 if (proc_in_teardown(entryp->procp)) {
1662 /*
1663 * we can't delay FSYNCS when in teardown as it will confuse _aio_exit,
1664 * if it was dequeued, then we must now commit to it
1665 */
1666 return FALSE;
1667 }
1668
1669 if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1670 return FALSE;
1671 }
1672
1673 return TRUE;
1674 }
1675
1676 static aio_workq_entry *
aio_create_queue_entry(proc_t procp,user_addr_t aiocbp,aio_entry_flags_t flags)1677 aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t flags)
1678 {
1679 aio_workq_entry *entryp;
1680
1681 entryp = zalloc_flags(aio_workq_zonep, Z_WAITOK | Z_ZERO);
1682 entryp->procp = procp;
1683 entryp->uaiocbp = aiocbp;
1684 entryp->flags = flags;
1685 /* consumed in aio_return or _aio_exit */
1686 os_ref_init(&entryp->aio_refcount, &aio_refgrp);
1687
1688 if (proc_is64bit(procp)) {
1689 struct user64_aiocb aiocb64;
1690
1691 if (copyin(aiocbp, &aiocb64, sizeof(aiocb64)) != 0) {
1692 goto error_exit;
1693 }
1694 do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1695 } else {
1696 struct user32_aiocb aiocb32;
1697
1698 if (copyin(aiocbp, &aiocb32, sizeof(aiocb32)) != 0) {
1699 goto error_exit;
1700 }
1701 do_munge_aiocb_user32_to_user(&aiocb32, &entryp->aiocb);
1702 }
1703
1704 /* do some more validation on the aiocb and embedded file descriptor */
1705 if (aio_validate(procp, entryp) != 0) {
1706 goto error_exit;
1707 }
1708
1709 /* get a reference to the user land map in order to keep it around */
1710 entryp->aio_map = get_task_map(proc_task(procp));
1711 vm_map_reference(entryp->aio_map);
1712
1713 /* get a reference on the current_thread, which is passed in vfs_context. */
1714 entryp->context = *vfs_context_current();
1715 thread_reference(entryp->context.vc_thread);
1716 kauth_cred_ref(entryp->context.vc_ucred);
1717 return entryp;
1718
1719 error_exit:
1720 zfree(aio_workq_zonep, entryp);
1721 return NULL;
1722 }
1723
1724
1725 /*
1726 * aio_queue_async_request - queue up an async IO request on our work queue then
1727 * wake up one of our worker threads to do the actual work. We get a reference
1728 * to our caller's user land map in order to keep it around while we are
1729 * processing the request.
1730 */
1731 static int
aio_queue_async_request(proc_t procp,user_addr_t aiocbp,aio_entry_flags_t flags)1732 aio_queue_async_request(proc_t procp, user_addr_t aiocbp,
1733 aio_entry_flags_t flags)
1734 {
1735 aio_workq_entry *entryp;
1736 int result;
1737
1738 entryp = aio_create_queue_entry(procp, aiocbp, flags);
1739 if (entryp == NULL) {
1740 result = EAGAIN;
1741 goto error_noalloc;
1742 }
1743
1744 aio_proc_lock_spin(procp);
1745 if (!aio_try_enqueue_work_locked(procp, entryp, NULL)) {
1746 result = EAGAIN;
1747 goto error_exit;
1748 }
1749 aio_proc_unlock(procp);
1750 return 0;
1751
1752 error_exit:
1753 /*
1754 * This entry has not been queued up so no worries about
1755 * unlocked state and aio_map
1756 */
1757 aio_proc_unlock(procp);
1758 aio_free_request(entryp);
1759 error_noalloc:
1760 return result;
1761 }
1762
1763
1764 /*
1765 * aio_free_request - remove our reference on the user land map and
1766 * free the work queue entry resources. The entry is off all lists
1767 * and has zero refcount, so no one can have a pointer to it.
1768 */
1769 static void
aio_free_request(aio_workq_entry * entryp)1770 aio_free_request(aio_workq_entry *entryp)
1771 {
1772 if (entryp->aio_proc_link.tqe_prev || entryp->aio_workq_link.tqe_prev) {
1773 panic("aio_workq_entry %p being freed while still enqueued", entryp);
1774 }
1775
1776 /* remove our reference to the user land map. */
1777 if (VM_MAP_NULL != entryp->aio_map) {
1778 vm_map_deallocate(entryp->aio_map);
1779 }
1780
1781 /* remove our reference to thread which enqueued the request */
1782 if (entryp->context.vc_thread) {
1783 thread_deallocate(entryp->context.vc_thread);
1784 }
1785 kauth_cred_unref(&entryp->context.vc_ucred);
1786
1787 zfree(aio_workq_zonep, entryp);
1788 }
1789
1790
1791 /*
1792 * aio_validate
1793 *
1794 * validate the aiocb passed in by one of the aio syscalls.
1795 */
1796 static int
aio_validate(proc_t p,aio_workq_entry * entryp)1797 aio_validate(proc_t p, aio_workq_entry *entryp)
1798 {
1799 struct fileproc *fp;
1800 int flag;
1801 int result;
1802
1803 result = 0;
1804
1805 if ((entryp->flags & AIO_LIO) != 0) {
1806 if (entryp->aiocb.aio_lio_opcode == LIO_READ) {
1807 entryp->flags |= AIO_READ;
1808 } else if (entryp->aiocb.aio_lio_opcode == LIO_WRITE) {
1809 entryp->flags |= AIO_WRITE;
1810 } else if (entryp->aiocb.aio_lio_opcode == LIO_NOP) {
1811 return 0;
1812 } else {
1813 return EINVAL;
1814 }
1815 }
1816
1817 flag = FREAD;
1818 if ((entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0) {
1819 flag = FWRITE;
1820 }
1821
1822 if ((entryp->flags & (AIO_READ | AIO_WRITE)) != 0) {
1823 if (entryp->aiocb.aio_nbytes > INT_MAX ||
1824 entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1825 entryp->aiocb.aio_offset < 0) {
1826 return EINVAL;
1827 }
1828 }
1829
1830 result = aio_sigev_validate(&entryp->aiocb.aio_sigevent);
1831 if (result) {
1832 return result;
1833 }
1834
1835 /* validate the file descriptor and that the file was opened
1836 * for the appropriate read / write access.
1837 */
1838 proc_fdlock(p);
1839
1840 fp = fp_get_noref_locked(p, entryp->aiocb.aio_fildes);
1841 if (fp == NULL) {
1842 result = EBADF;
1843 } else if ((fp->fp_glob->fg_flag & flag) == 0) {
1844 /* we don't have read or write access */
1845 result = EBADF;
1846 } else if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_VNODE) {
1847 /* this is not a file */
1848 result = ESPIPE;
1849 } else {
1850 fp->fp_flags |= FP_AIOISSUED;
1851 }
1852
1853 proc_fdunlock(p);
1854
1855 return result;
1856 }
1857
1858 /*
1859 * do_aio_completion_and_unlock. Handle async IO completion.
1860 */
1861 static void
do_aio_completion_and_unlock(proc_t p,aio_workq_entry * entryp)1862 do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp)
1863 {
1864 aio_workq_entry *leader = entryp->lio_leader;
1865 int lio_pending = 0;
1866 bool do_signal = false;
1867
1868 ASSERT_AIO_PROC_LOCK_OWNED(p);
1869
1870 aio_proc_move_done_locked(p, entryp);
1871
1872 if (leader) {
1873 lio_pending = --leader->lio_pending;
1874 if (lio_pending < 0) {
1875 panic("lio_pending accounting mistake");
1876 }
1877 if (lio_pending == 0 && (leader->flags & AIO_LIO_WAIT)) {
1878 wakeup(leader);
1879 }
1880 entryp->lio_leader = NULL; /* no dangling pointers please */
1881 }
1882
1883 /*
1884 * need to handle case where a process is trying to exit, exec, or
1885 * close and is currently waiting for active aio requests to complete.
1886 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
1887 * other requests in the active queue for this process. If there are
1888 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
1889 * If there are some still active then do nothing - we only want to
1890 * wakeup when all active aio requests for the process are complete.
1891 */
1892 if (__improbable(entryp->flags & AIO_EXIT_WAIT)) {
1893 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) | DBG_FUNC_NONE,
1894 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1895 0, 0, 0);
1896
1897 if (!aio_has_active_requests_for_process(p)) {
1898 /*
1899 * no active aio requests for this process, continue exiting. In this
1900 * case, there should be no one else waiting ont he proc in AIO...
1901 */
1902 wakeup_one((caddr_t)&p->AIO_CLEANUP_SLEEP_CHAN);
1903
1904 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) | DBG_FUNC_NONE,
1905 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1906 0, 0, 0);
1907 }
1908 } else if (entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
1909 /*
1910 * If this was the last request in the group, or not part of
1911 * a group, and that a signal is desired, send one.
1912 */
1913 do_signal = (lio_pending == 0);
1914 }
1915
1916 if (__improbable(entryp->flags & AIO_CLOSE_WAIT)) {
1917 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) | DBG_FUNC_NONE,
1918 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1919 0, 0, 0);
1920
1921 if (!aio_proc_has_active_requests_for_file(p, entryp->aiocb.aio_fildes)) {
1922 /* Can't wakeup_one(); multiple closes might be in progress. */
1923 wakeup(&p->AIO_CLEANUP_SLEEP_CHAN);
1924
1925 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) | DBG_FUNC_NONE,
1926 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1927 0, 0, 0);
1928 }
1929 }
1930
1931 aio_proc_unlock(p);
1932
1933 if (do_signal) {
1934 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig) | DBG_FUNC_NONE,
1935 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1936 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0);
1937
1938 psignal(p, entryp->aiocb.aio_sigevent.sigev_signo);
1939 }
1940
1941 /*
1942 * A thread in aio_suspend() wants to known about completed IOs. If it checked
1943 * the done list before we moved our AIO there, then it already asserted its wait,
1944 * and we can wake it up without holding the lock. If it checked the list after
1945 * we did our move, then it already has seen the AIO that we moved. Herego, we
1946 * can do our wakeup without holding the lock.
1947 */
1948 wakeup(&p->AIO_SUSPEND_SLEEP_CHAN);
1949 KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake) | DBG_FUNC_NONE,
1950 VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), 0, 0, 0);
1951
1952 aio_entry_unref(entryp); /* see aio_try_enqueue_work_locked */
1953 if (leader) {
1954 aio_entry_unref(leader); /* see lio_listio */
1955 }
1956 }
1957
1958
1959 /*
1960 * do_aio_read
1961 */
1962 static int
do_aio_read(aio_workq_entry * entryp)1963 do_aio_read(aio_workq_entry *entryp)
1964 {
1965 struct proc *p = entryp->procp;
1966 struct fileproc *fp;
1967 int error;
1968
1969 if ((error = fp_lookup(p, entryp->aiocb.aio_fildes, &fp, 0))) {
1970 return error;
1971 }
1972
1973 if (fp->fp_glob->fg_flag & FREAD) {
1974 error = dofileread(&entryp->context, fp,
1975 entryp->aiocb.aio_buf,
1976 entryp->aiocb.aio_nbytes,
1977 entryp->aiocb.aio_offset, FOF_OFFSET,
1978 &entryp->returnval);
1979 } else {
1980 error = EBADF;
1981 }
1982
1983 fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
1984 return error;
1985 }
1986
1987
1988 /*
1989 * do_aio_write
1990 */
1991 static int
do_aio_write(aio_workq_entry * entryp)1992 do_aio_write(aio_workq_entry *entryp)
1993 {
1994 struct proc *p = entryp->procp;
1995 struct fileproc *fp;
1996 int error;
1997
1998 if ((error = fp_lookup(p, entryp->aiocb.aio_fildes, &fp, 0))) {
1999 return error;
2000 }
2001
2002 if (fp->fp_glob->fg_flag & FWRITE) {
2003 int flags = 0;
2004
2005 if ((fp->fp_glob->fg_flag & O_APPEND) == 0) {
2006 flags |= FOF_OFFSET;
2007 }
2008
2009 /* NB: tell dofilewrite the offset, and to use the proc cred */
2010 error = dofilewrite(&entryp->context,
2011 fp,
2012 entryp->aiocb.aio_buf,
2013 entryp->aiocb.aio_nbytes,
2014 entryp->aiocb.aio_offset,
2015 flags,
2016 &entryp->returnval);
2017 } else {
2018 error = EBADF;
2019 }
2020
2021 fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
2022 return error;
2023 }
2024
2025
2026 /*
2027 * aio_has_active_requests_for_process - return whether the process has active
2028 * requests pending.
2029 */
2030 static bool
aio_has_active_requests_for_process(proc_t procp)2031 aio_has_active_requests_for_process(proc_t procp)
2032 {
2033 return !TAILQ_EMPTY(&procp->p_aio_activeq);
2034 }
2035
2036 /*
2037 * Called with the proc locked.
2038 */
2039 static bool
aio_proc_has_active_requests_for_file(proc_t procp,int fd)2040 aio_proc_has_active_requests_for_file(proc_t procp, int fd)
2041 {
2042 aio_workq_entry *entryp;
2043
2044 TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2045 if (entryp->aiocb.aio_fildes == fd) {
2046 return true;
2047 }
2048 }
2049
2050 return false;
2051 }
2052
2053
2054 /*
2055 * do_aio_fsync
2056 */
2057 static int
do_aio_fsync(aio_workq_entry * entryp)2058 do_aio_fsync(aio_workq_entry *entryp)
2059 {
2060 struct proc *p = entryp->procp;
2061 struct vnode *vp;
2062 struct fileproc *fp;
2063 int sync_flag;
2064 int error;
2065
2066 /*
2067 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2068 *
2069 * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2070 * to mark for update the metadata not strictly necessary for data
2071 * retrieval, rather than forcing it to disk.
2072 *
2073 * If AIO_FSYNC is set, we have to also wait for metadata not really
2074 * necessary to data retrival are committed to stable storage (e.g.
2075 * atime, mtime, ctime, etc.).
2076 *
2077 * Metadata necessary for data retrieval ust be committed to stable
2078 * storage in either case (file length, etc.).
2079 */
2080 if (entryp->flags & AIO_FSYNC) {
2081 sync_flag = MNT_WAIT;
2082 } else {
2083 sync_flag = MNT_DWAIT;
2084 }
2085
2086 error = fp_get_ftype(p, entryp->aiocb.aio_fildes, DTYPE_VNODE, ENOTSUP, &fp);
2087 if (error != 0) {
2088 entryp->returnval = -1;
2089 return error;
2090 }
2091 vp = fp_get_data(fp);
2092
2093 if ((error = vnode_getwithref(vp)) == 0) {
2094 error = VNOP_FSYNC(vp, sync_flag, &entryp->context);
2095
2096 (void)vnode_put(vp);
2097 } else {
2098 entryp->returnval = -1;
2099 }
2100
2101 fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
2102 return error;
2103 }
2104
2105
2106 /*
2107 * is_already_queued - runs through our queues to see if the given
2108 * aiocbp / process is there. Returns TRUE if there is a match
2109 * on any of our aio queues.
2110 *
2111 * Called with proc aio lock held (can be held spin)
2112 */
2113 static boolean_t
is_already_queued(proc_t procp,user_addr_t aiocbp)2114 is_already_queued(proc_t procp, user_addr_t aiocbp)
2115 {
2116 aio_workq_entry *entryp;
2117 boolean_t result;
2118
2119 result = FALSE;
2120
2121 /* look for matches on our queue of async IO requests that have completed */
2122 TAILQ_FOREACH(entryp, &procp->p_aio_doneq, aio_proc_link) {
2123 if (aiocbp == entryp->uaiocbp) {
2124 result = TRUE;
2125 goto ExitThisRoutine;
2126 }
2127 }
2128
2129 /* look for matches on our queue of active async IO requests */
2130 TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2131 if (aiocbp == entryp->uaiocbp) {
2132 result = TRUE;
2133 goto ExitThisRoutine;
2134 }
2135 }
2136
2137 ExitThisRoutine:
2138 return result;
2139 }
2140
2141
2142 /*
2143 * aio initialization
2144 */
2145 __private_extern__ void
aio_init(void)2146 aio_init(void)
2147 {
2148 for (int i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
2149 aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2150 }
2151
2152 _aio_create_worker_threads(aio_worker_threads);
2153 }
2154
2155
2156 /*
2157 * aio worker threads created here.
2158 */
2159 __private_extern__ void
_aio_create_worker_threads(int num)2160 _aio_create_worker_threads(int num)
2161 {
2162 int i;
2163
2164 /* create some worker threads to handle the async IO requests */
2165 for (i = 0; i < num; i++) {
2166 thread_t myThread;
2167
2168 if (KERN_SUCCESS != kernel_thread_start(aio_work_thread, NULL, &myThread)) {
2169 printf("%s - failed to create a work thread \n", __FUNCTION__);
2170 } else {
2171 thread_deallocate(myThread);
2172 }
2173 }
2174 }
2175
2176 /*
2177 * Return the current activation utask
2178 */
2179 task_t
get_aiotask(void)2180 get_aiotask(void)
2181 {
2182 return current_uthread()->uu_aio_task;
2183 }
2184
2185
2186 /*
2187 * In the case of an aiocb from a
2188 * 32-bit process we need to expand some longs and pointers to the correct
2189 * sizes in order to let downstream code always work on the same type of
2190 * aiocb (in our case that is a user_aiocb)
2191 */
2192 static void
do_munge_aiocb_user32_to_user(struct user32_aiocb * my_aiocbp,struct user_aiocb * the_user_aiocbp)2193 do_munge_aiocb_user32_to_user(struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp)
2194 {
2195 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2196 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2197 the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2198 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2199 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2200 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2201
2202 /* special case here. since we do not know if sigev_value is an */
2203 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */
2204 /* means if we send this info back to user space we need to remember */
2205 /* sigev_value was not expanded for the 32-bit case. */
2206 /* NOTE - this does NOT affect us since we don't support sigev_value */
2207 /* yet in the aio context. */
2208 //LP64
2209 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2210 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2211 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2212 my_aiocbp->aio_sigevent.sigev_value.sival_int;
2213 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2214 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2215 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2216 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2217 }
2218
2219 /* Similar for 64-bit user process, so that we don't need to satisfy
2220 * the alignment constraints of the original user64_aiocb
2221 */
2222 #if !__LP64__
2223 __dead2
2224 #endif
2225 static void
do_munge_aiocb_user64_to_user(struct user64_aiocb * my_aiocbp,struct user_aiocb * the_user_aiocbp)2226 do_munge_aiocb_user64_to_user(struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp)
2227 {
2228 #if __LP64__
2229 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2230 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2231 the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2232 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2233 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2234 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2235
2236 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2237 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2238 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2239 my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2240 the_user_aiocbp->aio_sigevent.sigev_notify_function =
2241 my_aiocbp->aio_sigevent.sigev_notify_function;
2242 the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2243 my_aiocbp->aio_sigevent.sigev_notify_attributes;
2244 #else
2245 #pragma unused(my_aiocbp, the_user_aiocbp)
2246 panic("64bit process on 32bit kernel is not supported");
2247 #endif
2248 }
2249