xref: /xnu-10002.81.5/bsd/kern/kern_aio.c (revision 5e3eaea39dcf651e66cb99ba7d70e32cc4a99587)
1 /*
2  * Copyright (c) 2003-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 
30 /*
31  * todo:
32  *		1) ramesh is looking into how to replace taking a reference on
33  *		        the user's map (vm_map_reference()) since it is believed that
34  *			would not hold the process for us.
35  *		2) david is looking into a way for us to set the priority of the
36  *		        worker threads to match that of the user's thread when the
37  *		        async IO was queued.
38  */
39 
40 
41 /*
42  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
43  */
44 
45 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/file_internal.h>
48 #include <sys/filedesc.h>
49 #include <sys/kernel.h>
50 #include <sys/vnode_internal.h>
51 #include <sys/kauth.h>
52 #include <sys/mount_internal.h>
53 #include <sys/param.h>
54 #include <sys/proc_internal.h>
55 #include <sys/sysctl.h>
56 #include <sys/unistd.h>
57 #include <sys/user.h>
58 
59 #include <sys/aio_kern.h>
60 #include <sys/sysproto.h>
61 
62 #include <machine/limits.h>
63 
64 #include <mach/mach_types.h>
65 #include <kern/kern_types.h>
66 #include <kern/waitq.h>
67 #include <kern/zalloc.h>
68 #include <kern/task.h>
69 #include <kern/sched_prim.h>
70 
71 #include <vm/vm_map.h>
72 
73 #include <os/refcnt.h>
74 
75 #include <sys/kdebug.h>
76 #define AIO_work_queued                 1
77 #define AIO_worker_wake                 2
78 #define AIO_completion_sig              3
79 #define AIO_completion_cleanup_wait     4
80 #define AIO_completion_cleanup_wake     5
81 #define AIO_completion_suspend_wake     6
82 #define AIO_fsync_delay                 7
83 #define AIO_cancel                      10
84 #define AIO_cancel_async_workq          11
85 #define AIO_cancel_sync_workq           12
86 #define AIO_cancel_activeq              13
87 #define AIO_cancel_doneq                14
88 #define AIO_fsync                       20
89 #define AIO_read                        30
90 #define AIO_write                       40
91 #define AIO_listio                      50
92 #define AIO_error                       60
93 #define AIO_error_val                   61
94 #define AIO_error_activeq               62
95 #define AIO_error_workq                 63
96 #define AIO_return                      70
97 #define AIO_return_val                  71
98 #define AIO_return_activeq              72
99 #define AIO_return_workq                73
100 #define AIO_exec                        80
101 #define AIO_exit                        90
102 #define AIO_exit_sleep                  91
103 #define AIO_close                       100
104 #define AIO_close_sleep                 101
105 #define AIO_suspend                     110
106 #define AIO_suspend_sleep               111
107 #define AIO_worker_thread               120
108 
109 __options_decl(aio_entry_flags_t, uint32_t, {
110 	AIO_READ        = 0x00000001, /* a read */
111 	AIO_WRITE       = 0x00000002, /* a write */
112 	AIO_FSYNC       = 0x00000004, /* aio_fsync with op = O_SYNC */
113 	AIO_DSYNC       = 0x00000008, /* aio_fsync with op = O_DSYNC (not supported yet) */
114 	AIO_LIO         = 0x00000010, /* lio_listio generated IO */
115 	AIO_LIO_WAIT    = 0x00000020, /* lio_listio is waiting on the leader */
116 
117 	/*
118 	 * These flags mean that this entry is blocking either:
119 	 * - close (AIO_CLOSE_WAIT)
120 	 * - exit or exec (AIO_EXIT_WAIT)
121 	 *
122 	 * These flags are mutually exclusive, and the AIO_EXIT_WAIT variant
123 	 * will also neuter notifications in do_aio_completion_and_unlock().
124 	 */
125 	AIO_CLOSE_WAIT  = 0x00004000,
126 	AIO_EXIT_WAIT   = 0x00008000,
127 });
128 
129 /*! @struct aio_workq_entry
130  *
131  * @discussion
132  * This represents a piece of aio/lio work.
133  *
134  * The ownership rules go as follows:
135  *
136  * - the "proc" owns one refcount on the entry (from creation), while it is
137  *   enqueued on the aio_activeq and then the aio_doneq.
138  *
139  *   either aio_return() (user read the status) or _aio_exit() (the process
140  *   died) will dequeue the entry and consume this ref.
141  *
142  * - the async workqueue owns one refcount once the work is submitted,
143  *   which is consumed in do_aio_completion_and_unlock().
144  *
145  *   This ref protects the entry for the the end of
146  *   do_aio_completion_and_unlock() (when signal delivery happens).
147  *
148  * - lio_listio() for batches picks one of the entries to be the "leader"
149  *   of the batch. Each work item will have a refcount on its leader
150  *   so that the accounting of the batch completion can be done on the leader
151  *   (to be able to decrement lio_pending).
152  *
153  *   This ref is consumed in do_aio_completion_and_unlock() as well.
154  *
155  * - lastly, in lio_listio() when the LIO_WAIT behavior is requested,
156  *   an extra ref is taken in this syscall as it needs to keep accessing
157  *   the leader "lio_pending" field until it hits 0.
158  */
159 struct aio_workq_entry {
160 	/* queue lock */
161 	TAILQ_ENTRY(aio_workq_entry)    aio_workq_link;
162 
163 	/* Proc lock */
164 	TAILQ_ENTRY(aio_workq_entry)    aio_proc_link;  /* p_aio_activeq or p_aio_doneq */
165 	user_ssize_t                    returnval;      /* return value from read / write request */
166 	errno_t                         errorval;       /* error value from read / write request */
167 	os_refcnt_t                     aio_refcount;
168 	aio_entry_flags_t               flags;
169 
170 	int                             lio_pending;    /* pending I/Os in lio group, only on leader */
171 	struct aio_workq_entry         *lio_leader;     /* pointer to the lio leader, can be self */
172 
173 	/* Initialized and never changed, safe to access */
174 	struct proc                    *procp;          /* user proc that queued this request */
175 	user_addr_t                     uaiocbp;        /* pointer passed in from user land */
176 	struct user_aiocb               aiocb;          /* copy of aiocb from user land */
177 	struct vfs_context              context;        /* context which enqueued the request */
178 
179 	/* Initialized, and possibly freed by aio_work_thread() or at free if cancelled */
180 	vm_map_t                        aio_map;        /* user land map we have a reference to */
181 };
182 
183 /*
184  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
185  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
186  * (proc.aio_activeq) when one of our worker threads start the IO.
187  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
188  * when the IO request completes.  The request remains on aio_doneq until
189  * user process calls aio_return or the process exits, either way that is our
190  * trigger to release aio resources.
191  */
192 typedef struct aio_workq   {
193 	TAILQ_HEAD(, aio_workq_entry)   aioq_entries;
194 	lck_spin_t                      aioq_lock;
195 	struct waitq                    aioq_waitq;
196 } *aio_workq_t;
197 
198 #define AIO_NUM_WORK_QUEUES 1
199 struct aio_anchor_cb {
200 	os_atomic(int)          aio_total_count;        /* total extant entries */
201 
202 	/* Hash table of queues here */
203 	int                     aio_num_workqs;
204 	struct aio_workq        aio_async_workqs[AIO_NUM_WORK_QUEUES];
205 };
206 typedef struct aio_anchor_cb aio_anchor_cb;
207 
208 /*
209  * Notes on aio sleep / wake channels.
210  * We currently pick a couple fields within the proc structure that will allow
211  * us sleep channels that currently do not collide with any other kernel routines.
212  * At this time, for binary compatibility reasons, we cannot create new proc fields.
213  */
214 #define AIO_SUSPEND_SLEEP_CHAN  p_aio_activeq
215 #define AIO_CLEANUP_SLEEP_CHAN  p_aio_total_count
216 
217 #define ASSERT_AIO_FROM_PROC(aiop, theproc)     \
218 	if ((aiop)->procp != (theproc)) {       \
219 	        panic("AIO on a proc list that does not belong to that proc."); \
220 	}
221 
222 /*
223  *  LOCAL PROTOTYPES
224  */
225 static void             aio_proc_lock(proc_t procp);
226 static void             aio_proc_lock_spin(proc_t procp);
227 static void             aio_proc_unlock(proc_t procp);
228 static lck_mtx_t       *aio_proc_mutex(proc_t procp);
229 static bool             aio_has_active_requests_for_process(proc_t procp);
230 static bool             aio_proc_has_active_requests_for_file(proc_t procp, int fd);
231 static boolean_t        is_already_queued(proc_t procp, user_addr_t aiocbp);
232 
233 static aio_workq_t      aio_entry_workq(aio_workq_entry *entryp);
234 static void             aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
235 static void             aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
236 static void             aio_entry_ref(aio_workq_entry *entryp);
237 static void             aio_entry_unref(aio_workq_entry *entryp);
238 static bool             aio_entry_try_workq_remove(aio_workq_entry *entryp);
239 static boolean_t        aio_delay_fsync_request(aio_workq_entry *entryp);
240 static void             aio_free_request(aio_workq_entry *entryp);
241 
242 static void             aio_workq_init(aio_workq_t wq);
243 static void             aio_workq_lock_spin(aio_workq_t wq);
244 static void             aio_workq_unlock(aio_workq_t wq);
245 static lck_spin_t      *aio_workq_lock(aio_workq_t wq);
246 
247 static void             aio_work_thread(void *arg, wait_result_t wr);
248 static aio_workq_entry *aio_get_some_work(void);
249 
250 static int              aio_queue_async_request(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
251 static int              aio_validate(proc_t, aio_workq_entry *entryp);
252 
253 static int              do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, aio_entry_flags_t);
254 static void             do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp);
255 static int              do_aio_fsync(aio_workq_entry *entryp);
256 static int              do_aio_read(aio_workq_entry *entryp);
257 static int              do_aio_write(aio_workq_entry *entryp);
258 static void             do_munge_aiocb_user32_to_user(struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp);
259 static void             do_munge_aiocb_user64_to_user(struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp);
260 static aio_workq_entry *aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
261 static int              aio_copy_in_list(proc_t, user_addr_t, user_addr_t *, int);
262 
263 #define ASSERT_AIO_PROC_LOCK_OWNED(p)   LCK_MTX_ASSERT(aio_proc_mutex(p), LCK_MTX_ASSERT_OWNED)
264 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q)  LCK_SPIN_ASSERT(aio_workq_lock(q), LCK_ASSERT_OWNED)
265 
266 /*
267  *  EXTERNAL PROTOTYPES
268  */
269 
270 /* in ...bsd/kern/sys_generic.c */
271 extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
272     user_addr_t bufp, user_size_t nbyte,
273     off_t offset, int flags, user_ssize_t *retval);
274 extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
275     user_addr_t bufp, user_size_t nbyte, off_t offset,
276     int flags, user_ssize_t *retval);
277 
278 /*
279  * aio external global variables.
280  */
281 extern int aio_max_requests;                    /* AIO_MAX - configurable */
282 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
283 extern int aio_worker_threads;                  /* AIO_THREAD_COUNT - configurable */
284 
285 
286 /*
287  * aio static variables.
288  */
289 static aio_anchor_cb aio_anchor = {
290 	.aio_num_workqs = AIO_NUM_WORK_QUEUES,
291 };
292 os_refgrp_decl(static, aio_refgrp, "aio", NULL);
293 static LCK_GRP_DECLARE(aio_proc_lock_grp, "aio_proc");
294 static LCK_GRP_DECLARE(aio_queue_lock_grp, "aio_queue");
295 static LCK_MTX_DECLARE(aio_proc_mtx, &aio_proc_lock_grp);
296 
297 static KALLOC_TYPE_DEFINE(aio_workq_zonep, aio_workq_entry, KT_DEFAULT);
298 
299 /* Hash */
300 static aio_workq_t
aio_entry_workq(__unused aio_workq_entry * entryp)301 aio_entry_workq(__unused aio_workq_entry *entryp)
302 {
303 	return &aio_anchor.aio_async_workqs[0];
304 }
305 
306 static void
aio_workq_init(aio_workq_t wq)307 aio_workq_init(aio_workq_t wq)
308 {
309 	TAILQ_INIT(&wq->aioq_entries);
310 	lck_spin_init(&wq->aioq_lock, &aio_queue_lock_grp, LCK_ATTR_NULL);
311 	waitq_init(&wq->aioq_waitq, WQT_QUEUE, SYNC_POLICY_FIFO);
312 }
313 
314 
315 /*
316  * Can be passed a queue which is locked spin.
317  */
318 static void
aio_workq_remove_entry_locked(aio_workq_t queue,aio_workq_entry * entryp)319 aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
320 {
321 	ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
322 
323 	if (entryp->aio_workq_link.tqe_prev == NULL) {
324 		panic("Trying to remove an entry from a work queue, but it is not on a queue");
325 	}
326 
327 	TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
328 	entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
329 }
330 
331 static void
aio_workq_add_entry_locked(aio_workq_t queue,aio_workq_entry * entryp)332 aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
333 {
334 	ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
335 
336 	TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
337 }
338 
339 static void
aio_proc_lock(proc_t procp)340 aio_proc_lock(proc_t procp)
341 {
342 	lck_mtx_lock(aio_proc_mutex(procp));
343 }
344 
345 static void
aio_proc_lock_spin(proc_t procp)346 aio_proc_lock_spin(proc_t procp)
347 {
348 	lck_mtx_lock_spin(aio_proc_mutex(procp));
349 }
350 
351 static bool
aio_has_any_work(void)352 aio_has_any_work(void)
353 {
354 	return os_atomic_load(&aio_anchor.aio_total_count, relaxed) != 0;
355 }
356 
357 static bool
aio_try_proc_insert_active_locked(proc_t procp,aio_workq_entry * entryp)358 aio_try_proc_insert_active_locked(proc_t procp, aio_workq_entry *entryp)
359 {
360 	int old, new;
361 
362 	ASSERT_AIO_PROC_LOCK_OWNED(procp);
363 
364 	if (procp->p_aio_total_count >= aio_max_requests_per_process) {
365 		return false;
366 	}
367 
368 	if (is_already_queued(procp, entryp->uaiocbp)) {
369 		return false;
370 	}
371 
372 	os_atomic_rmw_loop(&aio_anchor.aio_total_count, old, new, relaxed, {
373 		if (old >= aio_max_requests) {
374 		        os_atomic_rmw_loop_give_up(return false);
375 		}
376 		new = old + 1;
377 	});
378 
379 	TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link);
380 	procp->p_aio_total_count++;
381 	return true;
382 }
383 
384 static void
aio_proc_move_done_locked(proc_t procp,aio_workq_entry * entryp)385 aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
386 {
387 	TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link);
388 	TAILQ_INSERT_TAIL(&procp->p_aio_doneq, entryp, aio_proc_link);
389 }
390 
391 static void
aio_proc_remove_done_locked(proc_t procp,aio_workq_entry * entryp)392 aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
393 {
394 	TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
395 	entryp->aio_proc_link.tqe_prev = NULL;
396 	if (os_atomic_dec_orig(&aio_anchor.aio_total_count, relaxed) <= 0) {
397 		panic("Negative total AIO count!");
398 	}
399 	if (procp->p_aio_total_count-- <= 0) {
400 		panic("proc %p: p_aio_total_count accounting mismatch", procp);
401 	}
402 }
403 
404 static void
aio_proc_unlock(proc_t procp)405 aio_proc_unlock(proc_t procp)
406 {
407 	lck_mtx_unlock(aio_proc_mutex(procp));
408 }
409 
410 static lck_mtx_t*
aio_proc_mutex(proc_t procp)411 aio_proc_mutex(proc_t procp)
412 {
413 	return &procp->p_mlock;
414 }
415 
416 static void
aio_entry_ref(aio_workq_entry * entryp)417 aio_entry_ref(aio_workq_entry *entryp)
418 {
419 	os_ref_retain(&entryp->aio_refcount);
420 }
421 
422 static void
aio_entry_unref(aio_workq_entry * entryp)423 aio_entry_unref(aio_workq_entry *entryp)
424 {
425 	if (os_ref_release(&entryp->aio_refcount) == 0) {
426 		aio_free_request(entryp);
427 	}
428 }
429 
430 static bool
aio_entry_try_workq_remove(aio_workq_entry * entryp)431 aio_entry_try_workq_remove(aio_workq_entry *entryp)
432 {
433 	/* Can only be cancelled if it's still on a work queue */
434 	if (entryp->aio_workq_link.tqe_prev != NULL) {
435 		aio_workq_t queue;
436 
437 		/* Will have to check again under the lock */
438 		queue = aio_entry_workq(entryp);
439 		aio_workq_lock_spin(queue);
440 		if (entryp->aio_workq_link.tqe_prev != NULL) {
441 			aio_workq_remove_entry_locked(queue, entryp);
442 			aio_workq_unlock(queue);
443 			return true;
444 		} else {
445 			aio_workq_unlock(queue);
446 		}
447 	}
448 
449 	return false;
450 }
451 
452 static void
aio_workq_lock_spin(aio_workq_t wq)453 aio_workq_lock_spin(aio_workq_t wq)
454 {
455 	lck_spin_lock(aio_workq_lock(wq));
456 }
457 
458 static void
aio_workq_unlock(aio_workq_t wq)459 aio_workq_unlock(aio_workq_t wq)
460 {
461 	lck_spin_unlock(aio_workq_lock(wq));
462 }
463 
464 static lck_spin_t*
aio_workq_lock(aio_workq_t wq)465 aio_workq_lock(aio_workq_t wq)
466 {
467 	return &wq->aioq_lock;
468 }
469 
470 /*
471  * aio_cancel - attempt to cancel one or more async IO requests currently
472  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
473  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
474  * is NULL then all outstanding async IO request for the given file
475  * descriptor are cancelled (if possible).
476  */
477 int
aio_cancel(proc_t p,struct aio_cancel_args * uap,int * retval)478 aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval)
479 {
480 	struct user_aiocb my_aiocb;
481 	int               result;
482 
483 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) | DBG_FUNC_START,
484 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
485 
486 	/* quick check to see if there are any async IO requests queued up */
487 	if (!aio_has_any_work()) {
488 		result = 0;
489 		*retval = AIO_ALLDONE;
490 		goto ExitRoutine;
491 	}
492 
493 	*retval = -1;
494 	if (uap->aiocbp != USER_ADDR_NULL) {
495 		if (proc_is64bit(p)) {
496 			struct user64_aiocb aiocb64;
497 
498 			result = copyin(uap->aiocbp, &aiocb64, sizeof(aiocb64));
499 			if (result == 0) {
500 				do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
501 			}
502 		} else {
503 			struct user32_aiocb aiocb32;
504 
505 			result = copyin(uap->aiocbp, &aiocb32, sizeof(aiocb32));
506 			if (result == 0) {
507 				do_munge_aiocb_user32_to_user(&aiocb32, &my_aiocb);
508 			}
509 		}
510 
511 		if (result != 0) {
512 			result = EAGAIN;
513 			goto ExitRoutine;
514 		}
515 
516 		/* NOTE - POSIX standard says a mismatch between the file */
517 		/* descriptor passed in and the file descriptor embedded in */
518 		/* the aiocb causes unspecified results.  We return EBADF in */
519 		/* that situation.  */
520 		if (uap->fd != my_aiocb.aio_fildes) {
521 			result = EBADF;
522 			goto ExitRoutine;
523 		}
524 	}
525 
526 	aio_proc_lock(p);
527 	result = do_aio_cancel_locked(p, uap->fd, uap->aiocbp, 0);
528 	ASSERT_AIO_PROC_LOCK_OWNED(p);
529 	aio_proc_unlock(p);
530 
531 	if (result != -1) {
532 		*retval = result;
533 		result = 0;
534 		goto ExitRoutine;
535 	}
536 
537 	result = EBADF;
538 
539 ExitRoutine:
540 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) | DBG_FUNC_END,
541 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, result, 0, 0);
542 
543 	return result;
544 }
545 
546 
547 /*
548  * _aio_close - internal function used to clean up async IO requests for
549  * a file descriptor that is closing.
550  * THIS MAY BLOCK.
551  */
552 __private_extern__ void
_aio_close(proc_t p,int fd)553 _aio_close(proc_t p, int fd)
554 {
555 	int error;
556 
557 	/* quick check to see if there are any async IO requests queued up */
558 	if (!aio_has_any_work()) {
559 		return;
560 	}
561 
562 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) | DBG_FUNC_START,
563 	    VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
564 
565 	/* cancel all async IO requests on our todo queues for this file descriptor */
566 	aio_proc_lock(p);
567 	error = do_aio_cancel_locked(p, fd, USER_ADDR_NULL, AIO_CLOSE_WAIT);
568 	ASSERT_AIO_PROC_LOCK_OWNED(p);
569 	if (error == AIO_NOTCANCELED) {
570 		/*
571 		 * AIO_NOTCANCELED is returned when we find an aio request for this process
572 		 * and file descriptor on the active async IO queue.  Active requests cannot
573 		 * be cancelled so we must wait for them to complete.  We will get a special
574 		 * wake up call on our channel used to sleep for ALL active requests to
575 		 * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
576 		 * when we must wait for all active aio requests.
577 		 */
578 
579 		KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep) | DBG_FUNC_NONE,
580 		    VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
581 
582 		while (aio_proc_has_active_requests_for_file(p, fd)) {
583 			msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0);
584 		}
585 	}
586 
587 	aio_proc_unlock(p);
588 
589 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) | DBG_FUNC_END,
590 	    VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
591 }
592 
593 
594 /*
595  * aio_error - return the error status associated with the async IO
596  * request referred to by uap->aiocbp.  The error status is the errno
597  * value that would be set by the corresponding IO request (read, wrtie,
598  * fdatasync, or sync).
599  */
600 int
aio_error(proc_t p,struct aio_error_args * uap,int * retval)601 aio_error(proc_t p, struct aio_error_args *uap, int *retval)
602 {
603 	aio_workq_entry *entryp;
604 	int              error;
605 
606 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) | DBG_FUNC_START,
607 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
608 
609 	/* see if there are any aios to check */
610 	if (!aio_has_any_work()) {
611 		return EINVAL;
612 	}
613 
614 	aio_proc_lock(p);
615 
616 	/* look for a match on our queue of async IO requests that have completed */
617 	TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
618 		if (entryp->uaiocbp == uap->aiocbp) {
619 			ASSERT_AIO_FROM_PROC(entryp, p);
620 
621 			*retval = entryp->errorval;
622 			error = 0;
623 
624 			KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val) | DBG_FUNC_NONE,
625 			    VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
626 			goto ExitRoutine;
627 		}
628 	}
629 
630 	/* look for a match on our queue of active async IO requests */
631 	TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
632 		if (entryp->uaiocbp == uap->aiocbp) {
633 			ASSERT_AIO_FROM_PROC(entryp, p);
634 			*retval = EINPROGRESS;
635 			error = 0;
636 			KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq) | DBG_FUNC_NONE,
637 			    VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
638 			goto ExitRoutine;
639 		}
640 	}
641 
642 	error = EINVAL;
643 
644 ExitRoutine:
645 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) | DBG_FUNC_END,
646 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
647 	aio_proc_unlock(p);
648 
649 	return error;
650 }
651 
652 
653 /*
654  * aio_fsync - asynchronously force all IO operations associated
655  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
656  * queued at the time of the call to the synchronized completion state.
657  * NOTE - we do not support op O_DSYNC at this point since we do not support the
658  * fdatasync() call.
659  */
660 int
aio_fsync(proc_t p,struct aio_fsync_args * uap,int * retval)661 aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval)
662 {
663 	aio_entry_flags_t fsync_kind;
664 	int error;
665 
666 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) | DBG_FUNC_START,
667 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, uap->op, 0, 0);
668 
669 	*retval = 0;
670 	/* 0 := O_SYNC for binary backward compatibility with Panther */
671 	if (uap->op == O_SYNC || uap->op == 0) {
672 		fsync_kind = AIO_FSYNC;
673 	} else if (uap->op == O_DSYNC) {
674 		fsync_kind = AIO_DSYNC;
675 	} else {
676 		*retval = -1;
677 		error = EINVAL;
678 		goto ExitRoutine;
679 	}
680 
681 	error = aio_queue_async_request(p, uap->aiocbp, fsync_kind);
682 	if (error != 0) {
683 		*retval = -1;
684 	}
685 
686 ExitRoutine:
687 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) | DBG_FUNC_END,
688 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
689 
690 	return error;
691 }
692 
693 
694 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
695  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
696  * (uap->aiocbp->aio_buf).
697  */
698 int
aio_read(proc_t p,struct aio_read_args * uap,int * retval)699 aio_read(proc_t p, struct aio_read_args *uap, int *retval)
700 {
701 	int error;
702 
703 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) | DBG_FUNC_START,
704 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
705 
706 	*retval = 0;
707 
708 	error = aio_queue_async_request(p, uap->aiocbp, AIO_READ);
709 	if (error != 0) {
710 		*retval = -1;
711 	}
712 
713 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) | DBG_FUNC_END,
714 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
715 
716 	return error;
717 }
718 
719 
720 /*
721  * aio_return - return the return status associated with the async IO
722  * request referred to by uap->aiocbp.  The return status is the value
723  * that would be returned by corresponding IO request (read, write,
724  * fdatasync, or sync).  This is where we release kernel resources
725  * held for async IO call associated with the given aiocb pointer.
726  */
727 int
aio_return(proc_t p,struct aio_return_args * uap,user_ssize_t * retval)728 aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval)
729 {
730 	aio_workq_entry *entryp;
731 	int              error = EINVAL;
732 
733 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) | DBG_FUNC_START,
734 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
735 
736 	/* See if there are any entries to check */
737 	if (!aio_has_any_work()) {
738 		goto ExitRoutine;
739 	}
740 
741 	aio_proc_lock(p);
742 	*retval = 0;
743 
744 	/* look for a match on our queue of async IO requests that have completed */
745 	TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
746 		ASSERT_AIO_FROM_PROC(entryp, p);
747 		if (entryp->uaiocbp == uap->aiocbp) {
748 			/* Done and valid for aio_return(), pull it off the list */
749 			aio_proc_remove_done_locked(p, entryp);
750 
751 			*retval = entryp->returnval;
752 			error = 0;
753 			aio_proc_unlock(p);
754 
755 			aio_entry_unref(entryp);
756 
757 			KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val) | DBG_FUNC_NONE,
758 			    VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
759 			goto ExitRoutine;
760 		}
761 	}
762 
763 	/* look for a match on our queue of active async IO requests */
764 	TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
765 		ASSERT_AIO_FROM_PROC(entryp, p);
766 		if (entryp->uaiocbp == uap->aiocbp) {
767 			error = EINPROGRESS;
768 			KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq) | DBG_FUNC_NONE,
769 			    VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
770 			break;
771 		}
772 	}
773 
774 	aio_proc_unlock(p);
775 
776 ExitRoutine:
777 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) | DBG_FUNC_END,
778 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
779 
780 	return error;
781 }
782 
783 
784 /*
785  * _aio_exec - internal function used to clean up async IO requests for
786  * a process that is going away due to exec().  We cancel any async IOs
787  * we can and wait for those already active.  We also disable signaling
788  * for cancelled or active aio requests that complete.
789  * This routine MAY block!
790  */
791 __private_extern__ void
_aio_exec(proc_t p)792 _aio_exec(proc_t p)
793 {
794 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) | DBG_FUNC_START,
795 	    VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
796 
797 	_aio_exit(p);
798 
799 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) | DBG_FUNC_END,
800 	    VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
801 }
802 
803 
804 /*
805  * _aio_exit - internal function used to clean up async IO requests for
806  * a process that is terminating (via exit() or exec()).  We cancel any async IOs
807  * we can and wait for those already active.  We also disable signaling
808  * for cancelled or active aio requests that complete.  This routine MAY block!
809  */
810 __private_extern__ void
_aio_exit(proc_t p)811 _aio_exit(proc_t p)
812 {
813 	TAILQ_HEAD(, aio_workq_entry) tofree = TAILQ_HEAD_INITIALIZER(tofree);
814 	aio_workq_entry *entryp, *tmp;
815 	int              error;
816 
817 	/* quick check to see if there are any async IO requests queued up */
818 	if (!aio_has_any_work()) {
819 		return;
820 	}
821 
822 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) | DBG_FUNC_START,
823 	    VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
824 
825 	aio_proc_lock(p);
826 
827 	/*
828 	 * cancel async IO requests on the todo work queue and wait for those
829 	 * already active to complete.
830 	 */
831 	error = do_aio_cancel_locked(p, -1, USER_ADDR_NULL, AIO_EXIT_WAIT);
832 	ASSERT_AIO_PROC_LOCK_OWNED(p);
833 	if (error == AIO_NOTCANCELED) {
834 		/*
835 		 * AIO_NOTCANCELED is returned when we find an aio request for this process
836 		 * on the active async IO queue.  Active requests cannot be cancelled so we
837 		 * must wait for them to complete.  We will get a special wake up call on
838 		 * our channel used to sleep for ALL active requests to complete.  This sleep
839 		 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
840 		 * active aio requests.
841 		 */
842 
843 		KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep) | DBG_FUNC_NONE,
844 		    VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
845 
846 		while (aio_has_active_requests_for_process(p)) {
847 			msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0);
848 		}
849 	}
850 
851 	assert(!aio_has_active_requests_for_process(p));
852 
853 	/* release all aio resources used by this process */
854 	TAILQ_FOREACH_SAFE(entryp, &p->p_aio_doneq, aio_proc_link, tmp) {
855 		ASSERT_AIO_FROM_PROC(entryp, p);
856 
857 		aio_proc_remove_done_locked(p, entryp);
858 		TAILQ_INSERT_TAIL(&tofree, entryp, aio_proc_link);
859 	}
860 
861 	aio_proc_unlock(p);
862 
863 	/* free all the entries outside of the aio_proc_lock() */
864 	TAILQ_FOREACH_SAFE(entryp, &tofree, aio_proc_link, tmp) {
865 		entryp->aio_proc_link.tqe_prev = NULL;
866 		aio_entry_unref(entryp);
867 	}
868 
869 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) | DBG_FUNC_END,
870 	    VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
871 }
872 
873 
874 static bool
should_cancel(aio_workq_entry * entryp,int fd,user_addr_t aiocbp,aio_entry_flags_t reason)875 should_cancel(aio_workq_entry *entryp, int fd, user_addr_t aiocbp,
876     aio_entry_flags_t reason)
877 {
878 	if (reason & AIO_EXIT_WAIT) {
879 		/* caller is _aio_exit() */
880 		return true;
881 	}
882 	if (fd != entryp->aiocb.aio_fildes) {
883 		/* not the file we're looking for */
884 		return false;
885 	}
886 	/*
887 	 * aio_cancel() or _aio_close() cancel
888 	 * everything for a given fd when aiocbp is NULL
889 	 */
890 	return aiocbp == USER_ADDR_NULL || entryp->uaiocbp == aiocbp;
891 }
892 
893 /*
894  * do_aio_cancel_locked - cancel async IO requests (if possible).  We get called by
895  * aio_cancel, close, and at exit.
896  * There are three modes of operation: 1) cancel all async IOs for a process -
897  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
898  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
899  * aiocbp.
900  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
901  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
902  * target async IO requests, and AIO_ALLDONE if all target async IO requests
903  * were already complete.
904  * WARNING - do not deference aiocbp in this routine, it may point to user
905  * land data that has not been copied in (when called from aio_cancel())
906  *
907  * Called with proc locked, and returns the same way.
908  */
909 static int
do_aio_cancel_locked(proc_t p,int fd,user_addr_t aiocbp,aio_entry_flags_t reason)910 do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
911     aio_entry_flags_t reason)
912 {
913 	bool multiple_matches = (aiocbp == USER_ADDR_NULL);
914 	aio_workq_entry *entryp, *tmp;
915 	int result;
916 
917 	ASSERT_AIO_PROC_LOCK_OWNED(p);
918 
919 	/* look for a match on our queue of async todo work. */
920 again:
921 	result = -1;
922 	TAILQ_FOREACH_SAFE(entryp, &p->p_aio_activeq, aio_proc_link, tmp) {
923 		ASSERT_AIO_FROM_PROC(entryp, p);
924 
925 		if (!should_cancel(entryp, fd, aiocbp, reason)) {
926 			continue;
927 		}
928 
929 		if (reason) {
930 			/* mark the entry as blocking close or exit/exec */
931 			entryp->flags |= reason;
932 			if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
933 				panic("Close and exit flags set at the same time");
934 			}
935 		}
936 
937 		/* Can only be cancelled if it's still on a work queue */
938 		if (aio_entry_try_workq_remove(entryp)) {
939 			entryp->errorval = ECANCELED;
940 			entryp->returnval = -1;
941 
942 			/* Now it's officially cancelled.  Do the completion */
943 			KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq) | DBG_FUNC_NONE,
944 			    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
945 			    fd, 0, 0);
946 			do_aio_completion_and_unlock(p, entryp);
947 
948 			aio_proc_lock(p);
949 
950 			if (multiple_matches) {
951 				/*
952 				 * Restart from the head of the proc active queue since it
953 				 * may have been changed while we were away doing completion
954 				 * processing.
955 				 *
956 				 * Note that if we found an uncancellable AIO before, we will
957 				 * either find it again or discover that it's been completed,
958 				 * so resetting the result will not cause us to return success
959 				 * despite outstanding AIOs.
960 				 */
961 				goto again;
962 			}
963 
964 			return AIO_CANCELED;
965 		}
966 
967 		/*
968 		 * It's been taken off the active queue already, i.e. is in flight.
969 		 * All we can do is ask for notification.
970 		 */
971 		KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq) | DBG_FUNC_NONE,
972 		    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
973 		    fd, 0, 0);
974 
975 		result = AIO_NOTCANCELED;
976 		if (!multiple_matches) {
977 			return result;
978 		}
979 	}
980 
981 	/*
982 	 * if we didn't find any matches on the todo or active queues then look for a
983 	 * match on our queue of async IO requests that have completed and if found
984 	 * return AIO_ALLDONE result.
985 	 *
986 	 * Proc AIO lock is still held.
987 	 */
988 	if (result == -1) {
989 		TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
990 			ASSERT_AIO_FROM_PROC(entryp, p);
991 			if (should_cancel(entryp, fd, aiocbp, reason)) {
992 				KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq) | DBG_FUNC_NONE,
993 				    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
994 				    fd, 0, 0);
995 
996 				result = AIO_ALLDONE;
997 				if (!multiple_matches) {
998 					return result;
999 				}
1000 			}
1001 		}
1002 	}
1003 
1004 	return result;
1005 }
1006 
1007 
1008 /*
1009  * aio_suspend - suspend the calling thread until at least one of the async
1010  * IO operations referenced by uap->aiocblist has completed, until a signal
1011  * interrupts the function, or uap->timeoutp time interval (optional) has
1012  * passed.
1013  * Returns 0 if one or more async IOs have completed else -1 and errno is
1014  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1015  * woke us up.
1016  */
1017 int
aio_suspend(proc_t p,struct aio_suspend_args * uap,int * retval)1018 aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval)
1019 {
1020 	__pthread_testcancel(1);
1021 	return aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval);
1022 }
1023 
1024 
1025 int
aio_suspend_nocancel(proc_t p,struct aio_suspend_nocancel_args * uap,int * retval)1026 aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval)
1027 {
1028 	int                     error;
1029 	int                     i;
1030 	uint64_t                abstime;
1031 	struct user_timespec    ts;
1032 	aio_workq_entry        *entryp;
1033 	user_addr_t            *aiocbpp;
1034 	size_t                  aiocbpp_size;
1035 
1036 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) | DBG_FUNC_START,
1037 	    VM_KERNEL_ADDRPERM(p), uap->nent, 0, 0, 0);
1038 
1039 	*retval = -1;
1040 	abstime = 0;
1041 	aiocbpp = NULL;
1042 
1043 	if (!aio_has_any_work()) {
1044 		error = EINVAL;
1045 		goto ExitThisRoutine;
1046 	}
1047 
1048 	if (uap->nent < 1 || uap->nent > aio_max_requests_per_process ||
1049 	    os_mul_overflow(sizeof(user_addr_t), uap->nent, &aiocbpp_size)) {
1050 		error = EINVAL;
1051 		goto ExitThisRoutine;
1052 	}
1053 
1054 	if (uap->timeoutp != USER_ADDR_NULL) {
1055 		if (proc_is64bit(p)) {
1056 			struct user64_timespec temp;
1057 			error = copyin(uap->timeoutp, &temp, sizeof(temp));
1058 			if (error == 0) {
1059 				ts.tv_sec = (user_time_t)temp.tv_sec;
1060 				ts.tv_nsec = (user_long_t)temp.tv_nsec;
1061 			}
1062 		} else {
1063 			struct user32_timespec temp;
1064 			error = copyin(uap->timeoutp, &temp, sizeof(temp));
1065 			if (error == 0) {
1066 				ts.tv_sec = temp.tv_sec;
1067 				ts.tv_nsec = temp.tv_nsec;
1068 			}
1069 		}
1070 		if (error != 0) {
1071 			error = EAGAIN;
1072 			goto ExitThisRoutine;
1073 		}
1074 
1075 		if (ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) {
1076 			error = EINVAL;
1077 			goto ExitThisRoutine;
1078 		}
1079 
1080 		nanoseconds_to_absolutetime((uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1081 		    &abstime);
1082 		clock_absolutetime_interval_to_deadline(abstime, &abstime);
1083 	}
1084 
1085 	aiocbpp = (user_addr_t *)kalloc_data(aiocbpp_size, Z_WAITOK);
1086 	if (aiocbpp == NULL || aio_copy_in_list(p, uap->aiocblist, aiocbpp, uap->nent)) {
1087 		error = EAGAIN;
1088 		goto ExitThisRoutine;
1089 	}
1090 
1091 	/* check list of aio requests to see if any have completed */
1092 check_for_our_aiocbp:
1093 	aio_proc_lock_spin(p);
1094 	for (i = 0; i < uap->nent; i++) {
1095 		user_addr_t     aiocbp;
1096 
1097 		/* NULL elements are legal so check for 'em */
1098 		aiocbp = *(aiocbpp + i);
1099 		if (aiocbp == USER_ADDR_NULL) {
1100 			continue;
1101 		}
1102 
1103 		/* return immediately if any aio request in the list is done */
1104 		TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1105 			ASSERT_AIO_FROM_PROC(entryp, p);
1106 			if (entryp->uaiocbp == aiocbp) {
1107 				aio_proc_unlock(p);
1108 				*retval = 0;
1109 				error = 0;
1110 				goto ExitThisRoutine;
1111 			}
1112 		}
1113 	}
1114 
1115 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep) | DBG_FUNC_NONE,
1116 	    VM_KERNEL_ADDRPERM(p), uap->nent, 0, 0, 0);
1117 
1118 	/*
1119 	 * wait for an async IO to complete or a signal fires or timeout expires.
1120 	 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1121 	 * interrupts us.  If an async IO completes before a signal fires or our
1122 	 * timeout expires, we get a wakeup call from aio_work_thread().
1123 	 */
1124 
1125 	error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p),
1126 	    PCATCH | PWAIT | PDROP, "aio_suspend", abstime);
1127 	if (error == 0) {
1128 		/*
1129 		 * got our wakeup call from aio_work_thread().
1130 		 * Since we can get a wakeup on this channel from another thread in the
1131 		 * same process we head back up to make sure this is for the correct aiocbp.
1132 		 * If it is the correct aiocbp we will return from where we do the check
1133 		 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1134 		 * else we will fall out and just sleep again.
1135 		 */
1136 		goto check_for_our_aiocbp;
1137 	} else if (error == EWOULDBLOCK) {
1138 		/* our timeout expired */
1139 		error = EAGAIN;
1140 	} else {
1141 		/* we were interrupted */
1142 		error = EINTR;
1143 	}
1144 
1145 ExitThisRoutine:
1146 	if (aiocbpp != NULL) {
1147 		kfree_data(aiocbpp, aiocbpp_size);
1148 	}
1149 
1150 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) | DBG_FUNC_END,
1151 	    VM_KERNEL_ADDRPERM(p), uap->nent, error, 0, 0);
1152 
1153 	return error;
1154 }
1155 
1156 
1157 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1158  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1159  * (uap->aiocbp->aio_buf).
1160  */
1161 
1162 int
aio_write(proc_t p,struct aio_write_args * uap,int * retval __unused)1163 aio_write(proc_t p, struct aio_write_args *uap, int *retval __unused)
1164 {
1165 	int error;
1166 
1167 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) | DBG_FUNC_START,
1168 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
1169 
1170 	error = aio_queue_async_request(p, uap->aiocbp, AIO_WRITE);
1171 
1172 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) | DBG_FUNC_END,
1173 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
1174 
1175 	return error;
1176 }
1177 
1178 
1179 static int
aio_copy_in_list(proc_t procp,user_addr_t aiocblist,user_addr_t * aiocbpp,int nent)1180 aio_copy_in_list(proc_t procp, user_addr_t aiocblist, user_addr_t *aiocbpp,
1181     int nent)
1182 {
1183 	int result;
1184 
1185 	/* copyin our aiocb pointers from list */
1186 	result = copyin(aiocblist, aiocbpp,
1187 	    proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1188 	    : (nent * sizeof(user32_addr_t)));
1189 	if (result) {
1190 		return result;
1191 	}
1192 
1193 	/*
1194 	 * We depend on a list of user_addr_t's so we need to
1195 	 * munge and expand when these pointers came from a
1196 	 * 32-bit process
1197 	 */
1198 	if (!proc_is64bit(procp)) {
1199 		/* copy from last to first to deal with overlap */
1200 		user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1201 		user_addr_t *my_addrp = aiocbpp + (nent - 1);
1202 
1203 		for (int i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
1204 			*my_addrp = (user_addr_t) (*my_ptrp);
1205 		}
1206 	}
1207 
1208 	return 0;
1209 }
1210 
1211 
1212 static int
aio_copy_in_sigev(proc_t procp,user_addr_t sigp,struct user_sigevent * sigev)1213 aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1214 {
1215 	int     result = 0;
1216 
1217 	if (sigp == USER_ADDR_NULL) {
1218 		goto out;
1219 	}
1220 
1221 	/*
1222 	 * We need to munge aio_sigevent since it contains pointers.
1223 	 * Since we do not know if sigev_value is an int or a ptr we do
1224 	 * NOT cast the ptr to a user_addr_t.   This means if we send
1225 	 * this info back to user space we need to remember sigev_value
1226 	 * was not expanded for the 32-bit case.
1227 	 *
1228 	 * Notes:	 This does NOT affect us since we don't support
1229 	 *		sigev_value yet in the aio context.
1230 	 */
1231 	if (proc_is64bit(procp)) {
1232 #if __LP64__
1233 		struct user64_sigevent sigevent64;
1234 
1235 		result = copyin(sigp, &sigevent64, sizeof(sigevent64));
1236 		if (result == 0) {
1237 			sigev->sigev_notify = sigevent64.sigev_notify;
1238 			sigev->sigev_signo = sigevent64.sigev_signo;
1239 			sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1240 			sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1241 			sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1242 		}
1243 #else
1244 		panic("64bit process on 32bit kernel is not supported");
1245 #endif
1246 	} else {
1247 		struct user32_sigevent sigevent32;
1248 
1249 		result = copyin(sigp, &sigevent32, sizeof(sigevent32));
1250 		if (result == 0) {
1251 			sigev->sigev_notify = sigevent32.sigev_notify;
1252 			sigev->sigev_signo = sigevent32.sigev_signo;
1253 			sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1254 			sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1255 			sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1256 		}
1257 	}
1258 
1259 	if (result != 0) {
1260 		result = EAGAIN;
1261 	}
1262 
1263 out:
1264 	return result;
1265 }
1266 
1267 /*
1268  * validate user_sigevent.  at this point we only support
1269  * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE.  this means
1270  * sigev_value, sigev_notify_function, and sigev_notify_attributes
1271  * are ignored, since SIGEV_THREAD is unsupported.  This is consistent
1272  * with no [RTS] (RalTime Signal) option group support.
1273  */
1274 static int
aio_sigev_validate(const struct user_sigevent * sigev)1275 aio_sigev_validate(const struct user_sigevent *sigev)
1276 {
1277 	switch (sigev->sigev_notify) {
1278 	case SIGEV_SIGNAL:
1279 	{
1280 		int signum;
1281 
1282 		/* make sure we have a valid signal number */
1283 		signum = sigev->sigev_signo;
1284 		if (signum <= 0 || signum >= NSIG ||
1285 		    signum == SIGKILL || signum == SIGSTOP) {
1286 			return EINVAL;
1287 		}
1288 	}
1289 	break;
1290 
1291 	case SIGEV_NONE:
1292 		break;
1293 
1294 	case SIGEV_THREAD:
1295 	/* Unsupported [RTS] */
1296 
1297 	default:
1298 		return EINVAL;
1299 	}
1300 
1301 	return 0;
1302 }
1303 
1304 
1305 /*
1306  * aio_try_enqueue_work_locked
1307  *
1308  * Queue up the entry on the aio asynchronous work queue in priority order
1309  * based on the relative priority of the request.  We calculate the relative
1310  * priority using the nice value of the caller and the value
1311  *
1312  * Parameters:	procp			Process queueing the I/O
1313  *		entryp			The work queue entry being queued
1314  *		leader			The work leader if any
1315  *
1316  * Returns:	Wether the enqueue was successful
1317  *
1318  * Notes:	This function is used for both lio_listio and aio
1319  *
1320  * XXX:		At some point, we may have to consider thread priority
1321  *		rather than process priority, but we don't maintain the
1322  *		adjusted priority for threads the POSIX way.
1323  *
1324  * Called with proc locked.
1325  */
1326 static bool
aio_try_enqueue_work_locked(proc_t procp,aio_workq_entry * entryp,aio_workq_entry * leader)1327 aio_try_enqueue_work_locked(proc_t procp, aio_workq_entry *entryp,
1328     aio_workq_entry *leader)
1329 {
1330 	aio_workq_t queue = aio_entry_workq(entryp);
1331 
1332 	ASSERT_AIO_PROC_LOCK_OWNED(procp);
1333 
1334 	/* Onto proc queue */
1335 	if (!aio_try_proc_insert_active_locked(procp, entryp)) {
1336 		return false;
1337 	}
1338 
1339 	if (leader) {
1340 		aio_entry_ref(leader); /* consumed in do_aio_completion_and_unlock */
1341 		leader->lio_pending++;
1342 		entryp->lio_leader = leader;
1343 	}
1344 
1345 	/* And work queue */
1346 	aio_entry_ref(entryp); /* consumed in do_aio_completion_and_unlock */
1347 	aio_workq_lock_spin(queue);
1348 	aio_workq_add_entry_locked(queue, entryp);
1349 	waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue),
1350 	    THREAD_AWAKENED, WAITQ_WAKEUP_DEFAULT);
1351 	aio_workq_unlock(queue);
1352 
1353 	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) | DBG_FUNC_START,
1354 	    VM_KERNEL_ADDRPERM(procp), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1355 	    entryp->flags, entryp->aiocb.aio_fildes, 0);
1356 	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) | DBG_FUNC_END,
1357 	    entryp->aiocb.aio_offset, 0, entryp->aiocb.aio_nbytes, 0, 0);
1358 	return true;
1359 }
1360 
1361 
1362 /*
1363  * lio_listio - initiate a list of IO requests.  We process the list of
1364  * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1365  * (mode == LIO_NOWAIT).
1366  *
1367  * The caller gets error and return status for each aiocb in the list
1368  * via aio_error and aio_return.  We must keep completed requests until
1369  * released by the aio_return call.
1370  */
1371 int
lio_listio(proc_t p,struct lio_listio_args * uap,int * retval __unused)1372 lio_listio(proc_t p, struct lio_listio_args *uap, int *retval __unused)
1373 {
1374 	aio_workq_entry         *entries[AIO_LISTIO_MAX] = { };
1375 	user_addr_t              aiocbpp[AIO_LISTIO_MAX];
1376 	struct user_sigevent     aiosigev = { };
1377 	int                      result = 0;
1378 	int                      lio_count = 0;
1379 
1380 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) | DBG_FUNC_START,
1381 	    VM_KERNEL_ADDRPERM(p), uap->nent, uap->mode, 0, 0);
1382 
1383 	if (!(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT)) {
1384 		result = EINVAL;
1385 		goto ExitRoutine;
1386 	}
1387 
1388 	if (uap->nent < 1 || uap->nent > AIO_LISTIO_MAX) {
1389 		result = EINVAL;
1390 		goto ExitRoutine;
1391 	}
1392 
1393 	/*
1394 	 * Use sigevent passed in to lio_listio for each of our calls, but
1395 	 * only do completion notification after the last request completes.
1396 	 */
1397 	if (uap->sigp != USER_ADDR_NULL) {
1398 		result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1399 		if (result) {
1400 			goto ExitRoutine;
1401 		}
1402 		result = aio_sigev_validate(&aiosigev);
1403 		if (result) {
1404 			goto ExitRoutine;
1405 		}
1406 	}
1407 
1408 	if (aio_copy_in_list(p, uap->aiocblist, aiocbpp, uap->nent)) {
1409 		result = EAGAIN;
1410 		goto ExitRoutine;
1411 	}
1412 
1413 	/*
1414 	 * allocate/parse all entries
1415 	 */
1416 	for (int i = 0; i < uap->nent; i++) {
1417 		aio_workq_entry *entryp;
1418 
1419 		/* NULL elements are legal so check for 'em */
1420 		if (aiocbpp[i] == USER_ADDR_NULL) {
1421 			continue;
1422 		}
1423 
1424 		entryp = aio_create_queue_entry(p, aiocbpp[i], AIO_LIO);
1425 		if (entryp == NULL) {
1426 			result = EAGAIN;
1427 			goto ExitRoutine;
1428 		}
1429 
1430 		/*
1431 		 * This refcount is cleaned up on exit if the entry
1432 		 * isn't submitted
1433 		 */
1434 		entries[lio_count++] = entryp;
1435 		if (uap->mode == LIO_NOWAIT) {
1436 			/* Set signal hander, if any */
1437 			entryp->aiocb.aio_sigevent = aiosigev;
1438 		}
1439 	}
1440 
1441 	if (lio_count == 0) {
1442 		/* There's nothing to submit */
1443 		goto ExitRoutine;
1444 	}
1445 
1446 	/*
1447 	 * Past this point we're commited and will not bail out
1448 	 *
1449 	 * - keep a reference on the leader for LIO_WAIT
1450 	 * - perform the submissions and optionally wait
1451 	 */
1452 
1453 	aio_workq_entry *leader = entries[0];
1454 	if (uap->mode == LIO_WAIT) {
1455 		aio_entry_ref(leader); /* consumed below */
1456 	}
1457 
1458 	aio_proc_lock_spin(p);
1459 
1460 	for (int i = 0; i < lio_count; i++) {
1461 		if (aio_try_enqueue_work_locked(p, entries[i], leader)) {
1462 			entries[i] = NULL; /* the entry was submitted */
1463 		} else {
1464 			result = EAGAIN;
1465 		}
1466 	}
1467 
1468 	if (uap->mode == LIO_WAIT && result == 0) {
1469 		leader->flags |= AIO_LIO_WAIT;
1470 
1471 		while (leader->lio_pending) {
1472 			/* If we were interrupted, fail out (even if all finished) */
1473 			if (msleep(leader, aio_proc_mutex(p),
1474 			    PCATCH | PRIBIO | PSPIN, "lio_listio", 0) != 0) {
1475 				result = EINTR;
1476 				break;
1477 			}
1478 		}
1479 
1480 		leader->flags &= ~AIO_LIO_WAIT;
1481 	}
1482 
1483 	aio_proc_unlock(p);
1484 
1485 	if (uap->mode == LIO_WAIT) {
1486 		aio_entry_unref(leader);
1487 	}
1488 
1489 ExitRoutine:
1490 	/* Consume unsubmitted entries */
1491 	for (int i = 0; i < lio_count; i++) {
1492 		if (entries[i]) {
1493 			aio_entry_unref(entries[i]);
1494 		}
1495 	}
1496 
1497 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) | DBG_FUNC_END,
1498 	    VM_KERNEL_ADDRPERM(p), result, 0, 0, 0);
1499 
1500 	return result;
1501 }
1502 
1503 
1504 /*
1505  * aio worker thread.  this is where all the real work gets done.
1506  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1507  * after new work is queued up.
1508  */
1509 __attribute__((noreturn))
1510 static void
aio_work_thread(void * arg __unused,wait_result_t wr __unused)1511 aio_work_thread(void *arg __unused, wait_result_t wr __unused)
1512 {
1513 	aio_workq_entry *entryp;
1514 	int              error;
1515 	vm_map_t         currentmap;
1516 	vm_map_t         oldmap = VM_MAP_NULL;
1517 	task_t           oldaiotask = TASK_NULL;
1518 	struct uthread  *uthreadp = NULL;
1519 	proc_t           p = NULL;
1520 
1521 	for (;;) {
1522 		/*
1523 		 * returns with the entry ref'ed.
1524 		 * sleeps until work is available.
1525 		 */
1526 		entryp = aio_get_some_work();
1527 		p = entryp->procp;
1528 
1529 		KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) | DBG_FUNC_START,
1530 		    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1531 		    entryp->flags, 0, 0);
1532 
1533 		/*
1534 		 * Assume the target's address space identity for the duration
1535 		 * of the IO.  Note: don't need to have the entryp locked,
1536 		 * because the proc and map don't change until it's freed.
1537 		 */
1538 		currentmap = get_task_map(proc_task(current_proc()));
1539 		if (currentmap != entryp->aio_map) {
1540 			uthreadp = (struct uthread *) current_uthread();
1541 			oldaiotask = uthreadp->uu_aio_task;
1542 			/*
1543 			 * workq entries at this stage cause _aio_exec() and _aio_exit() to
1544 			 * block until we hit `do_aio_completion_and_unlock()` below,
1545 			 * which means that it is safe to dereference p->task without
1546 			 * holding a lock or taking references.
1547 			 */
1548 			uthreadp->uu_aio_task = proc_task(p);
1549 			oldmap = vm_map_switch(entryp->aio_map);
1550 		}
1551 
1552 		if ((entryp->flags & AIO_READ) != 0) {
1553 			error = do_aio_read(entryp);
1554 		} else if ((entryp->flags & AIO_WRITE) != 0) {
1555 			uthreadp = (struct uthread *)current_uthread();
1556 			uthread_t context_uthreadp = get_bsdthread_info(vfs_context_thread(&entryp->context));
1557 
1558 			if ((context_uthreadp && (context_uthreadp->uu_flag & UT_FS_BLKSIZE_NOCACHE_WRITES)) ||
1559 			    os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_NOCACHE_WRITE_FS_BLKSIZE) {
1560 				uthreadp->uu_flag |= UT_FS_BLKSIZE_NOCACHE_WRITES;
1561 			}
1562 
1563 			error = do_aio_write(entryp);
1564 
1565 			uthreadp->uu_flag &= ~UT_FS_BLKSIZE_NOCACHE_WRITES;
1566 		} else if ((entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0) {
1567 			error = do_aio_fsync(entryp);
1568 		} else {
1569 			error = EINVAL;
1570 		}
1571 
1572 		/* Restore old map */
1573 		if (currentmap != entryp->aio_map) {
1574 			vm_map_switch(oldmap);
1575 			uthreadp->uu_aio_task = oldaiotask;
1576 		}
1577 
1578 		/* liberate unused map */
1579 		vm_map_deallocate(entryp->aio_map);
1580 		entryp->aio_map = VM_MAP_NULL;
1581 
1582 		KERNEL_DEBUG(SDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) | DBG_FUNC_END,
1583 		    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1584 		    entryp->errorval, entryp->returnval, 0);
1585 
1586 		/* we're done with the IO request so pop it off the active queue and */
1587 		/* push it on the done queue */
1588 		aio_proc_lock(p);
1589 		entryp->errorval = error;
1590 		do_aio_completion_and_unlock(p, entryp);
1591 	}
1592 }
1593 
1594 
1595 /*
1596  * aio_get_some_work - get the next async IO request that is ready to be executed.
1597  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1598  * IO requests at the time the aio_fsync call came in have completed.
1599  * NOTE - AIO_LOCK must be held by caller
1600  */
1601 static aio_workq_entry *
aio_get_some_work(void)1602 aio_get_some_work(void)
1603 {
1604 	aio_workq_entry *entryp = NULL;
1605 	aio_workq_t      queue = NULL;
1606 
1607 	/* Just one queue for the moment.  In the future there will be many. */
1608 	queue = &aio_anchor.aio_async_workqs[0];
1609 	aio_workq_lock_spin(queue);
1610 
1611 	/*
1612 	 * Hold the queue lock.
1613 	 *
1614 	 * pop some work off the work queue and add to our active queue
1615 	 * Always start with the queue lock held.
1616 	 */
1617 	while ((entryp = TAILQ_FIRST(&queue->aioq_entries))) {
1618 		/*
1619 		 * Pull of of work queue.  Once it's off, it can't be cancelled,
1620 		 * so we can take our ref once we drop the queue lock.
1621 		 */
1622 
1623 		aio_workq_remove_entry_locked(queue, entryp);
1624 
1625 		aio_workq_unlock(queue);
1626 
1627 		/*
1628 		 * Check if it's an fsync that must be delayed.  No need to lock the entry;
1629 		 * that flag would have been set at initialization.
1630 		 */
1631 		if ((entryp->flags & AIO_FSYNC) != 0) {
1632 			/*
1633 			 * Check for unfinished operations on the same file
1634 			 * in this proc's queue.
1635 			 */
1636 			aio_proc_lock_spin(entryp->procp);
1637 			if (aio_delay_fsync_request(entryp)) {
1638 				/* It needs to be delayed.  Put it back on the end of the work queue */
1639 				KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay) | DBG_FUNC_NONE,
1640 				    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1641 				    0, 0, 0);
1642 
1643 				aio_proc_unlock(entryp->procp);
1644 
1645 				aio_workq_lock_spin(queue);
1646 				aio_workq_add_entry_locked(queue, entryp);
1647 				continue;
1648 			}
1649 			aio_proc_unlock(entryp->procp);
1650 		}
1651 
1652 		return entryp;
1653 	}
1654 
1655 	/* We will wake up when someone enqueues something */
1656 	waitq_assert_wait64(&queue->aioq_waitq, CAST_EVENT64_T(queue), THREAD_UNINT, 0);
1657 	aio_workq_unlock(queue);
1658 	thread_block(aio_work_thread);
1659 
1660 	__builtin_unreachable();
1661 }
1662 
1663 /*
1664  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1665  * A big, simple hammer: only send it off if it's the most recently filed IO which has
1666  * not been completed.
1667  */
1668 static boolean_t
aio_delay_fsync_request(aio_workq_entry * entryp)1669 aio_delay_fsync_request(aio_workq_entry *entryp)
1670 {
1671 	if (proc_in_teardown(entryp->procp)) {
1672 		/*
1673 		 * we can't delay FSYNCS when in teardown as it will confuse _aio_exit,
1674 		 * if it was dequeued, then we must now commit to it
1675 		 */
1676 		return FALSE;
1677 	}
1678 
1679 	if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1680 		return FALSE;
1681 	}
1682 
1683 	return TRUE;
1684 }
1685 
1686 static aio_workq_entry *
aio_create_queue_entry(proc_t procp,user_addr_t aiocbp,aio_entry_flags_t flags)1687 aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t flags)
1688 {
1689 	aio_workq_entry *entryp;
1690 
1691 	entryp = zalloc_flags(aio_workq_zonep, Z_WAITOK | Z_ZERO);
1692 	entryp->procp = procp;
1693 	entryp->uaiocbp = aiocbp;
1694 	entryp->flags = flags;
1695 	/* consumed in aio_return or _aio_exit */
1696 	os_ref_init(&entryp->aio_refcount, &aio_refgrp);
1697 
1698 	if (proc_is64bit(procp)) {
1699 		struct user64_aiocb aiocb64;
1700 
1701 		if (copyin(aiocbp, &aiocb64, sizeof(aiocb64)) != 0) {
1702 			goto error_exit;
1703 		}
1704 		do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1705 	} else {
1706 		struct user32_aiocb aiocb32;
1707 
1708 		if (copyin(aiocbp, &aiocb32, sizeof(aiocb32)) != 0) {
1709 			goto error_exit;
1710 		}
1711 		do_munge_aiocb_user32_to_user(&aiocb32, &entryp->aiocb);
1712 	}
1713 
1714 	/* do some more validation on the aiocb and embedded file descriptor */
1715 	if (aio_validate(procp, entryp) != 0) {
1716 		goto error_exit;
1717 	}
1718 
1719 	/* get a reference to the user land map in order to keep it around */
1720 	entryp->aio_map = get_task_map(proc_task(procp));
1721 	vm_map_reference(entryp->aio_map);
1722 
1723 	/* get a reference on the current_thread, which is passed in vfs_context. */
1724 	entryp->context = *vfs_context_current();
1725 	thread_reference(entryp->context.vc_thread);
1726 	kauth_cred_ref(entryp->context.vc_ucred);
1727 	return entryp;
1728 
1729 error_exit:
1730 	zfree(aio_workq_zonep, entryp);
1731 	return NULL;
1732 }
1733 
1734 
1735 /*
1736  * aio_queue_async_request - queue up an async IO request on our work queue then
1737  * wake up one of our worker threads to do the actual work.  We get a reference
1738  * to our caller's user land map in order to keep it around while we are
1739  * processing the request.
1740  */
1741 static int
aio_queue_async_request(proc_t procp,user_addr_t aiocbp,aio_entry_flags_t flags)1742 aio_queue_async_request(proc_t procp, user_addr_t aiocbp,
1743     aio_entry_flags_t flags)
1744 {
1745 	aio_workq_entry *entryp;
1746 	int              result;
1747 
1748 	entryp = aio_create_queue_entry(procp, aiocbp, flags);
1749 	if (entryp == NULL) {
1750 		result = EAGAIN;
1751 		goto error_noalloc;
1752 	}
1753 
1754 	aio_proc_lock_spin(procp);
1755 	if (!aio_try_enqueue_work_locked(procp, entryp, NULL)) {
1756 		result = EAGAIN;
1757 		goto error_exit;
1758 	}
1759 	aio_proc_unlock(procp);
1760 	return 0;
1761 
1762 error_exit:
1763 	/*
1764 	 * This entry has not been queued up so no worries about
1765 	 * unlocked state and aio_map
1766 	 */
1767 	aio_proc_unlock(procp);
1768 	aio_free_request(entryp);
1769 error_noalloc:
1770 	return result;
1771 }
1772 
1773 
1774 /*
1775  * aio_free_request - remove our reference on the user land map and
1776  * free the work queue entry resources.  The entry is off all lists
1777  * and has zero refcount, so no one can have a pointer to it.
1778  */
1779 static void
aio_free_request(aio_workq_entry * entryp)1780 aio_free_request(aio_workq_entry *entryp)
1781 {
1782 	if (entryp->aio_proc_link.tqe_prev || entryp->aio_workq_link.tqe_prev) {
1783 		panic("aio_workq_entry %p being freed while still enqueued", entryp);
1784 	}
1785 
1786 	/* remove our reference to the user land map. */
1787 	if (VM_MAP_NULL != entryp->aio_map) {
1788 		vm_map_deallocate(entryp->aio_map);
1789 	}
1790 
1791 	/* remove our reference to thread which enqueued the request */
1792 	if (entryp->context.vc_thread) {
1793 		thread_deallocate(entryp->context.vc_thread);
1794 	}
1795 	kauth_cred_unref(&entryp->context.vc_ucred);
1796 
1797 	zfree(aio_workq_zonep, entryp);
1798 }
1799 
1800 
1801 /*
1802  * aio_validate
1803  *
1804  * validate the aiocb passed in by one of the aio syscalls.
1805  */
1806 static int
aio_validate(proc_t p,aio_workq_entry * entryp)1807 aio_validate(proc_t p, aio_workq_entry *entryp)
1808 {
1809 	struct fileproc *fp;
1810 	int              flag;
1811 	int              result;
1812 
1813 	result = 0;
1814 
1815 	if ((entryp->flags & AIO_LIO) != 0) {
1816 		if (entryp->aiocb.aio_lio_opcode == LIO_READ) {
1817 			entryp->flags |= AIO_READ;
1818 		} else if (entryp->aiocb.aio_lio_opcode == LIO_WRITE) {
1819 			entryp->flags |= AIO_WRITE;
1820 		} else if (entryp->aiocb.aio_lio_opcode == LIO_NOP) {
1821 			return 0;
1822 		} else {
1823 			return EINVAL;
1824 		}
1825 	}
1826 
1827 	flag = FREAD;
1828 	if ((entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0) {
1829 		flag = FWRITE;
1830 	}
1831 
1832 	if ((entryp->flags & (AIO_READ | AIO_WRITE)) != 0) {
1833 		if (entryp->aiocb.aio_nbytes > INT_MAX ||
1834 		    entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1835 		    entryp->aiocb.aio_offset < 0) {
1836 			return EINVAL;
1837 		}
1838 	}
1839 
1840 	result = aio_sigev_validate(&entryp->aiocb.aio_sigevent);
1841 	if (result) {
1842 		return result;
1843 	}
1844 
1845 	/* validate the file descriptor and that the file was opened
1846 	 * for the appropriate read / write access.
1847 	 */
1848 	proc_fdlock(p);
1849 
1850 	fp = fp_get_noref_locked(p, entryp->aiocb.aio_fildes);
1851 	if (fp == NULL) {
1852 		result = EBADF;
1853 	} else if ((fp->fp_glob->fg_flag & flag) == 0) {
1854 		/* we don't have read or write access */
1855 		result = EBADF;
1856 	} else if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_VNODE) {
1857 		/* this is not a file */
1858 		result = ESPIPE;
1859 	} else {
1860 		fp->fp_flags |= FP_AIOISSUED;
1861 	}
1862 
1863 	proc_fdunlock(p);
1864 
1865 	return result;
1866 }
1867 
1868 /*
1869  * do_aio_completion_and_unlock.  Handle async IO completion.
1870  */
1871 static void
do_aio_completion_and_unlock(proc_t p,aio_workq_entry * entryp)1872 do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp)
1873 {
1874 	aio_workq_entry *leader = entryp->lio_leader;
1875 	int              lio_pending = 0;
1876 	bool             do_signal = false;
1877 
1878 	ASSERT_AIO_PROC_LOCK_OWNED(p);
1879 
1880 	aio_proc_move_done_locked(p, entryp);
1881 
1882 	if (leader) {
1883 		lio_pending = --leader->lio_pending;
1884 		if (lio_pending < 0) {
1885 			panic("lio_pending accounting mistake");
1886 		}
1887 		if (lio_pending == 0 && (leader->flags & AIO_LIO_WAIT)) {
1888 			wakeup(leader);
1889 		}
1890 		entryp->lio_leader = NULL; /* no dangling pointers please */
1891 	}
1892 
1893 	/*
1894 	 * need to handle case where a process is trying to exit, exec, or
1895 	 * close and is currently waiting for active aio requests to complete.
1896 	 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
1897 	 * other requests in the active queue for this process.  If there are
1898 	 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
1899 	 * If there are some still active then do nothing - we only want to
1900 	 * wakeup when all active aio requests for the process are complete.
1901 	 */
1902 	if (__improbable(entryp->flags & AIO_EXIT_WAIT)) {
1903 		KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) | DBG_FUNC_NONE,
1904 		    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1905 		    0, 0, 0);
1906 
1907 		if (!aio_has_active_requests_for_process(p)) {
1908 			/*
1909 			 * no active aio requests for this process, continue exiting.  In this
1910 			 * case, there should be no one else waiting ont he proc in AIO...
1911 			 */
1912 			wakeup_one((caddr_t)&p->AIO_CLEANUP_SLEEP_CHAN);
1913 
1914 			KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) | DBG_FUNC_NONE,
1915 			    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1916 			    0, 0, 0);
1917 		}
1918 	} else if (entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
1919 		/*
1920 		 * If this was the last request in the group, or not part of
1921 		 * a group, and that a signal is desired, send one.
1922 		 */
1923 		do_signal = (lio_pending == 0);
1924 	}
1925 
1926 	if (__improbable(entryp->flags & AIO_CLOSE_WAIT)) {
1927 		KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) | DBG_FUNC_NONE,
1928 		    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1929 		    0, 0, 0);
1930 
1931 		if (!aio_proc_has_active_requests_for_file(p, entryp->aiocb.aio_fildes)) {
1932 			/* Can't wakeup_one(); multiple closes might be in progress. */
1933 			wakeup(&p->AIO_CLEANUP_SLEEP_CHAN);
1934 
1935 			KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) | DBG_FUNC_NONE,
1936 			    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1937 			    0, 0, 0);
1938 		}
1939 	}
1940 
1941 	aio_proc_unlock(p);
1942 
1943 	if (do_signal) {
1944 		KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig) | DBG_FUNC_NONE,
1945 		    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1946 		    entryp->aiocb.aio_sigevent.sigev_signo, 0, 0);
1947 
1948 		psignal(p, entryp->aiocb.aio_sigevent.sigev_signo);
1949 	}
1950 
1951 	/*
1952 	 * A thread in aio_suspend() wants to known about completed IOs.  If it checked
1953 	 * the done list before we moved our AIO there, then it already asserted its wait,
1954 	 * and we can wake it up without holding the lock.  If it checked the list after
1955 	 * we did our move, then it already has seen the AIO that we moved.  Herego, we
1956 	 * can do our wakeup without holding the lock.
1957 	 */
1958 	wakeup(&p->AIO_SUSPEND_SLEEP_CHAN);
1959 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake) | DBG_FUNC_NONE,
1960 	    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), 0, 0, 0);
1961 
1962 	aio_entry_unref(entryp); /* see aio_try_enqueue_work_locked */
1963 	if (leader) {
1964 		aio_entry_unref(leader); /* see lio_listio */
1965 	}
1966 }
1967 
1968 
1969 /*
1970  * do_aio_read
1971  */
1972 static int
do_aio_read(aio_workq_entry * entryp)1973 do_aio_read(aio_workq_entry *entryp)
1974 {
1975 	struct proc     *p = entryp->procp;
1976 	struct fileproc *fp;
1977 	int error;
1978 
1979 	if ((error = fp_lookup(p, entryp->aiocb.aio_fildes, &fp, 0))) {
1980 		return error;
1981 	}
1982 
1983 	if (fp->fp_glob->fg_flag & FREAD) {
1984 		error = dofileread(&entryp->context, fp,
1985 		    entryp->aiocb.aio_buf,
1986 		    entryp->aiocb.aio_nbytes,
1987 		    entryp->aiocb.aio_offset, FOF_OFFSET,
1988 		    &entryp->returnval);
1989 	} else {
1990 		error = EBADF;
1991 	}
1992 
1993 	fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
1994 	return error;
1995 }
1996 
1997 
1998 /*
1999  * do_aio_write
2000  */
2001 static int
do_aio_write(aio_workq_entry * entryp)2002 do_aio_write(aio_workq_entry *entryp)
2003 {
2004 	struct proc     *p = entryp->procp;
2005 	struct fileproc *fp;
2006 	int error;
2007 
2008 	if ((error = fp_lookup(p, entryp->aiocb.aio_fildes, &fp, 0))) {
2009 		return error;
2010 	}
2011 
2012 	if (fp->fp_glob->fg_flag & FWRITE) {
2013 		int flags = 0;
2014 
2015 		if ((fp->fp_glob->fg_flag & O_APPEND) == 0) {
2016 			flags |= FOF_OFFSET;
2017 		}
2018 
2019 		/* NB: tell dofilewrite the offset, and to use the proc cred */
2020 		error = dofilewrite(&entryp->context,
2021 		    fp,
2022 		    entryp->aiocb.aio_buf,
2023 		    entryp->aiocb.aio_nbytes,
2024 		    entryp->aiocb.aio_offset,
2025 		    flags,
2026 		    &entryp->returnval);
2027 	} else {
2028 		error = EBADF;
2029 	}
2030 
2031 	fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
2032 	return error;
2033 }
2034 
2035 
2036 /*
2037  * aio_has_active_requests_for_process - return whether the process has active
2038  * requests pending.
2039  */
2040 static bool
aio_has_active_requests_for_process(proc_t procp)2041 aio_has_active_requests_for_process(proc_t procp)
2042 {
2043 	return !TAILQ_EMPTY(&procp->p_aio_activeq);
2044 }
2045 
2046 /*
2047  * Called with the proc locked.
2048  */
2049 static bool
aio_proc_has_active_requests_for_file(proc_t procp,int fd)2050 aio_proc_has_active_requests_for_file(proc_t procp, int fd)
2051 {
2052 	aio_workq_entry *entryp;
2053 
2054 	TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2055 		if (entryp->aiocb.aio_fildes == fd) {
2056 			return true;
2057 		}
2058 	}
2059 
2060 	return false;
2061 }
2062 
2063 
2064 /*
2065  * do_aio_fsync
2066  */
2067 static int
do_aio_fsync(aio_workq_entry * entryp)2068 do_aio_fsync(aio_workq_entry *entryp)
2069 {
2070 	struct proc            *p = entryp->procp;
2071 	struct vnode           *vp;
2072 	struct fileproc        *fp;
2073 	int                     sync_flag;
2074 	int                     error;
2075 
2076 	/*
2077 	 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2078 	 *
2079 	 * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2080 	 * to mark for update the metadata not strictly necessary for data
2081 	 * retrieval, rather than forcing it to disk.
2082 	 *
2083 	 * If AIO_FSYNC is set, we have to also wait for metadata not really
2084 	 * necessary to data retrival are committed to stable storage (e.g.
2085 	 * atime, mtime, ctime, etc.).
2086 	 *
2087 	 * Metadata necessary for data retrieval ust be committed to stable
2088 	 * storage in either case (file length, etc.).
2089 	 */
2090 	if (entryp->flags & AIO_FSYNC) {
2091 		sync_flag = MNT_WAIT;
2092 	} else {
2093 		sync_flag = MNT_DWAIT;
2094 	}
2095 
2096 	error = fp_get_ftype(p, entryp->aiocb.aio_fildes, DTYPE_VNODE, ENOTSUP, &fp);
2097 	if (error != 0) {
2098 		entryp->returnval = -1;
2099 		return error;
2100 	}
2101 	vp = fp_get_data(fp);
2102 
2103 	if ((error = vnode_getwithref(vp)) == 0) {
2104 		error = VNOP_FSYNC(vp, sync_flag, &entryp->context);
2105 
2106 		(void)vnode_put(vp);
2107 	} else {
2108 		entryp->returnval = -1;
2109 	}
2110 
2111 	fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
2112 	return error;
2113 }
2114 
2115 
2116 /*
2117  * is_already_queued - runs through our queues to see if the given
2118  * aiocbp / process is there.  Returns TRUE if there is a match
2119  * on any of our aio queues.
2120  *
2121  * Called with proc aio lock held (can be held spin)
2122  */
2123 static boolean_t
is_already_queued(proc_t procp,user_addr_t aiocbp)2124 is_already_queued(proc_t procp, user_addr_t aiocbp)
2125 {
2126 	aio_workq_entry *entryp;
2127 	boolean_t        result;
2128 
2129 	result = FALSE;
2130 
2131 	/* look for matches on our queue of async IO requests that have completed */
2132 	TAILQ_FOREACH(entryp, &procp->p_aio_doneq, aio_proc_link) {
2133 		if (aiocbp == entryp->uaiocbp) {
2134 			result = TRUE;
2135 			goto ExitThisRoutine;
2136 		}
2137 	}
2138 
2139 	/* look for matches on our queue of active async IO requests */
2140 	TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2141 		if (aiocbp == entryp->uaiocbp) {
2142 			result = TRUE;
2143 			goto ExitThisRoutine;
2144 		}
2145 	}
2146 
2147 ExitThisRoutine:
2148 	return result;
2149 }
2150 
2151 
2152 /*
2153  * aio initialization
2154  */
2155 __private_extern__ void
aio_init(void)2156 aio_init(void)
2157 {
2158 	for (int i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
2159 		aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2160 	}
2161 
2162 	_aio_create_worker_threads(aio_worker_threads);
2163 }
2164 
2165 
2166 /*
2167  * aio worker threads created here.
2168  */
2169 __private_extern__ void
_aio_create_worker_threads(int num)2170 _aio_create_worker_threads(int num)
2171 {
2172 	int i;
2173 
2174 	/* create some worker threads to handle the async IO requests */
2175 	for (i = 0; i < num; i++) {
2176 		thread_t                myThread;
2177 
2178 		if (KERN_SUCCESS != kernel_thread_start(aio_work_thread, NULL, &myThread)) {
2179 			printf("%s - failed to create a work thread \n", __FUNCTION__);
2180 		} else {
2181 			thread_deallocate(myThread);
2182 		}
2183 	}
2184 }
2185 
2186 /*
2187  * Return the current activation utask
2188  */
2189 task_t
get_aiotask(void)2190 get_aiotask(void)
2191 {
2192 	return current_uthread()->uu_aio_task;
2193 }
2194 
2195 
2196 /*
2197  * In the case of an aiocb from a
2198  * 32-bit process we need to expand some longs and pointers to the correct
2199  * sizes in order to let downstream code always work on the same type of
2200  * aiocb (in our case that is a user_aiocb)
2201  */
2202 static void
do_munge_aiocb_user32_to_user(struct user32_aiocb * my_aiocbp,struct user_aiocb * the_user_aiocbp)2203 do_munge_aiocb_user32_to_user(struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp)
2204 {
2205 	the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2206 	the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2207 	the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2208 	the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2209 	the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2210 	the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2211 
2212 	/* special case here.  since we do not know if sigev_value is an */
2213 	/* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
2214 	/* means if we send this info back to user space we need to remember */
2215 	/* sigev_value was not expanded for the 32-bit case.  */
2216 	/* NOTE - this does NOT affect us since we don't support sigev_value */
2217 	/* yet in the aio context.  */
2218 	//LP64
2219 	the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2220 	the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2221 	the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2222 	    my_aiocbp->aio_sigevent.sigev_value.sival_int;
2223 	the_user_aiocbp->aio_sigevent.sigev_notify_function =
2224 	    CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2225 	the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2226 	    CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2227 }
2228 
2229 /* Similar for 64-bit user process, so that we don't need to satisfy
2230  * the alignment constraints of the original user64_aiocb
2231  */
2232 #if !__LP64__
2233 __dead2
2234 #endif
2235 static void
do_munge_aiocb_user64_to_user(struct user64_aiocb * my_aiocbp,struct user_aiocb * the_user_aiocbp)2236 do_munge_aiocb_user64_to_user(struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp)
2237 {
2238 #if __LP64__
2239 	the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2240 	the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2241 	the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2242 	the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2243 	the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2244 	the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2245 
2246 	the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2247 	the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2248 	the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2249 	    my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2250 	the_user_aiocbp->aio_sigevent.sigev_notify_function =
2251 	    my_aiocbp->aio_sigevent.sigev_notify_function;
2252 	the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2253 	    my_aiocbp->aio_sigevent.sigev_notify_attributes;
2254 #else
2255 #pragma unused(my_aiocbp, the_user_aiocbp)
2256 	panic("64bit process on 32bit kernel is not supported");
2257 #endif
2258 }
2259