xref: /xnu-8020.140.41/bsd/kern/kern_aio.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * Copyright (c) 2003-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 
30 /*
31  * todo:
32  *		1) ramesh is looking into how to replace taking a reference on
33  *		        the user's map (vm_map_reference()) since it is believed that
34  *			would not hold the process for us.
35  *		2) david is looking into a way for us to set the priority of the
36  *		        worker threads to match that of the user's thread when the
37  *		        async IO was queued.
38  */
39 
40 
41 /*
42  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
43  */
44 
45 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/file_internal.h>
48 #include <sys/filedesc.h>
49 #include <sys/kernel.h>
50 #include <sys/vnode_internal.h>
51 #include <sys/malloc.h>
52 #include <sys/mount_internal.h>
53 #include <sys/param.h>
54 #include <sys/proc_internal.h>
55 #include <sys/sysctl.h>
56 #include <sys/unistd.h>
57 #include <sys/user.h>
58 
59 #include <sys/aio_kern.h>
60 #include <sys/sysproto.h>
61 
62 #include <machine/limits.h>
63 
64 #include <mach/mach_types.h>
65 #include <kern/kern_types.h>
66 #include <kern/waitq.h>
67 #include <kern/zalloc.h>
68 #include <kern/task.h>
69 #include <kern/sched_prim.h>
70 
71 #include <vm/vm_map.h>
72 
73 #include <os/refcnt.h>
74 
75 #include <sys/kdebug.h>
76 #define AIO_work_queued                 1
77 #define AIO_worker_wake                 2
78 #define AIO_completion_sig              3
79 #define AIO_completion_cleanup_wait     4
80 #define AIO_completion_cleanup_wake     5
81 #define AIO_completion_suspend_wake     6
82 #define AIO_fsync_delay                 7
83 #define AIO_cancel                      10
84 #define AIO_cancel_async_workq          11
85 #define AIO_cancel_sync_workq           12
86 #define AIO_cancel_activeq              13
87 #define AIO_cancel_doneq                14
88 #define AIO_fsync                       20
89 #define AIO_read                        30
90 #define AIO_write                       40
91 #define AIO_listio                      50
92 #define AIO_error                       60
93 #define AIO_error_val                   61
94 #define AIO_error_activeq               62
95 #define AIO_error_workq                 63
96 #define AIO_return                      70
97 #define AIO_return_val                  71
98 #define AIO_return_activeq              72
99 #define AIO_return_workq                73
100 #define AIO_exec                        80
101 #define AIO_exit                        90
102 #define AIO_exit_sleep                  91
103 #define AIO_close                       100
104 #define AIO_close_sleep                 101
105 #define AIO_suspend                     110
106 #define AIO_suspend_sleep               111
107 #define AIO_worker_thread               120
108 
109 __options_decl(aio_entry_flags_t, uint32_t, {
110 	AIO_READ        = 0x00000001, /* a read */
111 	AIO_WRITE       = 0x00000002, /* a write */
112 	AIO_FSYNC       = 0x00000004, /* aio_fsync with op = O_SYNC */
113 	AIO_DSYNC       = 0x00000008, /* aio_fsync with op = O_DSYNC (not supported yet) */
114 	AIO_LIO         = 0x00000010, /* lio_listio generated IO */
115 	AIO_LIO_WAIT    = 0x00000020, /* lio_listio is waiting on the leader */
116 
117 	/*
118 	 * These flags mean that this entry is blocking either:
119 	 * - close (AIO_CLOSE_WAIT)
120 	 * - exit or exec (AIO_EXIT_WAIT)
121 	 *
122 	 * These flags are mutually exclusive, and the AIO_EXIT_WAIT variant
123 	 * will also neuter notifications in do_aio_completion_and_unlock().
124 	 */
125 	AIO_CLOSE_WAIT  = 0x00004000,
126 	AIO_EXIT_WAIT   = 0x00008000,
127 });
128 
129 /*! @struct aio_workq_entry
130  *
131  * @discussion
132  * This represents a piece of aio/lio work.
133  *
134  * The ownership rules go as follows:
135  *
136  * - the "proc" owns one refcount on the entry (from creation), while it is
137  *   enqueued on the aio_activeq and then the aio_doneq.
138  *
139  *   either aio_return() (user read the status) or _aio_exit() (the process
140  *   died) will dequeue the entry and consume this ref.
141  *
142  * - the async workqueue owns one refcount once the work is submitted,
143  *   which is consumed in do_aio_completion_and_unlock().
144  *
145  *   This ref protects the entry for the the end of
146  *   do_aio_completion_and_unlock() (when signal delivery happens).
147  *
148  * - lio_listio() for batches picks one of the entries to be the "leader"
149  *   of the batch. Each work item will have a refcount on its leader
150  *   so that the accounting of the batch completion can be done on the leader
151  *   (to be able to decrement lio_pending).
152  *
153  *   This ref is consumed in do_aio_completion_and_unlock() as well.
154  *
155  * - lastly, in lio_listio() when the LIO_WAIT behavior is requested,
156  *   an extra ref is taken in this syscall as it needs to keep accessing
157  *   the leader "lio_pending" field until it hits 0.
158  */
159 struct aio_workq_entry {
160 	/* queue lock */
161 	TAILQ_ENTRY(aio_workq_entry)    aio_workq_link;
162 
163 	/* Proc lock */
164 	TAILQ_ENTRY(aio_workq_entry)    aio_proc_link;  /* p_aio_activeq or p_aio_doneq */
165 	user_ssize_t                    returnval;      /* return value from read / write request */
166 	errno_t                         errorval;       /* error value from read / write request */
167 	os_refcnt_t                     aio_refcount;
168 	aio_entry_flags_t               flags;
169 
170 	int                             lio_pending;    /* pending I/Os in lio group, only on leader */
171 	struct aio_workq_entry         *lio_leader;     /* pointer to the lio leader, can be self */
172 
173 	/* Initialized and never changed, safe to access */
174 	struct proc                    *procp;          /* user proc that queued this request */
175 	user_addr_t                     uaiocbp;        /* pointer passed in from user land */
176 	struct user_aiocb               aiocb;          /* copy of aiocb from user land */
177 	thread_t                        thread;         /* thread that queued this request */
178 
179 	/* Initialized, and possibly freed by aio_work_thread() or at free if cancelled */
180 	vm_map_t                        aio_map;        /* user land map we have a reference to */
181 };
182 
183 /*
184  * aio requests queue up on the aio_async_workq or lio_sync_workq (for
185  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
186  * (proc.aio_activeq) when one of our worker threads start the IO.
187  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
188  * when the IO request completes.  The request remains on aio_doneq until
189  * user process calls aio_return or the process exits, either way that is our
190  * trigger to release aio resources.
191  */
192 typedef struct aio_workq   {
193 	TAILQ_HEAD(, aio_workq_entry)   aioq_entries;
194 	lck_spin_t                      aioq_lock;
195 	struct waitq                    aioq_waitq;
196 } *aio_workq_t;
197 
198 #define AIO_NUM_WORK_QUEUES 1
199 struct aio_anchor_cb {
200 	os_atomic(int)          aio_total_count;        /* total extant entries */
201 
202 	/* Hash table of queues here */
203 	int                     aio_num_workqs;
204 	struct aio_workq        aio_async_workqs[AIO_NUM_WORK_QUEUES];
205 };
206 typedef struct aio_anchor_cb aio_anchor_cb;
207 
208 /*
209  * Notes on aio sleep / wake channels.
210  * We currently pick a couple fields within the proc structure that will allow
211  * us sleep channels that currently do not collide with any other kernel routines.
212  * At this time, for binary compatibility reasons, we cannot create new proc fields.
213  */
214 #define AIO_SUSPEND_SLEEP_CHAN  p_aio_activeq
215 #define AIO_CLEANUP_SLEEP_CHAN  p_aio_total_count
216 
217 #define ASSERT_AIO_FROM_PROC(aiop, theproc)     \
218 	if ((aiop)->procp != (theproc)) {       \
219 	        panic("AIO on a proc list that does not belong to that proc."); \
220 	}
221 
222 /*
223  *  LOCAL PROTOTYPES
224  */
225 static void             aio_proc_lock(proc_t procp);
226 static void             aio_proc_lock_spin(proc_t procp);
227 static void             aio_proc_unlock(proc_t procp);
228 static lck_mtx_t       *aio_proc_mutex(proc_t procp);
229 static bool             aio_has_active_requests_for_process(proc_t procp);
230 static bool             aio_proc_has_active_requests_for_file(proc_t procp, int fd);
231 static boolean_t        is_already_queued(proc_t procp, user_addr_t aiocbp);
232 
233 static aio_workq_t      aio_entry_workq(aio_workq_entry *entryp);
234 static void             aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
235 static void             aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
236 static void             aio_entry_ref(aio_workq_entry *entryp);
237 static void             aio_entry_unref(aio_workq_entry *entryp);
238 static bool             aio_entry_try_workq_remove(aio_workq_entry *entryp);
239 static boolean_t        aio_delay_fsync_request(aio_workq_entry *entryp);
240 static void             aio_free_request(aio_workq_entry *entryp);
241 
242 static void             aio_workq_init(aio_workq_t wq);
243 static void             aio_workq_lock_spin(aio_workq_t wq);
244 static void             aio_workq_unlock(aio_workq_t wq);
245 static lck_spin_t      *aio_workq_lock(aio_workq_t wq);
246 
247 static void             aio_work_thread(void *arg, wait_result_t wr);
248 static aio_workq_entry *aio_get_some_work(void);
249 
250 static int              aio_queue_async_request(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
251 static int              aio_validate(proc_t, aio_workq_entry *entryp);
252 
253 static int              do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, aio_entry_flags_t);
254 static void             do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp);
255 static int              do_aio_fsync(aio_workq_entry *entryp);
256 static int              do_aio_read(aio_workq_entry *entryp);
257 static int              do_aio_write(aio_workq_entry *entryp);
258 static void             do_munge_aiocb_user32_to_user(struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp);
259 static void             do_munge_aiocb_user64_to_user(struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp);
260 static aio_workq_entry *aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t);
261 static int              aio_copy_in_list(proc_t, user_addr_t, user_addr_t *, int);
262 
263 #define ASSERT_AIO_PROC_LOCK_OWNED(p)   LCK_MTX_ASSERT(aio_proc_mutex(p), LCK_MTX_ASSERT_OWNED)
264 #define ASSERT_AIO_WORKQ_LOCK_OWNED(q)  LCK_SPIN_ASSERT(aio_workq_lock(q), LCK_ASSERT_OWNED)
265 
266 /*
267  *  EXTERNAL PROTOTYPES
268  */
269 
270 /* in ...bsd/kern/sys_generic.c */
271 extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
272     user_addr_t bufp, user_size_t nbyte,
273     off_t offset, int flags, user_ssize_t *retval);
274 extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
275     user_addr_t bufp, user_size_t nbyte, off_t offset,
276     int flags, user_ssize_t *retval);
277 
278 /*
279  * aio external global variables.
280  */
281 extern int aio_max_requests;                    /* AIO_MAX - configurable */
282 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
283 extern int aio_worker_threads;                  /* AIO_THREAD_COUNT - configurable */
284 
285 
286 /*
287  * aio static variables.
288  */
289 static aio_anchor_cb aio_anchor = {
290 	.aio_num_workqs = AIO_NUM_WORK_QUEUES,
291 };
292 os_refgrp_decl(static, aio_refgrp, "aio", NULL);
293 static LCK_GRP_DECLARE(aio_proc_lock_grp, "aio_proc");
294 static LCK_GRP_DECLARE(aio_queue_lock_grp, "aio_queue");
295 static LCK_MTX_DECLARE(aio_proc_mtx, &aio_proc_lock_grp);
296 
297 static ZONE_DEFINE_TYPE(aio_workq_zonep, "aiowq", aio_workq_entry,
298     ZC_ZFREE_CLEARMEM);
299 
300 /* Hash */
301 static aio_workq_t
aio_entry_workq(__unused aio_workq_entry * entryp)302 aio_entry_workq(__unused aio_workq_entry *entryp)
303 {
304 	return &aio_anchor.aio_async_workqs[0];
305 }
306 
307 static void
aio_workq_init(aio_workq_t wq)308 aio_workq_init(aio_workq_t wq)
309 {
310 	TAILQ_INIT(&wq->aioq_entries);
311 	lck_spin_init(&wq->aioq_lock, &aio_queue_lock_grp, LCK_ATTR_NULL);
312 	waitq_init(&wq->aioq_waitq, WQT_QUEUE, SYNC_POLICY_FIFO);
313 }
314 
315 
316 /*
317  * Can be passed a queue which is locked spin.
318  */
319 static void
aio_workq_remove_entry_locked(aio_workq_t queue,aio_workq_entry * entryp)320 aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
321 {
322 	ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
323 
324 	if (entryp->aio_workq_link.tqe_prev == NULL) {
325 		panic("Trying to remove an entry from a work queue, but it is not on a queue");
326 	}
327 
328 	TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
329 	entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
330 }
331 
332 static void
aio_workq_add_entry_locked(aio_workq_t queue,aio_workq_entry * entryp)333 aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
334 {
335 	ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
336 
337 	TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
338 }
339 
340 static void
aio_proc_lock(proc_t procp)341 aio_proc_lock(proc_t procp)
342 {
343 	lck_mtx_lock(aio_proc_mutex(procp));
344 }
345 
346 static void
aio_proc_lock_spin(proc_t procp)347 aio_proc_lock_spin(proc_t procp)
348 {
349 	lck_mtx_lock_spin(aio_proc_mutex(procp));
350 }
351 
352 static bool
aio_has_any_work(void)353 aio_has_any_work(void)
354 {
355 	return os_atomic_load(&aio_anchor.aio_total_count, relaxed) != 0;
356 }
357 
358 static bool
aio_try_proc_insert_active_locked(proc_t procp,aio_workq_entry * entryp)359 aio_try_proc_insert_active_locked(proc_t procp, aio_workq_entry *entryp)
360 {
361 	int old, new;
362 
363 	ASSERT_AIO_PROC_LOCK_OWNED(procp);
364 
365 	if (procp->p_aio_total_count >= aio_max_requests_per_process) {
366 		return false;
367 	}
368 
369 	if (is_already_queued(procp, entryp->uaiocbp)) {
370 		return false;
371 	}
372 
373 	os_atomic_rmw_loop(&aio_anchor.aio_total_count, old, new, relaxed, {
374 		if (old >= aio_max_requests) {
375 		        os_atomic_rmw_loop_give_up(return false);
376 		}
377 		new = old + 1;
378 	});
379 
380 	TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link);
381 	procp->p_aio_total_count++;
382 	return true;
383 }
384 
385 static void
aio_proc_move_done_locked(proc_t procp,aio_workq_entry * entryp)386 aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
387 {
388 	TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link);
389 	TAILQ_INSERT_TAIL(&procp->p_aio_doneq, entryp, aio_proc_link);
390 }
391 
392 static void
aio_proc_remove_done_locked(proc_t procp,aio_workq_entry * entryp)393 aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
394 {
395 	TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
396 	entryp->aio_proc_link.tqe_prev = NULL;
397 	if (os_atomic_dec_orig(&aio_anchor.aio_total_count, relaxed) <= 0) {
398 		panic("Negative total AIO count!");
399 	}
400 	if (procp->p_aio_total_count-- <= 0) {
401 		panic("proc %p: p_aio_total_count accounting mismatch", procp);
402 	}
403 }
404 
405 static void
aio_proc_unlock(proc_t procp)406 aio_proc_unlock(proc_t procp)
407 {
408 	lck_mtx_unlock(aio_proc_mutex(procp));
409 }
410 
411 static lck_mtx_t*
aio_proc_mutex(proc_t procp)412 aio_proc_mutex(proc_t procp)
413 {
414 	return &procp->p_mlock;
415 }
416 
417 static void
aio_entry_ref(aio_workq_entry * entryp)418 aio_entry_ref(aio_workq_entry *entryp)
419 {
420 	os_ref_retain(&entryp->aio_refcount);
421 }
422 
423 static void
aio_entry_unref(aio_workq_entry * entryp)424 aio_entry_unref(aio_workq_entry *entryp)
425 {
426 	if (os_ref_release(&entryp->aio_refcount) == 0) {
427 		aio_free_request(entryp);
428 	}
429 }
430 
431 static bool
aio_entry_try_workq_remove(aio_workq_entry * entryp)432 aio_entry_try_workq_remove(aio_workq_entry *entryp)
433 {
434 	/* Can only be cancelled if it's still on a work queue */
435 	if (entryp->aio_workq_link.tqe_prev != NULL) {
436 		aio_workq_t queue;
437 
438 		/* Will have to check again under the lock */
439 		queue = aio_entry_workq(entryp);
440 		aio_workq_lock_spin(queue);
441 		if (entryp->aio_workq_link.tqe_prev != NULL) {
442 			aio_workq_remove_entry_locked(queue, entryp);
443 			aio_workq_unlock(queue);
444 			return true;
445 		} else {
446 			aio_workq_unlock(queue);
447 		}
448 	}
449 
450 	return false;
451 }
452 
453 static void
aio_workq_lock_spin(aio_workq_t wq)454 aio_workq_lock_spin(aio_workq_t wq)
455 {
456 	lck_spin_lock(aio_workq_lock(wq));
457 }
458 
459 static void
aio_workq_unlock(aio_workq_t wq)460 aio_workq_unlock(aio_workq_t wq)
461 {
462 	lck_spin_unlock(aio_workq_lock(wq));
463 }
464 
465 static lck_spin_t*
aio_workq_lock(aio_workq_t wq)466 aio_workq_lock(aio_workq_t wq)
467 {
468 	return &wq->aioq_lock;
469 }
470 
471 /*
472  * aio_cancel - attempt to cancel one or more async IO requests currently
473  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
474  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
475  * is NULL then all outstanding async IO request for the given file
476  * descriptor are cancelled (if possible).
477  */
478 int
aio_cancel(proc_t p,struct aio_cancel_args * uap,int * retval)479 aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval)
480 {
481 	struct user_aiocb my_aiocb;
482 	int               result;
483 
484 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) | DBG_FUNC_START,
485 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
486 
487 	/* quick check to see if there are any async IO requests queued up */
488 	if (!aio_has_any_work()) {
489 		result = 0;
490 		*retval = AIO_ALLDONE;
491 		goto ExitRoutine;
492 	}
493 
494 	*retval = -1;
495 	if (uap->aiocbp != USER_ADDR_NULL) {
496 		if (proc_is64bit(p)) {
497 			struct user64_aiocb aiocb64;
498 
499 			result = copyin(uap->aiocbp, &aiocb64, sizeof(aiocb64));
500 			if (result == 0) {
501 				do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
502 			}
503 		} else {
504 			struct user32_aiocb aiocb32;
505 
506 			result = copyin(uap->aiocbp, &aiocb32, sizeof(aiocb32));
507 			if (result == 0) {
508 				do_munge_aiocb_user32_to_user(&aiocb32, &my_aiocb);
509 			}
510 		}
511 
512 		if (result != 0) {
513 			result = EAGAIN;
514 			goto ExitRoutine;
515 		}
516 
517 		/* NOTE - POSIX standard says a mismatch between the file */
518 		/* descriptor passed in and the file descriptor embedded in */
519 		/* the aiocb causes unspecified results.  We return EBADF in */
520 		/* that situation.  */
521 		if (uap->fd != my_aiocb.aio_fildes) {
522 			result = EBADF;
523 			goto ExitRoutine;
524 		}
525 	}
526 
527 	aio_proc_lock(p);
528 	result = do_aio_cancel_locked(p, uap->fd, uap->aiocbp, 0);
529 	ASSERT_AIO_PROC_LOCK_OWNED(p);
530 	aio_proc_unlock(p);
531 
532 	if (result != -1) {
533 		*retval = result;
534 		result = 0;
535 		goto ExitRoutine;
536 	}
537 
538 	result = EBADF;
539 
540 ExitRoutine:
541 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) | DBG_FUNC_END,
542 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, result, 0, 0);
543 
544 	return result;
545 }
546 
547 
548 /*
549  * _aio_close - internal function used to clean up async IO requests for
550  * a file descriptor that is closing.
551  * THIS MAY BLOCK.
552  */
553 __private_extern__ void
_aio_close(proc_t p,int fd)554 _aio_close(proc_t p, int fd)
555 {
556 	int error;
557 
558 	/* quick check to see if there are any async IO requests queued up */
559 	if (!aio_has_any_work()) {
560 		return;
561 	}
562 
563 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) | DBG_FUNC_START,
564 	    VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
565 
566 	/* cancel all async IO requests on our todo queues for this file descriptor */
567 	aio_proc_lock(p);
568 	error = do_aio_cancel_locked(p, fd, USER_ADDR_NULL, AIO_CLOSE_WAIT);
569 	ASSERT_AIO_PROC_LOCK_OWNED(p);
570 	if (error == AIO_NOTCANCELED) {
571 		/*
572 		 * AIO_NOTCANCELED is returned when we find an aio request for this process
573 		 * and file descriptor on the active async IO queue.  Active requests cannot
574 		 * be cancelled so we must wait for them to complete.  We will get a special
575 		 * wake up call on our channel used to sleep for ALL active requests to
576 		 * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
577 		 * when we must wait for all active aio requests.
578 		 */
579 
580 		KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep) | DBG_FUNC_NONE,
581 		    VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
582 
583 		while (aio_proc_has_active_requests_for_file(p, fd)) {
584 			msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0);
585 		}
586 	}
587 
588 	aio_proc_unlock(p);
589 
590 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_close) | DBG_FUNC_END,
591 	    VM_KERNEL_ADDRPERM(p), fd, 0, 0, 0);
592 }
593 
594 
595 /*
596  * aio_error - return the error status associated with the async IO
597  * request referred to by uap->aiocbp.  The error status is the errno
598  * value that would be set by the corresponding IO request (read, wrtie,
599  * fdatasync, or sync).
600  */
601 int
aio_error(proc_t p,struct aio_error_args * uap,int * retval)602 aio_error(proc_t p, struct aio_error_args *uap, int *retval)
603 {
604 	aio_workq_entry *entryp;
605 	int              error;
606 
607 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) | DBG_FUNC_START,
608 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
609 
610 	/* see if there are any aios to check */
611 	if (!aio_has_any_work()) {
612 		return EINVAL;
613 	}
614 
615 	aio_proc_lock(p);
616 
617 	/* look for a match on our queue of async IO requests that have completed */
618 	TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
619 		if (entryp->uaiocbp == uap->aiocbp) {
620 			ASSERT_AIO_FROM_PROC(entryp, p);
621 
622 			*retval = entryp->errorval;
623 			error = 0;
624 
625 			KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val) | DBG_FUNC_NONE,
626 			    VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
627 			goto ExitRoutine;
628 		}
629 	}
630 
631 	/* look for a match on our queue of active async IO requests */
632 	TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
633 		if (entryp->uaiocbp == uap->aiocbp) {
634 			ASSERT_AIO_FROM_PROC(entryp, p);
635 			*retval = EINPROGRESS;
636 			error = 0;
637 			KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq) | DBG_FUNC_NONE,
638 			    VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
639 			goto ExitRoutine;
640 		}
641 	}
642 
643 	error = EINVAL;
644 
645 ExitRoutine:
646 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_error) | DBG_FUNC_END,
647 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
648 	aio_proc_unlock(p);
649 
650 	return error;
651 }
652 
653 
654 /*
655  * aio_fsync - asynchronously force all IO operations associated
656  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
657  * queued at the time of the call to the synchronized completion state.
658  * NOTE - we do not support op O_DSYNC at this point since we do not support the
659  * fdatasync() call.
660  */
661 int
aio_fsync(proc_t p,struct aio_fsync_args * uap,int * retval)662 aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval)
663 {
664 	aio_entry_flags_t fsync_kind;
665 	int error;
666 
667 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) | DBG_FUNC_START,
668 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, uap->op, 0, 0);
669 
670 	*retval = 0;
671 	/* 0 := O_SYNC for binary backward compatibility with Panther */
672 	if (uap->op == O_SYNC || uap->op == 0) {
673 		fsync_kind = AIO_FSYNC;
674 	} else if (uap->op == O_DSYNC) {
675 		fsync_kind = AIO_DSYNC;
676 	} else {
677 		*retval = -1;
678 		error = EINVAL;
679 		goto ExitRoutine;
680 	}
681 
682 	error = aio_queue_async_request(p, uap->aiocbp, fsync_kind);
683 	if (error != 0) {
684 		*retval = -1;
685 	}
686 
687 ExitRoutine:
688 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync) | DBG_FUNC_END,
689 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
690 
691 	return error;
692 }
693 
694 
695 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
696  * file descriptor (uap->aiocbp->aio_fildes) into the buffer
697  * (uap->aiocbp->aio_buf).
698  */
699 int
aio_read(proc_t p,struct aio_read_args * uap,int * retval)700 aio_read(proc_t p, struct aio_read_args *uap, int *retval)
701 {
702 	int error;
703 
704 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) | DBG_FUNC_START,
705 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
706 
707 	*retval = 0;
708 
709 	error = aio_queue_async_request(p, uap->aiocbp, AIO_READ);
710 	if (error != 0) {
711 		*retval = -1;
712 	}
713 
714 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_read) | DBG_FUNC_END,
715 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
716 
717 	return error;
718 }
719 
720 
721 /*
722  * aio_return - return the return status associated with the async IO
723  * request referred to by uap->aiocbp.  The return status is the value
724  * that would be returned by corresponding IO request (read, write,
725  * fdatasync, or sync).  This is where we release kernel resources
726  * held for async IO call associated with the given aiocb pointer.
727  */
728 int
aio_return(proc_t p,struct aio_return_args * uap,user_ssize_t * retval)729 aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval)
730 {
731 	aio_workq_entry *entryp;
732 	int              error = EINVAL;
733 
734 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) | DBG_FUNC_START,
735 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
736 
737 	/* See if there are any entries to check */
738 	if (!aio_has_any_work()) {
739 		goto ExitRoutine;
740 	}
741 
742 	aio_proc_lock(p);
743 	*retval = 0;
744 
745 	/* look for a match on our queue of async IO requests that have completed */
746 	TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
747 		ASSERT_AIO_FROM_PROC(entryp, p);
748 		if (entryp->uaiocbp == uap->aiocbp) {
749 			/* Done and valid for aio_return(), pull it off the list */
750 			aio_proc_remove_done_locked(p, entryp);
751 
752 			*retval = entryp->returnval;
753 			error = 0;
754 			aio_proc_unlock(p);
755 
756 			aio_entry_unref(entryp);
757 
758 			KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val) | DBG_FUNC_NONE,
759 			    VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
760 			goto ExitRoutine;
761 		}
762 	}
763 
764 	/* look for a match on our queue of active async IO requests */
765 	TAILQ_FOREACH(entryp, &p->p_aio_activeq, aio_proc_link) {
766 		ASSERT_AIO_FROM_PROC(entryp, p);
767 		if (entryp->uaiocbp == uap->aiocbp) {
768 			error = EINPROGRESS;
769 			KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq) | DBG_FUNC_NONE,
770 			    VM_KERNEL_ADDRPERM(p), uap->aiocbp, *retval, 0, 0);
771 			break;
772 		}
773 	}
774 
775 	aio_proc_unlock(p);
776 
777 ExitRoutine:
778 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_return) | DBG_FUNC_END,
779 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
780 
781 	return error;
782 }
783 
784 
785 /*
786  * _aio_exec - internal function used to clean up async IO requests for
787  * a process that is going away due to exec().  We cancel any async IOs
788  * we can and wait for those already active.  We also disable signaling
789  * for cancelled or active aio requests that complete.
790  * This routine MAY block!
791  */
792 __private_extern__ void
_aio_exec(proc_t p)793 _aio_exec(proc_t p)
794 {
795 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) | DBG_FUNC_START,
796 	    VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
797 
798 	_aio_exit(p);
799 
800 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exec) | DBG_FUNC_END,
801 	    VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
802 }
803 
804 
805 /*
806  * _aio_exit - internal function used to clean up async IO requests for
807  * a process that is terminating (via exit() or exec()).  We cancel any async IOs
808  * we can and wait for those already active.  We also disable signaling
809  * for cancelled or active aio requests that complete.  This routine MAY block!
810  */
811 __private_extern__ void
_aio_exit(proc_t p)812 _aio_exit(proc_t p)
813 {
814 	TAILQ_HEAD(, aio_workq_entry) tofree = TAILQ_HEAD_INITIALIZER(tofree);
815 	aio_workq_entry *entryp, *tmp;
816 	int              error;
817 
818 	/* quick check to see if there are any async IO requests queued up */
819 	if (!aio_has_any_work()) {
820 		return;
821 	}
822 
823 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) | DBG_FUNC_START,
824 	    VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
825 
826 	aio_proc_lock(p);
827 
828 	/*
829 	 * cancel async IO requests on the todo work queue and wait for those
830 	 * already active to complete.
831 	 */
832 	error = do_aio_cancel_locked(p, -1, USER_ADDR_NULL, AIO_EXIT_WAIT);
833 	ASSERT_AIO_PROC_LOCK_OWNED(p);
834 	if (error == AIO_NOTCANCELED) {
835 		/*
836 		 * AIO_NOTCANCELED is returned when we find an aio request for this process
837 		 * on the active async IO queue.  Active requests cannot be cancelled so we
838 		 * must wait for them to complete.  We will get a special wake up call on
839 		 * our channel used to sleep for ALL active requests to complete.  This sleep
840 		 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
841 		 * active aio requests.
842 		 */
843 
844 		KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep) | DBG_FUNC_NONE,
845 		    VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
846 
847 		while (aio_has_active_requests_for_process(p)) {
848 			msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0);
849 		}
850 	}
851 
852 	assert(!aio_has_active_requests_for_process(p));
853 
854 	/* release all aio resources used by this process */
855 	TAILQ_FOREACH_SAFE(entryp, &p->p_aio_doneq, aio_proc_link, tmp) {
856 		ASSERT_AIO_FROM_PROC(entryp, p);
857 
858 		aio_proc_remove_done_locked(p, entryp);
859 		TAILQ_INSERT_TAIL(&tofree, entryp, aio_proc_link);
860 	}
861 
862 	aio_proc_unlock(p);
863 
864 	/* free all the entries outside of the aio_proc_lock() */
865 	TAILQ_FOREACH_SAFE(entryp, &tofree, aio_proc_link, tmp) {
866 		entryp->aio_proc_link.tqe_prev = NULL;
867 		aio_entry_unref(entryp);
868 	}
869 
870 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) | DBG_FUNC_END,
871 	    VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0);
872 }
873 
874 
875 static bool
should_cancel(aio_workq_entry * entryp,int fd,user_addr_t aiocbp,aio_entry_flags_t reason)876 should_cancel(aio_workq_entry *entryp, int fd, user_addr_t aiocbp,
877     aio_entry_flags_t reason)
878 {
879 	if (reason & AIO_EXIT_WAIT) {
880 		/* caller is _aio_exit() */
881 		return true;
882 	}
883 	if (fd != entryp->aiocb.aio_fildes) {
884 		/* not the file we're looking for */
885 		return false;
886 	}
887 	/*
888 	 * aio_cancel() or _aio_close() cancel
889 	 * everything for a given fd when aiocbp is NULL
890 	 */
891 	return aiocbp == USER_ADDR_NULL || entryp->uaiocbp == aiocbp;
892 }
893 
894 /*
895  * do_aio_cancel_locked - cancel async IO requests (if possible).  We get called by
896  * aio_cancel, close, and at exit.
897  * There are three modes of operation: 1) cancel all async IOs for a process -
898  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
899  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
900  * aiocbp.
901  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
902  * target async IO requests, AIO_NOTCANCELED if we could not cancel all
903  * target async IO requests, and AIO_ALLDONE if all target async IO requests
904  * were already complete.
905  * WARNING - do not deference aiocbp in this routine, it may point to user
906  * land data that has not been copied in (when called from aio_cancel())
907  *
908  * Called with proc locked, and returns the same way.
909  */
910 static int
do_aio_cancel_locked(proc_t p,int fd,user_addr_t aiocbp,aio_entry_flags_t reason)911 do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
912     aio_entry_flags_t reason)
913 {
914 	bool multiple_matches = (aiocbp == USER_ADDR_NULL);
915 	aio_workq_entry *entryp, *tmp;
916 	int result;
917 
918 	ASSERT_AIO_PROC_LOCK_OWNED(p);
919 
920 	/* look for a match on our queue of async todo work. */
921 again:
922 	result = -1;
923 	TAILQ_FOREACH_SAFE(entryp, &p->p_aio_activeq, aio_proc_link, tmp) {
924 		ASSERT_AIO_FROM_PROC(entryp, p);
925 
926 		if (!should_cancel(entryp, fd, aiocbp, reason)) {
927 			continue;
928 		}
929 
930 		if (reason) {
931 			/* mark the entry as blocking close or exit/exec */
932 			entryp->flags |= reason;
933 			if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
934 				panic("Close and exit flags set at the same time");
935 			}
936 		}
937 
938 		/* Can only be cancelled if it's still on a work queue */
939 		if (aio_entry_try_workq_remove(entryp)) {
940 			entryp->errorval = ECANCELED;
941 			entryp->returnval = -1;
942 
943 			/* Now it's officially cancelled.  Do the completion */
944 			KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq) | DBG_FUNC_NONE,
945 			    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
946 			    fd, 0, 0);
947 			do_aio_completion_and_unlock(p, entryp);
948 
949 			aio_proc_lock(p);
950 
951 			if (multiple_matches) {
952 				/*
953 				 * Restart from the head of the proc active queue since it
954 				 * may have been changed while we were away doing completion
955 				 * processing.
956 				 *
957 				 * Note that if we found an uncancellable AIO before, we will
958 				 * either find it again or discover that it's been completed,
959 				 * so resetting the result will not cause us to return success
960 				 * despite outstanding AIOs.
961 				 */
962 				goto again;
963 			}
964 
965 			return AIO_CANCELED;
966 		}
967 
968 		/*
969 		 * It's been taken off the active queue already, i.e. is in flight.
970 		 * All we can do is ask for notification.
971 		 */
972 		KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq) | DBG_FUNC_NONE,
973 		    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
974 		    fd, 0, 0);
975 
976 		result = AIO_NOTCANCELED;
977 		if (!multiple_matches) {
978 			return result;
979 		}
980 	}
981 
982 	/*
983 	 * if we didn't find any matches on the todo or active queues then look for a
984 	 * match on our queue of async IO requests that have completed and if found
985 	 * return AIO_ALLDONE result.
986 	 *
987 	 * Proc AIO lock is still held.
988 	 */
989 	if (result == -1) {
990 		TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
991 			ASSERT_AIO_FROM_PROC(entryp, p);
992 			if (should_cancel(entryp, fd, aiocbp, reason)) {
993 				KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq) | DBG_FUNC_NONE,
994 				    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
995 				    fd, 0, 0);
996 
997 				result = AIO_ALLDONE;
998 				if (!multiple_matches) {
999 					return result;
1000 				}
1001 			}
1002 		}
1003 	}
1004 
1005 	return result;
1006 }
1007 
1008 
1009 /*
1010  * aio_suspend - suspend the calling thread until at least one of the async
1011  * IO operations referenced by uap->aiocblist has completed, until a signal
1012  * interrupts the function, or uap->timeoutp time interval (optional) has
1013  * passed.
1014  * Returns 0 if one or more async IOs have completed else -1 and errno is
1015  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1016  * woke us up.
1017  */
1018 int
aio_suspend(proc_t p,struct aio_suspend_args * uap,int * retval)1019 aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval)
1020 {
1021 	__pthread_testcancel(1);
1022 	return aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval);
1023 }
1024 
1025 
1026 int
aio_suspend_nocancel(proc_t p,struct aio_suspend_nocancel_args * uap,int * retval)1027 aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval)
1028 {
1029 	int                     error;
1030 	int                     i;
1031 	uint64_t                abstime;
1032 	struct user_timespec    ts;
1033 	aio_workq_entry        *entryp;
1034 	user_addr_t            *aiocbpp;
1035 	size_t                  aiocbpp_size;
1036 
1037 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) | DBG_FUNC_START,
1038 	    VM_KERNEL_ADDRPERM(p), uap->nent, 0, 0, 0);
1039 
1040 	*retval = -1;
1041 	abstime = 0;
1042 	aiocbpp = NULL;
1043 
1044 	if (!aio_has_any_work()) {
1045 		error = EINVAL;
1046 		goto ExitThisRoutine;
1047 	}
1048 
1049 	if (uap->nent < 1 || uap->nent > aio_max_requests_per_process ||
1050 	    os_mul_overflow(sizeof(user_addr_t), uap->nent, &aiocbpp_size)) {
1051 		error = EINVAL;
1052 		goto ExitThisRoutine;
1053 	}
1054 
1055 	if (uap->timeoutp != USER_ADDR_NULL) {
1056 		if (proc_is64bit(p)) {
1057 			struct user64_timespec temp;
1058 			error = copyin(uap->timeoutp, &temp, sizeof(temp));
1059 			if (error == 0) {
1060 				ts.tv_sec = (user_time_t)temp.tv_sec;
1061 				ts.tv_nsec = (user_long_t)temp.tv_nsec;
1062 			}
1063 		} else {
1064 			struct user32_timespec temp;
1065 			error = copyin(uap->timeoutp, &temp, sizeof(temp));
1066 			if (error == 0) {
1067 				ts.tv_sec = temp.tv_sec;
1068 				ts.tv_nsec = temp.tv_nsec;
1069 			}
1070 		}
1071 		if (error != 0) {
1072 			error = EAGAIN;
1073 			goto ExitThisRoutine;
1074 		}
1075 
1076 		if (ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) {
1077 			error = EINVAL;
1078 			goto ExitThisRoutine;
1079 		}
1080 
1081 		nanoseconds_to_absolutetime((uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1082 		    &abstime);
1083 		clock_absolutetime_interval_to_deadline(abstime, &abstime);
1084 	}
1085 
1086 	aiocbpp = (user_addr_t *)kalloc_data(aiocbpp_size, Z_WAITOK);
1087 	if (aiocbpp == NULL || aio_copy_in_list(p, uap->aiocblist, aiocbpp, uap->nent)) {
1088 		error = EAGAIN;
1089 		goto ExitThisRoutine;
1090 	}
1091 
1092 	/* check list of aio requests to see if any have completed */
1093 check_for_our_aiocbp:
1094 	aio_proc_lock_spin(p);
1095 	for (i = 0; i < uap->nent; i++) {
1096 		user_addr_t     aiocbp;
1097 
1098 		/* NULL elements are legal so check for 'em */
1099 		aiocbp = *(aiocbpp + i);
1100 		if (aiocbp == USER_ADDR_NULL) {
1101 			continue;
1102 		}
1103 
1104 		/* return immediately if any aio request in the list is done */
1105 		TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1106 			ASSERT_AIO_FROM_PROC(entryp, p);
1107 			if (entryp->uaiocbp == aiocbp) {
1108 				aio_proc_unlock(p);
1109 				*retval = 0;
1110 				error = 0;
1111 				goto ExitThisRoutine;
1112 			}
1113 		}
1114 	}
1115 
1116 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep) | DBG_FUNC_NONE,
1117 	    VM_KERNEL_ADDRPERM(p), uap->nent, 0, 0, 0);
1118 
1119 	/*
1120 	 * wait for an async IO to complete or a signal fires or timeout expires.
1121 	 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1122 	 * interrupts us.  If an async IO completes before a signal fires or our
1123 	 * timeout expires, we get a wakeup call from aio_work_thread().
1124 	 */
1125 
1126 	error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p),
1127 	    PCATCH | PWAIT | PDROP, "aio_suspend", abstime);
1128 	if (error == 0) {
1129 		/*
1130 		 * got our wakeup call from aio_work_thread().
1131 		 * Since we can get a wakeup on this channel from another thread in the
1132 		 * same process we head back up to make sure this is for the correct aiocbp.
1133 		 * If it is the correct aiocbp we will return from where we do the check
1134 		 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1135 		 * else we will fall out and just sleep again.
1136 		 */
1137 		goto check_for_our_aiocbp;
1138 	} else if (error == EWOULDBLOCK) {
1139 		/* our timeout expired */
1140 		error = EAGAIN;
1141 	} else {
1142 		/* we were interrupted */
1143 		error = EINTR;
1144 	}
1145 
1146 ExitThisRoutine:
1147 	if (aiocbpp != NULL) {
1148 		kfree_data(aiocbpp, aiocbpp_size);
1149 	}
1150 
1151 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend) | DBG_FUNC_END,
1152 	    VM_KERNEL_ADDRPERM(p), uap->nent, error, 0, 0);
1153 
1154 	return error;
1155 }
1156 
1157 
1158 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1159  * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1160  * (uap->aiocbp->aio_buf).
1161  */
1162 
1163 int
aio_write(proc_t p,struct aio_write_args * uap,int * retval __unused)1164 aio_write(proc_t p, struct aio_write_args *uap, int *retval __unused)
1165 {
1166 	int error;
1167 
1168 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) | DBG_FUNC_START,
1169 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0);
1170 
1171 	error = aio_queue_async_request(p, uap->aiocbp, AIO_WRITE);
1172 
1173 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_write) | DBG_FUNC_END,
1174 	    VM_KERNEL_ADDRPERM(p), uap->aiocbp, error, 0, 0);
1175 
1176 	return error;
1177 }
1178 
1179 
1180 static int
aio_copy_in_list(proc_t procp,user_addr_t aiocblist,user_addr_t * aiocbpp,int nent)1181 aio_copy_in_list(proc_t procp, user_addr_t aiocblist, user_addr_t *aiocbpp,
1182     int nent)
1183 {
1184 	int result;
1185 
1186 	/* copyin our aiocb pointers from list */
1187 	result = copyin(aiocblist, aiocbpp,
1188 	    proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1189 	    : (nent * sizeof(user32_addr_t)));
1190 	if (result) {
1191 		return result;
1192 	}
1193 
1194 	/*
1195 	 * We depend on a list of user_addr_t's so we need to
1196 	 * munge and expand when these pointers came from a
1197 	 * 32-bit process
1198 	 */
1199 	if (!proc_is64bit(procp)) {
1200 		/* copy from last to first to deal with overlap */
1201 		user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1202 		user_addr_t *my_addrp = aiocbpp + (nent - 1);
1203 
1204 		for (int i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
1205 			*my_addrp = (user_addr_t) (*my_ptrp);
1206 		}
1207 	}
1208 
1209 	return 0;
1210 }
1211 
1212 
1213 static int
aio_copy_in_sigev(proc_t procp,user_addr_t sigp,struct user_sigevent * sigev)1214 aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1215 {
1216 	int     result = 0;
1217 
1218 	if (sigp == USER_ADDR_NULL) {
1219 		goto out;
1220 	}
1221 
1222 	/*
1223 	 * We need to munge aio_sigevent since it contains pointers.
1224 	 * Since we do not know if sigev_value is an int or a ptr we do
1225 	 * NOT cast the ptr to a user_addr_t.   This means if we send
1226 	 * this info back to user space we need to remember sigev_value
1227 	 * was not expanded for the 32-bit case.
1228 	 *
1229 	 * Notes:	 This does NOT affect us since we don't support
1230 	 *		sigev_value yet in the aio context.
1231 	 */
1232 	if (proc_is64bit(procp)) {
1233 #if __LP64__
1234 		struct user64_sigevent sigevent64;
1235 
1236 		result = copyin(sigp, &sigevent64, sizeof(sigevent64));
1237 		if (result == 0) {
1238 			sigev->sigev_notify = sigevent64.sigev_notify;
1239 			sigev->sigev_signo = sigevent64.sigev_signo;
1240 			sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1241 			sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1242 			sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1243 		}
1244 #else
1245 		panic("64bit process on 32bit kernel is not supported");
1246 #endif
1247 	} else {
1248 		struct user32_sigevent sigevent32;
1249 
1250 		result = copyin(sigp, &sigevent32, sizeof(sigevent32));
1251 		if (result == 0) {
1252 			sigev->sigev_notify = sigevent32.sigev_notify;
1253 			sigev->sigev_signo = sigevent32.sigev_signo;
1254 			sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1255 			sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1256 			sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1257 		}
1258 	}
1259 
1260 	if (result != 0) {
1261 		result = EAGAIN;
1262 	}
1263 
1264 out:
1265 	return result;
1266 }
1267 
1268 /*
1269  * validate user_sigevent.  at this point we only support
1270  * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE.  this means
1271  * sigev_value, sigev_notify_function, and sigev_notify_attributes
1272  * are ignored, since SIGEV_THREAD is unsupported.  This is consistent
1273  * with no [RTS] (RalTime Signal) option group support.
1274  */
1275 static int
aio_sigev_validate(const struct user_sigevent * sigev)1276 aio_sigev_validate(const struct user_sigevent *sigev)
1277 {
1278 	switch (sigev->sigev_notify) {
1279 	case SIGEV_SIGNAL:
1280 	{
1281 		int signum;
1282 
1283 		/* make sure we have a valid signal number */
1284 		signum = sigev->sigev_signo;
1285 		if (signum <= 0 || signum >= NSIG ||
1286 		    signum == SIGKILL || signum == SIGSTOP) {
1287 			return EINVAL;
1288 		}
1289 	}
1290 	break;
1291 
1292 	case SIGEV_NONE:
1293 		break;
1294 
1295 	case SIGEV_THREAD:
1296 	/* Unsupported [RTS] */
1297 
1298 	default:
1299 		return EINVAL;
1300 	}
1301 
1302 	return 0;
1303 }
1304 
1305 
1306 /*
1307  * aio_try_enqueue_work_locked
1308  *
1309  * Queue up the entry on the aio asynchronous work queue in priority order
1310  * based on the relative priority of the request.  We calculate the relative
1311  * priority using the nice value of the caller and the value
1312  *
1313  * Parameters:	procp			Process queueing the I/O
1314  *		entryp			The work queue entry being queued
1315  *		leader			The work leader if any
1316  *
1317  * Returns:	Wether the enqueue was successful
1318  *
1319  * Notes:	This function is used for both lio_listio and aio
1320  *
1321  * XXX:		At some point, we may have to consider thread priority
1322  *		rather than process priority, but we don't maintain the
1323  *		adjusted priority for threads the POSIX way.
1324  *
1325  * Called with proc locked.
1326  */
1327 static bool
aio_try_enqueue_work_locked(proc_t procp,aio_workq_entry * entryp,aio_workq_entry * leader)1328 aio_try_enqueue_work_locked(proc_t procp, aio_workq_entry *entryp,
1329     aio_workq_entry *leader)
1330 {
1331 	aio_workq_t queue = aio_entry_workq(entryp);
1332 
1333 	ASSERT_AIO_PROC_LOCK_OWNED(procp);
1334 
1335 	/* Onto proc queue */
1336 	if (!aio_try_proc_insert_active_locked(procp, entryp)) {
1337 		return false;
1338 	}
1339 
1340 	if (leader) {
1341 		aio_entry_ref(leader); /* consumed in do_aio_completion_and_unlock */
1342 		leader->lio_pending++;
1343 		entryp->lio_leader = leader;
1344 	}
1345 
1346 	/* And work queue */
1347 	aio_entry_ref(entryp); /* consumed in do_aio_completion_and_unlock */
1348 	aio_workq_lock_spin(queue);
1349 	aio_workq_add_entry_locked(queue, entryp);
1350 	waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue),
1351 	    THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
1352 	aio_workq_unlock(queue);
1353 
1354 	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) | DBG_FUNC_START,
1355 	    VM_KERNEL_ADDRPERM(procp), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1356 	    entryp->flags, entryp->aiocb.aio_fildes, 0);
1357 	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) | DBG_FUNC_END,
1358 	    entryp->aiocb.aio_offset, 0, entryp->aiocb.aio_nbytes, 0, 0);
1359 	return true;
1360 }
1361 
1362 
1363 /*
1364  * lio_listio - initiate a list of IO requests.  We process the list of
1365  * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1366  * (mode == LIO_NOWAIT).
1367  *
1368  * The caller gets error and return status for each aiocb in the list
1369  * via aio_error and aio_return.  We must keep completed requests until
1370  * released by the aio_return call.
1371  */
1372 int
lio_listio(proc_t p,struct lio_listio_args * uap,int * retval __unused)1373 lio_listio(proc_t p, struct lio_listio_args *uap, int *retval __unused)
1374 {
1375 	aio_workq_entry         *entries[AIO_LISTIO_MAX] = { };
1376 	user_addr_t              aiocbpp[AIO_LISTIO_MAX];
1377 	struct user_sigevent     aiosigev = { };
1378 	int                      result = 0;
1379 	int                      lio_count = 0;
1380 
1381 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) | DBG_FUNC_START,
1382 	    VM_KERNEL_ADDRPERM(p), uap->nent, uap->mode, 0, 0);
1383 
1384 	if (!(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT)) {
1385 		result = EINVAL;
1386 		goto ExitRoutine;
1387 	}
1388 
1389 	if (uap->nent < 1 || uap->nent > AIO_LISTIO_MAX) {
1390 		result = EINVAL;
1391 		goto ExitRoutine;
1392 	}
1393 
1394 	/*
1395 	 * Use sigevent passed in to lio_listio for each of our calls, but
1396 	 * only do completion notification after the last request completes.
1397 	 */
1398 	if (uap->sigp != USER_ADDR_NULL) {
1399 		result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1400 		if (result) {
1401 			goto ExitRoutine;
1402 		}
1403 		result = aio_sigev_validate(&aiosigev);
1404 		if (result) {
1405 			goto ExitRoutine;
1406 		}
1407 	}
1408 
1409 	if (aio_copy_in_list(p, uap->aiocblist, aiocbpp, uap->nent)) {
1410 		result = EAGAIN;
1411 		goto ExitRoutine;
1412 	}
1413 
1414 	/*
1415 	 * allocate/parse all entries
1416 	 */
1417 	for (int i = 0; i < uap->nent; i++) {
1418 		aio_workq_entry *entryp;
1419 
1420 		/* NULL elements are legal so check for 'em */
1421 		if (aiocbpp[i] == USER_ADDR_NULL) {
1422 			continue;
1423 		}
1424 
1425 		entryp = aio_create_queue_entry(p, aiocbpp[i], AIO_LIO);
1426 		if (entryp == NULL) {
1427 			result = EAGAIN;
1428 			goto ExitRoutine;
1429 		}
1430 
1431 		/*
1432 		 * This refcount is cleaned up on exit if the entry
1433 		 * isn't submitted
1434 		 */
1435 		entries[lio_count++] = entryp;
1436 		if (uap->mode == LIO_NOWAIT) {
1437 			/* Set signal hander, if any */
1438 			entryp->aiocb.aio_sigevent = aiosigev;
1439 		}
1440 	}
1441 
1442 	if (lio_count == 0) {
1443 		/* There's nothing to submit */
1444 		goto ExitRoutine;
1445 	}
1446 
1447 	/*
1448 	 * Past this point we're commited and will not bail out
1449 	 *
1450 	 * - keep a reference on the leader for LIO_WAIT
1451 	 * - perform the submissions and optionally wait
1452 	 */
1453 
1454 	aio_workq_entry *leader = entries[0];
1455 	if (uap->mode == LIO_WAIT) {
1456 		aio_entry_ref(leader); /* consumed below */
1457 	}
1458 
1459 	aio_proc_lock_spin(p);
1460 
1461 	for (int i = 0; i < lio_count; i++) {
1462 		if (aio_try_enqueue_work_locked(p, entries[i], leader)) {
1463 			entries[i] = NULL; /* the entry was submitted */
1464 		} else {
1465 			result = EAGAIN;
1466 		}
1467 	}
1468 
1469 	if (uap->mode == LIO_WAIT && result == 0) {
1470 		leader->flags |= AIO_LIO_WAIT;
1471 
1472 		while (leader->lio_pending) {
1473 			/* If we were interrupted, fail out (even if all finished) */
1474 			if (msleep(leader, aio_proc_mutex(p),
1475 			    PCATCH | PRIBIO | PSPIN, "lio_listio", 0) != 0) {
1476 				result = EINTR;
1477 				break;
1478 			}
1479 		}
1480 
1481 		leader->flags &= ~AIO_LIO_WAIT;
1482 	}
1483 
1484 	aio_proc_unlock(p);
1485 
1486 	if (uap->mode == LIO_WAIT) {
1487 		aio_entry_unref(leader);
1488 	}
1489 
1490 ExitRoutine:
1491 	/* Consume unsubmitted entries */
1492 	for (int i = 0; i < lio_count; i++) {
1493 		if (entries[i]) {
1494 			aio_entry_unref(entries[i]);
1495 		}
1496 	}
1497 
1498 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_listio) | DBG_FUNC_END,
1499 	    VM_KERNEL_ADDRPERM(p), result, 0, 0, 0);
1500 
1501 	return result;
1502 }
1503 
1504 
1505 /*
1506  * aio worker thread.  this is where all the real work gets done.
1507  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1508  * after new work is queued up.
1509  */
1510 __attribute__((noreturn))
1511 static void
aio_work_thread(void * arg __unused,wait_result_t wr __unused)1512 aio_work_thread(void *arg __unused, wait_result_t wr __unused)
1513 {
1514 	aio_workq_entry *entryp;
1515 	int              error;
1516 	vm_map_t         currentmap;
1517 	vm_map_t         oldmap = VM_MAP_NULL;
1518 	task_t           oldaiotask = TASK_NULL;
1519 	struct uthread  *uthreadp = NULL;
1520 	proc_t           p = NULL;
1521 
1522 	for (;;) {
1523 		/*
1524 		 * returns with the entry ref'ed.
1525 		 * sleeps until work is available.
1526 		 */
1527 		entryp = aio_get_some_work();
1528 		p = entryp->procp;
1529 
1530 		KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) | DBG_FUNC_START,
1531 		    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1532 		    entryp->flags, 0, 0);
1533 
1534 		/*
1535 		 * Assume the target's address space identity for the duration
1536 		 * of the IO.  Note: don't need to have the entryp locked,
1537 		 * because the proc and map don't change until it's freed.
1538 		 */
1539 		currentmap = get_task_map((current_proc())->task);
1540 		if (currentmap != entryp->aio_map) {
1541 			uthreadp = (struct uthread *) current_uthread();
1542 			oldaiotask = uthreadp->uu_aio_task;
1543 			/*
1544 			 * workq entries at this stage cause _aio_exec() and _aio_exit() to
1545 			 * block until we hit `do_aio_completion_and_unlock()` below,
1546 			 * which means that it is safe to dereference p->task without
1547 			 * holding a lock or taking references.
1548 			 */
1549 			uthreadp->uu_aio_task = p->task;
1550 			oldmap = vm_map_switch(entryp->aio_map);
1551 		}
1552 
1553 		if ((entryp->flags & AIO_READ) != 0) {
1554 			error = do_aio_read(entryp);
1555 		} else if ((entryp->flags & AIO_WRITE) != 0) {
1556 			error = do_aio_write(entryp);
1557 		} else if ((entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0) {
1558 			error = do_aio_fsync(entryp);
1559 		} else {
1560 			error = EINVAL;
1561 		}
1562 
1563 		/* Restore old map */
1564 		if (currentmap != entryp->aio_map) {
1565 			vm_map_switch(oldmap);
1566 			uthreadp->uu_aio_task = oldaiotask;
1567 		}
1568 
1569 		/* liberate unused map */
1570 		vm_map_deallocate(entryp->aio_map);
1571 		entryp->aio_map = VM_MAP_NULL;
1572 
1573 		KERNEL_DEBUG(SDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) | DBG_FUNC_END,
1574 		    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1575 		    entryp->errorval, entryp->returnval, 0);
1576 
1577 		/* we're done with the IO request so pop it off the active queue and */
1578 		/* push it on the done queue */
1579 		aio_proc_lock(p);
1580 		entryp->errorval = error;
1581 		do_aio_completion_and_unlock(p, entryp);
1582 	}
1583 }
1584 
1585 
1586 /*
1587  * aio_get_some_work - get the next async IO request that is ready to be executed.
1588  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1589  * IO requests at the time the aio_fsync call came in have completed.
1590  * NOTE - AIO_LOCK must be held by caller
1591  */
1592 static aio_workq_entry *
aio_get_some_work(void)1593 aio_get_some_work(void)
1594 {
1595 	aio_workq_entry *entryp = NULL;
1596 	aio_workq_t      queue = NULL;
1597 
1598 	/* Just one queue for the moment.  In the future there will be many. */
1599 	queue = &aio_anchor.aio_async_workqs[0];
1600 	aio_workq_lock_spin(queue);
1601 
1602 	/*
1603 	 * Hold the queue lock.
1604 	 *
1605 	 * pop some work off the work queue and add to our active queue
1606 	 * Always start with the queue lock held.
1607 	 */
1608 	while ((entryp = TAILQ_FIRST(&queue->aioq_entries))) {
1609 		/*
1610 		 * Pull of of work queue.  Once it's off, it can't be cancelled,
1611 		 * so we can take our ref once we drop the queue lock.
1612 		 */
1613 
1614 		aio_workq_remove_entry_locked(queue, entryp);
1615 
1616 		aio_workq_unlock(queue);
1617 
1618 		/*
1619 		 * Check if it's an fsync that must be delayed.  No need to lock the entry;
1620 		 * that flag would have been set at initialization.
1621 		 */
1622 		if ((entryp->flags & AIO_FSYNC) != 0) {
1623 			/*
1624 			 * Check for unfinished operations on the same file
1625 			 * in this proc's queue.
1626 			 */
1627 			aio_proc_lock_spin(entryp->procp);
1628 			if (aio_delay_fsync_request(entryp)) {
1629 				/* It needs to be delayed.  Put it back on the end of the work queue */
1630 				KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay) | DBG_FUNC_NONE,
1631 				    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1632 				    0, 0, 0);
1633 
1634 				aio_proc_unlock(entryp->procp);
1635 
1636 				aio_workq_lock_spin(queue);
1637 				aio_workq_add_entry_locked(queue, entryp);
1638 				continue;
1639 			}
1640 			aio_proc_unlock(entryp->procp);
1641 		}
1642 
1643 		return entryp;
1644 	}
1645 
1646 	/* We will wake up when someone enqueues something */
1647 	waitq_assert_wait64(&queue->aioq_waitq, CAST_EVENT64_T(queue), THREAD_UNINT, 0);
1648 	aio_workq_unlock(queue);
1649 	thread_block(aio_work_thread);
1650 
1651 	__builtin_unreachable();
1652 }
1653 
1654 /*
1655  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1656  * A big, simple hammer: only send it off if it's the most recently filed IO which has
1657  * not been completed.
1658  */
1659 static boolean_t
aio_delay_fsync_request(aio_workq_entry * entryp)1660 aio_delay_fsync_request(aio_workq_entry *entryp)
1661 {
1662 	if (proc_in_teardown(entryp->procp)) {
1663 		/*
1664 		 * we can't delay FSYNCS when in teardown as it will confuse _aio_exit,
1665 		 * if it was dequeued, then we must now commit to it
1666 		 */
1667 		return FALSE;
1668 	}
1669 
1670 	if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1671 		return FALSE;
1672 	}
1673 
1674 	return TRUE;
1675 }
1676 
1677 static aio_workq_entry *
aio_create_queue_entry(proc_t procp,user_addr_t aiocbp,aio_entry_flags_t flags)1678 aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t flags)
1679 {
1680 	aio_workq_entry *entryp;
1681 
1682 	entryp = zalloc_flags(aio_workq_zonep, Z_WAITOK | Z_ZERO);
1683 	entryp->procp = procp;
1684 	entryp->uaiocbp = aiocbp;
1685 	entryp->flags = flags;
1686 	/* consumed in aio_return or _aio_exit */
1687 	os_ref_init(&entryp->aio_refcount, &aio_refgrp);
1688 
1689 	if (proc_is64bit(procp)) {
1690 		struct user64_aiocb aiocb64;
1691 
1692 		if (copyin(aiocbp, &aiocb64, sizeof(aiocb64)) != 0) {
1693 			goto error_exit;
1694 		}
1695 		do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1696 	} else {
1697 		struct user32_aiocb aiocb32;
1698 
1699 		if (copyin(aiocbp, &aiocb32, sizeof(aiocb32)) != 0) {
1700 			goto error_exit;
1701 		}
1702 		do_munge_aiocb_user32_to_user(&aiocb32, &entryp->aiocb);
1703 	}
1704 
1705 	/* do some more validation on the aiocb and embedded file descriptor */
1706 	if (aio_validate(procp, entryp) != 0) {
1707 		goto error_exit;
1708 	}
1709 
1710 	/* get a reference to the user land map in order to keep it around */
1711 	entryp->aio_map = get_task_map(procp->task);
1712 	vm_map_reference(entryp->aio_map);
1713 
1714 	/* get a reference on the current_thread, which is passed in vfs_context. */
1715 	entryp->thread = current_thread();
1716 	thread_reference(entryp->thread);
1717 	return entryp;
1718 
1719 error_exit:
1720 	zfree(aio_workq_zonep, entryp);
1721 	return NULL;
1722 }
1723 
1724 
1725 /*
1726  * aio_queue_async_request - queue up an async IO request on our work queue then
1727  * wake up one of our worker threads to do the actual work.  We get a reference
1728  * to our caller's user land map in order to keep it around while we are
1729  * processing the request.
1730  */
1731 static int
aio_queue_async_request(proc_t procp,user_addr_t aiocbp,aio_entry_flags_t flags)1732 aio_queue_async_request(proc_t procp, user_addr_t aiocbp,
1733     aio_entry_flags_t flags)
1734 {
1735 	aio_workq_entry *entryp;
1736 	int              result;
1737 
1738 	entryp = aio_create_queue_entry(procp, aiocbp, flags);
1739 	if (entryp == NULL) {
1740 		result = EAGAIN;
1741 		goto error_noalloc;
1742 	}
1743 
1744 	aio_proc_lock_spin(procp);
1745 	if (!aio_try_enqueue_work_locked(procp, entryp, NULL)) {
1746 		result = EAGAIN;
1747 		goto error_exit;
1748 	}
1749 	aio_proc_unlock(procp);
1750 	return 0;
1751 
1752 error_exit:
1753 	/*
1754 	 * This entry has not been queued up so no worries about
1755 	 * unlocked state and aio_map
1756 	 */
1757 	aio_proc_unlock(procp);
1758 	aio_free_request(entryp);
1759 error_noalloc:
1760 	return result;
1761 }
1762 
1763 
1764 /*
1765  * aio_free_request - remove our reference on the user land map and
1766  * free the work queue entry resources.  The entry is off all lists
1767  * and has zero refcount, so no one can have a pointer to it.
1768  */
1769 static void
aio_free_request(aio_workq_entry * entryp)1770 aio_free_request(aio_workq_entry *entryp)
1771 {
1772 	if (entryp->aio_proc_link.tqe_prev || entryp->aio_workq_link.tqe_prev) {
1773 		panic("aio_workq_entry %p being freed while still enqueued", entryp);
1774 	}
1775 
1776 	/* remove our reference to the user land map. */
1777 	if (VM_MAP_NULL != entryp->aio_map) {
1778 		vm_map_deallocate(entryp->aio_map);
1779 	}
1780 
1781 	/* remove our reference to thread which enqueued the request */
1782 	if (NULL != entryp->thread) {
1783 		thread_deallocate(entryp->thread);
1784 	}
1785 
1786 	zfree(aio_workq_zonep, entryp);
1787 }
1788 
1789 
1790 /*
1791  * aio_validate
1792  *
1793  * validate the aiocb passed in by one of the aio syscalls.
1794  */
1795 static int
aio_validate(proc_t p,aio_workq_entry * entryp)1796 aio_validate(proc_t p, aio_workq_entry *entryp)
1797 {
1798 	struct fileproc *fp;
1799 	int              flag;
1800 	int              result;
1801 
1802 	result = 0;
1803 
1804 	if ((entryp->flags & AIO_LIO) != 0) {
1805 		if (entryp->aiocb.aio_lio_opcode == LIO_READ) {
1806 			entryp->flags |= AIO_READ;
1807 		} else if (entryp->aiocb.aio_lio_opcode == LIO_WRITE) {
1808 			entryp->flags |= AIO_WRITE;
1809 		} else if (entryp->aiocb.aio_lio_opcode == LIO_NOP) {
1810 			return 0;
1811 		} else {
1812 			return EINVAL;
1813 		}
1814 	}
1815 
1816 	flag = FREAD;
1817 	if ((entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0) {
1818 		flag = FWRITE;
1819 	}
1820 
1821 	if ((entryp->flags & (AIO_READ | AIO_WRITE)) != 0) {
1822 		if (entryp->aiocb.aio_nbytes > INT_MAX ||
1823 		    entryp->aiocb.aio_buf == USER_ADDR_NULL ||
1824 		    entryp->aiocb.aio_offset < 0) {
1825 			return EINVAL;
1826 		}
1827 	}
1828 
1829 	result = aio_sigev_validate(&entryp->aiocb.aio_sigevent);
1830 	if (result) {
1831 		return result;
1832 	}
1833 
1834 	/* validate the file descriptor and that the file was opened
1835 	 * for the appropriate read / write access.
1836 	 */
1837 	proc_fdlock(p);
1838 
1839 	fp = fp_get_noref_locked(p, entryp->aiocb.aio_fildes);
1840 	if (fp == NULL) {
1841 		result = EBADF;
1842 	} else if ((fp->fp_glob->fg_flag & flag) == 0) {
1843 		/* we don't have read or write access */
1844 		result = EBADF;
1845 	} else if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_VNODE) {
1846 		/* this is not a file */
1847 		result = ESPIPE;
1848 	} else {
1849 		fp->fp_flags |= FP_AIOISSUED;
1850 	}
1851 
1852 	proc_fdunlock(p);
1853 
1854 	return result;
1855 }
1856 
1857 /*
1858  * do_aio_completion_and_unlock.  Handle async IO completion.
1859  */
1860 static void
do_aio_completion_and_unlock(proc_t p,aio_workq_entry * entryp)1861 do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp)
1862 {
1863 	aio_workq_entry *leader = entryp->lio_leader;
1864 	int              lio_pending = 0;
1865 	bool             do_signal = false;
1866 
1867 	ASSERT_AIO_PROC_LOCK_OWNED(p);
1868 
1869 	aio_proc_move_done_locked(p, entryp);
1870 
1871 	if (leader) {
1872 		lio_pending = --leader->lio_pending;
1873 		if (lio_pending < 0) {
1874 			panic("lio_pending accounting mistake");
1875 		}
1876 		if (lio_pending == 0 && (leader->flags & AIO_LIO_WAIT)) {
1877 			wakeup(leader);
1878 		}
1879 		entryp->lio_leader = NULL; /* no dangling pointers please */
1880 	}
1881 
1882 	/*
1883 	 * need to handle case where a process is trying to exit, exec, or
1884 	 * close and is currently waiting for active aio requests to complete.
1885 	 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
1886 	 * other requests in the active queue for this process.  If there are
1887 	 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
1888 	 * If there are some still active then do nothing - we only want to
1889 	 * wakeup when all active aio requests for the process are complete.
1890 	 */
1891 	if (__improbable(entryp->flags & AIO_EXIT_WAIT)) {
1892 		KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) | DBG_FUNC_NONE,
1893 		    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1894 		    0, 0, 0);
1895 
1896 		if (!aio_has_active_requests_for_process(p)) {
1897 			/*
1898 			 * no active aio requests for this process, continue exiting.  In this
1899 			 * case, there should be no one else waiting ont he proc in AIO...
1900 			 */
1901 			wakeup_one((caddr_t)&p->AIO_CLEANUP_SLEEP_CHAN);
1902 
1903 			KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) | DBG_FUNC_NONE,
1904 			    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1905 			    0, 0, 0);
1906 		}
1907 	} else if (entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
1908 		/*
1909 		 * If this was the last request in the group, or not part of
1910 		 * a group, and that a signal is desired, send one.
1911 		 */
1912 		do_signal = (lio_pending == 0);
1913 	}
1914 
1915 	if (__improbable(entryp->flags & AIO_CLOSE_WAIT)) {
1916 		KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) | DBG_FUNC_NONE,
1917 		    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1918 		    0, 0, 0);
1919 
1920 		if (!aio_proc_has_active_requests_for_file(p, entryp->aiocb.aio_fildes)) {
1921 			/* Can't wakeup_one(); multiple closes might be in progress. */
1922 			wakeup(&p->AIO_CLEANUP_SLEEP_CHAN);
1923 
1924 			KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake) | DBG_FUNC_NONE,
1925 			    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1926 			    0, 0, 0);
1927 		}
1928 	}
1929 
1930 	aio_proc_unlock(p);
1931 
1932 	if (do_signal) {
1933 		KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig) | DBG_FUNC_NONE,
1934 		    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp),
1935 		    entryp->aiocb.aio_sigevent.sigev_signo, 0, 0);
1936 
1937 		psignal(p, entryp->aiocb.aio_sigevent.sigev_signo);
1938 	}
1939 
1940 	/*
1941 	 * A thread in aio_suspend() wants to known about completed IOs.  If it checked
1942 	 * the done list before we moved our AIO there, then it already asserted its wait,
1943 	 * and we can wake it up without holding the lock.  If it checked the list after
1944 	 * we did our move, then it already has seen the AIO that we moved.  Herego, we
1945 	 * can do our wakeup without holding the lock.
1946 	 */
1947 	wakeup(&p->AIO_SUSPEND_SLEEP_CHAN);
1948 	KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake) | DBG_FUNC_NONE,
1949 	    VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), 0, 0, 0);
1950 
1951 	aio_entry_unref(entryp); /* see aio_try_enqueue_work_locked */
1952 	if (leader) {
1953 		aio_entry_unref(leader); /* see lio_listio */
1954 	}
1955 }
1956 
1957 
1958 /*
1959  * do_aio_read
1960  */
1961 static int
do_aio_read(aio_workq_entry * entryp)1962 do_aio_read(aio_workq_entry *entryp)
1963 {
1964 	struct proc     *p = entryp->procp;
1965 	struct fileproc *fp;
1966 	int error;
1967 
1968 	if ((error = fp_lookup(p, entryp->aiocb.aio_fildes, &fp, 0))) {
1969 		return error;
1970 	}
1971 
1972 	if (fp->fp_glob->fg_flag & FREAD) {
1973 		struct vfs_context context = {
1974 			.vc_thread = entryp->thread,     /* XXX */
1975 			.vc_ucred = fp->fp_glob->fg_cred,
1976 		};
1977 
1978 		error = dofileread(&context, fp,
1979 		    entryp->aiocb.aio_buf,
1980 		    entryp->aiocb.aio_nbytes,
1981 		    entryp->aiocb.aio_offset, FOF_OFFSET,
1982 		    &entryp->returnval);
1983 	} else {
1984 		error = EBADF;
1985 	}
1986 
1987 	fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
1988 	return error;
1989 }
1990 
1991 
1992 /*
1993  * do_aio_write
1994  */
1995 static int
do_aio_write(aio_workq_entry * entryp)1996 do_aio_write(aio_workq_entry *entryp)
1997 {
1998 	struct proc     *p = entryp->procp;
1999 	struct fileproc *fp;
2000 	int error;
2001 
2002 	if ((error = fp_lookup(p, entryp->aiocb.aio_fildes, &fp, 0))) {
2003 		return error;
2004 	}
2005 
2006 	if (fp->fp_glob->fg_flag & FWRITE) {
2007 		struct vfs_context context = {
2008 			.vc_thread = entryp->thread,     /* XXX */
2009 			.vc_ucred = fp->fp_glob->fg_cred,
2010 		};
2011 		int flags = FOF_PCRED;
2012 
2013 		if ((fp->fp_glob->fg_flag & O_APPEND) == 0) {
2014 			flags |= FOF_OFFSET;
2015 		}
2016 
2017 		/* NB: tell dofilewrite the offset, and to use the proc cred */
2018 		error = dofilewrite(&context,
2019 		    fp,
2020 		    entryp->aiocb.aio_buf,
2021 		    entryp->aiocb.aio_nbytes,
2022 		    entryp->aiocb.aio_offset,
2023 		    flags,
2024 		    &entryp->returnval);
2025 	} else {
2026 		error = EBADF;
2027 	}
2028 
2029 	fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
2030 	return error;
2031 }
2032 
2033 
2034 /*
2035  * aio_has_active_requests_for_process - return whether the process has active
2036  * requests pending.
2037  */
2038 static bool
aio_has_active_requests_for_process(proc_t procp)2039 aio_has_active_requests_for_process(proc_t procp)
2040 {
2041 	return !TAILQ_EMPTY(&procp->p_aio_activeq);
2042 }
2043 
2044 /*
2045  * Called with the proc locked.
2046  */
2047 static bool
aio_proc_has_active_requests_for_file(proc_t procp,int fd)2048 aio_proc_has_active_requests_for_file(proc_t procp, int fd)
2049 {
2050 	aio_workq_entry *entryp;
2051 
2052 	TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2053 		if (entryp->aiocb.aio_fildes == fd) {
2054 			return true;
2055 		}
2056 	}
2057 
2058 	return false;
2059 }
2060 
2061 
2062 /*
2063  * do_aio_fsync
2064  */
2065 static int
do_aio_fsync(aio_workq_entry * entryp)2066 do_aio_fsync(aio_workq_entry *entryp)
2067 {
2068 	struct proc            *p = entryp->procp;
2069 	struct vnode           *vp;
2070 	struct fileproc        *fp;
2071 	int                     sync_flag;
2072 	int                     error;
2073 
2074 	/*
2075 	 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2076 	 *
2077 	 * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2078 	 * to mark for update the metadata not strictly necessary for data
2079 	 * retrieval, rather than forcing it to disk.
2080 	 *
2081 	 * If AIO_FSYNC is set, we have to also wait for metadata not really
2082 	 * necessary to data retrival are committed to stable storage (e.g.
2083 	 * atime, mtime, ctime, etc.).
2084 	 *
2085 	 * Metadata necessary for data retrieval ust be committed to stable
2086 	 * storage in either case (file length, etc.).
2087 	 */
2088 	if (entryp->flags & AIO_FSYNC) {
2089 		sync_flag = MNT_WAIT;
2090 	} else {
2091 		sync_flag = MNT_DWAIT;
2092 	}
2093 
2094 	error = fp_get_ftype(p, entryp->aiocb.aio_fildes, DTYPE_VNODE, ENOTSUP, &fp);
2095 	if (error != 0) {
2096 		entryp->returnval = -1;
2097 		return error;
2098 	}
2099 	vp = fp_get_data(fp);
2100 
2101 	if ((error = vnode_getwithref(vp)) == 0) {
2102 		struct vfs_context context = {
2103 			.vc_thread = entryp->thread,     /* XXX */
2104 			.vc_ucred = fp->fp_glob->fg_cred,
2105 		};
2106 
2107 		error = VNOP_FSYNC(vp, sync_flag, &context);
2108 
2109 		(void)vnode_put(vp);
2110 	} else {
2111 		entryp->returnval = -1;
2112 	}
2113 
2114 	fp_drop(p, entryp->aiocb.aio_fildes, fp, 0);
2115 	return error;
2116 }
2117 
2118 
2119 /*
2120  * is_already_queued - runs through our queues to see if the given
2121  * aiocbp / process is there.  Returns TRUE if there is a match
2122  * on any of our aio queues.
2123  *
2124  * Called with proc aio lock held (can be held spin)
2125  */
2126 static boolean_t
is_already_queued(proc_t procp,user_addr_t aiocbp)2127 is_already_queued(proc_t procp, user_addr_t aiocbp)
2128 {
2129 	aio_workq_entry *entryp;
2130 	boolean_t        result;
2131 
2132 	result = FALSE;
2133 
2134 	/* look for matches on our queue of async IO requests that have completed */
2135 	TAILQ_FOREACH(entryp, &procp->p_aio_doneq, aio_proc_link) {
2136 		if (aiocbp == entryp->uaiocbp) {
2137 			result = TRUE;
2138 			goto ExitThisRoutine;
2139 		}
2140 	}
2141 
2142 	/* look for matches on our queue of active async IO requests */
2143 	TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2144 		if (aiocbp == entryp->uaiocbp) {
2145 			result = TRUE;
2146 			goto ExitThisRoutine;
2147 		}
2148 	}
2149 
2150 ExitThisRoutine:
2151 	return result;
2152 }
2153 
2154 
2155 /*
2156  * aio initialization
2157  */
2158 __private_extern__ void
aio_init(void)2159 aio_init(void)
2160 {
2161 	for (int i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
2162 		aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2163 	}
2164 
2165 	_aio_create_worker_threads(aio_worker_threads);
2166 }
2167 
2168 
2169 /*
2170  * aio worker threads created here.
2171  */
2172 __private_extern__ void
_aio_create_worker_threads(int num)2173 _aio_create_worker_threads(int num)
2174 {
2175 	int i;
2176 
2177 	/* create some worker threads to handle the async IO requests */
2178 	for (i = 0; i < num; i++) {
2179 		thread_t                myThread;
2180 
2181 		if (KERN_SUCCESS != kernel_thread_start(aio_work_thread, NULL, &myThread)) {
2182 			printf("%s - failed to create a work thread \n", __FUNCTION__);
2183 		} else {
2184 			thread_deallocate(myThread);
2185 		}
2186 	}
2187 }
2188 
2189 /*
2190  * Return the current activation utask
2191  */
2192 task_t
get_aiotask(void)2193 get_aiotask(void)
2194 {
2195 	return current_uthread()->uu_aio_task;
2196 }
2197 
2198 
2199 /*
2200  * In the case of an aiocb from a
2201  * 32-bit process we need to expand some longs and pointers to the correct
2202  * sizes in order to let downstream code always work on the same type of
2203  * aiocb (in our case that is a user_aiocb)
2204  */
2205 static void
do_munge_aiocb_user32_to_user(struct user32_aiocb * my_aiocbp,struct user_aiocb * the_user_aiocbp)2206 do_munge_aiocb_user32_to_user(struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp)
2207 {
2208 	the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2209 	the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2210 	the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2211 	the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2212 	the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2213 	the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2214 
2215 	/* special case here.  since we do not know if sigev_value is an */
2216 	/* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
2217 	/* means if we send this info back to user space we need to remember */
2218 	/* sigev_value was not expanded for the 32-bit case.  */
2219 	/* NOTE - this does NOT affect us since we don't support sigev_value */
2220 	/* yet in the aio context.  */
2221 	//LP64
2222 	the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2223 	the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2224 	the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2225 	    my_aiocbp->aio_sigevent.sigev_value.sival_int;
2226 	the_user_aiocbp->aio_sigevent.sigev_notify_function =
2227 	    CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2228 	the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2229 	    CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2230 }
2231 
2232 /* Similar for 64-bit user process, so that we don't need to satisfy
2233  * the alignment constraints of the original user64_aiocb
2234  */
2235 #if !__LP64__
2236 __dead2
2237 #endif
2238 static void
do_munge_aiocb_user64_to_user(struct user64_aiocb * my_aiocbp,struct user_aiocb * the_user_aiocbp)2239 do_munge_aiocb_user64_to_user(struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp)
2240 {
2241 #if __LP64__
2242 	the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2243 	the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2244 	the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2245 	the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2246 	the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2247 	the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2248 
2249 	the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2250 	the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2251 	the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2252 	    my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2253 	the_user_aiocbp->aio_sigevent.sigev_notify_function =
2254 	    my_aiocbp->aio_sigevent.sigev_notify_function;
2255 	the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2256 	    my_aiocbp->aio_sigevent.sigev_notify_attributes;
2257 #else
2258 #pragma unused(my_aiocbp, the_user_aiocbp)
2259 	panic("64bit process on 32bit kernel is not supported");
2260 #endif
2261 }
2262