/*
* An async IO implementation for Linux
- * Written by Benjamin LaHaise <bcrl@redhat.com>
+ * Written by Benjamin LaHaise <bcrl@kvack.org>
*
* Implements an efficient asynchronous io interface.
*
#define dprintk(x...) do { ; } while (0)
#endif
-long aio_run = 0; /* for testing only */
-long aio_wakeups = 0; /* for testing only */
-
/*------ sysctl variables----*/
-atomic_t aio_nr = ATOMIC_INIT(0); /* current system wide number of aio requests */
-unsigned aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
+static DEFINE_SPINLOCK(aio_nr_lock);
+unsigned long aio_nr; /* current system wide number of aio requests */
+unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
/*----end sysctl variables---*/
static kmem_cache_t *kiocb_cachep;
static void aio_fput_routine(void *);
static DECLARE_WORK(fput_work, aio_fput_routine, NULL);
-static spinlock_t fput_lock = SPIN_LOCK_UNLOCKED;
-LIST_HEAD(fput_head);
+static DEFINE_SPINLOCK(fput_lock);
+static LIST_HEAD(fput_head);
static void aio_kick_handler(void *);
+static void aio_queue_work(struct kioctx *);
/* aio_setup
* Creates the slab caches used by the aio routines, panic on
info->nr = 0;
info->ring_pages = info->internal_pages;
if (nr_pages > AIO_RING_PAGES) {
- info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL);
+ info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!info->ring_pages)
return -ENOMEM;
- memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages);
}
info->mmap_size = nr_pages * PAGE_SIZE;
return ERR_PTR(-EINVAL);
}
- if (nr_events > aio_max_nr)
+ if ((unsigned long)nr_events > aio_max_nr)
return ERR_PTR(-EAGAIN);
ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL);
goto out_freectx;
/* limit the number of system wide aios */
- atomic_add(ctx->max_reqs, &aio_nr); /* undone by __put_ioctx */
- if (unlikely(atomic_read(&aio_nr) > aio_max_nr))
+ spin_lock(&aio_nr_lock);
+ if (aio_nr + ctx->max_reqs > aio_max_nr ||
+ aio_nr + ctx->max_reqs < aio_nr)
+ ctx->max_reqs = 0;
+ else
+ aio_nr += ctx->max_reqs;
+ spin_unlock(&aio_nr_lock);
+ if (ctx->max_reqs == 0)
goto out_cleanup;
/* now link into global list. kludge. FIXME */
return ctx;
out_cleanup:
- atomic_sub(ctx->max_reqs, &aio_nr);
- ctx->max_reqs = 0; /* prevent __put_ioctx from sub'ing aio_nr */
__put_ioctx(ctx);
return ERR_PTR(-EAGAIN);
spin_unlock_irq(&ctx->ctx_lock);
}
-void wait_for_all_aios(struct kioctx *ctx)
+static void wait_for_all_aios(struct kioctx *ctx)
{
struct task_struct *tsk = current;
DECLARE_WAITQUEUE(wait, tsk);
pr_debug("__put_ioctx: freeing %p\n", ctx);
kmem_cache_free(kioctx_cachep, ctx);
- atomic_sub(nr_events, &aio_nr);
+ if (nr_events) {
+ spin_lock(&aio_nr_lock);
+ BUG_ON(aio_nr - nr_events > aio_nr);
+ aio_nr -= nr_events;
+ spin_unlock(&aio_nr_lock);
+ }
}
/* aio_get_req
if (unlikely(!req))
return NULL;
- req->ki_flags = 1 << KIF_LOCKED;
+ req->ki_flags = 0;
req->ki_users = 2;
req->ki_key = 0;
req->ki_ctx = ctx;
req->ki_cancel = NULL;
req->ki_retry = NULL;
- req->ki_obj.user = NULL;
req->ki_dtor = NULL;
req->private = NULL;
INIT_LIST_HEAD(&req->ki_run_list);
static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
{
+ assert_spin_locked(&ctx->ctx_lock);
+
if (req->ki_dtor)
req->ki_dtor(req);
- req->ki_ctx = NULL;
- req->ki_filp = NULL;
- req->ki_obj.user = NULL;
- req->ki_dtor = NULL;
- req->private = NULL;
kmem_cache_free(kiocb_cachep, req);
ctx->reqs_active--;
dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n",
req, atomic_read(&req->ki_filp->f_count));
+ assert_spin_locked(&ctx->ctx_lock);
+
req->ki_users --;
if (unlikely(req->ki_users < 0))
BUG();
atomic_inc(&mm->mm_count);
tsk->mm = mm;
tsk->active_mm = mm;
+ /*
+ * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise
+ * it won't work. Update it accordingly if you change it here
+ */
activate_mm(active_mm, mm);
task_unlock(tsk);
* Comments: Called with ctx->ctx_lock held. This nests
* task_lock instead ctx_lock.
*/
-void unuse_mm(struct mm_struct *mm)
+static void unuse_mm(struct mm_struct *mm)
{
struct task_struct *tsk = current;
* the kiocb (to tell the caller to activate the work
* queue to process it), or 0, if it found that it was
* already queued.
- *
- * Should be called with the spin lock iocb->ki_ctx->ctx_lock
- * held
*/
static inline int __queue_kicked_iocb(struct kiocb *iocb)
{
struct kioctx *ctx = iocb->ki_ctx;
+ assert_spin_locked(&ctx->ctx_lock);
+
if (list_empty(&iocb->ki_run_list)) {
list_add_tail(&iocb->ki_run_list,
&ctx->run_list);
- iocb->ki_queued++;
return 1;
}
return 0;
}
if (!(iocb->ki_retried & 0xff)) {
- pr_debug("%ld retry: %d of %d (kick %ld, Q %ld run %ld, wake %ld)\n",
- iocb->ki_retried,
- iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes,
- iocb->ki_kicked, iocb->ki_queued, aio_run, aio_wakeups);
+ pr_debug("%ld retry: %d of %d\n", iocb->ki_retried,
+ iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes);
}
if (!(retry = iocb->ki_retry)) {
ret = retry(iocb);
current->io_wait = NULL;
- if (-EIOCBRETRY != ret) {
- if (-EIOCBQUEUED != ret) {
- BUG_ON(!list_empty(&iocb->ki_wait.task_list));
- aio_complete(iocb, ret, 0);
- /* must not access the iocb after this */
- }
- } else {
- /*
- * Issue an additional retry to avoid waiting forever if
- * no waits were queued (e.g. in case of a short read).
- */
- if (list_empty(&iocb->ki_wait.task_list))
- kiocbSetKicked(iocb);
+ if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
+ BUG_ON(!list_empty(&iocb->ki_wait.task_list));
+ aio_complete(iocb, ret, 0);
}
out:
spin_lock_irq(&ctx->ctx_lock);
* has already been kicked */
if (kiocbIsKicked(iocb)) {
__queue_kicked_iocb(iocb);
+
+ /*
+ * __queue_kicked_iocb will always return 1 here, because
+ * iocb->ki_run_list is empty at this point so it should
+ * be safe to unconditionally queue the context into the
+ * work queue.
+ */
+ aio_queue_work(ctx);
}
}
return ret;
* Process all pending retries queued on the ioctx
* run list.
* Assumes it is operating within the aio issuer's mm
- * context. Expects to be called with ctx->ctx_lock held
+ * context.
*/
static int __aio_run_iocbs(struct kioctx *ctx)
{
struct kiocb *iocb;
- int count = 0;
LIST_HEAD(run_list);
+ assert_spin_locked(&ctx->ctx_lock);
+
list_splice_init(&ctx->run_list, &run_list);
while (!list_empty(&run_list)) {
iocb = list_entry(run_list.next, struct kiocb,
aio_run_iocb(iocb);
if (__aio_put_req(ctx, iocb)) /* drop extra ref */
put_ioctx(ctx);
- count++;
}
- aio_run++;
if (!list_empty(&ctx->run_list))
return 1;
return 0;
* and if required activate the aio work queue to process
* it
*/
-void queue_kicked_iocb(struct kiocb *iocb)
+static void try_queue_kicked_iocb(struct kiocb *iocb)
{
struct kioctx *ctx = iocb->ki_ctx;
unsigned long flags;
int run = 0;
- WARN_ON((!list_empty(&iocb->ki_wait.task_list)));
+ /* We're supposed to be the only path putting the iocb back on the run
+ * list. If we find that the iocb is *back* on a wait queue already
+ * than retry has happened before we could queue the iocb. This also
+ * means that the retry could have completed and freed our iocb, no
+ * good. */
+ BUG_ON((!list_empty(&iocb->ki_wait.task_list)));
spin_lock_irqsave(&ctx->ctx_lock, flags);
- run = __queue_kicked_iocb(iocb);
+ /* set this inside the lock so that we can't race with aio_run_iocb()
+ * testing it and putting the iocb on the run list under the lock */
+ if (!kiocbTryKick(iocb))
+ run = __queue_kicked_iocb(iocb);
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
- if (run) {
+ if (run)
aio_queue_work(ctx);
- aio_wakeups++;
- }
}
/*
return;
}
- iocb->ki_kicked++;
- /* If its already kicked we shouldn't queue it again */
- if (!kiocbTryKick(iocb)) {
- queue_kicked_iocb(iocb);
- }
+ try_queue_kicked_iocb(iocb);
}
EXPORT_SYMBOL(kick_iocb);
unsigned long tail;
int ret;
- /* Special case handling for sync iocbs: events go directly
- * into the iocb for fast handling. Note that this will not
- * work if we allow sync kiocbs to be cancelled. in which
- * case the usage count checks will have to move under ctx_lock
- * for all cases.
+ /*
+ * Special case handling for sync iocbs:
+ * - events go directly into the iocb for fast handling
+ * - the sync task with the iocb in its stack holds the single iocb
+ * ref, no other paths have a way to get another ref
+ * - the sync task helpfully left a reference to itself in the iocb
*/
if (is_sync_kiocb(iocb)) {
- int ret;
-
+ BUG_ON(iocb->ki_users != 1);
iocb->ki_user_data = res;
- if (iocb->ki_users == 1) {
- iocb->ki_users = 0;
- ret = 1;
- } else {
- spin_lock_irq(&ctx->ctx_lock);
- iocb->ki_users--;
- ret = (0 == iocb->ki_users);
- spin_unlock_irq(&ctx->ctx_lock);
- }
- /* sync iocbs put the task here for us */
+ iocb->ki_users = 0;
wake_up_process(iocb->ki_obj.tsk);
- return ret;
+ return 1;
}
info = &ctx->ring_info;
tail = info->tail;
event = aio_ring_event(info, tail, KM_IRQ0);
- tail = (tail + 1) % info->nr;
+ if (++tail >= info->nr)
+ tail = 0;
event->obj = (u64)(unsigned long)iocb->ki_obj.user;
event->data = iocb->ki_user_data;
pr_debug("added to ring %p at [%lu]\n", iocb, tail);
- pr_debug("%ld retries: %d of %d (kicked %ld, Q %ld run %ld wake %ld)\n",
- iocb->ki_retried,
- iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes,
- iocb->ki_kicked, iocb->ki_queued, aio_run, aio_wakeups);
+ pr_debug("%ld retries: %d of %d\n", iocb->ki_retried,
+ iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes);
put_rq:
/* everything turned out well, dispose of the aiocb. */
ret = __aio_put_req(ctx, iocb);
int i = 0;
struct io_event ent;
struct aio_timeout to;
- int event_loop = 0; /* testing only */
int retry = 0;
/* needed to zero any padding within an entry (there shouldn't be
if (to.timed_out) /* Only check after read evt */
break;
schedule();
- event_loop++;
if (signal_pending(tsk)) {
ret = -EINTR;
break;
if (timeout)
clear_timeout(&to);
out:
- pr_debug("event loop executed %d times\n", event_loop);
- pr_debug("aio_run %ld\n", aio_run);
- pr_debug("aio_wakeups %ld\n", aio_wakeups);
return i ? i : ret;
}
goto out;
ret = -EINVAL;
- if (unlikely(ctx || (int)nr_events <= 0)) {
- pr_debug("EINVAL: io_setup: ctx or nr_events > max\n");
+ if (unlikely(ctx || nr_events == 0)) {
+ pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
+ ctx, nr_events);
goto out;
}
if (!ret)
return 0;
+ get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */
io_destroy(ioctx);
}
}
/*
- * Default retry method for aio_read (also used for first time submit)
- * Responsible for updating iocb state as retries progress
+ * aio_p{read,write} are the default ki_retry methods for
+ * IO_CMD_P{READ,WRITE}. They maintains kiocb retry state around potentially
+ * multiple calls to f_op->aio_read(). They loop around partial progress
+ * instead of returning -EIOCBRETRY because they don't have the means to call
+ * kick_iocb().
*/
static ssize_t aio_pread(struct kiocb *iocb)
{
struct inode *inode = mapping->host;
ssize_t ret = 0;
- ret = file->f_op->aio_read(iocb, iocb->ki_buf,
- iocb->ki_left, iocb->ki_pos);
+ do {
+ ret = file->f_op->aio_read(iocb, iocb->ki_buf,
+ iocb->ki_left, iocb->ki_pos);
+ /*
+ * Can't just depend on iocb->ki_left to determine
+ * whether we are done. This may have been a short read.
+ */
+ if (ret > 0) {
+ iocb->ki_buf += ret;
+ iocb->ki_left -= ret;
+ }
- /*
- * Can't just depend on iocb->ki_left to determine
- * whether we are done. This may have been a short read.
- */
- if (ret > 0) {
- iocb->ki_buf += ret;
- iocb->ki_left -= ret;
/*
- * For pipes and sockets we return once we have
- * some data; for regular files we retry till we
- * complete the entire read or find that we can't
- * read any more data (e.g short reads).
+ * For pipes and sockets we return once we have some data; for
+ * regular files we retry till we complete the entire read or
+ * find that we can't read any more data (e.g short reads).
*/
- if (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))
- ret = -EIOCBRETRY;
- }
+ } while (ret > 0 && iocb->ki_left > 0 &&
+ !S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode));
/* This means we must have transferred all that we could */
/* No need to retry anymore */
return ret;
}
-/*
- * Default retry method for aio_write (also used for first time submit)
- * Responsible for updating iocb state as retries progress
- */
+/* see aio_pread() */
static ssize_t aio_pwrite(struct kiocb *iocb)
{
struct file *file = iocb->ki_filp;
ssize_t ret = 0;
- ret = file->f_op->aio_write(iocb, iocb->ki_buf,
- iocb->ki_left, iocb->ki_pos);
-
- if (ret > 0) {
- iocb->ki_buf += ret;
- iocb->ki_left -= ret;
-
- ret = -EIOCBRETRY;
- }
+ do {
+ ret = file->f_op->aio_write(iocb, iocb->ki_buf,
+ iocb->ki_left, iocb->ki_pos);
+ if (ret > 0) {
+ iocb->ki_buf += ret;
+ iocb->ki_left -= ret;
+ }
+ } while (ret > 0 && iocb->ki_left > 0);
- /* This means we must have transferred all that we could */
- /* No need to retry anymore */
if ((ret == 0) || (iocb->ki_left == 0))
ret = iocb->ki_nbytes - iocb->ki_left;
* Performs the initial checks and aio retry method
* setup for the kiocb at the time of io submission.
*/
-ssize_t aio_setup_iocb(struct kiocb *kiocb)
+static ssize_t aio_setup_iocb(struct kiocb *kiocb)
{
struct file *file = kiocb->ki_filp;
ssize_t ret = 0;
if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf,
kiocb->ki_left)))
break;
+ ret = security_file_permission(file, MAY_READ);
+ if (unlikely(ret))
+ break;
ret = -EINVAL;
if (file->f_op->aio_read)
kiocb->ki_retry = aio_pread;
if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf,
kiocb->ki_left)))
break;
+ ret = security_file_permission(file, MAY_WRITE);
+ if (unlikely(ret))
+ break;
ret = -EINVAL;
if (file->f_op->aio_write)
kiocb->ki_retry = aio_pwrite;
* because this callback isn't used for wait queues which
* are nested inside ioctx lock (i.e. ctx->wait)
*/
-int aio_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int aio_wake_function(wait_queue_t *wait, unsigned mode,
+ int sync, void *key)
{
struct kiocb *iocb = container_of(wait, struct kiocb, ki_wait);
}
req->ki_filp = file;
- iocb->aio_key = req->ki_key;
- ret = put_user(iocb->aio_key, &user_iocb->aio_key);
+ ret = put_user(req->ki_key, &user_iocb->aio_key);
if (unlikely(ret)) {
dprintk("EFAULT: aio_key\n");
goto out_put_req;
req->ki_opcode = iocb->aio_lio_opcode;
init_waitqueue_func_entry(&req->ki_wait, aio_wake_function);
INIT_LIST_HEAD(&req->ki_wait.task_list);
- req->ki_run_list.next = req->ki_run_list.prev = NULL;
- req->ki_retry = NULL;
req->ki_retried = 0;
- req->ki_kicked = 0;
- req->ki_queued = 0;
- aio_run = 0;
- aio_wakeups = 0;
ret = aio_setup_iocb(req);
goto out_put_req;
spin_lock_irq(&ctx->ctx_lock);
- list_add_tail(&req->ki_run_list, &ctx->run_list);
- /* drain the run list */
- while (__aio_run_iocbs(ctx))
- ;
+ aio_run_iocb(req);
+ if (!list_empty(&ctx->run_list)) {
+ /* drain the run list */
+ while (__aio_run_iocbs(ctx))
+ ;
+ }
spin_unlock_irq(&ctx->ctx_lock);
aio_put_req(req); /* drop extra ref to req */
return 0;
/* lookup_kiocb
* Finds a given iocb for cancellation.
- * MUST be called with ctx->ctx_lock held.
*/
-struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
+static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
+ u32 key)
{
struct list_head *pos;
+
+ assert_spin_locked(&ctx->ctx_lock);
+
/* TODO: use a hash or array, this sucks. */
list_for_each(pos, &ctx->active_reqs) {
struct kiocb *kiocb = list_kiocb(pos);
ret = -EFAULT;
}
} else
- printk(KERN_DEBUG "iocb has no cancel operation\n");
+ ret = -EINVAL;
put_ioctx(ctx);