+diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
+index f128427..3616042 100644
+--- a/fs/btrfs/acl.c
++++ b/fs/btrfs/acl.c
+@@ -27,7 +27,7 @@
+ #include "btrfs_inode.h"
+ #include "xattr.h"
+
+-#ifdef CONFIG_FS_POSIX_ACL
++#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+
+ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+ {
+@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
+ .set = btrfs_xattr_acl_access_set,
+ };
+
+-#else /* CONFIG_FS_POSIX_ACL */
++#else /* CONFIG_BTRFS_FS_POSIX_ACL */
+
+ int btrfs_acl_chmod(struct inode *inode)
+ {
+@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
+ return 0;
+ }
+
+-#endif /* CONFIG_FS_POSIX_ACL */
++#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
+diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
+index 019e8af..c0861e7 100644
+--- a/fs/btrfs/async-thread.c
++++ b/fs/btrfs/async-thread.c
+@@ -48,6 +48,9 @@ struct btrfs_worker_thread {
+ /* number of things on the pending list */
+ atomic_t num_pending;
+
++ /* reference counter for this struct */
++ atomic_t refs;
++
+ unsigned long sequence;
+
+ /* protects the pending list. */
+@@ -61,6 +64,51 @@ struct btrfs_worker_thread {
+ };
+
+ /*
++ * btrfs_start_workers uses kthread_run, which can block waiting for memory
++ * for a very long time. It will actually throttle on page writeback,
++ * and so it may not make progress until after our btrfs worker threads
++ * process all of the pending work structs in their queue
++ *
++ * This means we can't use btrfs_start_workers from inside a btrfs worker
++ * thread that is used as part of cleaning dirty memory, which pretty much
++ * involves all of the worker threads.
++ *
++ * Instead we have a helper queue who never has more than one thread
++ * where we scheduler thread start operations. This worker_start struct
++ * is used to contain the work and hold a pointer to the queue that needs
++ * another worker.
++ */
++struct worker_start {
++ struct btrfs_work work;
++ struct btrfs_workers *queue;
++};
++
++static void start_new_worker_func(struct btrfs_work *work)
++{
++ struct worker_start *start;
++ start = container_of(work, struct worker_start, work);
++ btrfs_start_workers(start->queue, 1);
++ kfree(start);
++}
++
++static int start_new_worker(struct btrfs_workers *queue)
++{
++ struct worker_start *start;
++ int ret;
++
++ start = kzalloc(sizeof(*start), GFP_NOFS);
++ if (!start)
++ return -ENOMEM;
++
++ start->work.func = start_new_worker_func;
++ start->queue = queue;
++ ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
++ if (ret)
++ kfree(start);
++ return ret;
++}
++
++/*
+ * helper function to move a thread onto the idle list after it
+ * has finished some requests.
+ */
+@@ -71,7 +119,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
+ unsigned long flags;
+ spin_lock_irqsave(&worker->workers->lock, flags);
+ worker->idle = 1;
+- list_move(&worker->worker_list, &worker->workers->idle_list);
++
++ /* the list may be empty if the worker is just starting */
++ if (!list_empty(&worker->worker_list)) {
++ list_move(&worker->worker_list,
++ &worker->workers->idle_list);
++ }
+ spin_unlock_irqrestore(&worker->workers->lock, flags);
+ }
+ }
+@@ -87,23 +140,51 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
+ unsigned long flags;
+ spin_lock_irqsave(&worker->workers->lock, flags);
+ worker->idle = 0;
+- list_move_tail(&worker->worker_list,
+- &worker->workers->worker_list);
++
++ if (!list_empty(&worker->worker_list)) {
++ list_move_tail(&worker->worker_list,
++ &worker->workers->worker_list);
++ }
+ spin_unlock_irqrestore(&worker->workers->lock, flags);
+ }
+ }
+
+-static noinline int run_ordered_completions(struct btrfs_workers *workers,
+- struct btrfs_work *work)
++static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
+ {
++ struct btrfs_workers *workers = worker->workers;
+ unsigned long flags;
+
++ rmb();
++ if (!workers->atomic_start_pending)
++ return;
++
++ spin_lock_irqsave(&workers->lock, flags);
++ if (!workers->atomic_start_pending)
++ goto out;
++
++ workers->atomic_start_pending = 0;
++ if (workers->num_workers + workers->num_workers_starting >=
++ workers->max_workers)
++ goto out;
++
++ workers->num_workers_starting += 1;
++ spin_unlock_irqrestore(&workers->lock, flags);
++ start_new_worker(workers);
++ return;
++
++out:
++ spin_unlock_irqrestore(&workers->lock, flags);
++}
++
++static noinline int run_ordered_completions(struct btrfs_workers *workers,
++ struct btrfs_work *work)
++{
+ if (!workers->ordered)
+ return 0;
+
+ set_bit(WORK_DONE_BIT, &work->flags);
+
+- spin_lock_irqsave(&workers->lock, flags);
++ spin_lock(&workers->order_lock);
+
+ while (1) {
+ if (!list_empty(&workers->prio_order_list)) {
+@@ -126,45 +207,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
+ if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
+ break;
+
+- spin_unlock_irqrestore(&workers->lock, flags);
++ spin_unlock(&workers->order_lock);
+
+ work->ordered_func(work);
+
+ /* now take the lock again and call the freeing code */
+- spin_lock_irqsave(&workers->lock, flags);
++ spin_lock(&workers->order_lock);
+ list_del(&work->order_list);
+ work->ordered_free(work);
+ }
+
+- spin_unlock_irqrestore(&workers->lock, flags);
++ spin_unlock(&workers->order_lock);
+ return 0;
+ }
+
++static void put_worker(struct btrfs_worker_thread *worker)
++{
++ if (atomic_dec_and_test(&worker->refs))
++ kfree(worker);
++}
++
++static int try_worker_shutdown(struct btrfs_worker_thread *worker)
++{
++ int freeit = 0;
++
++ spin_lock_irq(&worker->lock);
++ spin_lock(&worker->workers->lock);
++ if (worker->workers->num_workers > 1 &&
++ worker->idle &&
++ !worker->working &&
++ !list_empty(&worker->worker_list) &&
++ list_empty(&worker->prio_pending) &&
++ list_empty(&worker->pending) &&
++ atomic_read(&worker->num_pending) == 0) {
++ freeit = 1;
++ list_del_init(&worker->worker_list);
++ worker->workers->num_workers--;
++ }
++ spin_unlock(&worker->workers->lock);
++ spin_unlock_irq(&worker->lock);
++
++ if (freeit)
++ put_worker(worker);
++ return freeit;
++}
++
++static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
++ struct list_head *prio_head,
++ struct list_head *head)
++{
++ struct btrfs_work *work = NULL;
++ struct list_head *cur = NULL;
++
++ if(!list_empty(prio_head))
++ cur = prio_head->next;
++
++ smp_mb();
++ if (!list_empty(&worker->prio_pending))
++ goto refill;
++
++ if (!list_empty(head))
++ cur = head->next;
++
++ if (cur)
++ goto out;
++
++refill:
++ spin_lock_irq(&worker->lock);
++ list_splice_tail_init(&worker->prio_pending, prio_head);
++ list_splice_tail_init(&worker->pending, head);
++
++ if (!list_empty(prio_head))
++ cur = prio_head->next;
++ else if (!list_empty(head))
++ cur = head->next;
++ spin_unlock_irq(&worker->lock);
++
++ if (!cur)
++ goto out_fail;
++
++out:
++ work = list_entry(cur, struct btrfs_work, list);
++
++out_fail:
++ return work;
++}
++
+ /*
+ * main loop for servicing work items
+ */
+ static int worker_loop(void *arg)
+ {
+ struct btrfs_worker_thread *worker = arg;
+- struct list_head *cur;
++ struct list_head head;
++ struct list_head prio_head;
+ struct btrfs_work *work;
++
++ INIT_LIST_HEAD(&head);
++ INIT_LIST_HEAD(&prio_head);
++
+ do {
+- spin_lock_irq(&worker->lock);
+-again_locked:
++again:
+ while (1) {
+- if (!list_empty(&worker->prio_pending))
+- cur = worker->prio_pending.next;
+- else if (!list_empty(&worker->pending))
+- cur = worker->pending.next;
+- else
++
++
++ work = get_next_work(worker, &prio_head, &head);
++ if (!work)
+ break;
+
+- work = list_entry(cur, struct btrfs_work, list);
+ list_del(&work->list);
+ clear_bit(WORK_QUEUED_BIT, &work->flags);
+
+ work->worker = worker;
+- spin_unlock_irq(&worker->lock);
+
+ work->func(work);
+
+@@ -175,9 +329,13 @@ again_locked:
+ */
+ run_ordered_completions(worker->workers, work);
+
+- spin_lock_irq(&worker->lock);
+- check_idle_worker(worker);
++ check_pending_worker_creates(worker);
++
+ }
++
++ spin_lock_irq(&worker->lock);
++ check_idle_worker(worker);
++
+ if (freezing(current)) {
+ worker->working = 0;
+ spin_unlock_irq(&worker->lock);
+@@ -216,8 +374,10 @@ again_locked:
+ spin_lock_irq(&worker->lock);
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!list_empty(&worker->pending) ||
+- !list_empty(&worker->prio_pending))
+- goto again_locked;
++ !list_empty(&worker->prio_pending)) {
++ spin_unlock_irq(&worker->lock);
++ goto again;
++ }
+
+ /*
+ * this makes sure we get a wakeup when someone
+@@ -226,8 +386,13 @@ again_locked:
+ worker->working = 0;
+ spin_unlock_irq(&worker->lock);
+
+- if (!kthread_should_stop())
+- schedule();
++ if (!kthread_should_stop()) {
++ schedule_timeout(HZ * 120);
++ if (!worker->working &&
++ try_worker_shutdown(worker)) {
++ return 0;
++ }
++ }
+ }
+ __set_current_state(TASK_RUNNING);
+ }
+@@ -242,41 +407,61 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
+ {
+ struct list_head *cur;
+ struct btrfs_worker_thread *worker;
++ int can_stop;
+
++ spin_lock_irq(&workers->lock);
+ list_splice_init(&workers->idle_list, &workers->worker_list);
+ while (!list_empty(&workers->worker_list)) {
+ cur = workers->worker_list.next;
+ worker = list_entry(cur, struct btrfs_worker_thread,
+ worker_list);
+- kthread_stop(worker->task);
+- list_del(&worker->worker_list);
+- kfree(worker);
++
++ atomic_inc(&worker->refs);
++ workers->num_workers -= 1;
++ if (!list_empty(&worker->worker_list)) {
++ list_del_init(&worker->worker_list);
++ put_worker(worker);
++ can_stop = 1;
++ } else
++ can_stop = 0;
++ spin_unlock_irq(&workers->lock);
++ if (can_stop)
++ kthread_stop(worker->task);
++ spin_lock_irq(&workers->lock);
++ put_worker(worker);
+ }
++ spin_unlock_irq(&workers->lock);
+ return 0;
+ }
+
+ /*
+ * simple init on struct btrfs_workers
+ */
+-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
++void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
++ struct btrfs_workers *async_helper)
+ {
+ workers->num_workers = 0;
++ workers->num_workers_starting = 0;
+ INIT_LIST_HEAD(&workers->worker_list);
+ INIT_LIST_HEAD(&workers->idle_list);
+ INIT_LIST_HEAD(&workers->order_list);
+ INIT_LIST_HEAD(&workers->prio_order_list);
+ spin_lock_init(&workers->lock);
++ spin_lock_init(&workers->order_lock);
+ workers->max_workers = max;
+ workers->idle_thresh = 32;
+ workers->name = name;
+ workers->ordered = 0;
++ workers->atomic_start_pending = 0;
++ workers->atomic_worker_start = async_helper;
+ }
+
+ /*
+ * starts new worker threads. This does not enforce the max worker
+ * count in case you need to temporarily go past it.
+ */
+-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
++static int __btrfs_start_workers(struct btrfs_workers *workers,
++ int num_workers)
+ {
+ struct btrfs_worker_thread *worker;
+ int ret = 0;
+@@ -293,7 +478,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+ INIT_LIST_HEAD(&worker->prio_pending);
+ INIT_LIST_HEAD(&worker->worker_list);
+ spin_lock_init(&worker->lock);
++
+ atomic_set(&worker->num_pending, 0);
++ atomic_set(&worker->refs, 1);
+ worker->workers = workers;
+ worker->task = kthread_run(worker_loop, worker,
+ "btrfs-%s-%d", workers->name,
+@@ -303,11 +490,12 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+ kfree(worker);
+ goto fail;
+ }
+-
+ spin_lock_irq(&workers->lock);
+ list_add_tail(&worker->worker_list, &workers->idle_list);
+ worker->idle = 1;
+ workers->num_workers++;
++ workers->num_workers_starting--;
++ WARN_ON(workers->num_workers_starting < 0);
+ spin_unlock_irq(&workers->lock);
+ }
+ return 0;
+@@ -316,6 +504,14 @@ fail:
+ return ret;
+ }
+
++int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
++{
++ spin_lock_irq(&workers->lock);
++ workers->num_workers_starting += num_workers;
++ spin_unlock_irq(&workers->lock);
++ return __btrfs_start_workers(workers, num_workers);
++}
++
+ /*
+ * run through the list and find a worker thread that doesn't have a lot
+ * to do right now. This can return null if we aren't yet at the thread
+@@ -325,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+ {
+ struct btrfs_worker_thread *worker;
+ struct list_head *next;
+- int enforce_min = workers->num_workers < workers->max_workers;
++ int enforce_min;
++
++ enforce_min = (workers->num_workers + workers->num_workers_starting) <
++ workers->max_workers;
+
+ /*
+ * if we find an idle thread, don't move it to the end of the
+@@ -350,7 +549,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+ */
+ next = workers->worker_list.next;
+ worker = list_entry(next, struct btrfs_worker_thread, worker_list);
+- atomic_inc(&worker->num_pending);
+ worker->sequence++;
+
+ if (worker->sequence % workers->idle_thresh == 0)
+@@ -367,35 +565,49 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+ {
+ struct btrfs_worker_thread *worker;
+ unsigned long flags;
++ struct list_head *fallback;
+
+ again:
+ spin_lock_irqsave(&workers->lock, flags);
+ worker = next_worker(workers);
+- spin_unlock_irqrestore(&workers->lock, flags);
+
+ if (!worker) {
+- spin_lock_irqsave(&workers->lock, flags);
+- if (workers->num_workers >= workers->max_workers) {
+- struct list_head *fallback = NULL;
+- /*
+- * we have failed to find any workers, just
+- * return the force one
+- */
+- if (!list_empty(&workers->worker_list))
+- fallback = workers->worker_list.next;
+- if (!list_empty(&workers->idle_list))
+- fallback = workers->idle_list.next;
+- BUG_ON(!fallback);
+- worker = list_entry(fallback,
+- struct btrfs_worker_thread, worker_list);
+- spin_unlock_irqrestore(&workers->lock, flags);
++ if (workers->num_workers + workers->num_workers_starting >=
++ workers->max_workers) {
++ goto fallback;
++ } else if (workers->atomic_worker_start) {
++ workers->atomic_start_pending = 1;
++ goto fallback;
+ } else {
++ workers->num_workers_starting++;
+ spin_unlock_irqrestore(&workers->lock, flags);
+ /* we're below the limit, start another worker */
+- btrfs_start_workers(workers, 1);
++ __btrfs_start_workers(workers, 1);
+ goto again;
+ }
+ }
++ goto found;
++
++fallback:
++ fallback = NULL;
++ /*
++ * we have failed to find any workers, just
++ * return the first one we can find.
++ */
++ if (!list_empty(&workers->worker_list))
++ fallback = workers->worker_list.next;
++ if (!list_empty(&workers->idle_list))
++ fallback = workers->idle_list.next;
++ BUG_ON(!fallback);
++ worker = list_entry(fallback,
++ struct btrfs_worker_thread, worker_list);
++found:
++ /*
++ * this makes sure the worker doesn't exit before it is placed
++ * onto a busy/idle list
++ */
++ atomic_inc(&worker->num_pending);
++ spin_unlock_irqrestore(&workers->lock, flags);
+ return worker;
+ }
+
+@@ -427,7 +639,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
+ spin_lock(&worker->workers->lock);
+ worker->idle = 0;
+ list_move_tail(&worker->worker_list,
+- &worker->workers->worker_list);
++ &worker->workers->worker_list);
+ spin_unlock(&worker->workers->lock);
+ }
+ if (!worker->working) {
+@@ -435,9 +647,9 @@ int btrfs_requeue_work(struct btrfs_work *work)
+ worker->working = 1;
+ }
+
+- spin_unlock_irqrestore(&worker->lock, flags);
+ if (wake)
+ wake_up_process(worker->task);
++ spin_unlock_irqrestore(&worker->lock, flags);
+ out:
+
+ return 0;
+@@ -463,14 +675,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+
+ worker = find_worker(workers);
+ if (workers->ordered) {
+- spin_lock_irqsave(&workers->lock, flags);
++ /*
++ * you're not allowed to do ordered queues from an
++ * interrupt handler
++ */
++ spin_lock(&workers->order_lock);
+ if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
+ list_add_tail(&work->order_list,
+ &workers->prio_order_list);
+ } else {
+ list_add_tail(&work->order_list, &workers->order_list);
+ }
+- spin_unlock_irqrestore(&workers->lock, flags);
++ spin_unlock(&workers->order_lock);
+ } else {
+ INIT_LIST_HEAD(&work->order_list);
+ }
+@@ -481,7 +697,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+ list_add_tail(&work->list, &worker->prio_pending);
+ else
+ list_add_tail(&work->list, &worker->pending);
+- atomic_inc(&worker->num_pending);
+ check_busy_worker(worker);
+
+ /*
+@@ -492,10 +707,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+ wake = 1;
+ worker->working = 1;
+
+- spin_unlock_irqrestore(&worker->lock, flags);
+-
+ if (wake)
+ wake_up_process(worker->task);
++ spin_unlock_irqrestore(&worker->lock, flags);
++
+ out:
+ return 0;
+ }
+diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
+index 1b511c1..5077746 100644
+--- a/fs/btrfs/async-thread.h
++++ b/fs/btrfs/async-thread.h
+@@ -64,6 +64,8 @@ struct btrfs_workers {
+ /* current number of running workers */
+ int num_workers;
+
++ int num_workers_starting;
++
+ /* max number of workers allowed. changed by btrfs_start_workers */
+ int max_workers;
+
+@@ -73,6 +75,16 @@ struct btrfs_workers {
+ /* force completions in the order they were queued */
+ int ordered;
+
++ /* more workers required, but in an interrupt handler */
++ int atomic_start_pending;
++
++ /*
++ * are we allowed to sleep while starting workers or are we required
++ * to start them at a later time? If we can't sleep, this indicates
++ * which queue we need to use to schedule thread creation.
++ */
++ struct btrfs_workers *atomic_worker_start;
++
+ /* list with all the work threads. The workers on the idle thread
+ * may be actively servicing jobs, but they haven't yet hit the
+ * idle thresh limit above.
+@@ -90,6 +102,9 @@ struct btrfs_workers {
+ /* lock for finding the next worker thread to queue on */
+ spinlock_t lock;
+
++ /* lock for the ordered lists */
++ spinlock_t order_lock;
++
+ /* extra name for this worker, used for current->name */
+ char *name;
+ };
+@@ -97,7 +112,8 @@ struct btrfs_workers {
+ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+ int btrfs_stop_workers(struct btrfs_workers *workers);
+-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
++void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
++ struct btrfs_workers *async_starter);
+ int btrfs_requeue_work(struct btrfs_work *work);
+ void btrfs_set_work_high_prio(struct btrfs_work *work);
+ #endif
+diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
+index ea1ea0a..f6783a4 100644
+--- a/fs/btrfs/btrfs_inode.h
++++ b/fs/btrfs/btrfs_inode.h
+@@ -86,6 +86,12 @@ struct btrfs_inode {
+ * transid of the trans_handle that last modified this inode
+ */
+ u64 last_trans;
++
++ /*
++ * log transid when this inode was last modified
++ */
++ u64 last_sub_trans;
++
+ /*
+ * transid that last logged this inode
+ */
+@@ -128,6 +134,16 @@ struct btrfs_inode {
+ u64 last_unlink_trans;
+
+ /*
++ * Counters to keep track of the number of extent item's we may use due
++ * to delalloc and such. outstanding_extents is the number of extent
++ * items we think we'll end up using, and reserved_extents is the number
++ * of extent items we've reserved metadata for.
++ */
++ spinlock_t accounting_lock;
++ int reserved_extents;
++ int outstanding_extents;
++
++ /*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero. When it is set
+ * the btrfs file release call will add this inode to the
+@@ -138,6 +154,7 @@ struct btrfs_inode {
+ * of these.
+ */
+ unsigned ordered_data_close:1;
++ unsigned dummy_inode:1;
+
+ struct inode vfs_inode;
+ };
+diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
+index 9d8ba4d..a11a320 100644
+--- a/fs/btrfs/compression.c
++++ b/fs/btrfs/compression.c
+@@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
+ */
+ set_page_extent_mapped(page);
+ lock_extent(tree, last_offset, end, GFP_NOFS);
+- spin_lock(&em_tree->lock);
++ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, last_offset,
+ PAGE_CACHE_SIZE);
+- spin_unlock(&em_tree->lock);
++ read_unlock(&em_tree->lock);
+
+ if (!em || last_offset < em->start ||
+ (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+@@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ em_tree = &BTRFS_I(inode)->extent_tree;
+
+ /* we need the actual starting offset of this extent in the file */
+- spin_lock(&em_tree->lock);
++ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree,
+ page_offset(bio->bi_io_vec->bv_page),
+ PAGE_CACHE_SIZE);
+- spin_unlock(&em_tree->lock);
++ read_unlock(&em_tree->lock);
+
+ compressed_len = em->block_len;
+ cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
+index 3fdcc05..ec96f3a 100644
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
+ int split;
+ int num_doubles = 0;
+
++ l = path->nodes[0];
++ slot = path->slots[0];
++ if (extend && data_size + btrfs_item_size_nr(l, slot) +
++ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
++ return -EOVERFLOW;
++
+ /* first try to make some room by pushing left and right */
+ if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+ wret = push_leaf_right(trans, root, path, data_size, 0);
+diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
+index 837435c..e5dd628 100644
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -114,6 +114,10 @@ struct btrfs_ordered_sum;
+ */
+ #define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+
++#define BTRFS_BTREE_INODE_OBJECTID 1
++
++#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
++
+ /*
+ * we can actually store much bigger names, but lets not confuse the rest
+ * of linux
+@@ -670,21 +674,29 @@ struct btrfs_space_info {
+ u64 bytes_reserved; /* total bytes the allocator has reserved for
+ current allocations */
+ u64 bytes_readonly; /* total bytes that are read only */
+-
+- /* delalloc accounting */
+- u64 bytes_delalloc; /* number of bytes reserved for allocation,
+- this space is not necessarily reserved yet
+- by the allocator */
++ u64 bytes_super; /* total bytes reserved for the super blocks */
++ u64 bytes_root; /* the number of bytes needed to commit a
++ transaction */
+ u64 bytes_may_use; /* number of bytes that may be used for
+- delalloc */
++ delalloc/allocations */
++ u64 bytes_delalloc; /* number of bytes currently reserved for
++ delayed allocation */
+
+ int full; /* indicates that we cannot allocate any more
+ chunks for this space */
+ int force_alloc; /* set if we need to force a chunk alloc for
+ this space */
++ int force_delalloc; /* make people start doing filemap_flush until
++ we're under a threshold */
+
+ struct list_head list;
+
++ /* for controlling how we free up space for allocations */
++ wait_queue_head_t allocate_wait;
++ wait_queue_head_t flush_wait;
++ int allocating_chunk;
++ int flushing;
++
+ /* for block groups in our same type */
+ struct list_head block_groups;
+ spinlock_t lock;
+@@ -726,6 +738,15 @@ enum btrfs_caching_type {
+ BTRFS_CACHE_FINISHED = 2,
+ };
+
++struct btrfs_caching_control {
++ struct list_head list;
++ struct mutex mutex;
++ wait_queue_head_t wait;
++ struct btrfs_block_group_cache *block_group;
++ u64 progress;
++ atomic_t count;
++};
++
+ struct btrfs_block_group_cache {
+ struct btrfs_key key;
+ struct btrfs_block_group_item item;
+@@ -733,6 +754,7 @@ struct btrfs_block_group_cache {
+ spinlock_t lock;
+ u64 pinned;
+ u64 reserved;
++ u64 bytes_super;
+ u64 flags;
+ u64 sectorsize;
+ int extents_thresh;
+@@ -742,8 +764,9 @@ struct btrfs_block_group_cache {
+ int dirty;
+
+ /* cache tracking stuff */
+- wait_queue_head_t caching_q;
+ int cached;
++ struct btrfs_caching_control *caching_ctl;
++ u64 last_byte_to_unpin;
+
+ struct btrfs_space_info *space_info;
+
+@@ -782,13 +805,16 @@ struct btrfs_fs_info {
+
+ /* the log root tree is a directory of all the other log roots */
+ struct btrfs_root *log_root_tree;
++
++ spinlock_t fs_roots_radix_lock;
+ struct radix_tree_root fs_roots_radix;
+
+ /* block group cache stuff */
+ spinlock_t block_group_cache_lock;
+ struct rb_root block_group_cache_tree;
+
+- struct extent_io_tree pinned_extents;
++ struct extent_io_tree freed_extents[2];
++ struct extent_io_tree *pinned_extents;
+
+ /* logical->physical extent mapping */
+ struct btrfs_mapping_tree mapping_tree;
+@@ -822,11 +848,7 @@ struct btrfs_fs_info {
+ struct mutex transaction_kthread_mutex;
+ struct mutex cleaner_mutex;
+ struct mutex chunk_mutex;
+- struct mutex drop_mutex;
+ struct mutex volume_mutex;
+- struct mutex tree_reloc_mutex;
+- struct rw_semaphore extent_commit_sem;
+-
+ /*
+ * this protects the ordered operations list only while we are
+ * processing all of the entries on it. This way we make
+@@ -835,10 +857,16 @@ struct btrfs_fs_info {
+ * before jumping into the main commit.
+ */
+ struct mutex ordered_operations_mutex;
++ struct rw_semaphore extent_commit_sem;
++
++ struct rw_semaphore subvol_sem;
++
++ struct srcu_struct subvol_srcu;
+
+ struct list_head trans_list;
+ struct list_head hashers;
+ struct list_head dead_roots;
++ struct list_head caching_block_groups;
+
+ atomic_t nr_async_submits;
+ atomic_t async_submit_draining;
+@@ -882,6 +910,7 @@ struct btrfs_fs_info {
+ * A third pool does submit_bio to avoid deadlocking with the other
+ * two
+ */
++ struct btrfs_workers generic_worker;
+ struct btrfs_workers workers;
+ struct btrfs_workers delalloc_workers;
+ struct btrfs_workers endio_workers;
+@@ -889,6 +918,7 @@ struct btrfs_fs_info {
+ struct btrfs_workers endio_meta_write_workers;
+ struct btrfs_workers endio_write_workers;
+ struct btrfs_workers submit_workers;
++ struct btrfs_workers enospc_workers;
+ /*
+ * fixup workers take dirty pages that didn't properly go through
+ * the cow mechanism and make them safe to write. It happens
+@@ -979,7 +1009,10 @@ struct btrfs_root {
+ atomic_t log_writers;
+ atomic_t log_commit[2];
+ unsigned long log_transid;
++ unsigned long last_log_commit;
+ unsigned long log_batch;
++ pid_t log_start_pid;
++ bool log_multiple_pids;
+
+ u64 objectid;
+ u64 last_trans;
+@@ -996,10 +1029,12 @@ struct btrfs_root {
+ u32 stripesize;
+
+ u32 type;
+- u64 highest_inode;
+- u64 last_inode_alloc;
++
++ u64 highest_objectid;
+ int ref_cows;
+ int track_dirty;
++ int in_radix;
++
+ u64 defrag_trans_start;
+ struct btrfs_key defrag_progress;
+ struct btrfs_key defrag_max;
+@@ -1118,6 +1153,7 @@ struct btrfs_root {
+ #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
+ #define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
+ #define BTRFS_MOUNT_NOSSD (1 << 9)
++#define BTRFS_MOUNT_DISCARD (1 << 10)
+
+ #define BTRFS_MOUNT_TAGGED (1 << 24)
+
+@@ -1920,8 +1956,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long count);
+ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+-int btrfs_update_pinned_extents(struct btrfs_root *root,
+- u64 bytenr, u64 num, int pin);
++int btrfs_pin_extent(struct btrfs_root *root,
++ u64 bytenr, u64 num, int reserved);
+ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *leaf);
+ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+@@ -1971,9 +2007,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
+ u64 root_objectid, u64 owner, u64 offset);
+
+ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
++int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root);
+ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root,
+- struct extent_io_tree *unpin);
++ struct btrfs_root *root);
+ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+@@ -1984,6 +2021,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
+ int btrfs_free_block_groups(struct btrfs_fs_info *info);
+ int btrfs_read_block_groups(struct btrfs_root *root);
++int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
+ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytes_used,
+ u64 type, u64 chunk_objectid, u64 chunk_offset,
+@@ -1997,7 +2035,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
+ void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+
+-int btrfs_check_metadata_free_space(struct btrfs_root *root);
++int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
++int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
++int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
++ struct inode *inode, int num_items);
++int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
++ struct inode *inode, int num_items);
+ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+ u64 bytes);
+ void btrfs_free_reserved_data_space(struct btrfs_root *root,
+@@ -2006,7 +2049,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+ u64 bytes);
+ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+ u64 bytes);
+-void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
+ /* ctree.c */
+ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+ int level, int *slot);
+@@ -2100,12 +2142,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ struct extent_buffer *parent);
+ /* root-item.c */
+ int btrfs_find_root_ref(struct btrfs_root *tree_root,
+- struct btrfs_path *path,
+- u64 root_id, u64 ref_id);
++ struct btrfs_path *path,
++ u64 root_id, u64 ref_id);
+ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+- u64 root_id, u8 type, u64 ref_id,
+- u64 dirid, u64 sequence,
++ u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
++ const char *name, int name_len);
++int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
++ struct btrfs_root *tree_root,
++ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+ const char *name, int name_len);
+ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_key *key);
+@@ -2120,6 +2165,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
+ int btrfs_search_root(struct btrfs_root *root, u64 search_start,
+ u64 *found_objectid);
+ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
++int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
+ int btrfs_set_root_node(struct btrfs_root_item *item,
+ struct extent_buffer *node);
+ /* dir-item.c */
+@@ -2138,6 +2184,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u64 dir,
+ u64 objectid, const char *name, int name_len,
+ int mod);
++struct btrfs_dir_item *
++btrfs_search_dir_index_item(struct btrfs_root *root,
++ struct btrfs_path *path, u64 dirid,
++ const char *name, int name_len);
+ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len);
+@@ -2160,6 +2210,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset);
+ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset);
++int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
+
+ /* inode-map.c */
+ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+@@ -2232,6 +2283,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ int btrfs_add_link(struct btrfs_trans_handle *trans,
+ struct inode *parent_inode, struct inode *inode,
+ const char *name, int name_len, int add_backref, u64 index);
++int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root,
++ struct inode *dir, u64 objectid,
++ const char *name, int name_len);
+ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode, u64 new_size,
+@@ -2242,7 +2297,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
+ int btrfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc);
+ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+- struct btrfs_root *new_root, struct dentry *dentry,
++ struct btrfs_root *new_root,
+ u64 new_dirid, u64 alloc_hint);
+ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+ size_t size, struct bio *bio, unsigned long bio_flags);
+@@ -2258,6 +2313,7 @@ int btrfs_write_inode(struct inode *inode, int wait);
+ void btrfs_dirty_inode(struct inode *inode);
+ struct inode *btrfs_alloc_inode(struct super_block *sb);
+ void btrfs_destroy_inode(struct inode *inode);
++void btrfs_drop_inode(struct inode *inode);
+ int btrfs_init_cachep(void);
+ void btrfs_destroy_cachep(void);
+ long btrfs_ioctl_trans_end(struct file *file);
+@@ -2275,6 +2331,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
+ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
+ void btrfs_orphan_cleanup(struct btrfs_root *root);
+ int btrfs_cont_expand(struct inode *inode, loff_t size);
++int btrfs_invalidate_inodes(struct btrfs_root *root);
++extern const struct dentry_operations btrfs_dentry_operations;
+
+ /* ioctl.c */
+ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+@@ -2290,7 +2348,7 @@ extern struct file_operations btrfs_file_operations;
+ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ u64 start, u64 end, u64 locked_end,
+- u64 inline_limit, u64 *hint_block);
++ u64 inline_limit, u64 *hint_block, int drop_cache);
+ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode, u64 start, u64 end);
+@@ -2317,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
+ int btrfs_sync_fs(struct super_block *sb, int wait);
+
+ /* acl.c */
+-#ifdef CONFIG_FS_POSIX_ACL
++#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ int btrfs_check_acl(struct inode *inode, int mask);
+ #else
+ #define btrfs_check_acl NULL
+diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
+index 1d70236..f3a6075 100644
+--- a/fs/btrfs/dir-item.c
++++ b/fs/btrfs/dir-item.c
+@@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+ return btrfs_match_dir_item_name(root, path, name, name_len);
+ }
+
++struct btrfs_dir_item *
++btrfs_search_dir_index_item(struct btrfs_root *root,
++ struct btrfs_path *path, u64 dirid,
++ const char *name, int name_len)
++{
++ struct extent_buffer *leaf;
++ struct btrfs_dir_item *di;
++ struct btrfs_key key;
++ u32 nritems;
++ int ret;
++
++ key.objectid = dirid;
++ key.type = BTRFS_DIR_INDEX_KEY;
++ key.offset = 0;
++
++ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
++ if (ret < 0)
++ return ERR_PTR(ret);
++
++ leaf = path->nodes[0];
++ nritems = btrfs_header_nritems(leaf);
++
++ while (1) {
++ if (path->slots[0] >= nritems) {
++ ret = btrfs_next_leaf(root, path);
++ if (ret < 0)
++ return ERR_PTR(ret);
++ if (ret > 0)
++ break;
++ leaf = path->nodes[0];
++ nritems = btrfs_header_nritems(leaf);
++ continue;
++ }
++
++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++ if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
++ break;
++
++ di = btrfs_match_dir_item_name(root, path, name, name_len);
++ if (di)
++ return di;
++
++ path->slots[0]++;
++ }
++ return NULL;
++}
++
+ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index e83be2e..d4132aa 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -41,6 +41,7 @@
+
+ static struct extent_io_ops btree_extent_io_ops;
+ static void end_workqueue_fn(struct btrfs_work *work);
++static void free_fs_root(struct btrfs_root *root);
+
+ static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
+
+@@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,
+ struct extent_map *em;
+ int ret;
+
+- spin_lock(&em_tree->lock);
++ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (em) {
+ em->bdev =
+ BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+- spin_unlock(&em_tree->lock);
++ read_unlock(&em_tree->lock);
+ goto out;
+ }
+- spin_unlock(&em_tree->lock);
++ read_unlock(&em_tree->lock);
+
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em) {
+@@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
+ em->block_start = 0;
+ em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+- spin_lock(&em_tree->lock);
++ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ if (ret == -EEXIST) {
+ u64 failed_start = em->start;
+@@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
+ free_extent_map(em);
+ em = NULL;
+ }
+- spin_unlock(&em_tree->lock);
++ write_unlock(&em_tree->lock);
+
+ if (ret)
+ em = ERR_PTR(ret);
+@@ -828,7 +829,9 @@ int btrfs_write_tree_block(struct extent_buffer *buf)
+ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+ {
+ return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
+- buf->start, buf->start + buf->len - 1);
++ buf->start >> PAGE_CACHE_SHIFT,
++ (buf->start + buf->len - 1) >>
++ PAGE_CACHE_SHIFT);
+ }
+
+ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+@@ -895,8 +898,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+ root->fs_info = fs_info;
+ root->objectid = objectid;
+ root->last_trans = 0;
+- root->highest_inode = 0;
+- root->last_inode_alloc = 0;
++ root->highest_objectid = 0;
+ root->name = NULL;
+ root->in_sysfs = 0;
+ root->inode_tree.rb_node = NULL;
+@@ -917,6 +919,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+ atomic_set(&root->log_writers, 0);
+ root->log_batch = 0;
+ root->log_transid = 0;
++ root->last_log_commit = 0;
+ extent_io_tree_init(&root->dirty_log_pages,
+ fs_info->btree_inode->i_mapping, GFP_NOFS);
+
+@@ -952,14 +955,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
+ root, fs_info, objectid);
+ ret = btrfs_find_last_root(tree_root, objectid,
+ &root->root_item, &root->root_key);
++ if (ret > 0)
++ return -ENOENT;
+ BUG_ON(ret);
+
+ generation = btrfs_root_generation(&root->root_item);
+ blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+ blocksize, generation);
+- root->commit_root = btrfs_root_node(root);
+ BUG_ON(!root->node);
++ root->commit_root = btrfs_root_node(root);
+ return 0;
+ }
+
+@@ -1085,6 +1090,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+ WARN_ON(root->log_root);
+ root->log_root = log_root;
+ root->log_transid = 0;
++ root->last_log_commit = 0;
+ return 0;
+ }
+
+@@ -1095,7 +1101,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+ struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_path *path;
+ struct extent_buffer *l;
+- u64 highest_inode;
+ u64 generation;
+ u32 blocksize;
+ int ret = 0;
+@@ -1110,7 +1115,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+- goto insert;
++ goto out;
+ }
+
+ __setup_root(tree_root->nodesize, tree_root->leafsize,
+@@ -1120,39 +1125,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
+- if (ret != 0) {
+- if (ret > 0)
+- ret = -ENOENT;
+- goto out;
++ if (ret == 0) {
++ l = path->nodes[0];
++ read_extent_buffer(l, &root->root_item,
++ btrfs_item_ptr_offset(l, path->slots[0]),
++ sizeof(root->root_item));
++ memcpy(&root->root_key, location, sizeof(*location));
+ }
+- l = path->nodes[0];
+- read_extent_buffer(l, &root->root_item,
+- btrfs_item_ptr_offset(l, path->slots[0]),
+- sizeof(root->root_item));
+- memcpy(&root->root_key, location, sizeof(*location));
+- ret = 0;
+-out:
+- btrfs_release_path(root, path);
+ btrfs_free_path(path);
+ if (ret) {
+- kfree(root);
++ if (ret > 0)
++ ret = -ENOENT;
+ return ERR_PTR(ret);
+ }
++
+ generation = btrfs_root_generation(&root->root_item);
+ blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+ blocksize, generation);
+ root->commit_root = btrfs_root_node(root);
+ BUG_ON(!root->node);
+-insert:
+- if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
++out:
++ if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
+ root->ref_cows = 1;
+- ret = btrfs_find_highest_inode(root, &highest_inode);
+- if (ret == 0) {
+- root->highest_inode = highest_inode;
+- root->last_inode_alloc = highest_inode;
+- }
+- }
++
+ return root;
+ }
+
+@@ -1187,39 +1183,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+ return fs_info->dev_root;
+ if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
+ return fs_info->csum_root;
+-
++again:
++ spin_lock(&fs_info->fs_roots_radix_lock);
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)location->objectid);
++ spin_unlock(&fs_info->fs_roots_radix_lock);
+ if (root)
+ return root;
+
++ ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
++ if (ret == 0)
++ ret = -ENOENT;
++ if (ret < 0)
++ return ERR_PTR(ret);
++
+ root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+ if (IS_ERR(root))
+ return root;
+
++ WARN_ON(btrfs_root_refs(&root->root_item) == 0);
+ set_anon_super(&root->anon_super, NULL);
+
++ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
++ if (ret)
++ goto fail;
++
++ spin_lock(&fs_info->fs_roots_radix_lock);
+ ret = radix_tree_insert(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ root);
++ if (ret == 0)
++ root->in_radix = 1;
++ spin_unlock(&fs_info->fs_roots_radix_lock);
++ radix_tree_preload_end();
+ if (ret) {
+- free_extent_buffer(root->node);
+- kfree(root);
+- return ERR_PTR(ret);
++ if (ret == -EEXIST) {
++ free_fs_root(root);
++ goto again;
++ }
++ goto fail;
+ }
+- if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+- ret = btrfs_find_dead_roots(fs_info->tree_root,
+- root->root_key.objectid);
+- BUG_ON(ret);
++
++ ret = btrfs_find_dead_roots(fs_info->tree_root,
++ root->root_key.objectid);
++ WARN_ON(ret);
++
++ if (!(fs_info->sb->s_flags & MS_RDONLY))
+ btrfs_orphan_cleanup(root);
+- }
++
+ return root;
++fail:
++ free_fs_root(root);
++ return ERR_PTR(ret);
+ }
+
+ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *location,
+ const char *name, int namelen)
+ {
++ return btrfs_read_fs_root_no_name(fs_info, location);
++#if 0
+ struct btrfs_root *root;
+ int ret;
+
+@@ -1236,7 +1259,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+-#if 0
++
+ ret = btrfs_sysfs_add_root(root);
+ if (ret) {
+ free_extent_buffer(root->node);
+@@ -1244,9 +1267,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+-#endif
+ root->in_sysfs = 1;
+ return root;
++#endif
+ }
+
+ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+@@ -1325,9 +1348,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+ offset = page_offset(page);
+
+ em_tree = &BTRFS_I(inode)->extent_tree;
+- spin_lock(&em_tree->lock);
++ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+- spin_unlock(&em_tree->lock);
++ read_unlock(&em_tree->lock);
+ if (!em) {
+ __unplug_io_fn(bdi, page);
+ return;
+@@ -1359,8 +1382,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
+
+ err = bdi_register(bdi, NULL, "btrfs-%d",
+ atomic_inc_return(&btrfs_bdi_num));
+- if (err)
++ if (err) {
++ bdi_destroy(bdi);
+ return err;
++ }
+
+ bdi->ra_pages = default_backing_dev_info.ra_pages;
+ bdi->unplug_io_fn = btrfs_unplug_io_fn;
+@@ -1450,9 +1475,12 @@ static int cleaner_kthread(void *arg)
+ break;
+
+ vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+- mutex_lock(&root->fs_info->cleaner_mutex);
+- btrfs_clean_old_snapshots(root);
+- mutex_unlock(&root->fs_info->cleaner_mutex);
++
++ if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
++ mutex_trylock(&root->fs_info->cleaner_mutex)) {
++ btrfs_clean_old_snapshots(root);
++ mutex_unlock(&root->fs_info->cleaner_mutex);
++ }
+
+ if (freezing(current)) {
+ refrigerator();
+@@ -1557,15 +1585,36 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ err = -ENOMEM;
+ goto fail;
+ }
+- INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
++
++ ret = init_srcu_struct(&fs_info->subvol_srcu);
++ if (ret) {
++ err = ret;
++ goto fail;
++ }
++
++ ret = setup_bdi(fs_info, &fs_info->bdi);
++ if (ret) {
++ err = ret;
++ goto fail_srcu;
++ }
++
++ fs_info->btree_inode = new_inode(sb);
++ if (!fs_info->btree_inode) {
++ err = -ENOMEM;
++ goto fail_bdi;
++ }
++
++ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ INIT_LIST_HEAD(&fs_info->trans_list);
+ INIT_LIST_HEAD(&fs_info->dead_roots);
+ INIT_LIST_HEAD(&fs_info->hashers);
+ INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+ INIT_LIST_HEAD(&fs_info->ordered_operations);
++ INIT_LIST_HEAD(&fs_info->caching_block_groups);
+ spin_lock_init(&fs_info->delalloc_lock);
+ spin_lock_init(&fs_info->new_trans_lock);
+ spin_lock_init(&fs_info->ref_cache_lock);
++ spin_lock_init(&fs_info->fs_roots_radix_lock);
+
+ init_completion(&fs_info->kobj_unregister);
+ fs_info->tree_root = tree_root;
+@@ -1584,12 +1633,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ fs_info->sb = sb;
+ fs_info->max_extent = (u64)-1;
+ fs_info->max_inline = 8192 * 1024;
+- if (setup_bdi(fs_info, &fs_info->bdi))
+- goto fail_bdi;
+- fs_info->btree_inode = new_inode(sb);
+- fs_info->btree_inode->i_ino = 1;
+- fs_info->btree_inode->i_nlink = 1;
+- fs_info->metadata_ratio = 8;
++ fs_info->metadata_ratio = 0;
+
+ fs_info->thread_pool_size = min_t(unsigned long,
+ num_online_cpus() + 2, 8);
+@@ -1600,6 +1644,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ sb->s_blocksize = 4096;
+ sb->s_blocksize_bits = blksize_bits(4096);
+
++ fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
++ fs_info->btree_inode->i_nlink = 1;
+ /*
+ * we set the i_size on the btree inode to the max possible int.
+ * the real end of the address space is determined by all of
+@@ -1618,28 +1664,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+
+ BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+
++ BTRFS_I(fs_info->btree_inode)->root = tree_root;
++ memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
++ sizeof(struct btrfs_key));
++ BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
++ insert_inode_hash(fs_info->btree_inode);
++
+ spin_lock_init(&fs_info->block_group_cache_lock);
+ fs_info->block_group_cache_tree.rb_node = NULL;
+
+- extent_io_tree_init(&fs_info->pinned_extents,
++ extent_io_tree_init(&fs_info->freed_extents[0],
+ fs_info->btree_inode->i_mapping, GFP_NOFS);
++ extent_io_tree_init(&fs_info->freed_extents[1],
++ fs_info->btree_inode->i_mapping, GFP_NOFS);
++ fs_info->pinned_extents = &fs_info->freed_extents[0];
+ fs_info->do_barriers = 1;
+
+- BTRFS_I(fs_info->btree_inode)->root = tree_root;
+- memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+- sizeof(struct btrfs_key));
+- insert_inode_hash(fs_info->btree_inode);
+
+ mutex_init(&fs_info->trans_mutex);
+ mutex_init(&fs_info->ordered_operations_mutex);
+ mutex_init(&fs_info->tree_log_mutex);
+- mutex_init(&fs_info->drop_mutex);
+ mutex_init(&fs_info->chunk_mutex);
+ mutex_init(&fs_info->transaction_kthread_mutex);
+ mutex_init(&fs_info->cleaner_mutex);
+ mutex_init(&fs_info->volume_mutex);
+- mutex_init(&fs_info->tree_reloc_mutex);
+ init_rwsem(&fs_info->extent_commit_sem);
++ init_rwsem(&fs_info->subvol_sem);
+
+ btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
+ btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
+@@ -1699,20 +1749,24 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ goto fail_iput;
+ }
+
+- /*
+- * we need to start all the end_io workers up front because the
+- * queue work function gets called at interrupt time, and so it
+- * cannot dynamically grow.
+- */
++ btrfs_init_workers(&fs_info->generic_worker,
++ "genwork", 1, NULL);
++
+ btrfs_init_workers(&fs_info->workers, "worker",
+- fs_info->thread_pool_size);
++ fs_info->thread_pool_size,
++ &fs_info->generic_worker);
+
+ btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
+- fs_info->thread_pool_size);
++ fs_info->thread_pool_size,
++ &fs_info->generic_worker);
+
+ btrfs_init_workers(&fs_info->submit_workers, "submit",
+ min_t(u64, fs_devices->num_devices,
+- fs_info->thread_pool_size));
++ fs_info->thread_pool_size),
++ &fs_info->generic_worker);
++ btrfs_init_workers(&fs_info->enospc_workers, "enospc",
++ fs_info->thread_pool_size,
++ &fs_info->generic_worker);
+
+ /* a higher idle thresh on the submit workers makes it much more
+ * likely that bios will be send down in a sane order to the
+@@ -1726,15 +1780,20 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ fs_info->delalloc_workers.idle_thresh = 2;
+ fs_info->delalloc_workers.ordered = 1;
+
+- btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
++ btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
++ &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->endio_workers, "endio",
+- fs_info->thread_pool_size);
++ fs_info->thread_pool_size,
++ &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
+- fs_info->thread_pool_size);
++ fs_info->thread_pool_size,
++ &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->endio_meta_write_workers,
+- "endio-meta-write", fs_info->thread_pool_size);
++ "endio-meta-write", fs_info->thread_pool_size,
++ &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
+- fs_info->thread_pool_size);
++ fs_info->thread_pool_size,
++ &fs_info->generic_worker);
+
+ /*
+ * endios are largely parallel and should have a very
+@@ -1743,20 +1802,19 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ fs_info->endio_workers.idle_thresh = 4;
+ fs_info->endio_meta_workers.idle_thresh = 4;
+
+- fs_info->endio_write_workers.idle_thresh = 64;
+- fs_info->endio_meta_write_workers.idle_thresh = 64;
++ fs_info->endio_write_workers.idle_thresh = 2;
++ fs_info->endio_meta_write_workers.idle_thresh = 2;
+
+ btrfs_start_workers(&fs_info->workers, 1);
++ btrfs_start_workers(&fs_info->generic_worker, 1);
+ btrfs_start_workers(&fs_info->submit_workers, 1);
+ btrfs_start_workers(&fs_info->delalloc_workers, 1);
+ btrfs_start_workers(&fs_info->fixup_workers, 1);
+- btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+- btrfs_start_workers(&fs_info->endio_meta_workers,
+- fs_info->thread_pool_size);
+- btrfs_start_workers(&fs_info->endio_meta_write_workers,
+- fs_info->thread_pool_size);
+- btrfs_start_workers(&fs_info->endio_write_workers,
+- fs_info->thread_pool_size);
++ btrfs_start_workers(&fs_info->endio_workers, 1);
++ btrfs_start_workers(&fs_info->endio_meta_workers, 1);
++ btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
++ btrfs_start_workers(&fs_info->endio_write_workers, 1);
++ btrfs_start_workers(&fs_info->enospc_workers, 1);
+
+ fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+ fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+@@ -1916,6 +1974,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ }
+ }
+
++ ret = btrfs_find_orphan_roots(tree_root);
++ BUG_ON(ret);
++
+ if (!(sb->s_flags & MS_RDONLY)) {
+ ret = btrfs_recover_relocation(tree_root);
+ BUG_ON(ret);
+@@ -1959,6 +2020,7 @@ fail_chunk_root:
+ free_extent_buffer(chunk_root->node);
+ free_extent_buffer(chunk_root->commit_root);
+ fail_sb_buffer:
++ btrfs_stop_workers(&fs_info->generic_worker);
+ btrfs_stop_workers(&fs_info->fixup_workers);
+ btrfs_stop_workers(&fs_info->delalloc_workers);
+ btrfs_stop_workers(&fs_info->workers);
+@@ -1967,6 +2029,7 @@ fail_sb_buffer:
+ btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+ btrfs_stop_workers(&fs_info->endio_write_workers);
+ btrfs_stop_workers(&fs_info->submit_workers);
++ btrfs_stop_workers(&fs_info->enospc_workers);
+ fail_iput:
+ invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+ iput(fs_info->btree_inode);
+@@ -1975,6 +2038,8 @@ fail_iput:
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ fail_bdi:
+ bdi_destroy(&fs_info->bdi);
++fail_srcu:
++ cleanup_srcu_struct(&fs_info->subvol_srcu);
+ fail:
+ kfree(extent_root);
+ kfree(tree_root);
+@@ -2234,20 +2299,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
+
+ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+ {
+- WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
++ spin_lock(&fs_info->fs_roots_radix_lock);
+ radix_tree_delete(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid);
++ spin_unlock(&fs_info->fs_roots_radix_lock);
++
++ if (btrfs_root_refs(&root->root_item) == 0)
++ synchronize_srcu(&fs_info->subvol_srcu);
++
++ free_fs_root(root);
++ return 0;
++}
++
++static void free_fs_root(struct btrfs_root *root)
++{
++ WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ if (root->anon_super.s_dev) {
+ down_write(&root->anon_super.s_umount);
+ kill_anon_super(&root->anon_super);
+ }
+- if (root->node)
+- free_extent_buffer(root->node);
+- if (root->commit_root)
+- free_extent_buffer(root->commit_root);
++ free_extent_buffer(root->node);
++ free_extent_buffer(root->commit_root);
+ kfree(root->name);
+ kfree(root);
+- return 0;
+ }
+
+ static int del_fs_roots(struct btrfs_fs_info *fs_info)
+@@ -2256,6 +2330,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
+ struct btrfs_root *gang[8];
+ int i;
+
++ while (!list_empty(&fs_info->dead_roots)) {
++ gang[0] = list_entry(fs_info->dead_roots.next,
++ struct btrfs_root, root_list);
++ list_del(&gang[0]->root_list);
++
++ if (gang[0]->in_radix) {
++ btrfs_free_fs_root(fs_info, gang[0]);
++ } else {
++ free_extent_buffer(gang[0]->node);
++ free_extent_buffer(gang[0]->commit_root);
++ kfree(gang[0]);
++ }
++ }
++
+ while (1) {
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+@@ -2285,9 +2373,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
+ root_objectid = gang[ret - 1]->root_key.objectid + 1;
+ for (i = 0; i < ret; i++) {
+ root_objectid = gang[i]->root_key.objectid;
+- ret = btrfs_find_dead_roots(fs_info->tree_root,
+- root_objectid);
+- BUG_ON(ret);
+ btrfs_orphan_cleanup(gang[i]);
+ }
+ root_objectid++;
+@@ -2357,12 +2442,12 @@ int close_ctree(struct btrfs_root *root)
+ free_extent_buffer(root->fs_info->csum_root->commit_root);
+
+ btrfs_free_block_groups(root->fs_info);
+- btrfs_free_pinned_extents(root->fs_info);
+
+ del_fs_roots(fs_info);
+
+ iput(fs_info->btree_inode);
+
++ btrfs_stop_workers(&fs_info->generic_worker);
+ btrfs_stop_workers(&fs_info->fixup_workers);
+ btrfs_stop_workers(&fs_info->delalloc_workers);
+ btrfs_stop_workers(&fs_info->workers);
+@@ -2371,11 +2456,13 @@ int close_ctree(struct btrfs_root *root)
+ btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+ btrfs_stop_workers(&fs_info->endio_write_workers);
+ btrfs_stop_workers(&fs_info->submit_workers);
++ btrfs_stop_workers(&fs_info->enospc_workers);
+
+ btrfs_close_devices(fs_info->fs_devices);
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
+ bdi_destroy(&fs_info->bdi);
++ cleanup_srcu_struct(&fs_info->subvol_srcu);
+
+ kfree(fs_info->extent_root);
+ kfree(fs_info->tree_root);
+diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
+index 9596b40..ba5c3fd 100644
+--- a/fs/btrfs/export.c
++++ b/fs/btrfs/export.c
+@@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+ len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+ type = FILEID_BTRFS_WITHOUT_PARENT;
+
+- fid->objectid = BTRFS_I(inode)->location.objectid;
++ fid->objectid = inode->i_ino;
+ fid->root_objectid = BTRFS_I(inode)->root->objectid;
+ fid->gen = inode->i_generation;
+
+@@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+ }
+
+ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+- u64 root_objectid, u32 generation)
++ u64 root_objectid, u32 generation,
++ int check_generation)
+ {
++ struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+ struct btrfs_root *root;
++ struct dentry *dentry;
+ struct inode *inode;
+ struct btrfs_key key;
++ int index;
++ int err = 0;
++
++ if (objectid < BTRFS_FIRST_FREE_OBJECTID)
++ return ERR_PTR(-ESTALE);
+
+ key.objectid = root_objectid;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.offset = (u64)-1;
+
+- root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
+- if (IS_ERR(root))
+- return ERR_CAST(root);
++ index = srcu_read_lock(&fs_info->subvol_srcu);
++
++ root = btrfs_read_fs_root_no_name(fs_info, &key);
++ if (IS_ERR(root)) {
++ err = PTR_ERR(root);
++ goto fail;
++ }
++
++ if (btrfs_root_refs(&root->root_item) == 0) {
++ err = -ENOENT;
++ goto fail;
++ }
+
+ key.objectid = objectid;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+
+ inode = btrfs_iget(sb, &key, root);
+- if (IS_ERR(inode))
+- return (void *)inode;
++ if (IS_ERR(inode)) {
++ err = PTR_ERR(inode);
++ goto fail;
++ }
++
++ srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+- if (generation != inode->i_generation) {
++ if (check_generation && generation != inode->i_generation) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+
+- return d_obtain_alias(inode);
++ dentry = d_obtain_alias(inode);
++ if (!IS_ERR(dentry))
++ dentry->d_op = &btrfs_dentry_operations;
++ return dentry;
++fail:
++ srcu_read_unlock(&fs_info->subvol_srcu, index);
++ return ERR_PTR(err);
+ }
+
+ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+@@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+ objectid = fid->parent_objectid;
+ generation = fid->parent_gen;
+
+- return btrfs_get_dentry(sb, objectid, root_objectid, generation);
++ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+ }
+
+ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+@@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+ root_objectid = fid->root_objectid;
+ generation = fid->gen;
+
+- return btrfs_get_dentry(sb, objectid, root_objectid, generation);
++ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+ }
+
+ static struct dentry *btrfs_get_parent(struct dentry *child)
+ {
+ struct inode *dir = child->d_inode;
++ static struct dentry *dentry;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+- struct btrfs_key key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+- int slot;
+- u64 objectid;
++ struct btrfs_root_ref *ref;
++ struct btrfs_key key;
++ struct btrfs_key found_key;
+ int ret;
+
+ path = btrfs_alloc_path();
+
+- key.objectid = dir->i_ino;
+- btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+- key.offset = (u64)-1;
++ if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
++ key.objectid = root->root_key.objectid;
++ key.type = BTRFS_ROOT_BACKREF_KEY;
++ key.offset = (u64)-1;
++ root = root->fs_info->tree_root;
++ } else {
++ key.objectid = dir->i_ino;
++ key.type = BTRFS_INODE_REF_KEY;
++ key.offset = (u64)-1;
++ }
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+- if (ret < 0) {
+- /* Error */
+- btrfs_free_path(path);
+- return ERR_PTR(ret);
++ if (ret < 0)
++ goto fail;
++
++ BUG_ON(ret == 0);
++ if (path->slots[0] == 0) {
++ ret = -ENOENT;
++ goto fail;
+ }
++
++ path->slots[0]--;
+ leaf = path->nodes[0];
+- slot = path->slots[0];
+- if (ret) {
+- /* btrfs_search_slot() returns the slot where we'd want to
+- insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
+- The _real_ backref, telling us what the parent inode
+- _actually_ is, will be in the slot _before_ the one
+- that btrfs_search_slot() returns. */
+- if (!slot) {
+- /* Unless there is _no_ key in the tree before... */
+- btrfs_free_path(path);
+- return ERR_PTR(-EIO);
+- }
+- slot--;
++
++ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
++ if (found_key.objectid != key.objectid || found_key.type != key.type) {
++ ret = -ENOENT;
++ goto fail;
+ }
+
+- btrfs_item_key_to_cpu(leaf, &key, slot);
++ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
++ ref = btrfs_item_ptr(leaf, path->slots[0],
++ struct btrfs_root_ref);
++ key.objectid = btrfs_root_ref_dirid(leaf, ref);
++ } else {
++ key.objectid = found_key.offset;
++ }
+ btrfs_free_path(path);
+
+- if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
+- return ERR_PTR(-EINVAL);
+-
+- objectid = key.offset;
+-
+- /* If we are already at the root of a subvol, return the real root */
+- if (objectid == dir->i_ino)
+- return dget(dir->i_sb->s_root);
++ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
++ return btrfs_get_dentry(root->fs_info->sb, key.objectid,
++ found_key.offset, 0, 0);
++ }
+
+- /* Build a new key for the inode item */
+- key.objectid = objectid;
+- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
++ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+-
+- return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
++ dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
++ if (!IS_ERR(dentry))
++ dentry->d_op = &btrfs_dentry_operations;
++ return dentry;
++fail:
++ btrfs_free_path(path);
++ return ERR_PTR(ret);
+ }
+
+ const struct export_operations btrfs_export_ops = {
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index 72a2b9c..c56f916 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -32,12 +32,12 @@
+ #include "locking.h"
+ #include "free-space-cache.h"
+
+-static int update_reserved_extents(struct btrfs_root *root,
+- u64 bytenr, u64 num, int reserve);
+ static int update_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int alloc,
+ int mark_free);
++static int update_reserved_extents(struct btrfs_block_group_cache *cache,
++ u64 num_bytes, int reserve);
+ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+@@ -57,10 +57,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+ u64 parent, u64 root_objectid,
+ u64 flags, struct btrfs_disk_key *key,
+ int level, struct btrfs_key *ins);
+-
+ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, u64 alloc_bytes,
+ u64 flags, int force);
++static int pin_down_bytes(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root,
++ struct btrfs_path *path,
++ u64 bytenr, u64 num_bytes,
++ int is_data, int reserved,
++ struct extent_buffer **must_clean);
++static int find_next_key(struct btrfs_path *path, int level,
++ struct btrfs_key *key);
++static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
++ int dump_block_groups);
+
+ static noinline int
+ block_group_cache_done(struct btrfs_block_group_cache *cache)
+@@ -153,34 +162,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
+ return ret;
+ }
+
+-/*
+- * We always set EXTENT_LOCKED for the super mirror extents so we don't
+- * overwrite them, so those bits need to be unset. Also, if we are unmounting
+- * with pinned extents still sitting there because we had a block group caching,
+- * we need to clear those now, since we are done.
+- */
+-void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
++static int add_excluded_extent(struct btrfs_root *root,
++ u64 start, u64 num_bytes)
+ {
+- u64 start, end, last = 0;
+- int ret;
++ u64 end = start + num_bytes - 1;
++ set_extent_bits(&root->fs_info->freed_extents[0],
++ start, end, EXTENT_UPTODATE, GFP_NOFS);
++ set_extent_bits(&root->fs_info->freed_extents[1],
++ start, end, EXTENT_UPTODATE, GFP_NOFS);
++ return 0;
++}
+
+- while (1) {
+- ret = find_first_extent_bit(&info->pinned_extents, last,
+- &start, &end,
+- EXTENT_LOCKED|EXTENT_DIRTY);
+- if (ret)
+- break;
++static void free_excluded_extents(struct btrfs_root *root,
++ struct btrfs_block_group_cache *cache)
++{
++ u64 start, end;
+
+- clear_extent_bits(&info->pinned_extents, start, end,
+- EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
+- last = end+1;
+- }
++ start = cache->key.objectid;
++ end = start + cache->key.offset - 1;
++
++ clear_extent_bits(&root->fs_info->freed_extents[0],
++ start, end, EXTENT_UPTODATE, GFP_NOFS);
++ clear_extent_bits(&root->fs_info->freed_extents[1],
++ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ }
+
+-static int remove_sb_from_cache(struct btrfs_root *root,
+- struct btrfs_block_group_cache *cache)
++static int exclude_super_stripes(struct btrfs_root *root,
++ struct btrfs_block_group_cache *cache)
+ {
+- struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 bytenr;
+ u64 *logical;
+ int stripe_len;
+@@ -192,17 +201,42 @@ static int remove_sb_from_cache(struct btrfs_root *root,
+ cache->key.objectid, bytenr,
+ 0, &logical, &nr, &stripe_len);
+ BUG_ON(ret);
++
+ while (nr--) {
+- try_lock_extent(&fs_info->pinned_extents,
+- logical[nr],
+- logical[nr] + stripe_len - 1, GFP_NOFS);
++ cache->bytes_super += stripe_len;
++ ret = add_excluded_extent(root, logical[nr],
++ stripe_len);
++ BUG_ON(ret);
+ }
++
+ kfree(logical);
+ }
+-
+ return 0;
+ }
+
++static struct btrfs_caching_control *
++get_caching_control(struct btrfs_block_group_cache *cache)
++{
++ struct btrfs_caching_control *ctl;
++
++ spin_lock(&cache->lock);
++ if (cache->cached != BTRFS_CACHE_STARTED) {
++ spin_unlock(&cache->lock);
++ return NULL;
++ }
++
++ ctl = cache->caching_ctl;
++ atomic_inc(&ctl->count);
++ spin_unlock(&cache->lock);
++ return ctl;
++}
++
++static void put_caching_control(struct btrfs_caching_control *ctl)
++{
++ if (atomic_dec_and_test(&ctl->count))
++ kfree(ctl);
++}
++
+ /*
+ * this is only called by cache_block_group, since we could have freed extents
+ * we need to check the pinned_extents for any extents that can't be used yet
+@@ -215,9 +249,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ int ret;
+
+ while (start < end) {
+- ret = find_first_extent_bit(&info->pinned_extents, start,
++ ret = find_first_extent_bit(info->pinned_extents, start,
+ &extent_start, &extent_end,
+- EXTENT_DIRTY|EXTENT_LOCKED);
++ EXTENT_DIRTY | EXTENT_UPTODATE);
+ if (ret)
+ break;
+
+@@ -249,22 +283,27 @@ static int caching_kthread(void *data)
+ {
+ struct btrfs_block_group_cache *block_group = data;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+- u64 last = 0;
++ struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
++ struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_path *path;
+- int ret = 0;
+- struct btrfs_key key;
+ struct extent_buffer *leaf;
+- int slot;
++ struct btrfs_key key;
+ u64 total_found = 0;
+-
+- BUG_ON(!fs_info);
++ u64 last = 0;
++ u32 nritems;
++ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+- atomic_inc(&block_group->space_info->caching_threads);
++ exclude_super_stripes(extent_root, block_group);
++ spin_lock(&block_group->space_info->lock);
++ block_group->space_info->bytes_super += block_group->bytes_super;
++ spin_unlock(&block_group->space_info->lock);
++
+ last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
++
+ /*
+ * We don't want to deadlock with somebody trying to allocate a new
+ * extent for the extent root while also trying to search the extent
+@@ -277,74 +316,64 @@ static int caching_kthread(void *data)
+
+ key.objectid = last;
+ key.offset = 0;
+- btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
++ key.type = BTRFS_EXTENT_ITEM_KEY;
+ again:
++ mutex_lock(&caching_ctl->mutex);
+ /* need to make sure the commit_root doesn't disappear */
+ down_read(&fs_info->extent_commit_sem);
+
+- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
++ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto err;
+
++ leaf = path->nodes[0];
++ nritems = btrfs_header_nritems(leaf);
++
+ while (1) {
+ smp_mb();
+- if (block_group->fs_info->closing > 1) {
++ if (fs_info->closing > 1) {
+ last = (u64)-1;
+ break;
+ }
+
+- leaf = path->nodes[0];
+- slot = path->slots[0];
+- if (slot >= btrfs_header_nritems(leaf)) {
+- ret = btrfs_next_leaf(fs_info->extent_root, path);
+- if (ret < 0)
+- goto err;
+- else if (ret)
++ if (path->slots[0] < nritems) {
++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++ } else {
++ ret = find_next_key(path, 0, &key);
++ if (ret)
+ break;
+
+- if (need_resched() ||
+- btrfs_transaction_in_commit(fs_info)) {
+- leaf = path->nodes[0];
+-
+- /* this shouldn't happen, but if the
+- * leaf is empty just move on.
+- */
+- if (btrfs_header_nritems(leaf) == 0)
+- break;
+- /*
+- * we need to copy the key out so that
+- * we are sure the next search advances
+- * us forward in the btree.
+- */
+- btrfs_item_key_to_cpu(leaf, &key, 0);
+- btrfs_release_path(fs_info->extent_root, path);
+- up_read(&fs_info->extent_commit_sem);
++ caching_ctl->progress = last;
++ btrfs_release_path(extent_root, path);
++ up_read(&fs_info->extent_commit_sem);
++ mutex_unlock(&caching_ctl->mutex);
++ if (btrfs_transaction_in_commit(fs_info))
+ schedule_timeout(1);
+- goto again;
+- }
++ else
++ cond_resched();
++ goto again;
++ }
+
++ if (key.objectid < block_group->key.objectid) {
++ path->slots[0]++;
+ continue;
+ }
+- btrfs_item_key_to_cpu(leaf, &key, slot);
+- if (key.objectid < block_group->key.objectid)
+- goto next;
+
+ if (key.objectid >= block_group->key.objectid +
+ block_group->key.offset)
+ break;
+
+- if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
++ if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+ total_found += add_new_free_space(block_group,
+ fs_info, last,
+ key.objectid);
+ last = key.objectid + key.offset;
+- }
+
+- if (total_found > (1024 * 1024 * 2)) {
+- total_found = 0;
+- wake_up(&block_group->caching_q);
++ if (total_found > (1024 * 1024 * 2)) {
++ total_found = 0;
++ wake_up(&caching_ctl->wait);
++ }
+ }
+-next:
+ path->slots[0]++;
+ }
+ ret = 0;
+@@ -352,33 +381,65 @@ next:
+ total_found += add_new_free_space(block_group, fs_info, last,
+ block_group->key.objectid +
+ block_group->key.offset);
++ caching_ctl->progress = (u64)-1;
+
+ spin_lock(&block_group->lock);
++ block_group->caching_ctl = NULL;
+ block_group->cached = BTRFS_CACHE_FINISHED;
+ spin_unlock(&block_group->lock);
+
+ err:
+ btrfs_free_path(path);
+ up_read(&fs_info->extent_commit_sem);
+- atomic_dec(&block_group->space_info->caching_threads);
+- wake_up(&block_group->caching_q);
+
++ free_excluded_extents(extent_root, block_group);
++
++ mutex_unlock(&caching_ctl->mutex);
++ wake_up(&caching_ctl->wait);
++
++ put_caching_control(caching_ctl);
++ atomic_dec(&block_group->space_info->caching_threads);
+ return 0;
+ }
+
+ static int cache_block_group(struct btrfs_block_group_cache *cache)
+ {
++ struct btrfs_fs_info *fs_info = cache->fs_info;
++ struct btrfs_caching_control *caching_ctl;
+ struct task_struct *tsk;
+ int ret = 0;
+
++ smp_mb();
++ if (cache->cached != BTRFS_CACHE_NO)
++ return 0;
++
++ caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
++ BUG_ON(!caching_ctl);
++
++ INIT_LIST_HEAD(&caching_ctl->list);
++ mutex_init(&caching_ctl->mutex);
++ init_waitqueue_head(&caching_ctl->wait);
++ caching_ctl->block_group = cache;
++ caching_ctl->progress = cache->key.objectid;
++ /* one for caching kthread, one for caching block group list */
++ atomic_set(&caching_ctl->count, 2);
++
+ spin_lock(&cache->lock);
+ if (cache->cached != BTRFS_CACHE_NO) {
+ spin_unlock(&cache->lock);
+- return ret;
++ kfree(caching_ctl);
++ return 0;
+ }
++ cache->caching_ctl = caching_ctl;
+ cache->cached = BTRFS_CACHE_STARTED;
+ spin_unlock(&cache->lock);
+
++ down_write(&fs_info->extent_commit_sem);
++ list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
++ up_write(&fs_info->extent_commit_sem);
++
++ atomic_inc(&cache->space_info->caching_threads);
++
+ tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+ cache->key.objectid);
+ if (IS_ERR(tsk)) {
+@@ -1507,22 +1568,22 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
+ return ret;
+ }
+
+-#ifdef BIO_RW_DISCARD
+ static void btrfs_issue_discard(struct block_device *bdev,
+ u64 start, u64 len)
+ {
+ blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+ }
+-#endif
+
+ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes)
+ {
+-#ifdef BIO_RW_DISCARD
+ int ret;
+ u64 map_length = num_bytes;
+ struct btrfs_multi_bio *multi = NULL;
+
++ if (!btrfs_test_opt(root, DISCARD))
++ return 0;
++
+ /* Tell the block device(s) that the sectors can be discarded */
+ ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+ bytenr, &map_length, &multi, 0);
+@@ -1542,9 +1603,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ }
+
+ return ret;
+-#else
+- return 0;
+-#endif
+ }
+
+ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+@@ -1656,7 +1714,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
+ parent, ref_root, flags,
+ ref->objectid, ref->offset,
+ &ins, node->ref_mod);
+- update_reserved_extents(root, ins.objectid, ins.offset, 0);
+ } else if (node->action == BTRFS_ADD_DELAYED_REF) {
+ ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+ node->num_bytes, parent,
+@@ -1782,7 +1839,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ extent_op->flags_to_set,
+ &extent_op->key,
+ ref->level, &ins);
+- update_reserved_extents(root, ins.objectid, ins.offset, 0);
+ } else if (node->action == BTRFS_ADD_DELAYED_REF) {
+ ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+ node->num_bytes, parent, ref_root,
+@@ -1817,16 +1873,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+ BUG_ON(extent_op);
+ head = btrfs_delayed_node_to_head(node);
+ if (insert_reserved) {
++ int mark_free = 0;
++ struct extent_buffer *must_clean = NULL;
++
++ ret = pin_down_bytes(trans, root, NULL,
++ node->bytenr, node->num_bytes,
++ head->is_data, 1, &must_clean);
++ if (ret > 0)
++ mark_free = 1;
++
++ if (must_clean) {
++ clean_tree_block(NULL, root, must_clean);
++ btrfs_tree_unlock(must_clean);
++ free_extent_buffer(must_clean);
++ }
+ if (head->is_data) {
+ ret = btrfs_del_csums(trans, root,
+ node->bytenr,
+ node->num_bytes);
+ BUG_ON(ret);
+ }
+- btrfs_update_pinned_extents(root, node->bytenr,
+- node->num_bytes, 1);
+- update_reserved_extents(root, node->bytenr,
+- node->num_bytes, 0);
++ if (mark_free) {
++ ret = btrfs_free_reserved_extent(root,
++ node->bytenr,
++ node->num_bytes);
++ BUG_ON(ret);
++ }
+ }
+ mutex_unlock(&head->mutex);
+ return 0;
+@@ -2691,60 +2763,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+ alloc_target);
+ }
+
++static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
++{
++ u64 num_bytes;
++ int level;
++
++ level = BTRFS_MAX_LEVEL - 2;
++ /*
++ * NOTE: these calculations are absolutely the worst possible case.
++ * This assumes that _every_ item we insert will require a new leaf, and
++ * that the tree has grown to its maximum level size.
++ */
++
++ /*
++ * for every item we insert we could insert both an extent item and a
++ * extent ref item. Then for ever item we insert, we will need to cow
++ * both the original leaf, plus the leaf to the left and right of it.
++ *
++ * Unless we are talking about the extent root, then we just want the
++ * number of items * 2, since we just need the extent item plus its ref.
++ */
++ if (root == root->fs_info->extent_root)
++ num_bytes = num_items * 2;
++ else
++ num_bytes = (num_items + (2 * num_items)) * 3;
++
++ /*
++ * num_bytes is total number of leaves we could need times the leaf
++ * size, and then for every leaf we could end up cow'ing 2 nodes per
++ * level, down to the leaf level.
++ */
++ num_bytes = (num_bytes * root->leafsize) +
++ (num_bytes * (level * 2)) * root->nodesize;
++
++ return num_bytes;
++}
++
+ /*
+- * for now this just makes sure we have at least 5% of our metadata space free
+- * for use.
++ * Unreserve metadata space for delalloc. If we have less reserved credits than
++ * we have extents, this function does nothing.
+ */
+-int btrfs_check_metadata_free_space(struct btrfs_root *root)
++int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
++ struct inode *inode, int num_items)
+ {
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+- u64 alloc_target, thresh;
+- int committed = 0, ret;
++ u64 num_bytes;
++ u64 alloc_target;
++ bool bug = false;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+-again:
++ num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
++ num_items);
++
+ spin_lock(&meta_sinfo->lock);
+- if (!meta_sinfo->full)
+- thresh = meta_sinfo->total_bytes * 80;
+- else
+- thresh = meta_sinfo->total_bytes * 95;
++ spin_lock(&BTRFS_I(inode)->accounting_lock);
++ if (BTRFS_I(inode)->reserved_extents <=
++ BTRFS_I(inode)->outstanding_extents) {
++ spin_unlock(&BTRFS_I(inode)->accounting_lock);
++ spin_unlock(&meta_sinfo->lock);
++ return 0;
++ }
++ spin_unlock(&BTRFS_I(inode)->accounting_lock);
++
++ BTRFS_I(inode)->reserved_extents--;
++ BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
++
++ if (meta_sinfo->bytes_delalloc < num_bytes) {
++ bug = true;
++ meta_sinfo->bytes_delalloc = 0;
++ } else {
++ meta_sinfo->bytes_delalloc -= num_bytes;
++ }
++ spin_unlock(&meta_sinfo->lock);
++
++ BUG_ON(bug);
++
++ return 0;
++}
++
++static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
++{
++ u64 thresh;
++
++ thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
++ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
++ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
++ meta_sinfo->bytes_may_use;
+
++ thresh = meta_sinfo->total_bytes - thresh;
++ thresh *= 80;
+ do_div(thresh, 100);
++ if (thresh <= meta_sinfo->bytes_delalloc)
++ meta_sinfo->force_delalloc = 1;
++ else
++ meta_sinfo->force_delalloc = 0;
++}
+
+- if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+- struct btrfs_trans_handle *trans;
+- if (!meta_sinfo->full) {
+- meta_sinfo->force_alloc = 1;
+- spin_unlock(&meta_sinfo->lock);
++struct async_flush {
++ struct btrfs_root *root;
++ struct btrfs_space_info *info;
++ struct btrfs_work work;
++};
+
+- trans = btrfs_start_transaction(root, 1);
+- if (!trans)
+- return -ENOMEM;
++static noinline void flush_delalloc_async(struct btrfs_work *work)
++{
++ struct async_flush *async;
++ struct btrfs_root *root;
++ struct btrfs_space_info *info;
+
+- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+- 2 * 1024 * 1024, alloc_target, 0);
+- btrfs_end_transaction(trans, root);
++ async = container_of(work, struct async_flush, work);
++ root = async->root;
++ info = async->info;
++
++ btrfs_start_delalloc_inodes(root);
++ wake_up(&info->flush_wait);
++ btrfs_wait_ordered_extents(root, 0);
++
++ spin_lock(&info->lock);
++ info->flushing = 0;
++ spin_unlock(&info->lock);
++ wake_up(&info->flush_wait);
++
++ kfree(async);
++}
++
++static void wait_on_flush(struct btrfs_space_info *info)
++{
++ DEFINE_WAIT(wait);
++ u64 used;
++
++ while (1) {
++ prepare_to_wait(&info->flush_wait, &wait,
++ TASK_UNINTERRUPTIBLE);
++ spin_lock(&info->lock);
++ if (!info->flushing) {
++ spin_unlock(&info->lock);
++ break;
++ }
++
++ used = info->bytes_used + info->bytes_reserved +
++ info->bytes_pinned + info->bytes_readonly +
++ info->bytes_super + info->bytes_root +
++ info->bytes_may_use + info->bytes_delalloc;
++ if (used < info->total_bytes) {
++ spin_unlock(&info->lock);
++ break;
++ }
++ spin_unlock(&info->lock);
++ schedule();
++ }
++ finish_wait(&info->flush_wait, &wait);
++}
++
++static void flush_delalloc(struct btrfs_root *root,
++ struct btrfs_space_info *info)
++{
++ struct async_flush *async;
++ bool wait = false;
++
++ spin_lock(&info->lock);
++
++ if (!info->flushing) {
++ info->flushing = 1;
++ init_waitqueue_head(&info->flush_wait);
++ } else {
++ wait = true;
++ }
++
++ spin_unlock(&info->lock);
++
++ if (wait) {
++ wait_on_flush(info);
++ return;
++ }
++
++ async = kzalloc(sizeof(*async), GFP_NOFS);
++ if (!async)
++ goto flush;
++
++ async->root = root;
++ async->info = info;
++ async->work.func = flush_delalloc_async;
++
++ btrfs_queue_worker(&root->fs_info->enospc_workers,
++ &async->work);
++ wait_on_flush(info);
++ return;
++
++flush:
++ btrfs_start_delalloc_inodes(root);
++ btrfs_wait_ordered_extents(root, 0);
++
++ spin_lock(&info->lock);
++ info->flushing = 0;
++ spin_unlock(&info->lock);
++ wake_up(&info->flush_wait);
++}
++
++static int maybe_allocate_chunk(struct btrfs_root *root,
++ struct btrfs_space_info *info)
++{
++ struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
++ struct btrfs_trans_handle *trans;
++ bool wait = false;
++ int ret = 0;
++ u64 min_metadata;
++ u64 free_space;
++
++ free_space = btrfs_super_total_bytes(disk_super);
++ /*
++ * we allow the metadata to grow to a max of either 5gb or 5% of the
++ * space in the volume.
++ */
++ min_metadata = min((u64)5 * 1024 * 1024 * 1024,
++ div64_u64(free_space * 5, 100));
++ if (info->total_bytes >= min_metadata) {
++ spin_unlock(&info->lock);
++ return 0;
++ }
++
++ if (info->full) {
++ spin_unlock(&info->lock);
++ return 0;
++ }
++
++ if (!info->allocating_chunk) {
++ info->force_alloc = 1;
++ info->allocating_chunk = 1;
++ init_waitqueue_head(&info->allocate_wait);
++ } else {
++ wait = true;
++ }
++
++ spin_unlock(&info->lock);
++
++ if (wait) {
++ wait_event(info->allocate_wait,
++ !info->allocating_chunk);
++ return 1;
++ }
++
++ trans = btrfs_start_transaction(root, 1);
++ if (!trans) {
++ ret = -ENOMEM;
++ goto out;
++ }
++
++ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
++ 4096 + 2 * 1024 * 1024,
++ info->flags, 0);
++ btrfs_end_transaction(trans, root);
++ if (ret)
++ goto out;
++out:
++ spin_lock(&info->lock);
++ info->allocating_chunk = 0;
++ spin_unlock(&info->lock);
++ wake_up(&info->allocate_wait);
++
++ if (ret)
++ return 0;
++ return 1;
++}
++
++/*
++ * Reserve metadata space for delalloc.
++ */
++int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
++ struct inode *inode, int num_items)
++{
++ struct btrfs_fs_info *info = root->fs_info;
++ struct btrfs_space_info *meta_sinfo;
++ u64 num_bytes;
++ u64 used;
++ u64 alloc_target;
++ int flushed = 0;
++ int force_delalloc;
++
++ /* get the space info for where the metadata will live */
++ alloc_target = btrfs_get_alloc_profile(root, 0);
++ meta_sinfo = __find_space_info(info, alloc_target);
++
++ num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
++ num_items);
++again:
++ spin_lock(&meta_sinfo->lock);
++
++ force_delalloc = meta_sinfo->force_delalloc;
++
++ if (unlikely(!meta_sinfo->bytes_root))
++ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
++
++ if (!flushed)
++ meta_sinfo->bytes_delalloc += num_bytes;
++
++ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
++ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
++ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
++ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
++
++ if (used > meta_sinfo->total_bytes) {
++ flushed++;
++
++ if (flushed == 1) {
++ if (maybe_allocate_chunk(root, meta_sinfo))
++ goto again;
++ flushed++;
++ } else {
++ spin_unlock(&meta_sinfo->lock);
++ }
++
++ if (flushed == 2) {
++ filemap_flush(inode->i_mapping);
++ goto again;
++ } else if (flushed == 3) {
++ flush_delalloc(root, meta_sinfo);
+ goto again;
+ }
++ spin_lock(&meta_sinfo->lock);
++ meta_sinfo->bytes_delalloc -= num_bytes;
+ spin_unlock(&meta_sinfo->lock);
++ printk(KERN_ERR "enospc, has %d, reserved %d\n",
++ BTRFS_I(inode)->outstanding_extents,
++ BTRFS_I(inode)->reserved_extents);
++ dump_space_info(meta_sinfo, 0, 0);
++ return -ENOSPC;
++ }
+
+- if (!committed) {
+- committed = 1;
+- trans = btrfs_join_transaction(root, 1);
+- if (!trans)
+- return -ENOMEM;
+- ret = btrfs_commit_transaction(trans, root);
+- if (ret)
+- return ret;
++ BTRFS_I(inode)->reserved_extents++;
++ check_force_delalloc(meta_sinfo);
++ spin_unlock(&meta_sinfo->lock);
++
++ if (!flushed && force_delalloc)
++ filemap_flush(inode->i_mapping);
++
++ return 0;
++}
++
++/*
++ * unreserve num_items number of items worth of metadata space. This needs to
++ * be paired with btrfs_reserve_metadata_space.
++ *
++ * NOTE: if you have the option, run this _AFTER_ you do a
++ * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
++ * oprations which will result in more used metadata, so we want to make sure we
++ * can do that without issue.
++ */
++int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
++{
++ struct btrfs_fs_info *info = root->fs_info;
++ struct btrfs_space_info *meta_sinfo;
++ u64 num_bytes;
++ u64 alloc_target;
++ bool bug = false;
++
++ /* get the space info for where the metadata will live */
++ alloc_target = btrfs_get_alloc_profile(root, 0);
++ meta_sinfo = __find_space_info(info, alloc_target);
++
++ num_bytes = calculate_bytes_needed(root, num_items);
++
++ spin_lock(&meta_sinfo->lock);
++ if (meta_sinfo->bytes_may_use < num_bytes) {
++ bug = true;
++ meta_sinfo->bytes_may_use = 0;
++ } else {
++ meta_sinfo->bytes_may_use -= num_bytes;
++ }
++ spin_unlock(&meta_sinfo->lock);
++
++ BUG_ON(bug);
++
++ return 0;
++}
++
++/*
++ * Reserve some metadata space for use. We'll calculate the worste case number
++ * of bytes that would be needed to modify num_items number of items. If we
++ * have space, fantastic, if not, you get -ENOSPC. Please call
++ * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
++ * items you reserved, since whatever metadata you needed should have already
++ * been allocated.
++ *
++ * This will commit the transaction to make more space if we don't have enough
++ * metadata space. THe only time we don't do this is if we're reserving space
++ * inside of a transaction, then we will just return -ENOSPC and it is the
++ * callers responsibility to handle it properly.
++ */
++int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
++{
++ struct btrfs_fs_info *info = root->fs_info;
++ struct btrfs_space_info *meta_sinfo;
++ u64 num_bytes;
++ u64 used;
++ u64 alloc_target;
++ int retries = 0;
++
++ /* get the space info for where the metadata will live */
++ alloc_target = btrfs_get_alloc_profile(root, 0);
++ meta_sinfo = __find_space_info(info, alloc_target);
++
++ num_bytes = calculate_bytes_needed(root, num_items);
++again:
++ spin_lock(&meta_sinfo->lock);
++
++ if (unlikely(!meta_sinfo->bytes_root))
++ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
++
++ if (!retries)
++ meta_sinfo->bytes_may_use += num_bytes;
++
++ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
++ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
++ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
++ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
++
++ if (used > meta_sinfo->total_bytes) {
++ retries++;
++ if (retries == 1) {
++ if (maybe_allocate_chunk(root, meta_sinfo))
++ goto again;
++ retries++;
++ } else {
++ spin_unlock(&meta_sinfo->lock);
++ }
++
++ if (retries == 2) {
++ flush_delalloc(root, meta_sinfo);
+ goto again;
+ }
++ spin_lock(&meta_sinfo->lock);
++ meta_sinfo->bytes_may_use -= num_bytes;
++ spin_unlock(&meta_sinfo->lock);
++
++ dump_space_info(meta_sinfo, 0, 0);
+ return -ENOSPC;
+ }
++
++ check_force_delalloc(meta_sinfo);
+ spin_unlock(&meta_sinfo->lock);
+
+ return 0;
+@@ -2764,13 +3224,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+ bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+ data_sinfo = BTRFS_I(inode)->space_info;
++ if (!data_sinfo)
++ goto alloc;
++
+ again:
+ /* make sure we have enough space to handle the data first */
+ spin_lock(&data_sinfo->lock);
+ if (data_sinfo->total_bytes - data_sinfo->bytes_used -
+ data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
+ data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
+- data_sinfo->bytes_may_use < bytes) {
++ data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
+ struct btrfs_trans_handle *trans;
+
+ /*
+@@ -2782,7 +3245,7 @@ again:
+
+ data_sinfo->force_alloc = 1;
+ spin_unlock(&data_sinfo->lock);
+-
++alloc:
+ alloc_target = btrfs_get_alloc_profile(root, 1);
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+@@ -2794,12 +3257,17 @@ again:
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ return ret;
++
++ if (!data_sinfo) {
++ btrfs_set_inode_space_info(root, inode);
++ data_sinfo = BTRFS_I(inode)->space_info;
++ }
+ goto again;
+ }
+ spin_unlock(&data_sinfo->lock);
+
+ /* commit the current transaction and try again */
+- if (!committed) {
++ if (!committed && !root->fs_info->open_ioctl_trans) {
+ committed = 1;
+ trans = btrfs_join_transaction(root, 1);
+ if (!trans)
+@@ -2827,7 +3295,7 @@ again:
+ BTRFS_I(inode)->reserved_bytes += bytes;
+ spin_unlock(&data_sinfo->lock);
+
+- return btrfs_check_metadata_free_space(root);
++ return 0;
+ }
+
+ /*
+@@ -2926,17 +3394,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ BUG_ON(!space_info);
+
+ spin_lock(&space_info->lock);
+- if (space_info->force_alloc) {
++ if (space_info->force_alloc)
+ force = 1;
+- space_info->force_alloc = 0;
+- }
+ if (space_info->full) {
+ spin_unlock(&space_info->lock);
+ goto out;
+ }
+
+ thresh = space_info->total_bytes - space_info->bytes_readonly;
+- thresh = div_factor(thresh, 6);
++ thresh = div_factor(thresh, 8);
+ if (!force &&
+ (space_info->bytes_used + space_info->bytes_pinned +
+ space_info->bytes_reserved + alloc_bytes) < thresh) {
+@@ -2950,7 +3416,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ * we keep a reasonable number of metadata chunks allocated in the
+ * FS as well.
+ */
+- if (flags & BTRFS_BLOCK_GROUP_DATA) {
++ if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
+ fs_info->data_chunk_allocations++;
+ if (!(fs_info->data_chunk_allocations %
+ fs_info->metadata_ratio))
+@@ -2958,8 +3424,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ }
+
+ ret = btrfs_alloc_chunk(trans, extent_root, flags);
++ spin_lock(&space_info->lock);
+ if (ret)
+ space_info->full = 1;
++ space_info->force_alloc = 0;
++ spin_unlock(&space_info->lock);
+ out:
+ mutex_unlock(&extent_root->fs_info->chunk_mutex);
+ return ret;
+@@ -3008,10 +3477,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
+ num_bytes = min(total, cache->key.offset - byte_in_group);
+ if (alloc) {
+ old_val += num_bytes;
++ btrfs_set_block_group_used(&cache->item, old_val);
++ cache->reserved -= num_bytes;
+ cache->space_info->bytes_used += num_bytes;
++ cache->space_info->bytes_reserved -= num_bytes;
+ if (cache->ro)
+ cache->space_info->bytes_readonly -= num_bytes;
+- btrfs_set_block_group_used(&cache->item, old_val);
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ } else {
+@@ -3056,127 +3527,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
+ return bytenr;
+ }
+
+-int btrfs_update_pinned_extents(struct btrfs_root *root,
+- u64 bytenr, u64 num, int pin)
++/*
++ * this function must be called within transaction
++ */
++int btrfs_pin_extent(struct btrfs_root *root,
++ u64 bytenr, u64 num_bytes, int reserved)
+ {
+- u64 len;
+- struct btrfs_block_group_cache *cache;
+ struct btrfs_fs_info *fs_info = root->fs_info;
++ struct btrfs_block_group_cache *cache;
+
+- if (pin)
+- set_extent_dirty(&fs_info->pinned_extents,
+- bytenr, bytenr + num - 1, GFP_NOFS);
+-
+- while (num > 0) {
+- cache = btrfs_lookup_block_group(fs_info, bytenr);
+- BUG_ON(!cache);
+- len = min(num, cache->key.offset -
+- (bytenr - cache->key.objectid));
+- if (pin) {
+- spin_lock(&cache->space_info->lock);
+- spin_lock(&cache->lock);
+- cache->pinned += len;
+- cache->space_info->bytes_pinned += len;
+- spin_unlock(&cache->lock);
+- spin_unlock(&cache->space_info->lock);
+- fs_info->total_pinned += len;
+- } else {
+- int unpin = 0;
++ cache = btrfs_lookup_block_group(fs_info, bytenr);
++ BUG_ON(!cache);
+
+- /*
+- * in order to not race with the block group caching, we
+- * only want to unpin the extent if we are cached. If
+- * we aren't cached, we want to start async caching this
+- * block group so we can free the extent the next time
+- * around.
+- */
+- spin_lock(&cache->space_info->lock);
+- spin_lock(&cache->lock);
+- unpin = (cache->cached == BTRFS_CACHE_FINISHED);
+- if (likely(unpin)) {
+- cache->pinned -= len;
+- cache->space_info->bytes_pinned -= len;
+- fs_info->total_pinned -= len;
+- }
+- spin_unlock(&cache->lock);
+- spin_unlock(&cache->space_info->lock);
++ spin_lock(&cache->space_info->lock);
++ spin_lock(&cache->lock);
++ cache->pinned += num_bytes;
++ cache->space_info->bytes_pinned += num_bytes;
++ if (reserved) {
++ cache->reserved -= num_bytes;
++ cache->space_info->bytes_reserved -= num_bytes;
++ }
++ spin_unlock(&cache->lock);
++ spin_unlock(&cache->space_info->lock);
+
+- if (likely(unpin))
+- clear_extent_dirty(&fs_info->pinned_extents,
+- bytenr, bytenr + len -1,
+- GFP_NOFS);
+- else
+- cache_block_group(cache);
++ btrfs_put_block_group(cache);
+
+- if (unpin)
+- btrfs_add_free_space(cache, bytenr, len);
+- }
+- btrfs_put_block_group(cache);
+- bytenr += len;
+- num -= len;
++ set_extent_dirty(fs_info->pinned_extents,
++ bytenr, bytenr + num_bytes - 1, GFP_NOFS);
++ return 0;
++}
++
++static int update_reserved_extents(struct btrfs_block_group_cache *cache,
++ u64 num_bytes, int reserve)
++{
++ spin_lock(&cache->space_info->lock);
++ spin_lock(&cache->lock);
++ if (reserve) {
++ cache->reserved += num_bytes;
++ cache->space_info->bytes_reserved += num_bytes;
++ } else {
++ cache->reserved -= num_bytes;
++ cache->space_info->bytes_reserved -= num_bytes;
+ }
++ spin_unlock(&cache->lock);
++ spin_unlock(&cache->space_info->lock);
+ return 0;
+ }
+
+-static int update_reserved_extents(struct btrfs_root *root,
+- u64 bytenr, u64 num, int reserve)
++int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root)
+ {
+- u64 len;
+- struct btrfs_block_group_cache *cache;
+ struct btrfs_fs_info *fs_info = root->fs_info;
++ struct btrfs_caching_control *next;
++ struct btrfs_caching_control *caching_ctl;
++ struct btrfs_block_group_cache *cache;
+
+- while (num > 0) {
+- cache = btrfs_lookup_block_group(fs_info, bytenr);
+- BUG_ON(!cache);
+- len = min(num, cache->key.offset -
+- (bytenr - cache->key.objectid));
++ down_write(&fs_info->extent_commit_sem);
+
+- spin_lock(&cache->space_info->lock);
+- spin_lock(&cache->lock);
+- if (reserve) {
+- cache->reserved += len;
+- cache->space_info->bytes_reserved += len;
++ list_for_each_entry_safe(caching_ctl, next,
++ &fs_info->caching_block_groups, list) {
++ cache = caching_ctl->block_group;
++ if (block_group_cache_done(cache)) {
++ cache->last_byte_to_unpin = (u64)-1;
++ list_del_init(&caching_ctl->list);
++ put_caching_control(caching_ctl);
+ } else {
+- cache->reserved -= len;
+- cache->space_info->bytes_reserved -= len;
++ cache->last_byte_to_unpin = caching_ctl->progress;
+ }
+- spin_unlock(&cache->lock);
+- spin_unlock(&cache->space_info->lock);
+- btrfs_put_block_group(cache);
+- bytenr += len;
+- num -= len;
+ }
++
++ if (fs_info->pinned_extents == &fs_info->freed_extents[0])
++ fs_info->pinned_extents = &fs_info->freed_extents[1];
++ else
++ fs_info->pinned_extents = &fs_info->freed_extents[0];
++
++ up_write(&fs_info->extent_commit_sem);
+ return 0;
+ }
+
+-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
++static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+ {
+- u64 last = 0;
+- u64 start;
+- u64 end;
+- struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
+- int ret;
++ struct btrfs_fs_info *fs_info = root->fs_info;
++ struct btrfs_block_group_cache *cache = NULL;
++ u64 len;
+
+- while (1) {
+- ret = find_first_extent_bit(pinned_extents, last,
+- &start, &end, EXTENT_DIRTY);
+- if (ret)
+- break;
++ while (start <= end) {
++ if (!cache ||
++ start >= cache->key.objectid + cache->key.offset) {
++ if (cache)
++ btrfs_put_block_group(cache);
++ cache = btrfs_lookup_block_group(fs_info, start);
++ BUG_ON(!cache);
++ }
+
+- set_extent_dirty(copy, start, end, GFP_NOFS);
+- last = end + 1;
++ len = cache->key.objectid + cache->key.offset - start;
++ len = min(len, end + 1 - start);
++
++ if (start < cache->last_byte_to_unpin) {
++ len = min(len, cache->last_byte_to_unpin - start);
++ btrfs_add_free_space(cache, start, len);
++ }
++
++ spin_lock(&cache->space_info->lock);
++ spin_lock(&cache->lock);
++ cache->pinned -= len;
++ cache->space_info->bytes_pinned -= len;
++ spin_unlock(&cache->lock);
++ spin_unlock(&cache->space_info->lock);
++
++ start += len;
+ }
++
++ if (cache)
++ btrfs_put_block_group(cache);
+ return 0;
+ }
+
+ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root,
+- struct extent_io_tree *unpin)
++ struct btrfs_root *root)
+ {
++ struct btrfs_fs_info *fs_info = root->fs_info;
++ struct extent_io_tree *unpin;
+ u64 start;
+ u64 end;
+ int ret;
+
++ if (fs_info->pinned_extents == &fs_info->freed_extents[0])
++ unpin = &fs_info->freed_extents[1];
++ else
++ unpin = &fs_info->freed_extents[0];
++
+ while (1) {
+ ret = find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY);
+@@ -3185,10 +3665,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+
+ ret = btrfs_discard_extent(root, start, end + 1 - start);
+
+- /* unlocks the pinned mutex */
+- btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
+ clear_extent_dirty(unpin, start, end, GFP_NOFS);
+-
++ unpin_extent_range(root, start, end);
+ cond_resched();
+ }
+
+@@ -3198,7 +3676,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+ static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+- u64 bytenr, u64 num_bytes, int is_data,
++ u64 bytenr, u64 num_bytes,
++ int is_data, int reserved,
+ struct extent_buffer **must_clean)
+ {
+ int err = 0;
+@@ -3207,6 +3686,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ if (is_data)
+ goto pinit;
+
++ /*
++ * discard is sloooow, and so triggering discards on
++ * individual btree blocks isn't a good plan. Just
++ * pin everything in discard mode.
++ */
++ if (btrfs_test_opt(root, DISCARD))
++ goto pinit;
++
+ buf = btrfs_find_tree_block(root, bytenr, num_bytes);
+ if (!buf)
+ goto pinit;
+@@ -3230,15 +3717,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ }
+ free_extent_buffer(buf);
+ pinit:
+- btrfs_set_path_blocking(path);
++ if (path)
++ btrfs_set_path_blocking(path);
+ /* unlocks the pinned mutex */
+- btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
++ btrfs_pin_extent(root, bytenr, num_bytes, reserved);
+
+ BUG_ON(err < 0);
+ return 0;
+ }
+
+-
+ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+@@ -3412,7 +3899,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ }
+
+ ret = pin_down_bytes(trans, root, path, bytenr,
+- num_bytes, is_data, &must_clean);
++ num_bytes, is_data, 0, &must_clean);
+ if (ret > 0)
+ mark_free = 1;
+ BUG_ON(ret < 0);
+@@ -3543,8 +4030,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
+ if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
+ WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
+ /* unlocks the pinned mutex */
+- btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+- update_reserved_extents(root, bytenr, num_bytes, 0);
++ btrfs_pin_extent(root, bytenr, num_bytes, 1);
+ ret = 0;
+ } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+@@ -3584,19 +4070,33 @@ static noinline int
+ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+ u64 num_bytes)
+ {
++ struct btrfs_caching_control *caching_ctl;
+ DEFINE_WAIT(wait);
+
+- prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
+-
+- if (block_group_cache_done(cache)) {
+- finish_wait(&cache->caching_q, &wait);
++ caching_ctl = get_caching_control(cache);
++ if (!caching_ctl)
+ return 0;
+- }
+- schedule();
+- finish_wait(&cache->caching_q, &wait);
+
+- wait_event(cache->caching_q, block_group_cache_done(cache) ||
++ wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
+ (cache->free_space >= num_bytes));
++
++ put_caching_control(caching_ctl);
++ return 0;
++}
++
++static noinline int
++wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
++{
++ struct btrfs_caching_control *caching_ctl;
++ DEFINE_WAIT(wait);
++
++ caching_ctl = get_caching_control(cache);
++ if (!caching_ctl)
++ return 0;
++
++ wait_event(caching_ctl->wait, block_group_cache_done(cache));
++
++ put_caching_control(caching_ctl);
+ return 0;
+ }
+
+@@ -3634,6 +4134,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
+ int last_ptr_loop = 0;
+ int loop = 0;
+ bool found_uncached_bg = false;
++ bool failed_cluster_refill = false;
++ bool failed_alloc = false;
+
+ WARN_ON(num_bytes < root->sectorsize);
+ btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+@@ -3731,7 +4233,16 @@ have_block_group:
+ if (unlikely(block_group->ro))
+ goto loop;
+
+- if (last_ptr) {
++ /*
++ * Ok we want to try and use the cluster allocator, so lets look
++ * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
++ * have tried the cluster allocator plenty of times at this
++ * point and not have found anything, so we are likely way too
++ * fragmented for the clustering stuff to find anything, so lets
++ * just skip it and let the allocator find whatever block it can
++ * find
++ */
++ if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
+ /*
+ * the refill lock keeps out other
+ * people trying to start a new cluster
+@@ -3806,9 +4317,11 @@ refill_cluster:
+ spin_unlock(&last_ptr->refill_lock);
+ goto checks;
+ }
+- } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
++ } else if (!cached && loop > LOOP_CACHING_NOWAIT
++ && !failed_cluster_refill) {
+ spin_unlock(&last_ptr->refill_lock);
+
++ failed_cluster_refill = true;
+ wait_block_group_cache_progress(block_group,
+ num_bytes + empty_cluster + empty_size);
+ goto have_block_group;
+@@ -3820,25 +4333,30 @@ refill_cluster:
+ * cluster. Free the cluster we've been trying
+ * to use, and go to the next block group
+ */
+- if (loop < LOOP_NO_EMPTY_SIZE) {
+- btrfs_return_cluster_to_free_space(NULL,
+- last_ptr);
+- spin_unlock(&last_ptr->refill_lock);
+- goto loop;
+- }
++ btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ spin_unlock(&last_ptr->refill_lock);
++ goto loop;
+ }
+
+ offset = btrfs_find_space_for_alloc(block_group, search_start,
+ num_bytes, empty_size);
+- if (!offset && (cached || (!cached &&
+- loop == LOOP_CACHING_NOWAIT))) {
+- goto loop;
+- } else if (!offset && (!cached &&
+- loop > LOOP_CACHING_NOWAIT)) {
++ /*
++ * If we didn't find a chunk, and we haven't failed on this
++ * block group before, and this block group is in the middle of
++ * caching and we are ok with waiting, then go ahead and wait
++ * for progress to be made, and set failed_alloc to true.
++ *
++ * If failed_alloc is true then we've already waited on this
++ * block group once and should move on to the next block group.
++ */
++ if (!offset && !failed_alloc && !cached &&
++ loop > LOOP_CACHING_NOWAIT) {
+ wait_block_group_cache_progress(block_group,
+- num_bytes + empty_size);
++ num_bytes + empty_size);
++ failed_alloc = true;
+ goto have_block_group;
++ } else if (!offset) {
++ goto loop;
+ }
+ checks:
+ search_start = stripe_align(root, offset);
+@@ -3880,9 +4398,13 @@ checks:
+ search_start - offset);
+ BUG_ON(offset > search_start);
+
++ update_reserved_extents(block_group, num_bytes, 1);
++
+ /* we are all good, lets return */
+ break;
+ loop:
++ failed_cluster_refill = false;
++ failed_alloc = false;
+ btrfs_put_block_group(block_group);
+ }
+ up_read(&space_info->groups_sem);
+@@ -3940,21 +4462,32 @@ loop:
+ return ret;
+ }
+
+-static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
++static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
++ int dump_block_groups)
+ {
+ struct btrfs_block_group_cache *cache;
+
++ spin_lock(&info->lock);
+ printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+ (unsigned long long)(info->total_bytes - info->bytes_used -
+- info->bytes_pinned - info->bytes_reserved),
++ info->bytes_pinned - info->bytes_reserved -
++ info->bytes_super),
+ (info->full) ? "" : "not ");
+ printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
+- " may_use=%llu, used=%llu\n",
++ " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
++ "\n",
+ (unsigned long long)info->total_bytes,
+ (unsigned long long)info->bytes_pinned,
+ (unsigned long long)info->bytes_delalloc,
+ (unsigned long long)info->bytes_may_use,
+- (unsigned long long)info->bytes_used);
++ (unsigned long long)info->bytes_used,
++ (unsigned long long)info->bytes_root,
++ (unsigned long long)info->bytes_super,
++ (unsigned long long)info->bytes_reserved);
++ spin_unlock(&info->lock);
++
++ if (!dump_block_groups)
++ return;
+
+ down_read(&info->groups_sem);
+ list_for_each_entry(cache, &info->block_groups, list) {
+@@ -3972,12 +4505,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+ up_read(&info->groups_sem);
+ }
+
+-static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root,
+- u64 num_bytes, u64 min_alloc_size,
+- u64 empty_size, u64 hint_byte,
+- u64 search_end, struct btrfs_key *ins,
+- u64 data)
++int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root,
++ u64 num_bytes, u64 min_alloc_size,
++ u64 empty_size, u64 hint_byte,
++ u64 search_end, struct btrfs_key *ins,
++ u64 data)
+ {
+ int ret;
+ u64 search_start = 0;
+@@ -4022,7 +4555,7 @@ again:
+ printk(KERN_ERR "btrfs allocation failed flags %llu, "
+ "wanted %llu\n", (unsigned long long)data,
+ (unsigned long long)num_bytes);
+- dump_space_info(sinfo, num_bytes);
++ dump_space_info(sinfo, num_bytes, 1);
+ }
+
+ return ret;
+@@ -4043,25 +4576,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+ ret = btrfs_discard_extent(root, start, len);
+
+ btrfs_add_free_space(cache, start, len);
++ update_reserved_extents(cache, len, 0);
+ btrfs_put_block_group(cache);
+- update_reserved_extents(root, start, len, 0);
+-
+- return ret;
+-}
+-
+-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root,
+- u64 num_bytes, u64 min_alloc_size,
+- u64 empty_size, u64 hint_byte,
+- u64 search_end, struct btrfs_key *ins,
+- u64 data)
+-{
+- int ret;
+- ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
+- empty_size, hint_byte, search_end, ins,
+- data);
+- if (!ret)
+- update_reserved_extents(root, ins->objectid, ins->offset, 1);
+
+ return ret;
+ }
+@@ -4222,15 +4738,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+ {
+ int ret;
+ struct btrfs_block_group_cache *block_group;
++ struct btrfs_caching_control *caching_ctl;
++ u64 start = ins->objectid;
++ u64 num_bytes = ins->offset;
+
+ block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+ cache_block_group(block_group);
+- wait_event(block_group->caching_q,
+- block_group_cache_done(block_group));
++ caching_ctl = get_caching_control(block_group);
+
+- ret = btrfs_remove_free_space(block_group, ins->objectid,
+- ins->offset);
+- BUG_ON(ret);
++ if (!caching_ctl) {
++ BUG_ON(!block_group_cache_done(block_group));
++ ret = btrfs_remove_free_space(block_group, start, num_bytes);
++ BUG_ON(ret);
++ } else {
++ mutex_lock(&caching_ctl->mutex);
++
++ if (start >= caching_ctl->progress) {
++ ret = add_excluded_extent(root, start, num_bytes);
++ BUG_ON(ret);
++ } else if (start + num_bytes <= caching_ctl->progress) {
++ ret = btrfs_remove_free_space(block_group,
++ start, num_bytes);
++ BUG_ON(ret);
++ } else {
++ num_bytes = caching_ctl->progress - start;
++ ret = btrfs_remove_free_space(block_group,
++ start, num_bytes);
++ BUG_ON(ret);
++
++ start = caching_ctl->progress;
++ num_bytes = ins->objectid + ins->offset -
++ caching_ctl->progress;
++ ret = add_excluded_extent(root, start, num_bytes);
++ BUG_ON(ret);
++ }
++
++ mutex_unlock(&caching_ctl->mutex);
++ put_caching_control(caching_ctl);
++ }
++
++ update_reserved_extents(block_group, ins->offset, 1);
+ btrfs_put_block_group(block_group);
+ ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
+ 0, owner, offset, ins, 1);
+@@ -4254,9 +4801,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
+ int ret;
+ u64 flags = 0;
+
+- ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
+- empty_size, hint_byte, search_end,
+- ins, 0);
++ ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
++ empty_size, hint_byte, search_end,
++ ins, 0);
+ if (ret)
+ return ret;
+
+@@ -4267,7 +4814,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
+ } else
+ BUG_ON(parent > 0);
+
+- update_reserved_extents(root, ins->objectid, ins->offset, 1);
+ if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ struct btrfs_delayed_extent_op *extent_op;
+ extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+@@ -4346,452 +4892,108 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+ return buf;
+ }
+
+-#if 0
+-int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root, struct extent_buffer *leaf)
+-{
+- u64 disk_bytenr;
+- u64 num_bytes;
+- struct btrfs_key key;
+- struct btrfs_file_extent_item *fi;
+- u32 nritems;
+- int i;
+- int ret;
+-
+- BUG_ON(!btrfs_is_leaf(leaf));
+- nritems = btrfs_header_nritems(leaf);
+-
+- for (i = 0; i < nritems; i++) {
+- cond_resched();
+- btrfs_item_key_to_cpu(leaf, &key, i);
+-
+- /* only extents have references, skip everything else */
+- if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+- continue;
+-
+- fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+-
+- /* inline extents live in the btree, they don't have refs */
+- if (btrfs_file_extent_type(leaf, fi) ==
+- BTRFS_FILE_EXTENT_INLINE)
+- continue;
+-
+- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+-
+- /* holes don't have refs */
+- if (disk_bytenr == 0)
+- continue;
+-
+- num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+- ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
+- leaf->start, 0, key.objectid, 0);
+- BUG_ON(ret);
+- }
+- return 0;
+-}
+-
+-static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root,
+- struct btrfs_leaf_ref *ref)
+-{
+- int i;
+- int ret;
+- struct btrfs_extent_info *info;
+- struct refsort *sorted;
+-
+- if (ref->nritems == 0)
+- return 0;
+-
+- sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
+- for (i = 0; i < ref->nritems; i++) {
+- sorted[i].bytenr = ref->extents[i].bytenr;
+- sorted[i].slot = i;
+- }
+- sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
+-
+- /*
+- * the items in the ref were sorted when the ref was inserted
+- * into the ref cache, so this is already in order
+- */
+- for (i = 0; i < ref->nritems; i++) {
+- info = ref->extents + sorted[i].slot;
+- ret = btrfs_free_extent(trans, root, info->bytenr,
+- info->num_bytes, ref->bytenr,
+- ref->owner, ref->generation,
+- info->objectid, 0);
+-
+- atomic_inc(&root->fs_info->throttle_gen);
+- wake_up(&root->fs_info->transaction_throttle);
+- cond_resched();
+-
+- BUG_ON(ret);
+- info++;
+- }
+-
+- kfree(sorted);
+- return 0;
+-}
+-
+-
+-static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root, u64 start,
+- u64 len, u32 *refs)
+-{
+- int ret;
+-
+- ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
+- BUG_ON(ret);
+-
+-#if 0 /* some debugging code in case we see problems here */
+- /* if the refs count is one, it won't get increased again. But
+- * if the ref count is > 1, someone may be decreasing it at
+- * the same time we are.
+- */
+- if (*refs != 1) {
+- struct extent_buffer *eb = NULL;
+- eb = btrfs_find_create_tree_block(root, start, len);
+- if (eb)
+- btrfs_tree_lock(eb);
+-
+- mutex_lock(&root->fs_info->alloc_mutex);
+- ret = lookup_extent_ref(NULL, root, start, len, refs);
+- BUG_ON(ret);
+- mutex_unlock(&root->fs_info->alloc_mutex);
+-
+- if (eb) {
+- btrfs_tree_unlock(eb);
+- free_extent_buffer(eb);
+- }
+- if (*refs == 1) {
+- printk(KERN_ERR "btrfs block %llu went down to one "
+- "during drop_snap\n", (unsigned long long)start);
+- }
+-
+- }
+-#endif
+-
+- cond_resched();
+- return ret;
+-}
++struct walk_control {
++ u64 refs[BTRFS_MAX_LEVEL];
++ u64 flags[BTRFS_MAX_LEVEL];
++ struct btrfs_key update_progress;
++ int stage;
++ int level;
++ int shared_level;
++ int update_ref;
++ int keep_locks;
++ int reada_slot;
++ int reada_count;
++};
+
++#define DROP_REFERENCE 1
++#define UPDATE_BACKREF 2
+
+-/*
+- * this is used while deleting old snapshots, and it drops the refs
+- * on a whole subtree starting from a level 1 node.
+- *
+- * The idea is to sort all the leaf pointers, and then drop the
+- * ref on all the leaves in order. Most of the time the leaves
+- * will have ref cache entries, so no leaf IOs will be required to
+- * find the extents they have references on.
+- *
+- * For each leaf, any references it has are also dropped in order
+- *
+- * This ends up dropping the references in something close to optimal
+- * order for reading and modifying the extent allocation tree.
+- */
+-static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root,
+- struct btrfs_path *path)
++static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root,
++ struct walk_control *wc,
++ struct btrfs_path *path)
+ {
+ u64 bytenr;
+- u64 root_owner;
+- u64 root_gen;
+- struct extent_buffer *eb = path->nodes[1];
+- struct extent_buffer *leaf;
+- struct btrfs_leaf_ref *ref;
+- struct refsort *sorted = NULL;
+- int nritems = btrfs_header_nritems(eb);
++ u64 generation;
++ u64 refs;
++ u64 flags;
++ u64 last = 0;
++ u32 nritems;
++ u32 blocksize;
++ struct btrfs_key key;
++ struct extent_buffer *eb;
+ int ret;
+- int i;
+- int refi = 0;
+- int slot = path->slots[1];
+- u32 blocksize = btrfs_level_size(root, 0);
+- u32 refs;
+-
+- if (nritems == 0)
+- goto out;
+-
+- root_owner = btrfs_header_owner(eb);
+- root_gen = btrfs_header_generation(eb);
+- sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
++ int slot;
++ int nread = 0;
+
+- /*
+- * step one, sort all the leaf pointers so we don't scribble
+- * randomly into the extent allocation tree
+- */
+- for (i = slot; i < nritems; i++) {
+- sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
+- sorted[refi].slot = i;
+- refi++;
++ if (path->slots[wc->level] < wc->reada_slot) {
++ wc->reada_count = wc->reada_count * 2 / 3;
++ wc->reada_count = max(wc->reada_count, 2);
++ } else {
++ wc->reada_count = wc->reada_count * 3 / 2;
++ wc->reada_count = min_t(int, wc->reada_count,
++ BTRFS_NODEPTRS_PER_BLOCK(root));
+ }
+
+- /*
+- * nritems won't be zero, but if we're picking up drop_snapshot
+- * after a crash, slot might be > 0, so double check things
+- * just in case.
+- */
+- if (refi == 0)
+- goto out;
++ eb = path->nodes[wc->level];
++ nritems = btrfs_header_nritems(eb);
++ blocksize = btrfs_level_size(root, wc->level - 1);
+
+- sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
++ for (slot = path->slots[wc->level]; slot < nritems; slot++) {
++ if (nread >= wc->reada_count)
++ break;
+
+- /*
+- * the first loop frees everything the leaves point to
+- */
+- for (i = 0; i < refi; i++) {
+- u64 ptr_gen;
++ cond_resched();
++ bytenr = btrfs_node_blockptr(eb, slot);
++ generation = btrfs_node_ptr_generation(eb, slot);
+
+- bytenr = sorted[i].bytenr;
++ if (slot == path->slots[wc->level])
++ goto reada;
+
+- /*
+- * check the reference count on this leaf. If it is > 1
+- * we just decrement it below and don't update any
+- * of the refs the leaf points to.
+- */
+- ret = drop_snap_lookup_refcount(trans, root, bytenr,
+- blocksize, &refs);
+- BUG_ON(ret);
+- if (refs != 1)
++ if (wc->stage == UPDATE_BACKREF &&
++ generation <= root->root_key.offset)
+ continue;
+
+- ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
+-
+- /*
+- * the leaf only had one reference, which means the
+- * only thing pointing to this leaf is the snapshot
+- * we're deleting. It isn't possible for the reference
+- * count to increase again later
+- *
+- * The reference cache is checked for the leaf,
+- * and if found we'll be able to drop any refs held by
+- * the leaf without needing to read it in.
+- */
+- ref = btrfs_lookup_leaf_ref(root, bytenr);
+- if (ref && ref->generation != ptr_gen) {
+- btrfs_free_leaf_ref(root, ref);
+- ref = NULL;
+- }
+- if (ref) {
+- ret = cache_drop_leaf_ref(trans, root, ref);
+- BUG_ON(ret);
+- btrfs_remove_leaf_ref(root, ref);
+- btrfs_free_leaf_ref(root, ref);
+- } else {
+- /*
+- * the leaf wasn't in the reference cache, so
+- * we have to read it.
+- */
+- leaf = read_tree_block(root, bytenr, blocksize,
+- ptr_gen);
+- ret = btrfs_drop_leaf_ref(trans, root, leaf);
+- BUG_ON(ret);
+- free_extent_buffer(leaf);
+- }
+- atomic_inc(&root->fs_info->throttle_gen);
+- wake_up(&root->fs_info->transaction_throttle);
+- cond_resched();
+- }
+-
+- /*
+- * run through the loop again to free the refs on the leaves.
+- * This is faster than doing it in the loop above because
+- * the leaves are likely to be clustered together. We end up
+- * working in nice chunks on the extent allocation tree.
+- */
+- for (i = 0; i < refi; i++) {
+- bytenr = sorted[i].bytenr;
+- ret = btrfs_free_extent(trans, root, bytenr,
+- blocksize, eb->start,
+- root_owner, root_gen, 0, 1);
++ /* We don't lock the tree block, it's OK to be racy here */
++ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
++ &refs, &flags);
+ BUG_ON(ret);
++ BUG_ON(refs == 0);
+
+- atomic_inc(&root->fs_info->throttle_gen);
+- wake_up(&root->fs_info->transaction_throttle);
+- cond_resched();
+- }
+-out:
+- kfree(sorted);
+-
+- /*
+- * update the path to show we've processed the entire level 1
+- * node. This will get saved into the root's drop_snapshot_progress
+- * field so these drops are not repeated again if this transaction
+- * commits.
+- */
+- path->slots[1] = nritems;
+- return 0;
+-}
+-
+-/*
+- * helper function for drop_snapshot, this walks down the tree dropping ref
+- * counts as it goes.
+- */
+-static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root,
+- struct btrfs_path *path, int *level)
+-{
+- u64 root_owner;
+- u64 root_gen;
+- u64 bytenr;
+- u64 ptr_gen;
+- struct extent_buffer *next;
+- struct extent_buffer *cur;
+- struct extent_buffer *parent;
+- u32 blocksize;
+- int ret;
+- u32 refs;
+-
+- WARN_ON(*level < 0);
+- WARN_ON(*level >= BTRFS_MAX_LEVEL);
+- ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
+- path->nodes[*level]->len, &refs);
+- BUG_ON(ret);
+- if (refs > 1)
+- goto out;
+-
+- /*
+- * walk down to the last node level and free all the leaves
+- */
+- while (*level >= 0) {
+- WARN_ON(*level < 0);
+- WARN_ON(*level >= BTRFS_MAX_LEVEL);
+- cur = path->nodes[*level];
+-
+- if (btrfs_header_level(cur) != *level)
+- WARN_ON(1);
+-
+- if (path->slots[*level] >=
+- btrfs_header_nritems(cur))
+- break;
++ if (wc->stage == DROP_REFERENCE) {
++ if (refs == 1)
++ goto reada;
+
+- /* the new code goes down to level 1 and does all the
+- * leaves pointed to that node in bulk. So, this check
+- * for level 0 will always be false.
+- *
+- * But, the disk format allows the drop_snapshot_progress
+- * field in the root to leave things in a state where
+- * a leaf will need cleaning up here. If someone crashes
+- * with the old code and then boots with the new code,
+- * we might find a leaf here.
+- */
+- if (*level == 0) {
+- ret = btrfs_drop_leaf_ref(trans, root, cur);
+- BUG_ON(ret);
+- break;
++ if (wc->level == 1 &&
++ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
++ continue;
++ if (!wc->update_ref ||
++ generation <= root->root_key.offset)
++ continue;
++ btrfs_node_key_to_cpu(eb, &key, slot);
++ ret = btrfs_comp_cpu_keys(&key,
++ &wc->update_progress);
++ if (ret < 0)
++ continue;
++ } else {
++ if (wc->level == 1 &&
++ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
++ continue;
+ }
+-
+- /*
+- * once we get to level one, process the whole node
+- * at once, including everything below it.
+- */
+- if (*level == 1) {
+- ret = drop_level_one_refs(trans, root, path);
+- BUG_ON(ret);
++reada:
++ ret = readahead_tree_block(root, bytenr, blocksize,
++ generation);
++ if (ret)
+ break;
+- }
+-
+- bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+- ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+- blocksize = btrfs_level_size(root, *level - 1);
+-
+- ret = drop_snap_lookup_refcount(trans, root, bytenr,
+- blocksize, &refs);
+- BUG_ON(ret);
+-
+- /*
+- * if there is more than one reference, we don't need
+- * to read that node to drop any references it has. We
+- * just drop the ref we hold on that node and move on to the
+- * next slot in this level.
+- */
+- if (refs != 1) {
+- parent = path->nodes[*level];
+- root_owner = btrfs_header_owner(parent);
+- root_gen = btrfs_header_generation(parent);
+- path->slots[*level]++;
+-
+- ret = btrfs_free_extent(trans, root, bytenr,
+- blocksize, parent->start,
+- root_owner, root_gen,
+- *level - 1, 1);
+- BUG_ON(ret);
+-
+- atomic_inc(&root->fs_info->throttle_gen);
+- wake_up(&root->fs_info->transaction_throttle);
+- cond_resched();
+-
+- continue;
+- }
+-
+- /*
+- * we need to keep freeing things in the next level down.
+- * read the block and loop around to process it
+- */
+- next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+- WARN_ON(*level <= 0);
+- if (path->nodes[*level-1])
+- free_extent_buffer(path->nodes[*level-1]);
+- path->nodes[*level-1] = next;
+- *level = btrfs_header_level(next);
+- path->slots[*level] = 0;
+- cond_resched();
++ last = bytenr + blocksize;
++ nread++;
+ }
+-out:
+- WARN_ON(*level < 0);
+- WARN_ON(*level >= BTRFS_MAX_LEVEL);
+-
+- if (path->nodes[*level] == root->node) {
+- parent = path->nodes[*level];
+- bytenr = path->nodes[*level]->start;
+- } else {
+- parent = path->nodes[*level + 1];
+- bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
+- }
+-
+- blocksize = btrfs_level_size(root, *level);
+- root_owner = btrfs_header_owner(parent);
+- root_gen = btrfs_header_generation(parent);
+-
+- /*
+- * cleanup and free the reference on the last node
+- * we processed
+- */
+- ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+- parent->start, root_owner, root_gen,
+- *level, 1);
+- free_extent_buffer(path->nodes[*level]);
+- path->nodes[*level] = NULL;
+-
+- *level += 1;
+- BUG_ON(ret);
+-
+- cond_resched();
+- return 0;
++ wc->reada_slot = slot;
+ }
+-#endif
+-
+-struct walk_control {
+- u64 refs[BTRFS_MAX_LEVEL];
+- u64 flags[BTRFS_MAX_LEVEL];
+- struct btrfs_key update_progress;
+- int stage;
+- int level;
+- int shared_level;
+- int update_ref;
+- int keep_locks;
+-};
+-
+-#define DROP_REFERENCE 1
+-#define UPDATE_BACKREF 2
+
+ /*
+ * hepler to process tree block while walking down the tree.
+ *
+- * when wc->stage == DROP_REFERENCE, this function checks
+- * reference count of the block. if the block is shared and
+- * we need update back refs for the subtree rooted at the
+- * block, this function changes wc->stage to UPDATE_BACKREF
+- *
+ * when wc->stage == UPDATE_BACKREF, this function updates
+ * back refs for pointers in the block.
+ *
+@@ -4800,11 +5002,10 @@ struct walk_control {
+ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+- struct walk_control *wc)
++ struct walk_control *wc, int lookup_info)
+ {
+ int level = wc->level;
+ struct extent_buffer *eb = path->nodes[level];
+- struct btrfs_key key;
+ u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ int ret;
+
+@@ -4816,8 +5017,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ * when reference count of tree block is 1, it won't increase
+ * again. once full backref flag is set, we never clear it.
+ */
+- if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+- (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
++ if (lookup_info &&
++ ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
++ (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
+ BUG_ON(!path->locks[level]);
+ ret = btrfs_lookup_extent_info(trans, root,
+ eb->start, eb->len,
+@@ -4827,21 +5029,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ BUG_ON(wc->refs[level] == 0);
+ }
+
+- if (wc->stage == DROP_REFERENCE &&
+- wc->update_ref && wc->refs[level] > 1) {
+- BUG_ON(eb == root->node);
+- BUG_ON(path->slots[level] > 0);
+- if (level == 0)
+- btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
+- else
+- btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
+- if (btrfs_header_owner(eb) == root->root_key.objectid &&
+- btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
+- wc->stage = UPDATE_BACKREF;
+- wc->shared_level = level;
+- }
+- }
+-
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->refs[level] > 1)
+ return 1;
+@@ -4878,6 +5065,136 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ }
+
+ /*
++ * hepler to process tree block pointer.
++ *
++ * when wc->stage == DROP_REFERENCE, this function checks
++ * reference count of the block pointed to. if the block
++ * is shared and we need update back refs for the subtree
++ * rooted at the block, this function changes wc->stage to
++ * UPDATE_BACKREF. if the block is shared and there is no
++ * need to update back, this function drops the reference
++ * to the block.
++ *
++ * NOTE: return value 1 means we should stop walking down.
++ */
++static noinline int do_walk_down(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root,
++ struct btrfs_path *path,
++ struct walk_control *wc, int *lookup_info)
++{
++ u64 bytenr;
++ u64 generation;
++ u64 parent;
++ u32 blocksize;
++ struct btrfs_key key;
++ struct extent_buffer *next;
++ int level = wc->level;
++ int reada = 0;
++ int ret = 0;
++
++ generation = btrfs_node_ptr_generation(path->nodes[level],
++ path->slots[level]);
++ /*
++ * if the lower level block was created before the snapshot
++ * was created, we know there is no need to update back refs
++ * for the subtree
++ */
++ if (wc->stage == UPDATE_BACKREF &&
++ generation <= root->root_key.offset) {
++ *lookup_info = 1;
++ return 1;
++ }
++
++ bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
++ blocksize = btrfs_level_size(root, level - 1);
++
++ next = btrfs_find_tree_block(root, bytenr, blocksize);
++ if (!next) {
++ next = btrfs_find_create_tree_block(root, bytenr, blocksize);
++ reada = 1;
++ }
++ btrfs_tree_lock(next);
++ btrfs_set_lock_blocking(next);
++
++ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
++ &wc->refs[level - 1],
++ &wc->flags[level - 1]);
++ BUG_ON(ret);
++ BUG_ON(wc->refs[level - 1] == 0);
++ *lookup_info = 0;
++
++ if (wc->stage == DROP_REFERENCE) {
++ if (wc->refs[level - 1] > 1) {
++ if (level == 1 &&
++ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
++ goto skip;
++
++ if (!wc->update_ref ||
++ generation <= root->root_key.offset)
++ goto skip;
++
++ btrfs_node_key_to_cpu(path->nodes[level], &key,
++ path->slots[level]);
++ ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
++ if (ret < 0)
++ goto skip;
++
++ wc->stage = UPDATE_BACKREF;
++ wc->shared_level = level - 1;
++ }
++ } else {
++ if (level == 1 &&
++ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
++ goto skip;
++ }
++
++ if (!btrfs_buffer_uptodate(next, generation)) {
++ btrfs_tree_unlock(next);
++ free_extent_buffer(next);
++ next = NULL;
++ *lookup_info = 1;
++ }
++
++ if (!next) {
++ if (reada && level == 1)
++ reada_walk_down(trans, root, wc, path);
++ next = read_tree_block(root, bytenr, blocksize, generation);
++ btrfs_tree_lock(next);
++ btrfs_set_lock_blocking(next);
++ }
++
++ level--;
++ BUG_ON(level != btrfs_header_level(next));
++ path->nodes[level] = next;
++ path->slots[level] = 0;
++ path->locks[level] = 1;
++ wc->level = level;
++ if (wc->level == 1)
++ wc->reada_slot = 0;
++ return 0;
++skip:
++ wc->refs[level - 1] = 0;
++ wc->flags[level - 1] = 0;
++ if (wc->stage == DROP_REFERENCE) {
++ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
++ parent = path->nodes[level]->start;
++ } else {
++ BUG_ON(root->root_key.objectid !=
++ btrfs_header_owner(path->nodes[level]));
++ parent = 0;
++ }
++
++ ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
++ root->root_key.objectid, level - 1, 0);
++ BUG_ON(ret);
++ }
++ btrfs_tree_unlock(next);
++ free_extent_buffer(next);
++ *lookup_info = 1;
++ return 1;
++}
++
++/*
+ * hepler to process tree block while walking up the tree.
+ *
+ * when wc->stage == DROP_REFERENCE, this function drops
+@@ -4904,7 +5221,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+ if (level < wc->shared_level)
+ goto out;
+
+- BUG_ON(wc->refs[level] <= 1);
+ ret = find_next_key(path, level + 1, &wc->update_progress);
+ if (ret > 0)
+ wc->update_ref = 0;
+@@ -4935,8 +5251,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+ path->locks[level] = 0;
+ return 1;
+ }
+- } else {
+- BUG_ON(level != 0);
+ }
+ }
+
+@@ -4989,39 +5303,28 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct walk_control *wc)
+ {
+- struct extent_buffer *next;
+- struct extent_buffer *cur;
+- u64 bytenr;
+- u64 ptr_gen;
+- u32 blocksize;
+ int level = wc->level;
++ int lookup_info = 1;
+ int ret;
+
+ while (level >= 0) {
+- cur = path->nodes[level];
+- BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
++ if (path->slots[level] >=
++ btrfs_header_nritems(path->nodes[level]))
++ break;
+
+- ret = walk_down_proc(trans, root, path, wc);
++ ret = walk_down_proc(trans, root, path, wc, lookup_info);
+ if (ret > 0)
+ break;
+
+ if (level == 0)
+ break;
+
+- bytenr = btrfs_node_blockptr(cur, path->slots[level]);
+- blocksize = btrfs_level_size(root, level - 1);
+- ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
+-
+- next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+- btrfs_tree_lock(next);
+- btrfs_set_lock_blocking(next);
+-
+- level--;
+- BUG_ON(level != btrfs_header_level(next));
+- path->nodes[level] = next;
+- path->slots[level] = 0;
+- path->locks[level] = 1;
+- wc->level = level;
++ ret = do_walk_down(trans, root, path, wc, &lookup_info);
++ if (ret > 0) {
++ path->slots[level]++;
++ continue;
++ }
++ level = wc->level;
+ }
+ return 0;
+ }
+@@ -5111,9 +5414,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+ err = ret;
+ goto out;
+ }
+- btrfs_node_key_to_cpu(path->nodes[level], &key,
+- path->slots[level]);
+- WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
++ WARN_ON(ret > 0);
+
+ /*
+ * unlock our path, this is safe because only this
+@@ -5148,6 +5449,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+ wc->stage = DROP_REFERENCE;
+ wc->update_ref = update_ref;
+ wc->keep_locks = 0;
++ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+
+ while (1) {
+ ret = walk_down_tree(trans, root, path, wc);
+@@ -5200,9 +5502,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+ ret = btrfs_del_root(trans, tree_root, &root->root_key);
+ BUG_ON(ret);
+
+- free_extent_buffer(root->node);
+- free_extent_buffer(root->commit_root);
+- kfree(root);
++ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
++ ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
++ NULL, NULL);
++ BUG_ON(ret < 0);
++ if (ret > 0) {
++ ret = btrfs_del_orphan_item(trans, tree_root,
++ root->root_key.objectid);
++ BUG_ON(ret);
++ }
++ }
++
++ if (root->in_radix) {
++ btrfs_free_fs_root(tree_root->fs_info, root);
++ } else {
++ free_extent_buffer(root->node);
++ free_extent_buffer(root->commit_root);
++ kfree(root);
++ }
+ out:
+ btrfs_end_transaction(trans, tree_root);
+ kfree(wc);
+@@ -5254,6 +5571,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ wc->stage = DROP_REFERENCE;
+ wc->update_ref = 0;
+ wc->keep_locks = 1;
++ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+
+ while (1) {
+ wret = walk_down_tree(trans, root, path, wc);
+@@ -5396,9 +5714,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
+ lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
+ while (1) {
+ int ret;
+- spin_lock(&em_tree->lock);
++ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+- spin_unlock(&em_tree->lock);
++ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+@@ -6841,287 +7159,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+ return 0;
+ }
+
+-#if 0
+-static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root,
+- u64 objectid, u64 size)
+-{
+- struct btrfs_path *path;
+- struct btrfs_inode_item *item;
+- struct extent_buffer *leaf;
+- int ret;
+-
+- path = btrfs_alloc_path();
+- if (!path)
+- return -ENOMEM;
+-
+- path->leave_spinning = 1;
+- ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+- if (ret)
+- goto out;
+-
+- leaf = path->nodes[0];
+- item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+- memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+- btrfs_set_inode_generation(leaf, item, 1);
+- btrfs_set_inode_size(leaf, item, size);
+- btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+- btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+- btrfs_mark_buffer_dirty(leaf);
+- btrfs_release_path(root, path);
+-out:
+- btrfs_free_path(path);
+- return ret;
+-}
+-
+-static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+- struct btrfs_block_group_cache *group)
++/*
++ * checks to see if its even possible to relocate this block group.
++ *
++ * @return - -1 if it's not a good idea to relocate this block group, 0 if its
++ * ok to go ahead and try.
++ */
++int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
+ {
+- struct inode *inode = NULL;
+- struct btrfs_trans_handle *trans;
+- struct btrfs_root *root;
+- struct btrfs_key root_key;
+- u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+- int err = 0;
++ struct btrfs_block_group_cache *block_group;
++ struct btrfs_space_info *space_info;
++ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
++ struct btrfs_device *device;
++ int full = 0;
++ int ret = 0;
+
+- root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+- root_key.type = BTRFS_ROOT_ITEM_KEY;
+- root_key.offset = (u64)-1;
+- root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+- if (IS_ERR(root))
+- return ERR_CAST(root);
++ block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+
+- trans = btrfs_start_transaction(root, 1);
+- BUG_ON(!trans);
++ /* odd, couldn't find the block group, leave it alone */
++ if (!block_group)
++ return -1;
+
+- err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+- if (err)
++ /* no bytes used, we're good */
++ if (!btrfs_block_group_used(&block_group->item))
+ goto out;
+
+- err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+- BUG_ON(err);
+-
+- err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+- group->key.offset, 0, group->key.offset,
+- 0, 0, 0);
+- BUG_ON(err);
+-
+- inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+- if (inode->i_state & I_NEW) {
+- BTRFS_I(inode)->root = root;
+- BTRFS_I(inode)->location.objectid = objectid;
+- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+- BTRFS_I(inode)->location.offset = 0;
+- btrfs_read_locked_inode(inode);
+- unlock_new_inode(inode);
+- BUG_ON(is_bad_inode(inode));
+- } else {
+- BUG_ON(1);
+- }
+- BTRFS_I(inode)->index_cnt = group->key.objectid;
+-
+- err = btrfs_orphan_add(trans, inode);
+-out:
+- btrfs_end_transaction(trans, root);
+- if (err) {
+- if (inode)
+- iput(inode);
+- inode = ERR_PTR(err);
+- }
+- return inode;
+-}
+-
+-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+-{
+-
+- struct btrfs_ordered_sum *sums;
+- struct btrfs_sector_sum *sector_sum;
+- struct btrfs_ordered_extent *ordered;
+- struct btrfs_root *root = BTRFS_I(inode)->root;
+- struct list_head list;
+- size_t offset;
+- int ret;
+- u64 disk_bytenr;
+-
+- INIT_LIST_HEAD(&list);
+-
+- ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+- BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+-
+- disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+- ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+- disk_bytenr + len - 1, &list);
+-
+- while (!list_empty(&list)) {
+- sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+- list_del_init(&sums->list);
+-
+- sector_sum = sums->sums;
+- sums->bytenr = ordered->start;
++ space_info = block_group->space_info;
++ spin_lock(&space_info->lock);
+
+- offset = 0;
+- while (offset < sums->len) {
+- sector_sum->bytenr += ordered->start - disk_bytenr;
+- sector_sum++;
+- offset += root->sectorsize;
+- }
++ full = space_info->full;
+
+- btrfs_add_ordered_sum(inode, ordered, sums);
++ /*
++ * if this is the last block group we have in this space, we can't
++ * relocate it unless we're able to allocate a new chunk below.
++ *
++ * Otherwise, we need to make sure we have room in the space to handle
++ * all of the extents from this block group. If we can, we're good
++ */
++ if ((space_info->total_bytes != block_group->key.offset) &&
++ (space_info->bytes_used + space_info->bytes_reserved +
++ space_info->bytes_pinned + space_info->bytes_readonly +
++ btrfs_block_group_used(&block_group->item) <
++ space_info->total_bytes)) {
++ spin_unlock(&space_info->lock);
++ goto out;
+ }
+- btrfs_put_ordered_extent(ordered);
+- return 0;
+-}
+-
+-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
+-{
+- struct btrfs_trans_handle *trans;
+- struct btrfs_path *path;
+- struct btrfs_fs_info *info = root->fs_info;
+- struct extent_buffer *leaf;
+- struct inode *reloc_inode;
+- struct btrfs_block_group_cache *block_group;
+- struct btrfs_key key;
+- u64 skipped;
+- u64 cur_byte;
+- u64 total_found;
+- u32 nritems;
+- int ret;
+- int progress;
+- int pass = 0;
+-
+- root = root->fs_info->extent_root;
+-
+- block_group = btrfs_lookup_block_group(info, group_start);
+- BUG_ON(!block_group);
+-
+- printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
+- (unsigned long long)block_group->key.objectid,
+- (unsigned long long)block_group->flags);
+-
+- path = btrfs_alloc_path();
+- BUG_ON(!path);
+-
+- reloc_inode = create_reloc_inode(info, block_group);
+- BUG_ON(IS_ERR(reloc_inode));
+-
+- __alloc_chunk_for_shrink(root, block_group, 1);
+- set_block_group_readonly(block_group);
+-
+- btrfs_start_delalloc_inodes(info->tree_root);
+- btrfs_wait_ordered_extents(info->tree_root, 0);
+-again:
+- skipped = 0;
+- total_found = 0;
+- progress = 0;
+- key.objectid = block_group->key.objectid;
+- key.offset = 0;
+- key.type = 0;
+- cur_byte = key.objectid;
+-
+- trans = btrfs_start_transaction(info->tree_root, 1);
+- btrfs_commit_transaction(trans, info->tree_root);
++ spin_unlock(&space_info->lock);
+
+- mutex_lock(&root->fs_info->cleaner_mutex);
+- btrfs_clean_old_snapshots(info->tree_root);
+- btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
+- mutex_unlock(&root->fs_info->cleaner_mutex);
++ /*
++ * ok we don't have enough space, but maybe we have free space on our
++ * devices to allocate new chunks for relocation, so loop through our
++ * alloc devices and guess if we have enough space. However, if we
++ * were marked as full, then we know there aren't enough chunks, and we
++ * can just return.
++ */
++ ret = -1;
++ if (full)
++ goto out;
+
+- trans = btrfs_start_transaction(info->tree_root, 1);
+- btrfs_commit_transaction(trans, info->tree_root);
++ mutex_lock(&root->fs_info->chunk_mutex);
++ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
++ u64 min_free = btrfs_block_group_used(&block_group->item);
++ u64 dev_offset, max_avail;
+
+- while (1) {
+- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+- if (ret < 0)
+- goto out;
+-next:
+- leaf = path->nodes[0];
+- nritems = btrfs_header_nritems(leaf);
+- if (path->slots[0] >= nritems) {
+- ret = btrfs_next_leaf(root, path);
+- if (ret < 0)
+- goto out;
+- if (ret == 1) {
+- ret = 0;
++ /*
++ * check to make sure we can actually find a chunk with enough
++ * space to fit our block group in.
++ */
++ if (device->total_bytes > device->bytes_used + min_free) {
++ ret = find_free_dev_extent(NULL, device, min_free,
++ &dev_offset, &max_avail);
++ if (!ret)
+ break;
+- }
+- leaf = path->nodes[0];
+- nritems = btrfs_header_nritems(leaf);
+- }
+-
+- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+-
+- if (key.objectid >= block_group->key.objectid +
+- block_group->key.offset)
+- break;
+-
+- if (progress && need_resched()) {
+- btrfs_release_path(root, path);
+- cond_resched();
+- progress = 0;
+- continue;
+- }
+- progress = 1;
+-
+- if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
+- key.objectid + key.offset <= cur_byte) {
+- path->slots[0]++;
+- goto next;
++ ret = -1;
+ }
+-
+- total_found++;
+- cur_byte = key.objectid + key.offset;
+- btrfs_release_path(root, path);
+-
+- __alloc_chunk_for_shrink(root, block_group, 0);
+- ret = relocate_one_extent(root, path, &key, block_group,
+- reloc_inode, pass);
+- BUG_ON(ret < 0);
+- if (ret > 0)
+- skipped++;
+-
+- key.objectid = cur_byte;
+- key.type = 0;
+- key.offset = 0;
+ }
+-
+- btrfs_release_path(root, path);
+-
+- if (pass == 0) {
+- btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
+- invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
+- }
+-
+- if (total_found > 0) {
+- printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
+- (unsigned long long)total_found, pass);
+- pass++;
+- if (total_found == skipped && pass > 2) {
+- iput(reloc_inode);
+- reloc_inode = create_reloc_inode(info, block_group);
+- pass = 0;
+- }
+- goto again;
+- }
+-
+- /* delete reloc_inode */
+- iput(reloc_inode);
+-
+- /* unpin extents in this range */
+- trans = btrfs_start_transaction(info->tree_root, 1);
+- btrfs_commit_transaction(trans, info->tree_root);
+-
+- spin_lock(&block_group->lock);
+- WARN_ON(block_group->pinned > 0);
+- WARN_ON(block_group->reserved > 0);
+- WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
+- spin_unlock(&block_group->lock);
+- btrfs_put_block_group(block_group);
+- ret = 0;
++ mutex_unlock(&root->fs_info->chunk_mutex);
+ out:
+- btrfs_free_path(path);
++ btrfs_put_block_group(block_group);
+ return ret;
+ }
+-#endif
+
+ static int find_first_block_group(struct btrfs_root *root,
+ struct btrfs_path *path, struct btrfs_key *key)
+@@ -7164,8 +7281,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
+ {
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
++ struct btrfs_caching_control *caching_ctl;
+ struct rb_node *n;
+
++ down_write(&info->extent_commit_sem);
++ while (!list_empty(&info->caching_block_groups)) {
++ caching_ctl = list_entry(info->caching_block_groups.next,
++ struct btrfs_caching_control, list);
++ list_del(&caching_ctl->list);
++ put_caching_control(caching_ctl);
++ }
++ up_write(&info->extent_commit_sem);
++
+ spin_lock(&info->block_group_cache_lock);
+ while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+ block_group = rb_entry(n, struct btrfs_block_group_cache,
+@@ -7179,8 +7306,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
+ up_write(&block_group->space_info->groups_sem);
+
+ if (block_group->cached == BTRFS_CACHE_STARTED)
+- wait_event(block_group->caching_q,
+- block_group_cache_done(block_group));
++ wait_block_group_cache_done(block_group);
+
+ btrfs_remove_free_space_cache(block_group);
+
+@@ -7250,7 +7376,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
+ spin_lock_init(&cache->lock);
+ spin_lock_init(&cache->tree_lock);
+ cache->fs_info = info;
+- init_waitqueue_head(&cache->caching_q);
+ INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
+
+@@ -7272,8 +7397,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
+ cache->flags = btrfs_block_group_flags(&cache->item);
+ cache->sectorsize = root->sectorsize;
+
+- remove_sb_from_cache(root, cache);
+-
+ /*
+ * check for two cases, either we are full, and therefore
+ * don't need to bother with the caching work since we won't
+@@ -7282,13 +7405,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
+ * time, particularly in the full case.
+ */
+ if (found_key.offset == btrfs_block_group_used(&cache->item)) {
++ exclude_super_stripes(root, cache);
++ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
++ free_excluded_extents(root, cache);
+ } else if (btrfs_block_group_used(&cache->item) == 0) {
++ exclude_super_stripes(root, cache);
++ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ add_new_free_space(cache, root->fs_info,
+ found_key.objectid,
+ found_key.objectid +
+ found_key.offset);
++ free_excluded_extents(root, cache);
+ }
+
+ ret = update_space_info(info, cache->flags, found_key.offset,
+@@ -7296,6 +7425,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
+ &space_info);
+ BUG_ON(ret);
+ cache->space_info = space_info;
++ spin_lock(&cache->space_info->lock);
++ cache->space_info->bytes_super += cache->bytes_super;
++ spin_unlock(&cache->space_info->lock);
++
+ down_write(&space_info->groups_sem);
+ list_add_tail(&cache->list, &space_info->block_groups);
+ up_write(&space_info->groups_sem);
+@@ -7345,7 +7478,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ atomic_set(&cache->count, 1);
+ spin_lock_init(&cache->lock);
+ spin_lock_init(&cache->tree_lock);
+- init_waitqueue_head(&cache->caching_q);
+ INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
+
+@@ -7354,15 +7486,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ cache->flags = type;
+ btrfs_set_block_group_flags(&cache->item, type);
+
++ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+- remove_sb_from_cache(root, cache);
++ exclude_super_stripes(root, cache);
+
+ add_new_free_space(cache, root->fs_info, chunk_offset,
+ chunk_offset + size);
+
++ free_excluded_extents(root, cache);
++
+ ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
+ &cache->space_info);
+ BUG_ON(ret);
++
++ spin_lock(&cache->space_info->lock);
++ cache->space_info->bytes_super += cache->bytes_super;
++ spin_unlock(&cache->space_info->lock);
++
+ down_write(&cache->space_info->groups_sem);
+ list_add_tail(&cache->list, &cache->space_info->block_groups);
+ up_write(&cache->space_info->groups_sem);
+@@ -7428,8 +7568,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ up_write(&block_group->space_info->groups_sem);
+
+ if (block_group->cached == BTRFS_CACHE_STARTED)
+- wait_event(block_group->caching_q,
+- block_group_cache_done(block_group));
++ wait_block_group_cache_done(block_group);
+
+ btrfs_remove_free_space_cache(block_group);
+
+diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
+index 6826018..96577e8 100644
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
+ return NULL;
+ }
+
++static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
++ struct extent_state *other)
++{
++ if (tree->ops && tree->ops->merge_extent_hook)
++ tree->ops->merge_extent_hook(tree->mapping->host, new,
++ other);
++}
++
+ /*
+ * utility function to look for merge candidates inside a given range.
+ * Any extents with matching state are merged together into a single
+@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
+ other = rb_entry(other_node, struct extent_state, rb_node);
+ if (other->end == state->start - 1 &&
+ other->state == state->state) {
++ merge_cb(tree, state, other);
+ state->start = other->start;
+ other->tree = NULL;
+ rb_erase(&other->rb_node, &tree->state);
+@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
+ other = rb_entry(other_node, struct extent_state, rb_node);
+ if (other->start == state->end + 1 &&
+ other->state == state->state) {
++ merge_cb(tree, state, other);
+ other->start = state->start;
+ state->tree = NULL;
+ rb_erase(&state->rb_node, &tree->state);
+ free_extent_state(state);
++ state = NULL;
+ }
+ }
++
+ return 0;
+ }
+
+-static void set_state_cb(struct extent_io_tree *tree,
++static int set_state_cb(struct extent_io_tree *tree,
+ struct extent_state *state,
+ unsigned long bits)
+ {
+ if (tree->ops && tree->ops->set_bit_hook) {
+- tree->ops->set_bit_hook(tree->mapping->host, state->start,
+- state->end, state->state, bits);
++ return tree->ops->set_bit_hook(tree->mapping->host,
++ state->start, state->end,
++ state->state, bits);
+ }
++
++ return 0;
+ }
+
+ static void clear_state_cb(struct extent_io_tree *tree,
+ struct extent_state *state,
+ unsigned long bits)
+ {
+- if (tree->ops && tree->ops->clear_bit_hook) {
+- tree->ops->clear_bit_hook(tree->mapping->host, state->start,
+- state->end, state->state, bits);
+- }
++ if (tree->ops && tree->ops->clear_bit_hook)
++ tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
+ }
+
+ /*
+@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
+ int bits)
+ {
+ struct rb_node *node;
++ int ret;
+
+ if (end < start) {
+ printk(KERN_ERR "btrfs end < start %llu %llu\n",
+@@ -365,12 +379,15 @@ static int insert_state(struct extent_io_tree *tree,
+ (unsigned long long)start);
+ WARN_ON(1);
+ }
++ state->start = start;
++ state->end = end;
++ ret = set_state_cb(tree, state, bits);
++ if (ret)
++ return ret;
++
+ if (bits & EXTENT_DIRTY)
+ tree->dirty_bytes += end - start + 1;
+- set_state_cb(tree, state, bits);
+ state->state |= bits;
+- state->start = start;
+- state->end = end;
+ node = tree_insert(&tree->state, end, &state->rb_node);
+ if (node) {
+ struct extent_state *found;
+@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
+ return 0;
+ }
+
++static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
++ u64 split)
++{
++ if (tree->ops && tree->ops->split_extent_hook)
++ return tree->ops->split_extent_hook(tree->mapping->host,
++ orig, split);
++ return 0;
++}
++
+ /*
+ * split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half. 'split' indicates an
+@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+ struct extent_state *prealloc, u64 split)
+ {
+ struct rb_node *node;
++
++ split_cb(tree, orig, split);
++
+ prealloc->start = orig->start;
+ prealloc->end = split - 1;
+ prealloc->state = orig->state;
+@@ -431,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
+ struct extent_state *state, int bits, int wake,
+ int delete)
+ {
+- int ret = state->state & bits;
++ int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
++ int ret = state->state & bits_to_clear;
+
+ if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+ u64 range = state->end - state->start + 1;
+@@ -439,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
+ tree->dirty_bytes -= range;
+ }
+ clear_state_cb(tree, state, bits);
+- state->state &= ~bits;
++ state->state &= ~bits_to_clear;
+ if (wake)
+ wake_up(&state->wq);
+ if (delete || state->state == 0) {
+@@ -471,10 +501,14 @@ static int clear_state_bit(struct extent_io_tree *tree,
+ * bits were already set, or zero if none of the bits were already set.
+ */
+ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+- int bits, int wake, int delete, gfp_t mask)
++ int bits, int wake, int delete,
++ struct extent_state **cached_state,
++ gfp_t mask)
+ {
+ struct extent_state *state;
++ struct extent_state *cached;
+ struct extent_state *prealloc = NULL;
++ struct rb_node *next_node;
+ struct rb_node *node;
+ u64 last_end;
+ int err;
+@@ -488,6 +522,17 @@ again:
+ }
+
+ spin_lock(&tree->lock);
++ if (cached_state) {
++ cached = *cached_state;
++ *cached_state = NULL;
++ cached_state = NULL;
++ if (cached && cached->tree && cached->start == start) {
++ atomic_dec(&cached->refs);
++ state = cached;
++ goto hit_next;
++ }
++ free_extent_state(cached);
++ }
+ /*
+ * this search will find the extents that end after
+ * our range starts
+@@ -496,6 +541,7 @@ again:
+ if (!node)
+ goto out;
+ state = rb_entry(node, struct extent_state, rb_node);
++hit_next:
+ if (state->start > end)
+ goto out;
+ WARN_ON(state->end < start);
+@@ -526,13 +572,11 @@ again:
+ if (err)
+ goto out;
+ if (state->end <= end) {
+- set |= clear_state_bit(tree, state, bits,
+- wake, delete);
++ set |= clear_state_bit(tree, state, bits, wake,
++ delete);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+- } else {
+- start = state->start;
+ }
+ goto search_again;
+ }
+@@ -547,19 +591,30 @@ again:
+ prealloc = alloc_extent_state(GFP_ATOMIC);
+ err = split_state(tree, state, prealloc, end + 1);
+ BUG_ON(err == -EEXIST);
+-
+ if (wake)
+ wake_up(&state->wq);
+- set |= clear_state_bit(tree, prealloc, bits,
+- wake, delete);
++
++ set |= clear_state_bit(tree, prealloc, bits, wake, delete);
++
+ prealloc = NULL;
+ goto out;
+ }
+
++ if (state->end < end && prealloc && !need_resched())
++ next_node = rb_next(&state->rb_node);
++ else
++ next_node = NULL;
++
+ set |= clear_state_bit(tree, state, bits, wake, delete);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
++ if (start <= end && next_node) {
++ state = rb_entry(next_node, struct extent_state,
++ rb_node);
++ if (state->start == start)
++ goto hit_next;
++ }
+ goto search_again;
+
+ out:
+@@ -641,40 +696,59 @@ out:
+ return 0;
+ }
+
+-static void set_state_bits(struct extent_io_tree *tree,
++static int set_state_bits(struct extent_io_tree *tree,
+ struct extent_state *state,
+ int bits)
+ {
++ int ret;
++
++ ret = set_state_cb(tree, state, bits);
++ if (ret)
++ return ret;
++
+ if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+ u64 range = state->end - state->start + 1;
+ tree->dirty_bytes += range;
+ }
+- set_state_cb(tree, state, bits);
+ state->state |= bits;
++
++ return 0;
++}
++
++static void cache_state(struct extent_state *state,
++ struct extent_state **cached_ptr)
++{
++ if (cached_ptr && !(*cached_ptr)) {
++ if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
++ *cached_ptr = state;
++ atomic_inc(&state->refs);
++ }
++ }
+ }
+
+ /*
+- * set some bits on a range in the tree. This may require allocations
+- * or sleeping, so the gfp mask is used to indicate what is allowed.
++ * set some bits on a range in the tree. This may require allocations or
++ * sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+- * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
+- * range already has the desired bits set. The start of the existing
+- * range is returned in failed_start in this case.
++ * If any of the exclusive bits are set, this will fail with -EEXIST if some
++ * part of the range already has the desired bits set. The start of the
++ * existing range is returned in failed_start in this case.
+ *
+- * [start, end] is inclusive
+- * This takes the tree lock.
++ * [start, end] is inclusive This takes the tree lock.
+ */
++
+ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+- int bits, int exclusive, u64 *failed_start,
++ int bits, int exclusive_bits, u64 *failed_start,
++ struct extent_state **cached_state,
+ gfp_t mask)
+ {
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node *node;
+ int err = 0;
+- int set;
+ u64 last_start;
+ u64 last_end;
++
+ again:
+ if (!prealloc && (mask & __GFP_WAIT)) {
+ prealloc = alloc_extent_state(mask);
+@@ -683,6 +757,13 @@ again:
+ }
+
+ spin_lock(&tree->lock);
++ if (cached_state && *cached_state) {
++ state = *cached_state;
++ if (state->start == start && state->tree) {
++ node = &state->rb_node;
++ goto hit_next;
++ }
++ }
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+@@ -694,8 +775,8 @@ again:
+ BUG_ON(err == -EEXIST);
+ goto out;
+ }
+-
+ state = rb_entry(node, struct extent_state, rb_node);
++hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+@@ -706,17 +787,32 @@ again:
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+- set = state->state & bits;
+- if (set && exclusive) {
++ struct rb_node *next_node;
++ if (state->state & exclusive_bits) {
+ *failed_start = state->start;
+ err = -EEXIST;
+ goto out;
+ }
+- set_state_bits(tree, state, bits);
++
++ err = set_state_bits(tree, state, bits);
++ if (err)
++ goto out;
++
++ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
++
+ start = last_end + 1;
++ if (start < end && prealloc && !need_resched()) {
++ next_node = rb_next(node);
++ if (next_node) {
++ state = rb_entry(next_node, struct extent_state,
++ rb_node);
++ if (state->start == start)
++ goto hit_next;
++ }
++ }
+ goto search_again;
+ }
+
+@@ -737,8 +833,7 @@ again:
+ * desired bit on it.
+ */
+ if (state->start < start) {
+- set = state->state & bits;
+- if (exclusive && set) {
++ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+@@ -749,13 +844,14 @@ again:
+ if (err)
+ goto out;
+ if (state->end <= end) {
+- set_state_bits(tree, state, bits);
++ err = set_state_bits(tree, state, bits);
++ if (err)
++ goto out;
++ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+- } else {
+- start = state->start;
+ }
+ goto search_again;
+ }
+@@ -774,10 +870,13 @@ again:
+ this_end = last_start - 1;
+ err = insert_state(tree, prealloc, start, this_end,
+ bits);
+- prealloc = NULL;
+ BUG_ON(err == -EEXIST);
+- if (err)
++ if (err) {
++ prealloc = NULL;
+ goto out;
++ }
++ cache_state(prealloc, cached_state);
++ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+@@ -788,8 +887,7 @@ again:
+ * on the first half
+ */
+ if (state->start <= end && state->end > end) {
+- set = state->state & bits;
+- if (exclusive && set) {
++ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+@@ -797,7 +895,12 @@ again:
+ err = split_state(tree, state, prealloc, end + 1);
+ BUG_ON(err == -EEXIST);
+
+- set_state_bits(tree, prealloc, bits);
++ err = set_state_bits(tree, prealloc, bits);
++ if (err) {
++ prealloc = NULL;
++ goto out;
++ }
++ cache_state(prealloc, cached_state);
+ merge_state(tree, prealloc);
+ prealloc = NULL;
+ goto out;
+@@ -826,86 +929,65 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+ {
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+- mask);
+-}
+-
+-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+- gfp_t mask)
+-{
+- return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
++ NULL, mask);
+ }
+
+ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, gfp_t mask)
+ {
+ return set_extent_bit(tree, start, end, bits, 0, NULL,
+- mask);
++ NULL, mask);
+ }
+
+ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, gfp_t mask)
+ {
+- return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
++ return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
+ }
+
+ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+ {
+ return set_extent_bit(tree, start, end,
+- EXTENT_DELALLOC | EXTENT_DIRTY,
+- 0, NULL, mask);
++ EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
++ 0, NULL, NULL, mask);
+ }
+
+ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+ {
+ return clear_extent_bit(tree, start, end,
+- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
+-}
+-
+-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+- gfp_t mask)
+-{
+- return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
++ EXTENT_DIRTY | EXTENT_DELALLOC |
++ EXTENT_DO_ACCOUNTING, 0, 0,
++ NULL, mask);
+ }
+
+ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+ {
+ return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+- mask);
++ NULL, mask);
+ }
+
+ static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+ {
+- return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
++ return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
++ NULL, mask);
+ }
+
+ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+ {
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+- mask);
++ NULL, mask);
+ }
+
+ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+ {
+- return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
+-}
+-
+-static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+- gfp_t mask)
+-{
+- return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
+- 0, NULL, mask);
+-}
+-
+-static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
+- u64 end, gfp_t mask)
+-{
+- return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
++ return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
++ NULL, mask);
+ }
+
+ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+@@ -917,13 +999,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+ * either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
++int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
++ int bits, struct extent_state **cached_state, gfp_t mask)
+ {
+ int err;
+ u64 failed_start;
+ while (1) {
+- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+- &failed_start, mask);
++ err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
++ EXTENT_LOCKED, &failed_start,
++ cached_state, mask);
+ if (err == -EEXIST && (mask & __GFP_WAIT)) {
+ wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+ start = failed_start;
+@@ -935,27 +1019,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+ return err;
+ }
+
++int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
++{
++ return lock_extent_bits(tree, start, end, 0, NULL, mask);
++}
++
+ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+ {
+ int err;
+ u64 failed_start;
+
+- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+- &failed_start, mask);
++ err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
++ &failed_start, NULL, mask);
+ if (err == -EEXIST) {
+ if (failed_start > start)
+ clear_extent_bit(tree, start, failed_start - 1,
+- EXTENT_LOCKED, 1, 0, mask);
++ EXTENT_LOCKED, 1, 0, NULL, mask);
+ return 0;
+ }
+ return 1;
+ }
+
++int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
++ struct extent_state **cached, gfp_t mask)
++{
++ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
++ mask);
++}
++
+ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+ {
+- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
++ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
++ mask);
+ }
+
+ /*
+@@ -974,7 +1071,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
+ page_cache_release(page);
+ index++;
+ }
+- set_extent_dirty(tree, start, end, GFP_NOFS);
+ return 0;
+ }
+
+@@ -994,7 +1090,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+ page_cache_release(page);
+ index++;
+ }
+- set_extent_writeback(tree, start, end, GFP_NOFS);
+ return 0;
+ }
+
+@@ -1232,6 +1327,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
+ u64 delalloc_start;
+ u64 delalloc_end;
+ u64 found;
++ struct extent_state *cached_state = NULL;
+ int ret;
+ int loops = 0;
+
+@@ -1269,6 +1365,7 @@ again:
+ /* some of the pages are gone, lets avoid looping by
+ * shortening the size of the delalloc range we're searching
+ */
++ free_extent_state(cached_state);
+ if (!loops) {
+ unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+ max_bytes = PAGE_CACHE_SIZE - offset;
+@@ -1282,18 +1379,21 @@ again:
+ BUG_ON(ret);
+
+ /* step three, lock the state bits for the whole range */
+- lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
++ lock_extent_bits(tree, delalloc_start, delalloc_end,
++ 0, &cached_state, GFP_NOFS);
+
+ /* then test to make sure it is all still delalloc */
+ ret = test_range_bit(tree, delalloc_start, delalloc_end,
+- EXTENT_DELALLOC, 1);
++ EXTENT_DELALLOC, 1, cached_state);
+ if (!ret) {
+- unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
++ unlock_extent_cached(tree, delalloc_start, delalloc_end,
++ &cached_state, GFP_NOFS);
+ __unlock_for_delalloc(inode, locked_page,
+ delalloc_start, delalloc_end);
+ cond_resched();
+ goto again;
+ }
++ free_extent_state(cached_state);
+ *start = delalloc_start;
+ *end = delalloc_end;
+ out_failed:
+@@ -1303,11 +1403,7 @@ out_failed:
+ int extent_clear_unlock_delalloc(struct inode *inode,
+ struct extent_io_tree *tree,
+ u64 start, u64 end, struct page *locked_page,
+- int unlock_pages,
+- int clear_unlock,
+- int clear_delalloc, int clear_dirty,
+- int set_writeback,
+- int end_writeback)
++ unsigned long op)
+ {
+ int ret;
+ struct page *pages[16];
+@@ -1317,16 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
+ int i;
+ int clear_bits = 0;
+
+- if (clear_unlock)
++ if (op & EXTENT_CLEAR_UNLOCK)
+ clear_bits |= EXTENT_LOCKED;
+- if (clear_dirty)
++ if (op & EXTENT_CLEAR_DIRTY)
+ clear_bits |= EXTENT_DIRTY;
+
+- if (clear_delalloc)
++ if (op & EXTENT_CLEAR_DELALLOC)
+ clear_bits |= EXTENT_DELALLOC;
+
+- clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+- if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
++ if (op & EXTENT_CLEAR_ACCOUNTING)
++ clear_bits |= EXTENT_DO_ACCOUNTING;
++
++ clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
++ if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
++ EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
++ EXTENT_SET_PRIVATE2)))
+ return 0;
+
+ while (nr_pages > 0) {
+@@ -1334,17 +1435,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
+ min_t(unsigned long,
+ nr_pages, ARRAY_SIZE(pages)), pages);
+ for (i = 0; i < ret; i++) {
++
++ if (op & EXTENT_SET_PRIVATE2)
++ SetPagePrivate2(pages[i]);
++
+ if (pages[i] == locked_page) {
+ page_cache_release(pages[i]);
+ continue;
+ }
+- if (clear_dirty)
++ if (op & EXTENT_CLEAR_DIRTY)
+ clear_page_dirty_for_io(pages[i]);
+- if (set_writeback)
++ if (op & EXTENT_SET_WRITEBACK)
+ set_page_writeback(pages[i]);
+- if (end_writeback)
++ if (op & EXTENT_END_WRITEBACK)
+ end_page_writeback(pages[i]);
+- if (unlock_pages)
++ if (op & EXTENT_CLEAR_UNLOCK_PAGE)
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+@@ -1476,14 +1581,17 @@ out:
+ * range is found set.
+ */
+ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+- int bits, int filled)
++ int bits, int filled, struct extent_state *cached)
+ {
+ struct extent_state *state = NULL;
+ struct rb_node *node;
+ int bitset = 0;
+
+ spin_lock(&tree->lock);
+- node = tree_search(tree, start);
++ if (cached && cached->tree && cached->start == start)
++ node = &cached->rb_node;
++ else
++ node = tree_search(tree, start);
+ while (node && start <= end) {
+ state = rb_entry(node, struct extent_state, rb_node);
+
+@@ -1503,6 +1611,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ bitset = 0;
+ break;
+ }
++
++ if (state->end == (u64)-1)
++ break;
++
+ start = state->end + 1;
+ if (start > end)
+ break;
+@@ -1526,7 +1638,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,
+ {
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+- if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
++ if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
+ SetPageUptodate(page);
+ return 0;
+ }
+@@ -1540,7 +1652,7 @@ static int check_page_locked(struct extent_io_tree *tree,
+ {
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+- if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
++ if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
+ unlock_page(page);
+ return 0;
+ }
+@@ -1552,10 +1664,7 @@ static int check_page_locked(struct extent_io_tree *tree,
+ static int check_page_writeback(struct extent_io_tree *tree,
+ struct page *page)
+ {
+- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+- u64 end = start + PAGE_CACHE_SIZE - 1;
+- if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
+- end_page_writeback(page);
++ end_page_writeback(page);
+ return 0;
+ }
+
+@@ -1613,13 +1722,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
+ }
+
+ if (!uptodate) {
+- clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
++ clear_extent_uptodate(tree, start, end, GFP_NOFS);
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+
+- clear_extent_writeback(tree, start, end, GFP_ATOMIC);
+-
+ if (whole_page)
+ end_page_writeback(page);
+ else
+@@ -1983,7 +2090,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
+ continue;
+ }
+ /* the get_extent function already copied into the page */
+- if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
++ if (test_range_bit(tree, cur, cur_end,
++ EXTENT_UPTODATE, 1, NULL)) {
+ check_page_uptodate(tree, page);
+ unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ cur = cur + iosize;
+@@ -2078,6 +2186,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ u64 iosize;
+ u64 unlock_start;
+ sector_t sector;
++ struct extent_state *cached_state = NULL;
+ struct extent_map *em;
+ struct block_device *bdev;
+ int ret;
+@@ -2124,6 +2233,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ delalloc_end = 0;
+ page_started = 0;
+ if (!epd->extent_locked) {
++ u64 delalloc_to_write = 0;
+ /*
+ * make sure the wbc mapping index is at least updated
+ * to this page.
+@@ -2143,8 +2253,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ tree->ops->fill_delalloc(inode, page, delalloc_start,
+ delalloc_end, &page_started,
+ &nr_written);
++ /*
++ * delalloc_end is already one less than the total
++ * length, so we don't subtract one from
++ * PAGE_CACHE_SIZE
++ */
++ delalloc_to_write += (delalloc_end - delalloc_start +
++ PAGE_CACHE_SIZE) >>
++ PAGE_CACHE_SHIFT;
+ delalloc_start = delalloc_end + 1;
+ }
++ if (wbc->nr_to_write < delalloc_to_write) {
++ int thresh = 8192;
++
++ if (delalloc_to_write < thresh * 2)
++ thresh = delalloc_to_write;
++ wbc->nr_to_write = min_t(u64, delalloc_to_write,
++ thresh);
++ }
+
+ /* did the fill delalloc function already unlock and start
+ * the IO?
+@@ -2160,15 +2286,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ goto done_unlocked;
+ }
+ }
+- lock_extent(tree, start, page_end, GFP_NOFS);
+-
+- unlock_start = start;
+-
+ if (tree->ops && tree->ops->writepage_start_hook) {
+ ret = tree->ops->writepage_start_hook(page, start,
+ page_end);
+ if (ret == -EAGAIN) {
+- unlock_extent(tree, start, page_end, GFP_NOFS);
+ redirty_page_for_writepage(wbc, page);
+ update_nr_written(page, wbc, nr_written);
+ unlock_page(page);
+@@ -2184,12 +2305,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ update_nr_written(page, wbc, nr_written + 1);
+
+ end = page_end;
+- if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
+- printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
+-
+ if (last_byte <= start) {
+- clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+- unlock_extent(tree, start, page_end, GFP_NOFS);
+ if (tree->ops && tree->ops->writepage_end_io_hook)
+ tree->ops->writepage_end_io_hook(page, start,
+ page_end, NULL, 1);
+@@ -2197,13 +2313,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ goto done;
+ }
+
+- set_extent_uptodate(tree, start, page_end, GFP_NOFS);
+ blocksize = inode->i_sb->s_blocksize;
+
+ while (cur <= end) {
+ if (cur >= last_byte) {
+- clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+- unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+ if (tree->ops && tree->ops->writepage_end_io_hook)
+ tree->ops->writepage_end_io_hook(page, cur,
+ page_end, NULL, 1);
+@@ -2235,12 +2348,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ */
+ if (compressed || block_start == EXTENT_MAP_HOLE ||
+ block_start == EXTENT_MAP_INLINE) {
+- clear_extent_dirty(tree, cur,
+- cur + iosize - 1, GFP_NOFS);
+-
+- unlock_extent(tree, unlock_start, cur + iosize - 1,
+- GFP_NOFS);
+-
+ /*
+ * end_io notification does not happen here for
+ * compressed extents
+@@ -2265,13 +2372,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ }
+ /* leave this out until we have a page_mkwrite call */
+ if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
+- EXTENT_DIRTY, 0)) {
++ EXTENT_DIRTY, 0, NULL)) {
+ cur = cur + iosize;
+ pg_offset += iosize;
+ continue;
+ }
+
+- clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
+ if (tree->ops && tree->ops->writepage_io_hook) {
+ ret = tree->ops->writepage_io_hook(page, cur,
+ cur + iosize - 1);
+@@ -2309,12 +2415,12 @@ done:
+ set_page_writeback(page);
+ end_page_writeback(page);
+ }
+- if (unlock_start <= page_end)
+- unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+ unlock_page(page);
+
+ done_unlocked:
+
++ /* drop our reference on any cached states */
++ free_extent_state(cached_state);
+ return 0;
+ }
+
+@@ -2339,9 +2445,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
+ writepage_t writepage, void *data,
+ void (*flush_fn)(void *))
+ {
+- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ int ret = 0;
+ int done = 0;
++ int nr_to_write_done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+@@ -2361,7 +2467,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
+ scanned = 1;
+ }
+ retry:
+- while (!done && (index <= end) &&
++ while (!done && !nr_to_write_done && (index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY, min(end - index,
+ (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+@@ -2412,12 +2518,15 @@ retry:
+ unlock_page(page);
+ ret = 0;
+ }
+- if (ret || wbc->nr_to_write <= 0)
+- done = 1;
+- if (wbc->nonblocking && bdi_write_congested(bdi)) {
+- wbc->encountered_congestion = 1;
++ if (ret)
+ done = 1;
+- }
++
++ /*
++ * the filesystem may choose to bump up nr_to_write.
++ * We have to make sure to honor the new nr_to_write
++ * at any time
++ */
++ nr_to_write_done = wbc->nr_to_write <= 0;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+@@ -2604,10 +2713,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
+ return 0;
+
+ lock_extent(tree, start, end, GFP_NOFS);
+- wait_on_extent_writeback(tree, start, end);
++ wait_on_page_writeback(page);
+ clear_extent_bit(tree, start, end,
+- EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+- 1, 1, GFP_NOFS);
++ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
++ EXTENT_DO_ACCOUNTING,
++ 1, 1, NULL, GFP_NOFS);
+ return 0;
+ }
+
+@@ -2687,7 +2797,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
+ !isnew && !PageUptodate(page) &&
+ (block_off_end > to || block_off_start < from) &&
+ !test_range_bit(tree, block_start, cur_end,
+- EXTENT_UPTODATE, 1)) {
++ EXTENT_UPTODATE, 1, NULL)) {
+ u64 sector;
+ u64 extent_offset = block_start - em->start;
+ size_t iosize;
+@@ -2701,7 +2811,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
+ */
+ set_extent_bit(tree, block_start,
+ block_start + iosize - 1,
+- EXTENT_LOCKED, 0, NULL, GFP_NOFS);
++ EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
+ ret = submit_extent_page(READ, tree, page,
+ sector, iosize, page_offset, em->bdev,
+ NULL, 1,
+@@ -2742,13 +2852,18 @@ int try_release_extent_state(struct extent_map_tree *map,
+ int ret = 1;
+
+ if (test_range_bit(tree, start, end,
+- EXTENT_IOBITS | EXTENT_ORDERED, 0))
++ EXTENT_IOBITS, 0, NULL))
+ ret = 0;
+ else {
+ if ((mask & GFP_NOFS) == GFP_NOFS)
+ mask = GFP_NOFS;
+- clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+- 1, 1, mask);
++ /*
++ * at this point we can safely clear everything except the
++ * locked bit and the nodatasum bit
++ */
++ clear_extent_bit(tree, start, end,
++ ~(EXTENT_LOCKED | EXTENT_NODATASUM),
++ 0, 0, NULL, mask);
+ }
+ return ret;
+ }
+@@ -2771,29 +2886,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,
+ u64 len;
+ while (start <= end) {
+ len = end - start + 1;
+- spin_lock(&map->lock);
++ write_lock(&map->lock);
+ em = lookup_extent_mapping(map, start, len);
+ if (!em || IS_ERR(em)) {
+- spin_unlock(&map->lock);
++ write_unlock(&map->lock);
+ break;
+ }
+ if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+ em->start != start) {
+- spin_unlock(&map->lock);
++ write_unlock(&map->lock);
+ free_extent_map(em);
+ break;
+ }
+ if (!test_range_bit(tree, em->start,
+ extent_map_end(em) - 1,
+- EXTENT_LOCKED | EXTENT_WRITEBACK |
+- EXTENT_ORDERED,
+- 0)) {
++ EXTENT_LOCKED | EXTENT_WRITEBACK,
++ 0, NULL)) {
+ remove_extent_mapping(map, em);
+ /* once for the rb tree */
+ free_extent_map(em);
+ }
+ start = extent_map_end(em);
+- spin_unlock(&map->lock);
++ write_unlock(&map->lock);
+
+ /* once for us */
+ free_extent_map(em);
+@@ -3203,7 +3317,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
+ int uptodate;
+ unsigned long index;
+
+- ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
++ ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
+ if (ret)
+ return 1;
+ while (start <= end) {
+@@ -3233,7 +3347,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
+ return 1;
+
+ ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+- EXTENT_UPTODATE, 1);
++ EXTENT_UPTODATE, 1, NULL);
+ if (ret)
+ return ret;
+
+@@ -3269,7 +3383,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
+ return 0;
+
+ if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+- EXTENT_UPTODATE, 1)) {
++ EXTENT_UPTODATE, 1, NULL)) {
+ return 0;
+ }
+
+diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
+index 5bc20ab..36de250 100644
+--- a/fs/btrfs/extent_io.h
++++ b/fs/btrfs/extent_io.h
+@@ -13,10 +13,9 @@
+ #define EXTENT_DEFRAG (1 << 6)
+ #define EXTENT_DEFRAG_DONE (1 << 7)
+ #define EXTENT_BUFFER_FILLED (1 << 8)
+-#define EXTENT_ORDERED (1 << 9)
+-#define EXTENT_ORDERED_METADATA (1 << 10)
+-#define EXTENT_BOUNDARY (1 << 11)
+-#define EXTENT_NODATASUM (1 << 12)
++#define EXTENT_BOUNDARY (1 << 9)
++#define EXTENT_NODATASUM (1 << 10)
++#define EXTENT_DO_ACCOUNTING (1 << 11)
+ #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+
+ /* flags for bio submission */
+@@ -27,6 +26,16 @@
+ #define EXTENT_BUFFER_BLOCKING 1
+ #define EXTENT_BUFFER_DIRTY 2
+
++/* these are flags for extent_clear_unlock_delalloc */
++#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
++#define EXTENT_CLEAR_UNLOCK 0x2
++#define EXTENT_CLEAR_DELALLOC 0x4
++#define EXTENT_CLEAR_DIRTY 0x8
++#define EXTENT_SET_WRITEBACK 0x10
++#define EXTENT_END_WRITEBACK 0x20
++#define EXTENT_SET_PRIVATE2 0x40
++#define EXTENT_CLEAR_ACCOUNTING 0x80
++
+ /*
+ * page->private values. Every page that is controlled by the extent
+ * map has page->private set to one.
+@@ -62,8 +71,13 @@ struct extent_io_ops {
+ struct extent_state *state, int uptodate);
+ int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
+ unsigned long old, unsigned long bits);
+- int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
+- unsigned long old, unsigned long bits);
++ int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
++ unsigned long bits);
++ int (*merge_extent_hook)(struct inode *inode,
++ struct extent_state *new,
++ struct extent_state *other);
++ int (*split_extent_hook)(struct inode *inode,
++ struct extent_state *orig, u64 split);
+ int (*write_cache_pages_lock_hook)(struct page *page);
+ };
+
+@@ -81,10 +95,14 @@ struct extent_state {
+ u64 start;
+ u64 end; /* inclusive */
+ struct rb_node rb_node;
++
++ /* ADD NEW ELEMENTS AFTER THIS */
+ struct extent_io_tree *tree;
+ wait_queue_head_t wq;
+ atomic_t refs;
+ unsigned long state;
++ u64 split_start;
++ u64 split_end;
+
+ /* for use by the FS */
+ u64 private;
+@@ -142,6 +160,8 @@ int try_release_extent_state(struct extent_map_tree *map,
+ struct extent_io_tree *tree, struct page *page,
+ gfp_t mask);
+ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
++int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
++ int bits, struct extent_state **cached, gfp_t mask);
+ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+@@ -155,11 +175,12 @@ u64 count_range_bits(struct extent_io_tree *tree,
+ u64 max_bytes, unsigned long bits);
+
+ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+- int bits, int filled);
++ int bits, int filled, struct extent_state *cached_state);
+ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, gfp_t mask);
+ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+- int bits, int wake, int delete, gfp_t mask);
++ int bits, int wake, int delete, struct extent_state **cached,
++ gfp_t mask);
+ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, gfp_t mask);
+ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+@@ -278,9 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree,
+ int extent_clear_unlock_delalloc(struct inode *inode,
+ struct extent_io_tree *tree,
+ u64 start, u64 end, struct page *locked_page,
+- int unlock_page,
+- int clear_unlock,
+- int clear_delalloc, int clear_dirty,
+- int set_writeback,
+- int end_writeback);
++ unsigned long op);
+ #endif
+diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
+index 30c9365..2c726b7 100644
+--- a/fs/btrfs/extent_map.c
++++ b/fs/btrfs/extent_map.c
+@@ -36,7 +36,7 @@ void extent_map_exit(void)
+ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
+ {
+ tree->map.rb_node = NULL;
+- spin_lock_init(&tree->lock);
++ rwlock_init(&tree->lock);
+ }
+
+ /**
+@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+ return 0;
+ }
+
++int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
++{
++ int ret = 0;
++ struct extent_map *merge = NULL;
++ struct rb_node *rb;
++ struct extent_map *em;
++
++ write_lock(&tree->lock);
++ em = lookup_extent_mapping(tree, start, len);
++
++ WARN_ON(em->start != start || !em);
++
++ if (!em)
++ goto out;
++
++ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
++
++ if (em->start != 0) {
++ rb = rb_prev(&em->rb_node);
++ if (rb)
++ merge = rb_entry(rb, struct extent_map, rb_node);
++ if (rb && mergable_maps(merge, em)) {
++ em->start = merge->start;
++ em->len += merge->len;
++ em->block_len += merge->block_len;
++ em->block_start = merge->block_start;
++ merge->in_tree = 0;
++ rb_erase(&merge->rb_node, &tree->map);
++ free_extent_map(merge);
++ }
++ }
++
++ rb = rb_next(&em->rb_node);
++ if (rb)
++ merge = rb_entry(rb, struct extent_map, rb_node);
++ if (rb && mergable_maps(em, merge)) {
++ em->len += merge->len;
++ em->block_len += merge->len;
++ rb_erase(&merge->rb_node, &tree->map);
++ merge->in_tree = 0;
++ free_extent_map(merge);
++ }
++
++ free_extent_map(em);
++out:
++ write_unlock(&tree->lock);
++ return ret;
++
++}
++
+ /**
+ * add_extent_mapping - add new extent map to the extent tree
+ * @tree: tree to insert new map in
+@@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
+ ret = -EEXIST;
+ goto out;
+ }
+- assert_spin_locked(&tree->lock);
+ rb = tree_insert(&tree->map, em->start, &em->rb_node);
+ if (rb) {
+ ret = -EEXIST;
+@@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ struct rb_node *next = NULL;
+ u64 end = range_end(start, len);
+
+- assert_spin_locked(&tree->lock);
+ rb_node = __tree_search(&tree->map, start, &prev, &next);
+ if (!rb_node && prev) {
+ em = rb_entry(prev, struct extent_map, rb_node);
+@@ -319,6 +367,54 @@ out:
+ }
+
+ /**
++ * search_extent_mapping - find a nearby extent map
++ * @tree: tree to lookup in
++ * @start: byte offset to start the search
++ * @len: length of the lookup range
++ *
++ * Find and return the first extent_map struct in @tree that intersects the
++ * [start, len] range.
++ *
++ * If one can't be found, any nearby extent may be returned
++ */
++struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
++ u64 start, u64 len)
++{
++ struct extent_map *em;
++ struct rb_node *rb_node;
++ struct rb_node *prev = NULL;
++ struct rb_node *next = NULL;
++
++ rb_node = __tree_search(&tree->map, start, &prev, &next);
++ if (!rb_node && prev) {
++ em = rb_entry(prev, struct extent_map, rb_node);
++ goto found;
++ }
++ if (!rb_node && next) {
++ em = rb_entry(next, struct extent_map, rb_node);
++ goto found;
++ }
++ if (!rb_node) {
++ em = NULL;
++ goto out;
++ }
++ if (IS_ERR(rb_node)) {
++ em = ERR_PTR(PTR_ERR(rb_node));
++ goto out;
++ }
++ em = rb_entry(rb_node, struct extent_map, rb_node);
++ goto found;
++
++ em = NULL;
++ goto out;
++
++found:
++ atomic_inc(&em->refs);
++out:
++ return em;
++}
++
++/**
+ * remove_extent_mapping - removes an extent_map from the extent tree
+ * @tree: extent tree to remove from
+ * @em: extent map beeing removed
+@@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+ int ret = 0;
+
+ WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+- assert_spin_locked(&tree->lock);
+ rb_erase(&em->rb_node, &tree->map);
+ em->in_tree = 0;
+ return ret;
+diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
+index fb6eeef..ab6d74b 100644
+--- a/fs/btrfs/extent_map.h
++++ b/fs/btrfs/extent_map.h
+@@ -31,7 +31,7 @@ struct extent_map {
+
+ struct extent_map_tree {
+ struct rb_root map;
+- spinlock_t lock;
++ rwlock_t lock;
+ };
+
+ static inline u64 extent_map_end(struct extent_map *em)
+@@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask);
+ void free_extent_map(struct extent_map *em);
+ int __init extent_map_init(void);
+ void extent_map_exit(void);
++int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
++struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
++ u64 start, u64 len);
+ #endif
+diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
+index 4b83397..4599113 100644
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+ int err = 0;
+ int i;
+ struct inode *inode = fdentry(file)->d_inode;
+- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+- u64 hint_byte;
+ u64 num_bytes;
+ u64 start_pos;
+ u64 end_of_last_block;
+@@ -125,23 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+ root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+ end_of_last_block = start_pos + num_bytes - 1;
++ err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
++ if (err)
++ return err;
+
+- lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+- trans = btrfs_join_transaction(root, 1);
+- if (!trans) {
+- err = -ENOMEM;
+- goto out_unlock;
+- }
+- btrfs_set_trans_block_group(trans, inode);
+- hint_byte = 0;
+-
+- set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+-
+- /* check for reserved extents on each page, we don't want
+- * to reset the delalloc bit on things that already have
+- * extents reserved.
+- */
+- btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = pages[i];
+ SetPageUptodate(p);
+@@ -155,9 +140,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+ * at this time.
+ */
+ }
+- err = btrfs_end_transaction(trans, root);
+-out_unlock:
+- unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+ return err;
+ }
+
+@@ -189,18 +171,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ if (!split2)
+ split2 = alloc_extent_map(GFP_NOFS);
+
+- spin_lock(&em_tree->lock);
++ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (!em) {
+- spin_unlock(&em_tree->lock);
++ write_unlock(&em_tree->lock);
+ break;
+ }
+ flags = em->flags;
+ if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+- spin_unlock(&em_tree->lock);
+ if (em->start <= start &&
+ (!testend || em->start + em->len >= start + len)) {
+ free_extent_map(em);
++ write_unlock(&em_tree->lock);
+ break;
+ }
+ if (start < em->start) {
+@@ -210,6 +192,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ start = em->start + em->len;
+ }
+ free_extent_map(em);
++ write_unlock(&em_tree->lock);
+ continue;
+ }
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+@@ -260,7 +243,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ free_extent_map(split);
+ split = NULL;
+ }
+- spin_unlock(&em_tree->lock);
++ write_unlock(&em_tree->lock);
+
+ /* once for us */
+ free_extent_map(em);
+@@ -289,7 +272,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ u64 start, u64 end, u64 locked_end,
+- u64 inline_limit, u64 *hint_byte)
++ u64 inline_limit, u64 *hint_byte, int drop_cache)
+ {
+ u64 extent_end = 0;
+ u64 search_start = start;
+@@ -314,7 +297,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ int ret;
+
+ inline_limit = 0;
+- btrfs_drop_extent_cache(inode, start, end - 1, 0);
++ if (drop_cache)
++ btrfs_drop_extent_cache(inode, start, end - 1, 0);
+
+ path = btrfs_alloc_path();
+ if (!path)
+@@ -894,7 +878,8 @@ again:
+ btrfs_put_ordered_extent(ordered);
+
+ clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
+- last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
++ last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
++ EXTENT_DO_ACCOUNTING,
+ GFP_NOFS);
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ start_pos, last_pos - 1, GFP_NOFS);
+@@ -936,21 +921,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
+ start_pos = pos;
+
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
++
++ /* do the reserve before the mutex lock in case we have to do some
++ * flushing. We wouldn't deadlock, but this is more polite.
++ */
++ err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
++ if (err)
++ goto out_nolock;
++
++ mutex_lock(&inode->i_mutex);
++
+ current->backing_dev_info = inode->i_mapping->backing_dev_info;
+ err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (err)
+- goto out_nolock;
++ goto out;
++
+ if (count == 0)
+- goto out_nolock;
++ goto out;
+
+ err = file_remove_suid(file);
+ if (err)
+- goto out_nolock;
++ goto out;
++
+ file_update_time(file);
+
+ pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+
+- mutex_lock(&inode->i_mutex);
++ /* generic_write_checks can change our pos */
++ start_pos = pos;
++
+ BTRFS_I(inode)->sequence++;
+ first_index = pos >> PAGE_CACHE_SHIFT;
+ last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+@@ -1047,6 +1046,7 @@ out:
+ mutex_unlock(&inode->i_mutex);
+ if (ret)
+ err = ret;
++ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+
+ out_nolock:
+ kfree(pages);
+@@ -1087,8 +1087,10 @@ out_nolock:
+ btrfs_end_transaction(trans, root);
+ else
+ btrfs_commit_transaction(trans, root);
+- } else {
++ } else if (ret != BTRFS_NO_LOG_SYNC) {
+ btrfs_commit_transaction(trans, root);
++ } else {
++ btrfs_end_transaction(trans, root);
+ }
+ }
+ if (file->f_flags & O_DIRECT) {
+@@ -1138,6 +1140,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+ int ret = 0;
+ struct btrfs_trans_handle *trans;
+
++
++ /* we wait first, since the writeback may change the inode */
++ root->log_batch++;
++ /* the VFS called filemap_fdatawrite for us */
++ btrfs_wait_ordered_range(inode, 0, (u64)-1);
++ root->log_batch++;
++
+ /*
+ * check the transaction that last modified this inode
+ * and see if its already been committed
+@@ -1145,6 +1154,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+ if (!BTRFS_I(inode)->last_trans)
+ goto out;
+
++ /*
++ * if the last transaction that changed this file was before
++ * the current transaction, we can bail out now without any
++ * syncing
++ */
+ mutex_lock(&root->fs_info->trans_mutex);
+ if (BTRFS_I(inode)->last_trans <=
+ root->fs_info->last_trans_committed) {
+@@ -1154,13 +1168,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+ }
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+- root->log_batch++;
+- filemap_fdatawrite(inode->i_mapping);
+- btrfs_wait_ordered_range(inode, 0, (u64)-1);
+- root->log_batch++;
+-
+- if (datasync && !(inode->i_state & I_DIRTY_PAGES))
+- goto out;
+ /*
+ * ok we haven't committed the transaction yet, lets do a commit
+ */
+@@ -1189,14 +1196,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+ */
+ mutex_unlock(&dentry->d_inode->i_mutex);
+
+- if (ret > 0) {
+- ret = btrfs_commit_transaction(trans, root);
+- } else {
+- ret = btrfs_sync_log(trans, root);
+- if (ret == 0)
+- ret = btrfs_end_transaction(trans, root);
+- else
++ if (ret != BTRFS_NO_LOG_SYNC) {
++ if (ret > 0) {
+ ret = btrfs_commit_transaction(trans, root);
++ } else {
++ ret = btrfs_sync_log(trans, root);
++ if (ret == 0)
++ ret = btrfs_end_transaction(trans, root);
++ else
++ ret = btrfs_commit_transaction(trans, root);
++ }
++ } else {
++ ret = btrfs_end_transaction(trans, root);
+ }
+ mutex_lock(&dentry->d_inode->i_mutex);
+ out:
+diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
+index 5edcee3..5c2caad 100644
+--- a/fs/btrfs/free-space-cache.c
++++ b/fs/btrfs/free-space-cache.c
+@@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
+
+ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+ {
+- u64 max_bytes, possible_bytes;
++ u64 max_bytes;
++ u64 bitmap_bytes;
++ u64 extent_bytes;
+
+ /*
+ * The goal is to keep the total amount of memory used per 1gb of space
+@@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+ max_bytes = MAX_CACHE_BYTES_PER_GIG *
+ (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+
+- possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
+- (sizeof(struct btrfs_free_space) *
+- block_group->extents_thresh);
++ /*
++ * we want to account for 1 more bitmap than what we have so we can make
++ * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
++ * we add more bitmaps.
++ */
++ bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
+
+- if (possible_bytes > max_bytes) {
+- int extent_bytes = max_bytes -
+- (block_group->total_bitmaps * PAGE_CACHE_SIZE);
++ if (bitmap_bytes >= max_bytes) {
++ block_group->extents_thresh = 0;
++ return;
++ }
+
+- if (extent_bytes <= 0) {
+- block_group->extents_thresh = 0;
+- return;
+- }
++ /*
++ * we want the extent entry threshold to always be at most 1/2 the maxw
++ * bytes we can have, or whatever is less than that.
++ */
++ extent_bytes = max_bytes - bitmap_bytes;
++ extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
+
+- block_group->extents_thresh = extent_bytes /
+- (sizeof(struct btrfs_free_space));
+- }
++ block_group->extents_thresh =
++ div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
+ }
+
+ static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
+@@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+ BUG_ON(block_group->total_bitmaps >= max_bitmaps);
+
+ info->offset = offset_to_bitmap(block_group, offset);
++ info->bytes = 0;
+ link_free_space(block_group, info);
+ block_group->total_bitmaps++;
+
+diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
+index 6b627c6..72ce3c1 100644
+--- a/fs/btrfs/inode-item.c
++++ b/fs/btrfs/inode-item.c
+@@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ ptr = (unsigned long)(ref + 1);
+ ret = 0;
+ } else if (ret < 0) {
++ if (ret == -EOVERFLOW)
++ ret = -EMLINK;
+ goto out;
+ } else {
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+@@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(struct btrfs_inode_item));
+- if (ret == 0 && objectid > root->highest_inode)
+- root->highest_inode = objectid;
+ return ret;
+ }
+
+diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
+index 9abbced..c56eb59 100644
+--- a/fs/btrfs/inode-map.c
++++ b/fs/btrfs/inode-map.c
+@@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
+ slot = path->slots[0] - 1;
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+- *objectid = found_key.objectid;
++ *objectid = max_t(u64, found_key.objectid,
++ BTRFS_FIRST_FREE_OBJECTID - 1);
+ } else {
+- *objectid = BTRFS_FIRST_FREE_OBJECTID;
++ *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
+ }
+ ret = 0;
+ error:
+@@ -53,91 +54,27 @@ error:
+ return ret;
+ }
+
+-/*
+- * walks the btree of allocated inodes and find a hole.
+- */
+ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 dirid, u64 *objectid)
+ {
+- struct btrfs_path *path;
+- struct btrfs_key key;
+ int ret;
+- int slot = 0;
+- u64 last_ino = 0;
+- int start_found;
+- struct extent_buffer *l;
+- struct btrfs_key search_key;
+- u64 search_start = dirid;
+-
+ mutex_lock(&root->objectid_mutex);
+- if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
+- root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
+- *objectid = ++root->last_inode_alloc;
+- mutex_unlock(&root->objectid_mutex);
+- return 0;
+- }
+- path = btrfs_alloc_path();
+- BUG_ON(!path);
+- search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
+- search_key.objectid = search_start;
+- search_key.type = 0;
+- search_key.offset = 0;
+-
+- start_found = 0;
+- ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
+- if (ret < 0)
+- goto error;
+
+- while (1) {
+- l = path->nodes[0];
+- slot = path->slots[0];
+- if (slot >= btrfs_header_nritems(l)) {
+- ret = btrfs_next_leaf(root, path);
+- if (ret == 0)
+- continue;
+- if (ret < 0)
+- goto error;
+- if (!start_found) {
+- *objectid = search_start;
+- start_found = 1;
+- goto found;
+- }
+- *objectid = last_ino > search_start ?
+- last_ino : search_start;
+- goto found;
+- }
+- btrfs_item_key_to_cpu(l, &key, slot);
+- if (key.objectid >= search_start) {
+- if (start_found) {
+- if (last_ino < search_start)
+- last_ino = search_start;
+- if (key.objectid > last_ino) {
+- *objectid = last_ino;
+- goto found;
+- }
+- } else if (key.objectid > search_start) {
+- *objectid = search_start;
+- goto found;
+- }
+- }
+- if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
+- break;
++ if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
++ ret = btrfs_find_highest_inode(root, &root->highest_objectid);
++ if (ret)
++ goto out;
++ }
+
+- start_found = 1;
+- last_ino = key.objectid + 1;
+- path->slots[0]++;
++ if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
++ ret = -ENOSPC;
++ goto out;
+ }
+- BUG_ON(1);
+-found:
+- btrfs_release_path(root, path);
+- btrfs_free_path(path);
+- BUG_ON(*objectid < search_start);
+- mutex_unlock(&root->objectid_mutex);
+- return 0;
+-error:
+- btrfs_release_path(root, path);
+- btrfs_free_path(path);
++
++ *objectid = ++root->highest_objectid;
++ ret = 0;
++out:
+ mutex_unlock(&root->objectid_mutex);
+ return ret;
+ }
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 59cba18..f69e5e0 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
+ }
+
+ ret = btrfs_drop_extents(trans, root, inode, start,
+- aligned_end, aligned_end, start, &hint_byte);
++ aligned_end, aligned_end, start,
++ &hint_byte, 1);
+ BUG_ON(ret);
+
+ if (isize > actual_end)
+@@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
+ inline_len, compressed_size,
+ compressed_pages);
+ BUG_ON(ret);
+- btrfs_drop_extent_cache(inode, start, aligned_end, 0);
++ btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
+ return 0;
+ }
+
+@@ -423,9 +424,12 @@ again:
+ * and free up our temp pages.
+ */
+ extent_clear_unlock_delalloc(inode,
+- &BTRFS_I(inode)->io_tree,
+- start, end, NULL, 1, 0,
+- 0, 1, 1, 1);
++ &BTRFS_I(inode)->io_tree,
++ start, end, NULL,
++ EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
++ EXTENT_CLEAR_DELALLOC |
++ EXTENT_CLEAR_ACCOUNTING |
++ EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
+ ret = 0;
+ goto free_pages_out;
+ }
+@@ -611,9 +615,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
+ while (1) {
+- spin_lock(&em_tree->lock);
++ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+- spin_unlock(&em_tree->lock);
++ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+@@ -636,11 +640,14 @@ static noinline int submit_compressed_extents(struct inode *inode,
+ * clear dirty, set writeback and unlock the pages.
+ */
+ extent_clear_unlock_delalloc(inode,
+- &BTRFS_I(inode)->io_tree,
+- async_extent->start,
+- async_extent->start +
+- async_extent->ram_size - 1,
+- NULL, 1, 1, 0, 1, 1, 0);
++ &BTRFS_I(inode)->io_tree,
++ async_extent->start,
++ async_extent->start +
++ async_extent->ram_size - 1,
++ NULL, EXTENT_CLEAR_UNLOCK_PAGE |
++ EXTENT_CLEAR_UNLOCK |
++ EXTENT_CLEAR_DELALLOC |
++ EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
+
+ ret = btrfs_submit_compressed_write(inode,
+ async_extent->start,
+@@ -711,9 +718,15 @@ static noinline int cow_file_range(struct inode *inode,
+ start, end, 0, NULL);
+ if (ret == 0) {
+ extent_clear_unlock_delalloc(inode,
+- &BTRFS_I(inode)->io_tree,
+- start, end, NULL, 1, 1,
+- 1, 1, 1, 1);
++ &BTRFS_I(inode)->io_tree,
++ start, end, NULL,
++ EXTENT_CLEAR_UNLOCK_PAGE |
++ EXTENT_CLEAR_UNLOCK |
++ EXTENT_CLEAR_DELALLOC |
++ EXTENT_CLEAR_ACCOUNTING |
++ EXTENT_CLEAR_DIRTY |
++ EXTENT_SET_WRITEBACK |
++ EXTENT_END_WRITEBACK);
+ *nr_written = *nr_written +
+ (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+ *page_started = 1;
+@@ -725,9 +738,20 @@ static noinline int cow_file_range(struct inode *inode,
+ BUG_ON(disk_num_bytes >
+ btrfs_super_total_bytes(&root->fs_info->super_copy));
+
++
++ read_lock(&BTRFS_I(inode)->extent_tree.lock);
++ em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
++ start, num_bytes);
++ if (em) {
++ alloc_hint = em->block_start;
++ free_extent_map(em);
++ }
++ read_unlock(&BTRFS_I(inode)->extent_tree.lock);
+ btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+
+ while (disk_num_bytes > 0) {
++ unsigned long op;
++
+ cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
+ ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+ root->sectorsize, 0, alloc_hint,
+@@ -737,7 +761,6 @@ static noinline int cow_file_range(struct inode *inode,
+ em = alloc_extent_map(GFP_NOFS);
+ em->start = start;
+ em->orig_start = em->start;
+-
+ ram_size = ins.offset;
+ em->len = ins.offset;
+
+@@ -747,9 +770,9 @@ static noinline int cow_file_range(struct inode *inode,
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ while (1) {
+- spin_lock(&em_tree->lock);
++ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+- spin_unlock(&em_tree->lock);
++ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+@@ -776,11 +799,17 @@ static noinline int cow_file_range(struct inode *inode,
+ /* we're not doing compressed IO, don't unlock the first
+ * page (which the caller expects to stay locked), don't
+ * clear any dirty bits and don't set any writeback bits
++ *
++ * Do set the Private2 bit so we know this page was properly
++ * setup for writepage
+ */
++ op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
++ op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
++ EXTENT_SET_PRIVATE2;
++
+ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+ start, start + ram_size - 1,
+- locked_page, unlock, 1,
+- 1, 0, 0, 0);
++ locked_page, op);
+ disk_num_bytes -= cur_alloc_size;
+ num_bytes -= cur_alloc_size;
+ alloc_hint = ins.objectid + ins.offset;
+@@ -852,8 +881,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+ u64 cur_end;
+ int limit = 10 * 1024 * 1042;
+
+- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
+- EXTENT_DELALLOC, 1, 0, GFP_NOFS);
++ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
++ 1, 0, NULL, GFP_NOFS);
+ while (start < end) {
+ async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+ async_cow->inode = inode;
+@@ -994,6 +1023,7 @@ next_slot:
+
+ if (found_key.offset > cur_offset) {
+ extent_end = found_key.offset;
++ extent_type = 0;
+ goto out_check;
+ }
+
+@@ -1080,9 +1110,9 @@ out_check:
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ while (1) {
+- spin_lock(&em_tree->lock);
++ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+- spin_unlock(&em_tree->lock);
++ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+@@ -1100,8 +1130,10 @@ out_check:
+ BUG_ON(ret);
+
+ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+- cur_offset, cur_offset + num_bytes - 1,
+- locked_page, 1, 1, 1, 0, 0, 0);
++ cur_offset, cur_offset + num_bytes - 1,
++ locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
++ EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
++ EXTENT_SET_PRIVATE2);
+ cur_offset = extent_end;
+ if (cur_offset > end)
+ break;
+@@ -1147,6 +1179,89 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+ return ret;
+ }
+
++static int btrfs_split_extent_hook(struct inode *inode,
++ struct extent_state *orig, u64 split)
++{
++ struct btrfs_root *root = BTRFS_I(inode)->root;
++ u64 size;
++
++ if (!(orig->state & EXTENT_DELALLOC))
++ return 0;
++
++ size = orig->end - orig->start + 1;
++ if (size > root->fs_info->max_extent) {
++ u64 num_extents;
++ u64 new_size;
++
++ new_size = orig->end - split + 1;
++ num_extents = div64_u64(size + root->fs_info->max_extent - 1,
++ root->fs_info->max_extent);
++
++ /*
++ * if we break a large extent up then leave oustanding_extents
++ * be, since we've already accounted for the large extent.
++ */
++ if (div64_u64(new_size + root->fs_info->max_extent - 1,
++ root->fs_info->max_extent) < num_extents)
++ return 0;
++ }
++
++ spin_lock(&BTRFS_I(inode)->accounting_lock);
++ BTRFS_I(inode)->outstanding_extents++;
++ spin_unlock(&BTRFS_I(inode)->accounting_lock);
++
++ return 0;
++}
++
++/*
++ * extent_io.c merge_extent_hook, used to track merged delayed allocation
++ * extents so we can keep track of new extents that are just merged onto old
++ * extents, such as when we are doing sequential writes, so we can properly
++ * account for the metadata space we'll need.
++ */
++static int btrfs_merge_extent_hook(struct inode *inode,
++ struct extent_state *new,
++ struct extent_state *other)
++{
++ struct btrfs_root *root = BTRFS_I(inode)->root;
++ u64 new_size, old_size;
++ u64 num_extents;
++
++ /* not delalloc, ignore it */
++ if (!(other->state & EXTENT_DELALLOC))
++ return 0;
++
++ old_size = other->end - other->start + 1;
++ if (new->start < other->start)
++ new_size = other->end - new->start + 1;
++ else
++ new_size = new->end - other->start + 1;
++
++ /* we're not bigger than the max, unreserve the space and go */
++ if (new_size <= root->fs_info->max_extent) {
++ spin_lock(&BTRFS_I(inode)->accounting_lock);
++ BTRFS_I(inode)->outstanding_extents--;
++ spin_unlock(&BTRFS_I(inode)->accounting_lock);
++ return 0;
++ }
++
++ /*
++ * If we grew by another max_extent, just return, we want to keep that
++ * reserved amount.
++ */
++ num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
++ root->fs_info->max_extent);
++ if (div64_u64(new_size + root->fs_info->max_extent - 1,
++ root->fs_info->max_extent) > num_extents)
++ return 0;
++
++ spin_lock(&BTRFS_I(inode)->accounting_lock);
++ BTRFS_I(inode)->outstanding_extents--;
++ spin_unlock(&BTRFS_I(inode)->accounting_lock);
++
++ return 0;
++}
++
+ /*
+ * extent_io.c set_bit_hook, used to track delayed allocation
+ * bytes in this file, and to maintain the list of inodes that
+@@ -1155,6 +1270,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+ unsigned long old, unsigned long bits)
+ {
++
+ /*
+ * set_bit and clear bit hooks normally require _irqsave/restore
+ * but in this case, we are only testeing for the DELALLOC
+@@ -1162,6 +1278,10 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+ */
+ if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ struct btrfs_root *root = BTRFS_I(inode)->root;
++
++ spin_lock(&BTRFS_I(inode)->accounting_lock);
++ BTRFS_I(inode)->outstanding_extents++;
++ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+ spin_lock(&root->fs_info->delalloc_lock);
+ BTRFS_I(inode)->delalloc_bytes += end - start + 1;
+@@ -1178,22 +1298,31 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+ /*
+ * extent_io.c clear_bit_hook, see set_bit_hook for why
+ */
+-static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
+- unsigned long old, unsigned long bits)
++static int btrfs_clear_bit_hook(struct inode *inode,
++ struct extent_state *state, unsigned long bits)
+ {
+ /*
+ * set_bit and clear bit hooks normally require _irqsave/restore
+ * but in this case, we are only testeing for the DELALLOC
+ * bit, which is only set or cleared with irqs on
+ */
+- if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
++ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
++ if (bits & EXTENT_DO_ACCOUNTING) {
++ spin_lock(&BTRFS_I(inode)->accounting_lock);
++ BTRFS_I(inode)->outstanding_extents--;
++ spin_unlock(&BTRFS_I(inode)->accounting_lock);
++ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
++ }
++
+ spin_lock(&root->fs_info->delalloc_lock);
+- if (end - start + 1 > root->fs_info->delalloc_bytes) {
++ if (state->end - state->start + 1 >
++ root->fs_info->delalloc_bytes) {
+ printk(KERN_INFO "btrfs warning: delalloc account "
+ "%llu %llu\n",
+- (unsigned long long)end - start + 1,
++ (unsigned long long)
++ state->end - state->start + 1,
+ (unsigned long long)
+ root->fs_info->delalloc_bytes);
+ btrfs_delalloc_free_space(root, inode, (u64)-1);
+@@ -1201,9 +1330,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
+ BTRFS_I(inode)->delalloc_bytes = 0;
+ } else {
+ btrfs_delalloc_free_space(root, inode,
+- end - start + 1);
+- root->fs_info->delalloc_bytes -= end - start + 1;
+- BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
++ state->end -
++ state->start + 1);
++ root->fs_info->delalloc_bytes -= state->end -
++ state->start + 1;
++ BTRFS_I(inode)->delalloc_bytes -= state->end -
++ state->start + 1;
+ }
+ if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+ !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+@@ -1374,10 +1506,8 @@ again:
+ lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+
+ /* already ordered? We're done */
+- if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+- EXTENT_ORDERED, 0)) {
++ if (PagePrivate2(page))
+ goto out;
+- }
+
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered) {
+@@ -1413,11 +1543,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+ struct inode *inode = page->mapping->host;
+ struct btrfs_writepage_fixup *fixup;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+- int ret;
+
+- ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+- EXTENT_ORDERED, 0);
+- if (ret)
++ /* this page is properly in the ordered list */
++ if (TestClearPagePrivate2(page))
+ return 0;
+
+ if (PageChecked(page))
+@@ -1455,9 +1583,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+ BUG_ON(!path);
+
+ path->leave_spinning = 1;
++
++ /*
++ * we may be replacing one extent in the tree with another.
++ * The new extent is pinned in the extent map, and we don't want
++ * to drop it from the cache until it is completely in the btree.
++ *
++ * So, tell btrfs_drop_extents to leave this extent in the cache.
++ * the caller is expected to unpin it and allow it to be merged
++ * with the others.
++ */
+ ret = btrfs_drop_extents(trans, root, inode, file_pos,
+ file_pos + num_bytes, locked_end,
+- file_pos, &hint);
++ file_pos, &hint, 0);
+ BUG_ON(ret);
+
+ ins.objectid = inode->i_ino;
+@@ -1485,7 +1623,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+ btrfs_mark_buffer_dirty(leaf);
+
+ inode_add_bytes(inode, num_bytes);
+- btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
+
+ ins.objectid = disk_bytenr;
+ ins.offset = disk_num_bytes;
+@@ -1596,6 +1733,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+ ordered_extent->len,
+ compressed, 0, 0,
+ BTRFS_FILE_EXTENT_REG);
++ unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
++ ordered_extent->file_offset,
++ ordered_extent->len);
+ BUG_ON(ret);
+ }
+ unlock_extent(io_tree, ordered_extent->file_offset,
+@@ -1623,6 +1763,7 @@ nocow:
+ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+ struct extent_state *state, int uptodate)
+ {
++ ClearPagePrivate2(page);
+ return btrfs_finish_ordered_io(page->mapping->host, start, end);
+ }
+
+@@ -1669,13 +1810,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
+ failrec->last_mirror = 0;
+ failrec->bio_flags = 0;
+
+- spin_lock(&em_tree->lock);
++ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, failrec->len);
+ if (em->start > start || em->start + em->len < start) {
+ free_extent_map(em);
+ em = NULL;
+ }
+- spin_unlock(&em_tree->lock);
++ read_unlock(&em_tree->lock);
+
+ if (!em || IS_ERR(em)) {
+ kfree(failrec);
+@@ -1794,7 +1935,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+ return 0;
+
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+- test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
++ test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
+ clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
+ GFP_NOFS);
+ return 0;
+@@ -2352,6 +2493,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+ return ret;
+ }
+
++int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root,
++ struct inode *dir, u64 objectid,
++ const char *name, int name_len)
++{
++ struct btrfs_path *path;
++ struct extent_buffer *leaf;
++ struct btrfs_dir_item *di;
++ struct btrfs_key key;
++ u64 index;
++ int ret;
++
++ path = btrfs_alloc_path();
++ if (!path)
++ return -ENOMEM;
++
++ di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
++ name, name_len, -1);
++ BUG_ON(!di || IS_ERR(di));
++
++ leaf = path->nodes[0];
++ btrfs_dir_item_key_to_cpu(leaf, di, &key);
++ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
++ ret = btrfs_delete_one_dir_name(trans, root, path, di);
++ BUG_ON(ret);
++ btrfs_release_path(root, path);
++
++ ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
++ objectid, root->root_key.objectid,
++ dir->i_ino, &index, name, name_len);
++ if (ret < 0) {
++ BUG_ON(ret != -ENOENT);
++ di = btrfs_search_dir_index_item(root, path, dir->i_ino,
++ name, name_len);
++ BUG_ON(!di || IS_ERR(di));
++
++ leaf = path->nodes[0];
++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++ btrfs_release_path(root, path);
++ index = key.offset;
++ }
++
++ di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
++ index, name, name_len, -1);
++ BUG_ON(!di || IS_ERR(di));
++
++ leaf = path->nodes[0];
++ btrfs_dir_item_key_to_cpu(leaf, di, &key);
++ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
++ ret = btrfs_delete_one_dir_name(trans, root, path, di);
++ BUG_ON(ret);
++ btrfs_release_path(root, path);
++
++ btrfs_i_size_write(dir, dir->i_size - name_len * 2);
++ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
++ ret = btrfs_update_inode(trans, root, dir);
++ BUG_ON(ret);
++ dir->i_sb->s_dirt = 1;
++
++ btrfs_free_path(path);
++ return 0;
++}
++
+ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+ {
+ struct inode *inode = dentry->d_inode;
+@@ -2361,29 +2565,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+ struct btrfs_trans_handle *trans;
+ unsigned long nr = 0;
+
+- /*
+- * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
+- * the root of a subvolume or snapshot
+- */
+ if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
+- inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
++ inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ return -ENOTEMPTY;
+- }
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, dir);
+
++ if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
++ err = btrfs_unlink_subvol(trans, root, dir,
++ BTRFS_I(inode)->location.objectid,
++ dentry->d_name.name,
++ dentry->d_name.len);
++ goto out;
++ }
++
+ err = btrfs_orphan_add(trans, inode);
+ if (err)
+- goto fail_trans;
++ goto out;
+
+ /* now the directory is empty */
+ err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+ dentry->d_name.name, dentry->d_name.len);
+ if (!err)
+ btrfs_i_size_write(inode, 0);
+-
+-fail_trans:
++out:
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction_throttle(trans, root);
+ btrfs_btree_balance_dirty(root, nr);
+@@ -2826,12 +3032,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+
+ if ((offset & (blocksize - 1)) == 0)
+ goto out;
++ ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
++ if (ret)
++ goto out;
++
++ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
++ if (ret)
++ goto out;
+
+ ret = -ENOMEM;
+ again:
+ page = grab_cache_page(mapping, index);
+- if (!page)
++ if (!page) {
++ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
++ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ goto out;
++ }
+
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+@@ -2864,7 +3080,16 @@ again:
+ goto again;
+ }
+
+- btrfs_set_extent_delalloc(inode, page_start, page_end);
++ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
++ EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
++ GFP_NOFS);
++
++ ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
++ if (ret) {
++ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
++ goto out_unlock;
++ }
++
+ ret = 0;
+ if (offset != PAGE_CACHE_SIZE) {
+ kaddr = kmap(page);
+@@ -2877,6 +3102,9 @@ again:
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+ out_unlock:
++ if (ret)
++ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
++ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ unlock_page(page);
+ page_cache_release(page);
+ out:
+@@ -2895,17 +3123,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
+ u64 last_byte;
+ u64 cur_offset;
+ u64 hole_size;
+- int err;
++ int err = 0;
+
+ if (size <= hole_start)
+ return 0;
+
+- err = btrfs_check_metadata_free_space(root);
++ err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+ if (err)
+ return err;
+
+- btrfs_truncate_page(inode->i_mapping, inode->i_size);
+-
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+ btrfs_wait_ordered_range(inode, hole_start,
+@@ -2935,15 +3161,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
+ cur_offset,
+ cur_offset + hole_size,
+ block_end,
+- cur_offset, &hint_byte);
++ cur_offset, &hint_byte, 1);
+ if (err)
+ break;
++
++ err = btrfs_reserve_metadata_space(root, 1);
++ if (err)
++ break;
++
+ err = btrfs_insert_file_extent(trans, root,
+ inode->i_ino, cur_offset, 0,
+ 0, hole_size, 0, hole_size,
+ 0, 0, 0);
+ btrfs_drop_extent_cache(inode, hole_start,
+ last_byte - 1, 0);
++ btrfs_unreserve_metadata_space(root, 1);
+ }
+ free_extent_map(em);
+ cur_offset = last_byte;
+@@ -3003,6 +3235,11 @@ void btrfs_delete_inode(struct inode *inode)
+ }
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+
++ if (inode->i_nlink > 0) {
++ BUG_ON(btrfs_root_refs(&root->root_item) != 0);
++ goto no_delete;
++ }
++
+ btrfs_i_size_write(inode, 0);
+ trans = btrfs_join_transaction(root, 1);
+
+@@ -3070,29 +3307,67 @@ out_err:
+ * is kind of like crossing a mount point.
+ */
+ static int fixup_tree_root_location(struct btrfs_root *root,
+- struct btrfs_key *location,
+- struct btrfs_root **sub_root,
+- struct dentry *dentry)
++ struct inode *dir,
++ struct dentry *dentry,
++ struct btrfs_key *location,
++ struct btrfs_root **sub_root)
+ {
+- struct btrfs_root_item *ri;
++ struct btrfs_path *path;
++ struct btrfs_root *new_root;
++ struct btrfs_root_ref *ref;
++ struct extent_buffer *leaf;
++ int ret;
++ int err = 0;
+
+- if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
+- return 0;
+- if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+- return 0;
++ path = btrfs_alloc_path();
++ if (!path) {
++ err = -ENOMEM;
++ goto out;
++ }
+
+- *sub_root = btrfs_read_fs_root(root->fs_info, location,
+- dentry->d_name.name,
+- dentry->d_name.len);
+- if (IS_ERR(*sub_root))
+- return PTR_ERR(*sub_root);
++ err = -ENOENT;
++ ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
++ BTRFS_I(dir)->root->root_key.objectid,
++ location->objectid);
++ if (ret) {
++ if (ret < 0)
++ err = ret;
++ goto out;
++ }
+
+- ri = &(*sub_root)->root_item;
+- location->objectid = btrfs_root_dirid(ri);
+- btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+- location->offset = 0;
++ leaf = path->nodes[0];
++ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
++ if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
++ btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
++ goto out;
+
+- return 0;
++ ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
++ (unsigned long)(ref + 1),
++ dentry->d_name.len);
++ if (ret)
++ goto out;
++
++ btrfs_release_path(root->fs_info->tree_root, path);
++
++ new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
++ if (IS_ERR(new_root)) {
++ err = PTR_ERR(new_root);
++ goto out;
++ }
++
++ if (btrfs_root_refs(&new_root->root_item) == 0) {
++ err = -ENOENT;
++ goto out;
++ }
++
++ *sub_root = new_root;
++ location->objectid = btrfs_root_dirid(&new_root->root_item);
++ location->type = BTRFS_INODE_ITEM_KEY;
++ location->offset = 0;
++ err = 0;
++out:
++ btrfs_free_path(path);
++ return err;
+ }
+
+ static void inode_tree_add(struct inode *inode)
+@@ -3101,11 +3376,13 @@ static void inode_tree_add(struct inode *inode)
+ struct btrfs_inode *entry;
+ struct rb_node **p;
+ struct rb_node *parent;
+-
+ again:
+ p = &root->inode_tree.rb_node;
+ parent = NULL;
+
++ if (hlist_unhashed(&inode->i_hash))
++ return;
++
+ spin_lock(&root->inode_lock);
+ while (*p) {
+ parent = *p;
+@@ -3132,13 +3409,87 @@ again:
+ static void inode_tree_del(struct inode *inode)
+ {
+ struct btrfs_root *root = BTRFS_I(inode)->root;
++ int empty = 0;
+
+ spin_lock(&root->inode_lock);
+ if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
+ rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+ RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
++ empty = RB_EMPTY_ROOT(&root->inode_tree);
++ }
++ spin_unlock(&root->inode_lock);
++
++ if (empty && btrfs_root_refs(&root->root_item) == 0) {
++ synchronize_srcu(&root->fs_info->subvol_srcu);
++ spin_lock(&root->inode_lock);
++ empty = RB_EMPTY_ROOT(&root->inode_tree);
++ spin_unlock(&root->inode_lock);
++ if (empty)
++ btrfs_add_dead_root(root);
++ }
++}
++
++int btrfs_invalidate_inodes(struct btrfs_root *root)
++{
++ struct rb_node *node;
++ struct rb_node *prev;
++ struct btrfs_inode *entry;
++ struct inode *inode;
++ u64 objectid = 0;
++
++ WARN_ON(btrfs_root_refs(&root->root_item) != 0);
++
++ spin_lock(&root->inode_lock);
++again:
++ node = root->inode_tree.rb_node;
++ prev = NULL;
++ while (node) {
++ prev = node;
++ entry = rb_entry(node, struct btrfs_inode, rb_node);
++
++ if (objectid < entry->vfs_inode.i_ino)
++ node = node->rb_left;
++ else if (objectid > entry->vfs_inode.i_ino)
++ node = node->rb_right;
++ else
++ break;
++ }
++ if (!node) {
++ while (prev) {
++ entry = rb_entry(prev, struct btrfs_inode, rb_node);
++ if (objectid <= entry->vfs_inode.i_ino) {
++ node = prev;
++ break;
++ }
++ prev = rb_next(prev);
++ }
++ }
++ while (node) {
++ entry = rb_entry(node, struct btrfs_inode, rb_node);
++ objectid = entry->vfs_inode.i_ino + 1;
++ inode = igrab(&entry->vfs_inode);
++ if (inode) {
++ spin_unlock(&root->inode_lock);
++ if (atomic_read(&inode->i_count) > 1)
++ d_prune_aliases(inode);
++ /*
++ * btrfs_drop_inode will remove it from
++ * the inode cache when its usage count
++ * hits zero.
++ */
++ iput(inode);
++ cond_resched();
++ spin_lock(&root->inode_lock);
++ goto again;
++ }
++
++ if (cond_resched_lock(&root->inode_lock))
++ goto again;
++
++ node = rb_next(node);
+ }
+ spin_unlock(&root->inode_lock);
++ return 0;
+ }
+
+ static noinline void init_btrfs_i(struct inode *inode)
+@@ -3148,6 +3499,7 @@ static noinline void init_btrfs_i(struct inode *inode)
+ bi->generation = 0;
+ bi->sequence = 0;
+ bi->last_trans = 0;
++ bi->last_sub_trans = 0;
+ bi->logged_trans = 0;
+ bi->delalloc_bytes = 0;
+ bi->reserved_bytes = 0;
+@@ -3225,15 +3577,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+ return inode;
+ }
+
++static struct inode *new_simple_dir(struct super_block *s,
++ struct btrfs_key *key,
++ struct btrfs_root *root)
++{
++ struct inode *inode = new_inode(s);
++
++ if (!inode)
++ return ERR_PTR(-ENOMEM);
++
++ init_btrfs_i(inode);
++
++ BTRFS_I(inode)->root = root;
++ memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
++ BTRFS_I(inode)->dummy_inode = 1;
++
++ inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
++ inode->i_op = &simple_dir_inode_operations;
++ inode->i_fop = &simple_dir_operations;
++ inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
++
++ return inode;
++}
++
+ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+ {
+ struct inode *inode;
+- struct btrfs_inode *bi = BTRFS_I(dir);
+- struct btrfs_root *root = bi->root;
++ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *sub_root = root;
+ struct btrfs_key location;
++ int index;
+ int ret;
+
++ dentry->d_op = &btrfs_dentry_operations;
++
+ if (dentry->d_name.len > BTRFS_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+@@ -3242,29 +3620,52 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+- inode = NULL;
+- if (location.objectid) {
+- ret = fixup_tree_root_location(root, &location, &sub_root,
+- dentry);
+- if (ret < 0)
+- return ERR_PTR(ret);
+- if (ret > 0)
+- return ERR_PTR(-ENOENT);
++ if (location.objectid == 0)
++ return NULL;
++
++ if (location.type == BTRFS_INODE_ITEM_KEY) {
++ inode = btrfs_iget(dir->i_sb, &location, root);
++ return inode;
++ }
++
++ BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
++
++ index = srcu_read_lock(&root->fs_info->subvol_srcu);
++ ret = fixup_tree_root_location(root, dir, dentry,
++ &location, &sub_root);
++ if (ret < 0) {
++ if (ret != -ENOENT)
++ inode = ERR_PTR(ret);
++ else
++ inode = new_simple_dir(dir->i_sb, &location, sub_root);
++ } else {
+ inode = btrfs_iget(dir->i_sb, &location, sub_root);
+- if (IS_ERR(inode))
+- return ERR_CAST(inode);
+ }
++ srcu_read_unlock(&root->fs_info->subvol_srcu, index);
++
+ return inode;
+ }
+
++static int btrfs_dentry_delete(struct dentry *dentry)
++{
++ struct btrfs_root *root;
++
++ if (!dentry->d_inode && !IS_ROOT(dentry))
++ dentry = dentry->d_parent;
++
++ if (dentry->d_inode) {
++ root = BTRFS_I(dentry->d_inode)->root;
++ if (btrfs_root_refs(&root->root_item) == 0)
++ return 1;
++ }
++ return 0;
++}
++
+ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+ {
+ struct inode *inode;
+
+- if (dentry->d_name.len > BTRFS_NAME_LEN)
+- return ERR_PTR(-ENAMETOOLONG);
+-
+ inode = btrfs_lookup_dentry(dir, dentry);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+@@ -3603,9 +4004,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
+ if (ret != 0)
+ goto fail;
+
+- if (objectid > root->highest_inode)
+- root->highest_inode = objectid;
+-
+ inode->i_uid = current_fsuid();
+
+ if (dir && (dir->i_mode & S_ISGID)) {
+@@ -3673,26 +4071,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
+ struct inode *parent_inode, struct inode *inode,
+ const char *name, int name_len, int add_backref, u64 index)
+ {
+- int ret;
++ int ret = 0;
+ struct btrfs_key key;
+ struct btrfs_root *root = BTRFS_I(parent_inode)->root;
+
+- key.objectid = inode->i_ino;
+- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+- key.offset = 0;
++ if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
++ memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
++ } else {
++ key.objectid = inode->i_ino;
++ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
++ key.offset = 0;
++ }
++
++ if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
++ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
++ key.objectid, root->root_key.objectid,
++ parent_inode->i_ino,
++ index, name, name_len);
++ } else if (add_backref) {
++ ret = btrfs_insert_inode_ref(trans, root,
++ name, name_len, inode->i_ino,
++ parent_inode->i_ino, index);
++ }
+
+- ret = btrfs_insert_dir_item(trans, root, name, name_len,
+- parent_inode->i_ino,
+- &key, btrfs_inode_type(inode),
+- index);
+ if (ret == 0) {
+- if (add_backref) {
+- ret = btrfs_insert_inode_ref(trans, root,
+- name, name_len,
+- inode->i_ino,
+- parent_inode->i_ino,
+- index);
+- }
++ ret = btrfs_insert_dir_item(trans, root, name, name_len,
++ parent_inode->i_ino, &key,
++ btrfs_inode_type(inode), index);
++ BUG_ON(ret);
++
+ btrfs_i_size_write(parent_inode, parent_inode->i_size +
+ name_len * 2);
+ parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+@@ -3732,11 +4139,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
+ if (!new_valid_dev(rdev))
+ return -EINVAL;
+
+- err = btrfs_check_metadata_free_space(root);
++ /*
++ * 2 for inode item and ref
++ * 2 for dir items
++ * 1 for xattr if selinux is on
++ */
++ err = btrfs_reserve_metadata_space(root, 5);
+ if (err)
+- goto fail;
++ return err;
+
+ trans = btrfs_start_transaction(root, 1);
++ if (!trans)
++ goto fail;
+ btrfs_set_trans_block_group(trans, dir);
+
+ err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+@@ -3774,6 +4188,7 @@ out_unlock:
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, root);
+ fail:
++ btrfs_unreserve_metadata_space(root, 5);
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+@@ -3794,10 +4209,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
+ u64 objectid;
+ u64 index = 0;
+
+- err = btrfs_check_metadata_free_space(root);
++ /*
++ * 2 for inode item and ref
++ * 2 for dir items
++ * 1 for xattr if selinux is on
++ */
++ err = btrfs_reserve_metadata_space(root, 5);
+ if (err)
+- goto fail;
++ return err;
++
+ trans = btrfs_start_transaction(root, 1);
++ if (!trans)
++ goto fail;
+ btrfs_set_trans_block_group(trans, dir);
+
+ err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+@@ -3838,6 +4261,7 @@ out_unlock:
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, root);
+ fail:
++ btrfs_unreserve_metadata_space(root, 5);
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+@@ -3860,10 +4284,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+ if (inode->i_nlink == 0)
+ return -ENOENT;
+
+- btrfs_inc_nlink(inode);
+- err = btrfs_check_metadata_free_space(root);
++ /*
++ * 1 item for inode ref
++ * 2 items for dir items
++ */
++ err = btrfs_reserve_metadata_space(root, 3);
+ if (err)
+- goto fail;
++ return err;
++
++ btrfs_inc_nlink(inode);
++
+ err = btrfs_set_inode_index(dir, &index);
+ if (err)
+ goto fail;
+@@ -3875,20 +4305,19 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+
+ err = btrfs_add_nondir(trans, dentry, inode, 1, index);
+
+- if (err)
+- drop_inode = 1;
+-
+- btrfs_update_inode_block_group(trans, dir);
+- err = btrfs_update_inode(trans, root, inode);
+-
+- if (err)
++ if (err) {
+ drop_inode = 1;
++ } else {
++ btrfs_update_inode_block_group(trans, dir);
++ err = btrfs_update_inode(trans, root, inode);
++ BUG_ON(err);
++ btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
++ }
+
+ nr = trans->blocks_used;
+-
+- btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+ btrfs_end_transaction_throttle(trans, root);
+ fail:
++ btrfs_unreserve_metadata_space(root, 3);
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+@@ -3908,17 +4337,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+ u64 index = 0;
+ unsigned long nr = 1;
+
+- err = btrfs_check_metadata_free_space(root);
++ /*
++ * 2 items for inode and ref
++ * 2 items for dir items
++ * 1 for xattr if selinux is on
++ */
++ err = btrfs_reserve_metadata_space(root, 5);
+ if (err)
+- goto out_unlock;
++ return err;
+
+ trans = btrfs_start_transaction(root, 1);
+- btrfs_set_trans_block_group(trans, dir);
+-
+- if (IS_ERR(trans)) {
+- err = PTR_ERR(trans);
++ if (!trans) {
++ err = -ENOMEM;
+ goto out_unlock;
+ }
++ btrfs_set_trans_block_group(trans, dir);
+
+ err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+ if (err) {
+@@ -3967,6 +4400,7 @@ out_fail:
+ btrfs_end_transaction_throttle(trans, root);
+
+ out_unlock:
++ btrfs_unreserve_metadata_space(root, 5);
+ if (drop_on_err)
+ iput(inode);
+ btrfs_btree_balance_dirty(root, nr);
+@@ -4064,11 +4498,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+ int compressed;
+
+ again:
+- spin_lock(&em_tree->lock);
++ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (em)
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+- spin_unlock(&em_tree->lock);
++ read_unlock(&em_tree->lock);
+
+ if (em) {
+ if (em->start > start || em->start + em->len <= start)
+@@ -4215,6 +4649,11 @@ again:
+ map = kmap(page);
+ read_extent_buffer(leaf, map + pg_offset, ptr,
+ copy_size);
++ if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
++ memset(map + pg_offset + copy_size, 0,
++ PAGE_CACHE_SIZE - pg_offset -
++ copy_size);
++ }
+ kunmap(page);
+ }
+ flush_dcache_page(page);
+@@ -4259,7 +4698,7 @@ insert:
+ }
+
+ err = 0;
+- spin_lock(&em_tree->lock);
++ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ /* it is possible that someone inserted the extent into the tree
+ * while we had the lock dropped. It is also possible that
+@@ -4299,7 +4738,7 @@ insert:
+ err = 0;
+ }
+ }
+- spin_unlock(&em_tree->lock);
++ write_unlock(&em_tree->lock);
+ out:
+ if (path)
+ btrfs_free_path(path);
+@@ -4398,13 +4837,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+ u64 page_start = page_offset(page);
+ u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+
++
++ /*
++ * we have the page locked, so new writeback can't start,
++ * and the dirty bit won't be cleared while we are here.
++ *
++ * Wait for IO on this page so that we can safely clear
++ * the PagePrivate2 bit and do ordered accounting
++ */
+ wait_on_page_writeback(page);
++
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ if (offset) {
+ btrfs_releasepage(page, GFP_NOFS);
+ return;
+ }
+-
+ lock_extent(tree, page_start, page_end, GFP_NOFS);
+ ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+ page_offset(page));
+@@ -4415,16 +4862,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+ */
+ clear_extent_bit(tree, page_start, page_end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+- EXTENT_LOCKED, 1, 0, GFP_NOFS);
+- btrfs_finish_ordered_io(page->mapping->host,
+- page_start, page_end);
++ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
++ NULL, GFP_NOFS);
++ /*
++ * whoever cleared the private bit is responsible
++ * for the finish_ordered_io
++ */
++ if (TestClearPagePrivate2(page)) {
++ btrfs_finish_ordered_io(page->mapping->host,
++ page_start, page_end);
++ }
+ btrfs_put_ordered_extent(ordered);
+ lock_extent(tree, page_start, page_end, GFP_NOFS);
+ }
+ clear_extent_bit(tree, page_start, page_end,
+ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+- EXTENT_ORDERED,
+- 1, 1, GFP_NOFS);
++ EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
+ __btrfs_releasepage(page, GFP_NOFS);
+
+ ClearPageChecked(page);
+@@ -4473,6 +4926,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+ goto out;
+ }
+
++ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
++ if (ret) {
++ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
++ ret = VM_FAULT_SIGBUS;
++ goto out;
++ }
++
+ ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+ again:
+ lock_page(page);
+@@ -4504,7 +4964,24 @@ again:
+ goto again;
+ }
+
+- btrfs_set_extent_delalloc(inode, page_start, page_end);
++ /*
++ * XXX - page_mkwrite gets called every time the page is dirtied, even
++ * if it was already dirty, so for space accounting reasons we need to
++ * clear any delalloc bits for the range we are fixing to save. There
++ * is probably a better way to do this, but for now keep consistent with
++ * prepare_pages in the normal write path.
++ */
++ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
++ EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
++ GFP_NOFS);
++
++ ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
++ if (ret) {
++ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
++ ret = VM_FAULT_SIGBUS;
++ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
++ goto out_unlock;
++ }
+ ret = 0;
+
+ /* page is wholly or partially inside EOF */
+@@ -4521,11 +4998,17 @@ again:
+ }
+ ClearPageChecked(page);
+ set_page_dirty(page);
++ SetPageUptodate(page);
++
++ BTRFS_I(inode)->last_trans = root->fs_info->generation;
++ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+
+- BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+ out_unlock:
++ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
++ if (!ret)
++ return VM_FAULT_LOCKED;
+ unlock_page(page);
+ out:
+ return ret;
+@@ -4544,7 +5027,9 @@ static void btrfs_truncate(struct inode *inode)
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return;
+
+- btrfs_truncate_page(inode->i_mapping, inode->i_size);
++ ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
++ if (ret)
++ return;
+ btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+
+ trans = btrfs_start_transaction(root, 1);
+@@ -4594,11 +5079,11 @@ out:
+ * create a new subvolume directory/inode (helper for the ioctl).
+ */
+ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+- struct btrfs_root *new_root, struct dentry *dentry,
++ struct btrfs_root *new_root,
+ u64 new_dirid, u64 alloc_hint)
+ {
+ struct inode *inode;
+- int error;
++ int err;
+ u64 index = 0;
+
+ inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
+@@ -4611,11 +5096,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+ inode->i_nlink = 1;
+ btrfs_i_size_write(inode, 0);
+
+- error = btrfs_update_inode(trans, new_root, inode);
+- if (error)
+- return error;
++ err = btrfs_update_inode(trans, new_root, inode);
++ BUG_ON(err);
+
+- d_instantiate(dentry, inode);
++ iput(inode);
+ return 0;
+ }
+
+@@ -4640,7 +5124,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
+ if (!ei)
+ return NULL;
+ ei->last_trans = 0;
++ ei->last_sub_trans = 0;
+ ei->logged_trans = 0;
++ ei->outstanding_extents = 0;
++ ei->reserved_extents = 0;
++ ei->root = NULL;
++ spin_lock_init(&ei->accounting_lock);
+ btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+ INIT_LIST_HEAD(&ei->i_orphan);
+ INIT_LIST_HEAD(&ei->ordered_operations);
+@@ -4656,6 +5145,14 @@ void btrfs_destroy_inode(struct inode *inode)
+ WARN_ON(inode->i_data.nrpages);
+
+ /*
++ * This can happen where we create an inode, but somebody else also
++ * created the same inode and we need to destroy the one we already
++ * created.
++ */
++ if (!root)
++ goto free;
++
++ /*
+ * Make sure we're properly removed from the ordered operation
+ * lists.
+ */
+@@ -4690,9 +5187,20 @@ void btrfs_destroy_inode(struct inode *inode)
+ }
+ inode_tree_del(inode);
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
++free:
+ kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+ }
+
++void btrfs_drop_inode(struct inode *inode)
++{
++ struct btrfs_root *root = BTRFS_I(inode)->root;
++
++ if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
++ generic_delete_inode(inode);
++ else
++ generic_drop_inode(inode);
++}
++
+ static void init_once(void *foo)
+ {
+ struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+@@ -4761,31 +5269,37 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ {
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(old_dir)->root;
++ struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+ struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = old_dentry->d_inode;
+ struct timespec ctime = CURRENT_TIME;
+ u64 index = 0;
++ u64 root_objectid;
+ int ret;
+
+- /* we're not allowed to rename between subvolumes */
+- if (BTRFS_I(old_inode)->root->root_key.objectid !=
+- BTRFS_I(new_dir)->root->root_key.objectid)
++ if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
++ return -EPERM;
++
++ /* we only allow rename subvolume link between subvolumes */
++ if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ return -EXDEV;
+
++ if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
++ (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
++ return -ENOTEMPTY;
++
+ if (S_ISDIR(old_inode->i_mode) && new_inode &&
+- new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
++ new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+ return -ENOTEMPTY;
+- }
+
+- /* to rename a snapshot or subvolume, we need to juggle the
+- * backrefs. This isn't coded yet
++ /*
++ * 2 items for dir items
++ * 1 item for orphan entry
++ * 1 item for ref
+ */
+- if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+- return -EXDEV;
+-
+- ret = btrfs_check_metadata_free_space(root);
++ ret = btrfs_reserve_metadata_space(root, 4);
+ if (ret)
+- goto out_unlock;
++ return ret;
+
+ /*
+ * we're using rename to replace one file with another.
+@@ -4796,8 +5310,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+ filemap_flush(old_inode->i_mapping);
+
++ /* close the racy window with snapshot create/destroy ioctl */
++ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
++ down_read(&root->fs_info->subvol_sem);
++
+ trans = btrfs_start_transaction(root, 1);
++ btrfs_set_trans_block_group(trans, new_dir);
++
++ if (dest != root)
++ btrfs_record_root_in_trans(trans, dest);
++
++ ret = btrfs_set_inode_index(new_dir, &index);
++ if (ret)
++ goto out_fail;
+
++ if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
++ /* force full log commit if subvolume involved. */
++ root->fs_info->last_trans_log_full_commit = trans->transid;
++ } else {
++ ret = btrfs_insert_inode_ref(trans, dest,
++ new_dentry->d_name.name,
++ new_dentry->d_name.len,
++ old_inode->i_ino,
++ new_dir->i_ino, index);
++ if (ret)
++ goto out_fail;
++ /*
++ * this is an ugly little race, but the rename is required
++ * to make sure that if we crash, the inode is either at the
++ * old name or the new one. pinning the log transaction lets
++ * us make sure we don't allow a log commit to come in after
++ * we unlink the name but before we add the new name back in.
++ */
++ btrfs_pin_log_trans(root);
++ }
+ /*
+ * make sure the inode gets flushed if it is replacing
+ * something.
+@@ -4807,18 +5353,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ btrfs_add_ordered_operation(trans, root, old_inode);
+ }
+
+- /*
+- * this is an ugly little race, but the rename is required to make
+- * sure that if we crash, the inode is either at the old name
+- * or the new one. pinning the log transaction lets us make sure
+- * we don't allow a log commit to come in after we unlink the
+- * name but before we add the new name back in.
+- */
+- btrfs_pin_log_trans(root);
+-
+- btrfs_set_trans_block_group(trans, new_dir);
+-
+- btrfs_inc_nlink(old_dentry->d_inode);
+ old_dir->i_ctime = old_dir->i_mtime = ctime;
+ new_dir->i_ctime = new_dir->i_mtime = ctime;
+ old_inode->i_ctime = ctime;
+@@ -4826,47 +5360,60 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ if (old_dentry->d_parent != new_dentry->d_parent)
+ btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+
+- ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
+- old_dentry->d_name.name,
+- old_dentry->d_name.len);
+- if (ret)
+- goto out_fail;
++ if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
++ root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
++ ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
++ old_dentry->d_name.name,
++ old_dentry->d_name.len);
++ } else {
++ btrfs_inc_nlink(old_dentry->d_inode);
++ ret = btrfs_unlink_inode(trans, root, old_dir,
++ old_dentry->d_inode,
++ old_dentry->d_name.name,
++ old_dentry->d_name.len);
++ }
++ BUG_ON(ret);
+
+ if (new_inode) {
+ new_inode->i_ctime = CURRENT_TIME;
+- ret = btrfs_unlink_inode(trans, root, new_dir,
+- new_dentry->d_inode,
+- new_dentry->d_name.name,
+- new_dentry->d_name.len);
+- if (ret)
+- goto out_fail;
++ if (unlikely(new_inode->i_ino ==
++ BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
++ root_objectid = BTRFS_I(new_inode)->location.objectid;
++ ret = btrfs_unlink_subvol(trans, dest, new_dir,
++ root_objectid,
++ new_dentry->d_name.name,
++ new_dentry->d_name.len);
++ BUG_ON(new_inode->i_nlink == 0);
++ } else {
++ ret = btrfs_unlink_inode(trans, dest, new_dir,
++ new_dentry->d_inode,
++ new_dentry->d_name.name,
++ new_dentry->d_name.len);
++ }
++ BUG_ON(ret);
+ if (new_inode->i_nlink == 0) {
+ ret = btrfs_orphan_add(trans, new_dentry->d_inode);
+- if (ret)
+- goto out_fail;
++ BUG_ON(ret);
+ }
+-
+ }
+- ret = btrfs_set_inode_index(new_dir, &index);
+- if (ret)
+- goto out_fail;
+
+- ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
+- old_inode, new_dentry->d_name.name,
+- new_dentry->d_name.len, 1, index);
+- if (ret)
+- goto out_fail;
++ ret = btrfs_add_link(trans, new_dir, old_inode,
++ new_dentry->d_name.name,
++ new_dentry->d_name.len, 0, index);
++ BUG_ON(ret);
+
+- btrfs_log_new_name(trans, old_inode, old_dir,
+- new_dentry->d_parent);
++ if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
++ btrfs_log_new_name(trans, old_inode, old_dir,
++ new_dentry->d_parent);
++ btrfs_end_log_trans(root);
++ }
+ out_fail:
+-
+- /* this btrfs_end_log_trans just allows the current
+- * log-sub transaction to complete
+- */
+- btrfs_end_log_trans(root);
+ btrfs_end_transaction_throttle(trans, root);
+-out_unlock:
++
++ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
++ up_read(&root->fs_info->subvol_sem);
++
++ btrfs_unreserve_metadata_space(root, 4);
+ return ret;
+ }
+
+@@ -4938,11 +5485,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
+ if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
+ return -ENAMETOOLONG;
+
+- err = btrfs_check_metadata_free_space(root);
++ /*
++ * 2 items for inode item and ref
++ * 2 items for dir items
++ * 1 item for xattr if selinux is on
++ */
++ err = btrfs_reserve_metadata_space(root, 5);
+ if (err)
+- goto out_fail;
++ return err;
+
+ trans = btrfs_start_transaction(root, 1);
++ if (!trans)
++ goto out_fail;
+ btrfs_set_trans_block_group(trans, dir);
+
+ err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+@@ -5023,6 +5577,7 @@ out_unlock:
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, root);
+ out_fail:
++ btrfs_unreserve_metadata_space(root, 5);
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+@@ -5044,6 +5599,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
+
+ while (num_bytes > 0) {
+ alloc_size = min(num_bytes, root->fs_info->max_extent);
++
++ ret = btrfs_reserve_metadata_space(root, 1);
++ if (ret)
++ goto out;
++
+ ret = btrfs_reserve_extent(trans, root, alloc_size,
+ root->sectorsize, 0, alloc_hint,
+ (u64)-1, &ins, 1);
+@@ -5058,9 +5618,12 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
+ 0, 0, 0,
+ BTRFS_FILE_EXTENT_PREALLOC);
+ BUG_ON(ret);
++ btrfs_drop_extent_cache(inode, cur_offset,
++ cur_offset + ins.offset -1, 0);
+ num_bytes -= ins.offset;
+ cur_offset += ins.offset;
+ alloc_hint = ins.objectid + ins.offset;
++ btrfs_unreserve_metadata_space(root, 1);
+ }
+ out:
+ if (cur_offset > start) {
+@@ -5223,6 +5786,7 @@ static struct inode_operations btrfs_dir_ro_inode_operations = {
+ .lookup = btrfs_lookup,
+ .permission = btrfs_permission,
+ };
++
+ static struct file_operations btrfs_dir_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+@@ -5245,6 +5809,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
+ .readpage_io_failed_hook = btrfs_io_failed_hook,
+ .set_bit_hook = btrfs_set_bit_hook,
+ .clear_bit_hook = btrfs_clear_bit_hook,
++ .merge_extent_hook = btrfs_merge_extent_hook,
++ .split_extent_hook = btrfs_split_extent_hook,
+ };
+
+ /*
+@@ -5309,3 +5875,7 @@ static struct inode_operations btrfs_symlink_inode_operations = {
+ .listxattr = btrfs_listxattr,
+ .removexattr = btrfs_removexattr,
+ };
++
++const struct dentry_operations btrfs_dentry_operations = {
++ .d_delete = btrfs_dentry_delete,
++};
+diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
+index bd88f25..cdbb054 100644
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root,
+ struct btrfs_root_item root_item;
+ struct btrfs_inode_item *inode_item;
+ struct extent_buffer *leaf;
+- struct btrfs_root *new_root = root;
+- struct inode *dir;
++ struct btrfs_root *new_root;
++ struct inode *dir = dentry->d_parent->d_inode;
+ int ret;
+ int err;
+ u64 objectid;
+@@ -239,9 +239,15 @@ static noinline int create_subvol(struct btrfs_root *root,
+ u64 index = 0;
+ unsigned long nr = 1;
+
+- ret = btrfs_check_metadata_free_space(root);
++ /*
++ * 1 - inode item
++ * 2 - refs
++ * 1 - root item
++ * 2 - dir items
++ */
++ ret = btrfs_reserve_metadata_space(root, 6);
+ if (ret)
+- goto fail_commit;
++ return ret;
+
+ trans = btrfs_start_transaction(root, 1);
+ BUG_ON(!trans);
+@@ -304,11 +310,17 @@ static noinline int create_subvol(struct btrfs_root *root,
+ if (ret)
+ goto fail;
+
++ key.offset = (u64)-1;
++ new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
++ BUG_ON(IS_ERR(new_root));
++
++ btrfs_record_root_in_trans(trans, new_root);
++
++ ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
++ BTRFS_I(dir)->block_group);
+ /*
+ * insert the directory item
+ */
+- key.offset = (u64)-1;
+- dir = dentry->d_parent->d_inode;
+ ret = btrfs_set_inode_index(dir, &index);
+ BUG_ON(ret);
+
+@@ -322,43 +334,20 @@ static noinline int create_subvol(struct btrfs_root *root,
+ ret = btrfs_update_inode(trans, root, dir);
+ BUG_ON(ret);
+
+- /* add the backref first */
+ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+- objectid, BTRFS_ROOT_BACKREF_KEY,
+- root->root_key.objectid,
++ objectid, root->root_key.objectid,
+ dir->i_ino, index, name, namelen);
+
+ BUG_ON(ret);
+
+- /* now add the forward ref */
+- ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+- root->root_key.objectid, BTRFS_ROOT_REF_KEY,
+- objectid,
+- dir->i_ino, index, name, namelen);
+-
+- BUG_ON(ret);
+-
+- ret = btrfs_commit_transaction(trans, root);
+- if (ret)
+- goto fail_commit;
+-
+- new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+- BUG_ON(!new_root);
+-
+- trans = btrfs_start_transaction(new_root, 1);
+- BUG_ON(!trans);
+-
+- ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
+- BTRFS_I(dir)->block_group);
+- if (ret)
+- goto fail;
+-
++ d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
+ fail:
+ nr = trans->blocks_used;
+- err = btrfs_commit_transaction(trans, new_root);
++ err = btrfs_commit_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+-fail_commit:
++
++ btrfs_unreserve_metadata_space(root, 6);
+ btrfs_btree_balance_dirty(root, nr);
+ return ret;
+ }
+@@ -375,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+ if (!root->ref_cows)
+ return -EINVAL;
+
+- ret = btrfs_check_metadata_free_space(root);
++ /*
++ * 1 - inode item
++ * 2 - refs
++ * 1 - root item
++ * 2 - dir items
++ */
++ ret = btrfs_reserve_metadata_space(root, 6);
+ if (ret)
+ goto fail_unlock;
+
+ pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+ if (!pending_snapshot) {
+ ret = -ENOMEM;
++ btrfs_unreserve_metadata_space(root, 6);
+ goto fail_unlock;
+ }
+ pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
+ if (!pending_snapshot->name) {
+ ret = -ENOMEM;
+ kfree(pending_snapshot);
++ btrfs_unreserve_metadata_space(root, 6);
+ goto fail_unlock;
+ }
+ memcpy(pending_snapshot->name, name, namelen);
+@@ -420,14 +417,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+ * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
+ * inside this filesystem so it's quite a bit simpler.
+ */
+-static noinline int btrfs_mksubvol(struct path *parent, char *name,
+- int mode, int namelen,
++static noinline int btrfs_mksubvol(struct path *parent,
++ char *name, int namelen,
+ struct btrfs_root *snap_src)
+ {
++ struct inode *dir = parent->dentry->d_inode;
+ struct dentry *dentry;
+ int error;
+
+- mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
++ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+
+ dentry = lookup_one_len(name, parent->dentry, namelen);
+ error = PTR_ERR(dentry);
+@@ -438,99 +436,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
+ if (dentry->d_inode)
+ goto out_dput;
+
+- if (!IS_POSIXACL(parent->dentry->d_inode))
+- mode &= ~current_umask();
+-
+ error = mnt_want_write(parent->mnt);
+ if (error)
+ goto out_dput;
+
+- error = btrfs_may_create(parent->dentry->d_inode, dentry);
++ error = btrfs_may_create(dir, dentry);
+ if (error)
+ goto out_drop_write;
+
+- /*
+- * Actually perform the low-level subvolume creation after all
+- * this VFS fuzz.
+- *
+- * Eventually we want to pass in an inode under which we create this
+- * subvolume, but for now all are under the filesystem root.
+- *
+- * Also we should pass on the mode eventually to allow creating new
+- * subvolume with specific mode bits.
+- */
++ down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
++
++ if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
++ goto out_up_read;
++
+ if (snap_src) {
+- struct dentry *dir = dentry->d_parent;
+- struct dentry *test = dir->d_parent;
+- struct btrfs_path *path = btrfs_alloc_path();
+- int ret;
+- u64 test_oid;
+- u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
+-
+- test_oid = snap_src->root_key.objectid;
+-
+- ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+- path, parent_oid, test_oid);
+- if (ret == 0)
+- goto create;
+- btrfs_release_path(snap_src->fs_info->tree_root, path);
+-
+- /* we need to make sure we aren't creating a directory loop
+- * by taking a snapshot of something that has our current
+- * subvol in its directory tree. So, this loops through
+- * the dentries and checks the forward refs for each subvolume
+- * to see if is references the subvolume where we are
+- * placing this new snapshot.
+- */
+- while (1) {
+- if (!test ||
+- dir == snap_src->fs_info->sb->s_root ||
+- test == snap_src->fs_info->sb->s_root ||
+- test->d_inode->i_sb != snap_src->fs_info->sb) {
+- break;
+- }
+- if (S_ISLNK(test->d_inode->i_mode)) {
+- printk(KERN_INFO "Btrfs symlink in snapshot "
+- "path, failed\n");
+- error = -EMLINK;
+- btrfs_free_path(path);
+- goto out_drop_write;
+- }
+- test_oid =
+- BTRFS_I(test->d_inode)->root->root_key.objectid;
+- ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+- path, test_oid, parent_oid);
+- if (ret == 0) {
+- printk(KERN_INFO "Btrfs snapshot creation "
+- "failed, looping\n");
+- error = -EMLINK;
+- btrfs_free_path(path);
+- goto out_drop_write;
+- }
+- btrfs_release_path(snap_src->fs_info->tree_root, path);
+- test = test->d_parent;
+- }
+-create:
+- btrfs_free_path(path);
+- error = create_snapshot(snap_src, dentry, name, namelen);
++ error = create_snapshot(snap_src, dentry,
++ name, namelen);
+ } else {
+- error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
+- dentry, name, namelen);
++ error = create_subvol(BTRFS_I(dir)->root, dentry,
++ name, namelen);
+ }
+- if (error)
+- goto out_drop_write;
+-
+- fsnotify_mkdir(parent->dentry->d_inode, dentry);
++ if (!error)
++ fsnotify_mkdir(dir, dentry);
++out_up_read:
++ up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+ out_drop_write:
+ mnt_drop_write(parent->mnt);
+ out_dput:
+ dput(dentry);
+ out_unlock:
+- mutex_unlock(&parent->dentry->d_inode->i_mutex);
++ mutex_unlock(&dir->i_mutex);
+ return error;
+ }
+
+-
+ static int btrfs_defrag_file(struct file *file)
+ {
+ struct inode *inode = fdentry(file)->d_inode;
+@@ -596,9 +534,8 @@ again:
+ clear_page_dirty_for_io(page);
+
+ btrfs_set_extent_delalloc(inode, page_start, page_end);
+-
+- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ set_page_dirty(page);
++ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
+@@ -609,7 +546,8 @@ out_unlock:
+ return 0;
+ }
+
+-static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
++static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
++ void __user *arg)
+ {
+ u64 new_size;
+ u64 old_size;
+@@ -718,10 +656,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
+ {
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_ioctl_vol_args *vol_args;
+- struct btrfs_dir_item *di;
+- struct btrfs_path *path;
+ struct file *src_file;
+- u64 root_dirid;
+ int namelen;
+ int ret = 0;
+
+@@ -739,32 +674,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
+ goto out;
+ }
+
+- path = btrfs_alloc_path();
+- if (!path) {
+- ret = -ENOMEM;
+- goto out;
+- }
+-
+- root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
+- di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
+- path, root_dirid,
+- vol_args->name, namelen, 0);
+- btrfs_free_path(path);
+-
+- if (di && !IS_ERR(di)) {
+- ret = -EEXIST;
+- goto out;
+- }
+-
+- if (IS_ERR(di)) {
+- ret = PTR_ERR(di);
+- goto out;
+- }
+-
+ if (subvol) {
+- ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+- file->f_path.dentry->d_inode->i_mode,
+- namelen, NULL);
++ ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
++ NULL);
+ } else {
+ struct inode *src_inode;
+ src_file = fget(vol_args->fd);
+@@ -781,17 +693,157 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
+ fput(src_file);
+ goto out;
+ }
+- ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+- file->f_path.dentry->d_inode->i_mode,
+- namelen, BTRFS_I(src_inode)->root);
++ ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
++ BTRFS_I(src_inode)->root);
+ fput(src_file);
+ }
+-
+ out:
+ kfree(vol_args);
+ return ret;
+ }
+
++/*
++ * helper to check if the subvolume references other subvolumes
++ */
++static noinline int may_destroy_subvol(struct btrfs_root *root)
++{
++ struct btrfs_path *path;
++ struct btrfs_key key;
++ int ret;
++
++ path = btrfs_alloc_path();
++ if (!path)
++ return -ENOMEM;
++
++ key.objectid = root->root_key.objectid;
++ key.type = BTRFS_ROOT_REF_KEY;
++ key.offset = (u64)-1;
++
++ ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
++ &key, path, 0, 0);
++ if (ret < 0)
++ goto out;
++ BUG_ON(ret == 0);
++
++ ret = 0;
++ if (path->slots[0] > 0) {
++ path->slots[0]--;
++ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
++ if (key.objectid == root->root_key.objectid &&
++ key.type == BTRFS_ROOT_REF_KEY)
++ ret = -ENOTEMPTY;
++ }
++out:
++ btrfs_free_path(path);
++ return ret;
++}
++
++static noinline int btrfs_ioctl_snap_destroy(struct file *file,
++ void __user *arg)
++{
++ struct dentry *parent = fdentry(file);
++ struct dentry *dentry;
++ struct inode *dir = parent->d_inode;
++ struct inode *inode;
++ struct btrfs_root *root = BTRFS_I(dir)->root;
++ struct btrfs_root *dest = NULL;
++ struct btrfs_ioctl_vol_args *vol_args;
++ struct btrfs_trans_handle *trans;
++ int namelen;
++ int ret;
++ int err = 0;
++
++ if (!capable(CAP_SYS_ADMIN))
++ return -EPERM;
++
++ vol_args = memdup_user(arg, sizeof(*vol_args));
++ if (IS_ERR(vol_args))
++ return PTR_ERR(vol_args);
++
++ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
++ namelen = strlen(vol_args->name);
++ if (strchr(vol_args->name, '/') ||
++ strncmp(vol_args->name, "..", namelen) == 0) {
++ err = -EINVAL;
++ goto out;
++ }
++
++ err = mnt_want_write(file->f_path.mnt);
++ if (err)
++ goto out;
++
++ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
++ dentry = lookup_one_len(vol_args->name, parent, namelen);
++ if (IS_ERR(dentry)) {
++ err = PTR_ERR(dentry);
++ goto out_unlock_dir;
++ }
++
++ if (!dentry->d_inode) {
++ err = -ENOENT;
++ goto out_dput;
++ }
++
++ inode = dentry->d_inode;
++ if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
++ err = -EINVAL;
++ goto out_dput;
++ }
++
++ dest = BTRFS_I(inode)->root;
++
++ mutex_lock(&inode->i_mutex);
++ err = d_invalidate(dentry);
++ if (err)
++ goto out_unlock;
++
++ down_write(&root->fs_info->subvol_sem);
++
++ err = may_destroy_subvol(dest);
++ if (err)
++ goto out_up_write;
++
++ trans = btrfs_start_transaction(root, 1);
++ ret = btrfs_unlink_subvol(trans, root, dir,
++ dest->root_key.objectid,
++ dentry->d_name.name,
++ dentry->d_name.len);
++ BUG_ON(ret);
++
++ btrfs_record_root_in_trans(trans, dest);
++
++ memset(&dest->root_item.drop_progress, 0,
++ sizeof(dest->root_item.drop_progress));
++ dest->root_item.drop_level = 0;
++ btrfs_set_root_refs(&dest->root_item, 0);
++
++ ret = btrfs_insert_orphan_item(trans,
++ root->fs_info->tree_root,
++ dest->root_key.objectid);
++ BUG_ON(ret);
++
++ ret = btrfs_commit_transaction(trans, root);
++ BUG_ON(ret);
++ inode->i_flags |= S_DEAD;
++out_up_write:
++ up_write(&root->fs_info->subvol_sem);
++out_unlock:
++ mutex_unlock(&inode->i_mutex);
++ if (!err) {
++ shrink_dcache_sb(root->fs_info->sb);
++ btrfs_invalidate_inodes(dest);
++ d_delete(dentry);
++ }
++out_dput:
++ dput(dentry);
++out_unlock_dir:
++ mutex_unlock(&dir->i_mutex);
++ mnt_drop_write(file->f_path.mnt);
++out:
++ kfree(vol_args);
++ return err;
++}
++
+ static int btrfs_ioctl_defrag(struct file *file)
+ {
+ struct inode *inode = fdentry(file)->d_inode;
+@@ -865,8 +917,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+ return ret;
+ }
+
+-static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+- u64 off, u64 olen, u64 destoff)
++static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
++ u64 off, u64 olen, u64 destoff)
+ {
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+@@ -976,7 +1028,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+
+ /* punch hole in destination first */
+ btrfs_drop_extents(trans, root, inode, off, off + len,
+- off + len, 0, &hint_byte);
++ off + len, 0, &hint_byte, 1);
+
+ /* clone data */
+ key.objectid = src->i_ino;
+@@ -1071,9 +1123,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+ datao += off - key.offset;
+ datal -= off - key.offset;
+ }
+- if (key.offset + datao + datal + key.offset >
+- off + len)
+- datal = off + len - key.offset - datao;
++
++ if (key.offset + datal > off + len)
++ datal = off + len - key.offset;
++
+ /* disko == 0 means it's a hole */
+ if (!disko)
+ datao = 0;
+@@ -1182,15 +1235,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+- int ret = 0;
++ int ret;
+
++ ret = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+- return -EPERM;
++ goto out;
+
+- if (file->private_data) {
+- ret = -EINPROGRESS;
++ ret = -EINPROGRESS;
++ if (file->private_data)
+ goto out;
+- }
+
+ ret = mnt_want_write(file->f_path.mnt);
+ if (ret)
+@@ -1200,12 +1253,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
+ root->fs_info->open_ioctl_trans++;
+ mutex_unlock(&root->fs_info->trans_mutex);
+
++ ret = -ENOMEM;
+ trans = btrfs_start_ioctl_transaction(root, 0);
+- if (trans)
+- file->private_data = trans;
+- else
+- ret = -ENOMEM;
+- /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
++ if (!trans)
++ goto out_drop;
++
++ file->private_data = trans;
++ return 0;
++
++out_drop:
++ mutex_lock(&root->fs_info->trans_mutex);
++ root->fs_info->open_ioctl_trans--;
++ mutex_unlock(&root->fs_info->trans_mutex);
++ mnt_drop_write(file->f_path.mnt);
+ out:
+ return ret;
+ }
+@@ -1221,24 +1281,20 @@ long btrfs_ioctl_trans_end(struct file *file)
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+- int ret = 0;
+
+ trans = file->private_data;
+- if (!trans) {
+- ret = -EINVAL;
+- goto out;
+- }
+- btrfs_end_transaction(trans, root);
++ if (!trans)
++ return -EINVAL;
+ file->private_data = NULL;
+
++ btrfs_end_transaction(trans, root);
++
+ mutex_lock(&root->fs_info->trans_mutex);
+ root->fs_info->open_ioctl_trans--;
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ mnt_drop_write(file->f_path.mnt);
+-
+-out:
+- return ret;
++ return 0;
+ }
+
+ long btrfs_ioctl(struct file *file, unsigned int
+@@ -1258,6 +1314,8 @@ long btrfs_ioctl(struct file *file, unsigned int
+ return btrfs_ioctl_snap_create(file, argp, 0);
+ case BTRFS_IOC_SUBVOL_CREATE:
+ return btrfs_ioctl_snap_create(file, argp, 1);
++ case BTRFS_IOC_SNAP_DESTROY:
++ return btrfs_ioctl_snap_destroy(file, argp);
+ case BTRFS_IOC_DEFRAG:
+ return btrfs_ioctl_defrag(file);
+ case BTRFS_IOC_RESIZE:
+diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
+index b320b10..bc49914 100644
+--- a/fs/btrfs/ioctl.h
++++ b/fs/btrfs/ioctl.h
+@@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args {
+
+ #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
+ struct btrfs_ioctl_vol_args)
+-
++#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
++ struct btrfs_ioctl_vol_args)
+ #endif
+diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
+index d6f0806..ab21c29 100644
+--- a/fs/btrfs/ordered-data.c
++++ b/fs/btrfs/ordered-data.c
+@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+ *
+ * len is the length of the extent
+ *
+- * This also sets the EXTENT_ORDERED bit on the range in the inode.
+- *
+ * The tree is given a single reference on the ordered extent that was
+ * inserted.
+ */
+@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ entry->start = start;
+ entry->len = len;
+ entry->disk_len = disk_len;
++ entry->bytes_left = len;
+ entry->inode = inode;
+ if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
+ set_bit(type, &entry->flags);
+@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ &entry->rb_node);
+ BUG_ON(node);
+
+- set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
+- entry_end(entry) - 1, GFP_NOFS);
+-
+ spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ list_add_tail(&entry->root_extent_list,
+ &BTRFS_I(inode)->root->fs_info->ordered_extents);
+@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry;
+- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ int ret;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ mutex_lock(&tree->mutex);
+- clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
+- GFP_NOFS);
+ node = tree_search(tree, file_offset);
+ if (!node) {
+ ret = 1;
+@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
+ goto out;
+ }
+
+- ret = test_range_bit(io_tree, entry->file_offset,
+- entry->file_offset + entry->len - 1,
+- EXTENT_ORDERED, 0);
+- if (ret == 0)
++ if (io_size > entry->bytes_left) {
++ printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
++ (unsigned long long)entry->bytes_left,
++ (unsigned long long)io_size);
++ }
++ entry->bytes_left -= io_size;
++ if (entry->bytes_left == 0)
+ ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
++ else
++ ret = 1;
+ out:
+ mutex_unlock(&tree->mutex);
+ return ret == 0;
+@@ -308,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode,
+ tree->last = NULL;
+ set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+
++ spin_lock(&BTRFS_I(inode)->accounting_lock);
++ BTRFS_I(inode)->outstanding_extents--;
++ spin_unlock(&BTRFS_I(inode)->accounting_lock);
++ btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
++ inode, 1);
++
+ spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ list_del_init(&entry->root_extent_list);
+
+@@ -476,6 +480,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+ u64 orig_end;
+ u64 wait_end;
+ struct btrfs_ordered_extent *ordered;
++ int found;
+
+ if (start + len < start) {
+ orig_end = INT_LIMIT(loff_t);
+@@ -502,6 +507,7 @@ again:
+ orig_end >> PAGE_CACHE_SHIFT);
+
+ end = orig_end;
++ found = 0;
+ while (1) {
+ ordered = btrfs_lookup_first_ordered_extent(inode, end);
+ if (!ordered)
+@@ -514,6 +520,7 @@ again:
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
++ found++;
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ end = ordered->file_offset;
+ btrfs_put_ordered_extent(ordered);
+@@ -521,8 +528,8 @@ again:
+ break;
+ end--;
+ }
+- if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+- EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
++ if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
++ EXTENT_DELALLOC, 0, NULL)) {
+ schedule_timeout(1);
+ goto again;
+ }
+@@ -613,7 +620,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
+ */
+ if (test_range_bit(io_tree, disk_i_size,
+ ordered->file_offset + ordered->len - 1,
+- EXTENT_DELALLOC, 0)) {
++ EXTENT_DELALLOC, 0, NULL)) {
+ goto out;
+ }
+ /*
+@@ -664,7 +671,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
+ */
+ if (i_size_test > entry_end(ordered) &&
+ !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
+- EXTENT_DELALLOC, 0)) {
++ EXTENT_DELALLOC, 0, NULL)) {
+ new_i_size = min_t(u64, i_size_test, i_size_read(inode));
+ }
+ BTRFS_I(inode)->disk_i_size = new_i_size;
+diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
+index 3d31c88..993a7ea 100644
+--- a/fs/btrfs/ordered-data.h
++++ b/fs/btrfs/ordered-data.h
+@@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
+ /* extent length on disk */
+ u64 disk_len;
+
++ /* number of bytes that still need writing */
++ u64 bytes_left;
++
+ /* flags (described above) */
+ unsigned long flags;
+
+diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
+index 3c0d52a..79cba5f 100644
+--- a/fs/btrfs/orphan.c
++++ b/fs/btrfs/orphan.c
+@@ -65,3 +65,23 @@ out:
+ btrfs_free_path(path);
+ return ret;
+ }
++
++int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
++{
++ struct btrfs_path *path;
++ struct btrfs_key key;
++ int ret;
++
++ key.objectid = BTRFS_ORPHAN_OBJECTID;
++ key.type = BTRFS_ORPHAN_ITEM_KEY;
++ key.offset = offset;
++
++ path = btrfs_alloc_path();
++ if (!path)
++ return -ENOMEM;
++
++ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
++
++ btrfs_free_path(path);
++ return ret;
++}
+diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
+index c04f7f2..cfcc93c 100644
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -121,6 +121,15 @@ struct inodevec {
+ int nr;
+ };
+
++#define MAX_EXTENTS 128
++
++struct file_extent_cluster {
++ u64 start;
++ u64 end;
++ u64 boundary[MAX_EXTENTS];
++ unsigned int nr;
++};
++
+ struct reloc_control {
+ /* block group to relocate */
+ struct btrfs_block_group_cache *block_group;
+@@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
+ struct reloc_control *rc)
+ {
+ if (test_range_bit(&rc->processed_blocks, bytenr,
+- bytenr + blocksize - 1, EXTENT_DIRTY, 1))
++ bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
+ return 1;
+ return 0;
+ }
+@@ -2529,56 +2538,94 @@ out:
+ }
+
+ static noinline_for_stack
+-int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
++int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
++ u64 block_start)
++{
++ struct btrfs_root *root = BTRFS_I(inode)->root;
++ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
++ struct extent_map *em;
++ int ret = 0;
++
++ em = alloc_extent_map(GFP_NOFS);
++ if (!em)
++ return -ENOMEM;
++
++ em->start = start;
++ em->len = end + 1 - start;
++ em->block_len = em->len;
++ em->block_start = block_start;
++ em->bdev = root->fs_info->fs_devices->latest_bdev;
++ set_bit(EXTENT_FLAG_PINNED, &em->flags);
++
++ lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
++ while (1) {
++ write_lock(&em_tree->lock);
++ ret = add_extent_mapping(em_tree, em);
++ write_unlock(&em_tree->lock);
++ if (ret != -EEXIST) {
++ free_extent_map(em);
++ break;
++ }
++ btrfs_drop_extent_cache(inode, start, end, 0);
++ }
++ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
++ return ret;
++}
++
++static int relocate_file_extent_cluster(struct inode *inode,
++ struct file_extent_cluster *cluster)
+ {
+ u64 page_start;
+ u64 page_end;
+- unsigned long i;
+- unsigned long first_index;
++ u64 offset = BTRFS_I(inode)->index_cnt;
++ unsigned long index;
+ unsigned long last_index;
+- unsigned int total_read = 0;
+- unsigned int total_dirty = 0;
++ unsigned int dirty_page = 0;
+ struct page *page;
+ struct file_ra_state *ra;
+- struct btrfs_ordered_extent *ordered;
+- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
++ int nr = 0;
+ int ret = 0;
+
++ if (!cluster->nr)
++ return 0;
++
+ ra = kzalloc(sizeof(*ra), GFP_NOFS);
+ if (!ra)
+ return -ENOMEM;
+
++ index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
++ last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
++
+ mutex_lock(&inode->i_mutex);
+- first_index = start >> PAGE_CACHE_SHIFT;
+- last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
+
+- /* make sure the dirty trick played by the caller work */
+- while (1) {
+- ret = invalidate_inode_pages2_range(inode->i_mapping,
+- first_index, last_index);
+- if (ret != -EBUSY)
+- break;
+- schedule_timeout(HZ/10);
+- }
++ i_size_write(inode, cluster->end + 1 - offset);
++ ret = setup_extent_mapping(inode, cluster->start - offset,
++ cluster->end - offset, cluster->start);
+ if (ret)
+ goto out_unlock;
+
+ file_ra_state_init(ra, inode->i_mapping);
+
+- for (i = first_index ; i <= last_index; i++) {
+- if (total_read % ra->ra_pages == 0) {
+- btrfs_force_ra(inode->i_mapping, ra, NULL, i,
+- min(last_index, ra->ra_pages + i - 1));
+- }
+- total_read++;
+-again:
+- if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
+- BUG_ON(1);
+- page = grab_cache_page(inode->i_mapping, i);
++ WARN_ON(cluster->start != cluster->boundary[0]);
++ while (index <= last_index) {
++ page = find_lock_page(inode->i_mapping, index);
+ if (!page) {
+- ret = -ENOMEM;
+- goto out_unlock;
++ page_cache_sync_readahead(inode->i_mapping,
++ ra, NULL, index,
++ last_index + 1 - index);
++ page = grab_cache_page(inode->i_mapping, index);
++ if (!page) {
++ ret = -ENOMEM;
++ goto out_unlock;
++ }
++ }
++
++ if (PageReadahead(page)) {
++ page_cache_async_readahead(inode->i_mapping,
++ ra, NULL, page, index,
++ last_index + 1 - index);
+ }
++
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+@@ -2589,75 +2636,79 @@ again:
+ goto out_unlock;
+ }
+ }
+- wait_on_page_writeback(page);
+
+ page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+- lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+-
+- ordered = btrfs_lookup_ordered_extent(inode, page_start);
+- if (ordered) {
+- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+- unlock_page(page);
+- page_cache_release(page);
+- btrfs_start_ordered_extent(inode, ordered, 1);
+- btrfs_put_ordered_extent(ordered);
+- goto again;
+- }
++
++ lock_extent(&BTRFS_I(inode)->io_tree,
++ page_start, page_end, GFP_NOFS);
++
+ set_page_extent_mapped(page);
+
+- if (i == first_index)
+- set_extent_bits(io_tree, page_start, page_end,
++ if (nr < cluster->nr &&
++ page_start + offset == cluster->boundary[nr]) {
++ set_extent_bits(&BTRFS_I(inode)->io_tree,
++ page_start, page_end,
+ EXTENT_BOUNDARY, GFP_NOFS);
++ nr++;
++ }
+ btrfs_set_extent_delalloc(inode, page_start, page_end);
+
+ set_page_dirty(page);
+- total_dirty++;
++ dirty_page++;
+
+- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
++ unlock_extent(&BTRFS_I(inode)->io_tree,
++ page_start, page_end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
++
++ index++;
++ if (nr < cluster->nr &&
++ page_end + 1 + offset == cluster->boundary[nr]) {
++ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
++ dirty_page);
++ dirty_page = 0;
++ }
++ }
++ if (dirty_page) {
++ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
++ dirty_page);
+ }
++ WARN_ON(nr != cluster->nr);
+ out_unlock:
+ mutex_unlock(&inode->i_mutex);
+ kfree(ra);
+- balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+ return ret;
+ }
+
+ static noinline_for_stack
+-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
++int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
++ struct file_extent_cluster *cluster)
+ {
+- struct btrfs_root *root = BTRFS_I(inode)->root;
+- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+- struct extent_map *em;
+- u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
+- u64 end = start + extent_key->offset - 1;
+-
+- em = alloc_extent_map(GFP_NOFS);
+- em->start = start;
+- em->len = extent_key->offset;
+- em->block_len = extent_key->offset;
+- em->block_start = extent_key->objectid;
+- em->bdev = root->fs_info->fs_devices->latest_bdev;
+- set_bit(EXTENT_FLAG_PINNED, &em->flags);
++ int ret;
+
+- /* setup extent map to cheat btrfs_readpage */
+- lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+- while (1) {
+- int ret;
+- spin_lock(&em_tree->lock);
+- ret = add_extent_mapping(em_tree, em);
+- spin_unlock(&em_tree->lock);
+- if (ret != -EEXIST) {
+- free_extent_map(em);
+- break;
+- }
+- btrfs_drop_extent_cache(inode, start, end, 0);
++ if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
++ ret = relocate_file_extent_cluster(inode, cluster);
++ if (ret)
++ return ret;
++ cluster->nr = 0;
+ }
+- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+
+- return relocate_inode_pages(inode, start, extent_key->offset);
++ if (!cluster->nr)
++ cluster->start = extent_key->objectid;
++ else
++ BUG_ON(cluster->nr >= MAX_EXTENTS);
++ cluster->end = extent_key->objectid + extent_key->offset - 1;
++ cluster->boundary[cluster->nr] = extent_key->objectid;
++ cluster->nr++;
++
++ if (cluster->nr >= MAX_EXTENTS) {
++ ret = relocate_file_extent_cluster(inode, cluster);
++ if (ret)
++ return ret;
++ cluster->nr = 0;
++ }
++ return 0;
+ }
+
+ #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+@@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags)
+ return 0;
+ }
+
++
+ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+ {
+ struct rb_root blocks = RB_ROOT;
+ struct btrfs_key key;
++ struct file_extent_cluster *cluster;
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_path *path;
+ struct btrfs_extent_item *ei;
+@@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+ int ret;
+ int err = 0;
+
++ cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
++ if (!cluster)
++ return -ENOMEM;
++
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
++ rc->extents_found = 0;
++ rc->extents_skipped = 0;
++
+ rc->search_start = rc->block_group->key.objectid;
+ clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+ GFP_NOFS);
+@@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+ }
+
+ nr = trans->blocks_used;
+- btrfs_end_transaction_throttle(trans, rc->extent_root);
++ btrfs_end_transaction(trans, rc->extent_root);
+ trans = NULL;
+ btrfs_btree_balance_dirty(rc->extent_root, nr);
+
+ if (rc->stage == MOVE_DATA_EXTENTS &&
+ (flags & BTRFS_EXTENT_FLAG_DATA)) {
+ rc->found_file_extent = 1;
+- ret = relocate_data_extent(rc->data_inode, &key);
++ ret = relocate_data_extent(rc->data_inode,
++ &key, cluster);
+ if (ret < 0) {
+ err = ret;
+ break;
+@@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+ btrfs_btree_balance_dirty(rc->extent_root, nr);
+ }
+
++ if (!err) {
++ ret = relocate_file_extent_cluster(rc->data_inode, cluster);
++ if (ret < 0)
++ err = ret;
++ }
++
++ kfree(cluster);
++
+ rc->create_reloc_root = 0;
+ smp_mb();
+
+@@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+ }
+
+ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root,
+- u64 objectid, u64 size)
++ struct btrfs_root *root, u64 objectid)
+ {
+ struct btrfs_path *path;
+ struct btrfs_inode_item *item;
+@@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+ memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+ btrfs_set_inode_generation(leaf, item, 1);
+- btrfs_set_inode_size(leaf, item, size);
++ btrfs_set_inode_size(leaf, item, 0);
+ btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+ btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+ btrfs_mark_buffer_dirty(leaf);
+@@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+ if (err)
+ goto out;
+
+- err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+- BUG_ON(err);
+-
+- err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+- group->key.offset, 0, group->key.offset,
+- 0, 0, 0);
++ err = __insert_orphan_inode(trans, root, objectid);
+ BUG_ON(err);
+
+ key.objectid = objectid;
+@@ -3455,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+ BUG_ON(!rc->block_group);
+
+ btrfs_init_workers(&rc->workers, "relocate",
+- fs_info->thread_pool_size);
++ fs_info->thread_pool_size, NULL);
+
+ rc->extent_root = extent_root;
+ btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+@@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+ btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+
+ while (1) {
+- mutex_lock(&fs_info->cleaner_mutex);
+- btrfs_clean_old_snapshots(fs_info->tree_root);
+- mutex_unlock(&fs_info->cleaner_mutex);
+-
+ rc->extents_found = 0;
+ rc->extents_skipped = 0;
+
++ mutex_lock(&fs_info->cleaner_mutex);
++
++ btrfs_clean_old_snapshots(fs_info->tree_root);
+ ret = relocate_block_group(rc);
++
++ mutex_unlock(&fs_info->cleaner_mutex);
+ if (ret < 0) {
+ err = ret;
+ break;
+@@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+ }
+ }
+
+- filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
+- rc->block_group->key.objectid,
+- rc->block_group->key.objectid +
+- rc->block_group->key.offset - 1);
++ filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
++ rc->block_group->key.objectid,
++ rc->block_group->key.objectid +
++ rc->block_group->key.offset - 1);
+
+ WARN_ON(rc->block_group->pinned > 0);
+ WARN_ON(rc->block_group->reserved > 0);
+@@ -3530,6 +3594,26 @@ out:
+ return err;
+ }
+
++static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
++{
++ struct btrfs_trans_handle *trans;
++ int ret;
++
++ trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
++
++ memset(&root->root_item.drop_progress, 0,
++ sizeof(root->root_item.drop_progress));
++ root->root_item.drop_level = 0;
++ btrfs_set_root_refs(&root->root_item, 0);
++ ret = btrfs_update_root(trans, root->fs_info->tree_root,
++ &root->root_key, &root->root_item);
++ BUG_ON(ret);
++
++ ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
++ BUG_ON(ret);
++ return 0;
++}
++
+ /*
+ * recover relocation interrupted by system crash.
+ *
+@@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
+ fs_root = read_fs_root(root->fs_info,
+ reloc_root->root_key.offset);
+ if (IS_ERR(fs_root)) {
+- err = PTR_ERR(fs_root);
+- goto out;
++ ret = PTR_ERR(fs_root);
++ if (ret != -ENOENT) {
++ err = ret;
++ goto out;
++ }
++ mark_garbage_root(reloc_root);
+ }
+ }
+
+@@ -3613,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
+ mapping_tree_init(&rc->reloc_root_tree);
+ INIT_LIST_HEAD(&rc->reloc_roots);
+ btrfs_init_workers(&rc->workers, "relocate",
+- root->fs_info->thread_pool_size);
++ root->fs_info->thread_pool_size, NULL);
+ rc->extent_root = root->fs_info->extent_root;
+
+ set_reloc_control(rc);
+diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
+index 0ddc6d6..9351428 100644
+--- a/fs/btrfs/root-tree.c
++++ b/fs/btrfs/root-tree.c
+@@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
+ goto out;
+
+ BUG_ON(ret == 0);
++ if (path->slots[0] == 0) {
++ ret = 1;
++ goto out;
++ }
+ l = path->nodes[0];
+- BUG_ON(path->slots[0] == 0);
+ slot = path->slots[0] - 1;
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+- if (found_key.objectid != objectid) {
++ if (found_key.objectid != objectid ||
++ found_key.type != BTRFS_ROOT_ITEM_KEY) {
+ ret = 1;
+ goto out;
+ }
+- read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
+- sizeof(*item));
+- memcpy(key, &found_key, sizeof(found_key));
++ if (item)
++ read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
++ sizeof(*item));
++ if (key)
++ memcpy(key, &found_key, sizeof(found_key));
+ ret = 0;
+ out:
+ btrfs_free_path(path);
+@@ -249,6 +255,59 @@ err:
+ return ret;
+ }
+
++int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
++{
++ struct extent_buffer *leaf;
++ struct btrfs_path *path;
++ struct btrfs_key key;
++ int err = 0;
++ int ret;
++
++ path = btrfs_alloc_path();
++ if (!path)
++ return -ENOMEM;
++
++ key.objectid = BTRFS_ORPHAN_OBJECTID;
++ key.type = BTRFS_ORPHAN_ITEM_KEY;
++ key.offset = 0;
++
++ while (1) {
++ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
++ if (ret < 0) {
++ err = ret;
++ break;
++ }
++
++ leaf = path->nodes[0];
++ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
++ ret = btrfs_next_leaf(tree_root, path);
++ if (ret < 0)
++ err = ret;
++ if (ret != 0)
++ break;
++ leaf = path->nodes[0];
++ }
++
++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++ btrfs_release_path(tree_root, path);
++
++ if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
++ key.type != BTRFS_ORPHAN_ITEM_KEY)
++ break;
++
++ ret = btrfs_find_dead_roots(tree_root, key.offset);
++ if (ret) {
++ err = ret;
++ break;
++ }
++
++ key.offset++;
++ }
++
++ btrfs_free_path(path);
++ return err;
++}
++
+ /* drop the root item for 'key' from 'root' */
+ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_key *key)
+@@ -278,31 +337,57 @@ out:
+ return ret;
+ }
+
+-#if 0 /* this will get used when snapshot deletion is implemented */
+ int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+- u64 root_id, u8 type, u64 ref_id)
++ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
++ const char *name, int name_len)
++
+ {
++ struct btrfs_path *path;
++ struct btrfs_root_ref *ref;
++ struct extent_buffer *leaf;
+ struct btrfs_key key;
++ unsigned long ptr;
++ int err = 0;
+ int ret;
+- struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
++ if (!path)
++ return -ENOMEM;
+
+ key.objectid = root_id;
+- key.type = type;
++ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = ref_id;
+-
++again:
+ ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+- BUG_ON(ret);
+-
+- ret = btrfs_del_item(trans, tree_root, path);
+- BUG_ON(ret);
++ BUG_ON(ret < 0);
++ if (ret == 0) {
++ leaf = path->nodes[0];
++ ref = btrfs_item_ptr(leaf, path->slots[0],
++ struct btrfs_root_ref);
++
++ WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
++ WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
++ ptr = (unsigned long)(ref + 1);
++ WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
++ *sequence = btrfs_root_ref_sequence(leaf, ref);
++
++ ret = btrfs_del_item(trans, tree_root, path);
++ BUG_ON(ret);
++ } else
++ err = -ENOENT;
++
++ if (key.type == BTRFS_ROOT_BACKREF_KEY) {
++ btrfs_release_path(tree_root, path);
++ key.objectid = ref_id;
++ key.type = BTRFS_ROOT_REF_KEY;
++ key.offset = root_id;
++ goto again;
++ }
+
+ btrfs_free_path(path);
+- return ret;
++ return err;
+ }
+-#endif
+
+ int btrfs_find_root_ref(struct btrfs_root *tree_root,
+ struct btrfs_path *path,
+@@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
+ return ret;
+ }
+
+-
+ /*
+ * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
+ * or BTRFS_ROOT_BACKREF_KEY.
+@@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
+ */
+ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+- u64 root_id, u8 type, u64 ref_id,
+- u64 dirid, u64 sequence,
++ u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+ const char *name, int name_len)
+ {
+ struct btrfs_key key;
+@@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+
+-
+ path = btrfs_alloc_path();
++ if (!path)
++ return -ENOMEM;
+
+ key.objectid = root_id;
+- key.type = type;
++ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = ref_id;
+-
++again:
+ ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
+ sizeof(*ref) + name_len);
+ BUG_ON(ret);
+@@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ write_extent_buffer(leaf, name, ptr, name_len);
+ btrfs_mark_buffer_dirty(leaf);
+
++ if (key.type == BTRFS_ROOT_BACKREF_KEY) {
++ btrfs_release_path(tree_root, path);
++ key.objectid = ref_id;
++ key.type = BTRFS_ROOT_REF_KEY;
++ key.offset = root_id;
++ goto again;
++ }
++
+ btrfs_free_path(path);
+- return ret;
++ return 0;
+ }
+diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
+index 6d6d06c..939b68f 100644
+--- a/fs/btrfs/super.c
++++ b/fs/btrfs/super.c
+@@ -66,7 +66,7 @@ enum {
+ Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
+ Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
+ Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
+- Opt_tag, Opt_notag, Opt_tagid, Opt_err,
++ Opt_tag, Opt_notag, Opt_tagid, Opt_discard, Opt_err,
+ };
+
+ static match_table_t tokens = {
+@@ -88,6 +89,7 @@ static match_table_t tokens = {
+ {Opt_notreelog, "notreelog"},
+ {Opt_flushoncommit, "flushoncommit"},
+ {Opt_ratio, "metadata_ratio=%d"},
++ {Opt_discard, "discard"},
+ {Opt_tag, "tag"},
+ {Opt_notag, "notag"},
+ {Opt_tagid, "tagid=%u"},
+@@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
+ info->metadata_ratio);
+ }
+ break;
++ case Opt_discard:
++ btrfs_set_opt(info->mount_opt, DISCARD);
++ break;
+ #ifndef CONFIG_TAGGING_NONE
+ case Opt_tag:
+ printk(KERN_INFO "btrfs: use tagging\n");
+@@ -344,7 +349,9 @@ static int btrfs_fill_super(struct super_block *sb,
+ sb->s_export_op = &btrfs_export_ops;
+ sb->s_xattr = btrfs_xattr_handlers;
+ sb->s_time_gran = 1;
++#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ sb->s_flags |= MS_POSIXACL;
++#endif
+
+ tree_root = open_ctree(sb, fs_devices, (char *)data);
+
+@@ -676,6 +683,7 @@ static int btrfs_unfreeze(struct super_block *sb)
+ }
+
+ static struct super_operations btrfs_super_ops = {
++ .drop_inode = btrfs_drop_inode,
+ .delete_inode = btrfs_delete_inode,
+ .put_super = btrfs_put_super,
+ .sync_fs = btrfs_sync_fs,
+diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
+index cdbb502..bca82a4 100644
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
+ {
+ if (root->ref_cows && root->last_trans < trans->transid) {
+ WARN_ON(root == root->fs_info->extent_root);
+- WARN_ON(root->root_item.refs == 0);
+ WARN_ON(root->commit_root != root->node);
+
+ radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+@@ -187,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
+ h->alloc_exclude_start = 0;
+ h->delayed_ref_updates = 0;
+
++ if (!current->journal_info)
++ current->journal_info = h;
++
+ root->fs_info->running_transaction->use_count++;
+ record_root_in_trans(h, root);
+ mutex_unlock(&root->fs_info->trans_mutex);
+@@ -318,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
+ wake_up(&cur_trans->writer_wait);
+ put_transaction(cur_trans);
+ mutex_unlock(&info->trans_mutex);
++
++ if (current->journal_info == trans)
++ current->journal_info = NULL;
+ memset(trans, 0, sizeof(*trans));
+ kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+@@ -339,10 +344,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+ /*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+- * those extents are on disk for transaction or log commit
++ * those extents are sent to disk but does not wait on them
+ */
+-int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+- struct extent_io_tree *dirty_pages)
++int btrfs_write_marked_extents(struct btrfs_root *root,
++ struct extent_io_tree *dirty_pages)
+ {
+ int ret;
+ int err = 0;
+@@ -389,6 +394,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ page_cache_release(page);
+ }
+ }
++ if (err)
++ werr = err;
++ return werr;
++}
++
++/*
++ * when btree blocks are allocated, they have some corresponding bits set for
++ * them in one of two extent_io trees. This is used to make sure all of
++ * those extents are on disk for transaction or log commit. We wait
++ * on all the pages and clear them from the dirty pages state tree
++ */
++int btrfs_wait_marked_extents(struct btrfs_root *root,
++ struct extent_io_tree *dirty_pages)
++{
++ int ret;
++ int err = 0;
++ int werr = 0;
++ struct page *page;
++ struct inode *btree_inode = root->fs_info->btree_inode;
++ u64 start = 0;
++ u64 end;
++ unsigned long index;
++
+ while (1) {
+ ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
+ EXTENT_DIRTY);
+@@ -419,6 +447,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ return werr;
+ }
+
++/*
++ * when btree blocks are allocated, they have some corresponding bits set for
++ * them in one of two extent_io trees. This is used to make sure all of
++ * those extents are on disk for transaction or log commit
++ */
++int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
++ struct extent_io_tree *dirty_pages)
++{
++ int ret;
++ int ret2;
++
++ ret = btrfs_write_marked_extents(root, dirty_pages);
++ ret2 = btrfs_wait_marked_extents(root, dirty_pages);
++ return ret || ret2;
++}
++
+ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+ {
+@@ -720,7 +764,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+ memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+
+ key.objectid = objectid;
+- key.offset = 0;
++ /* record when the snapshot was created in key.offset */
++ key.offset = trans->transid;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
+ old = btrfs_lock_root_node(root);
+@@ -743,6 +788,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+ memcpy(&pending->root_key, &key, sizeof(key));
+ fail:
+ kfree(new_root_item);
++ btrfs_unreserve_metadata_space(root, 6);
+ return ret;
+ }
+
+@@ -778,24 +824,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
+ ret = btrfs_update_inode(trans, parent_root, parent_inode);
+ BUG_ON(ret);
+
+- /* add the backref first */
+ ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+ pending->root_key.objectid,
+- BTRFS_ROOT_BACKREF_KEY,
+ parent_root->root_key.objectid,
+ parent_inode->i_ino, index, pending->name,
+ namelen);
+
+ BUG_ON(ret);
+
+- /* now add the forward ref */
+- ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+- parent_root->root_key.objectid,
+- BTRFS_ROOT_REF_KEY,
+- pending->root_key.objectid,
+- parent_inode->i_ino, index, pending->name,
+- namelen);
+-
+ inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
+ d_instantiate(pending->dentry, inode);
+ fail:
+@@ -874,7 +910,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ unsigned long timeout = 1;
+ struct btrfs_transaction *cur_trans;
+ struct btrfs_transaction *prev_trans = NULL;
+- struct extent_io_tree *pinned_copy;
+ DEFINE_WAIT(wait);
+ int ret;
+ int should_grow = 0;
+@@ -915,13 +950,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ return 0;
+ }
+
+- pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
+- if (!pinned_copy)
+- return -ENOMEM;
+-
+- extent_io_tree_init(pinned_copy,
+- root->fs_info->btree_inode->i_mapping, GFP_NOFS);
+-
+ trans->transaction->in_commit = 1;
+ trans->transaction->blocked = 1;
+ if (cur_trans->list.prev != &root->fs_info->trans_list) {
+@@ -1019,6 +1047,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ ret = commit_cowonly_roots(trans, root);
+ BUG_ON(ret);
+
++ btrfs_prepare_extent_commit(trans, root);
++
+ cur_trans = root->fs_info->running_transaction;
+ spin_lock(&root->fs_info->new_trans_lock);
+ root->fs_info->running_transaction = NULL;
+@@ -1042,8 +1072,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
+ sizeof(root->fs_info->super_copy));
+
+- btrfs_copy_pinned(root, pinned_copy);
+-
+ trans->transaction->blocked = 0;
+
+ wake_up(&root->fs_info->transaction_wait);
+@@ -1059,8 +1087,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ */
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+
+- btrfs_finish_extent_commit(trans, root, pinned_copy);
+- kfree(pinned_copy);
++ btrfs_finish_extent_commit(trans, root);
+
+ /* do the directory inserts of any pending snapshot creations */
+ finish_pending_snapshots(trans, root->fs_info);
+@@ -1078,6 +1105,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+
+ mutex_unlock(&root->fs_info->trans_mutex);
+
++ if (current->journal_info == trans)
++ current->journal_info = NULL;
++
+ kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ return ret;
+ }
+@@ -1096,8 +1126,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
+
+ while (!list_empty(&list)) {
+ root = list_entry(list.next, struct btrfs_root, root_list);
+- list_del_init(&root->root_list);
+- btrfs_drop_snapshot(root, 0);
++ list_del(&root->root_list);
++
++ if (btrfs_header_backref_rev(root->node) <
++ BTRFS_MIXED_BACKREF_REV)
++ btrfs_drop_snapshot(root, 0);
++ else
++ btrfs_drop_snapshot(root, 1);
+ }
+ return 0;
+ }
+diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
+index 663c674..d4e3e7a 100644
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+ {
+ BTRFS_I(inode)->last_trans = trans->transaction->transid;
++ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+ }
+
+ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+@@ -107,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages);
++int btrfs_write_marked_extents(struct btrfs_root *root,
++ struct extent_io_tree *dirty_pages);
++int btrfs_wait_marked_extents(struct btrfs_root *root,
++ struct extent_io_tree *dirty_pages);
+ int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+ #endif
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index d91b0de..f51bf13 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
+
+ mutex_lock(&root->log_mutex);
+ if (root->log_root) {
++ if (!root->log_start_pid) {
++ root->log_start_pid = current->pid;
++ root->log_multiple_pids = false;
++ } else if (root->log_start_pid != current->pid) {
++ root->log_multiple_pids = true;
++ }
++
+ root->log_batch++;
+ atomic_inc(&root->log_writers);
+ mutex_unlock(&root->log_mutex);
+ return 0;
+ }
++ root->log_multiple_pids = false;
++ root->log_start_pid = current->pid;
+ mutex_lock(&root->fs_info->tree_log_mutex);
+ if (!root->fs_info->log_root_tree) {
+ ret = btrfs_init_log_root_tree(trans, root->fs_info);
+@@ -263,8 +272,8 @@ static int process_one_buffer(struct btrfs_root *log,
+ struct walk_control *wc, u64 gen)
+ {
+ if (wc->pin)
+- btrfs_update_pinned_extents(log->fs_info->extent_root,
+- eb->start, eb->len, 1);
++ btrfs_pin_extent(log->fs_info->extent_root,
++ eb->start, eb->len, 0);
+
+ if (btrfs_buffer_uptodate(eb, gen)) {
+ if (wc->write)
+@@ -534,7 +543,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+ saved_nbytes = inode_get_bytes(inode);
+ /* drop any overlapping extents */
+ ret = btrfs_drop_extents(trans, root, inode,
+- start, extent_end, extent_end, start, &alloc_hint);
++ start, extent_end, extent_end, start, &alloc_hint, 1);
+ BUG_ON(ret);
+
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+@@ -1971,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ int ret;
+ struct btrfs_root *log = root->log_root;
+ struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
++ u64 log_transid = 0;
+
+ mutex_lock(&root->log_mutex);
+ index1 = root->log_transid % 2;
+@@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+
+ while (1) {
+ unsigned long batch = root->log_batch;
+- mutex_unlock(&root->log_mutex);
+- schedule_timeout_uninterruptible(1);
+- mutex_lock(&root->log_mutex);
+-
++ if (root->log_multiple_pids) {
++ mutex_unlock(&root->log_mutex);
++ schedule_timeout_uninterruptible(1);
++ mutex_lock(&root->log_mutex);
++ }
+ wait_for_writer(trans, root);
+ if (batch == root->log_batch)
+ break;
+@@ -2003,14 +2014,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ goto out;
+ }
+
+- ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
++ /* we start IO on all the marked extents here, but we don't actually
++ * wait for them until later.
++ */
++ ret = btrfs_write_marked_extents(log, &log->dirty_log_pages);
+ BUG_ON(ret);
+
+ btrfs_set_root_node(&log->root_item, log->node);
+
+ root->log_batch = 0;
++ log_transid = root->log_transid;
+ root->log_transid++;
+ log->log_transid = root->log_transid;
++ root->log_start_pid = 0;
+ smp_mb();
+ /*
+ * log tree has been flushed to disk, new modifications of
+@@ -2036,6 +2052,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+
+ index2 = log_root_tree->log_transid % 2;
+ if (atomic_read(&log_root_tree->log_commit[index2])) {
++ btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+ wait_log_commit(trans, log_root_tree,
+ log_root_tree->log_transid);
+ mutex_unlock(&log_root_tree->log_mutex);
+@@ -2055,6 +2072,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ * check the full commit flag again
+ */
+ if (root->fs_info->last_trans_log_full_commit == trans->transid) {
++ btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = -EAGAIN;
+ goto out_wake_log_root;
+@@ -2063,6 +2081,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ ret = btrfs_write_and_wait_marked_extents(log_root_tree,
+ &log_root_tree->dirty_log_pages);
+ BUG_ON(ret);
++ btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+
+ btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+ log_root_tree->node->start);
+@@ -2082,9 +2101,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ * the running transaction open, so a full commit can't hop
+ * in and cause problems either.
+ */
+- write_ctree_super(trans, root->fs_info->tree_root, 2);
++ write_ctree_super(trans, root->fs_info->tree_root, 1);
+ ret = 0;
+
++ mutex_lock(&root->log_mutex);
++ if (root->last_log_commit < log_transid)
++ root->last_log_commit = log_transid;
++ mutex_unlock(&root->log_mutex);
++
+ out_wake_log_root:
+ atomic_set(&log_root_tree->log_commit[index2], 0);
+ smp_mb();
+@@ -2841,7 +2865,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
+ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ break;
+
+- if (parent == sb->s_root)
++ if (IS_ROOT(parent))
+ break;
+
+ parent = parent->d_parent;
+@@ -2852,6 +2876,21 @@ out:
+ return ret;
+ }
+
++static int inode_in_log(struct btrfs_trans_handle *trans,
++ struct inode *inode)
++{
++ struct btrfs_root *root = BTRFS_I(inode)->root;
++ int ret = 0;
++
++ mutex_lock(&root->log_mutex);
++ if (BTRFS_I(inode)->logged_trans == trans->transid &&
++ BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
++ ret = 1;
++ mutex_unlock(&root->log_mutex);
++ return ret;
++}
++
++
+ /*
+ * helper function around btrfs_log_inode to make sure newly created
+ * parent directories also end up in the log. A minimal inode and backref
+@@ -2880,11 +2919,22 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ goto end_no_trans;
+ }
+
++ if (root != BTRFS_I(inode)->root ||
++ btrfs_root_refs(&root->root_item) == 0) {
++ ret = 1;
++ goto end_no_trans;
++ }
++
+ ret = check_parent_dirs_for_sync(trans, inode, parent,
+ sb, last_committed);
+ if (ret)
+ goto end_no_trans;
+
++ if (inode_in_log(trans, inode)) {
++ ret = BTRFS_NO_LOG_SYNC;
++ goto end_no_trans;
++ }
++
+ start_log_trans(trans, root);
+
+ ret = btrfs_log_inode(trans, root, inode, inode_only);
+@@ -2907,12 +2957,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ break;
+
+ inode = parent->d_inode;
++ if (root != BTRFS_I(inode)->root)
++ break;
++
+ if (BTRFS_I(inode)->generation >
+ root->fs_info->last_trans_committed) {
+ ret = btrfs_log_inode(trans, root, inode, inode_only);
+ BUG_ON(ret);
+ }
+- if (parent == sb->s_root)
++ if (IS_ROOT(parent))
+ break;
+
+ parent = parent->d_parent;
+@@ -2951,7 +3004,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+ struct btrfs_key tmp_key;
+ struct btrfs_root *log;
+ struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+- u64 highest_inode;
+ struct walk_control wc = {
+ .process_func = process_one_buffer,
+ .stage = 0,
+@@ -3010,11 +3062,6 @@ again:
+ path);
+ BUG_ON(ret);
+ }
+- ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
+- if (ret == 0) {
+- wc.replay_dest->highest_inode = highest_inode;
+- wc.replay_dest->last_inode_alloc = highest_inode;
+- }
+
+ key.offset = found_key.offset - 1;
+ wc.replay_dest->log_root = NULL;
+diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
+index d09c760..0776eac 100644
+--- a/fs/btrfs/tree-log.h
++++ b/fs/btrfs/tree-log.h
+@@ -19,6 +19,9 @@
+ #ifndef __TREE_LOG_
+ #define __TREE_LOG_
+
++/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
++#define BTRFS_NO_LOG_SYNC 256
++
+ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
+index 5dbefd1..20cbd2e 100644
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -276,7 +276,7 @@ loop_lock:
+ * is now congested. Back off and let other work structs
+ * run instead
+ */
+- if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
++ if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
+ fs_info->fs_devices->open_devices > 1) {
+ struct io_context *ioc;
+
+@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
+ goto error;
+
+ device->name = kstrdup(orig_dev->name, GFP_NOFS);
+- if (!device->name)
++ if (!device->name) {
++ kfree(device);
+ goto error;
++ }
+
+ device->devid = orig_dev->devid;
+ device->work.func = pending_bios_fn;
+@@ -719,10 +721,9 @@ error:
+ * called very infrequently and that a given device has a small number
+ * of extents
+ */
+-static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
+- struct btrfs_device *device,
+- u64 num_bytes, u64 *start,
+- u64 *max_avail)
++int find_free_dev_extent(struct btrfs_trans_handle *trans,
++ struct btrfs_device *device, u64 num_bytes,
++ u64 *start, u64 *max_avail)
+ {
+ struct btrfs_key key;
+ struct btrfs_root *root = device->dev_root;
+@@ -1736,6 +1737,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
+ extent_root = root->fs_info->extent_root;
+ em_tree = &root->fs_info->mapping_tree.map_tree;
+
++ ret = btrfs_can_relocate(extent_root, chunk_offset);
++ if (ret)
++ return -ENOSPC;
++
+ /* step one, relocate all the extents inside this chunk */
+ ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+ BUG_ON(ret);
+@@ -1749,9 +1754,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
+ * step two, delete the device extents and the
+ * chunk tree entries
+ */
+- spin_lock(&em_tree->lock);
++ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+- spin_unlock(&em_tree->lock);
++ read_unlock(&em_tree->lock);
+
+ BUG_ON(em->start > chunk_offset ||
+ em->start + em->len < chunk_offset);
+@@ -1780,9 +1785,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
+ ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+ BUG_ON(ret);
+
+- spin_lock(&em_tree->lock);
++ write_lock(&em_tree->lock);
+ remove_extent_mapping(em_tree, em);
+- spin_unlock(&em_tree->lock);
++ write_unlock(&em_tree->lock);
+
+ kfree(map);
+ em->bdev = NULL;
+@@ -1807,12 +1812,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+ struct btrfs_key found_key;
+ u64 chunk_tree = chunk_root->root_key.objectid;
+ u64 chunk_type;
++ bool retried = false;
++ int failed = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
++again:
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.offset = (u64)-1;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+@@ -1842,7 +1850,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+ ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+ found_key.objectid,
+ found_key.offset);
+- BUG_ON(ret);
++ if (ret == -ENOSPC)
++ failed++;
++ else if (ret)
++ BUG();
+ }
+
+ if (found_key.offset == 0)
+@@ -1850,6 +1861,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+ key.offset = found_key.offset - 1;
+ }
+ ret = 0;
++ if (failed && !retried) {
++ failed = 0;
++ retried = true;
++ goto again;
++ } else if (failed && retried) {
++ WARN_ON(1);
++ ret = -ENOSPC;
++ }
+ error:
+ btrfs_free_path(path);
+ return ret;
+@@ -1894,6 +1913,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
+ continue;
+
+ ret = btrfs_shrink_device(device, old_size - size_to_free);
++ if (ret == -ENOSPC)
++ break;
+ BUG_ON(ret);
+
+ trans = btrfs_start_transaction(dev_root, 1);
+@@ -1938,9 +1959,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
+ chunk = btrfs_item_ptr(path->nodes[0],
+ path->slots[0],
+ struct btrfs_chunk);
+- key.offset = found_key.offset;
+ /* chunk zero is special */
+- if (key.offset == 0)
++ if (found_key.offset == 0)
+ break;
+
+ btrfs_release_path(chunk_root, path);
+@@ -1948,7 +1968,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
+ chunk_root->root_key.objectid,
+ found_key.objectid,
+ found_key.offset);
+- BUG_ON(ret);
++ BUG_ON(ret && ret != -ENOSPC);
++ key.offset = found_key.offset - 1;
+ }
+ ret = 0;
+ error:
+@@ -1974,10 +1995,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ u64 chunk_offset;
+ int ret;
+ int slot;
++ int failed = 0;
++ bool retried = false;
+ struct extent_buffer *l;
+ struct btrfs_key key;
+ struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ u64 old_total = btrfs_super_total_bytes(super_copy);
++ u64 old_size = device->total_bytes;
+ u64 diff = device->total_bytes - new_size;
+
+ if (new_size >= device->total_bytes)
+@@ -1987,12 +2011,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ if (!path)
+ return -ENOMEM;
+
+- trans = btrfs_start_transaction(root, 1);
+- if (!trans) {
+- ret = -ENOMEM;
+- goto done;
+- }
+-
+ path->reada = 2;
+
+ lock_chunks(root);
+@@ -2001,8 +2019,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ if (device->writeable)
+ device->fs_devices->total_rw_bytes -= diff;
+ unlock_chunks(root);
+- btrfs_end_transaction(trans, root);
+
++again:
+ key.objectid = device->devid;
+ key.offset = (u64)-1;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+@@ -2017,6 +2035,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ goto done;
+ if (ret) {
+ ret = 0;
++ btrfs_release_path(root, path);
+ break;
+ }
+
+@@ -2024,14 +2043,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
+- if (key.objectid != device->devid)
++ if (key.objectid != device->devid) {
++ btrfs_release_path(root, path);
+ break;
++ }
+
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ length = btrfs_dev_extent_length(l, dev_extent);
+
+- if (key.offset + length <= new_size)
++ if (key.offset + length <= new_size) {
++ btrfs_release_path(root, path);
+ break;
++ }
+
+ chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+ chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+@@ -2040,8 +2063,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+
+ ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
+ chunk_offset);
+- if (ret)
++ if (ret && ret != -ENOSPC)
+ goto done;
++ if (ret == -ENOSPC)
++ failed++;
++ key.offset -= 1;
++ }
++
++ if (failed && !retried) {
++ failed = 0;
++ retried = true;
++ goto again;
++ } else if (failed && retried) {
++ ret = -ENOSPC;
++ lock_chunks(root);
++
++ device->total_bytes = old_size;
++ if (device->writeable)
++ device->fs_devices->total_rw_bytes += diff;
++ unlock_chunks(root);
++ goto done;
+ }
+
+ /* Shrinking succeeded, else we would be at "done". */
+@@ -2294,9 +2335,9 @@ again:
+ em->block_len = em->len;
+
+ em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+- spin_lock(&em_tree->lock);
++ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+- spin_unlock(&em_tree->lock);
++ write_unlock(&em_tree->lock);
+ BUG_ON(ret);
+ free_extent_map(em);
+
+@@ -2491,9 +2532,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
+ int readonly = 0;
+ int i;
+
+- spin_lock(&map_tree->map_tree.lock);
++ read_lock(&map_tree->map_tree.lock);
+ em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+- spin_unlock(&map_tree->map_tree.lock);
++ read_unlock(&map_tree->map_tree.lock);
+ if (!em)
+ return 1;
+
+@@ -2518,11 +2559,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+ struct extent_map *em;
+
+ while (1) {
+- spin_lock(&tree->map_tree.lock);
++ write_lock(&tree->map_tree.lock);
+ em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+ if (em)
+ remove_extent_mapping(&tree->map_tree, em);
+- spin_unlock(&tree->map_tree.lock);
++ write_unlock(&tree->map_tree.lock);
+ if (!em)
+ break;
+ kfree(em->bdev);
+@@ -2540,9 +2581,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ int ret;
+
+- spin_lock(&em_tree->lock);
++ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, len);
+- spin_unlock(&em_tree->lock);
++ read_unlock(&em_tree->lock);
+ BUG_ON(!em);
+
+ BUG_ON(em->start > logical || em->start + em->len < logical);
+@@ -2604,9 +2645,9 @@ again:
+ atomic_set(&multi->error, 0);
+ }
+
+- spin_lock(&em_tree->lock);
++ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, *length);
+- spin_unlock(&em_tree->lock);
++ read_unlock(&em_tree->lock);
+
+ if (!em && unplug_page)
+ return 0;
+@@ -2763,9 +2804,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+ u64 stripe_nr;
+ int i, j, nr = 0;
+
+- spin_lock(&em_tree->lock);
++ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_start, 1);
+- spin_unlock(&em_tree->lock);
++ read_unlock(&em_tree->lock);
+
+ BUG_ON(!em || em->start != chunk_start);
+ map = (struct map_lookup *)em->bdev;
+@@ -3053,9 +3094,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+ logical = key->offset;
+ length = btrfs_chunk_length(leaf, chunk);
+
+- spin_lock(&map_tree->map_tree.lock);
++ read_lock(&map_tree->map_tree.lock);
+ em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+- spin_unlock(&map_tree->map_tree.lock);
++ read_unlock(&map_tree->map_tree.lock);
+
+ /* already mapped? */
+ if (em && em->start <= logical && em->start + em->len > logical) {
+@@ -3114,9 +3155,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+ map->stripes[i].dev->in_fs_metadata = 1;
+ }
+
+- spin_lock(&map_tree->map_tree.lock);
++ write_lock(&map_tree->map_tree.lock);
+ ret = add_extent_mapping(&map_tree->map_tree, em);
+- spin_unlock(&map_tree->map_tree.lock);
++ write_unlock(&map_tree->map_tree.lock);
+ BUG_ON(ret);
+ free_extent_map(em);
+
+diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
+index 5139a83..31b0fab 100644
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root);
+ void btrfs_unlock_volumes(void);
+ void btrfs_lock_volumes(void);
+ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
++int find_free_dev_extent(struct btrfs_trans_handle *trans,
++ struct btrfs_device *device, u64 num_bytes,
++ u64 *start, u64 *max_avail);
+ #endif
+diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
+index a9d3bf4..b6dd596 100644
+--- a/fs/btrfs/xattr.c
++++ b/fs/btrfs/xattr.c
+@@ -260,7 +260,7 @@ err:
+ * attributes are handled directly.
+ */
+ struct xattr_handler *btrfs_xattr_handlers[] = {
+-#ifdef CONFIG_FS_POSIX_ACL
++#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ &btrfs_xattr_acl_access_handler,
+ &btrfs_xattr_acl_default_handler,
+ #endif