1 diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
2 index f128427..3616042 100644
6 #include "btrfs_inode.h"
9 -#ifdef CONFIG_FS_POSIX_ACL
10 +#ifdef CONFIG_BTRFS_FS_POSIX_ACL
12 static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
14 @@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
15 .set = btrfs_xattr_acl_access_set,
18 -#else /* CONFIG_FS_POSIX_ACL */
19 +#else /* CONFIG_BTRFS_FS_POSIX_ACL */
21 int btrfs_acl_chmod(struct inode *inode)
23 @@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
27 -#endif /* CONFIG_FS_POSIX_ACL */
28 +#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
29 diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
30 index 019e8af..c0861e7 100644
31 --- a/fs/btrfs/async-thread.c
32 +++ b/fs/btrfs/async-thread.c
33 @@ -48,6 +48,9 @@ struct btrfs_worker_thread {
34 /* number of things on the pending list */
37 + /* reference counter for this struct */
40 unsigned long sequence;
42 /* protects the pending list. */
43 @@ -61,6 +64,51 @@ struct btrfs_worker_thread {
47 + * btrfs_start_workers uses kthread_run, which can block waiting for memory
48 + * for a very long time. It will actually throttle on page writeback,
49 + * and so it may not make progress until after our btrfs worker threads
50 + * process all of the pending work structs in their queue
52 + * This means we can't use btrfs_start_workers from inside a btrfs worker
53 + * thread that is used as part of cleaning dirty memory, which pretty much
54 + * involves all of the worker threads.
56 + * Instead we have a helper queue who never has more than one thread
57 + * where we scheduler thread start operations. This worker_start struct
58 + * is used to contain the work and hold a pointer to the queue that needs
61 +struct worker_start {
62 + struct btrfs_work work;
63 + struct btrfs_workers *queue;
66 +static void start_new_worker_func(struct btrfs_work *work)
68 + struct worker_start *start;
69 + start = container_of(work, struct worker_start, work);
70 + btrfs_start_workers(start->queue, 1);
74 +static int start_new_worker(struct btrfs_workers *queue)
76 + struct worker_start *start;
79 + start = kzalloc(sizeof(*start), GFP_NOFS);
83 + start->work.func = start_new_worker_func;
84 + start->queue = queue;
85 + ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
92 * helper function to move a thread onto the idle list after it
93 * has finished some requests.
95 @@ -71,7 +119,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
97 spin_lock_irqsave(&worker->workers->lock, flags);
99 - list_move(&worker->worker_list, &worker->workers->idle_list);
101 + /* the list may be empty if the worker is just starting */
102 + if (!list_empty(&worker->worker_list)) {
103 + list_move(&worker->worker_list,
104 + &worker->workers->idle_list);
106 spin_unlock_irqrestore(&worker->workers->lock, flags);
109 @@ -87,23 +140,51 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
111 spin_lock_irqsave(&worker->workers->lock, flags);
113 - list_move_tail(&worker->worker_list,
114 - &worker->workers->worker_list);
116 + if (!list_empty(&worker->worker_list)) {
117 + list_move_tail(&worker->worker_list,
118 + &worker->workers->worker_list);
120 spin_unlock_irqrestore(&worker->workers->lock, flags);
124 -static noinline int run_ordered_completions(struct btrfs_workers *workers,
125 - struct btrfs_work *work)
126 +static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
128 + struct btrfs_workers *workers = worker->workers;
132 + if (!workers->atomic_start_pending)
135 + spin_lock_irqsave(&workers->lock, flags);
136 + if (!workers->atomic_start_pending)
139 + workers->atomic_start_pending = 0;
140 + if (workers->num_workers + workers->num_workers_starting >=
141 + workers->max_workers)
144 + workers->num_workers_starting += 1;
145 + spin_unlock_irqrestore(&workers->lock, flags);
146 + start_new_worker(workers);
150 + spin_unlock_irqrestore(&workers->lock, flags);
153 +static noinline int run_ordered_completions(struct btrfs_workers *workers,
154 + struct btrfs_work *work)
156 if (!workers->ordered)
159 set_bit(WORK_DONE_BIT, &work->flags);
161 - spin_lock_irqsave(&workers->lock, flags);
162 + spin_lock(&workers->order_lock);
165 if (!list_empty(&workers->prio_order_list)) {
166 @@ -126,45 +207,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
167 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
170 - spin_unlock_irqrestore(&workers->lock, flags);
171 + spin_unlock(&workers->order_lock);
173 work->ordered_func(work);
175 /* now take the lock again and call the freeing code */
176 - spin_lock_irqsave(&workers->lock, flags);
177 + spin_lock(&workers->order_lock);
178 list_del(&work->order_list);
179 work->ordered_free(work);
182 - spin_unlock_irqrestore(&workers->lock, flags);
183 + spin_unlock(&workers->order_lock);
187 +static void put_worker(struct btrfs_worker_thread *worker)
189 + if (atomic_dec_and_test(&worker->refs))
193 +static int try_worker_shutdown(struct btrfs_worker_thread *worker)
197 + spin_lock_irq(&worker->lock);
198 + spin_lock(&worker->workers->lock);
199 + if (worker->workers->num_workers > 1 &&
201 + !worker->working &&
202 + !list_empty(&worker->worker_list) &&
203 + list_empty(&worker->prio_pending) &&
204 + list_empty(&worker->pending) &&
205 + atomic_read(&worker->num_pending) == 0) {
207 + list_del_init(&worker->worker_list);
208 + worker->workers->num_workers--;
210 + spin_unlock(&worker->workers->lock);
211 + spin_unlock_irq(&worker->lock);
214 + put_worker(worker);
218 +static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
219 + struct list_head *prio_head,
220 + struct list_head *head)
222 + struct btrfs_work *work = NULL;
223 + struct list_head *cur = NULL;
225 + if(!list_empty(prio_head))
226 + cur = prio_head->next;
229 + if (!list_empty(&worker->prio_pending))
232 + if (!list_empty(head))
239 + spin_lock_irq(&worker->lock);
240 + list_splice_tail_init(&worker->prio_pending, prio_head);
241 + list_splice_tail_init(&worker->pending, head);
243 + if (!list_empty(prio_head))
244 + cur = prio_head->next;
245 + else if (!list_empty(head))
247 + spin_unlock_irq(&worker->lock);
253 + work = list_entry(cur, struct btrfs_work, list);
260 * main loop for servicing work items
262 static int worker_loop(void *arg)
264 struct btrfs_worker_thread *worker = arg;
265 - struct list_head *cur;
266 + struct list_head head;
267 + struct list_head prio_head;
268 struct btrfs_work *work;
270 + INIT_LIST_HEAD(&head);
271 + INIT_LIST_HEAD(&prio_head);
274 - spin_lock_irq(&worker->lock);
278 - if (!list_empty(&worker->prio_pending))
279 - cur = worker->prio_pending.next;
280 - else if (!list_empty(&worker->pending))
281 - cur = worker->pending.next;
285 + work = get_next_work(worker, &prio_head, &head);
289 - work = list_entry(cur, struct btrfs_work, list);
290 list_del(&work->list);
291 clear_bit(WORK_QUEUED_BIT, &work->flags);
293 work->worker = worker;
294 - spin_unlock_irq(&worker->lock);
298 @@ -175,9 +329,13 @@ again_locked:
300 run_ordered_completions(worker->workers, work);
302 - spin_lock_irq(&worker->lock);
303 - check_idle_worker(worker);
304 + check_pending_worker_creates(worker);
308 + spin_lock_irq(&worker->lock);
309 + check_idle_worker(worker);
311 if (freezing(current)) {
313 spin_unlock_irq(&worker->lock);
314 @@ -216,8 +374,10 @@ again_locked:
315 spin_lock_irq(&worker->lock);
316 set_current_state(TASK_INTERRUPTIBLE);
317 if (!list_empty(&worker->pending) ||
318 - !list_empty(&worker->prio_pending))
320 + !list_empty(&worker->prio_pending)) {
321 + spin_unlock_irq(&worker->lock);
326 * this makes sure we get a wakeup when someone
327 @@ -226,8 +386,13 @@ again_locked:
329 spin_unlock_irq(&worker->lock);
331 - if (!kthread_should_stop())
333 + if (!kthread_should_stop()) {
334 + schedule_timeout(HZ * 120);
335 + if (!worker->working &&
336 + try_worker_shutdown(worker)) {
341 __set_current_state(TASK_RUNNING);
343 @@ -242,41 +407,61 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
345 struct list_head *cur;
346 struct btrfs_worker_thread *worker;
349 + spin_lock_irq(&workers->lock);
350 list_splice_init(&workers->idle_list, &workers->worker_list);
351 while (!list_empty(&workers->worker_list)) {
352 cur = workers->worker_list.next;
353 worker = list_entry(cur, struct btrfs_worker_thread,
355 - kthread_stop(worker->task);
356 - list_del(&worker->worker_list);
359 + atomic_inc(&worker->refs);
360 + workers->num_workers -= 1;
361 + if (!list_empty(&worker->worker_list)) {
362 + list_del_init(&worker->worker_list);
363 + put_worker(worker);
367 + spin_unlock_irq(&workers->lock);
369 + kthread_stop(worker->task);
370 + spin_lock_irq(&workers->lock);
371 + put_worker(worker);
373 + spin_unlock_irq(&workers->lock);
378 * simple init on struct btrfs_workers
380 -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
381 +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
382 + struct btrfs_workers *async_helper)
384 workers->num_workers = 0;
385 + workers->num_workers_starting = 0;
386 INIT_LIST_HEAD(&workers->worker_list);
387 INIT_LIST_HEAD(&workers->idle_list);
388 INIT_LIST_HEAD(&workers->order_list);
389 INIT_LIST_HEAD(&workers->prio_order_list);
390 spin_lock_init(&workers->lock);
391 + spin_lock_init(&workers->order_lock);
392 workers->max_workers = max;
393 workers->idle_thresh = 32;
394 workers->name = name;
395 workers->ordered = 0;
396 + workers->atomic_start_pending = 0;
397 + workers->atomic_worker_start = async_helper;
401 * starts new worker threads. This does not enforce the max worker
402 * count in case you need to temporarily go past it.
404 -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
405 +static int __btrfs_start_workers(struct btrfs_workers *workers,
408 struct btrfs_worker_thread *worker;
410 @@ -293,7 +478,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
411 INIT_LIST_HEAD(&worker->prio_pending);
412 INIT_LIST_HEAD(&worker->worker_list);
413 spin_lock_init(&worker->lock);
415 atomic_set(&worker->num_pending, 0);
416 + atomic_set(&worker->refs, 1);
417 worker->workers = workers;
418 worker->task = kthread_run(worker_loop, worker,
419 "btrfs-%s-%d", workers->name,
420 @@ -303,11 +490,12 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
425 spin_lock_irq(&workers->lock);
426 list_add_tail(&worker->worker_list, &workers->idle_list);
428 workers->num_workers++;
429 + workers->num_workers_starting--;
430 + WARN_ON(workers->num_workers_starting < 0);
431 spin_unlock_irq(&workers->lock);
434 @@ -316,6 +504,14 @@ fail:
438 +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
440 + spin_lock_irq(&workers->lock);
441 + workers->num_workers_starting += num_workers;
442 + spin_unlock_irq(&workers->lock);
443 + return __btrfs_start_workers(workers, num_workers);
447 * run through the list and find a worker thread that doesn't have a lot
448 * to do right now. This can return null if we aren't yet at the thread
449 @@ -325,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
451 struct btrfs_worker_thread *worker;
452 struct list_head *next;
453 - int enforce_min = workers->num_workers < workers->max_workers;
456 + enforce_min = (workers->num_workers + workers->num_workers_starting) <
457 + workers->max_workers;
460 * if we find an idle thread, don't move it to the end of the
461 @@ -350,7 +549,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
463 next = workers->worker_list.next;
464 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
465 - atomic_inc(&worker->num_pending);
468 if (worker->sequence % workers->idle_thresh == 0)
469 @@ -367,35 +565,49 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
471 struct btrfs_worker_thread *worker;
473 + struct list_head *fallback;
476 spin_lock_irqsave(&workers->lock, flags);
477 worker = next_worker(workers);
478 - spin_unlock_irqrestore(&workers->lock, flags);
481 - spin_lock_irqsave(&workers->lock, flags);
482 - if (workers->num_workers >= workers->max_workers) {
483 - struct list_head *fallback = NULL;
485 - * we have failed to find any workers, just
486 - * return the force one
488 - if (!list_empty(&workers->worker_list))
489 - fallback = workers->worker_list.next;
490 - if (!list_empty(&workers->idle_list))
491 - fallback = workers->idle_list.next;
493 - worker = list_entry(fallback,
494 - struct btrfs_worker_thread, worker_list);
495 - spin_unlock_irqrestore(&workers->lock, flags);
496 + if (workers->num_workers + workers->num_workers_starting >=
497 + workers->max_workers) {
499 + } else if (workers->atomic_worker_start) {
500 + workers->atomic_start_pending = 1;
503 + workers->num_workers_starting++;
504 spin_unlock_irqrestore(&workers->lock, flags);
505 /* we're below the limit, start another worker */
506 - btrfs_start_workers(workers, 1);
507 + __btrfs_start_workers(workers, 1);
516 + * we have failed to find any workers, just
517 + * return the first one we can find.
519 + if (!list_empty(&workers->worker_list))
520 + fallback = workers->worker_list.next;
521 + if (!list_empty(&workers->idle_list))
522 + fallback = workers->idle_list.next;
524 + worker = list_entry(fallback,
525 + struct btrfs_worker_thread, worker_list);
528 + * this makes sure the worker doesn't exit before it is placed
529 + * onto a busy/idle list
531 + atomic_inc(&worker->num_pending);
532 + spin_unlock_irqrestore(&workers->lock, flags);
536 @@ -427,7 +639,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
537 spin_lock(&worker->workers->lock);
539 list_move_tail(&worker->worker_list,
540 - &worker->workers->worker_list);
541 + &worker->workers->worker_list);
542 spin_unlock(&worker->workers->lock);
544 if (!worker->working) {
545 @@ -435,9 +647,9 @@ int btrfs_requeue_work(struct btrfs_work *work)
549 - spin_unlock_irqrestore(&worker->lock, flags);
551 wake_up_process(worker->task);
552 + spin_unlock_irqrestore(&worker->lock, flags);
556 @@ -463,14 +675,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
558 worker = find_worker(workers);
559 if (workers->ordered) {
560 - spin_lock_irqsave(&workers->lock, flags);
562 + * you're not allowed to do ordered queues from an
563 + * interrupt handler
565 + spin_lock(&workers->order_lock);
566 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
567 list_add_tail(&work->order_list,
568 &workers->prio_order_list);
570 list_add_tail(&work->order_list, &workers->order_list);
572 - spin_unlock_irqrestore(&workers->lock, flags);
573 + spin_unlock(&workers->order_lock);
575 INIT_LIST_HEAD(&work->order_list);
577 @@ -481,7 +697,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
578 list_add_tail(&work->list, &worker->prio_pending);
580 list_add_tail(&work->list, &worker->pending);
581 - atomic_inc(&worker->num_pending);
582 check_busy_worker(worker);
585 @@ -492,10 +707,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
589 - spin_unlock_irqrestore(&worker->lock, flags);
592 wake_up_process(worker->task);
593 + spin_unlock_irqrestore(&worker->lock, flags);
598 diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
599 index 1b511c1..5077746 100644
600 --- a/fs/btrfs/async-thread.h
601 +++ b/fs/btrfs/async-thread.h
602 @@ -64,6 +64,8 @@ struct btrfs_workers {
603 /* current number of running workers */
606 + int num_workers_starting;
608 /* max number of workers allowed. changed by btrfs_start_workers */
611 @@ -73,6 +75,16 @@ struct btrfs_workers {
612 /* force completions in the order they were queued */
615 + /* more workers required, but in an interrupt handler */
616 + int atomic_start_pending;
619 + * are we allowed to sleep while starting workers or are we required
620 + * to start them at a later time? If we can't sleep, this indicates
621 + * which queue we need to use to schedule thread creation.
623 + struct btrfs_workers *atomic_worker_start;
625 /* list with all the work threads. The workers on the idle thread
626 * may be actively servicing jobs, but they haven't yet hit the
627 * idle thresh limit above.
628 @@ -90,6 +102,9 @@ struct btrfs_workers {
629 /* lock for finding the next worker thread to queue on */
632 + /* lock for the ordered lists */
633 + spinlock_t order_lock;
635 /* extra name for this worker, used for current->name */
638 @@ -97,7 +112,8 @@ struct btrfs_workers {
639 int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
640 int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
641 int btrfs_stop_workers(struct btrfs_workers *workers);
642 -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
643 +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
644 + struct btrfs_workers *async_starter);
645 int btrfs_requeue_work(struct btrfs_work *work);
646 void btrfs_set_work_high_prio(struct btrfs_work *work);
648 diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
649 index ea1ea0a..f6783a4 100644
650 --- a/fs/btrfs/btrfs_inode.h
651 +++ b/fs/btrfs/btrfs_inode.h
652 @@ -86,6 +86,12 @@ struct btrfs_inode {
653 * transid of the trans_handle that last modified this inode
658 + * log transid when this inode was last modified
660 + u64 last_sub_trans;
663 * transid that last logged this inode
665 @@ -128,6 +134,16 @@ struct btrfs_inode {
666 u64 last_unlink_trans;
669 + * Counters to keep track of the number of extent item's we may use due
670 + * to delalloc and such. outstanding_extents is the number of extent
671 + * items we think we'll end up using, and reserved_extents is the number
672 + * of extent items we've reserved metadata for.
674 + spinlock_t accounting_lock;
675 + int reserved_extents;
676 + int outstanding_extents;
679 * ordered_data_close is set by truncate when a file that used
680 * to have good data has been truncated to zero. When it is set
681 * the btrfs file release call will add this inode to the
682 @@ -138,6 +154,7 @@ struct btrfs_inode {
685 unsigned ordered_data_close:1;
686 + unsigned dummy_inode:1;
688 struct inode vfs_inode;
690 diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
691 index 9d8ba4d..a11a320 100644
692 --- a/fs/btrfs/compression.c
693 +++ b/fs/btrfs/compression.c
694 @@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
696 set_page_extent_mapped(page);
697 lock_extent(tree, last_offset, end, GFP_NOFS);
698 - spin_lock(&em_tree->lock);
699 + read_lock(&em_tree->lock);
700 em = lookup_extent_mapping(em_tree, last_offset,
702 - spin_unlock(&em_tree->lock);
703 + read_unlock(&em_tree->lock);
705 if (!em || last_offset < em->start ||
706 (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
707 @@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
708 em_tree = &BTRFS_I(inode)->extent_tree;
710 /* we need the actual starting offset of this extent in the file */
711 - spin_lock(&em_tree->lock);
712 + read_lock(&em_tree->lock);
713 em = lookup_extent_mapping(em_tree,
714 page_offset(bio->bi_io_vec->bv_page),
716 - spin_unlock(&em_tree->lock);
717 + read_unlock(&em_tree->lock);
719 compressed_len = em->block_len;
720 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
721 diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
722 index 3fdcc05..ec96f3a 100644
723 --- a/fs/btrfs/ctree.c
724 +++ b/fs/btrfs/ctree.c
725 @@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
729 + l = path->nodes[0];
730 + slot = path->slots[0];
731 + if (extend && data_size + btrfs_item_size_nr(l, slot) +
732 + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
735 /* first try to make some room by pushing left and right */
736 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
737 wret = push_leaf_right(trans, root, path, data_size, 0);
738 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
739 index 837435c..e5dd628 100644
740 --- a/fs/btrfs/ctree.h
741 +++ b/fs/btrfs/ctree.h
742 @@ -114,6 +114,10 @@ struct btrfs_ordered_sum;
744 #define BTRFS_DEV_ITEMS_OBJECTID 1ULL
746 +#define BTRFS_BTREE_INODE_OBJECTID 1
748 +#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
751 * we can actually store much bigger names, but lets not confuse the rest
753 @@ -670,21 +674,29 @@ struct btrfs_space_info {
754 u64 bytes_reserved; /* total bytes the allocator has reserved for
755 current allocations */
756 u64 bytes_readonly; /* total bytes that are read only */
758 - /* delalloc accounting */
759 - u64 bytes_delalloc; /* number of bytes reserved for allocation,
760 - this space is not necessarily reserved yet
761 - by the allocator */
762 + u64 bytes_super; /* total bytes reserved for the super blocks */
763 + u64 bytes_root; /* the number of bytes needed to commit a
765 u64 bytes_may_use; /* number of bytes that may be used for
767 + delalloc/allocations */
768 + u64 bytes_delalloc; /* number of bytes currently reserved for
769 + delayed allocation */
771 int full; /* indicates that we cannot allocate any more
772 chunks for this space */
773 int force_alloc; /* set if we need to force a chunk alloc for
775 + int force_delalloc; /* make people start doing filemap_flush until
776 + we're under a threshold */
778 struct list_head list;
780 + /* for controlling how we free up space for allocations */
781 + wait_queue_head_t allocate_wait;
782 + wait_queue_head_t flush_wait;
783 + int allocating_chunk;
786 /* for block groups in our same type */
787 struct list_head block_groups;
789 @@ -726,6 +738,15 @@ enum btrfs_caching_type {
790 BTRFS_CACHE_FINISHED = 2,
793 +struct btrfs_caching_control {
794 + struct list_head list;
795 + struct mutex mutex;
796 + wait_queue_head_t wait;
797 + struct btrfs_block_group_cache *block_group;
802 struct btrfs_block_group_cache {
803 struct btrfs_key key;
804 struct btrfs_block_group_item item;
805 @@ -733,6 +754,7 @@ struct btrfs_block_group_cache {
813 @@ -742,8 +764,9 @@ struct btrfs_block_group_cache {
816 /* cache tracking stuff */
817 - wait_queue_head_t caching_q;
819 + struct btrfs_caching_control *caching_ctl;
820 + u64 last_byte_to_unpin;
822 struct btrfs_space_info *space_info;
824 @@ -782,13 +805,16 @@ struct btrfs_fs_info {
826 /* the log root tree is a directory of all the other log roots */
827 struct btrfs_root *log_root_tree;
829 + spinlock_t fs_roots_radix_lock;
830 struct radix_tree_root fs_roots_radix;
832 /* block group cache stuff */
833 spinlock_t block_group_cache_lock;
834 struct rb_root block_group_cache_tree;
836 - struct extent_io_tree pinned_extents;
837 + struct extent_io_tree freed_extents[2];
838 + struct extent_io_tree *pinned_extents;
840 /* logical->physical extent mapping */
841 struct btrfs_mapping_tree mapping_tree;
842 @@ -822,11 +848,7 @@ struct btrfs_fs_info {
843 struct mutex transaction_kthread_mutex;
844 struct mutex cleaner_mutex;
845 struct mutex chunk_mutex;
846 - struct mutex drop_mutex;
847 struct mutex volume_mutex;
848 - struct mutex tree_reloc_mutex;
849 - struct rw_semaphore extent_commit_sem;
852 * this protects the ordered operations list only while we are
853 * processing all of the entries on it. This way we make
854 @@ -835,10 +857,16 @@ struct btrfs_fs_info {
855 * before jumping into the main commit.
857 struct mutex ordered_operations_mutex;
858 + struct rw_semaphore extent_commit_sem;
860 + struct rw_semaphore subvol_sem;
862 + struct srcu_struct subvol_srcu;
864 struct list_head trans_list;
865 struct list_head hashers;
866 struct list_head dead_roots;
867 + struct list_head caching_block_groups;
869 atomic_t nr_async_submits;
870 atomic_t async_submit_draining;
871 @@ -882,6 +910,7 @@ struct btrfs_fs_info {
872 * A third pool does submit_bio to avoid deadlocking with the other
875 + struct btrfs_workers generic_worker;
876 struct btrfs_workers workers;
877 struct btrfs_workers delalloc_workers;
878 struct btrfs_workers endio_workers;
879 @@ -889,6 +918,7 @@ struct btrfs_fs_info {
880 struct btrfs_workers endio_meta_write_workers;
881 struct btrfs_workers endio_write_workers;
882 struct btrfs_workers submit_workers;
883 + struct btrfs_workers enospc_workers;
885 * fixup workers take dirty pages that didn't properly go through
886 * the cow mechanism and make them safe to write. It happens
887 @@ -979,7 +1009,10 @@ struct btrfs_root {
888 atomic_t log_writers;
889 atomic_t log_commit[2];
890 unsigned long log_transid;
891 + unsigned long last_log_commit;
892 unsigned long log_batch;
893 + pid_t log_start_pid;
894 + bool log_multiple_pids;
898 @@ -996,10 +1029,12 @@ struct btrfs_root {
903 - u64 last_inode_alloc;
905 + u64 highest_objectid;
910 u64 defrag_trans_start;
911 struct btrfs_key defrag_progress;
912 struct btrfs_key defrag_max;
913 @@ -1118,6 +1153,7 @@ struct btrfs_root {
914 #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
915 #define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
916 #define BTRFS_MOUNT_NOSSD (1 << 9)
917 +#define BTRFS_MOUNT_DISCARD (1 << 10)
919 #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
920 #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
921 @@ -1920,8 +1956,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
922 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
923 struct btrfs_root *root, unsigned long count);
924 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
925 -int btrfs_update_pinned_extents(struct btrfs_root *root,
926 - u64 bytenr, u64 num, int pin);
927 +int btrfs_pin_extent(struct btrfs_root *root,
928 + u64 bytenr, u64 num, int reserved);
929 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
930 struct btrfs_root *root, struct extent_buffer *leaf);
931 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
932 @@ -1971,9 +2007,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
933 u64 root_objectid, u64 owner, u64 offset);
935 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
936 +int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
937 + struct btrfs_root *root);
938 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
939 - struct btrfs_root *root,
940 - struct extent_io_tree *unpin);
941 + struct btrfs_root *root);
942 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
943 struct btrfs_root *root,
944 u64 bytenr, u64 num_bytes, u64 parent,
945 @@ -1984,6 +2021,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
946 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
947 int btrfs_free_block_groups(struct btrfs_fs_info *info);
948 int btrfs_read_block_groups(struct btrfs_root *root);
949 +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
950 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
951 struct btrfs_root *root, u64 bytes_used,
952 u64 type, u64 chunk_objectid, u64 chunk_offset,
953 @@ -1997,7 +2035,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
954 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
955 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
957 -int btrfs_check_metadata_free_space(struct btrfs_root *root);
958 +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
959 +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
960 +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
961 + struct inode *inode, int num_items);
962 +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
963 + struct inode *inode, int num_items);
964 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
966 void btrfs_free_reserved_data_space(struct btrfs_root *root,
967 @@ -2006,7 +2049,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
969 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
971 -void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
973 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
974 int level, int *slot);
975 @@ -2100,12 +2142,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
976 struct extent_buffer *parent);
978 int btrfs_find_root_ref(struct btrfs_root *tree_root,
979 - struct btrfs_path *path,
980 - u64 root_id, u64 ref_id);
981 + struct btrfs_path *path,
982 + u64 root_id, u64 ref_id);
983 int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
984 struct btrfs_root *tree_root,
985 - u64 root_id, u8 type, u64 ref_id,
986 - u64 dirid, u64 sequence,
987 + u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
988 + const char *name, int name_len);
989 +int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
990 + struct btrfs_root *tree_root,
991 + u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
992 const char *name, int name_len);
993 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
994 struct btrfs_key *key);
995 @@ -2120,6 +2165,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
996 int btrfs_search_root(struct btrfs_root *root, u64 search_start,
997 u64 *found_objectid);
998 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
999 +int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
1000 int btrfs_set_root_node(struct btrfs_root_item *item,
1001 struct extent_buffer *node);
1003 @@ -2138,6 +2184,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
1004 struct btrfs_path *path, u64 dir,
1005 u64 objectid, const char *name, int name_len,
1007 +struct btrfs_dir_item *
1008 +btrfs_search_dir_index_item(struct btrfs_root *root,
1009 + struct btrfs_path *path, u64 dirid,
1010 + const char *name, int name_len);
1011 struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
1012 struct btrfs_path *path,
1013 const char *name, int name_len);
1014 @@ -2160,6 +2210,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
1015 struct btrfs_root *root, u64 offset);
1016 int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
1017 struct btrfs_root *root, u64 offset);
1018 +int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
1021 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
1022 @@ -2232,6 +2283,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
1023 int btrfs_add_link(struct btrfs_trans_handle *trans,
1024 struct inode *parent_inode, struct inode *inode,
1025 const char *name, int name_len, int add_backref, u64 index);
1026 +int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
1027 + struct btrfs_root *root,
1028 + struct inode *dir, u64 objectid,
1029 + const char *name, int name_len);
1030 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
1031 struct btrfs_root *root,
1032 struct inode *inode, u64 new_size,
1033 @@ -2242,7 +2297,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
1034 int btrfs_writepages(struct address_space *mapping,
1035 struct writeback_control *wbc);
1036 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
1037 - struct btrfs_root *new_root, struct dentry *dentry,
1038 + struct btrfs_root *new_root,
1039 u64 new_dirid, u64 alloc_hint);
1040 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1041 size_t size, struct bio *bio, unsigned long bio_flags);
1042 @@ -2258,6 +2313,7 @@ int btrfs_write_inode(struct inode *inode, int wait);
1043 void btrfs_dirty_inode(struct inode *inode);
1044 struct inode *btrfs_alloc_inode(struct super_block *sb);
1045 void btrfs_destroy_inode(struct inode *inode);
1046 +void btrfs_drop_inode(struct inode *inode);
1047 int btrfs_init_cachep(void);
1048 void btrfs_destroy_cachep(void);
1049 long btrfs_ioctl_trans_end(struct file *file);
1050 @@ -2275,6 +2331,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
1051 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
1052 void btrfs_orphan_cleanup(struct btrfs_root *root);
1053 int btrfs_cont_expand(struct inode *inode, loff_t size);
1054 +int btrfs_invalidate_inodes(struct btrfs_root *root);
1055 +extern const struct dentry_operations btrfs_dentry_operations;
1058 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
1059 @@ -2290,7 +2348,7 @@ extern struct file_operations btrfs_file_operations;
1060 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
1061 struct btrfs_root *root, struct inode *inode,
1062 u64 start, u64 end, u64 locked_end,
1063 - u64 inline_limit, u64 *hint_block);
1064 + u64 inline_limit, u64 *hint_block, int drop_cache);
1065 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1066 struct btrfs_root *root,
1067 struct inode *inode, u64 start, u64 end);
1068 @@ -2317,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
1069 int btrfs_sync_fs(struct super_block *sb, int wait);
1072 -#ifdef CONFIG_FS_POSIX_ACL
1073 +#ifdef CONFIG_BTRFS_FS_POSIX_ACL
1074 int btrfs_check_acl(struct inode *inode, int mask);
1076 #define btrfs_check_acl NULL
1077 diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
1078 index 1d70236..f3a6075 100644
1079 --- a/fs/btrfs/dir-item.c
1080 +++ b/fs/btrfs/dir-item.c
1081 @@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
1082 return btrfs_match_dir_item_name(root, path, name, name_len);
1085 +struct btrfs_dir_item *
1086 +btrfs_search_dir_index_item(struct btrfs_root *root,
1087 + struct btrfs_path *path, u64 dirid,
1088 + const char *name, int name_len)
1090 + struct extent_buffer *leaf;
1091 + struct btrfs_dir_item *di;
1092 + struct btrfs_key key;
1096 + key.objectid = dirid;
1097 + key.type = BTRFS_DIR_INDEX_KEY;
1100 + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1102 + return ERR_PTR(ret);
1104 + leaf = path->nodes[0];
1105 + nritems = btrfs_header_nritems(leaf);
1108 + if (path->slots[0] >= nritems) {
1109 + ret = btrfs_next_leaf(root, path);
1111 + return ERR_PTR(ret);
1114 + leaf = path->nodes[0];
1115 + nritems = btrfs_header_nritems(leaf);
1119 + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1120 + if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
1123 + di = btrfs_match_dir_item_name(root, path, name, name_len);
1132 struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
1133 struct btrfs_root *root,
1134 struct btrfs_path *path, u64 dir,
1135 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
1136 index e83be2e..d4132aa 100644
1137 --- a/fs/btrfs/disk-io.c
1138 +++ b/fs/btrfs/disk-io.c
1141 static struct extent_io_ops btree_extent_io_ops;
1142 static void end_workqueue_fn(struct btrfs_work *work);
1143 +static void free_fs_root(struct btrfs_root *root);
1145 static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
1147 @@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,
1148 struct extent_map *em;
1151 - spin_lock(&em_tree->lock);
1152 + read_lock(&em_tree->lock);
1153 em = lookup_extent_mapping(em_tree, start, len);
1156 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
1157 - spin_unlock(&em_tree->lock);
1158 + read_unlock(&em_tree->lock);
1161 - spin_unlock(&em_tree->lock);
1162 + read_unlock(&em_tree->lock);
1164 em = alloc_extent_map(GFP_NOFS);
1166 @@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
1167 em->block_start = 0;
1168 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
1170 - spin_lock(&em_tree->lock);
1171 + write_lock(&em_tree->lock);
1172 ret = add_extent_mapping(em_tree, em);
1173 if (ret == -EEXIST) {
1174 u64 failed_start = em->start;
1175 @@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
1176 free_extent_map(em);
1179 - spin_unlock(&em_tree->lock);
1180 + write_unlock(&em_tree->lock);
1184 @@ -828,7 +829,9 @@ int btrfs_write_tree_block(struct extent_buffer *buf)
1185 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
1187 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
1188 - buf->start, buf->start + buf->len - 1);
1189 + buf->start >> PAGE_CACHE_SHIFT,
1190 + (buf->start + buf->len - 1) >>
1191 + PAGE_CACHE_SHIFT);
1194 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
1195 @@ -895,8 +898,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1196 root->fs_info = fs_info;
1197 root->objectid = objectid;
1198 root->last_trans = 0;
1199 - root->highest_inode = 0;
1200 - root->last_inode_alloc = 0;
1201 + root->highest_objectid = 0;
1204 root->inode_tree.rb_node = NULL;
1205 @@ -917,6 +919,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1206 atomic_set(&root->log_writers, 0);
1207 root->log_batch = 0;
1208 root->log_transid = 0;
1209 + root->last_log_commit = 0;
1210 extent_io_tree_init(&root->dirty_log_pages,
1211 fs_info->btree_inode->i_mapping, GFP_NOFS);
1213 @@ -952,14 +955,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
1214 root, fs_info, objectid);
1215 ret = btrfs_find_last_root(tree_root, objectid,
1216 &root->root_item, &root->root_key);
1221 generation = btrfs_root_generation(&root->root_item);
1222 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1223 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1224 blocksize, generation);
1225 - root->commit_root = btrfs_root_node(root);
1226 BUG_ON(!root->node);
1227 + root->commit_root = btrfs_root_node(root);
1231 @@ -1085,6 +1090,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1232 WARN_ON(root->log_root);
1233 root->log_root = log_root;
1234 root->log_transid = 0;
1235 + root->last_log_commit = 0;
1239 @@ -1095,7 +1101,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1240 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1241 struct btrfs_path *path;
1242 struct extent_buffer *l;
1243 - u64 highest_inode;
1247 @@ -1110,7 +1115,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1249 return ERR_PTR(ret);
1255 __setup_root(tree_root->nodesize, tree_root->leafsize,
1256 @@ -1120,39 +1125,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1257 path = btrfs_alloc_path();
1259 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1265 + l = path->nodes[0];
1266 + read_extent_buffer(l, &root->root_item,
1267 + btrfs_item_ptr_offset(l, path->slots[0]),
1268 + sizeof(root->root_item));
1269 + memcpy(&root->root_key, location, sizeof(*location));
1271 - l = path->nodes[0];
1272 - read_extent_buffer(l, &root->root_item,
1273 - btrfs_item_ptr_offset(l, path->slots[0]),
1274 - sizeof(root->root_item));
1275 - memcpy(&root->root_key, location, sizeof(*location));
1278 - btrfs_release_path(root, path);
1279 btrfs_free_path(path);
1284 return ERR_PTR(ret);
1287 generation = btrfs_root_generation(&root->root_item);
1288 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1289 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1290 blocksize, generation);
1291 root->commit_root = btrfs_root_node(root);
1292 BUG_ON(!root->node);
1294 - if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1296 + if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
1298 - ret = btrfs_find_highest_inode(root, &highest_inode);
1300 - root->highest_inode = highest_inode;
1301 - root->last_inode_alloc = highest_inode;
1308 @@ -1187,39 +1183,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1309 return fs_info->dev_root;
1310 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1311 return fs_info->csum_root;
1314 + spin_lock(&fs_info->fs_roots_radix_lock);
1315 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1316 (unsigned long)location->objectid);
1317 + spin_unlock(&fs_info->fs_roots_radix_lock);
1321 + ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1325 + return ERR_PTR(ret);
1327 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1331 + WARN_ON(btrfs_root_refs(&root->root_item) == 0);
1332 set_anon_super(&root->anon_super, NULL);
1334 + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1338 + spin_lock(&fs_info->fs_roots_radix_lock);
1339 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1340 (unsigned long)root->root_key.objectid,
1343 + root->in_radix = 1;
1344 + spin_unlock(&fs_info->fs_roots_radix_lock);
1345 + radix_tree_preload_end();
1347 - free_extent_buffer(root->node);
1349 - return ERR_PTR(ret);
1350 + if (ret == -EEXIST) {
1351 + free_fs_root(root);
1356 - if (!(fs_info->sb->s_flags & MS_RDONLY)) {
1357 - ret = btrfs_find_dead_roots(fs_info->tree_root,
1358 - root->root_key.objectid);
1361 + ret = btrfs_find_dead_roots(fs_info->tree_root,
1362 + root->root_key.objectid);
1365 + if (!(fs_info->sb->s_flags & MS_RDONLY))
1366 btrfs_orphan_cleanup(root);
1371 + free_fs_root(root);
1372 + return ERR_PTR(ret);
1375 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1376 struct btrfs_key *location,
1377 const char *name, int namelen)
1379 + return btrfs_read_fs_root_no_name(fs_info, location);
1381 struct btrfs_root *root;
1384 @@ -1236,7 +1259,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1386 return ERR_PTR(ret);
1390 ret = btrfs_sysfs_add_root(root);
1392 free_extent_buffer(root->node);
1393 @@ -1244,9 +1267,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1395 return ERR_PTR(ret);
1403 static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1404 @@ -1325,9 +1348,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1405 offset = page_offset(page);
1407 em_tree = &BTRFS_I(inode)->extent_tree;
1408 - spin_lock(&em_tree->lock);
1409 + read_lock(&em_tree->lock);
1410 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1411 - spin_unlock(&em_tree->lock);
1412 + read_unlock(&em_tree->lock);
1414 __unplug_io_fn(bdi, page);
1416 @@ -1359,8 +1382,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1418 err = bdi_register(bdi, NULL, "btrfs-%d",
1419 atomic_inc_return(&btrfs_bdi_num));
1426 bdi->ra_pages = default_backing_dev_info.ra_pages;
1427 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1428 @@ -1450,9 +1475,12 @@ static int cleaner_kthread(void *arg)
1431 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1432 - mutex_lock(&root->fs_info->cleaner_mutex);
1433 - btrfs_clean_old_snapshots(root);
1434 - mutex_unlock(&root->fs_info->cleaner_mutex);
1436 + if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
1437 + mutex_trylock(&root->fs_info->cleaner_mutex)) {
1438 + btrfs_clean_old_snapshots(root);
1439 + mutex_unlock(&root->fs_info->cleaner_mutex);
1442 if (freezing(current)) {
1444 @@ -1557,15 +1585,36 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1448 - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
1450 + ret = init_srcu_struct(&fs_info->subvol_srcu);
1456 + ret = setup_bdi(fs_info, &fs_info->bdi);
1462 + fs_info->btree_inode = new_inode(sb);
1463 + if (!fs_info->btree_inode) {
1468 + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1469 INIT_LIST_HEAD(&fs_info->trans_list);
1470 INIT_LIST_HEAD(&fs_info->dead_roots);
1471 INIT_LIST_HEAD(&fs_info->hashers);
1472 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1473 INIT_LIST_HEAD(&fs_info->ordered_operations);
1474 + INIT_LIST_HEAD(&fs_info->caching_block_groups);
1475 spin_lock_init(&fs_info->delalloc_lock);
1476 spin_lock_init(&fs_info->new_trans_lock);
1477 spin_lock_init(&fs_info->ref_cache_lock);
1478 + spin_lock_init(&fs_info->fs_roots_radix_lock);
1480 init_completion(&fs_info->kobj_unregister);
1481 fs_info->tree_root = tree_root;
1482 @@ -1584,12 +1633,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1484 fs_info->max_extent = (u64)-1;
1485 fs_info->max_inline = 8192 * 1024;
1486 - if (setup_bdi(fs_info, &fs_info->bdi))
1488 - fs_info->btree_inode = new_inode(sb);
1489 - fs_info->btree_inode->i_ino = 1;
1490 - fs_info->btree_inode->i_nlink = 1;
1491 - fs_info->metadata_ratio = 8;
1492 + fs_info->metadata_ratio = 0;
1494 fs_info->thread_pool_size = min_t(unsigned long,
1495 num_online_cpus() + 2, 8);
1496 @@ -1600,6 +1644,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1497 sb->s_blocksize = 4096;
1498 sb->s_blocksize_bits = blksize_bits(4096);
1500 + fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
1501 + fs_info->btree_inode->i_nlink = 1;
1503 * we set the i_size on the btree inode to the max possible int.
1504 * the real end of the address space is determined by all of
1505 @@ -1618,28 +1664,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1507 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1509 + BTRFS_I(fs_info->btree_inode)->root = tree_root;
1510 + memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1511 + sizeof(struct btrfs_key));
1512 + BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
1513 + insert_inode_hash(fs_info->btree_inode);
1515 spin_lock_init(&fs_info->block_group_cache_lock);
1516 fs_info->block_group_cache_tree.rb_node = NULL;
1518 - extent_io_tree_init(&fs_info->pinned_extents,
1519 + extent_io_tree_init(&fs_info->freed_extents[0],
1520 fs_info->btree_inode->i_mapping, GFP_NOFS);
1521 + extent_io_tree_init(&fs_info->freed_extents[1],
1522 + fs_info->btree_inode->i_mapping, GFP_NOFS);
1523 + fs_info->pinned_extents = &fs_info->freed_extents[0];
1524 fs_info->do_barriers = 1;
1526 - BTRFS_I(fs_info->btree_inode)->root = tree_root;
1527 - memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1528 - sizeof(struct btrfs_key));
1529 - insert_inode_hash(fs_info->btree_inode);
1531 mutex_init(&fs_info->trans_mutex);
1532 mutex_init(&fs_info->ordered_operations_mutex);
1533 mutex_init(&fs_info->tree_log_mutex);
1534 - mutex_init(&fs_info->drop_mutex);
1535 mutex_init(&fs_info->chunk_mutex);
1536 mutex_init(&fs_info->transaction_kthread_mutex);
1537 mutex_init(&fs_info->cleaner_mutex);
1538 mutex_init(&fs_info->volume_mutex);
1539 - mutex_init(&fs_info->tree_reloc_mutex);
1540 init_rwsem(&fs_info->extent_commit_sem);
1541 + init_rwsem(&fs_info->subvol_sem);
1543 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1544 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
1545 @@ -1699,20 +1749,24 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1550 - * we need to start all the end_io workers up front because the
1551 - * queue work function gets called at interrupt time, and so it
1552 - * cannot dynamically grow.
1554 + btrfs_init_workers(&fs_info->generic_worker,
1555 + "genwork", 1, NULL);
1557 btrfs_init_workers(&fs_info->workers, "worker",
1558 - fs_info->thread_pool_size);
1559 + fs_info->thread_pool_size,
1560 + &fs_info->generic_worker);
1562 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1563 - fs_info->thread_pool_size);
1564 + fs_info->thread_pool_size,
1565 + &fs_info->generic_worker);
1567 btrfs_init_workers(&fs_info->submit_workers, "submit",
1568 min_t(u64, fs_devices->num_devices,
1569 - fs_info->thread_pool_size));
1570 + fs_info->thread_pool_size),
1571 + &fs_info->generic_worker);
1572 + btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1573 + fs_info->thread_pool_size,
1574 + &fs_info->generic_worker);
1576 /* a higher idle thresh on the submit workers makes it much more
1577 * likely that bios will be send down in a sane order to the
1578 @@ -1726,15 +1780,20 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1579 fs_info->delalloc_workers.idle_thresh = 2;
1580 fs_info->delalloc_workers.ordered = 1;
1582 - btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
1583 + btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
1584 + &fs_info->generic_worker);
1585 btrfs_init_workers(&fs_info->endio_workers, "endio",
1586 - fs_info->thread_pool_size);
1587 + fs_info->thread_pool_size,
1588 + &fs_info->generic_worker);
1589 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
1590 - fs_info->thread_pool_size);
1591 + fs_info->thread_pool_size,
1592 + &fs_info->generic_worker);
1593 btrfs_init_workers(&fs_info->endio_meta_write_workers,
1594 - "endio-meta-write", fs_info->thread_pool_size);
1595 + "endio-meta-write", fs_info->thread_pool_size,
1596 + &fs_info->generic_worker);
1597 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1598 - fs_info->thread_pool_size);
1599 + fs_info->thread_pool_size,
1600 + &fs_info->generic_worker);
1603 * endios are largely parallel and should have a very
1604 @@ -1743,20 +1802,19 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1605 fs_info->endio_workers.idle_thresh = 4;
1606 fs_info->endio_meta_workers.idle_thresh = 4;
1608 - fs_info->endio_write_workers.idle_thresh = 64;
1609 - fs_info->endio_meta_write_workers.idle_thresh = 64;
1610 + fs_info->endio_write_workers.idle_thresh = 2;
1611 + fs_info->endio_meta_write_workers.idle_thresh = 2;
1613 btrfs_start_workers(&fs_info->workers, 1);
1614 + btrfs_start_workers(&fs_info->generic_worker, 1);
1615 btrfs_start_workers(&fs_info->submit_workers, 1);
1616 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1617 btrfs_start_workers(&fs_info->fixup_workers, 1);
1618 - btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
1619 - btrfs_start_workers(&fs_info->endio_meta_workers,
1620 - fs_info->thread_pool_size);
1621 - btrfs_start_workers(&fs_info->endio_meta_write_workers,
1622 - fs_info->thread_pool_size);
1623 - btrfs_start_workers(&fs_info->endio_write_workers,
1624 - fs_info->thread_pool_size);
1625 + btrfs_start_workers(&fs_info->endio_workers, 1);
1626 + btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1627 + btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1628 + btrfs_start_workers(&fs_info->endio_write_workers, 1);
1629 + btrfs_start_workers(&fs_info->enospc_workers, 1);
1631 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1632 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
1633 @@ -1916,6 +1974,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1637 + ret = btrfs_find_orphan_roots(tree_root);
1640 if (!(sb->s_flags & MS_RDONLY)) {
1641 ret = btrfs_recover_relocation(tree_root);
1643 @@ -1959,6 +2020,7 @@ fail_chunk_root:
1644 free_extent_buffer(chunk_root->node);
1645 free_extent_buffer(chunk_root->commit_root);
1647 + btrfs_stop_workers(&fs_info->generic_worker);
1648 btrfs_stop_workers(&fs_info->fixup_workers);
1649 btrfs_stop_workers(&fs_info->delalloc_workers);
1650 btrfs_stop_workers(&fs_info->workers);
1651 @@ -1967,6 +2029,7 @@ fail_sb_buffer:
1652 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
1653 btrfs_stop_workers(&fs_info->endio_write_workers);
1654 btrfs_stop_workers(&fs_info->submit_workers);
1655 + btrfs_stop_workers(&fs_info->enospc_workers);
1657 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1658 iput(fs_info->btree_inode);
1659 @@ -1975,6 +2038,8 @@ fail_iput:
1660 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1662 bdi_destroy(&fs_info->bdi);
1664 + cleanup_srcu_struct(&fs_info->subvol_srcu);
1668 @@ -2234,20 +2299,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
1670 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
1672 - WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
1673 + spin_lock(&fs_info->fs_roots_radix_lock);
1674 radix_tree_delete(&fs_info->fs_roots_radix,
1675 (unsigned long)root->root_key.objectid);
1676 + spin_unlock(&fs_info->fs_roots_radix_lock);
1678 + if (btrfs_root_refs(&root->root_item) == 0)
1679 + synchronize_srcu(&fs_info->subvol_srcu);
1681 + free_fs_root(root);
1685 +static void free_fs_root(struct btrfs_root *root)
1687 + WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
1688 if (root->anon_super.s_dev) {
1689 down_write(&root->anon_super.s_umount);
1690 kill_anon_super(&root->anon_super);
1693 - free_extent_buffer(root->node);
1694 - if (root->commit_root)
1695 - free_extent_buffer(root->commit_root);
1696 + free_extent_buffer(root->node);
1697 + free_extent_buffer(root->commit_root);
1703 static int del_fs_roots(struct btrfs_fs_info *fs_info)
1704 @@ -2256,6 +2330,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
1705 struct btrfs_root *gang[8];
1708 + while (!list_empty(&fs_info->dead_roots)) {
1709 + gang[0] = list_entry(fs_info->dead_roots.next,
1710 + struct btrfs_root, root_list);
1711 + list_del(&gang[0]->root_list);
1713 + if (gang[0]->in_radix) {
1714 + btrfs_free_fs_root(fs_info, gang[0]);
1716 + free_extent_buffer(gang[0]->node);
1717 + free_extent_buffer(gang[0]->commit_root);
1723 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
1725 @@ -2285,9 +2373,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
1726 root_objectid = gang[ret - 1]->root_key.objectid + 1;
1727 for (i = 0; i < ret; i++) {
1728 root_objectid = gang[i]->root_key.objectid;
1729 - ret = btrfs_find_dead_roots(fs_info->tree_root,
1732 btrfs_orphan_cleanup(gang[i]);
1735 @@ -2357,12 +2442,12 @@ int close_ctree(struct btrfs_root *root)
1736 free_extent_buffer(root->fs_info->csum_root->commit_root);
1738 btrfs_free_block_groups(root->fs_info);
1739 - btrfs_free_pinned_extents(root->fs_info);
1741 del_fs_roots(fs_info);
1743 iput(fs_info->btree_inode);
1745 + btrfs_stop_workers(&fs_info->generic_worker);
1746 btrfs_stop_workers(&fs_info->fixup_workers);
1747 btrfs_stop_workers(&fs_info->delalloc_workers);
1748 btrfs_stop_workers(&fs_info->workers);
1749 @@ -2371,11 +2456,13 @@ int close_ctree(struct btrfs_root *root)
1750 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
1751 btrfs_stop_workers(&fs_info->endio_write_workers);
1752 btrfs_stop_workers(&fs_info->submit_workers);
1753 + btrfs_stop_workers(&fs_info->enospc_workers);
1755 btrfs_close_devices(fs_info->fs_devices);
1756 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1758 bdi_destroy(&fs_info->bdi);
1759 + cleanup_srcu_struct(&fs_info->subvol_srcu);
1761 kfree(fs_info->extent_root);
1762 kfree(fs_info->tree_root);
1763 diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
1764 index 9596b40..ba5c3fd 100644
1765 --- a/fs/btrfs/export.c
1766 +++ b/fs/btrfs/export.c
1767 @@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
1768 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
1769 type = FILEID_BTRFS_WITHOUT_PARENT;
1771 - fid->objectid = BTRFS_I(inode)->location.objectid;
1772 + fid->objectid = inode->i_ino;
1773 fid->root_objectid = BTRFS_I(inode)->root->objectid;
1774 fid->gen = inode->i_generation;
1776 @@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
1779 static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
1780 - u64 root_objectid, u32 generation)
1781 + u64 root_objectid, u32 generation,
1782 + int check_generation)
1784 + struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
1785 struct btrfs_root *root;
1786 + struct dentry *dentry;
1787 struct inode *inode;
1788 struct btrfs_key key;
1792 + if (objectid < BTRFS_FIRST_FREE_OBJECTID)
1793 + return ERR_PTR(-ESTALE);
1795 key.objectid = root_objectid;
1796 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
1797 key.offset = (u64)-1;
1799 - root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
1801 - return ERR_CAST(root);
1802 + index = srcu_read_lock(&fs_info->subvol_srcu);
1804 + root = btrfs_read_fs_root_no_name(fs_info, &key);
1805 + if (IS_ERR(root)) {
1806 + err = PTR_ERR(root);
1810 + if (btrfs_root_refs(&root->root_item) == 0) {
1815 key.objectid = objectid;
1816 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
1819 inode = btrfs_iget(sb, &key, root);
1820 - if (IS_ERR(inode))
1821 - return (void *)inode;
1822 + if (IS_ERR(inode)) {
1823 + err = PTR_ERR(inode);
1827 + srcu_read_unlock(&fs_info->subvol_srcu, index);
1829 - if (generation != inode->i_generation) {
1830 + if (check_generation && generation != inode->i_generation) {
1832 return ERR_PTR(-ESTALE);
1835 - return d_obtain_alias(inode);
1836 + dentry = d_obtain_alias(inode);
1837 + if (!IS_ERR(dentry))
1838 + dentry->d_op = &btrfs_dentry_operations;
1841 + srcu_read_unlock(&fs_info->subvol_srcu, index);
1842 + return ERR_PTR(err);
1845 static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
1846 @@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
1847 objectid = fid->parent_objectid;
1848 generation = fid->parent_gen;
1850 - return btrfs_get_dentry(sb, objectid, root_objectid, generation);
1851 + return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
1854 static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
1855 @@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
1856 root_objectid = fid->root_objectid;
1857 generation = fid->gen;
1859 - return btrfs_get_dentry(sb, objectid, root_objectid, generation);
1860 + return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
1863 static struct dentry *btrfs_get_parent(struct dentry *child)
1865 struct inode *dir = child->d_inode;
1866 + static struct dentry *dentry;
1867 struct btrfs_root *root = BTRFS_I(dir)->root;
1868 - struct btrfs_key key;
1869 struct btrfs_path *path;
1870 struct extent_buffer *leaf;
1873 + struct btrfs_root_ref *ref;
1874 + struct btrfs_key key;
1875 + struct btrfs_key found_key;
1878 path = btrfs_alloc_path();
1880 - key.objectid = dir->i_ino;
1881 - btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
1882 - key.offset = (u64)-1;
1883 + if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
1884 + key.objectid = root->root_key.objectid;
1885 + key.type = BTRFS_ROOT_BACKREF_KEY;
1886 + key.offset = (u64)-1;
1887 + root = root->fs_info->tree_root;
1889 + key.objectid = dir->i_ino;
1890 + key.type = BTRFS_INODE_REF_KEY;
1891 + key.offset = (u64)-1;
1894 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1897 - btrfs_free_path(path);
1898 - return ERR_PTR(ret);
1903 + if (path->slots[0] == 0) {
1909 leaf = path->nodes[0];
1910 - slot = path->slots[0];
1912 - /* btrfs_search_slot() returns the slot where we'd want to
1913 - insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
1914 - The _real_ backref, telling us what the parent inode
1915 - _actually_ is, will be in the slot _before_ the one
1916 - that btrfs_search_slot() returns. */
1918 - /* Unless there is _no_ key in the tree before... */
1919 - btrfs_free_path(path);
1920 - return ERR_PTR(-EIO);
1924 + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1925 + if (found_key.objectid != key.objectid || found_key.type != key.type) {
1930 - btrfs_item_key_to_cpu(leaf, &key, slot);
1931 + if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
1932 + ref = btrfs_item_ptr(leaf, path->slots[0],
1933 + struct btrfs_root_ref);
1934 + key.objectid = btrfs_root_ref_dirid(leaf, ref);
1936 + key.objectid = found_key.offset;
1938 btrfs_free_path(path);
1940 - if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
1941 - return ERR_PTR(-EINVAL);
1943 - objectid = key.offset;
1945 - /* If we are already at the root of a subvol, return the real root */
1946 - if (objectid == dir->i_ino)
1947 - return dget(dir->i_sb->s_root);
1948 + if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
1949 + return btrfs_get_dentry(root->fs_info->sb, key.objectid,
1950 + found_key.offset, 0, 0);
1953 - /* Build a new key for the inode item */
1954 - key.objectid = objectid;
1955 - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
1956 + key.type = BTRFS_INODE_ITEM_KEY;
1959 - return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
1960 + dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
1961 + if (!IS_ERR(dentry))
1962 + dentry->d_op = &btrfs_dentry_operations;
1965 + btrfs_free_path(path);
1966 + return ERR_PTR(ret);
1969 const struct export_operations btrfs_export_ops = {
1970 diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
1971 index 72a2b9c..c56f916 100644
1972 --- a/fs/btrfs/extent-tree.c
1973 +++ b/fs/btrfs/extent-tree.c
1975 #include "locking.h"
1976 #include "free-space-cache.h"
1978 -static int update_reserved_extents(struct btrfs_root *root,
1979 - u64 bytenr, u64 num, int reserve);
1980 static int update_block_group(struct btrfs_trans_handle *trans,
1981 struct btrfs_root *root,
1982 u64 bytenr, u64 num_bytes, int alloc,
1984 +static int update_reserved_extents(struct btrfs_block_group_cache *cache,
1985 + u64 num_bytes, int reserve);
1986 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
1987 struct btrfs_root *root,
1988 u64 bytenr, u64 num_bytes, u64 parent,
1989 @@ -57,10 +57,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
1990 u64 parent, u64 root_objectid,
1991 u64 flags, struct btrfs_disk_key *key,
1992 int level, struct btrfs_key *ins);
1994 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1995 struct btrfs_root *extent_root, u64 alloc_bytes,
1996 u64 flags, int force);
1997 +static int pin_down_bytes(struct btrfs_trans_handle *trans,
1998 + struct btrfs_root *root,
1999 + struct btrfs_path *path,
2000 + u64 bytenr, u64 num_bytes,
2001 + int is_data, int reserved,
2002 + struct extent_buffer **must_clean);
2003 +static int find_next_key(struct btrfs_path *path, int level,
2004 + struct btrfs_key *key);
2005 +static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
2006 + int dump_block_groups);
2009 block_group_cache_done(struct btrfs_block_group_cache *cache)
2010 @@ -153,34 +162,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
2015 - * We always set EXTENT_LOCKED for the super mirror extents so we don't
2016 - * overwrite them, so those bits need to be unset. Also, if we are unmounting
2017 - * with pinned extents still sitting there because we had a block group caching,
2018 - * we need to clear those now, since we are done.
2020 -void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
2021 +static int add_excluded_extent(struct btrfs_root *root,
2022 + u64 start, u64 num_bytes)
2024 - u64 start, end, last = 0;
2026 + u64 end = start + num_bytes - 1;
2027 + set_extent_bits(&root->fs_info->freed_extents[0],
2028 + start, end, EXTENT_UPTODATE, GFP_NOFS);
2029 + set_extent_bits(&root->fs_info->freed_extents[1],
2030 + start, end, EXTENT_UPTODATE, GFP_NOFS);
2035 - ret = find_first_extent_bit(&info->pinned_extents, last,
2037 - EXTENT_LOCKED|EXTENT_DIRTY);
2040 +static void free_excluded_extents(struct btrfs_root *root,
2041 + struct btrfs_block_group_cache *cache)
2045 - clear_extent_bits(&info->pinned_extents, start, end,
2046 - EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
2049 + start = cache->key.objectid;
2050 + end = start + cache->key.offset - 1;
2052 + clear_extent_bits(&root->fs_info->freed_extents[0],
2053 + start, end, EXTENT_UPTODATE, GFP_NOFS);
2054 + clear_extent_bits(&root->fs_info->freed_extents[1],
2055 + start, end, EXTENT_UPTODATE, GFP_NOFS);
2058 -static int remove_sb_from_cache(struct btrfs_root *root,
2059 - struct btrfs_block_group_cache *cache)
2060 +static int exclude_super_stripes(struct btrfs_root *root,
2061 + struct btrfs_block_group_cache *cache)
2063 - struct btrfs_fs_info *fs_info = root->fs_info;
2067 @@ -192,17 +201,42 @@ static int remove_sb_from_cache(struct btrfs_root *root,
2068 cache->key.objectid, bytenr,
2069 0, &logical, &nr, &stripe_len);
2073 - try_lock_extent(&fs_info->pinned_extents,
2075 - logical[nr] + stripe_len - 1, GFP_NOFS);
2076 + cache->bytes_super += stripe_len;
2077 + ret = add_excluded_extent(root, logical[nr],
2088 +static struct btrfs_caching_control *
2089 +get_caching_control(struct btrfs_block_group_cache *cache)
2091 + struct btrfs_caching_control *ctl;
2093 + spin_lock(&cache->lock);
2094 + if (cache->cached != BTRFS_CACHE_STARTED) {
2095 + spin_unlock(&cache->lock);
2099 + ctl = cache->caching_ctl;
2100 + atomic_inc(&ctl->count);
2101 + spin_unlock(&cache->lock);
2105 +static void put_caching_control(struct btrfs_caching_control *ctl)
2107 + if (atomic_dec_and_test(&ctl->count))
2112 * this is only called by cache_block_group, since we could have freed extents
2113 * we need to check the pinned_extents for any extents that can't be used yet
2114 @@ -215,9 +249,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
2117 while (start < end) {
2118 - ret = find_first_extent_bit(&info->pinned_extents, start,
2119 + ret = find_first_extent_bit(info->pinned_extents, start,
2120 &extent_start, &extent_end,
2121 - EXTENT_DIRTY|EXTENT_LOCKED);
2122 + EXTENT_DIRTY | EXTENT_UPTODATE);
2126 @@ -249,22 +283,27 @@ static int caching_kthread(void *data)
2128 struct btrfs_block_group_cache *block_group = data;
2129 struct btrfs_fs_info *fs_info = block_group->fs_info;
2131 + struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
2132 + struct btrfs_root *extent_root = fs_info->extent_root;
2133 struct btrfs_path *path;
2135 - struct btrfs_key key;
2136 struct extent_buffer *leaf;
2138 + struct btrfs_key key;
2139 u64 total_found = 0;
2146 path = btrfs_alloc_path();
2150 - atomic_inc(&block_group->space_info->caching_threads);
2151 + exclude_super_stripes(extent_root, block_group);
2152 + spin_lock(&block_group->space_info->lock);
2153 + block_group->space_info->bytes_super += block_group->bytes_super;
2154 + spin_unlock(&block_group->space_info->lock);
2156 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
2159 * We don't want to deadlock with somebody trying to allocate a new
2160 * extent for the extent root while also trying to search the extent
2161 @@ -277,74 +316,64 @@ static int caching_kthread(void *data)
2163 key.objectid = last;
2165 - btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2166 + key.type = BTRFS_EXTENT_ITEM_KEY;
2168 + mutex_lock(&caching_ctl->mutex);
2169 /* need to make sure the commit_root doesn't disappear */
2170 down_read(&fs_info->extent_commit_sem);
2172 - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
2173 + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2177 + leaf = path->nodes[0];
2178 + nritems = btrfs_header_nritems(leaf);
2182 - if (block_group->fs_info->closing > 1) {
2183 + if (fs_info->closing > 1) {
2188 - leaf = path->nodes[0];
2189 - slot = path->slots[0];
2190 - if (slot >= btrfs_header_nritems(leaf)) {
2191 - ret = btrfs_next_leaf(fs_info->extent_root, path);
2195 + if (path->slots[0] < nritems) {
2196 + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2198 + ret = find_next_key(path, 0, &key);
2202 - if (need_resched() ||
2203 - btrfs_transaction_in_commit(fs_info)) {
2204 - leaf = path->nodes[0];
2206 - /* this shouldn't happen, but if the
2207 - * leaf is empty just move on.
2209 - if (btrfs_header_nritems(leaf) == 0)
2212 - * we need to copy the key out so that
2213 - * we are sure the next search advances
2214 - * us forward in the btree.
2216 - btrfs_item_key_to_cpu(leaf, &key, 0);
2217 - btrfs_release_path(fs_info->extent_root, path);
2218 - up_read(&fs_info->extent_commit_sem);
2219 + caching_ctl->progress = last;
2220 + btrfs_release_path(extent_root, path);
2221 + up_read(&fs_info->extent_commit_sem);
2222 + mutex_unlock(&caching_ctl->mutex);
2223 + if (btrfs_transaction_in_commit(fs_info))
2224 schedule_timeout(1);
2232 + if (key.objectid < block_group->key.objectid) {
2236 - btrfs_item_key_to_cpu(leaf, &key, slot);
2237 - if (key.objectid < block_group->key.objectid)
2240 if (key.objectid >= block_group->key.objectid +
2241 block_group->key.offset)
2244 - if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
2245 + if (key.type == BTRFS_EXTENT_ITEM_KEY) {
2246 total_found += add_new_free_space(block_group,
2249 last = key.objectid + key.offset;
2252 - if (total_found > (1024 * 1024 * 2)) {
2254 - wake_up(&block_group->caching_q);
2255 + if (total_found > (1024 * 1024 * 2)) {
2257 + wake_up(&caching_ctl->wait);
2264 @@ -352,33 +381,65 @@ next:
2265 total_found += add_new_free_space(block_group, fs_info, last,
2266 block_group->key.objectid +
2267 block_group->key.offset);
2268 + caching_ctl->progress = (u64)-1;
2270 spin_lock(&block_group->lock);
2271 + block_group->caching_ctl = NULL;
2272 block_group->cached = BTRFS_CACHE_FINISHED;
2273 spin_unlock(&block_group->lock);
2276 btrfs_free_path(path);
2277 up_read(&fs_info->extent_commit_sem);
2278 - atomic_dec(&block_group->space_info->caching_threads);
2279 - wake_up(&block_group->caching_q);
2281 + free_excluded_extents(extent_root, block_group);
2283 + mutex_unlock(&caching_ctl->mutex);
2284 + wake_up(&caching_ctl->wait);
2286 + put_caching_control(caching_ctl);
2287 + atomic_dec(&block_group->space_info->caching_threads);
2291 static int cache_block_group(struct btrfs_block_group_cache *cache)
2293 + struct btrfs_fs_info *fs_info = cache->fs_info;
2294 + struct btrfs_caching_control *caching_ctl;
2295 struct task_struct *tsk;
2299 + if (cache->cached != BTRFS_CACHE_NO)
2302 + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
2303 + BUG_ON(!caching_ctl);
2305 + INIT_LIST_HEAD(&caching_ctl->list);
2306 + mutex_init(&caching_ctl->mutex);
2307 + init_waitqueue_head(&caching_ctl->wait);
2308 + caching_ctl->block_group = cache;
2309 + caching_ctl->progress = cache->key.objectid;
2310 + /* one for caching kthread, one for caching block group list */
2311 + atomic_set(&caching_ctl->count, 2);
2313 spin_lock(&cache->lock);
2314 if (cache->cached != BTRFS_CACHE_NO) {
2315 spin_unlock(&cache->lock);
2317 + kfree(caching_ctl);
2320 + cache->caching_ctl = caching_ctl;
2321 cache->cached = BTRFS_CACHE_STARTED;
2322 spin_unlock(&cache->lock);
2324 + down_write(&fs_info->extent_commit_sem);
2325 + list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
2326 + up_write(&fs_info->extent_commit_sem);
2328 + atomic_inc(&cache->space_info->caching_threads);
2330 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
2331 cache->key.objectid);
2333 @@ -1507,22 +1568,22 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
2337 -#ifdef BIO_RW_DISCARD
2338 static void btrfs_issue_discard(struct block_device *bdev,
2341 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
2345 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
2348 -#ifdef BIO_RW_DISCARD
2350 u64 map_length = num_bytes;
2351 struct btrfs_multi_bio *multi = NULL;
2353 + if (!btrfs_test_opt(root, DISCARD))
2356 /* Tell the block device(s) that the sectors can be discarded */
2357 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
2358 bytenr, &map_length, &multi, 0);
2359 @@ -1542,9 +1603,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
2368 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2369 @@ -1656,7 +1714,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2370 parent, ref_root, flags,
2371 ref->objectid, ref->offset,
2372 &ins, node->ref_mod);
2373 - update_reserved_extents(root, ins.objectid, ins.offset, 0);
2374 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2375 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2376 node->num_bytes, parent,
2377 @@ -1782,7 +1839,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2378 extent_op->flags_to_set,
2381 - update_reserved_extents(root, ins.objectid, ins.offset, 0);
2382 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2383 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2384 node->num_bytes, parent, ref_root,
2385 @@ -1817,16 +1873,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2387 head = btrfs_delayed_node_to_head(node);
2388 if (insert_reserved) {
2389 + int mark_free = 0;
2390 + struct extent_buffer *must_clean = NULL;
2392 + ret = pin_down_bytes(trans, root, NULL,
2393 + node->bytenr, node->num_bytes,
2394 + head->is_data, 1, &must_clean);
2399 + clean_tree_block(NULL, root, must_clean);
2400 + btrfs_tree_unlock(must_clean);
2401 + free_extent_buffer(must_clean);
2403 if (head->is_data) {
2404 ret = btrfs_del_csums(trans, root,
2409 - btrfs_update_pinned_extents(root, node->bytenr,
2410 - node->num_bytes, 1);
2411 - update_reserved_extents(root, node->bytenr,
2412 - node->num_bytes, 0);
2414 + ret = btrfs_free_reserved_extent(root,
2420 mutex_unlock(&head->mutex);
2422 @@ -2691,60 +2763,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2426 +static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2431 + level = BTRFS_MAX_LEVEL - 2;
2433 + * NOTE: these calculations are absolutely the worst possible case.
2434 + * This assumes that _every_ item we insert will require a new leaf, and
2435 + * that the tree has grown to its maximum level size.
2439 + * for every item we insert we could insert both an extent item and a
2440 + * extent ref item. Then for ever item we insert, we will need to cow
2441 + * both the original leaf, plus the leaf to the left and right of it.
2443 + * Unless we are talking about the extent root, then we just want the
2444 + * number of items * 2, since we just need the extent item plus its ref.
2446 + if (root == root->fs_info->extent_root)
2447 + num_bytes = num_items * 2;
2449 + num_bytes = (num_items + (2 * num_items)) * 3;
2452 + * num_bytes is total number of leaves we could need times the leaf
2453 + * size, and then for every leaf we could end up cow'ing 2 nodes per
2454 + * level, down to the leaf level.
2456 + num_bytes = (num_bytes * root->leafsize) +
2457 + (num_bytes * (level * 2)) * root->nodesize;
2463 - * for now this just makes sure we have at least 5% of our metadata space free
2465 + * Unreserve metadata space for delalloc. If we have less reserved credits than
2466 + * we have extents, this function does nothing.
2468 -int btrfs_check_metadata_free_space(struct btrfs_root *root)
2469 +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2470 + struct inode *inode, int num_items)
2472 struct btrfs_fs_info *info = root->fs_info;
2473 struct btrfs_space_info *meta_sinfo;
2474 - u64 alloc_target, thresh;
2475 - int committed = 0, ret;
2480 /* get the space info for where the metadata will live */
2481 alloc_target = btrfs_get_alloc_profile(root, 0);
2482 meta_sinfo = __find_space_info(info, alloc_target);
2485 + num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2488 spin_lock(&meta_sinfo->lock);
2489 - if (!meta_sinfo->full)
2490 - thresh = meta_sinfo->total_bytes * 80;
2492 - thresh = meta_sinfo->total_bytes * 95;
2493 + spin_lock(&BTRFS_I(inode)->accounting_lock);
2494 + if (BTRFS_I(inode)->reserved_extents <=
2495 + BTRFS_I(inode)->outstanding_extents) {
2496 + spin_unlock(&BTRFS_I(inode)->accounting_lock);
2497 + spin_unlock(&meta_sinfo->lock);
2500 + spin_unlock(&BTRFS_I(inode)->accounting_lock);
2502 + BTRFS_I(inode)->reserved_extents--;
2503 + BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2505 + if (meta_sinfo->bytes_delalloc < num_bytes) {
2507 + meta_sinfo->bytes_delalloc = 0;
2509 + meta_sinfo->bytes_delalloc -= num_bytes;
2511 + spin_unlock(&meta_sinfo->lock);
2518 +static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2522 + thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2523 + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2524 + meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2525 + meta_sinfo->bytes_may_use;
2527 + thresh = meta_sinfo->total_bytes - thresh;
2529 do_div(thresh, 100);
2530 + if (thresh <= meta_sinfo->bytes_delalloc)
2531 + meta_sinfo->force_delalloc = 1;
2533 + meta_sinfo->force_delalloc = 0;
2536 - if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2537 - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
2538 - struct btrfs_trans_handle *trans;
2539 - if (!meta_sinfo->full) {
2540 - meta_sinfo->force_alloc = 1;
2541 - spin_unlock(&meta_sinfo->lock);
2542 +struct async_flush {
2543 + struct btrfs_root *root;
2544 + struct btrfs_space_info *info;
2545 + struct btrfs_work work;
2548 - trans = btrfs_start_transaction(root, 1);
2551 +static noinline void flush_delalloc_async(struct btrfs_work *work)
2553 + struct async_flush *async;
2554 + struct btrfs_root *root;
2555 + struct btrfs_space_info *info;
2557 - ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2558 - 2 * 1024 * 1024, alloc_target, 0);
2559 - btrfs_end_transaction(trans, root);
2560 + async = container_of(work, struct async_flush, work);
2561 + root = async->root;
2562 + info = async->info;
2564 + btrfs_start_delalloc_inodes(root);
2565 + wake_up(&info->flush_wait);
2566 + btrfs_wait_ordered_extents(root, 0);
2568 + spin_lock(&info->lock);
2569 + info->flushing = 0;
2570 + spin_unlock(&info->lock);
2571 + wake_up(&info->flush_wait);
2576 +static void wait_on_flush(struct btrfs_space_info *info)
2578 + DEFINE_WAIT(wait);
2582 + prepare_to_wait(&info->flush_wait, &wait,
2583 + TASK_UNINTERRUPTIBLE);
2584 + spin_lock(&info->lock);
2585 + if (!info->flushing) {
2586 + spin_unlock(&info->lock);
2590 + used = info->bytes_used + info->bytes_reserved +
2591 + info->bytes_pinned + info->bytes_readonly +
2592 + info->bytes_super + info->bytes_root +
2593 + info->bytes_may_use + info->bytes_delalloc;
2594 + if (used < info->total_bytes) {
2595 + spin_unlock(&info->lock);
2598 + spin_unlock(&info->lock);
2601 + finish_wait(&info->flush_wait, &wait);
2604 +static void flush_delalloc(struct btrfs_root *root,
2605 + struct btrfs_space_info *info)
2607 + struct async_flush *async;
2608 + bool wait = false;
2610 + spin_lock(&info->lock);
2612 + if (!info->flushing) {
2613 + info->flushing = 1;
2614 + init_waitqueue_head(&info->flush_wait);
2619 + spin_unlock(&info->lock);
2622 + wait_on_flush(info);
2626 + async = kzalloc(sizeof(*async), GFP_NOFS);
2630 + async->root = root;
2631 + async->info = info;
2632 + async->work.func = flush_delalloc_async;
2634 + btrfs_queue_worker(&root->fs_info->enospc_workers,
2636 + wait_on_flush(info);
2640 + btrfs_start_delalloc_inodes(root);
2641 + btrfs_wait_ordered_extents(root, 0);
2643 + spin_lock(&info->lock);
2644 + info->flushing = 0;
2645 + spin_unlock(&info->lock);
2646 + wake_up(&info->flush_wait);
2649 +static int maybe_allocate_chunk(struct btrfs_root *root,
2650 + struct btrfs_space_info *info)
2652 + struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2653 + struct btrfs_trans_handle *trans;
2654 + bool wait = false;
2659 + free_space = btrfs_super_total_bytes(disk_super);
2661 + * we allow the metadata to grow to a max of either 5gb or 5% of the
2662 + * space in the volume.
2664 + min_metadata = min((u64)5 * 1024 * 1024 * 1024,
2665 + div64_u64(free_space * 5, 100));
2666 + if (info->total_bytes >= min_metadata) {
2667 + spin_unlock(&info->lock);
2672 + spin_unlock(&info->lock);
2676 + if (!info->allocating_chunk) {
2677 + info->force_alloc = 1;
2678 + info->allocating_chunk = 1;
2679 + init_waitqueue_head(&info->allocate_wait);
2684 + spin_unlock(&info->lock);
2687 + wait_event(info->allocate_wait,
2688 + !info->allocating_chunk);
2692 + trans = btrfs_start_transaction(root, 1);
2698 + ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2699 + 4096 + 2 * 1024 * 1024,
2701 + btrfs_end_transaction(trans, root);
2705 + spin_lock(&info->lock);
2706 + info->allocating_chunk = 0;
2707 + spin_unlock(&info->lock);
2708 + wake_up(&info->allocate_wait);
2716 + * Reserve metadata space for delalloc.
2718 +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2719 + struct inode *inode, int num_items)
2721 + struct btrfs_fs_info *info = root->fs_info;
2722 + struct btrfs_space_info *meta_sinfo;
2727 + int force_delalloc;
2729 + /* get the space info for where the metadata will live */
2730 + alloc_target = btrfs_get_alloc_profile(root, 0);
2731 + meta_sinfo = __find_space_info(info, alloc_target);
2733 + num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2736 + spin_lock(&meta_sinfo->lock);
2738 + force_delalloc = meta_sinfo->force_delalloc;
2740 + if (unlikely(!meta_sinfo->bytes_root))
2741 + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
2744 + meta_sinfo->bytes_delalloc += num_bytes;
2746 + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2747 + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2748 + meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2749 + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
2751 + if (used > meta_sinfo->total_bytes) {
2754 + if (flushed == 1) {
2755 + if (maybe_allocate_chunk(root, meta_sinfo))
2759 + spin_unlock(&meta_sinfo->lock);
2762 + if (flushed == 2) {
2763 + filemap_flush(inode->i_mapping);
2765 + } else if (flushed == 3) {
2766 + flush_delalloc(root, meta_sinfo);
2769 + spin_lock(&meta_sinfo->lock);
2770 + meta_sinfo->bytes_delalloc -= num_bytes;
2771 spin_unlock(&meta_sinfo->lock);
2772 + printk(KERN_ERR "enospc, has %d, reserved %d\n",
2773 + BTRFS_I(inode)->outstanding_extents,
2774 + BTRFS_I(inode)->reserved_extents);
2775 + dump_space_info(meta_sinfo, 0, 0);
2781 - trans = btrfs_join_transaction(root, 1);
2784 - ret = btrfs_commit_transaction(trans, root);
2787 + BTRFS_I(inode)->reserved_extents++;
2788 + check_force_delalloc(meta_sinfo);
2789 + spin_unlock(&meta_sinfo->lock);
2791 + if (!flushed && force_delalloc)
2792 + filemap_flush(inode->i_mapping);
2798 + * unreserve num_items number of items worth of metadata space. This needs to
2799 + * be paired with btrfs_reserve_metadata_space.
2801 + * NOTE: if you have the option, run this _AFTER_ you do a
2802 + * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
2803 + * oprations which will result in more used metadata, so we want to make sure we
2804 + * can do that without issue.
2806 +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
2808 + struct btrfs_fs_info *info = root->fs_info;
2809 + struct btrfs_space_info *meta_sinfo;
2814 + /* get the space info for where the metadata will live */
2815 + alloc_target = btrfs_get_alloc_profile(root, 0);
2816 + meta_sinfo = __find_space_info(info, alloc_target);
2818 + num_bytes = calculate_bytes_needed(root, num_items);
2820 + spin_lock(&meta_sinfo->lock);
2821 + if (meta_sinfo->bytes_may_use < num_bytes) {
2823 + meta_sinfo->bytes_may_use = 0;
2825 + meta_sinfo->bytes_may_use -= num_bytes;
2827 + spin_unlock(&meta_sinfo->lock);
2835 + * Reserve some metadata space for use. We'll calculate the worste case number
2836 + * of bytes that would be needed to modify num_items number of items. If we
2837 + * have space, fantastic, if not, you get -ENOSPC. Please call
2838 + * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
2839 + * items you reserved, since whatever metadata you needed should have already
2842 + * This will commit the transaction to make more space if we don't have enough
2843 + * metadata space. THe only time we don't do this is if we're reserving space
2844 + * inside of a transaction, then we will just return -ENOSPC and it is the
2845 + * callers responsibility to handle it properly.
2847 +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
2849 + struct btrfs_fs_info *info = root->fs_info;
2850 + struct btrfs_space_info *meta_sinfo;
2856 + /* get the space info for where the metadata will live */
2857 + alloc_target = btrfs_get_alloc_profile(root, 0);
2858 + meta_sinfo = __find_space_info(info, alloc_target);
2860 + num_bytes = calculate_bytes_needed(root, num_items);
2862 + spin_lock(&meta_sinfo->lock);
2864 + if (unlikely(!meta_sinfo->bytes_root))
2865 + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
2868 + meta_sinfo->bytes_may_use += num_bytes;
2870 + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2871 + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2872 + meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2873 + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
2875 + if (used > meta_sinfo->total_bytes) {
2877 + if (retries == 1) {
2878 + if (maybe_allocate_chunk(root, meta_sinfo))
2882 + spin_unlock(&meta_sinfo->lock);
2885 + if (retries == 2) {
2886 + flush_delalloc(root, meta_sinfo);
2889 + spin_lock(&meta_sinfo->lock);
2890 + meta_sinfo->bytes_may_use -= num_bytes;
2891 + spin_unlock(&meta_sinfo->lock);
2893 + dump_space_info(meta_sinfo, 0, 0);
2897 + check_force_delalloc(meta_sinfo);
2898 spin_unlock(&meta_sinfo->lock);
2901 @@ -2764,13 +3224,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2902 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2904 data_sinfo = BTRFS_I(inode)->space_info;
2909 /* make sure we have enough space to handle the data first */
2910 spin_lock(&data_sinfo->lock);
2911 if (data_sinfo->total_bytes - data_sinfo->bytes_used -
2912 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
2913 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
2914 - data_sinfo->bytes_may_use < bytes) {
2915 + data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
2916 struct btrfs_trans_handle *trans;
2919 @@ -2782,7 +3245,7 @@ again:
2921 data_sinfo->force_alloc = 1;
2922 spin_unlock(&data_sinfo->lock);
2925 alloc_target = btrfs_get_alloc_profile(root, 1);
2926 trans = btrfs_start_transaction(root, 1);
2928 @@ -2794,12 +3257,17 @@ again:
2929 btrfs_end_transaction(trans, root);
2933 + if (!data_sinfo) {
2934 + btrfs_set_inode_space_info(root, inode);
2935 + data_sinfo = BTRFS_I(inode)->space_info;
2939 spin_unlock(&data_sinfo->lock);
2941 /* commit the current transaction and try again */
2943 + if (!committed && !root->fs_info->open_ioctl_trans) {
2945 trans = btrfs_join_transaction(root, 1);
2947 @@ -2827,7 +3295,7 @@ again:
2948 BTRFS_I(inode)->reserved_bytes += bytes;
2949 spin_unlock(&data_sinfo->lock);
2951 - return btrfs_check_metadata_free_space(root);
2956 @@ -2926,17 +3394,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2957 BUG_ON(!space_info);
2959 spin_lock(&space_info->lock);
2960 - if (space_info->force_alloc) {
2961 + if (space_info->force_alloc)
2963 - space_info->force_alloc = 0;
2965 if (space_info->full) {
2966 spin_unlock(&space_info->lock);
2970 thresh = space_info->total_bytes - space_info->bytes_readonly;
2971 - thresh = div_factor(thresh, 6);
2972 + thresh = div_factor(thresh, 8);
2974 (space_info->bytes_used + space_info->bytes_pinned +
2975 space_info->bytes_reserved + alloc_bytes) < thresh) {
2976 @@ -2950,7 +3416,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2977 * we keep a reasonable number of metadata chunks allocated in the
2980 - if (flags & BTRFS_BLOCK_GROUP_DATA) {
2981 + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
2982 fs_info->data_chunk_allocations++;
2983 if (!(fs_info->data_chunk_allocations %
2984 fs_info->metadata_ratio))
2985 @@ -2958,8 +3424,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2988 ret = btrfs_alloc_chunk(trans, extent_root, flags);
2989 + spin_lock(&space_info->lock);
2991 space_info->full = 1;
2992 + space_info->force_alloc = 0;
2993 + spin_unlock(&space_info->lock);
2995 mutex_unlock(&extent_root->fs_info->chunk_mutex);
2997 @@ -3008,10 +3477,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
2998 num_bytes = min(total, cache->key.offset - byte_in_group);
3000 old_val += num_bytes;
3001 + btrfs_set_block_group_used(&cache->item, old_val);
3002 + cache->reserved -= num_bytes;
3003 cache->space_info->bytes_used += num_bytes;
3004 + cache->space_info->bytes_reserved -= num_bytes;
3006 cache->space_info->bytes_readonly -= num_bytes;
3007 - btrfs_set_block_group_used(&cache->item, old_val);
3008 spin_unlock(&cache->lock);
3009 spin_unlock(&cache->space_info->lock);
3011 @@ -3056,127 +3527,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3015 -int btrfs_update_pinned_extents(struct btrfs_root *root,
3016 - u64 bytenr, u64 num, int pin)
3018 + * this function must be called within transaction
3020 +int btrfs_pin_extent(struct btrfs_root *root,
3021 + u64 bytenr, u64 num_bytes, int reserved)
3024 - struct btrfs_block_group_cache *cache;
3025 struct btrfs_fs_info *fs_info = root->fs_info;
3026 + struct btrfs_block_group_cache *cache;
3029 - set_extent_dirty(&fs_info->pinned_extents,
3030 - bytenr, bytenr + num - 1, GFP_NOFS);
3033 - cache = btrfs_lookup_block_group(fs_info, bytenr);
3035 - len = min(num, cache->key.offset -
3036 - (bytenr - cache->key.objectid));
3038 - spin_lock(&cache->space_info->lock);
3039 - spin_lock(&cache->lock);
3040 - cache->pinned += len;
3041 - cache->space_info->bytes_pinned += len;
3042 - spin_unlock(&cache->lock);
3043 - spin_unlock(&cache->space_info->lock);
3044 - fs_info->total_pinned += len;
3047 + cache = btrfs_lookup_block_group(fs_info, bytenr);
3051 - * in order to not race with the block group caching, we
3052 - * only want to unpin the extent if we are cached. If
3053 - * we aren't cached, we want to start async caching this
3054 - * block group so we can free the extent the next time
3057 - spin_lock(&cache->space_info->lock);
3058 - spin_lock(&cache->lock);
3059 - unpin = (cache->cached == BTRFS_CACHE_FINISHED);
3060 - if (likely(unpin)) {
3061 - cache->pinned -= len;
3062 - cache->space_info->bytes_pinned -= len;
3063 - fs_info->total_pinned -= len;
3065 - spin_unlock(&cache->lock);
3066 - spin_unlock(&cache->space_info->lock);
3067 + spin_lock(&cache->space_info->lock);
3068 + spin_lock(&cache->lock);
3069 + cache->pinned += num_bytes;
3070 + cache->space_info->bytes_pinned += num_bytes;
3072 + cache->reserved -= num_bytes;
3073 + cache->space_info->bytes_reserved -= num_bytes;
3075 + spin_unlock(&cache->lock);
3076 + spin_unlock(&cache->space_info->lock);
3078 - if (likely(unpin))
3079 - clear_extent_dirty(&fs_info->pinned_extents,
3080 - bytenr, bytenr + len -1,
3083 - cache_block_group(cache);
3084 + btrfs_put_block_group(cache);
3087 - btrfs_add_free_space(cache, bytenr, len);
3089 - btrfs_put_block_group(cache);
3092 + set_extent_dirty(fs_info->pinned_extents,
3093 + bytenr, bytenr + num_bytes - 1, GFP_NOFS);
3097 +static int update_reserved_extents(struct btrfs_block_group_cache *cache,
3098 + u64 num_bytes, int reserve)
3100 + spin_lock(&cache->space_info->lock);
3101 + spin_lock(&cache->lock);
3103 + cache->reserved += num_bytes;
3104 + cache->space_info->bytes_reserved += num_bytes;
3106 + cache->reserved -= num_bytes;
3107 + cache->space_info->bytes_reserved -= num_bytes;
3109 + spin_unlock(&cache->lock);
3110 + spin_unlock(&cache->space_info->lock);
3114 -static int update_reserved_extents(struct btrfs_root *root,
3115 - u64 bytenr, u64 num, int reserve)
3116 +int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3117 + struct btrfs_root *root)
3120 - struct btrfs_block_group_cache *cache;
3121 struct btrfs_fs_info *fs_info = root->fs_info;
3122 + struct btrfs_caching_control *next;
3123 + struct btrfs_caching_control *caching_ctl;
3124 + struct btrfs_block_group_cache *cache;
3127 - cache = btrfs_lookup_block_group(fs_info, bytenr);
3129 - len = min(num, cache->key.offset -
3130 - (bytenr - cache->key.objectid));
3131 + down_write(&fs_info->extent_commit_sem);
3133 - spin_lock(&cache->space_info->lock);
3134 - spin_lock(&cache->lock);
3136 - cache->reserved += len;
3137 - cache->space_info->bytes_reserved += len;
3138 + list_for_each_entry_safe(caching_ctl, next,
3139 + &fs_info->caching_block_groups, list) {
3140 + cache = caching_ctl->block_group;
3141 + if (block_group_cache_done(cache)) {
3142 + cache->last_byte_to_unpin = (u64)-1;
3143 + list_del_init(&caching_ctl->list);
3144 + put_caching_control(caching_ctl);
3146 - cache->reserved -= len;
3147 - cache->space_info->bytes_reserved -= len;
3148 + cache->last_byte_to_unpin = caching_ctl->progress;
3150 - spin_unlock(&cache->lock);
3151 - spin_unlock(&cache->space_info->lock);
3152 - btrfs_put_block_group(cache);
3157 + if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3158 + fs_info->pinned_extents = &fs_info->freed_extents[1];
3160 + fs_info->pinned_extents = &fs_info->freed_extents[0];
3162 + up_write(&fs_info->extent_commit_sem);
3166 -int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
3167 +static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3172 - struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
3174 + struct btrfs_fs_info *fs_info = root->fs_info;
3175 + struct btrfs_block_group_cache *cache = NULL;
3179 - ret = find_first_extent_bit(pinned_extents, last,
3180 - &start, &end, EXTENT_DIRTY);
3183 + while (start <= end) {
3185 + start >= cache->key.objectid + cache->key.offset) {
3187 + btrfs_put_block_group(cache);
3188 + cache = btrfs_lookup_block_group(fs_info, start);
3192 - set_extent_dirty(copy, start, end, GFP_NOFS);
3194 + len = cache->key.objectid + cache->key.offset - start;
3195 + len = min(len, end + 1 - start);
3197 + if (start < cache->last_byte_to_unpin) {
3198 + len = min(len, cache->last_byte_to_unpin - start);
3199 + btrfs_add_free_space(cache, start, len);
3202 + spin_lock(&cache->space_info->lock);
3203 + spin_lock(&cache->lock);
3204 + cache->pinned -= len;
3205 + cache->space_info->bytes_pinned -= len;
3206 + spin_unlock(&cache->lock);
3207 + spin_unlock(&cache->space_info->lock);
3213 + btrfs_put_block_group(cache);
3217 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3218 - struct btrfs_root *root,
3219 - struct extent_io_tree *unpin)
3220 + struct btrfs_root *root)
3222 + struct btrfs_fs_info *fs_info = root->fs_info;
3223 + struct extent_io_tree *unpin;
3228 + if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3229 + unpin = &fs_info->freed_extents[1];
3231 + unpin = &fs_info->freed_extents[0];
3234 ret = find_first_extent_bit(unpin, 0, &start, &end,
3236 @@ -3185,10 +3665,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3238 ret = btrfs_discard_extent(root, start, end + 1 - start);
3240 - /* unlocks the pinned mutex */
3241 - btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
3242 clear_extent_dirty(unpin, start, end, GFP_NOFS);
3244 + unpin_extent_range(root, start, end);
3248 @@ -3198,7 +3676,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3249 static int pin_down_bytes(struct btrfs_trans_handle *trans,
3250 struct btrfs_root *root,
3251 struct btrfs_path *path,
3252 - u64 bytenr, u64 num_bytes, int is_data,
3253 + u64 bytenr, u64 num_bytes,
3254 + int is_data, int reserved,
3255 struct extent_buffer **must_clean)
3258 @@ -3207,6 +3686,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
3263 + * discard is sloooow, and so triggering discards on
3264 + * individual btree blocks isn't a good plan. Just
3265 + * pin everything in discard mode.
3267 + if (btrfs_test_opt(root, DISCARD))
3270 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3273 @@ -3230,15 +3717,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
3275 free_extent_buffer(buf);
3277 - btrfs_set_path_blocking(path);
3279 + btrfs_set_path_blocking(path);
3280 /* unlocks the pinned mutex */
3281 - btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
3282 + btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3289 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3290 struct btrfs_root *root,
3291 u64 bytenr, u64 num_bytes, u64 parent,
3292 @@ -3412,7 +3899,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3295 ret = pin_down_bytes(trans, root, path, bytenr,
3296 - num_bytes, is_data, &must_clean);
3297 + num_bytes, is_data, 0, &must_clean);
3301 @@ -3543,8 +4030,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
3302 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
3303 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
3304 /* unlocks the pinned mutex */
3305 - btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
3306 - update_reserved_extents(root, bytenr, num_bytes, 0);
3307 + btrfs_pin_extent(root, bytenr, num_bytes, 1);
3309 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
3310 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
3311 @@ -3584,19 +4070,33 @@ static noinline int
3312 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
3315 + struct btrfs_caching_control *caching_ctl;
3318 - prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
3320 - if (block_group_cache_done(cache)) {
3321 - finish_wait(&cache->caching_q, &wait);
3322 + caching_ctl = get_caching_control(cache);
3327 - finish_wait(&cache->caching_q, &wait);
3329 - wait_event(cache->caching_q, block_group_cache_done(cache) ||
3330 + wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
3331 (cache->free_space >= num_bytes));
3333 + put_caching_control(caching_ctl);
3337 +static noinline int
3338 +wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
3340 + struct btrfs_caching_control *caching_ctl;
3341 + DEFINE_WAIT(wait);
3343 + caching_ctl = get_caching_control(cache);
3347 + wait_event(caching_ctl->wait, block_group_cache_done(cache));
3349 + put_caching_control(caching_ctl);
3353 @@ -3634,6 +4134,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3354 int last_ptr_loop = 0;
3356 bool found_uncached_bg = false;
3357 + bool failed_cluster_refill = false;
3358 + bool failed_alloc = false;
3360 WARN_ON(num_bytes < root->sectorsize);
3361 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
3362 @@ -3731,7 +4233,16 @@ have_block_group:
3363 if (unlikely(block_group->ro))
3368 + * Ok we want to try and use the cluster allocator, so lets look
3369 + * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
3370 + * have tried the cluster allocator plenty of times at this
3371 + * point and not have found anything, so we are likely way too
3372 + * fragmented for the clustering stuff to find anything, so lets
3373 + * just skip it and let the allocator find whatever block it can
3376 + if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
3378 * the refill lock keeps out other
3379 * people trying to start a new cluster
3380 @@ -3806,9 +4317,11 @@ refill_cluster:
3381 spin_unlock(&last_ptr->refill_lock);
3384 - } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
3385 + } else if (!cached && loop > LOOP_CACHING_NOWAIT
3386 + && !failed_cluster_refill) {
3387 spin_unlock(&last_ptr->refill_lock);
3389 + failed_cluster_refill = true;
3390 wait_block_group_cache_progress(block_group,
3391 num_bytes + empty_cluster + empty_size);
3392 goto have_block_group;
3393 @@ -3820,25 +4333,30 @@ refill_cluster:
3394 * cluster. Free the cluster we've been trying
3395 * to use, and go to the next block group
3397 - if (loop < LOOP_NO_EMPTY_SIZE) {
3398 - btrfs_return_cluster_to_free_space(NULL,
3400 - spin_unlock(&last_ptr->refill_lock);
3403 + btrfs_return_cluster_to_free_space(NULL, last_ptr);
3404 spin_unlock(&last_ptr->refill_lock);
3408 offset = btrfs_find_space_for_alloc(block_group, search_start,
3409 num_bytes, empty_size);
3410 - if (!offset && (cached || (!cached &&
3411 - loop == LOOP_CACHING_NOWAIT))) {
3413 - } else if (!offset && (!cached &&
3414 - loop > LOOP_CACHING_NOWAIT)) {
3416 + * If we didn't find a chunk, and we haven't failed on this
3417 + * block group before, and this block group is in the middle of
3418 + * caching and we are ok with waiting, then go ahead and wait
3419 + * for progress to be made, and set failed_alloc to true.
3421 + * If failed_alloc is true then we've already waited on this
3422 + * block group once and should move on to the next block group.
3424 + if (!offset && !failed_alloc && !cached &&
3425 + loop > LOOP_CACHING_NOWAIT) {
3426 wait_block_group_cache_progress(block_group,
3427 - num_bytes + empty_size);
3428 + num_bytes + empty_size);
3429 + failed_alloc = true;
3430 goto have_block_group;
3431 + } else if (!offset) {
3435 search_start = stripe_align(root, offset);
3436 @@ -3880,9 +4398,13 @@ checks:
3437 search_start - offset);
3438 BUG_ON(offset > search_start);
3440 + update_reserved_extents(block_group, num_bytes, 1);
3442 /* we are all good, lets return */
3445 + failed_cluster_refill = false;
3446 + failed_alloc = false;
3447 btrfs_put_block_group(block_group);
3449 up_read(&space_info->groups_sem);
3450 @@ -3940,21 +4462,32 @@ loop:
3454 -static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3455 +static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
3456 + int dump_block_groups)
3458 struct btrfs_block_group_cache *cache;
3460 + spin_lock(&info->lock);
3461 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3462 (unsigned long long)(info->total_bytes - info->bytes_used -
3463 - info->bytes_pinned - info->bytes_reserved),
3464 + info->bytes_pinned - info->bytes_reserved -
3465 + info->bytes_super),
3466 (info->full) ? "" : "not ");
3467 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
3468 - " may_use=%llu, used=%llu\n",
3469 + " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
3471 (unsigned long long)info->total_bytes,
3472 (unsigned long long)info->bytes_pinned,
3473 (unsigned long long)info->bytes_delalloc,
3474 (unsigned long long)info->bytes_may_use,
3475 - (unsigned long long)info->bytes_used);
3476 + (unsigned long long)info->bytes_used,
3477 + (unsigned long long)info->bytes_root,
3478 + (unsigned long long)info->bytes_super,
3479 + (unsigned long long)info->bytes_reserved);
3480 + spin_unlock(&info->lock);
3482 + if (!dump_block_groups)
3485 down_read(&info->groups_sem);
3486 list_for_each_entry(cache, &info->block_groups, list) {
3487 @@ -3972,12 +4505,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3488 up_read(&info->groups_sem);
3491 -static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3492 - struct btrfs_root *root,
3493 - u64 num_bytes, u64 min_alloc_size,
3494 - u64 empty_size, u64 hint_byte,
3495 - u64 search_end, struct btrfs_key *ins,
3497 +int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3498 + struct btrfs_root *root,
3499 + u64 num_bytes, u64 min_alloc_size,
3500 + u64 empty_size, u64 hint_byte,
3501 + u64 search_end, struct btrfs_key *ins,
3505 u64 search_start = 0;
3506 @@ -4022,7 +4555,7 @@ again:
3507 printk(KERN_ERR "btrfs allocation failed flags %llu, "
3508 "wanted %llu\n", (unsigned long long)data,
3509 (unsigned long long)num_bytes);
3510 - dump_space_info(sinfo, num_bytes);
3511 + dump_space_info(sinfo, num_bytes, 1);
3515 @@ -4043,25 +4576,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3516 ret = btrfs_discard_extent(root, start, len);
3518 btrfs_add_free_space(cache, start, len);
3519 + update_reserved_extents(cache, len, 0);
3520 btrfs_put_block_group(cache);
3521 - update_reserved_extents(root, start, len, 0);
3526 -int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3527 - struct btrfs_root *root,
3528 - u64 num_bytes, u64 min_alloc_size,
3529 - u64 empty_size, u64 hint_byte,
3530 - u64 search_end, struct btrfs_key *ins,
3534 - ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3535 - empty_size, hint_byte, search_end, ins,
3538 - update_reserved_extents(root, ins->objectid, ins->offset, 1);
3542 @@ -4222,15 +4738,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
3545 struct btrfs_block_group_cache *block_group;
3546 + struct btrfs_caching_control *caching_ctl;
3547 + u64 start = ins->objectid;
3548 + u64 num_bytes = ins->offset;
3550 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
3551 cache_block_group(block_group);
3552 - wait_event(block_group->caching_q,
3553 - block_group_cache_done(block_group));
3554 + caching_ctl = get_caching_control(block_group);
3556 - ret = btrfs_remove_free_space(block_group, ins->objectid,
3559 + if (!caching_ctl) {
3560 + BUG_ON(!block_group_cache_done(block_group));
3561 + ret = btrfs_remove_free_space(block_group, start, num_bytes);
3564 + mutex_lock(&caching_ctl->mutex);
3566 + if (start >= caching_ctl->progress) {
3567 + ret = add_excluded_extent(root, start, num_bytes);
3569 + } else if (start + num_bytes <= caching_ctl->progress) {
3570 + ret = btrfs_remove_free_space(block_group,
3571 + start, num_bytes);
3574 + num_bytes = caching_ctl->progress - start;
3575 + ret = btrfs_remove_free_space(block_group,
3576 + start, num_bytes);
3579 + start = caching_ctl->progress;
3580 + num_bytes = ins->objectid + ins->offset -
3581 + caching_ctl->progress;
3582 + ret = add_excluded_extent(root, start, num_bytes);
3586 + mutex_unlock(&caching_ctl->mutex);
3587 + put_caching_control(caching_ctl);
3590 + update_reserved_extents(block_group, ins->offset, 1);
3591 btrfs_put_block_group(block_group);
3592 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
3593 0, owner, offset, ins, 1);
3594 @@ -4254,9 +4801,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
3598 - ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
3599 - empty_size, hint_byte, search_end,
3601 + ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
3602 + empty_size, hint_byte, search_end,
3607 @@ -4267,7 +4814,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
3611 - update_reserved_extents(root, ins->objectid, ins->offset, 1);
3612 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3613 struct btrfs_delayed_extent_op *extent_op;
3614 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3615 @@ -4346,452 +4892,108 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3620 -int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3621 - struct btrfs_root *root, struct extent_buffer *leaf)
3625 - struct btrfs_key key;
3626 - struct btrfs_file_extent_item *fi;
3631 - BUG_ON(!btrfs_is_leaf(leaf));
3632 - nritems = btrfs_header_nritems(leaf);
3634 - for (i = 0; i < nritems; i++) {
3636 - btrfs_item_key_to_cpu(leaf, &key, i);
3638 - /* only extents have references, skip everything else */
3639 - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3642 - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3644 - /* inline extents live in the btree, they don't have refs */
3645 - if (btrfs_file_extent_type(leaf, fi) ==
3646 - BTRFS_FILE_EXTENT_INLINE)
3649 - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3651 - /* holes don't have refs */
3652 - if (disk_bytenr == 0)
3655 - num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
3656 - ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
3657 - leaf->start, 0, key.objectid, 0);
3663 -static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3664 - struct btrfs_root *root,
3665 - struct btrfs_leaf_ref *ref)
3669 - struct btrfs_extent_info *info;
3670 - struct refsort *sorted;
3672 - if (ref->nritems == 0)
3675 - sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3676 - for (i = 0; i < ref->nritems; i++) {
3677 - sorted[i].bytenr = ref->extents[i].bytenr;
3678 - sorted[i].slot = i;
3680 - sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3683 - * the items in the ref were sorted when the ref was inserted
3684 - * into the ref cache, so this is already in order
3686 - for (i = 0; i < ref->nritems; i++) {
3687 - info = ref->extents + sorted[i].slot;
3688 - ret = btrfs_free_extent(trans, root, info->bytenr,
3689 - info->num_bytes, ref->bytenr,
3690 - ref->owner, ref->generation,
3691 - info->objectid, 0);
3693 - atomic_inc(&root->fs_info->throttle_gen);
3694 - wake_up(&root->fs_info->transaction_throttle);
3706 -static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
3707 - struct btrfs_root *root, u64 start,
3708 - u64 len, u32 *refs)
3712 - ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
3715 -#if 0 /* some debugging code in case we see problems here */
3716 - /* if the refs count is one, it won't get increased again. But
3717 - * if the ref count is > 1, someone may be decreasing it at
3718 - * the same time we are.
3721 - struct extent_buffer *eb = NULL;
3722 - eb = btrfs_find_create_tree_block(root, start, len);
3724 - btrfs_tree_lock(eb);
3726 - mutex_lock(&root->fs_info->alloc_mutex);
3727 - ret = lookup_extent_ref(NULL, root, start, len, refs);
3729 - mutex_unlock(&root->fs_info->alloc_mutex);
3732 - btrfs_tree_unlock(eb);
3733 - free_extent_buffer(eb);
3736 - printk(KERN_ERR "btrfs block %llu went down to one "
3737 - "during drop_snap\n", (unsigned long long)start);
3746 +struct walk_control {
3747 + u64 refs[BTRFS_MAX_LEVEL];
3748 + u64 flags[BTRFS_MAX_LEVEL];
3749 + struct btrfs_key update_progress;
3759 +#define DROP_REFERENCE 1
3760 +#define UPDATE_BACKREF 2
3763 - * this is used while deleting old snapshots, and it drops the refs
3764 - * on a whole subtree starting from a level 1 node.
3766 - * The idea is to sort all the leaf pointers, and then drop the
3767 - * ref on all the leaves in order. Most of the time the leaves
3768 - * will have ref cache entries, so no leaf IOs will be required to
3769 - * find the extents they have references on.
3771 - * For each leaf, any references it has are also dropped in order
3773 - * This ends up dropping the references in something close to optimal
3774 - * order for reading and modifying the extent allocation tree.
3776 -static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3777 - struct btrfs_root *root,
3778 - struct btrfs_path *path)
3779 +static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
3780 + struct btrfs_root *root,
3781 + struct walk_control *wc,
3782 + struct btrfs_path *path)
3787 - struct extent_buffer *eb = path->nodes[1];
3788 - struct extent_buffer *leaf;
3789 - struct btrfs_leaf_ref *ref;
3790 - struct refsort *sorted = NULL;
3791 - int nritems = btrfs_header_nritems(eb);
3798 + struct btrfs_key key;
3799 + struct extent_buffer *eb;
3803 - int slot = path->slots[1];
3804 - u32 blocksize = btrfs_level_size(root, 0);
3810 - root_owner = btrfs_header_owner(eb);
3811 - root_gen = btrfs_header_generation(eb);
3812 - sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3817 - * step one, sort all the leaf pointers so we don't scribble
3818 - * randomly into the extent allocation tree
3820 - for (i = slot; i < nritems; i++) {
3821 - sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3822 - sorted[refi].slot = i;
3824 + if (path->slots[wc->level] < wc->reada_slot) {
3825 + wc->reada_count = wc->reada_count * 2 / 3;
3826 + wc->reada_count = max(wc->reada_count, 2);
3828 + wc->reada_count = wc->reada_count * 3 / 2;
3829 + wc->reada_count = min_t(int, wc->reada_count,
3830 + BTRFS_NODEPTRS_PER_BLOCK(root));
3834 - * nritems won't be zero, but if we're picking up drop_snapshot
3835 - * after a crash, slot might be > 0, so double check things
3840 + eb = path->nodes[wc->level];
3841 + nritems = btrfs_header_nritems(eb);
3842 + blocksize = btrfs_level_size(root, wc->level - 1);
3844 - sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3845 + for (slot = path->slots[wc->level]; slot < nritems; slot++) {
3846 + if (nread >= wc->reada_count)
3850 - * the first loop frees everything the leaves point to
3852 - for (i = 0; i < refi; i++) {
3855 + bytenr = btrfs_node_blockptr(eb, slot);
3856 + generation = btrfs_node_ptr_generation(eb, slot);
3858 - bytenr = sorted[i].bytenr;
3859 + if (slot == path->slots[wc->level])
3863 - * check the reference count on this leaf. If it is > 1
3864 - * we just decrement it below and don't update any
3865 - * of the refs the leaf points to.
3867 - ret = drop_snap_lookup_refcount(trans, root, bytenr,
3868 - blocksize, &refs);
3871 + if (wc->stage == UPDATE_BACKREF &&
3872 + generation <= root->root_key.offset)
3875 - ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3878 - * the leaf only had one reference, which means the
3879 - * only thing pointing to this leaf is the snapshot
3880 - * we're deleting. It isn't possible for the reference
3881 - * count to increase again later
3883 - * The reference cache is checked for the leaf,
3884 - * and if found we'll be able to drop any refs held by
3885 - * the leaf without needing to read it in.
3887 - ref = btrfs_lookup_leaf_ref(root, bytenr);
3888 - if (ref && ref->generation != ptr_gen) {
3889 - btrfs_free_leaf_ref(root, ref);
3893 - ret = cache_drop_leaf_ref(trans, root, ref);
3895 - btrfs_remove_leaf_ref(root, ref);
3896 - btrfs_free_leaf_ref(root, ref);
3899 - * the leaf wasn't in the reference cache, so
3900 - * we have to read it.
3902 - leaf = read_tree_block(root, bytenr, blocksize,
3904 - ret = btrfs_drop_leaf_ref(trans, root, leaf);
3906 - free_extent_buffer(leaf);
3908 - atomic_inc(&root->fs_info->throttle_gen);
3909 - wake_up(&root->fs_info->transaction_throttle);
3914 - * run through the loop again to free the refs on the leaves.
3915 - * This is faster than doing it in the loop above because
3916 - * the leaves are likely to be clustered together. We end up
3917 - * working in nice chunks on the extent allocation tree.
3919 - for (i = 0; i < refi; i++) {
3920 - bytenr = sorted[i].bytenr;
3921 - ret = btrfs_free_extent(trans, root, bytenr,
3922 - blocksize, eb->start,
3923 - root_owner, root_gen, 0, 1);
3924 + /* We don't lock the tree block, it's OK to be racy here */
3925 + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
3928 + BUG_ON(refs == 0);
3930 - atomic_inc(&root->fs_info->throttle_gen);
3931 - wake_up(&root->fs_info->transaction_throttle);
3938 - * update the path to show we've processed the entire level 1
3939 - * node. This will get saved into the root's drop_snapshot_progress
3940 - * field so these drops are not repeated again if this transaction
3943 - path->slots[1] = nritems;
3948 - * helper function for drop_snapshot, this walks down the tree dropping ref
3949 - * counts as it goes.
3951 -static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3952 - struct btrfs_root *root,
3953 - struct btrfs_path *path, int *level)
3959 - struct extent_buffer *next;
3960 - struct extent_buffer *cur;
3961 - struct extent_buffer *parent;
3966 - WARN_ON(*level < 0);
3967 - WARN_ON(*level >= BTRFS_MAX_LEVEL);
3968 - ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
3969 - path->nodes[*level]->len, &refs);
3975 - * walk down to the last node level and free all the leaves
3977 - while (*level >= 0) {
3978 - WARN_ON(*level < 0);
3979 - WARN_ON(*level >= BTRFS_MAX_LEVEL);
3980 - cur = path->nodes[*level];
3982 - if (btrfs_header_level(cur) != *level)
3985 - if (path->slots[*level] >=
3986 - btrfs_header_nritems(cur))
3988 + if (wc->stage == DROP_REFERENCE) {
3992 - /* the new code goes down to level 1 and does all the
3993 - * leaves pointed to that node in bulk. So, this check
3994 - * for level 0 will always be false.
3996 - * But, the disk format allows the drop_snapshot_progress
3997 - * field in the root to leave things in a state where
3998 - * a leaf will need cleaning up here. If someone crashes
3999 - * with the old code and then boots with the new code,
4000 - * we might find a leaf here.
4002 - if (*level == 0) {
4003 - ret = btrfs_drop_leaf_ref(trans, root, cur);
4006 + if (wc->level == 1 &&
4007 + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4009 + if (!wc->update_ref ||
4010 + generation <= root->root_key.offset)
4012 + btrfs_node_key_to_cpu(eb, &key, slot);
4013 + ret = btrfs_comp_cpu_keys(&key,
4014 + &wc->update_progress);
4018 + if (wc->level == 1 &&
4019 + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4024 - * once we get to level one, process the whole node
4025 - * at once, including everything below it.
4027 - if (*level == 1) {
4028 - ret = drop_level_one_refs(trans, root, path);
4031 + ret = readahead_tree_block(root, bytenr, blocksize,
4037 - bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
4038 - ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4039 - blocksize = btrfs_level_size(root, *level - 1);
4041 - ret = drop_snap_lookup_refcount(trans, root, bytenr,
4042 - blocksize, &refs);
4046 - * if there is more than one reference, we don't need
4047 - * to read that node to drop any references it has. We
4048 - * just drop the ref we hold on that node and move on to the
4049 - * next slot in this level.
4052 - parent = path->nodes[*level];
4053 - root_owner = btrfs_header_owner(parent);
4054 - root_gen = btrfs_header_generation(parent);
4055 - path->slots[*level]++;
4057 - ret = btrfs_free_extent(trans, root, bytenr,
4058 - blocksize, parent->start,
4059 - root_owner, root_gen,
4063 - atomic_inc(&root->fs_info->throttle_gen);
4064 - wake_up(&root->fs_info->transaction_throttle);
4071 - * we need to keep freeing things in the next level down.
4072 - * read the block and loop around to process it
4074 - next = read_tree_block(root, bytenr, blocksize, ptr_gen);
4075 - WARN_ON(*level <= 0);
4076 - if (path->nodes[*level-1])
4077 - free_extent_buffer(path->nodes[*level-1]);
4078 - path->nodes[*level-1] = next;
4079 - *level = btrfs_header_level(next);
4080 - path->slots[*level] = 0;
4082 + last = bytenr + blocksize;
4086 - WARN_ON(*level < 0);
4087 - WARN_ON(*level >= BTRFS_MAX_LEVEL);
4089 - if (path->nodes[*level] == root->node) {
4090 - parent = path->nodes[*level];
4091 - bytenr = path->nodes[*level]->start;
4093 - parent = path->nodes[*level + 1];
4094 - bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
4097 - blocksize = btrfs_level_size(root, *level);
4098 - root_owner = btrfs_header_owner(parent);
4099 - root_gen = btrfs_header_generation(parent);
4102 - * cleanup and free the reference on the last node
4105 - ret = btrfs_free_extent(trans, root, bytenr, blocksize,
4106 - parent->start, root_owner, root_gen,
4108 - free_extent_buffer(path->nodes[*level]);
4109 - path->nodes[*level] = NULL;
4116 + wc->reada_slot = slot;
4120 -struct walk_control {
4121 - u64 refs[BTRFS_MAX_LEVEL];
4122 - u64 flags[BTRFS_MAX_LEVEL];
4123 - struct btrfs_key update_progress;
4131 -#define DROP_REFERENCE 1
4132 -#define UPDATE_BACKREF 2
4135 * hepler to process tree block while walking down the tree.
4137 - * when wc->stage == DROP_REFERENCE, this function checks
4138 - * reference count of the block. if the block is shared and
4139 - * we need update back refs for the subtree rooted at the
4140 - * block, this function changes wc->stage to UPDATE_BACKREF
4142 * when wc->stage == UPDATE_BACKREF, this function updates
4143 * back refs for pointers in the block.
4145 @@ -4800,11 +5002,10 @@ struct walk_control {
4146 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4147 struct btrfs_root *root,
4148 struct btrfs_path *path,
4149 - struct walk_control *wc)
4150 + struct walk_control *wc, int lookup_info)
4152 int level = wc->level;
4153 struct extent_buffer *eb = path->nodes[level];
4154 - struct btrfs_key key;
4155 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
4158 @@ -4816,8 +5017,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4159 * when reference count of tree block is 1, it won't increase
4160 * again. once full backref flag is set, we never clear it.
4162 - if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
4163 - (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
4164 + if (lookup_info &&
4165 + ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
4166 + (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
4167 BUG_ON(!path->locks[level]);
4168 ret = btrfs_lookup_extent_info(trans, root,
4170 @@ -4827,21 +5029,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4171 BUG_ON(wc->refs[level] == 0);
4174 - if (wc->stage == DROP_REFERENCE &&
4175 - wc->update_ref && wc->refs[level] > 1) {
4176 - BUG_ON(eb == root->node);
4177 - BUG_ON(path->slots[level] > 0);
4179 - btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
4181 - btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
4182 - if (btrfs_header_owner(eb) == root->root_key.objectid &&
4183 - btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
4184 - wc->stage = UPDATE_BACKREF;
4185 - wc->shared_level = level;
4189 if (wc->stage == DROP_REFERENCE) {
4190 if (wc->refs[level] > 1)
4192 @@ -4878,6 +5065,136 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4196 + * hepler to process tree block pointer.
4198 + * when wc->stage == DROP_REFERENCE, this function checks
4199 + * reference count of the block pointed to. if the block
4200 + * is shared and we need update back refs for the subtree
4201 + * rooted at the block, this function changes wc->stage to
4202 + * UPDATE_BACKREF. if the block is shared and there is no
4203 + * need to update back, this function drops the reference
4206 + * NOTE: return value 1 means we should stop walking down.
4208 +static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4209 + struct btrfs_root *root,
4210 + struct btrfs_path *path,
4211 + struct walk_control *wc, int *lookup_info)
4217 + struct btrfs_key key;
4218 + struct extent_buffer *next;
4219 + int level = wc->level;
4223 + generation = btrfs_node_ptr_generation(path->nodes[level],
4224 + path->slots[level]);
4226 + * if the lower level block was created before the snapshot
4227 + * was created, we know there is no need to update back refs
4230 + if (wc->stage == UPDATE_BACKREF &&
4231 + generation <= root->root_key.offset) {
4236 + bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
4237 + blocksize = btrfs_level_size(root, level - 1);
4239 + next = btrfs_find_tree_block(root, bytenr, blocksize);
4241 + next = btrfs_find_create_tree_block(root, bytenr, blocksize);
4244 + btrfs_tree_lock(next);
4245 + btrfs_set_lock_blocking(next);
4247 + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
4248 + &wc->refs[level - 1],
4249 + &wc->flags[level - 1]);
4251 + BUG_ON(wc->refs[level - 1] == 0);
4254 + if (wc->stage == DROP_REFERENCE) {
4255 + if (wc->refs[level - 1] > 1) {
4257 + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4260 + if (!wc->update_ref ||
4261 + generation <= root->root_key.offset)
4264 + btrfs_node_key_to_cpu(path->nodes[level], &key,
4265 + path->slots[level]);
4266 + ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
4270 + wc->stage = UPDATE_BACKREF;
4271 + wc->shared_level = level - 1;
4275 + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4279 + if (!btrfs_buffer_uptodate(next, generation)) {
4280 + btrfs_tree_unlock(next);
4281 + free_extent_buffer(next);
4287 + if (reada && level == 1)
4288 + reada_walk_down(trans, root, wc, path);
4289 + next = read_tree_block(root, bytenr, blocksize, generation);
4290 + btrfs_tree_lock(next);
4291 + btrfs_set_lock_blocking(next);
4295 + BUG_ON(level != btrfs_header_level(next));
4296 + path->nodes[level] = next;
4297 + path->slots[level] = 0;
4298 + path->locks[level] = 1;
4299 + wc->level = level;
4300 + if (wc->level == 1)
4301 + wc->reada_slot = 0;
4304 + wc->refs[level - 1] = 0;
4305 + wc->flags[level - 1] = 0;
4306 + if (wc->stage == DROP_REFERENCE) {
4307 + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
4308 + parent = path->nodes[level]->start;
4310 + BUG_ON(root->root_key.objectid !=
4311 + btrfs_header_owner(path->nodes[level]));
4315 + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
4316 + root->root_key.objectid, level - 1, 0);
4319 + btrfs_tree_unlock(next);
4320 + free_extent_buffer(next);
4326 * hepler to process tree block while walking up the tree.
4328 * when wc->stage == DROP_REFERENCE, this function drops
4329 @@ -4904,7 +5221,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
4330 if (level < wc->shared_level)
4333 - BUG_ON(wc->refs[level] <= 1);
4334 ret = find_next_key(path, level + 1, &wc->update_progress);
4337 @@ -4935,8 +5251,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
4338 path->locks[level] = 0;
4342 - BUG_ON(level != 0);
4346 @@ -4989,39 +5303,28 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4347 struct btrfs_path *path,
4348 struct walk_control *wc)
4350 - struct extent_buffer *next;
4351 - struct extent_buffer *cur;
4355 int level = wc->level;
4356 + int lookup_info = 1;
4359 while (level >= 0) {
4360 - cur = path->nodes[level];
4361 - BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
4362 + if (path->slots[level] >=
4363 + btrfs_header_nritems(path->nodes[level]))
4366 - ret = walk_down_proc(trans, root, path, wc);
4367 + ret = walk_down_proc(trans, root, path, wc, lookup_info);
4374 - bytenr = btrfs_node_blockptr(cur, path->slots[level]);
4375 - blocksize = btrfs_level_size(root, level - 1);
4376 - ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
4378 - next = read_tree_block(root, bytenr, blocksize, ptr_gen);
4379 - btrfs_tree_lock(next);
4380 - btrfs_set_lock_blocking(next);
4383 - BUG_ON(level != btrfs_header_level(next));
4384 - path->nodes[level] = next;
4385 - path->slots[level] = 0;
4386 - path->locks[level] = 1;
4387 - wc->level = level;
4388 + ret = do_walk_down(trans, root, path, wc, &lookup_info);
4390 + path->slots[level]++;
4393 + level = wc->level;
4397 @@ -5111,9 +5414,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
4401 - btrfs_node_key_to_cpu(path->nodes[level], &key,
4402 - path->slots[level]);
4403 - WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
4407 * unlock our path, this is safe because only this
4408 @@ -5148,6 +5449,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
4409 wc->stage = DROP_REFERENCE;
4410 wc->update_ref = update_ref;
4412 + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
4415 ret = walk_down_tree(trans, root, path, wc);
4416 @@ -5200,9 +5502,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
4417 ret = btrfs_del_root(trans, tree_root, &root->root_key);
4420 - free_extent_buffer(root->node);
4421 - free_extent_buffer(root->commit_root);
4423 + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
4424 + ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
4428 + ret = btrfs_del_orphan_item(trans, tree_root,
4429 + root->root_key.objectid);
4434 + if (root->in_radix) {
4435 + btrfs_free_fs_root(tree_root->fs_info, root);
4437 + free_extent_buffer(root->node);
4438 + free_extent_buffer(root->commit_root);
4442 btrfs_end_transaction(trans, tree_root);
4444 @@ -5254,6 +5571,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
4445 wc->stage = DROP_REFERENCE;
4448 + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
4451 wret = walk_down_tree(trans, root, path, wc);
4452 @@ -5396,9 +5714,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
4453 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4456 - spin_lock(&em_tree->lock);
4457 + write_lock(&em_tree->lock);
4458 ret = add_extent_mapping(em_tree, em);
4459 - spin_unlock(&em_tree->lock);
4460 + write_unlock(&em_tree->lock);
4461 if (ret != -EEXIST) {
4462 free_extent_map(em);
4464 @@ -6841,287 +7159,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
4469 -static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4470 - struct btrfs_root *root,
4471 - u64 objectid, u64 size)
4473 - struct btrfs_path *path;
4474 - struct btrfs_inode_item *item;
4475 - struct extent_buffer *leaf;
4478 - path = btrfs_alloc_path();
4482 - path->leave_spinning = 1;
4483 - ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4487 - leaf = path->nodes[0];
4488 - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4489 - memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
4490 - btrfs_set_inode_generation(leaf, item, 1);
4491 - btrfs_set_inode_size(leaf, item, size);
4492 - btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4493 - btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
4494 - btrfs_mark_buffer_dirty(leaf);
4495 - btrfs_release_path(root, path);
4497 - btrfs_free_path(path);
4501 -static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4502 - struct btrfs_block_group_cache *group)
4504 + * checks to see if its even possible to relocate this block group.
4506 + * @return - -1 if it's not a good idea to relocate this block group, 0 if its
4507 + * ok to go ahead and try.
4509 +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
4511 - struct inode *inode = NULL;
4512 - struct btrfs_trans_handle *trans;
4513 - struct btrfs_root *root;
4514 - struct btrfs_key root_key;
4515 - u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
4517 + struct btrfs_block_group_cache *block_group;
4518 + struct btrfs_space_info *space_info;
4519 + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4520 + struct btrfs_device *device;
4524 - root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
4525 - root_key.type = BTRFS_ROOT_ITEM_KEY;
4526 - root_key.offset = (u64)-1;
4527 - root = btrfs_read_fs_root_no_name(fs_info, &root_key);
4529 - return ERR_CAST(root);
4530 + block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
4532 - trans = btrfs_start_transaction(root, 1);
4534 + /* odd, couldn't find the block group, leave it alone */
4538 - err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
4540 + /* no bytes used, we're good */
4541 + if (!btrfs_block_group_used(&block_group->item))
4544 - err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
4547 - err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
4548 - group->key.offset, 0, group->key.offset,
4552 - inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
4553 - if (inode->i_state & I_NEW) {
4554 - BTRFS_I(inode)->root = root;
4555 - BTRFS_I(inode)->location.objectid = objectid;
4556 - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
4557 - BTRFS_I(inode)->location.offset = 0;
4558 - btrfs_read_locked_inode(inode);
4559 - unlock_new_inode(inode);
4560 - BUG_ON(is_bad_inode(inode));
4564 - BTRFS_I(inode)->index_cnt = group->key.objectid;
4566 - err = btrfs_orphan_add(trans, inode);
4568 - btrfs_end_transaction(trans, root);
4572 - inode = ERR_PTR(err);
4577 -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4580 - struct btrfs_ordered_sum *sums;
4581 - struct btrfs_sector_sum *sector_sum;
4582 - struct btrfs_ordered_extent *ordered;
4583 - struct btrfs_root *root = BTRFS_I(inode)->root;
4584 - struct list_head list;
4589 - INIT_LIST_HEAD(&list);
4591 - ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4592 - BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4594 - disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4595 - ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
4596 - disk_bytenr + len - 1, &list);
4598 - while (!list_empty(&list)) {
4599 - sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4600 - list_del_init(&sums->list);
4602 - sector_sum = sums->sums;
4603 - sums->bytenr = ordered->start;
4604 + space_info = block_group->space_info;
4605 + spin_lock(&space_info->lock);
4608 - while (offset < sums->len) {
4609 - sector_sum->bytenr += ordered->start - disk_bytenr;
4611 - offset += root->sectorsize;
4613 + full = space_info->full;
4615 - btrfs_add_ordered_sum(inode, ordered, sums);
4617 + * if this is the last block group we have in this space, we can't
4618 + * relocate it unless we're able to allocate a new chunk below.
4620 + * Otherwise, we need to make sure we have room in the space to handle
4621 + * all of the extents from this block group. If we can, we're good
4623 + if ((space_info->total_bytes != block_group->key.offset) &&
4624 + (space_info->bytes_used + space_info->bytes_reserved +
4625 + space_info->bytes_pinned + space_info->bytes_readonly +
4626 + btrfs_block_group_used(&block_group->item) <
4627 + space_info->total_bytes)) {
4628 + spin_unlock(&space_info->lock);
4631 - btrfs_put_ordered_extent(ordered);
4635 -int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
4637 - struct btrfs_trans_handle *trans;
4638 - struct btrfs_path *path;
4639 - struct btrfs_fs_info *info = root->fs_info;
4640 - struct extent_buffer *leaf;
4641 - struct inode *reloc_inode;
4642 - struct btrfs_block_group_cache *block_group;
4643 - struct btrfs_key key;
4652 - root = root->fs_info->extent_root;
4654 - block_group = btrfs_lookup_block_group(info, group_start);
4655 - BUG_ON(!block_group);
4657 - printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
4658 - (unsigned long long)block_group->key.objectid,
4659 - (unsigned long long)block_group->flags);
4661 - path = btrfs_alloc_path();
4664 - reloc_inode = create_reloc_inode(info, block_group);
4665 - BUG_ON(IS_ERR(reloc_inode));
4667 - __alloc_chunk_for_shrink(root, block_group, 1);
4668 - set_block_group_readonly(block_group);
4670 - btrfs_start_delalloc_inodes(info->tree_root);
4671 - btrfs_wait_ordered_extents(info->tree_root, 0);
4676 - key.objectid = block_group->key.objectid;
4679 - cur_byte = key.objectid;
4681 - trans = btrfs_start_transaction(info->tree_root, 1);
4682 - btrfs_commit_transaction(trans, info->tree_root);
4683 + spin_unlock(&space_info->lock);
4685 - mutex_lock(&root->fs_info->cleaner_mutex);
4686 - btrfs_clean_old_snapshots(info->tree_root);
4687 - btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
4688 - mutex_unlock(&root->fs_info->cleaner_mutex);
4690 + * ok we don't have enough space, but maybe we have free space on our
4691 + * devices to allocate new chunks for relocation, so loop through our
4692 + * alloc devices and guess if we have enough space. However, if we
4693 + * were marked as full, then we know there aren't enough chunks, and we
4694 + * can just return.
4700 - trans = btrfs_start_transaction(info->tree_root, 1);
4701 - btrfs_commit_transaction(trans, info->tree_root);
4702 + mutex_lock(&root->fs_info->chunk_mutex);
4703 + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4704 + u64 min_free = btrfs_block_group_used(&block_group->item);
4705 + u64 dev_offset, max_avail;
4708 - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4712 - leaf = path->nodes[0];
4713 - nritems = btrfs_header_nritems(leaf);
4714 - if (path->slots[0] >= nritems) {
4715 - ret = btrfs_next_leaf(root, path);
4721 + * check to make sure we can actually find a chunk with enough
4722 + * space to fit our block group in.
4724 + if (device->total_bytes > device->bytes_used + min_free) {
4725 + ret = find_free_dev_extent(NULL, device, min_free,
4726 + &dev_offset, &max_avail);
4730 - leaf = path->nodes[0];
4731 - nritems = btrfs_header_nritems(leaf);
4734 - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4736 - if (key.objectid >= block_group->key.objectid +
4737 - block_group->key.offset)
4740 - if (progress && need_resched()) {
4741 - btrfs_release_path(root, path);
4748 - if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
4749 - key.objectid + key.offset <= cur_byte) {
4756 - cur_byte = key.objectid + key.offset;
4757 - btrfs_release_path(root, path);
4759 - __alloc_chunk_for_shrink(root, block_group, 0);
4760 - ret = relocate_one_extent(root, path, &key, block_group,
4761 - reloc_inode, pass);
4766 - key.objectid = cur_byte;
4771 - btrfs_release_path(root, path);
4774 - btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
4775 - invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
4778 - if (total_found > 0) {
4779 - printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
4780 - (unsigned long long)total_found, pass);
4782 - if (total_found == skipped && pass > 2) {
4783 - iput(reloc_inode);
4784 - reloc_inode = create_reloc_inode(info, block_group);
4790 - /* delete reloc_inode */
4791 - iput(reloc_inode);
4793 - /* unpin extents in this range */
4794 - trans = btrfs_start_transaction(info->tree_root, 1);
4795 - btrfs_commit_transaction(trans, info->tree_root);
4797 - spin_lock(&block_group->lock);
4798 - WARN_ON(block_group->pinned > 0);
4799 - WARN_ON(block_group->reserved > 0);
4800 - WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
4801 - spin_unlock(&block_group->lock);
4802 - btrfs_put_block_group(block_group);
4804 + mutex_unlock(&root->fs_info->chunk_mutex);
4806 - btrfs_free_path(path);
4807 + btrfs_put_block_group(block_group);
4812 static int find_first_block_group(struct btrfs_root *root,
4813 struct btrfs_path *path, struct btrfs_key *key)
4814 @@ -7164,8 +7281,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
4816 struct btrfs_block_group_cache *block_group;
4817 struct btrfs_space_info *space_info;
4818 + struct btrfs_caching_control *caching_ctl;
4821 + down_write(&info->extent_commit_sem);
4822 + while (!list_empty(&info->caching_block_groups)) {
4823 + caching_ctl = list_entry(info->caching_block_groups.next,
4824 + struct btrfs_caching_control, list);
4825 + list_del(&caching_ctl->list);
4826 + put_caching_control(caching_ctl);
4828 + up_write(&info->extent_commit_sem);
4830 spin_lock(&info->block_group_cache_lock);
4831 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
4832 block_group = rb_entry(n, struct btrfs_block_group_cache,
4833 @@ -7179,8 +7306,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
4834 up_write(&block_group->space_info->groups_sem);
4836 if (block_group->cached == BTRFS_CACHE_STARTED)
4837 - wait_event(block_group->caching_q,
4838 - block_group_cache_done(block_group));
4839 + wait_block_group_cache_done(block_group);
4841 btrfs_remove_free_space_cache(block_group);
4843 @@ -7250,7 +7376,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
4844 spin_lock_init(&cache->lock);
4845 spin_lock_init(&cache->tree_lock);
4846 cache->fs_info = info;
4847 - init_waitqueue_head(&cache->caching_q);
4848 INIT_LIST_HEAD(&cache->list);
4849 INIT_LIST_HEAD(&cache->cluster_list);
4851 @@ -7272,8 +7397,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
4852 cache->flags = btrfs_block_group_flags(&cache->item);
4853 cache->sectorsize = root->sectorsize;
4855 - remove_sb_from_cache(root, cache);
4858 * check for two cases, either we are full, and therefore
4859 * don't need to bother with the caching work since we won't
4860 @@ -7282,13 +7405,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
4861 * time, particularly in the full case.
4863 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
4864 + exclude_super_stripes(root, cache);
4865 + cache->last_byte_to_unpin = (u64)-1;
4866 cache->cached = BTRFS_CACHE_FINISHED;
4867 + free_excluded_extents(root, cache);
4868 } else if (btrfs_block_group_used(&cache->item) == 0) {
4869 + exclude_super_stripes(root, cache);
4870 + cache->last_byte_to_unpin = (u64)-1;
4871 cache->cached = BTRFS_CACHE_FINISHED;
4872 add_new_free_space(cache, root->fs_info,
4874 found_key.objectid +
4876 + free_excluded_extents(root, cache);
4879 ret = update_space_info(info, cache->flags, found_key.offset,
4880 @@ -7296,6 +7425,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
4883 cache->space_info = space_info;
4884 + spin_lock(&cache->space_info->lock);
4885 + cache->space_info->bytes_super += cache->bytes_super;
4886 + spin_unlock(&cache->space_info->lock);
4888 down_write(&space_info->groups_sem);
4889 list_add_tail(&cache->list, &space_info->block_groups);
4890 up_write(&space_info->groups_sem);
4891 @@ -7345,7 +7478,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
4892 atomic_set(&cache->count, 1);
4893 spin_lock_init(&cache->lock);
4894 spin_lock_init(&cache->tree_lock);
4895 - init_waitqueue_head(&cache->caching_q);
4896 INIT_LIST_HEAD(&cache->list);
4897 INIT_LIST_HEAD(&cache->cluster_list);
4899 @@ -7354,15 +7486,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
4900 cache->flags = type;
4901 btrfs_set_block_group_flags(&cache->item, type);
4903 + cache->last_byte_to_unpin = (u64)-1;
4904 cache->cached = BTRFS_CACHE_FINISHED;
4905 - remove_sb_from_cache(root, cache);
4906 + exclude_super_stripes(root, cache);
4908 add_new_free_space(cache, root->fs_info, chunk_offset,
4909 chunk_offset + size);
4911 + free_excluded_extents(root, cache);
4913 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
4914 &cache->space_info);
4917 + spin_lock(&cache->space_info->lock);
4918 + cache->space_info->bytes_super += cache->bytes_super;
4919 + spin_unlock(&cache->space_info->lock);
4921 down_write(&cache->space_info->groups_sem);
4922 list_add_tail(&cache->list, &cache->space_info->block_groups);
4923 up_write(&cache->space_info->groups_sem);
4924 @@ -7428,8 +7568,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
4925 up_write(&block_group->space_info->groups_sem);
4927 if (block_group->cached == BTRFS_CACHE_STARTED)
4928 - wait_event(block_group->caching_q,
4929 - block_group_cache_done(block_group));
4930 + wait_block_group_cache_done(block_group);
4932 btrfs_remove_free_space_cache(block_group);
4934 diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
4935 index 6826018..96577e8 100644
4936 --- a/fs/btrfs/extent_io.c
4937 +++ b/fs/btrfs/extent_io.c
4938 @@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
4942 +static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
4943 + struct extent_state *other)
4945 + if (tree->ops && tree->ops->merge_extent_hook)
4946 + tree->ops->merge_extent_hook(tree->mapping->host, new,
4951 * utility function to look for merge candidates inside a given range.
4952 * Any extents with matching state are merged together into a single
4953 @@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
4954 other = rb_entry(other_node, struct extent_state, rb_node);
4955 if (other->end == state->start - 1 &&
4956 other->state == state->state) {
4957 + merge_cb(tree, state, other);
4958 state->start = other->start;
4960 rb_erase(&other->rb_node, &tree->state);
4961 @@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
4962 other = rb_entry(other_node, struct extent_state, rb_node);
4963 if (other->start == state->end + 1 &&
4964 other->state == state->state) {
4965 + merge_cb(tree, state, other);
4966 other->start = state->start;
4968 rb_erase(&state->rb_node, &tree->state);
4969 free_extent_state(state);
4977 -static void set_state_cb(struct extent_io_tree *tree,
4978 +static int set_state_cb(struct extent_io_tree *tree,
4979 struct extent_state *state,
4982 if (tree->ops && tree->ops->set_bit_hook) {
4983 - tree->ops->set_bit_hook(tree->mapping->host, state->start,
4984 - state->end, state->state, bits);
4985 + return tree->ops->set_bit_hook(tree->mapping->host,
4986 + state->start, state->end,
4987 + state->state, bits);
4993 static void clear_state_cb(struct extent_io_tree *tree,
4994 struct extent_state *state,
4997 - if (tree->ops && tree->ops->clear_bit_hook) {
4998 - tree->ops->clear_bit_hook(tree->mapping->host, state->start,
4999 - state->end, state->state, bits);
5001 + if (tree->ops && tree->ops->clear_bit_hook)
5002 + tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
5006 @@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
5009 struct rb_node *node;
5013 printk(KERN_ERR "btrfs end < start %llu %llu\n",
5014 @@ -365,12 +379,15 @@ static int insert_state(struct extent_io_tree *tree,
5015 (unsigned long long)start);
5018 + state->start = start;
5020 + ret = set_state_cb(tree, state, bits);
5024 if (bits & EXTENT_DIRTY)
5025 tree->dirty_bytes += end - start + 1;
5026 - set_state_cb(tree, state, bits);
5027 state->state |= bits;
5028 - state->start = start;
5030 node = tree_insert(&tree->state, end, &state->rb_node);
5032 struct extent_state *found;
5033 @@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
5037 +static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
5040 + if (tree->ops && tree->ops->split_extent_hook)
5041 + return tree->ops->split_extent_hook(tree->mapping->host,
5047 * split a given extent state struct in two, inserting the preallocated
5048 * struct 'prealloc' as the newly created second half. 'split' indicates an
5049 @@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
5050 struct extent_state *prealloc, u64 split)
5052 struct rb_node *node;
5054 + split_cb(tree, orig, split);
5056 prealloc->start = orig->start;
5057 prealloc->end = split - 1;
5058 prealloc->state = orig->state;
5059 @@ -431,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
5060 struct extent_state *state, int bits, int wake,
5063 - int ret = state->state & bits;
5064 + int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
5065 + int ret = state->state & bits_to_clear;
5067 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
5068 u64 range = state->end - state->start + 1;
5069 @@ -439,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
5070 tree->dirty_bytes -= range;
5072 clear_state_cb(tree, state, bits);
5073 - state->state &= ~bits;
5074 + state->state &= ~bits_to_clear;
5076 wake_up(&state->wq);
5077 if (delete || state->state == 0) {
5078 @@ -471,10 +501,14 @@ static int clear_state_bit(struct extent_io_tree *tree,
5079 * bits were already set, or zero if none of the bits were already set.
5081 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
5082 - int bits, int wake, int delete, gfp_t mask)
5083 + int bits, int wake, int delete,
5084 + struct extent_state **cached_state,
5087 struct extent_state *state;
5088 + struct extent_state *cached;
5089 struct extent_state *prealloc = NULL;
5090 + struct rb_node *next_node;
5091 struct rb_node *node;
5094 @@ -488,6 +522,17 @@ again:
5097 spin_lock(&tree->lock);
5098 + if (cached_state) {
5099 + cached = *cached_state;
5100 + *cached_state = NULL;
5101 + cached_state = NULL;
5102 + if (cached && cached->tree && cached->start == start) {
5103 + atomic_dec(&cached->refs);
5107 + free_extent_state(cached);
5110 * this search will find the extents that end after
5112 @@ -496,6 +541,7 @@ again:
5115 state = rb_entry(node, struct extent_state, rb_node);
5117 if (state->start > end)
5119 WARN_ON(state->end < start);
5120 @@ -526,13 +572,11 @@ again:
5123 if (state->end <= end) {
5124 - set |= clear_state_bit(tree, state, bits,
5126 + set |= clear_state_bit(tree, state, bits, wake,
5128 if (last_end == (u64)-1)
5130 start = last_end + 1;
5132 - start = state->start;
5136 @@ -547,19 +591,30 @@ again:
5137 prealloc = alloc_extent_state(GFP_ATOMIC);
5138 err = split_state(tree, state, prealloc, end + 1);
5139 BUG_ON(err == -EEXIST);
5142 wake_up(&state->wq);
5143 - set |= clear_state_bit(tree, prealloc, bits,
5146 + set |= clear_state_bit(tree, prealloc, bits, wake, delete);
5152 + if (state->end < end && prealloc && !need_resched())
5153 + next_node = rb_next(&state->rb_node);
5157 set |= clear_state_bit(tree, state, bits, wake, delete);
5158 if (last_end == (u64)-1)
5160 start = last_end + 1;
5161 + if (start <= end && next_node) {
5162 + state = rb_entry(next_node, struct extent_state,
5164 + if (state->start == start)
5170 @@ -641,40 +696,59 @@ out:
5174 -static void set_state_bits(struct extent_io_tree *tree,
5175 +static int set_state_bits(struct extent_io_tree *tree,
5176 struct extent_state *state,
5181 + ret = set_state_cb(tree, state, bits);
5185 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
5186 u64 range = state->end - state->start + 1;
5187 tree->dirty_bytes += range;
5189 - set_state_cb(tree, state, bits);
5190 state->state |= bits;
5195 +static void cache_state(struct extent_state *state,
5196 + struct extent_state **cached_ptr)
5198 + if (cached_ptr && !(*cached_ptr)) {
5199 + if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
5200 + *cached_ptr = state;
5201 + atomic_inc(&state->refs);
5207 - * set some bits on a range in the tree. This may require allocations
5208 - * or sleeping, so the gfp mask is used to indicate what is allowed.
5209 + * set some bits on a range in the tree. This may require allocations or
5210 + * sleeping, so the gfp mask is used to indicate what is allowed.
5212 - * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
5213 - * range already has the desired bits set. The start of the existing
5214 - * range is returned in failed_start in this case.
5215 + * If any of the exclusive bits are set, this will fail with -EEXIST if some
5216 + * part of the range already has the desired bits set. The start of the
5217 + * existing range is returned in failed_start in this case.
5219 - * [start, end] is inclusive
5220 - * This takes the tree lock.
5221 + * [start, end] is inclusive This takes the tree lock.
5224 static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
5225 - int bits, int exclusive, u64 *failed_start,
5226 + int bits, int exclusive_bits, u64 *failed_start,
5227 + struct extent_state **cached_state,
5230 struct extent_state *state;
5231 struct extent_state *prealloc = NULL;
5232 struct rb_node *node;
5239 if (!prealloc && (mask & __GFP_WAIT)) {
5240 prealloc = alloc_extent_state(mask);
5241 @@ -683,6 +757,13 @@ again:
5244 spin_lock(&tree->lock);
5245 + if (cached_state && *cached_state) {
5246 + state = *cached_state;
5247 + if (state->start == start && state->tree) {
5248 + node = &state->rb_node;
5253 * this search will find all the extents that end after
5255 @@ -694,8 +775,8 @@ again:
5256 BUG_ON(err == -EEXIST);
5260 state = rb_entry(node, struct extent_state, rb_node);
5262 last_start = state->start;
5263 last_end = state->end;
5265 @@ -706,17 +787,32 @@ again:
5266 * Just lock what we found and keep going
5268 if (state->start == start && state->end <= end) {
5269 - set = state->state & bits;
5270 - if (set && exclusive) {
5271 + struct rb_node *next_node;
5272 + if (state->state & exclusive_bits) {
5273 *failed_start = state->start;
5277 - set_state_bits(tree, state, bits);
5279 + err = set_state_bits(tree, state, bits);
5283 + cache_state(state, cached_state);
5284 merge_state(tree, state);
5285 if (last_end == (u64)-1)
5288 start = last_end + 1;
5289 + if (start < end && prealloc && !need_resched()) {
5290 + next_node = rb_next(node);
5292 + state = rb_entry(next_node, struct extent_state,
5294 + if (state->start == start)
5301 @@ -737,8 +833,7 @@ again:
5302 * desired bit on it.
5304 if (state->start < start) {
5305 - set = state->state & bits;
5306 - if (exclusive && set) {
5307 + if (state->state & exclusive_bits) {
5308 *failed_start = start;
5311 @@ -749,13 +844,14 @@ again:
5314 if (state->end <= end) {
5315 - set_state_bits(tree, state, bits);
5316 + err = set_state_bits(tree, state, bits);
5319 + cache_state(state, cached_state);
5320 merge_state(tree, state);
5321 if (last_end == (u64)-1)
5323 start = last_end + 1;
5325 - start = state->start;
5329 @@ -774,10 +870,13 @@ again:
5330 this_end = last_start - 1;
5331 err = insert_state(tree, prealloc, start, this_end,
5334 BUG_ON(err == -EEXIST);
5340 + cache_state(prealloc, cached_state);
5342 start = this_end + 1;
5345 @@ -788,8 +887,7 @@ again:
5348 if (state->start <= end && state->end > end) {
5349 - set = state->state & bits;
5350 - if (exclusive && set) {
5351 + if (state->state & exclusive_bits) {
5352 *failed_start = start;
5355 @@ -797,7 +895,12 @@ again:
5356 err = split_state(tree, state, prealloc, end + 1);
5357 BUG_ON(err == -EEXIST);
5359 - set_state_bits(tree, prealloc, bits);
5360 + err = set_state_bits(tree, prealloc, bits);
5365 + cache_state(prealloc, cached_state);
5366 merge_state(tree, prealloc);
5369 @@ -826,86 +929,65 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
5372 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
5376 -int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
5379 - return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
5383 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
5384 int bits, gfp_t mask)
5386 return set_extent_bit(tree, start, end, bits, 0, NULL,
5391 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
5392 int bits, gfp_t mask)
5394 - return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
5395 + return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
5398 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
5401 return set_extent_bit(tree, start, end,
5402 - EXTENT_DELALLOC | EXTENT_DIRTY,
5404 + EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
5405 + 0, NULL, NULL, mask);
5408 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
5411 return clear_extent_bit(tree, start, end,
5412 - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
5415 -int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
5418 - return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
5419 + EXTENT_DIRTY | EXTENT_DELALLOC |
5420 + EXTENT_DO_ACCOUNTING, 0, 0,
5424 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
5427 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
5432 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
5435 - return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
5436 + return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
5440 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
5443 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
5448 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
5449 u64 end, gfp_t mask)
5451 - return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
5454 -static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
5457 - return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
5461 -static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
5462 - u64 end, gfp_t mask)
5464 - return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
5465 + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
5469 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
5470 @@ -917,13 +999,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
5471 * either insert or lock state struct between start and end use mask to tell
5472 * us if waiting is desired.
5474 -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
5475 +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
5476 + int bits, struct extent_state **cached_state, gfp_t mask)
5481 - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
5482 - &failed_start, mask);
5483 + err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
5484 + EXTENT_LOCKED, &failed_start,
5485 + cached_state, mask);
5486 if (err == -EEXIST && (mask & __GFP_WAIT)) {
5487 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
5488 start = failed_start;
5489 @@ -935,27 +1019,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
5493 +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
5495 + return lock_extent_bits(tree, start, end, 0, NULL, mask);
5498 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
5504 - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
5505 - &failed_start, mask);
5506 + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
5507 + &failed_start, NULL, mask);
5508 if (err == -EEXIST) {
5509 if (failed_start > start)
5510 clear_extent_bit(tree, start, failed_start - 1,
5511 - EXTENT_LOCKED, 1, 0, mask);
5512 + EXTENT_LOCKED, 1, 0, NULL, mask);
5518 +int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
5519 + struct extent_state **cached, gfp_t mask)
5521 + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
5525 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
5528 - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
5529 + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
5534 @@ -974,7 +1071,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
5535 page_cache_release(page);
5538 - set_extent_dirty(tree, start, end, GFP_NOFS);
5542 @@ -994,7 +1090,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
5543 page_cache_release(page);
5546 - set_extent_writeback(tree, start, end, GFP_NOFS);
5550 @@ -1232,6 +1327,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
5554 + struct extent_state *cached_state = NULL;
5558 @@ -1269,6 +1365,7 @@ again:
5559 /* some of the pages are gone, lets avoid looping by
5560 * shortening the size of the delalloc range we're searching
5562 + free_extent_state(cached_state);
5564 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
5565 max_bytes = PAGE_CACHE_SIZE - offset;
5566 @@ -1282,18 +1379,21 @@ again:
5569 /* step three, lock the state bits for the whole range */
5570 - lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
5571 + lock_extent_bits(tree, delalloc_start, delalloc_end,
5572 + 0, &cached_state, GFP_NOFS);
5574 /* then test to make sure it is all still delalloc */
5575 ret = test_range_bit(tree, delalloc_start, delalloc_end,
5576 - EXTENT_DELALLOC, 1);
5577 + EXTENT_DELALLOC, 1, cached_state);
5579 - unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
5580 + unlock_extent_cached(tree, delalloc_start, delalloc_end,
5581 + &cached_state, GFP_NOFS);
5582 __unlock_for_delalloc(inode, locked_page,
5583 delalloc_start, delalloc_end);
5587 + free_extent_state(cached_state);
5588 *start = delalloc_start;
5589 *end = delalloc_end;
5591 @@ -1303,11 +1403,7 @@ out_failed:
5592 int extent_clear_unlock_delalloc(struct inode *inode,
5593 struct extent_io_tree *tree,
5594 u64 start, u64 end, struct page *locked_page,
5597 - int clear_delalloc, int clear_dirty,
5598 - int set_writeback,
5599 - int end_writeback)
5603 struct page *pages[16];
5604 @@ -1317,16 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
5609 + if (op & EXTENT_CLEAR_UNLOCK)
5610 clear_bits |= EXTENT_LOCKED;
5612 + if (op & EXTENT_CLEAR_DIRTY)
5613 clear_bits |= EXTENT_DIRTY;
5615 - if (clear_delalloc)
5616 + if (op & EXTENT_CLEAR_DELALLOC)
5617 clear_bits |= EXTENT_DELALLOC;
5619 - clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
5620 - if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
5621 + if (op & EXTENT_CLEAR_ACCOUNTING)
5622 + clear_bits |= EXTENT_DO_ACCOUNTING;
5624 + clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
5625 + if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
5626 + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
5627 + EXTENT_SET_PRIVATE2)))
5630 while (nr_pages > 0) {
5631 @@ -1334,17 +1435,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
5632 min_t(unsigned long,
5633 nr_pages, ARRAY_SIZE(pages)), pages);
5634 for (i = 0; i < ret; i++) {
5636 + if (op & EXTENT_SET_PRIVATE2)
5637 + SetPagePrivate2(pages[i]);
5639 if (pages[i] == locked_page) {
5640 page_cache_release(pages[i]);
5644 + if (op & EXTENT_CLEAR_DIRTY)
5645 clear_page_dirty_for_io(pages[i]);
5646 - if (set_writeback)
5647 + if (op & EXTENT_SET_WRITEBACK)
5648 set_page_writeback(pages[i]);
5649 - if (end_writeback)
5650 + if (op & EXTENT_END_WRITEBACK)
5651 end_page_writeback(pages[i]);
5653 + if (op & EXTENT_CLEAR_UNLOCK_PAGE)
5654 unlock_page(pages[i]);
5655 page_cache_release(pages[i]);
5657 @@ -1476,14 +1581,17 @@ out:
5658 * range is found set.
5660 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
5661 - int bits, int filled)
5662 + int bits, int filled, struct extent_state *cached)
5664 struct extent_state *state = NULL;
5665 struct rb_node *node;
5668 spin_lock(&tree->lock);
5669 - node = tree_search(tree, start);
5670 + if (cached && cached->tree && cached->start == start)
5671 + node = &cached->rb_node;
5673 + node = tree_search(tree, start);
5674 while (node && start <= end) {
5675 state = rb_entry(node, struct extent_state, rb_node);
5677 @@ -1503,6 +1611,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
5682 + if (state->end == (u64)-1)
5685 start = state->end + 1;
5688 @@ -1526,7 +1638,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,
5690 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
5691 u64 end = start + PAGE_CACHE_SIZE - 1;
5692 - if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
5693 + if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
5694 SetPageUptodate(page);
5697 @@ -1540,7 +1652,7 @@ static int check_page_locked(struct extent_io_tree *tree,
5699 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
5700 u64 end = start + PAGE_CACHE_SIZE - 1;
5701 - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
5702 + if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
5706 @@ -1552,10 +1664,7 @@ static int check_page_locked(struct extent_io_tree *tree,
5707 static int check_page_writeback(struct extent_io_tree *tree,
5710 - u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
5711 - u64 end = start + PAGE_CACHE_SIZE - 1;
5712 - if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
5713 - end_page_writeback(page);
5714 + end_page_writeback(page);
5718 @@ -1613,13 +1722,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
5722 - clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
5723 + clear_extent_uptodate(tree, start, end, GFP_NOFS);
5724 ClearPageUptodate(page);
5728 - clear_extent_writeback(tree, start, end, GFP_ATOMIC);
5731 end_page_writeback(page);
5733 @@ -1983,7 +2090,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
5736 /* the get_extent function already copied into the page */
5737 - if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
5738 + if (test_range_bit(tree, cur, cur_end,
5739 + EXTENT_UPTODATE, 1, NULL)) {
5740 check_page_uptodate(tree, page);
5741 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
5743 @@ -2078,6 +2186,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5747 + struct extent_state *cached_state = NULL;
5748 struct extent_map *em;
5749 struct block_device *bdev;
5751 @@ -2124,6 +2233,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5754 if (!epd->extent_locked) {
5755 + u64 delalloc_to_write = 0;
5757 * make sure the wbc mapping index is at least updated
5759 @@ -2143,8 +2253,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5760 tree->ops->fill_delalloc(inode, page, delalloc_start,
5761 delalloc_end, &page_started,
5764 + * delalloc_end is already one less than the total
5765 + * length, so we don't subtract one from
5768 + delalloc_to_write += (delalloc_end - delalloc_start +
5769 + PAGE_CACHE_SIZE) >>
5771 delalloc_start = delalloc_end + 1;
5773 + if (wbc->nr_to_write < delalloc_to_write) {
5774 + int thresh = 8192;
5776 + if (delalloc_to_write < thresh * 2)
5777 + thresh = delalloc_to_write;
5778 + wbc->nr_to_write = min_t(u64, delalloc_to_write,
5782 /* did the fill delalloc function already unlock and start
5784 @@ -2160,15 +2286,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5788 - lock_extent(tree, start, page_end, GFP_NOFS);
5790 - unlock_start = start;
5792 if (tree->ops && tree->ops->writepage_start_hook) {
5793 ret = tree->ops->writepage_start_hook(page, start,
5795 if (ret == -EAGAIN) {
5796 - unlock_extent(tree, start, page_end, GFP_NOFS);
5797 redirty_page_for_writepage(wbc, page);
5798 update_nr_written(page, wbc, nr_written);
5800 @@ -2184,12 +2305,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5801 update_nr_written(page, wbc, nr_written + 1);
5804 - if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
5805 - printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
5807 if (last_byte <= start) {
5808 - clear_extent_dirty(tree, start, page_end, GFP_NOFS);
5809 - unlock_extent(tree, start, page_end, GFP_NOFS);
5810 if (tree->ops && tree->ops->writepage_end_io_hook)
5811 tree->ops->writepage_end_io_hook(page, start,
5813 @@ -2197,13 +2313,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5817 - set_extent_uptodate(tree, start, page_end, GFP_NOFS);
5818 blocksize = inode->i_sb->s_blocksize;
5820 while (cur <= end) {
5821 if (cur >= last_byte) {
5822 - clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
5823 - unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
5824 if (tree->ops && tree->ops->writepage_end_io_hook)
5825 tree->ops->writepage_end_io_hook(page, cur,
5827 @@ -2235,12 +2348,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5829 if (compressed || block_start == EXTENT_MAP_HOLE ||
5830 block_start == EXTENT_MAP_INLINE) {
5831 - clear_extent_dirty(tree, cur,
5832 - cur + iosize - 1, GFP_NOFS);
5834 - unlock_extent(tree, unlock_start, cur + iosize - 1,
5838 * end_io notification does not happen here for
5839 * compressed extents
5840 @@ -2265,13 +2372,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5842 /* leave this out until we have a page_mkwrite call */
5843 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
5844 - EXTENT_DIRTY, 0)) {
5845 + EXTENT_DIRTY, 0, NULL)) {
5847 pg_offset += iosize;
5851 - clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
5852 if (tree->ops && tree->ops->writepage_io_hook) {
5853 ret = tree->ops->writepage_io_hook(page, cur,
5855 @@ -2309,12 +2415,12 @@ done:
5856 set_page_writeback(page);
5857 end_page_writeback(page);
5859 - if (unlock_start <= page_end)
5860 - unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
5865 + /* drop our reference on any cached states */
5866 + free_extent_state(cached_state);
5870 @@ -2339,9 +2445,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
5871 writepage_t writepage, void *data,
5872 void (*flush_fn)(void *))
5874 - struct backing_dev_info *bdi = mapping->backing_dev_info;
5877 + int nr_to_write_done = 0;
5878 struct pagevec pvec;
5881 @@ -2361,7 +2467,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
5885 - while (!done && (index <= end) &&
5886 + while (!done && !nr_to_write_done && (index <= end) &&
5887 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
5888 PAGECACHE_TAG_DIRTY, min(end - index,
5889 (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
5890 @@ -2412,12 +2518,15 @@ retry:
5894 - if (ret || wbc->nr_to_write <= 0)
5896 - if (wbc->nonblocking && bdi_write_congested(bdi)) {
5897 - wbc->encountered_congestion = 1;
5903 + * the filesystem may choose to bump up nr_to_write.
5904 + * We have to make sure to honor the new nr_to_write
5907 + nr_to_write_done = wbc->nr_to_write <= 0;
5909 pagevec_release(&pvec);
5911 @@ -2604,10 +2713,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
5914 lock_extent(tree, start, end, GFP_NOFS);
5915 - wait_on_extent_writeback(tree, start, end);
5916 + wait_on_page_writeback(page);
5917 clear_extent_bit(tree, start, end,
5918 - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
5920 + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
5921 + EXTENT_DO_ACCOUNTING,
5922 + 1, 1, NULL, GFP_NOFS);
5926 @@ -2687,7 +2797,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
5927 !isnew && !PageUptodate(page) &&
5928 (block_off_end > to || block_off_start < from) &&
5929 !test_range_bit(tree, block_start, cur_end,
5930 - EXTENT_UPTODATE, 1)) {
5931 + EXTENT_UPTODATE, 1, NULL)) {
5933 u64 extent_offset = block_start - em->start;
5935 @@ -2701,7 +2811,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
5937 set_extent_bit(tree, block_start,
5938 block_start + iosize - 1,
5939 - EXTENT_LOCKED, 0, NULL, GFP_NOFS);
5940 + EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
5941 ret = submit_extent_page(READ, tree, page,
5942 sector, iosize, page_offset, em->bdev,
5944 @@ -2742,13 +2852,18 @@ int try_release_extent_state(struct extent_map_tree *map,
5947 if (test_range_bit(tree, start, end,
5948 - EXTENT_IOBITS | EXTENT_ORDERED, 0))
5949 + EXTENT_IOBITS, 0, NULL))
5952 if ((mask & GFP_NOFS) == GFP_NOFS)
5954 - clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
5957 + * at this point we can safely clear everything except the
5958 + * locked bit and the nodatasum bit
5960 + clear_extent_bit(tree, start, end,
5961 + ~(EXTENT_LOCKED | EXTENT_NODATASUM),
5962 + 0, 0, NULL, mask);
5966 @@ -2771,29 +2886,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,
5968 while (start <= end) {
5969 len = end - start + 1;
5970 - spin_lock(&map->lock);
5971 + write_lock(&map->lock);
5972 em = lookup_extent_mapping(map, start, len);
5973 if (!em || IS_ERR(em)) {
5974 - spin_unlock(&map->lock);
5975 + write_unlock(&map->lock);
5978 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
5979 em->start != start) {
5980 - spin_unlock(&map->lock);
5981 + write_unlock(&map->lock);
5982 free_extent_map(em);
5985 if (!test_range_bit(tree, em->start,
5986 extent_map_end(em) - 1,
5987 - EXTENT_LOCKED | EXTENT_WRITEBACK |
5990 + EXTENT_LOCKED | EXTENT_WRITEBACK,
5992 remove_extent_mapping(map, em);
5993 /* once for the rb tree */
5994 free_extent_map(em);
5996 start = extent_map_end(em);
5997 - spin_unlock(&map->lock);
5998 + write_unlock(&map->lock);
6001 free_extent_map(em);
6002 @@ -3203,7 +3317,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
6004 unsigned long index;
6006 - ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
6007 + ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
6010 while (start <= end) {
6011 @@ -3233,7 +3347,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
6014 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
6015 - EXTENT_UPTODATE, 1);
6016 + EXTENT_UPTODATE, 1, NULL);
6020 @@ -3269,7 +3383,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
6023 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
6024 - EXTENT_UPTODATE, 1)) {
6025 + EXTENT_UPTODATE, 1, NULL)) {
6029 diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
6030 index 5bc20ab..36de250 100644
6031 --- a/fs/btrfs/extent_io.h
6032 +++ b/fs/btrfs/extent_io.h
6034 #define EXTENT_DEFRAG (1 << 6)
6035 #define EXTENT_DEFRAG_DONE (1 << 7)
6036 #define EXTENT_BUFFER_FILLED (1 << 8)
6037 -#define EXTENT_ORDERED (1 << 9)
6038 -#define EXTENT_ORDERED_METADATA (1 << 10)
6039 -#define EXTENT_BOUNDARY (1 << 11)
6040 -#define EXTENT_NODATASUM (1 << 12)
6041 +#define EXTENT_BOUNDARY (1 << 9)
6042 +#define EXTENT_NODATASUM (1 << 10)
6043 +#define EXTENT_DO_ACCOUNTING (1 << 11)
6044 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
6046 /* flags for bio submission */
6048 #define EXTENT_BUFFER_BLOCKING 1
6049 #define EXTENT_BUFFER_DIRTY 2
6051 +/* these are flags for extent_clear_unlock_delalloc */
6052 +#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
6053 +#define EXTENT_CLEAR_UNLOCK 0x2
6054 +#define EXTENT_CLEAR_DELALLOC 0x4
6055 +#define EXTENT_CLEAR_DIRTY 0x8
6056 +#define EXTENT_SET_WRITEBACK 0x10
6057 +#define EXTENT_END_WRITEBACK 0x20
6058 +#define EXTENT_SET_PRIVATE2 0x40
6059 +#define EXTENT_CLEAR_ACCOUNTING 0x80
6062 * page->private values. Every page that is controlled by the extent
6063 * map has page->private set to one.
6064 @@ -62,8 +71,13 @@ struct extent_io_ops {
6065 struct extent_state *state, int uptodate);
6066 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
6067 unsigned long old, unsigned long bits);
6068 - int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
6069 - unsigned long old, unsigned long bits);
6070 + int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
6071 + unsigned long bits);
6072 + int (*merge_extent_hook)(struct inode *inode,
6073 + struct extent_state *new,
6074 + struct extent_state *other);
6075 + int (*split_extent_hook)(struct inode *inode,
6076 + struct extent_state *orig, u64 split);
6077 int (*write_cache_pages_lock_hook)(struct page *page);
6080 @@ -81,10 +95,14 @@ struct extent_state {
6082 u64 end; /* inclusive */
6083 struct rb_node rb_node;
6085 + /* ADD NEW ELEMENTS AFTER THIS */
6086 struct extent_io_tree *tree;
6087 wait_queue_head_t wq;
6089 unsigned long state;
6093 /* for use by the FS */
6095 @@ -142,6 +160,8 @@ int try_release_extent_state(struct extent_map_tree *map,
6096 struct extent_io_tree *tree, struct page *page,
6098 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
6099 +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
6100 + int bits, struct extent_state **cached, gfp_t mask);
6101 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
6102 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
6104 @@ -155,11 +175,12 @@ u64 count_range_bits(struct extent_io_tree *tree,
6105 u64 max_bytes, unsigned long bits);
6107 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
6108 - int bits, int filled);
6109 + int bits, int filled, struct extent_state *cached_state);
6110 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
6111 int bits, gfp_t mask);
6112 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
6113 - int bits, int wake, int delete, gfp_t mask);
6114 + int bits, int wake, int delete, struct extent_state **cached,
6116 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
6117 int bits, gfp_t mask);
6118 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
6119 @@ -278,9 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree,
6120 int extent_clear_unlock_delalloc(struct inode *inode,
6121 struct extent_io_tree *tree,
6122 u64 start, u64 end, struct page *locked_page,
6125 - int clear_delalloc, int clear_dirty,
6126 - int set_writeback,
6127 - int end_writeback);
6128 + unsigned long op);
6130 diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
6131 index 30c9365..2c726b7 100644
6132 --- a/fs/btrfs/extent_map.c
6133 +++ b/fs/btrfs/extent_map.c
6134 @@ -36,7 +36,7 @@ void extent_map_exit(void)
6135 void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
6137 tree->map.rb_node = NULL;
6138 - spin_lock_init(&tree->lock);
6139 + rwlock_init(&tree->lock);
6143 @@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
6147 +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
6150 + struct extent_map *merge = NULL;
6151 + struct rb_node *rb;
6152 + struct extent_map *em;
6154 + write_lock(&tree->lock);
6155 + em = lookup_extent_mapping(tree, start, len);
6157 + WARN_ON(em->start != start || !em);
6162 + clear_bit(EXTENT_FLAG_PINNED, &em->flags);
6164 + if (em->start != 0) {
6165 + rb = rb_prev(&em->rb_node);
6167 + merge = rb_entry(rb, struct extent_map, rb_node);
6168 + if (rb && mergable_maps(merge, em)) {
6169 + em->start = merge->start;
6170 + em->len += merge->len;
6171 + em->block_len += merge->block_len;
6172 + em->block_start = merge->block_start;
6173 + merge->in_tree = 0;
6174 + rb_erase(&merge->rb_node, &tree->map);
6175 + free_extent_map(merge);
6179 + rb = rb_next(&em->rb_node);
6181 + merge = rb_entry(rb, struct extent_map, rb_node);
6182 + if (rb && mergable_maps(em, merge)) {
6183 + em->len += merge->len;
6184 + em->block_len += merge->len;
6185 + rb_erase(&merge->rb_node, &tree->map);
6186 + merge->in_tree = 0;
6187 + free_extent_map(merge);
6190 + free_extent_map(em);
6192 + write_unlock(&tree->lock);
6198 * add_extent_mapping - add new extent map to the extent tree
6199 * @tree: tree to insert new map in
6200 @@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
6204 - assert_spin_locked(&tree->lock);
6205 rb = tree_insert(&tree->map, em->start, &em->rb_node);
6208 @@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
6209 struct rb_node *next = NULL;
6210 u64 end = range_end(start, len);
6212 - assert_spin_locked(&tree->lock);
6213 rb_node = __tree_search(&tree->map, start, &prev, &next);
6214 if (!rb_node && prev) {
6215 em = rb_entry(prev, struct extent_map, rb_node);
6216 @@ -319,6 +367,54 @@ out:
6220 + * search_extent_mapping - find a nearby extent map
6221 + * @tree: tree to lookup in
6222 + * @start: byte offset to start the search
6223 + * @len: length of the lookup range
6225 + * Find and return the first extent_map struct in @tree that intersects the
6226 + * [start, len] range.
6228 + * If one can't be found, any nearby extent may be returned
6230 +struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
6231 + u64 start, u64 len)
6233 + struct extent_map *em;
6234 + struct rb_node *rb_node;
6235 + struct rb_node *prev = NULL;
6236 + struct rb_node *next = NULL;
6238 + rb_node = __tree_search(&tree->map, start, &prev, &next);
6239 + if (!rb_node && prev) {
6240 + em = rb_entry(prev, struct extent_map, rb_node);
6243 + if (!rb_node && next) {
6244 + em = rb_entry(next, struct extent_map, rb_node);
6251 + if (IS_ERR(rb_node)) {
6252 + em = ERR_PTR(PTR_ERR(rb_node));
6255 + em = rb_entry(rb_node, struct extent_map, rb_node);
6262 + atomic_inc(&em->refs);
6268 * remove_extent_mapping - removes an extent_map from the extent tree
6269 * @tree: extent tree to remove from
6270 * @em: extent map beeing removed
6271 @@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
6274 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
6275 - assert_spin_locked(&tree->lock);
6276 rb_erase(&em->rb_node, &tree->map);
6279 diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
6280 index fb6eeef..ab6d74b 100644
6281 --- a/fs/btrfs/extent_map.h
6282 +++ b/fs/btrfs/extent_map.h
6283 @@ -31,7 +31,7 @@ struct extent_map {
6285 struct extent_map_tree {
6291 static inline u64 extent_map_end(struct extent_map *em)
6292 @@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask);
6293 void free_extent_map(struct extent_map *em);
6294 int __init extent_map_init(void);
6295 void extent_map_exit(void);
6296 +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
6297 +struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
6298 + u64 start, u64 len);
6300 diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
6301 index 4b83397..4599113 100644
6302 --- a/fs/btrfs/file.c
6303 +++ b/fs/btrfs/file.c
6304 @@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
6307 struct inode *inode = fdentry(file)->d_inode;
6308 - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6312 u64 end_of_last_block;
6313 @@ -125,23 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
6314 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
6316 end_of_last_block = start_pos + num_bytes - 1;
6317 + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
6321 - lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
6322 - trans = btrfs_join_transaction(root, 1);
6327 - btrfs_set_trans_block_group(trans, inode);
6330 - set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
6332 - /* check for reserved extents on each page, we don't want
6333 - * to reset the delalloc bit on things that already have
6334 - * extents reserved.
6336 - btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
6337 for (i = 0; i < num_pages; i++) {
6338 struct page *p = pages[i];
6340 @@ -155,9 +140,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
6344 - err = btrfs_end_transaction(trans, root);
6346 - unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
6350 @@ -189,18 +171,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
6352 split2 = alloc_extent_map(GFP_NOFS);
6354 - spin_lock(&em_tree->lock);
6355 + write_lock(&em_tree->lock);
6356 em = lookup_extent_mapping(em_tree, start, len);
6358 - spin_unlock(&em_tree->lock);
6359 + write_unlock(&em_tree->lock);
6363 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
6364 - spin_unlock(&em_tree->lock);
6365 if (em->start <= start &&
6366 (!testend || em->start + em->len >= start + len)) {
6367 free_extent_map(em);
6368 + write_unlock(&em_tree->lock);
6371 if (start < em->start) {
6372 @@ -210,6 +192,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
6373 start = em->start + em->len;
6375 free_extent_map(em);
6376 + write_unlock(&em_tree->lock);
6379 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6380 @@ -260,7 +243,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
6381 free_extent_map(split);
6384 - spin_unlock(&em_tree->lock);
6385 + write_unlock(&em_tree->lock);
6388 free_extent_map(em);
6389 @@ -289,7 +272,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
6390 noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
6391 struct btrfs_root *root, struct inode *inode,
6392 u64 start, u64 end, u64 locked_end,
6393 - u64 inline_limit, u64 *hint_byte)
6394 + u64 inline_limit, u64 *hint_byte, int drop_cache)
6397 u64 search_start = start;
6398 @@ -314,7 +297,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
6402 - btrfs_drop_extent_cache(inode, start, end - 1, 0);
6404 + btrfs_drop_extent_cache(inode, start, end - 1, 0);
6406 path = btrfs_alloc_path();
6408 @@ -894,7 +878,8 @@ again:
6409 btrfs_put_ordered_extent(ordered);
6411 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
6412 - last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
6413 + last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
6414 + EXTENT_DO_ACCOUNTING,
6416 unlock_extent(&BTRFS_I(inode)->io_tree,
6417 start_pos, last_pos - 1, GFP_NOFS);
6418 @@ -936,21 +921,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
6421 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
6423 + /* do the reserve before the mutex lock in case we have to do some
6424 + * flushing. We wouldn't deadlock, but this is more polite.
6426 + err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
6430 + mutex_lock(&inode->i_mutex);
6432 current->backing_dev_info = inode->i_mapping->backing_dev_info;
6433 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
6442 err = file_remove_suid(file);
6447 file_update_time(file);
6449 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
6451 - mutex_lock(&inode->i_mutex);
6452 + /* generic_write_checks can change our pos */
6455 BTRFS_I(inode)->sequence++;
6456 first_index = pos >> PAGE_CACHE_SHIFT;
6457 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
6458 @@ -1047,6 +1046,7 @@ out:
6459 mutex_unlock(&inode->i_mutex);
6462 + btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
6466 @@ -1087,8 +1087,10 @@ out_nolock:
6467 btrfs_end_transaction(trans, root);
6469 btrfs_commit_transaction(trans, root);
6471 + } else if (ret != BTRFS_NO_LOG_SYNC) {
6472 btrfs_commit_transaction(trans, root);
6474 + btrfs_end_transaction(trans, root);
6477 if (file->f_flags & O_DIRECT) {
6478 @@ -1138,6 +1140,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
6480 struct btrfs_trans_handle *trans;
6483 + /* we wait first, since the writeback may change the inode */
6484 + root->log_batch++;
6485 + /* the VFS called filemap_fdatawrite for us */
6486 + btrfs_wait_ordered_range(inode, 0, (u64)-1);
6487 + root->log_batch++;
6490 * check the transaction that last modified this inode
6491 * and see if its already been committed
6492 @@ -1145,6 +1154,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
6493 if (!BTRFS_I(inode)->last_trans)
6497 + * if the last transaction that changed this file was before
6498 + * the current transaction, we can bail out now without any
6501 mutex_lock(&root->fs_info->trans_mutex);
6502 if (BTRFS_I(inode)->last_trans <=
6503 root->fs_info->last_trans_committed) {
6504 @@ -1154,13 +1168,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
6506 mutex_unlock(&root->fs_info->trans_mutex);
6508 - root->log_batch++;
6509 - filemap_fdatawrite(inode->i_mapping);
6510 - btrfs_wait_ordered_range(inode, 0, (u64)-1);
6511 - root->log_batch++;
6513 - if (datasync && !(inode->i_state & I_DIRTY_PAGES))
6516 * ok we haven't committed the transaction yet, lets do a commit
6518 @@ -1189,14 +1196,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
6520 mutex_unlock(&dentry->d_inode->i_mutex);
6523 - ret = btrfs_commit_transaction(trans, root);
6525 - ret = btrfs_sync_log(trans, root);
6527 - ret = btrfs_end_transaction(trans, root);
6529 + if (ret != BTRFS_NO_LOG_SYNC) {
6531 ret = btrfs_commit_transaction(trans, root);
6533 + ret = btrfs_sync_log(trans, root);
6535 + ret = btrfs_end_transaction(trans, root);
6537 + ret = btrfs_commit_transaction(trans, root);
6540 + ret = btrfs_end_transaction(trans, root);
6542 mutex_lock(&dentry->d_inode->i_mutex);
6544 diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
6545 index 5edcee3..5c2caad 100644
6546 --- a/fs/btrfs/free-space-cache.c
6547 +++ b/fs/btrfs/free-space-cache.c
6548 @@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
6550 static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
6552 - u64 max_bytes, possible_bytes;
6558 * The goal is to keep the total amount of memory used per 1gb of space
6559 @@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
6560 max_bytes = MAX_CACHE_BYTES_PER_GIG *
6561 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
6563 - possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
6564 - (sizeof(struct btrfs_free_space) *
6565 - block_group->extents_thresh);
6567 + * we want to account for 1 more bitmap than what we have so we can make
6568 + * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
6569 + * we add more bitmaps.
6571 + bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
6573 - if (possible_bytes > max_bytes) {
6574 - int extent_bytes = max_bytes -
6575 - (block_group->total_bitmaps * PAGE_CACHE_SIZE);
6576 + if (bitmap_bytes >= max_bytes) {
6577 + block_group->extents_thresh = 0;
6581 - if (extent_bytes <= 0) {
6582 - block_group->extents_thresh = 0;
6586 + * we want the extent entry threshold to always be at most 1/2 the maxw
6587 + * bytes we can have, or whatever is less than that.
6589 + extent_bytes = max_bytes - bitmap_bytes;
6590 + extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
6592 - block_group->extents_thresh = extent_bytes /
6593 - (sizeof(struct btrfs_free_space));
6595 + block_group->extents_thresh =
6596 + div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
6599 static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
6600 @@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
6601 BUG_ON(block_group->total_bitmaps >= max_bitmaps);
6603 info->offset = offset_to_bitmap(block_group, offset);
6605 link_free_space(block_group, info);
6606 block_group->total_bitmaps++;
6608 diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
6609 index 6b627c6..72ce3c1 100644
6610 --- a/fs/btrfs/inode-item.c
6611 +++ b/fs/btrfs/inode-item.c
6612 @@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
6613 ptr = (unsigned long)(ref + 1);
6615 } else if (ret < 0) {
6616 + if (ret == -EOVERFLOW)
6620 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
6621 @@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
6623 ret = btrfs_insert_empty_item(trans, root, path, &key,
6624 sizeof(struct btrfs_inode_item));
6625 - if (ret == 0 && objectid > root->highest_inode)
6626 - root->highest_inode = objectid;
6630 diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
6631 index 9abbced..c56eb59 100644
6632 --- a/fs/btrfs/inode-map.c
6633 +++ b/fs/btrfs/inode-map.c
6634 @@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
6635 slot = path->slots[0] - 1;
6637 btrfs_item_key_to_cpu(l, &found_key, slot);
6638 - *objectid = found_key.objectid;
6639 + *objectid = max_t(u64, found_key.objectid,
6640 + BTRFS_FIRST_FREE_OBJECTID - 1);
6642 - *objectid = BTRFS_FIRST_FREE_OBJECTID;
6643 + *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
6647 @@ -53,91 +54,27 @@ error:
6652 - * walks the btree of allocated inodes and find a hole.
6654 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
6655 struct btrfs_root *root,
6656 u64 dirid, u64 *objectid)
6658 - struct btrfs_path *path;
6659 - struct btrfs_key key;
6664 - struct extent_buffer *l;
6665 - struct btrfs_key search_key;
6666 - u64 search_start = dirid;
6668 mutex_lock(&root->objectid_mutex);
6669 - if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
6670 - root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
6671 - *objectid = ++root->last_inode_alloc;
6672 - mutex_unlock(&root->objectid_mutex);
6675 - path = btrfs_alloc_path();
6677 - search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
6678 - search_key.objectid = search_start;
6679 - search_key.type = 0;
6680 - search_key.offset = 0;
6683 - ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
6688 - l = path->nodes[0];
6689 - slot = path->slots[0];
6690 - if (slot >= btrfs_header_nritems(l)) {
6691 - ret = btrfs_next_leaf(root, path);
6696 - if (!start_found) {
6697 - *objectid = search_start;
6701 - *objectid = last_ino > search_start ?
6702 - last_ino : search_start;
6705 - btrfs_item_key_to_cpu(l, &key, slot);
6706 - if (key.objectid >= search_start) {
6707 - if (start_found) {
6708 - if (last_ino < search_start)
6709 - last_ino = search_start;
6710 - if (key.objectid > last_ino) {
6711 - *objectid = last_ino;
6714 - } else if (key.objectid > search_start) {
6715 - *objectid = search_start;
6719 - if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
6721 + if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
6722 + ret = btrfs_find_highest_inode(root, &root->highest_objectid);
6728 - last_ino = key.objectid + 1;
6730 + if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
6736 - btrfs_release_path(root, path);
6737 - btrfs_free_path(path);
6738 - BUG_ON(*objectid < search_start);
6739 - mutex_unlock(&root->objectid_mutex);
6742 - btrfs_release_path(root, path);
6743 - btrfs_free_path(path);
6745 + *objectid = ++root->highest_objectid;
6748 mutex_unlock(&root->objectid_mutex);
6751 diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
6752 index 59cba18..f69e5e0 100644
6753 --- a/fs/btrfs/inode.c
6754 +++ b/fs/btrfs/inode.c
6755 @@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
6758 ret = btrfs_drop_extents(trans, root, inode, start,
6759 - aligned_end, aligned_end, start, &hint_byte);
6760 + aligned_end, aligned_end, start,
6764 if (isize > actual_end)
6765 @@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
6766 inline_len, compressed_size,
6769 - btrfs_drop_extent_cache(inode, start, aligned_end, 0);
6770 + btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
6774 @@ -423,9 +424,12 @@ again:
6775 * and free up our temp pages.
6777 extent_clear_unlock_delalloc(inode,
6778 - &BTRFS_I(inode)->io_tree,
6779 - start, end, NULL, 1, 0,
6781 + &BTRFS_I(inode)->io_tree,
6783 + EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
6784 + EXTENT_CLEAR_DELALLOC |
6785 + EXTENT_CLEAR_ACCOUNTING |
6786 + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
6788 goto free_pages_out;
6790 @@ -611,9 +615,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
6791 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6794 - spin_lock(&em_tree->lock);
6795 + write_lock(&em_tree->lock);
6796 ret = add_extent_mapping(em_tree, em);
6797 - spin_unlock(&em_tree->lock);
6798 + write_unlock(&em_tree->lock);
6799 if (ret != -EEXIST) {
6800 free_extent_map(em);
6802 @@ -636,11 +640,14 @@ static noinline int submit_compressed_extents(struct inode *inode,
6803 * clear dirty, set writeback and unlock the pages.
6805 extent_clear_unlock_delalloc(inode,
6806 - &BTRFS_I(inode)->io_tree,
6807 - async_extent->start,
6808 - async_extent->start +
6809 - async_extent->ram_size - 1,
6810 - NULL, 1, 1, 0, 1, 1, 0);
6811 + &BTRFS_I(inode)->io_tree,
6812 + async_extent->start,
6813 + async_extent->start +
6814 + async_extent->ram_size - 1,
6815 + NULL, EXTENT_CLEAR_UNLOCK_PAGE |
6816 + EXTENT_CLEAR_UNLOCK |
6817 + EXTENT_CLEAR_DELALLOC |
6818 + EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
6820 ret = btrfs_submit_compressed_write(inode,
6821 async_extent->start,
6822 @@ -711,9 +718,15 @@ static noinline int cow_file_range(struct inode *inode,
6823 start, end, 0, NULL);
6825 extent_clear_unlock_delalloc(inode,
6826 - &BTRFS_I(inode)->io_tree,
6827 - start, end, NULL, 1, 1,
6829 + &BTRFS_I(inode)->io_tree,
6831 + EXTENT_CLEAR_UNLOCK_PAGE |
6832 + EXTENT_CLEAR_UNLOCK |
6833 + EXTENT_CLEAR_DELALLOC |
6834 + EXTENT_CLEAR_ACCOUNTING |
6835 + EXTENT_CLEAR_DIRTY |
6836 + EXTENT_SET_WRITEBACK |
6837 + EXTENT_END_WRITEBACK);
6838 *nr_written = *nr_written +
6839 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
6841 @@ -725,9 +738,20 @@ static noinline int cow_file_range(struct inode *inode,
6842 BUG_ON(disk_num_bytes >
6843 btrfs_super_total_bytes(&root->fs_info->super_copy));
6846 + read_lock(&BTRFS_I(inode)->extent_tree.lock);
6847 + em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
6848 + start, num_bytes);
6850 + alloc_hint = em->block_start;
6851 + free_extent_map(em);
6853 + read_unlock(&BTRFS_I(inode)->extent_tree.lock);
6854 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
6856 while (disk_num_bytes > 0) {
6859 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
6860 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
6861 root->sectorsize, 0, alloc_hint,
6862 @@ -737,7 +761,6 @@ static noinline int cow_file_range(struct inode *inode,
6863 em = alloc_extent_map(GFP_NOFS);
6865 em->orig_start = em->start;
6867 ram_size = ins.offset;
6868 em->len = ins.offset;
6870 @@ -747,9 +770,9 @@ static noinline int cow_file_range(struct inode *inode,
6871 set_bit(EXTENT_FLAG_PINNED, &em->flags);
6874 - spin_lock(&em_tree->lock);
6875 + write_lock(&em_tree->lock);
6876 ret = add_extent_mapping(em_tree, em);
6877 - spin_unlock(&em_tree->lock);
6878 + write_unlock(&em_tree->lock);
6879 if (ret != -EEXIST) {
6880 free_extent_map(em);
6882 @@ -776,11 +799,17 @@ static noinline int cow_file_range(struct inode *inode,
6883 /* we're not doing compressed IO, don't unlock the first
6884 * page (which the caller expects to stay locked), don't
6885 * clear any dirty bits and don't set any writeback bits
6887 + * Do set the Private2 bit so we know this page was properly
6888 + * setup for writepage
6890 + op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
6891 + op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
6892 + EXTENT_SET_PRIVATE2;
6894 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
6895 start, start + ram_size - 1,
6896 - locked_page, unlock, 1,
6899 disk_num_bytes -= cur_alloc_size;
6900 num_bytes -= cur_alloc_size;
6901 alloc_hint = ins.objectid + ins.offset;
6902 @@ -852,8 +881,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
6904 int limit = 10 * 1024 * 1042;
6906 - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
6907 - EXTENT_DELALLOC, 1, 0, GFP_NOFS);
6908 + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
6909 + 1, 0, NULL, GFP_NOFS);
6910 while (start < end) {
6911 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
6912 async_cow->inode = inode;
6913 @@ -994,6 +1023,7 @@ next_slot:
6915 if (found_key.offset > cur_offset) {
6916 extent_end = found_key.offset;
6921 @@ -1080,9 +1110,9 @@ out_check:
6922 em->bdev = root->fs_info->fs_devices->latest_bdev;
6923 set_bit(EXTENT_FLAG_PINNED, &em->flags);
6925 - spin_lock(&em_tree->lock);
6926 + write_lock(&em_tree->lock);
6927 ret = add_extent_mapping(em_tree, em);
6928 - spin_unlock(&em_tree->lock);
6929 + write_unlock(&em_tree->lock);
6930 if (ret != -EEXIST) {
6931 free_extent_map(em);
6933 @@ -1100,8 +1130,10 @@ out_check:
6936 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
6937 - cur_offset, cur_offset + num_bytes - 1,
6938 - locked_page, 1, 1, 1, 0, 0, 0);
6939 + cur_offset, cur_offset + num_bytes - 1,
6940 + locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
6941 + EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
6942 + EXTENT_SET_PRIVATE2);
6943 cur_offset = extent_end;
6944 if (cur_offset > end)
6946 @@ -1147,6 +1179,89 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
6950 +static int btrfs_split_extent_hook(struct inode *inode,
6951 + struct extent_state *orig, u64 split)
6953 + struct btrfs_root *root = BTRFS_I(inode)->root;
6956 + if (!(orig->state & EXTENT_DELALLOC))
6959 + size = orig->end - orig->start + 1;
6960 + if (size > root->fs_info->max_extent) {
6964 + new_size = orig->end - split + 1;
6965 + num_extents = div64_u64(size + root->fs_info->max_extent - 1,
6966 + root->fs_info->max_extent);
6969 + * if we break a large extent up then leave oustanding_extents
6970 + * be, since we've already accounted for the large extent.
6972 + if (div64_u64(new_size + root->fs_info->max_extent - 1,
6973 + root->fs_info->max_extent) < num_extents)
6977 + spin_lock(&BTRFS_I(inode)->accounting_lock);
6978 + BTRFS_I(inode)->outstanding_extents++;
6979 + spin_unlock(&BTRFS_I(inode)->accounting_lock);
6985 + * extent_io.c merge_extent_hook, used to track merged delayed allocation
6986 + * extents so we can keep track of new extents that are just merged onto old
6987 + * extents, such as when we are doing sequential writes, so we can properly
6988 + * account for the metadata space we'll need.
6990 +static int btrfs_merge_extent_hook(struct inode *inode,
6991 + struct extent_state *new,
6992 + struct extent_state *other)
6994 + struct btrfs_root *root = BTRFS_I(inode)->root;
6995 + u64 new_size, old_size;
6998 + /* not delalloc, ignore it */
6999 + if (!(other->state & EXTENT_DELALLOC))
7002 + old_size = other->end - other->start + 1;
7003 + if (new->start < other->start)
7004 + new_size = other->end - new->start + 1;
7006 + new_size = new->end - other->start + 1;
7008 + /* we're not bigger than the max, unreserve the space and go */
7009 + if (new_size <= root->fs_info->max_extent) {
7010 + spin_lock(&BTRFS_I(inode)->accounting_lock);
7011 + BTRFS_I(inode)->outstanding_extents--;
7012 + spin_unlock(&BTRFS_I(inode)->accounting_lock);
7017 + * If we grew by another max_extent, just return, we want to keep that
7018 + * reserved amount.
7020 + num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
7021 + root->fs_info->max_extent);
7022 + if (div64_u64(new_size + root->fs_info->max_extent - 1,
7023 + root->fs_info->max_extent) > num_extents)
7026 + spin_lock(&BTRFS_I(inode)->accounting_lock);
7027 + BTRFS_I(inode)->outstanding_extents--;
7028 + spin_unlock(&BTRFS_I(inode)->accounting_lock);
7034 * extent_io.c set_bit_hook, used to track delayed allocation
7035 * bytes in this file, and to maintain the list of inodes that
7036 @@ -1155,6 +1270,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
7037 static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
7038 unsigned long old, unsigned long bits)
7042 * set_bit and clear bit hooks normally require _irqsave/restore
7043 * but in this case, we are only testeing for the DELALLOC
7044 @@ -1162,6 +1278,10 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
7046 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
7047 struct btrfs_root *root = BTRFS_I(inode)->root;
7049 + spin_lock(&BTRFS_I(inode)->accounting_lock);
7050 + BTRFS_I(inode)->outstanding_extents++;
7051 + spin_unlock(&BTRFS_I(inode)->accounting_lock);
7052 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
7053 spin_lock(&root->fs_info->delalloc_lock);
7054 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
7055 @@ -1178,22 +1298,31 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
7057 * extent_io.c clear_bit_hook, see set_bit_hook for why
7059 -static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
7060 - unsigned long old, unsigned long bits)
7061 +static int btrfs_clear_bit_hook(struct inode *inode,
7062 + struct extent_state *state, unsigned long bits)
7065 * set_bit and clear bit hooks normally require _irqsave/restore
7066 * but in this case, we are only testeing for the DELALLOC
7067 * bit, which is only set or cleared with irqs on
7069 - if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
7070 + if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
7071 struct btrfs_root *root = BTRFS_I(inode)->root;
7073 + if (bits & EXTENT_DO_ACCOUNTING) {
7074 + spin_lock(&BTRFS_I(inode)->accounting_lock);
7075 + BTRFS_I(inode)->outstanding_extents--;
7076 + spin_unlock(&BTRFS_I(inode)->accounting_lock);
7077 + btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
7080 spin_lock(&root->fs_info->delalloc_lock);
7081 - if (end - start + 1 > root->fs_info->delalloc_bytes) {
7082 + if (state->end - state->start + 1 >
7083 + root->fs_info->delalloc_bytes) {
7084 printk(KERN_INFO "btrfs warning: delalloc account "
7086 - (unsigned long long)end - start + 1,
7087 + (unsigned long long)
7088 + state->end - state->start + 1,
7089 (unsigned long long)
7090 root->fs_info->delalloc_bytes);
7091 btrfs_delalloc_free_space(root, inode, (u64)-1);
7092 @@ -1201,9 +1330,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
7093 BTRFS_I(inode)->delalloc_bytes = 0;
7095 btrfs_delalloc_free_space(root, inode,
7097 - root->fs_info->delalloc_bytes -= end - start + 1;
7098 - BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
7100 + state->start + 1);
7101 + root->fs_info->delalloc_bytes -= state->end -
7103 + BTRFS_I(inode)->delalloc_bytes -= state->end -
7106 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
7107 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
7108 @@ -1374,10 +1506,8 @@ again:
7109 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
7111 /* already ordered? We're done */
7112 - if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
7113 - EXTENT_ORDERED, 0)) {
7114 + if (PagePrivate2(page))
7118 ordered = btrfs_lookup_ordered_extent(inode, page_start);
7120 @@ -1413,11 +1543,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
7121 struct inode *inode = page->mapping->host;
7122 struct btrfs_writepage_fixup *fixup;
7123 struct btrfs_root *root = BTRFS_I(inode)->root;
7126 - ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
7127 - EXTENT_ORDERED, 0);
7129 + /* this page is properly in the ordered list */
7130 + if (TestClearPagePrivate2(page))
7133 if (PageChecked(page))
7134 @@ -1455,9 +1583,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
7137 path->leave_spinning = 1;
7140 + * we may be replacing one extent in the tree with another.
7141 + * The new extent is pinned in the extent map, and we don't want
7142 + * to drop it from the cache until it is completely in the btree.
7144 + * So, tell btrfs_drop_extents to leave this extent in the cache.
7145 + * the caller is expected to unpin it and allow it to be merged
7146 + * with the others.
7148 ret = btrfs_drop_extents(trans, root, inode, file_pos,
7149 file_pos + num_bytes, locked_end,
7151 + file_pos, &hint, 0);
7154 ins.objectid = inode->i_ino;
7155 @@ -1485,7 +1623,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
7156 btrfs_mark_buffer_dirty(leaf);
7158 inode_add_bytes(inode, num_bytes);
7159 - btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
7161 ins.objectid = disk_bytenr;
7162 ins.offset = disk_num_bytes;
7163 @@ -1596,6 +1733,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
7164 ordered_extent->len,
7166 BTRFS_FILE_EXTENT_REG);
7167 + unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
7168 + ordered_extent->file_offset,
7169 + ordered_extent->len);
7172 unlock_extent(io_tree, ordered_extent->file_offset,
7173 @@ -1623,6 +1763,7 @@ nocow:
7174 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
7175 struct extent_state *state, int uptodate)
7177 + ClearPagePrivate2(page);
7178 return btrfs_finish_ordered_io(page->mapping->host, start, end);
7181 @@ -1669,13 +1810,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
7182 failrec->last_mirror = 0;
7183 failrec->bio_flags = 0;
7185 - spin_lock(&em_tree->lock);
7186 + read_lock(&em_tree->lock);
7187 em = lookup_extent_mapping(em_tree, start, failrec->len);
7188 if (em->start > start || em->start + em->len < start) {
7189 free_extent_map(em);
7192 - spin_unlock(&em_tree->lock);
7193 + read_unlock(&em_tree->lock);
7195 if (!em || IS_ERR(em)) {
7197 @@ -1794,7 +1935,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
7200 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
7201 - test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
7202 + test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
7203 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
7206 @@ -2352,6 +2493,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
7210 +int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
7211 + struct btrfs_root *root,
7212 + struct inode *dir, u64 objectid,
7213 + const char *name, int name_len)
7215 + struct btrfs_path *path;
7216 + struct extent_buffer *leaf;
7217 + struct btrfs_dir_item *di;
7218 + struct btrfs_key key;
7222 + path = btrfs_alloc_path();
7226 + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
7227 + name, name_len, -1);
7228 + BUG_ON(!di || IS_ERR(di));
7230 + leaf = path->nodes[0];
7231 + btrfs_dir_item_key_to_cpu(leaf, di, &key);
7232 + WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
7233 + ret = btrfs_delete_one_dir_name(trans, root, path, di);
7235 + btrfs_release_path(root, path);
7237 + ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
7238 + objectid, root->root_key.objectid,
7239 + dir->i_ino, &index, name, name_len);
7241 + BUG_ON(ret != -ENOENT);
7242 + di = btrfs_search_dir_index_item(root, path, dir->i_ino,
7244 + BUG_ON(!di || IS_ERR(di));
7246 + leaf = path->nodes[0];
7247 + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7248 + btrfs_release_path(root, path);
7249 + index = key.offset;
7252 + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
7253 + index, name, name_len, -1);
7254 + BUG_ON(!di || IS_ERR(di));
7256 + leaf = path->nodes[0];
7257 + btrfs_dir_item_key_to_cpu(leaf, di, &key);
7258 + WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
7259 + ret = btrfs_delete_one_dir_name(trans, root, path, di);
7261 + btrfs_release_path(root, path);
7263 + btrfs_i_size_write(dir, dir->i_size - name_len * 2);
7264 + dir->i_mtime = dir->i_ctime = CURRENT_TIME;
7265 + ret = btrfs_update_inode(trans, root, dir);
7267 + dir->i_sb->s_dirt = 1;
7269 + btrfs_free_path(path);
7273 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
7275 struct inode *inode = dentry->d_inode;
7276 @@ -2361,29 +2565,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
7277 struct btrfs_trans_handle *trans;
7278 unsigned long nr = 0;
7281 - * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
7282 - * the root of a subvolume or snapshot
7284 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
7285 - inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
7286 + inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
7290 trans = btrfs_start_transaction(root, 1);
7291 btrfs_set_trans_block_group(trans, dir);
7293 + if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
7294 + err = btrfs_unlink_subvol(trans, root, dir,
7295 + BTRFS_I(inode)->location.objectid,
7296 + dentry->d_name.name,
7297 + dentry->d_name.len);
7301 err = btrfs_orphan_add(trans, inode);
7306 /* now the directory is empty */
7307 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
7308 dentry->d_name.name, dentry->d_name.len);
7310 btrfs_i_size_write(inode, 0);
7314 nr = trans->blocks_used;
7315 ret = btrfs_end_transaction_throttle(trans, root);
7316 btrfs_btree_balance_dirty(root, nr);
7317 @@ -2826,12 +3032,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
7319 if ((offset & (blocksize - 1)) == 0)
7321 + ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
7325 + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
7331 page = grab_cache_page(mapping, index);
7334 + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
7335 + btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
7339 page_start = page_offset(page);
7340 page_end = page_start + PAGE_CACHE_SIZE - 1;
7341 @@ -2864,7 +3080,16 @@ again:
7345 - btrfs_set_extent_delalloc(inode, page_start, page_end);
7346 + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
7347 + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
7350 + ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
7352 + unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
7357 if (offset != PAGE_CACHE_SIZE) {
7359 @@ -2877,6 +3102,9 @@ again:
7360 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
7364 + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
7365 + btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
7367 page_cache_release(page);
7369 @@ -2895,17 +3123,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
7376 if (size <= hole_start)
7379 - err = btrfs_check_metadata_free_space(root);
7380 + err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
7384 - btrfs_truncate_page(inode->i_mapping, inode->i_size);
7387 struct btrfs_ordered_extent *ordered;
7388 btrfs_wait_ordered_range(inode, hole_start,
7389 @@ -2935,15 +3161,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
7391 cur_offset + hole_size,
7393 - cur_offset, &hint_byte);
7394 + cur_offset, &hint_byte, 1);
7398 + err = btrfs_reserve_metadata_space(root, 1);
7402 err = btrfs_insert_file_extent(trans, root,
7403 inode->i_ino, cur_offset, 0,
7404 0, hole_size, 0, hole_size,
7406 btrfs_drop_extent_cache(inode, hole_start,
7408 + btrfs_unreserve_metadata_space(root, 1);
7410 free_extent_map(em);
7411 cur_offset = last_byte;
7412 @@ -3003,6 +3235,11 @@ void btrfs_delete_inode(struct inode *inode)
7414 btrfs_wait_ordered_range(inode, 0, (u64)-1);
7416 + if (inode->i_nlink > 0) {
7417 + BUG_ON(btrfs_root_refs(&root->root_item) != 0);
7421 btrfs_i_size_write(inode, 0);
7422 trans = btrfs_join_transaction(root, 1);
7424 @@ -3070,29 +3307,67 @@ out_err:
7425 * is kind of like crossing a mount point.
7427 static int fixup_tree_root_location(struct btrfs_root *root,
7428 - struct btrfs_key *location,
7429 - struct btrfs_root **sub_root,
7430 - struct dentry *dentry)
7431 + struct inode *dir,
7432 + struct dentry *dentry,
7433 + struct btrfs_key *location,
7434 + struct btrfs_root **sub_root)
7436 - struct btrfs_root_item *ri;
7437 + struct btrfs_path *path;
7438 + struct btrfs_root *new_root;
7439 + struct btrfs_root_ref *ref;
7440 + struct extent_buffer *leaf;
7444 - if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
7446 - if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
7448 + path = btrfs_alloc_path();
7454 - *sub_root = btrfs_read_fs_root(root->fs_info, location,
7455 - dentry->d_name.name,
7456 - dentry->d_name.len);
7457 - if (IS_ERR(*sub_root))
7458 - return PTR_ERR(*sub_root);
7460 + ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
7461 + BTRFS_I(dir)->root->root_key.objectid,
7462 + location->objectid);
7469 - ri = &(*sub_root)->root_item;
7470 - location->objectid = btrfs_root_dirid(ri);
7471 - btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
7472 - location->offset = 0;
7473 + leaf = path->nodes[0];
7474 + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
7475 + if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
7476 + btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
7480 + ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
7481 + (unsigned long)(ref + 1),
7482 + dentry->d_name.len);
7486 + btrfs_release_path(root->fs_info->tree_root, path);
7488 + new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
7489 + if (IS_ERR(new_root)) {
7490 + err = PTR_ERR(new_root);
7494 + if (btrfs_root_refs(&new_root->root_item) == 0) {
7499 + *sub_root = new_root;
7500 + location->objectid = btrfs_root_dirid(&new_root->root_item);
7501 + location->type = BTRFS_INODE_ITEM_KEY;
7502 + location->offset = 0;
7505 + btrfs_free_path(path);
7509 static void inode_tree_add(struct inode *inode)
7510 @@ -3101,11 +3376,13 @@ static void inode_tree_add(struct inode *inode)
7511 struct btrfs_inode *entry;
7513 struct rb_node *parent;
7516 p = &root->inode_tree.rb_node;
7519 + if (hlist_unhashed(&inode->i_hash))
7522 spin_lock(&root->inode_lock);
7525 @@ -3132,13 +3409,87 @@ again:
7526 static void inode_tree_del(struct inode *inode)
7528 struct btrfs_root *root = BTRFS_I(inode)->root;
7531 spin_lock(&root->inode_lock);
7532 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
7533 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
7534 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
7535 + empty = RB_EMPTY_ROOT(&root->inode_tree);
7537 + spin_unlock(&root->inode_lock);
7539 + if (empty && btrfs_root_refs(&root->root_item) == 0) {
7540 + synchronize_srcu(&root->fs_info->subvol_srcu);
7541 + spin_lock(&root->inode_lock);
7542 + empty = RB_EMPTY_ROOT(&root->inode_tree);
7543 + spin_unlock(&root->inode_lock);
7545 + btrfs_add_dead_root(root);
7549 +int btrfs_invalidate_inodes(struct btrfs_root *root)
7551 + struct rb_node *node;
7552 + struct rb_node *prev;
7553 + struct btrfs_inode *entry;
7554 + struct inode *inode;
7557 + WARN_ON(btrfs_root_refs(&root->root_item) != 0);
7559 + spin_lock(&root->inode_lock);
7561 + node = root->inode_tree.rb_node;
7565 + entry = rb_entry(node, struct btrfs_inode, rb_node);
7567 + if (objectid < entry->vfs_inode.i_ino)
7568 + node = node->rb_left;
7569 + else if (objectid > entry->vfs_inode.i_ino)
7570 + node = node->rb_right;
7576 + entry = rb_entry(prev, struct btrfs_inode, rb_node);
7577 + if (objectid <= entry->vfs_inode.i_ino) {
7581 + prev = rb_next(prev);
7585 + entry = rb_entry(node, struct btrfs_inode, rb_node);
7586 + objectid = entry->vfs_inode.i_ino + 1;
7587 + inode = igrab(&entry->vfs_inode);
7589 + spin_unlock(&root->inode_lock);
7590 + if (atomic_read(&inode->i_count) > 1)
7591 + d_prune_aliases(inode);
7593 + * btrfs_drop_inode will remove it from
7594 + * the inode cache when its usage count
7599 + spin_lock(&root->inode_lock);
7603 + if (cond_resched_lock(&root->inode_lock))
7606 + node = rb_next(node);
7608 spin_unlock(&root->inode_lock);
7612 static noinline void init_btrfs_i(struct inode *inode)
7613 @@ -3148,6 +3499,7 @@ static noinline void init_btrfs_i(struct inode *inode)
7617 + bi->last_sub_trans = 0;
7618 bi->logged_trans = 0;
7619 bi->delalloc_bytes = 0;
7620 bi->reserved_bytes = 0;
7621 @@ -3225,15 +3577,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
7625 +static struct inode *new_simple_dir(struct super_block *s,
7626 + struct btrfs_key *key,
7627 + struct btrfs_root *root)
7629 + struct inode *inode = new_inode(s);
7632 + return ERR_PTR(-ENOMEM);
7634 + init_btrfs_i(inode);
7636 + BTRFS_I(inode)->root = root;
7637 + memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
7638 + BTRFS_I(inode)->dummy_inode = 1;
7640 + inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
7641 + inode->i_op = &simple_dir_inode_operations;
7642 + inode->i_fop = &simple_dir_operations;
7643 + inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
7644 + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
7649 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
7651 struct inode *inode;
7652 - struct btrfs_inode *bi = BTRFS_I(dir);
7653 - struct btrfs_root *root = bi->root;
7654 + struct btrfs_root *root = BTRFS_I(dir)->root;
7655 struct btrfs_root *sub_root = root;
7656 struct btrfs_key location;
7660 + dentry->d_op = &btrfs_dentry_operations;
7662 if (dentry->d_name.len > BTRFS_NAME_LEN)
7663 return ERR_PTR(-ENAMETOOLONG);
7665 @@ -3242,29 +3620,52 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
7667 return ERR_PTR(ret);
7670 - if (location.objectid) {
7671 - ret = fixup_tree_root_location(root, &location, &sub_root,
7674 - return ERR_PTR(ret);
7676 - return ERR_PTR(-ENOENT);
7677 + if (location.objectid == 0)
7680 + if (location.type == BTRFS_INODE_ITEM_KEY) {
7681 + inode = btrfs_iget(dir->i_sb, &location, root);
7685 + BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
7687 + index = srcu_read_lock(&root->fs_info->subvol_srcu);
7688 + ret = fixup_tree_root_location(root, dir, dentry,
7689 + &location, &sub_root);
7691 + if (ret != -ENOENT)
7692 + inode = ERR_PTR(ret);
7694 + inode = new_simple_dir(dir->i_sb, &location, sub_root);
7696 inode = btrfs_iget(dir->i_sb, &location, sub_root);
7697 - if (IS_ERR(inode))
7698 - return ERR_CAST(inode);
7700 + srcu_read_unlock(&root->fs_info->subvol_srcu, index);
7705 +static int btrfs_dentry_delete(struct dentry *dentry)
7707 + struct btrfs_root *root;
7709 + if (!dentry->d_inode && !IS_ROOT(dentry))
7710 + dentry = dentry->d_parent;
7712 + if (dentry->d_inode) {
7713 + root = BTRFS_I(dentry->d_inode)->root;
7714 + if (btrfs_root_refs(&root->root_item) == 0)
7720 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
7721 struct nameidata *nd)
7723 struct inode *inode;
7725 - if (dentry->d_name.len > BTRFS_NAME_LEN)
7726 - return ERR_PTR(-ENAMETOOLONG);
7728 inode = btrfs_lookup_dentry(dir, dentry);
7730 return ERR_CAST(inode);
7731 @@ -3603,9 +4004,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
7735 - if (objectid > root->highest_inode)
7736 - root->highest_inode = objectid;
7738 inode->i_uid = current_fsuid();
7740 if (dir && (dir->i_mode & S_ISGID)) {
7741 @@ -3673,26 +4071,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
7742 struct inode *parent_inode, struct inode *inode,
7743 const char *name, int name_len, int add_backref, u64 index)
7747 struct btrfs_key key;
7748 struct btrfs_root *root = BTRFS_I(parent_inode)->root;
7750 - key.objectid = inode->i_ino;
7751 - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
7753 + if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
7754 + memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
7756 + key.objectid = inode->i_ino;
7757 + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
7761 + if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
7762 + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
7763 + key.objectid, root->root_key.objectid,
7764 + parent_inode->i_ino,
7765 + index, name, name_len);
7766 + } else if (add_backref) {
7767 + ret = btrfs_insert_inode_ref(trans, root,
7768 + name, name_len, inode->i_ino,
7769 + parent_inode->i_ino, index);
7772 - ret = btrfs_insert_dir_item(trans, root, name, name_len,
7773 - parent_inode->i_ino,
7774 - &key, btrfs_inode_type(inode),
7777 - if (add_backref) {
7778 - ret = btrfs_insert_inode_ref(trans, root,
7781 - parent_inode->i_ino,
7784 + ret = btrfs_insert_dir_item(trans, root, name, name_len,
7785 + parent_inode->i_ino, &key,
7786 + btrfs_inode_type(inode), index);
7789 btrfs_i_size_write(parent_inode, parent_inode->i_size +
7791 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
7792 @@ -3732,11 +4139,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
7793 if (!new_valid_dev(rdev))
7796 - err = btrfs_check_metadata_free_space(root);
7798 + * 2 for inode item and ref
7800 + * 1 for xattr if selinux is on
7802 + err = btrfs_reserve_metadata_space(root, 5);
7807 trans = btrfs_start_transaction(root, 1);
7810 btrfs_set_trans_block_group(trans, dir);
7812 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
7813 @@ -3774,6 +4188,7 @@ out_unlock:
7814 nr = trans->blocks_used;
7815 btrfs_end_transaction_throttle(trans, root);
7817 + btrfs_unreserve_metadata_space(root, 5);
7819 inode_dec_link_count(inode);
7821 @@ -3794,10 +4209,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
7825 - err = btrfs_check_metadata_free_space(root);
7827 + * 2 for inode item and ref
7829 + * 1 for xattr if selinux is on
7831 + err = btrfs_reserve_metadata_space(root, 5);
7836 trans = btrfs_start_transaction(root, 1);
7839 btrfs_set_trans_block_group(trans, dir);
7841 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
7842 @@ -3838,6 +4261,7 @@ out_unlock:
7843 nr = trans->blocks_used;
7844 btrfs_end_transaction_throttle(trans, root);
7846 + btrfs_unreserve_metadata_space(root, 5);
7848 inode_dec_link_count(inode);
7850 @@ -3860,10 +4284,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
7851 if (inode->i_nlink == 0)
7854 - btrfs_inc_nlink(inode);
7855 - err = btrfs_check_metadata_free_space(root);
7857 + * 1 item for inode ref
7858 + * 2 items for dir items
7860 + err = btrfs_reserve_metadata_space(root, 3);
7865 + btrfs_inc_nlink(inode);
7867 err = btrfs_set_inode_index(dir, &index);
7870 @@ -3875,20 +4305,19 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
7872 err = btrfs_add_nondir(trans, dentry, inode, 1, index);
7877 - btrfs_update_inode_block_group(trans, dir);
7878 - err = btrfs_update_inode(trans, root, inode);
7884 + btrfs_update_inode_block_group(trans, dir);
7885 + err = btrfs_update_inode(trans, root, inode);
7887 + btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
7890 nr = trans->blocks_used;
7892 - btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
7893 btrfs_end_transaction_throttle(trans, root);
7895 + btrfs_unreserve_metadata_space(root, 3);
7897 inode_dec_link_count(inode);
7899 @@ -3908,17 +4337,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
7901 unsigned long nr = 1;
7903 - err = btrfs_check_metadata_free_space(root);
7905 + * 2 items for inode and ref
7906 + * 2 items for dir items
7907 + * 1 for xattr if selinux is on
7909 + err = btrfs_reserve_metadata_space(root, 5);
7914 trans = btrfs_start_transaction(root, 1);
7915 - btrfs_set_trans_block_group(trans, dir);
7917 - if (IS_ERR(trans)) {
7918 - err = PTR_ERR(trans);
7923 + btrfs_set_trans_block_group(trans, dir);
7925 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
7927 @@ -3967,6 +4400,7 @@ out_fail:
7928 btrfs_end_transaction_throttle(trans, root);
7931 + btrfs_unreserve_metadata_space(root, 5);
7934 btrfs_btree_balance_dirty(root, nr);
7935 @@ -4064,11 +4498,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
7939 - spin_lock(&em_tree->lock);
7940 + read_lock(&em_tree->lock);
7941 em = lookup_extent_mapping(em_tree, start, len);
7943 em->bdev = root->fs_info->fs_devices->latest_bdev;
7944 - spin_unlock(&em_tree->lock);
7945 + read_unlock(&em_tree->lock);
7948 if (em->start > start || em->start + em->len <= start)
7949 @@ -4215,6 +4649,11 @@ again:
7951 read_extent_buffer(leaf, map + pg_offset, ptr,
7953 + if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
7954 + memset(map + pg_offset + copy_size, 0,
7955 + PAGE_CACHE_SIZE - pg_offset -
7960 flush_dcache_page(page);
7961 @@ -4259,7 +4698,7 @@ insert:
7965 - spin_lock(&em_tree->lock);
7966 + write_lock(&em_tree->lock);
7967 ret = add_extent_mapping(em_tree, em);
7968 /* it is possible that someone inserted the extent into the tree
7969 * while we had the lock dropped. It is also possible that
7970 @@ -4299,7 +4738,7 @@ insert:
7974 - spin_unlock(&em_tree->lock);
7975 + write_unlock(&em_tree->lock);
7978 btrfs_free_path(path);
7979 @@ -4398,13 +4837,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
7980 u64 page_start = page_offset(page);
7981 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
7985 + * we have the page locked, so new writeback can't start,
7986 + * and the dirty bit won't be cleared while we are here.
7988 + * Wait for IO on this page so that we can safely clear
7989 + * the PagePrivate2 bit and do ordered accounting
7991 wait_on_page_writeback(page);
7993 tree = &BTRFS_I(page->mapping->host)->io_tree;
7995 btrfs_releasepage(page, GFP_NOFS);
7999 lock_extent(tree, page_start, page_end, GFP_NOFS);
8000 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
8002 @@ -4415,16 +4862,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
8004 clear_extent_bit(tree, page_start, page_end,
8005 EXTENT_DIRTY | EXTENT_DELALLOC |
8006 - EXTENT_LOCKED, 1, 0, GFP_NOFS);
8007 - btrfs_finish_ordered_io(page->mapping->host,
8008 - page_start, page_end);
8009 + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
8012 + * whoever cleared the private bit is responsible
8013 + * for the finish_ordered_io
8015 + if (TestClearPagePrivate2(page)) {
8016 + btrfs_finish_ordered_io(page->mapping->host,
8017 + page_start, page_end);
8019 btrfs_put_ordered_extent(ordered);
8020 lock_extent(tree, page_start, page_end, GFP_NOFS);
8022 clear_extent_bit(tree, page_start, page_end,
8023 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
8026 + EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
8027 __btrfs_releasepage(page, GFP_NOFS);
8029 ClearPageChecked(page);
8030 @@ -4473,6 +4926,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
8034 + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
8036 + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
8037 + ret = VM_FAULT_SIGBUS;
8041 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
8044 @@ -4504,7 +4964,24 @@ again:
8048 - btrfs_set_extent_delalloc(inode, page_start, page_end);
8050 + * XXX - page_mkwrite gets called every time the page is dirtied, even
8051 + * if it was already dirty, so for space accounting reasons we need to
8052 + * clear any delalloc bits for the range we are fixing to save. There
8053 + * is probably a better way to do this, but for now keep consistent with
8054 + * prepare_pages in the normal write path.
8056 + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
8057 + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
8060 + ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
8062 + unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
8063 + ret = VM_FAULT_SIGBUS;
8064 + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
8069 /* page is wholly or partially inside EOF */
8070 @@ -4521,11 +4998,17 @@ again:
8072 ClearPageChecked(page);
8073 set_page_dirty(page);
8074 + SetPageUptodate(page);
8076 + BTRFS_I(inode)->last_trans = root->fs_info->generation;
8077 + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
8079 - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
8080 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
8083 + btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
8085 + return VM_FAULT_LOCKED;
8089 @@ -4544,7 +5027,9 @@ static void btrfs_truncate(struct inode *inode)
8090 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
8093 - btrfs_truncate_page(inode->i_mapping, inode->i_size);
8094 + ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
8097 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
8099 trans = btrfs_start_transaction(root, 1);
8100 @@ -4594,11 +5079,11 @@ out:
8101 * create a new subvolume directory/inode (helper for the ioctl).
8103 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
8104 - struct btrfs_root *new_root, struct dentry *dentry,
8105 + struct btrfs_root *new_root,
8106 u64 new_dirid, u64 alloc_hint)
8108 struct inode *inode;
8113 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
8114 @@ -4611,11 +5096,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
8116 btrfs_i_size_write(inode, 0);
8118 - error = btrfs_update_inode(trans, new_root, inode);
8121 + err = btrfs_update_inode(trans, new_root, inode);
8124 - d_instantiate(dentry, inode);
8129 @@ -4640,7 +5124,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
8133 + ei->last_sub_trans = 0;
8134 ei->logged_trans = 0;
8135 + ei->outstanding_extents = 0;
8136 + ei->reserved_extents = 0;
8138 + spin_lock_init(&ei->accounting_lock);
8139 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8140 INIT_LIST_HEAD(&ei->i_orphan);
8141 INIT_LIST_HEAD(&ei->ordered_operations);
8142 @@ -4656,6 +5145,14 @@ void btrfs_destroy_inode(struct inode *inode)
8143 WARN_ON(inode->i_data.nrpages);
8146 + * This can happen where we create an inode, but somebody else also
8147 + * created the same inode and we need to destroy the one we already
8154 * Make sure we're properly removed from the ordered operation
8157 @@ -4690,9 +5187,20 @@ void btrfs_destroy_inode(struct inode *inode)
8159 inode_tree_del(inode);
8160 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
8162 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8165 +void btrfs_drop_inode(struct inode *inode)
8167 + struct btrfs_root *root = BTRFS_I(inode)->root;
8169 + if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
8170 + generic_delete_inode(inode);
8172 + generic_drop_inode(inode);
8175 static void init_once(void *foo)
8177 struct btrfs_inode *ei = (struct btrfs_inode *) foo;
8178 @@ -4761,31 +5269,37 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8180 struct btrfs_trans_handle *trans;
8181 struct btrfs_root *root = BTRFS_I(old_dir)->root;
8182 + struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8183 struct inode *new_inode = new_dentry->d_inode;
8184 struct inode *old_inode = old_dentry->d_inode;
8185 struct timespec ctime = CURRENT_TIME;
8187 + u64 root_objectid;
8190 - /* we're not allowed to rename between subvolumes */
8191 - if (BTRFS_I(old_inode)->root->root_key.objectid !=
8192 - BTRFS_I(new_dir)->root->root_key.objectid)
8193 + if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
8196 + /* we only allow rename subvolume link between subvolumes */
8197 + if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
8200 + if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
8201 + (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
8202 + return -ENOTEMPTY;
8204 if (S_ISDIR(old_inode->i_mode) && new_inode &&
8205 - new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
8206 + new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
8210 - /* to rename a snapshot or subvolume, we need to juggle the
8211 - * backrefs. This isn't coded yet
8213 + * 2 items for dir items
8214 + * 1 item for orphan entry
8217 - if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
8220 - ret = btrfs_check_metadata_free_space(root);
8221 + ret = btrfs_reserve_metadata_space(root, 4);
8227 * we're using rename to replace one file with another.
8228 @@ -4796,8 +5310,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8229 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
8230 filemap_flush(old_inode->i_mapping);
8232 + /* close the racy window with snapshot create/destroy ioctl */
8233 + if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
8234 + down_read(&root->fs_info->subvol_sem);
8236 trans = btrfs_start_transaction(root, 1);
8237 + btrfs_set_trans_block_group(trans, new_dir);
8240 + btrfs_record_root_in_trans(trans, dest);
8242 + ret = btrfs_set_inode_index(new_dir, &index);
8246 + if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8247 + /* force full log commit if subvolume involved. */
8248 + root->fs_info->last_trans_log_full_commit = trans->transid;
8250 + ret = btrfs_insert_inode_ref(trans, dest,
8251 + new_dentry->d_name.name,
8252 + new_dentry->d_name.len,
8254 + new_dir->i_ino, index);
8258 + * this is an ugly little race, but the rename is required
8259 + * to make sure that if we crash, the inode is either at the
8260 + * old name or the new one. pinning the log transaction lets
8261 + * us make sure we don't allow a log commit to come in after
8262 + * we unlink the name but before we add the new name back in.
8264 + btrfs_pin_log_trans(root);
8267 * make sure the inode gets flushed if it is replacing
8269 @@ -4807,18 +5353,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8270 btrfs_add_ordered_operation(trans, root, old_inode);
8274 - * this is an ugly little race, but the rename is required to make
8275 - * sure that if we crash, the inode is either at the old name
8276 - * or the new one. pinning the log transaction lets us make sure
8277 - * we don't allow a log commit to come in after we unlink the
8278 - * name but before we add the new name back in.
8280 - btrfs_pin_log_trans(root);
8282 - btrfs_set_trans_block_group(trans, new_dir);
8284 - btrfs_inc_nlink(old_dentry->d_inode);
8285 old_dir->i_ctime = old_dir->i_mtime = ctime;
8286 new_dir->i_ctime = new_dir->i_mtime = ctime;
8287 old_inode->i_ctime = ctime;
8288 @@ -4826,47 +5360,60 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8289 if (old_dentry->d_parent != new_dentry->d_parent)
8290 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
8292 - ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
8293 - old_dentry->d_name.name,
8294 - old_dentry->d_name.len);
8297 + if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8298 + root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
8299 + ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
8300 + old_dentry->d_name.name,
8301 + old_dentry->d_name.len);
8303 + btrfs_inc_nlink(old_dentry->d_inode);
8304 + ret = btrfs_unlink_inode(trans, root, old_dir,
8305 + old_dentry->d_inode,
8306 + old_dentry->d_name.name,
8307 + old_dentry->d_name.len);
8312 new_inode->i_ctime = CURRENT_TIME;
8313 - ret = btrfs_unlink_inode(trans, root, new_dir,
8314 - new_dentry->d_inode,
8315 - new_dentry->d_name.name,
8316 - new_dentry->d_name.len);
8319 + if (unlikely(new_inode->i_ino ==
8320 + BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
8321 + root_objectid = BTRFS_I(new_inode)->location.objectid;
8322 + ret = btrfs_unlink_subvol(trans, dest, new_dir,
8324 + new_dentry->d_name.name,
8325 + new_dentry->d_name.len);
8326 + BUG_ON(new_inode->i_nlink == 0);
8328 + ret = btrfs_unlink_inode(trans, dest, new_dir,
8329 + new_dentry->d_inode,
8330 + new_dentry->d_name.name,
8331 + new_dentry->d_name.len);
8334 if (new_inode->i_nlink == 0) {
8335 ret = btrfs_orphan_add(trans, new_dentry->d_inode);
8342 - ret = btrfs_set_inode_index(new_dir, &index);
8346 - ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
8347 - old_inode, new_dentry->d_name.name,
8348 - new_dentry->d_name.len, 1, index);
8351 + ret = btrfs_add_link(trans, new_dir, old_inode,
8352 + new_dentry->d_name.name,
8353 + new_dentry->d_name.len, 0, index);
8356 - btrfs_log_new_name(trans, old_inode, old_dir,
8357 - new_dentry->d_parent);
8358 + if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
8359 + btrfs_log_new_name(trans, old_inode, old_dir,
8360 + new_dentry->d_parent);
8361 + btrfs_end_log_trans(root);
8365 - /* this btrfs_end_log_trans just allows the current
8366 - * log-sub transaction to complete
8368 - btrfs_end_log_trans(root);
8369 btrfs_end_transaction_throttle(trans, root);
8372 + if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
8373 + up_read(&root->fs_info->subvol_sem);
8375 + btrfs_unreserve_metadata_space(root, 4);
8379 @@ -4938,11 +5485,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8380 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
8381 return -ENAMETOOLONG;
8383 - err = btrfs_check_metadata_free_space(root);
8385 + * 2 items for inode item and ref
8386 + * 2 items for dir items
8387 + * 1 item for xattr if selinux is on
8389 + err = btrfs_reserve_metadata_space(root, 5);
8394 trans = btrfs_start_transaction(root, 1);
8397 btrfs_set_trans_block_group(trans, dir);
8399 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
8400 @@ -5023,6 +5577,7 @@ out_unlock:
8401 nr = trans->blocks_used;
8402 btrfs_end_transaction_throttle(trans, root);
8404 + btrfs_unreserve_metadata_space(root, 5);
8406 inode_dec_link_count(inode);
8408 @@ -5044,6 +5599,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
8410 while (num_bytes > 0) {
8411 alloc_size = min(num_bytes, root->fs_info->max_extent);
8413 + ret = btrfs_reserve_metadata_space(root, 1);
8417 ret = btrfs_reserve_extent(trans, root, alloc_size,
8418 root->sectorsize, 0, alloc_hint,
8420 @@ -5058,9 +5618,12 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
8422 BTRFS_FILE_EXTENT_PREALLOC);
8424 + btrfs_drop_extent_cache(inode, cur_offset,
8425 + cur_offset + ins.offset -1, 0);
8426 num_bytes -= ins.offset;
8427 cur_offset += ins.offset;
8428 alloc_hint = ins.objectid + ins.offset;
8429 + btrfs_unreserve_metadata_space(root, 1);
8432 if (cur_offset > start) {
8433 @@ -5223,6 +5786,7 @@ static struct inode_operations btrfs_dir_ro_inode_operations = {
8434 .lookup = btrfs_lookup,
8435 .permission = btrfs_permission,
8438 static struct file_operations btrfs_dir_file_operations = {
8439 .llseek = generic_file_llseek,
8440 .read = generic_read_dir,
8441 @@ -5245,6 +5809,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
8442 .readpage_io_failed_hook = btrfs_io_failed_hook,
8443 .set_bit_hook = btrfs_set_bit_hook,
8444 .clear_bit_hook = btrfs_clear_bit_hook,
8445 + .merge_extent_hook = btrfs_merge_extent_hook,
8446 + .split_extent_hook = btrfs_split_extent_hook,
8450 @@ -5309,3 +5875,7 @@ static struct inode_operations btrfs_symlink_inode_operations = {
8451 .listxattr = btrfs_listxattr,
8452 .removexattr = btrfs_removexattr,
8455 +const struct dentry_operations btrfs_dentry_operations = {
8456 + .d_delete = btrfs_dentry_delete,
8458 diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
8459 index bd88f25..cdbb054 100644
8460 --- a/fs/btrfs/ioctl.c
8461 +++ b/fs/btrfs/ioctl.c
8462 @@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root,
8463 struct btrfs_root_item root_item;
8464 struct btrfs_inode_item *inode_item;
8465 struct extent_buffer *leaf;
8466 - struct btrfs_root *new_root = root;
8467 - struct inode *dir;
8468 + struct btrfs_root *new_root;
8469 + struct inode *dir = dentry->d_parent->d_inode;
8473 @@ -239,9 +239,15 @@ static noinline int create_subvol(struct btrfs_root *root,
8475 unsigned long nr = 1;
8477 - ret = btrfs_check_metadata_free_space(root);
8484 + ret = btrfs_reserve_metadata_space(root, 6);
8489 trans = btrfs_start_transaction(root, 1);
8491 @@ -304,11 +310,17 @@ static noinline int create_subvol(struct btrfs_root *root,
8495 + key.offset = (u64)-1;
8496 + new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
8497 + BUG_ON(IS_ERR(new_root));
8499 + btrfs_record_root_in_trans(trans, new_root);
8501 + ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
8502 + BTRFS_I(dir)->block_group);
8504 * insert the directory item
8506 - key.offset = (u64)-1;
8507 - dir = dentry->d_parent->d_inode;
8508 ret = btrfs_set_inode_index(dir, &index);
8511 @@ -322,43 +334,20 @@ static noinline int create_subvol(struct btrfs_root *root,
8512 ret = btrfs_update_inode(trans, root, dir);
8515 - /* add the backref first */
8516 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
8517 - objectid, BTRFS_ROOT_BACKREF_KEY,
8518 - root->root_key.objectid,
8519 + objectid, root->root_key.objectid,
8520 dir->i_ino, index, name, namelen);
8524 - /* now add the forward ref */
8525 - ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
8526 - root->root_key.objectid, BTRFS_ROOT_REF_KEY,
8528 - dir->i_ino, index, name, namelen);
8532 - ret = btrfs_commit_transaction(trans, root);
8536 - new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
8537 - BUG_ON(!new_root);
8539 - trans = btrfs_start_transaction(new_root, 1);
8542 - ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
8543 - BTRFS_I(dir)->block_group);
8547 + d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
8549 nr = trans->blocks_used;
8550 - err = btrfs_commit_transaction(trans, new_root);
8551 + err = btrfs_commit_transaction(trans, root);
8556 + btrfs_unreserve_metadata_space(root, 6);
8557 btrfs_btree_balance_dirty(root, nr);
8560 @@ -375,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
8561 if (!root->ref_cows)
8564 - ret = btrfs_check_metadata_free_space(root);
8571 + ret = btrfs_reserve_metadata_space(root, 6);
8575 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
8576 if (!pending_snapshot) {
8578 + btrfs_unreserve_metadata_space(root, 6);
8581 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
8582 if (!pending_snapshot->name) {
8584 kfree(pending_snapshot);
8585 + btrfs_unreserve_metadata_space(root, 6);
8588 memcpy(pending_snapshot->name, name, namelen);
8589 @@ -420,14 +417,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
8590 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
8591 * inside this filesystem so it's quite a bit simpler.
8593 -static noinline int btrfs_mksubvol(struct path *parent, char *name,
8594 - int mode, int namelen,
8595 +static noinline int btrfs_mksubvol(struct path *parent,
8596 + char *name, int namelen,
8597 struct btrfs_root *snap_src)
8599 + struct inode *dir = parent->dentry->d_inode;
8600 struct dentry *dentry;
8603 - mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
8604 + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
8606 dentry = lookup_one_len(name, parent->dentry, namelen);
8607 error = PTR_ERR(dentry);
8608 @@ -438,99 +436,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
8609 if (dentry->d_inode)
8612 - if (!IS_POSIXACL(parent->dentry->d_inode))
8613 - mode &= ~current_umask();
8615 error = mnt_want_write(parent->mnt);
8619 - error = btrfs_may_create(parent->dentry->d_inode, dentry);
8620 + error = btrfs_may_create(dir, dentry);
8622 goto out_drop_write;
8625 - * Actually perform the low-level subvolume creation after all
8628 - * Eventually we want to pass in an inode under which we create this
8629 - * subvolume, but for now all are under the filesystem root.
8631 - * Also we should pass on the mode eventually to allow creating new
8632 - * subvolume with specific mode bits.
8634 + down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
8636 + if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
8640 - struct dentry *dir = dentry->d_parent;
8641 - struct dentry *test = dir->d_parent;
8642 - struct btrfs_path *path = btrfs_alloc_path();
8645 - u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
8647 - test_oid = snap_src->root_key.objectid;
8649 - ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
8650 - path, parent_oid, test_oid);
8653 - btrfs_release_path(snap_src->fs_info->tree_root, path);
8655 - /* we need to make sure we aren't creating a directory loop
8656 - * by taking a snapshot of something that has our current
8657 - * subvol in its directory tree. So, this loops through
8658 - * the dentries and checks the forward refs for each subvolume
8659 - * to see if is references the subvolume where we are
8660 - * placing this new snapshot.
8664 - dir == snap_src->fs_info->sb->s_root ||
8665 - test == snap_src->fs_info->sb->s_root ||
8666 - test->d_inode->i_sb != snap_src->fs_info->sb) {
8669 - if (S_ISLNK(test->d_inode->i_mode)) {
8670 - printk(KERN_INFO "Btrfs symlink in snapshot "
8671 - "path, failed\n");
8673 - btrfs_free_path(path);
8674 - goto out_drop_write;
8677 - BTRFS_I(test->d_inode)->root->root_key.objectid;
8678 - ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
8679 - path, test_oid, parent_oid);
8681 - printk(KERN_INFO "Btrfs snapshot creation "
8682 - "failed, looping\n");
8684 - btrfs_free_path(path);
8685 - goto out_drop_write;
8687 - btrfs_release_path(snap_src->fs_info->tree_root, path);
8688 - test = test->d_parent;
8691 - btrfs_free_path(path);
8692 - error = create_snapshot(snap_src, dentry, name, namelen);
8693 + error = create_snapshot(snap_src, dentry,
8696 - error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
8697 - dentry, name, namelen);
8698 + error = create_subvol(BTRFS_I(dir)->root, dentry,
8702 - goto out_drop_write;
8704 - fsnotify_mkdir(parent->dentry->d_inode, dentry);
8706 + fsnotify_mkdir(dir, dentry);
8708 + up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
8710 mnt_drop_write(parent->mnt);
8714 - mutex_unlock(&parent->dentry->d_inode->i_mutex);
8715 + mutex_unlock(&dir->i_mutex);
8720 static int btrfs_defrag_file(struct file *file)
8722 struct inode *inode = fdentry(file)->d_inode;
8723 @@ -596,9 +534,8 @@ again:
8724 clear_page_dirty_for_io(page);
8726 btrfs_set_extent_delalloc(inode, page_start, page_end);
8728 - unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
8729 set_page_dirty(page);
8730 + unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
8732 page_cache_release(page);
8733 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
8734 @@ -609,7 +546,8 @@ out_unlock:
8738 -static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
8739 +static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
8744 @@ -718,10 +656,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
8746 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
8747 struct btrfs_ioctl_vol_args *vol_args;
8748 - struct btrfs_dir_item *di;
8749 - struct btrfs_path *path;
8750 struct file *src_file;
8755 @@ -739,32 +674,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
8759 - path = btrfs_alloc_path();
8765 - root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
8766 - di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
8768 - vol_args->name, namelen, 0);
8769 - btrfs_free_path(path);
8771 - if (di && !IS_ERR(di)) {
8777 - ret = PTR_ERR(di);
8782 - ret = btrfs_mksubvol(&file->f_path, vol_args->name,
8783 - file->f_path.dentry->d_inode->i_mode,
8785 + ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
8788 struct inode *src_inode;
8789 src_file = fget(vol_args->fd);
8790 @@ -781,17 +693,157 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
8794 - ret = btrfs_mksubvol(&file->f_path, vol_args->name,
8795 - file->f_path.dentry->d_inode->i_mode,
8796 - namelen, BTRFS_I(src_inode)->root);
8797 + ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
8798 + BTRFS_I(src_inode)->root);
8808 + * helper to check if the subvolume references other subvolumes
8810 +static noinline int may_destroy_subvol(struct btrfs_root *root)
8812 + struct btrfs_path *path;
8813 + struct btrfs_key key;
8816 + path = btrfs_alloc_path();
8820 + key.objectid = root->root_key.objectid;
8821 + key.type = BTRFS_ROOT_REF_KEY;
8822 + key.offset = (u64)-1;
8824 + ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
8825 + &key, path, 0, 0);
8831 + if (path->slots[0] > 0) {
8833 + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
8834 + if (key.objectid == root->root_key.objectid &&
8835 + key.type == BTRFS_ROOT_REF_KEY)
8839 + btrfs_free_path(path);
8843 +static noinline int btrfs_ioctl_snap_destroy(struct file *file,
8846 + struct dentry *parent = fdentry(file);
8847 + struct dentry *dentry;
8848 + struct inode *dir = parent->d_inode;
8849 + struct inode *inode;
8850 + struct btrfs_root *root = BTRFS_I(dir)->root;
8851 + struct btrfs_root *dest = NULL;
8852 + struct btrfs_ioctl_vol_args *vol_args;
8853 + struct btrfs_trans_handle *trans;
8858 + if (!capable(CAP_SYS_ADMIN))
8861 + vol_args = memdup_user(arg, sizeof(*vol_args));
8862 + if (IS_ERR(vol_args))
8863 + return PTR_ERR(vol_args);
8865 + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
8866 + namelen = strlen(vol_args->name);
8867 + if (strchr(vol_args->name, '/') ||
8868 + strncmp(vol_args->name, "..", namelen) == 0) {
8873 + err = mnt_want_write(file->f_path.mnt);
8877 + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
8878 + dentry = lookup_one_len(vol_args->name, parent, namelen);
8879 + if (IS_ERR(dentry)) {
8880 + err = PTR_ERR(dentry);
8881 + goto out_unlock_dir;
8884 + if (!dentry->d_inode) {
8889 + inode = dentry->d_inode;
8890 + if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
8895 + dest = BTRFS_I(inode)->root;
8897 + mutex_lock(&inode->i_mutex);
8898 + err = d_invalidate(dentry);
8902 + down_write(&root->fs_info->subvol_sem);
8904 + err = may_destroy_subvol(dest);
8906 + goto out_up_write;
8908 + trans = btrfs_start_transaction(root, 1);
8909 + ret = btrfs_unlink_subvol(trans, root, dir,
8910 + dest->root_key.objectid,
8911 + dentry->d_name.name,
8912 + dentry->d_name.len);
8915 + btrfs_record_root_in_trans(trans, dest);
8917 + memset(&dest->root_item.drop_progress, 0,
8918 + sizeof(dest->root_item.drop_progress));
8919 + dest->root_item.drop_level = 0;
8920 + btrfs_set_root_refs(&dest->root_item, 0);
8922 + ret = btrfs_insert_orphan_item(trans,
8923 + root->fs_info->tree_root,
8924 + dest->root_key.objectid);
8927 + ret = btrfs_commit_transaction(trans, root);
8929 + inode->i_flags |= S_DEAD;
8931 + up_write(&root->fs_info->subvol_sem);
8933 + mutex_unlock(&inode->i_mutex);
8935 + shrink_dcache_sb(root->fs_info->sb);
8936 + btrfs_invalidate_inodes(dest);
8942 + mutex_unlock(&dir->i_mutex);
8943 + mnt_drop_write(file->f_path.mnt);
8949 static int btrfs_ioctl_defrag(struct file *file)
8951 struct inode *inode = fdentry(file)->d_inode;
8952 @@ -865,8 +917,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
8956 -static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
8957 - u64 off, u64 olen, u64 destoff)
8958 +static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
8959 + u64 off, u64 olen, u64 destoff)
8961 struct inode *inode = fdentry(file)->d_inode;
8962 struct btrfs_root *root = BTRFS_I(inode)->root;
8963 @@ -976,7 +1028,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
8965 /* punch hole in destination first */
8966 btrfs_drop_extents(trans, root, inode, off, off + len,
8967 - off + len, 0, &hint_byte);
8968 + off + len, 0, &hint_byte, 1);
8971 key.objectid = src->i_ino;
8972 @@ -1071,9 +1123,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
8973 datao += off - key.offset;
8974 datal -= off - key.offset;
8976 - if (key.offset + datao + datal + key.offset >
8978 - datal = off + len - key.offset - datao;
8980 + if (key.offset + datal > off + len)
8981 + datal = off + len - key.offset;
8983 /* disko == 0 means it's a hole */
8986 @@ -1182,15 +1235,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
8987 struct inode *inode = fdentry(file)->d_inode;
8988 struct btrfs_root *root = BTRFS_I(inode)->root;
8989 struct btrfs_trans_handle *trans;
8994 if (!capable(CAP_SYS_ADMIN))
8998 - if (file->private_data) {
8999 - ret = -EINPROGRESS;
9000 + ret = -EINPROGRESS;
9001 + if (file->private_data)
9005 ret = mnt_want_write(file->f_path.mnt);
9007 @@ -1200,12 +1253,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
9008 root->fs_info->open_ioctl_trans++;
9009 mutex_unlock(&root->fs_info->trans_mutex);
9012 trans = btrfs_start_ioctl_transaction(root, 0);
9014 - file->private_data = trans;
9017 - /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
9021 + file->private_data = trans;
9025 + mutex_lock(&root->fs_info->trans_mutex);
9026 + root->fs_info->open_ioctl_trans--;
9027 + mutex_unlock(&root->fs_info->trans_mutex);
9028 + mnt_drop_write(file->f_path.mnt);
9032 @@ -1221,24 +1281,20 @@ long btrfs_ioctl_trans_end(struct file *file)
9033 struct inode *inode = fdentry(file)->d_inode;
9034 struct btrfs_root *root = BTRFS_I(inode)->root;
9035 struct btrfs_trans_handle *trans;
9038 trans = file->private_data;
9043 - btrfs_end_transaction(trans, root);
9046 file->private_data = NULL;
9048 + btrfs_end_transaction(trans, root);
9050 mutex_lock(&root->fs_info->trans_mutex);
9051 root->fs_info->open_ioctl_trans--;
9052 mutex_unlock(&root->fs_info->trans_mutex);
9054 mnt_drop_write(file->f_path.mnt);
9061 long btrfs_ioctl(struct file *file, unsigned int
9062 @@ -1258,6 +1314,8 @@ long btrfs_ioctl(struct file *file, unsigned int
9063 return btrfs_ioctl_snap_create(file, argp, 0);
9064 case BTRFS_IOC_SUBVOL_CREATE:
9065 return btrfs_ioctl_snap_create(file, argp, 1);
9066 + case BTRFS_IOC_SNAP_DESTROY:
9067 + return btrfs_ioctl_snap_destroy(file, argp);
9068 case BTRFS_IOC_DEFRAG:
9069 return btrfs_ioctl_defrag(file);
9070 case BTRFS_IOC_RESIZE:
9071 diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
9072 index b320b10..bc49914 100644
9073 --- a/fs/btrfs/ioctl.h
9074 +++ b/fs/btrfs/ioctl.h
9075 @@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args {
9077 #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
9078 struct btrfs_ioctl_vol_args)
9080 +#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
9081 + struct btrfs_ioctl_vol_args)
9083 diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
9084 index d6f0806..ab21c29 100644
9085 --- a/fs/btrfs/ordered-data.c
9086 +++ b/fs/btrfs/ordered-data.c
9087 @@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
9089 * len is the length of the extent
9091 - * This also sets the EXTENT_ORDERED bit on the range in the inode.
9093 * The tree is given a single reference on the ordered extent that was
9096 @@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
9097 entry->start = start;
9099 entry->disk_len = disk_len;
9100 + entry->bytes_left = len;
9101 entry->inode = inode;
9102 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
9103 set_bit(type, &entry->flags);
9104 @@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
9108 - set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
9109 - entry_end(entry) - 1, GFP_NOFS);
9111 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
9112 list_add_tail(&entry->root_extent_list,
9113 &BTRFS_I(inode)->root->fs_info->ordered_extents);
9114 @@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
9115 struct btrfs_ordered_inode_tree *tree;
9116 struct rb_node *node;
9117 struct btrfs_ordered_extent *entry;
9118 - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
9121 tree = &BTRFS_I(inode)->ordered_tree;
9122 mutex_lock(&tree->mutex);
9123 - clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
9125 node = tree_search(tree, file_offset);
9128 @@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
9132 - ret = test_range_bit(io_tree, entry->file_offset,
9133 - entry->file_offset + entry->len - 1,
9134 - EXTENT_ORDERED, 0);
9136 + if (io_size > entry->bytes_left) {
9137 + printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
9138 + (unsigned long long)entry->bytes_left,
9139 + (unsigned long long)io_size);
9141 + entry->bytes_left -= io_size;
9142 + if (entry->bytes_left == 0)
9143 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
9147 mutex_unlock(&tree->mutex);
9149 @@ -308,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode,
9151 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
9153 + spin_lock(&BTRFS_I(inode)->accounting_lock);
9154 + BTRFS_I(inode)->outstanding_extents--;
9155 + spin_unlock(&BTRFS_I(inode)->accounting_lock);
9156 + btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
9159 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
9160 list_del_init(&entry->root_extent_list);
9162 @@ -476,6 +480,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
9165 struct btrfs_ordered_extent *ordered;
9168 if (start + len < start) {
9169 orig_end = INT_LIMIT(loff_t);
9170 @@ -502,6 +507,7 @@ again:
9171 orig_end >> PAGE_CACHE_SHIFT);
9176 ordered = btrfs_lookup_first_ordered_extent(inode, end);
9178 @@ -514,6 +520,7 @@ again:
9179 btrfs_put_ordered_extent(ordered);
9183 btrfs_start_ordered_extent(inode, ordered, 1);
9184 end = ordered->file_offset;
9185 btrfs_put_ordered_extent(ordered);
9186 @@ -521,8 +528,8 @@ again:
9190 - if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
9191 - EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
9192 + if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
9193 + EXTENT_DELALLOC, 0, NULL)) {
9194 schedule_timeout(1);
9197 @@ -613,7 +620,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
9199 if (test_range_bit(io_tree, disk_i_size,
9200 ordered->file_offset + ordered->len - 1,
9201 - EXTENT_DELALLOC, 0)) {
9202 + EXTENT_DELALLOC, 0, NULL)) {
9206 @@ -664,7 +671,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
9208 if (i_size_test > entry_end(ordered) &&
9209 !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
9210 - EXTENT_DELALLOC, 0)) {
9211 + EXTENT_DELALLOC, 0, NULL)) {
9212 new_i_size = min_t(u64, i_size_test, i_size_read(inode));
9214 BTRFS_I(inode)->disk_i_size = new_i_size;
9215 diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
9216 index 3d31c88..993a7ea 100644
9217 --- a/fs/btrfs/ordered-data.h
9218 +++ b/fs/btrfs/ordered-data.h
9219 @@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
9220 /* extent length on disk */
9223 + /* number of bytes that still need writing */
9226 /* flags (described above) */
9227 unsigned long flags;
9229 diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
9230 index 3c0d52a..79cba5f 100644
9231 --- a/fs/btrfs/orphan.c
9232 +++ b/fs/btrfs/orphan.c
9233 @@ -65,3 +65,23 @@ out:
9234 btrfs_free_path(path);
9238 +int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
9240 + struct btrfs_path *path;
9241 + struct btrfs_key key;
9244 + key.objectid = BTRFS_ORPHAN_OBJECTID;
9245 + key.type = BTRFS_ORPHAN_ITEM_KEY;
9246 + key.offset = offset;
9248 + path = btrfs_alloc_path();
9252 + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9254 + btrfs_free_path(path);
9257 diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
9258 index c04f7f2..cfcc93c 100644
9259 --- a/fs/btrfs/relocation.c
9260 +++ b/fs/btrfs/relocation.c
9261 @@ -121,6 +121,15 @@ struct inodevec {
9265 +#define MAX_EXTENTS 128
9267 +struct file_extent_cluster {
9270 + u64 boundary[MAX_EXTENTS];
9274 struct reloc_control {
9275 /* block group to relocate */
9276 struct btrfs_block_group_cache *block_group;
9277 @@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
9278 struct reloc_control *rc)
9280 if (test_range_bit(&rc->processed_blocks, bytenr,
9281 - bytenr + blocksize - 1, EXTENT_DIRTY, 1))
9282 + bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
9286 @@ -2529,56 +2538,94 @@ out:
9289 static noinline_for_stack
9290 -int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
9291 +int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
9294 + struct btrfs_root *root = BTRFS_I(inode)->root;
9295 + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
9296 + struct extent_map *em;
9299 + em = alloc_extent_map(GFP_NOFS);
9303 + em->start = start;
9304 + em->len = end + 1 - start;
9305 + em->block_len = em->len;
9306 + em->block_start = block_start;
9307 + em->bdev = root->fs_info->fs_devices->latest_bdev;
9308 + set_bit(EXTENT_FLAG_PINNED, &em->flags);
9310 + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
9312 + write_lock(&em_tree->lock);
9313 + ret = add_extent_mapping(em_tree, em);
9314 + write_unlock(&em_tree->lock);
9315 + if (ret != -EEXIST) {
9316 + free_extent_map(em);
9319 + btrfs_drop_extent_cache(inode, start, end, 0);
9321 + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
9325 +static int relocate_file_extent_cluster(struct inode *inode,
9326 + struct file_extent_cluster *cluster)
9331 - unsigned long first_index;
9332 + u64 offset = BTRFS_I(inode)->index_cnt;
9333 + unsigned long index;
9334 unsigned long last_index;
9335 - unsigned int total_read = 0;
9336 - unsigned int total_dirty = 0;
9337 + unsigned int dirty_page = 0;
9339 struct file_ra_state *ra;
9340 - struct btrfs_ordered_extent *ordered;
9341 - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
9348 ra = kzalloc(sizeof(*ra), GFP_NOFS);
9352 + index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
9353 + last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
9355 mutex_lock(&inode->i_mutex);
9356 - first_index = start >> PAGE_CACHE_SHIFT;
9357 - last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
9359 - /* make sure the dirty trick played by the caller work */
9361 - ret = invalidate_inode_pages2_range(inode->i_mapping,
9362 - first_index, last_index);
9363 - if (ret != -EBUSY)
9365 - schedule_timeout(HZ/10);
9367 + i_size_write(inode, cluster->end + 1 - offset);
9368 + ret = setup_extent_mapping(inode, cluster->start - offset,
9369 + cluster->end - offset, cluster->start);
9373 file_ra_state_init(ra, inode->i_mapping);
9375 - for (i = first_index ; i <= last_index; i++) {
9376 - if (total_read % ra->ra_pages == 0) {
9377 - btrfs_force_ra(inode->i_mapping, ra, NULL, i,
9378 - min(last_index, ra->ra_pages + i - 1));
9382 - if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
9384 - page = grab_cache_page(inode->i_mapping, i);
9385 + WARN_ON(cluster->start != cluster->boundary[0]);
9386 + while (index <= last_index) {
9387 + page = find_lock_page(inode->i_mapping, index);
9391 + page_cache_sync_readahead(inode->i_mapping,
9393 + last_index + 1 - index);
9394 + page = grab_cache_page(inode->i_mapping, index);
9401 + if (PageReadahead(page)) {
9402 + page_cache_async_readahead(inode->i_mapping,
9403 + ra, NULL, page, index,
9404 + last_index + 1 - index);
9407 if (!PageUptodate(page)) {
9408 btrfs_readpage(NULL, page);
9410 @@ -2589,75 +2636,79 @@ again:
9414 - wait_on_page_writeback(page);
9416 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
9417 page_end = page_start + PAGE_CACHE_SIZE - 1;
9418 - lock_extent(io_tree, page_start, page_end, GFP_NOFS);
9420 - ordered = btrfs_lookup_ordered_extent(inode, page_start);
9422 - unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
9423 - unlock_page(page);
9424 - page_cache_release(page);
9425 - btrfs_start_ordered_extent(inode, ordered, 1);
9426 - btrfs_put_ordered_extent(ordered);
9430 + lock_extent(&BTRFS_I(inode)->io_tree,
9431 + page_start, page_end, GFP_NOFS);
9433 set_page_extent_mapped(page);
9435 - if (i == first_index)
9436 - set_extent_bits(io_tree, page_start, page_end,
9437 + if (nr < cluster->nr &&
9438 + page_start + offset == cluster->boundary[nr]) {
9439 + set_extent_bits(&BTRFS_I(inode)->io_tree,
9440 + page_start, page_end,
9441 EXTENT_BOUNDARY, GFP_NOFS);
9444 btrfs_set_extent_delalloc(inode, page_start, page_end);
9446 set_page_dirty(page);
9450 - unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
9451 + unlock_extent(&BTRFS_I(inode)->io_tree,
9452 + page_start, page_end, GFP_NOFS);
9454 page_cache_release(page);
9457 + if (nr < cluster->nr &&
9458 + page_end + 1 + offset == cluster->boundary[nr]) {
9459 + balance_dirty_pages_ratelimited_nr(inode->i_mapping,
9465 + balance_dirty_pages_ratelimited_nr(inode->i_mapping,
9468 + WARN_ON(nr != cluster->nr);
9470 mutex_unlock(&inode->i_mutex);
9472 - balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
9476 static noinline_for_stack
9477 -int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
9478 +int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
9479 + struct file_extent_cluster *cluster)
9481 - struct btrfs_root *root = BTRFS_I(inode)->root;
9482 - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
9483 - struct extent_map *em;
9484 - u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
9485 - u64 end = start + extent_key->offset - 1;
9487 - em = alloc_extent_map(GFP_NOFS);
9488 - em->start = start;
9489 - em->len = extent_key->offset;
9490 - em->block_len = extent_key->offset;
9491 - em->block_start = extent_key->objectid;
9492 - em->bdev = root->fs_info->fs_devices->latest_bdev;
9493 - set_bit(EXTENT_FLAG_PINNED, &em->flags);
9496 - /* setup extent map to cheat btrfs_readpage */
9497 - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
9500 - spin_lock(&em_tree->lock);
9501 - ret = add_extent_mapping(em_tree, em);
9502 - spin_unlock(&em_tree->lock);
9503 - if (ret != -EEXIST) {
9504 - free_extent_map(em);
9507 - btrfs_drop_extent_cache(inode, start, end, 0);
9508 + if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
9509 + ret = relocate_file_extent_cluster(inode, cluster);
9514 - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
9516 - return relocate_inode_pages(inode, start, extent_key->offset);
9518 + cluster->start = extent_key->objectid;
9520 + BUG_ON(cluster->nr >= MAX_EXTENTS);
9521 + cluster->end = extent_key->objectid + extent_key->offset - 1;
9522 + cluster->boundary[cluster->nr] = extent_key->objectid;
9525 + if (cluster->nr >= MAX_EXTENTS) {
9526 + ret = relocate_file_extent_cluster(inode, cluster);
9534 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
9535 @@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags)
9540 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
9542 struct rb_root blocks = RB_ROOT;
9543 struct btrfs_key key;
9544 + struct file_extent_cluster *cluster;
9545 struct btrfs_trans_handle *trans = NULL;
9546 struct btrfs_path *path;
9547 struct btrfs_extent_item *ei;
9548 @@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
9552 + cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
9556 path = btrfs_alloc_path();
9560 + rc->extents_found = 0;
9561 + rc->extents_skipped = 0;
9563 rc->search_start = rc->block_group->key.objectid;
9564 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
9566 @@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
9569 nr = trans->blocks_used;
9570 - btrfs_end_transaction_throttle(trans, rc->extent_root);
9571 + btrfs_end_transaction(trans, rc->extent_root);
9573 btrfs_btree_balance_dirty(rc->extent_root, nr);
9575 if (rc->stage == MOVE_DATA_EXTENTS &&
9576 (flags & BTRFS_EXTENT_FLAG_DATA)) {
9577 rc->found_file_extent = 1;
9578 - ret = relocate_data_extent(rc->data_inode, &key);
9579 + ret = relocate_data_extent(rc->data_inode,
9584 @@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
9585 btrfs_btree_balance_dirty(rc->extent_root, nr);
9589 + ret = relocate_file_extent_cluster(rc->data_inode, cluster);
9596 rc->create_reloc_root = 0;
9599 @@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
9602 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
9603 - struct btrfs_root *root,
9604 - u64 objectid, u64 size)
9605 + struct btrfs_root *root, u64 objectid)
9607 struct btrfs_path *path;
9608 struct btrfs_inode_item *item;
9609 @@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
9610 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
9611 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
9612 btrfs_set_inode_generation(leaf, item, 1);
9613 - btrfs_set_inode_size(leaf, item, size);
9614 + btrfs_set_inode_size(leaf, item, 0);
9615 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
9616 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
9617 btrfs_mark_buffer_dirty(leaf);
9618 @@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
9622 - err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
9625 - err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
9626 - group->key.offset, 0, group->key.offset,
9628 + err = __insert_orphan_inode(trans, root, objectid);
9631 key.objectid = objectid;
9632 @@ -3455,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
9633 BUG_ON(!rc->block_group);
9635 btrfs_init_workers(&rc->workers, "relocate",
9636 - fs_info->thread_pool_size);
9637 + fs_info->thread_pool_size, NULL);
9639 rc->extent_root = extent_root;
9640 btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
9641 @@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
9642 btrfs_wait_ordered_extents(fs_info->tree_root, 0);
9645 - mutex_lock(&fs_info->cleaner_mutex);
9646 - btrfs_clean_old_snapshots(fs_info->tree_root);
9647 - mutex_unlock(&fs_info->cleaner_mutex);
9649 rc->extents_found = 0;
9650 rc->extents_skipped = 0;
9652 + mutex_lock(&fs_info->cleaner_mutex);
9654 + btrfs_clean_old_snapshots(fs_info->tree_root);
9655 ret = relocate_block_group(rc);
9657 + mutex_unlock(&fs_info->cleaner_mutex);
9661 @@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
9665 - filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
9666 - rc->block_group->key.objectid,
9667 - rc->block_group->key.objectid +
9668 - rc->block_group->key.offset - 1);
9669 + filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
9670 + rc->block_group->key.objectid,
9671 + rc->block_group->key.objectid +
9672 + rc->block_group->key.offset - 1);
9674 WARN_ON(rc->block_group->pinned > 0);
9675 WARN_ON(rc->block_group->reserved > 0);
9676 @@ -3530,6 +3594,26 @@ out:
9680 +static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
9682 + struct btrfs_trans_handle *trans;
9685 + trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
9687 + memset(&root->root_item.drop_progress, 0,
9688 + sizeof(root->root_item.drop_progress));
9689 + root->root_item.drop_level = 0;
9690 + btrfs_set_root_refs(&root->root_item, 0);
9691 + ret = btrfs_update_root(trans, root->fs_info->tree_root,
9692 + &root->root_key, &root->root_item);
9695 + ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
9701 * recover relocation interrupted by system crash.
9703 @@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
9704 fs_root = read_fs_root(root->fs_info,
9705 reloc_root->root_key.offset);
9706 if (IS_ERR(fs_root)) {
9707 - err = PTR_ERR(fs_root);
9709 + ret = PTR_ERR(fs_root);
9710 + if (ret != -ENOENT) {
9714 + mark_garbage_root(reloc_root);
9718 @@ -3613,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
9719 mapping_tree_init(&rc->reloc_root_tree);
9720 INIT_LIST_HEAD(&rc->reloc_roots);
9721 btrfs_init_workers(&rc->workers, "relocate",
9722 - root->fs_info->thread_pool_size);
9723 + root->fs_info->thread_pool_size, NULL);
9724 rc->extent_root = root->fs_info->extent_root;
9726 set_reloc_control(rc);
9727 diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
9728 index 0ddc6d6..9351428 100644
9729 --- a/fs/btrfs/root-tree.c
9730 +++ b/fs/btrfs/root-tree.c
9731 @@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
9735 + if (path->slots[0] == 0) {
9740 - BUG_ON(path->slots[0] == 0);
9741 slot = path->slots[0] - 1;
9742 btrfs_item_key_to_cpu(l, &found_key, slot);
9743 - if (found_key.objectid != objectid) {
9744 + if (found_key.objectid != objectid ||
9745 + found_key.type != BTRFS_ROOT_ITEM_KEY) {
9749 - read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
9751 - memcpy(key, &found_key, sizeof(found_key));
9753 + read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
9756 + memcpy(key, &found_key, sizeof(found_key));
9759 btrfs_free_path(path);
9760 @@ -249,6 +255,59 @@ err:
9764 +int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
9766 + struct extent_buffer *leaf;
9767 + struct btrfs_path *path;
9768 + struct btrfs_key key;
9772 + path = btrfs_alloc_path();
9776 + key.objectid = BTRFS_ORPHAN_OBJECTID;
9777 + key.type = BTRFS_ORPHAN_ITEM_KEY;
9781 + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
9787 + leaf = path->nodes[0];
9788 + if (path->slots[0] >= btrfs_header_nritems(leaf)) {
9789 + ret = btrfs_next_leaf(tree_root, path);
9794 + leaf = path->nodes[0];
9797 + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
9798 + btrfs_release_path(tree_root, path);
9800 + if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
9801 + key.type != BTRFS_ORPHAN_ITEM_KEY)
9804 + ret = btrfs_find_dead_roots(tree_root, key.offset);
9813 + btrfs_free_path(path);
9817 /* drop the root item for 'key' from 'root' */
9818 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
9819 struct btrfs_key *key)
9820 @@ -278,31 +337,57 @@ out:
9824 -#if 0 /* this will get used when snapshot deletion is implemented */
9825 int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
9826 struct btrfs_root *tree_root,
9827 - u64 root_id, u8 type, u64 ref_id)
9828 + u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
9829 + const char *name, int name_len)
9832 + struct btrfs_path *path;
9833 + struct btrfs_root_ref *ref;
9834 + struct extent_buffer *leaf;
9835 struct btrfs_key key;
9836 + unsigned long ptr;
9839 - struct btrfs_path *path;
9841 path = btrfs_alloc_path();
9845 key.objectid = root_id;
9847 + key.type = BTRFS_ROOT_BACKREF_KEY;
9848 key.offset = ref_id;
9851 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
9854 - ret = btrfs_del_item(trans, tree_root, path);
9858 + leaf = path->nodes[0];
9859 + ref = btrfs_item_ptr(leaf, path->slots[0],
9860 + struct btrfs_root_ref);
9862 + WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
9863 + WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
9864 + ptr = (unsigned long)(ref + 1);
9865 + WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
9866 + *sequence = btrfs_root_ref_sequence(leaf, ref);
9868 + ret = btrfs_del_item(trans, tree_root, path);
9873 + if (key.type == BTRFS_ROOT_BACKREF_KEY) {
9874 + btrfs_release_path(tree_root, path);
9875 + key.objectid = ref_id;
9876 + key.type = BTRFS_ROOT_REF_KEY;
9877 + key.offset = root_id;
9881 btrfs_free_path(path);
9887 int btrfs_find_root_ref(struct btrfs_root *tree_root,
9888 struct btrfs_path *path,
9889 @@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
9895 * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
9896 * or BTRFS_ROOT_BACKREF_KEY.
9897 @@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
9899 int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
9900 struct btrfs_root *tree_root,
9901 - u64 root_id, u8 type, u64 ref_id,
9902 - u64 dirid, u64 sequence,
9903 + u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
9904 const char *name, int name_len)
9906 struct btrfs_key key;
9907 @@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
9908 struct extent_buffer *leaf;
9912 path = btrfs_alloc_path();
9916 key.objectid = root_id;
9918 + key.type = BTRFS_ROOT_BACKREF_KEY;
9919 key.offset = ref_id;
9922 ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
9923 sizeof(*ref) + name_len);
9925 @@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
9926 write_extent_buffer(leaf, name, ptr, name_len);
9927 btrfs_mark_buffer_dirty(leaf);
9929 + if (key.type == BTRFS_ROOT_BACKREF_KEY) {
9930 + btrfs_release_path(tree_root, path);
9931 + key.objectid = ref_id;
9932 + key.type = BTRFS_ROOT_REF_KEY;
9933 + key.offset = root_id;
9937 btrfs_free_path(path);
9941 diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
9942 index 6d6d06c..939b68f 100644
9943 --- a/fs/btrfs/super.c
9944 +++ b/fs/btrfs/super.c
9945 @@ -66,7 +66,8 @@ enum {
9946 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
9947 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
9948 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
9949 - Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
9950 + Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
9951 + Opt_discard, Opt_err,
9954 static match_table_t tokens = {
9955 @@ -88,6 +89,7 @@ static match_table_t tokens = {
9956 {Opt_notreelog, "notreelog"},
9957 {Opt_flushoncommit, "flushoncommit"},
9958 {Opt_ratio, "metadata_ratio=%d"},
9959 + {Opt_discard, "discard"},
9963 @@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
9964 info->metadata_ratio);
9968 + btrfs_set_opt(info->mount_opt, DISCARD);
9973 @@ -344,7 +349,9 @@ static int btrfs_fill_super(struct super_block *sb,
9974 sb->s_export_op = &btrfs_export_ops;
9975 sb->s_xattr = btrfs_xattr_handlers;
9976 sb->s_time_gran = 1;
9977 +#ifdef CONFIG_BTRFS_FS_POSIX_ACL
9978 sb->s_flags |= MS_POSIXACL;
9981 tree_root = open_ctree(sb, fs_devices, (char *)data);
9983 @@ -676,6 +683,7 @@ static int btrfs_unfreeze(struct super_block *sb)
9986 static struct super_operations btrfs_super_ops = {
9987 + .drop_inode = btrfs_drop_inode,
9988 .delete_inode = btrfs_delete_inode,
9989 .put_super = btrfs_put_super,
9990 .sync_fs = btrfs_sync_fs,
9991 diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
9992 index cdbb502..bca82a4 100644
9993 --- a/fs/btrfs/transaction.c
9994 +++ b/fs/btrfs/transaction.c
9995 @@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
9997 if (root->ref_cows && root->last_trans < trans->transid) {
9998 WARN_ON(root == root->fs_info->extent_root);
9999 - WARN_ON(root->root_item.refs == 0);
10000 WARN_ON(root->commit_root != root->node);
10002 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
10003 @@ -187,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
10004 h->alloc_exclude_start = 0;
10005 h->delayed_ref_updates = 0;
10007 + if (!current->journal_info)
10008 + current->journal_info = h;
10010 root->fs_info->running_transaction->use_count++;
10011 record_root_in_trans(h, root);
10012 mutex_unlock(&root->fs_info->trans_mutex);
10013 @@ -318,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
10014 wake_up(&cur_trans->writer_wait);
10015 put_transaction(cur_trans);
10016 mutex_unlock(&info->trans_mutex);
10018 + if (current->journal_info == trans)
10019 + current->journal_info = NULL;
10020 memset(trans, 0, sizeof(*trans));
10021 kmem_cache_free(btrfs_trans_handle_cachep, trans);
10023 @@ -339,10 +344,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
10025 * when btree blocks are allocated, they have some corresponding bits set for
10026 * them in one of two extent_io trees. This is used to make sure all of
10027 - * those extents are on disk for transaction or log commit
10028 + * those extents are sent to disk but does not wait on them
10030 -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
10031 - struct extent_io_tree *dirty_pages)
10032 +int btrfs_write_marked_extents(struct btrfs_root *root,
10033 + struct extent_io_tree *dirty_pages)
10037 @@ -389,6 +394,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
10038 page_cache_release(page);
10047 + * when btree blocks are allocated, they have some corresponding bits set for
10048 + * them in one of two extent_io trees. This is used to make sure all of
10049 + * those extents are on disk for transaction or log commit. We wait
10050 + * on all the pages and clear them from the dirty pages state tree
10052 +int btrfs_wait_marked_extents(struct btrfs_root *root,
10053 + struct extent_io_tree *dirty_pages)
10058 + struct page *page;
10059 + struct inode *btree_inode = root->fs_info->btree_inode;
10062 + unsigned long index;
10065 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
10067 @@ -419,6 +447,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
10072 + * when btree blocks are allocated, they have some corresponding bits set for
10073 + * them in one of two extent_io trees. This is used to make sure all of
10074 + * those extents are on disk for transaction or log commit
10076 +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
10077 + struct extent_io_tree *dirty_pages)
10082 + ret = btrfs_write_marked_extents(root, dirty_pages);
10083 + ret2 = btrfs_wait_marked_extents(root, dirty_pages);
10084 + return ret || ret2;
10087 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
10088 struct btrfs_root *root)
10090 @@ -720,7 +764,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
10091 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
10093 key.objectid = objectid;
10095 + /* record when the snapshot was created in key.offset */
10096 + key.offset = trans->transid;
10097 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
10099 old = btrfs_lock_root_node(root);
10100 @@ -743,6 +788,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
10101 memcpy(&pending->root_key, &key, sizeof(key));
10103 kfree(new_root_item);
10104 + btrfs_unreserve_metadata_space(root, 6);
10108 @@ -778,24 +824,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
10109 ret = btrfs_update_inode(trans, parent_root, parent_inode);
10112 - /* add the backref first */
10113 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
10114 pending->root_key.objectid,
10115 - BTRFS_ROOT_BACKREF_KEY,
10116 parent_root->root_key.objectid,
10117 parent_inode->i_ino, index, pending->name,
10122 - /* now add the forward ref */
10123 - ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
10124 - parent_root->root_key.objectid,
10125 - BTRFS_ROOT_REF_KEY,
10126 - pending->root_key.objectid,
10127 - parent_inode->i_ino, index, pending->name,
10130 inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
10131 d_instantiate(pending->dentry, inode);
10133 @@ -874,7 +910,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
10134 unsigned long timeout = 1;
10135 struct btrfs_transaction *cur_trans;
10136 struct btrfs_transaction *prev_trans = NULL;
10137 - struct extent_io_tree *pinned_copy;
10140 int should_grow = 0;
10141 @@ -915,13 +950,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
10145 - pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
10146 - if (!pinned_copy)
10149 - extent_io_tree_init(pinned_copy,
10150 - root->fs_info->btree_inode->i_mapping, GFP_NOFS);
10152 trans->transaction->in_commit = 1;
10153 trans->transaction->blocked = 1;
10154 if (cur_trans->list.prev != &root->fs_info->trans_list) {
10155 @@ -1019,6 +1047,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
10156 ret = commit_cowonly_roots(trans, root);
10159 + btrfs_prepare_extent_commit(trans, root);
10161 cur_trans = root->fs_info->running_transaction;
10162 spin_lock(&root->fs_info->new_trans_lock);
10163 root->fs_info->running_transaction = NULL;
10164 @@ -1042,8 +1072,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
10165 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
10166 sizeof(root->fs_info->super_copy));
10168 - btrfs_copy_pinned(root, pinned_copy);
10170 trans->transaction->blocked = 0;
10172 wake_up(&root->fs_info->transaction_wait);
10173 @@ -1059,8 +1087,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
10175 mutex_unlock(&root->fs_info->tree_log_mutex);
10177 - btrfs_finish_extent_commit(trans, root, pinned_copy);
10178 - kfree(pinned_copy);
10179 + btrfs_finish_extent_commit(trans, root);
10181 /* do the directory inserts of any pending snapshot creations */
10182 finish_pending_snapshots(trans, root->fs_info);
10183 @@ -1078,6 +1105,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
10185 mutex_unlock(&root->fs_info->trans_mutex);
10187 + if (current->journal_info == trans)
10188 + current->journal_info = NULL;
10190 kmem_cache_free(btrfs_trans_handle_cachep, trans);
10193 @@ -1096,8 +1126,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
10195 while (!list_empty(&list)) {
10196 root = list_entry(list.next, struct btrfs_root, root_list);
10197 - list_del_init(&root->root_list);
10198 - btrfs_drop_snapshot(root, 0);
10199 + list_del(&root->root_list);
10201 + if (btrfs_header_backref_rev(root->node) <
10202 + BTRFS_MIXED_BACKREF_REV)
10203 + btrfs_drop_snapshot(root, 0);
10205 + btrfs_drop_snapshot(root, 1);
10209 diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
10210 index 663c674..d4e3e7a 100644
10211 --- a/fs/btrfs/transaction.h
10212 +++ b/fs/btrfs/transaction.h
10213 @@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
10214 struct inode *inode)
10216 BTRFS_I(inode)->last_trans = trans->transaction->transid;
10217 + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
10220 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
10221 @@ -107,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
10222 struct btrfs_root *root);
10223 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
10224 struct extent_io_tree *dirty_pages);
10225 +int btrfs_write_marked_extents(struct btrfs_root *root,
10226 + struct extent_io_tree *dirty_pages);
10227 +int btrfs_wait_marked_extents(struct btrfs_root *root,
10228 + struct extent_io_tree *dirty_pages);
10229 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
10231 diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
10232 index d91b0de..f51bf13 100644
10233 --- a/fs/btrfs/tree-log.c
10234 +++ b/fs/btrfs/tree-log.c
10235 @@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
10237 mutex_lock(&root->log_mutex);
10238 if (root->log_root) {
10239 + if (!root->log_start_pid) {
10240 + root->log_start_pid = current->pid;
10241 + root->log_multiple_pids = false;
10242 + } else if (root->log_start_pid != current->pid) {
10243 + root->log_multiple_pids = true;
10247 atomic_inc(&root->log_writers);
10248 mutex_unlock(&root->log_mutex);
10251 + root->log_multiple_pids = false;
10252 + root->log_start_pid = current->pid;
10253 mutex_lock(&root->fs_info->tree_log_mutex);
10254 if (!root->fs_info->log_root_tree) {
10255 ret = btrfs_init_log_root_tree(trans, root->fs_info);
10256 @@ -263,8 +272,8 @@ static int process_one_buffer(struct btrfs_root *log,
10257 struct walk_control *wc, u64 gen)
10260 - btrfs_update_pinned_extents(log->fs_info->extent_root,
10261 - eb->start, eb->len, 1);
10262 + btrfs_pin_extent(log->fs_info->extent_root,
10263 + eb->start, eb->len, 0);
10265 if (btrfs_buffer_uptodate(eb, gen)) {
10267 @@ -534,7 +543,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
10268 saved_nbytes = inode_get_bytes(inode);
10269 /* drop any overlapping extents */
10270 ret = btrfs_drop_extents(trans, root, inode,
10271 - start, extent_end, extent_end, start, &alloc_hint);
10272 + start, extent_end, extent_end, start, &alloc_hint, 1);
10275 if (found_type == BTRFS_FILE_EXTENT_REG ||
10276 @@ -1971,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
10278 struct btrfs_root *log = root->log_root;
10279 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
10280 + u64 log_transid = 0;
10282 mutex_lock(&root->log_mutex);
10283 index1 = root->log_transid % 2;
10284 @@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
10287 unsigned long batch = root->log_batch;
10288 - mutex_unlock(&root->log_mutex);
10289 - schedule_timeout_uninterruptible(1);
10290 - mutex_lock(&root->log_mutex);
10292 + if (root->log_multiple_pids) {
10293 + mutex_unlock(&root->log_mutex);
10294 + schedule_timeout_uninterruptible(1);
10295 + mutex_lock(&root->log_mutex);
10297 wait_for_writer(trans, root);
10298 if (batch == root->log_batch)
10300 @@ -2003,14 +2014,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
10304 - ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
10305 + /* we start IO on all the marked extents here, but we don't actually
10306 + * wait for them until later.
10308 + ret = btrfs_write_marked_extents(log, &log->dirty_log_pages);
10311 btrfs_set_root_node(&log->root_item, log->node);
10313 root->log_batch = 0;
10314 + log_transid = root->log_transid;
10315 root->log_transid++;
10316 log->log_transid = root->log_transid;
10317 + root->log_start_pid = 0;
10320 * log tree has been flushed to disk, new modifications of
10321 @@ -2036,6 +2052,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
10323 index2 = log_root_tree->log_transid % 2;
10324 if (atomic_read(&log_root_tree->log_commit[index2])) {
10325 + btrfs_wait_marked_extents(log, &log->dirty_log_pages);
10326 wait_log_commit(trans, log_root_tree,
10327 log_root_tree->log_transid);
10328 mutex_unlock(&log_root_tree->log_mutex);
10329 @@ -2055,6 +2072,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
10330 * check the full commit flag again
10332 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
10333 + btrfs_wait_marked_extents(log, &log->dirty_log_pages);
10334 mutex_unlock(&log_root_tree->log_mutex);
10336 goto out_wake_log_root;
10337 @@ -2063,6 +2081,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
10338 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
10339 &log_root_tree->dirty_log_pages);
10341 + btrfs_wait_marked_extents(log, &log->dirty_log_pages);
10343 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
10344 log_root_tree->node->start);
10345 @@ -2082,9 +2101,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
10346 * the running transaction open, so a full commit can't hop
10347 * in and cause problems either.
10349 - write_ctree_super(trans, root->fs_info->tree_root, 2);
10350 + write_ctree_super(trans, root->fs_info->tree_root, 1);
10353 + mutex_lock(&root->log_mutex);
10354 + if (root->last_log_commit < log_transid)
10355 + root->last_log_commit = log_transid;
10356 + mutex_unlock(&root->log_mutex);
10359 atomic_set(&log_root_tree->log_commit[index2], 0);
10361 @@ -2841,7 +2865,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
10362 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
10365 - if (parent == sb->s_root)
10366 + if (IS_ROOT(parent))
10369 parent = parent->d_parent;
10370 @@ -2852,6 +2876,21 @@ out:
10374 +static int inode_in_log(struct btrfs_trans_handle *trans,
10375 + struct inode *inode)
10377 + struct btrfs_root *root = BTRFS_I(inode)->root;
10380 + mutex_lock(&root->log_mutex);
10381 + if (BTRFS_I(inode)->logged_trans == trans->transid &&
10382 + BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
10384 + mutex_unlock(&root->log_mutex);
10390 * helper function around btrfs_log_inode to make sure newly created
10391 * parent directories also end up in the log. A minimal inode and backref
10392 @@ -2880,11 +2919,22 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
10396 + if (root != BTRFS_I(inode)->root ||
10397 + btrfs_root_refs(&root->root_item) == 0) {
10399 + goto end_no_trans;
10402 ret = check_parent_dirs_for_sync(trans, inode, parent,
10403 sb, last_committed);
10407 + if (inode_in_log(trans, inode)) {
10408 + ret = BTRFS_NO_LOG_SYNC;
10409 + goto end_no_trans;
10412 start_log_trans(trans, root);
10414 ret = btrfs_log_inode(trans, root, inode, inode_only);
10415 @@ -2907,12 +2957,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
10418 inode = parent->d_inode;
10419 + if (root != BTRFS_I(inode)->root)
10422 if (BTRFS_I(inode)->generation >
10423 root->fs_info->last_trans_committed) {
10424 ret = btrfs_log_inode(trans, root, inode, inode_only);
10427 - if (parent == sb->s_root)
10428 + if (IS_ROOT(parent))
10431 parent = parent->d_parent;
10432 @@ -2951,7 +3004,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
10433 struct btrfs_key tmp_key;
10434 struct btrfs_root *log;
10435 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
10436 - u64 highest_inode;
10437 struct walk_control wc = {
10438 .process_func = process_one_buffer,
10440 @@ -3010,11 +3062,6 @@ again:
10444 - ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
10446 - wc.replay_dest->highest_inode = highest_inode;
10447 - wc.replay_dest->last_inode_alloc = highest_inode;
10450 key.offset = found_key.offset - 1;
10451 wc.replay_dest->log_root = NULL;
10452 diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
10453 index d09c760..0776eac 100644
10454 --- a/fs/btrfs/tree-log.h
10455 +++ b/fs/btrfs/tree-log.h
10457 #ifndef __TREE_LOG_
10458 #define __TREE_LOG_
10460 +/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
10461 +#define BTRFS_NO_LOG_SYNC 256
10463 int btrfs_sync_log(struct btrfs_trans_handle *trans,
10464 struct btrfs_root *root);
10465 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
10466 diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
10467 index 5dbefd1..20cbd2e 100644
10468 --- a/fs/btrfs/volumes.c
10469 +++ b/fs/btrfs/volumes.c
10470 @@ -276,7 +276,7 @@ loop_lock:
10471 * is now congested. Back off and let other work structs
10474 - if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
10475 + if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
10476 fs_info->fs_devices->open_devices > 1) {
10477 struct io_context *ioc;
10479 @@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
10482 device->name = kstrdup(orig_dev->name, GFP_NOFS);
10483 - if (!device->name)
10484 + if (!device->name) {
10489 device->devid = orig_dev->devid;
10490 device->work.func = pending_bios_fn;
10491 @@ -719,10 +721,9 @@ error:
10492 * called very infrequently and that a given device has a small number
10495 -static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
10496 - struct btrfs_device *device,
10497 - u64 num_bytes, u64 *start,
10499 +int find_free_dev_extent(struct btrfs_trans_handle *trans,
10500 + struct btrfs_device *device, u64 num_bytes,
10501 + u64 *start, u64 *max_avail)
10503 struct btrfs_key key;
10504 struct btrfs_root *root = device->dev_root;
10505 @@ -1736,6 +1737,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
10506 extent_root = root->fs_info->extent_root;
10507 em_tree = &root->fs_info->mapping_tree.map_tree;
10509 + ret = btrfs_can_relocate(extent_root, chunk_offset);
10513 /* step one, relocate all the extents inside this chunk */
10514 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
10516 @@ -1749,9 +1754,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
10517 * step two, delete the device extents and the
10518 * chunk tree entries
10520 - spin_lock(&em_tree->lock);
10521 + read_lock(&em_tree->lock);
10522 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10523 - spin_unlock(&em_tree->lock);
10524 + read_unlock(&em_tree->lock);
10526 BUG_ON(em->start > chunk_offset ||
10527 em->start + em->len < chunk_offset);
10528 @@ -1780,9 +1785,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
10529 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
10532 - spin_lock(&em_tree->lock);
10533 + write_lock(&em_tree->lock);
10534 remove_extent_mapping(em_tree, em);
10535 - spin_unlock(&em_tree->lock);
10536 + write_unlock(&em_tree->lock);
10540 @@ -1807,12 +1812,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
10541 struct btrfs_key found_key;
10542 u64 chunk_tree = chunk_root->root_key.objectid;
10544 + bool retried = false;
10548 path = btrfs_alloc_path();
10553 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
10554 key.offset = (u64)-1;
10555 key.type = BTRFS_CHUNK_ITEM_KEY;
10556 @@ -1842,7 +1850,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
10557 ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
10558 found_key.objectid,
10561 + if (ret == -ENOSPC)
10567 if (found_key.offset == 0)
10568 @@ -1850,6 +1861,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
10569 key.offset = found_key.offset - 1;
10572 + if (failed && !retried) {
10576 + } else if (failed && retried) {
10581 btrfs_free_path(path);
10583 @@ -1894,6 +1913,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
10586 ret = btrfs_shrink_device(device, old_size - size_to_free);
10587 + if (ret == -ENOSPC)
10591 trans = btrfs_start_transaction(dev_root, 1);
10592 @@ -1938,9 +1959,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
10593 chunk = btrfs_item_ptr(path->nodes[0],
10595 struct btrfs_chunk);
10596 - key.offset = found_key.offset;
10597 /* chunk zero is special */
10598 - if (key.offset == 0)
10599 + if (found_key.offset == 0)
10602 btrfs_release_path(chunk_root, path);
10603 @@ -1948,7 +1968,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
10604 chunk_root->root_key.objectid,
10605 found_key.objectid,
10608 + BUG_ON(ret && ret != -ENOSPC);
10609 + key.offset = found_key.offset - 1;
10613 @@ -1974,10 +1995,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
10618 + bool retried = false;
10619 struct extent_buffer *l;
10620 struct btrfs_key key;
10621 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
10622 u64 old_total = btrfs_super_total_bytes(super_copy);
10623 + u64 old_size = device->total_bytes;
10624 u64 diff = device->total_bytes - new_size;
10626 if (new_size >= device->total_bytes)
10627 @@ -1987,12 +2011,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
10631 - trans = btrfs_start_transaction(root, 1);
10640 @@ -2001,8 +2019,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
10641 if (device->writeable)
10642 device->fs_devices->total_rw_bytes -= diff;
10643 unlock_chunks(root);
10644 - btrfs_end_transaction(trans, root);
10647 key.objectid = device->devid;
10648 key.offset = (u64)-1;
10649 key.type = BTRFS_DEV_EXTENT_KEY;
10650 @@ -2017,6 +2035,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
10654 + btrfs_release_path(root, path);
10658 @@ -2024,14 +2043,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
10659 slot = path->slots[0];
10660 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
10662 - if (key.objectid != device->devid)
10663 + if (key.objectid != device->devid) {
10664 + btrfs_release_path(root, path);
10668 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
10669 length = btrfs_dev_extent_length(l, dev_extent);
10671 - if (key.offset + length <= new_size)
10672 + if (key.offset + length <= new_size) {
10673 + btrfs_release_path(root, path);
10677 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
10678 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
10679 @@ -2040,8 +2063,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
10681 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
10684 + if (ret && ret != -ENOSPC)
10686 + if (ret == -ENOSPC)
10691 + if (failed && !retried) {
10695 + } else if (failed && retried) {
10697 + lock_chunks(root);
10699 + device->total_bytes = old_size;
10700 + if (device->writeable)
10701 + device->fs_devices->total_rw_bytes += diff;
10702 + unlock_chunks(root);
10706 /* Shrinking succeeded, else we would be at "done". */
10707 @@ -2294,9 +2335,9 @@ again:
10708 em->block_len = em->len;
10710 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
10711 - spin_lock(&em_tree->lock);
10712 + write_lock(&em_tree->lock);
10713 ret = add_extent_mapping(em_tree, em);
10714 - spin_unlock(&em_tree->lock);
10715 + write_unlock(&em_tree->lock);
10717 free_extent_map(em);
10719 @@ -2491,9 +2532,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
10723 - spin_lock(&map_tree->map_tree.lock);
10724 + read_lock(&map_tree->map_tree.lock);
10725 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
10726 - spin_unlock(&map_tree->map_tree.lock);
10727 + read_unlock(&map_tree->map_tree.lock);
10731 @@ -2518,11 +2559,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
10732 struct extent_map *em;
10735 - spin_lock(&tree->map_tree.lock);
10736 + write_lock(&tree->map_tree.lock);
10737 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
10739 remove_extent_mapping(&tree->map_tree, em);
10740 - spin_unlock(&tree->map_tree.lock);
10741 + write_unlock(&tree->map_tree.lock);
10745 @@ -2540,9 +2581,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
10746 struct extent_map_tree *em_tree = &map_tree->map_tree;
10749 - spin_lock(&em_tree->lock);
10750 + read_lock(&em_tree->lock);
10751 em = lookup_extent_mapping(em_tree, logical, len);
10752 - spin_unlock(&em_tree->lock);
10753 + read_unlock(&em_tree->lock);
10756 BUG_ON(em->start > logical || em->start + em->len < logical);
10757 @@ -2604,9 +2645,9 @@ again:
10758 atomic_set(&multi->error, 0);
10761 - spin_lock(&em_tree->lock);
10762 + read_lock(&em_tree->lock);
10763 em = lookup_extent_mapping(em_tree, logical, *length);
10764 - spin_unlock(&em_tree->lock);
10765 + read_unlock(&em_tree->lock);
10767 if (!em && unplug_page)
10769 @@ -2763,9 +2804,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
10773 - spin_lock(&em_tree->lock);
10774 + read_lock(&em_tree->lock);
10775 em = lookup_extent_mapping(em_tree, chunk_start, 1);
10776 - spin_unlock(&em_tree->lock);
10777 + read_unlock(&em_tree->lock);
10779 BUG_ON(!em || em->start != chunk_start);
10780 map = (struct map_lookup *)em->bdev;
10781 @@ -3053,9 +3094,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
10782 logical = key->offset;
10783 length = btrfs_chunk_length(leaf, chunk);
10785 - spin_lock(&map_tree->map_tree.lock);
10786 + read_lock(&map_tree->map_tree.lock);
10787 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
10788 - spin_unlock(&map_tree->map_tree.lock);
10789 + read_unlock(&map_tree->map_tree.lock);
10791 /* already mapped? */
10792 if (em && em->start <= logical && em->start + em->len > logical) {
10793 @@ -3114,9 +3155,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
10794 map->stripes[i].dev->in_fs_metadata = 1;
10797 - spin_lock(&map_tree->map_tree.lock);
10798 + write_lock(&map_tree->map_tree.lock);
10799 ret = add_extent_mapping(&map_tree->map_tree, em);
10800 - spin_unlock(&map_tree->map_tree.lock);
10801 + write_unlock(&map_tree->map_tree.lock);
10803 free_extent_map(em);
10805 diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
10806 index 5139a83..31b0fab 100644
10807 --- a/fs/btrfs/volumes.h
10808 +++ b/fs/btrfs/volumes.h
10809 @@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root);
10810 void btrfs_unlock_volumes(void);
10811 void btrfs_lock_volumes(void);
10812 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
10813 +int find_free_dev_extent(struct btrfs_trans_handle *trans,
10814 + struct btrfs_device *device, u64 num_bytes,
10815 + u64 *start, u64 *max_avail);
10817 diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
10818 index a9d3bf4..b6dd596 100644
10819 --- a/fs/btrfs/xattr.c
10820 +++ b/fs/btrfs/xattr.c
10821 @@ -260,7 +260,7 @@ err:
10822 * attributes are handled directly.
10824 struct xattr_handler *btrfs_xattr_handlers[] = {
10825 -#ifdef CONFIG_FS_POSIX_ACL
10826 +#ifdef CONFIG_BTRFS_FS_POSIX_ACL
10827 &btrfs_xattr_acl_access_handler,
10828 &btrfs_xattr_acl_default_handler,