make it compile
[linux-2.6.git] / linux-2.6-btrfs-upstream.patch
1 diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
2 index f128427..3616042 100644
3 --- a/fs/btrfs/acl.c
4 +++ b/fs/btrfs/acl.c
5 @@ -27,7 +27,7 @@
6  #include "btrfs_inode.h"
7  #include "xattr.h"
8  
9 -#ifdef CONFIG_FS_POSIX_ACL
10 +#ifdef CONFIG_BTRFS_FS_POSIX_ACL
11  
12  static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
13  {
14 @@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
15         .set    = btrfs_xattr_acl_access_set,
16  };
17  
18 -#else /* CONFIG_FS_POSIX_ACL */
19 +#else /* CONFIG_BTRFS_FS_POSIX_ACL */
20  
21  int btrfs_acl_chmod(struct inode *inode)
22  {
23 @@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
24         return 0;
25  }
26  
27 -#endif /* CONFIG_FS_POSIX_ACL */
28 +#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
29 diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
30 index 019e8af..c0861e7 100644
31 --- a/fs/btrfs/async-thread.c
32 +++ b/fs/btrfs/async-thread.c
33 @@ -48,6 +48,9 @@ struct btrfs_worker_thread {
34         /* number of things on the pending list */
35         atomic_t num_pending;
36  
37 +       /* reference counter for this struct */
38 +       atomic_t refs;
39 +
40         unsigned long sequence;
41  
42         /* protects the pending list. */
43 @@ -61,6 +64,51 @@ struct btrfs_worker_thread {
44  };
45  
46  /*
47 + * btrfs_start_workers uses kthread_run, which can block waiting for memory
48 + * for a very long time.  It will actually throttle on page writeback,
49 + * and so it may not make progress until after our btrfs worker threads
50 + * process all of the pending work structs in their queue
51 + *
52 + * This means we can't use btrfs_start_workers from inside a btrfs worker
53 + * thread that is used as part of cleaning dirty memory, which pretty much
54 + * involves all of the worker threads.
55 + *
56 + * Instead we have a helper queue who never has more than one thread
57 + * where we scheduler thread start operations.  This worker_start struct
58 + * is used to contain the work and hold a pointer to the queue that needs
59 + * another worker.
60 + */
61 +struct worker_start {
62 +       struct btrfs_work work;
63 +       struct btrfs_workers *queue;
64 +};
65 +
66 +static void start_new_worker_func(struct btrfs_work *work)
67 +{
68 +       struct worker_start *start;
69 +       start = container_of(work, struct worker_start, work);
70 +       btrfs_start_workers(start->queue, 1);
71 +       kfree(start);
72 +}
73 +
74 +static int start_new_worker(struct btrfs_workers *queue)
75 +{
76 +       struct worker_start *start;
77 +       int ret;
78 +
79 +       start = kzalloc(sizeof(*start), GFP_NOFS);
80 +       if (!start)
81 +               return -ENOMEM;
82 +
83 +       start->work.func = start_new_worker_func;
84 +       start->queue = queue;
85 +       ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
86 +       if (ret)
87 +               kfree(start);
88 +       return ret;
89 +}
90 +
91 +/*
92   * helper function to move a thread onto the idle list after it
93   * has finished some requests.
94   */
95 @@ -71,7 +119,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
96                 unsigned long flags;
97                 spin_lock_irqsave(&worker->workers->lock, flags);
98                 worker->idle = 1;
99 -               list_move(&worker->worker_list, &worker->workers->idle_list);
100 +
101 +               /* the list may be empty if the worker is just starting */
102 +               if (!list_empty(&worker->worker_list)) {
103 +                       list_move(&worker->worker_list,
104 +                                &worker->workers->idle_list);
105 +               }
106                 spin_unlock_irqrestore(&worker->workers->lock, flags);
107         }
108  }
109 @@ -87,23 +140,51 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
110                 unsigned long flags;
111                 spin_lock_irqsave(&worker->workers->lock, flags);
112                 worker->idle = 0;
113 -               list_move_tail(&worker->worker_list,
114 -                              &worker->workers->worker_list);
115 +
116 +               if (!list_empty(&worker->worker_list)) {
117 +                       list_move_tail(&worker->worker_list,
118 +                                     &worker->workers->worker_list);
119 +               }
120                 spin_unlock_irqrestore(&worker->workers->lock, flags);
121         }
122  }
123  
124 -static noinline int run_ordered_completions(struct btrfs_workers *workers,
125 -                                           struct btrfs_work *work)
126 +static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
127  {
128 +       struct btrfs_workers *workers = worker->workers;
129         unsigned long flags;
130  
131 +       rmb();
132 +       if (!workers->atomic_start_pending)
133 +               return;
134 +
135 +       spin_lock_irqsave(&workers->lock, flags);
136 +       if (!workers->atomic_start_pending)
137 +               goto out;
138 +
139 +       workers->atomic_start_pending = 0;
140 +       if (workers->num_workers + workers->num_workers_starting >=
141 +           workers->max_workers)
142 +               goto out;
143 +
144 +       workers->num_workers_starting += 1;
145 +       spin_unlock_irqrestore(&workers->lock, flags);
146 +       start_new_worker(workers);
147 +       return;
148 +
149 +out:
150 +       spin_unlock_irqrestore(&workers->lock, flags);
151 +}
152 +
153 +static noinline int run_ordered_completions(struct btrfs_workers *workers,
154 +                                           struct btrfs_work *work)
155 +{
156         if (!workers->ordered)
157                 return 0;
158  
159         set_bit(WORK_DONE_BIT, &work->flags);
160  
161 -       spin_lock_irqsave(&workers->lock, flags);
162 +       spin_lock(&workers->order_lock);
163  
164         while (1) {
165                 if (!list_empty(&workers->prio_order_list)) {
166 @@ -126,45 +207,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
167                 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
168                         break;
169  
170 -               spin_unlock_irqrestore(&workers->lock, flags);
171 +               spin_unlock(&workers->order_lock);
172  
173                 work->ordered_func(work);
174  
175                 /* now take the lock again and call the freeing code */
176 -               spin_lock_irqsave(&workers->lock, flags);
177 +               spin_lock(&workers->order_lock);
178                 list_del(&work->order_list);
179                 work->ordered_free(work);
180         }
181  
182 -       spin_unlock_irqrestore(&workers->lock, flags);
183 +       spin_unlock(&workers->order_lock);
184         return 0;
185  }
186  
187 +static void put_worker(struct btrfs_worker_thread *worker)
188 +{
189 +       if (atomic_dec_and_test(&worker->refs))
190 +               kfree(worker);
191 +}
192 +
193 +static int try_worker_shutdown(struct btrfs_worker_thread *worker)
194 +{
195 +       int freeit = 0;
196 +
197 +       spin_lock_irq(&worker->lock);
198 +       spin_lock(&worker->workers->lock);
199 +       if (worker->workers->num_workers > 1 &&
200 +           worker->idle &&
201 +           !worker->working &&
202 +           !list_empty(&worker->worker_list) &&
203 +           list_empty(&worker->prio_pending) &&
204 +           list_empty(&worker->pending) &&
205 +           atomic_read(&worker->num_pending) == 0) {
206 +               freeit = 1;
207 +               list_del_init(&worker->worker_list);
208 +               worker->workers->num_workers--;
209 +       }
210 +       spin_unlock(&worker->workers->lock);
211 +       spin_unlock_irq(&worker->lock);
212 +
213 +       if (freeit)
214 +               put_worker(worker);
215 +       return freeit;
216 +}
217 +
218 +static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
219 +                                       struct list_head *prio_head,
220 +                                       struct list_head *head)
221 +{
222 +       struct btrfs_work *work = NULL;
223 +       struct list_head *cur = NULL;
224 +
225 +       if(!list_empty(prio_head))
226 +               cur = prio_head->next;
227 +
228 +       smp_mb();
229 +       if (!list_empty(&worker->prio_pending))
230 +               goto refill;
231 +
232 +       if (!list_empty(head))
233 +               cur = head->next;
234 +
235 +       if (cur)
236 +               goto out;
237 +
238 +refill:
239 +       spin_lock_irq(&worker->lock);
240 +       list_splice_tail_init(&worker->prio_pending, prio_head);
241 +       list_splice_tail_init(&worker->pending, head);
242 +
243 +       if (!list_empty(prio_head))
244 +               cur = prio_head->next;
245 +       else if (!list_empty(head))
246 +               cur = head->next;
247 +       spin_unlock_irq(&worker->lock);
248 +
249 +       if (!cur)
250 +               goto out_fail;
251 +
252 +out:
253 +       work = list_entry(cur, struct btrfs_work, list);
254 +
255 +out_fail:
256 +       return work;
257 +}
258 +
259  /*
260   * main loop for servicing work items
261   */
262  static int worker_loop(void *arg)
263  {
264         struct btrfs_worker_thread *worker = arg;
265 -       struct list_head *cur;
266 +       struct list_head head;
267 +       struct list_head prio_head;
268         struct btrfs_work *work;
269 +
270 +       INIT_LIST_HEAD(&head);
271 +       INIT_LIST_HEAD(&prio_head);
272 +
273         do {
274 -               spin_lock_irq(&worker->lock);
275 -again_locked:
276 +again:
277                 while (1) {
278 -                       if (!list_empty(&worker->prio_pending))
279 -                               cur = worker->prio_pending.next;
280 -                       else if (!list_empty(&worker->pending))
281 -                               cur = worker->pending.next;
282 -                       else
283 +
284 +
285 +                       work = get_next_work(worker, &prio_head, &head);
286 +                       if (!work)
287                                 break;
288  
289 -                       work = list_entry(cur, struct btrfs_work, list);
290                         list_del(&work->list);
291                         clear_bit(WORK_QUEUED_BIT, &work->flags);
292  
293                         work->worker = worker;
294 -                       spin_unlock_irq(&worker->lock);
295  
296                         work->func(work);
297  
298 @@ -175,9 +329,13 @@ again_locked:
299                          */
300                         run_ordered_completions(worker->workers, work);
301  
302 -                       spin_lock_irq(&worker->lock);
303 -                       check_idle_worker(worker);
304 +                       check_pending_worker_creates(worker);
305 +
306                 }
307 +
308 +               spin_lock_irq(&worker->lock);
309 +               check_idle_worker(worker);
310 +
311                 if (freezing(current)) {
312                         worker->working = 0;
313                         spin_unlock_irq(&worker->lock);
314 @@ -216,8 +374,10 @@ again_locked:
315                                 spin_lock_irq(&worker->lock);
316                                 set_current_state(TASK_INTERRUPTIBLE);
317                                 if (!list_empty(&worker->pending) ||
318 -                                   !list_empty(&worker->prio_pending))
319 -                                       goto again_locked;
320 +                                   !list_empty(&worker->prio_pending)) {
321 +                                       spin_unlock_irq(&worker->lock);
322 +                                       goto again;
323 +                               }
324  
325                                 /*
326                                  * this makes sure we get a wakeup when someone
327 @@ -226,8 +386,13 @@ again_locked:
328                                 worker->working = 0;
329                                 spin_unlock_irq(&worker->lock);
330  
331 -                               if (!kthread_should_stop())
332 -                                       schedule();
333 +                               if (!kthread_should_stop()) {
334 +                                       schedule_timeout(HZ * 120);
335 +                                       if (!worker->working &&
336 +                                           try_worker_shutdown(worker)) {
337 +                                               return 0;
338 +                                       }
339 +                               }
340                         }
341                         __set_current_state(TASK_RUNNING);
342                 }
343 @@ -242,41 +407,61 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
344  {
345         struct list_head *cur;
346         struct btrfs_worker_thread *worker;
347 +       int can_stop;
348  
349 +       spin_lock_irq(&workers->lock);
350         list_splice_init(&workers->idle_list, &workers->worker_list);
351         while (!list_empty(&workers->worker_list)) {
352                 cur = workers->worker_list.next;
353                 worker = list_entry(cur, struct btrfs_worker_thread,
354                                     worker_list);
355 -               kthread_stop(worker->task);
356 -               list_del(&worker->worker_list);
357 -               kfree(worker);
358 +
359 +               atomic_inc(&worker->refs);
360 +               workers->num_workers -= 1;
361 +               if (!list_empty(&worker->worker_list)) {
362 +                       list_del_init(&worker->worker_list);
363 +                       put_worker(worker);
364 +                       can_stop = 1;
365 +               } else
366 +                       can_stop = 0;
367 +               spin_unlock_irq(&workers->lock);
368 +               if (can_stop)
369 +                       kthread_stop(worker->task);
370 +               spin_lock_irq(&workers->lock);
371 +               put_worker(worker);
372         }
373 +       spin_unlock_irq(&workers->lock);
374         return 0;
375  }
376  
377  /*
378   * simple init on struct btrfs_workers
379   */
380 -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
381 +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
382 +                       struct btrfs_workers *async_helper)
383  {
384         workers->num_workers = 0;
385 +       workers->num_workers_starting = 0;
386         INIT_LIST_HEAD(&workers->worker_list);
387         INIT_LIST_HEAD(&workers->idle_list);
388         INIT_LIST_HEAD(&workers->order_list);
389         INIT_LIST_HEAD(&workers->prio_order_list);
390         spin_lock_init(&workers->lock);
391 +       spin_lock_init(&workers->order_lock);
392         workers->max_workers = max;
393         workers->idle_thresh = 32;
394         workers->name = name;
395         workers->ordered = 0;
396 +       workers->atomic_start_pending = 0;
397 +       workers->atomic_worker_start = async_helper;
398  }
399  
400  /*
401   * starts new worker threads.  This does not enforce the max worker
402   * count in case you need to temporarily go past it.
403   */
404 -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
405 +static int __btrfs_start_workers(struct btrfs_workers *workers,
406 +                                int num_workers)
407  {
408         struct btrfs_worker_thread *worker;
409         int ret = 0;
410 @@ -293,7 +478,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
411                 INIT_LIST_HEAD(&worker->prio_pending);
412                 INIT_LIST_HEAD(&worker->worker_list);
413                 spin_lock_init(&worker->lock);
414 +
415                 atomic_set(&worker->num_pending, 0);
416 +               atomic_set(&worker->refs, 1);
417                 worker->workers = workers;
418                 worker->task = kthread_run(worker_loop, worker,
419                                            "btrfs-%s-%d", workers->name,
420 @@ -303,11 +490,12 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
421                         kfree(worker);
422                         goto fail;
423                 }
424 -
425                 spin_lock_irq(&workers->lock);
426                 list_add_tail(&worker->worker_list, &workers->idle_list);
427                 worker->idle = 1;
428                 workers->num_workers++;
429 +               workers->num_workers_starting--;
430 +               WARN_ON(workers->num_workers_starting < 0);
431                 spin_unlock_irq(&workers->lock);
432         }
433         return 0;
434 @@ -316,6 +504,14 @@ fail:
435         return ret;
436  }
437  
438 +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
439 +{
440 +       spin_lock_irq(&workers->lock);
441 +       workers->num_workers_starting += num_workers;
442 +       spin_unlock_irq(&workers->lock);
443 +       return __btrfs_start_workers(workers, num_workers);
444 +}
445 +
446  /*
447   * run through the list and find a worker thread that doesn't have a lot
448   * to do right now.  This can return null if we aren't yet at the thread
449 @@ -325,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
450  {
451         struct btrfs_worker_thread *worker;
452         struct list_head *next;
453 -       int enforce_min = workers->num_workers < workers->max_workers;
454 +       int enforce_min;
455 +
456 +       enforce_min = (workers->num_workers + workers->num_workers_starting) <
457 +               workers->max_workers;
458  
459         /*
460          * if we find an idle thread, don't move it to the end of the
461 @@ -350,7 +549,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
462          */
463         next = workers->worker_list.next;
464         worker = list_entry(next, struct btrfs_worker_thread, worker_list);
465 -       atomic_inc(&worker->num_pending);
466         worker->sequence++;
467  
468         if (worker->sequence % workers->idle_thresh == 0)
469 @@ -367,35 +565,49 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
470  {
471         struct btrfs_worker_thread *worker;
472         unsigned long flags;
473 +       struct list_head *fallback;
474  
475  again:
476         spin_lock_irqsave(&workers->lock, flags);
477         worker = next_worker(workers);
478 -       spin_unlock_irqrestore(&workers->lock, flags);
479  
480         if (!worker) {
481 -               spin_lock_irqsave(&workers->lock, flags);
482 -               if (workers->num_workers >= workers->max_workers) {
483 -                       struct list_head *fallback = NULL;
484 -                       /*
485 -                        * we have failed to find any workers, just
486 -                        * return the force one
487 -                        */
488 -                       if (!list_empty(&workers->worker_list))
489 -                               fallback = workers->worker_list.next;
490 -                       if (!list_empty(&workers->idle_list))
491 -                               fallback = workers->idle_list.next;
492 -                       BUG_ON(!fallback);
493 -                       worker = list_entry(fallback,
494 -                                 struct btrfs_worker_thread, worker_list);
495 -                       spin_unlock_irqrestore(&workers->lock, flags);
496 +               if (workers->num_workers + workers->num_workers_starting >=
497 +                   workers->max_workers) {
498 +                       goto fallback;
499 +               } else if (workers->atomic_worker_start) {
500 +                       workers->atomic_start_pending = 1;
501 +                       goto fallback;
502                 } else {
503 +                       workers->num_workers_starting++;
504                         spin_unlock_irqrestore(&workers->lock, flags);
505                         /* we're below the limit, start another worker */
506 -                       btrfs_start_workers(workers, 1);
507 +                       __btrfs_start_workers(workers, 1);
508                         goto again;
509                 }
510         }
511 +       goto found;
512 +
513 +fallback:
514 +       fallback = NULL;
515 +       /*
516 +        * we have failed to find any workers, just
517 +        * return the first one we can find.
518 +        */
519 +       if (!list_empty(&workers->worker_list))
520 +               fallback = workers->worker_list.next;
521 +       if (!list_empty(&workers->idle_list))
522 +               fallback = workers->idle_list.next;
523 +       BUG_ON(!fallback);
524 +       worker = list_entry(fallback,
525 +                 struct btrfs_worker_thread, worker_list);
526 +found:
527 +       /*
528 +        * this makes sure the worker doesn't exit before it is placed
529 +        * onto a busy/idle list
530 +        */
531 +       atomic_inc(&worker->num_pending);
532 +       spin_unlock_irqrestore(&workers->lock, flags);
533         return worker;
534  }
535  
536 @@ -427,7 +639,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
537                 spin_lock(&worker->workers->lock);
538                 worker->idle = 0;
539                 list_move_tail(&worker->worker_list,
540 -                              &worker->workers->worker_list);
541 +                             &worker->workers->worker_list);
542                 spin_unlock(&worker->workers->lock);
543         }
544         if (!worker->working) {
545 @@ -435,9 +647,9 @@ int btrfs_requeue_work(struct btrfs_work *work)
546                 worker->working = 1;
547         }
548  
549 -       spin_unlock_irqrestore(&worker->lock, flags);
550         if (wake)
551                 wake_up_process(worker->task);
552 +       spin_unlock_irqrestore(&worker->lock, flags);
553  out:
554  
555         return 0;
556 @@ -463,14 +675,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
557  
558         worker = find_worker(workers);
559         if (workers->ordered) {
560 -               spin_lock_irqsave(&workers->lock, flags);
561 +               /*
562 +                * you're not allowed to do ordered queues from an
563 +                * interrupt handler
564 +                */
565 +               spin_lock(&workers->order_lock);
566                 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
567                         list_add_tail(&work->order_list,
568                                       &workers->prio_order_list);
569                 } else {
570                         list_add_tail(&work->order_list, &workers->order_list);
571                 }
572 -               spin_unlock_irqrestore(&workers->lock, flags);
573 +               spin_unlock(&workers->order_lock);
574         } else {
575                 INIT_LIST_HEAD(&work->order_list);
576         }
577 @@ -481,7 +697,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
578                 list_add_tail(&work->list, &worker->prio_pending);
579         else
580                 list_add_tail(&work->list, &worker->pending);
581 -       atomic_inc(&worker->num_pending);
582         check_busy_worker(worker);
583  
584         /*
585 @@ -492,10 +707,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
586                 wake = 1;
587         worker->working = 1;
588  
589 -       spin_unlock_irqrestore(&worker->lock, flags);
590 -
591         if (wake)
592                 wake_up_process(worker->task);
593 +       spin_unlock_irqrestore(&worker->lock, flags);
594 +
595  out:
596         return 0;
597  }
598 diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
599 index 1b511c1..5077746 100644
600 --- a/fs/btrfs/async-thread.h
601 +++ b/fs/btrfs/async-thread.h
602 @@ -64,6 +64,8 @@ struct btrfs_workers {
603         /* current number of running workers */
604         int num_workers;
605  
606 +       int num_workers_starting;
607 +
608         /* max number of workers allowed.  changed by btrfs_start_workers */
609         int max_workers;
610  
611 @@ -73,6 +75,16 @@ struct btrfs_workers {
612         /* force completions in the order they were queued */
613         int ordered;
614  
615 +       /* more workers required, but in an interrupt handler */
616 +       int atomic_start_pending;
617 +
618 +       /*
619 +        * are we allowed to sleep while starting workers or are we required
620 +        * to start them at a later time?  If we can't sleep, this indicates
621 +        * which queue we need to use to schedule thread creation.
622 +        */
623 +       struct btrfs_workers *atomic_worker_start;
624 +
625         /* list with all the work threads.  The workers on the idle thread
626          * may be actively servicing jobs, but they haven't yet hit the
627          * idle thresh limit above.
628 @@ -90,6 +102,9 @@ struct btrfs_workers {
629         /* lock for finding the next worker thread to queue on */
630         spinlock_t lock;
631  
632 +       /* lock for the ordered lists */
633 +       spinlock_t order_lock;
634 +
635         /* extra name for this worker, used for current->name */
636         char *name;
637  };
638 @@ -97,7 +112,8 @@ struct btrfs_workers {
639  int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
640  int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
641  int btrfs_stop_workers(struct btrfs_workers *workers);
642 -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
643 +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
644 +                       struct btrfs_workers *async_starter);
645  int btrfs_requeue_work(struct btrfs_work *work);
646  void btrfs_set_work_high_prio(struct btrfs_work *work);
647  #endif
648 diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
649 index ea1ea0a..f6783a4 100644
650 --- a/fs/btrfs/btrfs_inode.h
651 +++ b/fs/btrfs/btrfs_inode.h
652 @@ -86,6 +86,12 @@ struct btrfs_inode {
653          * transid of the trans_handle that last modified this inode
654          */
655         u64 last_trans;
656 +
657 +       /*
658 +        * log transid when this inode was last modified
659 +        */
660 +       u64 last_sub_trans;
661 +
662         /*
663          * transid that last logged this inode
664          */
665 @@ -128,6 +134,16 @@ struct btrfs_inode {
666         u64 last_unlink_trans;
667  
668         /*
669 +        * Counters to keep track of the number of extent item's we may use due
670 +        * to delalloc and such.  outstanding_extents is the number of extent
671 +        * items we think we'll end up using, and reserved_extents is the number
672 +        * of extent items we've reserved metadata for.
673 +        */
674 +       spinlock_t accounting_lock;
675 +       int reserved_extents;
676 +       int outstanding_extents;
677 +
678 +       /*
679          * ordered_data_close is set by truncate when a file that used
680          * to have good data has been truncated to zero.  When it is set
681          * the btrfs file release call will add this inode to the
682 @@ -138,6 +154,7 @@ struct btrfs_inode {
683          * of these.
684          */
685         unsigned ordered_data_close:1;
686 +       unsigned dummy_inode:1;
687  
688         struct inode vfs_inode;
689  };
690 diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
691 index 9d8ba4d..a11a320 100644
692 --- a/fs/btrfs/compression.c
693 +++ b/fs/btrfs/compression.c
694 @@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
695                  */
696                 set_page_extent_mapped(page);
697                 lock_extent(tree, last_offset, end, GFP_NOFS);
698 -               spin_lock(&em_tree->lock);
699 +               read_lock(&em_tree->lock);
700                 em = lookup_extent_mapping(em_tree, last_offset,
701                                            PAGE_CACHE_SIZE);
702 -               spin_unlock(&em_tree->lock);
703 +               read_unlock(&em_tree->lock);
704  
705                 if (!em || last_offset < em->start ||
706                     (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
707 @@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
708         em_tree = &BTRFS_I(inode)->extent_tree;
709  
710         /* we need the actual starting offset of this extent in the file */
711 -       spin_lock(&em_tree->lock);
712 +       read_lock(&em_tree->lock);
713         em = lookup_extent_mapping(em_tree,
714                                    page_offset(bio->bi_io_vec->bv_page),
715                                    PAGE_CACHE_SIZE);
716 -       spin_unlock(&em_tree->lock);
717 +       read_unlock(&em_tree->lock);
718  
719         compressed_len = em->block_len;
720         cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
721 diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
722 index 3fdcc05..ec96f3a 100644
723 --- a/fs/btrfs/ctree.c
724 +++ b/fs/btrfs/ctree.c
725 @@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
726         int split;
727         int num_doubles = 0;
728  
729 +       l = path->nodes[0];
730 +       slot = path->slots[0];
731 +       if (extend && data_size + btrfs_item_size_nr(l, slot) +
732 +           sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
733 +               return -EOVERFLOW;
734 +
735         /* first try to make some room by pushing left and right */
736         if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
737                 wret = push_leaf_right(trans, root, path, data_size, 0);
738 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
739 index 837435c..e5dd628 100644
740 --- a/fs/btrfs/ctree.h
741 +++ b/fs/btrfs/ctree.h
742 @@ -114,6 +114,10 @@ struct btrfs_ordered_sum;
743   */
744  #define BTRFS_DEV_ITEMS_OBJECTID 1ULL
745  
746 +#define BTRFS_BTREE_INODE_OBJECTID 1
747 +
748 +#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
749 +
750  /*
751   * we can actually store much bigger names, but lets not confuse the rest
752   * of linux
753 @@ -670,21 +674,29 @@ struct btrfs_space_info {
754         u64 bytes_reserved;     /* total bytes the allocator has reserved for
755                                    current allocations */
756         u64 bytes_readonly;     /* total bytes that are read only */
757 -
758 -       /* delalloc accounting */
759 -       u64 bytes_delalloc;     /* number of bytes reserved for allocation,
760 -                                  this space is not necessarily reserved yet
761 -                                  by the allocator */
762 +       u64 bytes_super;        /* total bytes reserved for the super blocks */
763 +       u64 bytes_root;         /* the number of bytes needed to commit a
764 +                                  transaction */
765         u64 bytes_may_use;      /* number of bytes that may be used for
766 -                                  delalloc */
767 +                                  delalloc/allocations */
768 +       u64 bytes_delalloc;     /* number of bytes currently reserved for
769 +                                  delayed allocation */
770  
771         int full;               /* indicates that we cannot allocate any more
772                                    chunks for this space */
773         int force_alloc;        /* set if we need to force a chunk alloc for
774                                    this space */
775 +       int force_delalloc;     /* make people start doing filemap_flush until
776 +                                  we're under a threshold */
777  
778         struct list_head list;
779  
780 +       /* for controlling how we free up space for allocations */
781 +       wait_queue_head_t allocate_wait;
782 +       wait_queue_head_t flush_wait;
783 +       int allocating_chunk;
784 +       int flushing;
785 +
786         /* for block groups in our same type */
787         struct list_head block_groups;
788         spinlock_t lock;
789 @@ -726,6 +738,15 @@ enum btrfs_caching_type {
790         BTRFS_CACHE_FINISHED    = 2,
791  };
792  
793 +struct btrfs_caching_control {
794 +       struct list_head list;
795 +       struct mutex mutex;
796 +       wait_queue_head_t wait;
797 +       struct btrfs_block_group_cache *block_group;
798 +       u64 progress;
799 +       atomic_t count;
800 +};
801 +
802  struct btrfs_block_group_cache {
803         struct btrfs_key key;
804         struct btrfs_block_group_item item;
805 @@ -733,6 +754,7 @@ struct btrfs_block_group_cache {
806         spinlock_t lock;
807         u64 pinned;
808         u64 reserved;
809 +       u64 bytes_super;
810         u64 flags;
811         u64 sectorsize;
812         int extents_thresh;
813 @@ -742,8 +764,9 @@ struct btrfs_block_group_cache {
814         int dirty;
815  
816         /* cache tracking stuff */
817 -       wait_queue_head_t caching_q;
818         int cached;
819 +       struct btrfs_caching_control *caching_ctl;
820 +       u64 last_byte_to_unpin;
821  
822         struct btrfs_space_info *space_info;
823  
824 @@ -782,13 +805,16 @@ struct btrfs_fs_info {
825  
826         /* the log root tree is a directory of all the other log roots */
827         struct btrfs_root *log_root_tree;
828 +
829 +       spinlock_t fs_roots_radix_lock;
830         struct radix_tree_root fs_roots_radix;
831  
832         /* block group cache stuff */
833         spinlock_t block_group_cache_lock;
834         struct rb_root block_group_cache_tree;
835  
836 -       struct extent_io_tree pinned_extents;
837 +       struct extent_io_tree freed_extents[2];
838 +       struct extent_io_tree *pinned_extents;
839  
840         /* logical->physical extent mapping */
841         struct btrfs_mapping_tree mapping_tree;
842 @@ -822,11 +848,7 @@ struct btrfs_fs_info {
843         struct mutex transaction_kthread_mutex;
844         struct mutex cleaner_mutex;
845         struct mutex chunk_mutex;
846 -       struct mutex drop_mutex;
847         struct mutex volume_mutex;
848 -       struct mutex tree_reloc_mutex;
849 -       struct rw_semaphore extent_commit_sem;
850 -
851         /*
852          * this protects the ordered operations list only while we are
853          * processing all of the entries on it.  This way we make
854 @@ -835,10 +857,16 @@ struct btrfs_fs_info {
855          * before jumping into the main commit.
856          */
857         struct mutex ordered_operations_mutex;
858 +       struct rw_semaphore extent_commit_sem;
859 +
860 +       struct rw_semaphore subvol_sem;
861 +
862 +       struct srcu_struct subvol_srcu;
863  
864         struct list_head trans_list;
865         struct list_head hashers;
866         struct list_head dead_roots;
867 +       struct list_head caching_block_groups;
868  
869         atomic_t nr_async_submits;
870         atomic_t async_submit_draining;
871 @@ -882,6 +910,7 @@ struct btrfs_fs_info {
872          * A third pool does submit_bio to avoid deadlocking with the other
873          * two
874          */
875 +       struct btrfs_workers generic_worker;
876         struct btrfs_workers workers;
877         struct btrfs_workers delalloc_workers;
878         struct btrfs_workers endio_workers;
879 @@ -889,6 +918,7 @@ struct btrfs_fs_info {
880         struct btrfs_workers endio_meta_write_workers;
881         struct btrfs_workers endio_write_workers;
882         struct btrfs_workers submit_workers;
883 +       struct btrfs_workers enospc_workers;
884         /*
885          * fixup workers take dirty pages that didn't properly go through
886          * the cow mechanism and make them safe to write.  It happens
887 @@ -979,7 +1009,10 @@ struct btrfs_root {
888         atomic_t log_writers;
889         atomic_t log_commit[2];
890         unsigned long log_transid;
891 +       unsigned long last_log_commit;
892         unsigned long log_batch;
893 +       pid_t log_start_pid;
894 +       bool log_multiple_pids;
895  
896         u64 objectid;
897         u64 last_trans;
898 @@ -996,10 +1029,12 @@ struct btrfs_root {
899         u32 stripesize;
900  
901         u32 type;
902 -       u64 highest_inode;
903 -       u64 last_inode_alloc;
904 +
905 +       u64 highest_objectid;
906         int ref_cows;
907         int track_dirty;
908 +       int in_radix;
909 +
910         u64 defrag_trans_start;
911         struct btrfs_key defrag_progress;
912         struct btrfs_key defrag_max;
913 @@ -1118,6 +1153,7 @@ struct btrfs_root {
914  #define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
915  #define BTRFS_MOUNT_SSD_SPREAD         (1 << 8)
916  #define BTRFS_MOUNT_NOSSD              (1 << 9)
917 +#define BTRFS_MOUNT_DISCARD            (1 << 10)
918  
919  #define BTRFS_MOUNT_TAGGED             (1 << 24)
920  
921 @@ -1920,8 +1956,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
922  int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
923                            struct btrfs_root *root, unsigned long count);
924  int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
925 -int btrfs_update_pinned_extents(struct btrfs_root *root,
926 -                               u64 bytenr, u64 num, int pin);
927 +int btrfs_pin_extent(struct btrfs_root *root,
928 +                    u64 bytenr, u64 num, int reserved);
929  int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
930                         struct btrfs_root *root, struct extent_buffer *leaf);
931  int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
932 @@ -1971,9 +2007,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
933                       u64 root_objectid, u64 owner, u64 offset);
934  
935  int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
936 +int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
937 +                               struct btrfs_root *root);
938  int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
939 -                              struct btrfs_root *root,
940 -                              struct extent_io_tree *unpin);
941 +                              struct btrfs_root *root);
942  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
943                          struct btrfs_root *root,
944                          u64 bytenr, u64 num_bytes, u64 parent,
945 @@ -1984,6 +2021,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
946  int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
947  int btrfs_free_block_groups(struct btrfs_fs_info *info);
948  int btrfs_read_block_groups(struct btrfs_root *root);
949 +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
950  int btrfs_make_block_group(struct btrfs_trans_handle *trans,
951                            struct btrfs_root *root, u64 bytes_used,
952                            u64 type, u64 chunk_objectid, u64 chunk_offset,
953 @@ -1997,7 +2035,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
954  void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
955  void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
956  
957 -int btrfs_check_metadata_free_space(struct btrfs_root *root);
958 +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
959 +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
960 +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
961 +                                         struct inode *inode, int num_items);
962 +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
963 +                                       struct inode *inode, int num_items);
964  int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
965                                 u64 bytes);
966  void btrfs_free_reserved_data_space(struct btrfs_root *root,
967 @@ -2006,7 +2049,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
968                                  u64 bytes);
969  void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
970                               u64 bytes);
971 -void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
972  /* ctree.c */
973  int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
974                      int level, int *slot);
975 @@ -2100,12 +2142,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
976                         struct extent_buffer *parent);
977  /* root-item.c */
978  int btrfs_find_root_ref(struct btrfs_root *tree_root,
979 -                  struct btrfs_path *path,
980 -                  u64 root_id, u64 ref_id);
981 +                       struct btrfs_path *path,
982 +                       u64 root_id, u64 ref_id);
983  int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
984                        struct btrfs_root *tree_root,
985 -                      u64 root_id, u8 type, u64 ref_id,
986 -                      u64 dirid, u64 sequence,
987 +                      u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
988 +                      const char *name, int name_len);
989 +int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
990 +                      struct btrfs_root *tree_root,
991 +                      u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
992                        const char *name, int name_len);
993  int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
994                    struct btrfs_key *key);
995 @@ -2120,6 +2165,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
996  int btrfs_search_root(struct btrfs_root *root, u64 search_start,
997                       u64 *found_objectid);
998  int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
999 +int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
1000  int btrfs_set_root_node(struct btrfs_root_item *item,
1001                         struct extent_buffer *node);
1002  /* dir-item.c */
1003 @@ -2138,6 +2184,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
1004                             struct btrfs_path *path, u64 dir,
1005                             u64 objectid, const char *name, int name_len,
1006                             int mod);
1007 +struct btrfs_dir_item *
1008 +btrfs_search_dir_index_item(struct btrfs_root *root,
1009 +                           struct btrfs_path *path, u64 dirid,
1010 +                           const char *name, int name_len);
1011  struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
1012                               struct btrfs_path *path,
1013                               const char *name, int name_len);
1014 @@ -2160,6 +2210,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
1015                              struct btrfs_root *root, u64 offset);
1016  int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
1017                           struct btrfs_root *root, u64 offset);
1018 +int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
1019  
1020  /* inode-map.c */
1021  int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
1022 @@ -2232,6 +2283,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
1023  int btrfs_add_link(struct btrfs_trans_handle *trans,
1024                    struct inode *parent_inode, struct inode *inode,
1025                    const char *name, int name_len, int add_backref, u64 index);
1026 +int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
1027 +                       struct btrfs_root *root,
1028 +                       struct inode *dir, u64 objectid,
1029 +                       const char *name, int name_len);
1030  int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
1031                                struct btrfs_root *root,
1032                                struct inode *inode, u64 new_size,
1033 @@ -2242,7 +2297,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
1034  int btrfs_writepages(struct address_space *mapping,
1035                      struct writeback_control *wbc);
1036  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
1037 -                            struct btrfs_root *new_root, struct dentry *dentry,
1038 +                            struct btrfs_root *new_root,
1039                              u64 new_dirid, u64 alloc_hint);
1040  int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1041                          size_t size, struct bio *bio, unsigned long bio_flags);
1042 @@ -2258,6 +2313,7 @@ int btrfs_write_inode(struct inode *inode, int wait);
1043  void btrfs_dirty_inode(struct inode *inode);
1044  struct inode *btrfs_alloc_inode(struct super_block *sb);
1045  void btrfs_destroy_inode(struct inode *inode);
1046 +void btrfs_drop_inode(struct inode *inode);
1047  int btrfs_init_cachep(void);
1048  void btrfs_destroy_cachep(void);
1049  long btrfs_ioctl_trans_end(struct file *file);
1050 @@ -2275,6 +2331,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
1051  int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
1052  void btrfs_orphan_cleanup(struct btrfs_root *root);
1053  int btrfs_cont_expand(struct inode *inode, loff_t size);
1054 +int btrfs_invalidate_inodes(struct btrfs_root *root);
1055 +extern const struct dentry_operations btrfs_dentry_operations;
1056  
1057  /* ioctl.c */
1058  long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
1059 @@ -2290,7 +2348,7 @@ extern struct file_operations btrfs_file_operations;
1060  int btrfs_drop_extents(struct btrfs_trans_handle *trans,
1061                        struct btrfs_root *root, struct inode *inode,
1062                        u64 start, u64 end, u64 locked_end,
1063 -                      u64 inline_limit, u64 *hint_block);
1064 +                      u64 inline_limit, u64 *hint_block, int drop_cache);
1065  int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1066                               struct btrfs_root *root,
1067                               struct inode *inode, u64 start, u64 end);
1068 @@ -2317,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
1069  int btrfs_sync_fs(struct super_block *sb, int wait);
1070  
1071  /* acl.c */
1072 -#ifdef CONFIG_FS_POSIX_ACL
1073 +#ifdef CONFIG_BTRFS_FS_POSIX_ACL
1074  int btrfs_check_acl(struct inode *inode, int mask);
1075  #else
1076  #define btrfs_check_acl NULL
1077 diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
1078 index 1d70236..f3a6075 100644
1079 --- a/fs/btrfs/dir-item.c
1080 +++ b/fs/btrfs/dir-item.c
1081 @@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
1082         return btrfs_match_dir_item_name(root, path, name, name_len);
1083  }
1084  
1085 +struct btrfs_dir_item *
1086 +btrfs_search_dir_index_item(struct btrfs_root *root,
1087 +                           struct btrfs_path *path, u64 dirid,
1088 +                           const char *name, int name_len)
1089 +{
1090 +       struct extent_buffer *leaf;
1091 +       struct btrfs_dir_item *di;
1092 +       struct btrfs_key key;
1093 +       u32 nritems;
1094 +       int ret;
1095 +
1096 +       key.objectid = dirid;
1097 +       key.type = BTRFS_DIR_INDEX_KEY;
1098 +       key.offset = 0;
1099 +
1100 +       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1101 +       if (ret < 0)
1102 +               return ERR_PTR(ret);
1103 +
1104 +       leaf = path->nodes[0];
1105 +       nritems = btrfs_header_nritems(leaf);
1106 +
1107 +       while (1) {
1108 +               if (path->slots[0] >= nritems) {
1109 +                       ret = btrfs_next_leaf(root, path);
1110 +                       if (ret < 0)
1111 +                               return ERR_PTR(ret);
1112 +                       if (ret > 0)
1113 +                               break;
1114 +                       leaf = path->nodes[0];
1115 +                       nritems = btrfs_header_nritems(leaf);
1116 +                       continue;
1117 +               }
1118 +
1119 +               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1120 +               if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
1121 +                       break;
1122 +
1123 +               di = btrfs_match_dir_item_name(root, path, name, name_len);
1124 +               if (di)
1125 +                       return di;
1126 +
1127 +               path->slots[0]++;
1128 +       }
1129 +       return NULL;
1130 +}
1131 +
1132  struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
1133                                           struct btrfs_root *root,
1134                                           struct btrfs_path *path, u64 dir,
1135 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
1136 index e83be2e..d4132aa 100644
1137 --- a/fs/btrfs/disk-io.c
1138 +++ b/fs/btrfs/disk-io.c
1139 @@ -41,6 +41,7 @@
1140  
1141  static struct extent_io_ops btree_extent_io_ops;
1142  static void end_workqueue_fn(struct btrfs_work *work);
1143 +static void free_fs_root(struct btrfs_root *root);
1144  
1145  static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
1146  
1147 @@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,
1148         struct extent_map *em;
1149         int ret;
1150  
1151 -       spin_lock(&em_tree->lock);
1152 +       read_lock(&em_tree->lock);
1153         em = lookup_extent_mapping(em_tree, start, len);
1154         if (em) {
1155                 em->bdev =
1156                         BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
1157 -               spin_unlock(&em_tree->lock);
1158 +               read_unlock(&em_tree->lock);
1159                 goto out;
1160         }
1161 -       spin_unlock(&em_tree->lock);
1162 +       read_unlock(&em_tree->lock);
1163  
1164         em = alloc_extent_map(GFP_NOFS);
1165         if (!em) {
1166 @@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
1167         em->block_start = 0;
1168         em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
1169  
1170 -       spin_lock(&em_tree->lock);
1171 +       write_lock(&em_tree->lock);
1172         ret = add_extent_mapping(em_tree, em);
1173         if (ret == -EEXIST) {
1174                 u64 failed_start = em->start;
1175 @@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
1176                 free_extent_map(em);
1177                 em = NULL;
1178         }
1179 -       spin_unlock(&em_tree->lock);
1180 +       write_unlock(&em_tree->lock);
1181  
1182         if (ret)
1183                 em = ERR_PTR(ret);
1184 @@ -828,7 +829,9 @@ int btrfs_write_tree_block(struct extent_buffer *buf)
1185  int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
1186  {
1187         return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
1188 -                                 buf->start, buf->start + buf->len - 1);
1189 +                                 buf->start >> PAGE_CACHE_SHIFT,
1190 +                                 (buf->start + buf->len - 1) >>
1191 +                                  PAGE_CACHE_SHIFT);
1192  }
1193  
1194  struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
1195 @@ -895,8 +898,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1196         root->fs_info = fs_info;
1197         root->objectid = objectid;
1198         root->last_trans = 0;
1199 -       root->highest_inode = 0;
1200 -       root->last_inode_alloc = 0;
1201 +       root->highest_objectid = 0;
1202         root->name = NULL;
1203         root->in_sysfs = 0;
1204         root->inode_tree.rb_node = NULL;
1205 @@ -917,6 +919,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1206         atomic_set(&root->log_writers, 0);
1207         root->log_batch = 0;
1208         root->log_transid = 0;
1209 +       root->last_log_commit = 0;
1210         extent_io_tree_init(&root->dirty_log_pages,
1211                              fs_info->btree_inode->i_mapping, GFP_NOFS);
1212  
1213 @@ -952,14 +955,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
1214                      root, fs_info, objectid);
1215         ret = btrfs_find_last_root(tree_root, objectid,
1216                                    &root->root_item, &root->root_key);
1217 +       if (ret > 0)
1218 +               return -ENOENT;
1219         BUG_ON(ret);
1220  
1221         generation = btrfs_root_generation(&root->root_item);
1222         blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1223         root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1224                                      blocksize, generation);
1225 -       root->commit_root = btrfs_root_node(root);
1226         BUG_ON(!root->node);
1227 +       root->commit_root = btrfs_root_node(root);
1228         return 0;
1229  }
1230  
1231 @@ -1085,6 +1090,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1232         WARN_ON(root->log_root);
1233         root->log_root = log_root;
1234         root->log_transid = 0;
1235 +       root->last_log_commit = 0;
1236         return 0;
1237  }
1238  
1239 @@ -1095,7 +1101,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1240         struct btrfs_fs_info *fs_info = tree_root->fs_info;
1241         struct btrfs_path *path;
1242         struct extent_buffer *l;
1243 -       u64 highest_inode;
1244         u64 generation;
1245         u32 blocksize;
1246         int ret = 0;
1247 @@ -1110,7 +1115,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1248                         kfree(root);
1249                         return ERR_PTR(ret);
1250                 }
1251 -               goto insert;
1252 +               goto out;
1253         }
1254  
1255         __setup_root(tree_root->nodesize, tree_root->leafsize,
1256 @@ -1120,39 +1125,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1257         path = btrfs_alloc_path();
1258         BUG_ON(!path);
1259         ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1260 -       if (ret != 0) {
1261 -               if (ret > 0)
1262 -                       ret = -ENOENT;
1263 -               goto out;
1264 +       if (ret == 0) {
1265 +               l = path->nodes[0];
1266 +               read_extent_buffer(l, &root->root_item,
1267 +                               btrfs_item_ptr_offset(l, path->slots[0]),
1268 +                               sizeof(root->root_item));
1269 +               memcpy(&root->root_key, location, sizeof(*location));
1270         }
1271 -       l = path->nodes[0];
1272 -       read_extent_buffer(l, &root->root_item,
1273 -              btrfs_item_ptr_offset(l, path->slots[0]),
1274 -              sizeof(root->root_item));
1275 -       memcpy(&root->root_key, location, sizeof(*location));
1276 -       ret = 0;
1277 -out:
1278 -       btrfs_release_path(root, path);
1279         btrfs_free_path(path);
1280         if (ret) {
1281 -               kfree(root);
1282 +               if (ret > 0)
1283 +                       ret = -ENOENT;
1284                 return ERR_PTR(ret);
1285         }
1286 +
1287         generation = btrfs_root_generation(&root->root_item);
1288         blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1289         root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1290                                      blocksize, generation);
1291         root->commit_root = btrfs_root_node(root);
1292         BUG_ON(!root->node);
1293 -insert:
1294 -       if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1295 +out:
1296 +       if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
1297                 root->ref_cows = 1;
1298 -               ret = btrfs_find_highest_inode(root, &highest_inode);
1299 -               if (ret == 0) {
1300 -                       root->highest_inode = highest_inode;
1301 -                       root->last_inode_alloc = highest_inode;
1302 -               }
1303 -       }
1304 +
1305         return root;
1306  }
1307  
1308 @@ -1187,39 +1183,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1309                 return fs_info->dev_root;
1310         if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1311                 return fs_info->csum_root;
1312 -
1313 +again:
1314 +       spin_lock(&fs_info->fs_roots_radix_lock);
1315         root = radix_tree_lookup(&fs_info->fs_roots_radix,
1316                                  (unsigned long)location->objectid);
1317 +       spin_unlock(&fs_info->fs_roots_radix_lock);
1318         if (root)
1319                 return root;
1320  
1321 +       ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1322 +       if (ret == 0)
1323 +               ret = -ENOENT;
1324 +       if (ret < 0)
1325 +               return ERR_PTR(ret);
1326 +
1327         root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1328         if (IS_ERR(root))
1329                 return root;
1330  
1331 +       WARN_ON(btrfs_root_refs(&root->root_item) == 0);
1332         set_anon_super(&root->anon_super, NULL);
1333  
1334 +       ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1335 +       if (ret)
1336 +               goto fail;
1337 +
1338 +       spin_lock(&fs_info->fs_roots_radix_lock);
1339         ret = radix_tree_insert(&fs_info->fs_roots_radix,
1340                                 (unsigned long)root->root_key.objectid,
1341                                 root);
1342 +       if (ret == 0)
1343 +               root->in_radix = 1;
1344 +       spin_unlock(&fs_info->fs_roots_radix_lock);
1345 +       radix_tree_preload_end();
1346         if (ret) {
1347 -               free_extent_buffer(root->node);
1348 -               kfree(root);
1349 -               return ERR_PTR(ret);
1350 +               if (ret == -EEXIST) {
1351 +                       free_fs_root(root);
1352 +                       goto again;
1353 +               }
1354 +               goto fail;
1355         }
1356 -       if (!(fs_info->sb->s_flags & MS_RDONLY)) {
1357 -               ret = btrfs_find_dead_roots(fs_info->tree_root,
1358 -                                           root->root_key.objectid);
1359 -               BUG_ON(ret);
1360 +
1361 +       ret = btrfs_find_dead_roots(fs_info->tree_root,
1362 +                                   root->root_key.objectid);
1363 +       WARN_ON(ret);
1364 +
1365 +       if (!(fs_info->sb->s_flags & MS_RDONLY))
1366                 btrfs_orphan_cleanup(root);
1367 -       }
1368 +
1369         return root;
1370 +fail:
1371 +       free_fs_root(root);
1372 +       return ERR_PTR(ret);
1373  }
1374  
1375  struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1376                                       struct btrfs_key *location,
1377                                       const char *name, int namelen)
1378  {
1379 +       return btrfs_read_fs_root_no_name(fs_info, location);
1380 +#if 0
1381         struct btrfs_root *root;
1382         int ret;
1383  
1384 @@ -1236,7 +1259,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1385                 kfree(root);
1386                 return ERR_PTR(ret);
1387         }
1388 -#if 0
1389 +
1390         ret = btrfs_sysfs_add_root(root);
1391         if (ret) {
1392                 free_extent_buffer(root->node);
1393 @@ -1244,9 +1267,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1394                 kfree(root);
1395                 return ERR_PTR(ret);
1396         }
1397 -#endif
1398         root->in_sysfs = 1;
1399         return root;
1400 +#endif
1401  }
1402  
1403  static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1404 @@ -1325,9 +1348,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1405         offset = page_offset(page);
1406  
1407         em_tree = &BTRFS_I(inode)->extent_tree;
1408 -       spin_lock(&em_tree->lock);
1409 +       read_lock(&em_tree->lock);
1410         em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1411 -       spin_unlock(&em_tree->lock);
1412 +       read_unlock(&em_tree->lock);
1413         if (!em) {
1414                 __unplug_io_fn(bdi, page);
1415                 return;
1416 @@ -1359,8 +1382,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1417  
1418         err = bdi_register(bdi, NULL, "btrfs-%d",
1419                                 atomic_inc_return(&btrfs_bdi_num));
1420 -       if (err)
1421 +       if (err) {
1422 +               bdi_destroy(bdi);
1423                 return err;
1424 +       }
1425  
1426         bdi->ra_pages   = default_backing_dev_info.ra_pages;
1427         bdi->unplug_io_fn       = btrfs_unplug_io_fn;
1428 @@ -1450,9 +1475,12 @@ static int cleaner_kthread(void *arg)
1429                         break;
1430  
1431                 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1432 -               mutex_lock(&root->fs_info->cleaner_mutex);
1433 -               btrfs_clean_old_snapshots(root);
1434 -               mutex_unlock(&root->fs_info->cleaner_mutex);
1435 +
1436 +               if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
1437 +                   mutex_trylock(&root->fs_info->cleaner_mutex)) {
1438 +                       btrfs_clean_old_snapshots(root);
1439 +                       mutex_unlock(&root->fs_info->cleaner_mutex);
1440 +               }
1441  
1442                 if (freezing(current)) {
1443                         refrigerator();
1444 @@ -1557,15 +1585,36 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1445                 err = -ENOMEM;
1446                 goto fail;
1447         }
1448 -       INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
1449 +
1450 +       ret = init_srcu_struct(&fs_info->subvol_srcu);
1451 +       if (ret) {
1452 +               err = ret;
1453 +               goto fail;
1454 +       }
1455 +
1456 +       ret = setup_bdi(fs_info, &fs_info->bdi);
1457 +       if (ret) {
1458 +               err = ret;
1459 +               goto fail_srcu;
1460 +       }
1461 +
1462 +       fs_info->btree_inode = new_inode(sb);
1463 +       if (!fs_info->btree_inode) {
1464 +               err = -ENOMEM;
1465 +               goto fail_bdi;
1466 +       }
1467 +
1468 +       INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1469         INIT_LIST_HEAD(&fs_info->trans_list);
1470         INIT_LIST_HEAD(&fs_info->dead_roots);
1471         INIT_LIST_HEAD(&fs_info->hashers);
1472         INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1473         INIT_LIST_HEAD(&fs_info->ordered_operations);
1474 +       INIT_LIST_HEAD(&fs_info->caching_block_groups);
1475         spin_lock_init(&fs_info->delalloc_lock);
1476         spin_lock_init(&fs_info->new_trans_lock);
1477         spin_lock_init(&fs_info->ref_cache_lock);
1478 +       spin_lock_init(&fs_info->fs_roots_radix_lock);
1479  
1480         init_completion(&fs_info->kobj_unregister);
1481         fs_info->tree_root = tree_root;
1482 @@ -1584,12 +1633,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1483         fs_info->sb = sb;
1484         fs_info->max_extent = (u64)-1;
1485         fs_info->max_inline = 8192 * 1024;
1486 -       if (setup_bdi(fs_info, &fs_info->bdi))
1487 -               goto fail_bdi;
1488 -       fs_info->btree_inode = new_inode(sb);
1489 -       fs_info->btree_inode->i_ino = 1;
1490 -       fs_info->btree_inode->i_nlink = 1;
1491 -       fs_info->metadata_ratio = 8;
1492 +       fs_info->metadata_ratio = 0;
1493  
1494         fs_info->thread_pool_size = min_t(unsigned long,
1495                                           num_online_cpus() + 2, 8);
1496 @@ -1600,6 +1644,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1497         sb->s_blocksize = 4096;
1498         sb->s_blocksize_bits = blksize_bits(4096);
1499  
1500 +       fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
1501 +       fs_info->btree_inode->i_nlink = 1;
1502         /*
1503          * we set the i_size on the btree inode to the max possible int.
1504          * the real end of the address space is determined by all of
1505 @@ -1618,28 +1664,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1506  
1507         BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1508  
1509 +       BTRFS_I(fs_info->btree_inode)->root = tree_root;
1510 +       memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1511 +              sizeof(struct btrfs_key));
1512 +       BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
1513 +       insert_inode_hash(fs_info->btree_inode);
1514 +
1515         spin_lock_init(&fs_info->block_group_cache_lock);
1516         fs_info->block_group_cache_tree.rb_node = NULL;
1517  
1518 -       extent_io_tree_init(&fs_info->pinned_extents,
1519 +       extent_io_tree_init(&fs_info->freed_extents[0],
1520                              fs_info->btree_inode->i_mapping, GFP_NOFS);
1521 +       extent_io_tree_init(&fs_info->freed_extents[1],
1522 +                            fs_info->btree_inode->i_mapping, GFP_NOFS);
1523 +       fs_info->pinned_extents = &fs_info->freed_extents[0];
1524         fs_info->do_barriers = 1;
1525  
1526 -       BTRFS_I(fs_info->btree_inode)->root = tree_root;
1527 -       memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1528 -              sizeof(struct btrfs_key));
1529 -       insert_inode_hash(fs_info->btree_inode);
1530  
1531         mutex_init(&fs_info->trans_mutex);
1532         mutex_init(&fs_info->ordered_operations_mutex);
1533         mutex_init(&fs_info->tree_log_mutex);
1534 -       mutex_init(&fs_info->drop_mutex);
1535         mutex_init(&fs_info->chunk_mutex);
1536         mutex_init(&fs_info->transaction_kthread_mutex);
1537         mutex_init(&fs_info->cleaner_mutex);
1538         mutex_init(&fs_info->volume_mutex);
1539 -       mutex_init(&fs_info->tree_reloc_mutex);
1540         init_rwsem(&fs_info->extent_commit_sem);
1541 +       init_rwsem(&fs_info->subvol_sem);
1542  
1543         btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1544         btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
1545 @@ -1699,20 +1749,24 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1546                 goto fail_iput;
1547         }
1548  
1549 -       /*
1550 -        * we need to start all the end_io workers up front because the
1551 -        * queue work function gets called at interrupt time, and so it
1552 -        * cannot dynamically grow.
1553 -        */
1554 +       btrfs_init_workers(&fs_info->generic_worker,
1555 +                          "genwork", 1, NULL);
1556 +
1557         btrfs_init_workers(&fs_info->workers, "worker",
1558 -                          fs_info->thread_pool_size);
1559 +                          fs_info->thread_pool_size,
1560 +                          &fs_info->generic_worker);
1561  
1562         btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1563 -                          fs_info->thread_pool_size);
1564 +                          fs_info->thread_pool_size,
1565 +                          &fs_info->generic_worker);
1566  
1567         btrfs_init_workers(&fs_info->submit_workers, "submit",
1568                            min_t(u64, fs_devices->num_devices,
1569 -                          fs_info->thread_pool_size));
1570 +                          fs_info->thread_pool_size),
1571 +                          &fs_info->generic_worker);
1572 +       btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1573 +                          fs_info->thread_pool_size,
1574 +                          &fs_info->generic_worker);
1575  
1576         /* a higher idle thresh on the submit workers makes it much more
1577          * likely that bios will be send down in a sane order to the
1578 @@ -1726,15 +1780,20 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1579         fs_info->delalloc_workers.idle_thresh = 2;
1580         fs_info->delalloc_workers.ordered = 1;
1581  
1582 -       btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
1583 +       btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
1584 +                          &fs_info->generic_worker);
1585         btrfs_init_workers(&fs_info->endio_workers, "endio",
1586 -                          fs_info->thread_pool_size);
1587 +                          fs_info->thread_pool_size,
1588 +                          &fs_info->generic_worker);
1589         btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
1590 -                          fs_info->thread_pool_size);
1591 +                          fs_info->thread_pool_size,
1592 +                          &fs_info->generic_worker);
1593         btrfs_init_workers(&fs_info->endio_meta_write_workers,
1594 -                          "endio-meta-write", fs_info->thread_pool_size);
1595 +                          "endio-meta-write", fs_info->thread_pool_size,
1596 +                          &fs_info->generic_worker);
1597         btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1598 -                          fs_info->thread_pool_size);
1599 +                          fs_info->thread_pool_size,
1600 +                          &fs_info->generic_worker);
1601  
1602         /*
1603          * endios are largely parallel and should have a very
1604 @@ -1743,20 +1802,19 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1605         fs_info->endio_workers.idle_thresh = 4;
1606         fs_info->endio_meta_workers.idle_thresh = 4;
1607  
1608 -       fs_info->endio_write_workers.idle_thresh = 64;
1609 -       fs_info->endio_meta_write_workers.idle_thresh = 64;
1610 +       fs_info->endio_write_workers.idle_thresh = 2;
1611 +       fs_info->endio_meta_write_workers.idle_thresh = 2;
1612  
1613         btrfs_start_workers(&fs_info->workers, 1);
1614 +       btrfs_start_workers(&fs_info->generic_worker, 1);
1615         btrfs_start_workers(&fs_info->submit_workers, 1);
1616         btrfs_start_workers(&fs_info->delalloc_workers, 1);
1617         btrfs_start_workers(&fs_info->fixup_workers, 1);
1618 -       btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
1619 -       btrfs_start_workers(&fs_info->endio_meta_workers,
1620 -                           fs_info->thread_pool_size);
1621 -       btrfs_start_workers(&fs_info->endio_meta_write_workers,
1622 -                           fs_info->thread_pool_size);
1623 -       btrfs_start_workers(&fs_info->endio_write_workers,
1624 -                           fs_info->thread_pool_size);
1625 +       btrfs_start_workers(&fs_info->endio_workers, 1);
1626 +       btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1627 +       btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1628 +       btrfs_start_workers(&fs_info->endio_write_workers, 1);
1629 +       btrfs_start_workers(&fs_info->enospc_workers, 1);
1630  
1631         fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1632         fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
1633 @@ -1916,6 +1974,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1634                 }
1635         }
1636  
1637 +       ret = btrfs_find_orphan_roots(tree_root);
1638 +       BUG_ON(ret);
1639 +
1640         if (!(sb->s_flags & MS_RDONLY)) {
1641                 ret = btrfs_recover_relocation(tree_root);
1642                 BUG_ON(ret);
1643 @@ -1959,6 +2020,7 @@ fail_chunk_root:
1644         free_extent_buffer(chunk_root->node);
1645         free_extent_buffer(chunk_root->commit_root);
1646  fail_sb_buffer:
1647 +       btrfs_stop_workers(&fs_info->generic_worker);
1648         btrfs_stop_workers(&fs_info->fixup_workers);
1649         btrfs_stop_workers(&fs_info->delalloc_workers);
1650         btrfs_stop_workers(&fs_info->workers);
1651 @@ -1967,6 +2029,7 @@ fail_sb_buffer:
1652         btrfs_stop_workers(&fs_info->endio_meta_write_workers);
1653         btrfs_stop_workers(&fs_info->endio_write_workers);
1654         btrfs_stop_workers(&fs_info->submit_workers);
1655 +       btrfs_stop_workers(&fs_info->enospc_workers);
1656  fail_iput:
1657         invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1658         iput(fs_info->btree_inode);
1659 @@ -1975,6 +2038,8 @@ fail_iput:
1660         btrfs_mapping_tree_free(&fs_info->mapping_tree);
1661  fail_bdi:
1662         bdi_destroy(&fs_info->bdi);
1663 +fail_srcu:
1664 +       cleanup_srcu_struct(&fs_info->subvol_srcu);
1665  fail:
1666         kfree(extent_root);
1667         kfree(tree_root);
1668 @@ -2234,20 +2299,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
1669  
1670  int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
1671  {
1672 -       WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
1673 +       spin_lock(&fs_info->fs_roots_radix_lock);
1674         radix_tree_delete(&fs_info->fs_roots_radix,
1675                           (unsigned long)root->root_key.objectid);
1676 +       spin_unlock(&fs_info->fs_roots_radix_lock);
1677 +
1678 +       if (btrfs_root_refs(&root->root_item) == 0)
1679 +               synchronize_srcu(&fs_info->subvol_srcu);
1680 +
1681 +       free_fs_root(root);
1682 +       return 0;
1683 +}
1684 +
1685 +static void free_fs_root(struct btrfs_root *root)
1686 +{
1687 +       WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
1688         if (root->anon_super.s_dev) {
1689                 down_write(&root->anon_super.s_umount);
1690                 kill_anon_super(&root->anon_super);
1691         }
1692 -       if (root->node)
1693 -               free_extent_buffer(root->node);
1694 -       if (root->commit_root)
1695 -               free_extent_buffer(root->commit_root);
1696 +       free_extent_buffer(root->node);
1697 +       free_extent_buffer(root->commit_root);
1698         kfree(root->name);
1699         kfree(root);
1700 -       return 0;
1701  }
1702  
1703  static int del_fs_roots(struct btrfs_fs_info *fs_info)
1704 @@ -2256,6 +2330,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
1705         struct btrfs_root *gang[8];
1706         int i;
1707  
1708 +       while (!list_empty(&fs_info->dead_roots)) {
1709 +               gang[0] = list_entry(fs_info->dead_roots.next,
1710 +                                    struct btrfs_root, root_list);
1711 +               list_del(&gang[0]->root_list);
1712 +
1713 +               if (gang[0]->in_radix) {
1714 +                       btrfs_free_fs_root(fs_info, gang[0]);
1715 +               } else {
1716 +                       free_extent_buffer(gang[0]->node);
1717 +                       free_extent_buffer(gang[0]->commit_root);
1718 +                       kfree(gang[0]);
1719 +               }
1720 +       }
1721 +
1722         while (1) {
1723                 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
1724                                              (void **)gang, 0,
1725 @@ -2285,9 +2373,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
1726                 root_objectid = gang[ret - 1]->root_key.objectid + 1;
1727                 for (i = 0; i < ret; i++) {
1728                         root_objectid = gang[i]->root_key.objectid;
1729 -                       ret = btrfs_find_dead_roots(fs_info->tree_root,
1730 -                                                   root_objectid);
1731 -                       BUG_ON(ret);
1732                         btrfs_orphan_cleanup(gang[i]);
1733                 }
1734                 root_objectid++;
1735 @@ -2357,12 +2442,12 @@ int close_ctree(struct btrfs_root *root)
1736         free_extent_buffer(root->fs_info->csum_root->commit_root);
1737  
1738         btrfs_free_block_groups(root->fs_info);
1739 -       btrfs_free_pinned_extents(root->fs_info);
1740  
1741         del_fs_roots(fs_info);
1742  
1743         iput(fs_info->btree_inode);
1744  
1745 +       btrfs_stop_workers(&fs_info->generic_worker);
1746         btrfs_stop_workers(&fs_info->fixup_workers);
1747         btrfs_stop_workers(&fs_info->delalloc_workers);
1748         btrfs_stop_workers(&fs_info->workers);
1749 @@ -2371,11 +2456,13 @@ int close_ctree(struct btrfs_root *root)
1750         btrfs_stop_workers(&fs_info->endio_meta_write_workers);
1751         btrfs_stop_workers(&fs_info->endio_write_workers);
1752         btrfs_stop_workers(&fs_info->submit_workers);
1753 +       btrfs_stop_workers(&fs_info->enospc_workers);
1754  
1755         btrfs_close_devices(fs_info->fs_devices);
1756         btrfs_mapping_tree_free(&fs_info->mapping_tree);
1757  
1758         bdi_destroy(&fs_info->bdi);
1759 +       cleanup_srcu_struct(&fs_info->subvol_srcu);
1760  
1761         kfree(fs_info->extent_root);
1762         kfree(fs_info->tree_root);
1763 diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
1764 index 9596b40..ba5c3fd 100644
1765 --- a/fs/btrfs/export.c
1766 +++ b/fs/btrfs/export.c
1767 @@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
1768         len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
1769         type = FILEID_BTRFS_WITHOUT_PARENT;
1770  
1771 -       fid->objectid = BTRFS_I(inode)->location.objectid;
1772 +       fid->objectid = inode->i_ino;
1773         fid->root_objectid = BTRFS_I(inode)->root->objectid;
1774         fid->gen = inode->i_generation;
1775  
1776 @@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
1777  }
1778  
1779  static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
1780 -                                      u64 root_objectid, u32 generation)
1781 +                                      u64 root_objectid, u32 generation,
1782 +                                      int check_generation)
1783  {
1784 +       struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
1785         struct btrfs_root *root;
1786 +       struct dentry *dentry;
1787         struct inode *inode;
1788         struct btrfs_key key;
1789 +       int index;
1790 +       int err = 0;
1791 +
1792 +       if (objectid < BTRFS_FIRST_FREE_OBJECTID)
1793 +               return ERR_PTR(-ESTALE);
1794  
1795         key.objectid = root_objectid;
1796         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
1797         key.offset = (u64)-1;
1798  
1799 -       root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
1800 -       if (IS_ERR(root))
1801 -               return ERR_CAST(root);
1802 +       index = srcu_read_lock(&fs_info->subvol_srcu);
1803 +
1804 +       root = btrfs_read_fs_root_no_name(fs_info, &key);
1805 +       if (IS_ERR(root)) {
1806 +               err = PTR_ERR(root);
1807 +               goto fail;
1808 +       }
1809 +
1810 +       if (btrfs_root_refs(&root->root_item) == 0) {
1811 +               err = -ENOENT;
1812 +               goto fail;
1813 +       }
1814  
1815         key.objectid = objectid;
1816         btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
1817         key.offset = 0;
1818  
1819         inode = btrfs_iget(sb, &key, root);
1820 -       if (IS_ERR(inode))
1821 -               return (void *)inode;
1822 +       if (IS_ERR(inode)) {
1823 +               err = PTR_ERR(inode);
1824 +               goto fail;
1825 +       }
1826 +
1827 +       srcu_read_unlock(&fs_info->subvol_srcu, index);
1828  
1829 -       if (generation != inode->i_generation) {
1830 +       if (check_generation && generation != inode->i_generation) {
1831                 iput(inode);
1832                 return ERR_PTR(-ESTALE);
1833         }
1834  
1835 -       return d_obtain_alias(inode);
1836 +       dentry = d_obtain_alias(inode);
1837 +       if (!IS_ERR(dentry))
1838 +               dentry->d_op = &btrfs_dentry_operations;
1839 +       return dentry;
1840 +fail:
1841 +       srcu_read_unlock(&fs_info->subvol_srcu, index);
1842 +       return ERR_PTR(err);
1843  }
1844  
1845  static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
1846 @@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
1847         objectid = fid->parent_objectid;
1848         generation = fid->parent_gen;
1849  
1850 -       return btrfs_get_dentry(sb, objectid, root_objectid, generation);
1851 +       return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
1852  }
1853  
1854  static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
1855 @@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
1856         root_objectid = fid->root_objectid;
1857         generation = fid->gen;
1858  
1859 -       return btrfs_get_dentry(sb, objectid, root_objectid, generation);
1860 +       return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
1861  }
1862  
1863  static struct dentry *btrfs_get_parent(struct dentry *child)
1864  {
1865         struct inode *dir = child->d_inode;
1866 +       static struct dentry *dentry;
1867         struct btrfs_root *root = BTRFS_I(dir)->root;
1868 -       struct btrfs_key key;
1869         struct btrfs_path *path;
1870         struct extent_buffer *leaf;
1871 -       int slot;
1872 -       u64 objectid;
1873 +       struct btrfs_root_ref *ref;
1874 +       struct btrfs_key key;
1875 +       struct btrfs_key found_key;
1876         int ret;
1877  
1878         path = btrfs_alloc_path();
1879  
1880 -       key.objectid = dir->i_ino;
1881 -       btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
1882 -       key.offset = (u64)-1;
1883 +       if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
1884 +               key.objectid = root->root_key.objectid;
1885 +               key.type = BTRFS_ROOT_BACKREF_KEY;
1886 +               key.offset = (u64)-1;
1887 +               root = root->fs_info->tree_root;
1888 +       } else {
1889 +               key.objectid = dir->i_ino;
1890 +               key.type = BTRFS_INODE_REF_KEY;
1891 +               key.offset = (u64)-1;
1892 +       }
1893  
1894         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1895 -       if (ret < 0) {
1896 -               /* Error */
1897 -               btrfs_free_path(path);
1898 -               return ERR_PTR(ret);
1899 +       if (ret < 0)
1900 +               goto fail;
1901 +
1902 +       BUG_ON(ret == 0);
1903 +       if (path->slots[0] == 0) {
1904 +               ret = -ENOENT;
1905 +               goto fail;
1906         }
1907 +
1908 +       path->slots[0]--;
1909         leaf = path->nodes[0];
1910 -       slot = path->slots[0];
1911 -       if (ret) {
1912 -               /* btrfs_search_slot() returns the slot where we'd want to
1913 -                  insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
1914 -                  The _real_ backref, telling us what the parent inode
1915 -                  _actually_ is, will be in the slot _before_ the one
1916 -                  that btrfs_search_slot() returns. */
1917 -               if (!slot) {
1918 -                       /* Unless there is _no_ key in the tree before... */
1919 -                       btrfs_free_path(path);
1920 -                       return ERR_PTR(-EIO);
1921 -               }
1922 -               slot--;
1923 +
1924 +       btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1925 +       if (found_key.objectid != key.objectid || found_key.type != key.type) {
1926 +               ret = -ENOENT;
1927 +               goto fail;
1928         }
1929  
1930 -       btrfs_item_key_to_cpu(leaf, &key, slot);
1931 +       if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
1932 +               ref = btrfs_item_ptr(leaf, path->slots[0],
1933 +                                    struct btrfs_root_ref);
1934 +               key.objectid = btrfs_root_ref_dirid(leaf, ref);
1935 +       } else {
1936 +               key.objectid = found_key.offset;
1937 +       }
1938         btrfs_free_path(path);
1939  
1940 -       if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
1941 -               return ERR_PTR(-EINVAL);
1942 -
1943 -       objectid = key.offset;
1944 -
1945 -       /* If we are already at the root of a subvol, return the real root */
1946 -       if (objectid == dir->i_ino)
1947 -               return dget(dir->i_sb->s_root);
1948 +       if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
1949 +               return btrfs_get_dentry(root->fs_info->sb, key.objectid,
1950 +                                       found_key.offset, 0, 0);
1951 +       }
1952  
1953 -       /* Build a new key for the inode item */
1954 -       key.objectid = objectid;
1955 -       btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
1956 +       key.type = BTRFS_INODE_ITEM_KEY;
1957         key.offset = 0;
1958 -
1959 -       return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
1960 +       dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
1961 +       if (!IS_ERR(dentry))
1962 +               dentry->d_op = &btrfs_dentry_operations;
1963 +       return dentry;
1964 +fail:
1965 +       btrfs_free_path(path);
1966 +       return ERR_PTR(ret);
1967  }
1968  
1969  const struct export_operations btrfs_export_ops = {
1970 diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
1971 index 72a2b9c..c56f916 100644
1972 --- a/fs/btrfs/extent-tree.c
1973 +++ b/fs/btrfs/extent-tree.c
1974 @@ -32,12 +32,12 @@
1975  #include "locking.h"
1976  #include "free-space-cache.h"
1977  
1978 -static int update_reserved_extents(struct btrfs_root *root,
1979 -                                  u64 bytenr, u64 num, int reserve);
1980  static int update_block_group(struct btrfs_trans_handle *trans,
1981                               struct btrfs_root *root,
1982                               u64 bytenr, u64 num_bytes, int alloc,
1983                               int mark_free);
1984 +static int update_reserved_extents(struct btrfs_block_group_cache *cache,
1985 +                                  u64 num_bytes, int reserve);
1986  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
1987                                 struct btrfs_root *root,
1988                                 u64 bytenr, u64 num_bytes, u64 parent,
1989 @@ -57,10 +57,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
1990                                      u64 parent, u64 root_objectid,
1991                                      u64 flags, struct btrfs_disk_key *key,
1992                                      int level, struct btrfs_key *ins);
1993 -
1994  static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1995                           struct btrfs_root *extent_root, u64 alloc_bytes,
1996                           u64 flags, int force);
1997 +static int pin_down_bytes(struct btrfs_trans_handle *trans,
1998 +                         struct btrfs_root *root,
1999 +                         struct btrfs_path *path,
2000 +                         u64 bytenr, u64 num_bytes,
2001 +                         int is_data, int reserved,
2002 +                         struct extent_buffer **must_clean);
2003 +static int find_next_key(struct btrfs_path *path, int level,
2004 +                        struct btrfs_key *key);
2005 +static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
2006 +                           int dump_block_groups);
2007  
2008  static noinline int
2009  block_group_cache_done(struct btrfs_block_group_cache *cache)
2010 @@ -153,34 +162,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
2011         return ret;
2012  }
2013  
2014 -/*
2015 - * We always set EXTENT_LOCKED for the super mirror extents so we don't
2016 - * overwrite them, so those bits need to be unset.  Also, if we are unmounting
2017 - * with pinned extents still sitting there because we had a block group caching,
2018 - * we need to clear those now, since we are done.
2019 - */
2020 -void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
2021 +static int add_excluded_extent(struct btrfs_root *root,
2022 +                              u64 start, u64 num_bytes)
2023  {
2024 -       u64 start, end, last = 0;
2025 -       int ret;
2026 +       u64 end = start + num_bytes - 1;
2027 +       set_extent_bits(&root->fs_info->freed_extents[0],
2028 +                       start, end, EXTENT_UPTODATE, GFP_NOFS);
2029 +       set_extent_bits(&root->fs_info->freed_extents[1],
2030 +                       start, end, EXTENT_UPTODATE, GFP_NOFS);
2031 +       return 0;
2032 +}
2033  
2034 -       while (1) {
2035 -               ret = find_first_extent_bit(&info->pinned_extents, last,
2036 -                                           &start, &end,
2037 -                                           EXTENT_LOCKED|EXTENT_DIRTY);
2038 -               if (ret)
2039 -                       break;
2040 +static void free_excluded_extents(struct btrfs_root *root,
2041 +                                 struct btrfs_block_group_cache *cache)
2042 +{
2043 +       u64 start, end;
2044  
2045 -               clear_extent_bits(&info->pinned_extents, start, end,
2046 -                                 EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
2047 -               last = end+1;
2048 -       }
2049 +       start = cache->key.objectid;
2050 +       end = start + cache->key.offset - 1;
2051 +
2052 +       clear_extent_bits(&root->fs_info->freed_extents[0],
2053 +                         start, end, EXTENT_UPTODATE, GFP_NOFS);
2054 +       clear_extent_bits(&root->fs_info->freed_extents[1],
2055 +                         start, end, EXTENT_UPTODATE, GFP_NOFS);
2056  }
2057  
2058 -static int remove_sb_from_cache(struct btrfs_root *root,
2059 -                               struct btrfs_block_group_cache *cache)
2060 +static int exclude_super_stripes(struct btrfs_root *root,
2061 +                                struct btrfs_block_group_cache *cache)
2062  {
2063 -       struct btrfs_fs_info *fs_info = root->fs_info;
2064         u64 bytenr;
2065         u64 *logical;
2066         int stripe_len;
2067 @@ -192,17 +201,42 @@ static int remove_sb_from_cache(struct btrfs_root *root,
2068                                        cache->key.objectid, bytenr,
2069                                        0, &logical, &nr, &stripe_len);
2070                 BUG_ON(ret);
2071 +
2072                 while (nr--) {
2073 -                       try_lock_extent(&fs_info->pinned_extents,
2074 -                                       logical[nr],
2075 -                                       logical[nr] + stripe_len - 1, GFP_NOFS);
2076 +                       cache->bytes_super += stripe_len;
2077 +                       ret = add_excluded_extent(root, logical[nr],
2078 +                                                 stripe_len);
2079 +                       BUG_ON(ret);
2080                 }
2081 +
2082                 kfree(logical);
2083         }
2084 -
2085         return 0;
2086  }
2087  
2088 +static struct btrfs_caching_control *
2089 +get_caching_control(struct btrfs_block_group_cache *cache)
2090 +{
2091 +       struct btrfs_caching_control *ctl;
2092 +
2093 +       spin_lock(&cache->lock);
2094 +       if (cache->cached != BTRFS_CACHE_STARTED) {
2095 +               spin_unlock(&cache->lock);
2096 +               return NULL;
2097 +       }
2098 +
2099 +       ctl = cache->caching_ctl;
2100 +       atomic_inc(&ctl->count);
2101 +       spin_unlock(&cache->lock);
2102 +       return ctl;
2103 +}
2104 +
2105 +static void put_caching_control(struct btrfs_caching_control *ctl)
2106 +{
2107 +       if (atomic_dec_and_test(&ctl->count))
2108 +               kfree(ctl);
2109 +}
2110 +
2111  /*
2112   * this is only called by cache_block_group, since we could have freed extents
2113   * we need to check the pinned_extents for any extents that can't be used yet
2114 @@ -215,9 +249,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
2115         int ret;
2116  
2117         while (start < end) {
2118 -               ret = find_first_extent_bit(&info->pinned_extents, start,
2119 +               ret = find_first_extent_bit(info->pinned_extents, start,
2120                                             &extent_start, &extent_end,
2121 -                                           EXTENT_DIRTY|EXTENT_LOCKED);
2122 +                                           EXTENT_DIRTY | EXTENT_UPTODATE);
2123                 if (ret)
2124                         break;
2125  
2126 @@ -249,22 +283,27 @@ static int caching_kthread(void *data)
2127  {
2128         struct btrfs_block_group_cache *block_group = data;
2129         struct btrfs_fs_info *fs_info = block_group->fs_info;
2130 -       u64 last = 0;
2131 +       struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
2132 +       struct btrfs_root *extent_root = fs_info->extent_root;
2133         struct btrfs_path *path;
2134 -       int ret = 0;
2135 -       struct btrfs_key key;
2136         struct extent_buffer *leaf;
2137 -       int slot;
2138 +       struct btrfs_key key;
2139         u64 total_found = 0;
2140 -
2141 -       BUG_ON(!fs_info);
2142 +       u64 last = 0;
2143 +       u32 nritems;
2144 +       int ret = 0;
2145  
2146         path = btrfs_alloc_path();
2147         if (!path)
2148                 return -ENOMEM;
2149  
2150 -       atomic_inc(&block_group->space_info->caching_threads);
2151 +       exclude_super_stripes(extent_root, block_group);
2152 +       spin_lock(&block_group->space_info->lock);
2153 +       block_group->space_info->bytes_super += block_group->bytes_super;
2154 +       spin_unlock(&block_group->space_info->lock);
2155 +
2156         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
2157 +
2158         /*
2159          * We don't want to deadlock with somebody trying to allocate a new
2160          * extent for the extent root while also trying to search the extent
2161 @@ -277,74 +316,64 @@ static int caching_kthread(void *data)
2162  
2163         key.objectid = last;
2164         key.offset = 0;
2165 -       btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2166 +       key.type = BTRFS_EXTENT_ITEM_KEY;
2167  again:
2168 +       mutex_lock(&caching_ctl->mutex);
2169         /* need to make sure the commit_root doesn't disappear */
2170         down_read(&fs_info->extent_commit_sem);
2171  
2172 -       ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
2173 +       ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2174         if (ret < 0)
2175                 goto err;
2176  
2177 +       leaf = path->nodes[0];
2178 +       nritems = btrfs_header_nritems(leaf);
2179 +
2180         while (1) {
2181                 smp_mb();
2182 -               if (block_group->fs_info->closing > 1) {
2183 +               if (fs_info->closing > 1) {
2184                         last = (u64)-1;
2185                         break;
2186                 }
2187  
2188 -               leaf = path->nodes[0];
2189 -               slot = path->slots[0];
2190 -               if (slot >= btrfs_header_nritems(leaf)) {
2191 -                       ret = btrfs_next_leaf(fs_info->extent_root, path);
2192 -                       if (ret < 0)
2193 -                               goto err;
2194 -                       else if (ret)
2195 +               if (path->slots[0] < nritems) {
2196 +                       btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2197 +               } else {
2198 +                       ret = find_next_key(path, 0, &key);
2199 +                       if (ret)
2200                                 break;
2201  
2202 -                       if (need_resched() ||
2203 -                           btrfs_transaction_in_commit(fs_info)) {
2204 -                               leaf = path->nodes[0];
2205 -
2206 -                               /* this shouldn't happen, but if the
2207 -                                * leaf is empty just move on.
2208 -                                */
2209 -                               if (btrfs_header_nritems(leaf) == 0)
2210 -                                       break;
2211 -                               /*
2212 -                                * we need to copy the key out so that
2213 -                                * we are sure the next search advances
2214 -                                * us forward in the btree.
2215 -                                */
2216 -                               btrfs_item_key_to_cpu(leaf, &key, 0);
2217 -                               btrfs_release_path(fs_info->extent_root, path);
2218 -                               up_read(&fs_info->extent_commit_sem);
2219 +                       caching_ctl->progress = last;
2220 +                       btrfs_release_path(extent_root, path);
2221 +                       up_read(&fs_info->extent_commit_sem);
2222 +                       mutex_unlock(&caching_ctl->mutex);
2223 +                       if (btrfs_transaction_in_commit(fs_info))
2224                                 schedule_timeout(1);
2225 -                               goto again;
2226 -                       }
2227 +                       else
2228 +                               cond_resched();
2229 +                       goto again;
2230 +               }
2231  
2232 +               if (key.objectid < block_group->key.objectid) {
2233 +                       path->slots[0]++;
2234                         continue;
2235                 }
2236 -               btrfs_item_key_to_cpu(leaf, &key, slot);
2237 -               if (key.objectid < block_group->key.objectid)
2238 -                       goto next;
2239  
2240                 if (key.objectid >= block_group->key.objectid +
2241                     block_group->key.offset)
2242                         break;
2243  
2244 -               if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
2245 +               if (key.type == BTRFS_EXTENT_ITEM_KEY) {
2246                         total_found += add_new_free_space(block_group,
2247                                                           fs_info, last,
2248                                                           key.objectid);
2249                         last = key.objectid + key.offset;
2250 -               }
2251  
2252 -               if (total_found > (1024 * 1024 * 2)) {
2253 -                       total_found = 0;
2254 -                       wake_up(&block_group->caching_q);
2255 +                       if (total_found > (1024 * 1024 * 2)) {
2256 +                               total_found = 0;
2257 +                               wake_up(&caching_ctl->wait);
2258 +                       }
2259                 }
2260 -next:
2261                 path->slots[0]++;
2262         }
2263         ret = 0;
2264 @@ -352,33 +381,65 @@ next:
2265         total_found += add_new_free_space(block_group, fs_info, last,
2266                                           block_group->key.objectid +
2267                                           block_group->key.offset);
2268 +       caching_ctl->progress = (u64)-1;
2269  
2270         spin_lock(&block_group->lock);
2271 +       block_group->caching_ctl = NULL;
2272         block_group->cached = BTRFS_CACHE_FINISHED;
2273         spin_unlock(&block_group->lock);
2274  
2275  err:
2276         btrfs_free_path(path);
2277         up_read(&fs_info->extent_commit_sem);
2278 -       atomic_dec(&block_group->space_info->caching_threads);
2279 -       wake_up(&block_group->caching_q);
2280  
2281 +       free_excluded_extents(extent_root, block_group);
2282 +
2283 +       mutex_unlock(&caching_ctl->mutex);
2284 +       wake_up(&caching_ctl->wait);
2285 +
2286 +       put_caching_control(caching_ctl);
2287 +       atomic_dec(&block_group->space_info->caching_threads);
2288         return 0;
2289  }
2290  
2291  static int cache_block_group(struct btrfs_block_group_cache *cache)
2292  {
2293 +       struct btrfs_fs_info *fs_info = cache->fs_info;
2294 +       struct btrfs_caching_control *caching_ctl;
2295         struct task_struct *tsk;
2296         int ret = 0;
2297  
2298 +       smp_mb();
2299 +       if (cache->cached != BTRFS_CACHE_NO)
2300 +               return 0;
2301 +
2302 +       caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
2303 +       BUG_ON(!caching_ctl);
2304 +
2305 +       INIT_LIST_HEAD(&caching_ctl->list);
2306 +       mutex_init(&caching_ctl->mutex);
2307 +       init_waitqueue_head(&caching_ctl->wait);
2308 +       caching_ctl->block_group = cache;
2309 +       caching_ctl->progress = cache->key.objectid;
2310 +       /* one for caching kthread, one for caching block group list */
2311 +       atomic_set(&caching_ctl->count, 2);
2312 +
2313         spin_lock(&cache->lock);
2314         if (cache->cached != BTRFS_CACHE_NO) {
2315                 spin_unlock(&cache->lock);
2316 -               return ret;
2317 +               kfree(caching_ctl);
2318 +               return 0;
2319         }
2320 +       cache->caching_ctl = caching_ctl;
2321         cache->cached = BTRFS_CACHE_STARTED;
2322         spin_unlock(&cache->lock);
2323  
2324 +       down_write(&fs_info->extent_commit_sem);
2325 +       list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
2326 +       up_write(&fs_info->extent_commit_sem);
2327 +
2328 +       atomic_inc(&cache->space_info->caching_threads);
2329 +
2330         tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
2331                           cache->key.objectid);
2332         if (IS_ERR(tsk)) {
2333 @@ -1507,22 +1568,22 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
2334         return ret;
2335  }
2336  
2337 -#ifdef BIO_RW_DISCARD
2338  static void btrfs_issue_discard(struct block_device *bdev,
2339                                 u64 start, u64 len)
2340  {
2341         blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
2342  }
2343 -#endif
2344  
2345  static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
2346                                 u64 num_bytes)
2347  {
2348 -#ifdef BIO_RW_DISCARD
2349         int ret;
2350         u64 map_length = num_bytes;
2351         struct btrfs_multi_bio *multi = NULL;
2352  
2353 +       if (!btrfs_test_opt(root, DISCARD))
2354 +               return 0;
2355 +
2356         /* Tell the block device(s) that the sectors can be discarded */
2357         ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
2358                               bytenr, &map_length, &multi, 0);
2359 @@ -1542,9 +1603,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
2360         }
2361  
2362         return ret;
2363 -#else
2364 -       return 0;
2365 -#endif
2366  }
2367  
2368  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2369 @@ -1656,7 +1714,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2370                                                  parent, ref_root, flags,
2371                                                  ref->objectid, ref->offset,
2372                                                  &ins, node->ref_mod);
2373 -               update_reserved_extents(root, ins.objectid, ins.offset, 0);
2374         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2375                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2376                                              node->num_bytes, parent,
2377 @@ -1782,7 +1839,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2378                                                 extent_op->flags_to_set,
2379                                                 &extent_op->key,
2380                                                 ref->level, &ins);
2381 -               update_reserved_extents(root, ins.objectid, ins.offset, 0);
2382         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2383                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2384                                              node->num_bytes, parent, ref_root,
2385 @@ -1817,16 +1873,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2386                 BUG_ON(extent_op);
2387                 head = btrfs_delayed_node_to_head(node);
2388                 if (insert_reserved) {
2389 +                       int mark_free = 0;
2390 +                       struct extent_buffer *must_clean = NULL;
2391 +
2392 +                       ret = pin_down_bytes(trans, root, NULL,
2393 +                                            node->bytenr, node->num_bytes,
2394 +                                            head->is_data, 1, &must_clean);
2395 +                       if (ret > 0)
2396 +                               mark_free = 1;
2397 +
2398 +                       if (must_clean) {
2399 +                               clean_tree_block(NULL, root, must_clean);
2400 +                               btrfs_tree_unlock(must_clean);
2401 +                               free_extent_buffer(must_clean);
2402 +                       }
2403                         if (head->is_data) {
2404                                 ret = btrfs_del_csums(trans, root,
2405                                                       node->bytenr,
2406                                                       node->num_bytes);
2407                                 BUG_ON(ret);
2408                         }
2409 -                       btrfs_update_pinned_extents(root, node->bytenr,
2410 -                                                   node->num_bytes, 1);
2411 -                       update_reserved_extents(root, node->bytenr,
2412 -                                               node->num_bytes, 0);
2413 +                       if (mark_free) {
2414 +                               ret = btrfs_free_reserved_extent(root,
2415 +                                                       node->bytenr,
2416 +                                                       node->num_bytes);
2417 +                               BUG_ON(ret);
2418 +                       }
2419                 }
2420                 mutex_unlock(&head->mutex);
2421                 return 0;
2422 @@ -2691,60 +2763,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2423                                                        alloc_target);
2424  }
2425  
2426 +static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2427 +{
2428 +       u64 num_bytes;
2429 +       int level;
2430 +
2431 +       level = BTRFS_MAX_LEVEL - 2;
2432 +       /*
2433 +        * NOTE: these calculations are absolutely the worst possible case.
2434 +        * This assumes that _every_ item we insert will require a new leaf, and
2435 +        * that the tree has grown to its maximum level size.
2436 +        */
2437 +
2438 +       /*
2439 +        * for every item we insert we could insert both an extent item and a
2440 +        * extent ref item.  Then for ever item we insert, we will need to cow
2441 +        * both the original leaf, plus the leaf to the left and right of it.
2442 +        *
2443 +        * Unless we are talking about the extent root, then we just want the
2444 +        * number of items * 2, since we just need the extent item plus its ref.
2445 +        */
2446 +       if (root == root->fs_info->extent_root)
2447 +               num_bytes = num_items * 2;
2448 +       else
2449 +               num_bytes = (num_items + (2 * num_items)) * 3;
2450 +
2451 +       /*
2452 +        * num_bytes is total number of leaves we could need times the leaf
2453 +        * size, and then for every leaf we could end up cow'ing 2 nodes per
2454 +        * level, down to the leaf level.
2455 +        */
2456 +       num_bytes = (num_bytes * root->leafsize) +
2457 +               (num_bytes * (level * 2)) * root->nodesize;
2458 +
2459 +       return num_bytes;
2460 +}
2461 +
2462  /*
2463 - * for now this just makes sure we have at least 5% of our metadata space free
2464 - * for use.
2465 + * Unreserve metadata space for delalloc.  If we have less reserved credits than
2466 + * we have extents, this function does nothing.
2467   */
2468 -int btrfs_check_metadata_free_space(struct btrfs_root *root)
2469 +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2470 +                                         struct inode *inode, int num_items)
2471  {
2472         struct btrfs_fs_info *info = root->fs_info;
2473         struct btrfs_space_info *meta_sinfo;
2474 -       u64 alloc_target, thresh;
2475 -       int committed = 0, ret;
2476 +       u64 num_bytes;
2477 +       u64 alloc_target;
2478 +       bool bug = false;
2479  
2480         /* get the space info for where the metadata will live */
2481         alloc_target = btrfs_get_alloc_profile(root, 0);
2482         meta_sinfo = __find_space_info(info, alloc_target);
2483  
2484 -again:
2485 +       num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2486 +                                          num_items);
2487 +
2488         spin_lock(&meta_sinfo->lock);
2489 -       if (!meta_sinfo->full)
2490 -               thresh = meta_sinfo->total_bytes * 80;
2491 -       else
2492 -               thresh = meta_sinfo->total_bytes * 95;
2493 +       spin_lock(&BTRFS_I(inode)->accounting_lock);
2494 +       if (BTRFS_I(inode)->reserved_extents <=
2495 +           BTRFS_I(inode)->outstanding_extents) {
2496 +               spin_unlock(&BTRFS_I(inode)->accounting_lock);
2497 +               spin_unlock(&meta_sinfo->lock);
2498 +               return 0;
2499 +       }
2500 +       spin_unlock(&BTRFS_I(inode)->accounting_lock);
2501 +
2502 +       BTRFS_I(inode)->reserved_extents--;
2503 +       BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2504 +
2505 +       if (meta_sinfo->bytes_delalloc < num_bytes) {
2506 +               bug = true;
2507 +               meta_sinfo->bytes_delalloc = 0;
2508 +       } else {
2509 +               meta_sinfo->bytes_delalloc -= num_bytes;
2510 +       }
2511 +       spin_unlock(&meta_sinfo->lock);
2512 +
2513 +       BUG_ON(bug);
2514 +
2515 +       return 0;
2516 +}
2517 +
2518 +static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2519 +{
2520 +       u64 thresh;
2521 +
2522 +       thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2523 +               meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2524 +               meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2525 +               meta_sinfo->bytes_may_use;
2526  
2527 +       thresh = meta_sinfo->total_bytes - thresh;
2528 +       thresh *= 80;
2529         do_div(thresh, 100);
2530 +       if (thresh <= meta_sinfo->bytes_delalloc)
2531 +               meta_sinfo->force_delalloc = 1;
2532 +       else
2533 +               meta_sinfo->force_delalloc = 0;
2534 +}
2535  
2536 -       if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2537 -           meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
2538 -               struct btrfs_trans_handle *trans;
2539 -               if (!meta_sinfo->full) {
2540 -                       meta_sinfo->force_alloc = 1;
2541 -                       spin_unlock(&meta_sinfo->lock);
2542 +struct async_flush {
2543 +       struct btrfs_root *root;
2544 +       struct btrfs_space_info *info;
2545 +       struct btrfs_work work;
2546 +};
2547  
2548 -                       trans = btrfs_start_transaction(root, 1);
2549 -                       if (!trans)
2550 -                               return -ENOMEM;
2551 +static noinline void flush_delalloc_async(struct btrfs_work *work)
2552 +{
2553 +       struct async_flush *async;
2554 +       struct btrfs_root *root;
2555 +       struct btrfs_space_info *info;
2556  
2557 -                       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2558 -                                            2 * 1024 * 1024, alloc_target, 0);
2559 -                       btrfs_end_transaction(trans, root);
2560 +       async = container_of(work, struct async_flush, work);
2561 +       root = async->root;
2562 +       info = async->info;
2563 +
2564 +       btrfs_start_delalloc_inodes(root);
2565 +       wake_up(&info->flush_wait);
2566 +       btrfs_wait_ordered_extents(root, 0);
2567 +
2568 +       spin_lock(&info->lock);
2569 +       info->flushing = 0;
2570 +       spin_unlock(&info->lock);
2571 +       wake_up(&info->flush_wait);
2572 +
2573 +       kfree(async);
2574 +}
2575 +
2576 +static void wait_on_flush(struct btrfs_space_info *info)
2577 +{
2578 +       DEFINE_WAIT(wait);
2579 +       u64 used;
2580 +
2581 +       while (1) {
2582 +               prepare_to_wait(&info->flush_wait, &wait,
2583 +                               TASK_UNINTERRUPTIBLE);
2584 +               spin_lock(&info->lock);
2585 +               if (!info->flushing) {
2586 +                       spin_unlock(&info->lock);
2587 +                       break;
2588 +               }
2589 +
2590 +               used = info->bytes_used + info->bytes_reserved +
2591 +                       info->bytes_pinned + info->bytes_readonly +
2592 +                       info->bytes_super + info->bytes_root +
2593 +                       info->bytes_may_use + info->bytes_delalloc;
2594 +               if (used < info->total_bytes) {
2595 +                       spin_unlock(&info->lock);
2596 +                       break;
2597 +               }
2598 +               spin_unlock(&info->lock);
2599 +               schedule();
2600 +       }
2601 +       finish_wait(&info->flush_wait, &wait);
2602 +}
2603 +
2604 +static void flush_delalloc(struct btrfs_root *root,
2605 +                                struct btrfs_space_info *info)
2606 +{
2607 +       struct async_flush *async;
2608 +       bool wait = false;
2609 +
2610 +       spin_lock(&info->lock);
2611 +
2612 +       if (!info->flushing) {
2613 +               info->flushing = 1;
2614 +               init_waitqueue_head(&info->flush_wait);
2615 +       } else {
2616 +               wait = true;
2617 +       }
2618 +
2619 +       spin_unlock(&info->lock);
2620 +
2621 +       if (wait) {
2622 +               wait_on_flush(info);
2623 +               return;
2624 +       }
2625 +
2626 +       async = kzalloc(sizeof(*async), GFP_NOFS);
2627 +       if (!async)
2628 +               goto flush;
2629 +
2630 +       async->root = root;
2631 +       async->info = info;
2632 +       async->work.func = flush_delalloc_async;
2633 +
2634 +       btrfs_queue_worker(&root->fs_info->enospc_workers,
2635 +                          &async->work);
2636 +       wait_on_flush(info);
2637 +       return;
2638 +
2639 +flush:
2640 +       btrfs_start_delalloc_inodes(root);
2641 +       btrfs_wait_ordered_extents(root, 0);
2642 +
2643 +       spin_lock(&info->lock);
2644 +       info->flushing = 0;
2645 +       spin_unlock(&info->lock);
2646 +       wake_up(&info->flush_wait);
2647 +}
2648 +
2649 +static int maybe_allocate_chunk(struct btrfs_root *root,
2650 +                                struct btrfs_space_info *info)
2651 +{
2652 +       struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2653 +       struct btrfs_trans_handle *trans;
2654 +       bool wait = false;
2655 +       int ret = 0;
2656 +       u64 min_metadata;
2657 +       u64 free_space;
2658 +
2659 +       free_space = btrfs_super_total_bytes(disk_super);
2660 +       /*
2661 +        * we allow the metadata to grow to a max of either 5gb or 5% of the
2662 +        * space in the volume.
2663 +        */
2664 +       min_metadata = min((u64)5 * 1024 * 1024 * 1024,
2665 +                            div64_u64(free_space * 5, 100));
2666 +       if (info->total_bytes >= min_metadata) {
2667 +               spin_unlock(&info->lock);
2668 +               return 0;
2669 +       }
2670 +
2671 +       if (info->full) {
2672 +               spin_unlock(&info->lock);
2673 +               return 0;
2674 +       }
2675 +
2676 +       if (!info->allocating_chunk) {
2677 +               info->force_alloc = 1;
2678 +               info->allocating_chunk = 1;
2679 +               init_waitqueue_head(&info->allocate_wait);
2680 +       } else {
2681 +               wait = true;
2682 +       }
2683 +
2684 +       spin_unlock(&info->lock);
2685 +
2686 +       if (wait) {
2687 +               wait_event(info->allocate_wait,
2688 +                          !info->allocating_chunk);
2689 +               return 1;
2690 +       }
2691 +
2692 +       trans = btrfs_start_transaction(root, 1);
2693 +       if (!trans) {
2694 +               ret = -ENOMEM;
2695 +               goto out;
2696 +       }
2697 +
2698 +       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2699 +                            4096 + 2 * 1024 * 1024,
2700 +                            info->flags, 0);
2701 +       btrfs_end_transaction(trans, root);
2702 +       if (ret)
2703 +               goto out;
2704 +out:
2705 +       spin_lock(&info->lock);
2706 +       info->allocating_chunk = 0;
2707 +       spin_unlock(&info->lock);
2708 +       wake_up(&info->allocate_wait);
2709 +
2710 +       if (ret)
2711 +               return 0;
2712 +       return 1;
2713 +}
2714 +
2715 +/*
2716 + * Reserve metadata space for delalloc.
2717 + */
2718 +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2719 +                                       struct inode *inode, int num_items)
2720 +{
2721 +       struct btrfs_fs_info *info = root->fs_info;
2722 +       struct btrfs_space_info *meta_sinfo;
2723 +       u64 num_bytes;
2724 +       u64 used;
2725 +       u64 alloc_target;
2726 +       int flushed = 0;
2727 +       int force_delalloc;
2728 +
2729 +       /* get the space info for where the metadata will live */
2730 +       alloc_target = btrfs_get_alloc_profile(root, 0);
2731 +       meta_sinfo = __find_space_info(info, alloc_target);
2732 +
2733 +       num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2734 +                                          num_items);
2735 +again:
2736 +       spin_lock(&meta_sinfo->lock);
2737 +
2738 +       force_delalloc = meta_sinfo->force_delalloc;
2739 +
2740 +       if (unlikely(!meta_sinfo->bytes_root))
2741 +               meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
2742 +
2743 +       if (!flushed)
2744 +               meta_sinfo->bytes_delalloc += num_bytes;
2745 +
2746 +       used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2747 +               meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2748 +               meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2749 +               meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
2750 +
2751 +       if (used > meta_sinfo->total_bytes) {
2752 +               flushed++;
2753 +
2754 +               if (flushed == 1) {
2755 +                       if (maybe_allocate_chunk(root, meta_sinfo))
2756 +                               goto again;
2757 +                       flushed++;
2758 +               } else {
2759 +                       spin_unlock(&meta_sinfo->lock);
2760 +               }
2761 +
2762 +               if (flushed == 2) {
2763 +                       filemap_flush(inode->i_mapping);
2764 +                       goto again;
2765 +               } else if (flushed == 3) {
2766 +                       flush_delalloc(root, meta_sinfo);
2767                         goto again;
2768                 }
2769 +               spin_lock(&meta_sinfo->lock);
2770 +               meta_sinfo->bytes_delalloc -= num_bytes;
2771                 spin_unlock(&meta_sinfo->lock);
2772 +               printk(KERN_ERR "enospc, has %d, reserved %d\n",
2773 +                      BTRFS_I(inode)->outstanding_extents,
2774 +                      BTRFS_I(inode)->reserved_extents);
2775 +               dump_space_info(meta_sinfo, 0, 0);
2776 +               return -ENOSPC;
2777 +       }
2778  
2779 -               if (!committed) {
2780 -                       committed = 1;
2781 -                       trans = btrfs_join_transaction(root, 1);
2782 -                       if (!trans)
2783 -                               return -ENOMEM;
2784 -                       ret = btrfs_commit_transaction(trans, root);
2785 -                       if (ret)
2786 -                               return ret;
2787 +       BTRFS_I(inode)->reserved_extents++;
2788 +       check_force_delalloc(meta_sinfo);
2789 +       spin_unlock(&meta_sinfo->lock);
2790 +
2791 +       if (!flushed && force_delalloc)
2792 +               filemap_flush(inode->i_mapping);
2793 +
2794 +       return 0;
2795 +}
2796 +
2797 +/*
2798 + * unreserve num_items number of items worth of metadata space.  This needs to
2799 + * be paired with btrfs_reserve_metadata_space.
2800 + *
2801 + * NOTE: if you have the option, run this _AFTER_ you do a
2802 + * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
2803 + * oprations which will result in more used metadata, so we want to make sure we
2804 + * can do that without issue.
2805 + */
2806 +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
2807 +{
2808 +       struct btrfs_fs_info *info = root->fs_info;
2809 +       struct btrfs_space_info *meta_sinfo;
2810 +       u64 num_bytes;
2811 +       u64 alloc_target;
2812 +       bool bug = false;
2813 +
2814 +       /* get the space info for where the metadata will live */
2815 +       alloc_target = btrfs_get_alloc_profile(root, 0);
2816 +       meta_sinfo = __find_space_info(info, alloc_target);
2817 +
2818 +       num_bytes = calculate_bytes_needed(root, num_items);
2819 +
2820 +       spin_lock(&meta_sinfo->lock);
2821 +       if (meta_sinfo->bytes_may_use < num_bytes) {
2822 +               bug = true;
2823 +               meta_sinfo->bytes_may_use = 0;
2824 +       } else {
2825 +               meta_sinfo->bytes_may_use -= num_bytes;
2826 +       }
2827 +       spin_unlock(&meta_sinfo->lock);
2828 +
2829 +       BUG_ON(bug);
2830 +
2831 +       return 0;
2832 +}
2833 +
2834 +/*
2835 + * Reserve some metadata space for use.  We'll calculate the worste case number
2836 + * of bytes that would be needed to modify num_items number of items.  If we
2837 + * have space, fantastic, if not, you get -ENOSPC.  Please call
2838 + * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
2839 + * items you reserved, since whatever metadata you needed should have already
2840 + * been allocated.
2841 + *
2842 + * This will commit the transaction to make more space if we don't have enough
2843 + * metadata space.  THe only time we don't do this is if we're reserving space
2844 + * inside of a transaction, then we will just return -ENOSPC and it is the
2845 + * callers responsibility to handle it properly.
2846 + */
2847 +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
2848 +{
2849 +       struct btrfs_fs_info *info = root->fs_info;
2850 +       struct btrfs_space_info *meta_sinfo;
2851 +       u64 num_bytes;
2852 +       u64 used;
2853 +       u64 alloc_target;
2854 +       int retries = 0;
2855 +
2856 +       /* get the space info for where the metadata will live */
2857 +       alloc_target = btrfs_get_alloc_profile(root, 0);
2858 +       meta_sinfo = __find_space_info(info, alloc_target);
2859 +
2860 +       num_bytes = calculate_bytes_needed(root, num_items);
2861 +again:
2862 +       spin_lock(&meta_sinfo->lock);
2863 +
2864 +       if (unlikely(!meta_sinfo->bytes_root))
2865 +               meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
2866 +
2867 +       if (!retries)
2868 +               meta_sinfo->bytes_may_use += num_bytes;
2869 +
2870 +       used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2871 +               meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2872 +               meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2873 +               meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
2874 +
2875 +       if (used > meta_sinfo->total_bytes) {
2876 +               retries++;
2877 +               if (retries == 1) {
2878 +                       if (maybe_allocate_chunk(root, meta_sinfo))
2879 +                               goto again;
2880 +                       retries++;
2881 +               } else {
2882 +                       spin_unlock(&meta_sinfo->lock);
2883 +               }
2884 +
2885 +               if (retries == 2) {
2886 +                       flush_delalloc(root, meta_sinfo);
2887                         goto again;
2888                 }
2889 +               spin_lock(&meta_sinfo->lock);
2890 +               meta_sinfo->bytes_may_use -= num_bytes;
2891 +               spin_unlock(&meta_sinfo->lock);
2892 +
2893 +               dump_space_info(meta_sinfo, 0, 0);
2894                 return -ENOSPC;
2895         }
2896 +
2897 +       check_force_delalloc(meta_sinfo);
2898         spin_unlock(&meta_sinfo->lock);
2899  
2900         return 0;
2901 @@ -2764,13 +3224,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2902         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2903  
2904         data_sinfo = BTRFS_I(inode)->space_info;
2905 +       if (!data_sinfo)
2906 +               goto alloc;
2907 +
2908  again:
2909         /* make sure we have enough space to handle the data first */
2910         spin_lock(&data_sinfo->lock);
2911         if (data_sinfo->total_bytes - data_sinfo->bytes_used -
2912             data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
2913             data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
2914 -           data_sinfo->bytes_may_use < bytes) {
2915 +           data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
2916                 struct btrfs_trans_handle *trans;
2917  
2918                 /*
2919 @@ -2782,7 +3245,7 @@ again:
2920  
2921                         data_sinfo->force_alloc = 1;
2922                         spin_unlock(&data_sinfo->lock);
2923 -
2924 +alloc:
2925                         alloc_target = btrfs_get_alloc_profile(root, 1);
2926                         trans = btrfs_start_transaction(root, 1);
2927                         if (!trans)
2928 @@ -2794,12 +3257,17 @@ again:
2929                         btrfs_end_transaction(trans, root);
2930                         if (ret)
2931                                 return ret;
2932 +
2933 +                       if (!data_sinfo) {
2934 +                               btrfs_set_inode_space_info(root, inode);
2935 +                               data_sinfo = BTRFS_I(inode)->space_info;
2936 +                       }
2937                         goto again;
2938                 }
2939                 spin_unlock(&data_sinfo->lock);
2940  
2941                 /* commit the current transaction and try again */
2942 -               if (!committed) {
2943 +               if (!committed && !root->fs_info->open_ioctl_trans) {
2944                         committed = 1;
2945                         trans = btrfs_join_transaction(root, 1);
2946                         if (!trans)
2947 @@ -2827,7 +3295,7 @@ again:
2948         BTRFS_I(inode)->reserved_bytes += bytes;
2949         spin_unlock(&data_sinfo->lock);
2950  
2951 -       return btrfs_check_metadata_free_space(root);
2952 +       return 0;
2953  }
2954  
2955  /*
2956 @@ -2926,17 +3394,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2957         BUG_ON(!space_info);
2958  
2959         spin_lock(&space_info->lock);
2960 -       if (space_info->force_alloc) {
2961 +       if (space_info->force_alloc)
2962                 force = 1;
2963 -               space_info->force_alloc = 0;
2964 -       }
2965         if (space_info->full) {
2966                 spin_unlock(&space_info->lock);
2967                 goto out;
2968         }
2969  
2970         thresh = space_info->total_bytes - space_info->bytes_readonly;
2971 -       thresh = div_factor(thresh, 6);
2972 +       thresh = div_factor(thresh, 8);
2973         if (!force &&
2974            (space_info->bytes_used + space_info->bytes_pinned +
2975             space_info->bytes_reserved + alloc_bytes) < thresh) {
2976 @@ -2950,7 +3416,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2977          * we keep a reasonable number of metadata chunks allocated in the
2978          * FS as well.
2979          */
2980 -       if (flags & BTRFS_BLOCK_GROUP_DATA) {
2981 +       if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
2982                 fs_info->data_chunk_allocations++;
2983                 if (!(fs_info->data_chunk_allocations %
2984                       fs_info->metadata_ratio))
2985 @@ -2958,8 +3424,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2986         }
2987  
2988         ret = btrfs_alloc_chunk(trans, extent_root, flags);
2989 +       spin_lock(&space_info->lock);
2990         if (ret)
2991                 space_info->full = 1;
2992 +       space_info->force_alloc = 0;
2993 +       spin_unlock(&space_info->lock);
2994  out:
2995         mutex_unlock(&extent_root->fs_info->chunk_mutex);
2996         return ret;
2997 @@ -3008,10 +3477,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
2998                 num_bytes = min(total, cache->key.offset - byte_in_group);
2999                 if (alloc) {
3000                         old_val += num_bytes;
3001 +                       btrfs_set_block_group_used(&cache->item, old_val);
3002 +                       cache->reserved -= num_bytes;
3003                         cache->space_info->bytes_used += num_bytes;
3004 +                       cache->space_info->bytes_reserved -= num_bytes;
3005                         if (cache->ro)
3006                                 cache->space_info->bytes_readonly -= num_bytes;
3007 -                       btrfs_set_block_group_used(&cache->item, old_val);
3008                         spin_unlock(&cache->lock);
3009                         spin_unlock(&cache->space_info->lock);
3010                 } else {
3011 @@ -3056,127 +3527,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3012         return bytenr;
3013  }
3014  
3015 -int btrfs_update_pinned_extents(struct btrfs_root *root,
3016 -                               u64 bytenr, u64 num, int pin)
3017 +/*
3018 + * this function must be called within transaction
3019 + */
3020 +int btrfs_pin_extent(struct btrfs_root *root,
3021 +                    u64 bytenr, u64 num_bytes, int reserved)
3022  {
3023 -       u64 len;
3024 -       struct btrfs_block_group_cache *cache;
3025         struct btrfs_fs_info *fs_info = root->fs_info;
3026 +       struct btrfs_block_group_cache *cache;
3027  
3028 -       if (pin)
3029 -               set_extent_dirty(&fs_info->pinned_extents,
3030 -                               bytenr, bytenr + num - 1, GFP_NOFS);
3031 -
3032 -       while (num > 0) {
3033 -               cache = btrfs_lookup_block_group(fs_info, bytenr);
3034 -               BUG_ON(!cache);
3035 -               len = min(num, cache->key.offset -
3036 -                         (bytenr - cache->key.objectid));
3037 -               if (pin) {
3038 -                       spin_lock(&cache->space_info->lock);
3039 -                       spin_lock(&cache->lock);
3040 -                       cache->pinned += len;
3041 -                       cache->space_info->bytes_pinned += len;
3042 -                       spin_unlock(&cache->lock);
3043 -                       spin_unlock(&cache->space_info->lock);
3044 -                       fs_info->total_pinned += len;
3045 -               } else {
3046 -                       int unpin = 0;
3047 +       cache = btrfs_lookup_block_group(fs_info, bytenr);
3048 +       BUG_ON(!cache);
3049  
3050 -                       /*
3051 -                        * in order to not race with the block group caching, we
3052 -                        * only want to unpin the extent if we are cached.  If
3053 -                        * we aren't cached, we want to start async caching this
3054 -                        * block group so we can free the extent the next time
3055 -                        * around.
3056 -                        */
3057 -                       spin_lock(&cache->space_info->lock);
3058 -                       spin_lock(&cache->lock);
3059 -                       unpin = (cache->cached == BTRFS_CACHE_FINISHED);
3060 -                       if (likely(unpin)) {
3061 -                               cache->pinned -= len;
3062 -                               cache->space_info->bytes_pinned -= len;
3063 -                               fs_info->total_pinned -= len;
3064 -                       }
3065 -                       spin_unlock(&cache->lock);
3066 -                       spin_unlock(&cache->space_info->lock);
3067 +       spin_lock(&cache->space_info->lock);
3068 +       spin_lock(&cache->lock);
3069 +       cache->pinned += num_bytes;
3070 +       cache->space_info->bytes_pinned += num_bytes;
3071 +       if (reserved) {
3072 +               cache->reserved -= num_bytes;
3073 +               cache->space_info->bytes_reserved -= num_bytes;
3074 +       }
3075 +       spin_unlock(&cache->lock);
3076 +       spin_unlock(&cache->space_info->lock);
3077  
3078 -                       if (likely(unpin))
3079 -                               clear_extent_dirty(&fs_info->pinned_extents,
3080 -                                                  bytenr, bytenr + len -1,
3081 -                                                  GFP_NOFS);
3082 -                       else
3083 -                               cache_block_group(cache);
3084 +       btrfs_put_block_group(cache);
3085  
3086 -                       if (unpin)
3087 -                               btrfs_add_free_space(cache, bytenr, len);
3088 -               }
3089 -               btrfs_put_block_group(cache);
3090 -               bytenr += len;
3091 -               num -= len;
3092 +       set_extent_dirty(fs_info->pinned_extents,
3093 +                        bytenr, bytenr + num_bytes - 1, GFP_NOFS);
3094 +       return 0;
3095 +}
3096 +
3097 +static int update_reserved_extents(struct btrfs_block_group_cache *cache,
3098 +                                  u64 num_bytes, int reserve)
3099 +{
3100 +       spin_lock(&cache->space_info->lock);
3101 +       spin_lock(&cache->lock);
3102 +       if (reserve) {
3103 +               cache->reserved += num_bytes;
3104 +               cache->space_info->bytes_reserved += num_bytes;
3105 +       } else {
3106 +               cache->reserved -= num_bytes;
3107 +               cache->space_info->bytes_reserved -= num_bytes;
3108         }
3109 +       spin_unlock(&cache->lock);
3110 +       spin_unlock(&cache->space_info->lock);
3111         return 0;
3112  }
3113  
3114 -static int update_reserved_extents(struct btrfs_root *root,
3115 -                                  u64 bytenr, u64 num, int reserve)
3116 +int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3117 +                               struct btrfs_root *root)
3118  {
3119 -       u64 len;
3120 -       struct btrfs_block_group_cache *cache;
3121         struct btrfs_fs_info *fs_info = root->fs_info;
3122 +       struct btrfs_caching_control *next;
3123 +       struct btrfs_caching_control *caching_ctl;
3124 +       struct btrfs_block_group_cache *cache;
3125  
3126 -       while (num > 0) {
3127 -               cache = btrfs_lookup_block_group(fs_info, bytenr);
3128 -               BUG_ON(!cache);
3129 -               len = min(num, cache->key.offset -
3130 -                         (bytenr - cache->key.objectid));
3131 +       down_write(&fs_info->extent_commit_sem);
3132  
3133 -               spin_lock(&cache->space_info->lock);
3134 -               spin_lock(&cache->lock);
3135 -               if (reserve) {
3136 -                       cache->reserved += len;
3137 -                       cache->space_info->bytes_reserved += len;
3138 +       list_for_each_entry_safe(caching_ctl, next,
3139 +                                &fs_info->caching_block_groups, list) {
3140 +               cache = caching_ctl->block_group;
3141 +               if (block_group_cache_done(cache)) {
3142 +                       cache->last_byte_to_unpin = (u64)-1;
3143 +                       list_del_init(&caching_ctl->list);
3144 +                       put_caching_control(caching_ctl);
3145                 } else {
3146 -                       cache->reserved -= len;
3147 -                       cache->space_info->bytes_reserved -= len;
3148 +                       cache->last_byte_to_unpin = caching_ctl->progress;
3149                 }
3150 -               spin_unlock(&cache->lock);
3151 -               spin_unlock(&cache->space_info->lock);
3152 -               btrfs_put_block_group(cache);
3153 -               bytenr += len;
3154 -               num -= len;
3155         }
3156 +
3157 +       if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3158 +               fs_info->pinned_extents = &fs_info->freed_extents[1];
3159 +       else
3160 +               fs_info->pinned_extents = &fs_info->freed_extents[0];
3161 +
3162 +       up_write(&fs_info->extent_commit_sem);
3163         return 0;
3164  }
3165  
3166 -int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
3167 +static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3168  {
3169 -       u64 last = 0;
3170 -       u64 start;
3171 -       u64 end;
3172 -       struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
3173 -       int ret;
3174 +       struct btrfs_fs_info *fs_info = root->fs_info;
3175 +       struct btrfs_block_group_cache *cache = NULL;
3176 +       u64 len;
3177  
3178 -       while (1) {
3179 -               ret = find_first_extent_bit(pinned_extents, last,
3180 -                                           &start, &end, EXTENT_DIRTY);
3181 -               if (ret)
3182 -                       break;
3183 +       while (start <= end) {
3184 +               if (!cache ||
3185 +                   start >= cache->key.objectid + cache->key.offset) {
3186 +                       if (cache)
3187 +                               btrfs_put_block_group(cache);
3188 +                       cache = btrfs_lookup_block_group(fs_info, start);
3189 +                       BUG_ON(!cache);
3190 +               }
3191  
3192 -               set_extent_dirty(copy, start, end, GFP_NOFS);
3193 -               last = end + 1;
3194 +               len = cache->key.objectid + cache->key.offset - start;
3195 +               len = min(len, end + 1 - start);
3196 +
3197 +               if (start < cache->last_byte_to_unpin) {
3198 +                       len = min(len, cache->last_byte_to_unpin - start);
3199 +                       btrfs_add_free_space(cache, start, len);
3200 +               }
3201 +
3202 +               spin_lock(&cache->space_info->lock);
3203 +               spin_lock(&cache->lock);
3204 +               cache->pinned -= len;
3205 +               cache->space_info->bytes_pinned -= len;
3206 +               spin_unlock(&cache->lock);
3207 +               spin_unlock(&cache->space_info->lock);
3208 +
3209 +               start += len;
3210         }
3211 +
3212 +       if (cache)
3213 +               btrfs_put_block_group(cache);
3214         return 0;
3215  }
3216  
3217  int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3218 -                              struct btrfs_root *root,
3219 -                              struct extent_io_tree *unpin)
3220 +                              struct btrfs_root *root)
3221  {
3222 +       struct btrfs_fs_info *fs_info = root->fs_info;
3223 +       struct extent_io_tree *unpin;
3224         u64 start;
3225         u64 end;
3226         int ret;
3227  
3228 +       if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3229 +               unpin = &fs_info->freed_extents[1];
3230 +       else
3231 +               unpin = &fs_info->freed_extents[0];
3232 +
3233         while (1) {
3234                 ret = find_first_extent_bit(unpin, 0, &start, &end,
3235                                             EXTENT_DIRTY);
3236 @@ -3185,10 +3665,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3237  
3238                 ret = btrfs_discard_extent(root, start, end + 1 - start);
3239  
3240 -               /* unlocks the pinned mutex */
3241 -               btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
3242                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
3243 -
3244 +               unpin_extent_range(root, start, end);
3245                 cond_resched();
3246         }
3247  
3248 @@ -3198,7 +3676,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3249  static int pin_down_bytes(struct btrfs_trans_handle *trans,
3250                           struct btrfs_root *root,
3251                           struct btrfs_path *path,
3252 -                         u64 bytenr, u64 num_bytes, int is_data,
3253 +                         u64 bytenr, u64 num_bytes,
3254 +                         int is_data, int reserved,
3255                           struct extent_buffer **must_clean)
3256  {
3257         int err = 0;
3258 @@ -3207,6 +3686,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
3259         if (is_data)
3260                 goto pinit;
3261  
3262 +       /*
3263 +        * discard is sloooow, and so triggering discards on
3264 +        * individual btree blocks isn't a good plan.  Just
3265 +        * pin everything in discard mode.
3266 +        */
3267 +       if (btrfs_test_opt(root, DISCARD))
3268 +               goto pinit;
3269 +
3270         buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3271         if (!buf)
3272                 goto pinit;
3273 @@ -3230,15 +3717,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
3274         }
3275         free_extent_buffer(buf);
3276  pinit:
3277 -       btrfs_set_path_blocking(path);
3278 +       if (path)
3279 +               btrfs_set_path_blocking(path);
3280         /* unlocks the pinned mutex */
3281 -       btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
3282 +       btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3283  
3284         BUG_ON(err < 0);
3285         return 0;
3286  }
3287  
3288 -
3289  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3290                                 struct btrfs_root *root,
3291                                 u64 bytenr, u64 num_bytes, u64 parent,
3292 @@ -3412,7 +3899,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3293                 }
3294  
3295                 ret = pin_down_bytes(trans, root, path, bytenr,
3296 -                                    num_bytes, is_data, &must_clean);
3297 +                                    num_bytes, is_data, 0, &must_clean);
3298                 if (ret > 0)
3299                         mark_free = 1;
3300                 BUG_ON(ret < 0);
3301 @@ -3543,8 +4030,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
3302         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
3303                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
3304                 /* unlocks the pinned mutex */
3305 -               btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
3306 -               update_reserved_extents(root, bytenr, num_bytes, 0);
3307 +               btrfs_pin_extent(root, bytenr, num_bytes, 1);
3308                 ret = 0;
3309         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
3310                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
3311 @@ -3584,19 +4070,33 @@ static noinline int
3312  wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
3313                                 u64 num_bytes)
3314  {
3315 +       struct btrfs_caching_control *caching_ctl;
3316         DEFINE_WAIT(wait);
3317  
3318 -       prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
3319 -
3320 -       if (block_group_cache_done(cache)) {
3321 -               finish_wait(&cache->caching_q, &wait);
3322 +       caching_ctl = get_caching_control(cache);
3323 +       if (!caching_ctl)
3324                 return 0;
3325 -       }
3326 -       schedule();
3327 -       finish_wait(&cache->caching_q, &wait);
3328  
3329 -       wait_event(cache->caching_q, block_group_cache_done(cache) ||
3330 +       wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
3331                    (cache->free_space >= num_bytes));
3332 +
3333 +       put_caching_control(caching_ctl);
3334 +       return 0;
3335 +}
3336 +
3337 +static noinline int
3338 +wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
3339 +{
3340 +       struct btrfs_caching_control *caching_ctl;
3341 +       DEFINE_WAIT(wait);
3342 +
3343 +       caching_ctl = get_caching_control(cache);
3344 +       if (!caching_ctl)
3345 +               return 0;
3346 +
3347 +       wait_event(caching_ctl->wait, block_group_cache_done(cache));
3348 +
3349 +       put_caching_control(caching_ctl);
3350         return 0;
3351  }
3352  
3353 @@ -3634,6 +4134,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3354         int last_ptr_loop = 0;
3355         int loop = 0;
3356         bool found_uncached_bg = false;
3357 +       bool failed_cluster_refill = false;
3358 +       bool failed_alloc = false;
3359  
3360         WARN_ON(num_bytes < root->sectorsize);
3361         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
3362 @@ -3731,7 +4233,16 @@ have_block_group:
3363                 if (unlikely(block_group->ro))
3364                         goto loop;
3365  
3366 -               if (last_ptr) {
3367 +               /*
3368 +                * Ok we want to try and use the cluster allocator, so lets look
3369 +                * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
3370 +                * have tried the cluster allocator plenty of times at this
3371 +                * point and not have found anything, so we are likely way too
3372 +                * fragmented for the clustering stuff to find anything, so lets
3373 +                * just skip it and let the allocator find whatever block it can
3374 +                * find
3375 +                */
3376 +               if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
3377                         /*
3378                          * the refill lock keeps out other
3379                          * people trying to start a new cluster
3380 @@ -3806,9 +4317,11 @@ refill_cluster:
3381                                         spin_unlock(&last_ptr->refill_lock);
3382                                         goto checks;
3383                                 }
3384 -                       } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
3385 +                       } else if (!cached && loop > LOOP_CACHING_NOWAIT
3386 +                                  && !failed_cluster_refill) {
3387                                 spin_unlock(&last_ptr->refill_lock);
3388  
3389 +                               failed_cluster_refill = true;
3390                                 wait_block_group_cache_progress(block_group,
3391                                        num_bytes + empty_cluster + empty_size);
3392                                 goto have_block_group;
3393 @@ -3820,25 +4333,30 @@ refill_cluster:
3394                          * cluster.  Free the cluster we've been trying
3395                          * to use, and go to the next block group
3396                          */
3397 -                       if (loop < LOOP_NO_EMPTY_SIZE) {
3398 -                               btrfs_return_cluster_to_free_space(NULL,
3399 -                                                                  last_ptr);
3400 -                               spin_unlock(&last_ptr->refill_lock);
3401 -                               goto loop;
3402 -                       }
3403 +                       btrfs_return_cluster_to_free_space(NULL, last_ptr);
3404                         spin_unlock(&last_ptr->refill_lock);
3405 +                       goto loop;
3406                 }
3407  
3408                 offset = btrfs_find_space_for_alloc(block_group, search_start,
3409                                                     num_bytes, empty_size);
3410 -               if (!offset && (cached || (!cached &&
3411 -                                          loop == LOOP_CACHING_NOWAIT))) {
3412 -                       goto loop;
3413 -               } else if (!offset && (!cached &&
3414 -                                      loop > LOOP_CACHING_NOWAIT)) {
3415 +               /*
3416 +                * If we didn't find a chunk, and we haven't failed on this
3417 +                * block group before, and this block group is in the middle of
3418 +                * caching and we are ok with waiting, then go ahead and wait
3419 +                * for progress to be made, and set failed_alloc to true.
3420 +                *
3421 +                * If failed_alloc is true then we've already waited on this
3422 +                * block group once and should move on to the next block group.
3423 +                */
3424 +               if (!offset && !failed_alloc && !cached &&
3425 +                   loop > LOOP_CACHING_NOWAIT) {
3426                         wait_block_group_cache_progress(block_group,
3427 -                                       num_bytes + empty_size);
3428 +                                               num_bytes + empty_size);
3429 +                       failed_alloc = true;
3430                         goto have_block_group;
3431 +               } else if (!offset) {
3432 +                       goto loop;
3433                 }
3434  checks:
3435                 search_start = stripe_align(root, offset);
3436 @@ -3880,9 +4398,13 @@ checks:
3437                                              search_start - offset);
3438                 BUG_ON(offset > search_start);
3439  
3440 +               update_reserved_extents(block_group, num_bytes, 1);
3441 +
3442                 /* we are all good, lets return */
3443                 break;
3444  loop:
3445 +               failed_cluster_refill = false;
3446 +               failed_alloc = false;
3447                 btrfs_put_block_group(block_group);
3448         }
3449         up_read(&space_info->groups_sem);
3450 @@ -3940,21 +4462,32 @@ loop:
3451         return ret;
3452  }
3453  
3454 -static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3455 +static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
3456 +                           int dump_block_groups)
3457  {
3458         struct btrfs_block_group_cache *cache;
3459  
3460 +       spin_lock(&info->lock);
3461         printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3462                (unsigned long long)(info->total_bytes - info->bytes_used -
3463 -                                   info->bytes_pinned - info->bytes_reserved),
3464 +                                   info->bytes_pinned - info->bytes_reserved -
3465 +                                   info->bytes_super),
3466                (info->full) ? "" : "not ");
3467         printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
3468 -              " may_use=%llu, used=%llu\n",
3469 +              " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
3470 +              "\n",
3471                (unsigned long long)info->total_bytes,
3472                (unsigned long long)info->bytes_pinned,
3473                (unsigned long long)info->bytes_delalloc,
3474                (unsigned long long)info->bytes_may_use,
3475 -              (unsigned long long)info->bytes_used);
3476 +              (unsigned long long)info->bytes_used,
3477 +              (unsigned long long)info->bytes_root,
3478 +              (unsigned long long)info->bytes_super,
3479 +              (unsigned long long)info->bytes_reserved);
3480 +       spin_unlock(&info->lock);
3481 +
3482 +       if (!dump_block_groups)
3483 +               return;
3484  
3485         down_read(&info->groups_sem);
3486         list_for_each_entry(cache, &info->block_groups, list) {
3487 @@ -3972,12 +4505,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3488         up_read(&info->groups_sem);
3489  }
3490  
3491 -static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3492 -                                 struct btrfs_root *root,
3493 -                                 u64 num_bytes, u64 min_alloc_size,
3494 -                                 u64 empty_size, u64 hint_byte,
3495 -                                 u64 search_end, struct btrfs_key *ins,
3496 -                                 u64 data)
3497 +int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3498 +                        struct btrfs_root *root,
3499 +                        u64 num_bytes, u64 min_alloc_size,
3500 +                        u64 empty_size, u64 hint_byte,
3501 +                        u64 search_end, struct btrfs_key *ins,
3502 +                        u64 data)
3503  {
3504         int ret;
3505         u64 search_start = 0;
3506 @@ -4022,7 +4555,7 @@ again:
3507                 printk(KERN_ERR "btrfs allocation failed flags %llu, "
3508                        "wanted %llu\n", (unsigned long long)data,
3509                        (unsigned long long)num_bytes);
3510 -               dump_space_info(sinfo, num_bytes);
3511 +               dump_space_info(sinfo, num_bytes, 1);
3512         }
3513  
3514         return ret;
3515 @@ -4043,25 +4576,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3516         ret = btrfs_discard_extent(root, start, len);
3517  
3518         btrfs_add_free_space(cache, start, len);
3519 +       update_reserved_extents(cache, len, 0);
3520         btrfs_put_block_group(cache);
3521 -       update_reserved_extents(root, start, len, 0);
3522 -
3523 -       return ret;
3524 -}
3525 -
3526 -int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3527 -                                 struct btrfs_root *root,
3528 -                                 u64 num_bytes, u64 min_alloc_size,
3529 -                                 u64 empty_size, u64 hint_byte,
3530 -                                 u64 search_end, struct btrfs_key *ins,
3531 -                                 u64 data)
3532 -{
3533 -       int ret;
3534 -       ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3535 -                                    empty_size, hint_byte, search_end, ins,
3536 -                                    data);
3537 -       if (!ret)
3538 -               update_reserved_extents(root, ins->objectid, ins->offset, 1);
3539  
3540         return ret;
3541  }
3542 @@ -4222,15 +4738,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
3543  {
3544         int ret;
3545         struct btrfs_block_group_cache *block_group;
3546 +       struct btrfs_caching_control *caching_ctl;
3547 +       u64 start = ins->objectid;
3548 +       u64 num_bytes = ins->offset;
3549  
3550         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
3551         cache_block_group(block_group);
3552 -       wait_event(block_group->caching_q,
3553 -                  block_group_cache_done(block_group));
3554 +       caching_ctl = get_caching_control(block_group);
3555  
3556 -       ret = btrfs_remove_free_space(block_group, ins->objectid,
3557 -                                     ins->offset);
3558 -       BUG_ON(ret);
3559 +       if (!caching_ctl) {
3560 +               BUG_ON(!block_group_cache_done(block_group));
3561 +               ret = btrfs_remove_free_space(block_group, start, num_bytes);
3562 +               BUG_ON(ret);
3563 +       } else {
3564 +               mutex_lock(&caching_ctl->mutex);
3565 +
3566 +               if (start >= caching_ctl->progress) {
3567 +                       ret = add_excluded_extent(root, start, num_bytes);
3568 +                       BUG_ON(ret);
3569 +               } else if (start + num_bytes <= caching_ctl->progress) {
3570 +                       ret = btrfs_remove_free_space(block_group,
3571 +                                                     start, num_bytes);
3572 +                       BUG_ON(ret);
3573 +               } else {
3574 +                       num_bytes = caching_ctl->progress - start;
3575 +                       ret = btrfs_remove_free_space(block_group,
3576 +                                                     start, num_bytes);
3577 +                       BUG_ON(ret);
3578 +
3579 +                       start = caching_ctl->progress;
3580 +                       num_bytes = ins->objectid + ins->offset -
3581 +                                   caching_ctl->progress;
3582 +                       ret = add_excluded_extent(root, start, num_bytes);
3583 +                       BUG_ON(ret);
3584 +               }
3585 +
3586 +               mutex_unlock(&caching_ctl->mutex);
3587 +               put_caching_control(caching_ctl);
3588 +       }
3589 +
3590 +       update_reserved_extents(block_group, ins->offset, 1);
3591         btrfs_put_block_group(block_group);
3592         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
3593                                          0, owner, offset, ins, 1);
3594 @@ -4254,9 +4801,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
3595         int ret;
3596         u64 flags = 0;
3597  
3598 -       ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
3599 -                                    empty_size, hint_byte, search_end,
3600 -                                    ins, 0);
3601 +       ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
3602 +                                  empty_size, hint_byte, search_end,
3603 +                                  ins, 0);
3604         if (ret)
3605                 return ret;
3606  
3607 @@ -4267,7 +4814,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
3608         } else
3609                 BUG_ON(parent > 0);
3610  
3611 -       update_reserved_extents(root, ins->objectid, ins->offset, 1);
3612         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3613                 struct btrfs_delayed_extent_op *extent_op;
3614                 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3615 @@ -4346,452 +4892,108 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3616         return buf;
3617  }
3618  
3619 -#if 0
3620 -int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3621 -                       struct btrfs_root *root, struct extent_buffer *leaf)
3622 -{
3623 -       u64 disk_bytenr;
3624 -       u64 num_bytes;
3625 -       struct btrfs_key key;
3626 -       struct btrfs_file_extent_item *fi;
3627 -       u32 nritems;
3628 -       int i;
3629 -       int ret;
3630 -
3631 -       BUG_ON(!btrfs_is_leaf(leaf));
3632 -       nritems = btrfs_header_nritems(leaf);
3633 -
3634 -       for (i = 0; i < nritems; i++) {
3635 -               cond_resched();
3636 -               btrfs_item_key_to_cpu(leaf, &key, i);
3637 -
3638 -               /* only extents have references, skip everything else */
3639 -               if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3640 -                       continue;
3641 -
3642 -               fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3643 -
3644 -               /* inline extents live in the btree, they don't have refs */
3645 -               if (btrfs_file_extent_type(leaf, fi) ==
3646 -                   BTRFS_FILE_EXTENT_INLINE)
3647 -                       continue;
3648 -
3649 -               disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3650 -
3651 -               /* holes don't have refs */
3652 -               if (disk_bytenr == 0)
3653 -                       continue;
3654 -
3655 -               num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
3656 -               ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
3657 -                                       leaf->start, 0, key.objectid, 0);
3658 -               BUG_ON(ret);
3659 -       }
3660 -       return 0;
3661 -}
3662 -
3663 -static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3664 -                                       struct btrfs_root *root,
3665 -                                       struct btrfs_leaf_ref *ref)
3666 -{
3667 -       int i;
3668 -       int ret;
3669 -       struct btrfs_extent_info *info;
3670 -       struct refsort *sorted;
3671 -
3672 -       if (ref->nritems == 0)
3673 -               return 0;
3674 -
3675 -       sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3676 -       for (i = 0; i < ref->nritems; i++) {
3677 -               sorted[i].bytenr = ref->extents[i].bytenr;
3678 -               sorted[i].slot = i;
3679 -       }
3680 -       sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3681 -
3682 -       /*
3683 -        * the items in the ref were sorted when the ref was inserted
3684 -        * into the ref cache, so this is already in order
3685 -        */
3686 -       for (i = 0; i < ref->nritems; i++) {
3687 -               info = ref->extents + sorted[i].slot;
3688 -               ret = btrfs_free_extent(trans, root, info->bytenr,
3689 -                                         info->num_bytes, ref->bytenr,
3690 -                                         ref->owner, ref->generation,
3691 -                                         info->objectid, 0);
3692 -
3693 -               atomic_inc(&root->fs_info->throttle_gen);
3694 -               wake_up(&root->fs_info->transaction_throttle);
3695 -               cond_resched();
3696 -
3697 -               BUG_ON(ret);
3698 -               info++;
3699 -       }
3700 -
3701 -       kfree(sorted);
3702 -       return 0;
3703 -}
3704 -
3705 -
3706 -static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
3707 -                                    struct btrfs_root *root, u64 start,
3708 -                                    u64 len, u32 *refs)
3709 -{
3710 -       int ret;
3711 -
3712 -       ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
3713 -       BUG_ON(ret);
3714 -
3715 -#if 0 /* some debugging code in case we see problems here */
3716 -       /* if the refs count is one, it won't get increased again.  But
3717 -        * if the ref count is > 1, someone may be decreasing it at
3718 -        * the same time we are.
3719 -        */
3720 -       if (*refs != 1) {
3721 -               struct extent_buffer *eb = NULL;
3722 -               eb = btrfs_find_create_tree_block(root, start, len);
3723 -               if (eb)
3724 -                       btrfs_tree_lock(eb);
3725 -
3726 -               mutex_lock(&root->fs_info->alloc_mutex);
3727 -               ret = lookup_extent_ref(NULL, root, start, len, refs);
3728 -               BUG_ON(ret);
3729 -               mutex_unlock(&root->fs_info->alloc_mutex);
3730 -
3731 -               if (eb) {
3732 -                       btrfs_tree_unlock(eb);
3733 -                       free_extent_buffer(eb);
3734 -               }
3735 -               if (*refs == 1) {
3736 -                       printk(KERN_ERR "btrfs block %llu went down to one "
3737 -                              "during drop_snap\n", (unsigned long long)start);
3738 -               }
3739 -
3740 -       }
3741 -#endif
3742 -
3743 -       cond_resched();
3744 -       return ret;
3745 -}
3746 +struct walk_control {
3747 +       u64 refs[BTRFS_MAX_LEVEL];
3748 +       u64 flags[BTRFS_MAX_LEVEL];
3749 +       struct btrfs_key update_progress;
3750 +       int stage;
3751 +       int level;
3752 +       int shared_level;
3753 +       int update_ref;
3754 +       int keep_locks;
3755 +       int reada_slot;
3756 +       int reada_count;
3757 +};
3758  
3759 +#define DROP_REFERENCE 1
3760 +#define UPDATE_BACKREF 2
3761  
3762 -/*
3763 - * this is used while deleting old snapshots, and it drops the refs
3764 - * on a whole subtree starting from a level 1 node.
3765 - *
3766 - * The idea is to sort all the leaf pointers, and then drop the
3767 - * ref on all the leaves in order.  Most of the time the leaves
3768 - * will have ref cache entries, so no leaf IOs will be required to
3769 - * find the extents they have references on.
3770 - *
3771 - * For each leaf, any references it has are also dropped in order
3772 - *
3773 - * This ends up dropping the references in something close to optimal
3774 - * order for reading and modifying the extent allocation tree.
3775 - */
3776 -static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3777 -                                       struct btrfs_root *root,
3778 -                                       struct btrfs_path *path)
3779 +static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
3780 +                                    struct btrfs_root *root,
3781 +                                    struct walk_control *wc,
3782 +                                    struct btrfs_path *path)
3783  {
3784         u64 bytenr;
3785 -       u64 root_owner;
3786 -       u64 root_gen;
3787 -       struct extent_buffer *eb = path->nodes[1];
3788 -       struct extent_buffer *leaf;
3789 -       struct btrfs_leaf_ref *ref;
3790 -       struct refsort *sorted = NULL;
3791 -       int nritems = btrfs_header_nritems(eb);
3792 +       u64 generation;
3793 +       u64 refs;
3794 +       u64 flags;
3795 +       u64 last = 0;
3796 +       u32 nritems;
3797 +       u32 blocksize;
3798 +       struct btrfs_key key;
3799 +       struct extent_buffer *eb;
3800         int ret;
3801 -       int i;
3802 -       int refi = 0;
3803 -       int slot = path->slots[1];
3804 -       u32 blocksize = btrfs_level_size(root, 0);
3805 -       u32 refs;
3806 -
3807 -       if (nritems == 0)
3808 -               goto out;
3809 -
3810 -       root_owner = btrfs_header_owner(eb);
3811 -       root_gen = btrfs_header_generation(eb);
3812 -       sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3813 +       int slot;
3814 +       int nread = 0;
3815  
3816 -       /*
3817 -        * step one, sort all the leaf pointers so we don't scribble
3818 -        * randomly into the extent allocation tree
3819 -        */
3820 -       for (i = slot; i < nritems; i++) {
3821 -               sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3822 -               sorted[refi].slot = i;
3823 -               refi++;
3824 +       if (path->slots[wc->level] < wc->reada_slot) {
3825 +               wc->reada_count = wc->reada_count * 2 / 3;
3826 +               wc->reada_count = max(wc->reada_count, 2);
3827 +       } else {
3828 +               wc->reada_count = wc->reada_count * 3 / 2;
3829 +               wc->reada_count = min_t(int, wc->reada_count,
3830 +                                       BTRFS_NODEPTRS_PER_BLOCK(root));
3831         }
3832  
3833 -       /*
3834 -        * nritems won't be zero, but if we're picking up drop_snapshot
3835 -        * after a crash, slot might be > 0, so double check things
3836 -        * just in case.
3837 -        */
3838 -       if (refi == 0)
3839 -               goto out;
3840 +       eb = path->nodes[wc->level];
3841 +       nritems = btrfs_header_nritems(eb);
3842 +       blocksize = btrfs_level_size(root, wc->level - 1);
3843  
3844 -       sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3845 +       for (slot = path->slots[wc->level]; slot < nritems; slot++) {
3846 +               if (nread >= wc->reada_count)
3847 +                       break;
3848  
3849 -       /*
3850 -        * the first loop frees everything the leaves point to
3851 -        */
3852 -       for (i = 0; i < refi; i++) {
3853 -               u64 ptr_gen;
3854 +               cond_resched();
3855 +               bytenr = btrfs_node_blockptr(eb, slot);
3856 +               generation = btrfs_node_ptr_generation(eb, slot);
3857  
3858 -               bytenr = sorted[i].bytenr;
3859 +               if (slot == path->slots[wc->level])
3860 +                       goto reada;
3861  
3862 -               /*
3863 -                * check the reference count on this leaf.  If it is > 1
3864 -                * we just decrement it below and don't update any
3865 -                * of the refs the leaf points to.
3866 -                */
3867 -               ret = drop_snap_lookup_refcount(trans, root, bytenr,
3868 -                                               blocksize, &refs);
3869 -               BUG_ON(ret);
3870 -               if (refs != 1)
3871 +               if (wc->stage == UPDATE_BACKREF &&
3872 +                   generation <= root->root_key.offset)
3873                         continue;
3874  
3875 -               ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3876 -
3877 -               /*
3878 -                * the leaf only had one reference, which means the
3879 -                * only thing pointing to this leaf is the snapshot
3880 -                * we're deleting.  It isn't possible for the reference
3881 -                * count to increase again later
3882 -                *
3883 -                * The reference cache is checked for the leaf,
3884 -                * and if found we'll be able to drop any refs held by
3885 -                * the leaf without needing to read it in.
3886 -                */
3887 -               ref = btrfs_lookup_leaf_ref(root, bytenr);
3888 -               if (ref && ref->generation != ptr_gen) {
3889 -                       btrfs_free_leaf_ref(root, ref);
3890 -                       ref = NULL;
3891 -               }
3892 -               if (ref) {
3893 -                       ret = cache_drop_leaf_ref(trans, root, ref);
3894 -                       BUG_ON(ret);
3895 -                       btrfs_remove_leaf_ref(root, ref);
3896 -                       btrfs_free_leaf_ref(root, ref);
3897 -               } else {
3898 -                       /*
3899 -                        * the leaf wasn't in the reference cache, so
3900 -                        * we have to read it.
3901 -                        */
3902 -                       leaf = read_tree_block(root, bytenr, blocksize,
3903 -                                              ptr_gen);
3904 -                       ret = btrfs_drop_leaf_ref(trans, root, leaf);
3905 -                       BUG_ON(ret);
3906 -                       free_extent_buffer(leaf);
3907 -               }
3908 -               atomic_inc(&root->fs_info->throttle_gen);
3909 -               wake_up(&root->fs_info->transaction_throttle);
3910 -               cond_resched();
3911 -       }
3912 -
3913 -       /*
3914 -        * run through the loop again to free the refs on the leaves.
3915 -        * This is faster than doing it in the loop above because
3916 -        * the leaves are likely to be clustered together.  We end up
3917 -        * working in nice chunks on the extent allocation tree.
3918 -        */
3919 -       for (i = 0; i < refi; i++) {
3920 -               bytenr = sorted[i].bytenr;
3921 -               ret = btrfs_free_extent(trans, root, bytenr,
3922 -                                       blocksize, eb->start,
3923 -                                       root_owner, root_gen, 0, 1);
3924 +               /* We don't lock the tree block, it's OK to be racy here */
3925 +               ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
3926 +                                              &refs, &flags);
3927                 BUG_ON(ret);
3928 +               BUG_ON(refs == 0);
3929  
3930 -               atomic_inc(&root->fs_info->throttle_gen);
3931 -               wake_up(&root->fs_info->transaction_throttle);
3932 -               cond_resched();
3933 -       }
3934 -out:
3935 -       kfree(sorted);
3936 -
3937 -       /*
3938 -        * update the path to show we've processed the entire level 1
3939 -        * node.  This will get saved into the root's drop_snapshot_progress
3940 -        * field so these drops are not repeated again if this transaction
3941 -        * commits.
3942 -        */
3943 -       path->slots[1] = nritems;
3944 -       return 0;
3945 -}
3946 -
3947 -/*
3948 - * helper function for drop_snapshot, this walks down the tree dropping ref
3949 - * counts as it goes.
3950 - */
3951 -static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3952 -                                  struct btrfs_root *root,
3953 -                                  struct btrfs_path *path, int *level)
3954 -{
3955 -       u64 root_owner;
3956 -       u64 root_gen;
3957 -       u64 bytenr;
3958 -       u64 ptr_gen;
3959 -       struct extent_buffer *next;
3960 -       struct extent_buffer *cur;
3961 -       struct extent_buffer *parent;
3962 -       u32 blocksize;
3963 -       int ret;
3964 -       u32 refs;
3965 -
3966 -       WARN_ON(*level < 0);
3967 -       WARN_ON(*level >= BTRFS_MAX_LEVEL);
3968 -       ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
3969 -                               path->nodes[*level]->len, &refs);
3970 -       BUG_ON(ret);
3971 -       if (refs > 1)
3972 -               goto out;
3973 -
3974 -       /*
3975 -        * walk down to the last node level and free all the leaves
3976 -        */
3977 -       while (*level >= 0) {
3978 -               WARN_ON(*level < 0);
3979 -               WARN_ON(*level >= BTRFS_MAX_LEVEL);
3980 -               cur = path->nodes[*level];
3981 -
3982 -               if (btrfs_header_level(cur) != *level)
3983 -                       WARN_ON(1);
3984 -
3985 -               if (path->slots[*level] >=
3986 -                   btrfs_header_nritems(cur))
3987 -                       break;
3988 +               if (wc->stage == DROP_REFERENCE) {
3989 +                       if (refs == 1)
3990 +                               goto reada;
3991  
3992 -               /* the new code goes down to level 1 and does all the
3993 -                * leaves pointed to that node in bulk.  So, this check
3994 -                * for level 0 will always be false.
3995 -                *
3996 -                * But, the disk format allows the drop_snapshot_progress
3997 -                * field in the root to leave things in a state where
3998 -                * a leaf will need cleaning up here.  If someone crashes
3999 -                * with the old code and then boots with the new code,
4000 -                * we might find a leaf here.
4001 -                */
4002 -               if (*level == 0) {
4003 -                       ret = btrfs_drop_leaf_ref(trans, root, cur);
4004 -                       BUG_ON(ret);
4005 -                       break;
4006 +                       if (wc->level == 1 &&
4007 +                           (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4008 +                               continue;
4009 +                       if (!wc->update_ref ||
4010 +                           generation <= root->root_key.offset)
4011 +                               continue;
4012 +                       btrfs_node_key_to_cpu(eb, &key, slot);
4013 +                       ret = btrfs_comp_cpu_keys(&key,
4014 +                                                 &wc->update_progress);
4015 +                       if (ret < 0)
4016 +                               continue;
4017 +               } else {
4018 +                       if (wc->level == 1 &&
4019 +                           (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4020 +                               continue;
4021                 }
4022 -
4023 -               /*
4024 -                * once we get to level one, process the whole node
4025 -                * at once, including everything below it.
4026 -                */
4027 -               if (*level == 1) {
4028 -                       ret = drop_level_one_refs(trans, root, path);
4029 -                       BUG_ON(ret);
4030 +reada:
4031 +               ret = readahead_tree_block(root, bytenr, blocksize,
4032 +                                          generation);
4033 +               if (ret)
4034                         break;
4035 -               }
4036 -
4037 -               bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
4038 -               ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4039 -               blocksize = btrfs_level_size(root, *level - 1);
4040 -
4041 -               ret = drop_snap_lookup_refcount(trans, root, bytenr,
4042 -                                               blocksize, &refs);
4043 -               BUG_ON(ret);
4044 -
4045 -               /*
4046 -                * if there is more than one reference, we don't need
4047 -                * to read that node to drop any references it has.  We
4048 -                * just drop the ref we hold on that node and move on to the
4049 -                * next slot in this level.
4050 -                */
4051 -               if (refs != 1) {
4052 -                       parent = path->nodes[*level];
4053 -                       root_owner = btrfs_header_owner(parent);
4054 -                       root_gen = btrfs_header_generation(parent);
4055 -                       path->slots[*level]++;
4056 -
4057 -                       ret = btrfs_free_extent(trans, root, bytenr,
4058 -                                               blocksize, parent->start,
4059 -                                               root_owner, root_gen,
4060 -                                               *level - 1, 1);
4061 -                       BUG_ON(ret);
4062 -
4063 -                       atomic_inc(&root->fs_info->throttle_gen);
4064 -                       wake_up(&root->fs_info->transaction_throttle);
4065 -                       cond_resched();
4066 -
4067 -                       continue;
4068 -               }
4069 -
4070 -               /*
4071 -                * we need to keep freeing things in the next level down.
4072 -                * read the block and loop around to process it
4073 -                */
4074 -               next = read_tree_block(root, bytenr, blocksize, ptr_gen);
4075 -               WARN_ON(*level <= 0);
4076 -               if (path->nodes[*level-1])
4077 -                       free_extent_buffer(path->nodes[*level-1]);
4078 -               path->nodes[*level-1] = next;
4079 -               *level = btrfs_header_level(next);
4080 -               path->slots[*level] = 0;
4081 -               cond_resched();
4082 +               last = bytenr + blocksize;
4083 +               nread++;
4084         }
4085 -out:
4086 -       WARN_ON(*level < 0);
4087 -       WARN_ON(*level >= BTRFS_MAX_LEVEL);
4088 -
4089 -       if (path->nodes[*level] == root->node) {
4090 -               parent = path->nodes[*level];
4091 -               bytenr = path->nodes[*level]->start;
4092 -       } else {
4093 -               parent = path->nodes[*level + 1];
4094 -               bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
4095 -       }
4096 -
4097 -       blocksize = btrfs_level_size(root, *level);
4098 -       root_owner = btrfs_header_owner(parent);
4099 -       root_gen = btrfs_header_generation(parent);
4100 -
4101 -       /*
4102 -        * cleanup and free the reference on the last node
4103 -        * we processed
4104 -        */
4105 -       ret = btrfs_free_extent(trans, root, bytenr, blocksize,
4106 -                                 parent->start, root_owner, root_gen,
4107 -                                 *level, 1);
4108 -       free_extent_buffer(path->nodes[*level]);
4109 -       path->nodes[*level] = NULL;
4110 -
4111 -       *level += 1;
4112 -       BUG_ON(ret);
4113 -
4114 -       cond_resched();
4115 -       return 0;
4116 +       wc->reada_slot = slot;
4117  }
4118 -#endif
4119 -
4120 -struct walk_control {
4121 -       u64 refs[BTRFS_MAX_LEVEL];
4122 -       u64 flags[BTRFS_MAX_LEVEL];
4123 -       struct btrfs_key update_progress;
4124 -       int stage;
4125 -       int level;
4126 -       int shared_level;
4127 -       int update_ref;
4128 -       int keep_locks;
4129 -};
4130 -
4131 -#define DROP_REFERENCE 1
4132 -#define UPDATE_BACKREF 2
4133  
4134  /*
4135   * hepler to process tree block while walking down the tree.
4136   *
4137 - * when wc->stage == DROP_REFERENCE, this function checks
4138 - * reference count of the block. if the block is shared and
4139 - * we need update back refs for the subtree rooted at the
4140 - * block, this function changes wc->stage to UPDATE_BACKREF
4141 - *
4142   * when wc->stage == UPDATE_BACKREF, this function updates
4143   * back refs for pointers in the block.
4144   *
4145 @@ -4800,11 +5002,10 @@ struct walk_control {
4146  static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4147                                    struct btrfs_root *root,
4148                                    struct btrfs_path *path,
4149 -                                  struct walk_control *wc)
4150 +                                  struct walk_control *wc, int lookup_info)
4151  {
4152         int level = wc->level;
4153         struct extent_buffer *eb = path->nodes[level];
4154 -       struct btrfs_key key;
4155         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
4156         int ret;
4157  
4158 @@ -4816,8 +5017,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4159          * when reference count of tree block is 1, it won't increase
4160          * again. once full backref flag is set, we never clear it.
4161          */
4162 -       if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
4163 -           (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
4164 +       if (lookup_info &&
4165 +           ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
4166 +            (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
4167                 BUG_ON(!path->locks[level]);
4168                 ret = btrfs_lookup_extent_info(trans, root,
4169                                                eb->start, eb->len,
4170 @@ -4827,21 +5029,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4171                 BUG_ON(wc->refs[level] == 0);
4172         }
4173  
4174 -       if (wc->stage == DROP_REFERENCE &&
4175 -           wc->update_ref && wc->refs[level] > 1) {
4176 -               BUG_ON(eb == root->node);
4177 -               BUG_ON(path->slots[level] > 0);
4178 -               if (level == 0)
4179 -                       btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
4180 -               else
4181 -                       btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
4182 -               if (btrfs_header_owner(eb) == root->root_key.objectid &&
4183 -                   btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
4184 -                       wc->stage = UPDATE_BACKREF;
4185 -                       wc->shared_level = level;
4186 -               }
4187 -       }
4188 -
4189         if (wc->stage == DROP_REFERENCE) {
4190                 if (wc->refs[level] > 1)
4191                         return 1;
4192 @@ -4878,6 +5065,136 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4193  }
4194  
4195  /*
4196 + * hepler to process tree block pointer.
4197 + *
4198 + * when wc->stage == DROP_REFERENCE, this function checks
4199 + * reference count of the block pointed to. if the block
4200 + * is shared and we need update back refs for the subtree
4201 + * rooted at the block, this function changes wc->stage to
4202 + * UPDATE_BACKREF. if the block is shared and there is no
4203 + * need to update back, this function drops the reference
4204 + * to the block.
4205 + *
4206 + * NOTE: return value 1 means we should stop walking down.
4207 + */
4208 +static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4209 +                                struct btrfs_root *root,
4210 +                                struct btrfs_path *path,
4211 +                                struct walk_control *wc, int *lookup_info)
4212 +{
4213 +       u64 bytenr;
4214 +       u64 generation;
4215 +       u64 parent;
4216 +       u32 blocksize;
4217 +       struct btrfs_key key;
4218 +       struct extent_buffer *next;
4219 +       int level = wc->level;
4220 +       int reada = 0;
4221 +       int ret = 0;
4222 +
4223 +       generation = btrfs_node_ptr_generation(path->nodes[level],
4224 +                                              path->slots[level]);
4225 +       /*
4226 +        * if the lower level block was created before the snapshot
4227 +        * was created, we know there is no need to update back refs
4228 +        * for the subtree
4229 +        */
4230 +       if (wc->stage == UPDATE_BACKREF &&
4231 +           generation <= root->root_key.offset) {
4232 +               *lookup_info = 1;
4233 +               return 1;
4234 +       }
4235 +
4236 +       bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
4237 +       blocksize = btrfs_level_size(root, level - 1);
4238 +
4239 +       next = btrfs_find_tree_block(root, bytenr, blocksize);
4240 +       if (!next) {
4241 +               next = btrfs_find_create_tree_block(root, bytenr, blocksize);
4242 +               reada = 1;
4243 +       }
4244 +       btrfs_tree_lock(next);
4245 +       btrfs_set_lock_blocking(next);
4246 +
4247 +       ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
4248 +                                      &wc->refs[level - 1],
4249 +                                      &wc->flags[level - 1]);
4250 +       BUG_ON(ret);
4251 +       BUG_ON(wc->refs[level - 1] == 0);
4252 +       *lookup_info = 0;
4253 +
4254 +       if (wc->stage == DROP_REFERENCE) {
4255 +               if (wc->refs[level - 1] > 1) {
4256 +                       if (level == 1 &&
4257 +                           (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4258 +                               goto skip;
4259 +
4260 +                       if (!wc->update_ref ||
4261 +                           generation <= root->root_key.offset)
4262 +                               goto skip;
4263 +
4264 +                       btrfs_node_key_to_cpu(path->nodes[level], &key,
4265 +                                             path->slots[level]);
4266 +                       ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
4267 +                       if (ret < 0)
4268 +                               goto skip;
4269 +
4270 +                       wc->stage = UPDATE_BACKREF;
4271 +                       wc->shared_level = level - 1;
4272 +               }
4273 +       } else {
4274 +               if (level == 1 &&
4275 +                   (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4276 +                       goto skip;
4277 +       }
4278 +
4279 +       if (!btrfs_buffer_uptodate(next, generation)) {
4280 +               btrfs_tree_unlock(next);
4281 +               free_extent_buffer(next);
4282 +               next = NULL;
4283 +               *lookup_info = 1;
4284 +       }
4285 +
4286 +       if (!next) {
4287 +               if (reada && level == 1)
4288 +                       reada_walk_down(trans, root, wc, path);
4289 +               next = read_tree_block(root, bytenr, blocksize, generation);
4290 +               btrfs_tree_lock(next);
4291 +               btrfs_set_lock_blocking(next);
4292 +       }
4293 +
4294 +       level--;
4295 +       BUG_ON(level != btrfs_header_level(next));
4296 +       path->nodes[level] = next;
4297 +       path->slots[level] = 0;
4298 +       path->locks[level] = 1;
4299 +       wc->level = level;
4300 +       if (wc->level == 1)
4301 +               wc->reada_slot = 0;
4302 +       return 0;
4303 +skip:
4304 +       wc->refs[level - 1] = 0;
4305 +       wc->flags[level - 1] = 0;
4306 +       if (wc->stage == DROP_REFERENCE) {
4307 +               if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
4308 +                       parent = path->nodes[level]->start;
4309 +               } else {
4310 +                       BUG_ON(root->root_key.objectid !=
4311 +                              btrfs_header_owner(path->nodes[level]));
4312 +                       parent = 0;
4313 +               }
4314 +
4315 +               ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
4316 +                                       root->root_key.objectid, level - 1, 0);
4317 +               BUG_ON(ret);
4318 +       }
4319 +       btrfs_tree_unlock(next);
4320 +       free_extent_buffer(next);
4321 +       *lookup_info = 1;
4322 +       return 1;
4323 +}
4324 +
4325 +/*
4326   * hepler to process tree block while walking up the tree.
4327   *
4328   * when wc->stage == DROP_REFERENCE, this function drops
4329 @@ -4904,7 +5221,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
4330                 if (level < wc->shared_level)
4331                         goto out;
4332  
4333 -               BUG_ON(wc->refs[level] <= 1);
4334                 ret = find_next_key(path, level + 1, &wc->update_progress);
4335                 if (ret > 0)
4336                         wc->update_ref = 0;
4337 @@ -4935,8 +5251,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
4338                                 path->locks[level] = 0;
4339                                 return 1;
4340                         }
4341 -               } else {
4342 -                       BUG_ON(level != 0);
4343                 }
4344         }
4345  
4346 @@ -4989,39 +5303,28 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4347                                    struct btrfs_path *path,
4348                                    struct walk_control *wc)
4349  {
4350 -       struct extent_buffer *next;
4351 -       struct extent_buffer *cur;
4352 -       u64 bytenr;
4353 -       u64 ptr_gen;
4354 -       u32 blocksize;
4355         int level = wc->level;
4356 +       int lookup_info = 1;
4357         int ret;
4358  
4359         while (level >= 0) {
4360 -               cur = path->nodes[level];
4361 -               BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
4362 +               if (path->slots[level] >=
4363 +                   btrfs_header_nritems(path->nodes[level]))
4364 +                       break;
4365  
4366 -               ret = walk_down_proc(trans, root, path, wc);
4367 +               ret = walk_down_proc(trans, root, path, wc, lookup_info);
4368                 if (ret > 0)
4369                         break;
4370  
4371                 if (level == 0)
4372                         break;
4373  
4374 -               bytenr = btrfs_node_blockptr(cur, path->slots[level]);
4375 -               blocksize = btrfs_level_size(root, level - 1);
4376 -               ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
4377 -
4378 -               next = read_tree_block(root, bytenr, blocksize, ptr_gen);
4379 -               btrfs_tree_lock(next);
4380 -               btrfs_set_lock_blocking(next);
4381 -
4382 -               level--;
4383 -               BUG_ON(level != btrfs_header_level(next));
4384 -               path->nodes[level] = next;
4385 -               path->slots[level] = 0;
4386 -               path->locks[level] = 1;
4387 -               wc->level = level;
4388 +               ret = do_walk_down(trans, root, path, wc, &lookup_info);
4389 +               if (ret > 0) {
4390 +                       path->slots[level]++;
4391 +                       continue;
4392 +               }
4393 +               level = wc->level;
4394         }
4395         return 0;
4396  }
4397 @@ -5111,9 +5414,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
4398                         err = ret;
4399                         goto out;
4400                 }
4401 -               btrfs_node_key_to_cpu(path->nodes[level], &key,
4402 -                                     path->slots[level]);
4403 -               WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
4404 +               WARN_ON(ret > 0);
4405  
4406                 /*
4407                  * unlock our path, this is safe because only this
4408 @@ -5148,6 +5449,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
4409         wc->stage = DROP_REFERENCE;
4410         wc->update_ref = update_ref;
4411         wc->keep_locks = 0;
4412 +       wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
4413  
4414         while (1) {
4415                 ret = walk_down_tree(trans, root, path, wc);
4416 @@ -5200,9 +5502,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
4417         ret = btrfs_del_root(trans, tree_root, &root->root_key);
4418         BUG_ON(ret);
4419  
4420 -       free_extent_buffer(root->node);
4421 -       free_extent_buffer(root->commit_root);
4422 -       kfree(root);
4423 +       if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
4424 +               ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
4425 +                                          NULL, NULL);
4426 +               BUG_ON(ret < 0);
4427 +               if (ret > 0) {
4428 +                       ret = btrfs_del_orphan_item(trans, tree_root,
4429 +                                                   root->root_key.objectid);
4430 +                       BUG_ON(ret);
4431 +               }
4432 +       }
4433 +
4434 +       if (root->in_radix) {
4435 +               btrfs_free_fs_root(tree_root->fs_info, root);
4436 +       } else {
4437 +               free_extent_buffer(root->node);
4438 +               free_extent_buffer(root->commit_root);
4439 +               kfree(root);
4440 +       }
4441  out:
4442         btrfs_end_transaction(trans, tree_root);
4443         kfree(wc);
4444 @@ -5254,6 +5571,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
4445         wc->stage = DROP_REFERENCE;
4446         wc->update_ref = 0;
4447         wc->keep_locks = 1;
4448 +       wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
4449  
4450         while (1) {
4451                 wret = walk_down_tree(trans, root, path, wc);
4452 @@ -5396,9 +5714,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
4453         lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4454         while (1) {
4455                 int ret;
4456 -               spin_lock(&em_tree->lock);
4457 +               write_lock(&em_tree->lock);
4458                 ret = add_extent_mapping(em_tree, em);
4459 -               spin_unlock(&em_tree->lock);
4460 +               write_unlock(&em_tree->lock);
4461                 if (ret != -EEXIST) {
4462                         free_extent_map(em);
4463                         break;
4464 @@ -6841,287 +7159,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
4465         return 0;
4466  }
4467  
4468 -#if 0
4469 -static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4470 -                                struct btrfs_root *root,
4471 -                                u64 objectid, u64 size)
4472 -{
4473 -       struct btrfs_path *path;
4474 -       struct btrfs_inode_item *item;
4475 -       struct extent_buffer *leaf;
4476 -       int ret;
4477 -
4478 -       path = btrfs_alloc_path();
4479 -       if (!path)
4480 -               return -ENOMEM;
4481 -
4482 -       path->leave_spinning = 1;
4483 -       ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4484 -       if (ret)
4485 -               goto out;
4486 -
4487 -       leaf = path->nodes[0];
4488 -       item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4489 -       memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
4490 -       btrfs_set_inode_generation(leaf, item, 1);
4491 -       btrfs_set_inode_size(leaf, item, size);
4492 -       btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4493 -       btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
4494 -       btrfs_mark_buffer_dirty(leaf);
4495 -       btrfs_release_path(root, path);
4496 -out:
4497 -       btrfs_free_path(path);
4498 -       return ret;
4499 -}
4500 -
4501 -static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4502 -                                       struct btrfs_block_group_cache *group)
4503 +/*
4504 + * checks to see if its even possible to relocate this block group.
4505 + *
4506 + * @return - -1 if it's not a good idea to relocate this block group, 0 if its
4507 + * ok to go ahead and try.
4508 + */
4509 +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
4510  {
4511 -       struct inode *inode = NULL;
4512 -       struct btrfs_trans_handle *trans;
4513 -       struct btrfs_root *root;
4514 -       struct btrfs_key root_key;
4515 -       u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
4516 -       int err = 0;
4517 +       struct btrfs_block_group_cache *block_group;
4518 +       struct btrfs_space_info *space_info;
4519 +       struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4520 +       struct btrfs_device *device;
4521 +       int full = 0;
4522 +       int ret = 0;
4523  
4524 -       root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
4525 -       root_key.type = BTRFS_ROOT_ITEM_KEY;
4526 -       root_key.offset = (u64)-1;
4527 -       root = btrfs_read_fs_root_no_name(fs_info, &root_key);
4528 -       if (IS_ERR(root))
4529 -               return ERR_CAST(root);
4530 +       block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
4531  
4532 -       trans = btrfs_start_transaction(root, 1);
4533 -       BUG_ON(!trans);
4534 +       /* odd, couldn't find the block group, leave it alone */
4535 +       if (!block_group)
4536 +               return -1;
4537  
4538 -       err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
4539 -       if (err)
4540 +       /* no bytes used, we're good */
4541 +       if (!btrfs_block_group_used(&block_group->item))
4542                 goto out;
4543  
4544 -       err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
4545 -       BUG_ON(err);
4546 -
4547 -       err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
4548 -                                      group->key.offset, 0, group->key.offset,
4549 -                                      0, 0, 0);
4550 -       BUG_ON(err);
4551 -
4552 -       inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
4553 -       if (inode->i_state & I_NEW) {
4554 -               BTRFS_I(inode)->root = root;
4555 -               BTRFS_I(inode)->location.objectid = objectid;
4556 -               BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
4557 -               BTRFS_I(inode)->location.offset = 0;
4558 -               btrfs_read_locked_inode(inode);
4559 -               unlock_new_inode(inode);
4560 -               BUG_ON(is_bad_inode(inode));
4561 -       } else {
4562 -               BUG_ON(1);
4563 -       }
4564 -       BTRFS_I(inode)->index_cnt = group->key.objectid;
4565 -
4566 -       err = btrfs_orphan_add(trans, inode);
4567 -out:
4568 -       btrfs_end_transaction(trans, root);
4569 -       if (err) {
4570 -               if (inode)
4571 -                       iput(inode);
4572 -               inode = ERR_PTR(err);
4573 -       }
4574 -       return inode;
4575 -}
4576 -
4577 -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4578 -{
4579 -
4580 -       struct btrfs_ordered_sum *sums;
4581 -       struct btrfs_sector_sum *sector_sum;
4582 -       struct btrfs_ordered_extent *ordered;
4583 -       struct btrfs_root *root = BTRFS_I(inode)->root;
4584 -       struct list_head list;
4585 -       size_t offset;
4586 -       int ret;
4587 -       u64 disk_bytenr;
4588 -
4589 -       INIT_LIST_HEAD(&list);
4590 -
4591 -       ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4592 -       BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4593 -
4594 -       disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4595 -       ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
4596 -                                      disk_bytenr + len - 1, &list);
4597 -
4598 -       while (!list_empty(&list)) {
4599 -               sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4600 -               list_del_init(&sums->list);
4601 -
4602 -               sector_sum = sums->sums;
4603 -               sums->bytenr = ordered->start;
4604 +       space_info = block_group->space_info;
4605 +       spin_lock(&space_info->lock);
4606  
4607 -               offset = 0;
4608 -               while (offset < sums->len) {
4609 -                       sector_sum->bytenr += ordered->start - disk_bytenr;
4610 -                       sector_sum++;
4611 -                       offset += root->sectorsize;
4612 -               }
4613 +       full = space_info->full;
4614  
4615 -               btrfs_add_ordered_sum(inode, ordered, sums);
4616 +       /*
4617 +        * if this is the last block group we have in this space, we can't
4618 +        * relocate it unless we're able to allocate a new chunk below.
4619 +        *
4620 +        * Otherwise, we need to make sure we have room in the space to handle
4621 +        * all of the extents from this block group.  If we can, we're good
4622 +        */
4623 +       if ((space_info->total_bytes != block_group->key.offset) &&
4624 +          (space_info->bytes_used + space_info->bytes_reserved +
4625 +           space_info->bytes_pinned + space_info->bytes_readonly +
4626 +           btrfs_block_group_used(&block_group->item) <
4627 +           space_info->total_bytes)) {
4628 +               spin_unlock(&space_info->lock);
4629 +               goto out;
4630         }
4631 -       btrfs_put_ordered_extent(ordered);
4632 -       return 0;
4633 -}
4634 -
4635 -int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
4636 -{
4637 -       struct btrfs_trans_handle *trans;
4638 -       struct btrfs_path *path;
4639 -       struct btrfs_fs_info *info = root->fs_info;
4640 -       struct extent_buffer *leaf;
4641 -       struct inode *reloc_inode;
4642 -       struct btrfs_block_group_cache *block_group;
4643 -       struct btrfs_key key;
4644 -       u64 skipped;
4645 -       u64 cur_byte;
4646 -       u64 total_found;
4647 -       u32 nritems;
4648 -       int ret;
4649 -       int progress;
4650 -       int pass = 0;
4651 -
4652 -       root = root->fs_info->extent_root;
4653 -
4654 -       block_group = btrfs_lookup_block_group(info, group_start);
4655 -       BUG_ON(!block_group);
4656 -
4657 -       printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
4658 -              (unsigned long long)block_group->key.objectid,
4659 -              (unsigned long long)block_group->flags);
4660 -
4661 -       path = btrfs_alloc_path();
4662 -       BUG_ON(!path);
4663 -
4664 -       reloc_inode = create_reloc_inode(info, block_group);
4665 -       BUG_ON(IS_ERR(reloc_inode));
4666 -
4667 -       __alloc_chunk_for_shrink(root, block_group, 1);
4668 -       set_block_group_readonly(block_group);
4669 -
4670 -       btrfs_start_delalloc_inodes(info->tree_root);
4671 -       btrfs_wait_ordered_extents(info->tree_root, 0);
4672 -again:
4673 -       skipped = 0;
4674 -       total_found = 0;
4675 -       progress = 0;
4676 -       key.objectid = block_group->key.objectid;
4677 -       key.offset = 0;
4678 -       key.type = 0;
4679 -       cur_byte = key.objectid;
4680 -
4681 -       trans = btrfs_start_transaction(info->tree_root, 1);
4682 -       btrfs_commit_transaction(trans, info->tree_root);
4683 +       spin_unlock(&space_info->lock);
4684  
4685 -       mutex_lock(&root->fs_info->cleaner_mutex);
4686 -       btrfs_clean_old_snapshots(info->tree_root);
4687 -       btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
4688 -       mutex_unlock(&root->fs_info->cleaner_mutex);
4689 +       /*
4690 +        * ok we don't have enough space, but maybe we have free space on our
4691 +        * devices to allocate new chunks for relocation, so loop through our
4692 +        * alloc devices and guess if we have enough space.  However, if we
4693 +        * were marked as full, then we know there aren't enough chunks, and we
4694 +        * can just return.
4695 +        */
4696 +       ret = -1;
4697 +       if (full)
4698 +               goto out;
4699  
4700 -       trans = btrfs_start_transaction(info->tree_root, 1);
4701 -       btrfs_commit_transaction(trans, info->tree_root);
4702 +       mutex_lock(&root->fs_info->chunk_mutex);
4703 +       list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4704 +               u64 min_free = btrfs_block_group_used(&block_group->item);
4705 +               u64 dev_offset, max_avail;
4706  
4707 -       while (1) {
4708 -               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4709 -               if (ret < 0)
4710 -                       goto out;
4711 -next:
4712 -               leaf = path->nodes[0];
4713 -               nritems = btrfs_header_nritems(leaf);
4714 -               if (path->slots[0] >= nritems) {
4715 -                       ret = btrfs_next_leaf(root, path);
4716 -                       if (ret < 0)
4717 -                               goto out;
4718 -                       if (ret == 1) {
4719 -                               ret = 0;
4720 +               /*
4721 +                * check to make sure we can actually find a chunk with enough
4722 +                * space to fit our block group in.
4723 +                */
4724 +               if (device->total_bytes > device->bytes_used + min_free) {
4725 +                       ret = find_free_dev_extent(NULL, device, min_free,
4726 +                                                  &dev_offset, &max_avail);
4727 +                       if (!ret)
4728                                 break;
4729 -                       }
4730 -                       leaf = path->nodes[0];
4731 -                       nritems = btrfs_header_nritems(leaf);
4732 -               }
4733 -
4734 -               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4735 -
4736 -               if (key.objectid >= block_group->key.objectid +
4737 -                   block_group->key.offset)
4738 -                       break;
4739 -
4740 -               if (progress && need_resched()) {
4741 -                       btrfs_release_path(root, path);
4742 -                       cond_resched();
4743 -                       progress = 0;
4744 -                       continue;
4745 -               }
4746 -               progress = 1;
4747 -
4748 -               if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
4749 -                   key.objectid + key.offset <= cur_byte) {
4750 -                       path->slots[0]++;
4751 -                       goto next;
4752 +                       ret = -1;
4753                 }
4754 -
4755 -               total_found++;
4756 -               cur_byte = key.objectid + key.offset;
4757 -               btrfs_release_path(root, path);
4758 -
4759 -               __alloc_chunk_for_shrink(root, block_group, 0);
4760 -               ret = relocate_one_extent(root, path, &key, block_group,
4761 -                                         reloc_inode, pass);
4762 -               BUG_ON(ret < 0);
4763 -               if (ret > 0)
4764 -                       skipped++;
4765 -
4766 -               key.objectid = cur_byte;
4767 -               key.type = 0;
4768 -               key.offset = 0;
4769         }
4770 -
4771 -       btrfs_release_path(root, path);
4772 -
4773 -       if (pass == 0) {
4774 -               btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
4775 -               invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
4776 -       }
4777 -
4778 -       if (total_found > 0) {
4779 -               printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
4780 -                      (unsigned long long)total_found, pass);
4781 -               pass++;
4782 -               if (total_found == skipped && pass > 2) {
4783 -                       iput(reloc_inode);
4784 -                       reloc_inode = create_reloc_inode(info, block_group);
4785 -                       pass = 0;
4786 -               }
4787 -               goto again;
4788 -       }
4789 -
4790 -       /* delete reloc_inode */
4791 -       iput(reloc_inode);
4792 -
4793 -       /* unpin extents in this range */
4794 -       trans = btrfs_start_transaction(info->tree_root, 1);
4795 -       btrfs_commit_transaction(trans, info->tree_root);
4796 -
4797 -       spin_lock(&block_group->lock);
4798 -       WARN_ON(block_group->pinned > 0);
4799 -       WARN_ON(block_group->reserved > 0);
4800 -       WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
4801 -       spin_unlock(&block_group->lock);
4802 -       btrfs_put_block_group(block_group);
4803 -       ret = 0;
4804 +       mutex_unlock(&root->fs_info->chunk_mutex);
4805  out:
4806 -       btrfs_free_path(path);
4807 +       btrfs_put_block_group(block_group);
4808         return ret;
4809  }
4810 -#endif
4811  
4812  static int find_first_block_group(struct btrfs_root *root,
4813                 struct btrfs_path *path, struct btrfs_key *key)
4814 @@ -7164,8 +7281,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
4815  {
4816         struct btrfs_block_group_cache *block_group;
4817         struct btrfs_space_info *space_info;
4818 +       struct btrfs_caching_control *caching_ctl;
4819         struct rb_node *n;
4820  
4821 +       down_write(&info->extent_commit_sem);
4822 +       while (!list_empty(&info->caching_block_groups)) {
4823 +               caching_ctl = list_entry(info->caching_block_groups.next,
4824 +                                        struct btrfs_caching_control, list);
4825 +               list_del(&caching_ctl->list);
4826 +               put_caching_control(caching_ctl);
4827 +       }
4828 +       up_write(&info->extent_commit_sem);
4829 +
4830         spin_lock(&info->block_group_cache_lock);
4831         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
4832                 block_group = rb_entry(n, struct btrfs_block_group_cache,
4833 @@ -7179,8 +7306,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
4834                 up_write(&block_group->space_info->groups_sem);
4835  
4836                 if (block_group->cached == BTRFS_CACHE_STARTED)
4837 -                       wait_event(block_group->caching_q,
4838 -                                  block_group_cache_done(block_group));
4839 +                       wait_block_group_cache_done(block_group);
4840  
4841                 btrfs_remove_free_space_cache(block_group);
4842  
4843 @@ -7250,7 +7376,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
4844                 spin_lock_init(&cache->lock);
4845                 spin_lock_init(&cache->tree_lock);
4846                 cache->fs_info = info;
4847 -               init_waitqueue_head(&cache->caching_q);
4848                 INIT_LIST_HEAD(&cache->list);
4849                 INIT_LIST_HEAD(&cache->cluster_list);
4850  
4851 @@ -7272,8 +7397,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
4852                 cache->flags = btrfs_block_group_flags(&cache->item);
4853                 cache->sectorsize = root->sectorsize;
4854  
4855 -               remove_sb_from_cache(root, cache);
4856 -
4857                 /*
4858                  * check for two cases, either we are full, and therefore
4859                  * don't need to bother with the caching work since we won't
4860 @@ -7282,13 +7405,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
4861                  * time, particularly in the full case.
4862                  */
4863                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
4864 +                       exclude_super_stripes(root, cache);
4865 +                       cache->last_byte_to_unpin = (u64)-1;
4866                         cache->cached = BTRFS_CACHE_FINISHED;
4867 +                       free_excluded_extents(root, cache);
4868                 } else if (btrfs_block_group_used(&cache->item) == 0) {
4869 +                       exclude_super_stripes(root, cache);
4870 +                       cache->last_byte_to_unpin = (u64)-1;
4871                         cache->cached = BTRFS_CACHE_FINISHED;
4872                         add_new_free_space(cache, root->fs_info,
4873                                            found_key.objectid,
4874                                            found_key.objectid +
4875                                            found_key.offset);
4876 +                       free_excluded_extents(root, cache);
4877                 }
4878  
4879                 ret = update_space_info(info, cache->flags, found_key.offset,
4880 @@ -7296,6 +7425,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
4881                                         &space_info);
4882                 BUG_ON(ret);
4883                 cache->space_info = space_info;
4884 +               spin_lock(&cache->space_info->lock);
4885 +               cache->space_info->bytes_super += cache->bytes_super;
4886 +               spin_unlock(&cache->space_info->lock);
4887 +
4888                 down_write(&space_info->groups_sem);
4889                 list_add_tail(&cache->list, &space_info->block_groups);
4890                 up_write(&space_info->groups_sem);
4891 @@ -7345,7 +7478,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
4892         atomic_set(&cache->count, 1);
4893         spin_lock_init(&cache->lock);
4894         spin_lock_init(&cache->tree_lock);
4895 -       init_waitqueue_head(&cache->caching_q);
4896         INIT_LIST_HEAD(&cache->list);
4897         INIT_LIST_HEAD(&cache->cluster_list);
4898  
4899 @@ -7354,15 +7486,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
4900         cache->flags = type;
4901         btrfs_set_block_group_flags(&cache->item, type);
4902  
4903 +       cache->last_byte_to_unpin = (u64)-1;
4904         cache->cached = BTRFS_CACHE_FINISHED;
4905 -       remove_sb_from_cache(root, cache);
4906 +       exclude_super_stripes(root, cache);
4907  
4908         add_new_free_space(cache, root->fs_info, chunk_offset,
4909                            chunk_offset + size);
4910  
4911 +       free_excluded_extents(root, cache);
4912 +
4913         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
4914                                 &cache->space_info);
4915         BUG_ON(ret);
4916 +
4917 +       spin_lock(&cache->space_info->lock);
4918 +       cache->space_info->bytes_super += cache->bytes_super;
4919 +       spin_unlock(&cache->space_info->lock);
4920 +
4921         down_write(&cache->space_info->groups_sem);
4922         list_add_tail(&cache->list, &cache->space_info->block_groups);
4923         up_write(&cache->space_info->groups_sem);
4924 @@ -7428,8 +7568,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
4925         up_write(&block_group->space_info->groups_sem);
4926  
4927         if (block_group->cached == BTRFS_CACHE_STARTED)
4928 -               wait_event(block_group->caching_q,
4929 -                          block_group_cache_done(block_group));
4930 +               wait_block_group_cache_done(block_group);
4931  
4932         btrfs_remove_free_space_cache(block_group);
4933  
4934 diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
4935 index 6826018..96577e8 100644
4936 --- a/fs/btrfs/extent_io.c
4937 +++ b/fs/btrfs/extent_io.c
4938 @@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
4939         return NULL;
4940  }
4941  
4942 +static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
4943 +                    struct extent_state *other)
4944 +{
4945 +       if (tree->ops && tree->ops->merge_extent_hook)
4946 +               tree->ops->merge_extent_hook(tree->mapping->host, new,
4947 +                                            other);
4948 +}
4949 +
4950  /*
4951   * utility function to look for merge candidates inside a given range.
4952   * Any extents with matching state are merged together into a single
4953 @@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
4954                 other = rb_entry(other_node, struct extent_state, rb_node);
4955                 if (other->end == state->start - 1 &&
4956                     other->state == state->state) {
4957 +                       merge_cb(tree, state, other);
4958                         state->start = other->start;
4959                         other->tree = NULL;
4960                         rb_erase(&other->rb_node, &tree->state);
4961 @@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
4962                 other = rb_entry(other_node, struct extent_state, rb_node);
4963                 if (other->start == state->end + 1 &&
4964                     other->state == state->state) {
4965 +                       merge_cb(tree, state, other);
4966                         other->start = state->start;
4967                         state->tree = NULL;
4968                         rb_erase(&state->rb_node, &tree->state);
4969                         free_extent_state(state);
4970 +                       state = NULL;
4971                 }
4972         }
4973 +
4974         return 0;
4975  }
4976  
4977 -static void set_state_cb(struct extent_io_tree *tree,
4978 +static int set_state_cb(struct extent_io_tree *tree,
4979                          struct extent_state *state,
4980                          unsigned long bits)
4981  {
4982         if (tree->ops && tree->ops->set_bit_hook) {
4983 -               tree->ops->set_bit_hook(tree->mapping->host, state->start,
4984 -                                       state->end, state->state, bits);
4985 +               return tree->ops->set_bit_hook(tree->mapping->host,
4986 +                                              state->start, state->end,
4987 +                                              state->state, bits);
4988         }
4989 +
4990 +       return 0;
4991  }
4992  
4993  static void clear_state_cb(struct extent_io_tree *tree,
4994                            struct extent_state *state,
4995                            unsigned long bits)
4996  {
4997 -       if (tree->ops && tree->ops->clear_bit_hook) {
4998 -               tree->ops->clear_bit_hook(tree->mapping->host, state->start,
4999 -                                         state->end, state->state, bits);
5000 -       }
5001 +       if (tree->ops && tree->ops->clear_bit_hook)
5002 +               tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
5003  }
5004  
5005  /*
5006 @@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
5007                         int bits)
5008  {
5009         struct rb_node *node;
5010 +       int ret;
5011  
5012         if (end < start) {
5013                 printk(KERN_ERR "btrfs end < start %llu %llu\n",
5014 @@ -365,12 +379,15 @@ static int insert_state(struct extent_io_tree *tree,
5015                        (unsigned long long)start);
5016                 WARN_ON(1);
5017         }
5018 +       state->start = start;
5019 +       state->end = end;
5020 +       ret = set_state_cb(tree, state, bits);
5021 +       if (ret)
5022 +               return ret;
5023 +
5024         if (bits & EXTENT_DIRTY)
5025                 tree->dirty_bytes += end - start + 1;
5026 -       set_state_cb(tree, state, bits);
5027         state->state |= bits;
5028 -       state->start = start;
5029 -       state->end = end;
5030         node = tree_insert(&tree->state, end, &state->rb_node);
5031         if (node) {
5032                 struct extent_state *found;
5033 @@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
5034         return 0;
5035  }
5036  
5037 +static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
5038 +                    u64 split)
5039 +{
5040 +       if (tree->ops && tree->ops->split_extent_hook)
5041 +               return tree->ops->split_extent_hook(tree->mapping->host,
5042 +                                                   orig, split);
5043 +       return 0;
5044 +}
5045 +
5046  /*
5047   * split a given extent state struct in two, inserting the preallocated
5048   * struct 'prealloc' as the newly created second half.  'split' indicates an
5049 @@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
5050                        struct extent_state *prealloc, u64 split)
5051  {
5052         struct rb_node *node;
5053 +
5054 +       split_cb(tree, orig, split);
5055 +
5056         prealloc->start = orig->start;
5057         prealloc->end = split - 1;
5058         prealloc->state = orig->state;
5059 @@ -431,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
5060                             struct extent_state *state, int bits, int wake,
5061                             int delete)
5062  {
5063 -       int ret = state->state & bits;
5064 +       int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
5065 +       int ret = state->state & bits_to_clear;
5066  
5067         if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
5068                 u64 range = state->end - state->start + 1;
5069 @@ -439,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
5070                 tree->dirty_bytes -= range;
5071         }
5072         clear_state_cb(tree, state, bits);
5073 -       state->state &= ~bits;
5074 +       state->state &= ~bits_to_clear;
5075         if (wake)
5076                 wake_up(&state->wq);
5077         if (delete || state->state == 0) {
5078 @@ -471,10 +501,14 @@ static int clear_state_bit(struct extent_io_tree *tree,
5079   * bits were already set, or zero if none of the bits were already set.
5080   */
5081  int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
5082 -                    int bits, int wake, int delete, gfp_t mask)
5083 +                    int bits, int wake, int delete,
5084 +                    struct extent_state **cached_state,
5085 +                    gfp_t mask)
5086  {
5087         struct extent_state *state;
5088 +       struct extent_state *cached;
5089         struct extent_state *prealloc = NULL;
5090 +       struct rb_node *next_node;
5091         struct rb_node *node;
5092         u64 last_end;
5093         int err;
5094 @@ -488,6 +522,17 @@ again:
5095         }
5096  
5097         spin_lock(&tree->lock);
5098 +       if (cached_state) {
5099 +               cached = *cached_state;
5100 +               *cached_state = NULL;
5101 +               cached_state = NULL;
5102 +               if (cached && cached->tree && cached->start == start) {
5103 +                       atomic_dec(&cached->refs);
5104 +                       state = cached;
5105 +                       goto hit_next;
5106 +               }
5107 +               free_extent_state(cached);
5108 +       }
5109         /*
5110          * this search will find the extents that end after
5111          * our range starts
5112 @@ -496,6 +541,7 @@ again:
5113         if (!node)
5114                 goto out;
5115         state = rb_entry(node, struct extent_state, rb_node);
5116 +hit_next:
5117         if (state->start > end)
5118                 goto out;
5119         WARN_ON(state->end < start);
5120 @@ -526,13 +572,11 @@ again:
5121                 if (err)
5122                         goto out;
5123                 if (state->end <= end) {
5124 -                       set |= clear_state_bit(tree, state, bits,
5125 -                                       wake, delete);
5126 +                       set |= clear_state_bit(tree, state, bits, wake,
5127 +                                              delete);
5128                         if (last_end == (u64)-1)
5129                                 goto out;
5130                         start = last_end + 1;
5131 -               } else {
5132 -                       start = state->start;
5133                 }
5134                 goto search_again;
5135         }
5136 @@ -547,19 +591,30 @@ again:
5137                         prealloc = alloc_extent_state(GFP_ATOMIC);
5138                 err = split_state(tree, state, prealloc, end + 1);
5139                 BUG_ON(err == -EEXIST);
5140 -
5141                 if (wake)
5142                         wake_up(&state->wq);
5143 -               set |= clear_state_bit(tree, prealloc, bits,
5144 -                                      wake, delete);
5145 +
5146 +               set |= clear_state_bit(tree, prealloc, bits, wake, delete);
5147 +
5148                 prealloc = NULL;
5149                 goto out;
5150         }
5151  
5152 +       if (state->end < end && prealloc && !need_resched())
5153 +               next_node = rb_next(&state->rb_node);
5154 +       else
5155 +               next_node = NULL;
5156 +
5157         set |= clear_state_bit(tree, state, bits, wake, delete);
5158         if (last_end == (u64)-1)
5159                 goto out;
5160         start = last_end + 1;
5161 +       if (start <= end && next_node) {
5162 +               state = rb_entry(next_node, struct extent_state,
5163 +                                rb_node);
5164 +               if (state->start == start)
5165 +                       goto hit_next;
5166 +       }
5167         goto search_again;
5168  
5169  out:
5170 @@ -641,40 +696,59 @@ out:
5171         return 0;
5172  }
5173  
5174 -static void set_state_bits(struct extent_io_tree *tree,
5175 +static int set_state_bits(struct extent_io_tree *tree,
5176                            struct extent_state *state,
5177                            int bits)
5178  {
5179 +       int ret;
5180 +
5181 +       ret = set_state_cb(tree, state, bits);
5182 +       if (ret)
5183 +               return ret;
5184 +
5185         if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
5186                 u64 range = state->end - state->start + 1;
5187                 tree->dirty_bytes += range;
5188         }
5189 -       set_state_cb(tree, state, bits);
5190         state->state |= bits;
5191 +
5192 +       return 0;
5193 +}
5194 +
5195 +static void cache_state(struct extent_state *state,
5196 +                       struct extent_state **cached_ptr)
5197 +{
5198 +       if (cached_ptr && !(*cached_ptr)) {
5199 +               if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
5200 +                       *cached_ptr = state;
5201 +                       atomic_inc(&state->refs);
5202 +               }
5203 +       }
5204  }
5205  
5206  /*
5207 - * set some bits on a range in the tree.  This may require allocations
5208 - * or sleeping, so the gfp mask is used to indicate what is allowed.
5209 + * set some bits on a range in the tree.  This may require allocations or
5210 + * sleeping, so the gfp mask is used to indicate what is allowed.
5211   *
5212 - * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
5213 - * range already has the desired bits set.  The start of the existing
5214 - * range is returned in failed_start in this case.
5215 + * If any of the exclusive bits are set, this will fail with -EEXIST if some
5216 + * part of the range already has the desired bits set.  The start of the
5217 + * existing range is returned in failed_start in this case.
5218   *
5219 - * [start, end] is inclusive
5220 - * This takes the tree lock.
5221 + * [start, end] is inclusive This takes the tree lock.
5222   */
5223 +
5224  static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
5225 -                         int bits, int exclusive, u64 *failed_start,
5226 +                         int bits, int exclusive_bits, u64 *failed_start,
5227 +                         struct extent_state **cached_state,
5228                           gfp_t mask)
5229  {
5230         struct extent_state *state;
5231         struct extent_state *prealloc = NULL;
5232         struct rb_node *node;
5233         int err = 0;
5234 -       int set;
5235         u64 last_start;
5236         u64 last_end;
5237 +
5238  again:
5239         if (!prealloc && (mask & __GFP_WAIT)) {
5240                 prealloc = alloc_extent_state(mask);
5241 @@ -683,6 +757,13 @@ again:
5242         }
5243  
5244         spin_lock(&tree->lock);
5245 +       if (cached_state && *cached_state) {
5246 +               state = *cached_state;
5247 +               if (state->start == start && state->tree) {
5248 +                       node = &state->rb_node;
5249 +                       goto hit_next;
5250 +               }
5251 +       }
5252         /*
5253          * this search will find all the extents that end after
5254          * our range starts.
5255 @@ -694,8 +775,8 @@ again:
5256                 BUG_ON(err == -EEXIST);
5257                 goto out;
5258         }
5259 -
5260         state = rb_entry(node, struct extent_state, rb_node);
5261 +hit_next:
5262         last_start = state->start;
5263         last_end = state->end;
5264  
5265 @@ -706,17 +787,32 @@ again:
5266          * Just lock what we found and keep going
5267          */
5268         if (state->start == start && state->end <= end) {
5269 -               set = state->state & bits;
5270 -               if (set && exclusive) {
5271 +               struct rb_node *next_node;
5272 +               if (state->state & exclusive_bits) {
5273                         *failed_start = state->start;
5274                         err = -EEXIST;
5275                         goto out;
5276                 }
5277 -               set_state_bits(tree, state, bits);
5278 +
5279 +               err = set_state_bits(tree, state, bits);
5280 +               if (err)
5281 +                       goto out;
5282 +
5283 +               cache_state(state, cached_state);
5284                 merge_state(tree, state);
5285                 if (last_end == (u64)-1)
5286                         goto out;
5287 +
5288                 start = last_end + 1;
5289 +               if (start < end && prealloc && !need_resched()) {
5290 +                       next_node = rb_next(node);
5291 +                       if (next_node) {
5292 +                               state = rb_entry(next_node, struct extent_state,
5293 +                                                rb_node);
5294 +                               if (state->start == start)
5295 +                                       goto hit_next;
5296 +                       }
5297 +               }
5298                 goto search_again;
5299         }
5300  
5301 @@ -737,8 +833,7 @@ again:
5302          * desired bit on it.
5303          */
5304         if (state->start < start) {
5305 -               set = state->state & bits;
5306 -               if (exclusive && set) {
5307 +               if (state->state & exclusive_bits) {
5308                         *failed_start = start;
5309                         err = -EEXIST;
5310                         goto out;
5311 @@ -749,13 +844,14 @@ again:
5312                 if (err)
5313                         goto out;
5314                 if (state->end <= end) {
5315 -                       set_state_bits(tree, state, bits);
5316 +                       err = set_state_bits(tree, state, bits);
5317 +                       if (err)
5318 +                               goto out;
5319 +                       cache_state(state, cached_state);
5320                         merge_state(tree, state);
5321                         if (last_end == (u64)-1)
5322                                 goto out;
5323                         start = last_end + 1;
5324 -               } else {
5325 -                       start = state->start;
5326                 }
5327                 goto search_again;
5328         }
5329 @@ -774,10 +870,13 @@ again:
5330                         this_end = last_start - 1;
5331                 err = insert_state(tree, prealloc, start, this_end,
5332                                    bits);
5333 -               prealloc = NULL;
5334                 BUG_ON(err == -EEXIST);
5335 -               if (err)
5336 +               if (err) {
5337 +                       prealloc = NULL;
5338                         goto out;
5339 +               }
5340 +               cache_state(prealloc, cached_state);
5341 +               prealloc = NULL;
5342                 start = this_end + 1;
5343                 goto search_again;
5344         }
5345 @@ -788,8 +887,7 @@ again:
5346          * on the first half
5347          */
5348         if (state->start <= end && state->end > end) {
5349 -               set = state->state & bits;
5350 -               if (exclusive && set) {
5351 +               if (state->state & exclusive_bits) {
5352                         *failed_start = start;
5353                         err = -EEXIST;
5354                         goto out;
5355 @@ -797,7 +895,12 @@ again:
5356                 err = split_state(tree, state, prealloc, end + 1);
5357                 BUG_ON(err == -EEXIST);
5358  
5359 -               set_state_bits(tree, prealloc, bits);
5360 +               err = set_state_bits(tree, prealloc, bits);
5361 +               if (err) {
5362 +                       prealloc = NULL;
5363 +                       goto out;
5364 +               }
5365 +               cache_state(prealloc, cached_state);
5366                 merge_state(tree, prealloc);
5367                 prealloc = NULL;
5368                 goto out;
5369 @@ -826,86 +929,65 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
5370                      gfp_t mask)
5371  {
5372         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
5373 -                             mask);
5374 -}
5375 -
5376 -int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
5377 -                      gfp_t mask)
5378 -{
5379 -       return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
5380 +                             NULL, mask);
5381  }
5382  
5383  int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
5384                     int bits, gfp_t mask)
5385  {
5386         return set_extent_bit(tree, start, end, bits, 0, NULL,
5387 -                             mask);
5388 +                             NULL, mask);
5389  }
5390  
5391  int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
5392                       int bits, gfp_t mask)
5393  {
5394 -       return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
5395 +       return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
5396  }
5397  
5398  int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
5399                      gfp_t mask)
5400  {
5401         return set_extent_bit(tree, start, end,
5402 -                             EXTENT_DELALLOC | EXTENT_DIRTY,
5403 -                             0, NULL, mask);
5404 +                             EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
5405 +                             0, NULL, NULL, mask);
5406  }
5407  
5408  int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
5409                        gfp_t mask)
5410  {
5411         return clear_extent_bit(tree, start, end,
5412 -                               EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
5413 -}
5414 -
5415 -int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
5416 -                        gfp_t mask)
5417 -{
5418 -       return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
5419 +                               EXTENT_DIRTY | EXTENT_DELALLOC |
5420 +                               EXTENT_DO_ACCOUNTING, 0, 0,
5421 +                               NULL, mask);
5422  }
5423  
5424  int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
5425                      gfp_t mask)
5426  {
5427         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
5428 -                             mask);
5429 +                             NULL, mask);
5430  }
5431  
5432  static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
5433                        gfp_t mask)
5434  {
5435 -       return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
5436 +       return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
5437 +                               NULL, mask);
5438  }
5439  
5440  int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
5441                         gfp_t mask)
5442  {
5443         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
5444 -                             mask);
5445 +                             NULL, mask);
5446  }
5447  
5448  static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
5449                                  u64 end, gfp_t mask)
5450  {
5451 -       return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
5452 -}
5453 -
5454 -static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
5455 -                        gfp_t mask)
5456 -{
5457 -       return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
5458 -                             0, NULL, mask);
5459 -}
5460 -
5461 -static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
5462 -                                 u64 end, gfp_t mask)
5463 -{
5464 -       return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
5465 +       return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
5466 +                               NULL, mask);
5467  }
5468  
5469  int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
5470 @@ -917,13 +999,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
5471   * either insert or lock state struct between start and end use mask to tell
5472   * us if waiting is desired.
5473   */
5474 -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
5475 +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
5476 +                    int bits, struct extent_state **cached_state, gfp_t mask)
5477  {
5478         int err;
5479         u64 failed_start;
5480         while (1) {
5481 -               err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
5482 -                                    &failed_start, mask);
5483 +               err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
5484 +                                    EXTENT_LOCKED, &failed_start,
5485 +                                    cached_state, mask);
5486                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
5487                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
5488                         start = failed_start;
5489 @@ -935,27 +1019,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
5490         return err;
5491  }
5492  
5493 +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
5494 +{
5495 +       return lock_extent_bits(tree, start, end, 0, NULL, mask);
5496 +}
5497 +
5498  int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
5499                     gfp_t mask)
5500  {
5501         int err;
5502         u64 failed_start;
5503  
5504 -       err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
5505 -                            &failed_start, mask);
5506 +       err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
5507 +                            &failed_start, NULL, mask);
5508         if (err == -EEXIST) {
5509                 if (failed_start > start)
5510                         clear_extent_bit(tree, start, failed_start - 1,
5511 -                                        EXTENT_LOCKED, 1, 0, mask);
5512 +                                        EXTENT_LOCKED, 1, 0, NULL, mask);
5513                 return 0;
5514         }
5515         return 1;
5516  }
5517  
5518 +int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
5519 +                        struct extent_state **cached, gfp_t mask)
5520 +{
5521 +       return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
5522 +                               mask);
5523 +}
5524 +
5525  int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
5526                   gfp_t mask)
5527  {
5528 -       return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
5529 +       return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
5530 +                               mask);
5531  }
5532  
5533  /*
5534 @@ -974,7 +1071,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
5535                 page_cache_release(page);
5536                 index++;
5537         }
5538 -       set_extent_dirty(tree, start, end, GFP_NOFS);
5539         return 0;
5540  }
5541  
5542 @@ -994,7 +1090,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
5543                 page_cache_release(page);
5544                 index++;
5545         }
5546 -       set_extent_writeback(tree, start, end, GFP_NOFS);
5547         return 0;
5548  }
5549  
5550 @@ -1232,6 +1327,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
5551         u64 delalloc_start;
5552         u64 delalloc_end;
5553         u64 found;
5554 +       struct extent_state *cached_state = NULL;
5555         int ret;
5556         int loops = 0;
5557  
5558 @@ -1269,6 +1365,7 @@ again:
5559                 /* some of the pages are gone, lets avoid looping by
5560                  * shortening the size of the delalloc range we're searching
5561                  */
5562 +               free_extent_state(cached_state);
5563                 if (!loops) {
5564                         unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
5565                         max_bytes = PAGE_CACHE_SIZE - offset;
5566 @@ -1282,18 +1379,21 @@ again:
5567         BUG_ON(ret);
5568  
5569         /* step three, lock the state bits for the whole range */
5570 -       lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
5571 +       lock_extent_bits(tree, delalloc_start, delalloc_end,
5572 +                        0, &cached_state, GFP_NOFS);
5573  
5574         /* then test to make sure it is all still delalloc */
5575         ret = test_range_bit(tree, delalloc_start, delalloc_end,
5576 -                            EXTENT_DELALLOC, 1);
5577 +                            EXTENT_DELALLOC, 1, cached_state);
5578         if (!ret) {
5579 -               unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
5580 +               unlock_extent_cached(tree, delalloc_start, delalloc_end,
5581 +                                    &cached_state, GFP_NOFS);
5582                 __unlock_for_delalloc(inode, locked_page,
5583                               delalloc_start, delalloc_end);
5584                 cond_resched();
5585                 goto again;
5586         }
5587 +       free_extent_state(cached_state);
5588         *start = delalloc_start;
5589         *end = delalloc_end;
5590  out_failed:
5591 @@ -1303,11 +1403,7 @@ out_failed:
5592  int extent_clear_unlock_delalloc(struct inode *inode,
5593                                 struct extent_io_tree *tree,
5594                                 u64 start, u64 end, struct page *locked_page,
5595 -                               int unlock_pages,
5596 -                               int clear_unlock,
5597 -                               int clear_delalloc, int clear_dirty,
5598 -                               int set_writeback,
5599 -                               int end_writeback)
5600 +                               unsigned long op)
5601  {
5602         int ret;
5603         struct page *pages[16];
5604 @@ -1317,16 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
5605         int i;
5606         int clear_bits = 0;
5607  
5608 -       if (clear_unlock)
5609 +       if (op & EXTENT_CLEAR_UNLOCK)
5610                 clear_bits |= EXTENT_LOCKED;
5611 -       if (clear_dirty)
5612 +       if (op & EXTENT_CLEAR_DIRTY)
5613                 clear_bits |= EXTENT_DIRTY;
5614  
5615 -       if (clear_delalloc)
5616 +       if (op & EXTENT_CLEAR_DELALLOC)
5617                 clear_bits |= EXTENT_DELALLOC;
5618  
5619 -       clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
5620 -       if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
5621 +       if (op & EXTENT_CLEAR_ACCOUNTING)
5622 +               clear_bits |= EXTENT_DO_ACCOUNTING;
5623 +
5624 +       clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
5625 +       if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
5626 +                   EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
5627 +                   EXTENT_SET_PRIVATE2)))
5628                 return 0;
5629  
5630         while (nr_pages > 0) {
5631 @@ -1334,17 +1435,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
5632                                      min_t(unsigned long,
5633                                      nr_pages, ARRAY_SIZE(pages)), pages);
5634                 for (i = 0; i < ret; i++) {
5635 +
5636 +                       if (op & EXTENT_SET_PRIVATE2)
5637 +                               SetPagePrivate2(pages[i]);
5638 +
5639                         if (pages[i] == locked_page) {
5640                                 page_cache_release(pages[i]);
5641                                 continue;
5642                         }
5643 -                       if (clear_dirty)
5644 +                       if (op & EXTENT_CLEAR_DIRTY)
5645                                 clear_page_dirty_for_io(pages[i]);
5646 -                       if (set_writeback)
5647 +                       if (op & EXTENT_SET_WRITEBACK)
5648                                 set_page_writeback(pages[i]);
5649 -                       if (end_writeback)
5650 +                       if (op & EXTENT_END_WRITEBACK)
5651                                 end_page_writeback(pages[i]);
5652 -                       if (unlock_pages)
5653 +                       if (op & EXTENT_CLEAR_UNLOCK_PAGE)
5654                                 unlock_page(pages[i]);
5655                         page_cache_release(pages[i]);
5656                 }
5657 @@ -1476,14 +1581,17 @@ out:
5658   * range is found set.
5659   */
5660  int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
5661 -                  int bits, int filled)
5662 +                  int bits, int filled, struct extent_state *cached)
5663  {
5664         struct extent_state *state = NULL;
5665         struct rb_node *node;
5666         int bitset = 0;
5667  
5668         spin_lock(&tree->lock);
5669 -       node = tree_search(tree, start);
5670 +       if (cached && cached->tree && cached->start == start)
5671 +               node = &cached->rb_node;
5672 +       else
5673 +               node = tree_search(tree, start);
5674         while (node && start <= end) {
5675                 state = rb_entry(node, struct extent_state, rb_node);
5676  
5677 @@ -1503,6 +1611,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
5678                         bitset = 0;
5679                         break;
5680                 }
5681 +
5682 +               if (state->end == (u64)-1)
5683 +                       break;
5684 +
5685                 start = state->end + 1;
5686                 if (start > end)
5687                         break;
5688 @@ -1526,7 +1638,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,
5689  {
5690         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
5691         u64 end = start + PAGE_CACHE_SIZE - 1;
5692 -       if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
5693 +       if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
5694                 SetPageUptodate(page);
5695         return 0;
5696  }
5697 @@ -1540,7 +1652,7 @@ static int check_page_locked(struct extent_io_tree *tree,
5698  {
5699         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
5700         u64 end = start + PAGE_CACHE_SIZE - 1;
5701 -       if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
5702 +       if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
5703                 unlock_page(page);
5704         return 0;
5705  }
5706 @@ -1552,10 +1664,7 @@ static int check_page_locked(struct extent_io_tree *tree,
5707  static int check_page_writeback(struct extent_io_tree *tree,
5708                              struct page *page)
5709  {
5710 -       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
5711 -       u64 end = start + PAGE_CACHE_SIZE - 1;
5712 -       if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
5713 -               end_page_writeback(page);
5714 +       end_page_writeback(page);
5715         return 0;
5716  }
5717  
5718 @@ -1613,13 +1722,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
5719                 }
5720  
5721                 if (!uptodate) {
5722 -                       clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
5723 +                       clear_extent_uptodate(tree, start, end, GFP_NOFS);
5724                         ClearPageUptodate(page);
5725                         SetPageError(page);
5726                 }
5727  
5728 -               clear_extent_writeback(tree, start, end, GFP_ATOMIC);
5729 -
5730                 if (whole_page)
5731                         end_page_writeback(page);
5732                 else
5733 @@ -1983,7 +2090,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
5734                         continue;
5735                 }
5736                 /* the get_extent function already copied into the page */
5737 -               if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
5738 +               if (test_range_bit(tree, cur, cur_end,
5739 +                                  EXTENT_UPTODATE, 1, NULL)) {
5740                         check_page_uptodate(tree, page);
5741                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
5742                         cur = cur + iosize;
5743 @@ -2078,6 +2186,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5744         u64 iosize;
5745         u64 unlock_start;
5746         sector_t sector;
5747 +       struct extent_state *cached_state = NULL;
5748         struct extent_map *em;
5749         struct block_device *bdev;
5750         int ret;
5751 @@ -2124,6 +2233,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5752         delalloc_end = 0;
5753         page_started = 0;
5754         if (!epd->extent_locked) {
5755 +               u64 delalloc_to_write = 0;
5756                 /*
5757                  * make sure the wbc mapping index is at least updated
5758                  * to this page.
5759 @@ -2143,8 +2253,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5760                         tree->ops->fill_delalloc(inode, page, delalloc_start,
5761                                                  delalloc_end, &page_started,
5762                                                  &nr_written);
5763 +                       /*
5764 +                        * delalloc_end is already one less than the total
5765 +                        * length, so we don't subtract one from
5766 +                        * PAGE_CACHE_SIZE
5767 +                        */
5768 +                       delalloc_to_write += (delalloc_end - delalloc_start +
5769 +                                             PAGE_CACHE_SIZE) >>
5770 +                                             PAGE_CACHE_SHIFT;
5771                         delalloc_start = delalloc_end + 1;
5772                 }
5773 +               if (wbc->nr_to_write < delalloc_to_write) {
5774 +                       int thresh = 8192;
5775 +
5776 +                       if (delalloc_to_write < thresh * 2)
5777 +                               thresh = delalloc_to_write;
5778 +                       wbc->nr_to_write = min_t(u64, delalloc_to_write,
5779 +                                                thresh);
5780 +               }
5781  
5782                 /* did the fill delalloc function already unlock and start
5783                  * the IO?
5784 @@ -2160,15 +2286,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5785                         goto done_unlocked;
5786                 }
5787         }
5788 -       lock_extent(tree, start, page_end, GFP_NOFS);
5789 -
5790 -       unlock_start = start;
5791 -
5792         if (tree->ops && tree->ops->writepage_start_hook) {
5793                 ret = tree->ops->writepage_start_hook(page, start,
5794                                                       page_end);
5795                 if (ret == -EAGAIN) {
5796 -                       unlock_extent(tree, start, page_end, GFP_NOFS);
5797                         redirty_page_for_writepage(wbc, page);
5798                         update_nr_written(page, wbc, nr_written);
5799                         unlock_page(page);
5800 @@ -2184,12 +2305,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5801         update_nr_written(page, wbc, nr_written + 1);
5802  
5803         end = page_end;
5804 -       if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
5805 -               printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
5806 -
5807         if (last_byte <= start) {
5808 -               clear_extent_dirty(tree, start, page_end, GFP_NOFS);
5809 -               unlock_extent(tree, start, page_end, GFP_NOFS);
5810                 if (tree->ops && tree->ops->writepage_end_io_hook)
5811                         tree->ops->writepage_end_io_hook(page, start,
5812                                                          page_end, NULL, 1);
5813 @@ -2197,13 +2313,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5814                 goto done;
5815         }
5816  
5817 -       set_extent_uptodate(tree, start, page_end, GFP_NOFS);
5818         blocksize = inode->i_sb->s_blocksize;
5819  
5820         while (cur <= end) {
5821                 if (cur >= last_byte) {
5822 -                       clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
5823 -                       unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
5824                         if (tree->ops && tree->ops->writepage_end_io_hook)
5825                                 tree->ops->writepage_end_io_hook(page, cur,
5826                                                          page_end, NULL, 1);
5827 @@ -2235,12 +2348,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5828                  */
5829                 if (compressed || block_start == EXTENT_MAP_HOLE ||
5830                     block_start == EXTENT_MAP_INLINE) {
5831 -                       clear_extent_dirty(tree, cur,
5832 -                                          cur + iosize - 1, GFP_NOFS);
5833 -
5834 -                       unlock_extent(tree, unlock_start, cur + iosize - 1,
5835 -                                     GFP_NOFS);
5836 -
5837                         /*
5838                          * end_io notification does not happen here for
5839                          * compressed extents
5840 @@ -2265,13 +2372,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
5841                 }
5842                 /* leave this out until we have a page_mkwrite call */
5843                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
5844 -                                  EXTENT_DIRTY, 0)) {
5845 +                                  EXTENT_DIRTY, 0, NULL)) {
5846                         cur = cur + iosize;
5847                         pg_offset += iosize;
5848                         continue;
5849                 }
5850  
5851 -               clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
5852                 if (tree->ops && tree->ops->writepage_io_hook) {
5853                         ret = tree->ops->writepage_io_hook(page, cur,
5854                                                 cur + iosize - 1);
5855 @@ -2309,12 +2415,12 @@ done:
5856                 set_page_writeback(page);
5857                 end_page_writeback(page);
5858         }
5859 -       if (unlock_start <= page_end)
5860 -               unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
5861         unlock_page(page);
5862  
5863  done_unlocked:
5864  
5865 +       /* drop our reference on any cached states */
5866 +       free_extent_state(cached_state);
5867         return 0;
5868  }
5869  
5870 @@ -2339,9 +2445,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
5871                              writepage_t writepage, void *data,
5872                              void (*flush_fn)(void *))
5873  {
5874 -       struct backing_dev_info *bdi = mapping->backing_dev_info;
5875         int ret = 0;
5876         int done = 0;
5877 +       int nr_to_write_done = 0;
5878         struct pagevec pvec;
5879         int nr_pages;
5880         pgoff_t index;
5881 @@ -2361,7 +2467,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
5882                 scanned = 1;
5883         }
5884  retry:
5885 -       while (!done && (index <= end) &&
5886 +       while (!done && !nr_to_write_done && (index <= end) &&
5887                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
5888                               PAGECACHE_TAG_DIRTY, min(end - index,
5889                                   (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
5890 @@ -2412,12 +2518,15 @@ retry:
5891                                 unlock_page(page);
5892                                 ret = 0;
5893                         }
5894 -                       if (ret || wbc->nr_to_write <= 0)
5895 -                               done = 1;
5896 -                       if (wbc->nonblocking && bdi_write_congested(bdi)) {
5897 -                               wbc->encountered_congestion = 1;
5898 +                       if (ret)
5899                                 done = 1;
5900 -                       }
5901 +
5902 +                       /*
5903 +                        * the filesystem may choose to bump up nr_to_write.
5904 +                        * We have to make sure to honor the new nr_to_write
5905 +                        * at any time
5906 +                        */
5907 +                       nr_to_write_done = wbc->nr_to_write <= 0;
5908                 }
5909                 pagevec_release(&pvec);
5910                 cond_resched();
5911 @@ -2604,10 +2713,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
5912                 return 0;
5913  
5914         lock_extent(tree, start, end, GFP_NOFS);
5915 -       wait_on_extent_writeback(tree, start, end);
5916 +       wait_on_page_writeback(page);
5917         clear_extent_bit(tree, start, end,
5918 -                        EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
5919 -                        1, 1, GFP_NOFS);
5920 +                        EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
5921 +                        EXTENT_DO_ACCOUNTING,
5922 +                        1, 1, NULL, GFP_NOFS);
5923         return 0;
5924  }
5925  
5926 @@ -2687,7 +2797,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
5927                     !isnew && !PageUptodate(page) &&
5928                     (block_off_end > to || block_off_start < from) &&
5929                     !test_range_bit(tree, block_start, cur_end,
5930 -                                   EXTENT_UPTODATE, 1)) {
5931 +                                   EXTENT_UPTODATE, 1, NULL)) {
5932                         u64 sector;
5933                         u64 extent_offset = block_start - em->start;
5934                         size_t iosize;
5935 @@ -2701,7 +2811,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
5936                          */
5937                         set_extent_bit(tree, block_start,
5938                                        block_start + iosize - 1,
5939 -                                      EXTENT_LOCKED, 0, NULL, GFP_NOFS);
5940 +                                      EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
5941                         ret = submit_extent_page(READ, tree, page,
5942                                          sector, iosize, page_offset, em->bdev,
5943                                          NULL, 1,
5944 @@ -2742,13 +2852,18 @@ int try_release_extent_state(struct extent_map_tree *map,
5945         int ret = 1;
5946  
5947         if (test_range_bit(tree, start, end,
5948 -                          EXTENT_IOBITS | EXTENT_ORDERED, 0))
5949 +                          EXTENT_IOBITS, 0, NULL))
5950                 ret = 0;
5951         else {
5952                 if ((mask & GFP_NOFS) == GFP_NOFS)
5953                         mask = GFP_NOFS;
5954 -               clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
5955 -                                1, 1, mask);
5956 +               /*
5957 +                * at this point we can safely clear everything except the
5958 +                * locked bit and the nodatasum bit
5959 +                */
5960 +               clear_extent_bit(tree, start, end,
5961 +                                ~(EXTENT_LOCKED | EXTENT_NODATASUM),
5962 +                                0, 0, NULL, mask);
5963         }
5964         return ret;
5965  }
5966 @@ -2771,29 +2886,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,
5967                 u64 len;
5968                 while (start <= end) {
5969                         len = end - start + 1;
5970 -                       spin_lock(&map->lock);
5971 +                       write_lock(&map->lock);
5972                         em = lookup_extent_mapping(map, start, len);
5973                         if (!em || IS_ERR(em)) {
5974 -                               spin_unlock(&map->lock);
5975 +                               write_unlock(&map->lock);
5976                                 break;
5977                         }
5978                         if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
5979                             em->start != start) {
5980 -                               spin_unlock(&map->lock);
5981 +                               write_unlock(&map->lock);
5982                                 free_extent_map(em);
5983                                 break;
5984                         }
5985                         if (!test_range_bit(tree, em->start,
5986                                             extent_map_end(em) - 1,
5987 -                                           EXTENT_LOCKED | EXTENT_WRITEBACK |
5988 -                                           EXTENT_ORDERED,
5989 -                                           0)) {
5990 +                                           EXTENT_LOCKED | EXTENT_WRITEBACK,
5991 +                                           0, NULL)) {
5992                                 remove_extent_mapping(map, em);
5993                                 /* once for the rb tree */
5994                                 free_extent_map(em);
5995                         }
5996                         start = extent_map_end(em);
5997 -                       spin_unlock(&map->lock);
5998 +                       write_unlock(&map->lock);
5999  
6000                         /* once for us */
6001                         free_extent_map(em);
6002 @@ -3203,7 +3317,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
6003         int uptodate;
6004         unsigned long index;
6005  
6006 -       ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
6007 +       ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
6008         if (ret)
6009                 return 1;
6010         while (start <= end) {
6011 @@ -3233,7 +3347,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
6012                 return 1;
6013  
6014         ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
6015 -                          EXTENT_UPTODATE, 1);
6016 +                          EXTENT_UPTODATE, 1, NULL);
6017         if (ret)
6018                 return ret;
6019  
6020 @@ -3269,7 +3383,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
6021                 return 0;
6022  
6023         if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
6024 -                          EXTENT_UPTODATE, 1)) {
6025 +                          EXTENT_UPTODATE, 1, NULL)) {
6026                 return 0;
6027         }
6028  
6029 diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
6030 index 5bc20ab..36de250 100644
6031 --- a/fs/btrfs/extent_io.h
6032 +++ b/fs/btrfs/extent_io.h
6033 @@ -13,10 +13,9 @@
6034  #define EXTENT_DEFRAG (1 << 6)
6035  #define EXTENT_DEFRAG_DONE (1 << 7)
6036  #define EXTENT_BUFFER_FILLED (1 << 8)
6037 -#define EXTENT_ORDERED (1 << 9)
6038 -#define EXTENT_ORDERED_METADATA (1 << 10)
6039 -#define EXTENT_BOUNDARY (1 << 11)
6040 -#define EXTENT_NODATASUM (1 << 12)
6041 +#define EXTENT_BOUNDARY (1 << 9)
6042 +#define EXTENT_NODATASUM (1 << 10)
6043 +#define EXTENT_DO_ACCOUNTING (1 << 11)
6044  #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
6045  
6046  /* flags for bio submission */
6047 @@ -27,6 +26,16 @@
6048  #define EXTENT_BUFFER_BLOCKING 1
6049  #define EXTENT_BUFFER_DIRTY 2
6050  
6051 +/* these are flags for extent_clear_unlock_delalloc */
6052 +#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
6053 +#define EXTENT_CLEAR_UNLOCK     0x2
6054 +#define EXTENT_CLEAR_DELALLOC   0x4
6055 +#define EXTENT_CLEAR_DIRTY      0x8
6056 +#define EXTENT_SET_WRITEBACK    0x10
6057 +#define EXTENT_END_WRITEBACK    0x20
6058 +#define EXTENT_SET_PRIVATE2     0x40
6059 +#define EXTENT_CLEAR_ACCOUNTING  0x80
6060 +
6061  /*
6062   * page->private values.  Every page that is controlled by the extent
6063   * map has page->private set to one.
6064 @@ -62,8 +71,13 @@ struct extent_io_ops {
6065                                       struct extent_state *state, int uptodate);
6066         int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
6067                             unsigned long old, unsigned long bits);
6068 -       int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
6069 -                           unsigned long old, unsigned long bits);
6070 +       int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
6071 +                             unsigned long bits);
6072 +       int (*merge_extent_hook)(struct inode *inode,
6073 +                                struct extent_state *new,
6074 +                                struct extent_state *other);
6075 +       int (*split_extent_hook)(struct inode *inode,
6076 +                                struct extent_state *orig, u64 split);
6077         int (*write_cache_pages_lock_hook)(struct page *page);
6078  };
6079  
6080 @@ -81,10 +95,14 @@ struct extent_state {
6081         u64 start;
6082         u64 end; /* inclusive */
6083         struct rb_node rb_node;
6084 +
6085 +       /* ADD NEW ELEMENTS AFTER THIS */
6086         struct extent_io_tree *tree;
6087         wait_queue_head_t wq;
6088         atomic_t refs;
6089         unsigned long state;
6090 +       u64 split_start;
6091 +       u64 split_end;
6092  
6093         /* for use by the FS */
6094         u64 private;
6095 @@ -142,6 +160,8 @@ int try_release_extent_state(struct extent_map_tree *map,
6096                              struct extent_io_tree *tree, struct page *page,
6097                              gfp_t mask);
6098  int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
6099 +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
6100 +                    int bits, struct extent_state **cached, gfp_t mask);
6101  int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
6102  int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
6103                     gfp_t mask);
6104 @@ -155,11 +175,12 @@ u64 count_range_bits(struct extent_io_tree *tree,
6105                      u64 max_bytes, unsigned long bits);
6106  
6107  int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
6108 -                  int bits, int filled);
6109 +                  int bits, int filled, struct extent_state *cached_state);
6110  int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
6111                       int bits, gfp_t mask);
6112  int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
6113 -                    int bits, int wake, int delete, gfp_t mask);
6114 +                    int bits, int wake, int delete, struct extent_state **cached,
6115 +                    gfp_t mask);
6116  int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
6117                     int bits, gfp_t mask);
6118  int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
6119 @@ -278,9 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree,
6120  int extent_clear_unlock_delalloc(struct inode *inode,
6121                                 struct extent_io_tree *tree,
6122                                 u64 start, u64 end, struct page *locked_page,
6123 -                               int unlock_page,
6124 -                               int clear_unlock,
6125 -                               int clear_delalloc, int clear_dirty,
6126 -                               int set_writeback,
6127 -                               int end_writeback);
6128 +                               unsigned long op);
6129  #endif
6130 diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
6131 index 30c9365..2c726b7 100644
6132 --- a/fs/btrfs/extent_map.c
6133 +++ b/fs/btrfs/extent_map.c
6134 @@ -36,7 +36,7 @@ void extent_map_exit(void)
6135  void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
6136  {
6137         tree->map.rb_node = NULL;
6138 -       spin_lock_init(&tree->lock);
6139 +       rwlock_init(&tree->lock);
6140  }
6141  
6142  /**
6143 @@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
6144         return 0;
6145  }
6146  
6147 +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
6148 +{
6149 +       int ret = 0;
6150 +       struct extent_map *merge = NULL;
6151 +       struct rb_node *rb;
6152 +       struct extent_map *em;
6153 +
6154 +       write_lock(&tree->lock);
6155 +       em = lookup_extent_mapping(tree, start, len);
6156 +
6157 +       WARN_ON(em->start != start || !em);
6158 +
6159 +       if (!em)
6160 +               goto out;
6161 +
6162 +       clear_bit(EXTENT_FLAG_PINNED, &em->flags);
6163 +
6164 +       if (em->start != 0) {
6165 +               rb = rb_prev(&em->rb_node);
6166 +               if (rb)
6167 +                       merge = rb_entry(rb, struct extent_map, rb_node);
6168 +               if (rb && mergable_maps(merge, em)) {
6169 +                       em->start = merge->start;
6170 +                       em->len += merge->len;
6171 +                       em->block_len += merge->block_len;
6172 +                       em->block_start = merge->block_start;
6173 +                       merge->in_tree = 0;
6174 +                       rb_erase(&merge->rb_node, &tree->map);
6175 +                       free_extent_map(merge);
6176 +               }
6177 +       }
6178 +
6179 +       rb = rb_next(&em->rb_node);
6180 +       if (rb)
6181 +               merge = rb_entry(rb, struct extent_map, rb_node);
6182 +       if (rb && mergable_maps(em, merge)) {
6183 +               em->len += merge->len;
6184 +               em->block_len += merge->len;
6185 +               rb_erase(&merge->rb_node, &tree->map);
6186 +               merge->in_tree = 0;
6187 +               free_extent_map(merge);
6188 +       }
6189 +
6190 +       free_extent_map(em);
6191 +out:
6192 +       write_unlock(&tree->lock);
6193 +       return ret;
6194 +
6195 +}
6196 +
6197  /**
6198   * add_extent_mapping - add new extent map to the extent tree
6199   * @tree:      tree to insert new map in
6200 @@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
6201                 ret = -EEXIST;
6202                 goto out;
6203         }
6204 -       assert_spin_locked(&tree->lock);
6205         rb = tree_insert(&tree->map, em->start, &em->rb_node);
6206         if (rb) {
6207                 ret = -EEXIST;
6208 @@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
6209         struct rb_node *next = NULL;
6210         u64 end = range_end(start, len);
6211  
6212 -       assert_spin_locked(&tree->lock);
6213         rb_node = __tree_search(&tree->map, start, &prev, &next);
6214         if (!rb_node && prev) {
6215                 em = rb_entry(prev, struct extent_map, rb_node);
6216 @@ -319,6 +367,54 @@ out:
6217  }
6218  
6219  /**
6220 + * search_extent_mapping - find a nearby extent map
6221 + * @tree:      tree to lookup in
6222 + * @start:     byte offset to start the search
6223 + * @len:       length of the lookup range
6224 + *
6225 + * Find and return the first extent_map struct in @tree that intersects the
6226 + * [start, len] range.
6227 + *
6228 + * If one can't be found, any nearby extent may be returned
6229 + */
6230 +struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
6231 +                                        u64 start, u64 len)
6232 +{
6233 +       struct extent_map *em;
6234 +       struct rb_node *rb_node;
6235 +       struct rb_node *prev = NULL;
6236 +       struct rb_node *next = NULL;
6237 +
6238 +       rb_node = __tree_search(&tree->map, start, &prev, &next);
6239 +       if (!rb_node && prev) {
6240 +               em = rb_entry(prev, struct extent_map, rb_node);
6241 +               goto found;
6242 +       }
6243 +       if (!rb_node && next) {
6244 +               em = rb_entry(next, struct extent_map, rb_node);
6245 +               goto found;
6246 +       }
6247 +       if (!rb_node) {
6248 +               em = NULL;
6249 +               goto out;
6250 +       }
6251 +       if (IS_ERR(rb_node)) {
6252 +               em = ERR_PTR(PTR_ERR(rb_node));
6253 +               goto out;
6254 +       }
6255 +       em = rb_entry(rb_node, struct extent_map, rb_node);
6256 +       goto found;
6257 +
6258 +       em = NULL;
6259 +       goto out;
6260 +
6261 +found:
6262 +       atomic_inc(&em->refs);
6263 +out:
6264 +       return em;
6265 +}
6266 +
6267 +/**
6268   * remove_extent_mapping - removes an extent_map from the extent tree
6269   * @tree:      extent tree to remove from
6270   * @em:                extent map beeing removed
6271 @@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
6272         int ret = 0;
6273  
6274         WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
6275 -       assert_spin_locked(&tree->lock);
6276         rb_erase(&em->rb_node, &tree->map);
6277         em->in_tree = 0;
6278         return ret;
6279 diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
6280 index fb6eeef..ab6d74b 100644
6281 --- a/fs/btrfs/extent_map.h
6282 +++ b/fs/btrfs/extent_map.h
6283 @@ -31,7 +31,7 @@ struct extent_map {
6284  
6285  struct extent_map_tree {
6286         struct rb_root map;
6287 -       spinlock_t lock;
6288 +       rwlock_t lock;
6289  };
6290  
6291  static inline u64 extent_map_end(struct extent_map *em)
6292 @@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask);
6293  void free_extent_map(struct extent_map *em);
6294  int __init extent_map_init(void);
6295  void extent_map_exit(void);
6296 +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
6297 +struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
6298 +                                        u64 start, u64 len);
6299  #endif
6300 diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
6301 index 4b83397..4599113 100644
6302 --- a/fs/btrfs/file.c
6303 +++ b/fs/btrfs/file.c
6304 @@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
6305         int err = 0;
6306         int i;
6307         struct inode *inode = fdentry(file)->d_inode;
6308 -       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6309 -       u64 hint_byte;
6310         u64 num_bytes;
6311         u64 start_pos;
6312         u64 end_of_last_block;
6313 @@ -125,23 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
6314                     root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
6315  
6316         end_of_last_block = start_pos + num_bytes - 1;
6317 +       err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
6318 +       if (err)
6319 +               return err;
6320  
6321 -       lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
6322 -       trans = btrfs_join_transaction(root, 1);
6323 -       if (!trans) {
6324 -               err = -ENOMEM;
6325 -               goto out_unlock;
6326 -       }
6327 -       btrfs_set_trans_block_group(trans, inode);
6328 -       hint_byte = 0;
6329 -
6330 -       set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
6331 -
6332 -       /* check for reserved extents on each page, we don't want
6333 -        * to reset the delalloc bit on things that already have
6334 -        * extents reserved.
6335 -        */
6336 -       btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
6337         for (i = 0; i < num_pages; i++) {
6338                 struct page *p = pages[i];
6339                 SetPageUptodate(p);
6340 @@ -155,9 +140,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
6341                  * at this time.
6342                  */
6343         }
6344 -       err = btrfs_end_transaction(trans, root);
6345 -out_unlock:
6346 -       unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
6347         return err;
6348  }
6349  
6350 @@ -189,18 +171,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
6351                 if (!split2)
6352                         split2 = alloc_extent_map(GFP_NOFS);
6353  
6354 -               spin_lock(&em_tree->lock);
6355 +               write_lock(&em_tree->lock);
6356                 em = lookup_extent_mapping(em_tree, start, len);
6357                 if (!em) {
6358 -                       spin_unlock(&em_tree->lock);
6359 +                       write_unlock(&em_tree->lock);
6360                         break;
6361                 }
6362                 flags = em->flags;
6363                 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
6364 -                       spin_unlock(&em_tree->lock);
6365                         if (em->start <= start &&
6366                             (!testend || em->start + em->len >= start + len)) {
6367                                 free_extent_map(em);
6368 +                               write_unlock(&em_tree->lock);
6369                                 break;
6370                         }
6371                         if (start < em->start) {
6372 @@ -210,6 +192,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
6373                                 start = em->start + em->len;
6374                         }
6375                         free_extent_map(em);
6376 +                       write_unlock(&em_tree->lock);
6377                         continue;
6378                 }
6379                 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6380 @@ -260,7 +243,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
6381                         free_extent_map(split);
6382                         split = NULL;
6383                 }
6384 -               spin_unlock(&em_tree->lock);
6385 +               write_unlock(&em_tree->lock);
6386  
6387                 /* once for us */
6388                 free_extent_map(em);
6389 @@ -289,7 +272,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
6390  noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
6391                        struct btrfs_root *root, struct inode *inode,
6392                        u64 start, u64 end, u64 locked_end,
6393 -                      u64 inline_limit, u64 *hint_byte)
6394 +                      u64 inline_limit, u64 *hint_byte, int drop_cache)
6395  {
6396         u64 extent_end = 0;
6397         u64 search_start = start;
6398 @@ -314,7 +297,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
6399         int ret;
6400  
6401         inline_limit = 0;
6402 -       btrfs_drop_extent_cache(inode, start, end - 1, 0);
6403 +       if (drop_cache)
6404 +               btrfs_drop_extent_cache(inode, start, end - 1, 0);
6405  
6406         path = btrfs_alloc_path();
6407         if (!path)
6408 @@ -894,7 +878,8 @@ again:
6409                         btrfs_put_ordered_extent(ordered);
6410  
6411                 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
6412 -                                 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
6413 +                                 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
6414 +                                 EXTENT_DO_ACCOUNTING,
6415                                   GFP_NOFS);
6416                 unlock_extent(&BTRFS_I(inode)->io_tree,
6417                               start_pos, last_pos - 1, GFP_NOFS);
6418 @@ -936,21 +921,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
6419         start_pos = pos;
6420  
6421         vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
6422 +
6423 +       /* do the reserve before the mutex lock in case we have to do some
6424 +        * flushing.  We wouldn't deadlock, but this is more polite.
6425 +        */
6426 +       err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
6427 +       if (err)
6428 +               goto out_nolock;
6429 +
6430 +       mutex_lock(&inode->i_mutex);
6431 +
6432         current->backing_dev_info = inode->i_mapping->backing_dev_info;
6433         err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
6434         if (err)
6435 -               goto out_nolock;
6436 +               goto out;
6437 +
6438         if (count == 0)
6439 -               goto out_nolock;
6440 +               goto out;
6441  
6442         err = file_remove_suid(file);
6443         if (err)
6444 -               goto out_nolock;
6445 +               goto out;
6446 +
6447         file_update_time(file);
6448  
6449         pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
6450  
6451 -       mutex_lock(&inode->i_mutex);
6452 +       /* generic_write_checks can change our pos */
6453 +       start_pos = pos;
6454 +
6455         BTRFS_I(inode)->sequence++;
6456         first_index = pos >> PAGE_CACHE_SHIFT;
6457         last_index = (pos + count) >> PAGE_CACHE_SHIFT;
6458 @@ -1047,6 +1046,7 @@ out:
6459         mutex_unlock(&inode->i_mutex);
6460         if (ret)
6461                 err = ret;
6462 +       btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
6463  
6464  out_nolock:
6465         kfree(pages);
6466 @@ -1087,8 +1087,10 @@ out_nolock:
6467                                         btrfs_end_transaction(trans, root);
6468                                 else
6469                                         btrfs_commit_transaction(trans, root);
6470 -                       } else {
6471 +                       } else if (ret != BTRFS_NO_LOG_SYNC) {
6472                                 btrfs_commit_transaction(trans, root);
6473 +                       } else {
6474 +                               btrfs_end_transaction(trans, root);
6475                         }
6476                 }
6477                 if (file->f_flags & O_DIRECT) {
6478 @@ -1138,6 +1140,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
6479         int ret = 0;
6480         struct btrfs_trans_handle *trans;
6481  
6482 +
6483 +       /* we wait first, since the writeback may change the inode */
6484 +       root->log_batch++;
6485 +       /* the VFS called filemap_fdatawrite for us */
6486 +       btrfs_wait_ordered_range(inode, 0, (u64)-1);
6487 +       root->log_batch++;
6488 +
6489         /*
6490          * check the transaction that last modified this inode
6491          * and see if its already been committed
6492 @@ -1145,6 +1154,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
6493         if (!BTRFS_I(inode)->last_trans)
6494                 goto out;
6495  
6496 +       /*
6497 +        * if the last transaction that changed this file was before
6498 +        * the current transaction, we can bail out now without any
6499 +        * syncing
6500 +        */
6501         mutex_lock(&root->fs_info->trans_mutex);
6502         if (BTRFS_I(inode)->last_trans <=
6503             root->fs_info->last_trans_committed) {
6504 @@ -1154,13 +1168,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
6505         }
6506         mutex_unlock(&root->fs_info->trans_mutex);
6507  
6508 -       root->log_batch++;
6509 -       filemap_fdatawrite(inode->i_mapping);
6510 -       btrfs_wait_ordered_range(inode, 0, (u64)-1);
6511 -       root->log_batch++;
6512 -
6513 -       if (datasync && !(inode->i_state & I_DIRTY_PAGES))
6514 -               goto out;
6515         /*
6516          * ok we haven't committed the transaction yet, lets do a commit
6517          */
6518 @@ -1189,14 +1196,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
6519          */
6520         mutex_unlock(&dentry->d_inode->i_mutex);
6521  
6522 -       if (ret > 0) {
6523 -               ret = btrfs_commit_transaction(trans, root);
6524 -       } else {
6525 -               ret = btrfs_sync_log(trans, root);
6526 -               if (ret == 0)
6527 -                       ret = btrfs_end_transaction(trans, root);
6528 -               else
6529 +       if (ret != BTRFS_NO_LOG_SYNC) {
6530 +               if (ret > 0) {
6531                         ret = btrfs_commit_transaction(trans, root);
6532 +               } else {
6533 +                       ret = btrfs_sync_log(trans, root);
6534 +                       if (ret == 0)
6535 +                               ret = btrfs_end_transaction(trans, root);
6536 +                       else
6537 +                               ret = btrfs_commit_transaction(trans, root);
6538 +               }
6539 +       } else {
6540 +               ret = btrfs_end_transaction(trans, root);
6541         }
6542         mutex_lock(&dentry->d_inode->i_mutex);
6543  out:
6544 diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
6545 index 5edcee3..5c2caad 100644
6546 --- a/fs/btrfs/free-space-cache.c
6547 +++ b/fs/btrfs/free-space-cache.c
6548 @@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
6549  
6550  static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
6551  {
6552 -       u64 max_bytes, possible_bytes;
6553 +       u64 max_bytes;
6554 +       u64 bitmap_bytes;
6555 +       u64 extent_bytes;
6556  
6557         /*
6558          * The goal is to keep the total amount of memory used per 1gb of space
6559 @@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
6560         max_bytes = MAX_CACHE_BYTES_PER_GIG *
6561                 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
6562  
6563 -       possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
6564 -               (sizeof(struct btrfs_free_space) *
6565 -                block_group->extents_thresh);
6566 +       /*
6567 +        * we want to account for 1 more bitmap than what we have so we can make
6568 +        * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
6569 +        * we add more bitmaps.
6570 +        */
6571 +       bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
6572  
6573 -       if (possible_bytes > max_bytes) {
6574 -               int extent_bytes = max_bytes -
6575 -                       (block_group->total_bitmaps * PAGE_CACHE_SIZE);
6576 +       if (bitmap_bytes >= max_bytes) {
6577 +               block_group->extents_thresh = 0;
6578 +               return;
6579 +       }
6580  
6581 -               if (extent_bytes <= 0) {
6582 -                       block_group->extents_thresh = 0;
6583 -                       return;
6584 -               }
6585 +       /*
6586 +        * we want the extent entry threshold to always be at most 1/2 the maxw
6587 +        * bytes we can have, or whatever is less than that.
6588 +        */
6589 +       extent_bytes = max_bytes - bitmap_bytes;
6590 +       extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
6591  
6592 -               block_group->extents_thresh = extent_bytes /
6593 -                       (sizeof(struct btrfs_free_space));
6594 -       }
6595 +       block_group->extents_thresh =
6596 +               div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
6597  }
6598  
6599  static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
6600 @@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
6601         BUG_ON(block_group->total_bitmaps >= max_bitmaps);
6602  
6603         info->offset = offset_to_bitmap(block_group, offset);
6604 +       info->bytes = 0;
6605         link_free_space(block_group, info);
6606         block_group->total_bitmaps++;
6607  
6608 diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
6609 index 6b627c6..72ce3c1 100644
6610 --- a/fs/btrfs/inode-item.c
6611 +++ b/fs/btrfs/inode-item.c
6612 @@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
6613                 ptr = (unsigned long)(ref + 1);
6614                 ret = 0;
6615         } else if (ret < 0) {
6616 +               if (ret == -EOVERFLOW)
6617 +                       ret = -EMLINK;
6618                 goto out;
6619         } else {
6620                 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
6621 @@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
6622  
6623         ret = btrfs_insert_empty_item(trans, root, path, &key,
6624                                       sizeof(struct btrfs_inode_item));
6625 -       if (ret == 0 && objectid > root->highest_inode)
6626 -               root->highest_inode = objectid;
6627         return ret;
6628  }
6629  
6630 diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
6631 index 9abbced..c56eb59 100644
6632 --- a/fs/btrfs/inode-map.c
6633 +++ b/fs/btrfs/inode-map.c
6634 @@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
6635                 slot = path->slots[0] - 1;
6636                 l = path->nodes[0];
6637                 btrfs_item_key_to_cpu(l, &found_key, slot);
6638 -               *objectid = found_key.objectid;
6639 +               *objectid = max_t(u64, found_key.objectid,
6640 +                                 BTRFS_FIRST_FREE_OBJECTID - 1);
6641         } else {
6642 -               *objectid = BTRFS_FIRST_FREE_OBJECTID;
6643 +               *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
6644         }
6645         ret = 0;
6646  error:
6647 @@ -53,91 +54,27 @@ error:
6648         return ret;
6649  }
6650  
6651 -/*
6652 - * walks the btree of allocated inodes and find a hole.
6653 - */
6654  int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
6655                              struct btrfs_root *root,
6656                              u64 dirid, u64 *objectid)
6657  {
6658 -       struct btrfs_path *path;
6659 -       struct btrfs_key key;
6660         int ret;
6661 -       int slot = 0;
6662 -       u64 last_ino = 0;
6663 -       int start_found;
6664 -       struct extent_buffer *l;
6665 -       struct btrfs_key search_key;
6666 -       u64 search_start = dirid;
6667 -
6668         mutex_lock(&root->objectid_mutex);
6669 -       if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
6670 -           root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
6671 -               *objectid = ++root->last_inode_alloc;
6672 -               mutex_unlock(&root->objectid_mutex);
6673 -               return 0;
6674 -       }
6675 -       path = btrfs_alloc_path();
6676 -       BUG_ON(!path);
6677 -       search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
6678 -       search_key.objectid = search_start;
6679 -       search_key.type = 0;
6680 -       search_key.offset = 0;
6681 -
6682 -       start_found = 0;
6683 -       ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
6684 -       if (ret < 0)
6685 -               goto error;
6686  
6687 -       while (1) {
6688 -               l = path->nodes[0];
6689 -               slot = path->slots[0];
6690 -               if (slot >= btrfs_header_nritems(l)) {
6691 -                       ret = btrfs_next_leaf(root, path);
6692 -                       if (ret == 0)
6693 -                               continue;
6694 -                       if (ret < 0)
6695 -                               goto error;
6696 -                       if (!start_found) {
6697 -                               *objectid = search_start;
6698 -                               start_found = 1;
6699 -                               goto found;
6700 -                       }
6701 -                       *objectid = last_ino > search_start ?
6702 -                               last_ino : search_start;
6703 -                       goto found;
6704 -               }
6705 -               btrfs_item_key_to_cpu(l, &key, slot);
6706 -               if (key.objectid >= search_start) {
6707 -                       if (start_found) {
6708 -                               if (last_ino < search_start)
6709 -                                       last_ino = search_start;
6710 -                               if (key.objectid > last_ino) {
6711 -                                       *objectid = last_ino;
6712 -                                       goto found;
6713 -                               }
6714 -                       } else if (key.objectid > search_start) {
6715 -                               *objectid = search_start;
6716 -                               goto found;
6717 -                       }
6718 -               }
6719 -               if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
6720 -                       break;
6721 +       if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
6722 +               ret = btrfs_find_highest_inode(root, &root->highest_objectid);
6723 +               if (ret)
6724 +                       goto out;
6725 +       }
6726  
6727 -               start_found = 1;
6728 -               last_ino = key.objectid + 1;
6729 -               path->slots[0]++;
6730 +       if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
6731 +               ret = -ENOSPC;
6732 +               goto out;
6733         }
6734 -       BUG_ON(1);
6735 -found:
6736 -       btrfs_release_path(root, path);
6737 -       btrfs_free_path(path);
6738 -       BUG_ON(*objectid < search_start);
6739 -       mutex_unlock(&root->objectid_mutex);
6740 -       return 0;
6741 -error:
6742 -       btrfs_release_path(root, path);
6743 -       btrfs_free_path(path);
6744 +
6745 +       *objectid = ++root->highest_objectid;
6746 +       ret = 0;
6747 +out:
6748         mutex_unlock(&root->objectid_mutex);
6749         return ret;
6750  }
6751 diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
6752 index 59cba18..f69e5e0 100644
6753 --- a/fs/btrfs/inode.c
6754 +++ b/fs/btrfs/inode.c
6755 @@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
6756         }
6757  
6758         ret = btrfs_drop_extents(trans, root, inode, start,
6759 -                                aligned_end, aligned_end, start, &hint_byte);
6760 +                                aligned_end, aligned_end, start,
6761 +                                &hint_byte, 1);
6762         BUG_ON(ret);
6763  
6764         if (isize > actual_end)
6765 @@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
6766                                    inline_len, compressed_size,
6767                                    compressed_pages);
6768         BUG_ON(ret);
6769 -       btrfs_drop_extent_cache(inode, start, aligned_end, 0);
6770 +       btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
6771         return 0;
6772  }
6773  
6774 @@ -423,9 +424,12 @@ again:
6775                          * and free up our temp pages.
6776                          */
6777                         extent_clear_unlock_delalloc(inode,
6778 -                                                    &BTRFS_I(inode)->io_tree,
6779 -                                                    start, end, NULL, 1, 0,
6780 -                                                    0, 1, 1, 1);
6781 +                            &BTRFS_I(inode)->io_tree,
6782 +                            start, end, NULL,
6783 +                            EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
6784 +                            EXTENT_CLEAR_DELALLOC |
6785 +                            EXTENT_CLEAR_ACCOUNTING |
6786 +                            EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
6787                         ret = 0;
6788                         goto free_pages_out;
6789                 }
6790 @@ -611,9 +615,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
6791                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6792  
6793                 while (1) {
6794 -                       spin_lock(&em_tree->lock);
6795 +                       write_lock(&em_tree->lock);
6796                         ret = add_extent_mapping(em_tree, em);
6797 -                       spin_unlock(&em_tree->lock);
6798 +                       write_unlock(&em_tree->lock);
6799                         if (ret != -EEXIST) {
6800                                 free_extent_map(em);
6801                                 break;
6802 @@ -636,11 +640,14 @@ static noinline int submit_compressed_extents(struct inode *inode,
6803                  * clear dirty, set writeback and unlock the pages.
6804                  */
6805                 extent_clear_unlock_delalloc(inode,
6806 -                                            &BTRFS_I(inode)->io_tree,
6807 -                                            async_extent->start,
6808 -                                            async_extent->start +
6809 -                                            async_extent->ram_size - 1,
6810 -                                            NULL, 1, 1, 0, 1, 1, 0);
6811 +                               &BTRFS_I(inode)->io_tree,
6812 +                               async_extent->start,
6813 +                               async_extent->start +
6814 +                               async_extent->ram_size - 1,
6815 +                               NULL, EXTENT_CLEAR_UNLOCK_PAGE |
6816 +                               EXTENT_CLEAR_UNLOCK |
6817 +                               EXTENT_CLEAR_DELALLOC |
6818 +                               EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
6819  
6820                 ret = btrfs_submit_compressed_write(inode,
6821                                     async_extent->start,
6822 @@ -711,9 +718,15 @@ static noinline int cow_file_range(struct inode *inode,
6823                                             start, end, 0, NULL);
6824                 if (ret == 0) {
6825                         extent_clear_unlock_delalloc(inode,
6826 -                                                    &BTRFS_I(inode)->io_tree,
6827 -                                                    start, end, NULL, 1, 1,
6828 -                                                    1, 1, 1, 1);
6829 +                                    &BTRFS_I(inode)->io_tree,
6830 +                                    start, end, NULL,
6831 +                                    EXTENT_CLEAR_UNLOCK_PAGE |
6832 +                                    EXTENT_CLEAR_UNLOCK |
6833 +                                    EXTENT_CLEAR_DELALLOC |
6834 +                                    EXTENT_CLEAR_ACCOUNTING |
6835 +                                    EXTENT_CLEAR_DIRTY |
6836 +                                    EXTENT_SET_WRITEBACK |
6837 +                                    EXTENT_END_WRITEBACK);
6838                         *nr_written = *nr_written +
6839                              (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
6840                         *page_started = 1;
6841 @@ -725,9 +738,20 @@ static noinline int cow_file_range(struct inode *inode,
6842         BUG_ON(disk_num_bytes >
6843                btrfs_super_total_bytes(&root->fs_info->super_copy));
6844  
6845 +
6846 +       read_lock(&BTRFS_I(inode)->extent_tree.lock);
6847 +       em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
6848 +                                  start, num_bytes);
6849 +       if (em) {
6850 +               alloc_hint = em->block_start;
6851 +               free_extent_map(em);
6852 +       }
6853 +       read_unlock(&BTRFS_I(inode)->extent_tree.lock);
6854         btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
6855  
6856         while (disk_num_bytes > 0) {
6857 +               unsigned long op;
6858 +
6859                 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
6860                 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
6861                                            root->sectorsize, 0, alloc_hint,
6862 @@ -737,7 +761,6 @@ static noinline int cow_file_range(struct inode *inode,
6863                 em = alloc_extent_map(GFP_NOFS);
6864                 em->start = start;
6865                 em->orig_start = em->start;
6866 -
6867                 ram_size = ins.offset;
6868                 em->len = ins.offset;
6869  
6870 @@ -747,9 +770,9 @@ static noinline int cow_file_range(struct inode *inode,
6871                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
6872  
6873                 while (1) {
6874 -                       spin_lock(&em_tree->lock);
6875 +                       write_lock(&em_tree->lock);
6876                         ret = add_extent_mapping(em_tree, em);
6877 -                       spin_unlock(&em_tree->lock);
6878 +                       write_unlock(&em_tree->lock);
6879                         if (ret != -EEXIST) {
6880                                 free_extent_map(em);
6881                                 break;
6882 @@ -776,11 +799,17 @@ static noinline int cow_file_range(struct inode *inode,
6883                 /* we're not doing compressed IO, don't unlock the first
6884                  * page (which the caller expects to stay locked), don't
6885                  * clear any dirty bits and don't set any writeback bits
6886 +                *
6887 +                * Do set the Private2 bit so we know this page was properly
6888 +                * setup for writepage
6889                  */
6890 +               op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
6891 +               op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
6892 +                       EXTENT_SET_PRIVATE2;
6893 +
6894                 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
6895                                              start, start + ram_size - 1,
6896 -                                            locked_page, unlock, 1,
6897 -                                            1, 0, 0, 0);
6898 +                                            locked_page, op);
6899                 disk_num_bytes -= cur_alloc_size;
6900                 num_bytes -= cur_alloc_size;
6901                 alloc_hint = ins.objectid + ins.offset;
6902 @@ -852,8 +881,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
6903         u64 cur_end;
6904         int limit = 10 * 1024 * 1042;
6905  
6906 -       clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
6907 -                        EXTENT_DELALLOC, 1, 0, GFP_NOFS);
6908 +       clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
6909 +                        1, 0, NULL, GFP_NOFS);
6910         while (start < end) {
6911                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
6912                 async_cow->inode = inode;
6913 @@ -994,6 +1023,7 @@ next_slot:
6914  
6915                 if (found_key.offset > cur_offset) {
6916                         extent_end = found_key.offset;
6917 +                       extent_type = 0;
6918                         goto out_check;
6919                 }
6920  
6921 @@ -1080,9 +1110,9 @@ out_check:
6922                         em->bdev = root->fs_info->fs_devices->latest_bdev;
6923                         set_bit(EXTENT_FLAG_PINNED, &em->flags);
6924                         while (1) {
6925 -                               spin_lock(&em_tree->lock);
6926 +                               write_lock(&em_tree->lock);
6927                                 ret = add_extent_mapping(em_tree, em);
6928 -                               spin_unlock(&em_tree->lock);
6929 +                               write_unlock(&em_tree->lock);
6930                                 if (ret != -EEXIST) {
6931                                         free_extent_map(em);
6932                                         break;
6933 @@ -1100,8 +1130,10 @@ out_check:
6934                 BUG_ON(ret);
6935  
6936                 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
6937 -                                       cur_offset, cur_offset + num_bytes - 1,
6938 -                                       locked_page, 1, 1, 1, 0, 0, 0);
6939 +                               cur_offset, cur_offset + num_bytes - 1,
6940 +                               locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
6941 +                               EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
6942 +                               EXTENT_SET_PRIVATE2);
6943                 cur_offset = extent_end;
6944                 if (cur_offset > end)
6945                         break;
6946 @@ -1147,6 +1179,89 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
6947         return ret;
6948  }
6949  
6950 +static int btrfs_split_extent_hook(struct inode *inode,
6951 +                                   struct extent_state *orig, u64 split)
6952 +{
6953 +       struct btrfs_root *root = BTRFS_I(inode)->root;
6954 +       u64 size;
6955 +
6956 +       if (!(orig->state & EXTENT_DELALLOC))
6957 +               return 0;
6958 +
6959 +       size = orig->end - orig->start + 1;
6960 +       if (size > root->fs_info->max_extent) {
6961 +               u64 num_extents;
6962 +               u64 new_size;
6963 +
6964 +               new_size = orig->end - split + 1;
6965 +               num_extents = div64_u64(size + root->fs_info->max_extent - 1,
6966 +                                       root->fs_info->max_extent);
6967 +
6968 +               /*
6969 +                * if we break a large extent up then leave oustanding_extents
6970 +                * be, since we've already accounted for the large extent.
6971 +                */
6972 +               if (div64_u64(new_size + root->fs_info->max_extent - 1,
6973 +                             root->fs_info->max_extent) < num_extents)
6974 +                       return 0;
6975 +       }
6976 +
6977 +       spin_lock(&BTRFS_I(inode)->accounting_lock);
6978 +       BTRFS_I(inode)->outstanding_extents++;
6979 +       spin_unlock(&BTRFS_I(inode)->accounting_lock);
6980 +
6981 +       return 0;
6982 +}
6983 +
6984 +/*
6985 + * extent_io.c merge_extent_hook, used to track merged delayed allocation
6986 + * extents so we can keep track of new extents that are just merged onto old
6987 + * extents, such as when we are doing sequential writes, so we can properly
6988 + * account for the metadata space we'll need.
6989 + */
6990 +static int btrfs_merge_extent_hook(struct inode *inode,
6991 +                                  struct extent_state *new,
6992 +                                  struct extent_state *other)
6993 +{
6994 +       struct btrfs_root *root = BTRFS_I(inode)->root;
6995 +       u64 new_size, old_size;
6996 +       u64 num_extents;
6997 +
6998 +       /* not delalloc, ignore it */
6999 +       if (!(other->state & EXTENT_DELALLOC))
7000 +               return 0;
7001 +
7002 +       old_size = other->end - other->start + 1;
7003 +       if (new->start < other->start)
7004 +               new_size = other->end - new->start + 1;
7005 +       else
7006 +               new_size = new->end - other->start + 1;
7007 +
7008 +       /* we're not bigger than the max, unreserve the space and go */
7009 +       if (new_size <= root->fs_info->max_extent) {
7010 +               spin_lock(&BTRFS_I(inode)->accounting_lock);
7011 +               BTRFS_I(inode)->outstanding_extents--;
7012 +               spin_unlock(&BTRFS_I(inode)->accounting_lock);
7013 +               return 0;
7014 +       }
7015 +
7016 +       /*
7017 +        * If we grew by another max_extent, just return, we want to keep that
7018 +        * reserved amount.
7019 +        */
7020 +       num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
7021 +                               root->fs_info->max_extent);
7022 +       if (div64_u64(new_size + root->fs_info->max_extent - 1,
7023 +                     root->fs_info->max_extent) > num_extents)
7024 +               return 0;
7025 +
7026 +       spin_lock(&BTRFS_I(inode)->accounting_lock);
7027 +       BTRFS_I(inode)->outstanding_extents--;
7028 +       spin_unlock(&BTRFS_I(inode)->accounting_lock);
7029 +
7030 +       return 0;
7031 +}
7032 +
7033  /*
7034   * extent_io.c set_bit_hook, used to track delayed allocation
7035   * bytes in this file, and to maintain the list of inodes that
7036 @@ -1155,6 +1270,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
7037  static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
7038                        unsigned long old, unsigned long bits)
7039  {
7040 +
7041         /*
7042          * set_bit and clear bit hooks normally require _irqsave/restore
7043          * but in this case, we are only testeing for the DELALLOC
7044 @@ -1162,6 +1278,10 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
7045          */
7046         if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
7047                 struct btrfs_root *root = BTRFS_I(inode)->root;
7048 +
7049 +               spin_lock(&BTRFS_I(inode)->accounting_lock);
7050 +               BTRFS_I(inode)->outstanding_extents++;
7051 +               spin_unlock(&BTRFS_I(inode)->accounting_lock);
7052                 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
7053                 spin_lock(&root->fs_info->delalloc_lock);
7054                 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
7055 @@ -1178,22 +1298,31 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
7056  /*
7057   * extent_io.c clear_bit_hook, see set_bit_hook for why
7058   */
7059 -static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
7060 -                        unsigned long old, unsigned long bits)
7061 +static int btrfs_clear_bit_hook(struct inode *inode,
7062 +                               struct extent_state *state, unsigned long bits)
7063  {
7064         /*
7065          * set_bit and clear bit hooks normally require _irqsave/restore
7066          * but in this case, we are only testeing for the DELALLOC
7067          * bit, which is only set or cleared with irqs on
7068          */
7069 -       if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
7070 +       if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
7071                 struct btrfs_root *root = BTRFS_I(inode)->root;
7072  
7073 +               if (bits & EXTENT_DO_ACCOUNTING) {
7074 +                       spin_lock(&BTRFS_I(inode)->accounting_lock);
7075 +                       BTRFS_I(inode)->outstanding_extents--;
7076 +                       spin_unlock(&BTRFS_I(inode)->accounting_lock);
7077 +                       btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
7078 +               }
7079 +
7080                 spin_lock(&root->fs_info->delalloc_lock);
7081 -               if (end - start + 1 > root->fs_info->delalloc_bytes) {
7082 +               if (state->end - state->start + 1 >
7083 +                   root->fs_info->delalloc_bytes) {
7084                         printk(KERN_INFO "btrfs warning: delalloc account "
7085                                "%llu %llu\n",
7086 -                              (unsigned long long)end - start + 1,
7087 +                              (unsigned long long)
7088 +                              state->end - state->start + 1,
7089                                (unsigned long long)
7090                                root->fs_info->delalloc_bytes);
7091                         btrfs_delalloc_free_space(root, inode, (u64)-1);
7092 @@ -1201,9 +1330,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
7093                         BTRFS_I(inode)->delalloc_bytes = 0;
7094                 } else {
7095                         btrfs_delalloc_free_space(root, inode,
7096 -                                                 end - start + 1);
7097 -                       root->fs_info->delalloc_bytes -= end - start + 1;
7098 -                       BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
7099 +                                                 state->end -
7100 +                                                 state->start + 1);
7101 +                       root->fs_info->delalloc_bytes -= state->end -
7102 +                               state->start + 1;
7103 +                       BTRFS_I(inode)->delalloc_bytes -= state->end -
7104 +                               state->start + 1;
7105                 }
7106                 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
7107                     !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
7108 @@ -1374,10 +1506,8 @@ again:
7109         lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
7110  
7111         /* already ordered? We're done */
7112 -       if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
7113 -                            EXTENT_ORDERED, 0)) {
7114 +       if (PagePrivate2(page))
7115                 goto out;
7116 -       }
7117  
7118         ordered = btrfs_lookup_ordered_extent(inode, page_start);
7119         if (ordered) {
7120 @@ -1413,11 +1543,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
7121         struct inode *inode = page->mapping->host;
7122         struct btrfs_writepage_fixup *fixup;
7123         struct btrfs_root *root = BTRFS_I(inode)->root;
7124 -       int ret;
7125  
7126 -       ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
7127 -                            EXTENT_ORDERED, 0);
7128 -       if (ret)
7129 +       /* this page is properly in the ordered list */
7130 +       if (TestClearPagePrivate2(page))
7131                 return 0;
7132  
7133         if (PageChecked(page))
7134 @@ -1455,9 +1583,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
7135         BUG_ON(!path);
7136  
7137         path->leave_spinning = 1;
7138 +
7139 +       /*
7140 +        * we may be replacing one extent in the tree with another.
7141 +        * The new extent is pinned in the extent map, and we don't want
7142 +        * to drop it from the cache until it is completely in the btree.
7143 +        *
7144 +        * So, tell btrfs_drop_extents to leave this extent in the cache.
7145 +        * the caller is expected to unpin it and allow it to be merged
7146 +        * with the others.
7147 +        */
7148         ret = btrfs_drop_extents(trans, root, inode, file_pos,
7149                                  file_pos + num_bytes, locked_end,
7150 -                                file_pos, &hint);
7151 +                                file_pos, &hint, 0);
7152         BUG_ON(ret);
7153  
7154         ins.objectid = inode->i_ino;
7155 @@ -1485,7 +1623,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
7156         btrfs_mark_buffer_dirty(leaf);
7157  
7158         inode_add_bytes(inode, num_bytes);
7159 -       btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
7160  
7161         ins.objectid = disk_bytenr;
7162         ins.offset = disk_num_bytes;
7163 @@ -1596,6 +1733,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
7164                                                 ordered_extent->len,
7165                                                 compressed, 0, 0,
7166                                                 BTRFS_FILE_EXTENT_REG);
7167 +               unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
7168 +                                  ordered_extent->file_offset,
7169 +                                  ordered_extent->len);
7170                 BUG_ON(ret);
7171         }
7172         unlock_extent(io_tree, ordered_extent->file_offset,
7173 @@ -1623,6 +1763,7 @@ nocow:
7174  static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
7175                                 struct extent_state *state, int uptodate)
7176  {
7177 +       ClearPagePrivate2(page);
7178         return btrfs_finish_ordered_io(page->mapping->host, start, end);
7179  }
7180  
7181 @@ -1669,13 +1810,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
7182                 failrec->last_mirror = 0;
7183                 failrec->bio_flags = 0;
7184  
7185 -               spin_lock(&em_tree->lock);
7186 +               read_lock(&em_tree->lock);
7187                 em = lookup_extent_mapping(em_tree, start, failrec->len);
7188                 if (em->start > start || em->start + em->len < start) {
7189                         free_extent_map(em);
7190                         em = NULL;
7191                 }
7192 -               spin_unlock(&em_tree->lock);
7193 +               read_unlock(&em_tree->lock);
7194  
7195                 if (!em || IS_ERR(em)) {
7196                         kfree(failrec);
7197 @@ -1794,7 +1935,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
7198                 return 0;
7199  
7200         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
7201 -           test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
7202 +           test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
7203                 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
7204                                   GFP_NOFS);
7205                 return 0;
7206 @@ -2352,6 +2493,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
7207         return ret;
7208  }
7209  
7210 +int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
7211 +                       struct btrfs_root *root,
7212 +                       struct inode *dir, u64 objectid,
7213 +                       const char *name, int name_len)
7214 +{
7215 +       struct btrfs_path *path;
7216 +       struct extent_buffer *leaf;
7217 +       struct btrfs_dir_item *di;
7218 +       struct btrfs_key key;
7219 +       u64 index;
7220 +       int ret;
7221 +
7222 +       path = btrfs_alloc_path();
7223 +       if (!path)
7224 +               return -ENOMEM;
7225 +
7226 +       di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
7227 +                                  name, name_len, -1);
7228 +       BUG_ON(!di || IS_ERR(di));
7229 +
7230 +       leaf = path->nodes[0];
7231 +       btrfs_dir_item_key_to_cpu(leaf, di, &key);
7232 +       WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
7233 +       ret = btrfs_delete_one_dir_name(trans, root, path, di);
7234 +       BUG_ON(ret);
7235 +       btrfs_release_path(root, path);
7236 +
7237 +       ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
7238 +                                objectid, root->root_key.objectid,
7239 +                                dir->i_ino, &index, name, name_len);
7240 +       if (ret < 0) {
7241 +               BUG_ON(ret != -ENOENT);
7242 +               di = btrfs_search_dir_index_item(root, path, dir->i_ino,
7243 +                                                name, name_len);
7244 +               BUG_ON(!di || IS_ERR(di));
7245 +
7246 +               leaf = path->nodes[0];
7247 +               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7248 +               btrfs_release_path(root, path);
7249 +               index = key.offset;
7250 +       }
7251 +
7252 +       di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
7253 +                                        index, name, name_len, -1);
7254 +       BUG_ON(!di || IS_ERR(di));
7255 +
7256 +       leaf = path->nodes[0];
7257 +       btrfs_dir_item_key_to_cpu(leaf, di, &key);
7258 +       WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
7259 +       ret = btrfs_delete_one_dir_name(trans, root, path, di);
7260 +       BUG_ON(ret);
7261 +       btrfs_release_path(root, path);
7262 +
7263 +       btrfs_i_size_write(dir, dir->i_size - name_len * 2);
7264 +       dir->i_mtime = dir->i_ctime = CURRENT_TIME;
7265 +       ret = btrfs_update_inode(trans, root, dir);
7266 +       BUG_ON(ret);
7267 +       dir->i_sb->s_dirt = 1;
7268 +
7269 +       btrfs_free_path(path);
7270 +       return 0;
7271 +}
7272 +
7273  static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
7274  {
7275         struct inode *inode = dentry->d_inode;
7276 @@ -2361,29 +2565,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
7277         struct btrfs_trans_handle *trans;
7278         unsigned long nr = 0;
7279  
7280 -       /*
7281 -        * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
7282 -        * the root of a subvolume or snapshot
7283 -        */
7284         if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
7285 -           inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
7286 +           inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
7287                 return -ENOTEMPTY;
7288 -       }
7289  
7290         trans = btrfs_start_transaction(root, 1);
7291         btrfs_set_trans_block_group(trans, dir);
7292  
7293 +       if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
7294 +               err = btrfs_unlink_subvol(trans, root, dir,
7295 +                                         BTRFS_I(inode)->location.objectid,
7296 +                                         dentry->d_name.name,
7297 +                                         dentry->d_name.len);
7298 +               goto out;
7299 +       }
7300 +
7301         err = btrfs_orphan_add(trans, inode);
7302         if (err)
7303 -               goto fail_trans;
7304 +               goto out;
7305  
7306         /* now the directory is empty */
7307         err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
7308                                  dentry->d_name.name, dentry->d_name.len);
7309         if (!err)
7310                 btrfs_i_size_write(inode, 0);
7311 -
7312 -fail_trans:
7313 +out:
7314         nr = trans->blocks_used;
7315         ret = btrfs_end_transaction_throttle(trans, root);
7316         btrfs_btree_balance_dirty(root, nr);
7317 @@ -2826,12 +3032,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
7318  
7319         if ((offset & (blocksize - 1)) == 0)
7320                 goto out;
7321 +       ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
7322 +       if (ret)
7323 +               goto out;
7324 +
7325 +       ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
7326 +       if (ret)
7327 +               goto out;
7328  
7329         ret = -ENOMEM;
7330  again:
7331         page = grab_cache_page(mapping, index);
7332 -       if (!page)
7333 +       if (!page) {
7334 +               btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
7335 +               btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
7336                 goto out;
7337 +       }
7338  
7339         page_start = page_offset(page);
7340         page_end = page_start + PAGE_CACHE_SIZE - 1;
7341 @@ -2864,7 +3080,16 @@ again:
7342                 goto again;
7343         }
7344  
7345 -       btrfs_set_extent_delalloc(inode, page_start, page_end);
7346 +       clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
7347 +                         EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
7348 +                         GFP_NOFS);
7349 +
7350 +       ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
7351 +       if (ret) {
7352 +               unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
7353 +               goto out_unlock;
7354 +       }
7355 +
7356         ret = 0;
7357         if (offset != PAGE_CACHE_SIZE) {
7358                 kaddr = kmap(page);
7359 @@ -2877,6 +3102,9 @@ again:
7360         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
7361  
7362  out_unlock:
7363 +       if (ret)
7364 +               btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
7365 +       btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
7366         unlock_page(page);
7367         page_cache_release(page);
7368  out:
7369 @@ -2895,17 +3123,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
7370         u64 last_byte;
7371         u64 cur_offset;
7372         u64 hole_size;
7373 -       int err;
7374 +       int err = 0;
7375  
7376         if (size <= hole_start)
7377                 return 0;
7378  
7379 -       err = btrfs_check_metadata_free_space(root);
7380 +       err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
7381         if (err)
7382                 return err;
7383  
7384 -       btrfs_truncate_page(inode->i_mapping, inode->i_size);
7385 -
7386         while (1) {
7387                 struct btrfs_ordered_extent *ordered;
7388                 btrfs_wait_ordered_range(inode, hole_start,
7389 @@ -2935,15 +3161,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
7390                                                  cur_offset,
7391                                                  cur_offset + hole_size,
7392                                                  block_end,
7393 -                                                cur_offset, &hint_byte);
7394 +                                                cur_offset, &hint_byte, 1);
7395                         if (err)
7396                                 break;
7397 +
7398 +                       err = btrfs_reserve_metadata_space(root, 1);
7399 +                       if (err)
7400 +                               break;
7401 +
7402                         err = btrfs_insert_file_extent(trans, root,
7403                                         inode->i_ino, cur_offset, 0,
7404                                         0, hole_size, 0, hole_size,
7405                                         0, 0, 0);
7406                         btrfs_drop_extent_cache(inode, hole_start,
7407                                         last_byte - 1, 0);
7408 +                       btrfs_unreserve_metadata_space(root, 1);
7409                 }
7410                 free_extent_map(em);
7411                 cur_offset = last_byte;
7412 @@ -3003,6 +3235,11 @@ void btrfs_delete_inode(struct inode *inode)
7413         }
7414         btrfs_wait_ordered_range(inode, 0, (u64)-1);
7415  
7416 +       if (inode->i_nlink > 0) {
7417 +               BUG_ON(btrfs_root_refs(&root->root_item) != 0);
7418 +               goto no_delete;
7419 +       }
7420 +
7421         btrfs_i_size_write(inode, 0);
7422         trans = btrfs_join_transaction(root, 1);
7423  
7424 @@ -3070,29 +3307,67 @@ out_err:
7425   * is kind of like crossing a mount point.
7426   */
7427  static int fixup_tree_root_location(struct btrfs_root *root,
7428 -                            struct btrfs_key *location,
7429 -                            struct btrfs_root **sub_root,
7430 -                            struct dentry *dentry)
7431 +                                   struct inode *dir,
7432 +                                   struct dentry *dentry,
7433 +                                   struct btrfs_key *location,
7434 +                                   struct btrfs_root **sub_root)
7435  {
7436 -       struct btrfs_root_item *ri;
7437 +       struct btrfs_path *path;
7438 +       struct btrfs_root *new_root;
7439 +       struct btrfs_root_ref *ref;
7440 +       struct extent_buffer *leaf;
7441 +       int ret;
7442 +       int err = 0;
7443  
7444 -       if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
7445 -               return 0;
7446 -       if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
7447 -               return 0;
7448 +       path = btrfs_alloc_path();
7449 +       if (!path) {
7450 +               err = -ENOMEM;
7451 +               goto out;
7452 +       }
7453  
7454 -       *sub_root = btrfs_read_fs_root(root->fs_info, location,
7455 -                                       dentry->d_name.name,
7456 -                                       dentry->d_name.len);
7457 -       if (IS_ERR(*sub_root))
7458 -               return PTR_ERR(*sub_root);
7459 +       err = -ENOENT;
7460 +       ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
7461 +                                 BTRFS_I(dir)->root->root_key.objectid,
7462 +                                 location->objectid);
7463 +       if (ret) {
7464 +               if (ret < 0)
7465 +                       err = ret;
7466 +               goto out;
7467 +       }
7468  
7469 -       ri = &(*sub_root)->root_item;
7470 -       location->objectid = btrfs_root_dirid(ri);
7471 -       btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
7472 -       location->offset = 0;
7473 +       leaf = path->nodes[0];
7474 +       ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
7475 +       if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
7476 +           btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
7477 +               goto out;
7478  
7479 -       return 0;
7480 +       ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
7481 +                                  (unsigned long)(ref + 1),
7482 +                                  dentry->d_name.len);
7483 +       if (ret)
7484 +               goto out;
7485 +
7486 +       btrfs_release_path(root->fs_info->tree_root, path);
7487 +
7488 +       new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
7489 +       if (IS_ERR(new_root)) {
7490 +               err = PTR_ERR(new_root);
7491 +               goto out;
7492 +       }
7493 +
7494 +       if (btrfs_root_refs(&new_root->root_item) == 0) {
7495 +               err = -ENOENT;
7496 +               goto out;
7497 +       }
7498 +
7499 +       *sub_root = new_root;
7500 +       location->objectid = btrfs_root_dirid(&new_root->root_item);
7501 +       location->type = BTRFS_INODE_ITEM_KEY;
7502 +       location->offset = 0;
7503 +       err = 0;
7504 +out:
7505 +       btrfs_free_path(path);
7506 +       return err;
7507  }
7508  
7509  static void inode_tree_add(struct inode *inode)
7510 @@ -3101,11 +3376,13 @@ static void inode_tree_add(struct inode *inode)
7511         struct btrfs_inode *entry;
7512         struct rb_node **p;
7513         struct rb_node *parent;
7514 -
7515  again:
7516         p = &root->inode_tree.rb_node;
7517         parent = NULL;
7518  
7519 +       if (hlist_unhashed(&inode->i_hash))
7520 +               return;
7521 +
7522         spin_lock(&root->inode_lock);
7523         while (*p) {
7524                 parent = *p;
7525 @@ -3132,13 +3409,87 @@ again:
7526  static void inode_tree_del(struct inode *inode)
7527  {
7528         struct btrfs_root *root = BTRFS_I(inode)->root;
7529 +       int empty = 0;
7530  
7531         spin_lock(&root->inode_lock);
7532         if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
7533                 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
7534                 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
7535 +               empty = RB_EMPTY_ROOT(&root->inode_tree);
7536 +       }
7537 +       spin_unlock(&root->inode_lock);
7538 +
7539 +       if (empty && btrfs_root_refs(&root->root_item) == 0) {
7540 +               synchronize_srcu(&root->fs_info->subvol_srcu);
7541 +               spin_lock(&root->inode_lock);
7542 +               empty = RB_EMPTY_ROOT(&root->inode_tree);
7543 +               spin_unlock(&root->inode_lock);
7544 +               if (empty)
7545 +                       btrfs_add_dead_root(root);
7546 +       }
7547 +}
7548 +
7549 +int btrfs_invalidate_inodes(struct btrfs_root *root)
7550 +{
7551 +       struct rb_node *node;
7552 +       struct rb_node *prev;
7553 +       struct btrfs_inode *entry;
7554 +       struct inode *inode;
7555 +       u64 objectid = 0;
7556 +
7557 +       WARN_ON(btrfs_root_refs(&root->root_item) != 0);
7558 +
7559 +       spin_lock(&root->inode_lock);
7560 +again:
7561 +       node = root->inode_tree.rb_node;
7562 +       prev = NULL;
7563 +       while (node) {
7564 +               prev = node;
7565 +               entry = rb_entry(node, struct btrfs_inode, rb_node);
7566 +
7567 +               if (objectid < entry->vfs_inode.i_ino)
7568 +                       node = node->rb_left;
7569 +               else if (objectid > entry->vfs_inode.i_ino)
7570 +                       node = node->rb_right;
7571 +               else
7572 +                       break;
7573 +       }
7574 +       if (!node) {
7575 +               while (prev) {
7576 +                       entry = rb_entry(prev, struct btrfs_inode, rb_node);
7577 +                       if (objectid <= entry->vfs_inode.i_ino) {
7578 +                               node = prev;
7579 +                               break;
7580 +                       }
7581 +                       prev = rb_next(prev);
7582 +               }
7583 +       }
7584 +       while (node) {
7585 +               entry = rb_entry(node, struct btrfs_inode, rb_node);
7586 +               objectid = entry->vfs_inode.i_ino + 1;
7587 +               inode = igrab(&entry->vfs_inode);
7588 +               if (inode) {
7589 +                       spin_unlock(&root->inode_lock);
7590 +                       if (atomic_read(&inode->i_count) > 1)
7591 +                               d_prune_aliases(inode);
7592 +                       /*
7593 +                        * btrfs_drop_inode will remove it from
7594 +                        * the inode cache when its usage count
7595 +                        * hits zero.
7596 +                        */
7597 +                       iput(inode);
7598 +                       cond_resched();
7599 +                       spin_lock(&root->inode_lock);
7600 +                       goto again;
7601 +               }
7602 +
7603 +               if (cond_resched_lock(&root->inode_lock))
7604 +                       goto again;
7605 +
7606 +               node = rb_next(node);
7607         }
7608         spin_unlock(&root->inode_lock);
7609 +       return 0;
7610  }
7611  
7612  static noinline void init_btrfs_i(struct inode *inode)
7613 @@ -3148,6 +3499,7 @@ static noinline void init_btrfs_i(struct inode *inode)
7614         bi->generation = 0;
7615         bi->sequence = 0;
7616         bi->last_trans = 0;
7617 +       bi->last_sub_trans = 0;
7618         bi->logged_trans = 0;
7619         bi->delalloc_bytes = 0;
7620         bi->reserved_bytes = 0;
7621 @@ -3225,15 +3577,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
7622         return inode;
7623  }
7624  
7625 +static struct inode *new_simple_dir(struct super_block *s,
7626 +                                   struct btrfs_key *key,
7627 +                                   struct btrfs_root *root)
7628 +{
7629 +       struct inode *inode = new_inode(s);
7630 +
7631 +       if (!inode)
7632 +               return ERR_PTR(-ENOMEM);
7633 +
7634 +       init_btrfs_i(inode);
7635 +
7636 +       BTRFS_I(inode)->root = root;
7637 +       memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
7638 +       BTRFS_I(inode)->dummy_inode = 1;
7639 +
7640 +       inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
7641 +       inode->i_op = &simple_dir_inode_operations;
7642 +       inode->i_fop = &simple_dir_operations;
7643 +       inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
7644 +       inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
7645 +
7646 +       return inode;
7647 +}
7648 +
7649  struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
7650  {
7651         struct inode *inode;
7652 -       struct btrfs_inode *bi = BTRFS_I(dir);
7653 -       struct btrfs_root *root = bi->root;
7654 +       struct btrfs_root *root = BTRFS_I(dir)->root;
7655         struct btrfs_root *sub_root = root;
7656         struct btrfs_key location;
7657 +       int index;
7658         int ret;
7659  
7660 +       dentry->d_op = &btrfs_dentry_operations;
7661 +
7662         if (dentry->d_name.len > BTRFS_NAME_LEN)
7663                 return ERR_PTR(-ENAMETOOLONG);
7664  
7665 @@ -3242,29 +3620,52 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
7666         if (ret < 0)
7667                 return ERR_PTR(ret);
7668  
7669 -       inode = NULL;
7670 -       if (location.objectid) {
7671 -               ret = fixup_tree_root_location(root, &location, &sub_root,
7672 -                                               dentry);
7673 -               if (ret < 0)
7674 -                       return ERR_PTR(ret);
7675 -               if (ret > 0)
7676 -                       return ERR_PTR(-ENOENT);
7677 +       if (location.objectid == 0)
7678 +               return NULL;
7679 +
7680 +       if (location.type == BTRFS_INODE_ITEM_KEY) {
7681 +               inode = btrfs_iget(dir->i_sb, &location, root);
7682 +               return inode;
7683 +       }
7684 +
7685 +       BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
7686 +
7687 +       index = srcu_read_lock(&root->fs_info->subvol_srcu);
7688 +       ret = fixup_tree_root_location(root, dir, dentry,
7689 +                                      &location, &sub_root);
7690 +       if (ret < 0) {
7691 +               if (ret != -ENOENT)
7692 +                       inode = ERR_PTR(ret);
7693 +               else
7694 +                       inode = new_simple_dir(dir->i_sb, &location, sub_root);
7695 +       } else {
7696                 inode = btrfs_iget(dir->i_sb, &location, sub_root);
7697 -               if (IS_ERR(inode))
7698 -                       return ERR_CAST(inode);
7699         }
7700 +       srcu_read_unlock(&root->fs_info->subvol_srcu, index);
7701 +
7702         return inode;
7703  }
7704  
7705 +static int btrfs_dentry_delete(struct dentry *dentry)
7706 +{
7707 +       struct btrfs_root *root;
7708 +
7709 +       if (!dentry->d_inode && !IS_ROOT(dentry))
7710 +               dentry = dentry->d_parent;
7711 +
7712 +       if (dentry->d_inode) {
7713 +               root = BTRFS_I(dentry->d_inode)->root;
7714 +               if (btrfs_root_refs(&root->root_item) == 0)
7715 +                       return 1;
7716 +       }
7717 +       return 0;
7718 +}
7719 +
7720  static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
7721                                    struct nameidata *nd)
7722  {
7723         struct inode *inode;
7724  
7725 -       if (dentry->d_name.len > BTRFS_NAME_LEN)
7726 -               return ERR_PTR(-ENAMETOOLONG);
7727 -
7728         inode = btrfs_lookup_dentry(dir, dentry);
7729         if (IS_ERR(inode))
7730                 return ERR_CAST(inode);
7731 @@ -3603,9 +4004,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
7732         if (ret != 0)
7733                 goto fail;
7734  
7735 -       if (objectid > root->highest_inode)
7736 -               root->highest_inode = objectid;
7737 -
7738         inode->i_uid = current_fsuid();
7739  
7740         if (dir && (dir->i_mode & S_ISGID)) {
7741 @@ -3673,26 +4071,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
7742                    struct inode *parent_inode, struct inode *inode,
7743                    const char *name, int name_len, int add_backref, u64 index)
7744  {
7745 -       int ret;
7746 +       int ret = 0;
7747         struct btrfs_key key;
7748         struct btrfs_root *root = BTRFS_I(parent_inode)->root;
7749  
7750 -       key.objectid = inode->i_ino;
7751 -       btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
7752 -       key.offset = 0;
7753 +       if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
7754 +               memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
7755 +       } else {
7756 +               key.objectid = inode->i_ino;
7757 +               btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
7758 +               key.offset = 0;
7759 +       }
7760 +
7761 +       if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
7762 +               ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
7763 +                                        key.objectid, root->root_key.objectid,
7764 +                                        parent_inode->i_ino,
7765 +                                        index, name, name_len);
7766 +       } else if (add_backref) {
7767 +               ret = btrfs_insert_inode_ref(trans, root,
7768 +                                            name, name_len, inode->i_ino,
7769 +                                            parent_inode->i_ino, index);
7770 +       }
7771  
7772 -       ret = btrfs_insert_dir_item(trans, root, name, name_len,
7773 -                                   parent_inode->i_ino,
7774 -                                   &key, btrfs_inode_type(inode),
7775 -                                   index);
7776         if (ret == 0) {
7777 -               if (add_backref) {
7778 -                       ret = btrfs_insert_inode_ref(trans, root,
7779 -                                                    name, name_len,
7780 -                                                    inode->i_ino,
7781 -                                                    parent_inode->i_ino,
7782 -                                                    index);
7783 -               }
7784 +               ret = btrfs_insert_dir_item(trans, root, name, name_len,
7785 +                                           parent_inode->i_ino, &key,
7786 +                                           btrfs_inode_type(inode), index);
7787 +               BUG_ON(ret);
7788 +
7789                 btrfs_i_size_write(parent_inode, parent_inode->i_size +
7790                                    name_len * 2);
7791                 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
7792 @@ -3732,11 +4139,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
7793         if (!new_valid_dev(rdev))
7794                 return -EINVAL;
7795  
7796 -       err = btrfs_check_metadata_free_space(root);
7797 +       /*
7798 +        * 2 for inode item and ref
7799 +        * 2 for dir items
7800 +        * 1 for xattr if selinux is on
7801 +        */
7802 +       err = btrfs_reserve_metadata_space(root, 5);
7803         if (err)
7804 -               goto fail;
7805 +               return err;
7806  
7807         trans = btrfs_start_transaction(root, 1);
7808 +       if (!trans)
7809 +               goto fail;
7810         btrfs_set_trans_block_group(trans, dir);
7811  
7812         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
7813 @@ -3774,6 +4188,7 @@ out_unlock:
7814         nr = trans->blocks_used;
7815         btrfs_end_transaction_throttle(trans, root);
7816  fail:
7817 +       btrfs_unreserve_metadata_space(root, 5);
7818         if (drop_inode) {
7819                 inode_dec_link_count(inode);
7820                 iput(inode);
7821 @@ -3794,10 +4209,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
7822         u64 objectid;
7823         u64 index = 0;
7824  
7825 -       err = btrfs_check_metadata_free_space(root);
7826 +       /*
7827 +        * 2 for inode item and ref
7828 +        * 2 for dir items
7829 +        * 1 for xattr if selinux is on
7830 +        */
7831 +       err = btrfs_reserve_metadata_space(root, 5);
7832         if (err)
7833 -               goto fail;
7834 +               return err;
7835 +
7836         trans = btrfs_start_transaction(root, 1);
7837 +       if (!trans)
7838 +               goto fail;
7839         btrfs_set_trans_block_group(trans, dir);
7840  
7841         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
7842 @@ -3838,6 +4261,7 @@ out_unlock:
7843         nr = trans->blocks_used;
7844         btrfs_end_transaction_throttle(trans, root);
7845  fail:
7846 +       btrfs_unreserve_metadata_space(root, 5);
7847         if (drop_inode) {
7848                 inode_dec_link_count(inode);
7849                 iput(inode);
7850 @@ -3860,10 +4284,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
7851         if (inode->i_nlink == 0)
7852                 return -ENOENT;
7853  
7854 -       btrfs_inc_nlink(inode);
7855 -       err = btrfs_check_metadata_free_space(root);
7856 +       /*
7857 +        * 1 item for inode ref
7858 +        * 2 items for dir items
7859 +        */
7860 +       err = btrfs_reserve_metadata_space(root, 3);
7861         if (err)
7862 -               goto fail;
7863 +               return err;
7864 +
7865 +       btrfs_inc_nlink(inode);
7866 +
7867         err = btrfs_set_inode_index(dir, &index);
7868         if (err)
7869                 goto fail;
7870 @@ -3875,20 +4305,19 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
7871  
7872         err = btrfs_add_nondir(trans, dentry, inode, 1, index);
7873  
7874 -       if (err)
7875 -               drop_inode = 1;
7876 -
7877 -       btrfs_update_inode_block_group(trans, dir);
7878 -       err = btrfs_update_inode(trans, root, inode);
7879 -
7880 -       if (err)
7881 +       if (err) {
7882                 drop_inode = 1;
7883 +       } else {
7884 +               btrfs_update_inode_block_group(trans, dir);
7885 +               err = btrfs_update_inode(trans, root, inode);
7886 +               BUG_ON(err);
7887 +               btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
7888 +       }
7889  
7890         nr = trans->blocks_used;
7891 -
7892 -       btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
7893         btrfs_end_transaction_throttle(trans, root);
7894  fail:
7895 +       btrfs_unreserve_metadata_space(root, 3);
7896         if (drop_inode) {
7897                 inode_dec_link_count(inode);
7898                 iput(inode);
7899 @@ -3908,17 +4337,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
7900         u64 index = 0;
7901         unsigned long nr = 1;
7902  
7903 -       err = btrfs_check_metadata_free_space(root);
7904 +       /*
7905 +        * 2 items for inode and ref
7906 +        * 2 items for dir items
7907 +        * 1 for xattr if selinux is on
7908 +        */
7909 +       err = btrfs_reserve_metadata_space(root, 5);
7910         if (err)
7911 -               goto out_unlock;
7912 +               return err;
7913  
7914         trans = btrfs_start_transaction(root, 1);
7915 -       btrfs_set_trans_block_group(trans, dir);
7916 -
7917 -       if (IS_ERR(trans)) {
7918 -               err = PTR_ERR(trans);
7919 +       if (!trans) {
7920 +               err = -ENOMEM;
7921                 goto out_unlock;
7922         }
7923 +       btrfs_set_trans_block_group(trans, dir);
7924  
7925         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
7926         if (err) {
7927 @@ -3967,6 +4400,7 @@ out_fail:
7928         btrfs_end_transaction_throttle(trans, root);
7929  
7930  out_unlock:
7931 +       btrfs_unreserve_metadata_space(root, 5);
7932         if (drop_on_err)
7933                 iput(inode);
7934         btrfs_btree_balance_dirty(root, nr);
7935 @@ -4064,11 +4498,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
7936         int compressed;
7937  
7938  again:
7939 -       spin_lock(&em_tree->lock);
7940 +       read_lock(&em_tree->lock);
7941         em = lookup_extent_mapping(em_tree, start, len);
7942         if (em)
7943                 em->bdev = root->fs_info->fs_devices->latest_bdev;
7944 -       spin_unlock(&em_tree->lock);
7945 +       read_unlock(&em_tree->lock);
7946  
7947         if (em) {
7948                 if (em->start > start || em->start + em->len <= start)
7949 @@ -4215,6 +4649,11 @@ again:
7950                                 map = kmap(page);
7951                                 read_extent_buffer(leaf, map + pg_offset, ptr,
7952                                                    copy_size);
7953 +                               if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
7954 +                                       memset(map + pg_offset + copy_size, 0,
7955 +                                              PAGE_CACHE_SIZE - pg_offset -
7956 +                                              copy_size);
7957 +                               }
7958                                 kunmap(page);
7959                         }
7960                         flush_dcache_page(page);
7961 @@ -4259,7 +4698,7 @@ insert:
7962         }
7963  
7964         err = 0;
7965 -       spin_lock(&em_tree->lock);
7966 +       write_lock(&em_tree->lock);
7967         ret = add_extent_mapping(em_tree, em);
7968         /* it is possible that someone inserted the extent into the tree
7969          * while we had the lock dropped.  It is also possible that
7970 @@ -4299,7 +4738,7 @@ insert:
7971                         err = 0;
7972                 }
7973         }
7974 -       spin_unlock(&em_tree->lock);
7975 +       write_unlock(&em_tree->lock);
7976  out:
7977         if (path)
7978                 btrfs_free_path(path);
7979 @@ -4398,13 +4837,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
7980         u64 page_start = page_offset(page);
7981         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
7982  
7983 +
7984 +       /*
7985 +        * we have the page locked, so new writeback can't start,
7986 +        * and the dirty bit won't be cleared while we are here.
7987 +        *
7988 +        * Wait for IO on this page so that we can safely clear
7989 +        * the PagePrivate2 bit and do ordered accounting
7990 +        */
7991         wait_on_page_writeback(page);
7992 +
7993         tree = &BTRFS_I(page->mapping->host)->io_tree;
7994         if (offset) {
7995                 btrfs_releasepage(page, GFP_NOFS);
7996                 return;
7997         }
7998 -
7999         lock_extent(tree, page_start, page_end, GFP_NOFS);
8000         ordered = btrfs_lookup_ordered_extent(page->mapping->host,
8001                                            page_offset(page));
8002 @@ -4415,16 +4862,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
8003                  */
8004                 clear_extent_bit(tree, page_start, page_end,
8005                                  EXTENT_DIRTY | EXTENT_DELALLOC |
8006 -                                EXTENT_LOCKED, 1, 0, GFP_NOFS);
8007 -               btrfs_finish_ordered_io(page->mapping->host,
8008 -                                       page_start, page_end);
8009 +                                EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
8010 +                                NULL, GFP_NOFS);
8011 +               /*
8012 +                * whoever cleared the private bit is responsible
8013 +                * for the finish_ordered_io
8014 +                */
8015 +               if (TestClearPagePrivate2(page)) {
8016 +                       btrfs_finish_ordered_io(page->mapping->host,
8017 +                                               page_start, page_end);
8018 +               }
8019                 btrfs_put_ordered_extent(ordered);
8020                 lock_extent(tree, page_start, page_end, GFP_NOFS);
8021         }
8022         clear_extent_bit(tree, page_start, page_end,
8023                  EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
8024 -                EXTENT_ORDERED,
8025 -                1, 1, GFP_NOFS);
8026 +                EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
8027         __btrfs_releasepage(page, GFP_NOFS);
8028  
8029         ClearPageChecked(page);
8030 @@ -4473,6 +4926,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
8031                 goto out;
8032         }
8033  
8034 +       ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
8035 +       if (ret) {
8036 +               btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
8037 +               ret = VM_FAULT_SIGBUS;
8038 +               goto out;
8039 +       }
8040 +
8041         ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
8042  again:
8043         lock_page(page);
8044 @@ -4504,7 +4964,24 @@ again:
8045                 goto again;
8046         }
8047  
8048 -       btrfs_set_extent_delalloc(inode, page_start, page_end);
8049 +       /*
8050 +        * XXX - page_mkwrite gets called every time the page is dirtied, even
8051 +        * if it was already dirty, so for space accounting reasons we need to
8052 +        * clear any delalloc bits for the range we are fixing to save.  There
8053 +        * is probably a better way to do this, but for now keep consistent with
8054 +        * prepare_pages in the normal write path.
8055 +        */
8056 +       clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
8057 +                         EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
8058 +                         GFP_NOFS);
8059 +
8060 +       ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
8061 +       if (ret) {
8062 +               unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
8063 +               ret = VM_FAULT_SIGBUS;
8064 +               btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
8065 +               goto out_unlock;
8066 +       }
8067         ret = 0;
8068  
8069         /* page is wholly or partially inside EOF */
8070 @@ -4521,11 +4998,17 @@ again:
8071         }
8072         ClearPageChecked(page);
8073         set_page_dirty(page);
8074 +       SetPageUptodate(page);
8075 +
8076 +       BTRFS_I(inode)->last_trans = root->fs_info->generation;
8077 +       BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
8078  
8079 -       BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
8080         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
8081  
8082  out_unlock:
8083 +       btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
8084 +       if (!ret)
8085 +               return VM_FAULT_LOCKED;
8086         unlock_page(page);
8087  out:
8088         return ret;
8089 @@ -4544,7 +5027,9 @@ static void btrfs_truncate(struct inode *inode)
8090         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
8091                 return;
8092  
8093 -       btrfs_truncate_page(inode->i_mapping, inode->i_size);
8094 +       ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
8095 +       if (ret)
8096 +               return;
8097         btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
8098  
8099         trans = btrfs_start_transaction(root, 1);
8100 @@ -4594,11 +5079,11 @@ out:
8101   * create a new subvolume directory/inode (helper for the ioctl).
8102   */
8103  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
8104 -                            struct btrfs_root *new_root, struct dentry *dentry,
8105 +                            struct btrfs_root *new_root,
8106                              u64 new_dirid, u64 alloc_hint)
8107  {
8108         struct inode *inode;
8109 -       int error;
8110 +       int err;
8111         u64 index = 0;
8112  
8113         inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
8114 @@ -4611,11 +5096,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
8115         inode->i_nlink = 1;
8116         btrfs_i_size_write(inode, 0);
8117  
8118 -       error = btrfs_update_inode(trans, new_root, inode);
8119 -       if (error)
8120 -               return error;
8121 +       err = btrfs_update_inode(trans, new_root, inode);
8122 +       BUG_ON(err);
8123  
8124 -       d_instantiate(dentry, inode);
8125 +       iput(inode);
8126         return 0;
8127  }
8128  
8129 @@ -4640,7 +5124,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
8130         if (!ei)
8131                 return NULL;
8132         ei->last_trans = 0;
8133 +       ei->last_sub_trans = 0;
8134         ei->logged_trans = 0;
8135 +       ei->outstanding_extents = 0;
8136 +       ei->reserved_extents = 0;
8137 +       ei->root = NULL;
8138 +       spin_lock_init(&ei->accounting_lock);
8139         btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8140         INIT_LIST_HEAD(&ei->i_orphan);
8141         INIT_LIST_HEAD(&ei->ordered_operations);
8142 @@ -4656,6 +5145,14 @@ void btrfs_destroy_inode(struct inode *inode)
8143         WARN_ON(inode->i_data.nrpages);
8144  
8145         /*
8146 +        * This can happen where we create an inode, but somebody else also
8147 +        * created the same inode and we need to destroy the one we already
8148 +        * created.
8149 +        */
8150 +       if (!root)
8151 +               goto free;
8152 +
8153 +       /*
8154          * Make sure we're properly removed from the ordered operation
8155          * lists.
8156          */
8157 @@ -4690,9 +5187,20 @@ void btrfs_destroy_inode(struct inode *inode)
8158         }
8159         inode_tree_del(inode);
8160         btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
8161 +free:
8162         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8163  }
8164  
8165 +void btrfs_drop_inode(struct inode *inode)
8166 +{
8167 +       struct btrfs_root *root = BTRFS_I(inode)->root;
8168 +
8169 +       if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
8170 +               generic_delete_inode(inode);
8171 +       else
8172 +               generic_drop_inode(inode);
8173 +}
8174 +
8175  static void init_once(void *foo)
8176  {
8177         struct btrfs_inode *ei = (struct btrfs_inode *) foo;
8178 @@ -4761,31 +5269,37 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8179  {
8180         struct btrfs_trans_handle *trans;
8181         struct btrfs_root *root = BTRFS_I(old_dir)->root;
8182 +       struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8183         struct inode *new_inode = new_dentry->d_inode;
8184         struct inode *old_inode = old_dentry->d_inode;
8185         struct timespec ctime = CURRENT_TIME;
8186         u64 index = 0;
8187 +       u64 root_objectid;
8188         int ret;
8189  
8190 -       /* we're not allowed to rename between subvolumes */
8191 -       if (BTRFS_I(old_inode)->root->root_key.objectid !=
8192 -           BTRFS_I(new_dir)->root->root_key.objectid)
8193 +       if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
8194 +               return -EPERM;
8195 +
8196 +       /* we only allow rename subvolume link between subvolumes */
8197 +       if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
8198                 return -EXDEV;
8199  
8200 +       if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
8201 +           (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
8202 +               return -ENOTEMPTY;
8203 +
8204         if (S_ISDIR(old_inode->i_mode) && new_inode &&
8205 -           new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
8206 +           new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
8207                 return -ENOTEMPTY;
8208 -       }
8209  
8210 -       /* to rename a snapshot or subvolume, we need to juggle the
8211 -        * backrefs.  This isn't coded yet
8212 +       /*
8213 +        * 2 items for dir items
8214 +        * 1 item for orphan entry
8215 +        * 1 item for ref
8216          */
8217 -       if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
8218 -               return -EXDEV;
8219 -
8220 -       ret = btrfs_check_metadata_free_space(root);
8221 +       ret = btrfs_reserve_metadata_space(root, 4);
8222         if (ret)
8223 -               goto out_unlock;
8224 +               return ret;
8225  
8226         /*
8227          * we're using rename to replace one file with another.
8228 @@ -4796,8 +5310,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8229             old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
8230                 filemap_flush(old_inode->i_mapping);
8231  
8232 +       /* close the racy window with snapshot create/destroy ioctl */
8233 +       if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
8234 +               down_read(&root->fs_info->subvol_sem);
8235 +
8236         trans = btrfs_start_transaction(root, 1);
8237 +       btrfs_set_trans_block_group(trans, new_dir);
8238 +
8239 +       if (dest != root)
8240 +               btrfs_record_root_in_trans(trans, dest);
8241 +
8242 +       ret = btrfs_set_inode_index(new_dir, &index);
8243 +       if (ret)
8244 +               goto out_fail;
8245  
8246 +       if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8247 +               /* force full log commit if subvolume involved. */
8248 +               root->fs_info->last_trans_log_full_commit = trans->transid;
8249 +       } else {
8250 +               ret = btrfs_insert_inode_ref(trans, dest,
8251 +                                            new_dentry->d_name.name,
8252 +                                            new_dentry->d_name.len,
8253 +                                            old_inode->i_ino,
8254 +                                            new_dir->i_ino, index);
8255 +               if (ret)
8256 +                       goto out_fail;
8257 +               /*
8258 +                * this is an ugly little race, but the rename is required
8259 +                * to make sure that if we crash, the inode is either at the
8260 +                * old name or the new one.  pinning the log transaction lets
8261 +                * us make sure we don't allow a log commit to come in after
8262 +                * we unlink the name but before we add the new name back in.
8263 +                */
8264 +               btrfs_pin_log_trans(root);
8265 +       }
8266         /*
8267          * make sure the inode gets flushed if it is replacing
8268          * something.
8269 @@ -4807,18 +5353,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8270                 btrfs_add_ordered_operation(trans, root, old_inode);
8271         }
8272  
8273 -       /*
8274 -        * this is an ugly little race, but the rename is required to make
8275 -        * sure that if we crash, the inode is either at the old name
8276 -        * or the new one.  pinning the log transaction lets us make sure
8277 -        * we don't allow a log commit to come in after we unlink the
8278 -        * name but before we add the new name back in.
8279 -        */
8280 -       btrfs_pin_log_trans(root);
8281 -
8282 -       btrfs_set_trans_block_group(trans, new_dir);
8283 -
8284 -       btrfs_inc_nlink(old_dentry->d_inode);
8285         old_dir->i_ctime = old_dir->i_mtime = ctime;
8286         new_dir->i_ctime = new_dir->i_mtime = ctime;
8287         old_inode->i_ctime = ctime;
8288 @@ -4826,47 +5360,60 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8289         if (old_dentry->d_parent != new_dentry->d_parent)
8290                 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
8291  
8292 -       ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
8293 -                                old_dentry->d_name.name,
8294 -                                old_dentry->d_name.len);
8295 -       if (ret)
8296 -               goto out_fail;
8297 +       if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8298 +               root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
8299 +               ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
8300 +                                       old_dentry->d_name.name,
8301 +                                       old_dentry->d_name.len);
8302 +       } else {
8303 +               btrfs_inc_nlink(old_dentry->d_inode);
8304 +               ret = btrfs_unlink_inode(trans, root, old_dir,
8305 +                                        old_dentry->d_inode,
8306 +                                        old_dentry->d_name.name,
8307 +                                        old_dentry->d_name.len);
8308 +       }
8309 +       BUG_ON(ret);
8310  
8311         if (new_inode) {
8312                 new_inode->i_ctime = CURRENT_TIME;
8313 -               ret = btrfs_unlink_inode(trans, root, new_dir,
8314 -                                        new_dentry->d_inode,
8315 -                                        new_dentry->d_name.name,
8316 -                                        new_dentry->d_name.len);
8317 -               if (ret)
8318 -                       goto out_fail;
8319 +               if (unlikely(new_inode->i_ino ==
8320 +                            BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
8321 +                       root_objectid = BTRFS_I(new_inode)->location.objectid;
8322 +                       ret = btrfs_unlink_subvol(trans, dest, new_dir,
8323 +                                               root_objectid,
8324 +                                               new_dentry->d_name.name,
8325 +                                               new_dentry->d_name.len);
8326 +                       BUG_ON(new_inode->i_nlink == 0);
8327 +               } else {
8328 +                       ret = btrfs_unlink_inode(trans, dest, new_dir,
8329 +                                                new_dentry->d_inode,
8330 +                                                new_dentry->d_name.name,
8331 +                                                new_dentry->d_name.len);
8332 +               }
8333 +               BUG_ON(ret);
8334                 if (new_inode->i_nlink == 0) {
8335                         ret = btrfs_orphan_add(trans, new_dentry->d_inode);
8336 -                       if (ret)
8337 -                               goto out_fail;
8338 +                       BUG_ON(ret);
8339                 }
8340 -
8341         }
8342 -       ret = btrfs_set_inode_index(new_dir, &index);
8343 -       if (ret)
8344 -               goto out_fail;
8345  
8346 -       ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
8347 -                            old_inode, new_dentry->d_name.name,
8348 -                            new_dentry->d_name.len, 1, index);
8349 -       if (ret)
8350 -               goto out_fail;
8351 +       ret = btrfs_add_link(trans, new_dir, old_inode,
8352 +                            new_dentry->d_name.name,
8353 +                            new_dentry->d_name.len, 0, index);
8354 +       BUG_ON(ret);
8355  
8356 -       btrfs_log_new_name(trans, old_inode, old_dir,
8357 -                                      new_dentry->d_parent);
8358 +       if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
8359 +               btrfs_log_new_name(trans, old_inode, old_dir,
8360 +                                  new_dentry->d_parent);
8361 +               btrfs_end_log_trans(root);
8362 +       }
8363  out_fail:
8364 -
8365 -       /* this btrfs_end_log_trans just allows the current
8366 -        * log-sub transaction to complete
8367 -        */
8368 -       btrfs_end_log_trans(root);
8369         btrfs_end_transaction_throttle(trans, root);
8370 -out_unlock:
8371 +
8372 +       if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
8373 +               up_read(&root->fs_info->subvol_sem);
8374 +
8375 +       btrfs_unreserve_metadata_space(root, 4);
8376         return ret;
8377  }
8378  
8379 @@ -4938,11 +5485,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8380         if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
8381                 return -ENAMETOOLONG;
8382  
8383 -       err = btrfs_check_metadata_free_space(root);
8384 +       /*
8385 +        * 2 items for inode item and ref
8386 +        * 2 items for dir items
8387 +        * 1 item for xattr if selinux is on
8388 +        */
8389 +       err = btrfs_reserve_metadata_space(root, 5);
8390         if (err)
8391 -               goto out_fail;
8392 +               return err;
8393  
8394         trans = btrfs_start_transaction(root, 1);
8395 +       if (!trans)
8396 +               goto out_fail;
8397         btrfs_set_trans_block_group(trans, dir);
8398  
8399         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
8400 @@ -5023,6 +5577,7 @@ out_unlock:
8401         nr = trans->blocks_used;
8402         btrfs_end_transaction_throttle(trans, root);
8403  out_fail:
8404 +       btrfs_unreserve_metadata_space(root, 5);
8405         if (drop_inode) {
8406                 inode_dec_link_count(inode);
8407                 iput(inode);
8408 @@ -5044,6 +5599,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
8409  
8410         while (num_bytes > 0) {
8411                 alloc_size = min(num_bytes, root->fs_info->max_extent);
8412 +
8413 +               ret = btrfs_reserve_metadata_space(root, 1);
8414 +               if (ret)
8415 +                       goto out;
8416 +
8417                 ret = btrfs_reserve_extent(trans, root, alloc_size,
8418                                            root->sectorsize, 0, alloc_hint,
8419                                            (u64)-1, &ins, 1);
8420 @@ -5058,9 +5618,12 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
8421                                                   0, 0, 0,
8422                                                   BTRFS_FILE_EXTENT_PREALLOC);
8423                 BUG_ON(ret);
8424 +               btrfs_drop_extent_cache(inode, cur_offset,
8425 +                                       cur_offset + ins.offset -1, 0);
8426                 num_bytes -= ins.offset;
8427                 cur_offset += ins.offset;
8428                 alloc_hint = ins.objectid + ins.offset;
8429 +               btrfs_unreserve_metadata_space(root, 1);
8430         }
8431  out:
8432         if (cur_offset > start) {
8433 @@ -5223,6 +5786,7 @@ static struct inode_operations btrfs_dir_ro_inode_operations = {
8434         .lookup         = btrfs_lookup,
8435         .permission     = btrfs_permission,
8436  };
8437 +
8438  static struct file_operations btrfs_dir_file_operations = {
8439         .llseek         = generic_file_llseek,
8440         .read           = generic_read_dir,
8441 @@ -5245,6 +5809,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
8442         .readpage_io_failed_hook = btrfs_io_failed_hook,
8443         .set_bit_hook = btrfs_set_bit_hook,
8444         .clear_bit_hook = btrfs_clear_bit_hook,
8445 +       .merge_extent_hook = btrfs_merge_extent_hook,
8446 +       .split_extent_hook = btrfs_split_extent_hook,
8447  };
8448  
8449  /*
8450 @@ -5309,3 +5875,7 @@ static struct inode_operations btrfs_symlink_inode_operations = {
8451         .listxattr      = btrfs_listxattr,
8452         .removexattr    = btrfs_removexattr,
8453  };
8454 +
8455 +const struct dentry_operations btrfs_dentry_operations = {
8456 +       .d_delete       = btrfs_dentry_delete,
8457 +};
8458 diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
8459 index bd88f25..cdbb054 100644
8460 --- a/fs/btrfs/ioctl.c
8461 +++ b/fs/btrfs/ioctl.c
8462 @@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root,
8463         struct btrfs_root_item root_item;
8464         struct btrfs_inode_item *inode_item;
8465         struct extent_buffer *leaf;
8466 -       struct btrfs_root *new_root = root;
8467 -       struct inode *dir;
8468 +       struct btrfs_root *new_root;
8469 +       struct inode *dir = dentry->d_parent->d_inode;
8470         int ret;
8471         int err;
8472         u64 objectid;
8473 @@ -239,9 +239,15 @@ static noinline int create_subvol(struct btrfs_root *root,
8474         u64 index = 0;
8475         unsigned long nr = 1;
8476  
8477 -       ret = btrfs_check_metadata_free_space(root);
8478 +       /*
8479 +        * 1 - inode item
8480 +        * 2 - refs
8481 +        * 1 - root item
8482 +        * 2 - dir items
8483 +        */
8484 +       ret = btrfs_reserve_metadata_space(root, 6);
8485         if (ret)
8486 -               goto fail_commit;
8487 +               return ret;
8488  
8489         trans = btrfs_start_transaction(root, 1);
8490         BUG_ON(!trans);
8491 @@ -304,11 +310,17 @@ static noinline int create_subvol(struct btrfs_root *root,
8492         if (ret)
8493                 goto fail;
8494  
8495 +       key.offset = (u64)-1;
8496 +       new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
8497 +       BUG_ON(IS_ERR(new_root));
8498 +
8499 +       btrfs_record_root_in_trans(trans, new_root);
8500 +
8501 +       ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
8502 +                                      BTRFS_I(dir)->block_group);
8503         /*
8504          * insert the directory item
8505          */
8506 -       key.offset = (u64)-1;
8507 -       dir = dentry->d_parent->d_inode;
8508         ret = btrfs_set_inode_index(dir, &index);
8509         BUG_ON(ret);
8510  
8511 @@ -322,43 +334,20 @@ static noinline int create_subvol(struct btrfs_root *root,
8512         ret = btrfs_update_inode(trans, root, dir);
8513         BUG_ON(ret);
8514  
8515 -       /* add the backref first */
8516         ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
8517 -                                objectid, BTRFS_ROOT_BACKREF_KEY,
8518 -                                root->root_key.objectid,
8519 +                                objectid, root->root_key.objectid,
8520                                  dir->i_ino, index, name, namelen);
8521  
8522         BUG_ON(ret);
8523  
8524 -       /* now add the forward ref */
8525 -       ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
8526 -                                root->root_key.objectid, BTRFS_ROOT_REF_KEY,
8527 -                                objectid,
8528 -                                dir->i_ino, index, name, namelen);
8529 -
8530 -       BUG_ON(ret);
8531 -
8532 -       ret = btrfs_commit_transaction(trans, root);
8533 -       if (ret)
8534 -               goto fail_commit;
8535 -
8536 -       new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
8537 -       BUG_ON(!new_root);
8538 -
8539 -       trans = btrfs_start_transaction(new_root, 1);
8540 -       BUG_ON(!trans);
8541 -
8542 -       ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
8543 -                                      BTRFS_I(dir)->block_group);
8544 -       if (ret)
8545 -               goto fail;
8546 -
8547 +       d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
8548  fail:
8549         nr = trans->blocks_used;
8550 -       err = btrfs_commit_transaction(trans, new_root);
8551 +       err = btrfs_commit_transaction(trans, root);
8552         if (err && !ret)
8553                 ret = err;
8554 -fail_commit:
8555 +
8556 +       btrfs_unreserve_metadata_space(root, 6);
8557         btrfs_btree_balance_dirty(root, nr);
8558         return ret;
8559  }
8560 @@ -375,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
8561         if (!root->ref_cows)
8562                 return -EINVAL;
8563  
8564 -       ret = btrfs_check_metadata_free_space(root);
8565 +       /*
8566 +        * 1 - inode item
8567 +        * 2 - refs
8568 +        * 1 - root item
8569 +        * 2 - dir items
8570 +        */
8571 +       ret = btrfs_reserve_metadata_space(root, 6);
8572         if (ret)
8573                 goto fail_unlock;
8574  
8575         pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
8576         if (!pending_snapshot) {
8577                 ret = -ENOMEM;
8578 +               btrfs_unreserve_metadata_space(root, 6);
8579                 goto fail_unlock;
8580         }
8581         pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
8582         if (!pending_snapshot->name) {
8583                 ret = -ENOMEM;
8584                 kfree(pending_snapshot);
8585 +               btrfs_unreserve_metadata_space(root, 6);
8586                 goto fail_unlock;
8587         }
8588         memcpy(pending_snapshot->name, name, namelen);
8589 @@ -420,14 +417,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
8590   * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
8591   * inside this filesystem so it's quite a bit simpler.
8592   */
8593 -static noinline int btrfs_mksubvol(struct path *parent, char *name,
8594 -                                  int mode, int namelen,
8595 +static noinline int btrfs_mksubvol(struct path *parent,
8596 +                                  char *name, int namelen,
8597                                    struct btrfs_root *snap_src)
8598  {
8599 +       struct inode *dir  = parent->dentry->d_inode;
8600         struct dentry *dentry;
8601         int error;
8602  
8603 -       mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
8604 +       mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
8605  
8606         dentry = lookup_one_len(name, parent->dentry, namelen);
8607         error = PTR_ERR(dentry);
8608 @@ -438,99 +436,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
8609         if (dentry->d_inode)
8610                 goto out_dput;
8611  
8612 -       if (!IS_POSIXACL(parent->dentry->d_inode))
8613 -               mode &= ~current_umask();
8614 -
8615         error = mnt_want_write(parent->mnt);
8616         if (error)
8617                 goto out_dput;
8618  
8619 -       error = btrfs_may_create(parent->dentry->d_inode, dentry);
8620 +       error = btrfs_may_create(dir, dentry);
8621         if (error)
8622                 goto out_drop_write;
8623  
8624 -       /*
8625 -        * Actually perform the low-level subvolume creation after all
8626 -        * this VFS fuzz.
8627 -        *
8628 -        * Eventually we want to pass in an inode under which we create this
8629 -        * subvolume, but for now all are under the filesystem root.
8630 -        *
8631 -        * Also we should pass on the mode eventually to allow creating new
8632 -        * subvolume with specific mode bits.
8633 -        */
8634 +       down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
8635 +
8636 +       if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
8637 +               goto out_up_read;
8638 +
8639         if (snap_src) {
8640 -               struct dentry *dir = dentry->d_parent;
8641 -               struct dentry *test = dir->d_parent;
8642 -               struct btrfs_path *path = btrfs_alloc_path();
8643 -               int ret;
8644 -               u64 test_oid;
8645 -               u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
8646 -
8647 -               test_oid = snap_src->root_key.objectid;
8648 -
8649 -               ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
8650 -                                         path, parent_oid, test_oid);
8651 -               if (ret == 0)
8652 -                       goto create;
8653 -               btrfs_release_path(snap_src->fs_info->tree_root, path);
8654 -
8655 -               /* we need to make sure we aren't creating a directory loop
8656 -                * by taking a snapshot of something that has our current
8657 -                * subvol in its directory tree.  So, this loops through
8658 -                * the dentries and checks the forward refs for each subvolume
8659 -                * to see if is references the subvolume where we are
8660 -                * placing this new snapshot.
8661 -                */
8662 -               while (1) {
8663 -                       if (!test ||
8664 -                           dir == snap_src->fs_info->sb->s_root ||
8665 -                           test == snap_src->fs_info->sb->s_root ||
8666 -                           test->d_inode->i_sb != snap_src->fs_info->sb) {
8667 -                               break;
8668 -                       }
8669 -                       if (S_ISLNK(test->d_inode->i_mode)) {
8670 -                               printk(KERN_INFO "Btrfs symlink in snapshot "
8671 -                                      "path, failed\n");
8672 -                               error = -EMLINK;
8673 -                               btrfs_free_path(path);
8674 -                               goto out_drop_write;
8675 -                       }
8676 -                       test_oid =
8677 -                               BTRFS_I(test->d_inode)->root->root_key.objectid;
8678 -                       ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
8679 -                                 path, test_oid, parent_oid);
8680 -                       if (ret == 0) {
8681 -                               printk(KERN_INFO "Btrfs snapshot creation "
8682 -                                      "failed, looping\n");
8683 -                               error = -EMLINK;
8684 -                               btrfs_free_path(path);
8685 -                               goto out_drop_write;
8686 -                       }
8687 -                       btrfs_release_path(snap_src->fs_info->tree_root, path);
8688 -                       test = test->d_parent;
8689 -               }
8690 -create:
8691 -               btrfs_free_path(path);
8692 -               error = create_snapshot(snap_src, dentry, name, namelen);
8693 +               error = create_snapshot(snap_src, dentry,
8694 +                                       name, namelen);
8695         } else {
8696 -               error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
8697 -                                     dentry, name, namelen);
8698 +               error = create_subvol(BTRFS_I(dir)->root, dentry,
8699 +                                     name, namelen);
8700         }
8701 -       if (error)
8702 -               goto out_drop_write;
8703 -
8704 -       fsnotify_mkdir(parent->dentry->d_inode, dentry);
8705 +       if (!error)
8706 +               fsnotify_mkdir(dir, dentry);
8707 +out_up_read:
8708 +       up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
8709  out_drop_write:
8710         mnt_drop_write(parent->mnt);
8711  out_dput:
8712         dput(dentry);
8713  out_unlock:
8714 -       mutex_unlock(&parent->dentry->d_inode->i_mutex);
8715 +       mutex_unlock(&dir->i_mutex);
8716         return error;
8717  }
8718  
8719 -
8720  static int btrfs_defrag_file(struct file *file)
8721  {
8722         struct inode *inode = fdentry(file)->d_inode;
8723 @@ -596,9 +534,8 @@ again:
8724                 clear_page_dirty_for_io(page);
8725  
8726                 btrfs_set_extent_delalloc(inode, page_start, page_end);
8727 -
8728 -               unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
8729                 set_page_dirty(page);
8730 +               unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
8731                 unlock_page(page);
8732                 page_cache_release(page);
8733                 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
8734 @@ -609,7 +546,8 @@ out_unlock:
8735         return 0;
8736  }
8737  
8738 -static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
8739 +static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
8740 +                                       void __user *arg)
8741  {
8742         u64 new_size;
8743         u64 old_size;
8744 @@ -718,10 +656,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
8745  {
8746         struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
8747         struct btrfs_ioctl_vol_args *vol_args;
8748 -       struct btrfs_dir_item *di;
8749 -       struct btrfs_path *path;
8750         struct file *src_file;
8751 -       u64 root_dirid;
8752         int namelen;
8753         int ret = 0;
8754  
8755 @@ -739,32 +674,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
8756                 goto out;
8757         }
8758  
8759 -       path = btrfs_alloc_path();
8760 -       if (!path) {
8761 -               ret = -ENOMEM;
8762 -               goto out;
8763 -       }
8764 -
8765 -       root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
8766 -       di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
8767 -                           path, root_dirid,
8768 -                           vol_args->name, namelen, 0);
8769 -       btrfs_free_path(path);
8770 -
8771 -       if (di && !IS_ERR(di)) {
8772 -               ret = -EEXIST;
8773 -               goto out;
8774 -       }
8775 -
8776 -       if (IS_ERR(di)) {
8777 -               ret = PTR_ERR(di);
8778 -               goto out;
8779 -       }
8780 -
8781         if (subvol) {
8782 -               ret = btrfs_mksubvol(&file->f_path, vol_args->name,
8783 -                                    file->f_path.dentry->d_inode->i_mode,
8784 -                                    namelen, NULL);
8785 +               ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
8786 +                                    NULL);
8787         } else {
8788                 struct inode *src_inode;
8789                 src_file = fget(vol_args->fd);
8790 @@ -781,17 +693,157 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
8791                         fput(src_file);
8792                         goto out;
8793                 }
8794 -               ret = btrfs_mksubvol(&file->f_path, vol_args->name,
8795 -                            file->f_path.dentry->d_inode->i_mode,
8796 -                            namelen, BTRFS_I(src_inode)->root);
8797 +               ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
8798 +                                    BTRFS_I(src_inode)->root);
8799                 fput(src_file);
8800         }
8801 -
8802  out:
8803         kfree(vol_args);
8804         return ret;
8805  }
8806  
8807 +/*
8808 + * helper to check if the subvolume references other subvolumes
8809 + */
8810 +static noinline int may_destroy_subvol(struct btrfs_root *root)
8811 +{
8812 +       struct btrfs_path *path;
8813 +       struct btrfs_key key;
8814 +       int ret;
8815 +
8816 +       path = btrfs_alloc_path();
8817 +       if (!path)
8818 +               return -ENOMEM;
8819 +
8820 +       key.objectid = root->root_key.objectid;
8821 +       key.type = BTRFS_ROOT_REF_KEY;
8822 +       key.offset = (u64)-1;
8823 +
8824 +       ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
8825 +                               &key, path, 0, 0);
8826 +       if (ret < 0)
8827 +               goto out;
8828 +       BUG_ON(ret == 0);
8829 +
8830 +       ret = 0;
8831 +       if (path->slots[0] > 0) {
8832 +               path->slots[0]--;
8833 +               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
8834 +               if (key.objectid == root->root_key.objectid &&
8835 +                   key.type == BTRFS_ROOT_REF_KEY)
8836 +                       ret = -ENOTEMPTY;
8837 +       }
8838 +out:
8839 +       btrfs_free_path(path);
8840 +       return ret;
8841 +}
8842 +
8843 +static noinline int btrfs_ioctl_snap_destroy(struct file *file,
8844 +                                            void __user *arg)
8845 +{
8846 +       struct dentry *parent = fdentry(file);
8847 +       struct dentry *dentry;
8848 +       struct inode *dir = parent->d_inode;
8849 +       struct inode *inode;
8850 +       struct btrfs_root *root = BTRFS_I(dir)->root;
8851 +       struct btrfs_root *dest = NULL;
8852 +       struct btrfs_ioctl_vol_args *vol_args;
8853 +       struct btrfs_trans_handle *trans;
8854 +       int namelen;
8855 +       int ret;
8856 +       int err = 0;
8857 +
8858 +       if (!capable(CAP_SYS_ADMIN))
8859 +               return -EPERM;
8860 +
8861 +       vol_args = memdup_user(arg, sizeof(*vol_args));
8862 +       if (IS_ERR(vol_args))
8863 +               return PTR_ERR(vol_args);
8864 +
8865 +       vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
8866 +       namelen = strlen(vol_args->name);
8867 +       if (strchr(vol_args->name, '/') ||
8868 +           strncmp(vol_args->name, "..", namelen) == 0) {
8869 +               err = -EINVAL;
8870 +               goto out;
8871 +       }
8872 +
8873 +       err = mnt_want_write(file->f_path.mnt);
8874 +       if (err)
8875 +               goto out;
8876 +
8877 +       mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
8878 +       dentry = lookup_one_len(vol_args->name, parent, namelen);
8879 +       if (IS_ERR(dentry)) {
8880 +               err = PTR_ERR(dentry);
8881 +               goto out_unlock_dir;
8882 +       }
8883 +
8884 +       if (!dentry->d_inode) {
8885 +               err = -ENOENT;
8886 +               goto out_dput;
8887 +       }
8888 +
8889 +       inode = dentry->d_inode;
8890 +       if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
8891 +               err = -EINVAL;
8892 +               goto out_dput;
8893 +       }
8894 +
8895 +       dest = BTRFS_I(inode)->root;
8896 +
8897 +       mutex_lock(&inode->i_mutex);
8898 +       err = d_invalidate(dentry);
8899 +       if (err)
8900 +               goto out_unlock;
8901 +
8902 +       down_write(&root->fs_info->subvol_sem);
8903 +
8904 +       err = may_destroy_subvol(dest);
8905 +       if (err)
8906 +               goto out_up_write;
8907 +
8908 +       trans = btrfs_start_transaction(root, 1);
8909 +       ret = btrfs_unlink_subvol(trans, root, dir,
8910 +                               dest->root_key.objectid,
8911 +                               dentry->d_name.name,
8912 +                               dentry->d_name.len);
8913 +       BUG_ON(ret);
8914 +
8915 +       btrfs_record_root_in_trans(trans, dest);
8916 +
8917 +       memset(&dest->root_item.drop_progress, 0,
8918 +               sizeof(dest->root_item.drop_progress));
8919 +       dest->root_item.drop_level = 0;
8920 +       btrfs_set_root_refs(&dest->root_item, 0);
8921 +
8922 +       ret = btrfs_insert_orphan_item(trans,
8923 +                               root->fs_info->tree_root,
8924 +                               dest->root_key.objectid);
8925 +       BUG_ON(ret);
8926 +
8927 +       ret = btrfs_commit_transaction(trans, root);
8928 +       BUG_ON(ret);
8929 +       inode->i_flags |= S_DEAD;
8930 +out_up_write:
8931 +       up_write(&root->fs_info->subvol_sem);
8932 +out_unlock:
8933 +       mutex_unlock(&inode->i_mutex);
8934 +       if (!err) {
8935 +               shrink_dcache_sb(root->fs_info->sb);
8936 +               btrfs_invalidate_inodes(dest);
8937 +               d_delete(dentry);
8938 +       }
8939 +out_dput:
8940 +       dput(dentry);
8941 +out_unlock_dir:
8942 +       mutex_unlock(&dir->i_mutex);
8943 +       mnt_drop_write(file->f_path.mnt);
8944 +out:
8945 +       kfree(vol_args);
8946 +       return err;
8947 +}
8948 +
8949  static int btrfs_ioctl_defrag(struct file *file)
8950  {
8951         struct inode *inode = fdentry(file)->d_inode;
8952 @@ -865,8 +917,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
8953         return ret;
8954  }
8955  
8956 -static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
8957 -               u64 off, u64 olen, u64 destoff)
8958 +static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
8959 +                                      u64 off, u64 olen, u64 destoff)
8960  {
8961         struct inode *inode = fdentry(file)->d_inode;
8962         struct btrfs_root *root = BTRFS_I(inode)->root;
8963 @@ -976,7 +1028,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
8964  
8965         /* punch hole in destination first */
8966         btrfs_drop_extents(trans, root, inode, off, off + len,
8967 -                          off + len, 0, &hint_byte);
8968 +                          off + len, 0, &hint_byte, 1);
8969  
8970         /* clone data */
8971         key.objectid = src->i_ino;
8972 @@ -1071,9 +1123,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
8973                                         datao += off - key.offset;
8974                                         datal -= off - key.offset;
8975                                 }
8976 -                               if (key.offset + datao + datal + key.offset >
8977 -                                   off + len)
8978 -                                       datal = off + len - key.offset - datao;
8979 +
8980 +                               if (key.offset + datal > off + len)
8981 +                                       datal = off + len - key.offset;
8982 +
8983                                 /* disko == 0 means it's a hole */
8984                                 if (!disko)
8985                                         datao = 0;
8986 @@ -1182,15 +1235,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
8987         struct inode *inode = fdentry(file)->d_inode;
8988         struct btrfs_root *root = BTRFS_I(inode)->root;
8989         struct btrfs_trans_handle *trans;
8990 -       int ret = 0;
8991 +       int ret;
8992  
8993 +       ret = -EPERM;
8994         if (!capable(CAP_SYS_ADMIN))
8995 -               return -EPERM;
8996 +               goto out;
8997  
8998 -       if (file->private_data) {
8999 -               ret = -EINPROGRESS;
9000 +       ret = -EINPROGRESS;
9001 +       if (file->private_data)
9002                 goto out;
9003 -       }
9004  
9005         ret = mnt_want_write(file->f_path.mnt);
9006         if (ret)
9007 @@ -1200,12 +1253,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
9008         root->fs_info->open_ioctl_trans++;
9009         mutex_unlock(&root->fs_info->trans_mutex);
9010  
9011 +       ret = -ENOMEM;
9012         trans = btrfs_start_ioctl_transaction(root, 0);
9013 -       if (trans)
9014 -               file->private_data = trans;
9015 -       else
9016 -               ret = -ENOMEM;
9017 -       /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
9018 +       if (!trans)
9019 +               goto out_drop;
9020 +
9021 +       file->private_data = trans;
9022 +       return 0;
9023 +
9024 +out_drop:
9025 +       mutex_lock(&root->fs_info->trans_mutex);
9026 +       root->fs_info->open_ioctl_trans--;
9027 +       mutex_unlock(&root->fs_info->trans_mutex);
9028 +       mnt_drop_write(file->f_path.mnt);
9029  out:
9030         return ret;
9031  }
9032 @@ -1221,24 +1281,20 @@ long btrfs_ioctl_trans_end(struct file *file)
9033         struct inode *inode = fdentry(file)->d_inode;
9034         struct btrfs_root *root = BTRFS_I(inode)->root;
9035         struct btrfs_trans_handle *trans;
9036 -       int ret = 0;
9037  
9038         trans = file->private_data;
9039 -       if (!trans) {
9040 -               ret = -EINVAL;
9041 -               goto out;
9042 -       }
9043 -       btrfs_end_transaction(trans, root);
9044 +       if (!trans)
9045 +               return -EINVAL;
9046         file->private_data = NULL;
9047  
9048 +       btrfs_end_transaction(trans, root);
9049 +
9050         mutex_lock(&root->fs_info->trans_mutex);
9051         root->fs_info->open_ioctl_trans--;
9052         mutex_unlock(&root->fs_info->trans_mutex);
9053  
9054         mnt_drop_write(file->f_path.mnt);
9055 -
9056 -out:
9057 -       return ret;
9058 +       return 0;
9059  }
9060  
9061  long btrfs_ioctl(struct file *file, unsigned int
9062 @@ -1258,6 +1314,8 @@ long btrfs_ioctl(struct file *file, unsigned int
9063                 return btrfs_ioctl_snap_create(file, argp, 0);
9064         case BTRFS_IOC_SUBVOL_CREATE:
9065                 return btrfs_ioctl_snap_create(file, argp, 1);
9066 +       case BTRFS_IOC_SNAP_DESTROY:
9067 +               return btrfs_ioctl_snap_destroy(file, argp);
9068         case BTRFS_IOC_DEFRAG:
9069                 return btrfs_ioctl_defrag(file);
9070         case BTRFS_IOC_RESIZE:
9071 diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
9072 index b320b10..bc49914 100644
9073 --- a/fs/btrfs/ioctl.h
9074 +++ b/fs/btrfs/ioctl.h
9075 @@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args {
9076  
9077  #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
9078                                    struct btrfs_ioctl_vol_args)
9079 -
9080 +#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
9081 +                               struct btrfs_ioctl_vol_args)
9082  #endif
9083 diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
9084 index d6f0806..ab21c29 100644
9085 --- a/fs/btrfs/ordered-data.c
9086 +++ b/fs/btrfs/ordered-data.c
9087 @@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
9088   *
9089   * len is the length of the extent
9090   *
9091 - * This also sets the EXTENT_ORDERED bit on the range in the inode.
9092 - *
9093   * The tree is given a single reference on the ordered extent that was
9094   * inserted.
9095   */
9096 @@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
9097         entry->start = start;
9098         entry->len = len;
9099         entry->disk_len = disk_len;
9100 +       entry->bytes_left = len;
9101         entry->inode = inode;
9102         if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
9103                 set_bit(type, &entry->flags);
9104 @@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
9105                            &entry->rb_node);
9106         BUG_ON(node);
9107  
9108 -       set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
9109 -                          entry_end(entry) - 1, GFP_NOFS);
9110 -
9111         spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
9112         list_add_tail(&entry->root_extent_list,
9113                       &BTRFS_I(inode)->root->fs_info->ordered_extents);
9114 @@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
9115         struct btrfs_ordered_inode_tree *tree;
9116         struct rb_node *node;
9117         struct btrfs_ordered_extent *entry;
9118 -       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
9119         int ret;
9120  
9121         tree = &BTRFS_I(inode)->ordered_tree;
9122         mutex_lock(&tree->mutex);
9123 -       clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
9124 -                            GFP_NOFS);
9125         node = tree_search(tree, file_offset);
9126         if (!node) {
9127                 ret = 1;
9128 @@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
9129                 goto out;
9130         }
9131  
9132 -       ret = test_range_bit(io_tree, entry->file_offset,
9133 -                            entry->file_offset + entry->len - 1,
9134 -                            EXTENT_ORDERED, 0);
9135 -       if (ret == 0)
9136 +       if (io_size > entry->bytes_left) {
9137 +               printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
9138 +                      (unsigned long long)entry->bytes_left,
9139 +                      (unsigned long long)io_size);
9140 +       }
9141 +       entry->bytes_left -= io_size;
9142 +       if (entry->bytes_left == 0)
9143                 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
9144 +       else
9145 +               ret = 1;
9146  out:
9147         mutex_unlock(&tree->mutex);
9148         return ret == 0;
9149 @@ -308,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode,
9150         tree->last = NULL;
9151         set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
9152  
9153 +       spin_lock(&BTRFS_I(inode)->accounting_lock);
9154 +       BTRFS_I(inode)->outstanding_extents--;
9155 +       spin_unlock(&BTRFS_I(inode)->accounting_lock);
9156 +       btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
9157 +                                             inode, 1);
9158 +
9159         spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
9160         list_del_init(&entry->root_extent_list);
9161  
9162 @@ -476,6 +480,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
9163         u64 orig_end;
9164         u64 wait_end;
9165         struct btrfs_ordered_extent *ordered;
9166 +       int found;
9167  
9168         if (start + len < start) {
9169                 orig_end = INT_LIMIT(loff_t);
9170 @@ -502,6 +507,7 @@ again:
9171                                            orig_end >> PAGE_CACHE_SHIFT);
9172  
9173         end = orig_end;
9174 +       found = 0;
9175         while (1) {
9176                 ordered = btrfs_lookup_first_ordered_extent(inode, end);
9177                 if (!ordered)
9178 @@ -514,6 +520,7 @@ again:
9179                         btrfs_put_ordered_extent(ordered);
9180                         break;
9181                 }
9182 +               found++;
9183                 btrfs_start_ordered_extent(inode, ordered, 1);
9184                 end = ordered->file_offset;
9185                 btrfs_put_ordered_extent(ordered);
9186 @@ -521,8 +528,8 @@ again:
9187                         break;
9188                 end--;
9189         }
9190 -       if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
9191 -                          EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
9192 +       if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
9193 +                          EXTENT_DELALLOC, 0, NULL)) {
9194                 schedule_timeout(1);
9195                 goto again;
9196         }
9197 @@ -613,7 +620,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
9198          */
9199         if (test_range_bit(io_tree, disk_i_size,
9200                            ordered->file_offset + ordered->len - 1,
9201 -                          EXTENT_DELALLOC, 0)) {
9202 +                          EXTENT_DELALLOC, 0, NULL)) {
9203                 goto out;
9204         }
9205         /*
9206 @@ -664,7 +671,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
9207          */
9208         if (i_size_test > entry_end(ordered) &&
9209             !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
9210 -                          EXTENT_DELALLOC, 0)) {
9211 +                          EXTENT_DELALLOC, 0, NULL)) {
9212                 new_i_size = min_t(u64, i_size_test, i_size_read(inode));
9213         }
9214         BTRFS_I(inode)->disk_i_size = new_i_size;
9215 diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
9216 index 3d31c88..993a7ea 100644
9217 --- a/fs/btrfs/ordered-data.h
9218 +++ b/fs/btrfs/ordered-data.h
9219 @@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
9220         /* extent length on disk */
9221         u64 disk_len;
9222  
9223 +       /* number of bytes that still need writing */
9224 +       u64 bytes_left;
9225 +
9226         /* flags (described above) */
9227         unsigned long flags;
9228  
9229 diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
9230 index 3c0d52a..79cba5f 100644
9231 --- a/fs/btrfs/orphan.c
9232 +++ b/fs/btrfs/orphan.c
9233 @@ -65,3 +65,23 @@ out:
9234         btrfs_free_path(path);
9235         return ret;
9236  }
9237 +
9238 +int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
9239 +{
9240 +       struct btrfs_path *path;
9241 +       struct btrfs_key key;
9242 +       int ret;
9243 +
9244 +       key.objectid = BTRFS_ORPHAN_OBJECTID;
9245 +       key.type = BTRFS_ORPHAN_ITEM_KEY;
9246 +       key.offset = offset;
9247 +
9248 +       path = btrfs_alloc_path();
9249 +       if (!path)
9250 +               return -ENOMEM;
9251 +
9252 +       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9253 +
9254 +       btrfs_free_path(path);
9255 +       return ret;
9256 +}
9257 diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
9258 index c04f7f2..cfcc93c 100644
9259 --- a/fs/btrfs/relocation.c
9260 +++ b/fs/btrfs/relocation.c
9261 @@ -121,6 +121,15 @@ struct inodevec {
9262         int nr;
9263  };
9264  
9265 +#define MAX_EXTENTS 128
9266 +
9267 +struct file_extent_cluster {
9268 +       u64 start;
9269 +       u64 end;
9270 +       u64 boundary[MAX_EXTENTS];
9271 +       unsigned int nr;
9272 +};
9273 +
9274  struct reloc_control {
9275         /* block group to relocate */
9276         struct btrfs_block_group_cache *block_group;
9277 @@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
9278                                 struct reloc_control *rc)
9279  {
9280         if (test_range_bit(&rc->processed_blocks, bytenr,
9281 -                          bytenr + blocksize - 1, EXTENT_DIRTY, 1))
9282 +                          bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
9283                 return 1;
9284         return 0;
9285  }
9286 @@ -2529,56 +2538,94 @@ out:
9287  }
9288  
9289  static noinline_for_stack
9290 -int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
9291 +int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
9292 +                        u64 block_start)
9293 +{
9294 +       struct btrfs_root *root = BTRFS_I(inode)->root;
9295 +       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
9296 +       struct extent_map *em;
9297 +       int ret = 0;
9298 +
9299 +       em = alloc_extent_map(GFP_NOFS);
9300 +       if (!em)
9301 +               return -ENOMEM;
9302 +
9303 +       em->start = start;
9304 +       em->len = end + 1 - start;
9305 +       em->block_len = em->len;
9306 +       em->block_start = block_start;
9307 +       em->bdev = root->fs_info->fs_devices->latest_bdev;
9308 +       set_bit(EXTENT_FLAG_PINNED, &em->flags);
9309 +
9310 +       lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
9311 +       while (1) {
9312 +               write_lock(&em_tree->lock);
9313 +               ret = add_extent_mapping(em_tree, em);
9314 +               write_unlock(&em_tree->lock);
9315 +               if (ret != -EEXIST) {
9316 +                       free_extent_map(em);
9317 +                       break;
9318 +               }
9319 +               btrfs_drop_extent_cache(inode, start, end, 0);
9320 +       }
9321 +       unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
9322 +       return ret;
9323 +}
9324 +
9325 +static int relocate_file_extent_cluster(struct inode *inode,
9326 +                                       struct file_extent_cluster *cluster)
9327  {
9328         u64 page_start;
9329         u64 page_end;
9330 -       unsigned long i;
9331 -       unsigned long first_index;
9332 +       u64 offset = BTRFS_I(inode)->index_cnt;
9333 +       unsigned long index;
9334         unsigned long last_index;
9335 -       unsigned int total_read = 0;
9336 -       unsigned int total_dirty = 0;
9337 +       unsigned int dirty_page = 0;
9338         struct page *page;
9339         struct file_ra_state *ra;
9340 -       struct btrfs_ordered_extent *ordered;
9341 -       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
9342 +       int nr = 0;
9343         int ret = 0;
9344  
9345 +       if (!cluster->nr)
9346 +               return 0;
9347 +
9348         ra = kzalloc(sizeof(*ra), GFP_NOFS);
9349         if (!ra)
9350                 return -ENOMEM;
9351  
9352 +       index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
9353 +       last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
9354 +
9355         mutex_lock(&inode->i_mutex);
9356 -       first_index = start >> PAGE_CACHE_SHIFT;
9357 -       last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
9358  
9359 -       /* make sure the dirty trick played by the caller work */
9360 -       while (1) {
9361 -               ret = invalidate_inode_pages2_range(inode->i_mapping,
9362 -                                                   first_index, last_index);
9363 -               if (ret != -EBUSY)
9364 -                       break;
9365 -               schedule_timeout(HZ/10);
9366 -       }
9367 +       i_size_write(inode, cluster->end + 1 - offset);
9368 +       ret = setup_extent_mapping(inode, cluster->start - offset,
9369 +                                  cluster->end - offset, cluster->start);
9370         if (ret)
9371                 goto out_unlock;
9372  
9373         file_ra_state_init(ra, inode->i_mapping);
9374  
9375 -       for (i = first_index ; i <= last_index; i++) {
9376 -               if (total_read % ra->ra_pages == 0) {
9377 -                       btrfs_force_ra(inode->i_mapping, ra, NULL, i,
9378 -                               min(last_index, ra->ra_pages + i - 1));
9379 -               }
9380 -               total_read++;
9381 -again:
9382 -               if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
9383 -                       BUG_ON(1);
9384 -               page = grab_cache_page(inode->i_mapping, i);
9385 +       WARN_ON(cluster->start != cluster->boundary[0]);
9386 +       while (index <= last_index) {
9387 +               page = find_lock_page(inode->i_mapping, index);
9388                 if (!page) {
9389 -                       ret = -ENOMEM;
9390 -                       goto out_unlock;
9391 +                       page_cache_sync_readahead(inode->i_mapping,
9392 +                                                 ra, NULL, index,
9393 +                                                 last_index + 1 - index);
9394 +                       page = grab_cache_page(inode->i_mapping, index);
9395 +                       if (!page) {
9396 +                               ret = -ENOMEM;
9397 +                               goto out_unlock;
9398 +                       }
9399 +               }
9400 +
9401 +               if (PageReadahead(page)) {
9402 +                       page_cache_async_readahead(inode->i_mapping,
9403 +                                                  ra, NULL, page, index,
9404 +                                                  last_index + 1 - index);
9405                 }
9406 +
9407                 if (!PageUptodate(page)) {
9408                         btrfs_readpage(NULL, page);
9409                         lock_page(page);
9410 @@ -2589,75 +2636,79 @@ again:
9411                                 goto out_unlock;
9412                         }
9413                 }
9414 -               wait_on_page_writeback(page);
9415  
9416                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
9417                 page_end = page_start + PAGE_CACHE_SIZE - 1;
9418 -               lock_extent(io_tree, page_start, page_end, GFP_NOFS);
9419 -
9420 -               ordered = btrfs_lookup_ordered_extent(inode, page_start);
9421 -               if (ordered) {
9422 -                       unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
9423 -                       unlock_page(page);
9424 -                       page_cache_release(page);
9425 -                       btrfs_start_ordered_extent(inode, ordered, 1);
9426 -                       btrfs_put_ordered_extent(ordered);
9427 -                       goto again;
9428 -               }
9429 +
9430 +               lock_extent(&BTRFS_I(inode)->io_tree,
9431 +                           page_start, page_end, GFP_NOFS);
9432 +
9433                 set_page_extent_mapped(page);
9434  
9435 -               if (i == first_index)
9436 -                       set_extent_bits(io_tree, page_start, page_end,
9437 +               if (nr < cluster->nr &&
9438 +                   page_start + offset == cluster->boundary[nr]) {
9439 +                       set_extent_bits(&BTRFS_I(inode)->io_tree,
9440 +                                       page_start, page_end,
9441                                         EXTENT_BOUNDARY, GFP_NOFS);
9442 +                       nr++;
9443 +               }
9444                 btrfs_set_extent_delalloc(inode, page_start, page_end);
9445  
9446                 set_page_dirty(page);
9447 -               total_dirty++;
9448 +               dirty_page++;
9449  
9450 -               unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
9451 +               unlock_extent(&BTRFS_I(inode)->io_tree,
9452 +                             page_start, page_end, GFP_NOFS);
9453                 unlock_page(page);
9454                 page_cache_release(page);
9455 +
9456 +               index++;
9457 +               if (nr < cluster->nr &&
9458 +                   page_end + 1 + offset == cluster->boundary[nr]) {
9459 +                       balance_dirty_pages_ratelimited_nr(inode->i_mapping,
9460 +                                                          dirty_page);
9461 +                       dirty_page = 0;
9462 +               }
9463 +       }
9464 +       if (dirty_page) {
9465 +               balance_dirty_pages_ratelimited_nr(inode->i_mapping,
9466 +                                                  dirty_page);
9467         }
9468 +       WARN_ON(nr != cluster->nr);
9469  out_unlock:
9470         mutex_unlock(&inode->i_mutex);
9471         kfree(ra);
9472 -       balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
9473         return ret;
9474  }
9475  
9476  static noinline_for_stack
9477 -int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
9478 +int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
9479 +                        struct file_extent_cluster *cluster)
9480  {
9481 -       struct btrfs_root *root = BTRFS_I(inode)->root;
9482 -       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
9483 -       struct extent_map *em;
9484 -       u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
9485 -       u64 end = start + extent_key->offset - 1;
9486 -
9487 -       em = alloc_extent_map(GFP_NOFS);
9488 -       em->start = start;
9489 -       em->len = extent_key->offset;
9490 -       em->block_len = extent_key->offset;
9491 -       em->block_start = extent_key->objectid;
9492 -       em->bdev = root->fs_info->fs_devices->latest_bdev;
9493 -       set_bit(EXTENT_FLAG_PINNED, &em->flags);
9494 +       int ret;
9495  
9496 -       /* setup extent map to cheat btrfs_readpage */
9497 -       lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
9498 -       while (1) {
9499 -               int ret;
9500 -               spin_lock(&em_tree->lock);
9501 -               ret = add_extent_mapping(em_tree, em);
9502 -               spin_unlock(&em_tree->lock);
9503 -               if (ret != -EEXIST) {
9504 -                       free_extent_map(em);
9505 -                       break;
9506 -               }
9507 -               btrfs_drop_extent_cache(inode, start, end, 0);
9508 +       if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
9509 +               ret = relocate_file_extent_cluster(inode, cluster);
9510 +               if (ret)
9511 +                       return ret;
9512 +               cluster->nr = 0;
9513         }
9514 -       unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
9515  
9516 -       return relocate_inode_pages(inode, start, extent_key->offset);
9517 +       if (!cluster->nr)
9518 +               cluster->start = extent_key->objectid;
9519 +       else
9520 +               BUG_ON(cluster->nr >= MAX_EXTENTS);
9521 +       cluster->end = extent_key->objectid + extent_key->offset - 1;
9522 +       cluster->boundary[cluster->nr] = extent_key->objectid;
9523 +       cluster->nr++;
9524 +
9525 +       if (cluster->nr >= MAX_EXTENTS) {
9526 +               ret = relocate_file_extent_cluster(inode, cluster);
9527 +               if (ret)
9528 +                       return ret;
9529 +               cluster->nr = 0;
9530 +       }
9531 +       return 0;
9532  }
9533  
9534  #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
9535 @@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags)
9536         return 0;
9537  }
9538  
9539 +
9540  static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
9541  {
9542         struct rb_root blocks = RB_ROOT;
9543         struct btrfs_key key;
9544 +       struct file_extent_cluster *cluster;
9545         struct btrfs_trans_handle *trans = NULL;
9546         struct btrfs_path *path;
9547         struct btrfs_extent_item *ei;
9548 @@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
9549         int ret;
9550         int err = 0;
9551  
9552 +       cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
9553 +       if (!cluster)
9554 +               return -ENOMEM;
9555 +
9556         path = btrfs_alloc_path();
9557         if (!path)
9558                 return -ENOMEM;
9559  
9560 +       rc->extents_found = 0;
9561 +       rc->extents_skipped = 0;
9562 +
9563         rc->search_start = rc->block_group->key.objectid;
9564         clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
9565                           GFP_NOFS);
9566 @@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
9567                 }
9568  
9569                 nr = trans->blocks_used;
9570 -               btrfs_end_transaction_throttle(trans, rc->extent_root);
9571 +               btrfs_end_transaction(trans, rc->extent_root);
9572                 trans = NULL;
9573                 btrfs_btree_balance_dirty(rc->extent_root, nr);
9574  
9575                 if (rc->stage == MOVE_DATA_EXTENTS &&
9576                     (flags & BTRFS_EXTENT_FLAG_DATA)) {
9577                         rc->found_file_extent = 1;
9578 -                       ret = relocate_data_extent(rc->data_inode, &key);
9579 +                       ret = relocate_data_extent(rc->data_inode,
9580 +                                                  &key, cluster);
9581                         if (ret < 0) {
9582                                 err = ret;
9583                                 break;
9584 @@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
9585                 btrfs_btree_balance_dirty(rc->extent_root, nr);
9586         }
9587  
9588 +       if (!err) {
9589 +               ret = relocate_file_extent_cluster(rc->data_inode, cluster);
9590 +               if (ret < 0)
9591 +                       err = ret;
9592 +       }
9593 +
9594 +       kfree(cluster);
9595 +
9596         rc->create_reloc_root = 0;
9597         smp_mb();
9598  
9599 @@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
9600  }
9601  
9602  static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
9603 -                                struct btrfs_root *root,
9604 -                                u64 objectid, u64 size)
9605 +                                struct btrfs_root *root, u64 objectid)
9606  {
9607         struct btrfs_path *path;
9608         struct btrfs_inode_item *item;
9609 @@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
9610         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
9611         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
9612         btrfs_set_inode_generation(leaf, item, 1);
9613 -       btrfs_set_inode_size(leaf, item, size);
9614 +       btrfs_set_inode_size(leaf, item, 0);
9615         btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
9616         btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
9617         btrfs_mark_buffer_dirty(leaf);
9618 @@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
9619         if (err)
9620                 goto out;
9621  
9622 -       err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
9623 -       BUG_ON(err);
9624 -
9625 -       err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
9626 -                                      group->key.offset, 0, group->key.offset,
9627 -                                      0, 0, 0);
9628 +       err = __insert_orphan_inode(trans, root, objectid);
9629         BUG_ON(err);
9630  
9631         key.objectid = objectid;
9632 @@ -3455,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
9633         BUG_ON(!rc->block_group);
9634  
9635         btrfs_init_workers(&rc->workers, "relocate",
9636 -                          fs_info->thread_pool_size);
9637 +                          fs_info->thread_pool_size, NULL);
9638  
9639         rc->extent_root = extent_root;
9640         btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
9641 @@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
9642         btrfs_wait_ordered_extents(fs_info->tree_root, 0);
9643  
9644         while (1) {
9645 -               mutex_lock(&fs_info->cleaner_mutex);
9646 -               btrfs_clean_old_snapshots(fs_info->tree_root);
9647 -               mutex_unlock(&fs_info->cleaner_mutex);
9648 -
9649                 rc->extents_found = 0;
9650                 rc->extents_skipped = 0;
9651  
9652 +               mutex_lock(&fs_info->cleaner_mutex);
9653 +
9654 +               btrfs_clean_old_snapshots(fs_info->tree_root);
9655                 ret = relocate_block_group(rc);
9656 +
9657 +               mutex_unlock(&fs_info->cleaner_mutex);
9658                 if (ret < 0) {
9659                         err = ret;
9660                         break;
9661 @@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
9662                 }
9663         }
9664  
9665 -       filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
9666 -                                rc->block_group->key.objectid,
9667 -                                rc->block_group->key.objectid +
9668 -                                rc->block_group->key.offset - 1);
9669 +       filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
9670 +                                    rc->block_group->key.objectid,
9671 +                                    rc->block_group->key.objectid +
9672 +                                    rc->block_group->key.offset - 1);
9673  
9674         WARN_ON(rc->block_group->pinned > 0);
9675         WARN_ON(rc->block_group->reserved > 0);
9676 @@ -3530,6 +3594,26 @@ out:
9677         return err;
9678  }
9679  
9680 +static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
9681 +{
9682 +       struct btrfs_trans_handle *trans;
9683 +       int ret;
9684 +
9685 +       trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
9686 +
9687 +       memset(&root->root_item.drop_progress, 0,
9688 +               sizeof(root->root_item.drop_progress));
9689 +       root->root_item.drop_level = 0;
9690 +       btrfs_set_root_refs(&root->root_item, 0);
9691 +       ret = btrfs_update_root(trans, root->fs_info->tree_root,
9692 +                               &root->root_key, &root->root_item);
9693 +       BUG_ON(ret);
9694 +
9695 +       ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
9696 +       BUG_ON(ret);
9697 +       return 0;
9698 +}
9699 +
9700  /*
9701   * recover relocation interrupted by system crash.
9702   *
9703 @@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
9704                         fs_root = read_fs_root(root->fs_info,
9705                                                reloc_root->root_key.offset);
9706                         if (IS_ERR(fs_root)) {
9707 -                               err = PTR_ERR(fs_root);
9708 -                               goto out;
9709 +                               ret = PTR_ERR(fs_root);
9710 +                               if (ret != -ENOENT) {
9711 +                                       err = ret;
9712 +                                       goto out;
9713 +                               }
9714 +                               mark_garbage_root(reloc_root);
9715                         }
9716                 }
9717  
9718 @@ -3613,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
9719         mapping_tree_init(&rc->reloc_root_tree);
9720         INIT_LIST_HEAD(&rc->reloc_roots);
9721         btrfs_init_workers(&rc->workers, "relocate",
9722 -                          root->fs_info->thread_pool_size);
9723 +                          root->fs_info->thread_pool_size, NULL);
9724         rc->extent_root = root->fs_info->extent_root;
9725  
9726         set_reloc_control(rc);
9727 diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
9728 index 0ddc6d6..9351428 100644
9729 --- a/fs/btrfs/root-tree.c
9730 +++ b/fs/btrfs/root-tree.c
9731 @@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
9732                 goto out;
9733  
9734         BUG_ON(ret == 0);
9735 +       if (path->slots[0] == 0) {
9736 +               ret = 1;
9737 +               goto out;
9738 +       }
9739         l = path->nodes[0];
9740 -       BUG_ON(path->slots[0] == 0);
9741         slot = path->slots[0] - 1;
9742         btrfs_item_key_to_cpu(l, &found_key, slot);
9743 -       if (found_key.objectid != objectid) {
9744 +       if (found_key.objectid != objectid ||
9745 +           found_key.type != BTRFS_ROOT_ITEM_KEY) {
9746                 ret = 1;
9747                 goto out;
9748         }
9749 -       read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
9750 -                          sizeof(*item));
9751 -       memcpy(key, &found_key, sizeof(found_key));
9752 +       if (item)
9753 +               read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
9754 +                                  sizeof(*item));
9755 +       if (key)
9756 +               memcpy(key, &found_key, sizeof(found_key));
9757         ret = 0;
9758  out:
9759         btrfs_free_path(path);
9760 @@ -249,6 +255,59 @@ err:
9761         return ret;
9762  }
9763  
9764 +int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
9765 +{
9766 +       struct extent_buffer *leaf;
9767 +       struct btrfs_path *path;
9768 +       struct btrfs_key key;
9769 +       int err = 0;
9770 +       int ret;
9771 +
9772 +       path = btrfs_alloc_path();
9773 +       if (!path)
9774 +               return -ENOMEM;
9775 +
9776 +       key.objectid = BTRFS_ORPHAN_OBJECTID;
9777 +       key.type = BTRFS_ORPHAN_ITEM_KEY;
9778 +       key.offset = 0;
9779 +
9780 +       while (1) {
9781 +               ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
9782 +               if (ret < 0) {
9783 +                       err = ret;
9784 +                       break;
9785 +               }
9786 +
9787 +               leaf = path->nodes[0];
9788 +               if (path->slots[0] >= btrfs_header_nritems(leaf)) {
9789 +                       ret = btrfs_next_leaf(tree_root, path);
9790 +                       if (ret < 0)
9791 +                               err = ret;
9792 +                       if (ret != 0)
9793 +                               break;
9794 +                       leaf = path->nodes[0];
9795 +               }
9796 +
9797 +               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
9798 +               btrfs_release_path(tree_root, path);
9799 +
9800 +               if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
9801 +                   key.type != BTRFS_ORPHAN_ITEM_KEY)
9802 +                       break;
9803 +
9804 +               ret = btrfs_find_dead_roots(tree_root, key.offset);
9805 +               if (ret) {
9806 +                       err = ret;
9807 +                       break;
9808 +               }
9809 +
9810 +               key.offset++;
9811 +       }
9812 +
9813 +       btrfs_free_path(path);
9814 +       return err;
9815 +}
9816 +
9817  /* drop the root item for 'key' from 'root' */
9818  int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
9819                    struct btrfs_key *key)
9820 @@ -278,31 +337,57 @@ out:
9821         return ret;
9822  }
9823  
9824 -#if 0 /* this will get used when snapshot deletion is implemented */
9825  int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
9826                        struct btrfs_root *tree_root,
9827 -                      u64 root_id, u8 type, u64 ref_id)
9828 +                      u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
9829 +                      const char *name, int name_len)
9830 +
9831  {
9832 +       struct btrfs_path *path;
9833 +       struct btrfs_root_ref *ref;
9834 +       struct extent_buffer *leaf;
9835         struct btrfs_key key;
9836 +       unsigned long ptr;
9837 +       int err = 0;
9838         int ret;
9839 -       struct btrfs_path *path;
9840  
9841         path = btrfs_alloc_path();
9842 +       if (!path)
9843 +               return -ENOMEM;
9844  
9845         key.objectid = root_id;
9846 -       key.type = type;
9847 +       key.type = BTRFS_ROOT_BACKREF_KEY;
9848         key.offset = ref_id;
9849 -
9850 +again:
9851         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
9852 -       BUG_ON(ret);
9853 -
9854 -       ret = btrfs_del_item(trans, tree_root, path);
9855 -       BUG_ON(ret);
9856 +       BUG_ON(ret < 0);
9857 +       if (ret == 0) {
9858 +               leaf = path->nodes[0];
9859 +               ref = btrfs_item_ptr(leaf, path->slots[0],
9860 +                                    struct btrfs_root_ref);
9861 +
9862 +               WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
9863 +               WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
9864 +               ptr = (unsigned long)(ref + 1);
9865 +               WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
9866 +               *sequence = btrfs_root_ref_sequence(leaf, ref);
9867 +
9868 +               ret = btrfs_del_item(trans, tree_root, path);
9869 +               BUG_ON(ret);
9870 +       } else
9871 +               err = -ENOENT;
9872 +
9873 +       if (key.type == BTRFS_ROOT_BACKREF_KEY) {
9874 +               btrfs_release_path(tree_root, path);
9875 +               key.objectid = ref_id;
9876 +               key.type = BTRFS_ROOT_REF_KEY;
9877 +               key.offset = root_id;
9878 +               goto again;
9879 +       }
9880  
9881         btrfs_free_path(path);
9882 -       return ret;
9883 +       return err;
9884  }
9885 -#endif
9886  
9887  int btrfs_find_root_ref(struct btrfs_root *tree_root,
9888                    struct btrfs_path *path,
9889 @@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
9890         return ret;
9891  }
9892  
9893 -
9894  /*
9895   * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
9896   * or BTRFS_ROOT_BACKREF_KEY.
9897 @@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
9898   */
9899  int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
9900                        struct btrfs_root *tree_root,
9901 -                      u64 root_id, u8 type, u64 ref_id,
9902 -                      u64 dirid, u64 sequence,
9903 +                      u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
9904                        const char *name, int name_len)
9905  {
9906         struct btrfs_key key;
9907 @@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
9908         struct extent_buffer *leaf;
9909         unsigned long ptr;
9910  
9911 -
9912         path = btrfs_alloc_path();
9913 +       if (!path)
9914 +               return -ENOMEM;
9915  
9916         key.objectid = root_id;
9917 -       key.type = type;
9918 +       key.type = BTRFS_ROOT_BACKREF_KEY;
9919         key.offset = ref_id;
9920 -
9921 +again:
9922         ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
9923                                       sizeof(*ref) + name_len);
9924         BUG_ON(ret);
9925 @@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
9926         write_extent_buffer(leaf, name, ptr, name_len);
9927         btrfs_mark_buffer_dirty(leaf);
9928  
9929 +       if (key.type == BTRFS_ROOT_BACKREF_KEY) {
9930 +               btrfs_release_path(tree_root, path);
9931 +               key.objectid = ref_id;
9932 +               key.type = BTRFS_ROOT_REF_KEY;
9933 +               key.offset = root_id;
9934 +               goto again;
9935 +       }
9936 +
9937         btrfs_free_path(path);
9938 -       return ret;
9939 +       return 0;
9940  }
9941 diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
9942 index 6d6d06c..939b68f 100644
9943 --- a/fs/btrfs/super.c
9944 +++ b/fs/btrfs/super.c
9945 @@ -66,7 +66,7 @@ enum {
9946         Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
9947         Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
9948         Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
9949 -       Opt_tag, Opt_notag, Opt_tagid, Opt_err,
9950 +       Opt_tag, Opt_notag, Opt_tagid, Opt_discard, Opt_err,
9951  };
9952  
9953  static match_table_t tokens = {
9954 @@ -88,6 +89,7 @@ static match_table_t tokens = {
9955         {Opt_notreelog, "notreelog"},
9956         {Opt_flushoncommit, "flushoncommit"},
9957         {Opt_ratio, "metadata_ratio=%d"},
9958 +       {Opt_discard, "discard"},
9959         {Opt_tag, "tag"},
9960         {Opt_notag, "notag"},
9961         {Opt_tagid, "tagid=%u"},
9962 @@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
9963                                        info->metadata_ratio);
9964                         }
9965                         break;
9966 +               case Opt_discard:
9967 +                       btrfs_set_opt(info->mount_opt, DISCARD);
9968 +                       break;
9969  #ifndef CONFIG_TAGGING_NONE
9970                 case Opt_tag:
9971                         printk(KERN_INFO "btrfs: use tagging\n");
9972 @@ -344,7 +349,9 @@ static int btrfs_fill_super(struct super_block *sb,
9973         sb->s_export_op = &btrfs_export_ops;
9974         sb->s_xattr = btrfs_xattr_handlers;
9975         sb->s_time_gran = 1;
9976 +#ifdef CONFIG_BTRFS_FS_POSIX_ACL
9977         sb->s_flags |= MS_POSIXACL;
9978 +#endif
9979  
9980         tree_root = open_ctree(sb, fs_devices, (char *)data);
9981  
9982 @@ -676,6 +683,7 @@ static int btrfs_unfreeze(struct super_block *sb)
9983  }
9984  
9985  static struct super_operations btrfs_super_ops = {
9986 +       .drop_inode     = btrfs_drop_inode,
9987         .delete_inode   = btrfs_delete_inode,
9988         .put_super      = btrfs_put_super,
9989         .sync_fs        = btrfs_sync_fs,
9990 diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
9991 index cdbb502..bca82a4 100644
9992 --- a/fs/btrfs/transaction.c
9993 +++ b/fs/btrfs/transaction.c
9994 @@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
9995  {
9996         if (root->ref_cows && root->last_trans < trans->transid) {
9997                 WARN_ON(root == root->fs_info->extent_root);
9998 -               WARN_ON(root->root_item.refs == 0);
9999                 WARN_ON(root->commit_root != root->node);
10000  
10001                 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
10002 @@ -187,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
10003         h->alloc_exclude_start = 0;
10004         h->delayed_ref_updates = 0;
10005  
10006 +       if (!current->journal_info)
10007 +               current->journal_info = h;
10008 +
10009         root->fs_info->running_transaction->use_count++;
10010         record_root_in_trans(h, root);
10011         mutex_unlock(&root->fs_info->trans_mutex);
10012 @@ -318,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
10013                 wake_up(&cur_trans->writer_wait);
10014         put_transaction(cur_trans);
10015         mutex_unlock(&info->trans_mutex);
10016 +
10017 +       if (current->journal_info == trans)
10018 +               current->journal_info = NULL;
10019         memset(trans, 0, sizeof(*trans));
10020         kmem_cache_free(btrfs_trans_handle_cachep, trans);
10021  
10022 @@ -339,10 +344,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
10023  /*
10024   * when btree blocks are allocated, they have some corresponding bits set for
10025   * them in one of two extent_io trees.  This is used to make sure all of
10026 - * those extents are on disk for transaction or log commit
10027 + * those extents are sent to disk but does not wait on them
10028   */
10029 -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
10030 -                                       struct extent_io_tree *dirty_pages)
10031 +int btrfs_write_marked_extents(struct btrfs_root *root,
10032 +                              struct extent_io_tree *dirty_pages)
10033  {
10034         int ret;
10035         int err = 0;
10036 @@ -389,6 +394,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
10037                         page_cache_release(page);
10038                 }
10039         }
10040 +       if (err)
10041 +               werr = err;
10042 +       return werr;
10043 +}
10044 +
10045 +/*
10046 + * when btree blocks are allocated, they have some corresponding bits set for
10047 + * them in one of two extent_io trees.  This is used to make sure all of
10048 + * those extents are on disk for transaction or log commit.  We wait
10049 + * on all the pages and clear them from the dirty pages state tree
10050 + */
10051 +int btrfs_wait_marked_extents(struct btrfs_root *root,
10052 +                             struct extent_io_tree *dirty_pages)
10053 +{
10054 +       int ret;
10055 +       int err = 0;
10056 +       int werr = 0;
10057 +       struct page *page;
10058 +       struct inode *btree_inode = root->fs_info->btree_inode;
10059 +       u64 start = 0;
10060 +       u64 end;
10061 +       unsigned long index;
10062 +
10063         while (1) {
10064                 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
10065                                             EXTENT_DIRTY);
10066 @@ -419,6 +447,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
10067         return werr;
10068  }
10069  
10070 +/*
10071 + * when btree blocks are allocated, they have some corresponding bits set for
10072 + * them in one of two extent_io trees.  This is used to make sure all of
10073 + * those extents are on disk for transaction or log commit
10074 + */
10075 +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
10076 +                                       struct extent_io_tree *dirty_pages)
10077 +{
10078 +       int ret;
10079 +       int ret2;
10080 +
10081 +       ret = btrfs_write_marked_extents(root, dirty_pages);
10082 +       ret2 = btrfs_wait_marked_extents(root, dirty_pages);
10083 +       return ret || ret2;
10084 +}
10085 +
10086  int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
10087                                      struct btrfs_root *root)
10088  {
10089 @@ -720,7 +764,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
10090         memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
10091  
10092         key.objectid = objectid;
10093 -       key.offset = 0;
10094 +       /* record when the snapshot was created in key.offset */
10095 +       key.offset = trans->transid;
10096         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
10097  
10098         old = btrfs_lock_root_node(root);
10099 @@ -743,6 +788,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
10100         memcpy(&pending->root_key, &key, sizeof(key));
10101  fail:
10102         kfree(new_root_item);
10103 +       btrfs_unreserve_metadata_space(root, 6);
10104         return ret;
10105  }
10106  
10107 @@ -778,24 +824,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
10108         ret = btrfs_update_inode(trans, parent_root, parent_inode);
10109         BUG_ON(ret);
10110  
10111 -       /* add the backref first */
10112         ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
10113                                  pending->root_key.objectid,
10114 -                                BTRFS_ROOT_BACKREF_KEY,
10115                                  parent_root->root_key.objectid,
10116                                  parent_inode->i_ino, index, pending->name,
10117                                  namelen);
10118  
10119         BUG_ON(ret);
10120  
10121 -       /* now add the forward ref */
10122 -       ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
10123 -                                parent_root->root_key.objectid,
10124 -                                BTRFS_ROOT_REF_KEY,
10125 -                                pending->root_key.objectid,
10126 -                                parent_inode->i_ino, index, pending->name,
10127 -                                namelen);
10128 -
10129         inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
10130         d_instantiate(pending->dentry, inode);
10131  fail:
10132 @@ -874,7 +910,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
10133         unsigned long timeout = 1;
10134         struct btrfs_transaction *cur_trans;
10135         struct btrfs_transaction *prev_trans = NULL;
10136 -       struct extent_io_tree *pinned_copy;
10137         DEFINE_WAIT(wait);
10138         int ret;
10139         int should_grow = 0;
10140 @@ -915,13 +950,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
10141                 return 0;
10142         }
10143  
10144 -       pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
10145 -       if (!pinned_copy)
10146 -               return -ENOMEM;
10147 -
10148 -       extent_io_tree_init(pinned_copy,
10149 -                            root->fs_info->btree_inode->i_mapping, GFP_NOFS);
10150 -
10151         trans->transaction->in_commit = 1;
10152         trans->transaction->blocked = 1;
10153         if (cur_trans->list.prev != &root->fs_info->trans_list) {
10154 @@ -1019,6 +1047,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
10155         ret = commit_cowonly_roots(trans, root);
10156         BUG_ON(ret);
10157  
10158 +       btrfs_prepare_extent_commit(trans, root);
10159 +
10160         cur_trans = root->fs_info->running_transaction;
10161         spin_lock(&root->fs_info->new_trans_lock);
10162         root->fs_info->running_transaction = NULL;
10163 @@ -1042,8 +1072,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
10164         memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
10165                sizeof(root->fs_info->super_copy));
10166  
10167 -       btrfs_copy_pinned(root, pinned_copy);
10168 -
10169         trans->transaction->blocked = 0;
10170  
10171         wake_up(&root->fs_info->transaction_wait);
10172 @@ -1059,8 +1087,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
10173          */
10174         mutex_unlock(&root->fs_info->tree_log_mutex);
10175  
10176 -       btrfs_finish_extent_commit(trans, root, pinned_copy);
10177 -       kfree(pinned_copy);
10178 +       btrfs_finish_extent_commit(trans, root);
10179  
10180         /* do the directory inserts of any pending snapshot creations */
10181         finish_pending_snapshots(trans, root->fs_info);
10182 @@ -1078,6 +1105,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
10183  
10184         mutex_unlock(&root->fs_info->trans_mutex);
10185  
10186 +       if (current->journal_info == trans)
10187 +               current->journal_info = NULL;
10188 +
10189         kmem_cache_free(btrfs_trans_handle_cachep, trans);
10190         return ret;
10191  }
10192 @@ -1096,8 +1126,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
10193  
10194         while (!list_empty(&list)) {
10195                 root = list_entry(list.next, struct btrfs_root, root_list);
10196 -               list_del_init(&root->root_list);
10197 -               btrfs_drop_snapshot(root, 0);
10198 +               list_del(&root->root_list);
10199 +
10200 +               if (btrfs_header_backref_rev(root->node) <
10201 +                   BTRFS_MIXED_BACKREF_REV)
10202 +                       btrfs_drop_snapshot(root, 0);
10203 +               else
10204 +                       btrfs_drop_snapshot(root, 1);
10205         }
10206         return 0;
10207  }
10208 diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
10209 index 663c674..d4e3e7a 100644
10210 --- a/fs/btrfs/transaction.h
10211 +++ b/fs/btrfs/transaction.h
10212 @@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
10213                                               struct inode *inode)
10214  {
10215         BTRFS_I(inode)->last_trans = trans->transaction->transid;
10216 +       BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
10217  }
10218  
10219  int btrfs_end_transaction(struct btrfs_trans_handle *trans,
10220 @@ -107,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
10221                                 struct btrfs_root *root);
10222  int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
10223                                         struct extent_io_tree *dirty_pages);
10224 +int btrfs_write_marked_extents(struct btrfs_root *root,
10225 +                                       struct extent_io_tree *dirty_pages);
10226 +int btrfs_wait_marked_extents(struct btrfs_root *root,
10227 +                                       struct extent_io_tree *dirty_pages);
10228  int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
10229  #endif
10230 diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
10231 index d91b0de..f51bf13 100644
10232 --- a/fs/btrfs/tree-log.c
10233 +++ b/fs/btrfs/tree-log.c
10234 @@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
10235  
10236         mutex_lock(&root->log_mutex);
10237         if (root->log_root) {
10238 +               if (!root->log_start_pid) {
10239 +                       root->log_start_pid = current->pid;
10240 +                       root->log_multiple_pids = false;
10241 +               } else if (root->log_start_pid != current->pid) {
10242 +                       root->log_multiple_pids = true;
10243 +               }
10244 +
10245                 root->log_batch++;
10246                 atomic_inc(&root->log_writers);
10247                 mutex_unlock(&root->log_mutex);
10248                 return 0;
10249         }
10250 +       root->log_multiple_pids = false;
10251 +       root->log_start_pid = current->pid;
10252         mutex_lock(&root->fs_info->tree_log_mutex);
10253         if (!root->fs_info->log_root_tree) {
10254                 ret = btrfs_init_log_root_tree(trans, root->fs_info);
10255 @@ -263,8 +272,8 @@ static int process_one_buffer(struct btrfs_root *log,
10256                               struct walk_control *wc, u64 gen)
10257  {
10258         if (wc->pin)
10259 -               btrfs_update_pinned_extents(log->fs_info->extent_root,
10260 -                                           eb->start, eb->len, 1);
10261 +               btrfs_pin_extent(log->fs_info->extent_root,
10262 +                                eb->start, eb->len, 0);
10263  
10264         if (btrfs_buffer_uptodate(eb, gen)) {
10265                 if (wc->write)
10266 @@ -534,7 +543,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
10267         saved_nbytes = inode_get_bytes(inode);
10268         /* drop any overlapping extents */
10269         ret = btrfs_drop_extents(trans, root, inode,
10270 -                        start, extent_end, extent_end, start, &alloc_hint);
10271 +                        start, extent_end, extent_end, start, &alloc_hint, 1);
10272         BUG_ON(ret);
10273  
10274         if (found_type == BTRFS_FILE_EXTENT_REG ||
10275 @@ -1971,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
10276         int ret;
10277         struct btrfs_root *log = root->log_root;
10278         struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
10279 +       u64 log_transid = 0;
10280  
10281         mutex_lock(&root->log_mutex);
10282         index1 = root->log_transid % 2;
10283 @@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
10284  
10285         while (1) {
10286                 unsigned long batch = root->log_batch;
10287 -               mutex_unlock(&root->log_mutex);
10288 -               schedule_timeout_uninterruptible(1);
10289 -               mutex_lock(&root->log_mutex);
10290 -
10291 +               if (root->log_multiple_pids) {
10292 +                       mutex_unlock(&root->log_mutex);
10293 +                       schedule_timeout_uninterruptible(1);
10294 +                       mutex_lock(&root->log_mutex);
10295 +               }
10296                 wait_for_writer(trans, root);
10297                 if (batch == root->log_batch)
10298                         break;
10299 @@ -2003,14 +2014,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
10300                 goto out;
10301         }
10302  
10303 -       ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
10304 +       /* we start IO on  all the marked extents here, but we don't actually
10305 +        * wait for them until later.
10306 +        */
10307 +       ret = btrfs_write_marked_extents(log, &log->dirty_log_pages);
10308         BUG_ON(ret);
10309  
10310         btrfs_set_root_node(&log->root_item, log->node);
10311  
10312         root->log_batch = 0;
10313 +       log_transid = root->log_transid;
10314         root->log_transid++;
10315         log->log_transid = root->log_transid;
10316 +       root->log_start_pid = 0;
10317         smp_mb();
10318         /*
10319          * log tree has been flushed to disk, new modifications of
10320 @@ -2036,6 +2052,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
10321  
10322         index2 = log_root_tree->log_transid % 2;
10323         if (atomic_read(&log_root_tree->log_commit[index2])) {
10324 +               btrfs_wait_marked_extents(log, &log->dirty_log_pages);
10325                 wait_log_commit(trans, log_root_tree,
10326                                 log_root_tree->log_transid);
10327                 mutex_unlock(&log_root_tree->log_mutex);
10328 @@ -2055,6 +2072,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
10329          * check the full commit flag again
10330          */
10331         if (root->fs_info->last_trans_log_full_commit == trans->transid) {
10332 +               btrfs_wait_marked_extents(log, &log->dirty_log_pages);
10333                 mutex_unlock(&log_root_tree->log_mutex);
10334                 ret = -EAGAIN;
10335                 goto out_wake_log_root;
10336 @@ -2063,6 +2081,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
10337         ret = btrfs_write_and_wait_marked_extents(log_root_tree,
10338                                 &log_root_tree->dirty_log_pages);
10339         BUG_ON(ret);
10340 +       btrfs_wait_marked_extents(log, &log->dirty_log_pages);
10341  
10342         btrfs_set_super_log_root(&root->fs_info->super_for_commit,
10343                                 log_root_tree->node->start);
10344 @@ -2082,9 +2101,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
10345          * the running transaction open, so a full commit can't hop
10346          * in and cause problems either.
10347          */
10348 -       write_ctree_super(trans, root->fs_info->tree_root, 2);
10349 +       write_ctree_super(trans, root->fs_info->tree_root, 1);
10350         ret = 0;
10351  
10352 +       mutex_lock(&root->log_mutex);
10353 +       if (root->last_log_commit < log_transid)
10354 +               root->last_log_commit = log_transid;
10355 +       mutex_unlock(&root->log_mutex);
10356 +
10357  out_wake_log_root:
10358         atomic_set(&log_root_tree->log_commit[index2], 0);
10359         smp_mb();
10360 @@ -2841,7 +2865,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
10361                 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
10362                         break;
10363  
10364 -               if (parent == sb->s_root)
10365 +               if (IS_ROOT(parent))
10366                         break;
10367  
10368                 parent = parent->d_parent;
10369 @@ -2852,6 +2876,21 @@ out:
10370         return ret;
10371  }
10372  
10373 +static int inode_in_log(struct btrfs_trans_handle *trans,
10374 +                struct inode *inode)
10375 +{
10376 +       struct btrfs_root *root = BTRFS_I(inode)->root;
10377 +       int ret = 0;
10378 +
10379 +       mutex_lock(&root->log_mutex);
10380 +       if (BTRFS_I(inode)->logged_trans == trans->transid &&
10381 +           BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
10382 +               ret = 1;
10383 +       mutex_unlock(&root->log_mutex);
10384 +       return ret;
10385 +}
10386 +
10387 +
10388  /*
10389   * helper function around btrfs_log_inode to make sure newly created
10390   * parent directories also end up in the log.  A minimal inode and backref
10391 @@ -2880,11 +2919,22 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
10392                 goto end_no_trans;
10393         }
10394  
10395 +       if (root != BTRFS_I(inode)->root ||
10396 +           btrfs_root_refs(&root->root_item) == 0) {
10397 +               ret = 1;
10398 +               goto end_no_trans;
10399 +       }
10400 +
10401         ret = check_parent_dirs_for_sync(trans, inode, parent,
10402                                          sb, last_committed);
10403         if (ret)
10404                 goto end_no_trans;
10405  
10406 +       if (inode_in_log(trans, inode)) {
10407 +               ret = BTRFS_NO_LOG_SYNC;
10408 +               goto end_no_trans;
10409 +       }
10410 +
10411         start_log_trans(trans, root);
10412  
10413         ret = btrfs_log_inode(trans, root, inode, inode_only);
10414 @@ -2907,12 +2957,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
10415                         break;
10416  
10417                 inode = parent->d_inode;
10418 +               if (root != BTRFS_I(inode)->root)
10419 +                       break;
10420 +
10421                 if (BTRFS_I(inode)->generation >
10422                     root->fs_info->last_trans_committed) {
10423                         ret = btrfs_log_inode(trans, root, inode, inode_only);
10424                         BUG_ON(ret);
10425                 }
10426 -               if (parent == sb->s_root)
10427 +               if (IS_ROOT(parent))
10428                         break;
10429  
10430                 parent = parent->d_parent;
10431 @@ -2951,7 +3004,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
10432         struct btrfs_key tmp_key;
10433         struct btrfs_root *log;
10434         struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
10435 -       u64 highest_inode;
10436         struct walk_control wc = {
10437                 .process_func = process_one_buffer,
10438                 .stage = 0,
10439 @@ -3010,11 +3062,6 @@ again:
10440                                                       path);
10441                         BUG_ON(ret);
10442                 }
10443 -               ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
10444 -               if (ret == 0) {
10445 -                       wc.replay_dest->highest_inode = highest_inode;
10446 -                       wc.replay_dest->last_inode_alloc = highest_inode;
10447 -               }
10448  
10449                 key.offset = found_key.offset - 1;
10450                 wc.replay_dest->log_root = NULL;
10451 diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
10452 index d09c760..0776eac 100644
10453 --- a/fs/btrfs/tree-log.h
10454 +++ b/fs/btrfs/tree-log.h
10455 @@ -19,6 +19,9 @@
10456  #ifndef __TREE_LOG_
10457  #define __TREE_LOG_
10458  
10459 +/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
10460 +#define BTRFS_NO_LOG_SYNC 256
10461 +
10462  int btrfs_sync_log(struct btrfs_trans_handle *trans,
10463                    struct btrfs_root *root);
10464  int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
10465 diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
10466 index 5dbefd1..20cbd2e 100644
10467 --- a/fs/btrfs/volumes.c
10468 +++ b/fs/btrfs/volumes.c
10469 @@ -276,7 +276,7 @@ loop_lock:
10470                  * is now congested.  Back off and let other work structs
10471                  * run instead
10472                  */
10473 -               if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
10474 +               if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
10475                     fs_info->fs_devices->open_devices > 1) {
10476                         struct io_context *ioc;
10477  
10478 @@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
10479                         goto error;
10480  
10481                 device->name = kstrdup(orig_dev->name, GFP_NOFS);
10482 -               if (!device->name)
10483 +               if (!device->name) {
10484 +                       kfree(device);
10485                         goto error;
10486 +               }
10487  
10488                 device->devid = orig_dev->devid;
10489                 device->work.func = pending_bios_fn;
10490 @@ -719,10 +721,9 @@ error:
10491   * called very infrequently and that a given device has a small number
10492   * of extents
10493   */
10494 -static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
10495 -                                        struct btrfs_device *device,
10496 -                                        u64 num_bytes, u64 *start,
10497 -                                        u64 *max_avail)
10498 +int find_free_dev_extent(struct btrfs_trans_handle *trans,
10499 +                        struct btrfs_device *device, u64 num_bytes,
10500 +                        u64 *start, u64 *max_avail)
10501  {
10502         struct btrfs_key key;
10503         struct btrfs_root *root = device->dev_root;
10504 @@ -1736,6 +1737,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
10505         extent_root = root->fs_info->extent_root;
10506         em_tree = &root->fs_info->mapping_tree.map_tree;
10507  
10508 +       ret = btrfs_can_relocate(extent_root, chunk_offset);
10509 +       if (ret)
10510 +               return -ENOSPC;
10511 +
10512         /* step one, relocate all the extents inside this chunk */
10513         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
10514         BUG_ON(ret);
10515 @@ -1749,9 +1754,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
10516          * step two, delete the device extents and the
10517          * chunk tree entries
10518          */
10519 -       spin_lock(&em_tree->lock);
10520 +       read_lock(&em_tree->lock);
10521         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10522 -       spin_unlock(&em_tree->lock);
10523 +       read_unlock(&em_tree->lock);
10524  
10525         BUG_ON(em->start > chunk_offset ||
10526                em->start + em->len < chunk_offset);
10527 @@ -1780,9 +1785,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
10528         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
10529         BUG_ON(ret);
10530  
10531 -       spin_lock(&em_tree->lock);
10532 +       write_lock(&em_tree->lock);
10533         remove_extent_mapping(em_tree, em);
10534 -       spin_unlock(&em_tree->lock);
10535 +       write_unlock(&em_tree->lock);
10536  
10537         kfree(map);
10538         em->bdev = NULL;
10539 @@ -1807,12 +1812,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
10540         struct btrfs_key found_key;
10541         u64 chunk_tree = chunk_root->root_key.objectid;
10542         u64 chunk_type;
10543 +       bool retried = false;
10544 +       int failed = 0;
10545         int ret;
10546  
10547         path = btrfs_alloc_path();
10548         if (!path)
10549                 return -ENOMEM;
10550  
10551 +again:
10552         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
10553         key.offset = (u64)-1;
10554         key.type = BTRFS_CHUNK_ITEM_KEY;
10555 @@ -1842,7 +1850,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
10556                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
10557                                                    found_key.objectid,
10558                                                    found_key.offset);
10559 -                       BUG_ON(ret);
10560 +                       if (ret == -ENOSPC)
10561 +                               failed++;
10562 +                       else if (ret)
10563 +                               BUG();
10564                 }
10565  
10566                 if (found_key.offset == 0)
10567 @@ -1850,6 +1861,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
10568                 key.offset = found_key.offset - 1;
10569         }
10570         ret = 0;
10571 +       if (failed && !retried) {
10572 +               failed = 0;
10573 +               retried = true;
10574 +               goto again;
10575 +       } else if (failed && retried) {
10576 +               WARN_ON(1);
10577 +               ret = -ENOSPC;
10578 +       }
10579  error:
10580         btrfs_free_path(path);
10581         return ret;
10582 @@ -1894,6 +1913,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
10583                         continue;
10584  
10585                 ret = btrfs_shrink_device(device, old_size - size_to_free);
10586 +               if (ret == -ENOSPC)
10587 +                       break;
10588                 BUG_ON(ret);
10589  
10590                 trans = btrfs_start_transaction(dev_root, 1);
10591 @@ -1938,9 +1959,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
10592                 chunk = btrfs_item_ptr(path->nodes[0],
10593                                        path->slots[0],
10594                                        struct btrfs_chunk);
10595 -               key.offset = found_key.offset;
10596                 /* chunk zero is special */
10597 -               if (key.offset == 0)
10598 +               if (found_key.offset == 0)
10599                         break;
10600  
10601                 btrfs_release_path(chunk_root, path);
10602 @@ -1948,7 +1968,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
10603                                            chunk_root->root_key.objectid,
10604                                            found_key.objectid,
10605                                            found_key.offset);
10606 -               BUG_ON(ret);
10607 +               BUG_ON(ret && ret != -ENOSPC);
10608 +               key.offset = found_key.offset - 1;
10609         }
10610         ret = 0;
10611  error:
10612 @@ -1974,10 +1995,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
10613         u64 chunk_offset;
10614         int ret;
10615         int slot;
10616 +       int failed = 0;
10617 +       bool retried = false;
10618         struct extent_buffer *l;
10619         struct btrfs_key key;
10620         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
10621         u64 old_total = btrfs_super_total_bytes(super_copy);
10622 +       u64 old_size = device->total_bytes;
10623         u64 diff = device->total_bytes - new_size;
10624  
10625         if (new_size >= device->total_bytes)
10626 @@ -1987,12 +2011,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
10627         if (!path)
10628                 return -ENOMEM;
10629  
10630 -       trans = btrfs_start_transaction(root, 1);
10631 -       if (!trans) {
10632 -               ret = -ENOMEM;
10633 -               goto done;
10634 -       }
10635 -
10636         path->reada = 2;
10637  
10638         lock_chunks(root);
10639 @@ -2001,8 +2019,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
10640         if (device->writeable)
10641                 device->fs_devices->total_rw_bytes -= diff;
10642         unlock_chunks(root);
10643 -       btrfs_end_transaction(trans, root);
10644  
10645 +again:
10646         key.objectid = device->devid;
10647         key.offset = (u64)-1;
10648         key.type = BTRFS_DEV_EXTENT_KEY;
10649 @@ -2017,6 +2035,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
10650                         goto done;
10651                 if (ret) {
10652                         ret = 0;
10653 +                       btrfs_release_path(root, path);
10654                         break;
10655                 }
10656  
10657 @@ -2024,14 +2043,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
10658                 slot = path->slots[0];
10659                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
10660  
10661 -               if (key.objectid != device->devid)
10662 +               if (key.objectid != device->devid) {
10663 +                       btrfs_release_path(root, path);
10664                         break;
10665 +               }
10666  
10667                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
10668                 length = btrfs_dev_extent_length(l, dev_extent);
10669  
10670 -               if (key.offset + length <= new_size)
10671 +               if (key.offset + length <= new_size) {
10672 +                       btrfs_release_path(root, path);
10673                         break;
10674 +               }
10675  
10676                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
10677                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
10678 @@ -2040,8 +2063,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
10679  
10680                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
10681                                            chunk_offset);
10682 -               if (ret)
10683 +               if (ret && ret != -ENOSPC)
10684                         goto done;
10685 +               if (ret == -ENOSPC)
10686 +                       failed++;
10687 +               key.offset -= 1;
10688 +       }
10689 +
10690 +       if (failed && !retried) {
10691 +               failed = 0;
10692 +               retried = true;
10693 +               goto again;
10694 +       } else if (failed && retried) {
10695 +               ret = -ENOSPC;
10696 +               lock_chunks(root);
10697 +
10698 +               device->total_bytes = old_size;
10699 +               if (device->writeable)
10700 +                       device->fs_devices->total_rw_bytes += diff;
10701 +               unlock_chunks(root);
10702 +               goto done;
10703         }
10704  
10705         /* Shrinking succeeded, else we would be at "done". */
10706 @@ -2294,9 +2335,9 @@ again:
10707         em->block_len = em->len;
10708  
10709         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
10710 -       spin_lock(&em_tree->lock);
10711 +       write_lock(&em_tree->lock);
10712         ret = add_extent_mapping(em_tree, em);
10713 -       spin_unlock(&em_tree->lock);
10714 +       write_unlock(&em_tree->lock);
10715         BUG_ON(ret);
10716         free_extent_map(em);
10717  
10718 @@ -2491,9 +2532,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
10719         int readonly = 0;
10720         int i;
10721  
10722 -       spin_lock(&map_tree->map_tree.lock);
10723 +       read_lock(&map_tree->map_tree.lock);
10724         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
10725 -       spin_unlock(&map_tree->map_tree.lock);
10726 +       read_unlock(&map_tree->map_tree.lock);
10727         if (!em)
10728                 return 1;
10729  
10730 @@ -2518,11 +2559,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
10731         struct extent_map *em;
10732  
10733         while (1) {
10734 -               spin_lock(&tree->map_tree.lock);
10735 +               write_lock(&tree->map_tree.lock);
10736                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
10737                 if (em)
10738                         remove_extent_mapping(&tree->map_tree, em);
10739 -               spin_unlock(&tree->map_tree.lock);
10740 +               write_unlock(&tree->map_tree.lock);
10741                 if (!em)
10742                         break;
10743                 kfree(em->bdev);
10744 @@ -2540,9 +2581,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
10745         struct extent_map_tree *em_tree = &map_tree->map_tree;
10746         int ret;
10747  
10748 -       spin_lock(&em_tree->lock);
10749 +       read_lock(&em_tree->lock);
10750         em = lookup_extent_mapping(em_tree, logical, len);
10751 -       spin_unlock(&em_tree->lock);
10752 +       read_unlock(&em_tree->lock);
10753         BUG_ON(!em);
10754  
10755         BUG_ON(em->start > logical || em->start + em->len < logical);
10756 @@ -2604,9 +2645,9 @@ again:
10757                 atomic_set(&multi->error, 0);
10758         }
10759  
10760 -       spin_lock(&em_tree->lock);
10761 +       read_lock(&em_tree->lock);
10762         em = lookup_extent_mapping(em_tree, logical, *length);
10763 -       spin_unlock(&em_tree->lock);
10764 +       read_unlock(&em_tree->lock);
10765  
10766         if (!em && unplug_page)
10767                 return 0;
10768 @@ -2763,9 +2804,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
10769         u64 stripe_nr;
10770         int i, j, nr = 0;
10771  
10772 -       spin_lock(&em_tree->lock);
10773 +       read_lock(&em_tree->lock);
10774         em = lookup_extent_mapping(em_tree, chunk_start, 1);
10775 -       spin_unlock(&em_tree->lock);
10776 +       read_unlock(&em_tree->lock);
10777  
10778         BUG_ON(!em || em->start != chunk_start);
10779         map = (struct map_lookup *)em->bdev;
10780 @@ -3053,9 +3094,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
10781         logical = key->offset;
10782         length = btrfs_chunk_length(leaf, chunk);
10783  
10784 -       spin_lock(&map_tree->map_tree.lock);
10785 +       read_lock(&map_tree->map_tree.lock);
10786         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
10787 -       spin_unlock(&map_tree->map_tree.lock);
10788 +       read_unlock(&map_tree->map_tree.lock);
10789  
10790         /* already mapped? */
10791         if (em && em->start <= logical && em->start + em->len > logical) {
10792 @@ -3114,9 +3155,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
10793                 map->stripes[i].dev->in_fs_metadata = 1;
10794         }
10795  
10796 -       spin_lock(&map_tree->map_tree.lock);
10797 +       write_lock(&map_tree->map_tree.lock);
10798         ret = add_extent_mapping(&map_tree->map_tree, em);
10799 -       spin_unlock(&map_tree->map_tree.lock);
10800 +       write_unlock(&map_tree->map_tree.lock);
10801         BUG_ON(ret);
10802         free_extent_map(em);
10803  
10804 diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
10805 index 5139a83..31b0fab 100644
10806 --- a/fs/btrfs/volumes.h
10807 +++ b/fs/btrfs/volumes.h
10808 @@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root);
10809  void btrfs_unlock_volumes(void);
10810  void btrfs_lock_volumes(void);
10811  int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
10812 +int find_free_dev_extent(struct btrfs_trans_handle *trans,
10813 +                        struct btrfs_device *device, u64 num_bytes,
10814 +                        u64 *start, u64 *max_avail);
10815  #endif
10816 diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
10817 index a9d3bf4..b6dd596 100644
10818 --- a/fs/btrfs/xattr.c
10819 +++ b/fs/btrfs/xattr.c
10820 @@ -260,7 +260,7 @@ err:
10821   * attributes are handled directly.
10822   */
10823  struct xattr_handler *btrfs_xattr_handlers[] = {
10824 -#ifdef CONFIG_FS_POSIX_ACL
10825 +#ifdef CONFIG_BTRFS_FS_POSIX_ACL
10826         &btrfs_xattr_acl_access_handler,
10827         &btrfs_xattr_acl_default_handler,
10828  #endif