From bc1f73059fa29dbdd87d683265e765b323a34994 Mon Sep 17 00:00:00 2001
From: Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr>
Date: Fri, 11 Dec 2009 14:21:29 +0000
Subject: [PATCH] first draft for a plain f12 kernel

---
 Makefile                                      |    95 +
 kernel.spec                                   |  3897 ++++++
 linux-2.6-btrfs-upstream.patch                | 10828 +++++++++++++++
 linux-2.6-debug-vm-would-have-oomkilled.patch |    65 +
 linux-2.6-execshield.patch                    |  1013 ++
 linux-2.6-utrace.patch                        |  4102 ++++++
 original/kernel.spec                          |  3886 ++++++
 original/linux-2.6-btrfs-upstream.patch       | 10829 ++++++++++++++++
 ...ux-2.6-debug-vm-would-have-oomkilled.patch |    65 +
 original/linux-2.6-execshield.patch           |  1013 ++
 original/linux-2.6-utrace.patch               |  4102 ++++++
 rpmmacros.in                                  |     7 +
 rpmmacros.sh                                  |     1 +
 sources                                       |     2 +
 14 files changed, 39905 insertions(+)
 create mode 100644 Makefile
 create mode 100644 kernel.spec
 create mode 100644 linux-2.6-btrfs-upstream.patch
 create mode 100644 linux-2.6-debug-vm-would-have-oomkilled.patch
 create mode 100644 linux-2.6-execshield.patch
 create mode 100644 linux-2.6-utrace.patch
 create mode 100644 original/kernel.spec
 create mode 100644 original/linux-2.6-btrfs-upstream.patch
 create mode 100644 original/linux-2.6-debug-vm-would-have-oomkilled.patch
 create mode 100644 original/linux-2.6-execshield.patch
 create mode 100644 original/linux-2.6-utrace.patch
 create mode 100644 rpmmacros.in
 create mode 100755 rpmmacros.sh
 create mode 100644 sources

diff --git a/Makefile b/Makefile
new file mode 100644
index 000000000..545e454a9
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,95 @@
+CURL	?= $(shell if test -f /usr/bin/curl ; then echo "curl -H Pragma: -O -R -S --fail --show-error" ; fi)
+WGET	?= $(shell if test -f /usr/bin/wget ; then echo "wget -nd -m" ; fi)
+CLIENT	?= $(if $(CURL),$(CURL),$(if $(WGET),$(WGET)))
+AWK	= awk
+SHA1SUM	= sha1sum
+SED	= sed
+
+SPECFILE = kernel.spec
+
+# Thierry - when called from within the build, PWD is /build
+PWD=$(shell pwd)
+
+# get nevr from specfile.
+ifndef NAME
+NAME := $(shell rpm $(RPMDEFS) $(DISTDEFS) -q --qf "%{NAME}\n" --specfile $(SPECFILE) | head -1)
+endif
+ifndef EPOCH
+EPOCH := $(shell rpm $(RPMDEFS) $(DISTDEFS) -q --qf "%{EPOCH}\n" --specfile $(SPECFILE) | head -1 | sed 's/(none)//')
+endif
+ifeq ($(EPOCH),(none))
+override EPOCH := ""
+endif
+ifndef VERSION
+VERSION := $(shell rpm $(RPMDEFS) $(DISTDEFS) -q --qf "%{VERSION}\n" --specfile $(SPECFILE)| head -1)
+endif
+ifndef RELEASE
+RELEASE := $(shell rpm $(RPMDEFS) $(DISTDEFS) -q --qf "%{RELEASE}\n" --specfile $(SPECFILE)| head -1)
+endif
+
+define get_sources_sha1
+$(shell cat sources 2>/dev/null | awk 'gensub("^.*/", "", 1, $$2) == "$@" { print $$1; exit; }')
+endef
+define get_sources_url
+$(shell cat sources 2>/dev/null | awk 'gensub("^.*/", "", 1, $$2) == "$@" { print $$2; exit; }')
+endef
+SOURCEFILES := $(shell cat sources 2>/dev/null | awk '{ print gensub("^.*/", "", 1, $$2) }')
+SOURCE_RPM := $(firstword $(SOURCEFILES))
+
+sources: $(SOURCEFILES) $(TARGETS)
+
+$(SOURCEFILES): #FORCE
+	@if [ ! -e "$@" ] ; then echo "$(CLIENT) $(get_sources_url)" ; $(CLIENT) $(get_sources_url) ; fi
+	@if [ ! -e "$@" ] ; then echo "Could not download source file: $@ does not exist" ; exit 1 ; fi
+	@if test "$$(sha1sum $@ | awk '{print $$1}')" != "$(get_sources_sha1)" ; then \
+	    echo "sha1sum of the downloaded $@ does not match the one from 'sources' file" ; \
+	    echo "Local copy: $$(sha1sum $@)" ; \
+	    echo "In sources: $$(grep $@ sources)" ; \
+	    exit 1 ; \
+	else \
+	    ls -l $@ ; \
+	fi
+
+download-sources:
+	@for i in $(SOURCES); do \
+		if [ ! -e "$${i##*/}" ]; then \
+			echo "$(CLIENT) $$i"; \
+			$(CLIENT) $$i; \
+		fi; \
+	done
+
+replace-sources:
+	rm -f sources
+	@$(MAKE) new-sources
+
+new-sources: download-sources
+	@for i in $(SOURCES); do \
+		echo "$(SHA1SUM) $$i >> sources"; \
+		$(SHA1SUM) $${i##*/} | $(AWK) '{ printf "%s  %s\n", $$1, "'"$$i"'" }' >> sources; \
+	done
+
+PREPARCH ?= noarch
+RPMDIRDEFS = --define "_sourcedir $(PWD)" --define "_builddir $(PWD)" --define "_srcrpmdir $(PWD)" --define "_rpmdir $(PWD)"
+trees: sources
+	rpmbuild $(RPMDIRDEFS) $(RPMDEFS) --nodeps -bp --target $(PREPARCH) $(SPECFILE)
+
+# use the stock source rpm, unwrap it,
+# install our own specfile and patched patches
+# and patch configs for IPV6
+# then rewrap with rpm
+srpm: sources
+	mkdir SOURCES SRPMS
+	(cd SOURCES; rpm2cpio ../$(SOURCE_RPM) | cpio -diu; \
+	 cp ../$(SPECFILE) . ; cp ../linux*.patch . ; \
+	 sed -i -e s,CONFIG_IPV6=m,CONFIG_IPV6=y, config-generic)
+	./rpmmacros.sh
+	export HOME=$(shell pwd) ; rpmbuild $(RPMDIRDEFS) $(RPMDEFS) --nodeps -bs SOURCES/$(SPECFILE)
+	cp $(SOURCE_RPM) $(EXPECTED_SRPM)
+
+TARGET ?= $(shell uname -m)
+rpm: sources
+	rpmbuild $(RPMDIRDEFS) $(RPMDEFS) --nodeps --target $(TARGET) -bb $(SPECFILE)
+
+clean:
+	rm -f *.rpm
+
diff --git a/kernel.spec b/kernel.spec
new file mode 100644
index 000000000..983e1743d
--- /dev/null
+++ b/kernel.spec
@@ -0,0 +1,3897 @@
+# We have to override the new %%install behavior because, well... the kernel is special.
+%global __spec_install_pre %{___build_pre}
+
+Summary: The Linux kernel
+
+# For a stable, released kernel, released_kernel should be 1. For rawhide
+# and/or a kernel built from an rc or git snapshot, released_kernel should
+# be 0.
+%global released_kernel 1
+
+# Versions of various parts
+
+# Polite request for people who spin their own kernel rpms:
+# please modify the "buildid" define in a way that identifies
+# that the kernel isn't the stock distribution kernel, for example,
+# by setting the define to ".local" or ".bz123456"
+#
+###-vs-
+%define buildid .vs2.3.0.36.27
+
+# fedora_build defines which build revision of this kernel version we're
+# building. Rather than incrementing forever, as with the prior versioning
+# setup, we set fedora_cvs_origin to the current cvs revision s/1.// of the
+# kernel spec when the kernel is rebased, so fedora_build automatically
+# works out to the offset from the rebase, so it doesn't get too ginormous.
+#
+# If you're building on a branch, the RCS revision will be something like
+# 1.1205.1.1.  In this case we drop the initial 1, subtract fedora_cvs_origin
+# from the second number, and then append the rest of the RCS string as is.
+# Don't stare at the awk too long, you'll go blind.
+%define fedora_cvs_origin   1786
+%define fedora_cvs_revision() %2
+%global fedora_build %(echo %{fedora_cvs_origin}.%{fedora_cvs_revision $Revision: 1.1948 $} | awk -F . '{ OFS = "."; ORS = ""; print $3 - $1 ; i = 4 ; OFS = ""; while (i <= NF) { print ".", $i ; i++} }')
+
+# base_sublevel is the kernel version we're starting with and patching
+# on top of -- for example, 2.6.22-rc7-git1 starts with a 2.6.21 base,
+# which yields a base_sublevel of 21.
+%define base_sublevel 31
+
+## If this is a released kernel ##
+%if 0%{?released_kernel}
+
+# Do we have a -stable update to apply?
+%define stable_update 6
+# Is it a -stable RC?
+%define stable_rc 0
+# Set rpm version accordingly
+%if 0%{?stable_update}
+%define stablerev .%{stable_update}
+%define stable_base %{stable_update}
+%if 0%{?stable_rc}
+# stable RCs are incremental patches, so we need the previous stable patch
+%define stable_base %(echo $((%{stable_update} - 1)))
+%endif
+%endif
+%define rpmversion 2.6.%{base_sublevel}%{?stablerev}
+
+## The not-released-kernel case ##
+%else
+# The next upstream release sublevel (base_sublevel+1)
+%define upstream_sublevel %(echo $((%{base_sublevel} + 1)))
+# The rc snapshot level
+%define rcrev 9
+# The git snapshot level
+%define gitrev 2
+# Set rpm version accordingly
+%define rpmversion 2.6.%{upstream_sublevel}
+%endif
+# Nb: The above rcrev and gitrev values automagically define Patch00 and Patch01 below.
+
+# What parts do we want to build?  We must build at least one kernel.
+# These are the kernels that are built IF the architecture allows it.
+# All should default to 1 (enabled) and be flipped to 0 (disabled)
+# by later arch-specific checks.
+
+# The following build options are enabled by default.
+# Use either --without <opt> in your rpmbuild command or force values
+# to 0 in here to disable them.
+#
+# standard kernel
+%define with_up        %{?_without_up:        0} %{?!_without_up:        1}
+# kernel-smp (only valid for ppc 32-bit)
+%define with_smp       %{?_without_smp:       0} %{?!_without_smp:       1}
+# kernel-kdump
+%define with_kdump     %{?_without_kdump:     0} %{?!_without_kdump:     1}
+# kernel-debug
+%define with_debug     %{?_without_debug:     0} %{?!_without_debug:     1}
+# kernel-doc
+%define with_doc       %{?_without_doc:       0} %{?!_without_doc:       1}
+# kernel-headers
+%define with_headers   %{?_without_headers:   0} %{?!_without_headers:   1}
+# kernel-firmware
+%define with_firmware  %{?_with_firmware:     1} %{?!_with_firmware:     0}
+# tools/perf
+%define with_perftool  %{?_without_perftool:  0} %{?!_without_perftool:  1}
+# perf noarch subpkg
+%define with_perf      %{?_without_perf:      0} %{?!_without_perf:      1}
+# kernel-debuginfo
+%define with_debuginfo %{?_without_debuginfo: 0} %{?!_without_debuginfo: 1}
+# kernel-bootwrapper (for creating zImages from kernel + initrd)
+%define with_bootwrapper %{?_without_bootwrapper: 0} %{?!_without_bootwrapper: 1}
+# Want to build a the vsdo directories installed
+%define with_vdso_install %{?_without_vdso_install: 0} %{?!_without_vdso_install: 1}
+# Use dracut instead of mkinitrd for initrd image generation
+%define with_dracut       %{?_without_dracut:       0} %{?!_without_dracut:       1}
+
+# Build the kernel-doc package, but don't fail the build if it botches.
+# Here "true" means "continue" and "false" means "fail the build".
+%if 0%{?released_kernel}
+%define doc_build_fail false
+%else
+%define doc_build_fail true
+%endif
+
+%define rawhide_skip_docs 0
+%if 0%{?rawhide_skip_docs}
+%define with_doc 0
+%endif
+
+# Additional options for user-friendly one-off kernel building:
+#
+# Only build the base kernel (--with baseonly):
+%define with_baseonly  %{?_with_baseonly:     1} %{?!_with_baseonly:     0}
+# Only build the smp kernel (--with smponly):
+%define with_smponly   %{?_with_smponly:      1} %{?!_with_smponly:      0}
+# Only build the debug kernel (--with dbgonly):
+%define with_dbgonly   %{?_with_dbgonly:      1} %{?!_with_dbgonly:      0}
+
+# should we do C=1 builds with sparse
+%define with_sparse	%{?_with_sparse:      1} %{?!_with_sparse:      0}
+
+# Set debugbuildsenabled to 1 for production (build separate debug kernels)
+#  and 0 for rawhide (all kernels are debug kernels).
+# See also 'make debug' and 'make release'.
+%define debugbuildsenabled 1
+
+# Want to build a vanilla kernel build without any non-upstream patches?
+# (well, almost none, we need nonintconfig for build purposes). Default to 0 (off).
+%define with_vanilla %{?_with_vanilla: 1} %{?!_with_vanilla: 0}
+
+# pkg_release is what we'll fill in for the rpm Release: field
+%if 0%{?released_kernel}
+
+%if 0%{?stable_rc}
+%define stable_rctag .rc%{stable_rc}
+%endif
+%define pkg_release %{fedora_build}%{?stable_rctag}%{?buildid}%{?dist}
+
+%else
+
+# non-released_kernel
+%if 0%{?rcrev}
+%define rctag .rc%rcrev
+%endif
+%if 0%{?gitrev}
+%define gittag .git%gitrev
+%if !0%{?rcrev}
+%define rctag .rc0
+%endif
+%endif
+%define pkg_release 0.%{fedora_build}%{?rctag}%{?gittag}%{?buildid}%{?dist}
+
+%endif
+
+# The kernel tarball/base version
+%define kversion 2.6.%{base_sublevel}
+
+%define make_target bzImage
+
+%define KVERREL %{PACKAGE_VERSION}-%{PACKAGE_RELEASE}.%{_target_cpu}
+%define hdrarch %_target_cpu
+%define asmarch %_target_cpu
+
+%if 0%{!?nopatches:1}
+%define nopatches 0
+%endif
+
+%if %{with_vanilla}
+%define nopatches 1
+%endif
+
+%if %{nopatches}
+%define with_bootwrapper 0
+%define variant -vanilla
+%else
+%define variant_fedora -fedora
+%endif
+
+%define using_upstream_branch 0
+%if 0%{?upstream_branch:1}
+%define stable_update 0
+%define using_upstream_branch 1
+%define variant -%{upstream_branch}%{?variant_fedora}
+%define pkg_release 0.%{fedora_build}%{upstream_branch_tag}%{?buildid}%{?dist}
+%endif
+
+%if !%{debugbuildsenabled}
+%define with_debug 0
+%endif
+
+%if !%{with_debuginfo}
+%define _enable_debug_packages 0
+%endif
+%define debuginfodir /usr/lib/debug
+
+# kernel-PAE is only built on i686.
+%ifarch i686
+%define with_pae 1
+%else
+%define with_pae 0
+%endif
+
+# if requested, only build base kernel
+%if %{with_baseonly}
+%define with_smp 0
+%define with_kdump 0
+%define with_debug 0
+%endif
+
+# if requested, only build smp kernel
+%if %{with_smponly}
+%define with_up 0
+%define with_kdump 0
+%define with_debug 0
+%endif
+
+# if requested, only build debug kernel
+%if %{with_dbgonly}
+%if %{debugbuildsenabled}
+%define with_up 0
+%endif
+%define with_smp 0
+%define with_pae 0
+%define with_xen 0
+%define with_kdump 0
+%define with_perftool 0
+%endif
+
+%define all_x86 i386 i686
+
+%if %{with_vdso_install}
+# These arches install vdso/ directories.
+%define vdso_arches %{all_x86} x86_64 ppc ppc64
+%endif
+
+# Overrides for generic default options
+
+# only ppc and alphav56 need separate smp kernels
+%ifnarch ppc alphaev56
+%define with_smp 0
+%endif
+
+# only build kernel-kdump on ppc64
+# (no relocatable kernel support upstream yet)
+#FIXME: Temporarily disabled to speed up builds.
+#ifnarch ppc64
+%define with_kdump 0
+#endif
+
+# don't do debug builds on anything but i686 and x86_64
+%ifnarch i686 x86_64
+%define with_debug 0
+%endif
+
+# only package docs noarch
+%ifnarch noarch
+%define with_doc 0
+%define with_perf 0
+%endif
+
+# don't build noarch kernels or headers (duh)
+%ifarch noarch
+%define with_up 0
+%define with_headers 0
+%define all_arch_configs kernel-%{version}-*.config
+%define with_firmware  %{?_without_firmware:  0} %{?!_without_firmware:  1}
+%endif
+
+# bootwrapper is only on ppc
+%ifnarch ppc ppc64
+%define with_bootwrapper 0
+%endif
+
+# sparse blows up on ppc64 alpha and sparc64
+%ifarch ppc64 ppc alpha sparc64
+%define with_sparse 0
+%endif
+
+# Per-arch tweaks
+
+%ifarch %{all_x86}
+%define asmarch x86
+%define hdrarch i386
+%define all_arch_configs kernel-%{version}-i?86*.config
+%define image_install_path boot
+%define kernel_image arch/x86/boot/bzImage
+%endif
+
+%ifarch x86_64
+%define asmarch x86
+%define all_arch_configs kernel-%{version}-x86_64*.config
+%define image_install_path boot
+%define kernel_image arch/x86/boot/bzImage
+%endif
+
+%ifarch ppc64
+%define asmarch powerpc
+%define hdrarch powerpc
+%define all_arch_configs kernel-%{version}-ppc64*.config
+%define image_install_path boot
+%define make_target vmlinux
+%define kernel_image vmlinux
+%define kernel_image_elf 1
+%endif
+
+%ifarch s390x
+%define asmarch s390
+%define hdrarch s390
+%define all_arch_configs kernel-%{version}-s390x.config
+%define image_install_path boot
+%define make_target image
+%define kernel_image arch/s390/boot/image
+%endif
+
+%ifarch sparc
+# We only build sparc headers since we dont support sparc32 hardware
+%endif
+
+%ifarch sparc64
+%define asmarch sparc
+%define all_arch_configs kernel-%{version}-sparc64*.config
+%define make_target image
+%define kernel_image arch/sparc/boot/image
+%define image_install_path boot
+%define with_perftool 0
+%endif
+
+%ifarch ppc
+%define asmarch powerpc
+%define hdrarch powerpc
+%define all_arch_configs kernel-%{version}-ppc{-,.}*config
+%define image_install_path boot
+%define make_target vmlinux
+%define kernel_image vmlinux
+%define kernel_image_elf 1
+%endif
+
+%ifarch ia64
+%define all_arch_configs kernel-%{version}-ia64*.config
+%define image_install_path boot/efi/EFI/redhat
+%define make_target compressed
+%define kernel_image vmlinux.gz
+%endif
+
+%ifarch alpha alphaev56
+%define all_arch_configs kernel-%{version}-alpha*.config
+%define image_install_path boot
+%define make_target vmlinux
+%define kernel_image vmlinux
+%endif
+
+%ifarch %{arm}
+%define all_arch_configs kernel-%{version}-arm*.config
+%define image_install_path boot
+%define hdrarch arm
+%define make_target vmlinux
+%define kernel_image vmlinux
+%endif
+
+%if %{nopatches}
+# XXX temporary until last vdso patches are upstream
+%define vdso_arches ppc ppc64
+%endif
+
+%if %{nopatches}%{using_upstream_branch}
+# Ignore unknown options in our config-* files.
+# Some options go with patches we're not applying.
+%define oldconfig_target loose_nonint_oldconfig
+%else
+%define oldconfig_target nonint_oldconfig
+%endif
+
+# To temporarily exclude an architecture from being built, add it to
+# %nobuildarches. Do _NOT_ use the ExclusiveArch: line, because if we
+# don't build kernel-headers then the new build system will no longer let
+# us use the previous build of that package -- it'll just be completely AWOL.
+# Which is a BadThing(tm).
+
+# We don't build a kernel on i386; we only do kernel-headers there,
+# and we no longer build for 31bit S390. Same for 32bit sparc and arm.
+%define nobuildarches i386 s390 sparc %{arm}
+
+%ifarch %nobuildarches
+%define with_up 0
+%define with_smp 0
+%define with_pae 0
+%define with_kdump 0
+%define with_debuginfo 0
+%define with_perftool 0
+%define _enable_debug_packages 0
+%endif
+
+%define with_pae_debug 0
+%if %{with_pae}
+%define with_pae_debug %{with_debug}
+%endif
+
+#
+# Three sets of minimum package version requirements in the form of Conflicts:
+# to versions below the minimum
+#
+
+#
+# First the general kernel 2.6 required versions as per
+# Documentation/Changes
+#
+%define kernel_dot_org_conflicts  ppp < 2.4.3-3, isdn4k-utils < 3.2-32, nfs-utils < 1.0.7-12, e2fsprogs < 1.37-4, util-linux < 2.12, jfsutils < 1.1.7-2, reiserfs-utils < 3.6.19-2, xfsprogs < 2.6.13-4, procps < 3.2.5-6.3, oprofile < 0.9.1-2
+
+#
+# Then a series of requirements that are distribution specific, either
+# because we add patches for something, or the older versions have
+# problems with the newer kernel or lack certain things that make
+# integration in the distro harder than needed.
+#
+%define package_conflicts initscripts < 7.23, udev < 063-6, iptables < 1.3.2-1, ipw2200-firmware < 2.4, iwl4965-firmware < 228.57.2, selinux-policy-targeted < 1.25.3-14, squashfs-tools < 4.0, wireless-tools < 29-3
+
+#
+# The ld.so.conf.d file we install uses syntax older ldconfig's don't grok.
+#
+%define kernel_xen_conflicts glibc < 2.3.5-1, xen < 3.0.1
+
+%define kernel_PAE_obsoletes kernel-smp < 2.6.17, kernel-xen <= 2.6.27-0.2.rc0.git6.fc10
+%define kernel_PAE_provides kernel-xen = %{rpmversion}-%{pkg_release}
+
+%ifarch x86_64
+%define kernel_obsoletes kernel-xen <= 2.6.27-0.2.rc0.git6.fc10
+%define kernel_provides kernel-xen = %{rpmversion}-%{pkg_release}
+%endif
+
+# We moved the drm include files into kernel-headers, make sure there's
+# a recent enough libdrm-devel on the system that doesn't have those.
+%define kernel_headers_conflicts libdrm-devel < 2.4.0-0.15
+
+#
+# Packages that need to be installed before the kernel is, because the %post
+# scripts use them.
+#
+%define kernel_prereq  fileutils, module-init-tools, initscripts >= 8.11.1-1, kernel-firmware >= %{rpmversion}-%{fedora_build}, grubby >= 7.0.4-1
+%if %{with_dracut}
+%define initrd_prereq  dracut >= 001-7
+%else
+%define initrd_prereq  mkinitrd >= 6.0.61-1
+%endif
+
+#
+# This macro does requires, provides, conflicts, obsoletes for a kernel package.
+#	%%kernel_reqprovconf <subpackage>
+# It uses any kernel_<subpackage>_conflicts and kernel_<subpackage>_obsoletes
+# macros defined above.
+#
+%define kernel_reqprovconf \
+Provides: kernel = %{rpmversion}-%{pkg_release}\
+Provides: kernel-%{_target_cpu} = %{rpmversion}-%{pkg_release}%{?1:.%{1}}\
+Provides: kernel-drm = 4.3.0\
+Provides: kernel-drm-nouveau = 15\
+Provides: kernel-modeset = 1\
+Provides: kernel-uname-r = %{KVERREL}%{?1:.%{1}}\
+Requires(pre): %{kernel_prereq}\
+Requires(pre): %{initrd_prereq}\
+Requires(post): /sbin/new-kernel-pkg\
+Requires(preun): /sbin/new-kernel-pkg\
+Conflicts: %{kernel_dot_org_conflicts}\
+Conflicts: %{package_conflicts}\
+%{expand:%%{?kernel%{?1:_%{1}}_conflicts:Conflicts: %%{kernel%{?1:_%{1}}_conflicts}}}\
+%{expand:%%{?kernel%{?1:_%{1}}_obsoletes:Obsoletes: %%{kernel%{?1:_%{1}}_obsoletes}}}\
+%{expand:%%{?kernel%{?1:_%{1}}_provides:Provides: %%{kernel%{?1:_%{1}}_provides}}}\
+# We can't let RPM do the dependencies automatic because it'll then pick up\
+# a correct but undesirable perl dependency from the module headers which\
+# isn't required for the kernel proper to function\
+AutoReq: no\
+AutoProv: yes\
+%{nil}
+
+Name: kernel%{?variant}
+Group: System Environment/Kernel
+License: GPLv2
+URL: http://www.kernel.org/
+Version: %{rpmversion}
+Release: %{pkg_release}
+# DO NOT CHANGE THE 'ExclusiveArch' LINE TO TEMPORARILY EXCLUDE AN ARCHITECTURE BUILD.
+# SET %%nobuildarches (ABOVE) INSTEAD
+ExclusiveArch: noarch %{all_x86} x86_64 ppc ppc64 ia64 sparc sparc64 s390x alpha alphaev56 %{arm}
+ExclusiveOS: Linux
+
+%kernel_reqprovconf
+%ifarch x86_64 sparc64
+Obsoletes: kernel-smp
+%endif
+
+
+#
+# List the packages used during the kernel build
+#
+BuildRequires: module-init-tools, patch >= 2.5.4, bash >= 2.03, sh-utils, tar
+BuildRequires: bzip2, findutils, gzip, m4, perl, make >= 3.78, diffutils, gawk
+BuildRequires: gcc >= 3.4.2, binutils >= 2.12, redhat-rpm-config
+BuildRequires: net-tools
+BuildRequires: xmlto, asciidoc
+%if %{with_sparse}
+BuildRequires: sparse >= 0.4.1
+%endif
+%if %{with_perftool}
+BuildRequires: elfutils-libelf-devel zlib-devel binutils-devel
+%endif
+BuildConflicts: rhbuildsys(DiskFree) < 500Mb
+
+%define fancy_debuginfo 0
+%if %{with_debuginfo}
+%if 0%{?fedora} >= 8 || 0%{?rhel} >= 6
+%define fancy_debuginfo 1
+%endif
+%endif
+
+%if %{fancy_debuginfo}
+# Fancy new debuginfo generation introduced in Fedora 8.
+BuildRequires: rpm-build >= 4.4.2.1-4
+%define debuginfo_args --strict-build-id
+%endif
+
+Source0: ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-%{kversion}.tar.bz2
+
+Source11: genkey
+Source14: find-provides
+Source15: merge.pl
+
+Source20: Makefile.config
+Source21: config-debug
+Source22: config-nodebug
+Source23: config-generic
+Source24: config-rhel-generic
+
+Source30: config-x86-generic
+Source31: config-i686-PAE
+
+Source40: config-x86_64-generic
+
+Source50: config-powerpc-generic
+Source51: config-powerpc32-generic
+Source52: config-powerpc32-smp
+Source53: config-powerpc64
+
+Source60: config-ia64-generic
+
+Source70: config-s390x
+
+Source90: config-sparc64-generic
+
+Source100: config-arm
+
+Source200: perf
+
+# Here should be only the patches up to the upstream canonical Linus tree.
+
+# For a stable release kernel
+%if 0%{?stable_update}
+%if 0%{?stable_base}
+%define    stable_patch_00  patch-2.6.%{base_sublevel}.%{stable_base}.bz2
+Patch00: %{stable_patch_00}
+%endif
+%if 0%{?stable_rc}
+%define    stable_patch_01  patch-2.6.%{base_sublevel}.%{stable_update}-rc%{stable_rc}.bz2
+Patch01: %{stable_patch_01}
+%endif
+
+# non-released_kernel case
+# These are automagically defined by the rcrev and gitrev values set up
+# near the top of this spec file.
+%else
+%if 0%{?rcrev}
+Patch00: patch-2.6.%{upstream_sublevel}-rc%{rcrev}.bz2
+%if 0%{?gitrev}
+Patch01: patch-2.6.%{upstream_sublevel}-rc%{rcrev}-git%{gitrev}.bz2
+%endif
+%else
+# pre-{base_sublevel+1}-rc1 case
+%if 0%{?gitrev}
+Patch00: patch-2.6.%{base_sublevel}-git%{gitrev}.bz2
+%endif
+%endif
+%endif
+
+%if %{using_upstream_branch}
+### BRANCH PATCH ###
+%endif
+
+Patch02: git-linus.diff
+
+# we always need nonintconfig, even for -vanilla kernels
+Patch03: linux-2.6-build-nonintconfig.patch
+
+# we also need compile fixes for -vanilla
+Patch04: linux-2.6-compile-fixes.patch
+
+# build tweak for build ID magic, even for -vanilla
+Patch05: linux-2.6-makefile-after_link.patch
+
+###-vs- http://vserver.13thfloor.at/ExperimentalT/patch-2.6.31.6-vs2.3.0.36.27.diff
+Patch06: patch-2.6.31.6-vs2.3.0.36.27.diff
+
+%if !%{nopatches}
+
+# revert upstream patches we get via other methods
+Patch09: linux-2.6-upstream-reverts.patch
+# Git trees.
+Patch10: git-cpufreq.patch
+Patch11: git-bluetooth.patch
+
+# Standalone patches
+Patch20: linux-2.6-hotfixes.patch
+
+Patch21: linux-2.6-tracehook.patch
+Patch22: linux-2.6-utrace.patch
+
+Patch30: sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch
+
+Patch31: disable-stackprotector-all.patch
+
+# Intel IOMMU fixes/workarounds
+Patch100: linux-2.6-die-closed-source-bios-muppets-die.patch
+Patch101: linux-2.6-intel-iommu-updates.patch
+Patch102: linux-2.6-iommu-at-zero.patch
+Patch103: linux-2.6-iommu-dmar-all-1s.patch
+Patch104: linux-2.6-iommu-another-hp-screwup.patch
+Patch105: linux-2.6-iommu-sanity-checks-for-intr-remap-too.patch
+Patch106: linux-2.6-iommu-hp-cantiga-resume.patch
+
+Patch141: linux-2.6-ps3-storage-alias.patch
+Patch143: linux-2.6-g5-therm-shutdown.patch
+Patch144: linux-2.6-vio-modalias.patch
+Patch147: linux-2.6-imac-transparent-bridge.patch
+
+Patch150: linux-2.6.29-sparc-IOC_TYPECHECK.patch
+
+Patch160: linux-2.6-execshield.patch
+
+Patch250: linux-2.6-debug-sizeof-structs.patch
+Patch260: linux-2.6-debug-nmi-timeout.patch
+Patch270: linux-2.6-debug-taint-vm.patch
+Patch280: linux-2.6-debug-spinlock-taint.patch
+Patch300: linux-2.6-driver-level-usb-autosuspend.diff
+Patch302: linux-2.6-qcserial-autosuspend.diff
+Patch303: linux-2.6-bluetooth-autosuspend.diff
+Patch304: linux-2.6-usb-uvc-autosuspend.diff
+Patch340: linux-2.6-debug-vm-would-have-oomkilled.patch
+Patch360: linux-2.6-debug-always-inline-kzalloc.patch
+Patch380: linux-2.6-defaults-pci_no_msi.patch
+Patch381: linux-2.6-pciehp-update.patch
+Patch382: linux-2.6-defaults-pciehp.patch
+Patch383: linux-2.6-defaults-aspm.patch
+Patch390: linux-2.6-defaults-acpi-video.patch
+Patch391: linux-2.6-acpi-video-dos.patch
+Patch450: linux-2.6-input-kill-stupid-messages.patch
+Patch451: linux-2.6-input-fix-toshiba-hotkeys.patch
+Patch452: linux-2.6.30-no-pcspkr-modalias.patch
+
+Patch460: linux-2.6-serial-460800.patch
+
+Patch470: die-floppy-die.patch
+
+Patch500: linux-2.6.31-copy_from_user-bounds.patch
+
+Patch510: linux-2.6-silence-noise.patch
+Patch520: linux-2.6.30-hush-rom-warning.patch
+Patch530: linux-2.6-silence-fbcon-logo.patch
+Patch570: linux-2.6-selinux-mprotect-checks.patch
+Patch580: linux-2.6-sparc-selinux-mprotect-checks.patch
+
+Patch600: linux-2.6-defaults-alsa-hda-beep-off.patch
+Patch601: linux-2.6-alsa-improve-hda-powerdown.patch
+Patch610: hda_intel-prealloc-4mb-dmabuffer.patch
+Patch611: alsa-tell-user-that-stream-to-be-rewound-is-suspended.patch
+
+Patch670: linux-2.6-ata-quirk.patch
+Patch671: linux-2.6-ahci-export-capabilities.patch
+
+Patch680: prism54-remove-pci-dev-table.patch
+Patch681: linux-2.6-ath9k-fixes.patch
+
+Patch800: linux-2.6-crash-driver.patch
+
+Patch900: linux-2.6-pci-cacheline-sizing.patch
+
+# ACPI
+Patch1100: linux-2.6.31-cpuidle-faster-io.patch
+# EC fixes from 2.6.32 (#492699, #525681)
+Patch1110: acpi-ec-merge-irq-and-poll-modes.patch
+Patch1120: acpi-ec-use-burst-mode-only-for-msi-notebooks.patch
+Patch1130: acpi-ec-restart-command-even-if-no-interrupts-from-ec.patch
+
+Patch1515: lirc-2.6.31.patch
+Patch1517: hdpvr-ir-enable.patch
+Patch1518: hid-ignore-all-recent-imon-devices.patch
+
+# virt + ksm patches
+Patch1550: linux-2.6-ksm.patch
+Patch1551: linux-2.6-ksm-kvm.patch
+Patch1552: linux-2.6-ksm-updates.patch
+Patch1553: linux-2.6-ksm-fix-munlock.patch
+Patch1554: linux-2.6-ksm-updates-from-32.patch
+Patch1579: linux-2.6-virtio_blk-revert-QUEUE_FLAG_VIRT-addition.patch
+Patch1583: linux-2.6-xen-fix-is_disconnected_device-exists_disconnected_device.patch
+Patch1584: linux-2.6-xen-improvement-to-wait_for_devices.patch
+Patch1585: linux-2.6-xen-increase-device-connection-timeout.patch
+Patch1586: linux-2.6-virtio_blk-add-support-for-cache-flush.patch
+
+# nouveau + drm fixes
+Patch1810: kms-offb-handoff.patch
+Patch1812: drm-next-b390f944.patch
+Patch1813: drm-radeon-pm.patch
+Patch1814: drm-nouveau.patch
+Patch1818: drm-i915-resume-force-mode.patch
+# intel drm is all merged upstream
+Patch1824: drm-intel-next.patch
+Patch1825: drm-intel-pm.patch
+Patch1826: drm-intel-no-tv-hotplug.patch
+Patch1827: drm-i915-fix-tvmode-oops.patch
+Patch1831: drm-conservative-fallback-modes.patch
+Patch1832: drm-edid-retry.patch
+Patch1834: drm-edid-header-fixup.patch
+Patch1835: drm-default-mode.patch
+Patch1837: drm-i915-fix-sync-to-vbl-when-vga-is-off.patch
+Patch1839: drm-radeon-misc-fixes.patch
+Patch1840: drm-radeon-rv410-test-fix.patch
+
+# vga arb
+Patch1900: linux-2.6-vga-arb.patch
+Patch1901: drm-vga-arb.patch
+Patch1902: drm-radeon-kms-arbiter-return-ignore.patch
+
+# make harmless fbcon debug less loud
+Patch1903: fbcon-lower-debug.patch
+
+# kludge to make ich9 e1000 work
+Patch2000: linux-2.6-e1000-ich9.patch
+
+# linux1394 git patches
+Patch2200: linux-2.6-firewire-git-update.patch
+Patch2201: linux-2.6-firewire-git-pending.patch
+
+# Quiet boot fixes
+# silence the ACPI blacklist code
+Patch2802: linux-2.6-silence-acpi-blacklist.patch
+
+Patch2899: linux-2.6-v4l-dvb-fixes.patch
+Patch2900: linux-2.6-v4l-dvb-update.patch
+Patch2901: linux-2.6-v4l-dvb-experimental.patch
+Patch2904: v4l-dvb-fix-cx25840-firmware-loading.patch
+
+# fs fixes
+
+#btrfs
+Patch3000: linux-2.6-btrfs-upstream.patch
+
+# NFSv4
+Patch3050: linux-2.6-nfsd4-proots.patch
+Patch3060: linux-2.6-nfs4-ver4opt.patch
+Patch3061: linux-2.6-nfs4-callback-hidden.patch
+
+# VIA Nano / VX8xx updates
+Patch11010: via-hwmon-temp-sensor.patch
+
+# patches headed upstream
+Patch12010: linux-2.6-dell-laptop-rfkill-fix.patch
+Patch12011: linux-2.6-block-silently-error-unsupported-empty-barriers-too.patch
+Patch12012: linux-2.6-rtc-show-hctosys.patch
+Patch12013: linux-2.6-rfkill-all.patch
+Patch12014: linux-2.6-selinux-module-load-perms.patch
+
+# sched fixes cherry-picked from 2.6.32
+Patch13100: sched-deal-with-low-load-in-wake-affine.patch
+Patch13101: sched-ensure-child-cant-gain-time-over-its-parent-after-fork.patch
+Patch13102: sched-remove-shortcut-from-select-task-rq-fair.patch
+# latency defaults from 2.6.32
+Patch13110: sched-retune-scheduler-latency-defaults.patch
+# Fix huge wakeup latencies
+Patch13120: sched-update-the-clock-of-runqueue-select-task-rq-selected.patch
+
+# patches headed for -stable
+
+# make perf counter API available to userspace (#527264)
+Patch14010: perf-make-perf-counter-h-available-to-userspace.patch
+
+# fix resource counter issues on *big* machines
+Patch14101: improve-resource-counter-scalability.patch
+
+# fix perf for sysprof
+Patch14420: perf-events-fix-swevent-hrtimer-sampling.patch
+Patch14421: perf-events-dont-generate-events-for-the-idle-task.patch
+
+Patch14430: crypto-via-padlock-fix-nano-aes.patch
+
+# tg3 fixes (#527209)
+Patch14451: tg3-01-delay-mdio-bus-init-until-fw-finishes.patch
+Patch14452: tg3-02-fix-tso-test-against-wrong-flags-var.patch
+Patch14453: tg3-03-fix-57780-asic-rev-pcie-link-receiver-errors.patch
+Patch14454: tg3-04-prevent-tx-bd-corruption.patch
+Patch14455: tg3-05-assign-flags-to-fixes-in-start_xmit_dma_bug.patch
+Patch14456: tg3-06-fix-5906-transmit-hangs.patch
+
+Patch14460: highmem-Fix-debug_kmap_atomic-to-also-handle-KM_IRQ_.patch
+Patch14461: highmem-Fix-race-in-debug_kmap_atomic-which-could-ca.patch
+Patch14462: highmem-fix-arm-powerpc-kmap_types.patch
+
+Patch14463: dlm-fix-connection-close-handling.patch
+
+# rhbz#544144 [bbf31bf18d34caa87dd01f08bf713635593697f2]
+Patch14464: ipv4-fix-null-ptr-deref-in-ip_fragment.patch
+
+%endif
+
+BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root
+
+%description
+The kernel package contains the Linux kernel (vmlinuz), the core of any
+Linux operating system.  The kernel handles the basic functions
+of the operating system: memory allocation, process allocation, device
+input and output, etc.
+
+
+%package doc
+Summary: Various documentation bits found in the kernel source
+Group: Documentation
+%description doc
+This package contains documentation files from the kernel
+source. Various bits of information about the Linux kernel and the
+device drivers shipped with it are documented in these files.
+
+You'll want to install this package if you need a reference to the
+options that can be passed to Linux kernel modules at load time.
+
+
+%package headers
+Summary: Header files for the Linux kernel for use by glibc
+Group: Development/System
+Obsoletes: glibc-kernheaders
+Provides: glibc-kernheaders = 3.0-46
+%description headers
+Kernel-headers includes the C header files that specify the interface
+between the Linux kernel and userspace libraries and programs.  The
+header files define structures and constants that are needed for
+building most standard programs and are also needed for rebuilding the
+glibc package.
+
+%package firmware
+Summary: Firmware files used by the Linux kernel
+Group: Development/System
+# This is... complicated.
+# Look at the WHENCE file.
+License: GPL+ and GPLv2+ and MIT and Redistributable, no modification permitted
+%if "x%{?variant}" != "x"
+Provides: kernel-firmware = %{rpmversion}-%{pkg_release}
+%endif
+%description firmware
+Kernel-firmware includes firmware files required for some devices to
+operate.
+
+%package bootwrapper
+Summary: Boot wrapper files for generating combined kernel + initrd images
+Group: Development/System
+Requires: gzip
+%description bootwrapper
+Kernel-bootwrapper contains the wrapper code which makes bootable "zImage"
+files combining both kernel and initial ramdisk.
+
+%package debuginfo-common-%{_target_cpu}
+Summary: Kernel source files used by %{name}-debuginfo packages
+Group: Development/Debug
+%description debuginfo-common-%{_target_cpu}
+This package is required by %{name}-debuginfo subpackages.
+It provides the kernel source files common to all builds.
+
+%package -n perf
+Summary: Performance monitoring for the Linux kernel
+Group: Development/System
+License: GPLv2
+%description -n perf
+This package provides the supporting documentation for the perf tool
+shipped in each kernel image subpackage.
+
+#
+# This macro creates a kernel-<subpackage>-debuginfo package.
+#	%%kernel_debuginfo_package <subpackage>
+#
+%define kernel_debuginfo_package() \
+%package %{?1:%{1}-}debuginfo\
+Summary: Debug information for package %{name}%{?1:-%{1}}\
+Group: Development/Debug\
+Requires: %{name}-debuginfo-common-%{_target_cpu} = %{version}-%{release}\
+Provides: %{name}%{?1:-%{1}}-debuginfo-%{_target_cpu} = %{version}-%{release}\
+AutoReqProv: no\
+%description -n %{name}%{?1:-%{1}}-debuginfo\
+This package provides debug information for package %{name}%{?1:-%{1}}.\
+This is required to use SystemTap with %{name}%{?1:-%{1}}-%{KVERREL}.\
+%{expand:%%global debuginfo_args %{?debuginfo_args} -p '/.*/%%{KVERREL}%{?1:\.%{1}}/.*|/.*%%{KVERREL}%{?1:\.%{1}}(\.debug)?' -o debuginfo%{?1}.list}\
+%{nil}
+
+#
+# This macro creates a kernel-<subpackage>-devel package.
+#	%%kernel_devel_package <subpackage> <pretty-name>
+#
+%define kernel_devel_package() \
+%package %{?1:%{1}-}devel\
+Summary: Development package for building kernel modules to match the %{?2:%{2} }kernel\
+Group: System Environment/Kernel\
+Provides: kernel%{?1:-%{1}}-devel-%{_target_cpu} = %{version}-%{release}\
+Provides: kernel-devel-%{_target_cpu} = %{version}-%{release}%{?1:.%{1}}\
+Provides: kernel-devel = %{version}-%{release}%{?1:.%{1}}\
+Provides: kernel-devel-uname-r = %{KVERREL}%{?1:.%{1}}\
+AutoReqProv: no\
+Requires(pre): /usr/bin/find\
+%description -n kernel%{?variant}%{?1:-%{1}}-devel\
+This package provides kernel headers and makefiles sufficient to build modules\
+against the %{?2:%{2} }kernel package.\
+%{nil}
+
+#
+# This macro creates a kernel-<subpackage> and its -devel and -debuginfo too.
+#	%%define variant_summary The Linux kernel compiled for <configuration>
+#	%%kernel_variant_package [-n <pretty-name>] <subpackage>
+#
+%define kernel_variant_package(n:) \
+%package %1\
+Summary: %{variant_summary}\
+Group: System Environment/Kernel\
+%kernel_reqprovconf\
+%{expand:%%kernel_devel_package %1 %{!?-n:%1}%{?-n:%{-n*}}}\
+%{expand:%%kernel_debuginfo_package %1}\
+%{nil}
+
+
+# First the auxiliary packages of the main kernel package.
+%kernel_devel_package
+%kernel_debuginfo_package
+
+
+# Now, each variant package.
+
+%define variant_summary The Linux kernel compiled for SMP machines
+%kernel_variant_package -n SMP smp
+%description smp
+This package includes a SMP version of the Linux kernel. It is
+required only on machines with two or more CPUs as well as machines with
+hyperthreading technology.
+
+Install the kernel-smp package if your machine uses two or more CPUs.
+
+
+%define variant_summary The Linux kernel compiled for PAE capable machines
+%kernel_variant_package PAE
+%description PAE
+This package includes a version of the Linux kernel with support for up to
+64GB of high memory. It requires a CPU with Physical Address Extensions (PAE).
+The non-PAE kernel can only address up to 4GB of memory.
+Install the kernel-PAE package if your machine has more than 4GB of memory.
+
+
+%define variant_summary The Linux kernel compiled with extra debugging enabled for PAE capable machines
+%kernel_variant_package PAEdebug
+Obsoletes: kernel-PAE-debug
+%description PAEdebug
+This package includes a version of the Linux kernel with support for up to
+64GB of high memory. It requires a CPU with Physical Address Extensions (PAE).
+The non-PAE kernel can only address up to 4GB of memory.
+Install the kernel-PAE package if your machine has more than 4GB of memory.
+
+This variant of the kernel has numerous debugging options enabled.
+It should only be installed when trying to gather additional information
+on kernel bugs, as some of these options impact performance noticably.
+
+
+%define variant_summary The Linux kernel compiled with extra debugging enabled
+%kernel_variant_package debug
+%description debug
+The kernel package contains the Linux kernel (vmlinuz), the core of any
+Linux operating system.  The kernel handles the basic functions
+of the operating system:  memory allocation, process allocation, device
+input and output, etc.
+
+This variant of the kernel has numerous debugging options enabled.
+It should only be installed when trying to gather additional information
+on kernel bugs, as some of these options impact performance noticably.
+
+
+%define variant_summary A minimal Linux kernel compiled for crash dumps
+%kernel_variant_package kdump
+%description kdump
+This package includes a kdump version of the Linux kernel. It is
+required only on machines which will use the kexec-based kernel crash dump
+mechanism.
+
+
+%prep
+# do a few sanity-checks for --with *only builds
+%if %{with_baseonly}
+%if !%{with_up}%{with_pae}
+echo "Cannot build --with baseonly, up build is disabled"
+exit 1
+%endif
+%endif
+
+%if %{with_smponly}
+%if !%{with_smp}
+echo "Cannot build --with smponly, smp build is disabled"
+exit 1
+%endif
+%endif
+
+# more sanity checking; do it quietly
+if [ "%{patches}" != "%%{patches}" ] ; then
+  for patch in %{patches} ; do
+    if [ ! -f $patch ] ; then
+      echo "ERROR: Patch  ${patch##/*/}  listed in specfile but is missing"
+      exit 1
+    fi
+  done
+fi 2>/dev/null
+
+patch_command='patch -p1 -F1 -s'
+ApplyPatch()
+{
+  local patch=$1
+  shift
+  if [ ! -f $RPM_SOURCE_DIR/$patch ]; then
+    exit 1
+  fi
+  if ! egrep "^Patch[0-9]+: $patch\$" %{_specdir}/${RPM_PACKAGE_NAME%%%%%{?variant}}.spec ; then
+    if [ "${patch:0:10}" != "patch-2.6." ] ; then
+      echo "ERROR: Patch  $patch  not listed as a source patch in specfile"
+      exit 1
+    fi
+  fi 2>/dev/null
+  case "$patch" in
+  *.bz2) bunzip2 < "$RPM_SOURCE_DIR/$patch" | $patch_command ${1+"$@"} ;;
+  *.gz) gunzip < "$RPM_SOURCE_DIR/$patch" | $patch_command ${1+"$@"} ;;
+  *) $patch_command ${1+"$@"} < "$RPM_SOURCE_DIR/$patch" ;;
+  esac
+}
+
+# don't apply patch if it's empty
+ApplyOptionalPatch()
+{
+  local patch=$1
+  shift
+  if [ ! -f $RPM_SOURCE_DIR/$patch ]; then
+    exit 1
+  fi
+  local C=$(wc -l $RPM_SOURCE_DIR/$patch | awk '{print $1}')
+  if [ "$C" -gt 9 ]; then
+    ApplyPatch $patch ${1+"$@"}
+  fi
+}
+
+# we don't want a .config file when building firmware: it just confuses the build system
+%define build_firmware \
+   mv .config .config.firmware_save \
+   make INSTALL_FW_PATH=$RPM_BUILD_ROOT/lib/firmware firmware_install \
+   mv .config.firmware_save .config
+
+# First we unpack the kernel tarball.
+# If this isn't the first make prep, we use links to the existing clean tarball
+# which speeds things up quite a bit.
+
+# Update to latest upstream.
+%if 0%{?released_kernel}
+%define vanillaversion 2.6.%{base_sublevel}
+# non-released_kernel case
+%else
+%if 0%{?rcrev}
+%define vanillaversion 2.6.%{upstream_sublevel}-rc%{rcrev}
+%if 0%{?gitrev}
+%define vanillaversion 2.6.%{upstream_sublevel}-rc%{rcrev}-git%{gitrev}
+%endif
+%else
+# pre-{base_sublevel+1}-rc1 case
+%if 0%{?gitrev}
+%define vanillaversion 2.6.%{base_sublevel}-git%{gitrev}
+%endif
+%endif
+%endif
+
+# We can share hardlinked source trees by putting a list of
+# directory names of the CVS checkouts that we want to share
+# with in .shared-srctree. (Full pathnames are required.)
+[ -f .shared-srctree ] && sharedirs=$(cat .shared-srctree)
+
+if [ ! -d kernel-%{kversion}/vanilla-%{vanillaversion} ]; then
+
+  if [ -d kernel-%{kversion}/vanilla-%{kversion} ]; then
+
+    cd kernel-%{kversion}
+
+    # Any vanilla-* directories other than the base one are stale.
+    for dir in vanilla-*; do
+      [ "$dir" = vanilla-%{kversion} ] || rm -rf $dir &
+    done
+
+  else
+
+    # Ok, first time we do a make prep.
+    rm -f pax_global_header
+    for sharedir in $sharedirs ; do
+      if [[ ! -z $sharedir  &&  -d $sharedir/kernel-%{kversion}/vanilla-%{kversion} ]] ; then
+        break
+      fi
+    done
+    if [[ ! -z $sharedir  &&  -d $sharedir/kernel-%{kversion}/vanilla-%{kversion} ]] ; then
+%setup -q -n kernel-%{kversion} -c -T
+      cp -rl $sharedir/kernel-%{kversion}/vanilla-%{kversion} .
+    else
+%setup -q -n kernel-%{kversion} -c
+      mv linux-%{kversion} vanilla-%{kversion}
+    fi
+
+  fi
+
+%if "%{kversion}" != "%{vanillaversion}"
+
+  for sharedir in $sharedirs ; do
+    if [[ ! -z $sharedir  &&  -d $sharedir/kernel-%{kversion}/vanilla-%{vanillaversion} ]] ; then
+      break
+    fi
+  done
+  if [[ ! -z $sharedir  &&  -d $sharedir/kernel-%{kversion}/vanilla-%{vanillaversion} ]] ; then
+
+    cp -rl $sharedir/kernel-%{kversion}/vanilla-%{vanillaversion} .
+
+  else
+
+    cp -rl vanilla-%{kversion} vanilla-%{vanillaversion}
+    cd vanilla-%{vanillaversion}
+
+# Update vanilla to the latest upstream.
+# (non-released_kernel case only)
+%if 0%{?rcrev}
+    ApplyPatch patch-2.6.%{upstream_sublevel}-rc%{rcrev}.bz2
+%if 0%{?gitrev}
+    ApplyPatch patch-2.6.%{upstream_sublevel}-rc%{rcrev}-git%{gitrev}.bz2
+%endif
+%else
+# pre-{base_sublevel+1}-rc1 case
+%if 0%{?gitrev}
+    ApplyPatch patch-2.6.%{base_sublevel}-git%{gitrev}.bz2
+%endif
+%endif
+
+    cd ..
+
+  fi
+
+%endif
+
+else
+  # We already have a vanilla dir.
+  cd kernel-%{kversion}
+fi
+
+if [ -d linux-%{kversion}.%{_target_cpu} ]; then
+  # Just in case we ctrl-c'd a prep already
+  rm -rf deleteme.%{_target_cpu}
+  # Move away the stale away, and delete in background.
+  mv linux-%{kversion}.%{_target_cpu} deleteme.%{_target_cpu}
+  rm -rf deleteme.%{_target_cpu} &
+fi
+
+cp -rl vanilla-%{vanillaversion} linux-%{kversion}.%{_target_cpu}
+
+cd linux-%{kversion}.%{_target_cpu}
+
+# released_kernel with possible stable updates
+%if 0%{?stable_base}
+ApplyPatch %{stable_patch_00}
+%endif
+%if 0%{?stable_rc}
+ApplyPatch %{stable_patch_01}
+%endif
+
+%if %{using_upstream_branch}
+### BRANCH APPLY ###
+%endif
+
+# Drop some necessary files from the source dir into the buildroot
+cp $RPM_SOURCE_DIR/config-* .
+cp %{SOURCE15} .
+
+# Dynamically generate kernel .config files from config-* files
+make -f %{SOURCE20} VERSION=%{version} configs
+
+#if a rhel kernel, apply the rhel config options
+%if 0%{?rhel}
+  for i in %{all_arch_configs}
+  do
+    mv $i $i.tmp
+    ./merge.pl config-rhel-generic $i.tmp > $i
+    rm $i.tmp
+  done
+%endif
+
+#ApplyOptionalPatch git-linus.diff
+
+# This patch adds a "make nonint_oldconfig" which is non-interactive and
+# also gives a list of missing options at the end. Useful for automated
+# builds (as used in the buildsystem).
+ApplyPatch linux-2.6-build-nonintconfig.patch
+
+ApplyPatch linux-2.6-makefile-after_link.patch
+
+###-vs-
+ApplyPatch patch-2.6.31.6-vs2.3.0.36.27.diff
+
+#
+# misc small stuff to make things compile
+#
+ApplyOptionalPatch linux-2.6-compile-fixes.patch
+
+%if !%{nopatches}
+
+# revert patches from upstream that conflict or that we get via other means
+ApplyOptionalPatch linux-2.6-upstream-reverts.patch -R
+
+ApplyOptionalPatch git-cpufreq.patch
+#ApplyOptionalPatch git-bluetooth.patch
+
+ApplyPatch linux-2.6-hotfixes.patch
+
+# Roland's utrace ptrace replacement.
+ApplyPatch linux-2.6-tracehook.patch
+###-vs-
+ApplyPatch linux-2.6-utrace.patch -F3
+
+ApplyPatch sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch
+
+ApplyPatch disable-stackprotector-all.patch
+
+# Architecture patches
+# x86(-64)
+ApplyPatch via-hwmon-temp-sensor.patch
+ApplyPatch linux-2.6-dell-laptop-rfkill-fix.patch
+
+#
+# Intel IOMMU
+#
+# Quiesce USB host controllers before setting up the IOMMU
+ApplyPatch linux-2.6-die-closed-source-bios-muppets-die.patch
+# Some performance fixes, unify hardware/software passthrough support, and
+# most importantly: notice when the BIOS points us to a region that returns
+# all 0xFF, and claims that there's an IOMMU there.
+ApplyPatch linux-2.6-intel-iommu-updates.patch
+ApplyPatch linux-2.6-iommu-at-zero.patch
+ApplyPatch linux-2.6-iommu-dmar-all-1s.patch
+# Check for RMRRs which end before they start
+ApplyPatch linux-2.6-iommu-another-hp-screwup.patch
+# Apply the 'at zero' and 'all 0xFF' sanity checks for intr_remap too
+ApplyPatch linux-2.6-iommu-sanity-checks-for-intr-remap-too.patch
+# Fix up MMIO BAR for integrated graphics on HP laptops on resume (#536675)
+ApplyPatch linux-2.6-iommu-hp-cantiga-resume.patch
+
+#
+# PowerPC
+#
+### NOT (YET) UPSTREAM:
+# The storage alias patch is Fedora-local, and allows the old 'ps3_storage'
+# module name to work on upgrades. Otherwise, I believe mkinitrd will fail
+# to pull the module in,
+ApplyPatch linux-2.6-ps3-storage-alias.patch
+# Alleviate G5 thermal shutdown problems
+ApplyPatch linux-2.6-g5-therm-shutdown.patch
+# Provide modalias in sysfs for vio devices
+ApplyPatch linux-2.6-vio-modalias.patch
+# Work around PCIe bridge setup on iSight
+ApplyPatch linux-2.6-imac-transparent-bridge.patch
+
+#
+# SPARC64
+#
+ApplyPatch linux-2.6.29-sparc-IOC_TYPECHECK.patch
+
+#
+# Exec shield
+#
+###-vs- 
+ApplyPatch linux-2.6-execshield.patch -F3
+
+#
+# bugfixes to drivers and filesystems
+#
+
+# ext4
+
+# xfs
+
+# btrfs
+###-vs- 
+ApplyPatch linux-2.6-btrfs-upstream.patch
+
+# eCryptfs
+
+# NFSv4
+ApplyPatch linux-2.6-nfsd4-proots.patch
+ApplyPatch linux-2.6-nfs4-ver4opt.patch
+ApplyPatch linux-2.6-nfs4-callback-hidden.patch
+
+# USB
+ApplyPatch linux-2.6-driver-level-usb-autosuspend.diff
+ApplyPatch linux-2.6-qcserial-autosuspend.diff
+ApplyPatch linux-2.6-bluetooth-autosuspend.diff
+ApplyPatch linux-2.6-usb-uvc-autosuspend.diff
+
+# ACPI
+ApplyPatch linux-2.6-defaults-acpi-video.patch
+ApplyPatch linux-2.6-acpi-video-dos.patch
+# cpuidle: Fix the menu governor to boost IO performance
+ApplyPatch linux-2.6.31-cpuidle-faster-io.patch
+# EC fixes from 2.6.32 (#492699, #525681)
+ApplyPatch acpi-ec-merge-irq-and-poll-modes.patch
+ApplyPatch acpi-ec-use-burst-mode-only-for-msi-notebooks.patch
+ApplyPatch acpi-ec-restart-command-even-if-no-interrupts-from-ec.patch
+
+# Various low-impact patches to aid debugging.
+ApplyPatch linux-2.6-debug-sizeof-structs.patch
+ApplyPatch linux-2.6-debug-nmi-timeout.patch
+ApplyPatch linux-2.6-debug-taint-vm.patch
+ApplyPatch linux-2.6-debug-spinlock-taint.patch
+###-vs- 
+ApplyPatch linux-2.6-debug-vm-would-have-oomkilled.patch
+ApplyPatch linux-2.6-debug-always-inline-kzalloc.patch
+
+#
+# PCI
+#
+# disable message signaled interrupts
+ApplyPatch linux-2.6-defaults-pci_no_msi.patch
+# update the pciehp driver
+#ApplyPatch linux-2.6-pciehp-update.patch
+# default to enabling passively listening for hotplug events
+#ApplyPatch linux-2.6-defaults-pciehp.patch
+# enable ASPM by default on hardware we expect to work
+ApplyPatch linux-2.6-defaults-aspm.patch
+
+#
+# SCSI Bits.
+#
+
+# ALSA
+# squelch hda_beep by default
+ApplyPatch linux-2.6-defaults-alsa-hda-beep-off.patch
+ApplyPatch linux-2.6-alsa-improve-hda-powerdown.patch
+ApplyPatch hda_intel-prealloc-4mb-dmabuffer.patch
+ApplyPatch alsa-tell-user-that-stream-to-be-rewound-is-suspended.patch
+
+# Networking
+
+# Misc fixes
+# The input layer spews crap no-one cares about.
+ApplyPatch linux-2.6-input-kill-stupid-messages.patch
+
+# stop floppy.ko from autoloading during udev...
+ApplyPatch die-floppy-die.patch
+
+# make copy_from_user to a stack slot provable right
+# hosed stuff, just drop this close to beta
+#ApplyPatch linux-2.6.31-copy_from_user-bounds.patch
+
+# Get away from having to poll Toshibas
+#ApplyPatch linux-2.6-input-fix-toshiba-hotkeys.patch
+
+ApplyPatch linux-2.6.30-no-pcspkr-modalias.patch
+
+# Allow to use 480600 baud on 16C950 UARTs
+ApplyPatch linux-2.6-serial-460800.patch
+
+# Silence some useless messages that still get printed with 'quiet'
+ApplyPatch linux-2.6-silence-noise.patch
+ApplyPatch linux-2.6.30-hush-rom-warning.patch
+
+# Make fbcon not show the penguins with 'quiet'
+ApplyPatch linux-2.6-silence-fbcon-logo.patch
+
+# Fix the SELinux mprotect checks on executable mappings
+#ApplyPatch linux-2.6-selinux-mprotect-checks.patch
+# Fix SELinux for sparc
+#ApplyPatch linux-2.6-sparc-selinux-mprotect-checks.patch
+
+# Changes to upstream defaults.
+
+
+# ia64 ata quirk
+ApplyPatch linux-2.6-ata-quirk.patch
+
+# Make it possible to identify non-hotplug SATA ports
+ApplyPatch linux-2.6-ahci-export-capabilities.patch
+
+# prism54: remove pci modinfo device table
+ApplyPatch prism54-remove-pci-dev-table.patch
+
+# ath9k: add fixes suggested by upstream maintainer
+ApplyPatch linux-2.6-ath9k-fixes.patch
+
+# /dev/crash driver.
+ApplyPatch linux-2.6-crash-driver.patch
+
+# Determine cacheline sizes in a generic manner.
+ApplyPatch linux-2.6-pci-cacheline-sizing.patch
+
+# http://www.lirc.org/
+ApplyPatch lirc-2.6.31.patch
+# enable IR receiver on Hauppauge HD PVR (v4l-dvb merge pending)
+ApplyPatch hdpvr-ir-enable.patch
+# tell usbhid to ignore all imon devices (sent upstream 2009.07.31)
+ApplyPatch hid-ignore-all-recent-imon-devices.patch
+
+# Add kernel KSM support
+ApplyPatch linux-2.6-ksm.patch
+ApplyPatch linux-2.6-ksm-updates.patch
+ApplyPatch linux-2.6-ksm-fix-munlock.patch
+ApplyPatch linux-2.6-ksm-updates-from-32.patch
+# Optimize KVM for KSM support
+ApplyPatch linux-2.6-ksm-kvm.patch
+
+# Assorted Virt Fixes
+ApplyPatch linux-2.6-virtio_blk-revert-QUEUE_FLAG_VIRT-addition.patch
+ApplyPatch linux-2.6-xen-fix-is_disconnected_device-exists_disconnected_device.patch
+ApplyPatch linux-2.6-xen-improvement-to-wait_for_devices.patch
+ApplyPatch linux-2.6-xen-increase-device-connection-timeout.patch
+ApplyPatch linux-2.6-virtio_blk-add-support-for-cache-flush.patch
+
+# Fix block I/O errors in KVM
+ApplyPatch linux-2.6-block-silently-error-unsupported-empty-barriers-too.patch
+
+ApplyPatch linux-2.6-e1000-ich9.patch
+
+# Nouveau DRM + drm fixes
+ApplyPatch kms-offb-handoff.patch
+ApplyPatch drm-next-b390f944.patch
+ApplyPatch drm-radeon-misc-fixes.patch
+ApplyPatch drm-radeon-rv410-test-fix.patch
+ApplyPatch drm-conservative-fallback-modes.patch
+ApplyPatch drm-edid-retry.patch
+ApplyPatch drm-edid-header-fixup.patch
+ApplyPatch drm-default-mode.patch
+
+ApplyPatch drm-nouveau.patch
+# pm broken on my thinkpad t60p - airlied
+#ApplyPatch drm-radeon-pm.patch
+ApplyPatch drm-i915-resume-force-mode.patch
+ApplyOptionalPatch drm-intel-next.patch
+#this appears to be upstream - mjg59?
+#ApplyPatch drm-intel-pm.patch
+ApplyPatch drm-intel-no-tv-hotplug.patch
+ApplyPatch drm-i915-fix-tvmode-oops.patch
+ApplyPatch drm-i915-fix-sync-to-vbl-when-vga-is-off.patch
+#ApplyPatch drm-disable-r600-aspm.patch
+
+# VGA arb + drm
+ApplyPatch linux-2.6-vga-arb.patch
+ApplyPatch drm-vga-arb.patch
+ApplyPatch drm-radeon-kms-arbiter-return-ignore.patch
+
+# Lower debug level of fbcon handover messages (rh#538526)
+ApplyPatch fbcon-lower-debug.patch
+
+# linux1394 git patches
+# apply if non-empty
+ApplyOptionalPatch linux-2.6-firewire-git-update.patch
+ApplyOptionalPatch linux-2.6-firewire-git-pending.patch
+
+# silence the ACPI blacklist code
+ApplyPatch linux-2.6-silence-acpi-blacklist.patch
+
+# V4L/DVB updates/fixes/experimental drivers
+# apply if non-empty
+ApplyOptionalPatch linux-2.6-v4l-dvb-fixes.patch
+ApplyOptionalPatch linux-2.6-v4l-dvb-update.patch
+ApplyOptionalPatch linux-2.6-v4l-dvb-experimental.patch
+
+ApplyPatch v4l-dvb-fix-cx25840-firmware-loading.patch
+
+# Patches headed upstream
+ApplyPatch linux-2.6-rtc-show-hctosys.patch
+ApplyPatch linux-2.6-rfkill-all.patch
+ApplyPatch linux-2.6-selinux-module-load-perms.patch
+
+# patches headed for -stable
+
+# make perf counter API available to userspace (#527264)
+ApplyPatch perf-make-perf-counter-h-available-to-userspace.patch
+
+ApplyPatch improve-resource-counter-scalability.patch
+
+# fix perf for sysprof
+ApplyPatch perf-events-fix-swevent-hrtimer-sampling.patch
+ApplyPatch perf-events-dont-generate-events-for-the-idle-task.patch
+
+# Fix oops in padlock
+ApplyPatch crypto-via-padlock-fix-nano-aes.patch
+
+# tg3 fixes (#527209)
+ApplyPatch tg3-01-delay-mdio-bus-init-until-fw-finishes.patch
+ApplyPatch tg3-02-fix-tso-test-against-wrong-flags-var.patch
+ApplyPatch tg3-03-fix-57780-asic-rev-pcie-link-receiver-errors.patch
+ApplyPatch tg3-04-prevent-tx-bd-corruption.patch
+ApplyPatch tg3-05-assign-flags-to-fixes-in-start_xmit_dma_bug.patch
+ApplyPatch tg3-06-fix-5906-transmit-hangs.patch
+
+# sched fixes cherry-picked from 2.6.32
+ApplyPatch sched-deal-with-low-load-in-wake-affine.patch
+ApplyPatch sched-ensure-child-cant-gain-time-over-its-parent-after-fork.patch
+ApplyPatch sched-remove-shortcut-from-select-task-rq-fair.patch
+# latency defaults from 2.6.32
+ApplyPatch sched-retune-scheduler-latency-defaults.patch
+# fix wakeup latency
+ApplyPatch sched-update-the-clock-of-runqueue-select-task-rq-selected.patch
+
+ApplyPatch highmem-Fix-debug_kmap_atomic-to-also-handle-KM_IRQ_.patch
+ApplyPatch highmem-Fix-race-in-debug_kmap_atomic-which-could-ca.patch
+ApplyPatch highmem-fix-arm-powerpc-kmap_types.patch
+
+ApplyPatch dlm-fix-connection-close-handling.patch
+
+# rhbz#544144
+ApplyPatch ipv4-fix-null-ptr-deref-in-ip_fragment.patch
+
+# END OF PATCH APPLICATIONS
+
+%endif
+
+# Any further pre-build tree manipulations happen here.
+
+chmod +x scripts/checkpatch.pl
+
+# only deal with configs if we are going to build for the arch
+%ifnarch %nobuildarches
+
+mkdir configs
+
+# Remove configs not for the buildarch
+for cfg in kernel-%{version}-*.config; do
+  if [ `echo %{all_arch_configs} | grep -c $cfg` -eq 0 ]; then
+    rm -f $cfg
+  fi
+done
+
+%if !%{debugbuildsenabled}
+rm -f kernel-%{version}-*debug.config
+%endif
+
+# now run oldconfig over all the config files
+for i in *.config
+do
+  mv $i .config
+  Arch=`head -1 .config | cut -b 3-`
+  make ARCH=$Arch %{oldconfig_target}
+  echo "# $Arch" > configs/$i
+  cat .config >> configs/$i
+done
+# end of kernel config
+%endif
+
+# get rid of unwanted files resulting from patch fuzz
+find . \( -name "*.orig" -o -name "*~" \) -exec rm -f {} \; >/dev/null
+
+cd ..
+
+###
+### build
+###
+%build
+
+%if %{with_sparse}
+%define sparse_mflags	C=1
+%endif
+
+%if %{fancy_debuginfo}
+# This override tweaks the kernel makefiles so that we run debugedit on an
+# object before embedding it.  When we later run find-debuginfo.sh, it will
+# run debugedit again.  The edits it does change the build ID bits embedded
+# in the stripped object, but repeating debugedit is a no-op.  We do it
+# beforehand to get the proper final build ID bits into the embedded image.
+# This affects the vDSO images in vmlinux, and the vmlinux image in bzImage.
+export AFTER_LINK=\
+'sh -xc "/usr/lib/rpm/debugedit -b $$RPM_BUILD_DIR -d /usr/src/debug -i $@"'
+%endif
+
+cp_vmlinux()
+{
+  eu-strip --remove-comment -o "$2" "$1"
+}
+
+BuildKernel() {
+    MakeTarget=$1
+    KernelImage=$2
+    Flavour=$3
+    InstallName=${4:-vmlinuz}
+
+    # Pick the right config file for the kernel we're building
+    Config=kernel-%{version}-%{_target_cpu}${Flavour:+-${Flavour}}.config
+    DevelDir=/usr/src/kernels/%{KVERREL}${Flavour:+.${Flavour}}
+
+    # When the bootable image is just the ELF kernel, strip it.
+    # We already copy the unstripped file into the debuginfo package.
+    if [ "$KernelImage" = vmlinux ]; then
+      CopyKernel=cp_vmlinux
+    else
+      CopyKernel=cp
+    fi
+
+    KernelVer=%{version}-%{release}.%{_target_cpu}${Flavour:+.${Flavour}}
+    echo BUILDING A KERNEL FOR ${Flavour} %{_target_cpu}...
+
+    # make sure EXTRAVERSION says what we want it to say
+    perl -p -i -e "s/^EXTRAVERSION.*/EXTRAVERSION = %{?stablerev}-%{release}.%{_target_cpu}${Flavour:+.${Flavour}}/" Makefile
+
+    # if pre-rc1 devel kernel, must fix up SUBLEVEL for our versioning scheme
+    %if !0%{?rcrev}
+    %if 0%{?gitrev}
+    perl -p -i -e 's/^SUBLEVEL.*/SUBLEVEL = %{upstream_sublevel}/' Makefile
+    %endif
+    %endif
+
+    # and now to start the build process
+
+    make -s mrproper
+    cp configs/$Config .config
+
+    Arch=`head -1 .config | cut -b 3-`
+    echo USING ARCH=$Arch
+
+    make -s ARCH=$Arch %{oldconfig_target} > /dev/null
+    make -s ARCH=$Arch V=1 %{?_smp_mflags} $MakeTarget %{?sparse_mflags}
+    make -s ARCH=$Arch V=1 %{?_smp_mflags} modules %{?sparse_mflags} || exit 1
+
+%if %{with_perftool}
+    pushd tools/perf
+# make sure the scripts are executable... won't be in tarball until 2.6.31 :/
+    chmod +x util/generate-cmdlist.sh util/PERF-VERSION-GEN
+    make -s V=1 %{?_smp_mflags} perf
+    mkdir -p $RPM_BUILD_ROOT/usr/libexec/
+    install -m 755 perf $RPM_BUILD_ROOT/usr/libexec/perf.$KernelVer
+    popd
+%endif
+
+    # Start installing the results
+%if %{with_debuginfo}
+    mkdir -p $RPM_BUILD_ROOT%{debuginfodir}/boot
+    mkdir -p $RPM_BUILD_ROOT%{debuginfodir}/%{image_install_path}
+%endif
+    mkdir -p $RPM_BUILD_ROOT/%{image_install_path}
+    install -m 644 .config $RPM_BUILD_ROOT/boot/config-$KernelVer
+    install -m 644 System.map $RPM_BUILD_ROOT/boot/System.map-$KernelVer
+%if %{with_dracut}
+    # We estimate the size of the initramfs because rpm needs to take this size
+    # into consideration when performing disk space calculations. (See bz #530778)
+    dd if=/dev/zero of=$RPM_BUILD_ROOT/boot/initramfs-$KernelVer.img bs=1M count=20
+%else
+    dd if=/dev/zero of=$RPM_BUILD_ROOT/boot/initrd-$KernelVer.img bs=1M count=5
+%endif
+    if [ -f arch/$Arch/boot/zImage.stub ]; then
+      cp arch/$Arch/boot/zImage.stub $RPM_BUILD_ROOT/%{image_install_path}/zImage.stub-$KernelVer || :
+    fi
+    $CopyKernel $KernelImage \
+    		$RPM_BUILD_ROOT/%{image_install_path}/$InstallName-$KernelVer
+    chmod 755 $RPM_BUILD_ROOT/%{image_install_path}/$InstallName-$KernelVer
+
+    mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer
+    # Override $(mod-fw) because we don't want it to install any firmware
+    # We'll do that ourselves with 'make firmware_install'
+    make -s ARCH=$Arch INSTALL_MOD_PATH=$RPM_BUILD_ROOT modules_install KERNELRELEASE=$KernelVer mod-fw=
+%ifarch %{vdso_arches}
+    make -s ARCH=$Arch INSTALL_MOD_PATH=$RPM_BUILD_ROOT vdso_install KERNELRELEASE=$KernelVer
+    if grep '^CONFIG_XEN=y$' .config >/dev/null; then
+      echo > ldconfig-kernel.conf "\
+# This directive teaches ldconfig to search in nosegneg subdirectories
+# and cache the DSOs there with extra bit 0 set in their hwcap match
+# fields.  In Xen guest kernels, the vDSO tells the dynamic linker to
+# search in nosegneg subdirectories and to match this extra hwcap bit
+# in the ld.so.cache file.
+hwcap 0 nosegneg"
+    fi
+    if [ ! -s ldconfig-kernel.conf ]; then
+      echo > ldconfig-kernel.conf "\
+# Placeholder file, no vDSO hwcap entries used in this kernel."
+    fi
+    %{__install} -D -m 444 ldconfig-kernel.conf \
+        $RPM_BUILD_ROOT/etc/ld.so.conf.d/kernel-$KernelVer.conf
+%endif
+
+    # And save the headers/makefiles etc for building modules against
+    #
+    # This all looks scary, but the end result is supposed to be:
+    # * all arch relevant include/ files
+    # * all Makefile/Kconfig files
+    # * all script/ files
+
+    rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/source
+    mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    (cd $RPM_BUILD_ROOT/lib/modules/$KernelVer ; ln -s build source)
+    # dirs for additional modules per module-init-tools, kbuild/modules.txt
+    mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/extra
+    mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/updates
+    mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/weak-updates
+    # first copy everything
+    cp --parents `find  -type f -name "Makefile*" -o -name "Kconfig*"` $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    cp Module.symvers $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    cp System.map $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    if [ -s Module.markers ]; then
+      cp Module.markers $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    fi
+    # then drop all but the needed Makefiles/Kconfig files
+    rm -rf $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/Documentation
+    rm -rf $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/scripts
+    rm -rf $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include
+    cp .config $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    cp -a scripts $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    if [ -d arch/$Arch/scripts ]; then
+      cp -a arch/$Arch/scripts $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/arch/%{_arch} || :
+    fi
+    if [ -f arch/$Arch/*lds ]; then
+      cp -a arch/$Arch/*lds $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/arch/%{_arch}/ || :
+    fi
+    rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/scripts/*.o
+    rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/scripts/*/*.o
+%ifarch ppc
+    cp -a --parents arch/powerpc/lib/crtsavres.[So] $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/
+%endif
+    if [ -d arch/%{asmarch}/include ]; then
+      cp -a --parents arch/%{asmarch}/include $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/
+    fi
+    mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include
+    cd include
+    cp -a acpi config crypto keys linux math-emu media mtd net pcmcia rdma rxrpc scsi sound trace video drm asm-generic $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include
+    asmdir=$(readlink asm)
+    cp -a $asmdir $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/
+    pushd $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include
+    ln -s $asmdir asm
+    popd
+    # Make sure the Makefile and version.h have a matching timestamp so that
+    # external modules can be built
+    touch -r $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/Makefile $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/linux/version.h
+    touch -r $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/.config $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/linux/autoconf.h
+    # Copy .config to include/config/auto.conf so "make prepare" is unnecessary.
+    cp $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/.config $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/config/auto.conf
+    cd ..
+
+    #
+    # save the vmlinux file for kernel debugging into the kernel-debuginfo rpm
+    #
+%if %{with_debuginfo}
+    mkdir -p $RPM_BUILD_ROOT%{debuginfodir}/lib/modules/$KernelVer
+    cp vmlinux $RPM_BUILD_ROOT%{debuginfodir}/lib/modules/$KernelVer
+%endif
+
+    find $RPM_BUILD_ROOT/lib/modules/$KernelVer -name "*.ko" -type f >modnames
+
+    # mark modules executable so that strip-to-file can strip them
+    xargs --no-run-if-empty chmod u+x < modnames
+
+    # Generate a list of modules for block and networking.
+
+    fgrep /drivers/ modnames | xargs --no-run-if-empty nm -upA |
+    sed -n 's,^.*/\([^/]*\.ko\):  *U \(.*\)$,\1 \2,p' > drivers.undef
+
+    collect_modules_list()
+    {
+      sed -r -n -e "s/^([^ ]+) \\.?($2)\$/\\1/p" drivers.undef |
+      LC_ALL=C sort -u > $RPM_BUILD_ROOT/lib/modules/$KernelVer/modules.$1
+    }
+
+    collect_modules_list networking \
+    			 'register_netdev|ieee80211_register_hw|usbnet_probe'
+    collect_modules_list block \
+    			 'ata_scsi_ioctl|scsi_add_host|blk_init_queue|register_mtd_blktrans|scsi_esp_register|scsi_register_device_handler'
+    collect_modules_list drm \
+    			 'drm_open|drm_init'
+    collect_modules_list modesetting \
+    			 'drm_crtc_init'
+
+    # detect missing or incorrect license tags
+    rm -f modinfo
+    while read i
+    do
+      echo -n "${i#$RPM_BUILD_ROOT/lib/modules/$KernelVer/} " >> modinfo
+      /sbin/modinfo -l $i >> modinfo
+    done < modnames
+
+    egrep -v \
+    	  'GPL( v2)?$|Dual BSD/GPL$|Dual MPL/GPL$|GPL and additional rights$' \
+	  modinfo && exit 1
+
+    rm -f modinfo modnames
+
+    # remove files that will be auto generated by depmod at rpm -i time
+    for i in alias alias.bin ccwmap dep dep.bin ieee1394map inputmap isapnpmap ofmap pcimap seriomap symbols symbols.bin usbmap
+    do
+      rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/modules.$i
+    done
+
+    # Move the devel headers out of the root file system
+    mkdir -p $RPM_BUILD_ROOT/usr/src/kernels
+    mv $RPM_BUILD_ROOT/lib/modules/$KernelVer/build $RPM_BUILD_ROOT/$DevelDir
+    ln -sf ../../..$DevelDir $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+}
+
+###
+# DO it...
+###
+
+# prepare directories
+rm -rf $RPM_BUILD_ROOT
+mkdir -p $RPM_BUILD_ROOT/boot
+
+cd linux-%{kversion}.%{_target_cpu}
+
+%if %{with_debug}
+BuildKernel %make_target %kernel_image debug
+%endif
+
+%if %{with_pae_debug}
+BuildKernel %make_target %kernel_image PAEdebug
+%endif
+
+%if %{with_pae}
+BuildKernel %make_target %kernel_image PAE
+%endif
+
+%if %{with_up}
+BuildKernel %make_target %kernel_image
+%endif
+
+%if %{with_smp}
+BuildKernel %make_target %kernel_image smp
+%endif
+
+%if %{with_kdump}
+BuildKernel vmlinux vmlinux kdump vmlinux
+%endif
+
+%if %{with_doc}
+# Make the HTML and man pages.
+# XXX nix %{?_smp_mflags} here, buggy Documentation/*/Makefile!
+make htmldocs mandocs || %{doc_build_fail}
+
+# sometimes non-world-readable files sneak into the kernel source tree
+chmod -R a=rX Documentation
+find Documentation -type d | xargs chmod u+w
+%endif
+
+%if %{with_perf}
+pushd tools/perf
+make %{?_smp_mflags} man || %{doc_build_fail}
+popd
+%endif
+
+###
+### Special hacks for debuginfo subpackages.
+###
+
+# This macro is used by %%install, so we must redefine it before that.
+%define debug_package %{nil}
+
+%if %{fancy_debuginfo}
+%define __debug_install_post \
+  /usr/lib/rpm/find-debuginfo.sh %{debuginfo_args} %{_builddir}/%{?buildsubdir}\
+%{nil}
+%endif
+
+%if %{with_debuginfo}
+%ifnarch noarch
+%global __debug_package 1
+%files -f debugfiles.list debuginfo-common-%{_target_cpu}
+%defattr(-,root,root)
+%endif
+%endif
+
+###
+### install
+###
+
+%install
+
+cd linux-%{kversion}.%{_target_cpu}
+
+%if %{with_doc}
+docdir=$RPM_BUILD_ROOT%{_datadir}/doc/kernel-doc-%{rpmversion}
+man9dir=$RPM_BUILD_ROOT%{_datadir}/man/man9
+
+# copy the source over
+mkdir -p $docdir
+tar -f - --exclude=man --exclude='.*' -c Documentation | tar xf - -C $docdir
+
+# Install man pages for the kernel API.
+mkdir -p $man9dir
+find Documentation/DocBook/man -name '*.9.gz' -print0 |
+xargs -0 --no-run-if-empty %{__install} -m 444 -t $man9dir $m
+ls $man9dir | grep -q '' || > $man9dir/BROKEN
+%endif # with_doc
+
+# perf docs
+%if %{with_perf}
+mandir=$RPM_BUILD_ROOT%{_datadir}/man
+man1dir=$mandir/man1
+pushd tools/perf/Documentation
+make install-man mandir=$mandir
+popd
+
+pushd $man1dir
+for d in *.1; do
+ gzip $d;
+done
+popd
+%endif # with_perf
+
+# perf shell wrapper
+%if %{with_perf}
+mkdir -p $RPM_BUILD_ROOT/usr/sbin/
+cp $RPM_SOURCE_DIR/perf $RPM_BUILD_ROOT/usr/sbin/perf
+chmod 0755 $RPM_BUILD_ROOT/usr/sbin/perf
+mkdir -p $RPM_BUILD_ROOT%{_datadir}/doc/perf
+%endif
+
+%if %{with_headers}
+# Install kernel headers
+make ARCH=%{hdrarch} INSTALL_HDR_PATH=$RPM_BUILD_ROOT/usr headers_install
+
+# Do headers_check but don't die if it fails.
+make ARCH=%{hdrarch} INSTALL_HDR_PATH=$RPM_BUILD_ROOT/usr headers_check \
+     > hdrwarnings.txt || :
+if grep -q exist hdrwarnings.txt; then
+   sed s:^$RPM_BUILD_ROOT/usr/include/:: hdrwarnings.txt
+   # Temporarily cause a build failure if header inconsistencies.
+   # exit 1
+fi
+
+find $RPM_BUILD_ROOT/usr/include \
+     \( -name .install -o -name .check -o \
+     	-name ..install.cmd -o -name ..check.cmd \) | xargs rm -f
+
+# glibc provides scsi headers for itself, for now
+rm -rf $RPM_BUILD_ROOT/usr/include/scsi
+rm -f $RPM_BUILD_ROOT/usr/include/asm*/atomic.h
+rm -f $RPM_BUILD_ROOT/usr/include/asm*/io.h
+rm -f $RPM_BUILD_ROOT/usr/include/asm*/irq.h
+%endif
+
+%if %{with_firmware}
+%{build_firmware}
+%endif
+
+%if %{with_bootwrapper}
+make DESTDIR=$RPM_BUILD_ROOT bootwrapper_install WRAPPER_OBJDIR=%{_libdir}/kernel-wrapper WRAPPER_DTSDIR=%{_libdir}/kernel-wrapper/dts
+%endif
+
+
+###
+### clean
+###
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+###
+### scripts
+###
+
+#
+# This macro defines a %%post script for a kernel*-devel package.
+#	%%kernel_devel_post [<subpackage>]
+#
+%define kernel_devel_post() \
+%{expand:%%post %{?1:%{1}-}devel}\
+if [ -f /etc/sysconfig/kernel ]\
+then\
+    . /etc/sysconfig/kernel || exit $?\
+fi\
+if [ "$HARDLINK" != "no" -a -x /usr/sbin/hardlink ]\
+then\
+    (cd /usr/src/kernels/%{KVERREL}%{?1:.%{1}} &&\
+     /usr/bin/find . -type f | while read f; do\
+       hardlink -c /usr/src/kernels/*.fc*.*/$f $f\
+     done)\
+fi\
+%{nil}
+
+# This macro defines a %%posttrans script for a kernel package.
+#	%%kernel_variant_posttrans [<subpackage>]
+# More text can follow to go at the end of this variant's %%post.
+#
+%define kernel_variant_posttrans() \
+%{expand:%%posttrans %{?1}}\
+/sbin/new-kernel-pkg --package kernel%{?1:-%{1}} --rpmposttrans %{KVERREL}%{?1:.%{1}} || exit $?\
+%{nil}
+
+#
+# This macro defines a %%post script for a kernel package and its devel package.
+#	%%kernel_variant_post [-v <subpackage>] [-r <replace>]
+# More text can follow to go at the end of this variant's %%post.
+#
+%define kernel_variant_post(v:r:) \
+%{expand:%%kernel_devel_post %{?-v*}}\
+%{expand:%%kernel_variant_posttrans %{?-v*}}\
+%{expand:%%post %{?-v*}}\
+%{-r:\
+if [ `uname -i` == "x86_64" -o `uname -i` == "i386" ] &&\
+   [ -f /etc/sysconfig/kernel ]; then\
+  /bin/sed -r -i -e 's/^DEFAULTKERNEL=%{-r*}$/DEFAULTKERNEL=kernel%{?-v:-%{-v*}}/' /etc/sysconfig/kernel || exit $?\
+fi}\
+%{expand:\
+%if %{with_dracut}\
+/sbin/new-kernel-pkg --package kernel%{?-v:-%{-v*}} --mkinitrd --dracut --depmod --install %{KVERREL}%{?-v:.%{-v*}} || exit $?\
+%else\
+/sbin/new-kernel-pkg --package kernel%{?-v:-%{-v*}} --mkinitrd --depmod --install %{KVERREL}%{?-v:.%{-v*}} || exit $?\
+%endif}\
+#if [ -x /sbin/weak-modules ]\
+#then\
+#    /sbin/weak-modules --add-kernel %{KVERREL}%{?-v*} || exit $?\
+#fi\
+%{nil}
+
+#
+# This macro defines a %%preun script for a kernel package.
+#	%%kernel_variant_preun <subpackage>
+#
+%define kernel_variant_preun() \
+%{expand:%%preun %{?1}}\
+/sbin/new-kernel-pkg --rminitrd --rmmoddep --remove %{KVERREL}%{?1:.%{1}} || exit $?\
+#if [ -x /sbin/weak-modules ]\
+#then\
+#    /sbin/weak-modules --remove-kernel %{KVERREL}%{?1} || exit $?\
+#fi\
+%{nil}
+
+%kernel_variant_preun
+%ifarch x86_64
+%kernel_variant_post -r (kernel-smp|kernel-xen)
+%else
+%kernel_variant_post -r kernel-smp
+%endif
+
+%kernel_variant_preun smp
+%kernel_variant_post -v smp
+
+%kernel_variant_preun PAE
+%kernel_variant_post -v PAE -r (kernel|kernel-smp|kernel-xen)
+
+%kernel_variant_preun debug
+%kernel_variant_post -v debug
+
+%kernel_variant_post -v PAEdebug -r (kernel|kernel-smp|kernel-xen)
+%kernel_variant_preun PAEdebug
+
+if [ -x /sbin/ldconfig ]
+then
+    /sbin/ldconfig -X || exit $?
+fi
+
+###
+### file lists
+###
+
+%if %{with_headers}
+%files headers
+%defattr(-,root,root)
+/usr/include/*
+%endif
+
+%if %{with_firmware}
+%files firmware
+%defattr(-,root,root)
+/lib/firmware/*
+%doc linux-%{kversion}.%{_target_cpu}/firmware/WHENCE
+%endif
+
+%if %{with_bootwrapper}
+%files bootwrapper
+%defattr(-,root,root)
+/usr/sbin/*
+%{_libdir}/kernel-wrapper
+%endif
+
+# only some architecture builds need kernel-doc
+%if %{with_doc}
+%files doc
+%defattr(-,root,root)
+%{_datadir}/doc/kernel-doc-%{rpmversion}/Documentation/*
+%dir %{_datadir}/doc/kernel-doc-%{rpmversion}/Documentation
+%dir %{_datadir}/doc/kernel-doc-%{rpmversion}
+%{_datadir}/man/man9/*
+%endif
+
+%if %{with_perf}
+%files -n perf
+%defattr(-,root,root)
+%{_datadir}/doc/perf
+/usr/sbin/perf
+%{_datadir}/man/man1/*
+%endif
+
+# This is %{image_install_path} on an arch where that includes ELF files,
+# or empty otherwise.
+%define elf_image_install_path %{?kernel_image_elf:%{image_install_path}}
+
+#
+# This macro defines the %%files sections for a kernel package
+# and its devel and debuginfo packages.
+#	%%kernel_variant_files [-k vmlinux] <condition> <subpackage>
+#
+%define kernel_variant_files(k:) \
+%if %{1}\
+%{expand:%%files %{?2}}\
+%defattr(-,root,root)\
+/%{image_install_path}/%{?-k:%{-k*}}%{!?-k:vmlinuz}-%{KVERREL}%{?2:.%{2}}\
+/boot/System.map-%{KVERREL}%{?2:.%{2}}\
+%if %{with_perftool}\
+/usr/libexec/perf.%{KVERREL}%{?2:.%{2}}\
+%endif\
+#/boot/symvers-%{KVERREL}%{?2:.%{2}}.gz\
+/boot/config-%{KVERREL}%{?2:.%{2}}\
+%dir /lib/modules/%{KVERREL}%{?2:.%{2}}\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/kernel\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/build\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/source\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/extra\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/updates\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/weak-updates\
+%ifarch %{vdso_arches}\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/vdso\
+/etc/ld.so.conf.d/kernel-%{KVERREL}%{?2:.%{2}}.conf\
+%endif\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/modules.*\
+%if %{with_dracut}\
+/boot/initramfs-%{KVERREL}%{?2:.%{2}}.img\
+%else\
+/boot/initrd-%{KVERREL}%{?2:.%{2}}.img\
+%endif\
+%{expand:%%files %{?2:%{2}-}devel}\
+%defattr(-,root,root)\
+%dir /usr/src/kernels\
+%verify(not mtime) /usr/src/kernels/%{KVERREL}%{?2:.%{2}}\
+/usr/src/kernels/%{KVERREL}%{?2:.%{2}}\
+%if %{with_debuginfo}\
+%ifnarch noarch\
+%if %{fancy_debuginfo}\
+%{expand:%%files -f debuginfo%{?2}.list %{?2:%{2}-}debuginfo}\
+%else\
+%{expand:%%files %{?2:%{2}-}debuginfo}\
+%endif\
+%defattr(-,root,root)\
+%if !%{fancy_debuginfo}\
+%if "%{elf_image_install_path}" != ""\
+%{debuginfodir}/%{elf_image_install_path}/*-%{KVERREL}%{?2:.%{2}}.debug\
+%endif\
+%{debuginfodir}/lib/modules/%{KVERREL}%{?2:.%{2}}\
+%{debuginfodir}/usr/src/kernels/%{KVERREL}%{?2:.%{2}}\
+%endif\
+%endif\
+%endif\
+%endif\
+%{nil}
+
+
+%kernel_variant_files %{with_up}
+%kernel_variant_files %{with_smp} smp
+%kernel_variant_files %{with_debug} debug
+%kernel_variant_files %{with_pae} PAE
+%kernel_variant_files %{with_pae_debug} PAEdebug
+%kernel_variant_files -k vmlinux %{with_kdump} kdump
+
+# plz don't put in a version string unless you're going to tag
+# and build.
+
+%changelog
+* Thu Dec 03 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.6-162
+- ipv4-fix-null-ptr-deref-in-ip_fragment.patch: null ptr deref
+  bug fix.
+
+* Thu Dec 03 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-161
+- rv410 LVDS on resume test fix from AMD (#541562)
+
+* Wed Dec 02 2009 John W. Linville <linville@redhat.com> 2.6.31.6-160
+- ath9k: add fixes suggested by upstream maintainer
+
+* Wed Dec 02 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-159
+- drm-radeon-misc-fixes.patch: r400 LVDS, r600 digital dpms, cursor fix, tv property
+
+* Wed Dec 02 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.6-158
+- nouveau: more complete lvds script selection on >=G80 (rh#522690, rh#529859)
+- nouveau: more complete tmds script selection on >=G80 (rh#537853)
+- nouveau: TV detection fixes
+
+* Tue Dec 01 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-157
+- div/0 fix harder (#540593) - also ignore unposted GPUs with no BIOS
+
+* Tue Dec 01 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-156
+- drm-next: fixes LVDS resume on r4xx, div/0 on no bios (#540593)
+  lockup on tv-out only startup.
+
+* Mon Nov 30 2009 Kyle McMartin <kyle@redhat.com>
+- drm-i915-fix-sync-to-vbl-when-vga-is-off.patch: add (rhbz#541670)
+
+* Sun Nov 29 2009 Kyle McMartin <kyle@redhat.com>
+- Drop linux-2.6-sysrq-c.patch, made consistent upstream.
+
+* Fri Nov 27 2009 Jarod Wilson <jarod@redhat.com> 2.6.31.6-153
+- add device name to lirc_zilog, fixes issues w/multiple target devices
+- add lirc_imon pure input mode support for onboard decode devices
+
+* Wed Nov 26 2009 David Woodhouse <David.Woodhouse@intel.com> 2.6.31.6-152
+- Fix intel_tv_mode_set oops (#540218)
+
+* Wed Nov 26 2009 David Woodhouse <David.Woodhouse@intel.com> 2.6.31.6-151
+- VT-d: Work around yet more HP BIOS brokenness (#536675)
+
+* Wed Nov 25 2009 Kyle McMartin <kyle@redhat.com>
+- dlm: fix connection close handling.
+  Fix by lmb, requested by fabio.
+
+* Wed Nov 25 2009 David Woodhouse <David.Woodhouse@intel.com> 2.6.31.6-149
+- VT-d: Work around more HP BIOS brokenness.
+
+* Tue Nov 24 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-148
+- radeon: flush HDP cache on rendering wait - fixes r600 rendercheck failure
+
+* Mon Nov 23 2009 Adam Jackson <ajax@redhat.com>
+- drm-default-mode.patch: Default to 1024x768 to match UMS. (#538761)
+
+* Mon Nov 23 2009 Roland McGrath <roland@redhat.com> 2.6.31.6-146
+- Fix oops in x86-32 kernel's iret handling for bogus user %cs. (#540580)
+
+* Fri Nov 21 2009 Kyle McMartin <kyle@redhat.com>
+- Fix up ssp' highmem fixes with fixes for arm & ppc.
+
+* Thu Nov 20 2009 Chris Wright <chrisw@redhat.com> 2.6.31.6-144
+- VT-d: another fallback for another BIOS bug (#524808)
+
+* Thu Nov 19 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.6-142
+- Oops, add new patch to spec file
+
+* Thu Nov 19 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.6-141
+- Lower debug level of fbcon handover messages (rh#538526)
+
+* Thu Nov 19 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-140
+- drm-next-44c83571.patch: oops pulled the wrong tree into my f12 tree
+
+* Thu Nov 19 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.6-139
+- nouveau: s/r fixes on chipsets using bios opcode 0x87
+- nouveau: fixes to bios opcode 0x8e
+- nouveau: hopefully fix nv1x context switching issues (rh#526577)
+- nouveau: support for NVA5 (GeForce G220)
+- nouveau: fixes for NVAA support
+
+* Thu Nov 19 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-138
+- drm-next-d56672a9.patch: fix some rn50 cloning issues
+
+* Wed Nov 18 2009 David Woodhouse <David.Woodhouse@intel.com> 2.6.31.6-137
+- Actually force the IOMMU not to be used when we detect the HP/Acer bug.
+
+* Tue Nov 17 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.6-136
+- ACPI embedded controller fixes from Fedora 11.
+
+* Tue Nov 17 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.6-135
+- Scheduler fixes and latency tuning patches from F-11.
+
+* Tue Nov 17 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-134
+- glad to see edid retry patch was compiled.
+
+* Tue Nov 17 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-133
+- drm-next-984d1f3c.patch: rebase with upstream fixes - drop all merged
+
+* Thu Nov 12 2009 Adam Jackson <ajax@redhat.com>
+- Actually apply the EDID retry patch
+- drm-edid-header-fixup.patch: Fix up some broken EDID headers (#534120)
+
+* Thu Nov 12 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.6-130
+- Use ApplyOptionalPatch for v4l and firewire updates.
+- Drop unused v4l ABI fix.
+
+* Thu Nov 12 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.6-129
+- Linux 2.6.31.6
+- Drop merged patches:
+  linux-2.6-iwlwifi-reduce-noise-when-skb-allocation-fails.patch
+  linux-2.6-libertas-crash.patch
+  pci-increase-alignment-to-make-more-space.patch
+  acpi-revert-attach-device-to-handle-early.patch
+  ahci-revert-restore-sb600-sata-controller-64-bit-dma.patch
+  acpi-pci-fix-null-pointer-dereference-in-acpi-get-pci-dev.patch
+  af_unix-fix-deadlock-connecting-to-shutdown-socket.patch
+  keys-get_instantiation_keyring-should-inc-the-keyring-refcount.patch
+  netlink-fix-typo-in-initialization.patch
+  fs-pipe-null-ptr-deref-fix.patch
+
+* Wed Nov 11 2009 Justin M. Forbes <jforbes@redhat.com> 2.6.31.5-128
+- Fix KSM for i686 users. (#532215)
+- Add KSM fixes from 2.6.32
+
+* Sun Nov 08 2009 David Woodhouse <David.Woodhouse@intel.com> 2.6.31.5-127
+- Apply fix for fallback when HP/Acer BIOS bug detected (#524808)
+- Re-enable DMAR.
+- Fix libertas crash due to skb pointer bug
+
+* Sat Nov 07 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.5-126
+- Re-enable linux-2.6-die-closed-source-bios-muppets-die.patch, DMAR
+  still defaulting to off.
+
+* Sat Nov 07 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.5-125
+- Disable linux-2.6-die-closed-source-bios-muppets-die.patch and
+  default DMAR to off (can be re-enabled with intel_iommu=on on the
+  command line due to last minute issues and reversion upstream.)
+
+* Thu Nov 05 2009 Jarod Wilson <jarod@redhat.com>
+- Add --with dbgonly rpmbuild option to build only debug kernels
+
+* Thu Nov 05 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-122
+- comment out kmap atomic for now, it breaks ppc build
+
+* Thu Nov 05 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-121
+- drm-radeon-fix-agp-resume.patch (#531825)
+
+* Thu Nov 05 2009 Kyle McMartin <kyle@redhat.com>
+- Add two patches from Soren from mingo/linux-2.6-x86.git to fix
+  debug_kmap_atomic prints.
+
+* Thu Nov 05 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: fix rh#532924
+
+* Wed Nov 04 2009 Kyle McMartin <kyle@redhat.com>
+- Make JBD2_DEBUG a toggleable debug setting. Leave it the way it was.
+  (Double checked resulting configs, don't fret.)
+
+* Wed Nov 04 2009 Adam Jackson <ajax@redhat.com> 2.6.31.5-117
+- drm-edid-retry.patch: Try DDC up to four times, like X. (#532957)
+
+* Wed Nov 04 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.5-116
+- tg3 bug fixes (#527209)
+
+* Wed Nov 04 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.5-115
+- fs/pipe.c: fix null pointer dereference (CVE-2009-3547)
+
+* Wed Nov 04 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.5-114
+- nouveau: provide info userspace needs to handle low memory situations
+- nouveau: fix for rh#532711
+- nouveau: add option to provide more debug info for rh#532579
+- patch only so large because of included register rename
+
+* Tue Nov 03 2009 Adam Jackson <ajax@redhat.com> 2.6.31.5-113
+- drm-conservative-fallback-modes.patch: When an output is connected but
+  fails EDID, only add modes with refresh rates <= 60 (#514600)
+
+* Tue Nov 03 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-112
+- drm-r600-lenovo-w500-fix.patch: add second patch from upstream fix
+
+* Tue Nov 03 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-111
+- drm-r600-lenovo-w500-fix.patch: fix lenovo w500 acpi video kill laptop dead
+- drop aspm r600 patch as correct fix should be in 110
+
+* Tue Nov 03 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-110
+- r600: fix for ring setup RMW issue.
+
+* Mon Nov 02 2009 John W. Linville <linville@redhat.com> 2.6.31.5-109
+- prism54: remove pci modinfo device table (#447047)
+
+* Mon Nov 02 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.5-108
+- Enable acerhdf driver for fan speed control on Acer Aspire One notebook (#532463)
+
+* Mon Nov 02 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-107
+- r600: back that out, thanks to yaneti for testing.
+
+* Mon Nov 02 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-106
+- r600: ring size guesswork fix.
+
+* Fri Oct 30 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-105
+- drm-radeon-agp-font-fix.patch: hopefully fix AGP coherency issue
+
+* Wed Oct 28 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-104
+- drm-next-ea1495a6.patch: fix rs400 resume on my test box
+
+* Wed Oct 28 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-103
+- drm-next-fc7f7119.patch: fix oops in SS code, fix multi-card, dvo.
+- drm-radeon-kms-arbiter-return-ignore.patch: fix arbiter for non-VGA display
+
+* Tue Oct 27 2009 Chuck Ebbert <cebbert@redhat.com>
+- Fix oops in VIA padlock-aes code.
+
+* Tue Oct 27 2009 Dave Airlie <airlied@redhat.com>
+- kms: add offb handoff patch for ppc to work
+
+* Tue Oct 27 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: misc fixes, very initial NVA8 work
+
+* Tue Oct 27 2009 Dave Airlie <airlied@redhat.com>
+- fix dd command lines
+
+* Mon Oct 26 2009 Dave Jones <davej@redhat.com>
+- Make a 20MB initramfs file so rpm gets its diskspace calculations right. (#530778)
+
+* Mon Oct 26 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-97
+- drm: rebase to drm-next, drop palette fix, merged upstream
+- drm-intel-big-hammer.patch: drop, proper fix in 2.6.31.5
+- drm-disable-r600-aspm.patch: test patch to disable aspm on r600/r700 for now
+
+* Fri Oct 23 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.5-96
+- Bump NR_CPUS to 256 on x86_64.
+- Add two backports (ugh, just had to go renaming perf counters to events...)
+  for fixing sysprof with perf.
+
+* Fri Oct 23 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-95
+- re enable MSI
+
+* Fri Oct 23 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-94
+- disable debug + stackprotector
+
+* Fri Oct 23 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.31.5
+
+* Thu Oct 22 2009 Chuck Ebbert <cebbert@redhat.com>
+- Fix exploitable OOPS in keyring code. (CVE-2009-3624)
+- Fix kernel memory leak to userspace. (CVE-2009-3612)
+
+* Thu Oct 22 2009 Dave Airlie <airlied@redhat.com>  2.6.31.5-91.rc1
+- kms: fix palette
+
+* Wed Oct 21 2009 Chuck Ebbert <cebbert@redhat.com>
+- Disable powersave by default for AC97 audio devices. (#524414)
+
+* Wed Oct 21 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.31.5-rc1
+- Remove the merged HP DC7900 workaround from iommu-updates patch.
+- Drop merged patch:
+  linux-2.6-raidlockdep.patch
+
+* Mon Oct 19 2009 Kyle McMartin <kyle@redhat.com>
+- af_unix-fix-deadlock-connecting-to-shutdown-socket.patch: fix for
+  rhbz#529626.
+
+* Sat Oct 17 2009 Chuck Ebbert <cebbert@redhat.com>
+- Replace linux-2.6-bluetooth-autosuspend.diff with upstream version.
+
+* Fri Oct 16 2009 Josef Bacik <josef@toxicpanda.com>
+- Update btrfs to latest upstream
+
+* Fri Oct 16 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.4-85
+- Fix another ACPI boot hang (#513680)
+
+* Fri Oct 16 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.4-84
+- nouveau: more vbios opcodes, minor fixes, hopeful fix for rh#529292
+
+* Wed Oct 14 2009 Roland McGrath <roland@redhat.com> 2.6.31.4-83
+- Remove work-around for gcc bug #521991, now fixed.
+- Build *docs non-parallel, working around kernel's makefile bugs.
+
+* Wed Oct 14 2009 Peter Jones <pjones@redhat.com>
+- Add scsi_register_device_handler to modules.block's symbol list so
+  we'll have scsi device handlers in installer images.
+
+* Tue Oct 13 2009 Steve Dickson <steved@redhat.com> 2.6.31.4-81
+- Fixed hang during NFS installs (bz 528537)
+
+* Tue Oct 13 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.4-80
+- Disable 64-bit DMA on SB600 SATA controllers.
+
+* Tue Oct 13 2009 Kyle McMartin <kyle@redhat.com>
+- Always build perf docs, regardless of whether we build kernel-doc.
+  Seems rather unfair to not ship the manpages half the time.
+  Also, drop BuildRequires %if when not with_doc, the rules about %if
+  there are f*!&^ing complicated.
+
+* Mon Oct 12 2009 Kyle McMartin <kyle@redhat.com>
+- Build the perf manpages properly.
+
+* Mon Oct 12 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.4-77
+- Fix boot hang with ACPI on some systems.
+
+* Mon Oct 12 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.4-76
+- Linux 2.6.31.4
+
+* Mon Oct 12 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.4-75.rc2
+- improve-resource-counter-scalability.patch: Fix scalability issues
+  on big machines, requested by prarit.
+
+* Mon Oct 12 2009 Jarod Wilson <jarod@redhat.com>
+- Fix irq status check bugs in lirc_ene0100
+
+* Mon Oct 12 2009 Chuck Ebbert <cebbert@redhat.com>
+- Fix 2.6.31 regression that caused device failures with ACPI enabled.
+
+* Sun Oct 11 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.31.4-rc2
+- Drop merged patch: linux-2.6-frace-fixes.patch
+
+* Sat Oct 10 2009 Chuck Ebbert <cebbert@redhat.com>
+- Make performance counter API available to userspace programs (#527264)
+
+* Sat Oct 10 2009 Dave Jones <davej@redhat.com>
+- Drop the NX kernel data patch for now. Causes no-boot on some systems.
+
+* Fri Oct 09 2009 Dave Jones <davej@redhat.com>
+- Backport two critical ftrace fixes.
+  ftrace: check for failure for all conversions
+  tracing: correct module boundaries for ftrace_release
+
+* Fri Oct 09 2009 Jarod Wilson <jarod@redhat.com>
+- Build docs sub-package again
+
+* Thu Oct 08 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.3-67
+- Linux 2.6.31.3
+- rebase drm-next trivially.
+- dropped merged upstream patches,
+ - linux-2.6-fix-usb-serial-autosuspend.diff
+ - linux-2.6-iwlagn-modify-digital-SVR-for-1000.patch
+ - linux-2.6-iwlwifi-Handle-new-firmware-file-with-ucode-build-number-in-header.patch
+ - linux-2.6-iwlwifi-fix-debugfs-buffer-handling.patch
+ - linux-2.6-iwlwifi-fix-unloading-driver-while-scanning.patch
+ - linux-2.6-iwlwifi-remove-deprecated-6000-series-adapters.patch
+ - linux-2.6-iwlwifi-traverse-linklist-to-find-the-valid-OTP-block.patch
+ - linux-2.6-iwlwifi-update-1000-series-API-version-to-match-firmware.patch
+ - linux-2.6-xen-check-efer-fix.patch
+ - linux-2.6-xen-spinlock-enable-interrupts-only-when-blocking.patch
+ - linux-2.6-xen-spinlock-stronger-barrier.patch
+ - linux-2.6-xen-stack-protector-fix.patch
+ - linux-2.6.31-cpufreq-powernow-k8-oops.patch
+
+* Thu Oct 08 2009 Ben Skeggs <bskeggs@redhat.com>
+- ppc: compile nvidiafb as a module only, nvidiafb+nouveau = bang! (rh#491308)
+
+* Thu Oct 08 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.1-65
+- nouveau: {drm-next,context,fbcon,misc} fixes, connector forcing
+
+* Thu Oct 08 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-64
+- rebase latest drm-next, fixes many s/r and r600 problems
+
+* Wed Oct 07 2009 Dave Jones <davej@redhat.com>
+- Don't mark the initramfs file as a ghost.
+
+* Wed Oct 07 2009 Dave Jones <davej@redhat.com>
+- Enable FUNCTION_GRAPH_TRACER on x86-64.
+
+* Wed Oct 07 2009 Dave Jones <davej@redhat.com>
+- Disable CONFIG_IRQSOFF_TRACER on srostedt's recommendation.
+  (Adds unwanted overhead when not in use).
+
+* Tue Oct  6 2009 Justin M. Forbes <jforbes@redhat.com>
+- virtio_blk: add support for cache flush (#526869)
+
+* Fri Oct  2 2009 John W. Linville <linville@redhat.com>
+- Backport "iwlwifi: reduce noise when skb allocation fails"
+
+* Wed Sep 30 2009 David Woodhouse <David.Woodhouse@intel.com>
+- Update IOMMU code; mostly a bunch more workarounds for broken BIOSes.
+
+* Wed Sep 30 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-56
+- revert all the arjan patches until someone tests them.
+
+* Tue Sep 29 2009 Steve Dickson <steved@redhat.com>  2.6.31.1-55
+- Updated the NFS4 pseudo root code with a fix from upstream
+
+* Tue Sep 29 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-54
+- Fix broken capabilties that stopped dbus working due to copy from user
+  fixups.
+
+* Tue Sep 29 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-53
+- drm-next-4c57edba4.patch: fix r600 dri1 memory leak and r600 bugs
+
+* Mon Sep 28 2009 Dave Jones <davej@redhat.com> 2.6.31.1-52
+- Use __builtin_object_size to validate the buffer size for copy_from_user
+  + associated fixes to various copy_from_user invocations.
+
+* Mon Sep 28 2009 Justin M. Forbes <jmforbes@redhat.com> 2.6.31.1-50
+- Increase timeout for xen frontend devices to connect.
+
+* Sat Sep 26 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.1-49
+- Add Xen spinlock patches to improve scalability.
+
+* Sat Sep 26 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-48
+- drm-next-8ef8678c8.patch: fix intel/nouveau kms
+
+* Fri Sep 25 2009 Justin M. Forbes <bskeggs@redhat.com> 2.6.31.1-47
+- Fix xen guest booting when NX is disabled (#525290)
+
+* Fri Sep 25 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.1-46
+- drm-nouveau.patch: cleanups, fixes, pre-G80 s/r fixes, init rework
+
+* Fri Sep 25 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-45
+- drm-next-adea4796c.patch: fix r600 glxgears
+
+* Fri Sep 25 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-44
+- bump a extra one because I accidentially CVS.
+
+* Thu Sep 24 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-42
+- drm-next update - fix r600 s/r, and command line mode picking and r600 tv
+
+* Thu Sep 24 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.1-41
+- Linux 2.6.31.1
+- Drop patches merged upstream:
+    linux-2.6-kvm-vmx-check-cpl-before-emulating-debug-register-access.patch
+    linux-2.6-use-__pa_symbol-to-calculate-address-of-C-symbol.patch
+    linux-2.6-kvm-pvmmu-do-not-batch-pte-updates-from-interrupt-context.patch
+    linux-2.6-scsi-sd-fix-oops-during-scanning.patch
+    linux-2.6-scsi-sg-fix-oops-in-error-path.patch
+
+* Thu Sep 24 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-40
+- Drop the modules-ro-nx patch: it's causing ftrace to be unable
+  to NOP out module function call tracking. (#524042)
+
+* Wed Sep 23 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-39
+- touch initramfs-$foo not dracut-$foo.
+
+* Wed Sep 23 2009 Adam Jackson <ajax@redhat.com> 2.6.31-37
+- drm: Fix various buglets in EDID parsing.
+
+* Mon Sep 21 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: more on rh#522649, added some useful info to debugfs
+- lots of coding style cleanups, which is the reason for the huge commit
+
+* Fri Sep 18 2009 Dave Jones <davej@redhat.com>
+- %ghost the dracut initramfs file.
+
+* Thu Sep 17 2009 Hans de Goede <hdegoede@redhat.com>
+- Now that we have %%post generation of dracut images we do not need to
+  Require dracut-kernel anymore
+
+* Thu Sep 17 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-33
+- Turn off CONFIG_CC_OPTIMIZE_FOR_SIZE on ppc64 until ld decides to play nice
+  and generate the save/restore stubs.
+
+* Thu Sep 17 2009 Kristian HÃ¸gsberg <krh@redhat.com>
+- Drop drm page-flip patch for F12.
+
+* Thu Sep 17 2009 Dave Jones <davej@redhat.com>
+- cpuidle: Fix the menu governor to boost IO performance.
+
+* Wed Sep 16 2009 John W. Linville <linville@redhat.com>
+- Add a few more iwl1000 support patches.
+- Remove support for deprecated iwl6000 parts.
+
+* Wed Sep 16 2009 Eric Paris <eparis@redhat.com>
+- Do not check CAP_SYS_MODULE when networking tres to autoload a module
+
+* Wed Sep 16 2009 John W. Linville <linville@redhat.com>
+- Add iwl1000 support patches.
+
+* Wed Sep 16 2009 Adam Jackson <ajax@redhat.com>
+- Disable hotplug interrupts on TV connectors on i915.
+
+* Wed Sep 16 2009 Dave Jones <davej@redhat.com>
+- Fix NULL deref in powernow-k8 driver. (korg #13780)
+
+* Wed Sep 16 2009 Hans de Goede <hdegoede@redhat.com>
+- Fix lockdep warning (and potential real deadlock) in mdraid10 code,
+  requested for -stable, rh#515471
+
+* Wed Sep 16 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31-17
+- nouveau: potential fix for rh#522649 + misc other fixes
+
+* Tue Sep 15 2009 Chuck Ebbert <cebbert@redhat.com>
+- Add unused-kernel-patches Make target, change some patches to
+  use ApplyOptionalPatch
+
+* Tue Sep 15 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: misc fixes to context-related issues, fixes some severe nv4x bugs
+
+* Tue Sep 15 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: temporarily disable fbcon accel, it's racing with ttm
+
+* Mon Sep 14 2009 Steve Dickson <steved@redhat.com>
+- Added support for -o v4 mount parsing
+
+* Mon Sep 14 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: avoid PFIFO IRQ hardlock, misc LVDS mode fixes, nv5x RAMFC cleanup
+
+* Sun Sep 13 2009 Chuck Ebbert <cebbert@redhat.com>
+- SCSI oops fixes requested for -stable
+
+* Fri Sep 11 2009 Dave Jones <davej@redhat.com>
+- Apply NX/RO to modules
+
+* Fri Sep 11 2009 Dave Jones <davej@redhat.com>
+- Mark kernel data section as NX
+
+* Fri Sep 11 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: bring in Matthew Garret's initial switchable graphics support
+
+* Fri Sep 11 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: fixed use of strap-based panel mode when required (rh#522649)
+- nouveau: temporarily block accel on NVAC chipsets (rh#522361, rh#522575)
+
+* Thu Sep 10 2009 Matthew Garrett <mjg@redhat.com>
+- linux-2.6-ahci-export-capabilities.patch: Backport from upstream
+- linux-2.6-rtc-show-hctosys.patch: Export the hctosys state of an rtc
+- linux-2.6-rfkill-all.patch: Support for keys that toggle all rfkill state
+
+* Thu Sep 10 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: add some scaler-only modes for LVDS, GEM/TTM fixes
+
+* Wed Sep 09 2009 Dennis Gilmore <dennis@ausil.us> 2.6.31-2
+- touch the dracut initrd file when using %%{with_dracut}
+
+* Wed Sep 09 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-1
+- Linux 2.6.31
+
+* Wed Sep 09 2009 Chuck Ebbert <cebbert@redhat.com>
+- Enable VXpocket and PDaudioCF PCMCIA sound drivers.
+
+* Wed Sep 09 2009 Hans de Goede <hdegoede@redhat.com>
+- Move to %%post generation of dracut initrd, because of GPL issues surrounding
+  shipping a prebuild initrd
+- Require grubby >= 7.0.4-1, for %%post generation
+
+* Wed Sep  9 2009 Steve Dickson <steved@redhat.com>
+- Updated the NFS4 pseudo root code to the latest release.
+
+* Wed Sep 09 2009 Justin M. Forbes <jforbes@redhat.com>
+- Revert virtio_blk to rotational mode. (#509383)
+
+* Wed Sep 09 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.219.rc9.git
+- uggh lost nouveau bits in page flip
+
+* Wed Sep 09 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.218.rc9.git2
+- fix r600 oops with page flip patch (#520766)
+
+* Wed Sep 09 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: fix display resume on pre-G8x chips
+
+* Wed Sep 09 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: add getparam to know using tile_flags is ok for scanout
+
+* Wed Sep 09 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc9-git2
+
+* Wed Sep  9 2009 Roland McGrath <roland@redhat.com> 2.6.31-0.214.rc9.git1
+- compile with -fno-var-tracking-assignments, work around gcc bug #521991
+
+* Wed Sep 09 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.213.rc9.git1
+- fix two bugs in r600 kms, fencing + mobile lvds
+
+* Tue Sep 08 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31-0.212.rc9.git1
+- drm-nouveau.patch: fix ppc build
+
+* Tue Sep 08 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31-0.211.rc9.git1
+- drm-nouveau.patch: more misc fixes
+
+* Tue Sep 08 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.210.rc9.git1
+- drm-page-flip.patch: rebase again
+
+* Tue Sep 08 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.209.rc9.git1
+- drm-next.patch: fix r600 signal interruption return value
+
+* Tue Sep 08 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31-0.208.rc9.git1
+- drm-nouveau.patch: latest upstream + rebase onto drm-next
+
+* Tue Sep 08 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.207.rc9.git1
+- drm-vga-arb.patch: update to avoid lockdep + add r600 support
+
+* Tue Sep 08 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.206.rc9.git1
+- drm: rebase to drm-next - r600 accel + kms should start working now
+
+* Mon Sep 07 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.205.rc9.git1
+- 2.6.31-rc9-git1
+- Temporarily hack the drm-next patch so it still applies; the result
+  should still be safe to build.
+
+* Sat Sep 05 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.204.rc9
+- 2.6.31-rc9
+
+* Fri Sep 04 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.203.rc8.git2
+- Fix kernel build errors when building firmware by removing the
+  .config file before that step and restoring it afterward.
+
+* Thu Sep 03 2009 Adam Jackson <ajax@redhat.com>
+- drm-ddc-caching-bug.patch: Empty the connector's mode list when it's
+  disconnected.
+
+* Thu Sep 03 2009 Jarod Wilson <jarod@redhat.com>
+- Update hdpvr and lirc_zilog drivers for 2.6.31 i2c
+
+* Thu Sep 03 2009 Justin M.Forbes <jforbes@redhat.com>
+- Fix xen guest with stack protector. (#508120)
+- Small kvm fixes.
+
+* Wed Sep 02 2009 Adam Jackson <ajax@redhat.com> 2.6.31-0.199.rc8.git2
+- drm-intel-pm.patch: Disable by default, too flickery on too many machines.
+  Enable with i915.powersave=1.
+
+* Wed Sep 02 2009 Dave Jones <davej@redhat.com>
+- Add missing scriptlet dependancy. (#520788)
+
+* Tue Sep 01 2009 Adam Jackson <ajax@redhat.com>
+- Make DRM less chatty about EDID failures.  No one cares.
+
+* Tue Sep 01 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc8-git2
+- Blank out drm-intel-next: entire contents are now upstream.
+
+* Tue Sep 01 2009 Dave Jones <davej@redhat.com>
+- Make firmware buildarch noarch. (Suggested by drago01 on irc)
+
+* Tue Sep 01 2009 Jarod Wilson <jarod@redhat.com>
+- Fix up lirc_zilog to enable functional IR transmit and receive
+  on the Hauppauge HD PVR
+- Fix audio on PVR-500 when used in same system as HVR-1800 (#480728)
+
+* Sun Aug 30 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc8-git1
+- Drop linux-2.6-inotify-accounting.patch, merged upstream.
+
+* Sun Aug 30 2009 Jarod Wilson <jarod@redhat.com>
+- fix lirc_imon oops on older devices w/o tx ctrl ep (#520008)
+
+* Fri Aug 28 2009 Eric Paris <eparis@redhat.com> 2.6.31-0.190.rc8
+- fix inotify length accounting and send inotify events
+
+* Fri Aug 28 2009 David Woodhouse <David.Woodhouse@intel.com>
+- Enable Solos DSL driver
+
+* Fri Aug 28 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc8
+
+* Thu Aug 27 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.185.rc7.git6
+- 2.6.31-rc7-git6
+- Drop patch merged upstream:
+  xen-fb-probe-fix.patch
+
+* Thu Aug 27 2009 Adam Jackson <ajax@redhat.com>
+- drm-rv710-ucode-fix.patch: Treat successful microcode load on RV710 as,
+  you know, success. (#519718)
+
+* Thu Aug 27 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc7-git5
+- Drop patch linux-2.6-ima-leak.patch, now merged upstream.
+
+* Wed Aug 26 2009 Jarod Wilson <jarod@redhat.com>
+- Fix up hdpvr ir enable patch for use w/modular i2c (David Engel)
+
+* Wed Aug 26 2009 Eric Paris <eparis@redhat.com>
+- fix iint_cache leak in IMA code
+  drop the ima=0 patch
+
+* Wed Aug 26 2009 Justin M. Forbes <jforbes@redhat.com>
+- Fix munlock with KSM (#516909)
+- Re-enable KSM
+
+* Wed Aug 26 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc7-git4
+- Drop patches merged upstream:
+  xen-x86-fix-stackprotect.patch
+  xen-x86-no-stackprotect.patch
+
+* Wed Aug 26 2009 Adam Jackson <ajax@redhat.com>
+- drm-intel-next.patch: Update, various output setup fixes.
+
+* Wed Aug 26 2009 David Woodhouse <David.Woodhouse@intel.com>
+- Make WiMAX modular (#512070)
+
+* Tue Aug 25 2009 Kyle McMartin <kyle@redhat.com>
+- allow-disabling-ima.diff: debugging patch... adds ima=0 kernel
+  param to disable initialization of IMA.
+
+* Tue Aug 25 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31-0.174.rc7.git2
+- drm-nouveau.patch: upstream update, pre-nv50 tv-out + misc fixes
+
+* Tue Aug 25 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.173.rc7.git2
+- Fix Xen boot (#508120)
+
+* Tue Aug 25 2009 Dave Airlie <airlied@redhat.com>
+- pull in drm-next tree + rebase around it
+
+* Mon Aug 24 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc7-git2
+
+* Mon Aug 24 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc7-git1
+
+* Sat Aug 22 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc7
+
+* Thu Aug 20 2009 Mark McLoughlin <markmc@redhat.com>
+- Disable LZMA for xen (#515831)
+
+* Thu Aug 20 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc6-git5
+- Fix up drm-r600-kms.patch
+- Drop fix-perf-make-man-failure.patch
+
+* Wed Aug 19 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc6-git5
+- Revert linux-2.6-debug-vm-would-have-oomkilled.patch to v1.2
+  because upstream changes to oom-kill.c were all reverted.
+
+* Tue Aug 18 2009 Kyle McMartin <kyle@redhat.com>
+- Fix up perf so that it builds docs now that they are fixed.
+- with_docs disables perf docs too. be warned. (logic is that the
+  build deps are (mostly) the same, so if you don't want one, odds are...)
+
+* Tue Aug 18 2009 Dave Jones <davej@redhat.com>
+- 2.6.31-rc6-git3
+
+* Mon Aug 17 2009 Dave Jones <davej@redhat.com> 2.6.31-0.161.rc6.git2
+- 2.6.31-rc6-git2
+
+* Mon Aug 17 2009 Chuck Ebbert <cebbert@redhat.com>
+- Stop generating the (unused) ppc64-kdump.config file.
+
+* Mon Aug 17 2009 Jarod Wilson <jarod@redhat.com>
+- Add new lirc driver for built-in ENE0100 device on some laptops
+
+* Sun Aug 16 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.158.rc6
+- Improve the perf script so it prints something helpful if the
+  perf binary doesn't exist.
+
+* Sat Aug 15 2009 Dave Jones <davej@redhat.com> 2.6.31-0.157.rc6
+- Disable KSM patches on a hunch.  Chasing the "encrypted VGs don't work" bug.
+
+* Fri Aug 14 2009 Dave Jones <davej@redhat.com> 2.6.31-0.155.rc6
+- 2.6.31-rc6
+
+* Wed Aug 12 2009 Kyle McMartin <kyle@redhat.com>
+- fix perf.
+- move perf to perf.$ver instead of perf-$ver...
+
+* Wed Aug 12 2009 Dennis Gilmore <dennis@ausil.us>
+- Obsolete kernel-smp on sparc64
+- Require grubby >= 7.0.2-1 since thats what introduces the dracut options we use
+
+* Wed Aug 12 2009 Kristian HÃ¸gsberg <krh@redhat.com>
+- Fix drm-page-flip.patch to not break radeon kms and to not reset
+  crtc offset into fb on flip.
+
+* Wed Aug 12 2009 Adam Jackson <ajax@redhat.com>
+- Update drm-intel-next patch
+
+* Tue Aug 11 2009 Dennis Gilmore <dennis@ausil.us> - 2.6.31-0.149.rc5.git3
+- disable building the -smp kernel on sparc64
+- disable building kernel-perf on sparc64 syscalls not supported
+
+* Tue Aug 11 2009 Eric Paris <eparis@redhat.com>
+- Enable config IMA
+
+* Tue Aug 11 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: various cleanups and fixes + more sanity checking in dma paths
+
+* Mon Aug 10 2009 Jarod Wilson <jarod@redhat.com>
+- Add new device ID to lirc_mceusb (#512483)
+- Fix some lockdep false positives
+- Add support for setting and enabling iMON clock via sysfs
+- Add tunable pad threshold support to lirc_imon
+- Add new pseudo-IR protocl to lirc_imon for universals w/o a pad
+- Fix mouse device support on older iMON devices
+
+* Mon Aug 10 2009 David Woodhouse <David.Woodhouse@intel.com> 2.6.31-0.145.rc5.git3
+- Merge latest Intel IOMMU fixes and BIOS workarounds, re-enable by default.
+
+* Sun Aug 09 2009 Kyle McMartin <kyle@redhat.com>
+- btusb autosuspend: fix build on !CONFIG_PM by stubbing out
+  suspend/resume methods.
+
+* Sat Aug 08 2009 Dennis Gilmore <dennis@ausil.us> 2.6.31-0.141.rc5.git3
+- disable kgdb on sparc64 uni-processor kernel
+- set max cpus to 256 on sparc64
+- enable AT keyboard on sparc64
+
+* Fri Aug 07 2009 Justin M. Forbes <jforbes@redhat.com>
+- Apply KSM updates from upstream
+
+* Fri Aug 07 2009 Hans de Goede <hdegoede@redhat.com>
+- When building a dracut generic initrd tell new-kernel-pkg to use that
+  instead of running mkinitrd
+
+* Fri Aug 07 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.139.rc5.git3
+- drm-r600-kms.patch - update r600 KMS
+- drm-radeon-fixes.patch - patches for queue to Linus
+
+* Thu Aug 06 2009 Justin M. Forbes <jforbes@redhat.com> 2.6.31-0.138.rc5.git3
+- Fix kvm virtio_blk errors (#514901)
+
+* Thu Aug 06 2009 Adam Jackson <ajax@redhat.com>
+- Hush DRM vblank warnings, they're constant (and harmless) under DRI2.
+
+* Thu Aug 06 2009 Dave Airlie <airlied@redhat.com> 2.6.31.0.134.rc5.git3
+- fixup vga arb warning at startup and handover between gpus
+
+* Thu Aug 06 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.0.133.rc5.git3
+- die-floppy-die.patch: it's the 21st century, let's not rely on
+  steam powered technology.
+
+* Wed Aug 05 2009 Dave Airlie <airlied@redhat.com> 2.6.31.0.132.rc5.git3
+- revert-ftrace-powerpc-snafu.patch - fix ppc build
+
+* Wed Aug 05 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: respect nomodeset
+
+* Wed Aug 05 2009 Chuck Ebbert <cebbert@redhat.com>
+- Fix /usr/sbin/perf script. (#515494)
+
+* Wed Aug 05 2009 Dave Jones <davej@redhat.com>
+- Fix shift in pci cacheline size printk.
+
+* Wed Aug 05 2009 Dave Airlie <airlied@redhat.com> 2.6.31.0.128.rc5.git3
+- 2.6.31-rc5-git3
+- drop cpufreq + set memory fixes
+
+* Wed Aug 05 2009 Dave Airlie <airlied@redhat.com>
+- Add Jeromes initial r600 kms work.
+- rebase arb patch
+
+* Tue Aug 04 2009 Kyle McMartin <kyle@redhat.com>
+- alsa-tell-user-that-stream-to-be-rewound-is-suspended.patch: apply patch
+  destined for 2.6.32, requested by Lennart.
+
+* Tue Aug 04 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: more code share between nv50/<nv50 kms, bug fixes
+
+* Tue Aug 04 2009 Dave Airlie <airlied@redhat.com>
+- update VGA arb patches again
+
+* Mon Aug 03 2009 Adam Jackson <ajax@redhat.com>
+- Update intel drm from anholt's tree
+- Rebase drm-intel-pm.patch to match
+- Drop gen3 fb hack, merged
+- Drop previous watermark setup change
+
+* Mon Aug 03 2009 Dave Jones <davej@redhat.com> 2.6.31-0.122.rc5.git2
+- 2.6.31-rc5-git2
+
+* Mon Aug 03 2009 Adam Jackson <ajax@redhat.com>
+- (Attempt to) fix watermark setup on Intel 9xx parts.
+
+* Mon Aug 03 2009 Jarod Wilson <jarod@redhat.com>
+- make usbhid driver ignore all recent SoundGraph iMON devices, so the
+  lirc_imon driver can grab them instead
+
+* Mon Aug 03 2009 Dave Airlie <airlied@redhat.com>
+- update VGA arb patches
+
+* Sat Aug 01 2009 David Woodhouse <David.Woodhouse@intel.com> 2.6.31-0.118.rc5
+- Fix boot failures on ppc32 (#514010, #505071)
+
+* Fri Jul 31 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.117.rc5
+- Linux 2.6.31-rc5
+
+* Fri Jul 31 2009 Matthew Garrett <mjg@redhat.com>
+- linux-2.6-dell-laptop-rfkill-fix.patch: Fix up Dell rfkill
+
+* Fri Jul 31 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: build against 2.6.31-rc4-git6, fix script parsing on some G8x chips
+
+* Thu Jul 30 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.31-rc4-git6
+  New config item: CONFIG_BATTERY_DS2782 is not set
+- Add last-minute set_memory_wc() fix from LKML.
+
+* Thu Jul 30 2009 Matthew Garrett <mjg@redhat.com>
+- drm-intel-pm.patch: Don't reclock external outputs. Increase the reduced
+   clock slightly to avoid upsetting some hardware. Disable renderclock
+   adjustment for the moment - it's breaking on some hardware.
+
+* Thu Jul 30 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: another DCB 1.5 entry, G80 corruption fixes, small <G80 KMS fix
+
+* Thu Jul 30 2009 Dave Airlie <airlied@redhat.com>
+- fix VGA ARB + kms
+
+* Wed Jul 29 2009 Dave Jones <davej@redhat.com>
+- Add support for dracut. (Harald Hoyer)
+
+* Wed Jul 29 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: nv50/nva0 tiled scanout fixes, nv40 kms fixes
+
+* Wed Jul 29 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.31-rc4-git3
+- Drop linux-2.6-ecryptfs-overflow-fixes.patch, merged upstream now.
+
+* Wed Jul 29 2009 Dave Airlie <airlied@redhat.com>
+- update VGA arb patches
+
+* Tue Jul 28 2009 Adam Jackson <ajax@redhat.com>
+- Remove the pcspkr modalias.  If you're still living in 1994, load it
+  by hand.
+
+* Tue Jul 28 2009 Eric Sandeen <sandeen@redhat.com> 2.6.31-0.102.rc4.git2
+- Fix eCryptfs overflow issues (CVE-2009-2406, CVE-2009-2407)
+
+* Tue Jul 28 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.101.rc4.git2
+- 2.6.31-rc4-git2
+- rebase linux-2.6-fix-usb-serial-autosuspend.diff
+- config changes:
+ - USB_GSPCA_SN9C20X=m (_EVDEV=y)
+
+* Tue Jul 28 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: cleanup userspace API, various bugfixes.
+  Looks worse than it is, register macros got cleaned up, which
+  touches pretty much everywhere..
+
+* Mon Jul 27 2009 Adam Jackson <ajax@redhat.com>
+- Warn quieter about not finding PCI bus parents for ROM BARs, they're
+  not usually needed and there's nothing you can do about it anyway.
+
+* Mon Jul 27 2009 Matthew Garrett <mjg@redhat.com>
+- linux-2.6-alsa-improve-hda-powerdown.patch - attempt to reduce audio glitches
+   caused by HDA powerdown
+- disable CONFIG_DEBUG_KOBJECT again for now, since it produces huge dmesg spew
+
+* Mon Jul 27 2009 Dave Airlie <airlied@redhat.com>
+- update vga arb code
+
+* Mon Jul 27 2009 Matthew Garrett <mjg@redhat.com>
+- drm-intel-pm.patch - Add runtime PM for Intel graphics
+
+* Fri Jul 24 2009 Kristian HÃ¸gsberg <krh@redhat.com>
+- Add drm-page-flip.patch to support vsynced page flipping on intel
+  chipsets.
+- Really add patch.
+- Fix patch to not break nouveau.
+
+* Fri Jul 24 2009 Chuck Ebbert <cebbert@redhat.com>
+- Enable CONFIG_DEBUG_KOBJECT in debug kernels. (#513606)
+
+* Thu Jul 23 2009 Kyle McMartin <kyle@redhat.com>
+- perf BuildRequires binutils-devel now.
+
+* Thu Jul 23 2009 Justin M. Forbes <jforbes@redhat.com>
+- Add KSM support
+
+* Thu Jul 23 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.87.rc4
+- Linux 2.6.31-rc4
+- config changes:
+ - USB_CDC_PHONET=m [all]
+ - EVENT_PROFILE=y [i386, x86_64, powerpc, s390]
+
+* Wed Jul 22 2009 Tom "spot" Callaway <tcallawa@redhat.com>
+- We have to override the new %%install behavior because, well... the kernel is special.
+
+* Wed Jul 22 2009 Dave Jones <davej@redhat.com>
+- 2.6.31-rc3-git5
+
+* Wed Jul 22 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31-0.82.rc3.git4
+- Enable KMS for nouveau
+
+* Wed Jul 22 2009 Ben Skeggs <bskeggs@redhat.com>
+- Update nouveau from upstream (initial suspend/resume + misc bugfixes)
+
+* Mon Jul 20 2009 Adam Jackson <ajax@redhat.com>
+- Disable VGA arbiter patches for a moment
+
+* Mon Jul 20 2009 Adam Jackson <ajax@redhat.com>
+- Revive 4k framebuffers for intel gen3
+
+* Mon Jul 20 2009 Dave Jones <davej@redhat.com> 2.6.31-0.78.rc3.git4
+- Enable CONFIG_RTC_HCTOSYS (#489494)
+
+* Mon Jul 20 2009 Dave Jones <davej@redhat.com> 2.6.31-0.77.rc3.git4
+- Don't build 586 kernels any more.
+
+* Sun Jul 19 2009 Dave Jones <davej@redhat.com> 2.6.31-0.75.rc3.git4
+- build a 'full' package on i686 (Bill Nottingham)
+
+* Sun Jul 19 2009 Dave Jones <davej@redhat.com> 2.6.31-0.74.rc3.git4
+- 2.6.31-rc3-git4
+
+* Sat Jul 18 2009 Matthew Garrett <mjg@redhat.com>
+- linux-2.6-driver-level-usb-autosuspend.diff - allow drivers to enable autopm
+- linux-2.6-fix-usb-serial-autosuspend.diff - fix generic usb-serial autopm
+- linux-2.6-qcserial-autosuspend.diff - enable autopm by default on qcserial
+- linux-2.6-bluetooth-autosuspend.diff - enable autopm by default on btusb
+- linux-2.6-usb-uvc-autosuspend.diff - enable autopm by default on uvc
+
+* Thu Jul 16 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc3-git3
+
+* Thu Jul 16 2009 Matthew Garrett <mjg@redhat.com>
+- linux-2.6-defaults-aspm.patch - default ASPM to on for PCIe >= 1.1 hardware
+
+* Thu Jul 16 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.69.rc3
+- linux-2.6-vga-arb.patch - add VGA arbiter.
+- drm-vga-arb.patch - add VGA arbiter support to drm
+
+* Tue Jul 14 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.68-rc3
+- 2.6.31-rc3
+- config changes:
+ - RTL8192SU is not set, (staging)
+
+* Mon Jul 13 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.67.rc2.git9
+- 2.6.31-rc2-git9
+- config changes:
+ - BLK_DEV_OSD=m
+
+* Mon Jul 13 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: update from upstream
+
+* Fri Jul 10 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc2-git6
+- Drop dmadebug-spinlock patch -- merged upstream.
+
+* Fri Jul 10 2009 Dave Jones <davej@redhat.com> 2.6.31-0.64.rc2.git5
+- Don't jump through hoops that ppc powerbooks have to on sensible systems
+  in cpufreq_suspend.
+
+* Fri Jul 10 2009 Dave Jones <davej@redhat.com>
+- 2.6.31-rc2-git5
+
+* Thu Jul 09 2009 Dave Jones <davej@redhat.com> 2.6.31-0.62.rc2.git4
+- Use correct spinlock initialization in dma-debug
+
+* Thu Jul 09 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.61.rc2.git4
+- 2.6.31-rc2-git4
+
+* Thu Jul 09 2009 Jarod Wilson <jarod@redhat.com>
+- Enable IR receiver on the Hauppauge HD PVR
+- Trim the changelog, axing everything before 2.6.29 (see cvs
+  if you still really want to see that far back)
+
+* Wed Jul 08 2009 Dave Jones <davej@redhat.com>
+- Enable a bunch of debugging options that were missed somehow.
+
+* Wed Jul 08 2009 Kyle McMartin <kyle@redhat.com>
+- Bump NR_CPUS on x86_64 to 512.
+
+* Wed Jul 08 2009 Adam Jackson <ajax@redhat.com>
+- drm-no-gem-on-i8xx.patch: Drop, intel 2D driver requires GEM now. This
+  should be entertaining.
+
+* Wed Jul 08 2009 Kyle McMartin <kyle@redhat.com>
+- First cut of /usr/sbin/perf wrapper script and 'perf'
+  subpackage.
+
+* Wed Jul 08 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.54.rc2.git2
+- Rebase and re-apply all the Fedora-specific linux-2.6-debug-*
+  patches.
+- Cull a bunch of upstreamed patches from the spec.
+
+* Wed Jul 08 2009 Steve Dickson <steved@redhat.com>
+- Added NFSD v4 dynamic pseudo root patch which allows
+  NFS v3 exports to be mounted by v4 clients.
+
+* Tue Jul 07 2009 Jarod Wilson <jarod@redhat.com>
+- See if we can't make lirc_streamzap behave better... (#508952)
+
+* Tue Jul 07 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.47.rc2.git2
+- 2.6.31-rc2-git2
+
+* Tue Jul 07 2009 Jarod Wilson <jarod@redhat.com>
+- Make lirc_i2c actually work with 2.6.31 i2c
+
+* Mon Jul 06 2009 Chuck Ebbert <cebbert@redhat.com>
+- Use LZMA for kernel compression on X86.
+
+* Mon Jul 06 2009 Jarod Wilson <jarod@redhat.com>
+- Hack up lirc_i2c and lirc_zilog to compile with 2.6.31 i2c
+  changes. The drivers might not actually be functional now, but
+  at least they compile again. Will fix later, if need be...
+
+* Sat Jul 04 2009 Dave Jones <davej@redhat.com> 2.6.31-0.42.rc2
+- 2.6.31-rc2
+
+* Sat Jul 04 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc1-git11
+
+* Fri Jul 03 2009 Hans de Goede <hdegoede@redhat.com>
+- Disable v4l1 ov511 and quickcam_messenger drivers (obsoleted by
+  v4l2 gspca subdrivers)
+
+* Thu Jul 02 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.39.rc1.git9
+- 2.6.31-rc1-git9
+- linux-2.6-dm-fix-exstore-search.patch: similar patch merged upstream.
+
+* Tue Jun 30 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.38.rc1.git7
+- 2.6.31-rc1-git7
+
+* Tue Jun 30 2009 Dave Jones <davej@redhat.com> 2.6.31-0.37.rc1.git5
+- Disable kmemleak. Way too noisy, and not finding any real bugs.
+
+* Tue Jun 30 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: match upstream
+
+* Mon Jun 29 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.35.rc1.git5
+- 2.6.31-rc1-git5
+- CONFIG_LEDS_LP3944=m
+
+* Mon Jun 29 2009 Chuck Ebbert <cebbert@redhat.com>
+- Try to fix the dm overlay bug for real (#505121)
+
+* Sat Jun 27 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31-0.33.rc1.git2
+- drm-nouveau.patch: fix conflicts from 2.6.31-rc1-git2
+
+* Fri Jun 26 2009 Dave Jones <davej@redhat.com> 2.6.31-0.31.rc1.git2
+- Further improvements to kmemleak
+
+* Fri Jun 26 2009 Dave Jones <davej@redhat.com> 2.6.31-0.30.rc1.git2
+- 2.6.31-rc1-git2
+
+* Fri Jun 26 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: latest upstream + reenable
+
+* Thu Jun 25 2009 Dave Jones <davej@redhat.com> 2.6.31-0.29.rc1
+- Make kmemleak scan process stacks by default.
+  Should reduce false positives (which does also increase false negatives,
+  but that's at least less noisy)
+
+* Wed Jun 24 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.28.rc1
+- 2.6.31-rc1
+- linux-2.6-utrace.patch: rebase on kernel/Makefile changes
+- config changes:
+ - generic:
+  - CONFIG_DM_LOG_USERSPACE=m
+  - CONFIG_DM_MULTIPATH_QL=m
+  - CONFIG_DM_MULTIPATH_ST=m
+  - CONFIG_BATTERY_MAX17040=m
+  - CONFIG_I2C_DESIGNWARE is off (depends on clk.h)
+
+* Wed Jun 24 2009 Kyle McMartin <kyle@redhat.com>
+- Move perf to /usr/libexec/perf-$KernelVer.
+
+* Wed Jun 24 2009 Kyle McMartin <kyle@redhat.com>
+- config changes:
+ - generic:
+  - CONFIG_SCSI_DEBUG=m (was off, requested by davidz)
+
+* Wed Jun 24 2009 Dave Jones <davej@redhat.com> 2.6.31-0.22.rc0.git22
+- 2.6.30-git22
+
+* Tue Jun 23 2009 Dave Jones <davej@redhat.com> 2.6.31-0.22.rc0.git20
+- 2.6.30-git20
+
+* Mon Jun 22 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.24.rc0.git18
+- Enable tools/perf, installed as /bin/perf-$KernelVer. Docs and a /bin/perf
+  wrapper come next if this builds ok.
+
+* Mon Jun 22 2009 Kyle McMartin <kyle@redhat.com>
+- sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch: pull in
+  two fixes from Mike Galbraith from tip.git
+
+* Sun Jun 21 2009 Dave Jones <davej@redhat.com> 2.6.31-0.21.rc0.git18
+- Add patch to possibly fix the pktlen problem on via-velocity.
+
+* Sun Jun 21 2009 Dave Jones <davej@redhat.com> 2.6.31-0.20.rc0.git18
+- 2.6.30-git18
+  VIA crypto & mmc patches now upstream.
+
+* Sun Jun 21 2009 Dave Jones <davej@redhat.com>
+- Determine cacheline sizes in a generic manner.
+
+* Sun Jun 21 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.18.rc0.git17
+- 2.6.30-git17
+- Config changes:
+  - powerpc32-generic
+      CONFIG_PERF_COUNTERS=y
+  - generic
+      CONFIG_KEYBOARD_LM8323 is not set
+      CONFIG_MOUSE_SYNAPTICS_I2C=m
+      CONFIG_TOUCHSCREEN_EETI=m
+      CONFIG_TOUCHSCREEN_W90X900=m
+- Dropped agp-set_memory_ucwb.patch, all fixed upstream now.
+
+* Sat Jun 20 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.0.17.rc0.git15
+- config changes:
+ - ppc generic:
+  - CONFIG_PPC_DISABLE_WERROR=y (switched... chrp fails otherwise, stack
+    frame size.)
+
+* Sat Jun 20 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.0.16.rc0.git15
+- 2.6.30-git15
+- config changes:
+ - generic:
+  - CONFIG_LBDAF=y
+ - staging:
+  - CONFIG_USB_SERIAL_QUATECH2 is not set
+  - CONFIG_VT6655 is not set
+  - CONFIG_USB_CPC is not set
+  - CONFIG_RDC_17F3101X is not set
+  - CONFIG_FB_UDL is not set
+ - ppc32:
+  - CONFIG_KMETER1=y
+ - ppc generic:
+  - CONFIG_PPC_DISABLE_WERROR is not set
+- lirc disabled due to i2c detach_client removal.
+
+* Sat Jun 20 2009 Kyle McMartin <kyle@redhat.com>
+- sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch: add,
+  queued in tip/sched/core (ca94c442535a44d508c99a77e54f21a59f4fc462)
+
+* Fri Jun 19 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.0.15.rc0.git14
+- Fix up ptrace, hopefully. Builds on x86_64 at least.
+
+* Fri Jun 19 2009 Chuck Ebbert <cebbert@redhat.com>
+- linux-2.6-tip.git-203abd67b75f7714ce98ab0cdbd6cfd7ad79dec4.patch
+  Fixes oops on boot with qemu (#507007)
+
+* Fri Jun 19 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.13.rc0.git14
+- 2.6.30-git14
+
+* Fri Jun 19 2009 Chuck Ebbert <cebbert@redhat.com>
+- Fix up the via-sdmmc and via-hwmon-temp-sensor patches.
+- Drop VIA Padlock patches merged upstream:
+    via-rng-enable-64bit.patch
+    via-padlock-10-enable-64bit.patch
+    via-padlock-20-add-x86-dependency.patch
+
+* Thu Jun 18 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.11.rc0.git13
+- 2.6.30-git13
+- config changes:
+ - arm:
+  - CONFIG_UACCESS_WITH_MEMCPY is not set
+ - i686-PAE:
+  - CONFIG_XEN_DEV_EVTCHN=m
+  - CONFIG_XEN_SYS_HYPERVISOR=y
+ - ia64:
+  - CONFIG_RCU_FANOUT=64
+ - nodebug:
+  - CONFIG_DEBUG_KMEMLEAK is not set
+  - CONFIG_DEBUG_KMEMLEAK_TEST=m
+ - powerpc:
+  - CONFIG_CAN_SJA1000_OF_PLATFORM=m
+  - CONFIG_PPC_EMULATED_STATS=y
+  - CONFIG_SWIOTLB=y
+  - CONFIG_RDS is not set (broken on ppc32)
+ - powerpc32:
+  - CONFIG_RCU_FANOUT=32
+ - powerpc64:
+  - CONFIG_RCU_FANOUT=64
+  - CONFIG_PERF_COUNTERS=y
+ - s390x:
+  - CONFIG_RCU_FANOUT=64
+  - CONFIG_SECCOMP=y
+  - CONFIG_PM=y
+  - CONFIG_HIBERNATION=y
+  - CONFIG_PM_STD_PARTITION="/dev/jokes"
+ - sparc64:
+  - CONFIG_RCU_FANOUT=64
+ - x86:
+  - CONFIG_RCU_FANOUT=32
+  - CONFIG_IOMMU_STRESS is not set
+  - CONFIG_PERF_COUNTERS=y
+  - CONFIG_X86_OLD_MCE is not set
+  - CONFIG_X86_MCE_INTEL=y
+  - CONFIG_X86_MCE_AMD=y
+  - CONFIG_X86_ANCIENT_MCE is not set
+  - CONFIG_X86_MCE_INJECT is not set
+ - x86_64:
+  - CONFIG_EDAC_AMD64=m
+  - CONFIG_EDAC_AMD64_ERROR_INJECTION is not set
+  - CONFIG_XEN_DEV_EVTCHN=m
+  - CONFIG_XEN_SYS_HYPERVISOR=y
+  - CONFIG_RCU_FANOUT=64
+  - CONFIG_IOMMU_STRESS is not set
+  - CONFIG_PERF_COUNTERS=y
+  - CONFIG_X86_MCE_INJECT is not set
+ - generic:
+  - CONFIG_RCU_FANOUT=32
+  - CONFIG_MMC_SDHCI_PLTFM=m
+  - CONFIG_MMC_CB710=m
+  - CONFIG_CB710_CORE=m
+  - CONFIG_CB710_DEBUG is not set
+  - CONFIG_SCSI_MVSAS_DEBUG is not set
+  - CONFIG_SCSI_BNX2_ISCSI=m
+  - CONFIG_NETFILTER_XT_MATCH_OSF=m
+  - CONFIG_RFKILL_INPUT=y (used to be =m, which was invalid)
+  - CONFIG_DE2104X_DSL=0
+  - CONFIG_KS8842 is not set
+  - CONFIG_CFG80211_DEBUGFS=y
+  - CONFIG_MAC80211_DEFAULT_PS=y
+  - CONFIG_IWM=m
+  - CONFIG_IWM_DEBUG is not set
+  - CONFIG_RT2800USB=m
+  - CONFIG_CAN_DEV=m
+  - CONFIG_CAN_CALC_BITTIMING=y
+  - CONFIG_CAN_SJA1000=m
+  - CONFIG_CAN_SJA1000_PLATFORM=m
+  - CONFIG_CAN_EMS_PCI=m
+  - CONFIG_CAN_KVASER_PCI=m
+  - CONFIG_EEPROM_MAX6875=m
+  - CONFIG_SENSORS_TMP401=m
+  - CONFIG_MEDIA_SUPPORT=m
+  - CONFIG_SND_CTXFI=m
+  - CONFIG_SND_LX6464ES=m
+  - CONFIG_SND_HDA_CODEC_CA0110=y
+  - CONFIG_USB_XHCI_HCD=m
+  - CONFIG_USB_XHCI_HCD_DEBUGGING is not set
+  - CONFIG_DRAGONRISE_FF=y (used to be =m)
+  - CONFIG_GREENASIA_FF=y (used to be =m)
+  - CONFIG_SMARTJOYPLUS_FF=y (used to be =m)
+  - CONFIG_USB_NET_INT51X1=m
+  - CONFIG_CUSE=m
+  - CONFIG_FUNCTION_PROFILER=y
+  - CONFIG_RING_BUFFER_BENCHMARK=m
+  - CONFIG_REGULATOR_USERSPACE_CONSUMER=m
+  - CONFIG_REGULATOR_MAX1586=m
+  - CONFIG_REGULATOR_LP3971=m
+  - CONFIG_RCU_FANOUT_EXACT is not set
+  - CONFIG_DEFAULT_MMAP_MIN_ADDR=4096
+  - CONFIG_FSNOTIFY=y
+  - CONFIG_IEEE802154=m
+  - CONFIG_IEEE802154_DRIVERS=m
+  - CONFIG_IEEE802154_FAKEHARD=m
+  - CONFIG_CNIC=m
+
+* Wed Jun 17 2009 Jarod Wilson <jarod@redhat.com>
+- New lirc_imon hotness, update 2:
+  * support dual-interface devices with a single lirc device
+  * directional pad functions as an input device mouse
+  * touchscreen devices finally properly supported
+  * support for using MCE/RC-6 protocol remotes
+  * fix oops in RF remote association code (F10 bug #475496)
+  * fix re-enabling case/panel buttons and/or knobs
+- Add some misc additional lirc_mceusb2 transceiver IDs
+- Add missing unregister_chrdev_region() call to lirc_dev exit
+- Add it8720 support to lirc_it87
+
+* Tue Jun 16 2009 Chuck Ebbert <cebbert@redhat.com>
+- Update via-sdmmc driver
+
+* Mon Jun 15 2009 Jarod Wilson <jarod@redhat.com>
+- Update lirc patches w/new imon hotness
+
+* Fri Jun 12 2009 Chuck Ebbert <cebbert@redhat.com>
+- Update VIA temp sensor and mmc drivers.
+
+* Fri Jun 12 2009 John W. Linville <linville@redhat.com> 2.6.30-6
+- neigh: fix state transition INCOMPLETE->FAILED via Netlink request
+- enable CONFIG_ARPD (used by OpenNHRP)
+
+* Wed Jun 10 2009 Chuck Ebbert <cebbert@redhat.com>
+- VIA Nano updates:
+  Enable Padlock AES encryption and random number generator on x86-64
+  Add via-sdmmc and via-cputemp drivers
+
+* Wed Jun 10 2009 Kyle McMartin <kyle@redhat.com> 2.6.30-1
+- Linux 2.6.30 rebase.
+
+* Tue Jun 09 2009 John W. Linville <linville@tuxdriver.com>
+- Clean-up some wireless bits in config-generic
+
+* Tue Jun 09 2009 Chuck Ebbert <cebbert@redhat.com>
+- Add support for ACPI P-states on VIA processors.
+- Disable the e_powersaver driver.
+
+* Tue Jun 09 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.30-rc8-git6
+
+* Fri Jun 05 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.30-rc8-git1
+
+* Wed Jun 03 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.30-rc8
+
+* Tue Jun  2 2009 Roland McGrath <roland@redhat.com>
+- utrace update (fixes stap PR10185)
+
+* Tue Jun 02 2009 Dave Jones <davej@redhat.com>
+- For reasons unknown, RT2X00 driver was being built-in.
+  Make it modular.
+
+* Tue Jun 02 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc7-git5
+
+* Sat May 30 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc7-git4
+
+* Thu May 28 2009 Dave Jones <davej@redhat.com
+- 2.6.30-rc7-git3
+
+* Wed May 27 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc7-git2
+
+* Tue May 26 2009 Dave Jones <davej@redhat.com>
+- Various cpufreq patches from git.
+
+* Tue May 26 2009 Dave Jones <davej@redhat.com
+- 2.6.30-rc7-git1
+
+* Tue May 26 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc7-git1
+
+* Mon May 25 2009 Kyle McMartin <kyle@redhat.com>
+- rds-only-on-64-bit-or-x86.patch: drop patch, issue is fixed upstream.
+
+* Sat May 23 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc7
+
+* Thu May 21 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc6-git6
+
+* Wed May 20 2009  Chuck Ebbert <cebbert@redhat.com>
+- Enable Divas (formerly Eicon) ISDN drivers on x86_64. (#480837)
+
+* Wed May 20 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc6-git5
+
+* Mon May 18 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc6-git3
+
+* Sun May 17 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc6-git2
+
+* Sat May 16 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc6
+
+* Mon May 11 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.30-rc5-git1
+
+* Fri May 08 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.30-rc5
+
+* Fri May 08 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.30-rc4-git4
+
+* Wed May 06 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.30-rc4-git3
+- linux-2.6-cdrom-door-status.patch: merged upstream.
+- linux-2.6-iwl3945-remove-useless-exports.patch: merged upstream.
+- linux-2.6-utrace.patch: rebase against changes to fs/proc/array.c
+- USB_NET_CDC_EEM=m
+
+* Fri May 01 2009 Eric Sandeen <sandeen@redhat.com>
+- Fix ext4 corruption on partial write into prealloc block
+
+* Thu Apr 30 2009 Kyle McMartin <kyle@redhat.com>
+- 2.6.30-rc4
+
+* Wed Apr 29 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc3-git6
+
+* Tue Apr 28 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc3-git4
+
+* Tue Apr 28 2009 Chuck Ebbert <cebbert@redhat.com>
+- Make the kernel-vanilla package buildable again.
+- Allow building with older versions of RPM.
+
+* Tue Apr 28 2009 Neil Horman <nhorman@redhat.com>
+- Backport missing snmp stats (bz 492391)
+
+* Tue Apr 28 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.30-0.72.rc3.git3
+- Drop unused exports from the iwl3945 driver.
+
+* Tue Apr 28 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.30-rc3-git3
+
+* Mon Apr 27 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc3-git2
+
+* Sun Apr 26 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.30-0.68.rc3.git1
+- Linux 2.6.30-rc3-git1
+
+* Wed Apr 22 2009 Dave Jones <davej@redhat.com> 2.6.30-0.67.rc3
+- Disable SYSFS_DEPRECATED on ia64
+
+* Wed Apr 22 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.30-rc3
+- PROC_VMCORE=y: Exports the dump image of crashed
+  kernel in ELF format
+
+* Wed Apr 22 2009 Neil Horman <nhorman@redhat.com>
+- Enable RELOCATABLE and CRASH_DUMP for powerpc64
+- With this we can remove the -kdump build variant
+- for the ppc64 arch
+
+* Tue Apr 21 2009 Chuck Ebbert <cebbert@redhat.com>
+- Don't include the modules.*.bin files in the RPM package.
+
+* Tue Apr 21 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc2-git7
+
+* Mon Apr 20 2009 Dave Jones <davej@redhat.com>
+- Various s390x config tweaks. (#496596, #496601, #496605, #496607)
+
+* Mon Apr 20 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc2-git6
+
+* Sat Apr 18 2009 Chuck Ebbert <cebbert@redhat.com>
+- Set CONFIG_UEVENT_HELPER_PATH to the empty string (#496296)
+
+* Fri Apr 17 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc2-git3
+
+* Thu Apr 16 2009 Kyle McMartin <kyle@redhat.com> 2.6.30-0.58.rc2.git1
+- 2.6.30-rc2-git1
+
+* Wed Apr 15 2009 Kyle McMartin <kyle@redhat.com> 2.6.30-0.57.rc2
+- 2.6.30-rc2
+
+* Tue Apr 14 2009 Kyle McMartin <kyle@redhat.com>
+- 2.6.30-rc1-git7
+- CONFIG_TOUCHSCREEN_AD7879_I2C=m
+- CONFIG_STRIP_ASM_SYMS=y, off for -debug
+
+* Mon Apr 13 2009 Kyle McMartin <kyle@redhat.com>
+- ppc-fix-parport_pc.patch: add from linuxppc-dev@
+
+* Mon Apr 13 2009 Kyle McMartin <kyle@redhat.com>
+- execshield: fix build (load_user_cs_desc is 32-bit only in tlb.c)
+
+* Sun Apr 12 2009 Kyle McMartin <kyle@redhat.com>
+- 2.6.30-rc1-git5
+- revert-fix-modules_install-via-nfs.patch: reverted upstream
+
+* Thu Apr 09 2009 Kyle McMartin <kyle@redhat.com>
+- actually drop utrace-ftrace from srpm.
+
+* Thu Apr 09 2009 Kyle McMartin <kyle@redhat.com>
+- 2.6.30-rc1-git2
+- CONFIG_IGBVF=m
+- CONFIG_NETFILTER_XT_TARGET_LED=m
+
+* Thu Apr 09 2009 Dave Jones <davej@redhat.com>
+- Bring back the /dev/crash driver. (#492803)
+
+* Wed Apr 08 2009 Dave Jones <davej@redhat.com>
+- disable MMIOTRACE in non-debug builds (#494584)
+
+* Wed Apr 08 2009 Kyle McMartin <kyle@redhat.com> 2.6.30-0.44.rc1
+- 2.6.30-rc1
+- linux-2.6-hwmon-atk0110.patch: drop
+- CONFIG_DETECT_HUNG_TASK=y
+- # CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+
+* Tue Apr  7 2009 Roland McGrath <roland@redhat.com>
+- utrace update, drop unfinished utrace-ftrace
+
+* Tue Apr 07 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.29-git15
+- EXT3_DEFAULTS_TO_ORDERED on for now.
+- X86_X2APIC enabled.
+- LEDS_LP5521, LEDS_BD2802 off... look not generally relevant.
+- LIBFCOE on.
+
+* Tue Apr 07 2009 Dave Jones <davej@redhat.com>
+- Enable CONFIG_CIFS_STATS (#494545)
+
+* Mon Apr 06 2009 Kyle McMartin <kyle@redhat.com>
+- linux-2.6-execshield.patch: rebase for 2.6.30
+
+* Mon Apr 06 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.29-git13
+- drop patches merged upstream:
+  - fix-ppc-debug_kmap_atomic.patch
+  - fix-staging-at76.patch
+  - linux-2.6-acpi-video-didl-intel-outputs.patch
+  - linux-2.6-acpi-strict-resources.patch
+  - linux-2.6-sony-laptop-rfkill.patch
+  - linux-2.6-btrfs-fix-umount-hang.patch
+  - linux-2.6-fiemap-header-install.patch
+  - linux-2.6-debug-dma-api.patch
+  - dma-api-debug-fixes.patch
+  - linux-2.6-ext4-flush-on-close.patch
+  - linux-2.6-relatime-by-default.patch
+  - linux-2.6-pci-sysfs-remove-id.patch
+  - linux-2.6-scsi-cpqarray-set-master.patch
+  - alsa-rewrite-hw_ptr-updaters.patch
+  - alsa-pcm-always-reset-invalid-position.patch
+  - alsa-pcm-fix-delta-calc-at-overlap.patch
+  - alsa-pcm-safer-boundary-checks.patch
+  - linux-2.6-input-hid-extra-gamepad.patch
+  - linux-2.6-ipw2x00-age-scan-results-on-resume.patch
+  - linux-2.6-dropwatch-protocol.patch
+  - linux-2.6-net-fix-gro-bug.patch
+  - linux-2.6-net-fix-another-gro-bug.patch
+  - linux-2.6-net-xfrm-fix-spin-unlock.patch
+  - linux-2.6.29-pat-change-is_linear_pfn_mapping-to-not-use-vm_pgoff.patch
+  - linux-2.6.29-pat-pci-change-prot-for-inherit.patch
+
+* Thu Apr 02 2009 Josef Bacik <josef@toxicpanda.com>
+- linux-2.6-btrfs-fix-umount-hang.patch: fix umount hang on btrfs
+
+* Thu Apr 02 2009 Kyle McMartin <kyle@redhat.com>
+- fix-ppc-debug_kmap_atomic.patch: fix build failures on ppc.
+
+* Thu Apr 02 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.29-git9
+
+* Tue Mar 31 2009 Kyle McMartin <kyle@redhat.com>
+- rds-only-on-64-bit-or-x86.patch: add
+- at76-netdev_ops.patch: add
+
+* Tue Mar 31 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.29-git8
+- linux-2.6-net-fix-another-gro-bug.patch: upstream.
+
+* Tue Mar 31 2009 Eric Sandeen <sandeen@redhat.com>
+- add fiemap.h to kernel-headers
+- build ext4 (and jbd2 and crc16) into the kernel
+
+* Tue Mar 31 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.29-git7
+- fix-staging-at76.patch: pull patch from linux-wireless to fix...
+
+* Mon Mar 30 2009 Kyle McMartin <kyle@redhat.com> 2.6.30-0.28.rc0.git6
+- Linux 2.6.29-git6
+- Bunch of stuff disabled, most merged, some needs rebasing.
+
+* Mon Mar 30 2009 Chuck Ebbert <cebbert@redhat.com>
+- Make the .shared-srctree file a list so more than two checkouts
+  can share source files.
+
+* Mon Mar 30 2009 Chuck Ebbert <cebbert@redhat.com>
+- Separate PAT fixes that are headed for -stable from our out-of-tree ones.
+
+* Mon Mar 30 2009 Dave Jones <davej@redhat.com>
+- Make io schedulers selectable at boot time again. (#492817)
+
+* Mon Mar 30 2009 Dave Jones <davej@redhat.com>
+- Add a strict-devmem=0 boot argument (#492803)
+
+* Mon Mar 30 2009 Adam Jackson <ajax@redhat.com>
+- linux-2.6.29-pat-fixes.patch: Fix PAT/GTT interaction
+
+* Mon Mar 30 2009 Mauro Carvalho Chehab <mchehab@redhat.com>
+- some fixes of troubles caused by v4l2 subdev conversion
+
+* Mon Mar 30 2009 Mark McLoughlin <markmc@redhat.com> 2.6.29-21
+- Fix guest->remote network stall with virtio/GSO (#490266)
+
+* Mon Mar 30 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch
+  - rewrite nouveau PCI(E) GART functions, should fix rh#492492
+  - kms: kernel option to allow dual-link dvi
+  - modinfo descriptions for module parameters
+
+* Sun Mar 29 2009 Mauro Carvalho Chehab <mchehab@redhat.com>
+- more v4l/dvb updates: v4l subdev conversion and some driver improvements
+
+* Sun Mar 29 2009 Chuck Ebbert <cebbert@redhat.com>
+- More fixes for ALSA hardware pointer updating.
+
+* Sat Mar 28 2009 Mauro Carvalho Chehab <mchehab@redhat.com>
+- linux-2.6-revert-dvb-net-kabi-change.patch: attempt to fix dvb net breakage
+- update v4l fixes patch to reflect what's ready for 2.6.30
+- update v4l devel patch to reflect what will be kept on linux-next for a while
+
+* Fri Mar 27 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.29-16
+- Fix 2.6.29 networking lockups.
+- Fix locking in net/xfrm/xfrm_state.c (#489764)
+
+* Fri Mar 27 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: do nothing for dac_{prepare,commit}, it's useless
+  and breaks some things in strange ways.
+
+* Fri Mar 27 2009 Ben Skeggs <bskeggs@redhat.com>
+- nv50: clear 0x1900/8 on init, possible fix for rh#492240
+- forcibly disable GEM also if KMS requested where not supported
+- inform the user if we disable KMS because of it not being supported
+
+* Thu Mar 26 2009 Matthew Garrett <mjg@redhat.com>
+- linux-2.6-relatime-by-default.patch: Backport relatime code from 2.6.30
+
+* Thu Mar 26 2009 Dave Jones <davej@redhat.com>
+- Check for modesetting enabled before forcing mode on 915. (#490336)
+
+* Thu Mar 26 2009 Dave Jones <davej@redhat.com>
+- Set kernel-PAE as default in grub. (#487578)
+
+* Thu Mar 26 2009 Dave Jones <davej@redhat.com>
+- Enable CONFIG_MOUSE_PS2_ELANTECH (#492163)
+
+* Thu Mar 26 2009 Kyle McMartin <kyle@redhat.com>
+- linux-2.6-v4l-pvrusb2-fixes.patch: fix build for uncle steve.
+
+* Thu Mar 26 2009 Mauro Carvalho Chehab <mchehab@redhat.com>
+- Move all 2.6.30 stuff into linux-2.6-v4l-dvb-fixes.patch, in
+  preparation for upstream pull;
+- Added two new drivers: gspca sq905c and DVB Intel ce6230
+- Updated to the latest v4l-dvb drivers.
+
+* Wed Mar 25 2009 Mauro Carvalho Chehab <mchehab@redhat.com>
+- remove duplicated Cinergy T2 entry at config-generic
+
+* Wed Mar 25 2009 Neil Horman <nhorman@redhat.com>
+- Add dropmonitor/dropwatch protocol from 2.6.30
+
+* Wed Mar 25 2009 Kyle McMartin <kyle@redhat.com>
+- alsa-rewrite-hw_ptr-updaters.patch: snd_pcm_update_hw_ptr() tries to
+  detect the unexpected hwptr jumps more strictly to avoid the position
+  mess-up, which often results in the bad quality I/O with pulseaudio.
+
+* Wed Mar 25 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: idle channels better before destroying them
+
+* Tue Mar 24 2009 Kyle McMartin <kyle@redhat.com>
+- Disable DMAR by default until suspend & resume is fixed.
+
+* Tue Mar 24 2009 Josef Bacik <josef@toxicpanda.com>
+- fsync replay fixes for btrfs
+
+* Mon Mar 23 2009 Dave Jones <davej@redhat.com>
+- 2.6.29
+
+###
+# The following Emacs magic makes C-c C-e use UTC dates.
+# Local Variables:
+# rpm-change-log-uses-utc: t
+# End:
+###
diff --git a/linux-2.6-btrfs-upstream.patch b/linux-2.6-btrfs-upstream.patch
new file mode 100644
index 000000000..d309773f2
--- /dev/null
+++ b/linux-2.6-btrfs-upstream.patch
@@ -0,0 +1,10828 @@
+diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
+index f128427..3616042 100644
+--- a/fs/btrfs/acl.c
++++ b/fs/btrfs/acl.c
+@@ -27,7 +27,7 @@
+ #include "btrfs_inode.h"
+ #include "xattr.h"
+ 
+-#ifdef CONFIG_FS_POSIX_ACL
++#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ 
+ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+ {
+@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
+ 	.set	= btrfs_xattr_acl_access_set,
+ };
+ 
+-#else /* CONFIG_FS_POSIX_ACL */
++#else /* CONFIG_BTRFS_FS_POSIX_ACL */
+ 
+ int btrfs_acl_chmod(struct inode *inode)
+ {
+@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
+ 	return 0;
+ }
+ 
+-#endif /* CONFIG_FS_POSIX_ACL */
++#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
+diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
+index 019e8af..c0861e7 100644
+--- a/fs/btrfs/async-thread.c
++++ b/fs/btrfs/async-thread.c
+@@ -48,6 +48,9 @@ struct btrfs_worker_thread {
+ 	/* number of things on the pending list */
+ 	atomic_t num_pending;
+ 
++	/* reference counter for this struct */
++	atomic_t refs;
++
+ 	unsigned long sequence;
+ 
+ 	/* protects the pending list. */
+@@ -61,6 +64,51 @@ struct btrfs_worker_thread {
+ };
+ 
+ /*
++ * btrfs_start_workers uses kthread_run, which can block waiting for memory
++ * for a very long time.  It will actually throttle on page writeback,
++ * and so it may not make progress until after our btrfs worker threads
++ * process all of the pending work structs in their queue
++ *
++ * This means we can't use btrfs_start_workers from inside a btrfs worker
++ * thread that is used as part of cleaning dirty memory, which pretty much
++ * involves all of the worker threads.
++ *
++ * Instead we have a helper queue who never has more than one thread
++ * where we scheduler thread start operations.  This worker_start struct
++ * is used to contain the work and hold a pointer to the queue that needs
++ * another worker.
++ */
++struct worker_start {
++	struct btrfs_work work;
++	struct btrfs_workers *queue;
++};
++
++static void start_new_worker_func(struct btrfs_work *work)
++{
++	struct worker_start *start;
++	start = container_of(work, struct worker_start, work);
++	btrfs_start_workers(start->queue, 1);
++	kfree(start);
++}
++
++static int start_new_worker(struct btrfs_workers *queue)
++{
++	struct worker_start *start;
++	int ret;
++
++	start = kzalloc(sizeof(*start), GFP_NOFS);
++	if (!start)
++		return -ENOMEM;
++
++	start->work.func = start_new_worker_func;
++	start->queue = queue;
++	ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
++	if (ret)
++		kfree(start);
++	return ret;
++}
++
++/*
+  * helper function to move a thread onto the idle list after it
+  * has finished some requests.
+  */
+@@ -71,7 +119,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
+ 		unsigned long flags;
+ 		spin_lock_irqsave(&worker->workers->lock, flags);
+ 		worker->idle = 1;
+-		list_move(&worker->worker_list, &worker->workers->idle_list);
++
++		/* the list may be empty if the worker is just starting */
++		if (!list_empty(&worker->worker_list)) {
++			list_move(&worker->worker_list,
++				 &worker->workers->idle_list);
++		}
+ 		spin_unlock_irqrestore(&worker->workers->lock, flags);
+ 	}
+ }
+@@ -87,23 +140,51 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
+ 		unsigned long flags;
+ 		spin_lock_irqsave(&worker->workers->lock, flags);
+ 		worker->idle = 0;
+-		list_move_tail(&worker->worker_list,
+-			       &worker->workers->worker_list);
++
++		if (!list_empty(&worker->worker_list)) {
++			list_move_tail(&worker->worker_list,
++				      &worker->workers->worker_list);
++		}
+ 		spin_unlock_irqrestore(&worker->workers->lock, flags);
+ 	}
+ }
+ 
+-static noinline int run_ordered_completions(struct btrfs_workers *workers,
+-					    struct btrfs_work *work)
++static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
+ {
++	struct btrfs_workers *workers = worker->workers;
+ 	unsigned long flags;
+ 
++	rmb();
++	if (!workers->atomic_start_pending)
++		return;
++
++	spin_lock_irqsave(&workers->lock, flags);
++	if (!workers->atomic_start_pending)
++		goto out;
++
++	workers->atomic_start_pending = 0;
++	if (workers->num_workers + workers->num_workers_starting >=
++	    workers->max_workers)
++		goto out;
++
++	workers->num_workers_starting += 1;
++	spin_unlock_irqrestore(&workers->lock, flags);
++	start_new_worker(workers);
++	return;
++
++out:
++	spin_unlock_irqrestore(&workers->lock, flags);
++}
++
++static noinline int run_ordered_completions(struct btrfs_workers *workers,
++					    struct btrfs_work *work)
++{
+ 	if (!workers->ordered)
+ 		return 0;
+ 
+ 	set_bit(WORK_DONE_BIT, &work->flags);
+ 
+-	spin_lock_irqsave(&workers->lock, flags);
++	spin_lock(&workers->order_lock);
+ 
+ 	while (1) {
+ 		if (!list_empty(&workers->prio_order_list)) {
+@@ -126,45 +207,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
+ 		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
+ 			break;
+ 
+-		spin_unlock_irqrestore(&workers->lock, flags);
++		spin_unlock(&workers->order_lock);
+ 
+ 		work->ordered_func(work);
+ 
+ 		/* now take the lock again and call the freeing code */
+-		spin_lock_irqsave(&workers->lock, flags);
++		spin_lock(&workers->order_lock);
+ 		list_del(&work->order_list);
+ 		work->ordered_free(work);
+ 	}
+ 
+-	spin_unlock_irqrestore(&workers->lock, flags);
++	spin_unlock(&workers->order_lock);
+ 	return 0;
+ }
+ 
++static void put_worker(struct btrfs_worker_thread *worker)
++{
++	if (atomic_dec_and_test(&worker->refs))
++		kfree(worker);
++}
++
++static int try_worker_shutdown(struct btrfs_worker_thread *worker)
++{
++	int freeit = 0;
++
++	spin_lock_irq(&worker->lock);
++	spin_lock(&worker->workers->lock);
++	if (worker->workers->num_workers > 1 &&
++	    worker->idle &&
++	    !worker->working &&
++	    !list_empty(&worker->worker_list) &&
++	    list_empty(&worker->prio_pending) &&
++	    list_empty(&worker->pending) &&
++	    atomic_read(&worker->num_pending) == 0) {
++		freeit = 1;
++		list_del_init(&worker->worker_list);
++		worker->workers->num_workers--;
++	}
++	spin_unlock(&worker->workers->lock);
++	spin_unlock_irq(&worker->lock);
++
++	if (freeit)
++		put_worker(worker);
++	return freeit;
++}
++
++static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
++					struct list_head *prio_head,
++					struct list_head *head)
++{
++	struct btrfs_work *work = NULL;
++	struct list_head *cur = NULL;
++
++	if(!list_empty(prio_head))
++		cur = prio_head->next;
++
++	smp_mb();
++	if (!list_empty(&worker->prio_pending))
++		goto refill;
++
++	if (!list_empty(head))
++		cur = head->next;
++
++	if (cur)
++		goto out;
++
++refill:
++	spin_lock_irq(&worker->lock);
++	list_splice_tail_init(&worker->prio_pending, prio_head);
++	list_splice_tail_init(&worker->pending, head);
++
++	if (!list_empty(prio_head))
++		cur = prio_head->next;
++	else if (!list_empty(head))
++		cur = head->next;
++	spin_unlock_irq(&worker->lock);
++
++	if (!cur)
++		goto out_fail;
++
++out:
++	work = list_entry(cur, struct btrfs_work, list);
++
++out_fail:
++	return work;
++}
++
+ /*
+  * main loop for servicing work items
+  */
+ static int worker_loop(void *arg)
+ {
+ 	struct btrfs_worker_thread *worker = arg;
+-	struct list_head *cur;
++	struct list_head head;
++	struct list_head prio_head;
+ 	struct btrfs_work *work;
++
++	INIT_LIST_HEAD(&head);
++	INIT_LIST_HEAD(&prio_head);
++
+ 	do {
+-		spin_lock_irq(&worker->lock);
+-again_locked:
++again:
+ 		while (1) {
+-			if (!list_empty(&worker->prio_pending))
+-				cur = worker->prio_pending.next;
+-			else if (!list_empty(&worker->pending))
+-				cur = worker->pending.next;
+-			else
++
++
++			work = get_next_work(worker, &prio_head, &head);
++			if (!work)
+ 				break;
+ 
+-			work = list_entry(cur, struct btrfs_work, list);
+ 			list_del(&work->list);
+ 			clear_bit(WORK_QUEUED_BIT, &work->flags);
+ 
+ 			work->worker = worker;
+-			spin_unlock_irq(&worker->lock);
+ 
+ 			work->func(work);
+ 
+@@ -175,9 +329,13 @@ again_locked:
+ 			 */
+ 			run_ordered_completions(worker->workers, work);
+ 
+-			spin_lock_irq(&worker->lock);
+-			check_idle_worker(worker);
++			check_pending_worker_creates(worker);
++
+ 		}
++
++		spin_lock_irq(&worker->lock);
++		check_idle_worker(worker);
++
+ 		if (freezing(current)) {
+ 			worker->working = 0;
+ 			spin_unlock_irq(&worker->lock);
+@@ -216,8 +374,10 @@ again_locked:
+ 				spin_lock_irq(&worker->lock);
+ 				set_current_state(TASK_INTERRUPTIBLE);
+ 				if (!list_empty(&worker->pending) ||
+-				    !list_empty(&worker->prio_pending))
+-					goto again_locked;
++				    !list_empty(&worker->prio_pending)) {
++					spin_unlock_irq(&worker->lock);
++					goto again;
++				}
+ 
+ 				/*
+ 				 * this makes sure we get a wakeup when someone
+@@ -226,8 +386,13 @@ again_locked:
+ 				worker->working = 0;
+ 				spin_unlock_irq(&worker->lock);
+ 
+-				if (!kthread_should_stop())
+-					schedule();
++				if (!kthread_should_stop()) {
++					schedule_timeout(HZ * 120);
++					if (!worker->working &&
++					    try_worker_shutdown(worker)) {
++						return 0;
++					}
++				}
+ 			}
+ 			__set_current_state(TASK_RUNNING);
+ 		}
+@@ -242,41 +407,61 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
+ {
+ 	struct list_head *cur;
+ 	struct btrfs_worker_thread *worker;
++	int can_stop;
+ 
++	spin_lock_irq(&workers->lock);
+ 	list_splice_init(&workers->idle_list, &workers->worker_list);
+ 	while (!list_empty(&workers->worker_list)) {
+ 		cur = workers->worker_list.next;
+ 		worker = list_entry(cur, struct btrfs_worker_thread,
+ 				    worker_list);
+-		kthread_stop(worker->task);
+-		list_del(&worker->worker_list);
+-		kfree(worker);
++
++		atomic_inc(&worker->refs);
++		workers->num_workers -= 1;
++		if (!list_empty(&worker->worker_list)) {
++			list_del_init(&worker->worker_list);
++			put_worker(worker);
++			can_stop = 1;
++		} else
++			can_stop = 0;
++		spin_unlock_irq(&workers->lock);
++		if (can_stop)
++			kthread_stop(worker->task);
++		spin_lock_irq(&workers->lock);
++		put_worker(worker);
+ 	}
++	spin_unlock_irq(&workers->lock);
+ 	return 0;
+ }
+ 
+ /*
+  * simple init on struct btrfs_workers
+  */
+-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
++void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
++			struct btrfs_workers *async_helper)
+ {
+ 	workers->num_workers = 0;
++	workers->num_workers_starting = 0;
+ 	INIT_LIST_HEAD(&workers->worker_list);
+ 	INIT_LIST_HEAD(&workers->idle_list);
+ 	INIT_LIST_HEAD(&workers->order_list);
+ 	INIT_LIST_HEAD(&workers->prio_order_list);
+ 	spin_lock_init(&workers->lock);
++	spin_lock_init(&workers->order_lock);
+ 	workers->max_workers = max;
+ 	workers->idle_thresh = 32;
+ 	workers->name = name;
+ 	workers->ordered = 0;
++	workers->atomic_start_pending = 0;
++	workers->atomic_worker_start = async_helper;
+ }
+ 
+ /*
+  * starts new worker threads.  This does not enforce the max worker
+  * count in case you need to temporarily go past it.
+  */
+-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
++static int __btrfs_start_workers(struct btrfs_workers *workers,
++				 int num_workers)
+ {
+ 	struct btrfs_worker_thread *worker;
+ 	int ret = 0;
+@@ -293,7 +478,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+ 		INIT_LIST_HEAD(&worker->prio_pending);
+ 		INIT_LIST_HEAD(&worker->worker_list);
+ 		spin_lock_init(&worker->lock);
++
+ 		atomic_set(&worker->num_pending, 0);
++		atomic_set(&worker->refs, 1);
+ 		worker->workers = workers;
+ 		worker->task = kthread_run(worker_loop, worker,
+ 					   "btrfs-%s-%d", workers->name,
+@@ -303,11 +490,12 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+ 			kfree(worker);
+ 			goto fail;
+ 		}
+-
+ 		spin_lock_irq(&workers->lock);
+ 		list_add_tail(&worker->worker_list, &workers->idle_list);
+ 		worker->idle = 1;
+ 		workers->num_workers++;
++		workers->num_workers_starting--;
++		WARN_ON(workers->num_workers_starting < 0);
+ 		spin_unlock_irq(&workers->lock);
+ 	}
+ 	return 0;
+@@ -316,6 +504,14 @@ fail:
+ 	return ret;
+ }
+ 
++int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
++{
++	spin_lock_irq(&workers->lock);
++	workers->num_workers_starting += num_workers;
++	spin_unlock_irq(&workers->lock);
++	return __btrfs_start_workers(workers, num_workers);
++}
++
+ /*
+  * run through the list and find a worker thread that doesn't have a lot
+  * to do right now.  This can return null if we aren't yet at the thread
+@@ -325,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+ {
+ 	struct btrfs_worker_thread *worker;
+ 	struct list_head *next;
+-	int enforce_min = workers->num_workers < workers->max_workers;
++	int enforce_min;
++
++	enforce_min = (workers->num_workers + workers->num_workers_starting) <
++		workers->max_workers;
+ 
+ 	/*
+ 	 * if we find an idle thread, don't move it to the end of the
+@@ -350,7 +549,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+ 	 */
+ 	next = workers->worker_list.next;
+ 	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
+-	atomic_inc(&worker->num_pending);
+ 	worker->sequence++;
+ 
+ 	if (worker->sequence % workers->idle_thresh == 0)
+@@ -367,35 +565,49 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+ {
+ 	struct btrfs_worker_thread *worker;
+ 	unsigned long flags;
++	struct list_head *fallback;
+ 
+ again:
+ 	spin_lock_irqsave(&workers->lock, flags);
+ 	worker = next_worker(workers);
+-	spin_unlock_irqrestore(&workers->lock, flags);
+ 
+ 	if (!worker) {
+-		spin_lock_irqsave(&workers->lock, flags);
+-		if (workers->num_workers >= workers->max_workers) {
+-			struct list_head *fallback = NULL;
+-			/*
+-			 * we have failed to find any workers, just
+-			 * return the force one
+-			 */
+-			if (!list_empty(&workers->worker_list))
+-				fallback = workers->worker_list.next;
+-			if (!list_empty(&workers->idle_list))
+-				fallback = workers->idle_list.next;
+-			BUG_ON(!fallback);
+-			worker = list_entry(fallback,
+-				  struct btrfs_worker_thread, worker_list);
+-			spin_unlock_irqrestore(&workers->lock, flags);
++		if (workers->num_workers + workers->num_workers_starting >=
++		    workers->max_workers) {
++			goto fallback;
++		} else if (workers->atomic_worker_start) {
++			workers->atomic_start_pending = 1;
++			goto fallback;
+ 		} else {
++			workers->num_workers_starting++;
+ 			spin_unlock_irqrestore(&workers->lock, flags);
+ 			/* we're below the limit, start another worker */
+-			btrfs_start_workers(workers, 1);
++			__btrfs_start_workers(workers, 1);
+ 			goto again;
+ 		}
+ 	}
++	goto found;
++
++fallback:
++	fallback = NULL;
++	/*
++	 * we have failed to find any workers, just
++	 * return the first one we can find.
++	 */
++	if (!list_empty(&workers->worker_list))
++		fallback = workers->worker_list.next;
++	if (!list_empty(&workers->idle_list))
++		fallback = workers->idle_list.next;
++	BUG_ON(!fallback);
++	worker = list_entry(fallback,
++		  struct btrfs_worker_thread, worker_list);
++found:
++	/*
++	 * this makes sure the worker doesn't exit before it is placed
++	 * onto a busy/idle list
++	 */
++	atomic_inc(&worker->num_pending);
++	spin_unlock_irqrestore(&workers->lock, flags);
+ 	return worker;
+ }
+ 
+@@ -427,7 +639,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
+ 		spin_lock(&worker->workers->lock);
+ 		worker->idle = 0;
+ 		list_move_tail(&worker->worker_list,
+-			       &worker->workers->worker_list);
++			      &worker->workers->worker_list);
+ 		spin_unlock(&worker->workers->lock);
+ 	}
+ 	if (!worker->working) {
+@@ -435,9 +647,9 @@ int btrfs_requeue_work(struct btrfs_work *work)
+ 		worker->working = 1;
+ 	}
+ 
+-	spin_unlock_irqrestore(&worker->lock, flags);
+ 	if (wake)
+ 		wake_up_process(worker->task);
++	spin_unlock_irqrestore(&worker->lock, flags);
+ out:
+ 
+ 	return 0;
+@@ -463,14 +675,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+ 
+ 	worker = find_worker(workers);
+ 	if (workers->ordered) {
+-		spin_lock_irqsave(&workers->lock, flags);
++		/*
++		 * you're not allowed to do ordered queues from an
++		 * interrupt handler
++		 */
++		spin_lock(&workers->order_lock);
+ 		if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
+ 			list_add_tail(&work->order_list,
+ 				      &workers->prio_order_list);
+ 		} else {
+ 			list_add_tail(&work->order_list, &workers->order_list);
+ 		}
+-		spin_unlock_irqrestore(&workers->lock, flags);
++		spin_unlock(&workers->order_lock);
+ 	} else {
+ 		INIT_LIST_HEAD(&work->order_list);
+ 	}
+@@ -481,7 +697,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+ 		list_add_tail(&work->list, &worker->prio_pending);
+ 	else
+ 		list_add_tail(&work->list, &worker->pending);
+-	atomic_inc(&worker->num_pending);
+ 	check_busy_worker(worker);
+ 
+ 	/*
+@@ -492,10 +707,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+ 		wake = 1;
+ 	worker->working = 1;
+ 
+-	spin_unlock_irqrestore(&worker->lock, flags);
+-
+ 	if (wake)
+ 		wake_up_process(worker->task);
++	spin_unlock_irqrestore(&worker->lock, flags);
++
+ out:
+ 	return 0;
+ }
+diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
+index 1b511c1..5077746 100644
+--- a/fs/btrfs/async-thread.h
++++ b/fs/btrfs/async-thread.h
+@@ -64,6 +64,8 @@ struct btrfs_workers {
+ 	/* current number of running workers */
+ 	int num_workers;
+ 
++	int num_workers_starting;
++
+ 	/* max number of workers allowed.  changed by btrfs_start_workers */
+ 	int max_workers;
+ 
+@@ -73,6 +75,16 @@ struct btrfs_workers {
+ 	/* force completions in the order they were queued */
+ 	int ordered;
+ 
++	/* more workers required, but in an interrupt handler */
++	int atomic_start_pending;
++
++	/*
++	 * are we allowed to sleep while starting workers or are we required
++	 * to start them at a later time?  If we can't sleep, this indicates
++	 * which queue we need to use to schedule thread creation.
++	 */
++	struct btrfs_workers *atomic_worker_start;
++
+ 	/* list with all the work threads.  The workers on the idle thread
+ 	 * may be actively servicing jobs, but they haven't yet hit the
+ 	 * idle thresh limit above.
+@@ -90,6 +102,9 @@ struct btrfs_workers {
+ 	/* lock for finding the next worker thread to queue on */
+ 	spinlock_t lock;
+ 
++	/* lock for the ordered lists */
++	spinlock_t order_lock;
++
+ 	/* extra name for this worker, used for current->name */
+ 	char *name;
+ };
+@@ -97,7 +112,8 @@ struct btrfs_workers {
+ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+ int btrfs_stop_workers(struct btrfs_workers *workers);
+-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
++void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
++			struct btrfs_workers *async_starter);
+ int btrfs_requeue_work(struct btrfs_work *work);
+ void btrfs_set_work_high_prio(struct btrfs_work *work);
+ #endif
+diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
+index ea1ea0a..f6783a4 100644
+--- a/fs/btrfs/btrfs_inode.h
++++ b/fs/btrfs/btrfs_inode.h
+@@ -86,6 +86,12 @@ struct btrfs_inode {
+ 	 * transid of the trans_handle that last modified this inode
+ 	 */
+ 	u64 last_trans;
++
++	/*
++	 * log transid when this inode was last modified
++	 */
++	u64 last_sub_trans;
++
+ 	/*
+ 	 * transid that last logged this inode
+ 	 */
+@@ -128,6 +134,16 @@ struct btrfs_inode {
+ 	u64 last_unlink_trans;
+ 
+ 	/*
++	 * Counters to keep track of the number of extent item's we may use due
++	 * to delalloc and such.  outstanding_extents is the number of extent
++	 * items we think we'll end up using, and reserved_extents is the number
++	 * of extent items we've reserved metadata for.
++	 */
++	spinlock_t accounting_lock;
++	int reserved_extents;
++	int outstanding_extents;
++
++	/*
+ 	 * ordered_data_close is set by truncate when a file that used
+ 	 * to have good data has been truncated to zero.  When it is set
+ 	 * the btrfs file release call will add this inode to the
+@@ -138,6 +154,7 @@ struct btrfs_inode {
+ 	 * of these.
+ 	 */
+ 	unsigned ordered_data_close:1;
++	unsigned dummy_inode:1;
+ 
+ 	struct inode vfs_inode;
+ };
+diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
+index 9d8ba4d..a11a320 100644
+--- a/fs/btrfs/compression.c
++++ b/fs/btrfs/compression.c
+@@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
+ 		 */
+ 		set_page_extent_mapped(page);
+ 		lock_extent(tree, last_offset, end, GFP_NOFS);
+-		spin_lock(&em_tree->lock);
++		read_lock(&em_tree->lock);
+ 		em = lookup_extent_mapping(em_tree, last_offset,
+ 					   PAGE_CACHE_SIZE);
+-		spin_unlock(&em_tree->lock);
++		read_unlock(&em_tree->lock);
+ 
+ 		if (!em || last_offset < em->start ||
+ 		    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+@@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ 	em_tree = &BTRFS_I(inode)->extent_tree;
+ 
+ 	/* we need the actual starting offset of this extent in the file */
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree,
+ 				   page_offset(bio->bi_io_vec->bv_page),
+ 				   PAGE_CACHE_SIZE);
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 
+ 	compressed_len = em->block_len;
+ 	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
+index 3fdcc05..ec96f3a 100644
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
+ 	int split;
+ 	int num_doubles = 0;
+ 
++	l = path->nodes[0];
++	slot = path->slots[0];
++	if (extend && data_size + btrfs_item_size_nr(l, slot) +
++	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
++		return -EOVERFLOW;
++
+ 	/* first try to make some room by pushing left and right */
+ 	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+ 		wret = push_leaf_right(trans, root, path, data_size, 0);
+diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
+index 837435c..e5dd628 100644
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -114,6 +114,10 @@ struct btrfs_ordered_sum;
+  */
+ #define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+ 
++#define BTRFS_BTREE_INODE_OBJECTID 1
++
++#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
++
+ /*
+  * we can actually store much bigger names, but lets not confuse the rest
+  * of linux
+@@ -670,21 +674,29 @@ struct btrfs_space_info {
+ 	u64 bytes_reserved;	/* total bytes the allocator has reserved for
+ 				   current allocations */
+ 	u64 bytes_readonly;	/* total bytes that are read only */
+-
+-	/* delalloc accounting */
+-	u64 bytes_delalloc;	/* number of bytes reserved for allocation,
+-				   this space is not necessarily reserved yet
+-				   by the allocator */
++	u64 bytes_super;	/* total bytes reserved for the super blocks */
++	u64 bytes_root;		/* the number of bytes needed to commit a
++				   transaction */
+ 	u64 bytes_may_use;	/* number of bytes that may be used for
+-				   delalloc */
++				   delalloc/allocations */
++	u64 bytes_delalloc;	/* number of bytes currently reserved for
++				   delayed allocation */
+ 
+ 	int full;		/* indicates that we cannot allocate any more
+ 				   chunks for this space */
+ 	int force_alloc;	/* set if we need to force a chunk alloc for
+ 				   this space */
++	int force_delalloc;	/* make people start doing filemap_flush until
++				   we're under a threshold */
+ 
+ 	struct list_head list;
+ 
++	/* for controlling how we free up space for allocations */
++	wait_queue_head_t allocate_wait;
++	wait_queue_head_t flush_wait;
++	int allocating_chunk;
++	int flushing;
++
+ 	/* for block groups in our same type */
+ 	struct list_head block_groups;
+ 	spinlock_t lock;
+@@ -726,6 +738,15 @@ enum btrfs_caching_type {
+ 	BTRFS_CACHE_FINISHED	= 2,
+ };
+ 
++struct btrfs_caching_control {
++	struct list_head list;
++	struct mutex mutex;
++	wait_queue_head_t wait;
++	struct btrfs_block_group_cache *block_group;
++	u64 progress;
++	atomic_t count;
++};
++
+ struct btrfs_block_group_cache {
+ 	struct btrfs_key key;
+ 	struct btrfs_block_group_item item;
+@@ -733,6 +754,7 @@ struct btrfs_block_group_cache {
+ 	spinlock_t lock;
+ 	u64 pinned;
+ 	u64 reserved;
++	u64 bytes_super;
+ 	u64 flags;
+ 	u64 sectorsize;
+ 	int extents_thresh;
+@@ -742,8 +764,9 @@ struct btrfs_block_group_cache {
+ 	int dirty;
+ 
+ 	/* cache tracking stuff */
+-	wait_queue_head_t caching_q;
+ 	int cached;
++	struct btrfs_caching_control *caching_ctl;
++	u64 last_byte_to_unpin;
+ 
+ 	struct btrfs_space_info *space_info;
+ 
+@@ -782,13 +805,16 @@ struct btrfs_fs_info {
+ 
+ 	/* the log root tree is a directory of all the other log roots */
+ 	struct btrfs_root *log_root_tree;
++
++	spinlock_t fs_roots_radix_lock;
+ 	struct radix_tree_root fs_roots_radix;
+ 
+ 	/* block group cache stuff */
+ 	spinlock_t block_group_cache_lock;
+ 	struct rb_root block_group_cache_tree;
+ 
+-	struct extent_io_tree pinned_extents;
++	struct extent_io_tree freed_extents[2];
++	struct extent_io_tree *pinned_extents;
+ 
+ 	/* logical->physical extent mapping */
+ 	struct btrfs_mapping_tree mapping_tree;
+@@ -822,11 +848,7 @@ struct btrfs_fs_info {
+ 	struct mutex transaction_kthread_mutex;
+ 	struct mutex cleaner_mutex;
+ 	struct mutex chunk_mutex;
+-	struct mutex drop_mutex;
+ 	struct mutex volume_mutex;
+-	struct mutex tree_reloc_mutex;
+-	struct rw_semaphore extent_commit_sem;
+-
+ 	/*
+ 	 * this protects the ordered operations list only while we are
+ 	 * processing all of the entries on it.  This way we make
+@@ -835,10 +857,16 @@ struct btrfs_fs_info {
+ 	 * before jumping into the main commit.
+ 	 */
+ 	struct mutex ordered_operations_mutex;
++	struct rw_semaphore extent_commit_sem;
++
++	struct rw_semaphore subvol_sem;
++
++	struct srcu_struct subvol_srcu;
+ 
+ 	struct list_head trans_list;
+ 	struct list_head hashers;
+ 	struct list_head dead_roots;
++	struct list_head caching_block_groups;
+ 
+ 	atomic_t nr_async_submits;
+ 	atomic_t async_submit_draining;
+@@ -882,6 +910,7 @@ struct btrfs_fs_info {
+ 	 * A third pool does submit_bio to avoid deadlocking with the other
+ 	 * two
+ 	 */
++	struct btrfs_workers generic_worker;
+ 	struct btrfs_workers workers;
+ 	struct btrfs_workers delalloc_workers;
+ 	struct btrfs_workers endio_workers;
+@@ -889,6 +918,7 @@ struct btrfs_fs_info {
+ 	struct btrfs_workers endio_meta_write_workers;
+ 	struct btrfs_workers endio_write_workers;
+ 	struct btrfs_workers submit_workers;
++	struct btrfs_workers enospc_workers;
+ 	/*
+ 	 * fixup workers take dirty pages that didn't properly go through
+ 	 * the cow mechanism and make them safe to write.  It happens
+@@ -979,7 +1009,10 @@ struct btrfs_root {
+ 	atomic_t log_writers;
+ 	atomic_t log_commit[2];
+ 	unsigned long log_transid;
++	unsigned long last_log_commit;
+ 	unsigned long log_batch;
++	pid_t log_start_pid;
++	bool log_multiple_pids;
+ 
+ 	u64 objectid;
+ 	u64 last_trans;
+@@ -996,10 +1029,12 @@ struct btrfs_root {
+ 	u32 stripesize;
+ 
+ 	u32 type;
+-	u64 highest_inode;
+-	u64 last_inode_alloc;
++
++	u64 highest_objectid;
+ 	int ref_cows;
+ 	int track_dirty;
++	int in_radix;
++
+ 	u64 defrag_trans_start;
+ 	struct btrfs_key defrag_progress;
+ 	struct btrfs_key defrag_max;
+@@ -1118,6 +1153,7 @@ struct btrfs_root {
+ #define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
+ #define BTRFS_MOUNT_SSD_SPREAD		(1 << 8)
+ #define BTRFS_MOUNT_NOSSD		(1 << 9)
++#define BTRFS_MOUNT_DISCARD		(1 << 10)
+ 
+ #define BTRFS_MOUNT_TAGGED		(1 << 24)
+ 
+@@ -1920,8 +1956,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ 			   struct btrfs_root *root, unsigned long count);
+ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+-int btrfs_update_pinned_extents(struct btrfs_root *root,
+-				u64 bytenr, u64 num, int pin);
++int btrfs_pin_extent(struct btrfs_root *root,
++		     u64 bytenr, u64 num, int reserved);
+ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+ 			struct btrfs_root *root, struct extent_buffer *leaf);
+ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+@@ -1971,9 +2007,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
+ 		      u64 root_objectid, u64 owner, u64 offset);
+ 
+ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
++int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
++				struct btrfs_root *root);
+ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+-			       struct btrfs_root *root,
+-			       struct extent_io_tree *unpin);
++			       struct btrfs_root *root);
+ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+ 			 struct btrfs_root *root,
+ 			 u64 bytenr, u64 num_bytes, u64 parent,
+@@ -1984,6 +2021,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
+ int btrfs_free_block_groups(struct btrfs_fs_info *info);
+ int btrfs_read_block_groups(struct btrfs_root *root);
++int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
+ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ 			   struct btrfs_root *root, u64 bytes_used,
+ 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
+@@ -1997,7 +2035,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
+ void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+ 
+-int btrfs_check_metadata_free_space(struct btrfs_root *root);
++int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
++int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
++int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
++					  struct inode *inode, int num_items);
++int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
++					struct inode *inode, int num_items);
+ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+ 				u64 bytes);
+ void btrfs_free_reserved_data_space(struct btrfs_root *root,
+@@ -2006,7 +2049,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+ 				 u64 bytes);
+ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+ 			      u64 bytes);
+-void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
+ /* ctree.c */
+ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+ 		     int level, int *slot);
+@@ -2100,12 +2142,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ 			struct extent_buffer *parent);
+ /* root-item.c */
+ int btrfs_find_root_ref(struct btrfs_root *tree_root,
+-		   struct btrfs_path *path,
+-		   u64 root_id, u64 ref_id);
++			struct btrfs_path *path,
++			u64 root_id, u64 ref_id);
+ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ 		       struct btrfs_root *tree_root,
+-		       u64 root_id, u8 type, u64 ref_id,
+-		       u64 dirid, u64 sequence,
++		       u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
++		       const char *name, int name_len);
++int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
++		       struct btrfs_root *tree_root,
++		       u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+ 		       const char *name, int name_len);
+ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ 		   struct btrfs_key *key);
+@@ -2120,6 +2165,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
+ int btrfs_search_root(struct btrfs_root *root, u64 search_start,
+ 		      u64 *found_objectid);
+ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
++int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
+ int btrfs_set_root_node(struct btrfs_root_item *item,
+ 			struct extent_buffer *node);
+ /* dir-item.c */
+@@ -2138,6 +2184,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+ 			    struct btrfs_path *path, u64 dir,
+ 			    u64 objectid, const char *name, int name_len,
+ 			    int mod);
++struct btrfs_dir_item *
++btrfs_search_dir_index_item(struct btrfs_root *root,
++			    struct btrfs_path *path, u64 dirid,
++			    const char *name, int name_len);
+ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ 			      struct btrfs_path *path,
+ 			      const char *name, int name_len);
+@@ -2160,6 +2210,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+ 			     struct btrfs_root *root, u64 offset);
+ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+ 			  struct btrfs_root *root, u64 offset);
++int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
+ 
+ /* inode-map.c */
+ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+@@ -2232,6 +2283,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ int btrfs_add_link(struct btrfs_trans_handle *trans,
+ 		   struct inode *parent_inode, struct inode *inode,
+ 		   const char *name, int name_len, int add_backref, u64 index);
++int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
++			struct btrfs_root *root,
++			struct inode *dir, u64 objectid,
++			const char *name, int name_len);
+ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ 			       struct btrfs_root *root,
+ 			       struct inode *inode, u64 new_size,
+@@ -2242,7 +2297,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
+ int btrfs_writepages(struct address_space *mapping,
+ 		     struct writeback_control *wbc);
+ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+-			     struct btrfs_root *new_root, struct dentry *dentry,
++			     struct btrfs_root *new_root,
+ 			     u64 new_dirid, u64 alloc_hint);
+ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+ 			 size_t size, struct bio *bio, unsigned long bio_flags);
+@@ -2258,6 +2313,7 @@ int btrfs_write_inode(struct inode *inode, int wait);
+ void btrfs_dirty_inode(struct inode *inode);
+ struct inode *btrfs_alloc_inode(struct super_block *sb);
+ void btrfs_destroy_inode(struct inode *inode);
++void btrfs_drop_inode(struct inode *inode);
+ int btrfs_init_cachep(void);
+ void btrfs_destroy_cachep(void);
+ long btrfs_ioctl_trans_end(struct file *file);
+@@ -2275,6 +2331,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
+ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
+ void btrfs_orphan_cleanup(struct btrfs_root *root);
+ int btrfs_cont_expand(struct inode *inode, loff_t size);
++int btrfs_invalidate_inodes(struct btrfs_root *root);
++extern const struct dentry_operations btrfs_dentry_operations;
+ 
+ /* ioctl.c */
+ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+@@ -2290,7 +2348,7 @@ extern struct file_operations btrfs_file_operations;
+ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ 		       struct btrfs_root *root, struct inode *inode,
+ 		       u64 start, u64 end, u64 locked_end,
+-		       u64 inline_limit, u64 *hint_block);
++		       u64 inline_limit, u64 *hint_block, int drop_cache);
+ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+ 			      struct btrfs_root *root,
+ 			      struct inode *inode, u64 start, u64 end);
+@@ -2317,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
+ int btrfs_sync_fs(struct super_block *sb, int wait);
+ 
+ /* acl.c */
+-#ifdef CONFIG_FS_POSIX_ACL
++#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ int btrfs_check_acl(struct inode *inode, int mask);
+ #else
+ #define btrfs_check_acl NULL
+diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
+index 1d70236..f3a6075 100644
+--- a/fs/btrfs/dir-item.c
++++ b/fs/btrfs/dir-item.c
+@@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+ 	return btrfs_match_dir_item_name(root, path, name, name_len);
+ }
+ 
++struct btrfs_dir_item *
++btrfs_search_dir_index_item(struct btrfs_root *root,
++			    struct btrfs_path *path, u64 dirid,
++			    const char *name, int name_len)
++{
++	struct extent_buffer *leaf;
++	struct btrfs_dir_item *di;
++	struct btrfs_key key;
++	u32 nritems;
++	int ret;
++
++	key.objectid = dirid;
++	key.type = BTRFS_DIR_INDEX_KEY;
++	key.offset = 0;
++
++	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
++	if (ret < 0)
++		return ERR_PTR(ret);
++
++	leaf = path->nodes[0];
++	nritems = btrfs_header_nritems(leaf);
++
++	while (1) {
++		if (path->slots[0] >= nritems) {
++			ret = btrfs_next_leaf(root, path);
++			if (ret < 0)
++				return ERR_PTR(ret);
++			if (ret > 0)
++				break;
++			leaf = path->nodes[0];
++			nritems = btrfs_header_nritems(leaf);
++			continue;
++		}
++
++		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++		if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
++			break;
++
++		di = btrfs_match_dir_item_name(root, path, name, name_len);
++		if (di)
++			return di;
++
++		path->slots[0]++;
++	}
++	return NULL;
++}
++
+ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+ 					  struct btrfs_root *root,
+ 					  struct btrfs_path *path, u64 dir,
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index e83be2e..d4132aa 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -41,6 +41,7 @@
+ 
+ static struct extent_io_ops btree_extent_io_ops;
+ static void end_workqueue_fn(struct btrfs_work *work);
++static void free_fs_root(struct btrfs_root *root);
+ 
+ static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
+ 
+@@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,
+ 	struct extent_map *em;
+ 	int ret;
+ 
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree, start, len);
+ 	if (em) {
+ 		em->bdev =
+ 			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+-		spin_unlock(&em_tree->lock);
++		read_unlock(&em_tree->lock);
+ 		goto out;
+ 	}
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 
+ 	em = alloc_extent_map(GFP_NOFS);
+ 	if (!em) {
+@@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
+ 	em->block_start = 0;
+ 	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+ 
+-	spin_lock(&em_tree->lock);
++	write_lock(&em_tree->lock);
+ 	ret = add_extent_mapping(em_tree, em);
+ 	if (ret == -EEXIST) {
+ 		u64 failed_start = em->start;
+@@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
+ 		free_extent_map(em);
+ 		em = NULL;
+ 	}
+-	spin_unlock(&em_tree->lock);
++	write_unlock(&em_tree->lock);
+ 
+ 	if (ret)
+ 		em = ERR_PTR(ret);
+@@ -828,7 +829,9 @@ int btrfs_write_tree_block(struct extent_buffer *buf)
+ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+ {
+ 	return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
+-				  buf->start, buf->start + buf->len - 1);
++				  buf->start >> PAGE_CACHE_SHIFT,
++				  (buf->start + buf->len - 1) >>
++				   PAGE_CACHE_SHIFT);
+ }
+ 
+ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+@@ -895,8 +898,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+ 	root->fs_info = fs_info;
+ 	root->objectid = objectid;
+ 	root->last_trans = 0;
+-	root->highest_inode = 0;
+-	root->last_inode_alloc = 0;
++	root->highest_objectid = 0;
+ 	root->name = NULL;
+ 	root->in_sysfs = 0;
+ 	root->inode_tree.rb_node = NULL;
+@@ -917,6 +919,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+ 	atomic_set(&root->log_writers, 0);
+ 	root->log_batch = 0;
+ 	root->log_transid = 0;
++	root->last_log_commit = 0;
+ 	extent_io_tree_init(&root->dirty_log_pages,
+ 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+ 
+@@ -952,14 +955,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
+ 		     root, fs_info, objectid);
+ 	ret = btrfs_find_last_root(tree_root, objectid,
+ 				   &root->root_item, &root->root_key);
++	if (ret > 0)
++		return -ENOENT;
+ 	BUG_ON(ret);
+ 
+ 	generation = btrfs_root_generation(&root->root_item);
+ 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+ 				     blocksize, generation);
+-	root->commit_root = btrfs_root_node(root);
+ 	BUG_ON(!root->node);
++	root->commit_root = btrfs_root_node(root);
+ 	return 0;
+ }
+ 
+@@ -1085,6 +1090,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+ 	WARN_ON(root->log_root);
+ 	root->log_root = log_root;
+ 	root->log_transid = 0;
++	root->last_log_commit = 0;
+ 	return 0;
+ }
+ 
+@@ -1095,7 +1101,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+ 	struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ 	struct btrfs_path *path;
+ 	struct extent_buffer *l;
+-	u64 highest_inode;
+ 	u64 generation;
+ 	u32 blocksize;
+ 	int ret = 0;
+@@ -1110,7 +1115,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+ 			kfree(root);
+ 			return ERR_PTR(ret);
+ 		}
+-		goto insert;
++		goto out;
+ 	}
+ 
+ 	__setup_root(tree_root->nodesize, tree_root->leafsize,
+@@ -1120,39 +1125,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+ 	path = btrfs_alloc_path();
+ 	BUG_ON(!path);
+ 	ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
+-	if (ret != 0) {
+-		if (ret > 0)
+-			ret = -ENOENT;
+-		goto out;
++	if (ret == 0) {
++		l = path->nodes[0];
++		read_extent_buffer(l, &root->root_item,
++				btrfs_item_ptr_offset(l, path->slots[0]),
++				sizeof(root->root_item));
++		memcpy(&root->root_key, location, sizeof(*location));
+ 	}
+-	l = path->nodes[0];
+-	read_extent_buffer(l, &root->root_item,
+-	       btrfs_item_ptr_offset(l, path->slots[0]),
+-	       sizeof(root->root_item));
+-	memcpy(&root->root_key, location, sizeof(*location));
+-	ret = 0;
+-out:
+-	btrfs_release_path(root, path);
+ 	btrfs_free_path(path);
+ 	if (ret) {
+-		kfree(root);
++		if (ret > 0)
++			ret = -ENOENT;
+ 		return ERR_PTR(ret);
+ 	}
++
+ 	generation = btrfs_root_generation(&root->root_item);
+ 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+ 				     blocksize, generation);
+ 	root->commit_root = btrfs_root_node(root);
+ 	BUG_ON(!root->node);
+-insert:
+-	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
++out:
++	if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
+ 		root->ref_cows = 1;
+-		ret = btrfs_find_highest_inode(root, &highest_inode);
+-		if (ret == 0) {
+-			root->highest_inode = highest_inode;
+-			root->last_inode_alloc = highest_inode;
+-		}
+-	}
++
+ 	return root;
+ }
+ 
+@@ -1187,39 +1183,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+ 		return fs_info->dev_root;
+ 	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
+ 		return fs_info->csum_root;
+-
++again:
++	spin_lock(&fs_info->fs_roots_radix_lock);
+ 	root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ 				 (unsigned long)location->objectid);
++	spin_unlock(&fs_info->fs_roots_radix_lock);
+ 	if (root)
+ 		return root;
+ 
++	ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
++	if (ret == 0)
++		ret = -ENOENT;
++	if (ret < 0)
++		return ERR_PTR(ret);
++
+ 	root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+ 	if (IS_ERR(root))
+ 		return root;
+ 
++	WARN_ON(btrfs_root_refs(&root->root_item) == 0);
+ 	set_anon_super(&root->anon_super, NULL);
+ 
++	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
++	if (ret)
++		goto fail;
++
++	spin_lock(&fs_info->fs_roots_radix_lock);
+ 	ret = radix_tree_insert(&fs_info->fs_roots_radix,
+ 				(unsigned long)root->root_key.objectid,
+ 				root);
++	if (ret == 0)
++		root->in_radix = 1;
++	spin_unlock(&fs_info->fs_roots_radix_lock);
++	radix_tree_preload_end();
+ 	if (ret) {
+-		free_extent_buffer(root->node);
+-		kfree(root);
+-		return ERR_PTR(ret);
++		if (ret == -EEXIST) {
++			free_fs_root(root);
++			goto again;
++		}
++		goto fail;
+ 	}
+-	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+-		ret = btrfs_find_dead_roots(fs_info->tree_root,
+-					    root->root_key.objectid);
+-		BUG_ON(ret);
++
++	ret = btrfs_find_dead_roots(fs_info->tree_root,
++				    root->root_key.objectid);
++	WARN_ON(ret);
++
++	if (!(fs_info->sb->s_flags & MS_RDONLY))
+ 		btrfs_orphan_cleanup(root);
+-	}
++
+ 	return root;
++fail:
++	free_fs_root(root);
++	return ERR_PTR(ret);
+ }
+ 
+ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+ 				      struct btrfs_key *location,
+ 				      const char *name, int namelen)
+ {
++	return btrfs_read_fs_root_no_name(fs_info, location);
++#if 0
+ 	struct btrfs_root *root;
+ 	int ret;
+ 
+@@ -1236,7 +1259,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+ 		kfree(root);
+ 		return ERR_PTR(ret);
+ 	}
+-#if 0
++
+ 	ret = btrfs_sysfs_add_root(root);
+ 	if (ret) {
+ 		free_extent_buffer(root->node);
+@@ -1244,9 +1267,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+ 		kfree(root);
+ 		return ERR_PTR(ret);
+ 	}
+-#endif
+ 	root->in_sysfs = 1;
+ 	return root;
++#endif
+ }
+ 
+ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+@@ -1325,9 +1348,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+ 	offset = page_offset(page);
+ 
+ 	em_tree = &BTRFS_I(inode)->extent_tree;
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 	if (!em) {
+ 		__unplug_io_fn(bdi, page);
+ 		return;
+@@ -1359,8 +1382,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
+ 
+ 	err = bdi_register(bdi, NULL, "btrfs-%d",
+ 				atomic_inc_return(&btrfs_bdi_num));
+-	if (err)
++	if (err) {
++		bdi_destroy(bdi);
+ 		return err;
++	}
+ 
+ 	bdi->ra_pages	= default_backing_dev_info.ra_pages;
+ 	bdi->unplug_io_fn	= btrfs_unplug_io_fn;
+@@ -1450,9 +1475,12 @@ static int cleaner_kthread(void *arg)
+ 			break;
+ 
+ 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+-		mutex_lock(&root->fs_info->cleaner_mutex);
+-		btrfs_clean_old_snapshots(root);
+-		mutex_unlock(&root->fs_info->cleaner_mutex);
++
++		if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
++		    mutex_trylock(&root->fs_info->cleaner_mutex)) {
++			btrfs_clean_old_snapshots(root);
++			mutex_unlock(&root->fs_info->cleaner_mutex);
++		}
+ 
+ 		if (freezing(current)) {
+ 			refrigerator();
+@@ -1557,15 +1585,36 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 		err = -ENOMEM;
+ 		goto fail;
+ 	}
+-	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
++
++	ret = init_srcu_struct(&fs_info->subvol_srcu);
++	if (ret) {
++		err = ret;
++		goto fail;
++	}
++
++	ret = setup_bdi(fs_info, &fs_info->bdi);
++	if (ret) {
++		err = ret;
++		goto fail_srcu;
++	}
++
++	fs_info->btree_inode = new_inode(sb);
++	if (!fs_info->btree_inode) {
++		err = -ENOMEM;
++		goto fail_bdi;
++	}
++
++	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ 	INIT_LIST_HEAD(&fs_info->trans_list);
+ 	INIT_LIST_HEAD(&fs_info->dead_roots);
+ 	INIT_LIST_HEAD(&fs_info->hashers);
+ 	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+ 	INIT_LIST_HEAD(&fs_info->ordered_operations);
++	INIT_LIST_HEAD(&fs_info->caching_block_groups);
+ 	spin_lock_init(&fs_info->delalloc_lock);
+ 	spin_lock_init(&fs_info->new_trans_lock);
+ 	spin_lock_init(&fs_info->ref_cache_lock);
++	spin_lock_init(&fs_info->fs_roots_radix_lock);
+ 
+ 	init_completion(&fs_info->kobj_unregister);
+ 	fs_info->tree_root = tree_root;
+@@ -1584,12 +1633,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 	fs_info->sb = sb;
+ 	fs_info->max_extent = (u64)-1;
+ 	fs_info->max_inline = 8192 * 1024;
+-	if (setup_bdi(fs_info, &fs_info->bdi))
+-		goto fail_bdi;
+-	fs_info->btree_inode = new_inode(sb);
+-	fs_info->btree_inode->i_ino = 1;
+-	fs_info->btree_inode->i_nlink = 1;
+-	fs_info->metadata_ratio = 8;
++	fs_info->metadata_ratio = 0;
+ 
+ 	fs_info->thread_pool_size = min_t(unsigned long,
+ 					  num_online_cpus() + 2, 8);
+@@ -1600,6 +1644,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 	sb->s_blocksize = 4096;
+ 	sb->s_blocksize_bits = blksize_bits(4096);
+ 
++	fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
++	fs_info->btree_inode->i_nlink = 1;
+ 	/*
+ 	 * we set the i_size on the btree inode to the max possible int.
+ 	 * the real end of the address space is determined by all of
+@@ -1618,28 +1664,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 
+ 	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+ 
++	BTRFS_I(fs_info->btree_inode)->root = tree_root;
++	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
++	       sizeof(struct btrfs_key));
++	BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
++	insert_inode_hash(fs_info->btree_inode);
++
+ 	spin_lock_init(&fs_info->block_group_cache_lock);
+ 	fs_info->block_group_cache_tree.rb_node = NULL;
+ 
+-	extent_io_tree_init(&fs_info->pinned_extents,
++	extent_io_tree_init(&fs_info->freed_extents[0],
+ 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
++	extent_io_tree_init(&fs_info->freed_extents[1],
++			     fs_info->btree_inode->i_mapping, GFP_NOFS);
++	fs_info->pinned_extents = &fs_info->freed_extents[0];
+ 	fs_info->do_barriers = 1;
+ 
+-	BTRFS_I(fs_info->btree_inode)->root = tree_root;
+-	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+-	       sizeof(struct btrfs_key));
+-	insert_inode_hash(fs_info->btree_inode);
+ 
+ 	mutex_init(&fs_info->trans_mutex);
+ 	mutex_init(&fs_info->ordered_operations_mutex);
+ 	mutex_init(&fs_info->tree_log_mutex);
+-	mutex_init(&fs_info->drop_mutex);
+ 	mutex_init(&fs_info->chunk_mutex);
+ 	mutex_init(&fs_info->transaction_kthread_mutex);
+ 	mutex_init(&fs_info->cleaner_mutex);
+ 	mutex_init(&fs_info->volume_mutex);
+-	mutex_init(&fs_info->tree_reloc_mutex);
+ 	init_rwsem(&fs_info->extent_commit_sem);
++	init_rwsem(&fs_info->subvol_sem);
+ 
+ 	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
+ 	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
+@@ -1699,20 +1749,24 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 		goto fail_iput;
+ 	}
+ 
+-	/*
+-	 * we need to start all the end_io workers up front because the
+-	 * queue work function gets called at interrupt time, and so it
+-	 * cannot dynamically grow.
+-	 */
++	btrfs_init_workers(&fs_info->generic_worker,
++			   "genwork", 1, NULL);
++
+ 	btrfs_init_workers(&fs_info->workers, "worker",
+-			   fs_info->thread_pool_size);
++			   fs_info->thread_pool_size,
++			   &fs_info->generic_worker);
+ 
+ 	btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
+-			   fs_info->thread_pool_size);
++			   fs_info->thread_pool_size,
++			   &fs_info->generic_worker);
+ 
+ 	btrfs_init_workers(&fs_info->submit_workers, "submit",
+ 			   min_t(u64, fs_devices->num_devices,
+-			   fs_info->thread_pool_size));
++			   fs_info->thread_pool_size),
++			   &fs_info->generic_worker);
++	btrfs_init_workers(&fs_info->enospc_workers, "enospc",
++			   fs_info->thread_pool_size,
++			   &fs_info->generic_worker);
+ 
+ 	/* a higher idle thresh on the submit workers makes it much more
+ 	 * likely that bios will be send down in a sane order to the
+@@ -1726,15 +1780,20 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 	fs_info->delalloc_workers.idle_thresh = 2;
+ 	fs_info->delalloc_workers.ordered = 1;
+ 
+-	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
++	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
++			   &fs_info->generic_worker);
+ 	btrfs_init_workers(&fs_info->endio_workers, "endio",
+-			   fs_info->thread_pool_size);
++			   fs_info->thread_pool_size,
++			   &fs_info->generic_worker);
+ 	btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
+-			   fs_info->thread_pool_size);
++			   fs_info->thread_pool_size,
++			   &fs_info->generic_worker);
+ 	btrfs_init_workers(&fs_info->endio_meta_write_workers,
+-			   "endio-meta-write", fs_info->thread_pool_size);
++			   "endio-meta-write", fs_info->thread_pool_size,
++			   &fs_info->generic_worker);
+ 	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
+-			   fs_info->thread_pool_size);
++			   fs_info->thread_pool_size,
++			   &fs_info->generic_worker);
+ 
+ 	/*
+ 	 * endios are largely parallel and should have a very
+@@ -1743,20 +1802,19 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 	fs_info->endio_workers.idle_thresh = 4;
+ 	fs_info->endio_meta_workers.idle_thresh = 4;
+ 
+-	fs_info->endio_write_workers.idle_thresh = 64;
+-	fs_info->endio_meta_write_workers.idle_thresh = 64;
++	fs_info->endio_write_workers.idle_thresh = 2;
++	fs_info->endio_meta_write_workers.idle_thresh = 2;
+ 
+ 	btrfs_start_workers(&fs_info->workers, 1);
++	btrfs_start_workers(&fs_info->generic_worker, 1);
+ 	btrfs_start_workers(&fs_info->submit_workers, 1);
+ 	btrfs_start_workers(&fs_info->delalloc_workers, 1);
+ 	btrfs_start_workers(&fs_info->fixup_workers, 1);
+-	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+-	btrfs_start_workers(&fs_info->endio_meta_workers,
+-			    fs_info->thread_pool_size);
+-	btrfs_start_workers(&fs_info->endio_meta_write_workers,
+-			    fs_info->thread_pool_size);
+-	btrfs_start_workers(&fs_info->endio_write_workers,
+-			    fs_info->thread_pool_size);
++	btrfs_start_workers(&fs_info->endio_workers, 1);
++	btrfs_start_workers(&fs_info->endio_meta_workers, 1);
++	btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
++	btrfs_start_workers(&fs_info->endio_write_workers, 1);
++	btrfs_start_workers(&fs_info->enospc_workers, 1);
+ 
+ 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+ 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+@@ -1916,6 +1974,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 		}
+ 	}
+ 
++	ret = btrfs_find_orphan_roots(tree_root);
++	BUG_ON(ret);
++
+ 	if (!(sb->s_flags & MS_RDONLY)) {
+ 		ret = btrfs_recover_relocation(tree_root);
+ 		BUG_ON(ret);
+@@ -1959,6 +2020,7 @@ fail_chunk_root:
+ 	free_extent_buffer(chunk_root->node);
+ 	free_extent_buffer(chunk_root->commit_root);
+ fail_sb_buffer:
++	btrfs_stop_workers(&fs_info->generic_worker);
+ 	btrfs_stop_workers(&fs_info->fixup_workers);
+ 	btrfs_stop_workers(&fs_info->delalloc_workers);
+ 	btrfs_stop_workers(&fs_info->workers);
+@@ -1967,6 +2029,7 @@ fail_sb_buffer:
+ 	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+ 	btrfs_stop_workers(&fs_info->endio_write_workers);
+ 	btrfs_stop_workers(&fs_info->submit_workers);
++	btrfs_stop_workers(&fs_info->enospc_workers);
+ fail_iput:
+ 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+ 	iput(fs_info->btree_inode);
+@@ -1975,6 +2038,8 @@ fail_iput:
+ 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ fail_bdi:
+ 	bdi_destroy(&fs_info->bdi);
++fail_srcu:
++	cleanup_srcu_struct(&fs_info->subvol_srcu);
+ fail:
+ 	kfree(extent_root);
+ 	kfree(tree_root);
+@@ -2234,20 +2299,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
+ 
+ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+ {
+-	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
++	spin_lock(&fs_info->fs_roots_radix_lock);
+ 	radix_tree_delete(&fs_info->fs_roots_radix,
+ 			  (unsigned long)root->root_key.objectid);
++	spin_unlock(&fs_info->fs_roots_radix_lock);
++
++	if (btrfs_root_refs(&root->root_item) == 0)
++		synchronize_srcu(&fs_info->subvol_srcu);
++
++	free_fs_root(root);
++	return 0;
++}
++
++static void free_fs_root(struct btrfs_root *root)
++{
++	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ 	if (root->anon_super.s_dev) {
+ 		down_write(&root->anon_super.s_umount);
+ 		kill_anon_super(&root->anon_super);
+ 	}
+-	if (root->node)
+-		free_extent_buffer(root->node);
+-	if (root->commit_root)
+-		free_extent_buffer(root->commit_root);
++	free_extent_buffer(root->node);
++	free_extent_buffer(root->commit_root);
+ 	kfree(root->name);
+ 	kfree(root);
+-	return 0;
+ }
+ 
+ static int del_fs_roots(struct btrfs_fs_info *fs_info)
+@@ -2256,6 +2330,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
+ 	struct btrfs_root *gang[8];
+ 	int i;
+ 
++	while (!list_empty(&fs_info->dead_roots)) {
++		gang[0] = list_entry(fs_info->dead_roots.next,
++				     struct btrfs_root, root_list);
++		list_del(&gang[0]->root_list);
++
++		if (gang[0]->in_radix) {
++			btrfs_free_fs_root(fs_info, gang[0]);
++		} else {
++			free_extent_buffer(gang[0]->node);
++			free_extent_buffer(gang[0]->commit_root);
++			kfree(gang[0]);
++		}
++	}
++
+ 	while (1) {
+ 		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ 					     (void **)gang, 0,
+@@ -2285,9 +2373,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
+ 		root_objectid = gang[ret - 1]->root_key.objectid + 1;
+ 		for (i = 0; i < ret; i++) {
+ 			root_objectid = gang[i]->root_key.objectid;
+-			ret = btrfs_find_dead_roots(fs_info->tree_root,
+-						    root_objectid);
+-			BUG_ON(ret);
+ 			btrfs_orphan_cleanup(gang[i]);
+ 		}
+ 		root_objectid++;
+@@ -2357,12 +2442,12 @@ int close_ctree(struct btrfs_root *root)
+ 	free_extent_buffer(root->fs_info->csum_root->commit_root);
+ 
+ 	btrfs_free_block_groups(root->fs_info);
+-	btrfs_free_pinned_extents(root->fs_info);
+ 
+ 	del_fs_roots(fs_info);
+ 
+ 	iput(fs_info->btree_inode);
+ 
++	btrfs_stop_workers(&fs_info->generic_worker);
+ 	btrfs_stop_workers(&fs_info->fixup_workers);
+ 	btrfs_stop_workers(&fs_info->delalloc_workers);
+ 	btrfs_stop_workers(&fs_info->workers);
+@@ -2371,11 +2456,13 @@ int close_ctree(struct btrfs_root *root)
+ 	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+ 	btrfs_stop_workers(&fs_info->endio_write_workers);
+ 	btrfs_stop_workers(&fs_info->submit_workers);
++	btrfs_stop_workers(&fs_info->enospc_workers);
+ 
+ 	btrfs_close_devices(fs_info->fs_devices);
+ 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ 
+ 	bdi_destroy(&fs_info->bdi);
++	cleanup_srcu_struct(&fs_info->subvol_srcu);
+ 
+ 	kfree(fs_info->extent_root);
+ 	kfree(fs_info->tree_root);
+diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
+index 9596b40..ba5c3fd 100644
+--- a/fs/btrfs/export.c
++++ b/fs/btrfs/export.c
+@@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+ 	len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
+ 	type = FILEID_BTRFS_WITHOUT_PARENT;
+ 
+-	fid->objectid = BTRFS_I(inode)->location.objectid;
++	fid->objectid = inode->i_ino;
+ 	fid->root_objectid = BTRFS_I(inode)->root->objectid;
+ 	fid->gen = inode->i_generation;
+ 
+@@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+ }
+ 
+ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+-				       u64 root_objectid, u32 generation)
++				       u64 root_objectid, u32 generation,
++				       int check_generation)
+ {
++	struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+ 	struct btrfs_root *root;
++	struct dentry *dentry;
+ 	struct inode *inode;
+ 	struct btrfs_key key;
++	int index;
++	int err = 0;
++
++	if (objectid < BTRFS_FIRST_FREE_OBJECTID)
++		return ERR_PTR(-ESTALE);
+ 
+ 	key.objectid = root_objectid;
+ 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ 	key.offset = (u64)-1;
+ 
+-	root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
+-	if (IS_ERR(root))
+-		return ERR_CAST(root);
++	index = srcu_read_lock(&fs_info->subvol_srcu);
++
++	root = btrfs_read_fs_root_no_name(fs_info, &key);
++	if (IS_ERR(root)) {
++		err = PTR_ERR(root);
++		goto fail;
++	}
++
++	if (btrfs_root_refs(&root->root_item) == 0) {
++		err = -ENOENT;
++		goto fail;
++	}
+ 
+ 	key.objectid = objectid;
+ 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ 	key.offset = 0;
+ 
+ 	inode = btrfs_iget(sb, &key, root);
+-	if (IS_ERR(inode))
+-		return (void *)inode;
++	if (IS_ERR(inode)) {
++		err = PTR_ERR(inode);
++		goto fail;
++	}
++
++	srcu_read_unlock(&fs_info->subvol_srcu, index);
+ 
+-	if (generation != inode->i_generation) {
++	if (check_generation && generation != inode->i_generation) {
+ 		iput(inode);
+ 		return ERR_PTR(-ESTALE);
+ 	}
+ 
+-	return d_obtain_alias(inode);
++	dentry = d_obtain_alias(inode);
++	if (!IS_ERR(dentry))
++		dentry->d_op = &btrfs_dentry_operations;
++	return dentry;
++fail:
++	srcu_read_unlock(&fs_info->subvol_srcu, index);
++	return ERR_PTR(err);
+ }
+ 
+ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+@@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+ 	objectid = fid->parent_objectid;
+ 	generation = fid->parent_gen;
+ 
+-	return btrfs_get_dentry(sb, objectid, root_objectid, generation);
++	return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+ }
+ 
+ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+@@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+ 	root_objectid = fid->root_objectid;
+ 	generation = fid->gen;
+ 
+-	return btrfs_get_dentry(sb, objectid, root_objectid, generation);
++	return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+ }
+ 
+ static struct dentry *btrfs_get_parent(struct dentry *child)
+ {
+ 	struct inode *dir = child->d_inode;
++	static struct dentry *dentry;
+ 	struct btrfs_root *root = BTRFS_I(dir)->root;
+-	struct btrfs_key key;
+ 	struct btrfs_path *path;
+ 	struct extent_buffer *leaf;
+-	int slot;
+-	u64 objectid;
++	struct btrfs_root_ref *ref;
++	struct btrfs_key key;
++	struct btrfs_key found_key;
+ 	int ret;
+ 
+ 	path = btrfs_alloc_path();
+ 
+-	key.objectid = dir->i_ino;
+-	btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+-	key.offset = (u64)-1;
++	if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
++		key.objectid = root->root_key.objectid;
++		key.type = BTRFS_ROOT_BACKREF_KEY;
++		key.offset = (u64)-1;
++		root = root->fs_info->tree_root;
++	} else {
++		key.objectid = dir->i_ino;
++		key.type = BTRFS_INODE_REF_KEY;
++		key.offset = (u64)-1;
++	}
+ 
+ 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+-	if (ret < 0) {
+-		/* Error */
+-		btrfs_free_path(path);
+-		return ERR_PTR(ret);
++	if (ret < 0)
++		goto fail;
++
++	BUG_ON(ret == 0);
++	if (path->slots[0] == 0) {
++		ret = -ENOENT;
++		goto fail;
+ 	}
++
++	path->slots[0]--;
+ 	leaf = path->nodes[0];
+-	slot = path->slots[0];
+-	if (ret) {
+-		/* btrfs_search_slot() returns the slot where we'd want to
+-		   insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
+-		   The _real_ backref, telling us what the parent inode
+-		   _actually_ is, will be in the slot _before_ the one
+-		   that btrfs_search_slot() returns. */
+-		if (!slot) {
+-			/* Unless there is _no_ key in the tree before... */
+-			btrfs_free_path(path);
+-			return ERR_PTR(-EIO);
+-		}
+-		slot--;
++
++	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
++	if (found_key.objectid != key.objectid || found_key.type != key.type) {
++		ret = -ENOENT;
++		goto fail;
+ 	}
+ 
+-	btrfs_item_key_to_cpu(leaf, &key, slot);
++	if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
++		ref = btrfs_item_ptr(leaf, path->slots[0],
++				     struct btrfs_root_ref);
++		key.objectid = btrfs_root_ref_dirid(leaf, ref);
++	} else {
++		key.objectid = found_key.offset;
++	}
+ 	btrfs_free_path(path);
+ 
+-	if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
+-		return ERR_PTR(-EINVAL);
+-
+-	objectid = key.offset;
+-
+-	/* If we are already at the root of a subvol, return the real root */
+-	if (objectid == dir->i_ino)
+-		return dget(dir->i_sb->s_root);
++	if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
++		return btrfs_get_dentry(root->fs_info->sb, key.objectid,
++					found_key.offset, 0, 0);
++	}
+ 
+-	/* Build a new key for the inode item */
+-	key.objectid = objectid;
+-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
++	key.type = BTRFS_INODE_ITEM_KEY;
+ 	key.offset = 0;
+-
+-	return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
++	dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
++	if (!IS_ERR(dentry))
++		dentry->d_op = &btrfs_dentry_operations;
++	return dentry;
++fail:
++	btrfs_free_path(path);
++	return ERR_PTR(ret);
+ }
+ 
+ const struct export_operations btrfs_export_ops = {
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index 72a2b9c..c56f916 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -32,12 +32,12 @@
+ #include "locking.h"
+ #include "free-space-cache.h"
+ 
+-static int update_reserved_extents(struct btrfs_root *root,
+-				   u64 bytenr, u64 num, int reserve);
+ static int update_block_group(struct btrfs_trans_handle *trans,
+ 			      struct btrfs_root *root,
+ 			      u64 bytenr, u64 num_bytes, int alloc,
+ 			      int mark_free);
++static int update_reserved_extents(struct btrfs_block_group_cache *cache,
++				   u64 num_bytes, int reserve);
+ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ 				struct btrfs_root *root,
+ 				u64 bytenr, u64 num_bytes, u64 parent,
+@@ -57,10 +57,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+ 				     u64 parent, u64 root_objectid,
+ 				     u64 flags, struct btrfs_disk_key *key,
+ 				     int level, struct btrfs_key *ins);
+-
+ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ 			  struct btrfs_root *extent_root, u64 alloc_bytes,
+ 			  u64 flags, int force);
++static int pin_down_bytes(struct btrfs_trans_handle *trans,
++			  struct btrfs_root *root,
++			  struct btrfs_path *path,
++			  u64 bytenr, u64 num_bytes,
++			  int is_data, int reserved,
++			  struct extent_buffer **must_clean);
++static int find_next_key(struct btrfs_path *path, int level,
++			 struct btrfs_key *key);
++static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
++			    int dump_block_groups);
+ 
+ static noinline int
+ block_group_cache_done(struct btrfs_block_group_cache *cache)
+@@ -153,34 +162,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
+ 	return ret;
+ }
+ 
+-/*
+- * We always set EXTENT_LOCKED for the super mirror extents so we don't
+- * overwrite them, so those bits need to be unset.  Also, if we are unmounting
+- * with pinned extents still sitting there because we had a block group caching,
+- * we need to clear those now, since we are done.
+- */
+-void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
++static int add_excluded_extent(struct btrfs_root *root,
++			       u64 start, u64 num_bytes)
+ {
+-	u64 start, end, last = 0;
+-	int ret;
++	u64 end = start + num_bytes - 1;
++	set_extent_bits(&root->fs_info->freed_extents[0],
++			start, end, EXTENT_UPTODATE, GFP_NOFS);
++	set_extent_bits(&root->fs_info->freed_extents[1],
++			start, end, EXTENT_UPTODATE, GFP_NOFS);
++	return 0;
++}
+ 
+-	while (1) {
+-		ret = find_first_extent_bit(&info->pinned_extents, last,
+-					    &start, &end,
+-					    EXTENT_LOCKED|EXTENT_DIRTY);
+-		if (ret)
+-			break;
++static void free_excluded_extents(struct btrfs_root *root,
++				  struct btrfs_block_group_cache *cache)
++{
++	u64 start, end;
+ 
+-		clear_extent_bits(&info->pinned_extents, start, end,
+-				  EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
+-		last = end+1;
+-	}
++	start = cache->key.objectid;
++	end = start + cache->key.offset - 1;
++
++	clear_extent_bits(&root->fs_info->freed_extents[0],
++			  start, end, EXTENT_UPTODATE, GFP_NOFS);
++	clear_extent_bits(&root->fs_info->freed_extents[1],
++			  start, end, EXTENT_UPTODATE, GFP_NOFS);
+ }
+ 
+-static int remove_sb_from_cache(struct btrfs_root *root,
+-				struct btrfs_block_group_cache *cache)
++static int exclude_super_stripes(struct btrfs_root *root,
++				 struct btrfs_block_group_cache *cache)
+ {
+-	struct btrfs_fs_info *fs_info = root->fs_info;
+ 	u64 bytenr;
+ 	u64 *logical;
+ 	int stripe_len;
+@@ -192,17 +201,42 @@ static int remove_sb_from_cache(struct btrfs_root *root,
+ 				       cache->key.objectid, bytenr,
+ 				       0, &logical, &nr, &stripe_len);
+ 		BUG_ON(ret);
++
+ 		while (nr--) {
+-			try_lock_extent(&fs_info->pinned_extents,
+-					logical[nr],
+-					logical[nr] + stripe_len - 1, GFP_NOFS);
++			cache->bytes_super += stripe_len;
++			ret = add_excluded_extent(root, logical[nr],
++						  stripe_len);
++			BUG_ON(ret);
+ 		}
++
+ 		kfree(logical);
+ 	}
+-
+ 	return 0;
+ }
+ 
++static struct btrfs_caching_control *
++get_caching_control(struct btrfs_block_group_cache *cache)
++{
++	struct btrfs_caching_control *ctl;
++
++	spin_lock(&cache->lock);
++	if (cache->cached != BTRFS_CACHE_STARTED) {
++		spin_unlock(&cache->lock);
++		return NULL;
++	}
++
++	ctl = cache->caching_ctl;
++	atomic_inc(&ctl->count);
++	spin_unlock(&cache->lock);
++	return ctl;
++}
++
++static void put_caching_control(struct btrfs_caching_control *ctl)
++{
++	if (atomic_dec_and_test(&ctl->count))
++		kfree(ctl);
++}
++
+ /*
+  * this is only called by cache_block_group, since we could have freed extents
+  * we need to check the pinned_extents for any extents that can't be used yet
+@@ -215,9 +249,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ 	int ret;
+ 
+ 	while (start < end) {
+-		ret = find_first_extent_bit(&info->pinned_extents, start,
++		ret = find_first_extent_bit(info->pinned_extents, start,
+ 					    &extent_start, &extent_end,
+-					    EXTENT_DIRTY|EXTENT_LOCKED);
++					    EXTENT_DIRTY | EXTENT_UPTODATE);
+ 		if (ret)
+ 			break;
+ 
+@@ -249,22 +283,27 @@ static int caching_kthread(void *data)
+ {
+ 	struct btrfs_block_group_cache *block_group = data;
+ 	struct btrfs_fs_info *fs_info = block_group->fs_info;
+-	u64 last = 0;
++	struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
++	struct btrfs_root *extent_root = fs_info->extent_root;
+ 	struct btrfs_path *path;
+-	int ret = 0;
+-	struct btrfs_key key;
+ 	struct extent_buffer *leaf;
+-	int slot;
++	struct btrfs_key key;
+ 	u64 total_found = 0;
+-
+-	BUG_ON(!fs_info);
++	u64 last = 0;
++	u32 nritems;
++	int ret = 0;
+ 
+ 	path = btrfs_alloc_path();
+ 	if (!path)
+ 		return -ENOMEM;
+ 
+-	atomic_inc(&block_group->space_info->caching_threads);
++	exclude_super_stripes(extent_root, block_group);
++	spin_lock(&block_group->space_info->lock);
++	block_group->space_info->bytes_super += block_group->bytes_super;
++	spin_unlock(&block_group->space_info->lock);
++
+ 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
++
+ 	/*
+ 	 * We don't want to deadlock with somebody trying to allocate a new
+ 	 * extent for the extent root while also trying to search the extent
+@@ -277,74 +316,64 @@ static int caching_kthread(void *data)
+ 
+ 	key.objectid = last;
+ 	key.offset = 0;
+-	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
++	key.type = BTRFS_EXTENT_ITEM_KEY;
+ again:
++	mutex_lock(&caching_ctl->mutex);
+ 	/* need to make sure the commit_root doesn't disappear */
+ 	down_read(&fs_info->extent_commit_sem);
+ 
+-	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
++	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ 	if (ret < 0)
+ 		goto err;
+ 
++	leaf = path->nodes[0];
++	nritems = btrfs_header_nritems(leaf);
++
+ 	while (1) {
+ 		smp_mb();
+-		if (block_group->fs_info->closing > 1) {
++		if (fs_info->closing > 1) {
+ 			last = (u64)-1;
+ 			break;
+ 		}
+ 
+-		leaf = path->nodes[0];
+-		slot = path->slots[0];
+-		if (slot >= btrfs_header_nritems(leaf)) {
+-			ret = btrfs_next_leaf(fs_info->extent_root, path);
+-			if (ret < 0)
+-				goto err;
+-			else if (ret)
++		if (path->slots[0] < nritems) {
++			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++		} else {
++			ret = find_next_key(path, 0, &key);
++			if (ret)
+ 				break;
+ 
+-			if (need_resched() ||
+-			    btrfs_transaction_in_commit(fs_info)) {
+-				leaf = path->nodes[0];
+-
+-				/* this shouldn't happen, but if the
+-				 * leaf is empty just move on.
+-				 */
+-				if (btrfs_header_nritems(leaf) == 0)
+-					break;
+-				/*
+-				 * we need to copy the key out so that
+-				 * we are sure the next search advances
+-				 * us forward in the btree.
+-				 */
+-				btrfs_item_key_to_cpu(leaf, &key, 0);
+-				btrfs_release_path(fs_info->extent_root, path);
+-				up_read(&fs_info->extent_commit_sem);
++			caching_ctl->progress = last;
++			btrfs_release_path(extent_root, path);
++			up_read(&fs_info->extent_commit_sem);
++			mutex_unlock(&caching_ctl->mutex);
++			if (btrfs_transaction_in_commit(fs_info))
+ 				schedule_timeout(1);
+-				goto again;
+-			}
++			else
++				cond_resched();
++			goto again;
++		}
+ 
++		if (key.objectid < block_group->key.objectid) {
++			path->slots[0]++;
+ 			continue;
+ 		}
+-		btrfs_item_key_to_cpu(leaf, &key, slot);
+-		if (key.objectid < block_group->key.objectid)
+-			goto next;
+ 
+ 		if (key.objectid >= block_group->key.objectid +
+ 		    block_group->key.offset)
+ 			break;
+ 
+-		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
++		if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+ 			total_found += add_new_free_space(block_group,
+ 							  fs_info, last,
+ 							  key.objectid);
+ 			last = key.objectid + key.offset;
+-		}
+ 
+-		if (total_found > (1024 * 1024 * 2)) {
+-			total_found = 0;
+-			wake_up(&block_group->caching_q);
++			if (total_found > (1024 * 1024 * 2)) {
++				total_found = 0;
++				wake_up(&caching_ctl->wait);
++			}
+ 		}
+-next:
+ 		path->slots[0]++;
+ 	}
+ 	ret = 0;
+@@ -352,33 +381,65 @@ next:
+ 	total_found += add_new_free_space(block_group, fs_info, last,
+ 					  block_group->key.objectid +
+ 					  block_group->key.offset);
++	caching_ctl->progress = (u64)-1;
+ 
+ 	spin_lock(&block_group->lock);
++	block_group->caching_ctl = NULL;
+ 	block_group->cached = BTRFS_CACHE_FINISHED;
+ 	spin_unlock(&block_group->lock);
+ 
+ err:
+ 	btrfs_free_path(path);
+ 	up_read(&fs_info->extent_commit_sem);
+-	atomic_dec(&block_group->space_info->caching_threads);
+-	wake_up(&block_group->caching_q);
+ 
++	free_excluded_extents(extent_root, block_group);
++
++	mutex_unlock(&caching_ctl->mutex);
++	wake_up(&caching_ctl->wait);
++
++	put_caching_control(caching_ctl);
++	atomic_dec(&block_group->space_info->caching_threads);
+ 	return 0;
+ }
+ 
+ static int cache_block_group(struct btrfs_block_group_cache *cache)
+ {
++	struct btrfs_fs_info *fs_info = cache->fs_info;
++	struct btrfs_caching_control *caching_ctl;
+ 	struct task_struct *tsk;
+ 	int ret = 0;
+ 
++	smp_mb();
++	if (cache->cached != BTRFS_CACHE_NO)
++		return 0;
++
++	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
++	BUG_ON(!caching_ctl);
++
++	INIT_LIST_HEAD(&caching_ctl->list);
++	mutex_init(&caching_ctl->mutex);
++	init_waitqueue_head(&caching_ctl->wait);
++	caching_ctl->block_group = cache;
++	caching_ctl->progress = cache->key.objectid;
++	/* one for caching kthread, one for caching block group list */
++	atomic_set(&caching_ctl->count, 2);
++
+ 	spin_lock(&cache->lock);
+ 	if (cache->cached != BTRFS_CACHE_NO) {
+ 		spin_unlock(&cache->lock);
+-		return ret;
++		kfree(caching_ctl);
++		return 0;
+ 	}
++	cache->caching_ctl = caching_ctl;
+ 	cache->cached = BTRFS_CACHE_STARTED;
+ 	spin_unlock(&cache->lock);
+ 
++	down_write(&fs_info->extent_commit_sem);
++	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
++	up_write(&fs_info->extent_commit_sem);
++
++	atomic_inc(&cache->space_info->caching_threads);
++
+ 	tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+ 			  cache->key.objectid);
+ 	if (IS_ERR(tsk)) {
+@@ -1507,22 +1568,22 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
+ 	return ret;
+ }
+ 
+-#ifdef BIO_RW_DISCARD
+ static void btrfs_issue_discard(struct block_device *bdev,
+ 				u64 start, u64 len)
+ {
+ 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+ }
+-#endif
+ 
+ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ 				u64 num_bytes)
+ {
+-#ifdef BIO_RW_DISCARD
+ 	int ret;
+ 	u64 map_length = num_bytes;
+ 	struct btrfs_multi_bio *multi = NULL;
+ 
++	if (!btrfs_test_opt(root, DISCARD))
++		return 0;
++
+ 	/* Tell the block device(s) that the sectors can be discarded */
+ 	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+ 			      bytenr, &map_length, &multi, 0);
+@@ -1542,9 +1603,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ 	}
+ 
+ 	return ret;
+-#else
+-	return 0;
+-#endif
+ }
+ 
+ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+@@ -1656,7 +1714,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
+ 						 parent, ref_root, flags,
+ 						 ref->objectid, ref->offset,
+ 						 &ins, node->ref_mod);
+-		update_reserved_extents(root, ins.objectid, ins.offset, 0);
+ 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
+ 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+ 					     node->num_bytes, parent,
+@@ -1782,7 +1839,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ 						extent_op->flags_to_set,
+ 						&extent_op->key,
+ 						ref->level, &ins);
+-		update_reserved_extents(root, ins.objectid, ins.offset, 0);
+ 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
+ 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+ 					     node->num_bytes, parent, ref_root,
+@@ -1817,16 +1873,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+ 		BUG_ON(extent_op);
+ 		head = btrfs_delayed_node_to_head(node);
+ 		if (insert_reserved) {
++			int mark_free = 0;
++			struct extent_buffer *must_clean = NULL;
++
++			ret = pin_down_bytes(trans, root, NULL,
++					     node->bytenr, node->num_bytes,
++					     head->is_data, 1, &must_clean);
++			if (ret > 0)
++				mark_free = 1;
++
++			if (must_clean) {
++				clean_tree_block(NULL, root, must_clean);
++				btrfs_tree_unlock(must_clean);
++				free_extent_buffer(must_clean);
++			}
+ 			if (head->is_data) {
+ 				ret = btrfs_del_csums(trans, root,
+ 						      node->bytenr,
+ 						      node->num_bytes);
+ 				BUG_ON(ret);
+ 			}
+-			btrfs_update_pinned_extents(root, node->bytenr,
+-						    node->num_bytes, 1);
+-			update_reserved_extents(root, node->bytenr,
+-						node->num_bytes, 0);
++			if (mark_free) {
++				ret = btrfs_free_reserved_extent(root,
++							node->bytenr,
++							node->num_bytes);
++				BUG_ON(ret);
++			}
+ 		}
+ 		mutex_unlock(&head->mutex);
+ 		return 0;
+@@ -2691,60 +2763,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+ 						       alloc_target);
+ }
+ 
++static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
++{
++	u64 num_bytes;
++	int level;
++
++	level = BTRFS_MAX_LEVEL - 2;
++	/*
++	 * NOTE: these calculations are absolutely the worst possible case.
++	 * This assumes that _every_ item we insert will require a new leaf, and
++	 * that the tree has grown to its maximum level size.
++	 */
++
++	/*
++	 * for every item we insert we could insert both an extent item and a
++	 * extent ref item.  Then for ever item we insert, we will need to cow
++	 * both the original leaf, plus the leaf to the left and right of it.
++	 *
++	 * Unless we are talking about the extent root, then we just want the
++	 * number of items * 2, since we just need the extent item plus its ref.
++	 */
++	if (root == root->fs_info->extent_root)
++		num_bytes = num_items * 2;
++	else
++		num_bytes = (num_items + (2 * num_items)) * 3;
++
++	/*
++	 * num_bytes is total number of leaves we could need times the leaf
++	 * size, and then for every leaf we could end up cow'ing 2 nodes per
++	 * level, down to the leaf level.
++	 */
++	num_bytes = (num_bytes * root->leafsize) +
++		(num_bytes * (level * 2)) * root->nodesize;
++
++	return num_bytes;
++}
++
+ /*
+- * for now this just makes sure we have at least 5% of our metadata space free
+- * for use.
++ * Unreserve metadata space for delalloc.  If we have less reserved credits than
++ * we have extents, this function does nothing.
+  */
+-int btrfs_check_metadata_free_space(struct btrfs_root *root)
++int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
++					  struct inode *inode, int num_items)
+ {
+ 	struct btrfs_fs_info *info = root->fs_info;
+ 	struct btrfs_space_info *meta_sinfo;
+-	u64 alloc_target, thresh;
+-	int committed = 0, ret;
++	u64 num_bytes;
++	u64 alloc_target;
++	bool bug = false;
+ 
+ 	/* get the space info for where the metadata will live */
+ 	alloc_target = btrfs_get_alloc_profile(root, 0);
+ 	meta_sinfo = __find_space_info(info, alloc_target);
+ 
+-again:
++	num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
++					   num_items);
++
+ 	spin_lock(&meta_sinfo->lock);
+-	if (!meta_sinfo->full)
+-		thresh = meta_sinfo->total_bytes * 80;
+-	else
+-		thresh = meta_sinfo->total_bytes * 95;
++	spin_lock(&BTRFS_I(inode)->accounting_lock);
++	if (BTRFS_I(inode)->reserved_extents <=
++	    BTRFS_I(inode)->outstanding_extents) {
++		spin_unlock(&BTRFS_I(inode)->accounting_lock);
++		spin_unlock(&meta_sinfo->lock);
++		return 0;
++	}
++	spin_unlock(&BTRFS_I(inode)->accounting_lock);
++
++	BTRFS_I(inode)->reserved_extents--;
++	BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
++
++	if (meta_sinfo->bytes_delalloc < num_bytes) {
++		bug = true;
++		meta_sinfo->bytes_delalloc = 0;
++	} else {
++		meta_sinfo->bytes_delalloc -= num_bytes;
++	}
++	spin_unlock(&meta_sinfo->lock);
++
++	BUG_ON(bug);
++
++	return 0;
++}
++
++static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
++{
++	u64 thresh;
++
++	thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
++		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
++		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
++		meta_sinfo->bytes_may_use;
+ 
++	thresh = meta_sinfo->total_bytes - thresh;
++	thresh *= 80;
+ 	do_div(thresh, 100);
++	if (thresh <= meta_sinfo->bytes_delalloc)
++		meta_sinfo->force_delalloc = 1;
++	else
++		meta_sinfo->force_delalloc = 0;
++}
+ 
+-	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+-	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+-		struct btrfs_trans_handle *trans;
+-		if (!meta_sinfo->full) {
+-			meta_sinfo->force_alloc = 1;
+-			spin_unlock(&meta_sinfo->lock);
++struct async_flush {
++	struct btrfs_root *root;
++	struct btrfs_space_info *info;
++	struct btrfs_work work;
++};
+ 
+-			trans = btrfs_start_transaction(root, 1);
+-			if (!trans)
+-				return -ENOMEM;
++static noinline void flush_delalloc_async(struct btrfs_work *work)
++{
++	struct async_flush *async;
++	struct btrfs_root *root;
++	struct btrfs_space_info *info;
+ 
+-			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+-					     2 * 1024 * 1024, alloc_target, 0);
+-			btrfs_end_transaction(trans, root);
++	async = container_of(work, struct async_flush, work);
++	root = async->root;
++	info = async->info;
++
++	btrfs_start_delalloc_inodes(root);
++	wake_up(&info->flush_wait);
++	btrfs_wait_ordered_extents(root, 0);
++
++	spin_lock(&info->lock);
++	info->flushing = 0;
++	spin_unlock(&info->lock);
++	wake_up(&info->flush_wait);
++
++	kfree(async);
++}
++
++static void wait_on_flush(struct btrfs_space_info *info)
++{
++	DEFINE_WAIT(wait);
++	u64 used;
++
++	while (1) {
++		prepare_to_wait(&info->flush_wait, &wait,
++				TASK_UNINTERRUPTIBLE);
++		spin_lock(&info->lock);
++		if (!info->flushing) {
++			spin_unlock(&info->lock);
++			break;
++		}
++
++		used = info->bytes_used + info->bytes_reserved +
++			info->bytes_pinned + info->bytes_readonly +
++			info->bytes_super + info->bytes_root +
++			info->bytes_may_use + info->bytes_delalloc;
++		if (used < info->total_bytes) {
++			spin_unlock(&info->lock);
++			break;
++		}
++		spin_unlock(&info->lock);
++		schedule();
++	}
++	finish_wait(&info->flush_wait, &wait);
++}
++
++static void flush_delalloc(struct btrfs_root *root,
++				 struct btrfs_space_info *info)
++{
++	struct async_flush *async;
++	bool wait = false;
++
++	spin_lock(&info->lock);
++
++	if (!info->flushing) {
++		info->flushing = 1;
++		init_waitqueue_head(&info->flush_wait);
++	} else {
++		wait = true;
++	}
++
++	spin_unlock(&info->lock);
++
++	if (wait) {
++		wait_on_flush(info);
++		return;
++	}
++
++	async = kzalloc(sizeof(*async), GFP_NOFS);
++	if (!async)
++		goto flush;
++
++	async->root = root;
++	async->info = info;
++	async->work.func = flush_delalloc_async;
++
++	btrfs_queue_worker(&root->fs_info->enospc_workers,
++			   &async->work);
++	wait_on_flush(info);
++	return;
++
++flush:
++	btrfs_start_delalloc_inodes(root);
++	btrfs_wait_ordered_extents(root, 0);
++
++	spin_lock(&info->lock);
++	info->flushing = 0;
++	spin_unlock(&info->lock);
++	wake_up(&info->flush_wait);
++}
++
++static int maybe_allocate_chunk(struct btrfs_root *root,
++				 struct btrfs_space_info *info)
++{
++	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
++	struct btrfs_trans_handle *trans;
++	bool wait = false;
++	int ret = 0;
++	u64 min_metadata;
++	u64 free_space;
++
++	free_space = btrfs_super_total_bytes(disk_super);
++	/*
++	 * we allow the metadata to grow to a max of either 5gb or 5% of the
++	 * space in the volume.
++	 */
++	min_metadata = min((u64)5 * 1024 * 1024 * 1024,
++			     div64_u64(free_space * 5, 100));
++	if (info->total_bytes >= min_metadata) {
++		spin_unlock(&info->lock);
++		return 0;
++	}
++
++	if (info->full) {
++		spin_unlock(&info->lock);
++		return 0;
++	}
++
++	if (!info->allocating_chunk) {
++		info->force_alloc = 1;
++		info->allocating_chunk = 1;
++		init_waitqueue_head(&info->allocate_wait);
++	} else {
++		wait = true;
++	}
++
++	spin_unlock(&info->lock);
++
++	if (wait) {
++		wait_event(info->allocate_wait,
++			   !info->allocating_chunk);
++		return 1;
++	}
++
++	trans = btrfs_start_transaction(root, 1);
++	if (!trans) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	ret = do_chunk_alloc(trans, root->fs_info->extent_root,
++			     4096 + 2 * 1024 * 1024,
++			     info->flags, 0);
++	btrfs_end_transaction(trans, root);
++	if (ret)
++		goto out;
++out:
++	spin_lock(&info->lock);
++	info->allocating_chunk = 0;
++	spin_unlock(&info->lock);
++	wake_up(&info->allocate_wait);
++
++	if (ret)
++		return 0;
++	return 1;
++}
++
++/*
++ * Reserve metadata space for delalloc.
++ */
++int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
++					struct inode *inode, int num_items)
++{
++	struct btrfs_fs_info *info = root->fs_info;
++	struct btrfs_space_info *meta_sinfo;
++	u64 num_bytes;
++	u64 used;
++	u64 alloc_target;
++	int flushed = 0;
++	int force_delalloc;
++
++	/* get the space info for where the metadata will live */
++	alloc_target = btrfs_get_alloc_profile(root, 0);
++	meta_sinfo = __find_space_info(info, alloc_target);
++
++	num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
++					   num_items);
++again:
++	spin_lock(&meta_sinfo->lock);
++
++	force_delalloc = meta_sinfo->force_delalloc;
++
++	if (unlikely(!meta_sinfo->bytes_root))
++		meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
++
++	if (!flushed)
++		meta_sinfo->bytes_delalloc += num_bytes;
++
++	used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
++		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
++		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
++		meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
++
++	if (used > meta_sinfo->total_bytes) {
++		flushed++;
++
++		if (flushed == 1) {
++			if (maybe_allocate_chunk(root, meta_sinfo))
++				goto again;
++			flushed++;
++		} else {
++			spin_unlock(&meta_sinfo->lock);
++		}
++
++		if (flushed == 2) {
++			filemap_flush(inode->i_mapping);
++			goto again;
++		} else if (flushed == 3) {
++			flush_delalloc(root, meta_sinfo);
+ 			goto again;
+ 		}
++		spin_lock(&meta_sinfo->lock);
++		meta_sinfo->bytes_delalloc -= num_bytes;
+ 		spin_unlock(&meta_sinfo->lock);
++		printk(KERN_ERR "enospc, has %d, reserved %d\n",
++		       BTRFS_I(inode)->outstanding_extents,
++		       BTRFS_I(inode)->reserved_extents);
++		dump_space_info(meta_sinfo, 0, 0);
++		return -ENOSPC;
++	}
+ 
+-		if (!committed) {
+-			committed = 1;
+-			trans = btrfs_join_transaction(root, 1);
+-			if (!trans)
+-				return -ENOMEM;
+-			ret = btrfs_commit_transaction(trans, root);
+-			if (ret)
+-				return ret;
++	BTRFS_I(inode)->reserved_extents++;
++	check_force_delalloc(meta_sinfo);
++	spin_unlock(&meta_sinfo->lock);
++
++	if (!flushed && force_delalloc)
++		filemap_flush(inode->i_mapping);
++
++	return 0;
++}
++
++/*
++ * unreserve num_items number of items worth of metadata space.  This needs to
++ * be paired with btrfs_reserve_metadata_space.
++ *
++ * NOTE: if you have the option, run this _AFTER_ you do a
++ * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
++ * oprations which will result in more used metadata, so we want to make sure we
++ * can do that without issue.
++ */
++int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
++{
++	struct btrfs_fs_info *info = root->fs_info;
++	struct btrfs_space_info *meta_sinfo;
++	u64 num_bytes;
++	u64 alloc_target;
++	bool bug = false;
++
++	/* get the space info for where the metadata will live */
++	alloc_target = btrfs_get_alloc_profile(root, 0);
++	meta_sinfo = __find_space_info(info, alloc_target);
++
++	num_bytes = calculate_bytes_needed(root, num_items);
++
++	spin_lock(&meta_sinfo->lock);
++	if (meta_sinfo->bytes_may_use < num_bytes) {
++		bug = true;
++		meta_sinfo->bytes_may_use = 0;
++	} else {
++		meta_sinfo->bytes_may_use -= num_bytes;
++	}
++	spin_unlock(&meta_sinfo->lock);
++
++	BUG_ON(bug);
++
++	return 0;
++}
++
++/*
++ * Reserve some metadata space for use.  We'll calculate the worste case number
++ * of bytes that would be needed to modify num_items number of items.  If we
++ * have space, fantastic, if not, you get -ENOSPC.  Please call
++ * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
++ * items you reserved, since whatever metadata you needed should have already
++ * been allocated.
++ *
++ * This will commit the transaction to make more space if we don't have enough
++ * metadata space.  THe only time we don't do this is if we're reserving space
++ * inside of a transaction, then we will just return -ENOSPC and it is the
++ * callers responsibility to handle it properly.
++ */
++int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
++{
++	struct btrfs_fs_info *info = root->fs_info;
++	struct btrfs_space_info *meta_sinfo;
++	u64 num_bytes;
++	u64 used;
++	u64 alloc_target;
++	int retries = 0;
++
++	/* get the space info for where the metadata will live */
++	alloc_target = btrfs_get_alloc_profile(root, 0);
++	meta_sinfo = __find_space_info(info, alloc_target);
++
++	num_bytes = calculate_bytes_needed(root, num_items);
++again:
++	spin_lock(&meta_sinfo->lock);
++
++	if (unlikely(!meta_sinfo->bytes_root))
++		meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
++
++	if (!retries)
++		meta_sinfo->bytes_may_use += num_bytes;
++
++	used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
++		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
++		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
++		meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
++
++	if (used > meta_sinfo->total_bytes) {
++		retries++;
++		if (retries == 1) {
++			if (maybe_allocate_chunk(root, meta_sinfo))
++				goto again;
++			retries++;
++		} else {
++			spin_unlock(&meta_sinfo->lock);
++		}
++
++		if (retries == 2) {
++			flush_delalloc(root, meta_sinfo);
+ 			goto again;
+ 		}
++		spin_lock(&meta_sinfo->lock);
++		meta_sinfo->bytes_may_use -= num_bytes;
++		spin_unlock(&meta_sinfo->lock);
++
++		dump_space_info(meta_sinfo, 0, 0);
+ 		return -ENOSPC;
+ 	}
++
++	check_force_delalloc(meta_sinfo);
+ 	spin_unlock(&meta_sinfo->lock);
+ 
+ 	return 0;
+@@ -2764,13 +3224,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+ 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ 
+ 	data_sinfo = BTRFS_I(inode)->space_info;
++	if (!data_sinfo)
++		goto alloc;
++
+ again:
+ 	/* make sure we have enough space to handle the data first */
+ 	spin_lock(&data_sinfo->lock);
+ 	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
+ 	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
+ 	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
+-	    data_sinfo->bytes_may_use < bytes) {
++	    data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
+ 		struct btrfs_trans_handle *trans;
+ 
+ 		/*
+@@ -2782,7 +3245,7 @@ again:
+ 
+ 			data_sinfo->force_alloc = 1;
+ 			spin_unlock(&data_sinfo->lock);
+-
++alloc:
+ 			alloc_target = btrfs_get_alloc_profile(root, 1);
+ 			trans = btrfs_start_transaction(root, 1);
+ 			if (!trans)
+@@ -2794,12 +3257,17 @@ again:
+ 			btrfs_end_transaction(trans, root);
+ 			if (ret)
+ 				return ret;
++
++			if (!data_sinfo) {
++				btrfs_set_inode_space_info(root, inode);
++				data_sinfo = BTRFS_I(inode)->space_info;
++			}
+ 			goto again;
+ 		}
+ 		spin_unlock(&data_sinfo->lock);
+ 
+ 		/* commit the current transaction and try again */
+-		if (!committed) {
++		if (!committed && !root->fs_info->open_ioctl_trans) {
+ 			committed = 1;
+ 			trans = btrfs_join_transaction(root, 1);
+ 			if (!trans)
+@@ -2827,7 +3295,7 @@ again:
+ 	BTRFS_I(inode)->reserved_bytes += bytes;
+ 	spin_unlock(&data_sinfo->lock);
+ 
+-	return btrfs_check_metadata_free_space(root);
++	return 0;
+ }
+ 
+ /*
+@@ -2926,17 +3394,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ 	BUG_ON(!space_info);
+ 
+ 	spin_lock(&space_info->lock);
+-	if (space_info->force_alloc) {
++	if (space_info->force_alloc)
+ 		force = 1;
+-		space_info->force_alloc = 0;
+-	}
+ 	if (space_info->full) {
+ 		spin_unlock(&space_info->lock);
+ 		goto out;
+ 	}
+ 
+ 	thresh = space_info->total_bytes - space_info->bytes_readonly;
+-	thresh = div_factor(thresh, 6);
++	thresh = div_factor(thresh, 8);
+ 	if (!force &&
+ 	   (space_info->bytes_used + space_info->bytes_pinned +
+ 	    space_info->bytes_reserved + alloc_bytes) < thresh) {
+@@ -2950,7 +3416,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ 	 * we keep a reasonable number of metadata chunks allocated in the
+ 	 * FS as well.
+ 	 */
+-	if (flags & BTRFS_BLOCK_GROUP_DATA) {
++	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
+ 		fs_info->data_chunk_allocations++;
+ 		if (!(fs_info->data_chunk_allocations %
+ 		      fs_info->metadata_ratio))
+@@ -2958,8 +3424,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ 	}
+ 
+ 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
++	spin_lock(&space_info->lock);
+ 	if (ret)
+ 		space_info->full = 1;
++	space_info->force_alloc = 0;
++	spin_unlock(&space_info->lock);
+ out:
+ 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
+ 	return ret;
+@@ -3008,10 +3477,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
+ 		num_bytes = min(total, cache->key.offset - byte_in_group);
+ 		if (alloc) {
+ 			old_val += num_bytes;
++			btrfs_set_block_group_used(&cache->item, old_val);
++			cache->reserved -= num_bytes;
+ 			cache->space_info->bytes_used += num_bytes;
++			cache->space_info->bytes_reserved -= num_bytes;
+ 			if (cache->ro)
+ 				cache->space_info->bytes_readonly -= num_bytes;
+-			btrfs_set_block_group_used(&cache->item, old_val);
+ 			spin_unlock(&cache->lock);
+ 			spin_unlock(&cache->space_info->lock);
+ 		} else {
+@@ -3056,127 +3527,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
+ 	return bytenr;
+ }
+ 
+-int btrfs_update_pinned_extents(struct btrfs_root *root,
+-				u64 bytenr, u64 num, int pin)
++/*
++ * this function must be called within transaction
++ */
++int btrfs_pin_extent(struct btrfs_root *root,
++		     u64 bytenr, u64 num_bytes, int reserved)
+ {
+-	u64 len;
+-	struct btrfs_block_group_cache *cache;
+ 	struct btrfs_fs_info *fs_info = root->fs_info;
++	struct btrfs_block_group_cache *cache;
+ 
+-	if (pin)
+-		set_extent_dirty(&fs_info->pinned_extents,
+-				bytenr, bytenr + num - 1, GFP_NOFS);
+-
+-	while (num > 0) {
+-		cache = btrfs_lookup_block_group(fs_info, bytenr);
+-		BUG_ON(!cache);
+-		len = min(num, cache->key.offset -
+-			  (bytenr - cache->key.objectid));
+-		if (pin) {
+-			spin_lock(&cache->space_info->lock);
+-			spin_lock(&cache->lock);
+-			cache->pinned += len;
+-			cache->space_info->bytes_pinned += len;
+-			spin_unlock(&cache->lock);
+-			spin_unlock(&cache->space_info->lock);
+-			fs_info->total_pinned += len;
+-		} else {
+-			int unpin = 0;
++	cache = btrfs_lookup_block_group(fs_info, bytenr);
++	BUG_ON(!cache);
+ 
+-			/*
+-			 * in order to not race with the block group caching, we
+-			 * only want to unpin the extent if we are cached.  If
+-			 * we aren't cached, we want to start async caching this
+-			 * block group so we can free the extent the next time
+-			 * around.
+-			 */
+-			spin_lock(&cache->space_info->lock);
+-			spin_lock(&cache->lock);
+-			unpin = (cache->cached == BTRFS_CACHE_FINISHED);
+-			if (likely(unpin)) {
+-				cache->pinned -= len;
+-				cache->space_info->bytes_pinned -= len;
+-				fs_info->total_pinned -= len;
+-			}
+-			spin_unlock(&cache->lock);
+-			spin_unlock(&cache->space_info->lock);
++	spin_lock(&cache->space_info->lock);
++	spin_lock(&cache->lock);
++	cache->pinned += num_bytes;
++	cache->space_info->bytes_pinned += num_bytes;
++	if (reserved) {
++		cache->reserved -= num_bytes;
++		cache->space_info->bytes_reserved -= num_bytes;
++	}
++	spin_unlock(&cache->lock);
++	spin_unlock(&cache->space_info->lock);
+ 
+-			if (likely(unpin))
+-				clear_extent_dirty(&fs_info->pinned_extents,
+-						   bytenr, bytenr + len -1,
+-						   GFP_NOFS);
+-			else
+-				cache_block_group(cache);
++	btrfs_put_block_group(cache);
+ 
+-			if (unpin)
+-				btrfs_add_free_space(cache, bytenr, len);
+-		}
+-		btrfs_put_block_group(cache);
+-		bytenr += len;
+-		num -= len;
++	set_extent_dirty(fs_info->pinned_extents,
++			 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
++	return 0;
++}
++
++static int update_reserved_extents(struct btrfs_block_group_cache *cache,
++				   u64 num_bytes, int reserve)
++{
++	spin_lock(&cache->space_info->lock);
++	spin_lock(&cache->lock);
++	if (reserve) {
++		cache->reserved += num_bytes;
++		cache->space_info->bytes_reserved += num_bytes;
++	} else {
++		cache->reserved -= num_bytes;
++		cache->space_info->bytes_reserved -= num_bytes;
+ 	}
++	spin_unlock(&cache->lock);
++	spin_unlock(&cache->space_info->lock);
+ 	return 0;
+ }
+ 
+-static int update_reserved_extents(struct btrfs_root *root,
+-				   u64 bytenr, u64 num, int reserve)
++int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
++				struct btrfs_root *root)
+ {
+-	u64 len;
+-	struct btrfs_block_group_cache *cache;
+ 	struct btrfs_fs_info *fs_info = root->fs_info;
++	struct btrfs_caching_control *next;
++	struct btrfs_caching_control *caching_ctl;
++	struct btrfs_block_group_cache *cache;
+ 
+-	while (num > 0) {
+-		cache = btrfs_lookup_block_group(fs_info, bytenr);
+-		BUG_ON(!cache);
+-		len = min(num, cache->key.offset -
+-			  (bytenr - cache->key.objectid));
++	down_write(&fs_info->extent_commit_sem);
+ 
+-		spin_lock(&cache->space_info->lock);
+-		spin_lock(&cache->lock);
+-		if (reserve) {
+-			cache->reserved += len;
+-			cache->space_info->bytes_reserved += len;
++	list_for_each_entry_safe(caching_ctl, next,
++				 &fs_info->caching_block_groups, list) {
++		cache = caching_ctl->block_group;
++		if (block_group_cache_done(cache)) {
++			cache->last_byte_to_unpin = (u64)-1;
++			list_del_init(&caching_ctl->list);
++			put_caching_control(caching_ctl);
+ 		} else {
+-			cache->reserved -= len;
+-			cache->space_info->bytes_reserved -= len;
++			cache->last_byte_to_unpin = caching_ctl->progress;
+ 		}
+-		spin_unlock(&cache->lock);
+-		spin_unlock(&cache->space_info->lock);
+-		btrfs_put_block_group(cache);
+-		bytenr += len;
+-		num -= len;
+ 	}
++
++	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
++		fs_info->pinned_extents = &fs_info->freed_extents[1];
++	else
++		fs_info->pinned_extents = &fs_info->freed_extents[0];
++
++	up_write(&fs_info->extent_commit_sem);
+ 	return 0;
+ }
+ 
+-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
++static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+ {
+-	u64 last = 0;
+-	u64 start;
+-	u64 end;
+-	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
+-	int ret;
++	struct btrfs_fs_info *fs_info = root->fs_info;
++	struct btrfs_block_group_cache *cache = NULL;
++	u64 len;
+ 
+-	while (1) {
+-		ret = find_first_extent_bit(pinned_extents, last,
+-					    &start, &end, EXTENT_DIRTY);
+-		if (ret)
+-			break;
++	while (start <= end) {
++		if (!cache ||
++		    start >= cache->key.objectid + cache->key.offset) {
++			if (cache)
++				btrfs_put_block_group(cache);
++			cache = btrfs_lookup_block_group(fs_info, start);
++			BUG_ON(!cache);
++		}
+ 
+-		set_extent_dirty(copy, start, end, GFP_NOFS);
+-		last = end + 1;
++		len = cache->key.objectid + cache->key.offset - start;
++		len = min(len, end + 1 - start);
++
++		if (start < cache->last_byte_to_unpin) {
++			len = min(len, cache->last_byte_to_unpin - start);
++			btrfs_add_free_space(cache, start, len);
++		}
++
++		spin_lock(&cache->space_info->lock);
++		spin_lock(&cache->lock);
++		cache->pinned -= len;
++		cache->space_info->bytes_pinned -= len;
++		spin_unlock(&cache->lock);
++		spin_unlock(&cache->space_info->lock);
++
++		start += len;
+ 	}
++
++	if (cache)
++		btrfs_put_block_group(cache);
+ 	return 0;
+ }
+ 
+ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+-			       struct btrfs_root *root,
+-			       struct extent_io_tree *unpin)
++			       struct btrfs_root *root)
+ {
++	struct btrfs_fs_info *fs_info = root->fs_info;
++	struct extent_io_tree *unpin;
+ 	u64 start;
+ 	u64 end;
+ 	int ret;
+ 
++	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
++		unpin = &fs_info->freed_extents[1];
++	else
++		unpin = &fs_info->freed_extents[0];
++
+ 	while (1) {
+ 		ret = find_first_extent_bit(unpin, 0, &start, &end,
+ 					    EXTENT_DIRTY);
+@@ -3185,10 +3665,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+ 
+ 		ret = btrfs_discard_extent(root, start, end + 1 - start);
+ 
+-		/* unlocks the pinned mutex */
+-		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
+ 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
+-
++		unpin_extent_range(root, start, end);
+ 		cond_resched();
+ 	}
+ 
+@@ -3198,7 +3676,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+ static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ 			  struct btrfs_root *root,
+ 			  struct btrfs_path *path,
+-			  u64 bytenr, u64 num_bytes, int is_data,
++			  u64 bytenr, u64 num_bytes,
++			  int is_data, int reserved,
+ 			  struct extent_buffer **must_clean)
+ {
+ 	int err = 0;
+@@ -3207,6 +3686,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ 	if (is_data)
+ 		goto pinit;
+ 
++	/*
++	 * discard is sloooow, and so triggering discards on
++	 * individual btree blocks isn't a good plan.  Just
++	 * pin everything in discard mode.
++	 */
++	if (btrfs_test_opt(root, DISCARD))
++		goto pinit;
++
+ 	buf = btrfs_find_tree_block(root, bytenr, num_bytes);
+ 	if (!buf)
+ 		goto pinit;
+@@ -3230,15 +3717,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ 	}
+ 	free_extent_buffer(buf);
+ pinit:
+-	btrfs_set_path_blocking(path);
++	if (path)
++		btrfs_set_path_blocking(path);
+ 	/* unlocks the pinned mutex */
+-	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
++	btrfs_pin_extent(root, bytenr, num_bytes, reserved);
+ 
+ 	BUG_ON(err < 0);
+ 	return 0;
+ }
+ 
+-
+ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ 				struct btrfs_root *root,
+ 				u64 bytenr, u64 num_bytes, u64 parent,
+@@ -3412,7 +3899,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ 		}
+ 
+ 		ret = pin_down_bytes(trans, root, path, bytenr,
+-				     num_bytes, is_data, &must_clean);
++				     num_bytes, is_data, 0, &must_clean);
+ 		if (ret > 0)
+ 			mark_free = 1;
+ 		BUG_ON(ret < 0);
+@@ -3543,8 +4030,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
+ 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
+ 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
+ 		/* unlocks the pinned mutex */
+-		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+-		update_reserved_extents(root, bytenr, num_bytes, 0);
++		btrfs_pin_extent(root, bytenr, num_bytes, 1);
+ 		ret = 0;
+ 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ 		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+@@ -3584,19 +4070,33 @@ static noinline int
+ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+ 				u64 num_bytes)
+ {
++	struct btrfs_caching_control *caching_ctl;
+ 	DEFINE_WAIT(wait);
+ 
+-	prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
+-
+-	if (block_group_cache_done(cache)) {
+-		finish_wait(&cache->caching_q, &wait);
++	caching_ctl = get_caching_control(cache);
++	if (!caching_ctl)
+ 		return 0;
+-	}
+-	schedule();
+-	finish_wait(&cache->caching_q, &wait);
+ 
+-	wait_event(cache->caching_q, block_group_cache_done(cache) ||
++	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
+ 		   (cache->free_space >= num_bytes));
++
++	put_caching_control(caching_ctl);
++	return 0;
++}
++
++static noinline int
++wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
++{
++	struct btrfs_caching_control *caching_ctl;
++	DEFINE_WAIT(wait);
++
++	caching_ctl = get_caching_control(cache);
++	if (!caching_ctl)
++		return 0;
++
++	wait_event(caching_ctl->wait, block_group_cache_done(cache));
++
++	put_caching_control(caching_ctl);
+ 	return 0;
+ }
+ 
+@@ -3634,6 +4134,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
+ 	int last_ptr_loop = 0;
+ 	int loop = 0;
+ 	bool found_uncached_bg = false;
++	bool failed_cluster_refill = false;
++	bool failed_alloc = false;
+ 
+ 	WARN_ON(num_bytes < root->sectorsize);
+ 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+@@ -3731,7 +4233,16 @@ have_block_group:
+ 		if (unlikely(block_group->ro))
+ 			goto loop;
+ 
+-		if (last_ptr) {
++		/*
++		 * Ok we want to try and use the cluster allocator, so lets look
++		 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
++		 * have tried the cluster allocator plenty of times at this
++		 * point and not have found anything, so we are likely way too
++		 * fragmented for the clustering stuff to find anything, so lets
++		 * just skip it and let the allocator find whatever block it can
++		 * find
++		 */
++		if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
+ 			/*
+ 			 * the refill lock keeps out other
+ 			 * people trying to start a new cluster
+@@ -3806,9 +4317,11 @@ refill_cluster:
+ 					spin_unlock(&last_ptr->refill_lock);
+ 					goto checks;
+ 				}
+-			} else if (!cached && loop > LOOP_CACHING_NOWAIT) {
++			} else if (!cached && loop > LOOP_CACHING_NOWAIT
++				   && !failed_cluster_refill) {
+ 				spin_unlock(&last_ptr->refill_lock);
+ 
++				failed_cluster_refill = true;
+ 				wait_block_group_cache_progress(block_group,
+ 				       num_bytes + empty_cluster + empty_size);
+ 				goto have_block_group;
+@@ -3820,25 +4333,30 @@ refill_cluster:
+ 			 * cluster.  Free the cluster we've been trying
+ 			 * to use, and go to the next block group
+ 			 */
+-			if (loop < LOOP_NO_EMPTY_SIZE) {
+-				btrfs_return_cluster_to_free_space(NULL,
+-								   last_ptr);
+-				spin_unlock(&last_ptr->refill_lock);
+-				goto loop;
+-			}
++			btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ 			spin_unlock(&last_ptr->refill_lock);
++			goto loop;
+ 		}
+ 
+ 		offset = btrfs_find_space_for_alloc(block_group, search_start,
+ 						    num_bytes, empty_size);
+-		if (!offset && (cached || (!cached &&
+-					   loop == LOOP_CACHING_NOWAIT))) {
+-			goto loop;
+-		} else if (!offset && (!cached &&
+-				       loop > LOOP_CACHING_NOWAIT)) {
++		/*
++		 * If we didn't find a chunk, and we haven't failed on this
++		 * block group before, and this block group is in the middle of
++		 * caching and we are ok with waiting, then go ahead and wait
++		 * for progress to be made, and set failed_alloc to true.
++		 *
++		 * If failed_alloc is true then we've already waited on this
++		 * block group once and should move on to the next block group.
++		 */
++		if (!offset && !failed_alloc && !cached &&
++		    loop > LOOP_CACHING_NOWAIT) {
+ 			wait_block_group_cache_progress(block_group,
+-					num_bytes + empty_size);
++						num_bytes + empty_size);
++			failed_alloc = true;
+ 			goto have_block_group;
++		} else if (!offset) {
++			goto loop;
+ 		}
+ checks:
+ 		search_start = stripe_align(root, offset);
+@@ -3880,9 +4398,13 @@ checks:
+ 					     search_start - offset);
+ 		BUG_ON(offset > search_start);
+ 
++		update_reserved_extents(block_group, num_bytes, 1);
++
+ 		/* we are all good, lets return */
+ 		break;
+ loop:
++		failed_cluster_refill = false;
++		failed_alloc = false;
+ 		btrfs_put_block_group(block_group);
+ 	}
+ 	up_read(&space_info->groups_sem);
+@@ -3940,21 +4462,32 @@ loop:
+ 	return ret;
+ }
+ 
+-static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
++static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
++			    int dump_block_groups)
+ {
+ 	struct btrfs_block_group_cache *cache;
+ 
++	spin_lock(&info->lock);
+ 	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+ 	       (unsigned long long)(info->total_bytes - info->bytes_used -
+-				    info->bytes_pinned - info->bytes_reserved),
++				    info->bytes_pinned - info->bytes_reserved -
++				    info->bytes_super),
+ 	       (info->full) ? "" : "not ");
+ 	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
+-	       " may_use=%llu, used=%llu\n",
++	       " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
++	       "\n",
+ 	       (unsigned long long)info->total_bytes,
+ 	       (unsigned long long)info->bytes_pinned,
+ 	       (unsigned long long)info->bytes_delalloc,
+ 	       (unsigned long long)info->bytes_may_use,
+-	       (unsigned long long)info->bytes_used);
++	       (unsigned long long)info->bytes_used,
++	       (unsigned long long)info->bytes_root,
++	       (unsigned long long)info->bytes_super,
++	       (unsigned long long)info->bytes_reserved);
++	spin_unlock(&info->lock);
++
++	if (!dump_block_groups)
++		return;
+ 
+ 	down_read(&info->groups_sem);
+ 	list_for_each_entry(cache, &info->block_groups, list) {
+@@ -3972,12 +4505,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+ 	up_read(&info->groups_sem);
+ }
+ 
+-static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+-				  struct btrfs_root *root,
+-				  u64 num_bytes, u64 min_alloc_size,
+-				  u64 empty_size, u64 hint_byte,
+-				  u64 search_end, struct btrfs_key *ins,
+-				  u64 data)
++int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
++			 struct btrfs_root *root,
++			 u64 num_bytes, u64 min_alloc_size,
++			 u64 empty_size, u64 hint_byte,
++			 u64 search_end, struct btrfs_key *ins,
++			 u64 data)
+ {
+ 	int ret;
+ 	u64 search_start = 0;
+@@ -4022,7 +4555,7 @@ again:
+ 		printk(KERN_ERR "btrfs allocation failed flags %llu, "
+ 		       "wanted %llu\n", (unsigned long long)data,
+ 		       (unsigned long long)num_bytes);
+-		dump_space_info(sinfo, num_bytes);
++		dump_space_info(sinfo, num_bytes, 1);
+ 	}
+ 
+ 	return ret;
+@@ -4043,25 +4576,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+ 	ret = btrfs_discard_extent(root, start, len);
+ 
+ 	btrfs_add_free_space(cache, start, len);
++	update_reserved_extents(cache, len, 0);
+ 	btrfs_put_block_group(cache);
+-	update_reserved_extents(root, start, len, 0);
+-
+-	return ret;
+-}
+-
+-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+-				  struct btrfs_root *root,
+-				  u64 num_bytes, u64 min_alloc_size,
+-				  u64 empty_size, u64 hint_byte,
+-				  u64 search_end, struct btrfs_key *ins,
+-				  u64 data)
+-{
+-	int ret;
+-	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
+-				     empty_size, hint_byte, search_end, ins,
+-				     data);
+-	if (!ret)
+-		update_reserved_extents(root, ins->objectid, ins->offset, 1);
+ 
+ 	return ret;
+ }
+@@ -4222,15 +4738,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+ {
+ 	int ret;
+ 	struct btrfs_block_group_cache *block_group;
++	struct btrfs_caching_control *caching_ctl;
++	u64 start = ins->objectid;
++	u64 num_bytes = ins->offset;
+ 
+ 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+ 	cache_block_group(block_group);
+-	wait_event(block_group->caching_q,
+-		   block_group_cache_done(block_group));
++	caching_ctl = get_caching_control(block_group);
+ 
+-	ret = btrfs_remove_free_space(block_group, ins->objectid,
+-				      ins->offset);
+-	BUG_ON(ret);
++	if (!caching_ctl) {
++		BUG_ON(!block_group_cache_done(block_group));
++		ret = btrfs_remove_free_space(block_group, start, num_bytes);
++		BUG_ON(ret);
++	} else {
++		mutex_lock(&caching_ctl->mutex);
++
++		if (start >= caching_ctl->progress) {
++			ret = add_excluded_extent(root, start, num_bytes);
++			BUG_ON(ret);
++		} else if (start + num_bytes <= caching_ctl->progress) {
++			ret = btrfs_remove_free_space(block_group,
++						      start, num_bytes);
++			BUG_ON(ret);
++		} else {
++			num_bytes = caching_ctl->progress - start;
++			ret = btrfs_remove_free_space(block_group,
++						      start, num_bytes);
++			BUG_ON(ret);
++
++			start = caching_ctl->progress;
++			num_bytes = ins->objectid + ins->offset -
++				    caching_ctl->progress;
++			ret = add_excluded_extent(root, start, num_bytes);
++			BUG_ON(ret);
++		}
++
++		mutex_unlock(&caching_ctl->mutex);
++		put_caching_control(caching_ctl);
++	}
++
++	update_reserved_extents(block_group, ins->offset, 1);
+ 	btrfs_put_block_group(block_group);
+ 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
+ 					 0, owner, offset, ins, 1);
+@@ -4254,9 +4801,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
+ 	int ret;
+ 	u64 flags = 0;
+ 
+-	ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
+-				     empty_size, hint_byte, search_end,
+-				     ins, 0);
++	ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
++				   empty_size, hint_byte, search_end,
++				   ins, 0);
+ 	if (ret)
+ 		return ret;
+ 
+@@ -4267,7 +4814,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
+ 	} else
+ 		BUG_ON(parent > 0);
+ 
+-	update_reserved_extents(root, ins->objectid, ins->offset, 1);
+ 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ 		struct btrfs_delayed_extent_op *extent_op;
+ 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+@@ -4346,452 +4892,108 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+ 	return buf;
+ }
+ 
+-#if 0
+-int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+-			struct btrfs_root *root, struct extent_buffer *leaf)
+-{
+-	u64 disk_bytenr;
+-	u64 num_bytes;
+-	struct btrfs_key key;
+-	struct btrfs_file_extent_item *fi;
+-	u32 nritems;
+-	int i;
+-	int ret;
+-
+-	BUG_ON(!btrfs_is_leaf(leaf));
+-	nritems = btrfs_header_nritems(leaf);
+-
+-	for (i = 0; i < nritems; i++) {
+-		cond_resched();
+-		btrfs_item_key_to_cpu(leaf, &key, i);
+-
+-		/* only extents have references, skip everything else */
+-		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+-			continue;
+-
+-		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+-
+-		/* inline extents live in the btree, they don't have refs */
+-		if (btrfs_file_extent_type(leaf, fi) ==
+-		    BTRFS_FILE_EXTENT_INLINE)
+-			continue;
+-
+-		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+-
+-		/* holes don't have refs */
+-		if (disk_bytenr == 0)
+-			continue;
+-
+-		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+-		ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
+-					leaf->start, 0, key.objectid, 0);
+-		BUG_ON(ret);
+-	}
+-	return 0;
+-}
+-
+-static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
+-					struct btrfs_root *root,
+-					struct btrfs_leaf_ref *ref)
+-{
+-	int i;
+-	int ret;
+-	struct btrfs_extent_info *info;
+-	struct refsort *sorted;
+-
+-	if (ref->nritems == 0)
+-		return 0;
+-
+-	sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
+-	for (i = 0; i < ref->nritems; i++) {
+-		sorted[i].bytenr = ref->extents[i].bytenr;
+-		sorted[i].slot = i;
+-	}
+-	sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
+-
+-	/*
+-	 * the items in the ref were sorted when the ref was inserted
+-	 * into the ref cache, so this is already in order
+-	 */
+-	for (i = 0; i < ref->nritems; i++) {
+-		info = ref->extents + sorted[i].slot;
+-		ret = btrfs_free_extent(trans, root, info->bytenr,
+-					  info->num_bytes, ref->bytenr,
+-					  ref->owner, ref->generation,
+-					  info->objectid, 0);
+-
+-		atomic_inc(&root->fs_info->throttle_gen);
+-		wake_up(&root->fs_info->transaction_throttle);
+-		cond_resched();
+-
+-		BUG_ON(ret);
+-		info++;
+-	}
+-
+-	kfree(sorted);
+-	return 0;
+-}
+-
+-
+-static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
+-				     struct btrfs_root *root, u64 start,
+-				     u64 len, u32 *refs)
+-{
+-	int ret;
+-
+-	ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
+-	BUG_ON(ret);
+-
+-#if 0 /* some debugging code in case we see problems here */
+-	/* if the refs count is one, it won't get increased again.  But
+-	 * if the ref count is > 1, someone may be decreasing it at
+-	 * the same time we are.
+-	 */
+-	if (*refs != 1) {
+-		struct extent_buffer *eb = NULL;
+-		eb = btrfs_find_create_tree_block(root, start, len);
+-		if (eb)
+-			btrfs_tree_lock(eb);
+-
+-		mutex_lock(&root->fs_info->alloc_mutex);
+-		ret = lookup_extent_ref(NULL, root, start, len, refs);
+-		BUG_ON(ret);
+-		mutex_unlock(&root->fs_info->alloc_mutex);
+-
+-		if (eb) {
+-			btrfs_tree_unlock(eb);
+-			free_extent_buffer(eb);
+-		}
+-		if (*refs == 1) {
+-			printk(KERN_ERR "btrfs block %llu went down to one "
+-			       "during drop_snap\n", (unsigned long long)start);
+-		}
+-
+-	}
+-#endif
+-
+-	cond_resched();
+-	return ret;
+-}
++struct walk_control {
++	u64 refs[BTRFS_MAX_LEVEL];
++	u64 flags[BTRFS_MAX_LEVEL];
++	struct btrfs_key update_progress;
++	int stage;
++	int level;
++	int shared_level;
++	int update_ref;
++	int keep_locks;
++	int reada_slot;
++	int reada_count;
++};
+ 
++#define DROP_REFERENCE	1
++#define UPDATE_BACKREF	2
+ 
+-/*
+- * this is used while deleting old snapshots, and it drops the refs
+- * on a whole subtree starting from a level 1 node.
+- *
+- * The idea is to sort all the leaf pointers, and then drop the
+- * ref on all the leaves in order.  Most of the time the leaves
+- * will have ref cache entries, so no leaf IOs will be required to
+- * find the extents they have references on.
+- *
+- * For each leaf, any references it has are also dropped in order
+- *
+- * This ends up dropping the references in something close to optimal
+- * order for reading and modifying the extent allocation tree.
+- */
+-static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
+-					struct btrfs_root *root,
+-					struct btrfs_path *path)
++static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
++				     struct btrfs_root *root,
++				     struct walk_control *wc,
++				     struct btrfs_path *path)
+ {
+ 	u64 bytenr;
+-	u64 root_owner;
+-	u64 root_gen;
+-	struct extent_buffer *eb = path->nodes[1];
+-	struct extent_buffer *leaf;
+-	struct btrfs_leaf_ref *ref;
+-	struct refsort *sorted = NULL;
+-	int nritems = btrfs_header_nritems(eb);
++	u64 generation;
++	u64 refs;
++	u64 flags;
++	u64 last = 0;
++	u32 nritems;
++	u32 blocksize;
++	struct btrfs_key key;
++	struct extent_buffer *eb;
+ 	int ret;
+-	int i;
+-	int refi = 0;
+-	int slot = path->slots[1];
+-	u32 blocksize = btrfs_level_size(root, 0);
+-	u32 refs;
+-
+-	if (nritems == 0)
+-		goto out;
+-
+-	root_owner = btrfs_header_owner(eb);
+-	root_gen = btrfs_header_generation(eb);
+-	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
++	int slot;
++	int nread = 0;
+ 
+-	/*
+-	 * step one, sort all the leaf pointers so we don't scribble
+-	 * randomly into the extent allocation tree
+-	 */
+-	for (i = slot; i < nritems; i++) {
+-		sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
+-		sorted[refi].slot = i;
+-		refi++;
++	if (path->slots[wc->level] < wc->reada_slot) {
++		wc->reada_count = wc->reada_count * 2 / 3;
++		wc->reada_count = max(wc->reada_count, 2);
++	} else {
++		wc->reada_count = wc->reada_count * 3 / 2;
++		wc->reada_count = min_t(int, wc->reada_count,
++					BTRFS_NODEPTRS_PER_BLOCK(root));
+ 	}
+ 
+-	/*
+-	 * nritems won't be zero, but if we're picking up drop_snapshot
+-	 * after a crash, slot might be > 0, so double check things
+-	 * just in case.
+-	 */
+-	if (refi == 0)
+-		goto out;
++	eb = path->nodes[wc->level];
++	nritems = btrfs_header_nritems(eb);
++	blocksize = btrfs_level_size(root, wc->level - 1);
+ 
+-	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
++	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
++		if (nread >= wc->reada_count)
++			break;
+ 
+-	/*
+-	 * the first loop frees everything the leaves point to
+-	 */
+-	for (i = 0; i < refi; i++) {
+-		u64 ptr_gen;
++		cond_resched();
++		bytenr = btrfs_node_blockptr(eb, slot);
++		generation = btrfs_node_ptr_generation(eb, slot);
+ 
+-		bytenr = sorted[i].bytenr;
++		if (slot == path->slots[wc->level])
++			goto reada;
+ 
+-		/*
+-		 * check the reference count on this leaf.  If it is > 1
+-		 * we just decrement it below and don't update any
+-		 * of the refs the leaf points to.
+-		 */
+-		ret = drop_snap_lookup_refcount(trans, root, bytenr,
+-						blocksize, &refs);
+-		BUG_ON(ret);
+-		if (refs != 1)
++		if (wc->stage == UPDATE_BACKREF &&
++		    generation <= root->root_key.offset)
+ 			continue;
+ 
+-		ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
+-
+-		/*
+-		 * the leaf only had one reference, which means the
+-		 * only thing pointing to this leaf is the snapshot
+-		 * we're deleting.  It isn't possible for the reference
+-		 * count to increase again later
+-		 *
+-		 * The reference cache is checked for the leaf,
+-		 * and if found we'll be able to drop any refs held by
+-		 * the leaf without needing to read it in.
+-		 */
+-		ref = btrfs_lookup_leaf_ref(root, bytenr);
+-		if (ref && ref->generation != ptr_gen) {
+-			btrfs_free_leaf_ref(root, ref);
+-			ref = NULL;
+-		}
+-		if (ref) {
+-			ret = cache_drop_leaf_ref(trans, root, ref);
+-			BUG_ON(ret);
+-			btrfs_remove_leaf_ref(root, ref);
+-			btrfs_free_leaf_ref(root, ref);
+-		} else {
+-			/*
+-			 * the leaf wasn't in the reference cache, so
+-			 * we have to read it.
+-			 */
+-			leaf = read_tree_block(root, bytenr, blocksize,
+-					       ptr_gen);
+-			ret = btrfs_drop_leaf_ref(trans, root, leaf);
+-			BUG_ON(ret);
+-			free_extent_buffer(leaf);
+-		}
+-		atomic_inc(&root->fs_info->throttle_gen);
+-		wake_up(&root->fs_info->transaction_throttle);
+-		cond_resched();
+-	}
+-
+-	/*
+-	 * run through the loop again to free the refs on the leaves.
+-	 * This is faster than doing it in the loop above because
+-	 * the leaves are likely to be clustered together.  We end up
+-	 * working in nice chunks on the extent allocation tree.
+-	 */
+-	for (i = 0; i < refi; i++) {
+-		bytenr = sorted[i].bytenr;
+-		ret = btrfs_free_extent(trans, root, bytenr,
+-					blocksize, eb->start,
+-					root_owner, root_gen, 0, 1);
++		/* We don't lock the tree block, it's OK to be racy here */
++		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
++					       &refs, &flags);
+ 		BUG_ON(ret);
++		BUG_ON(refs == 0);
+ 
+-		atomic_inc(&root->fs_info->throttle_gen);
+-		wake_up(&root->fs_info->transaction_throttle);
+-		cond_resched();
+-	}
+-out:
+-	kfree(sorted);
+-
+-	/*
+-	 * update the path to show we've processed the entire level 1
+-	 * node.  This will get saved into the root's drop_snapshot_progress
+-	 * field so these drops are not repeated again if this transaction
+-	 * commits.
+-	 */
+-	path->slots[1] = nritems;
+-	return 0;
+-}
+-
+-/*
+- * helper function for drop_snapshot, this walks down the tree dropping ref
+- * counts as it goes.
+- */
+-static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+-				   struct btrfs_root *root,
+-				   struct btrfs_path *path, int *level)
+-{
+-	u64 root_owner;
+-	u64 root_gen;
+-	u64 bytenr;
+-	u64 ptr_gen;
+-	struct extent_buffer *next;
+-	struct extent_buffer *cur;
+-	struct extent_buffer *parent;
+-	u32 blocksize;
+-	int ret;
+-	u32 refs;
+-
+-	WARN_ON(*level < 0);
+-	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+-	ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
+-				path->nodes[*level]->len, &refs);
+-	BUG_ON(ret);
+-	if (refs > 1)
+-		goto out;
+-
+-	/*
+-	 * walk down to the last node level and free all the leaves
+-	 */
+-	while (*level >= 0) {
+-		WARN_ON(*level < 0);
+-		WARN_ON(*level >= BTRFS_MAX_LEVEL);
+-		cur = path->nodes[*level];
+-
+-		if (btrfs_header_level(cur) != *level)
+-			WARN_ON(1);
+-
+-		if (path->slots[*level] >=
+-		    btrfs_header_nritems(cur))
+-			break;
++		if (wc->stage == DROP_REFERENCE) {
++			if (refs == 1)
++				goto reada;
+ 
+-		/* the new code goes down to level 1 and does all the
+-		 * leaves pointed to that node in bulk.  So, this check
+-		 * for level 0 will always be false.
+-		 *
+-		 * But, the disk format allows the drop_snapshot_progress
+-		 * field in the root to leave things in a state where
+-		 * a leaf will need cleaning up here.  If someone crashes
+-		 * with the old code and then boots with the new code,
+-		 * we might find a leaf here.
+-		 */
+-		if (*level == 0) {
+-			ret = btrfs_drop_leaf_ref(trans, root, cur);
+-			BUG_ON(ret);
+-			break;
++			if (wc->level == 1 &&
++			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
++				continue;
++			if (!wc->update_ref ||
++			    generation <= root->root_key.offset)
++				continue;
++			btrfs_node_key_to_cpu(eb, &key, slot);
++			ret = btrfs_comp_cpu_keys(&key,
++						  &wc->update_progress);
++			if (ret < 0)
++				continue;
++		} else {
++			if (wc->level == 1 &&
++			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
++				continue;
+ 		}
+-
+-		/*
+-		 * once we get to level one, process the whole node
+-		 * at once, including everything below it.
+-		 */
+-		if (*level == 1) {
+-			ret = drop_level_one_refs(trans, root, path);
+-			BUG_ON(ret);
++reada:
++		ret = readahead_tree_block(root, bytenr, blocksize,
++					   generation);
++		if (ret)
+ 			break;
+-		}
+-
+-		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+-		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+-		blocksize = btrfs_level_size(root, *level - 1);
+-
+-		ret = drop_snap_lookup_refcount(trans, root, bytenr,
+-						blocksize, &refs);
+-		BUG_ON(ret);
+-
+-		/*
+-		 * if there is more than one reference, we don't need
+-		 * to read that node to drop any references it has.  We
+-		 * just drop the ref we hold on that node and move on to the
+-		 * next slot in this level.
+-		 */
+-		if (refs != 1) {
+-			parent = path->nodes[*level];
+-			root_owner = btrfs_header_owner(parent);
+-			root_gen = btrfs_header_generation(parent);
+-			path->slots[*level]++;
+-
+-			ret = btrfs_free_extent(trans, root, bytenr,
+-						blocksize, parent->start,
+-						root_owner, root_gen,
+-						*level - 1, 1);
+-			BUG_ON(ret);
+-
+-			atomic_inc(&root->fs_info->throttle_gen);
+-			wake_up(&root->fs_info->transaction_throttle);
+-			cond_resched();
+-
+-			continue;
+-		}
+-
+-		/*
+-		 * we need to keep freeing things in the next level down.
+-		 * read the block and loop around to process it
+-		 */
+-		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+-		WARN_ON(*level <= 0);
+-		if (path->nodes[*level-1])
+-			free_extent_buffer(path->nodes[*level-1]);
+-		path->nodes[*level-1] = next;
+-		*level = btrfs_header_level(next);
+-		path->slots[*level] = 0;
+-		cond_resched();
++		last = bytenr + blocksize;
++		nread++;
+ 	}
+-out:
+-	WARN_ON(*level < 0);
+-	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+-
+-	if (path->nodes[*level] == root->node) {
+-		parent = path->nodes[*level];
+-		bytenr = path->nodes[*level]->start;
+-	} else {
+-		parent = path->nodes[*level + 1];
+-		bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
+-	}
+-
+-	blocksize = btrfs_level_size(root, *level);
+-	root_owner = btrfs_header_owner(parent);
+-	root_gen = btrfs_header_generation(parent);
+-
+-	/*
+-	 * cleanup and free the reference on the last node
+-	 * we processed
+-	 */
+-	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+-				  parent->start, root_owner, root_gen,
+-				  *level, 1);
+-	free_extent_buffer(path->nodes[*level]);
+-	path->nodes[*level] = NULL;
+-
+-	*level += 1;
+-	BUG_ON(ret);
+-
+-	cond_resched();
+-	return 0;
++	wc->reada_slot = slot;
+ }
+-#endif
+-
+-struct walk_control {
+-	u64 refs[BTRFS_MAX_LEVEL];
+-	u64 flags[BTRFS_MAX_LEVEL];
+-	struct btrfs_key update_progress;
+-	int stage;
+-	int level;
+-	int shared_level;
+-	int update_ref;
+-	int keep_locks;
+-};
+-
+-#define DROP_REFERENCE	1
+-#define UPDATE_BACKREF	2
+ 
+ /*
+  * hepler to process tree block while walking down the tree.
+  *
+- * when wc->stage == DROP_REFERENCE, this function checks
+- * reference count of the block. if the block is shared and
+- * we need update back refs for the subtree rooted at the
+- * block, this function changes wc->stage to UPDATE_BACKREF
+- *
+  * when wc->stage == UPDATE_BACKREF, this function updates
+  * back refs for pointers in the block.
+  *
+@@ -4800,11 +5002,10 @@ struct walk_control {
+ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ 				   struct btrfs_root *root,
+ 				   struct btrfs_path *path,
+-				   struct walk_control *wc)
++				   struct walk_control *wc, int lookup_info)
+ {
+ 	int level = wc->level;
+ 	struct extent_buffer *eb = path->nodes[level];
+-	struct btrfs_key key;
+ 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ 	int ret;
+ 
+@@ -4816,8 +5017,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ 	 * when reference count of tree block is 1, it won't increase
+ 	 * again. once full backref flag is set, we never clear it.
+ 	 */
+-	if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+-	    (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
++	if (lookup_info &&
++	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
++	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
+ 		BUG_ON(!path->locks[level]);
+ 		ret = btrfs_lookup_extent_info(trans, root,
+ 					       eb->start, eb->len,
+@@ -4827,21 +5029,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ 		BUG_ON(wc->refs[level] == 0);
+ 	}
+ 
+-	if (wc->stage == DROP_REFERENCE &&
+-	    wc->update_ref && wc->refs[level] > 1) {
+-		BUG_ON(eb == root->node);
+-		BUG_ON(path->slots[level] > 0);
+-		if (level == 0)
+-			btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
+-		else
+-			btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
+-		if (btrfs_header_owner(eb) == root->root_key.objectid &&
+-		    btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
+-			wc->stage = UPDATE_BACKREF;
+-			wc->shared_level = level;
+-		}
+-	}
+-
+ 	if (wc->stage == DROP_REFERENCE) {
+ 		if (wc->refs[level] > 1)
+ 			return 1;
+@@ -4878,6 +5065,136 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ }
+ 
+ /*
++ * hepler to process tree block pointer.
++ *
++ * when wc->stage == DROP_REFERENCE, this function checks
++ * reference count of the block pointed to. if the block
++ * is shared and we need update back refs for the subtree
++ * rooted at the block, this function changes wc->stage to
++ * UPDATE_BACKREF. if the block is shared and there is no
++ * need to update back, this function drops the reference
++ * to the block.
++ *
++ * NOTE: return value 1 means we should stop walking down.
++ */
++static noinline int do_walk_down(struct btrfs_trans_handle *trans,
++				 struct btrfs_root *root,
++				 struct btrfs_path *path,
++				 struct walk_control *wc, int *lookup_info)
++{
++	u64 bytenr;
++	u64 generation;
++	u64 parent;
++	u32 blocksize;
++	struct btrfs_key key;
++	struct extent_buffer *next;
++	int level = wc->level;
++	int reada = 0;
++	int ret = 0;
++
++	generation = btrfs_node_ptr_generation(path->nodes[level],
++					       path->slots[level]);
++	/*
++	 * if the lower level block was created before the snapshot
++	 * was created, we know there is no need to update back refs
++	 * for the subtree
++	 */
++	if (wc->stage == UPDATE_BACKREF &&
++	    generation <= root->root_key.offset) {
++		*lookup_info = 1;
++		return 1;
++	}
++
++	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
++	blocksize = btrfs_level_size(root, level - 1);
++
++	next = btrfs_find_tree_block(root, bytenr, blocksize);
++	if (!next) {
++		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
++		reada = 1;
++	}
++	btrfs_tree_lock(next);
++	btrfs_set_lock_blocking(next);
++
++	ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
++				       &wc->refs[level - 1],
++				       &wc->flags[level - 1]);
++	BUG_ON(ret);
++	BUG_ON(wc->refs[level - 1] == 0);
++	*lookup_info = 0;
++
++	if (wc->stage == DROP_REFERENCE) {
++		if (wc->refs[level - 1] > 1) {
++			if (level == 1 &&
++			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
++				goto skip;
++
++			if (!wc->update_ref ||
++			    generation <= root->root_key.offset)
++				goto skip;
++
++			btrfs_node_key_to_cpu(path->nodes[level], &key,
++					      path->slots[level]);
++			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
++			if (ret < 0)
++				goto skip;
++
++			wc->stage = UPDATE_BACKREF;
++			wc->shared_level = level - 1;
++		}
++	} else {
++		if (level == 1 &&
++		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
++			goto skip;
++	}
++
++	if (!btrfs_buffer_uptodate(next, generation)) {
++		btrfs_tree_unlock(next);
++		free_extent_buffer(next);
++		next = NULL;
++		*lookup_info = 1;
++	}
++
++	if (!next) {
++		if (reada && level == 1)
++			reada_walk_down(trans, root, wc, path);
++		next = read_tree_block(root, bytenr, blocksize, generation);
++		btrfs_tree_lock(next);
++		btrfs_set_lock_blocking(next);
++	}
++
++	level--;
++	BUG_ON(level != btrfs_header_level(next));
++	path->nodes[level] = next;
++	path->slots[level] = 0;
++	path->locks[level] = 1;
++	wc->level = level;
++	if (wc->level == 1)
++		wc->reada_slot = 0;
++	return 0;
++skip:
++	wc->refs[level - 1] = 0;
++	wc->flags[level - 1] = 0;
++	if (wc->stage == DROP_REFERENCE) {
++		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
++			parent = path->nodes[level]->start;
++		} else {
++			BUG_ON(root->root_key.objectid !=
++			       btrfs_header_owner(path->nodes[level]));
++			parent = 0;
++		}
++
++		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
++					root->root_key.objectid, level - 1, 0);
++		BUG_ON(ret);
++	}
++	btrfs_tree_unlock(next);
++	free_extent_buffer(next);
++	*lookup_info = 1;
++	return 1;
++}
++
++/*
+  * hepler to process tree block while walking up the tree.
+  *
+  * when wc->stage == DROP_REFERENCE, this function drops
+@@ -4904,7 +5221,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+ 		if (level < wc->shared_level)
+ 			goto out;
+ 
+-		BUG_ON(wc->refs[level] <= 1);
+ 		ret = find_next_key(path, level + 1, &wc->update_progress);
+ 		if (ret > 0)
+ 			wc->update_ref = 0;
+@@ -4935,8 +5251,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+ 				path->locks[level] = 0;
+ 				return 1;
+ 			}
+-		} else {
+-			BUG_ON(level != 0);
+ 		}
+ 	}
+ 
+@@ -4989,39 +5303,28 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+ 				   struct btrfs_path *path,
+ 				   struct walk_control *wc)
+ {
+-	struct extent_buffer *next;
+-	struct extent_buffer *cur;
+-	u64 bytenr;
+-	u64 ptr_gen;
+-	u32 blocksize;
+ 	int level = wc->level;
++	int lookup_info = 1;
+ 	int ret;
+ 
+ 	while (level >= 0) {
+-		cur = path->nodes[level];
+-		BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
++		if (path->slots[level] >=
++		    btrfs_header_nritems(path->nodes[level]))
++			break;
+ 
+-		ret = walk_down_proc(trans, root, path, wc);
++		ret = walk_down_proc(trans, root, path, wc, lookup_info);
+ 		if (ret > 0)
+ 			break;
+ 
+ 		if (level == 0)
+ 			break;
+ 
+-		bytenr = btrfs_node_blockptr(cur, path->slots[level]);
+-		blocksize = btrfs_level_size(root, level - 1);
+-		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
+-
+-		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+-		btrfs_tree_lock(next);
+-		btrfs_set_lock_blocking(next);
+-
+-		level--;
+-		BUG_ON(level != btrfs_header_level(next));
+-		path->nodes[level] = next;
+-		path->slots[level] = 0;
+-		path->locks[level] = 1;
+-		wc->level = level;
++		ret = do_walk_down(trans, root, path, wc, &lookup_info);
++		if (ret > 0) {
++			path->slots[level]++;
++			continue;
++		}
++		level = wc->level;
+ 	}
+ 	return 0;
+ }
+@@ -5111,9 +5414,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+ 			err = ret;
+ 			goto out;
+ 		}
+-		btrfs_node_key_to_cpu(path->nodes[level], &key,
+-				      path->slots[level]);
+-		WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
++		WARN_ON(ret > 0);
+ 
+ 		/*
+ 		 * unlock our path, this is safe because only this
+@@ -5148,6 +5449,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+ 	wc->stage = DROP_REFERENCE;
+ 	wc->update_ref = update_ref;
+ 	wc->keep_locks = 0;
++	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+ 
+ 	while (1) {
+ 		ret = walk_down_tree(trans, root, path, wc);
+@@ -5200,9 +5502,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+ 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
+ 	BUG_ON(ret);
+ 
+-	free_extent_buffer(root->node);
+-	free_extent_buffer(root->commit_root);
+-	kfree(root);
++	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
++		ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
++					   NULL, NULL);
++		BUG_ON(ret < 0);
++		if (ret > 0) {
++			ret = btrfs_del_orphan_item(trans, tree_root,
++						    root->root_key.objectid);
++			BUG_ON(ret);
++		}
++	}
++
++	if (root->in_radix) {
++		btrfs_free_fs_root(tree_root->fs_info, root);
++	} else {
++		free_extent_buffer(root->node);
++		free_extent_buffer(root->commit_root);
++		kfree(root);
++	}
+ out:
+ 	btrfs_end_transaction(trans, tree_root);
+ 	kfree(wc);
+@@ -5254,6 +5571,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ 	wc->stage = DROP_REFERENCE;
+ 	wc->update_ref = 0;
+ 	wc->keep_locks = 1;
++	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+ 
+ 	while (1) {
+ 		wret = walk_down_tree(trans, root, path, wc);
+@@ -5396,9 +5714,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
+ 	lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
+ 	while (1) {
+ 		int ret;
+-		spin_lock(&em_tree->lock);
++		write_lock(&em_tree->lock);
+ 		ret = add_extent_mapping(em_tree, em);
+-		spin_unlock(&em_tree->lock);
++		write_unlock(&em_tree->lock);
+ 		if (ret != -EEXIST) {
+ 			free_extent_map(em);
+ 			break;
+@@ -6841,287 +7159,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+ 	return 0;
+ }
+ 
+-#if 0
+-static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+-				 struct btrfs_root *root,
+-				 u64 objectid, u64 size)
+-{
+-	struct btrfs_path *path;
+-	struct btrfs_inode_item *item;
+-	struct extent_buffer *leaf;
+-	int ret;
+-
+-	path = btrfs_alloc_path();
+-	if (!path)
+-		return -ENOMEM;
+-
+-	path->leave_spinning = 1;
+-	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+-	if (ret)
+-		goto out;
+-
+-	leaf = path->nodes[0];
+-	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+-	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+-	btrfs_set_inode_generation(leaf, item, 1);
+-	btrfs_set_inode_size(leaf, item, size);
+-	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+-	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+-	btrfs_mark_buffer_dirty(leaf);
+-	btrfs_release_path(root, path);
+-out:
+-	btrfs_free_path(path);
+-	return ret;
+-}
+-
+-static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+-					struct btrfs_block_group_cache *group)
++/*
++ * checks to see if its even possible to relocate this block group.
++ *
++ * @return - -1 if it's not a good idea to relocate this block group, 0 if its
++ * ok to go ahead and try.
++ */
++int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
+ {
+-	struct inode *inode = NULL;
+-	struct btrfs_trans_handle *trans;
+-	struct btrfs_root *root;
+-	struct btrfs_key root_key;
+-	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+-	int err = 0;
++	struct btrfs_block_group_cache *block_group;
++	struct btrfs_space_info *space_info;
++	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
++	struct btrfs_device *device;
++	int full = 0;
++	int ret = 0;
+ 
+-	root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+-	root_key.type = BTRFS_ROOT_ITEM_KEY;
+-	root_key.offset = (u64)-1;
+-	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+-	if (IS_ERR(root))
+-		return ERR_CAST(root);
++	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+ 
+-	trans = btrfs_start_transaction(root, 1);
+-	BUG_ON(!trans);
++	/* odd, couldn't find the block group, leave it alone */
++	if (!block_group)
++		return -1;
+ 
+-	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+-	if (err)
++	/* no bytes used, we're good */
++	if (!btrfs_block_group_used(&block_group->item))
+ 		goto out;
+ 
+-	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+-	BUG_ON(err);
+-
+-	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+-				       group->key.offset, 0, group->key.offset,
+-				       0, 0, 0);
+-	BUG_ON(err);
+-
+-	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+-	if (inode->i_state & I_NEW) {
+-		BTRFS_I(inode)->root = root;
+-		BTRFS_I(inode)->location.objectid = objectid;
+-		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+-		BTRFS_I(inode)->location.offset = 0;
+-		btrfs_read_locked_inode(inode);
+-		unlock_new_inode(inode);
+-		BUG_ON(is_bad_inode(inode));
+-	} else {
+-		BUG_ON(1);
+-	}
+-	BTRFS_I(inode)->index_cnt = group->key.objectid;
+-
+-	err = btrfs_orphan_add(trans, inode);
+-out:
+-	btrfs_end_transaction(trans, root);
+-	if (err) {
+-		if (inode)
+-			iput(inode);
+-		inode = ERR_PTR(err);
+-	}
+-	return inode;
+-}
+-
+-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+-{
+-
+-	struct btrfs_ordered_sum *sums;
+-	struct btrfs_sector_sum *sector_sum;
+-	struct btrfs_ordered_extent *ordered;
+-	struct btrfs_root *root = BTRFS_I(inode)->root;
+-	struct list_head list;
+-	size_t offset;
+-	int ret;
+-	u64 disk_bytenr;
+-
+-	INIT_LIST_HEAD(&list);
+-
+-	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+-	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+-
+-	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+-	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+-				       disk_bytenr + len - 1, &list);
+-
+-	while (!list_empty(&list)) {
+-		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+-		list_del_init(&sums->list);
+-
+-		sector_sum = sums->sums;
+-		sums->bytenr = ordered->start;
++	space_info = block_group->space_info;
++	spin_lock(&space_info->lock);
+ 
+-		offset = 0;
+-		while (offset < sums->len) {
+-			sector_sum->bytenr += ordered->start - disk_bytenr;
+-			sector_sum++;
+-			offset += root->sectorsize;
+-		}
++	full = space_info->full;
+ 
+-		btrfs_add_ordered_sum(inode, ordered, sums);
++	/*
++	 * if this is the last block group we have in this space, we can't
++	 * relocate it unless we're able to allocate a new chunk below.
++	 *
++	 * Otherwise, we need to make sure we have room in the space to handle
++	 * all of the extents from this block group.  If we can, we're good
++	 */
++	if ((space_info->total_bytes != block_group->key.offset) &&
++	   (space_info->bytes_used + space_info->bytes_reserved +
++	    space_info->bytes_pinned + space_info->bytes_readonly +
++	    btrfs_block_group_used(&block_group->item) <
++	    space_info->total_bytes)) {
++		spin_unlock(&space_info->lock);
++		goto out;
+ 	}
+-	btrfs_put_ordered_extent(ordered);
+-	return 0;
+-}
+-
+-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
+-{
+-	struct btrfs_trans_handle *trans;
+-	struct btrfs_path *path;
+-	struct btrfs_fs_info *info = root->fs_info;
+-	struct extent_buffer *leaf;
+-	struct inode *reloc_inode;
+-	struct btrfs_block_group_cache *block_group;
+-	struct btrfs_key key;
+-	u64 skipped;
+-	u64 cur_byte;
+-	u64 total_found;
+-	u32 nritems;
+-	int ret;
+-	int progress;
+-	int pass = 0;
+-
+-	root = root->fs_info->extent_root;
+-
+-	block_group = btrfs_lookup_block_group(info, group_start);
+-	BUG_ON(!block_group);
+-
+-	printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
+-	       (unsigned long long)block_group->key.objectid,
+-	       (unsigned long long)block_group->flags);
+-
+-	path = btrfs_alloc_path();
+-	BUG_ON(!path);
+-
+-	reloc_inode = create_reloc_inode(info, block_group);
+-	BUG_ON(IS_ERR(reloc_inode));
+-
+-	__alloc_chunk_for_shrink(root, block_group, 1);
+-	set_block_group_readonly(block_group);
+-
+-	btrfs_start_delalloc_inodes(info->tree_root);
+-	btrfs_wait_ordered_extents(info->tree_root, 0);
+-again:
+-	skipped = 0;
+-	total_found = 0;
+-	progress = 0;
+-	key.objectid = block_group->key.objectid;
+-	key.offset = 0;
+-	key.type = 0;
+-	cur_byte = key.objectid;
+-
+-	trans = btrfs_start_transaction(info->tree_root, 1);
+-	btrfs_commit_transaction(trans, info->tree_root);
++	spin_unlock(&space_info->lock);
+ 
+-	mutex_lock(&root->fs_info->cleaner_mutex);
+-	btrfs_clean_old_snapshots(info->tree_root);
+-	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
+-	mutex_unlock(&root->fs_info->cleaner_mutex);
++	/*
++	 * ok we don't have enough space, but maybe we have free space on our
++	 * devices to allocate new chunks for relocation, so loop through our
++	 * alloc devices and guess if we have enough space.  However, if we
++	 * were marked as full, then we know there aren't enough chunks, and we
++	 * can just return.
++	 */
++	ret = -1;
++	if (full)
++		goto out;
+ 
+-	trans = btrfs_start_transaction(info->tree_root, 1);
+-	btrfs_commit_transaction(trans, info->tree_root);
++	mutex_lock(&root->fs_info->chunk_mutex);
++	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
++		u64 min_free = btrfs_block_group_used(&block_group->item);
++		u64 dev_offset, max_avail;
+ 
+-	while (1) {
+-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+-		if (ret < 0)
+-			goto out;
+-next:
+-		leaf = path->nodes[0];
+-		nritems = btrfs_header_nritems(leaf);
+-		if (path->slots[0] >= nritems) {
+-			ret = btrfs_next_leaf(root, path);
+-			if (ret < 0)
+-				goto out;
+-			if (ret == 1) {
+-				ret = 0;
++		/*
++		 * check to make sure we can actually find a chunk with enough
++		 * space to fit our block group in.
++		 */
++		if (device->total_bytes > device->bytes_used + min_free) {
++			ret = find_free_dev_extent(NULL, device, min_free,
++						   &dev_offset, &max_avail);
++			if (!ret)
+ 				break;
+-			}
+-			leaf = path->nodes[0];
+-			nritems = btrfs_header_nritems(leaf);
+-		}
+-
+-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+-
+-		if (key.objectid >= block_group->key.objectid +
+-		    block_group->key.offset)
+-			break;
+-
+-		if (progress && need_resched()) {
+-			btrfs_release_path(root, path);
+-			cond_resched();
+-			progress = 0;
+-			continue;
+-		}
+-		progress = 1;
+-
+-		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
+-		    key.objectid + key.offset <= cur_byte) {
+-			path->slots[0]++;
+-			goto next;
++			ret = -1;
+ 		}
+-
+-		total_found++;
+-		cur_byte = key.objectid + key.offset;
+-		btrfs_release_path(root, path);
+-
+-		__alloc_chunk_for_shrink(root, block_group, 0);
+-		ret = relocate_one_extent(root, path, &key, block_group,
+-					  reloc_inode, pass);
+-		BUG_ON(ret < 0);
+-		if (ret > 0)
+-			skipped++;
+-
+-		key.objectid = cur_byte;
+-		key.type = 0;
+-		key.offset = 0;
+ 	}
+-
+-	btrfs_release_path(root, path);
+-
+-	if (pass == 0) {
+-		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
+-		invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
+-	}
+-
+-	if (total_found > 0) {
+-		printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
+-		       (unsigned long long)total_found, pass);
+-		pass++;
+-		if (total_found == skipped && pass > 2) {
+-			iput(reloc_inode);
+-			reloc_inode = create_reloc_inode(info, block_group);
+-			pass = 0;
+-		}
+-		goto again;
+-	}
+-
+-	/* delete reloc_inode */
+-	iput(reloc_inode);
+-
+-	/* unpin extents in this range */
+-	trans = btrfs_start_transaction(info->tree_root, 1);
+-	btrfs_commit_transaction(trans, info->tree_root);
+-
+-	spin_lock(&block_group->lock);
+-	WARN_ON(block_group->pinned > 0);
+-	WARN_ON(block_group->reserved > 0);
+-	WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
+-	spin_unlock(&block_group->lock);
+-	btrfs_put_block_group(block_group);
+-	ret = 0;
++	mutex_unlock(&root->fs_info->chunk_mutex);
+ out:
+-	btrfs_free_path(path);
++	btrfs_put_block_group(block_group);
+ 	return ret;
+ }
+-#endif
+ 
+ static int find_first_block_group(struct btrfs_root *root,
+ 		struct btrfs_path *path, struct btrfs_key *key)
+@@ -7164,8 +7281,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
+ {
+ 	struct btrfs_block_group_cache *block_group;
+ 	struct btrfs_space_info *space_info;
++	struct btrfs_caching_control *caching_ctl;
+ 	struct rb_node *n;
+ 
++	down_write(&info->extent_commit_sem);
++	while (!list_empty(&info->caching_block_groups)) {
++		caching_ctl = list_entry(info->caching_block_groups.next,
++					 struct btrfs_caching_control, list);
++		list_del(&caching_ctl->list);
++		put_caching_control(caching_ctl);
++	}
++	up_write(&info->extent_commit_sem);
++
+ 	spin_lock(&info->block_group_cache_lock);
+ 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+ 		block_group = rb_entry(n, struct btrfs_block_group_cache,
+@@ -7179,8 +7306,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
+ 		up_write(&block_group->space_info->groups_sem);
+ 
+ 		if (block_group->cached == BTRFS_CACHE_STARTED)
+-			wait_event(block_group->caching_q,
+-				   block_group_cache_done(block_group));
++			wait_block_group_cache_done(block_group);
+ 
+ 		btrfs_remove_free_space_cache(block_group);
+ 
+@@ -7250,7 +7376,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
+ 		spin_lock_init(&cache->lock);
+ 		spin_lock_init(&cache->tree_lock);
+ 		cache->fs_info = info;
+-		init_waitqueue_head(&cache->caching_q);
+ 		INIT_LIST_HEAD(&cache->list);
+ 		INIT_LIST_HEAD(&cache->cluster_list);
+ 
+@@ -7272,8 +7397,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
+ 		cache->flags = btrfs_block_group_flags(&cache->item);
+ 		cache->sectorsize = root->sectorsize;
+ 
+-		remove_sb_from_cache(root, cache);
+-
+ 		/*
+ 		 * check for two cases, either we are full, and therefore
+ 		 * don't need to bother with the caching work since we won't
+@@ -7282,13 +7405,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
+ 		 * time, particularly in the full case.
+ 		 */
+ 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
++			exclude_super_stripes(root, cache);
++			cache->last_byte_to_unpin = (u64)-1;
+ 			cache->cached = BTRFS_CACHE_FINISHED;
++			free_excluded_extents(root, cache);
+ 		} else if (btrfs_block_group_used(&cache->item) == 0) {
++			exclude_super_stripes(root, cache);
++			cache->last_byte_to_unpin = (u64)-1;
+ 			cache->cached = BTRFS_CACHE_FINISHED;
+ 			add_new_free_space(cache, root->fs_info,
+ 					   found_key.objectid,
+ 					   found_key.objectid +
+ 					   found_key.offset);
++			free_excluded_extents(root, cache);
+ 		}
+ 
+ 		ret = update_space_info(info, cache->flags, found_key.offset,
+@@ -7296,6 +7425,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
+ 					&space_info);
+ 		BUG_ON(ret);
+ 		cache->space_info = space_info;
++		spin_lock(&cache->space_info->lock);
++		cache->space_info->bytes_super += cache->bytes_super;
++		spin_unlock(&cache->space_info->lock);
++
+ 		down_write(&space_info->groups_sem);
+ 		list_add_tail(&cache->list, &space_info->block_groups);
+ 		up_write(&space_info->groups_sem);
+@@ -7345,7 +7478,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ 	atomic_set(&cache->count, 1);
+ 	spin_lock_init(&cache->lock);
+ 	spin_lock_init(&cache->tree_lock);
+-	init_waitqueue_head(&cache->caching_q);
+ 	INIT_LIST_HEAD(&cache->list);
+ 	INIT_LIST_HEAD(&cache->cluster_list);
+ 
+@@ -7354,15 +7486,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ 	cache->flags = type;
+ 	btrfs_set_block_group_flags(&cache->item, type);
+ 
++	cache->last_byte_to_unpin = (u64)-1;
+ 	cache->cached = BTRFS_CACHE_FINISHED;
+-	remove_sb_from_cache(root, cache);
++	exclude_super_stripes(root, cache);
+ 
+ 	add_new_free_space(cache, root->fs_info, chunk_offset,
+ 			   chunk_offset + size);
+ 
++	free_excluded_extents(root, cache);
++
+ 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
+ 				&cache->space_info);
+ 	BUG_ON(ret);
++
++	spin_lock(&cache->space_info->lock);
++	cache->space_info->bytes_super += cache->bytes_super;
++	spin_unlock(&cache->space_info->lock);
++
+ 	down_write(&cache->space_info->groups_sem);
+ 	list_add_tail(&cache->list, &cache->space_info->block_groups);
+ 	up_write(&cache->space_info->groups_sem);
+@@ -7428,8 +7568,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ 	up_write(&block_group->space_info->groups_sem);
+ 
+ 	if (block_group->cached == BTRFS_CACHE_STARTED)
+-		wait_event(block_group->caching_q,
+-			   block_group_cache_done(block_group));
++		wait_block_group_cache_done(block_group);
+ 
+ 	btrfs_remove_free_space_cache(block_group);
+ 
+diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
+index 6826018..96577e8 100644
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
+ 	return NULL;
+ }
+ 
++static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
++		     struct extent_state *other)
++{
++	if (tree->ops && tree->ops->merge_extent_hook)
++		tree->ops->merge_extent_hook(tree->mapping->host, new,
++					     other);
++}
++
+ /*
+  * utility function to look for merge candidates inside a given range.
+  * Any extents with matching state are merged together into a single
+@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
+ 		other = rb_entry(other_node, struct extent_state, rb_node);
+ 		if (other->end == state->start - 1 &&
+ 		    other->state == state->state) {
++			merge_cb(tree, state, other);
+ 			state->start = other->start;
+ 			other->tree = NULL;
+ 			rb_erase(&other->rb_node, &tree->state);
+@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
+ 		other = rb_entry(other_node, struct extent_state, rb_node);
+ 		if (other->start == state->end + 1 &&
+ 		    other->state == state->state) {
++			merge_cb(tree, state, other);
+ 			other->start = state->start;
+ 			state->tree = NULL;
+ 			rb_erase(&state->rb_node, &tree->state);
+ 			free_extent_state(state);
++			state = NULL;
+ 		}
+ 	}
++
+ 	return 0;
+ }
+ 
+-static void set_state_cb(struct extent_io_tree *tree,
++static int set_state_cb(struct extent_io_tree *tree,
+ 			 struct extent_state *state,
+ 			 unsigned long bits)
+ {
+ 	if (tree->ops && tree->ops->set_bit_hook) {
+-		tree->ops->set_bit_hook(tree->mapping->host, state->start,
+-					state->end, state->state, bits);
++		return tree->ops->set_bit_hook(tree->mapping->host,
++					       state->start, state->end,
++					       state->state, bits);
+ 	}
++
++	return 0;
+ }
+ 
+ static void clear_state_cb(struct extent_io_tree *tree,
+ 			   struct extent_state *state,
+ 			   unsigned long bits)
+ {
+-	if (tree->ops && tree->ops->clear_bit_hook) {
+-		tree->ops->clear_bit_hook(tree->mapping->host, state->start,
+-					  state->end, state->state, bits);
+-	}
++	if (tree->ops && tree->ops->clear_bit_hook)
++		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
+ }
+ 
+ /*
+@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
+ 			int bits)
+ {
+ 	struct rb_node *node;
++	int ret;
+ 
+ 	if (end < start) {
+ 		printk(KERN_ERR "btrfs end < start %llu %llu\n",
+@@ -365,12 +379,15 @@ static int insert_state(struct extent_io_tree *tree,
+ 		       (unsigned long long)start);
+ 		WARN_ON(1);
+ 	}
++	state->start = start;
++	state->end = end;
++	ret = set_state_cb(tree, state, bits);
++	if (ret)
++		return ret;
++
+ 	if (bits & EXTENT_DIRTY)
+ 		tree->dirty_bytes += end - start + 1;
+-	set_state_cb(tree, state, bits);
+ 	state->state |= bits;
+-	state->start = start;
+-	state->end = end;
+ 	node = tree_insert(&tree->state, end, &state->rb_node);
+ 	if (node) {
+ 		struct extent_state *found;
+@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
+ 	return 0;
+ }
+ 
++static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
++		     u64 split)
++{
++	if (tree->ops && tree->ops->split_extent_hook)
++		return tree->ops->split_extent_hook(tree->mapping->host,
++						    orig, split);
++	return 0;
++}
++
+ /*
+  * split a given extent state struct in two, inserting the preallocated
+  * struct 'prealloc' as the newly created second half.  'split' indicates an
+@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+ 		       struct extent_state *prealloc, u64 split)
+ {
+ 	struct rb_node *node;
++
++	split_cb(tree, orig, split);
++
+ 	prealloc->start = orig->start;
+ 	prealloc->end = split - 1;
+ 	prealloc->state = orig->state;
+@@ -431,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
+ 			    struct extent_state *state, int bits, int wake,
+ 			    int delete)
+ {
+-	int ret = state->state & bits;
++	int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
++	int ret = state->state & bits_to_clear;
+ 
+ 	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+ 		u64 range = state->end - state->start + 1;
+@@ -439,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
+ 		tree->dirty_bytes -= range;
+ 	}
+ 	clear_state_cb(tree, state, bits);
+-	state->state &= ~bits;
++	state->state &= ~bits_to_clear;
+ 	if (wake)
+ 		wake_up(&state->wq);
+ 	if (delete || state->state == 0) {
+@@ -471,10 +501,14 @@ static int clear_state_bit(struct extent_io_tree *tree,
+  * bits were already set, or zero if none of the bits were already set.
+  */
+ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+-		     int bits, int wake, int delete, gfp_t mask)
++		     int bits, int wake, int delete,
++		     struct extent_state **cached_state,
++		     gfp_t mask)
+ {
+ 	struct extent_state *state;
++	struct extent_state *cached;
+ 	struct extent_state *prealloc = NULL;
++	struct rb_node *next_node;
+ 	struct rb_node *node;
+ 	u64 last_end;
+ 	int err;
+@@ -488,6 +522,17 @@ again:
+ 	}
+ 
+ 	spin_lock(&tree->lock);
++	if (cached_state) {
++		cached = *cached_state;
++		*cached_state = NULL;
++		cached_state = NULL;
++		if (cached && cached->tree && cached->start == start) {
++			atomic_dec(&cached->refs);
++			state = cached;
++			goto hit_next;
++		}
++		free_extent_state(cached);
++	}
+ 	/*
+ 	 * this search will find the extents that end after
+ 	 * our range starts
+@@ -496,6 +541,7 @@ again:
+ 	if (!node)
+ 		goto out;
+ 	state = rb_entry(node, struct extent_state, rb_node);
++hit_next:
+ 	if (state->start > end)
+ 		goto out;
+ 	WARN_ON(state->end < start);
+@@ -526,13 +572,11 @@ again:
+ 		if (err)
+ 			goto out;
+ 		if (state->end <= end) {
+-			set |= clear_state_bit(tree, state, bits,
+-					wake, delete);
++			set |= clear_state_bit(tree, state, bits, wake,
++					       delete);
+ 			if (last_end == (u64)-1)
+ 				goto out;
+ 			start = last_end + 1;
+-		} else {
+-			start = state->start;
+ 		}
+ 		goto search_again;
+ 	}
+@@ -547,19 +591,30 @@ again:
+ 			prealloc = alloc_extent_state(GFP_ATOMIC);
+ 		err = split_state(tree, state, prealloc, end + 1);
+ 		BUG_ON(err == -EEXIST);
+-
+ 		if (wake)
+ 			wake_up(&state->wq);
+-		set |= clear_state_bit(tree, prealloc, bits,
+-				       wake, delete);
++
++		set |= clear_state_bit(tree, prealloc, bits, wake, delete);
++
+ 		prealloc = NULL;
+ 		goto out;
+ 	}
+ 
++	if (state->end < end && prealloc && !need_resched())
++		next_node = rb_next(&state->rb_node);
++	else
++		next_node = NULL;
++
+ 	set |= clear_state_bit(tree, state, bits, wake, delete);
+ 	if (last_end == (u64)-1)
+ 		goto out;
+ 	start = last_end + 1;
++	if (start <= end && next_node) {
++		state = rb_entry(next_node, struct extent_state,
++				 rb_node);
++		if (state->start == start)
++			goto hit_next;
++	}
+ 	goto search_again;
+ 
+ out:
+@@ -641,40 +696,59 @@ out:
+ 	return 0;
+ }
+ 
+-static void set_state_bits(struct extent_io_tree *tree,
++static int set_state_bits(struct extent_io_tree *tree,
+ 			   struct extent_state *state,
+ 			   int bits)
+ {
++	int ret;
++
++	ret = set_state_cb(tree, state, bits);
++	if (ret)
++		return ret;
++
+ 	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+ 		u64 range = state->end - state->start + 1;
+ 		tree->dirty_bytes += range;
+ 	}
+-	set_state_cb(tree, state, bits);
+ 	state->state |= bits;
++
++	return 0;
++}
++
++static void cache_state(struct extent_state *state,
++			struct extent_state **cached_ptr)
++{
++	if (cached_ptr && !(*cached_ptr)) {
++		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
++			*cached_ptr = state;
++			atomic_inc(&state->refs);
++		}
++	}
+ }
+ 
+ /*
+- * set some bits on a range in the tree.  This may require allocations
+- * or sleeping, so the gfp mask is used to indicate what is allowed.
++ * set some bits on a range in the tree.  This may require allocations or
++ * sleeping, so the gfp mask is used to indicate what is allowed.
+  *
+- * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
+- * range already has the desired bits set.  The start of the existing
+- * range is returned in failed_start in this case.
++ * If any of the exclusive bits are set, this will fail with -EEXIST if some
++ * part of the range already has the desired bits set.  The start of the
++ * existing range is returned in failed_start in this case.
+  *
+- * [start, end] is inclusive
+- * This takes the tree lock.
++ * [start, end] is inclusive This takes the tree lock.
+  */
++
+ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+-			  int bits, int exclusive, u64 *failed_start,
++			  int bits, int exclusive_bits, u64 *failed_start,
++			  struct extent_state **cached_state,
+ 			  gfp_t mask)
+ {
+ 	struct extent_state *state;
+ 	struct extent_state *prealloc = NULL;
+ 	struct rb_node *node;
+ 	int err = 0;
+-	int set;
+ 	u64 last_start;
+ 	u64 last_end;
++
+ again:
+ 	if (!prealloc && (mask & __GFP_WAIT)) {
+ 		prealloc = alloc_extent_state(mask);
+@@ -683,6 +757,13 @@ again:
+ 	}
+ 
+ 	spin_lock(&tree->lock);
++	if (cached_state && *cached_state) {
++		state = *cached_state;
++		if (state->start == start && state->tree) {
++			node = &state->rb_node;
++			goto hit_next;
++		}
++	}
+ 	/*
+ 	 * this search will find all the extents that end after
+ 	 * our range starts.
+@@ -694,8 +775,8 @@ again:
+ 		BUG_ON(err == -EEXIST);
+ 		goto out;
+ 	}
+-
+ 	state = rb_entry(node, struct extent_state, rb_node);
++hit_next:
+ 	last_start = state->start;
+ 	last_end = state->end;
+ 
+@@ -706,17 +787,32 @@ again:
+ 	 * Just lock what we found and keep going
+ 	 */
+ 	if (state->start == start && state->end <= end) {
+-		set = state->state & bits;
+-		if (set && exclusive) {
++		struct rb_node *next_node;
++		if (state->state & exclusive_bits) {
+ 			*failed_start = state->start;
+ 			err = -EEXIST;
+ 			goto out;
+ 		}
+-		set_state_bits(tree, state, bits);
++
++		err = set_state_bits(tree, state, bits);
++		if (err)
++			goto out;
++
++		cache_state(state, cached_state);
+ 		merge_state(tree, state);
+ 		if (last_end == (u64)-1)
+ 			goto out;
++
+ 		start = last_end + 1;
++		if (start < end && prealloc && !need_resched()) {
++			next_node = rb_next(node);
++			if (next_node) {
++				state = rb_entry(next_node, struct extent_state,
++						 rb_node);
++				if (state->start == start)
++					goto hit_next;
++			}
++		}
+ 		goto search_again;
+ 	}
+ 
+@@ -737,8 +833,7 @@ again:
+ 	 * desired bit on it.
+ 	 */
+ 	if (state->start < start) {
+-		set = state->state & bits;
+-		if (exclusive && set) {
++		if (state->state & exclusive_bits) {
+ 			*failed_start = start;
+ 			err = -EEXIST;
+ 			goto out;
+@@ -749,13 +844,14 @@ again:
+ 		if (err)
+ 			goto out;
+ 		if (state->end <= end) {
+-			set_state_bits(tree, state, bits);
++			err = set_state_bits(tree, state, bits);
++			if (err)
++				goto out;
++			cache_state(state, cached_state);
+ 			merge_state(tree, state);
+ 			if (last_end == (u64)-1)
+ 				goto out;
+ 			start = last_end + 1;
+-		} else {
+-			start = state->start;
+ 		}
+ 		goto search_again;
+ 	}
+@@ -774,10 +870,13 @@ again:
+ 			this_end = last_start - 1;
+ 		err = insert_state(tree, prealloc, start, this_end,
+ 				   bits);
+-		prealloc = NULL;
+ 		BUG_ON(err == -EEXIST);
+-		if (err)
++		if (err) {
++			prealloc = NULL;
+ 			goto out;
++		}
++		cache_state(prealloc, cached_state);
++		prealloc = NULL;
+ 		start = this_end + 1;
+ 		goto search_again;
+ 	}
+@@ -788,8 +887,7 @@ again:
+ 	 * on the first half
+ 	 */
+ 	if (state->start <= end && state->end > end) {
+-		set = state->state & bits;
+-		if (exclusive && set) {
++		if (state->state & exclusive_bits) {
+ 			*failed_start = start;
+ 			err = -EEXIST;
+ 			goto out;
+@@ -797,7 +895,12 @@ again:
+ 		err = split_state(tree, state, prealloc, end + 1);
+ 		BUG_ON(err == -EEXIST);
+ 
+-		set_state_bits(tree, prealloc, bits);
++		err = set_state_bits(tree, prealloc, bits);
++		if (err) {
++			prealloc = NULL;
++			goto out;
++		}
++		cache_state(prealloc, cached_state);
+ 		merge_state(tree, prealloc);
+ 		prealloc = NULL;
+ 		goto out;
+@@ -826,86 +929,65 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ 		     gfp_t mask)
+ {
+ 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+-			      mask);
+-}
+-
+-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+-		       gfp_t mask)
+-{
+-	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
++			      NULL, mask);
+ }
+ 
+ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ 		    int bits, gfp_t mask)
+ {
+ 	return set_extent_bit(tree, start, end, bits, 0, NULL,
+-			      mask);
++			      NULL, mask);
+ }
+ 
+ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ 		      int bits, gfp_t mask)
+ {
+-	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
++	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
+ }
+ 
+ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+ 		     gfp_t mask)
+ {
+ 	return set_extent_bit(tree, start, end,
+-			      EXTENT_DELALLOC | EXTENT_DIRTY,
+-			      0, NULL, mask);
++			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
++			      0, NULL, NULL, mask);
+ }
+ 
+ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ 		       gfp_t mask)
+ {
+ 	return clear_extent_bit(tree, start, end,
+-				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
+-}
+-
+-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+-			 gfp_t mask)
+-{
+-	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
++				EXTENT_DIRTY | EXTENT_DELALLOC |
++				EXTENT_DO_ACCOUNTING, 0, 0,
++				NULL, mask);
+ }
+ 
+ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+ 		     gfp_t mask)
+ {
+ 	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+-			      mask);
++			      NULL, mask);
+ }
+ 
+ static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+ 		       gfp_t mask)
+ {
+-	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
++	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
++				NULL, mask);
+ }
+ 
+ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ 			gfp_t mask)
+ {
+ 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+-			      mask);
++			      NULL, mask);
+ }
+ 
+ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ 				 u64 end, gfp_t mask)
+ {
+-	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
+-}
+-
+-static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+-			 gfp_t mask)
+-{
+-	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
+-			      0, NULL, mask);
+-}
+-
+-static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
+-				  u64 end, gfp_t mask)
+-{
+-	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
++	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
++				NULL, mask);
+ }
+ 
+ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+@@ -917,13 +999,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+  * either insert or lock state struct between start and end use mask to tell
+  * us if waiting is desired.
+  */
+-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
++int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
++		     int bits, struct extent_state **cached_state, gfp_t mask)
+ {
+ 	int err;
+ 	u64 failed_start;
+ 	while (1) {
+-		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+-				     &failed_start, mask);
++		err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
++				     EXTENT_LOCKED, &failed_start,
++				     cached_state, mask);
+ 		if (err == -EEXIST && (mask & __GFP_WAIT)) {
+ 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+ 			start = failed_start;
+@@ -935,27 +1019,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+ 	return err;
+ }
+ 
++int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
++{
++	return lock_extent_bits(tree, start, end, 0, NULL, mask);
++}
++
+ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ 		    gfp_t mask)
+ {
+ 	int err;
+ 	u64 failed_start;
+ 
+-	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+-			     &failed_start, mask);
++	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
++			     &failed_start, NULL, mask);
+ 	if (err == -EEXIST) {
+ 		if (failed_start > start)
+ 			clear_extent_bit(tree, start, failed_start - 1,
+-					 EXTENT_LOCKED, 1, 0, mask);
++					 EXTENT_LOCKED, 1, 0, NULL, mask);
+ 		return 0;
+ 	}
+ 	return 1;
+ }
+ 
++int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
++			 struct extent_state **cached, gfp_t mask)
++{
++	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
++				mask);
++}
++
+ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ 		  gfp_t mask)
+ {
+-	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
++	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
++				mask);
+ }
+ 
+ /*
+@@ -974,7 +1071,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
+ 		page_cache_release(page);
+ 		index++;
+ 	}
+-	set_extent_dirty(tree, start, end, GFP_NOFS);
+ 	return 0;
+ }
+ 
+@@ -994,7 +1090,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+ 		page_cache_release(page);
+ 		index++;
+ 	}
+-	set_extent_writeback(tree, start, end, GFP_NOFS);
+ 	return 0;
+ }
+ 
+@@ -1232,6 +1327,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
+ 	u64 delalloc_start;
+ 	u64 delalloc_end;
+ 	u64 found;
++	struct extent_state *cached_state = NULL;
+ 	int ret;
+ 	int loops = 0;
+ 
+@@ -1269,6 +1365,7 @@ again:
+ 		/* some of the pages are gone, lets avoid looping by
+ 		 * shortening the size of the delalloc range we're searching
+ 		 */
++		free_extent_state(cached_state);
+ 		if (!loops) {
+ 			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+ 			max_bytes = PAGE_CACHE_SIZE - offset;
+@@ -1282,18 +1379,21 @@ again:
+ 	BUG_ON(ret);
+ 
+ 	/* step three, lock the state bits for the whole range */
+-	lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
++	lock_extent_bits(tree, delalloc_start, delalloc_end,
++			 0, &cached_state, GFP_NOFS);
+ 
+ 	/* then test to make sure it is all still delalloc */
+ 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
+-			     EXTENT_DELALLOC, 1);
++			     EXTENT_DELALLOC, 1, cached_state);
+ 	if (!ret) {
+-		unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
++		unlock_extent_cached(tree, delalloc_start, delalloc_end,
++				     &cached_state, GFP_NOFS);
+ 		__unlock_for_delalloc(inode, locked_page,
+ 			      delalloc_start, delalloc_end);
+ 		cond_resched();
+ 		goto again;
+ 	}
++	free_extent_state(cached_state);
+ 	*start = delalloc_start;
+ 	*end = delalloc_end;
+ out_failed:
+@@ -1303,11 +1403,7 @@ out_failed:
+ int extent_clear_unlock_delalloc(struct inode *inode,
+ 				struct extent_io_tree *tree,
+ 				u64 start, u64 end, struct page *locked_page,
+-				int unlock_pages,
+-				int clear_unlock,
+-				int clear_delalloc, int clear_dirty,
+-				int set_writeback,
+-				int end_writeback)
++				unsigned long op)
+ {
+ 	int ret;
+ 	struct page *pages[16];
+@@ -1317,16 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
+ 	int i;
+ 	int clear_bits = 0;
+ 
+-	if (clear_unlock)
++	if (op & EXTENT_CLEAR_UNLOCK)
+ 		clear_bits |= EXTENT_LOCKED;
+-	if (clear_dirty)
++	if (op & EXTENT_CLEAR_DIRTY)
+ 		clear_bits |= EXTENT_DIRTY;
+ 
+-	if (clear_delalloc)
++	if (op & EXTENT_CLEAR_DELALLOC)
+ 		clear_bits |= EXTENT_DELALLOC;
+ 
+-	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+-	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
++	if (op & EXTENT_CLEAR_ACCOUNTING)
++		clear_bits |= EXTENT_DO_ACCOUNTING;
++
++	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
++	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
++		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
++		    EXTENT_SET_PRIVATE2)))
+ 		return 0;
+ 
+ 	while (nr_pages > 0) {
+@@ -1334,17 +1435,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
+ 				     min_t(unsigned long,
+ 				     nr_pages, ARRAY_SIZE(pages)), pages);
+ 		for (i = 0; i < ret; i++) {
++
++			if (op & EXTENT_SET_PRIVATE2)
++				SetPagePrivate2(pages[i]);
++
+ 			if (pages[i] == locked_page) {
+ 				page_cache_release(pages[i]);
+ 				continue;
+ 			}
+-			if (clear_dirty)
++			if (op & EXTENT_CLEAR_DIRTY)
+ 				clear_page_dirty_for_io(pages[i]);
+-			if (set_writeback)
++			if (op & EXTENT_SET_WRITEBACK)
+ 				set_page_writeback(pages[i]);
+-			if (end_writeback)
++			if (op & EXTENT_END_WRITEBACK)
+ 				end_page_writeback(pages[i]);
+-			if (unlock_pages)
++			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
+ 				unlock_page(pages[i]);
+ 			page_cache_release(pages[i]);
+ 		}
+@@ -1476,14 +1581,17 @@ out:
+  * range is found set.
+  */
+ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+-		   int bits, int filled)
++		   int bits, int filled, struct extent_state *cached)
+ {
+ 	struct extent_state *state = NULL;
+ 	struct rb_node *node;
+ 	int bitset = 0;
+ 
+ 	spin_lock(&tree->lock);
+-	node = tree_search(tree, start);
++	if (cached && cached->tree && cached->start == start)
++		node = &cached->rb_node;
++	else
++		node = tree_search(tree, start);
+ 	while (node && start <= end) {
+ 		state = rb_entry(node, struct extent_state, rb_node);
+ 
+@@ -1503,6 +1611,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ 			bitset = 0;
+ 			break;
+ 		}
++
++		if (state->end == (u64)-1)
++			break;
++
+ 		start = state->end + 1;
+ 		if (start > end)
+ 			break;
+@@ -1526,7 +1638,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,
+ {
+ 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ 	u64 end = start + PAGE_CACHE_SIZE - 1;
+-	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
++	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
+ 		SetPageUptodate(page);
+ 	return 0;
+ }
+@@ -1540,7 +1652,7 @@ static int check_page_locked(struct extent_io_tree *tree,
+ {
+ 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ 	u64 end = start + PAGE_CACHE_SIZE - 1;
+-	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
++	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
+ 		unlock_page(page);
+ 	return 0;
+ }
+@@ -1552,10 +1664,7 @@ static int check_page_locked(struct extent_io_tree *tree,
+ static int check_page_writeback(struct extent_io_tree *tree,
+ 			     struct page *page)
+ {
+-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+-	u64 end = start + PAGE_CACHE_SIZE - 1;
+-	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
+-		end_page_writeback(page);
++	end_page_writeback(page);
+ 	return 0;
+ }
+ 
+@@ -1613,13 +1722,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
+ 		}
+ 
+ 		if (!uptodate) {
+-			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
++			clear_extent_uptodate(tree, start, end, GFP_NOFS);
+ 			ClearPageUptodate(page);
+ 			SetPageError(page);
+ 		}
+ 
+-		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
+-
+ 		if (whole_page)
+ 			end_page_writeback(page);
+ 		else
+@@ -1983,7 +2090,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
+ 			continue;
+ 		}
+ 		/* the get_extent function already copied into the page */
+-		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
++		if (test_range_bit(tree, cur, cur_end,
++				   EXTENT_UPTODATE, 1, NULL)) {
+ 			check_page_uptodate(tree, page);
+ 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ 			cur = cur + iosize;
+@@ -2078,6 +2186,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 	u64 iosize;
+ 	u64 unlock_start;
+ 	sector_t sector;
++	struct extent_state *cached_state = NULL;
+ 	struct extent_map *em;
+ 	struct block_device *bdev;
+ 	int ret;
+@@ -2124,6 +2233,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 	delalloc_end = 0;
+ 	page_started = 0;
+ 	if (!epd->extent_locked) {
++		u64 delalloc_to_write = 0;
+ 		/*
+ 		 * make sure the wbc mapping index is at least updated
+ 		 * to this page.
+@@ -2143,8 +2253,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 			tree->ops->fill_delalloc(inode, page, delalloc_start,
+ 						 delalloc_end, &page_started,
+ 						 &nr_written);
++			/*
++			 * delalloc_end is already one less than the total
++			 * length, so we don't subtract one from
++			 * PAGE_CACHE_SIZE
++			 */
++			delalloc_to_write += (delalloc_end - delalloc_start +
++					      PAGE_CACHE_SIZE) >>
++					      PAGE_CACHE_SHIFT;
+ 			delalloc_start = delalloc_end + 1;
+ 		}
++		if (wbc->nr_to_write < delalloc_to_write) {
++			int thresh = 8192;
++
++			if (delalloc_to_write < thresh * 2)
++				thresh = delalloc_to_write;
++			wbc->nr_to_write = min_t(u64, delalloc_to_write,
++						 thresh);
++		}
+ 
+ 		/* did the fill delalloc function already unlock and start
+ 		 * the IO?
+@@ -2160,15 +2286,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 			goto done_unlocked;
+ 		}
+ 	}
+-	lock_extent(tree, start, page_end, GFP_NOFS);
+-
+-	unlock_start = start;
+-
+ 	if (tree->ops && tree->ops->writepage_start_hook) {
+ 		ret = tree->ops->writepage_start_hook(page, start,
+ 						      page_end);
+ 		if (ret == -EAGAIN) {
+-			unlock_extent(tree, start, page_end, GFP_NOFS);
+ 			redirty_page_for_writepage(wbc, page);
+ 			update_nr_written(page, wbc, nr_written);
+ 			unlock_page(page);
+@@ -2184,12 +2305,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 	update_nr_written(page, wbc, nr_written + 1);
+ 
+ 	end = page_end;
+-	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
+-		printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
+-
+ 	if (last_byte <= start) {
+-		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+-		unlock_extent(tree, start, page_end, GFP_NOFS);
+ 		if (tree->ops && tree->ops->writepage_end_io_hook)
+ 			tree->ops->writepage_end_io_hook(page, start,
+ 							 page_end, NULL, 1);
+@@ -2197,13 +2313,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 		goto done;
+ 	}
+ 
+-	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
+ 	blocksize = inode->i_sb->s_blocksize;
+ 
+ 	while (cur <= end) {
+ 		if (cur >= last_byte) {
+-			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+-			unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+ 			if (tree->ops && tree->ops->writepage_end_io_hook)
+ 				tree->ops->writepage_end_io_hook(page, cur,
+ 							 page_end, NULL, 1);
+@@ -2235,12 +2348,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 		 */
+ 		if (compressed || block_start == EXTENT_MAP_HOLE ||
+ 		    block_start == EXTENT_MAP_INLINE) {
+-			clear_extent_dirty(tree, cur,
+-					   cur + iosize - 1, GFP_NOFS);
+-
+-			unlock_extent(tree, unlock_start, cur + iosize - 1,
+-				      GFP_NOFS);
+-
+ 			/*
+ 			 * end_io notification does not happen here for
+ 			 * compressed extents
+@@ -2265,13 +2372,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 		}
+ 		/* leave this out until we have a page_mkwrite call */
+ 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
+-				   EXTENT_DIRTY, 0)) {
++				   EXTENT_DIRTY, 0, NULL)) {
+ 			cur = cur + iosize;
+ 			pg_offset += iosize;
+ 			continue;
+ 		}
+ 
+-		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
+ 		if (tree->ops && tree->ops->writepage_io_hook) {
+ 			ret = tree->ops->writepage_io_hook(page, cur,
+ 						cur + iosize - 1);
+@@ -2309,12 +2415,12 @@ done:
+ 		set_page_writeback(page);
+ 		end_page_writeback(page);
+ 	}
+-	if (unlock_start <= page_end)
+-		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+ 	unlock_page(page);
+ 
+ done_unlocked:
+ 
++	/* drop our reference on any cached states */
++	free_extent_state(cached_state);
+ 	return 0;
+ }
+ 
+@@ -2339,9 +2445,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
+ 			     writepage_t writepage, void *data,
+ 			     void (*flush_fn)(void *))
+ {
+-	struct backing_dev_info *bdi = mapping->backing_dev_info;
+ 	int ret = 0;
+ 	int done = 0;
++	int nr_to_write_done = 0;
+ 	struct pagevec pvec;
+ 	int nr_pages;
+ 	pgoff_t index;
+@@ -2361,7 +2467,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
+ 		scanned = 1;
+ 	}
+ retry:
+-	while (!done && (index <= end) &&
++	while (!done && !nr_to_write_done && (index <= end) &&
+ 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ 			      PAGECACHE_TAG_DIRTY, min(end - index,
+ 				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+@@ -2412,12 +2518,15 @@ retry:
+ 				unlock_page(page);
+ 				ret = 0;
+ 			}
+-			if (ret || wbc->nr_to_write <= 0)
+-				done = 1;
+-			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+-				wbc->encountered_congestion = 1;
++			if (ret)
+ 				done = 1;
+-			}
++
++			/*
++			 * the filesystem may choose to bump up nr_to_write.
++			 * We have to make sure to honor the new nr_to_write
++			 * at any time
++			 */
++			nr_to_write_done = wbc->nr_to_write <= 0;
+ 		}
+ 		pagevec_release(&pvec);
+ 		cond_resched();
+@@ -2604,10 +2713,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
+ 		return 0;
+ 
+ 	lock_extent(tree, start, end, GFP_NOFS);
+-	wait_on_extent_writeback(tree, start, end);
++	wait_on_page_writeback(page);
+ 	clear_extent_bit(tree, start, end,
+-			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+-			 1, 1, GFP_NOFS);
++			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
++			 EXTENT_DO_ACCOUNTING,
++			 1, 1, NULL, GFP_NOFS);
+ 	return 0;
+ }
+ 
+@@ -2687,7 +2797,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
+ 		    !isnew && !PageUptodate(page) &&
+ 		    (block_off_end > to || block_off_start < from) &&
+ 		    !test_range_bit(tree, block_start, cur_end,
+-				    EXTENT_UPTODATE, 1)) {
++				    EXTENT_UPTODATE, 1, NULL)) {
+ 			u64 sector;
+ 			u64 extent_offset = block_start - em->start;
+ 			size_t iosize;
+@@ -2701,7 +2811,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
+ 			 */
+ 			set_extent_bit(tree, block_start,
+ 				       block_start + iosize - 1,
+-				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
++				       EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
+ 			ret = submit_extent_page(READ, tree, page,
+ 					 sector, iosize, page_offset, em->bdev,
+ 					 NULL, 1,
+@@ -2742,13 +2852,18 @@ int try_release_extent_state(struct extent_map_tree *map,
+ 	int ret = 1;
+ 
+ 	if (test_range_bit(tree, start, end,
+-			   EXTENT_IOBITS | EXTENT_ORDERED, 0))
++			   EXTENT_IOBITS, 0, NULL))
+ 		ret = 0;
+ 	else {
+ 		if ((mask & GFP_NOFS) == GFP_NOFS)
+ 			mask = GFP_NOFS;
+-		clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+-				 1, 1, mask);
++		/*
++		 * at this point we can safely clear everything except the
++		 * locked bit and the nodatasum bit
++		 */
++		clear_extent_bit(tree, start, end,
++				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
++				 0, 0, NULL, mask);
+ 	}
+ 	return ret;
+ }
+@@ -2771,29 +2886,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,
+ 		u64 len;
+ 		while (start <= end) {
+ 			len = end - start + 1;
+-			spin_lock(&map->lock);
++			write_lock(&map->lock);
+ 			em = lookup_extent_mapping(map, start, len);
+ 			if (!em || IS_ERR(em)) {
+-				spin_unlock(&map->lock);
++				write_unlock(&map->lock);
+ 				break;
+ 			}
+ 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+ 			    em->start != start) {
+-				spin_unlock(&map->lock);
++				write_unlock(&map->lock);
+ 				free_extent_map(em);
+ 				break;
+ 			}
+ 			if (!test_range_bit(tree, em->start,
+ 					    extent_map_end(em) - 1,
+-					    EXTENT_LOCKED | EXTENT_WRITEBACK |
+-					    EXTENT_ORDERED,
+-					    0)) {
++					    EXTENT_LOCKED | EXTENT_WRITEBACK,
++					    0, NULL)) {
+ 				remove_extent_mapping(map, em);
+ 				/* once for the rb tree */
+ 				free_extent_map(em);
+ 			}
+ 			start = extent_map_end(em);
+-			spin_unlock(&map->lock);
++			write_unlock(&map->lock);
+ 
+ 			/* once for us */
+ 			free_extent_map(em);
+@@ -3203,7 +3317,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
+ 	int uptodate;
+ 	unsigned long index;
+ 
+-	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
++	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
+ 	if (ret)
+ 		return 1;
+ 	while (start <= end) {
+@@ -3233,7 +3347,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
+ 		return 1;
+ 
+ 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+-			   EXTENT_UPTODATE, 1);
++			   EXTENT_UPTODATE, 1, NULL);
+ 	if (ret)
+ 		return ret;
+ 
+@@ -3269,7 +3383,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
+ 		return 0;
+ 
+ 	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+-			   EXTENT_UPTODATE, 1)) {
++			   EXTENT_UPTODATE, 1, NULL)) {
+ 		return 0;
+ 	}
+ 
+diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
+index 5bc20ab..36de250 100644
+--- a/fs/btrfs/extent_io.h
++++ b/fs/btrfs/extent_io.h
+@@ -13,10 +13,9 @@
+ #define EXTENT_DEFRAG (1 << 6)
+ #define EXTENT_DEFRAG_DONE (1 << 7)
+ #define EXTENT_BUFFER_FILLED (1 << 8)
+-#define EXTENT_ORDERED (1 << 9)
+-#define EXTENT_ORDERED_METADATA (1 << 10)
+-#define EXTENT_BOUNDARY (1 << 11)
+-#define EXTENT_NODATASUM (1 << 12)
++#define EXTENT_BOUNDARY (1 << 9)
++#define EXTENT_NODATASUM (1 << 10)
++#define EXTENT_DO_ACCOUNTING (1 << 11)
+ #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+ 
+ /* flags for bio submission */
+@@ -27,6 +26,16 @@
+ #define EXTENT_BUFFER_BLOCKING 1
+ #define EXTENT_BUFFER_DIRTY 2
+ 
++/* these are flags for extent_clear_unlock_delalloc */
++#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
++#define EXTENT_CLEAR_UNLOCK	 0x2
++#define EXTENT_CLEAR_DELALLOC	 0x4
++#define EXTENT_CLEAR_DIRTY	 0x8
++#define EXTENT_SET_WRITEBACK	 0x10
++#define EXTENT_END_WRITEBACK	 0x20
++#define EXTENT_SET_PRIVATE2	 0x40
++#define EXTENT_CLEAR_ACCOUNTING  0x80
++
+ /*
+  * page->private values.  Every page that is controlled by the extent
+  * map has page->private set to one.
+@@ -62,8 +71,13 @@ struct extent_io_ops {
+ 				      struct extent_state *state, int uptodate);
+ 	int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
+ 			    unsigned long old, unsigned long bits);
+-	int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
+-			    unsigned long old, unsigned long bits);
++	int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
++			      unsigned long bits);
++	int (*merge_extent_hook)(struct inode *inode,
++				 struct extent_state *new,
++				 struct extent_state *other);
++	int (*split_extent_hook)(struct inode *inode,
++				 struct extent_state *orig, u64 split);
+ 	int (*write_cache_pages_lock_hook)(struct page *page);
+ };
+ 
+@@ -81,10 +95,14 @@ struct extent_state {
+ 	u64 start;
+ 	u64 end; /* inclusive */
+ 	struct rb_node rb_node;
++
++	/* ADD NEW ELEMENTS AFTER THIS */
+ 	struct extent_io_tree *tree;
+ 	wait_queue_head_t wq;
+ 	atomic_t refs;
+ 	unsigned long state;
++	u64 split_start;
++	u64 split_end;
+ 
+ 	/* for use by the FS */
+ 	u64 private;
+@@ -142,6 +160,8 @@ int try_release_extent_state(struct extent_map_tree *map,
+ 			     struct extent_io_tree *tree, struct page *page,
+ 			     gfp_t mask);
+ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
++int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
++		     int bits, struct extent_state **cached, gfp_t mask);
+ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ 		    gfp_t mask);
+@@ -155,11 +175,12 @@ u64 count_range_bits(struct extent_io_tree *tree,
+ 		     u64 max_bytes, unsigned long bits);
+ 
+ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+-		   int bits, int filled);
++		   int bits, int filled, struct extent_state *cached_state);
+ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ 		      int bits, gfp_t mask);
+ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+-		     int bits, int wake, int delete, gfp_t mask);
++		     int bits, int wake, int delete, struct extent_state **cached,
++		     gfp_t mask);
+ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ 		    int bits, gfp_t mask);
+ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+@@ -278,9 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree,
+ int extent_clear_unlock_delalloc(struct inode *inode,
+ 				struct extent_io_tree *tree,
+ 				u64 start, u64 end, struct page *locked_page,
+-				int unlock_page,
+-				int clear_unlock,
+-				int clear_delalloc, int clear_dirty,
+-				int set_writeback,
+-				int end_writeback);
++				unsigned long op);
+ #endif
+diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
+index 30c9365..2c726b7 100644
+--- a/fs/btrfs/extent_map.c
++++ b/fs/btrfs/extent_map.c
+@@ -36,7 +36,7 @@ void extent_map_exit(void)
+ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
+ {
+ 	tree->map.rb_node = NULL;
+-	spin_lock_init(&tree->lock);
++	rwlock_init(&tree->lock);
+ }
+ 
+ /**
+@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+ 	return 0;
+ }
+ 
++int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
++{
++	int ret = 0;
++	struct extent_map *merge = NULL;
++	struct rb_node *rb;
++	struct extent_map *em;
++
++	write_lock(&tree->lock);
++	em = lookup_extent_mapping(tree, start, len);
++
++	WARN_ON(em->start != start || !em);
++
++	if (!em)
++		goto out;
++
++	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
++
++	if (em->start != 0) {
++		rb = rb_prev(&em->rb_node);
++		if (rb)
++			merge = rb_entry(rb, struct extent_map, rb_node);
++		if (rb && mergable_maps(merge, em)) {
++			em->start = merge->start;
++			em->len += merge->len;
++			em->block_len += merge->block_len;
++			em->block_start = merge->block_start;
++			merge->in_tree = 0;
++			rb_erase(&merge->rb_node, &tree->map);
++			free_extent_map(merge);
++		}
++	}
++
++	rb = rb_next(&em->rb_node);
++	if (rb)
++		merge = rb_entry(rb, struct extent_map, rb_node);
++	if (rb && mergable_maps(em, merge)) {
++		em->len += merge->len;
++		em->block_len += merge->len;
++		rb_erase(&merge->rb_node, &tree->map);
++		merge->in_tree = 0;
++		free_extent_map(merge);
++	}
++
++	free_extent_map(em);
++out:
++	write_unlock(&tree->lock);
++	return ret;
++
++}
++
+ /**
+  * add_extent_mapping - add new extent map to the extent tree
+  * @tree:	tree to insert new map in
+@@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
+ 		ret = -EEXIST;
+ 		goto out;
+ 	}
+-	assert_spin_locked(&tree->lock);
+ 	rb = tree_insert(&tree->map, em->start, &em->rb_node);
+ 	if (rb) {
+ 		ret = -EEXIST;
+@@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ 	struct rb_node *next = NULL;
+ 	u64 end = range_end(start, len);
+ 
+-	assert_spin_locked(&tree->lock);
+ 	rb_node = __tree_search(&tree->map, start, &prev, &next);
+ 	if (!rb_node && prev) {
+ 		em = rb_entry(prev, struct extent_map, rb_node);
+@@ -319,6 +367,54 @@ out:
+ }
+ 
+ /**
++ * search_extent_mapping - find a nearby extent map
++ * @tree:	tree to lookup in
++ * @start:	byte offset to start the search
++ * @len:	length of the lookup range
++ *
++ * Find and return the first extent_map struct in @tree that intersects the
++ * [start, len] range.
++ *
++ * If one can't be found, any nearby extent may be returned
++ */
++struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
++					 u64 start, u64 len)
++{
++	struct extent_map *em;
++	struct rb_node *rb_node;
++	struct rb_node *prev = NULL;
++	struct rb_node *next = NULL;
++
++	rb_node = __tree_search(&tree->map, start, &prev, &next);
++	if (!rb_node && prev) {
++		em = rb_entry(prev, struct extent_map, rb_node);
++		goto found;
++	}
++	if (!rb_node && next) {
++		em = rb_entry(next, struct extent_map, rb_node);
++		goto found;
++	}
++	if (!rb_node) {
++		em = NULL;
++		goto out;
++	}
++	if (IS_ERR(rb_node)) {
++		em = ERR_PTR(PTR_ERR(rb_node));
++		goto out;
++	}
++	em = rb_entry(rb_node, struct extent_map, rb_node);
++	goto found;
++
++	em = NULL;
++	goto out;
++
++found:
++	atomic_inc(&em->refs);
++out:
++	return em;
++}
++
++/**
+  * remove_extent_mapping - removes an extent_map from the extent tree
+  * @tree:	extent tree to remove from
+  * @em:		extent map beeing removed
+@@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+ 	int ret = 0;
+ 
+ 	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+-	assert_spin_locked(&tree->lock);
+ 	rb_erase(&em->rb_node, &tree->map);
+ 	em->in_tree = 0;
+ 	return ret;
+diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
+index fb6eeef..ab6d74b 100644
+--- a/fs/btrfs/extent_map.h
++++ b/fs/btrfs/extent_map.h
+@@ -31,7 +31,7 @@ struct extent_map {
+ 
+ struct extent_map_tree {
+ 	struct rb_root map;
+-	spinlock_t lock;
++	rwlock_t lock;
+ };
+ 
+ static inline u64 extent_map_end(struct extent_map *em)
+@@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask);
+ void free_extent_map(struct extent_map *em);
+ int __init extent_map_init(void);
+ void extent_map_exit(void);
++int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
++struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
++					 u64 start, u64 len);
+ #endif
+diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
+index 4b83397..4599113 100644
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+ 	int err = 0;
+ 	int i;
+ 	struct inode *inode = fdentry(file)->d_inode;
+-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+-	u64 hint_byte;
+ 	u64 num_bytes;
+ 	u64 start_pos;
+ 	u64 end_of_last_block;
+@@ -125,23 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+ 		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ 
+ 	end_of_last_block = start_pos + num_bytes - 1;
++	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
++	if (err)
++		return err;
+ 
+-	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+-	trans = btrfs_join_transaction(root, 1);
+-	if (!trans) {
+-		err = -ENOMEM;
+-		goto out_unlock;
+-	}
+-	btrfs_set_trans_block_group(trans, inode);
+-	hint_byte = 0;
+-
+-	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+-
+-	/* check for reserved extents on each page, we don't want
+-	 * to reset the delalloc bit on things that already have
+-	 * extents reserved.
+-	 */
+-	btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+ 	for (i = 0; i < num_pages; i++) {
+ 		struct page *p = pages[i];
+ 		SetPageUptodate(p);
+@@ -155,9 +140,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+ 		 * at this time.
+ 		 */
+ 	}
+-	err = btrfs_end_transaction(trans, root);
+-out_unlock:
+-	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+ 	return err;
+ }
+ 
+@@ -189,18 +171,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ 		if (!split2)
+ 			split2 = alloc_extent_map(GFP_NOFS);
+ 
+-		spin_lock(&em_tree->lock);
++		write_lock(&em_tree->lock);
+ 		em = lookup_extent_mapping(em_tree, start, len);
+ 		if (!em) {
+-			spin_unlock(&em_tree->lock);
++			write_unlock(&em_tree->lock);
+ 			break;
+ 		}
+ 		flags = em->flags;
+ 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+-			spin_unlock(&em_tree->lock);
+ 			if (em->start <= start &&
+ 			    (!testend || em->start + em->len >= start + len)) {
+ 				free_extent_map(em);
++				write_unlock(&em_tree->lock);
+ 				break;
+ 			}
+ 			if (start < em->start) {
+@@ -210,6 +192,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ 				start = em->start + em->len;
+ 			}
+ 			free_extent_map(em);
++			write_unlock(&em_tree->lock);
+ 			continue;
+ 		}
+ 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+@@ -260,7 +243,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ 			free_extent_map(split);
+ 			split = NULL;
+ 		}
+-		spin_unlock(&em_tree->lock);
++		write_unlock(&em_tree->lock);
+ 
+ 		/* once for us */
+ 		free_extent_map(em);
+@@ -289,7 +272,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ 		       struct btrfs_root *root, struct inode *inode,
+ 		       u64 start, u64 end, u64 locked_end,
+-		       u64 inline_limit, u64 *hint_byte)
++		       u64 inline_limit, u64 *hint_byte, int drop_cache)
+ {
+ 	u64 extent_end = 0;
+ 	u64 search_start = start;
+@@ -314,7 +297,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ 	int ret;
+ 
+ 	inline_limit = 0;
+-	btrfs_drop_extent_cache(inode, start, end - 1, 0);
++	if (drop_cache)
++		btrfs_drop_extent_cache(inode, start, end - 1, 0);
+ 
+ 	path = btrfs_alloc_path();
+ 	if (!path)
+@@ -894,7 +878,8 @@ again:
+ 			btrfs_put_ordered_extent(ordered);
+ 
+ 		clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
+-				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
++				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
++				  EXTENT_DO_ACCOUNTING,
+ 				  GFP_NOFS);
+ 		unlock_extent(&BTRFS_I(inode)->io_tree,
+ 			      start_pos, last_pos - 1, GFP_NOFS);
+@@ -936,21 +921,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
+ 	start_pos = pos;
+ 
+ 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
++
++	/* do the reserve before the mutex lock in case we have to do some
++	 * flushing.  We wouldn't deadlock, but this is more polite.
++	 */
++	err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
++	if (err)
++		goto out_nolock;
++
++	mutex_lock(&inode->i_mutex);
++
+ 	current->backing_dev_info = inode->i_mapping->backing_dev_info;
+ 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ 	if (err)
+-		goto out_nolock;
++		goto out;
++
+ 	if (count == 0)
+-		goto out_nolock;
++		goto out;
+ 
+ 	err = file_remove_suid(file);
+ 	if (err)
+-		goto out_nolock;
++		goto out;
++
+ 	file_update_time(file);
+ 
+ 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+ 
+-	mutex_lock(&inode->i_mutex);
++	/* generic_write_checks can change our pos */
++	start_pos = pos;
++
+ 	BTRFS_I(inode)->sequence++;
+ 	first_index = pos >> PAGE_CACHE_SHIFT;
+ 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+@@ -1047,6 +1046,7 @@ out:
+ 	mutex_unlock(&inode->i_mutex);
+ 	if (ret)
+ 		err = ret;
++	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ 
+ out_nolock:
+ 	kfree(pages);
+@@ -1087,8 +1087,10 @@ out_nolock:
+ 					btrfs_end_transaction(trans, root);
+ 				else
+ 					btrfs_commit_transaction(trans, root);
+-			} else {
++			} else if (ret != BTRFS_NO_LOG_SYNC) {
+ 				btrfs_commit_transaction(trans, root);
++			} else {
++				btrfs_end_transaction(trans, root);
+ 			}
+ 		}
+ 		if (file->f_flags & O_DIRECT) {
+@@ -1138,6 +1140,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+ 	int ret = 0;
+ 	struct btrfs_trans_handle *trans;
+ 
++
++	/* we wait first, since the writeback may change the inode */
++	root->log_batch++;
++	/* the VFS called filemap_fdatawrite for us */
++	btrfs_wait_ordered_range(inode, 0, (u64)-1);
++	root->log_batch++;
++
+ 	/*
+ 	 * check the transaction that last modified this inode
+ 	 * and see if its already been committed
+@@ -1145,6 +1154,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+ 	if (!BTRFS_I(inode)->last_trans)
+ 		goto out;
+ 
++	/*
++	 * if the last transaction that changed this file was before
++	 * the current transaction, we can bail out now without any
++	 * syncing
++	 */
+ 	mutex_lock(&root->fs_info->trans_mutex);
+ 	if (BTRFS_I(inode)->last_trans <=
+ 	    root->fs_info->last_trans_committed) {
+@@ -1154,13 +1168,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+ 	}
+ 	mutex_unlock(&root->fs_info->trans_mutex);
+ 
+-	root->log_batch++;
+-	filemap_fdatawrite(inode->i_mapping);
+-	btrfs_wait_ordered_range(inode, 0, (u64)-1);
+-	root->log_batch++;
+-
+-	if (datasync && !(inode->i_state & I_DIRTY_PAGES))
+-		goto out;
+ 	/*
+ 	 * ok we haven't committed the transaction yet, lets do a commit
+ 	 */
+@@ -1189,14 +1196,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+ 	 */
+ 	mutex_unlock(&dentry->d_inode->i_mutex);
+ 
+-	if (ret > 0) {
+-		ret = btrfs_commit_transaction(trans, root);
+-	} else {
+-		ret = btrfs_sync_log(trans, root);
+-		if (ret == 0)
+-			ret = btrfs_end_transaction(trans, root);
+-		else
++	if (ret != BTRFS_NO_LOG_SYNC) {
++		if (ret > 0) {
+ 			ret = btrfs_commit_transaction(trans, root);
++		} else {
++			ret = btrfs_sync_log(trans, root);
++			if (ret == 0)
++				ret = btrfs_end_transaction(trans, root);
++			else
++				ret = btrfs_commit_transaction(trans, root);
++		}
++	} else {
++		ret = btrfs_end_transaction(trans, root);
+ 	}
+ 	mutex_lock(&dentry->d_inode->i_mutex);
+ out:
+diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
+index 5edcee3..5c2caad 100644
+--- a/fs/btrfs/free-space-cache.c
++++ b/fs/btrfs/free-space-cache.c
+@@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
+ 
+ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+ {
+-	u64 max_bytes, possible_bytes;
++	u64 max_bytes;
++	u64 bitmap_bytes;
++	u64 extent_bytes;
+ 
+ 	/*
+ 	 * The goal is to keep the total amount of memory used per 1gb of space
+@@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+ 	max_bytes = MAX_CACHE_BYTES_PER_GIG *
+ 		(div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+ 
+-	possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
+-		(sizeof(struct btrfs_free_space) *
+-		 block_group->extents_thresh);
++	/*
++	 * we want to account for 1 more bitmap than what we have so we can make
++	 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
++	 * we add more bitmaps.
++	 */
++	bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
+ 
+-	if (possible_bytes > max_bytes) {
+-		int extent_bytes = max_bytes -
+-			(block_group->total_bitmaps * PAGE_CACHE_SIZE);
++	if (bitmap_bytes >= max_bytes) {
++		block_group->extents_thresh = 0;
++		return;
++	}
+ 
+-		if (extent_bytes <= 0) {
+-			block_group->extents_thresh = 0;
+-			return;
+-		}
++	/*
++	 * we want the extent entry threshold to always be at most 1/2 the maxw
++	 * bytes we can have, or whatever is less than that.
++	 */
++	extent_bytes = max_bytes - bitmap_bytes;
++	extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
+ 
+-		block_group->extents_thresh = extent_bytes /
+-			(sizeof(struct btrfs_free_space));
+-	}
++	block_group->extents_thresh =
++		div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
+ }
+ 
+ static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
+@@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+ 	BUG_ON(block_group->total_bitmaps >= max_bitmaps);
+ 
+ 	info->offset = offset_to_bitmap(block_group, offset);
++	info->bytes = 0;
+ 	link_free_space(block_group, info);
+ 	block_group->total_bitmaps++;
+ 
+diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
+index 6b627c6..72ce3c1 100644
+--- a/fs/btrfs/inode-item.c
++++ b/fs/btrfs/inode-item.c
+@@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ 		ptr = (unsigned long)(ref + 1);
+ 		ret = 0;
+ 	} else if (ret < 0) {
++		if (ret == -EOVERFLOW)
++			ret = -EMLINK;
+ 		goto out;
+ 	} else {
+ 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+@@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+ 
+ 	ret = btrfs_insert_empty_item(trans, root, path, &key,
+ 				      sizeof(struct btrfs_inode_item));
+-	if (ret == 0 && objectid > root->highest_inode)
+-		root->highest_inode = objectid;
+ 	return ret;
+ }
+ 
+diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
+index 9abbced..c56eb59 100644
+--- a/fs/btrfs/inode-map.c
++++ b/fs/btrfs/inode-map.c
+@@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
+ 		slot = path->slots[0] - 1;
+ 		l = path->nodes[0];
+ 		btrfs_item_key_to_cpu(l, &found_key, slot);
+-		*objectid = found_key.objectid;
++		*objectid = max_t(u64, found_key.objectid,
++				  BTRFS_FIRST_FREE_OBJECTID - 1);
+ 	} else {
+-		*objectid = BTRFS_FIRST_FREE_OBJECTID;
++		*objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
+ 	}
+ 	ret = 0;
+ error:
+@@ -53,91 +54,27 @@ error:
+ 	return ret;
+ }
+ 
+-/*
+- * walks the btree of allocated inodes and find a hole.
+- */
+ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+ 			     struct btrfs_root *root,
+ 			     u64 dirid, u64 *objectid)
+ {
+-	struct btrfs_path *path;
+-	struct btrfs_key key;
+ 	int ret;
+-	int slot = 0;
+-	u64 last_ino = 0;
+-	int start_found;
+-	struct extent_buffer *l;
+-	struct btrfs_key search_key;
+-	u64 search_start = dirid;
+-
+ 	mutex_lock(&root->objectid_mutex);
+-	if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
+-	    root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
+-		*objectid = ++root->last_inode_alloc;
+-		mutex_unlock(&root->objectid_mutex);
+-		return 0;
+-	}
+-	path = btrfs_alloc_path();
+-	BUG_ON(!path);
+-	search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
+-	search_key.objectid = search_start;
+-	search_key.type = 0;
+-	search_key.offset = 0;
+-
+-	start_found = 0;
+-	ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
+-	if (ret < 0)
+-		goto error;
+ 
+-	while (1) {
+-		l = path->nodes[0];
+-		slot = path->slots[0];
+-		if (slot >= btrfs_header_nritems(l)) {
+-			ret = btrfs_next_leaf(root, path);
+-			if (ret == 0)
+-				continue;
+-			if (ret < 0)
+-				goto error;
+-			if (!start_found) {
+-				*objectid = search_start;
+-				start_found = 1;
+-				goto found;
+-			}
+-			*objectid = last_ino > search_start ?
+-				last_ino : search_start;
+-			goto found;
+-		}
+-		btrfs_item_key_to_cpu(l, &key, slot);
+-		if (key.objectid >= search_start) {
+-			if (start_found) {
+-				if (last_ino < search_start)
+-					last_ino = search_start;
+-				if (key.objectid > last_ino) {
+-					*objectid = last_ino;
+-					goto found;
+-				}
+-			} else if (key.objectid > search_start) {
+-				*objectid = search_start;
+-				goto found;
+-			}
+-		}
+-		if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
+-			break;
++	if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
++		ret = btrfs_find_highest_inode(root, &root->highest_objectid);
++		if (ret)
++			goto out;
++	}
+ 
+-		start_found = 1;
+-		last_ino = key.objectid + 1;
+-		path->slots[0]++;
++	if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
++		ret = -ENOSPC;
++		goto out;
+ 	}
+-	BUG_ON(1);
+-found:
+-	btrfs_release_path(root, path);
+-	btrfs_free_path(path);
+-	BUG_ON(*objectid < search_start);
+-	mutex_unlock(&root->objectid_mutex);
+-	return 0;
+-error:
+-	btrfs_release_path(root, path);
+-	btrfs_free_path(path);
++
++	*objectid = ++root->highest_objectid;
++	ret = 0;
++out:
+ 	mutex_unlock(&root->objectid_mutex);
+ 	return ret;
+ }
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 59cba18..f69e5e0 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
+ 	}
+ 
+ 	ret = btrfs_drop_extents(trans, root, inode, start,
+-				 aligned_end, aligned_end, start, &hint_byte);
++				 aligned_end, aligned_end, start,
++				 &hint_byte, 1);
+ 	BUG_ON(ret);
+ 
+ 	if (isize > actual_end)
+@@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
+ 				   inline_len, compressed_size,
+ 				   compressed_pages);
+ 	BUG_ON(ret);
+-	btrfs_drop_extent_cache(inode, start, aligned_end, 0);
++	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
+ 	return 0;
+ }
+ 
+@@ -423,9 +424,12 @@ again:
+ 			 * and free up our temp pages.
+ 			 */
+ 			extent_clear_unlock_delalloc(inode,
+-						     &BTRFS_I(inode)->io_tree,
+-						     start, end, NULL, 1, 0,
+-						     0, 1, 1, 1);
++			     &BTRFS_I(inode)->io_tree,
++			     start, end, NULL,
++			     EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
++			     EXTENT_CLEAR_DELALLOC |
++			     EXTENT_CLEAR_ACCOUNTING |
++			     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
+ 			ret = 0;
+ 			goto free_pages_out;
+ 		}
+@@ -611,9 +615,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
+ 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ 
+ 		while (1) {
+-			spin_lock(&em_tree->lock);
++			write_lock(&em_tree->lock);
+ 			ret = add_extent_mapping(em_tree, em);
+-			spin_unlock(&em_tree->lock);
++			write_unlock(&em_tree->lock);
+ 			if (ret != -EEXIST) {
+ 				free_extent_map(em);
+ 				break;
+@@ -636,11 +640,14 @@ static noinline int submit_compressed_extents(struct inode *inode,
+ 		 * clear dirty, set writeback and unlock the pages.
+ 		 */
+ 		extent_clear_unlock_delalloc(inode,
+-					     &BTRFS_I(inode)->io_tree,
+-					     async_extent->start,
+-					     async_extent->start +
+-					     async_extent->ram_size - 1,
+-					     NULL, 1, 1, 0, 1, 1, 0);
++				&BTRFS_I(inode)->io_tree,
++				async_extent->start,
++				async_extent->start +
++				async_extent->ram_size - 1,
++				NULL, EXTENT_CLEAR_UNLOCK_PAGE |
++				EXTENT_CLEAR_UNLOCK |
++				EXTENT_CLEAR_DELALLOC |
++				EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
+ 
+ 		ret = btrfs_submit_compressed_write(inode,
+ 				    async_extent->start,
+@@ -711,9 +718,15 @@ static noinline int cow_file_range(struct inode *inode,
+ 					    start, end, 0, NULL);
+ 		if (ret == 0) {
+ 			extent_clear_unlock_delalloc(inode,
+-						     &BTRFS_I(inode)->io_tree,
+-						     start, end, NULL, 1, 1,
+-						     1, 1, 1, 1);
++				     &BTRFS_I(inode)->io_tree,
++				     start, end, NULL,
++				     EXTENT_CLEAR_UNLOCK_PAGE |
++				     EXTENT_CLEAR_UNLOCK |
++				     EXTENT_CLEAR_DELALLOC |
++				     EXTENT_CLEAR_ACCOUNTING |
++				     EXTENT_CLEAR_DIRTY |
++				     EXTENT_SET_WRITEBACK |
++				     EXTENT_END_WRITEBACK);
+ 			*nr_written = *nr_written +
+ 			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+ 			*page_started = 1;
+@@ -725,9 +738,20 @@ static noinline int cow_file_range(struct inode *inode,
+ 	BUG_ON(disk_num_bytes >
+ 	       btrfs_super_total_bytes(&root->fs_info->super_copy));
+ 
++
++	read_lock(&BTRFS_I(inode)->extent_tree.lock);
++	em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
++				   start, num_bytes);
++	if (em) {
++		alloc_hint = em->block_start;
++		free_extent_map(em);
++	}
++	read_unlock(&BTRFS_I(inode)->extent_tree.lock);
+ 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+ 
+ 	while (disk_num_bytes > 0) {
++		unsigned long op;
++
+ 		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
+ 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+ 					   root->sectorsize, 0, alloc_hint,
+@@ -737,7 +761,6 @@ static noinline int cow_file_range(struct inode *inode,
+ 		em = alloc_extent_map(GFP_NOFS);
+ 		em->start = start;
+ 		em->orig_start = em->start;
+-
+ 		ram_size = ins.offset;
+ 		em->len = ins.offset;
+ 
+@@ -747,9 +770,9 @@ static noinline int cow_file_range(struct inode *inode,
+ 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ 
+ 		while (1) {
+-			spin_lock(&em_tree->lock);
++			write_lock(&em_tree->lock);
+ 			ret = add_extent_mapping(em_tree, em);
+-			spin_unlock(&em_tree->lock);
++			write_unlock(&em_tree->lock);
+ 			if (ret != -EEXIST) {
+ 				free_extent_map(em);
+ 				break;
+@@ -776,11 +799,17 @@ static noinline int cow_file_range(struct inode *inode,
+ 		/* we're not doing compressed IO, don't unlock the first
+ 		 * page (which the caller expects to stay locked), don't
+ 		 * clear any dirty bits and don't set any writeback bits
++		 *
++		 * Do set the Private2 bit so we know this page was properly
++		 * setup for writepage
+ 		 */
++		op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
++		op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
++			EXTENT_SET_PRIVATE2;
++
+ 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+ 					     start, start + ram_size - 1,
+-					     locked_page, unlock, 1,
+-					     1, 0, 0, 0);
++					     locked_page, op);
+ 		disk_num_bytes -= cur_alloc_size;
+ 		num_bytes -= cur_alloc_size;
+ 		alloc_hint = ins.objectid + ins.offset;
+@@ -852,8 +881,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+ 	u64 cur_end;
+ 	int limit = 10 * 1024 * 1042;
+ 
+-	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
+-			 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
++	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
++			 1, 0, NULL, GFP_NOFS);
+ 	while (start < end) {
+ 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+ 		async_cow->inode = inode;
+@@ -994,6 +1023,7 @@ next_slot:
+ 
+ 		if (found_key.offset > cur_offset) {
+ 			extent_end = found_key.offset;
++			extent_type = 0;
+ 			goto out_check;
+ 		}
+ 
+@@ -1080,9 +1110,9 @@ out_check:
+ 			em->bdev = root->fs_info->fs_devices->latest_bdev;
+ 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ 			while (1) {
+-				spin_lock(&em_tree->lock);
++				write_lock(&em_tree->lock);
+ 				ret = add_extent_mapping(em_tree, em);
+-				spin_unlock(&em_tree->lock);
++				write_unlock(&em_tree->lock);
+ 				if (ret != -EEXIST) {
+ 					free_extent_map(em);
+ 					break;
+@@ -1100,8 +1130,10 @@ out_check:
+ 		BUG_ON(ret);
+ 
+ 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+-					cur_offset, cur_offset + num_bytes - 1,
+-					locked_page, 1, 1, 1, 0, 0, 0);
++				cur_offset, cur_offset + num_bytes - 1,
++				locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
++				EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
++				EXTENT_SET_PRIVATE2);
+ 		cur_offset = extent_end;
+ 		if (cur_offset > end)
+ 			break;
+@@ -1147,6 +1179,89 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+ 	return ret;
+ }
+ 
++static int btrfs_split_extent_hook(struct inode *inode,
++				    struct extent_state *orig, u64 split)
++{
++	struct btrfs_root *root = BTRFS_I(inode)->root;
++	u64 size;
++
++	if (!(orig->state & EXTENT_DELALLOC))
++		return 0;
++
++	size = orig->end - orig->start + 1;
++	if (size > root->fs_info->max_extent) {
++		u64 num_extents;
++		u64 new_size;
++
++		new_size = orig->end - split + 1;
++		num_extents = div64_u64(size + root->fs_info->max_extent - 1,
++					root->fs_info->max_extent);
++
++		/*
++		 * if we break a large extent up then leave oustanding_extents
++		 * be, since we've already accounted for the large extent.
++		 */
++		if (div64_u64(new_size + root->fs_info->max_extent - 1,
++			      root->fs_info->max_extent) < num_extents)
++			return 0;
++	}
++
++	spin_lock(&BTRFS_I(inode)->accounting_lock);
++	BTRFS_I(inode)->outstanding_extents++;
++	spin_unlock(&BTRFS_I(inode)->accounting_lock);
++
++	return 0;
++}
++
++/*
++ * extent_io.c merge_extent_hook, used to track merged delayed allocation
++ * extents so we can keep track of new extents that are just merged onto old
++ * extents, such as when we are doing sequential writes, so we can properly
++ * account for the metadata space we'll need.
++ */
++static int btrfs_merge_extent_hook(struct inode *inode,
++				   struct extent_state *new,
++				   struct extent_state *other)
++{
++	struct btrfs_root *root = BTRFS_I(inode)->root;
++	u64 new_size, old_size;
++	u64 num_extents;
++
++	/* not delalloc, ignore it */
++	if (!(other->state & EXTENT_DELALLOC))
++		return 0;
++
++	old_size = other->end - other->start + 1;
++	if (new->start < other->start)
++		new_size = other->end - new->start + 1;
++	else
++		new_size = new->end - other->start + 1;
++
++	/* we're not bigger than the max, unreserve the space and go */
++	if (new_size <= root->fs_info->max_extent) {
++		spin_lock(&BTRFS_I(inode)->accounting_lock);
++		BTRFS_I(inode)->outstanding_extents--;
++		spin_unlock(&BTRFS_I(inode)->accounting_lock);
++		return 0;
++	}
++
++	/*
++	 * If we grew by another max_extent, just return, we want to keep that
++	 * reserved amount.
++	 */
++	num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
++				root->fs_info->max_extent);
++	if (div64_u64(new_size + root->fs_info->max_extent - 1,
++		      root->fs_info->max_extent) > num_extents)
++		return 0;
++
++	spin_lock(&BTRFS_I(inode)->accounting_lock);
++	BTRFS_I(inode)->outstanding_extents--;
++	spin_unlock(&BTRFS_I(inode)->accounting_lock);
++
++	return 0;
++}
++
+ /*
+  * extent_io.c set_bit_hook, used to track delayed allocation
+  * bytes in this file, and to maintain the list of inodes that
+@@ -1155,6 +1270,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+ 		       unsigned long old, unsigned long bits)
+ {
++
+ 	/*
+ 	 * set_bit and clear bit hooks normally require _irqsave/restore
+ 	 * but in this case, we are only testeing for the DELALLOC
+@@ -1162,6 +1278,10 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+ 	 */
+ 	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ 		struct btrfs_root *root = BTRFS_I(inode)->root;
++
++		spin_lock(&BTRFS_I(inode)->accounting_lock);
++		BTRFS_I(inode)->outstanding_extents++;
++		spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ 		btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+ 		spin_lock(&root->fs_info->delalloc_lock);
+ 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
+@@ -1178,22 +1298,31 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+ /*
+  * extent_io.c clear_bit_hook, see set_bit_hook for why
+  */
+-static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
+-			 unsigned long old, unsigned long bits)
++static int btrfs_clear_bit_hook(struct inode *inode,
++				struct extent_state *state, unsigned long bits)
+ {
+ 	/*
+ 	 * set_bit and clear bit hooks normally require _irqsave/restore
+ 	 * but in this case, we are only testeing for the DELALLOC
+ 	 * bit, which is only set or cleared with irqs on
+ 	 */
+-	if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
++	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ 		struct btrfs_root *root = BTRFS_I(inode)->root;
+ 
++		if (bits & EXTENT_DO_ACCOUNTING) {
++			spin_lock(&BTRFS_I(inode)->accounting_lock);
++			BTRFS_I(inode)->outstanding_extents--;
++			spin_unlock(&BTRFS_I(inode)->accounting_lock);
++			btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
++		}
++
+ 		spin_lock(&root->fs_info->delalloc_lock);
+-		if (end - start + 1 > root->fs_info->delalloc_bytes) {
++		if (state->end - state->start + 1 >
++		    root->fs_info->delalloc_bytes) {
+ 			printk(KERN_INFO "btrfs warning: delalloc account "
+ 			       "%llu %llu\n",
+-			       (unsigned long long)end - start + 1,
++			       (unsigned long long)
++			       state->end - state->start + 1,
+ 			       (unsigned long long)
+ 			       root->fs_info->delalloc_bytes);
+ 			btrfs_delalloc_free_space(root, inode, (u64)-1);
+@@ -1201,9 +1330,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
+ 			BTRFS_I(inode)->delalloc_bytes = 0;
+ 		} else {
+ 			btrfs_delalloc_free_space(root, inode,
+-						  end - start + 1);
+-			root->fs_info->delalloc_bytes -= end - start + 1;
+-			BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
++						  state->end -
++						  state->start + 1);
++			root->fs_info->delalloc_bytes -= state->end -
++				state->start + 1;
++			BTRFS_I(inode)->delalloc_bytes -= state->end -
++				state->start + 1;
+ 		}
+ 		if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+ 		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+@@ -1374,10 +1506,8 @@ again:
+ 	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+ 
+ 	/* already ordered? We're done */
+-	if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+-			     EXTENT_ORDERED, 0)) {
++	if (PagePrivate2(page))
+ 		goto out;
+-	}
+ 
+ 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ 	if (ordered) {
+@@ -1413,11 +1543,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+ 	struct inode *inode = page->mapping->host;
+ 	struct btrfs_writepage_fixup *fixup;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+-	int ret;
+ 
+-	ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+-			     EXTENT_ORDERED, 0);
+-	if (ret)
++	/* this page is properly in the ordered list */
++	if (TestClearPagePrivate2(page))
+ 		return 0;
+ 
+ 	if (PageChecked(page))
+@@ -1455,9 +1583,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+ 	BUG_ON(!path);
+ 
+ 	path->leave_spinning = 1;
++
++	/*
++	 * we may be replacing one extent in the tree with another.
++	 * The new extent is pinned in the extent map, and we don't want
++	 * to drop it from the cache until it is completely in the btree.
++	 *
++	 * So, tell btrfs_drop_extents to leave this extent in the cache.
++	 * the caller is expected to unpin it and allow it to be merged
++	 * with the others.
++	 */
+ 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
+ 				 file_pos + num_bytes, locked_end,
+-				 file_pos, &hint);
++				 file_pos, &hint, 0);
+ 	BUG_ON(ret);
+ 
+ 	ins.objectid = inode->i_ino;
+@@ -1485,7 +1623,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+ 	btrfs_mark_buffer_dirty(leaf);
+ 
+ 	inode_add_bytes(inode, num_bytes);
+-	btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
+ 
+ 	ins.objectid = disk_bytenr;
+ 	ins.offset = disk_num_bytes;
+@@ -1596,6 +1733,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+ 						ordered_extent->len,
+ 						compressed, 0, 0,
+ 						BTRFS_FILE_EXTENT_REG);
++		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
++				   ordered_extent->file_offset,
++				   ordered_extent->len);
+ 		BUG_ON(ret);
+ 	}
+ 	unlock_extent(io_tree, ordered_extent->file_offset,
+@@ -1623,6 +1763,7 @@ nocow:
+ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+ 				struct extent_state *state, int uptodate)
+ {
++	ClearPagePrivate2(page);
+ 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
+ }
+ 
+@@ -1669,13 +1810,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
+ 		failrec->last_mirror = 0;
+ 		failrec->bio_flags = 0;
+ 
+-		spin_lock(&em_tree->lock);
++		read_lock(&em_tree->lock);
+ 		em = lookup_extent_mapping(em_tree, start, failrec->len);
+ 		if (em->start > start || em->start + em->len < start) {
+ 			free_extent_map(em);
+ 			em = NULL;
+ 		}
+-		spin_unlock(&em_tree->lock);
++		read_unlock(&em_tree->lock);
+ 
+ 		if (!em || IS_ERR(em)) {
+ 			kfree(failrec);
+@@ -1794,7 +1935,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+ 		return 0;
+ 
+ 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+-	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
++	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
+ 		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
+ 				  GFP_NOFS);
+ 		return 0;
+@@ -2352,6 +2493,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+ 	return ret;
+ }
+ 
++int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
++			struct btrfs_root *root,
++			struct inode *dir, u64 objectid,
++			const char *name, int name_len)
++{
++	struct btrfs_path *path;
++	struct extent_buffer *leaf;
++	struct btrfs_dir_item *di;
++	struct btrfs_key key;
++	u64 index;
++	int ret;
++
++	path = btrfs_alloc_path();
++	if (!path)
++		return -ENOMEM;
++
++	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
++				   name, name_len, -1);
++	BUG_ON(!di || IS_ERR(di));
++
++	leaf = path->nodes[0];
++	btrfs_dir_item_key_to_cpu(leaf, di, &key);
++	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
++	ret = btrfs_delete_one_dir_name(trans, root, path, di);
++	BUG_ON(ret);
++	btrfs_release_path(root, path);
++
++	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
++				 objectid, root->root_key.objectid,
++				 dir->i_ino, &index, name, name_len);
++	if (ret < 0) {
++		BUG_ON(ret != -ENOENT);
++		di = btrfs_search_dir_index_item(root, path, dir->i_ino,
++						 name, name_len);
++		BUG_ON(!di || IS_ERR(di));
++
++		leaf = path->nodes[0];
++		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++		btrfs_release_path(root, path);
++		index = key.offset;
++	}
++
++	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
++					 index, name, name_len, -1);
++	BUG_ON(!di || IS_ERR(di));
++
++	leaf = path->nodes[0];
++	btrfs_dir_item_key_to_cpu(leaf, di, &key);
++	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
++	ret = btrfs_delete_one_dir_name(trans, root, path, di);
++	BUG_ON(ret);
++	btrfs_release_path(root, path);
++
++	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
++	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
++	ret = btrfs_update_inode(trans, root, dir);
++	BUG_ON(ret);
++	dir->i_sb->s_dirt = 1;
++
++	btrfs_free_path(path);
++	return 0;
++}
++
+ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+ {
+ 	struct inode *inode = dentry->d_inode;
+@@ -2361,29 +2565,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+ 	struct btrfs_trans_handle *trans;
+ 	unsigned long nr = 0;
+ 
+-	/*
+-	 * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
+-	 * the root of a subvolume or snapshot
+-	 */
+ 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
+-	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
++	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ 		return -ENOTEMPTY;
+-	}
+ 
+ 	trans = btrfs_start_transaction(root, 1);
+ 	btrfs_set_trans_block_group(trans, dir);
+ 
++	if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
++		err = btrfs_unlink_subvol(trans, root, dir,
++					  BTRFS_I(inode)->location.objectid,
++					  dentry->d_name.name,
++					  dentry->d_name.len);
++		goto out;
++	}
++
+ 	err = btrfs_orphan_add(trans, inode);
+ 	if (err)
+-		goto fail_trans;
++		goto out;
+ 
+ 	/* now the directory is empty */
+ 	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+ 				 dentry->d_name.name, dentry->d_name.len);
+ 	if (!err)
+ 		btrfs_i_size_write(inode, 0);
+-
+-fail_trans:
++out:
+ 	nr = trans->blocks_used;
+ 	ret = btrfs_end_transaction_throttle(trans, root);
+ 	btrfs_btree_balance_dirty(root, nr);
+@@ -2826,12 +3032,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+ 
+ 	if ((offset & (blocksize - 1)) == 0)
+ 		goto out;
++	ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
++	if (ret)
++		goto out;
++
++	ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
++	if (ret)
++		goto out;
+ 
+ 	ret = -ENOMEM;
+ again:
+ 	page = grab_cache_page(mapping, index);
+-	if (!page)
++	if (!page) {
++		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
++		btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ 		goto out;
++	}
+ 
+ 	page_start = page_offset(page);
+ 	page_end = page_start + PAGE_CACHE_SIZE - 1;
+@@ -2864,7 +3080,16 @@ again:
+ 		goto again;
+ 	}
+ 
+-	btrfs_set_extent_delalloc(inode, page_start, page_end);
++	clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
++			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
++			  GFP_NOFS);
++
++	ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
++	if (ret) {
++		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
++		goto out_unlock;
++	}
++
+ 	ret = 0;
+ 	if (offset != PAGE_CACHE_SIZE) {
+ 		kaddr = kmap(page);
+@@ -2877,6 +3102,9 @@ again:
+ 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ 
+ out_unlock:
++	if (ret)
++		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
++	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ 	unlock_page(page);
+ 	page_cache_release(page);
+ out:
+@@ -2895,17 +3123,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
+ 	u64 last_byte;
+ 	u64 cur_offset;
+ 	u64 hole_size;
+-	int err;
++	int err = 0;
+ 
+ 	if (size <= hole_start)
+ 		return 0;
+ 
+-	err = btrfs_check_metadata_free_space(root);
++	err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+ 	if (err)
+ 		return err;
+ 
+-	btrfs_truncate_page(inode->i_mapping, inode->i_size);
+-
+ 	while (1) {
+ 		struct btrfs_ordered_extent *ordered;
+ 		btrfs_wait_ordered_range(inode, hole_start,
+@@ -2935,15 +3161,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
+ 						 cur_offset,
+ 						 cur_offset + hole_size,
+ 						 block_end,
+-						 cur_offset, &hint_byte);
++						 cur_offset, &hint_byte, 1);
+ 			if (err)
+ 				break;
++
++			err = btrfs_reserve_metadata_space(root, 1);
++			if (err)
++				break;
++
+ 			err = btrfs_insert_file_extent(trans, root,
+ 					inode->i_ino, cur_offset, 0,
+ 					0, hole_size, 0, hole_size,
+ 					0, 0, 0);
+ 			btrfs_drop_extent_cache(inode, hole_start,
+ 					last_byte - 1, 0);
++			btrfs_unreserve_metadata_space(root, 1);
+ 		}
+ 		free_extent_map(em);
+ 		cur_offset = last_byte;
+@@ -3003,6 +3235,11 @@ void btrfs_delete_inode(struct inode *inode)
+ 	}
+ 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ 
++	if (inode->i_nlink > 0) {
++		BUG_ON(btrfs_root_refs(&root->root_item) != 0);
++		goto no_delete;
++	}
++
+ 	btrfs_i_size_write(inode, 0);
+ 	trans = btrfs_join_transaction(root, 1);
+ 
+@@ -3070,29 +3307,67 @@ out_err:
+  * is kind of like crossing a mount point.
+  */
+ static int fixup_tree_root_location(struct btrfs_root *root,
+-			     struct btrfs_key *location,
+-			     struct btrfs_root **sub_root,
+-			     struct dentry *dentry)
++				    struct inode *dir,
++				    struct dentry *dentry,
++				    struct btrfs_key *location,
++				    struct btrfs_root **sub_root)
+ {
+-	struct btrfs_root_item *ri;
++	struct btrfs_path *path;
++	struct btrfs_root *new_root;
++	struct btrfs_root_ref *ref;
++	struct extent_buffer *leaf;
++	int ret;
++	int err = 0;
+ 
+-	if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
+-		return 0;
+-	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+-		return 0;
++	path = btrfs_alloc_path();
++	if (!path) {
++		err = -ENOMEM;
++		goto out;
++	}
+ 
+-	*sub_root = btrfs_read_fs_root(root->fs_info, location,
+-					dentry->d_name.name,
+-					dentry->d_name.len);
+-	if (IS_ERR(*sub_root))
+-		return PTR_ERR(*sub_root);
++	err = -ENOENT;
++	ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
++				  BTRFS_I(dir)->root->root_key.objectid,
++				  location->objectid);
++	if (ret) {
++		if (ret < 0)
++			err = ret;
++		goto out;
++	}
+ 
+-	ri = &(*sub_root)->root_item;
+-	location->objectid = btrfs_root_dirid(ri);
+-	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+-	location->offset = 0;
++	leaf = path->nodes[0];
++	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
++	if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
++	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
++		goto out;
+ 
+-	return 0;
++	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
++				   (unsigned long)(ref + 1),
++				   dentry->d_name.len);
++	if (ret)
++		goto out;
++
++	btrfs_release_path(root->fs_info->tree_root, path);
++
++	new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
++	if (IS_ERR(new_root)) {
++		err = PTR_ERR(new_root);
++		goto out;
++	}
++
++	if (btrfs_root_refs(&new_root->root_item) == 0) {
++		err = -ENOENT;
++		goto out;
++	}
++
++	*sub_root = new_root;
++	location->objectid = btrfs_root_dirid(&new_root->root_item);
++	location->type = BTRFS_INODE_ITEM_KEY;
++	location->offset = 0;
++	err = 0;
++out:
++	btrfs_free_path(path);
++	return err;
+ }
+ 
+ static void inode_tree_add(struct inode *inode)
+@@ -3101,11 +3376,13 @@ static void inode_tree_add(struct inode *inode)
+ 	struct btrfs_inode *entry;
+ 	struct rb_node **p;
+ 	struct rb_node *parent;
+-
+ again:
+ 	p = &root->inode_tree.rb_node;
+ 	parent = NULL;
+ 
++	if (hlist_unhashed(&inode->i_hash))
++		return;
++
+ 	spin_lock(&root->inode_lock);
+ 	while (*p) {
+ 		parent = *p;
+@@ -3132,13 +3409,87 @@ again:
+ static void inode_tree_del(struct inode *inode)
+ {
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
++	int empty = 0;
+ 
+ 	spin_lock(&root->inode_lock);
+ 	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
+ 		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+ 		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
++		empty = RB_EMPTY_ROOT(&root->inode_tree);
++	}
++	spin_unlock(&root->inode_lock);
++
++	if (empty && btrfs_root_refs(&root->root_item) == 0) {
++		synchronize_srcu(&root->fs_info->subvol_srcu);
++		spin_lock(&root->inode_lock);
++		empty = RB_EMPTY_ROOT(&root->inode_tree);
++		spin_unlock(&root->inode_lock);
++		if (empty)
++			btrfs_add_dead_root(root);
++	}
++}
++
++int btrfs_invalidate_inodes(struct btrfs_root *root)
++{
++	struct rb_node *node;
++	struct rb_node *prev;
++	struct btrfs_inode *entry;
++	struct inode *inode;
++	u64 objectid = 0;
++
++	WARN_ON(btrfs_root_refs(&root->root_item) != 0);
++
++	spin_lock(&root->inode_lock);
++again:
++	node = root->inode_tree.rb_node;
++	prev = NULL;
++	while (node) {
++		prev = node;
++		entry = rb_entry(node, struct btrfs_inode, rb_node);
++
++		if (objectid < entry->vfs_inode.i_ino)
++			node = node->rb_left;
++		else if (objectid > entry->vfs_inode.i_ino)
++			node = node->rb_right;
++		else
++			break;
++	}
++	if (!node) {
++		while (prev) {
++			entry = rb_entry(prev, struct btrfs_inode, rb_node);
++			if (objectid <= entry->vfs_inode.i_ino) {
++				node = prev;
++				break;
++			}
++			prev = rb_next(prev);
++		}
++	}
++	while (node) {
++		entry = rb_entry(node, struct btrfs_inode, rb_node);
++		objectid = entry->vfs_inode.i_ino + 1;
++		inode = igrab(&entry->vfs_inode);
++		if (inode) {
++			spin_unlock(&root->inode_lock);
++			if (atomic_read(&inode->i_count) > 1)
++				d_prune_aliases(inode);
++			/*
++			 * btrfs_drop_inode will remove it from
++			 * the inode cache when its usage count
++			 * hits zero.
++			 */
++			iput(inode);
++			cond_resched();
++			spin_lock(&root->inode_lock);
++			goto again;
++		}
++
++		if (cond_resched_lock(&root->inode_lock))
++			goto again;
++
++		node = rb_next(node);
+ 	}
+ 	spin_unlock(&root->inode_lock);
++	return 0;
+ }
+ 
+ static noinline void init_btrfs_i(struct inode *inode)
+@@ -3148,6 +3499,7 @@ static noinline void init_btrfs_i(struct inode *inode)
+ 	bi->generation = 0;
+ 	bi->sequence = 0;
+ 	bi->last_trans = 0;
++	bi->last_sub_trans = 0;
+ 	bi->logged_trans = 0;
+ 	bi->delalloc_bytes = 0;
+ 	bi->reserved_bytes = 0;
+@@ -3225,15 +3577,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+ 	return inode;
+ }
+ 
++static struct inode *new_simple_dir(struct super_block *s,
++				    struct btrfs_key *key,
++				    struct btrfs_root *root)
++{
++	struct inode *inode = new_inode(s);
++
++	if (!inode)
++		return ERR_PTR(-ENOMEM);
++
++	init_btrfs_i(inode);
++
++	BTRFS_I(inode)->root = root;
++	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
++	BTRFS_I(inode)->dummy_inode = 1;
++
++	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
++	inode->i_op = &simple_dir_inode_operations;
++	inode->i_fop = &simple_dir_operations;
++	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
++	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
++
++	return inode;
++}
++
+ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+ {
+ 	struct inode *inode;
+-	struct btrfs_inode *bi = BTRFS_I(dir);
+-	struct btrfs_root *root = bi->root;
++	struct btrfs_root *root = BTRFS_I(dir)->root;
+ 	struct btrfs_root *sub_root = root;
+ 	struct btrfs_key location;
++	int index;
+ 	int ret;
+ 
++	dentry->d_op = &btrfs_dentry_operations;
++
+ 	if (dentry->d_name.len > BTRFS_NAME_LEN)
+ 		return ERR_PTR(-ENAMETOOLONG);
+ 
+@@ -3242,29 +3620,52 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+ 	if (ret < 0)
+ 		return ERR_PTR(ret);
+ 
+-	inode = NULL;
+-	if (location.objectid) {
+-		ret = fixup_tree_root_location(root, &location, &sub_root,
+-						dentry);
+-		if (ret < 0)
+-			return ERR_PTR(ret);
+-		if (ret > 0)
+-			return ERR_PTR(-ENOENT);
++	if (location.objectid == 0)
++		return NULL;
++
++	if (location.type == BTRFS_INODE_ITEM_KEY) {
++		inode = btrfs_iget(dir->i_sb, &location, root);
++		return inode;
++	}
++
++	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
++
++	index = srcu_read_lock(&root->fs_info->subvol_srcu);
++	ret = fixup_tree_root_location(root, dir, dentry,
++				       &location, &sub_root);
++	if (ret < 0) {
++		if (ret != -ENOENT)
++			inode = ERR_PTR(ret);
++		else
++			inode = new_simple_dir(dir->i_sb, &location, sub_root);
++	} else {
+ 		inode = btrfs_iget(dir->i_sb, &location, sub_root);
+-		if (IS_ERR(inode))
+-			return ERR_CAST(inode);
+ 	}
++	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
++
+ 	return inode;
+ }
+ 
++static int btrfs_dentry_delete(struct dentry *dentry)
++{
++	struct btrfs_root *root;
++
++	if (!dentry->d_inode && !IS_ROOT(dentry))
++		dentry = dentry->d_parent;
++
++	if (dentry->d_inode) {
++		root = BTRFS_I(dentry->d_inode)->root;
++		if (btrfs_root_refs(&root->root_item) == 0)
++			return 1;
++	}
++	return 0;
++}
++
+ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
+ 				   struct nameidata *nd)
+ {
+ 	struct inode *inode;
+ 
+-	if (dentry->d_name.len > BTRFS_NAME_LEN)
+-		return ERR_PTR(-ENAMETOOLONG);
+-
+ 	inode = btrfs_lookup_dentry(dir, dentry);
+ 	if (IS_ERR(inode))
+ 		return ERR_CAST(inode);
+@@ -3603,9 +4004,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
+ 	if (ret != 0)
+ 		goto fail;
+ 
+-	if (objectid > root->highest_inode)
+-		root->highest_inode = objectid;
+-
+ 	inode->i_uid = current_fsuid();
+ 
+ 	if (dir && (dir->i_mode & S_ISGID)) {
+@@ -3673,26 +4071,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
+ 		   struct inode *parent_inode, struct inode *inode,
+ 		   const char *name, int name_len, int add_backref, u64 index)
+ {
+-	int ret;
++	int ret = 0;
+ 	struct btrfs_key key;
+ 	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
+ 
+-	key.objectid = inode->i_ino;
+-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+-	key.offset = 0;
++	if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
++		memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
++	} else {
++		key.objectid = inode->i_ino;
++		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
++		key.offset = 0;
++	}
++
++	if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
++		ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
++					 key.objectid, root->root_key.objectid,
++					 parent_inode->i_ino,
++					 index, name, name_len);
++	} else if (add_backref) {
++		ret = btrfs_insert_inode_ref(trans, root,
++					     name, name_len, inode->i_ino,
++					     parent_inode->i_ino, index);
++	}
+ 
+-	ret = btrfs_insert_dir_item(trans, root, name, name_len,
+-				    parent_inode->i_ino,
+-				    &key, btrfs_inode_type(inode),
+-				    index);
+ 	if (ret == 0) {
+-		if (add_backref) {
+-			ret = btrfs_insert_inode_ref(trans, root,
+-						     name, name_len,
+-						     inode->i_ino,
+-						     parent_inode->i_ino,
+-						     index);
+-		}
++		ret = btrfs_insert_dir_item(trans, root, name, name_len,
++					    parent_inode->i_ino, &key,
++					    btrfs_inode_type(inode), index);
++		BUG_ON(ret);
++
+ 		btrfs_i_size_write(parent_inode, parent_inode->i_size +
+ 				   name_len * 2);
+ 		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+@@ -3732,11 +4139,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
+ 	if (!new_valid_dev(rdev))
+ 		return -EINVAL;
+ 
+-	err = btrfs_check_metadata_free_space(root);
++	/*
++	 * 2 for inode item and ref
++	 * 2 for dir items
++	 * 1 for xattr if selinux is on
++	 */
++	err = btrfs_reserve_metadata_space(root, 5);
+ 	if (err)
+-		goto fail;
++		return err;
+ 
+ 	trans = btrfs_start_transaction(root, 1);
++	if (!trans)
++		goto fail;
+ 	btrfs_set_trans_block_group(trans, dir);
+ 
+ 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+@@ -3774,6 +4188,7 @@ out_unlock:
+ 	nr = trans->blocks_used;
+ 	btrfs_end_transaction_throttle(trans, root);
+ fail:
++	btrfs_unreserve_metadata_space(root, 5);
+ 	if (drop_inode) {
+ 		inode_dec_link_count(inode);
+ 		iput(inode);
+@@ -3794,10 +4209,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
+ 	u64 objectid;
+ 	u64 index = 0;
+ 
+-	err = btrfs_check_metadata_free_space(root);
++	/*
++	 * 2 for inode item and ref
++	 * 2 for dir items
++	 * 1 for xattr if selinux is on
++	 */
++	err = btrfs_reserve_metadata_space(root, 5);
+ 	if (err)
+-		goto fail;
++		return err;
++
+ 	trans = btrfs_start_transaction(root, 1);
++	if (!trans)
++		goto fail;
+ 	btrfs_set_trans_block_group(trans, dir);
+ 
+ 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+@@ -3838,6 +4261,7 @@ out_unlock:
+ 	nr = trans->blocks_used;
+ 	btrfs_end_transaction_throttle(trans, root);
+ fail:
++	btrfs_unreserve_metadata_space(root, 5);
+ 	if (drop_inode) {
+ 		inode_dec_link_count(inode);
+ 		iput(inode);
+@@ -3860,10 +4284,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+ 	if (inode->i_nlink == 0)
+ 		return -ENOENT;
+ 
+-	btrfs_inc_nlink(inode);
+-	err = btrfs_check_metadata_free_space(root);
++	/*
++	 * 1 item for inode ref
++	 * 2 items for dir items
++	 */
++	err = btrfs_reserve_metadata_space(root, 3);
+ 	if (err)
+-		goto fail;
++		return err;
++
++	btrfs_inc_nlink(inode);
++
+ 	err = btrfs_set_inode_index(dir, &index);
+ 	if (err)
+ 		goto fail;
+@@ -3875,20 +4305,19 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+ 
+ 	err = btrfs_add_nondir(trans, dentry, inode, 1, index);
+ 
+-	if (err)
+-		drop_inode = 1;
+-
+-	btrfs_update_inode_block_group(trans, dir);
+-	err = btrfs_update_inode(trans, root, inode);
+-
+-	if (err)
++	if (err) {
+ 		drop_inode = 1;
++	} else {
++		btrfs_update_inode_block_group(trans, dir);
++		err = btrfs_update_inode(trans, root, inode);
++		BUG_ON(err);
++		btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
++	}
+ 
+ 	nr = trans->blocks_used;
+-
+-	btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+ 	btrfs_end_transaction_throttle(trans, root);
+ fail:
++	btrfs_unreserve_metadata_space(root, 3);
+ 	if (drop_inode) {
+ 		inode_dec_link_count(inode);
+ 		iput(inode);
+@@ -3908,17 +4337,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+ 	u64 index = 0;
+ 	unsigned long nr = 1;
+ 
+-	err = btrfs_check_metadata_free_space(root);
++	/*
++	 * 2 items for inode and ref
++	 * 2 items for dir items
++	 * 1 for xattr if selinux is on
++	 */
++	err = btrfs_reserve_metadata_space(root, 5);
+ 	if (err)
+-		goto out_unlock;
++		return err;
+ 
+ 	trans = btrfs_start_transaction(root, 1);
+-	btrfs_set_trans_block_group(trans, dir);
+-
+-	if (IS_ERR(trans)) {
+-		err = PTR_ERR(trans);
++	if (!trans) {
++		err = -ENOMEM;
+ 		goto out_unlock;
+ 	}
++	btrfs_set_trans_block_group(trans, dir);
+ 
+ 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+ 	if (err) {
+@@ -3967,6 +4400,7 @@ out_fail:
+ 	btrfs_end_transaction_throttle(trans, root);
+ 
+ out_unlock:
++	btrfs_unreserve_metadata_space(root, 5);
+ 	if (drop_on_err)
+ 		iput(inode);
+ 	btrfs_btree_balance_dirty(root, nr);
+@@ -4064,11 +4498,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+ 	int compressed;
+ 
+ again:
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree, start, len);
+ 	if (em)
+ 		em->bdev = root->fs_info->fs_devices->latest_bdev;
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 
+ 	if (em) {
+ 		if (em->start > start || em->start + em->len <= start)
+@@ -4215,6 +4649,11 @@ again:
+ 				map = kmap(page);
+ 				read_extent_buffer(leaf, map + pg_offset, ptr,
+ 						   copy_size);
++				if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
++					memset(map + pg_offset + copy_size, 0,
++					       PAGE_CACHE_SIZE - pg_offset -
++					       copy_size);
++				}
+ 				kunmap(page);
+ 			}
+ 			flush_dcache_page(page);
+@@ -4259,7 +4698,7 @@ insert:
+ 	}
+ 
+ 	err = 0;
+-	spin_lock(&em_tree->lock);
++	write_lock(&em_tree->lock);
+ 	ret = add_extent_mapping(em_tree, em);
+ 	/* it is possible that someone inserted the extent into the tree
+ 	 * while we had the lock dropped.  It is also possible that
+@@ -4299,7 +4738,7 @@ insert:
+ 			err = 0;
+ 		}
+ 	}
+-	spin_unlock(&em_tree->lock);
++	write_unlock(&em_tree->lock);
+ out:
+ 	if (path)
+ 		btrfs_free_path(path);
+@@ -4398,13 +4837,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+ 	u64 page_start = page_offset(page);
+ 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+ 
++
++	/*
++	 * we have the page locked, so new writeback can't start,
++	 * and the dirty bit won't be cleared while we are here.
++	 *
++	 * Wait for IO on this page so that we can safely clear
++	 * the PagePrivate2 bit and do ordered accounting
++	 */
+ 	wait_on_page_writeback(page);
++
+ 	tree = &BTRFS_I(page->mapping->host)->io_tree;
+ 	if (offset) {
+ 		btrfs_releasepage(page, GFP_NOFS);
+ 		return;
+ 	}
+-
+ 	lock_extent(tree, page_start, page_end, GFP_NOFS);
+ 	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+ 					   page_offset(page));
+@@ -4415,16 +4862,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+ 		 */
+ 		clear_extent_bit(tree, page_start, page_end,
+ 				 EXTENT_DIRTY | EXTENT_DELALLOC |
+-				 EXTENT_LOCKED, 1, 0, GFP_NOFS);
+-		btrfs_finish_ordered_io(page->mapping->host,
+-					page_start, page_end);
++				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
++				 NULL, GFP_NOFS);
++		/*
++		 * whoever cleared the private bit is responsible
++		 * for the finish_ordered_io
++		 */
++		if (TestClearPagePrivate2(page)) {
++			btrfs_finish_ordered_io(page->mapping->host,
++						page_start, page_end);
++		}
+ 		btrfs_put_ordered_extent(ordered);
+ 		lock_extent(tree, page_start, page_end, GFP_NOFS);
+ 	}
+ 	clear_extent_bit(tree, page_start, page_end,
+ 		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+-		 EXTENT_ORDERED,
+-		 1, 1, GFP_NOFS);
++		 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
+ 	__btrfs_releasepage(page, GFP_NOFS);
+ 
+ 	ClearPageChecked(page);
+@@ -4473,6 +4926,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+ 		goto out;
+ 	}
+ 
++	ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
++	if (ret) {
++		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
++		ret = VM_FAULT_SIGBUS;
++		goto out;
++	}
++
+ 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+ again:
+ 	lock_page(page);
+@@ -4504,7 +4964,24 @@ again:
+ 		goto again;
+ 	}
+ 
+-	btrfs_set_extent_delalloc(inode, page_start, page_end);
++	/*
++	 * XXX - page_mkwrite gets called every time the page is dirtied, even
++	 * if it was already dirty, so for space accounting reasons we need to
++	 * clear any delalloc bits for the range we are fixing to save.  There
++	 * is probably a better way to do this, but for now keep consistent with
++	 * prepare_pages in the normal write path.
++	 */
++	clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
++			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
++			  GFP_NOFS);
++
++	ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
++	if (ret) {
++		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
++		ret = VM_FAULT_SIGBUS;
++		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
++		goto out_unlock;
++	}
+ 	ret = 0;
+ 
+ 	/* page is wholly or partially inside EOF */
+@@ -4521,11 +4998,17 @@ again:
+ 	}
+ 	ClearPageChecked(page);
+ 	set_page_dirty(page);
++	SetPageUptodate(page);
++
++	BTRFS_I(inode)->last_trans = root->fs_info->generation;
++	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+ 
+-	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+ 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ 
+ out_unlock:
++	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
++	if (!ret)
++		return VM_FAULT_LOCKED;
+ 	unlock_page(page);
+ out:
+ 	return ret;
+@@ -4544,7 +5027,9 @@ static void btrfs_truncate(struct inode *inode)
+ 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ 		return;
+ 
+-	btrfs_truncate_page(inode->i_mapping, inode->i_size);
++	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
++	if (ret)
++		return;
+ 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+ 
+ 	trans = btrfs_start_transaction(root, 1);
+@@ -4594,11 +5079,11 @@ out:
+  * create a new subvolume directory/inode (helper for the ioctl).
+  */
+ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+-			     struct btrfs_root *new_root, struct dentry *dentry,
++			     struct btrfs_root *new_root,
+ 			     u64 new_dirid, u64 alloc_hint)
+ {
+ 	struct inode *inode;
+-	int error;
++	int err;
+ 	u64 index = 0;
+ 
+ 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
+@@ -4611,11 +5096,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+ 	inode->i_nlink = 1;
+ 	btrfs_i_size_write(inode, 0);
+ 
+-	error = btrfs_update_inode(trans, new_root, inode);
+-	if (error)
+-		return error;
++	err = btrfs_update_inode(trans, new_root, inode);
++	BUG_ON(err);
+ 
+-	d_instantiate(dentry, inode);
++	iput(inode);
+ 	return 0;
+ }
+ 
+@@ -4640,7 +5124,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
+ 	if (!ei)
+ 		return NULL;
+ 	ei->last_trans = 0;
++	ei->last_sub_trans = 0;
+ 	ei->logged_trans = 0;
++	ei->outstanding_extents = 0;
++	ei->reserved_extents = 0;
++	ei->root = NULL;
++	spin_lock_init(&ei->accounting_lock);
+ 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+ 	INIT_LIST_HEAD(&ei->i_orphan);
+ 	INIT_LIST_HEAD(&ei->ordered_operations);
+@@ -4656,6 +5145,14 @@ void btrfs_destroy_inode(struct inode *inode)
+ 	WARN_ON(inode->i_data.nrpages);
+ 
+ 	/*
++	 * This can happen where we create an inode, but somebody else also
++	 * created the same inode and we need to destroy the one we already
++	 * created.
++	 */
++	if (!root)
++		goto free;
++
++	/*
+ 	 * Make sure we're properly removed from the ordered operation
+ 	 * lists.
+ 	 */
+@@ -4690,9 +5187,20 @@ void btrfs_destroy_inode(struct inode *inode)
+ 	}
+ 	inode_tree_del(inode);
+ 	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
++free:
+ 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+ }
+ 
++void btrfs_drop_inode(struct inode *inode)
++{
++	struct btrfs_root *root = BTRFS_I(inode)->root;
++
++	if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
++		generic_delete_inode(inode);
++	else
++		generic_drop_inode(inode);
++}
++
+ static void init_once(void *foo)
+ {
+ 	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+@@ -4761,31 +5269,37 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ {
+ 	struct btrfs_trans_handle *trans;
+ 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
++	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+ 	struct inode *new_inode = new_dentry->d_inode;
+ 	struct inode *old_inode = old_dentry->d_inode;
+ 	struct timespec ctime = CURRENT_TIME;
+ 	u64 index = 0;
++	u64 root_objectid;
+ 	int ret;
+ 
+-	/* we're not allowed to rename between subvolumes */
+-	if (BTRFS_I(old_inode)->root->root_key.objectid !=
+-	    BTRFS_I(new_dir)->root->root_key.objectid)
++	if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
++		return -EPERM;
++
++	/* we only allow rename subvolume link between subvolumes */
++	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ 		return -EXDEV;
+ 
++	if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
++	    (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
++		return -ENOTEMPTY;
++
+ 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
+-	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
++	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+ 		return -ENOTEMPTY;
+-	}
+ 
+-	/* to rename a snapshot or subvolume, we need to juggle the
+-	 * backrefs.  This isn't coded yet
++	/*
++	 * 2 items for dir items
++	 * 1 item for orphan entry
++	 * 1 item for ref
+ 	 */
+-	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+-		return -EXDEV;
+-
+-	ret = btrfs_check_metadata_free_space(root);
++	ret = btrfs_reserve_metadata_space(root, 4);
+ 	if (ret)
+-		goto out_unlock;
++		return ret;
+ 
+ 	/*
+ 	 * we're using rename to replace one file with another.
+@@ -4796,8 +5310,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ 	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+ 		filemap_flush(old_inode->i_mapping);
+ 
++	/* close the racy window with snapshot create/destroy ioctl */
++	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
++		down_read(&root->fs_info->subvol_sem);
++
+ 	trans = btrfs_start_transaction(root, 1);
++	btrfs_set_trans_block_group(trans, new_dir);
++
++	if (dest != root)
++		btrfs_record_root_in_trans(trans, dest);
++
++	ret = btrfs_set_inode_index(new_dir, &index);
++	if (ret)
++		goto out_fail;
+ 
++	if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
++		/* force full log commit if subvolume involved. */
++		root->fs_info->last_trans_log_full_commit = trans->transid;
++	} else {
++		ret = btrfs_insert_inode_ref(trans, dest,
++					     new_dentry->d_name.name,
++					     new_dentry->d_name.len,
++					     old_inode->i_ino,
++					     new_dir->i_ino, index);
++		if (ret)
++			goto out_fail;
++		/*
++		 * this is an ugly little race, but the rename is required
++		 * to make sure that if we crash, the inode is either at the
++		 * old name or the new one.  pinning the log transaction lets
++		 * us make sure we don't allow a log commit to come in after
++		 * we unlink the name but before we add the new name back in.
++		 */
++		btrfs_pin_log_trans(root);
++	}
+ 	/*
+ 	 * make sure the inode gets flushed if it is replacing
+ 	 * something.
+@@ -4807,18 +5353,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ 		btrfs_add_ordered_operation(trans, root, old_inode);
+ 	}
+ 
+-	/*
+-	 * this is an ugly little race, but the rename is required to make
+-	 * sure that if we crash, the inode is either at the old name
+-	 * or the new one.  pinning the log transaction lets us make sure
+-	 * we don't allow a log commit to come in after we unlink the
+-	 * name but before we add the new name back in.
+-	 */
+-	btrfs_pin_log_trans(root);
+-
+-	btrfs_set_trans_block_group(trans, new_dir);
+-
+-	btrfs_inc_nlink(old_dentry->d_inode);
+ 	old_dir->i_ctime = old_dir->i_mtime = ctime;
+ 	new_dir->i_ctime = new_dir->i_mtime = ctime;
+ 	old_inode->i_ctime = ctime;
+@@ -4826,47 +5360,60 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ 	if (old_dentry->d_parent != new_dentry->d_parent)
+ 		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+ 
+-	ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
+-				 old_dentry->d_name.name,
+-				 old_dentry->d_name.len);
+-	if (ret)
+-		goto out_fail;
++	if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
++		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
++		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
++					old_dentry->d_name.name,
++					old_dentry->d_name.len);
++	} else {
++		btrfs_inc_nlink(old_dentry->d_inode);
++		ret = btrfs_unlink_inode(trans, root, old_dir,
++					 old_dentry->d_inode,
++					 old_dentry->d_name.name,
++					 old_dentry->d_name.len);
++	}
++	BUG_ON(ret);
+ 
+ 	if (new_inode) {
+ 		new_inode->i_ctime = CURRENT_TIME;
+-		ret = btrfs_unlink_inode(trans, root, new_dir,
+-					 new_dentry->d_inode,
+-					 new_dentry->d_name.name,
+-					 new_dentry->d_name.len);
+-		if (ret)
+-			goto out_fail;
++		if (unlikely(new_inode->i_ino ==
++			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
++			root_objectid = BTRFS_I(new_inode)->location.objectid;
++			ret = btrfs_unlink_subvol(trans, dest, new_dir,
++						root_objectid,
++						new_dentry->d_name.name,
++						new_dentry->d_name.len);
++			BUG_ON(new_inode->i_nlink == 0);
++		} else {
++			ret = btrfs_unlink_inode(trans, dest, new_dir,
++						 new_dentry->d_inode,
++						 new_dentry->d_name.name,
++						 new_dentry->d_name.len);
++		}
++		BUG_ON(ret);
+ 		if (new_inode->i_nlink == 0) {
+ 			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
+-			if (ret)
+-				goto out_fail;
++			BUG_ON(ret);
+ 		}
+-
+ 	}
+-	ret = btrfs_set_inode_index(new_dir, &index);
+-	if (ret)
+-		goto out_fail;
+ 
+-	ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
+-			     old_inode, new_dentry->d_name.name,
+-			     new_dentry->d_name.len, 1, index);
+-	if (ret)
+-		goto out_fail;
++	ret = btrfs_add_link(trans, new_dir, old_inode,
++			     new_dentry->d_name.name,
++			     new_dentry->d_name.len, 0, index);
++	BUG_ON(ret);
+ 
+-	btrfs_log_new_name(trans, old_inode, old_dir,
+-				       new_dentry->d_parent);
++	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
++		btrfs_log_new_name(trans, old_inode, old_dir,
++				   new_dentry->d_parent);
++		btrfs_end_log_trans(root);
++	}
+ out_fail:
+-
+-	/* this btrfs_end_log_trans just allows the current
+-	 * log-sub transaction to complete
+-	 */
+-	btrfs_end_log_trans(root);
+ 	btrfs_end_transaction_throttle(trans, root);
+-out_unlock:
++
++	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
++		up_read(&root->fs_info->subvol_sem);
++
++	btrfs_unreserve_metadata_space(root, 4);
+ 	return ret;
+ }
+ 
+@@ -4938,11 +5485,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
+ 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
+ 		return -ENAMETOOLONG;
+ 
+-	err = btrfs_check_metadata_free_space(root);
++	/*
++	 * 2 items for inode item and ref
++	 * 2 items for dir items
++	 * 1 item for xattr if selinux is on
++	 */
++	err = btrfs_reserve_metadata_space(root, 5);
+ 	if (err)
+-		goto out_fail;
++		return err;
+ 
+ 	trans = btrfs_start_transaction(root, 1);
++	if (!trans)
++		goto out_fail;
+ 	btrfs_set_trans_block_group(trans, dir);
+ 
+ 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+@@ -5023,6 +5577,7 @@ out_unlock:
+ 	nr = trans->blocks_used;
+ 	btrfs_end_transaction_throttle(trans, root);
+ out_fail:
++	btrfs_unreserve_metadata_space(root, 5);
+ 	if (drop_inode) {
+ 		inode_dec_link_count(inode);
+ 		iput(inode);
+@@ -5044,6 +5599,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
+ 
+ 	while (num_bytes > 0) {
+ 		alloc_size = min(num_bytes, root->fs_info->max_extent);
++
++		ret = btrfs_reserve_metadata_space(root, 1);
++		if (ret)
++			goto out;
++
+ 		ret = btrfs_reserve_extent(trans, root, alloc_size,
+ 					   root->sectorsize, 0, alloc_hint,
+ 					   (u64)-1, &ins, 1);
+@@ -5058,9 +5618,12 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
+ 						  0, 0, 0,
+ 						  BTRFS_FILE_EXTENT_PREALLOC);
+ 		BUG_ON(ret);
++		btrfs_drop_extent_cache(inode, cur_offset,
++					cur_offset + ins.offset -1, 0);
+ 		num_bytes -= ins.offset;
+ 		cur_offset += ins.offset;
+ 		alloc_hint = ins.objectid + ins.offset;
++		btrfs_unreserve_metadata_space(root, 1);
+ 	}
+ out:
+ 	if (cur_offset > start) {
+@@ -5223,6 +5786,7 @@ static struct inode_operations btrfs_dir_ro_inode_operations = {
+ 	.lookup		= btrfs_lookup,
+ 	.permission	= btrfs_permission,
+ };
++
+ static struct file_operations btrfs_dir_file_operations = {
+ 	.llseek		= generic_file_llseek,
+ 	.read		= generic_read_dir,
+@@ -5245,6 +5809,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
+ 	.readpage_io_failed_hook = btrfs_io_failed_hook,
+ 	.set_bit_hook = btrfs_set_bit_hook,
+ 	.clear_bit_hook = btrfs_clear_bit_hook,
++	.merge_extent_hook = btrfs_merge_extent_hook,
++	.split_extent_hook = btrfs_split_extent_hook,
+ };
+ 
+ /*
+@@ -5309,3 +5875,7 @@ static struct inode_operations btrfs_symlink_inode_operations = {
+ 	.listxattr	= btrfs_listxattr,
+ 	.removexattr	= btrfs_removexattr,
+ };
++
++const struct dentry_operations btrfs_dentry_operations = {
++	.d_delete	= btrfs_dentry_delete,
++};
+diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
+index bd88f25..cdbb054 100644
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root,
+ 	struct btrfs_root_item root_item;
+ 	struct btrfs_inode_item *inode_item;
+ 	struct extent_buffer *leaf;
+-	struct btrfs_root *new_root = root;
+-	struct inode *dir;
++	struct btrfs_root *new_root;
++	struct inode *dir = dentry->d_parent->d_inode;
+ 	int ret;
+ 	int err;
+ 	u64 objectid;
+@@ -239,9 +239,15 @@ static noinline int create_subvol(struct btrfs_root *root,
+ 	u64 index = 0;
+ 	unsigned long nr = 1;
+ 
+-	ret = btrfs_check_metadata_free_space(root);
++	/*
++	 * 1 - inode item
++	 * 2 - refs
++	 * 1 - root item
++	 * 2 - dir items
++	 */
++	ret = btrfs_reserve_metadata_space(root, 6);
+ 	if (ret)
+-		goto fail_commit;
++		return ret;
+ 
+ 	trans = btrfs_start_transaction(root, 1);
+ 	BUG_ON(!trans);
+@@ -304,11 +310,17 @@ static noinline int create_subvol(struct btrfs_root *root,
+ 	if (ret)
+ 		goto fail;
+ 
++	key.offset = (u64)-1;
++	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
++	BUG_ON(IS_ERR(new_root));
++
++	btrfs_record_root_in_trans(trans, new_root);
++
++	ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
++				       BTRFS_I(dir)->block_group);
+ 	/*
+ 	 * insert the directory item
+ 	 */
+-	key.offset = (u64)-1;
+-	dir = dentry->d_parent->d_inode;
+ 	ret = btrfs_set_inode_index(dir, &index);
+ 	BUG_ON(ret);
+ 
+@@ -322,43 +334,20 @@ static noinline int create_subvol(struct btrfs_root *root,
+ 	ret = btrfs_update_inode(trans, root, dir);
+ 	BUG_ON(ret);
+ 
+-	/* add the backref first */
+ 	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+-				 objectid, BTRFS_ROOT_BACKREF_KEY,
+-				 root->root_key.objectid,
++				 objectid, root->root_key.objectid,
+ 				 dir->i_ino, index, name, namelen);
+ 
+ 	BUG_ON(ret);
+ 
+-	/* now add the forward ref */
+-	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+-				 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
+-				 objectid,
+-				 dir->i_ino, index, name, namelen);
+-
+-	BUG_ON(ret);
+-
+-	ret = btrfs_commit_transaction(trans, root);
+-	if (ret)
+-		goto fail_commit;
+-
+-	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+-	BUG_ON(!new_root);
+-
+-	trans = btrfs_start_transaction(new_root, 1);
+-	BUG_ON(!trans);
+-
+-	ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
+-				       BTRFS_I(dir)->block_group);
+-	if (ret)
+-		goto fail;
+-
++	d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
+ fail:
+ 	nr = trans->blocks_used;
+-	err = btrfs_commit_transaction(trans, new_root);
++	err = btrfs_commit_transaction(trans, root);
+ 	if (err && !ret)
+ 		ret = err;
+-fail_commit:
++
++	btrfs_unreserve_metadata_space(root, 6);
+ 	btrfs_btree_balance_dirty(root, nr);
+ 	return ret;
+ }
+@@ -375,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+ 	if (!root->ref_cows)
+ 		return -EINVAL;
+ 
+-	ret = btrfs_check_metadata_free_space(root);
++	/*
++	 * 1 - inode item
++	 * 2 - refs
++	 * 1 - root item
++	 * 2 - dir items
++	 */
++	ret = btrfs_reserve_metadata_space(root, 6);
+ 	if (ret)
+ 		goto fail_unlock;
+ 
+ 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+ 	if (!pending_snapshot) {
+ 		ret = -ENOMEM;
++		btrfs_unreserve_metadata_space(root, 6);
+ 		goto fail_unlock;
+ 	}
+ 	pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
+ 	if (!pending_snapshot->name) {
+ 		ret = -ENOMEM;
+ 		kfree(pending_snapshot);
++		btrfs_unreserve_metadata_space(root, 6);
+ 		goto fail_unlock;
+ 	}
+ 	memcpy(pending_snapshot->name, name, namelen);
+@@ -420,14 +417,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+  * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
+  * inside this filesystem so it's quite a bit simpler.
+  */
+-static noinline int btrfs_mksubvol(struct path *parent, char *name,
+-				   int mode, int namelen,
++static noinline int btrfs_mksubvol(struct path *parent,
++				   char *name, int namelen,
+ 				   struct btrfs_root *snap_src)
+ {
++	struct inode *dir  = parent->dentry->d_inode;
+ 	struct dentry *dentry;
+ 	int error;
+ 
+-	mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
++	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ 
+ 	dentry = lookup_one_len(name, parent->dentry, namelen);
+ 	error = PTR_ERR(dentry);
+@@ -438,99 +436,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
+ 	if (dentry->d_inode)
+ 		goto out_dput;
+ 
+-	if (!IS_POSIXACL(parent->dentry->d_inode))
+-		mode &= ~current_umask();
+-
+ 	error = mnt_want_write(parent->mnt);
+ 	if (error)
+ 		goto out_dput;
+ 
+-	error = btrfs_may_create(parent->dentry->d_inode, dentry);
++	error = btrfs_may_create(dir, dentry);
+ 	if (error)
+ 		goto out_drop_write;
+ 
+-	/*
+-	 * Actually perform the low-level subvolume creation after all
+-	 * this VFS fuzz.
+-	 *
+-	 * Eventually we want to pass in an inode under which we create this
+-	 * subvolume, but for now all are under the filesystem root.
+-	 *
+-	 * Also we should pass on the mode eventually to allow creating new
+-	 * subvolume with specific mode bits.
+-	 */
++	down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
++
++	if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
++		goto out_up_read;
++
+ 	if (snap_src) {
+-		struct dentry *dir = dentry->d_parent;
+-		struct dentry *test = dir->d_parent;
+-		struct btrfs_path *path = btrfs_alloc_path();
+-		int ret;
+-		u64 test_oid;
+-		u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
+-
+-		test_oid = snap_src->root_key.objectid;
+-
+-		ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+-					  path, parent_oid, test_oid);
+-		if (ret == 0)
+-			goto create;
+-		btrfs_release_path(snap_src->fs_info->tree_root, path);
+-
+-		/* we need to make sure we aren't creating a directory loop
+-		 * by taking a snapshot of something that has our current
+-		 * subvol in its directory tree.  So, this loops through
+-		 * the dentries and checks the forward refs for each subvolume
+-		 * to see if is references the subvolume where we are
+-		 * placing this new snapshot.
+-		 */
+-		while (1) {
+-			if (!test ||
+-			    dir == snap_src->fs_info->sb->s_root ||
+-			    test == snap_src->fs_info->sb->s_root ||
+-			    test->d_inode->i_sb != snap_src->fs_info->sb) {
+-				break;
+-			}
+-			if (S_ISLNK(test->d_inode->i_mode)) {
+-				printk(KERN_INFO "Btrfs symlink in snapshot "
+-				       "path, failed\n");
+-				error = -EMLINK;
+-				btrfs_free_path(path);
+-				goto out_drop_write;
+-			}
+-			test_oid =
+-				BTRFS_I(test->d_inode)->root->root_key.objectid;
+-			ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+-				  path, test_oid, parent_oid);
+-			if (ret == 0) {
+-				printk(KERN_INFO "Btrfs snapshot creation "
+-				       "failed, looping\n");
+-				error = -EMLINK;
+-				btrfs_free_path(path);
+-				goto out_drop_write;
+-			}
+-			btrfs_release_path(snap_src->fs_info->tree_root, path);
+-			test = test->d_parent;
+-		}
+-create:
+-		btrfs_free_path(path);
+-		error = create_snapshot(snap_src, dentry, name, namelen);
++		error = create_snapshot(snap_src, dentry,
++					name, namelen);
+ 	} else {
+-		error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
+-				      dentry, name, namelen);
++		error = create_subvol(BTRFS_I(dir)->root, dentry,
++				      name, namelen);
+ 	}
+-	if (error)
+-		goto out_drop_write;
+-
+-	fsnotify_mkdir(parent->dentry->d_inode, dentry);
++	if (!error)
++		fsnotify_mkdir(dir, dentry);
++out_up_read:
++	up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+ out_drop_write:
+ 	mnt_drop_write(parent->mnt);
+ out_dput:
+ 	dput(dentry);
+ out_unlock:
+-	mutex_unlock(&parent->dentry->d_inode->i_mutex);
++	mutex_unlock(&dir->i_mutex);
+ 	return error;
+ }
+ 
+-
+ static int btrfs_defrag_file(struct file *file)
+ {
+ 	struct inode *inode = fdentry(file)->d_inode;
+@@ -596,9 +534,8 @@ again:
+ 		clear_page_dirty_for_io(page);
+ 
+ 		btrfs_set_extent_delalloc(inode, page_start, page_end);
+-
+-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ 		set_page_dirty(page);
++		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ 		unlock_page(page);
+ 		page_cache_release(page);
+ 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
+@@ -609,7 +546,8 @@ out_unlock:
+ 	return 0;
+ }
+ 
+-static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
++static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
++					void __user *arg)
+ {
+ 	u64 new_size;
+ 	u64 old_size;
+@@ -718,10 +656,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
+ {
+ 	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ 	struct btrfs_ioctl_vol_args *vol_args;
+-	struct btrfs_dir_item *di;
+-	struct btrfs_path *path;
+ 	struct file *src_file;
+-	u64 root_dirid;
+ 	int namelen;
+ 	int ret = 0;
+ 
+@@ -739,32 +674,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
+ 		goto out;
+ 	}
+ 
+-	path = btrfs_alloc_path();
+-	if (!path) {
+-		ret = -ENOMEM;
+-		goto out;
+-	}
+-
+-	root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
+-	di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
+-			    path, root_dirid,
+-			    vol_args->name, namelen, 0);
+-	btrfs_free_path(path);
+-
+-	if (di && !IS_ERR(di)) {
+-		ret = -EEXIST;
+-		goto out;
+-	}
+-
+-	if (IS_ERR(di)) {
+-		ret = PTR_ERR(di);
+-		goto out;
+-	}
+-
+ 	if (subvol) {
+-		ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+-				     file->f_path.dentry->d_inode->i_mode,
+-				     namelen, NULL);
++		ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
++				     NULL);
+ 	} else {
+ 		struct inode *src_inode;
+ 		src_file = fget(vol_args->fd);
+@@ -781,17 +693,157 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
+ 			fput(src_file);
+ 			goto out;
+ 		}
+-		ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+-			     file->f_path.dentry->d_inode->i_mode,
+-			     namelen, BTRFS_I(src_inode)->root);
++		ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
++				     BTRFS_I(src_inode)->root);
+ 		fput(src_file);
+ 	}
+-
+ out:
+ 	kfree(vol_args);
+ 	return ret;
+ }
+ 
++/*
++ * helper to check if the subvolume references other subvolumes
++ */
++static noinline int may_destroy_subvol(struct btrfs_root *root)
++{
++	struct btrfs_path *path;
++	struct btrfs_key key;
++	int ret;
++
++	path = btrfs_alloc_path();
++	if (!path)
++		return -ENOMEM;
++
++	key.objectid = root->root_key.objectid;
++	key.type = BTRFS_ROOT_REF_KEY;
++	key.offset = (u64)-1;
++
++	ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
++				&key, path, 0, 0);
++	if (ret < 0)
++		goto out;
++	BUG_ON(ret == 0);
++
++	ret = 0;
++	if (path->slots[0] > 0) {
++		path->slots[0]--;
++		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
++		if (key.objectid == root->root_key.objectid &&
++		    key.type == BTRFS_ROOT_REF_KEY)
++			ret = -ENOTEMPTY;
++	}
++out:
++	btrfs_free_path(path);
++	return ret;
++}
++
++static noinline int btrfs_ioctl_snap_destroy(struct file *file,
++					     void __user *arg)
++{
++	struct dentry *parent = fdentry(file);
++	struct dentry *dentry;
++	struct inode *dir = parent->d_inode;
++	struct inode *inode;
++	struct btrfs_root *root = BTRFS_I(dir)->root;
++	struct btrfs_root *dest = NULL;
++	struct btrfs_ioctl_vol_args *vol_args;
++	struct btrfs_trans_handle *trans;
++	int namelen;
++	int ret;
++	int err = 0;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	vol_args = memdup_user(arg, sizeof(*vol_args));
++	if (IS_ERR(vol_args))
++		return PTR_ERR(vol_args);
++
++	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
++	namelen = strlen(vol_args->name);
++	if (strchr(vol_args->name, '/') ||
++	    strncmp(vol_args->name, "..", namelen) == 0) {
++		err = -EINVAL;
++		goto out;
++	}
++
++	err = mnt_want_write(file->f_path.mnt);
++	if (err)
++		goto out;
++
++	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
++	dentry = lookup_one_len(vol_args->name, parent, namelen);
++	if (IS_ERR(dentry)) {
++		err = PTR_ERR(dentry);
++		goto out_unlock_dir;
++	}
++
++	if (!dentry->d_inode) {
++		err = -ENOENT;
++		goto out_dput;
++	}
++
++	inode = dentry->d_inode;
++	if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
++		err = -EINVAL;
++		goto out_dput;
++	}
++
++	dest = BTRFS_I(inode)->root;
++
++	mutex_lock(&inode->i_mutex);
++	err = d_invalidate(dentry);
++	if (err)
++		goto out_unlock;
++
++	down_write(&root->fs_info->subvol_sem);
++
++	err = may_destroy_subvol(dest);
++	if (err)
++		goto out_up_write;
++
++	trans = btrfs_start_transaction(root, 1);
++	ret = btrfs_unlink_subvol(trans, root, dir,
++				dest->root_key.objectid,
++				dentry->d_name.name,
++				dentry->d_name.len);
++	BUG_ON(ret);
++
++	btrfs_record_root_in_trans(trans, dest);
++
++	memset(&dest->root_item.drop_progress, 0,
++		sizeof(dest->root_item.drop_progress));
++	dest->root_item.drop_level = 0;
++	btrfs_set_root_refs(&dest->root_item, 0);
++
++	ret = btrfs_insert_orphan_item(trans,
++				root->fs_info->tree_root,
++				dest->root_key.objectid);
++	BUG_ON(ret);
++
++	ret = btrfs_commit_transaction(trans, root);
++	BUG_ON(ret);
++	inode->i_flags |= S_DEAD;
++out_up_write:
++	up_write(&root->fs_info->subvol_sem);
++out_unlock:
++	mutex_unlock(&inode->i_mutex);
++	if (!err) {
++		shrink_dcache_sb(root->fs_info->sb);
++		btrfs_invalidate_inodes(dest);
++		d_delete(dentry);
++	}
++out_dput:
++	dput(dentry);
++out_unlock_dir:
++	mutex_unlock(&dir->i_mutex);
++	mnt_drop_write(file->f_path.mnt);
++out:
++	kfree(vol_args);
++	return err;
++}
++
+ static int btrfs_ioctl_defrag(struct file *file)
+ {
+ 	struct inode *inode = fdentry(file)->d_inode;
+@@ -865,8 +917,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+ 	return ret;
+ }
+ 
+-static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+-		u64 off, u64 olen, u64 destoff)
++static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
++				       u64 off, u64 olen, u64 destoff)
+ {
+ 	struct inode *inode = fdentry(file)->d_inode;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+@@ -976,7 +1028,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+ 
+ 	/* punch hole in destination first */
+ 	btrfs_drop_extents(trans, root, inode, off, off + len,
+-			   off + len, 0, &hint_byte);
++			   off + len, 0, &hint_byte, 1);
+ 
+ 	/* clone data */
+ 	key.objectid = src->i_ino;
+@@ -1071,9 +1123,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+ 					datao += off - key.offset;
+ 					datal -= off - key.offset;
+ 				}
+-				if (key.offset + datao + datal + key.offset >
+-				    off + len)
+-					datal = off + len - key.offset - datao;
++
++				if (key.offset + datal > off + len)
++					datal = off + len - key.offset;
++
+ 				/* disko == 0 means it's a hole */
+ 				if (!disko)
+ 					datao = 0;
+@@ -1182,15 +1235,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
+ 	struct inode *inode = fdentry(file)->d_inode;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	struct btrfs_trans_handle *trans;
+-	int ret = 0;
++	int ret;
+ 
++	ret = -EPERM;
+ 	if (!capable(CAP_SYS_ADMIN))
+-		return -EPERM;
++		goto out;
+ 
+-	if (file->private_data) {
+-		ret = -EINPROGRESS;
++	ret = -EINPROGRESS;
++	if (file->private_data)
+ 		goto out;
+-	}
+ 
+ 	ret = mnt_want_write(file->f_path.mnt);
+ 	if (ret)
+@@ -1200,12 +1253,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
+ 	root->fs_info->open_ioctl_trans++;
+ 	mutex_unlock(&root->fs_info->trans_mutex);
+ 
++	ret = -ENOMEM;
+ 	trans = btrfs_start_ioctl_transaction(root, 0);
+-	if (trans)
+-		file->private_data = trans;
+-	else
+-		ret = -ENOMEM;
+-	/*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
++	if (!trans)
++		goto out_drop;
++
++	file->private_data = trans;
++	return 0;
++
++out_drop:
++	mutex_lock(&root->fs_info->trans_mutex);
++	root->fs_info->open_ioctl_trans--;
++	mutex_unlock(&root->fs_info->trans_mutex);
++	mnt_drop_write(file->f_path.mnt);
+ out:
+ 	return ret;
+ }
+@@ -1221,24 +1281,20 @@ long btrfs_ioctl_trans_end(struct file *file)
+ 	struct inode *inode = fdentry(file)->d_inode;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	struct btrfs_trans_handle *trans;
+-	int ret = 0;
+ 
+ 	trans = file->private_data;
+-	if (!trans) {
+-		ret = -EINVAL;
+-		goto out;
+-	}
+-	btrfs_end_transaction(trans, root);
++	if (!trans)
++		return -EINVAL;
+ 	file->private_data = NULL;
+ 
++	btrfs_end_transaction(trans, root);
++
+ 	mutex_lock(&root->fs_info->trans_mutex);
+ 	root->fs_info->open_ioctl_trans--;
+ 	mutex_unlock(&root->fs_info->trans_mutex);
+ 
+ 	mnt_drop_write(file->f_path.mnt);
+-
+-out:
+-	return ret;
++	return 0;
+ }
+ 
+ long btrfs_ioctl(struct file *file, unsigned int
+@@ -1258,6 +1314,8 @@ long btrfs_ioctl(struct file *file, unsigned int
+ 		return btrfs_ioctl_snap_create(file, argp, 0);
+ 	case BTRFS_IOC_SUBVOL_CREATE:
+ 		return btrfs_ioctl_snap_create(file, argp, 1);
++	case BTRFS_IOC_SNAP_DESTROY:
++		return btrfs_ioctl_snap_destroy(file, argp);
+ 	case BTRFS_IOC_DEFRAG:
+ 		return btrfs_ioctl_defrag(file);
+ 	case BTRFS_IOC_RESIZE:
+diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
+index b320b10..bc49914 100644
+--- a/fs/btrfs/ioctl.h
++++ b/fs/btrfs/ioctl.h
+@@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args {
+ 
+ #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
+ 				   struct btrfs_ioctl_vol_args)
+-
++#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
++				struct btrfs_ioctl_vol_args)
+ #endif
+diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
+index d6f0806..ab21c29 100644
+--- a/fs/btrfs/ordered-data.c
++++ b/fs/btrfs/ordered-data.c
+@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+  *
+  * len is the length of the extent
+  *
+- * This also sets the EXTENT_ORDERED bit on the range in the inode.
+- *
+  * The tree is given a single reference on the ordered extent that was
+  * inserted.
+  */
+@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ 	entry->start = start;
+ 	entry->len = len;
+ 	entry->disk_len = disk_len;
++	entry->bytes_left = len;
+ 	entry->inode = inode;
+ 	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
+ 		set_bit(type, &entry->flags);
+@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ 			   &entry->rb_node);
+ 	BUG_ON(node);
+ 
+-	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
+-			   entry_end(entry) - 1, GFP_NOFS);
+-
+ 	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ 	list_add_tail(&entry->root_extent_list,
+ 		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
+@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
+ 	struct btrfs_ordered_inode_tree *tree;
+ 	struct rb_node *node;
+ 	struct btrfs_ordered_extent *entry;
+-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ 	int ret;
+ 
+ 	tree = &BTRFS_I(inode)->ordered_tree;
+ 	mutex_lock(&tree->mutex);
+-	clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
+-			     GFP_NOFS);
+ 	node = tree_search(tree, file_offset);
+ 	if (!node) {
+ 		ret = 1;
+@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
+ 		goto out;
+ 	}
+ 
+-	ret = test_range_bit(io_tree, entry->file_offset,
+-			     entry->file_offset + entry->len - 1,
+-			     EXTENT_ORDERED, 0);
+-	if (ret == 0)
++	if (io_size > entry->bytes_left) {
++		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
++		       (unsigned long long)entry->bytes_left,
++		       (unsigned long long)io_size);
++	}
++	entry->bytes_left -= io_size;
++	if (entry->bytes_left == 0)
+ 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
++	else
++		ret = 1;
+ out:
+ 	mutex_unlock(&tree->mutex);
+ 	return ret == 0;
+@@ -308,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode,
+ 	tree->last = NULL;
+ 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ 
++	spin_lock(&BTRFS_I(inode)->accounting_lock);
++	BTRFS_I(inode)->outstanding_extents--;
++	spin_unlock(&BTRFS_I(inode)->accounting_lock);
++	btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
++					      inode, 1);
++
+ 	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ 	list_del_init(&entry->root_extent_list);
+ 
+@@ -476,6 +480,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+ 	u64 orig_end;
+ 	u64 wait_end;
+ 	struct btrfs_ordered_extent *ordered;
++	int found;
+ 
+ 	if (start + len < start) {
+ 		orig_end = INT_LIMIT(loff_t);
+@@ -502,6 +507,7 @@ again:
+ 					   orig_end >> PAGE_CACHE_SHIFT);
+ 
+ 	end = orig_end;
++	found = 0;
+ 	while (1) {
+ 		ordered = btrfs_lookup_first_ordered_extent(inode, end);
+ 		if (!ordered)
+@@ -514,6 +520,7 @@ again:
+ 			btrfs_put_ordered_extent(ordered);
+ 			break;
+ 		}
++		found++;
+ 		btrfs_start_ordered_extent(inode, ordered, 1);
+ 		end = ordered->file_offset;
+ 		btrfs_put_ordered_extent(ordered);
+@@ -521,8 +528,8 @@ again:
+ 			break;
+ 		end--;
+ 	}
+-	if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+-			   EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
++	if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
++			   EXTENT_DELALLOC, 0, NULL)) {
+ 		schedule_timeout(1);
+ 		goto again;
+ 	}
+@@ -613,7 +620,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
+ 	 */
+ 	if (test_range_bit(io_tree, disk_i_size,
+ 			   ordered->file_offset + ordered->len - 1,
+-			   EXTENT_DELALLOC, 0)) {
++			   EXTENT_DELALLOC, 0, NULL)) {
+ 		goto out;
+ 	}
+ 	/*
+@@ -664,7 +671,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
+ 	 */
+ 	if (i_size_test > entry_end(ordered) &&
+ 	    !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
+-			   EXTENT_DELALLOC, 0)) {
++			   EXTENT_DELALLOC, 0, NULL)) {
+ 		new_i_size = min_t(u64, i_size_test, i_size_read(inode));
+ 	}
+ 	BTRFS_I(inode)->disk_i_size = new_i_size;
+diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
+index 3d31c88..993a7ea 100644
+--- a/fs/btrfs/ordered-data.h
++++ b/fs/btrfs/ordered-data.h
+@@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
+ 	/* extent length on disk */
+ 	u64 disk_len;
+ 
++	/* number of bytes that still need writing */
++	u64 bytes_left;
++
+ 	/* flags (described above) */
+ 	unsigned long flags;
+ 
+diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
+index 3c0d52a..79cba5f 100644
+--- a/fs/btrfs/orphan.c
++++ b/fs/btrfs/orphan.c
+@@ -65,3 +65,23 @@ out:
+ 	btrfs_free_path(path);
+ 	return ret;
+ }
++
++int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
++{
++	struct btrfs_path *path;
++	struct btrfs_key key;
++	int ret;
++
++	key.objectid = BTRFS_ORPHAN_OBJECTID;
++	key.type = BTRFS_ORPHAN_ITEM_KEY;
++	key.offset = offset;
++
++	path = btrfs_alloc_path();
++	if (!path)
++		return -ENOMEM;
++
++	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
++
++	btrfs_free_path(path);
++	return ret;
++}
+diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
+index c04f7f2..cfcc93c 100644
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -121,6 +121,15 @@ struct inodevec {
+ 	int nr;
+ };
+ 
++#define MAX_EXTENTS 128
++
++struct file_extent_cluster {
++	u64 start;
++	u64 end;
++	u64 boundary[MAX_EXTENTS];
++	unsigned int nr;
++};
++
+ struct reloc_control {
+ 	/* block group to relocate */
+ 	struct btrfs_block_group_cache *block_group;
+@@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
+ 				struct reloc_control *rc)
+ {
+ 	if (test_range_bit(&rc->processed_blocks, bytenr,
+-			   bytenr + blocksize - 1, EXTENT_DIRTY, 1))
++			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
+ 		return 1;
+ 	return 0;
+ }
+@@ -2529,56 +2538,94 @@ out:
+ }
+ 
+ static noinline_for_stack
+-int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
++int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
++			 u64 block_start)
++{
++	struct btrfs_root *root = BTRFS_I(inode)->root;
++	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
++	struct extent_map *em;
++	int ret = 0;
++
++	em = alloc_extent_map(GFP_NOFS);
++	if (!em)
++		return -ENOMEM;
++
++	em->start = start;
++	em->len = end + 1 - start;
++	em->block_len = em->len;
++	em->block_start = block_start;
++	em->bdev = root->fs_info->fs_devices->latest_bdev;
++	set_bit(EXTENT_FLAG_PINNED, &em->flags);
++
++	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
++	while (1) {
++		write_lock(&em_tree->lock);
++		ret = add_extent_mapping(em_tree, em);
++		write_unlock(&em_tree->lock);
++		if (ret != -EEXIST) {
++			free_extent_map(em);
++			break;
++		}
++		btrfs_drop_extent_cache(inode, start, end, 0);
++	}
++	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
++	return ret;
++}
++
++static int relocate_file_extent_cluster(struct inode *inode,
++					struct file_extent_cluster *cluster)
+ {
+ 	u64 page_start;
+ 	u64 page_end;
+-	unsigned long i;
+-	unsigned long first_index;
++	u64 offset = BTRFS_I(inode)->index_cnt;
++	unsigned long index;
+ 	unsigned long last_index;
+-	unsigned int total_read = 0;
+-	unsigned int total_dirty = 0;
++	unsigned int dirty_page = 0;
+ 	struct page *page;
+ 	struct file_ra_state *ra;
+-	struct btrfs_ordered_extent *ordered;
+-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
++	int nr = 0;
+ 	int ret = 0;
+ 
++	if (!cluster->nr)
++		return 0;
++
+ 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+ 	if (!ra)
+ 		return -ENOMEM;
+ 
++	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
++	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
++
+ 	mutex_lock(&inode->i_mutex);
+-	first_index = start >> PAGE_CACHE_SHIFT;
+-	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
+ 
+-	/* make sure the dirty trick played by the caller work */
+-	while (1) {
+-		ret = invalidate_inode_pages2_range(inode->i_mapping,
+-						    first_index, last_index);
+-		if (ret != -EBUSY)
+-			break;
+-		schedule_timeout(HZ/10);
+-	}
++	i_size_write(inode, cluster->end + 1 - offset);
++	ret = setup_extent_mapping(inode, cluster->start - offset,
++				   cluster->end - offset, cluster->start);
+ 	if (ret)
+ 		goto out_unlock;
+ 
+ 	file_ra_state_init(ra, inode->i_mapping);
+ 
+-	for (i = first_index ; i <= last_index; i++) {
+-		if (total_read % ra->ra_pages == 0) {
+-			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
+-				min(last_index, ra->ra_pages + i - 1));
+-		}
+-		total_read++;
+-again:
+-		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
+-			BUG_ON(1);
+-		page = grab_cache_page(inode->i_mapping, i);
++	WARN_ON(cluster->start != cluster->boundary[0]);
++	while (index <= last_index) {
++		page = find_lock_page(inode->i_mapping, index);
+ 		if (!page) {
+-			ret = -ENOMEM;
+-			goto out_unlock;
++			page_cache_sync_readahead(inode->i_mapping,
++						  ra, NULL, index,
++						  last_index + 1 - index);
++			page = grab_cache_page(inode->i_mapping, index);
++			if (!page) {
++				ret = -ENOMEM;
++				goto out_unlock;
++			}
++		}
++
++		if (PageReadahead(page)) {
++			page_cache_async_readahead(inode->i_mapping,
++						   ra, NULL, page, index,
++						   last_index + 1 - index);
+ 		}
++
+ 		if (!PageUptodate(page)) {
+ 			btrfs_readpage(NULL, page);
+ 			lock_page(page);
+@@ -2589,75 +2636,79 @@ again:
+ 				goto out_unlock;
+ 			}
+ 		}
+-		wait_on_page_writeback(page);
+ 
+ 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+ 		page_end = page_start + PAGE_CACHE_SIZE - 1;
+-		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+-
+-		ordered = btrfs_lookup_ordered_extent(inode, page_start);
+-		if (ordered) {
+-			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+-			unlock_page(page);
+-			page_cache_release(page);
+-			btrfs_start_ordered_extent(inode, ordered, 1);
+-			btrfs_put_ordered_extent(ordered);
+-			goto again;
+-		}
++
++		lock_extent(&BTRFS_I(inode)->io_tree,
++			    page_start, page_end, GFP_NOFS);
++
+ 		set_page_extent_mapped(page);
+ 
+-		if (i == first_index)
+-			set_extent_bits(io_tree, page_start, page_end,
++		if (nr < cluster->nr &&
++		    page_start + offset == cluster->boundary[nr]) {
++			set_extent_bits(&BTRFS_I(inode)->io_tree,
++					page_start, page_end,
+ 					EXTENT_BOUNDARY, GFP_NOFS);
++			nr++;
++		}
+ 		btrfs_set_extent_delalloc(inode, page_start, page_end);
+ 
+ 		set_page_dirty(page);
+-		total_dirty++;
++		dirty_page++;
+ 
+-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
++		unlock_extent(&BTRFS_I(inode)->io_tree,
++			      page_start, page_end, GFP_NOFS);
+ 		unlock_page(page);
+ 		page_cache_release(page);
++
++		index++;
++		if (nr < cluster->nr &&
++		    page_end + 1 + offset == cluster->boundary[nr]) {
++			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
++							   dirty_page);
++			dirty_page = 0;
++		}
++	}
++	if (dirty_page) {
++		balance_dirty_pages_ratelimited_nr(inode->i_mapping,
++						   dirty_page);
+ 	}
++	WARN_ON(nr != cluster->nr);
+ out_unlock:
+ 	mutex_unlock(&inode->i_mutex);
+ 	kfree(ra);
+-	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+ 	return ret;
+ }
+ 
+ static noinline_for_stack
+-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
++int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
++			 struct file_extent_cluster *cluster)
+ {
+-	struct btrfs_root *root = BTRFS_I(inode)->root;
+-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+-	struct extent_map *em;
+-	u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
+-	u64 end = start + extent_key->offset - 1;
+-
+-	em = alloc_extent_map(GFP_NOFS);
+-	em->start = start;
+-	em->len = extent_key->offset;
+-	em->block_len = extent_key->offset;
+-	em->block_start = extent_key->objectid;
+-	em->bdev = root->fs_info->fs_devices->latest_bdev;
+-	set_bit(EXTENT_FLAG_PINNED, &em->flags);
++	int ret;
+ 
+-	/* setup extent map to cheat btrfs_readpage */
+-	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+-	while (1) {
+-		int ret;
+-		spin_lock(&em_tree->lock);
+-		ret = add_extent_mapping(em_tree, em);
+-		spin_unlock(&em_tree->lock);
+-		if (ret != -EEXIST) {
+-			free_extent_map(em);
+-			break;
+-		}
+-		btrfs_drop_extent_cache(inode, start, end, 0);
++	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
++		ret = relocate_file_extent_cluster(inode, cluster);
++		if (ret)
++			return ret;
++		cluster->nr = 0;
+ 	}
+-	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ 
+-	return relocate_inode_pages(inode, start, extent_key->offset);
++	if (!cluster->nr)
++		cluster->start = extent_key->objectid;
++	else
++		BUG_ON(cluster->nr >= MAX_EXTENTS);
++	cluster->end = extent_key->objectid + extent_key->offset - 1;
++	cluster->boundary[cluster->nr] = extent_key->objectid;
++	cluster->nr++;
++
++	if (cluster->nr >= MAX_EXTENTS) {
++		ret = relocate_file_extent_cluster(inode, cluster);
++		if (ret)
++			return ret;
++		cluster->nr = 0;
++	}
++	return 0;
+ }
+ 
+ #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+@@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags)
+ 	return 0;
+ }
+ 
++
+ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+ {
+ 	struct rb_root blocks = RB_ROOT;
+ 	struct btrfs_key key;
++	struct file_extent_cluster *cluster;
+ 	struct btrfs_trans_handle *trans = NULL;
+ 	struct btrfs_path *path;
+ 	struct btrfs_extent_item *ei;
+@@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+ 	int ret;
+ 	int err = 0;
+ 
++	cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
++	if (!cluster)
++		return -ENOMEM;
++
+ 	path = btrfs_alloc_path();
+ 	if (!path)
+ 		return -ENOMEM;
+ 
++	rc->extents_found = 0;
++	rc->extents_skipped = 0;
++
+ 	rc->search_start = rc->block_group->key.objectid;
+ 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+ 			  GFP_NOFS);
+@@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+ 		}
+ 
+ 		nr = trans->blocks_used;
+-		btrfs_end_transaction_throttle(trans, rc->extent_root);
++		btrfs_end_transaction(trans, rc->extent_root);
+ 		trans = NULL;
+ 		btrfs_btree_balance_dirty(rc->extent_root, nr);
+ 
+ 		if (rc->stage == MOVE_DATA_EXTENTS &&
+ 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
+ 			rc->found_file_extent = 1;
+-			ret = relocate_data_extent(rc->data_inode, &key);
++			ret = relocate_data_extent(rc->data_inode,
++						   &key, cluster);
+ 			if (ret < 0) {
+ 				err = ret;
+ 				break;
+@@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+ 		btrfs_btree_balance_dirty(rc->extent_root, nr);
+ 	}
+ 
++	if (!err) {
++		ret = relocate_file_extent_cluster(rc->data_inode, cluster);
++		if (ret < 0)
++			err = ret;
++	}
++
++	kfree(cluster);
++
+ 	rc->create_reloc_root = 0;
+ 	smp_mb();
+ 
+@@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+ }
+ 
+ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+-				 struct btrfs_root *root,
+-				 u64 objectid, u64 size)
++				 struct btrfs_root *root, u64 objectid)
+ {
+ 	struct btrfs_path *path;
+ 	struct btrfs_inode_item *item;
+@@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+ 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+ 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+ 	btrfs_set_inode_generation(leaf, item, 1);
+-	btrfs_set_inode_size(leaf, item, size);
++	btrfs_set_inode_size(leaf, item, 0);
+ 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+ 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+ 	btrfs_mark_buffer_dirty(leaf);
+@@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+ 	if (err)
+ 		goto out;
+ 
+-	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+-	BUG_ON(err);
+-
+-	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+-				       group->key.offset, 0, group->key.offset,
+-				       0, 0, 0);
++	err = __insert_orphan_inode(trans, root, objectid);
+ 	BUG_ON(err);
+ 
+ 	key.objectid = objectid;
+@@ -3455,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+ 	BUG_ON(!rc->block_group);
+ 
+ 	btrfs_init_workers(&rc->workers, "relocate",
+-			   fs_info->thread_pool_size);
++			   fs_info->thread_pool_size, NULL);
+ 
+ 	rc->extent_root = extent_root;
+ 	btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+@@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+ 	btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+ 
+ 	while (1) {
+-		mutex_lock(&fs_info->cleaner_mutex);
+-		btrfs_clean_old_snapshots(fs_info->tree_root);
+-		mutex_unlock(&fs_info->cleaner_mutex);
+-
+ 		rc->extents_found = 0;
+ 		rc->extents_skipped = 0;
+ 
++		mutex_lock(&fs_info->cleaner_mutex);
++
++		btrfs_clean_old_snapshots(fs_info->tree_root);
+ 		ret = relocate_block_group(rc);
++
++		mutex_unlock(&fs_info->cleaner_mutex);
+ 		if (ret < 0) {
+ 			err = ret;
+ 			break;
+@@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+ 		}
+ 	}
+ 
+-	filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
+-				 rc->block_group->key.objectid,
+-				 rc->block_group->key.objectid +
+-				 rc->block_group->key.offset - 1);
++	filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
++				     rc->block_group->key.objectid,
++				     rc->block_group->key.objectid +
++				     rc->block_group->key.offset - 1);
+ 
+ 	WARN_ON(rc->block_group->pinned > 0);
+ 	WARN_ON(rc->block_group->reserved > 0);
+@@ -3530,6 +3594,26 @@ out:
+ 	return err;
+ }
+ 
++static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
++{
++	struct btrfs_trans_handle *trans;
++	int ret;
++
++	trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
++
++	memset(&root->root_item.drop_progress, 0,
++		sizeof(root->root_item.drop_progress));
++	root->root_item.drop_level = 0;
++	btrfs_set_root_refs(&root->root_item, 0);
++	ret = btrfs_update_root(trans, root->fs_info->tree_root,
++				&root->root_key, &root->root_item);
++	BUG_ON(ret);
++
++	ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
++	BUG_ON(ret);
++	return 0;
++}
++
+ /*
+  * recover relocation interrupted by system crash.
+  *
+@@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
+ 			fs_root = read_fs_root(root->fs_info,
+ 					       reloc_root->root_key.offset);
+ 			if (IS_ERR(fs_root)) {
+-				err = PTR_ERR(fs_root);
+-				goto out;
++				ret = PTR_ERR(fs_root);
++				if (ret != -ENOENT) {
++					err = ret;
++					goto out;
++				}
++				mark_garbage_root(reloc_root);
+ 			}
+ 		}
+ 
+@@ -3613,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
+ 	mapping_tree_init(&rc->reloc_root_tree);
+ 	INIT_LIST_HEAD(&rc->reloc_roots);
+ 	btrfs_init_workers(&rc->workers, "relocate",
+-			   root->fs_info->thread_pool_size);
++			   root->fs_info->thread_pool_size, NULL);
+ 	rc->extent_root = root->fs_info->extent_root;
+ 
+ 	set_reloc_control(rc);
+diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
+index 0ddc6d6..9351428 100644
+--- a/fs/btrfs/root-tree.c
++++ b/fs/btrfs/root-tree.c
+@@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
+ 		goto out;
+ 
+ 	BUG_ON(ret == 0);
++	if (path->slots[0] == 0) {
++		ret = 1;
++		goto out;
++	}
+ 	l = path->nodes[0];
+-	BUG_ON(path->slots[0] == 0);
+ 	slot = path->slots[0] - 1;
+ 	btrfs_item_key_to_cpu(l, &found_key, slot);
+-	if (found_key.objectid != objectid) {
++	if (found_key.objectid != objectid ||
++	    found_key.type != BTRFS_ROOT_ITEM_KEY) {
+ 		ret = 1;
+ 		goto out;
+ 	}
+-	read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
+-			   sizeof(*item));
+-	memcpy(key, &found_key, sizeof(found_key));
++	if (item)
++		read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
++				   sizeof(*item));
++	if (key)
++		memcpy(key, &found_key, sizeof(found_key));
+ 	ret = 0;
+ out:
+ 	btrfs_free_path(path);
+@@ -249,6 +255,59 @@ err:
+ 	return ret;
+ }
+ 
++int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
++{
++	struct extent_buffer *leaf;
++	struct btrfs_path *path;
++	struct btrfs_key key;
++	int err = 0;
++	int ret;
++
++	path = btrfs_alloc_path();
++	if (!path)
++		return -ENOMEM;
++
++	key.objectid = BTRFS_ORPHAN_OBJECTID;
++	key.type = BTRFS_ORPHAN_ITEM_KEY;
++	key.offset = 0;
++
++	while (1) {
++		ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
++		if (ret < 0) {
++			err = ret;
++			break;
++		}
++
++		leaf = path->nodes[0];
++		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
++			ret = btrfs_next_leaf(tree_root, path);
++			if (ret < 0)
++				err = ret;
++			if (ret != 0)
++				break;
++			leaf = path->nodes[0];
++		}
++
++		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++		btrfs_release_path(tree_root, path);
++
++		if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
++		    key.type != BTRFS_ORPHAN_ITEM_KEY)
++			break;
++
++		ret = btrfs_find_dead_roots(tree_root, key.offset);
++		if (ret) {
++			err = ret;
++			break;
++		}
++
++		key.offset++;
++	}
++
++	btrfs_free_path(path);
++	return err;
++}
++
+ /* drop the root item for 'key' from 'root' */
+ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ 		   struct btrfs_key *key)
+@@ -278,31 +337,57 @@ out:
+ 	return ret;
+ }
+ 
+-#if 0 /* this will get used when snapshot deletion is implemented */
+ int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+ 		       struct btrfs_root *tree_root,
+-		       u64 root_id, u8 type, u64 ref_id)
++		       u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
++		       const char *name, int name_len)
++
+ {
++	struct btrfs_path *path;
++	struct btrfs_root_ref *ref;
++	struct extent_buffer *leaf;
+ 	struct btrfs_key key;
++	unsigned long ptr;
++	int err = 0;
+ 	int ret;
+-	struct btrfs_path *path;
+ 
+ 	path = btrfs_alloc_path();
++	if (!path)
++		return -ENOMEM;
+ 
+ 	key.objectid = root_id;
+-	key.type = type;
++	key.type = BTRFS_ROOT_BACKREF_KEY;
+ 	key.offset = ref_id;
+-
++again:
+ 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+-	BUG_ON(ret);
+-
+-	ret = btrfs_del_item(trans, tree_root, path);
+-	BUG_ON(ret);
++	BUG_ON(ret < 0);
++	if (ret == 0) {
++		leaf = path->nodes[0];
++		ref = btrfs_item_ptr(leaf, path->slots[0],
++				     struct btrfs_root_ref);
++
++		WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
++		WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
++		ptr = (unsigned long)(ref + 1);
++		WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
++		*sequence = btrfs_root_ref_sequence(leaf, ref);
++
++		ret = btrfs_del_item(trans, tree_root, path);
++		BUG_ON(ret);
++	} else
++		err = -ENOENT;
++
++	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
++		btrfs_release_path(tree_root, path);
++		key.objectid = ref_id;
++		key.type = BTRFS_ROOT_REF_KEY;
++		key.offset = root_id;
++		goto again;
++	}
+ 
+ 	btrfs_free_path(path);
+-	return ret;
++	return err;
+ }
+-#endif
+ 
+ int btrfs_find_root_ref(struct btrfs_root *tree_root,
+ 		   struct btrfs_path *path,
+@@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
+ 	return ret;
+ }
+ 
+-
+ /*
+  * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
+  * or BTRFS_ROOT_BACKREF_KEY.
+@@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
+  */
+ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ 		       struct btrfs_root *tree_root,
+-		       u64 root_id, u8 type, u64 ref_id,
+-		       u64 dirid, u64 sequence,
++		       u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+ 		       const char *name, int name_len)
+ {
+ 	struct btrfs_key key;
+@@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ 	struct extent_buffer *leaf;
+ 	unsigned long ptr;
+ 
+-
+ 	path = btrfs_alloc_path();
++	if (!path)
++		return -ENOMEM;
+ 
+ 	key.objectid = root_id;
+-	key.type = type;
++	key.type = BTRFS_ROOT_BACKREF_KEY;
+ 	key.offset = ref_id;
+-
++again:
+ 	ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
+ 				      sizeof(*ref) + name_len);
+ 	BUG_ON(ret);
+@@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ 	write_extent_buffer(leaf, name, ptr, name_len);
+ 	btrfs_mark_buffer_dirty(leaf);
+ 
++	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
++		btrfs_release_path(tree_root, path);
++		key.objectid = ref_id;
++		key.type = BTRFS_ROOT_REF_KEY;
++		key.offset = root_id;
++		goto again;
++	}
++
+ 	btrfs_free_path(path);
+-	return ret;
++	return 0;
+ }
+diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
+index 6d6d06c..939b68f 100644
+--- a/fs/btrfs/super.c
++++ b/fs/btrfs/super.c
+@@ -66,7 +66,7 @@ enum {
+ 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
+ 	Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
+ 	Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
+-	Opt_tag, Opt_notag, Opt_tagid, Opt_err,
++	Opt_tag, Opt_notag, Opt_tagid, Opt_discard, Opt_err,
+ };
+ 
+ static match_table_t tokens = {
+@@ -88,6 +89,7 @@ static match_table_t tokens = {
+ 	{Opt_notreelog, "notreelog"},
+ 	{Opt_flushoncommit, "flushoncommit"},
+ 	{Opt_ratio, "metadata_ratio=%d"},
++	{Opt_discard, "discard"},
+ 	{Opt_tag, "tag"},
+ 	{Opt_notag, "notag"},
+ 	{Opt_tagid, "tagid=%u"},
+@@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
+ 				       info->metadata_ratio);
+ 			}
+ 			break;
++		case Opt_discard:
++			btrfs_set_opt(info->mount_opt, DISCARD);
++			break;
+ #ifndef CONFIG_TAGGING_NONE
+		case Opt_tag:
+			printk(KERN_INFO "btrfs: use tagging\n");
+@@ -344,7 +349,9 @@ static int btrfs_fill_super(struct super_block *sb,
+ 	sb->s_export_op = &btrfs_export_ops;
+ 	sb->s_xattr = btrfs_xattr_handlers;
+ 	sb->s_time_gran = 1;
++#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ 	sb->s_flags |= MS_POSIXACL;
++#endif
+ 
+ 	tree_root = open_ctree(sb, fs_devices, (char *)data);
+ 
+@@ -676,6 +683,7 @@ static int btrfs_unfreeze(struct super_block *sb)
+ }
+ 
+ static struct super_operations btrfs_super_ops = {
++	.drop_inode	= btrfs_drop_inode,
+ 	.delete_inode	= btrfs_delete_inode,
+ 	.put_super	= btrfs_put_super,
+ 	.sync_fs	= btrfs_sync_fs,
+diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
+index cdbb502..bca82a4 100644
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
+ {
+ 	if (root->ref_cows && root->last_trans < trans->transid) {
+ 		WARN_ON(root == root->fs_info->extent_root);
+-		WARN_ON(root->root_item.refs == 0);
+ 		WARN_ON(root->commit_root != root->node);
+ 
+ 		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+@@ -187,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
+ 	h->alloc_exclude_start = 0;
+ 	h->delayed_ref_updates = 0;
+ 
++	if (!current->journal_info)
++		current->journal_info = h;
++
+ 	root->fs_info->running_transaction->use_count++;
+ 	record_root_in_trans(h, root);
+ 	mutex_unlock(&root->fs_info->trans_mutex);
+@@ -318,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
+ 		wake_up(&cur_trans->writer_wait);
+ 	put_transaction(cur_trans);
+ 	mutex_unlock(&info->trans_mutex);
++
++	if (current->journal_info == trans)
++		current->journal_info = NULL;
+ 	memset(trans, 0, sizeof(*trans));
+ 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ 
+@@ -339,10 +344,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+ /*
+  * when btree blocks are allocated, they have some corresponding bits set for
+  * them in one of two extent_io trees.  This is used to make sure all of
+- * those extents are on disk for transaction or log commit
++ * those extents are sent to disk but does not wait on them
+  */
+-int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+-					struct extent_io_tree *dirty_pages)
++int btrfs_write_marked_extents(struct btrfs_root *root,
++			       struct extent_io_tree *dirty_pages)
+ {
+ 	int ret;
+ 	int err = 0;
+@@ -389,6 +394,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ 			page_cache_release(page);
+ 		}
+ 	}
++	if (err)
++		werr = err;
++	return werr;
++}
++
++/*
++ * when btree blocks are allocated, they have some corresponding bits set for
++ * them in one of two extent_io trees.  This is used to make sure all of
++ * those extents are on disk for transaction or log commit.  We wait
++ * on all the pages and clear them from the dirty pages state tree
++ */
++int btrfs_wait_marked_extents(struct btrfs_root *root,
++			      struct extent_io_tree *dirty_pages)
++{
++	int ret;
++	int err = 0;
++	int werr = 0;
++	struct page *page;
++	struct inode *btree_inode = root->fs_info->btree_inode;
++	u64 start = 0;
++	u64 end;
++	unsigned long index;
++
+ 	while (1) {
+ 		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
+ 					    EXTENT_DIRTY);
+@@ -419,6 +447,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ 	return werr;
+ }
+ 
++/*
++ * when btree blocks are allocated, they have some corresponding bits set for
++ * them in one of two extent_io trees.  This is used to make sure all of
++ * those extents are on disk for transaction or log commit
++ */
++int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
++					struct extent_io_tree *dirty_pages)
++{
++	int ret;
++	int ret2;
++
++	ret = btrfs_write_marked_extents(root, dirty_pages);
++	ret2 = btrfs_wait_marked_extents(root, dirty_pages);
++	return ret || ret2;
++}
++
+ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+ 				     struct btrfs_root *root)
+ {
+@@ -720,7 +764,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+ 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+ 
+ 	key.objectid = objectid;
+-	key.offset = 0;
++	/* record when the snapshot was created in key.offset */
++	key.offset = trans->transid;
+ 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ 
+ 	old = btrfs_lock_root_node(root);
+@@ -743,6 +788,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+ 	memcpy(&pending->root_key, &key, sizeof(key));
+ fail:
+ 	kfree(new_root_item);
++	btrfs_unreserve_metadata_space(root, 6);
+ 	return ret;
+ }
+ 
+@@ -778,24 +824,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
+ 	ret = btrfs_update_inode(trans, parent_root, parent_inode);
+ 	BUG_ON(ret);
+ 
+-	/* add the backref first */
+ 	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+ 				 pending->root_key.objectid,
+-				 BTRFS_ROOT_BACKREF_KEY,
+ 				 parent_root->root_key.objectid,
+ 				 parent_inode->i_ino, index, pending->name,
+ 				 namelen);
+ 
+ 	BUG_ON(ret);
+ 
+-	/* now add the forward ref */
+-	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+-				 parent_root->root_key.objectid,
+-				 BTRFS_ROOT_REF_KEY,
+-				 pending->root_key.objectid,
+-				 parent_inode->i_ino, index, pending->name,
+-				 namelen);
+-
+ 	inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
+ 	d_instantiate(pending->dentry, inode);
+ fail:
+@@ -874,7 +910,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ 	unsigned long timeout = 1;
+ 	struct btrfs_transaction *cur_trans;
+ 	struct btrfs_transaction *prev_trans = NULL;
+-	struct extent_io_tree *pinned_copy;
+ 	DEFINE_WAIT(wait);
+ 	int ret;
+ 	int should_grow = 0;
+@@ -915,13 +950,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ 		return 0;
+ 	}
+ 
+-	pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
+-	if (!pinned_copy)
+-		return -ENOMEM;
+-
+-	extent_io_tree_init(pinned_copy,
+-			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
+-
+ 	trans->transaction->in_commit = 1;
+ 	trans->transaction->blocked = 1;
+ 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
+@@ -1019,6 +1047,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ 	ret = commit_cowonly_roots(trans, root);
+ 	BUG_ON(ret);
+ 
++	btrfs_prepare_extent_commit(trans, root);
++
+ 	cur_trans = root->fs_info->running_transaction;
+ 	spin_lock(&root->fs_info->new_trans_lock);
+ 	root->fs_info->running_transaction = NULL;
+@@ -1042,8 +1072,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ 	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
+ 	       sizeof(root->fs_info->super_copy));
+ 
+-	btrfs_copy_pinned(root, pinned_copy);
+-
+ 	trans->transaction->blocked = 0;
+ 
+ 	wake_up(&root->fs_info->transaction_wait);
+@@ -1059,8 +1087,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ 	 */
+ 	mutex_unlock(&root->fs_info->tree_log_mutex);
+ 
+-	btrfs_finish_extent_commit(trans, root, pinned_copy);
+-	kfree(pinned_copy);
++	btrfs_finish_extent_commit(trans, root);
+ 
+ 	/* do the directory inserts of any pending snapshot creations */
+ 	finish_pending_snapshots(trans, root->fs_info);
+@@ -1078,6 +1105,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ 
+ 	mutex_unlock(&root->fs_info->trans_mutex);
+ 
++	if (current->journal_info == trans)
++		current->journal_info = NULL;
++
+ 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ 	return ret;
+ }
+@@ -1096,8 +1126,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
+ 
+ 	while (!list_empty(&list)) {
+ 		root = list_entry(list.next, struct btrfs_root, root_list);
+-		list_del_init(&root->root_list);
+-		btrfs_drop_snapshot(root, 0);
++		list_del(&root->root_list);
++
++		if (btrfs_header_backref_rev(root->node) <
++		    BTRFS_MIXED_BACKREF_REV)
++			btrfs_drop_snapshot(root, 0);
++		else
++			btrfs_drop_snapshot(root, 1);
+ 	}
+ 	return 0;
+ }
+diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
+index 663c674..d4e3e7a 100644
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
+ 					      struct inode *inode)
+ {
+ 	BTRFS_I(inode)->last_trans = trans->transaction->transid;
++	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+ }
+ 
+ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+@@ -107,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+ 				struct btrfs_root *root);
+ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ 					struct extent_io_tree *dirty_pages);
++int btrfs_write_marked_extents(struct btrfs_root *root,
++					struct extent_io_tree *dirty_pages);
++int btrfs_wait_marked_extents(struct btrfs_root *root,
++					struct extent_io_tree *dirty_pages);
+ int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+ #endif
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index d91b0de..f51bf13 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
+ 
+ 	mutex_lock(&root->log_mutex);
+ 	if (root->log_root) {
++		if (!root->log_start_pid) {
++			root->log_start_pid = current->pid;
++			root->log_multiple_pids = false;
++		} else if (root->log_start_pid != current->pid) {
++			root->log_multiple_pids = true;
++		}
++
+ 		root->log_batch++;
+ 		atomic_inc(&root->log_writers);
+ 		mutex_unlock(&root->log_mutex);
+ 		return 0;
+ 	}
++	root->log_multiple_pids = false;
++	root->log_start_pid = current->pid;
+ 	mutex_lock(&root->fs_info->tree_log_mutex);
+ 	if (!root->fs_info->log_root_tree) {
+ 		ret = btrfs_init_log_root_tree(trans, root->fs_info);
+@@ -263,8 +272,8 @@ static int process_one_buffer(struct btrfs_root *log,
+ 			      struct walk_control *wc, u64 gen)
+ {
+ 	if (wc->pin)
+-		btrfs_update_pinned_extents(log->fs_info->extent_root,
+-					    eb->start, eb->len, 1);
++		btrfs_pin_extent(log->fs_info->extent_root,
++				 eb->start, eb->len, 0);
+ 
+ 	if (btrfs_buffer_uptodate(eb, gen)) {
+ 		if (wc->write)
+@@ -534,7 +543,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+ 	saved_nbytes = inode_get_bytes(inode);
+ 	/* drop any overlapping extents */
+ 	ret = btrfs_drop_extents(trans, root, inode,
+-			 start, extent_end, extent_end, start, &alloc_hint);
++			 start, extent_end, extent_end, start, &alloc_hint, 1);
+ 	BUG_ON(ret);
+ 
+ 	if (found_type == BTRFS_FILE_EXTENT_REG ||
+@@ -1971,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 	int ret;
+ 	struct btrfs_root *log = root->log_root;
+ 	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
++	u64 log_transid = 0;
+ 
+ 	mutex_lock(&root->log_mutex);
+ 	index1 = root->log_transid % 2;
+@@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 
+ 	while (1) {
+ 		unsigned long batch = root->log_batch;
+-		mutex_unlock(&root->log_mutex);
+-		schedule_timeout_uninterruptible(1);
+-		mutex_lock(&root->log_mutex);
+-
++		if (root->log_multiple_pids) {
++			mutex_unlock(&root->log_mutex);
++			schedule_timeout_uninterruptible(1);
++			mutex_lock(&root->log_mutex);
++		}
+ 		wait_for_writer(trans, root);
+ 		if (batch == root->log_batch)
+ 			break;
+@@ -2003,14 +2014,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 		goto out;
+ 	}
+ 
+-	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
++	/* we start IO on  all the marked extents here, but we don't actually
++	 * wait for them until later.
++	 */
++	ret = btrfs_write_marked_extents(log, &log->dirty_log_pages);
+ 	BUG_ON(ret);
+ 
+ 	btrfs_set_root_node(&log->root_item, log->node);
+ 
+ 	root->log_batch = 0;
++	log_transid = root->log_transid;
+ 	root->log_transid++;
+ 	log->log_transid = root->log_transid;
++	root->log_start_pid = 0;
+ 	smp_mb();
+ 	/*
+ 	 * log tree has been flushed to disk, new modifications of
+@@ -2036,6 +2052,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 
+ 	index2 = log_root_tree->log_transid % 2;
+ 	if (atomic_read(&log_root_tree->log_commit[index2])) {
++		btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+ 		wait_log_commit(trans, log_root_tree,
+ 				log_root_tree->log_transid);
+ 		mutex_unlock(&log_root_tree->log_mutex);
+@@ -2055,6 +2072,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 	 * check the full commit flag again
+ 	 */
+ 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
++		btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+ 		mutex_unlock(&log_root_tree->log_mutex);
+ 		ret = -EAGAIN;
+ 		goto out_wake_log_root;
+@@ -2063,6 +2081,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
+ 				&log_root_tree->dirty_log_pages);
+ 	BUG_ON(ret);
++	btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+ 
+ 	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+ 				log_root_tree->node->start);
+@@ -2082,9 +2101,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 	 * the running transaction open, so a full commit can't hop
+ 	 * in and cause problems either.
+ 	 */
+-	write_ctree_super(trans, root->fs_info->tree_root, 2);
++	write_ctree_super(trans, root->fs_info->tree_root, 1);
+ 	ret = 0;
+ 
++	mutex_lock(&root->log_mutex);
++	if (root->last_log_commit < log_transid)
++		root->last_log_commit = log_transid;
++	mutex_unlock(&root->log_mutex);
++
+ out_wake_log_root:
+ 	atomic_set(&log_root_tree->log_commit[index2], 0);
+ 	smp_mb();
+@@ -2841,7 +2865,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
+ 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ 			break;
+ 
+-		if (parent == sb->s_root)
++		if (IS_ROOT(parent))
+ 			break;
+ 
+ 		parent = parent->d_parent;
+@@ -2852,6 +2876,21 @@ out:
+ 	return ret;
+ }
+ 
++static int inode_in_log(struct btrfs_trans_handle *trans,
++		 struct inode *inode)
++{
++	struct btrfs_root *root = BTRFS_I(inode)->root;
++	int ret = 0;
++
++	mutex_lock(&root->log_mutex);
++	if (BTRFS_I(inode)->logged_trans == trans->transid &&
++	    BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
++		ret = 1;
++	mutex_unlock(&root->log_mutex);
++	return ret;
++}
++
++
+ /*
+  * helper function around btrfs_log_inode to make sure newly created
+  * parent directories also end up in the log.  A minimal inode and backref
+@@ -2880,11 +2919,22 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ 		goto end_no_trans;
+ 	}
+ 
++	if (root != BTRFS_I(inode)->root ||
++	    btrfs_root_refs(&root->root_item) == 0) {
++		ret = 1;
++		goto end_no_trans;
++	}
++
+ 	ret = check_parent_dirs_for_sync(trans, inode, parent,
+ 					 sb, last_committed);
+ 	if (ret)
+ 		goto end_no_trans;
+ 
++	if (inode_in_log(trans, inode)) {
++		ret = BTRFS_NO_LOG_SYNC;
++		goto end_no_trans;
++	}
++
+ 	start_log_trans(trans, root);
+ 
+ 	ret = btrfs_log_inode(trans, root, inode, inode_only);
+@@ -2907,12 +2957,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ 			break;
+ 
+ 		inode = parent->d_inode;
++		if (root != BTRFS_I(inode)->root)
++			break;
++
+ 		if (BTRFS_I(inode)->generation >
+ 		    root->fs_info->last_trans_committed) {
+ 			ret = btrfs_log_inode(trans, root, inode, inode_only);
+ 			BUG_ON(ret);
+ 		}
+-		if (parent == sb->s_root)
++		if (IS_ROOT(parent))
+ 			break;
+ 
+ 		parent = parent->d_parent;
+@@ -2951,7 +3004,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+ 	struct btrfs_key tmp_key;
+ 	struct btrfs_root *log;
+ 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+-	u64 highest_inode;
+ 	struct walk_control wc = {
+ 		.process_func = process_one_buffer,
+ 		.stage = 0,
+@@ -3010,11 +3062,6 @@ again:
+ 						      path);
+ 			BUG_ON(ret);
+ 		}
+-		ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
+-		if (ret == 0) {
+-			wc.replay_dest->highest_inode = highest_inode;
+-			wc.replay_dest->last_inode_alloc = highest_inode;
+-		}
+ 
+ 		key.offset = found_key.offset - 1;
+ 		wc.replay_dest->log_root = NULL;
+diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
+index d09c760..0776eac 100644
+--- a/fs/btrfs/tree-log.h
++++ b/fs/btrfs/tree-log.h
+@@ -19,6 +19,9 @@
+ #ifndef __TREE_LOG_
+ #define __TREE_LOG_
+ 
++/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
++#define BTRFS_NO_LOG_SYNC 256
++
+ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 		   struct btrfs_root *root);
+ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
+index 5dbefd1..20cbd2e 100644
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -276,7 +276,7 @@ loop_lock:
+ 		 * is now congested.  Back off and let other work structs
+ 		 * run instead
+ 		 */
+-		if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
++		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
+ 		    fs_info->fs_devices->open_devices > 1) {
+ 			struct io_context *ioc;
+ 
+@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
+ 			goto error;
+ 
+ 		device->name = kstrdup(orig_dev->name, GFP_NOFS);
+-		if (!device->name)
++		if (!device->name) {
++			kfree(device);
+ 			goto error;
++		}
+ 
+ 		device->devid = orig_dev->devid;
+ 		device->work.func = pending_bios_fn;
+@@ -719,10 +721,9 @@ error:
+  * called very infrequently and that a given device has a small number
+  * of extents
+  */
+-static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
+-					 struct btrfs_device *device,
+-					 u64 num_bytes, u64 *start,
+-					 u64 *max_avail)
++int find_free_dev_extent(struct btrfs_trans_handle *trans,
++			 struct btrfs_device *device, u64 num_bytes,
++			 u64 *start, u64 *max_avail)
+ {
+ 	struct btrfs_key key;
+ 	struct btrfs_root *root = device->dev_root;
+@@ -1736,6 +1737,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
+ 	extent_root = root->fs_info->extent_root;
+ 	em_tree = &root->fs_info->mapping_tree.map_tree;
+ 
++	ret = btrfs_can_relocate(extent_root, chunk_offset);
++	if (ret)
++		return -ENOSPC;
++
+ 	/* step one, relocate all the extents inside this chunk */
+ 	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+ 	BUG_ON(ret);
+@@ -1749,9 +1754,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
+ 	 * step two, delete the device extents and the
+ 	 * chunk tree entries
+ 	 */
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 
+ 	BUG_ON(em->start > chunk_offset ||
+ 	       em->start + em->len < chunk_offset);
+@@ -1780,9 +1785,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
+ 	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+ 	BUG_ON(ret);
+ 
+-	spin_lock(&em_tree->lock);
++	write_lock(&em_tree->lock);
+ 	remove_extent_mapping(em_tree, em);
+-	spin_unlock(&em_tree->lock);
++	write_unlock(&em_tree->lock);
+ 
+ 	kfree(map);
+ 	em->bdev = NULL;
+@@ -1807,12 +1812,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+ 	struct btrfs_key found_key;
+ 	u64 chunk_tree = chunk_root->root_key.objectid;
+ 	u64 chunk_type;
++	bool retried = false;
++	int failed = 0;
+ 	int ret;
+ 
+ 	path = btrfs_alloc_path();
+ 	if (!path)
+ 		return -ENOMEM;
+ 
++again:
+ 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ 	key.offset = (u64)-1;
+ 	key.type = BTRFS_CHUNK_ITEM_KEY;
+@@ -1842,7 +1850,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+ 			ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+ 						   found_key.objectid,
+ 						   found_key.offset);
+-			BUG_ON(ret);
++			if (ret == -ENOSPC)
++				failed++;
++			else if (ret)
++				BUG();
+ 		}
+ 
+ 		if (found_key.offset == 0)
+@@ -1850,6 +1861,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+ 		key.offset = found_key.offset - 1;
+ 	}
+ 	ret = 0;
++	if (failed && !retried) {
++		failed = 0;
++		retried = true;
++		goto again;
++	} else if (failed && retried) {
++		WARN_ON(1);
++		ret = -ENOSPC;
++	}
+ error:
+ 	btrfs_free_path(path);
+ 	return ret;
+@@ -1894,6 +1913,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
+ 			continue;
+ 
+ 		ret = btrfs_shrink_device(device, old_size - size_to_free);
++		if (ret == -ENOSPC)
++			break;
+ 		BUG_ON(ret);
+ 
+ 		trans = btrfs_start_transaction(dev_root, 1);
+@@ -1938,9 +1959,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
+ 		chunk = btrfs_item_ptr(path->nodes[0],
+ 				       path->slots[0],
+ 				       struct btrfs_chunk);
+-		key.offset = found_key.offset;
+ 		/* chunk zero is special */
+-		if (key.offset == 0)
++		if (found_key.offset == 0)
+ 			break;
+ 
+ 		btrfs_release_path(chunk_root, path);
+@@ -1948,7 +1968,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
+ 					   chunk_root->root_key.objectid,
+ 					   found_key.objectid,
+ 					   found_key.offset);
+-		BUG_ON(ret);
++		BUG_ON(ret && ret != -ENOSPC);
++		key.offset = found_key.offset - 1;
+ 	}
+ 	ret = 0;
+ error:
+@@ -1974,10 +1995,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ 	u64 chunk_offset;
+ 	int ret;
+ 	int slot;
++	int failed = 0;
++	bool retried = false;
+ 	struct extent_buffer *l;
+ 	struct btrfs_key key;
+ 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ 	u64 old_total = btrfs_super_total_bytes(super_copy);
++	u64 old_size = device->total_bytes;
+ 	u64 diff = device->total_bytes - new_size;
+ 
+ 	if (new_size >= device->total_bytes)
+@@ -1987,12 +2011,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ 	if (!path)
+ 		return -ENOMEM;
+ 
+-	trans = btrfs_start_transaction(root, 1);
+-	if (!trans) {
+-		ret = -ENOMEM;
+-		goto done;
+-	}
+-
+ 	path->reada = 2;
+ 
+ 	lock_chunks(root);
+@@ -2001,8 +2019,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ 	if (device->writeable)
+ 		device->fs_devices->total_rw_bytes -= diff;
+ 	unlock_chunks(root);
+-	btrfs_end_transaction(trans, root);
+ 
++again:
+ 	key.objectid = device->devid;
+ 	key.offset = (u64)-1;
+ 	key.type = BTRFS_DEV_EXTENT_KEY;
+@@ -2017,6 +2035,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ 			goto done;
+ 		if (ret) {
+ 			ret = 0;
++			btrfs_release_path(root, path);
+ 			break;
+ 		}
+ 
+@@ -2024,14 +2043,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ 		slot = path->slots[0];
+ 		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+ 
+-		if (key.objectid != device->devid)
++		if (key.objectid != device->devid) {
++			btrfs_release_path(root, path);
+ 			break;
++		}
+ 
+ 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ 		length = btrfs_dev_extent_length(l, dev_extent);
+ 
+-		if (key.offset + length <= new_size)
++		if (key.offset + length <= new_size) {
++			btrfs_release_path(root, path);
+ 			break;
++		}
+ 
+ 		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+ 		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+@@ -2040,8 +2063,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ 
+ 		ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
+ 					   chunk_offset);
+-		if (ret)
++		if (ret && ret != -ENOSPC)
+ 			goto done;
++		if (ret == -ENOSPC)
++			failed++;
++		key.offset -= 1;
++	}
++
++	if (failed && !retried) {
++		failed = 0;
++		retried = true;
++		goto again;
++	} else if (failed && retried) {
++		ret = -ENOSPC;
++		lock_chunks(root);
++
++		device->total_bytes = old_size;
++		if (device->writeable)
++			device->fs_devices->total_rw_bytes += diff;
++		unlock_chunks(root);
++		goto done;
+ 	}
+ 
+ 	/* Shrinking succeeded, else we would be at "done". */
+@@ -2294,9 +2335,9 @@ again:
+ 	em->block_len = em->len;
+ 
+ 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+-	spin_lock(&em_tree->lock);
++	write_lock(&em_tree->lock);
+ 	ret = add_extent_mapping(em_tree, em);
+-	spin_unlock(&em_tree->lock);
++	write_unlock(&em_tree->lock);
+ 	BUG_ON(ret);
+ 	free_extent_map(em);
+ 
+@@ -2491,9 +2532,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
+ 	int readonly = 0;
+ 	int i;
+ 
+-	spin_lock(&map_tree->map_tree.lock);
++	read_lock(&map_tree->map_tree.lock);
+ 	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+-	spin_unlock(&map_tree->map_tree.lock);
++	read_unlock(&map_tree->map_tree.lock);
+ 	if (!em)
+ 		return 1;
+ 
+@@ -2518,11 +2559,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+ 	struct extent_map *em;
+ 
+ 	while (1) {
+-		spin_lock(&tree->map_tree.lock);
++		write_lock(&tree->map_tree.lock);
+ 		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+ 		if (em)
+ 			remove_extent_mapping(&tree->map_tree, em);
+-		spin_unlock(&tree->map_tree.lock);
++		write_unlock(&tree->map_tree.lock);
+ 		if (!em)
+ 			break;
+ 		kfree(em->bdev);
+@@ -2540,9 +2581,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+ 	struct extent_map_tree *em_tree = &map_tree->map_tree;
+ 	int ret;
+ 
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree, logical, len);
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 	BUG_ON(!em);
+ 
+ 	BUG_ON(em->start > logical || em->start + em->len < logical);
+@@ -2604,9 +2645,9 @@ again:
+ 		atomic_set(&multi->error, 0);
+ 	}
+ 
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree, logical, *length);
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 
+ 	if (!em && unplug_page)
+ 		return 0;
+@@ -2763,9 +2804,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+ 	u64 stripe_nr;
+ 	int i, j, nr = 0;
+ 
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree, chunk_start, 1);
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 
+ 	BUG_ON(!em || em->start != chunk_start);
+ 	map = (struct map_lookup *)em->bdev;
+@@ -3053,9 +3094,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+ 	logical = key->offset;
+ 	length = btrfs_chunk_length(leaf, chunk);
+ 
+-	spin_lock(&map_tree->map_tree.lock);
++	read_lock(&map_tree->map_tree.lock);
+ 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+-	spin_unlock(&map_tree->map_tree.lock);
++	read_unlock(&map_tree->map_tree.lock);
+ 
+ 	/* already mapped? */
+ 	if (em && em->start <= logical && em->start + em->len > logical) {
+@@ -3114,9 +3155,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+ 		map->stripes[i].dev->in_fs_metadata = 1;
+ 	}
+ 
+-	spin_lock(&map_tree->map_tree.lock);
++	write_lock(&map_tree->map_tree.lock);
+ 	ret = add_extent_mapping(&map_tree->map_tree, em);
+-	spin_unlock(&map_tree->map_tree.lock);
++	write_unlock(&map_tree->map_tree.lock);
+ 	BUG_ON(ret);
+ 	free_extent_map(em);
+ 
+diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
+index 5139a83..31b0fab 100644
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root);
+ void btrfs_unlock_volumes(void);
+ void btrfs_lock_volumes(void);
+ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
++int find_free_dev_extent(struct btrfs_trans_handle *trans,
++			 struct btrfs_device *device, u64 num_bytes,
++			 u64 *start, u64 *max_avail);
+ #endif
+diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
+index a9d3bf4..b6dd596 100644
+--- a/fs/btrfs/xattr.c
++++ b/fs/btrfs/xattr.c
+@@ -260,7 +260,7 @@ err:
+  * attributes are handled directly.
+  */
+ struct xattr_handler *btrfs_xattr_handlers[] = {
+-#ifdef CONFIG_FS_POSIX_ACL
++#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ 	&btrfs_xattr_acl_access_handler,
+ 	&btrfs_xattr_acl_default_handler,
+ #endif
diff --git a/linux-2.6-debug-vm-would-have-oomkilled.patch b/linux-2.6-debug-vm-would-have-oomkilled.patch
new file mode 100644
index 000000000..bcad97e35
--- /dev/null
+++ b/linux-2.6-debug-vm-would-have-oomkilled.patch
@@ -0,0 +1,65 @@
+diff --git a/kernel/sysctl.c b/kernel/sysctl.c
+index b2a2d68..3b132ee 100644
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -67,6 +67,7 @@ extern int sysctl_overcommit_ratio;
+ extern int sysctl_panic_on_oom;
+ extern int sysctl_oom_kill_allocating_task;
+ extern int sysctl_oom_dump_tasks;
++extern int sysctl_would_have_oomkilled;
+ extern int max_threads;
+ extern int core_uses_pid;
+ extern int suid_dumpable;
+@@ -861,6 +862,14 @@ static struct ctl_table vm_table[] = {
+ 		.proc_handler	= &proc_dointvec,
+ 	},
+ 	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "would_have_oomkilled",
++		.data		= &sysctl_would_have_oomkilled,
++		.maxlen		= sizeof(sysctl_would_have_oomkilled),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{
+ 		.ctl_name	= VM_OVERCOMMIT_RATIO,
+ 		.procname	= "overcommit_ratio",
+ 		.data		= &sysctl_overcommit_ratio,
+diff --git a/mm/oom_kill.c b/mm/oom_kill.c
+index f255eda..3335a94 100644
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -31,6 +31,7 @@
+ int sysctl_panic_on_oom;
+ int sysctl_oom_kill_allocating_task;
+ int sysctl_oom_dump_tasks;
++int sysctl_would_have_oomkilled;
+ static DEFINE_SPINLOCK(zone_scan_lock);
+ /* #define DEBUG */
+ 
+@@ -321,6 +322,12 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
+ 		return;
+ 	}
+ 
++	if (sysctl_would_have_oomkilled == 1) {
++		printk(KERN_ERR "Would have killed process %d (%s). But continuing instead.\n",
++				task_pid_nr(p), p->comm);
++		return;
++	}
++
+ 	if (verbose)
+ 		printk(KERN_ERR "Killed process %s(%d:#%u)\n",
+ 			p->comm, task_pid_nr(p), p->xid);
+@@ -363,6 +370,12 @@ static int oom_kill_task(struct task_struct *p)
+ 			return 1;
+ 	} while_each_thread(g, q);
+ 
++	if (sysctl_would_have_oomkilled == 1) {
++		printk(KERN_ERR "Would have killed process %d (%s). But continuing instead.\n",
++				task_pid_nr(p), p->comm);
++		return 1;
++	}
++
+ 	__oom_kill_task(p, 1);
+ 
+ 	/*
diff --git a/linux-2.6-execshield.patch b/linux-2.6-execshield.patch
new file mode 100644
index 000000000..36ee866aa
--- /dev/null
+++ b/linux-2.6-execshield.patch
@@ -0,0 +1,1013 @@
+diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
+index c45f415..3a6dbad 100644
+--- a/arch/x86/include/asm/desc.h
++++ b/arch/x86/include/asm/desc.h
+@@ -6,6 +6,7 @@
+ #include <asm/ldt.h>
+ #include <asm/mmu.h>
+ #include <linux/smp.h>
++#include <linux/mm_types.h>
+ 
+ static inline void fill_ldt(struct desc_struct *desc,
+ 			    const struct user_desc *info)
+@@ -94,6 +95,9 @@ static inline int desc_empty(const void *ptr)
+ 
+ #define load_TLS(t, cpu) native_load_tls(t, cpu)
+ #define set_ldt native_set_ldt
++#ifdef CONFIG_X86_32
++#define load_user_cs_desc native_load_user_cs_desc
++#endif /*CONFIG_X86_32*/
+ 
+ #define write_ldt_entry(dt, entry, desc)	\
+ 	native_write_ldt_entry(dt, entry, desc)
+@@ -380,4 +384,25 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
+ 	_set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
+ }
+ 
++#ifdef CONFIG_X86_32
++static inline void set_user_cs(struct desc_struct *desc, unsigned long limit)
++{
++	limit = (limit - 1) / PAGE_SIZE;
++	desc->a = limit & 0xffff;
++	desc->b = (limit & 0xf0000) | 0x00c0fb00;
++}
++
++static inline void native_load_user_cs_desc(int cpu, struct mm_struct *mm)
++{
++	get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS] = (mm)->context.user_cs;
++}
++
++#define arch_add_exec_range arch_add_exec_range
++#define arch_remove_exec_range arch_remove_exec_range
++#define arch_flush_exec_range arch_flush_exec_range
++extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit);
++extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit);
++extern void arch_flush_exec_range(struct mm_struct *mm);
++#endif /* CONFIG_X86_32 */
++
+ #endif /* _ASM_X86_DESC_H */
+diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
+index 80a1dee..8314c66 100644
+--- a/arch/x86/include/asm/mmu.h
++++ b/arch/x86/include/asm/mmu.h
+@@ -7,12 +7,19 @@
+ /*
+  * The x86 doesn't have a mmu context, but
+  * we put the segment information here.
++ *
++ * exec_limit is used to track the range PROT_EXEC
++ * mappings span.
+  */
+ typedef struct {
+ 	void *ldt;
+ 	int size;
+ 	struct mutex lock;
+ 	void *vdso;
++#ifdef CONFIG_X86_32
++	struct desc_struct user_cs;
++	unsigned long exec_limit;
++#endif
+ } mm_context_t;
+ 
+ #ifdef CONFIG_SMP
+diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
+index 4fb37c8..d5cc31c 100644
+--- a/arch/x86/include/asm/paravirt.h
++++ b/arch/x86/include/asm/paravirt.h
+@@ -139,6 +139,9 @@ struct pv_cpu_ops {
+ 	void (*store_gdt)(struct desc_ptr *);
+ 	void (*store_idt)(struct desc_ptr *);
+ 	void (*set_ldt)(const void *desc, unsigned entries);
++#ifdef CONFIG_X86_32
++	void (*load_user_cs_desc)(int cpu, struct mm_struct *mm);
++#endif /*CONFIG_X86_32*/
+ 	unsigned long (*store_tr)(void);
+ 	void (*load_tls)(struct thread_struct *t, unsigned int cpu);
+ #ifdef CONFIG_X86_64
+@@ -955,6 +958,12 @@ static inline void set_ldt(const void *addr, unsigned entries)
+ {
+ 	PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
+ }
++#ifdef CONFIG_X86_32
++static inline void load_user_cs_desc(unsigned int cpu, struct mm_struct *mm)
++{
++	PVOP_VCALL2(pv_cpu_ops.load_user_cs_desc, cpu, mm);
++}
++#endif /*CONFIG_X86_32*/
+ static inline void store_gdt(struct desc_ptr *dtr)
+ {
+ 	PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
+diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
+index c776826..fb6b579 100644
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -160,6 +160,9 @@ static inline int hlt_works(int cpu)
+ 
+ #define cache_line_size()	(boot_cpu_data.x86_cache_alignment)
+ 
++#define __HAVE_ARCH_ALIGN_STACK
++extern unsigned long arch_align_stack(unsigned long sp);
++
+ extern void cpu_detect(struct cpuinfo_x86 *c);
+ 
+ extern struct pt_regs *idle_regs(struct pt_regs *);
+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
+index 3ffdcfa..62cba96 100644
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -804,6 +804,20 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+ 	/* Filter out anything that depends on CPUID levels we don't have */
+ 	filter_cpuid_features(c, true);
+ 
++	/*
++	 *  emulation of NX with segment limits unfortunately means
++	 *  we have to disable the fast system calls, due to the way that
++	 *  sysexit clears the segment limits on return.
++	 *  If we have either disabled exec-shield on the boot command line,
++	 *  or we have NX, then we don't need to do this.
++	 */
++	if (exec_shield != 0) {
++#ifdef CONFIG_X86_PAE
++		if (!test_cpu_cap(c, X86_FEATURE_NX))
++#endif
++			clear_cpu_cap(c, X86_FEATURE_SEP);
++	}
++
+ 	/* If the model name is still unset, do table lookup. */
+ 	if (!c->x86_model_id[0]) {
+ 		const char *p;
+diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
+index 70ec9b9..d956b8c 100644
+--- a/arch/x86/kernel/paravirt.c
++++ b/arch/x86/kernel/paravirt.c
+@@ -369,6 +369,9 @@ struct pv_cpu_ops pv_cpu_ops = {
+ 	.read_tscp = native_read_tscp,
+ 	.load_tr_desc = native_load_tr_desc,
+ 	.set_ldt = native_set_ldt,
++#ifdef CONFIG_X86_32
++	.load_user_cs_desc = native_load_user_cs_desc,
++#endif /*CONFIG_X86_32*/
+ 	.load_gdt = native_load_gdt,
+ 	.load_idt = native_load_idt,
+ 	.store_gdt = native_store_gdt,
+diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
+index 59f4524..068e286 100644
+--- a/arch/x86/kernel/process_32.c
++++ b/arch/x86/kernel/process_32.c
+@@ -299,7 +299,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
+ void
+ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+ {
++	int cpu;
++
+ 	set_user_gs(regs, 0);
++
+ 	regs->fs		= 0;
+ 	set_fs(USER_DS);
+ 	regs->ds		= __USER_DS;
+@@ -308,6 +311,11 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+ 	regs->cs		= __USER_CS;
+ 	regs->ip		= new_ip;
+ 	regs->sp		= new_sp;
++
++	cpu = get_cpu();
++	load_user_cs_desc(cpu, current->mm);
++	put_cpu();
++
+ 	/*
+ 	 * Free the old FP and other extended state
+ 	 */
+@@ -354,7 +362,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+ 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
+ 
+ 	__unlazy_fpu(prev_p);
+-
++	if (next_p->mm)
++		load_user_cs_desc(cpu, next_p->mm);
+ 
+ 	/* we're going to use this soon, after a few expensive things */
+ 	if (next_p->fpu_counter > 5)
+@@ -495,3 +504,40 @@ unsigned long get_wchan(struct task_struct *p)
+ 	return 0;
+ }
+ 
++static void modify_cs(struct mm_struct *mm, unsigned long limit)
++{
++	mm->context.exec_limit = limit;
++	set_user_cs(&mm->context.user_cs, limit);
++	if (mm == current->mm) {
++		int cpu;
++
++		cpu = get_cpu();
++		load_user_cs_desc(cpu, mm);
++		put_cpu();
++	}
++}
++
++void arch_add_exec_range(struct mm_struct *mm, unsigned long limit)
++{
++	if (limit > mm->context.exec_limit)
++		modify_cs(mm, limit);
++}
++
++void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end)
++{
++	struct vm_area_struct *vma;
++	unsigned long limit = PAGE_SIZE;
++
++	if (old_end == mm->context.exec_limit) {
++		for (vma = mm->mmap; vma; vma = vma->vm_next)
++			if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
++				limit = vma->vm_end;
++		modify_cs(mm, limit);
++	}
++}
++
++void arch_flush_exec_range(struct mm_struct *mm)
++{
++	mm->context.exec_limit = 0;
++	set_user_cs(&mm->context.user_cs, 0);
++}
+diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
+index 07d60c8..41e9129 100644
+--- a/arch/x86/kernel/traps.c
++++ b/arch/x86/kernel/traps.c
+@@ -118,6 +118,76 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err)
+ 	if (!user_mode_vm(regs))
+ 		die(str, regs, err);
+ }
++
++static inline int
++__compare_user_cs_desc(const struct desc_struct *desc1,
++	const struct desc_struct *desc2)
++{
++	return ((desc1->limit0 != desc2->limit0) ||
++		(desc1->limit != desc2->limit) ||
++		(desc1->base0 != desc2->base0) ||
++		(desc1->base1 != desc2->base1) ||
++		(desc1->base2 != desc2->base2));
++}
++
++/*
++ * lazy-check for CS validity on exec-shield binaries:
++ *
++ * the original non-exec stack patch was written by
++ * Solar Designer <solar at openwall.com>. Thanks!
++ */
++static int
++check_lazy_exec_limit(int cpu, struct pt_regs *regs, long error_code)
++{
++	struct desc_struct *desc1, *desc2;
++	struct vm_area_struct *vma;
++	unsigned long limit;
++
++	if (current->mm == NULL)
++		return 0;
++
++	limit = -1UL;
++	if (current->mm->context.exec_limit != -1UL) {
++		limit = PAGE_SIZE;
++		spin_lock(&current->mm->page_table_lock);
++		for (vma = current->mm->mmap; vma; vma = vma->vm_next)
++			if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
++				limit = vma->vm_end;
++		vma = get_gate_vma(current);
++		if (vma && (vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
++			limit = vma->vm_end;
++		spin_unlock(&current->mm->page_table_lock);
++		if (limit >= TASK_SIZE)
++			limit = -1UL;
++		current->mm->context.exec_limit = limit;
++	}
++	set_user_cs(&current->mm->context.user_cs, limit);
++
++	desc1 = &current->mm->context.user_cs;
++	desc2 = get_cpu_gdt_table(cpu) + GDT_ENTRY_DEFAULT_USER_CS;
++
++	if (__compare_user_cs_desc(desc1, desc2)) {
++		/*
++		 * The CS was not in sync - reload it and retry the
++		 * instruction. If the instruction still faults then
++		 * we won't hit this branch next time around.
++		 */
++		if (print_fatal_signals >= 2) {
++			printk(KERN_ERR "#GPF fixup (%ld[seg:%lx]) at %08lx, CPU#%d.\n",
++				error_code, error_code/8, regs->ip,
++				smp_processor_id());
++			printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x, CPU_cs: %08x/%08x.\n",
++				current->mm->context.exec_limit,
++				desc1->a, desc1->b, desc2->a, desc2->b);
++		}
++
++		load_user_cs_desc(cpu, current->mm);
++
++		return 1;
++	}
++
++	return 0;
++}
+ #endif
+ 
+ static void __kprobes
+@@ -276,6 +346,29 @@ do_general_protection(struct pt_regs *regs, long error_code)
+ 	if (!user_mode(regs))
+ 		goto gp_in_kernel;
+ 
++#ifdef CONFIG_X86_32
++{
++	int cpu;
++	int ok;
++
++	cpu = get_cpu();
++	ok = check_lazy_exec_limit(cpu, regs, error_code);
++	put_cpu();
++
++	if (ok)
++		return;
++
++	if (print_fatal_signals) {
++		printk(KERN_ERR "#GPF(%ld[seg:%lx]) at %08lx, CPU#%d.\n",
++			error_code, error_code/8, regs->ip, smp_processor_id());
++		printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x.\n",
++			current->mm->context.exec_limit,
++			current->mm->context.user_cs.a,
++			current->mm->context.user_cs.b);
++	}
++}
++#endif /*CONFIG_X86_32*/
++
+ 	tsk->thread.error_code = error_code;
+ 	tsk->thread.trap_no = 13;
+ 
+@@ -885,19 +978,37 @@ do_device_not_available(struct pt_regs *regs, long error_code)
+ }
+ 
+ #ifdef CONFIG_X86_32
++/*
++ * The fixup code for errors in iret jumps to here (iret_exc). It loses
++ * the original trap number and erorr code. The bogus trap 32 and error
++ * code 0 are what the vanilla kernel delivers via:
++ * DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
++ *
++ * NOTE: Because of the final "1" in the macro we need to enable interrupts.
++ *
++ * In case of a general protection fault in the iret instruction, we
++ * need to check for a lazy CS update for exec-shield.
++ */
+ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
+ {
+-	siginfo_t info;
++	int ok;
++	int cpu;
++
+ 	local_irq_enable();
+ 
+-	info.si_signo = SIGILL;
+-	info.si_errno = 0;
+-	info.si_code = ILL_BADSTK;
+-	info.si_addr = NULL;
+-	if (notify_die(DIE_TRAP, "iret exception",
+-			regs, error_code, 32, SIGILL) == NOTIFY_STOP)
+-		return;
+-	do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
++	cpu = get_cpu();
++	ok = check_lazy_exec_limit(cpu, regs, error_code);
++	put_cpu();
++
++	if (!ok && notify_die(DIE_TRAP, "iret exception", regs,
++		error_code, 32, SIGSEGV) != NOTIFY_STOP) {
++			siginfo_t info;
++			info.si_signo = SIGSEGV;
++			info.si_errno = 0;
++			info.si_code = ILL_BADSTK;
++			info.si_addr = 0;
++			do_trap(32, SIGSEGV, "iret exception", regs, error_code, &info);
++	}
+ }
+ #endif
+ 
+diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
+index 34c1bfb..32c3d8d 100644
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -228,6 +228,12 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
+ 	set_nx();
+ 	if (nx_enabled)
+ 		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
++#ifdef CONFIG_X86_32
++	else
++	if (exec_shield)
++		printk(KERN_INFO "Using x86 segment limits to approximate "
++			"NX protection\n");
++#endif
+ 
+ 	/* Enable PSE if available */
+ 	if (cpu_has_pse)
+diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
+index 949708d..c1373b6 100644
+--- a/arch/x86/mm/init_32.c
++++ b/arch/x86/mm/init_32.c
+@@ -587,6 +587,54 @@ void zap_low_mappings(void)
+ pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
+ EXPORT_SYMBOL_GPL(__supported_pte_mask);
+ 
++#ifdef CONFIG_X86_PAE
++
++static int disable_nx __initdata;
++
++/*
++ * noexec = on|off
++ *
++ * Control non executable mappings.
++ *
++ * on      Enable
++ * off     Disable (disables exec-shield too)
++ */
++static int __init noexec_setup(char *str)
++{
++	if (!str || !strcmp(str, "on")) {
++		if (cpu_has_nx) {
++			__supported_pte_mask |= _PAGE_NX;
++			disable_nx = 0;
++		}
++	} else if (!strcmp(str, "off")) {
++		disable_nx = 1;
++		__supported_pte_mask &= ~_PAGE_NX;
++		exec_shield = 0;
++	} else
++		return -EINVAL;
++
++	return 0;
++}
++early_param("noexec", noexec_setup);
++
++void __init set_nx(void)
++{
++	unsigned int v[4], l, h;
++
++	if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
++		cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
++
++		if ((v[3] & (1 << 20)) && !disable_nx) {
++			rdmsr(MSR_EFER, l, h);
++			l |= EFER_NX;
++			wrmsr(MSR_EFER, l, h);
++			nx_enabled = 1;
++			__supported_pte_mask |= _PAGE_NX;
++		}
++	}
++}
++#endif
++
+ /* user-defined highmem size */
+ static unsigned int highmem_pages = -1;
+ 
+diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
+index 1658296..72056cf 100644
+--- a/arch/x86/mm/mmap.c
++++ b/arch/x86/mm/mmap.c
+@@ -111,13 +111,16 @@ static unsigned long mmap_legacy_base(void)
+  */
+ void arch_pick_mmap_layout(struct mm_struct *mm)
+ {
+-	if (mmap_is_legacy()) {
++	if (!(2 & exec_shield) && mmap_is_legacy()) {
+ 		mm->mmap_base = mmap_legacy_base();
+ 		mm->get_unmapped_area = arch_get_unmapped_area;
+ 		mm->unmap_area = arch_unmap_area;
+ 	} else {
+ 		mm->mmap_base = mmap_base();
+ 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
++		if (!(current->personality & READ_IMPLIES_EXEC)
++		    && mmap_is_ia32())
++			mm->get_unmapped_exec_area = arch_get_unmapped_exec_area;
+ 		mm->unmap_area = arch_unmap_area_topdown;
+ 	}
+ }
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index 821e970..ea5a4c3 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -6,6 +6,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/module.h>
+ 
++#include <asm/desc.h>
+ #include <asm/tlbflush.h>
+ #include <asm/mmu_context.h>
+ #include <asm/apic.h>
+@@ -129,6 +130,12 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
+ 	union smp_flush_state *f;
+ 
+ 	cpu = smp_processor_id();
++
++#ifdef CONFIG_X86_32
++	if (current->active_mm)
++		load_user_cs_desc(cpu, current->active_mm);
++#endif
++
+ 	/*
+ 	 * orig_rax contains the negated interrupt vector.
+ 	 * Use that to determine where the sender put the data.
+diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
+index 58bc00f..1fdafb5 100644
+--- a/arch/x86/vdso/vdso32-setup.c
++++ b/arch/x86/vdso/vdso32-setup.c
+@@ -331,7 +331,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+ 	if (compat)
+ 		addr = VDSO_HIGH_BASE;
+ 	else {
+-		addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
++		addr = get_unmapped_area_prot(NULL, 0, PAGE_SIZE, 0, 0, 1);
+ 		if (IS_ERR_VALUE(addr)) {
+ 			ret = addr;
+ 			goto up_fail;
+diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
+index 0a1700a..37b8744 100644
+--- a/arch/x86/xen/enlighten.c
++++ b/arch/x86/xen/enlighten.c
+@@ -321,6 +321,24 @@ static void xen_set_ldt(const void *addr, unsigned entries)
+ 	xen_mc_issue(PARAVIRT_LAZY_CPU);
+ }
+ 
++#ifdef CONFIG_X86_32
++static void xen_load_user_cs_desc(int cpu, struct mm_struct *mm)
++{
++	void *gdt;
++	xmaddr_t mgdt;
++	u64 descriptor;
++	struct desc_struct user_cs;
++
++	gdt = &get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS];
++	mgdt = virt_to_machine(gdt);
++
++	user_cs = mm->context.user_cs;
++	descriptor = (u64) user_cs.a | ((u64) user_cs.b) << 32;
++
++	HYPERVISOR_update_descriptor(mgdt.maddr, descriptor);
++}
++#endif /*CONFIG_X86_32*/
++
+ static void xen_load_gdt(const struct desc_ptr *dtr)
+ {
+ 	unsigned long va = dtr->address;
+@@ -886,6 +904,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+ 
+ 	.load_tr_desc = paravirt_nop,
+ 	.set_ldt = xen_set_ldt,
++#ifdef CONFIG_X86_32
++	.load_user_cs_desc = xen_load_user_cs_desc,
++#endif /*CONFIG_X86_32*/
+ 	.load_gdt = xen_load_gdt,
+ 	.load_idt = xen_load_idt,
+ 	.load_tls = xen_load_tls,
+diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
+index 40381df..f856fab 100644
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -73,7 +73,7 @@ static struct linux_binfmt elf_format = {
+ 		.hasvdso	= 1
+ };
+ 
+-#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
++#define BAD_ADDR(x) IS_ERR_VALUE(x)
+ 
+ static int set_brk(unsigned long start, unsigned long end)
+ {
+@@ -721,6 +721,11 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+ 			break;
+ 		}
+ 
++	if (current->personality == PER_LINUX && (exec_shield & 2)) {
++		executable_stack = EXSTACK_DISABLE_X;
++		current->flags |= PF_RANDOMIZE;
++	}
++
+ 	/* Some simple consistency checks for the interpreter */
+ 	if (elf_interpreter) {
+ 		retval = -ELIBBAD;
+@@ -740,6 +745,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+ 	if (retval)
+ 		goto out_free_dentry;
+ 
++#ifdef CONFIG_X86_32
++	/*
++	 * Turn off the CS limit completely if exec-shield disabled or
++	 * NX active:
++	 */
++	if (!exec_shield || executable_stack != EXSTACK_DISABLE_X || nx_enabled)
++		arch_add_exec_range(current->mm, -1);
++#endif
++
+ 	/* OK, This is the point of no return */
+ 	current->flags &= ~PF_FORKNOEXEC;
+ 	current->mm->def_flags = def_flags;
+@@ -747,7 +761,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+ 	/* Do this immediately, since STACK_TOP as used in setup_arg_pages
+ 	   may depend on the personality.  */
+ 	SET_PERSONALITY(loc->elf_ex);
+-	if (elf_read_implies_exec(loc->elf_ex, executable_stack))
++	if (!(exec_shield & 2) &&
++			elf_read_implies_exec(loc->elf_ex, executable_stack))
+ 		current->personality |= READ_IMPLIES_EXEC;
+ 
+ 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+@@ -912,7 +927,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+ 					    interpreter,
+ 					    &interp_map_addr,
+ 					    load_bias);
+-		if (!IS_ERR((void *)elf_entry)) {
++		if (!BAD_ADDR(elf_entry)) {
+ 			/*
+ 			 * load_elf_interp() returns relocation
+ 			 * adjustment
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index ad613ed..08f08d0 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1135,7 +1135,13 @@ extern int install_special_mapping(struct mm_struct *mm,
+ 				   unsigned long addr, unsigned long len,
+ 				   unsigned long flags, struct page **pages);
+ 
+-extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
++extern unsigned long get_unmapped_area_prot(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, int);
++
++static inline unsigned long get_unmapped_area(struct file *file, unsigned long addr,
++		unsigned long len, unsigned long pgoff, unsigned long flags)
++{
++	return get_unmapped_area_prot(file, addr, len, pgoff, flags, 0);
++}
+ 
+ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+ 	unsigned long len, unsigned long prot,
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 0e80e26..af904ea 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -198,6 +198,9 @@ struct mm_struct {
+ 	unsigned long (*get_unmapped_area) (struct file *filp,
+ 				unsigned long addr, unsigned long len,
+ 				unsigned long pgoff, unsigned long flags);
++       unsigned long (*get_unmapped_exec_area) (struct file *filp,
++				unsigned long addr, unsigned long len,
++				unsigned long pgoff, unsigned long flags);
+ 	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
+ 	unsigned long mmap_base;		/* base of mmap area */
+ 	unsigned long task_size;		/* size of task vm space */
+diff --git a/include/linux/resource.h b/include/linux/resource.h
+index 40fc7e6..68c2549 100644
+--- a/include/linux/resource.h
++++ b/include/linux/resource.h
+@@ -55,8 +55,11 @@ struct rlimit {
+ /*
+  * Limit the stack by to some sane default: root can always
+  * increase this limit if needed..  8MB seems reasonable.
++ *
++ * (2MB more to cover randomization effects.)
+  */
+-#define _STK_LIM	(8*1024*1024)
++#define _STK_LIM	(10*1024*1024)
++#define EXEC_STACK_BIAS	(2*1024*1024)
+ 
+ /*
+  * GPG2 wants 64kB of mlocked memory, to make sure pass phrases
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 4896fdf..3513e03 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -101,6 +101,9 @@ struct fs_struct;
+ struct bts_context;
+ struct perf_counter_context;
+ 
++extern int exec_shield;
++extern int print_fatal_signals;
++
+ /*
+  * List of flags we want to share for kernel threads,
+  * if only because they are not used by them anyway.
+@@ -359,6 +362,10 @@ extern int sysctl_max_map_count;
+ extern unsigned long
+ arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
+ 		       unsigned long, unsigned long);
++
++extern unsigned long
++arch_get_unmapped_exec_area(struct file *, unsigned long, unsigned long,
++		       unsigned long, unsigned long);
+ extern unsigned long
+ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ 			  unsigned long len, unsigned long pgoff,
+diff --git a/kernel/sysctl.c b/kernel/sysctl.c
+index ce664f9..1905e22 100644
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -87,6 +87,26 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
+ #ifndef CONFIG_MMU
+ extern int sysctl_nr_trim_pages;
+ #endif
++
++int exec_shield = (1<<0);
++/* exec_shield is a bitmask:
++ * 0: off; vdso at STACK_TOP, 1 page below TASK_SIZE
++ * (1<<0) 1: on [also on if !=0]
++ * (1<<1) 2: force noexecstack regardless of PT_GNU_STACK
++ * The old settings
++ * (1<<2) 4: vdso just below .text of main (unless too low)
++ * (1<<3) 8: vdso just below .text of PT_INTERP (unless too low)
++ * are ignored because the vdso is placed completely randomly
++ */
++
++static int __init setup_exec_shield(char *str)
++{
++	get_option(&str, &exec_shield);
++
++	return 1;
++}
++__setup("exec-shield=", setup_exec_shield);
++
+ #ifdef CONFIG_RCU_TORTURE_TEST
+ extern int rcutorture_runnable;
+ #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
+@@ -382,6 +402,14 @@ static struct ctl_table kern_table[] = {
+ 		.proc_handler	= &proc_dointvec,
+ 	},
+ 	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "exec-shield",
++		.data		= &exec_shield,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{
+ 		.ctl_name	= KERN_CORE_USES_PID,
+ 		.procname	= "core_uses_pid",
+ 		.data		= &core_uses_pid,
+diff --git a/mm/mmap.c b/mm/mmap.c
+index 34579b2..260bb3c 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -29,6 +29,7 @@
+ #include <linux/rmap.h>
+ #include <linux/mmu_notifier.h>
+ #include <linux/perf_counter.h>
++#include <linux/random.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/cacheflush.h>
+@@ -45,6 +46,18 @@
+ #define arch_rebalance_pgtables(addr, len)		(addr)
+ #endif
+ 
++/* No sane architecture will #define these to anything else */
++#ifndef arch_add_exec_range
++#define arch_add_exec_range(mm, limit)	do { ; } while (0)
++#endif
++#ifndef arch_flush_exec_range
++#define arch_flush_exec_range(mm)	do { ; } while (0)
++#endif
++#ifndef arch_remove_exec_range
++#define arch_remove_exec_range(mm, limit)	do { ; } while (0)
++#endif
++
++
+ static void unmap_region(struct mm_struct *mm,
+ 		struct vm_area_struct *vma, struct vm_area_struct *prev,
+ 		unsigned long start, unsigned long end);
+@@ -392,6 +405,8 @@ static inline void
+ __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+ 		struct vm_area_struct *prev, struct rb_node *rb_parent)
+ {
++	if (vma->vm_flags & VM_EXEC)
++		arch_add_exec_range(mm, vma->vm_end);
+ 	if (prev) {
+ 		vma->vm_next = prev->vm_next;
+ 		prev->vm_next = vma;
+@@ -494,6 +509,8 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	rb_erase(&vma->vm_rb, &mm->mm_rb);
+ 	if (mm->mmap_cache == vma)
+ 		mm->mmap_cache = prev;
++	if (vma->vm_flags & VM_EXEC)
++		arch_remove_exec_range(mm, vma->vm_end);
+ }
+ 
+ /*
+@@ -803,6 +820,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
+ 		} else					/* cases 2, 5, 7 */
+ 			vma_adjust(prev, prev->vm_start,
+ 				end, prev->vm_pgoff, NULL);
++		if (prev->vm_flags & VM_EXEC)
++			arch_add_exec_range(mm, prev->vm_end);
+ 		return prev;
+ 	}
+ 
+@@ -957,7 +976,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+ 	/* Obtain the address to map to. we verify (or select) it and ensure
+ 	 * that it represents a valid section of the address space.
+ 	 */
+-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
++	addr = get_unmapped_area_prot(file, addr, len, pgoff, flags,
++		prot & PROT_EXEC);
+ 	if (addr & ~PAGE_MASK)
+ 		return addr;
+ 
+@@ -1442,13 +1462,17 @@ void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
+ }
+ 
+ unsigned long
+-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+-		unsigned long pgoff, unsigned long flags)
++get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
++		unsigned long pgoff, unsigned long flags, int exec)
+ {
+ 	unsigned long (*get_area)(struct file *, unsigned long,
+ 				  unsigned long, unsigned long, unsigned long);
+ 
+-	get_area = current->mm->get_unmapped_area;
++	if (exec && current->mm->get_unmapped_exec_area)
++		get_area = current->mm->get_unmapped_exec_area;
++	else
++		get_area = current->mm->get_unmapped_area;
++
+ 	if (file && file->f_op && file->f_op->get_unmapped_area)
+ 		get_area = file->f_op->get_unmapped_area;
+ 	addr = get_area(file, addr, len, pgoff, flags);
+@@ -1462,8 +1486,76 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+ 
+ 	return arch_rebalance_pgtables(addr, len);
+ }
++EXPORT_SYMBOL(get_unmapped_area_prot);
++
++#define SHLIB_BASE	0x00110000
++
++unsigned long
++arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0,
++		unsigned long len0, unsigned long pgoff, unsigned long flags)
++{
++	unsigned long addr = addr0, len = len0;
++	struct mm_struct *mm = current->mm;
++	struct vm_area_struct *vma;
++	unsigned long tmp;
++
++	if (len > TASK_SIZE)
++		return -ENOMEM;
++
++	if (flags & MAP_FIXED)
++		return addr;
++
++	if (!addr)
++		addr = randomize_range(SHLIB_BASE, 0x01000000, len);
++
++	if (addr) {
++		addr = PAGE_ALIGN(addr);
++		vma = find_vma(mm, addr);
++		if (TASK_SIZE - len >= addr &&
++		    (!vma || addr + len <= vma->vm_start))
++			return addr;
++	}
++
++	addr = SHLIB_BASE;
++	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
++		/* At this point:  (!vma || addr < vma->vm_end). */
++		if (TASK_SIZE - len < addr)
++			return -ENOMEM;
++
++		if (!vma || addr + len <= vma->vm_start) {
++			/*
++			 * Must not let a PROT_EXEC mapping get into the
++			 * brk area:
++			 */
++			if (addr + len > mm->brk)
++				goto failed;
++
++			/*
++			 * Up until the brk area we randomize addresses
++			 * as much as possible:
++			 */
++			if (addr >= 0x01000000) {
++				tmp = randomize_range(0x01000000,
++					PAGE_ALIGN(max(mm->start_brk,
++					(unsigned long)0x08000000)), len);
++				vma = find_vma(mm, tmp);
++				if (TASK_SIZE - len >= tmp &&
++				    (!vma || tmp + len <= vma->vm_start))
++					return tmp;
++			}
++			/*
++			 * Ok, randomization didnt work out - return
++			 * the result of the linear search:
++			 */
++			return addr;
++		}
++		addr = vma->vm_end;
++	}
++
++failed:
++	return current->mm->get_unmapped_area(filp, addr0, len0, pgoff, flags);
++}
+ 
+-EXPORT_SYMBOL(get_unmapped_area);
+ 
+ /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
+ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+@@ -1538,6 +1630,14 @@ out:
+ 	return prev ? prev->vm_next : vma;
+ }
+ 
++static int over_stack_limit(unsigned long sz)
++{
++	if (sz < EXEC_STACK_BIAS)
++		return 0;
++	return (sz - EXEC_STACK_BIAS) >
++			current->signal->rlim[RLIMIT_STACK].rlim_cur;
++}
++
+ /*
+  * Verify that the stack growth is acceptable and
+  * update accounting. This is shared with both the
+@@ -1554,7 +1654,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
+ 		return -ENOMEM;
+ 
+ 	/* Stack limit test */
+-	if (size > rlim[RLIMIT_STACK].rlim_cur)
++	if (over_stack_limit(size))
+ 		return -ENOMEM;
+ 
+ 	/* mlock limit tests */
+@@ -1864,10 +1964,14 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+ 	if (new->vm_ops && new->vm_ops->open)
+ 		new->vm_ops->open(new);
+ 
+-	if (new_below)
++	if (new_below) {
++		unsigned long old_end = vma->vm_end;
++
+ 		vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
+ 			((addr - new->vm_start) >> PAGE_SHIFT), new);
+-	else
++		if (vma->vm_flags & VM_EXEC)
++			arch_remove_exec_range(mm, old_end);
++	} else
+ 		vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+ 
+ 	return 0;
+@@ -2116,6 +2220,7 @@ void exit_mmap(struct mm_struct *mm)
+ 	vm_unacct_memory(nr_accounted);
+ 	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
+ 	tlb_finish_mmu(tlb, 0, end);
++	arch_flush_exec_range(mm);
+ }
+ 
+ /*
+diff --git a/mm/mprotect.c b/mm/mprotect.c
+index d80311b..032423d 100644
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -26,9 +26,14 @@
+ #include <linux/perf_counter.h>
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
++#include <asm/pgalloc.h>
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+ 
++#ifndef arch_remove_exec_range
++#define arch_remove_exec_range(mm, limit)      do { ; } while (0)
++#endif
++
+ #ifndef pgprot_modify
+ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+ {
+@@ -139,7 +144,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
+ 	struct mm_struct *mm = vma->vm_mm;
+ 	unsigned long oldflags = vma->vm_flags;
+ 	long nrpages = (end - start) >> PAGE_SHIFT;
+-	unsigned long charged = 0;
++	unsigned long charged = 0, old_end = vma->vm_end;
+ 	pgoff_t pgoff;
+ 	int error;
+ 	int dirty_accountable = 0;
+@@ -204,6 +209,9 @@ success:
+ 		dirty_accountable = 1;
+ 	}
+ 
++	if (oldflags & VM_EXEC)
++		arch_remove_exec_range(current->mm, old_end);
++
+ 	mmu_notifier_invalidate_range_start(mm, start, end);
+ 	if (is_vm_hugetlb_page(vma))
+ 		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
+diff --git a/mm/mremap.c b/mm/mremap.c
+index a39b7b9..6bebfde 100644
+--- a/mm/mremap.c
++++ b/mm/mremap.c
+@@ -400,8 +400,8 @@ unsigned long do_mremap(unsigned long addr,
+ 			if (vma->vm_flags & VM_MAYSHARE)
+ 				map_flags |= MAP_SHARED;
+ 
+-			new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+-						vma->vm_pgoff, map_flags);
++			new_addr = get_unmapped_area_prot(vma->vm_file, 0, new_len,
++				vma->vm_pgoff, map_flags, vma->vm_flags & VM_EXEC);
+ 			if (new_addr & ~PAGE_MASK) {
+ 				ret = new_addr;
+ 				goto out;
diff --git a/linux-2.6-utrace.patch b/linux-2.6-utrace.patch
new file mode 100644
index 000000000..ebb318bf6
--- /dev/null
+++ b/linux-2.6-utrace.patch
@@ -0,0 +1,4102 @@
+diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
+index 9632444..bf4b9e8 100644
+--- a/Documentation/DocBook/Makefile
++++ b/Documentation/DocBook/Makefile
+@@ -9,7 +9,7 @@
+ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \
+ 	    kernel-hacking.xml kernel-locking.xml deviceiobook.xml \
+ 	    procfs-guide.xml writing_usb_driver.xml networking.xml \
+-	    kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml \
++	    kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml utrace.xml \
+ 	    gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
+ 	    genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \
+ 	    mac80211.xml debugobjects.xml sh.xml regulator.xml \
+diff --git a/Documentation/DocBook/utrace.tmpl b/Documentation/DocBook/utrace.tmpl
+new file mode 100644
+index 0000000..6cc58a1
+--- /dev/null
++++ b/Documentation/DocBook/utrace.tmpl
+@@ -0,0 +1,590 @@
++<?xml version="1.0" encoding="UTF-8"?>
++<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
++"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
++
++<book id="utrace">
++  <bookinfo>
++    <title>The utrace User Debugging Infrastructure</title>
++  </bookinfo>
++
++  <toc></toc>
++
++  <chapter id="concepts"><title>utrace concepts</title>
++
++  <sect1 id="intro"><title>Introduction</title>
++
++  <para>
++    <application>utrace</application> is infrastructure code for tracing
++    and controlling user threads.  This is the foundation for writing
++    tracing engines, which can be loadable kernel modules.
++  </para>
++
++  <para>
++    The basic actors in <application>utrace</application> are the thread
++    and the tracing engine.  A tracing engine is some body of code that
++    calls into the <filename>&lt;linux/utrace.h&gt;</filename>
++    interfaces, represented by a <structname>struct
++    utrace_engine_ops</structname>.  (Usually it's a kernel module,
++    though the legacy <function>ptrace</function> support is a tracing
++    engine that is not in a kernel module.)  The interface operates on
++    individual threads (<structname>struct task_struct</structname>).
++    If an engine wants to treat several threads as a group, that is up
++    to its higher-level code.
++  </para>
++
++  <para>
++    Tracing begins by attaching an engine to a thread, using
++    <function>utrace_attach_task</function> or
++    <function>utrace_attach_pid</function>.  If successful, it returns a
++    pointer that is the handle used in all other calls.
++  </para>
++
++  </sect1>
++
++  <sect1 id="callbacks"><title>Events and Callbacks</title>
++
++  <para>
++    An attached engine does nothing by default.  An engine makes something
++    happen by requesting callbacks via <function>utrace_set_events</function>
++    and poking the thread with <function>utrace_control</function>.
++    The synchronization issues related to these two calls
++    are discussed further below in <xref linkend="teardown"/>.
++  </para>
++
++  <para>
++    Events are specified using the macro
++    <constant>UTRACE_EVENT(<replaceable>type</replaceable>)</constant>.
++    Each event type is associated with a callback in <structname>struct
++    utrace_engine_ops</structname>.  A tracing engine can leave unused
++    callbacks <constant>NULL</constant>.  The only callbacks required
++    are those used by the event flags it sets.
++  </para>
++
++  <para>
++    Many engines can be attached to each thread.  When a thread has an
++    event, each engine gets a callback if it has set the event flag for
++    that event type.  For most events, engines are called in the order they
++    attached.  Engines that attach after the event has occurred do not get
++    callbacks for that event.  This includes any new engines just attached
++    by an existing engine's callback function.  Once the sequence of
++    callbacks for that one event has completed, such new engines are then
++    eligible in the next sequence that starts when there is another event.
++  </para>
++
++  <para>
++    Event reporting callbacks have details particular to the event type,
++    but are all called in similar environments and have the same
++    constraints.  Callbacks are made from safe points, where no locks
++    are held, no special resources are pinned (usually), and the
++    user-mode state of the thread is accessible.  So, callback code has
++    a pretty free hand.  But to be a good citizen, callback code should
++    never block for long periods.  It is fine to block in
++    <function>kmalloc</function> and the like, but never wait for i/o or
++    for user mode to do something.  If you need the thread to wait, use
++    <constant>UTRACE_STOP</constant> and return from the callback
++    quickly.  When your i/o finishes or whatever, you can use
++    <function>utrace_control</function> to resume the thread.
++  </para>
++
++  <para>
++    The <constant>UTRACE_EVENT(SYSCALL_ENTRY)</constant> event is a special
++    case.  While other events happen in the kernel when it will return to
++    user mode soon, this event happens when entering the kernel before it
++    will proceed with the work requested from user mode.  Because of this
++    difference, the <function>report_syscall_entry</function> callback is
++    special in two ways.  For this event, engines are called in reverse of
++    the normal order (this includes the <function>report_quiesce</function>
++    call that precedes a <function>report_syscall_entry</function> call).
++    This preserves the semantics that the last engine to attach is called
++    "closest to user mode"--the engine that is first to see a thread's user
++    state when it enters the kernel is also the last to see that state when
++    the thread returns to user mode.  For the same reason, if these
++    callbacks use <constant>UTRACE_STOP</constant> (see the next section),
++    the thread stops immediately after callbacks rather than only when it's
++    ready to return to user mode; when allowed to resume, it will actually
++    attempt the system call indicated by the register values at that time.
++  </para>
++
++  </sect1>
++
++  <sect1 id="safely"><title>Stopping Safely</title>
++
++  <sect2 id="well-behaved"><title>Writing well-behaved callbacks</title>
++
++  <para>
++    Well-behaved callbacks are important to maintain two essential
++    properties of the interface.  The first of these is that unrelated
++    tracing engines should not interfere with each other.  If your engine's
++    event callback does not return quickly, then another engine won't get
++    the event notification in a timely manner.  The second important
++    property is that tracing should be as noninvasive as possible to the
++    normal operation of the system overall and of the traced thread in
++    particular.  That is, attached tracing engines should not perturb a
++    thread's behavior, except to the extent that changing its user-visible
++    state is explicitly what you want to do.  (Obviously some perturbation
++    is unavoidable, primarily timing changes, ranging from small delays due
++    to the overhead of tracing, to arbitrary pauses in user code execution
++    when a user stops a thread with a debugger for examination.)  Even when
++    you explicitly want the perturbation of making the traced thread block,
++    just blocking directly in your callback has more unwanted effects.  For
++    example, the <constant>CLONE</constant> event callbacks are called when
++    the new child thread has been created but not yet started running; the
++    child can never be scheduled until the <constant>CLONE</constant>
++    tracing callbacks return.  (This allows engines tracing the parent to
++    attach to the child.)  If a <constant>CLONE</constant> event callback
++    blocks the parent thread, it also prevents the child thread from
++    running (even to process a <constant>SIGKILL</constant>).  If what you
++    want is to make both the parent and child block, then use
++    <function>utrace_attach_task</function> on the child and then use
++    <constant>UTRACE_STOP</constant> on both threads.  A more crucial
++    problem with blocking in callbacks is that it can prevent
++    <constant>SIGKILL</constant> from working.  A thread that is blocking
++    due to <constant>UTRACE_STOP</constant> will still wake up and die
++    immediately when sent a <constant>SIGKILL</constant>, as all threads
++    should.  Relying on the <application>utrace</application>
++    infrastructure rather than on private synchronization calls in event
++    callbacks is an important way to help keep tracing robustly
++    noninvasive.
++  </para>
++
++  </sect2>
++
++  <sect2 id="UTRACE_STOP"><title>Using <constant>UTRACE_STOP</constant></title>
++
++  <para>
++    To control another thread and access its state, it must be stopped
++    with <constant>UTRACE_STOP</constant>.  This means that it is
++    stopped and won't start running again while we access it.  When a
++    thread is not already stopped, <function>utrace_control</function>
++    returns <constant>-EINPROGRESS</constant> and an engine must wait
++    for an event callback when the thread is ready to stop.  The thread
++    may be running on another CPU or may be blocked.  When it is ready
++    to be examined, it will make callbacks to engines that set the
++    <constant>UTRACE_EVENT(QUIESCE)</constant> event bit.  To wake up an
++    interruptible wait, use <constant>UTRACE_INTERRUPT</constant>.
++  </para>
++
++  <para>
++    As long as some engine has used <constant>UTRACE_STOP</constant> and
++    not called <function>utrace_control</function> to resume the thread,
++    then the thread will remain stopped.  <constant>SIGKILL</constant>
++    will wake it up, but it will not run user code.  When the stop is
++    cleared with <function>utrace_control</function> or a callback
++    return value, the thread starts running again.
++    (See also <xref linkend="teardown"/>.)
++  </para>
++
++  </sect2>
++
++  </sect1>
++
++  <sect1 id="teardown"><title>Tear-down Races</title>
++
++  <sect2 id="SIGKILL"><title>Primacy of <constant>SIGKILL</constant></title>
++  <para>
++    Ordinarily synchronization issues for tracing engines are kept fairly
++    straightforward by using <constant>UTRACE_STOP</constant>.  You ask a
++    thread to stop, and then once it makes the
++    <function>report_quiesce</function> callback it cannot do anything else
++    that would result in another callback, until you let it with a
++    <function>utrace_control</function> call.  This simple arrangement
++    avoids complex and error-prone code in each one of a tracing engine's
++    event callbacks to keep them serialized with the engine's other
++    operations done on that thread from another thread of control.
++    However, giving tracing engines complete power to keep a traced thread
++    stuck in place runs afoul of a more important kind of simplicity that
++    the kernel overall guarantees: nothing can prevent or delay
++    <constant>SIGKILL</constant> from making a thread die and release its
++    resources.  To preserve this important property of
++    <constant>SIGKILL</constant>, it as a special case can break
++    <constant>UTRACE_STOP</constant> like nothing else normally can.  This
++    includes both explicit <constant>SIGKILL</constant> signals and the
++    implicit <constant>SIGKILL</constant> sent to each other thread in the
++    same thread group by a thread doing an exec, or processing a fatal
++    signal, or making an <function>exit_group</function> system call.  A
++    tracing engine can prevent a thread from beginning the exit or exec or
++    dying by signal (other than <constant>SIGKILL</constant>) if it is
++    attached to that thread, but once the operation begins, no tracing
++    engine can prevent or delay all other threads in the same thread group
++    dying.
++  </para>
++  </sect2>
++
++  <sect2 id="reap"><title>Final callbacks</title>
++  <para>
++    The <function>report_reap</function> callback is always the final event
++    in the life cycle of a traced thread.  Tracing engines can use this as
++    the trigger to clean up their own data structures.  The
++    <function>report_death</function> callback is always the penultimate
++    event a tracing engine might see; it's seen unless the thread was
++    already in the midst of dying when the engine attached.  Many tracing
++    engines will have no interest in when a parent reaps a dead process,
++    and nothing they want to do with a zombie thread once it dies; for
++    them, the <function>report_death</function> callback is the natural
++    place to clean up data structures and detach.  To facilitate writing
++    such engines robustly, given the asynchrony of
++    <constant>SIGKILL</constant>, and without error-prone manual
++    implementation of synchronization schemes, the
++    <application>utrace</application> infrastructure provides some special
++    guarantees about the <function>report_death</function> and
++    <function>report_reap</function> callbacks.  It still takes some care
++    to be sure your tracing engine is robust to tear-down races, but these
++    rules make it reasonably straightforward and concise to handle a lot of
++    corner cases correctly.
++  </para>
++  </sect2>
++
++  <sect2 id="refcount"><title>Engine and task pointers</title>
++  <para>
++    The first sort of guarantee concerns the core data structures
++    themselves.  <structname>struct utrace_engine</structname> is
++    a reference-counted data structure.  While you hold a reference, an
++    engine pointer will always stay valid so that you can safely pass it to
++    any <application>utrace</application> call.  Each call to
++    <function>utrace_attach_task</function> or
++    <function>utrace_attach_pid</function> returns an engine pointer with a
++    reference belonging to the caller.  You own that reference until you
++    drop it using <function>utrace_engine_put</function>.  There is an
++    implicit reference on the engine while it is attached.  So if you drop
++    your only reference, and then use
++    <function>utrace_attach_task</function> without
++    <constant>UTRACE_ATTACH_CREATE</constant> to look up that same engine,
++    you will get the same pointer with a new reference to replace the one
++    you dropped, just like calling <function>utrace_engine_get</function>.
++    When an engine has been detached, either explicitly with
++    <constant>UTRACE_DETACH</constant> or implicitly after
++    <function>report_reap</function>, then any references you hold are all
++    that keep the old engine pointer alive.
++  </para>
++
++  <para>
++    There is nothing a kernel module can do to keep a <structname>struct
++    task_struct</structname> alive outside of
++    <function>rcu_read_lock</function>.  When the task dies and is reaped
++    by its parent (or itself), that structure can be freed so that any
++    dangling pointers you have stored become invalid.
++    <application>utrace</application> will not prevent this, but it can
++    help you detect it safely.  By definition, a task that has been reaped
++    has had all its engines detached.  All
++    <application>utrace</application> calls can be safely called on a
++    detached engine if the caller holds a reference on that engine pointer,
++    even if the task pointer passed in the call is invalid.  All calls
++    return <constant>-ESRCH</constant> for a detached engine, which tells
++    you that the task pointer you passed could be invalid now.  Since
++    <function>utrace_control</function> and
++    <function>utrace_set_events</function> do not block, you can call those
++    inside a <function>rcu_read_lock</function> section and be sure after
++    they don't return <constant>-ESRCH</constant> that the task pointer is
++    still valid until <function>rcu_read_unlock</function>.  The
++    infrastructure never holds task references of its own.  Though neither
++    <function>rcu_read_lock</function> nor any other lock is held while
++    making a callback, it's always guaranteed that the <structname>struct
++    task_struct</structname> and the <structname>struct
++    utrace_engine</structname> passed as arguments remain valid
++    until the callback function returns.
++  </para>
++
++  <para>
++    The common means for safely holding task pointers that is available to
++    kernel modules is to use <structname>struct pid</structname>, which
++    permits <function>put_pid</function> from kernel modules.  When using
++    that, the calls <function>utrace_attach_pid</function>,
++    <function>utrace_control_pid</function>,
++    <function>utrace_set_events_pid</function>, and
++    <function>utrace_barrier_pid</function> are available.
++  </para>
++  </sect2>
++
++  <sect2 id="reap-after-death">
++    <title>
++      Serialization of <constant>DEATH</constant> and <constant>REAP</constant>
++    </title>
++    <para>
++      The second guarantee is the serialization of
++      <constant>DEATH</constant> and <constant>REAP</constant> event
++      callbacks for a given thread.  The actual reaping by the parent
++      (<function>release_task</function> call) can occur simultaneously
++      while the thread is still doing the final steps of dying, including
++      the <function>report_death</function> callback.  If a tracing engine
++      has requested both <constant>DEATH</constant> and
++      <constant>REAP</constant> event reports, it's guaranteed that the
++      <function>report_reap</function> callback will not be made until
++      after the <function>report_death</function> callback has returned.
++      If the <function>report_death</function> callback itself detaches
++      from the thread, then the <function>report_reap</function> callback
++      will never be made.  Thus it is safe for a
++      <function>report_death</function> callback to clean up data
++      structures and detach.
++    </para>
++  </sect2>
++
++  <sect2 id="interlock"><title>Interlock with final callbacks</title>
++  <para>
++    The final sort of guarantee is that a tracing engine will know for sure
++    whether or not the <function>report_death</function> and/or
++    <function>report_reap</function> callbacks will be made for a certain
++    thread.  These tear-down races are disambiguated by the error return
++    values of <function>utrace_set_events</function> and
++    <function>utrace_control</function>.  Normally
++    <function>utrace_control</function> called with
++    <constant>UTRACE_DETACH</constant> returns zero, and this means that no
++    more callbacks will be made.  If the thread is in the midst of dying,
++    it returns <constant>-EALREADY</constant> to indicate that the
++    <constant>report_death</constant> callback may already be in progress;
++    when you get this error, you know that any cleanup your
++    <function>report_death</function> callback does is about to happen or
++    has just happened--note that if the <function>report_death</function>
++    callback does not detach, the engine remains attached until the thread
++    gets reaped.  If the thread is in the midst of being reaped,
++    <function>utrace_control</function> returns <constant>-ESRCH</constant>
++    to indicate that the <function>report_reap</function> callback may
++    already be in progress; this means the engine is implicitly detached
++    when the callback completes.  This makes it possible for a tracing
++    engine that has decided asynchronously to detach from a thread to
++    safely clean up its data structures, knowing that no
++    <function>report_death</function> or <function>report_reap</function>
++    callback will try to do the same.  <constant>utrace_detach</constant>
++    returns <constant>-ESRCH</constant> when the <structname>struct
++    utrace_engine</structname> has already been detached, but is
++    still a valid pointer because of its reference count.  A tracing engine
++    can use this to safely synchronize its own independent multiple threads
++    of control with each other and with its event callbacks that detach.
++  </para>
++
++  <para>
++    In the same vein, <function>utrace_set_events</function> normally
++    returns zero; if the target thread was stopped before the call, then
++    after a successful call, no event callbacks not requested in the new
++    flags will be made.  It fails with <constant>-EALREADY</constant> if
++    you try to clear <constant>UTRACE_EVENT(DEATH)</constant> when the
++    <function>report_death</function> callback may already have begun, if
++    you try to clear <constant>UTRACE_EVENT(REAP)</constant> when the
++    <function>report_reap</function> callback may already have begun, or if
++    you try to newly set <constant>UTRACE_EVENT(DEATH)</constant> or
++    <constant>UTRACE_EVENT(QUIESCE)</constant> when the target is already
++    dead or dying.  Like <function>utrace_control</function>, it returns
++    <constant>-ESRCH</constant> when the thread has already been detached
++    (including forcible detach on reaping).  This lets the tracing engine
++    know for sure which event callbacks it will or won't see after
++    <function>utrace_set_events</function> has returned.  By checking for
++    errors, it can know whether to clean up its data structures immediately
++    or to let its callbacks do the work.
++  </para>
++  </sect2>
++
++  <sect2 id="barrier"><title>Using <function>utrace_barrier</function></title>
++  <para>
++    When a thread is safely stopped, calling
++    <function>utrace_control</function> with <constant>UTRACE_DETACH</constant>
++    or calling <function>utrace_set_events</function> to disable some events
++    ensures synchronously that your engine won't get any more of the callbacks
++    that have been disabled (none at all when detaching).  But these can also
++    be used while the thread is not stopped, when it might be simultaneously
++    making a callback to your engine.  For this situation, these calls return
++    <constant>-EINPROGRESS</constant> when it's possible a callback is in
++    progress.  If you are not prepared to have your old callbacks still run,
++    then you can synchronize to be sure all the old callbacks are finished,
++    using <function>utrace_barrier</function>.  This is necessary if the
++    kernel module containing your callback code is going to be unloaded.
++  </para>
++  <para>
++    After using <constant>UTRACE_DETACH</constant> once, further calls to
++    <function>utrace_control</function> with the same engine pointer will
++    return <constant>-ESRCH</constant>.  In contrast, after getting
++    <constant>-EINPROGRESS</constant> from
++    <function>utrace_set_events</function>, you can call
++    <function>utrace_set_events</function> again later and if it returns zero
++    then know the old callbacks have finished.
++  </para>
++  <para>
++    Unlike all other calls, <function>utrace_barrier</function> (and
++    <function>utrace_barrier_pid</function>) will accept any engine pointer you
++    hold a reference on, even if <constant>UTRACE_DETACH</constant> has already
++    been used.  After any <function>utrace_control</function> or
++    <function>utrace_set_events</function> call (these do not block), you can
++    call <function>utrace_barrier</function> to block until callbacks have
++    finished.  This returns <constant>-ESRCH</constant> only if the engine is
++    completely detached (finished all callbacks).  Otherwise it waits
++    until the thread is definitely not in the midst of a callback to this
++    engine and then returns zero, but can return
++    <constant>-ERESTARTSYS</constant> if its wait is interrupted.
++  </para>
++  </sect2>
++
++</sect1>
++
++</chapter>
++
++<chapter id="core"><title>utrace core API</title>
++
++<para>
++  The utrace API is declared in <filename>&lt;linux/utrace.h&gt;</filename>.
++</para>
++
++!Iinclude/linux/utrace.h
++!Ekernel/utrace.c
++
++</chapter>
++
++<chapter id="machine"><title>Machine State</title>
++
++<para>
++  The <function>task_current_syscall</function> function can be used on any
++  valid <structname>struct task_struct</structname> at any time, and does
++  not even require that <function>utrace_attach_task</function> was used at all.
++</para>
++
++<para>
++  The other ways to access the registers and other machine-dependent state of
++  a task can only be used on a task that is at a known safe point.  The safe
++  points are all the places where <function>utrace_set_events</function> can
++  request callbacks (except for the <constant>DEATH</constant> and
++  <constant>REAP</constant> events).  So at any event callback, it is safe to
++  examine <varname>current</varname>.
++</para>
++
++<para>
++  One task can examine another only after a callback in the target task that
++  returns <constant>UTRACE_STOP</constant> so that task will not return to user
++  mode after the safe point.  This guarantees that the task will not resume
++  until the same engine uses <function>utrace_control</function>, unless the
++  task dies suddenly.  To examine safely, one must use a pair of calls to
++  <function>utrace_prepare_examine</function> and
++  <function>utrace_finish_examine</function> surrounding the calls to
++  <structname>struct user_regset</structname> functions or direct examination
++  of task data structures.  <function>utrace_prepare_examine</function> returns
++  an error if the task is not properly stopped and not dead.  After a
++  successful examination, the paired <function>utrace_finish_examine</function>
++  call returns an error if the task ever woke up during the examination.  If
++  so, any data gathered may be scrambled and should be discarded.  This means
++  there was a spurious wake-up (which should not happen), or a sudden death.
++</para>
++
++<sect1 id="regset"><title><structname>struct user_regset</structname></title>
++
++<para>
++  The <structname>struct user_regset</structname> API
++  is declared in <filename>&lt;linux/regset.h&gt;</filename>.
++</para>
++
++!Finclude/linux/regset.h
++
++</sect1>
++
++<sect1 id="task_current_syscall">
++  <title><filename>System Call Information</filename></title>
++
++<para>
++  This function is declared in <filename>&lt;linux/ptrace.h&gt;</filename>.
++</para>
++
++!Elib/syscall.c
++
++</sect1>
++
++<sect1 id="syscall"><title><filename>System Call Tracing</filename></title>
++
++<para>
++  The arch API for system call information is declared in
++  <filename>&lt;asm/syscall.h&gt;</filename>.
++  Each of these calls can be used only at system call entry tracing,
++  or can be used only at system call exit and the subsequent safe points
++  before returning to user mode.
++  At system call entry tracing means either during a
++  <structfield>report_syscall_entry</structfield> callback,
++  or any time after that callback has returned <constant>UTRACE_STOP</constant>.
++</para>
++
++!Finclude/asm-generic/syscall.h
++
++</sect1>
++
++</chapter>
++
++<chapter id="internals"><title>Kernel Internals</title>
++
++<para>
++  This chapter covers the interface to the tracing infrastructure
++  from the core of the kernel and the architecture-specific code.
++  This is for maintainers of the kernel and arch code, and not relevant
++  to using the tracing facilities described in preceding chapters.
++</para>
++
++<sect1 id="tracehook"><title>Core Calls In</title>
++
++<para>
++  These calls are declared in <filename>&lt;linux/tracehook.h&gt;</filename>.
++  The core kernel calls these functions at various important places.
++</para>
++
++!Finclude/linux/tracehook.h
++
++</sect1>
++
++<sect1 id="arch"><title>Architecture Calls Out</title>
++
++<para>
++  An arch that has done all these things sets
++  <constant>CONFIG_HAVE_ARCH_TRACEHOOK</constant>.
++  This is required to enable the <application>utrace</application> code.
++</para>
++
++<sect2 id="arch-ptrace"><title><filename>&lt;asm/ptrace.h&gt;</filename></title>
++
++<para>
++  An arch defines these in <filename>&lt;asm/ptrace.h&gt;</filename>
++  if it supports hardware single-step or block-step features.
++</para>
++
++!Finclude/linux/ptrace.h arch_has_single_step arch_has_block_step
++!Finclude/linux/ptrace.h user_enable_single_step user_enable_block_step
++!Finclude/linux/ptrace.h user_disable_single_step
++
++</sect2>
++
++<sect2 id="arch-syscall">
++  <title><filename>&lt;asm/syscall.h&gt;</filename></title>
++
++  <para>
++    An arch provides <filename>&lt;asm/syscall.h&gt;</filename> that
++    defines these as inlines, or declares them as exported functions.
++    These interfaces are described in <xref linkend="syscall"/>.
++  </para>
++
++</sect2>
++
++<sect2 id="arch-tracehook">
++  <title><filename>&lt;linux/tracehook.h&gt;</filename></title>
++
++  <para>
++    An arch must define <constant>TIF_NOTIFY_RESUME</constant>
++    and <constant>TIF_SYSCALL_TRACE</constant>
++    in its <filename>&lt;asm/thread_info.h&gt;</filename>.
++    The arch code must call the following functions, all declared
++    in <filename>&lt;linux/tracehook.h&gt;</filename> and
++    described in <xref linkend="tracehook"/>:
++
++    <itemizedlist>
++      <listitem>
++	<para><function>tracehook_notify_resume</function></para>
++      </listitem>
++      <listitem>
++	<para><function>tracehook_report_syscall_entry</function></para>
++      </listitem>
++      <listitem>
++	<para><function>tracehook_report_syscall_exit</function></para>
++      </listitem>
++      <listitem>
++	<para><function>tracehook_signal_handler</function></para>
++      </listitem>
++    </itemizedlist>
++
++  </para>
++
++</sect2>
++
++</sect1>
++
++</chapter>
++
++</book>
+diff --git a/fs/proc/array.c b/fs/proc/array.c
+index 725a650..e299a63 100644
+--- a/fs/proc/array.c
++++ b/fs/proc/array.c
+@@ -82,6 +82,7 @@
+ #include <linux/pid_namespace.h>
+ #include <linux/ptrace.h>
+ #include <linux/tracehook.h>
++#include <linux/utrace.h>
+ #include <linux/vs_context.h>
+ #include <linux/vs_network.h>
+
+@@ -188,6 +189,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
+ 		cred->uid, cred->euid, cred->suid, cred->fsuid,
+ 		cred->gid, cred->egid, cred->sgid, cred->fsgid);
+ 
++	task_utrace_proc_status(m, p);
++
+ 	task_lock(p);
+ 	if (p->files)
+ 		fdt = files_fdtable(p->files);
+diff --git a/include/linux/init_task.h b/include/linux/init_task.h
+index 5368fbd..aecd24e 100644
+--- a/include/linux/init_task.h
++++ b/include/linux/init_task.h
+@@ -167,6 +167,7 @@ extern struct cred init_cred;
+ 		[PIDTYPE_SID]  = INIT_PID_LINK(PIDTYPE_SID),		\
+ 	},								\
+ 	.dirties = INIT_PROP_LOCAL_SINGLE(dirties),			\
++	INIT_UTRACE(tsk)						\
+ 	INIT_IDS							\
+ 	INIT_PERF_COUNTERS(tsk)						\
+ 	INIT_TRACE_IRQFLAGS						\
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 4d07542..2060aa1 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -59,6 +59,7 @@ struct sched_param {
+ #include <linux/errno.h>
+ #include <linux/nodemask.h>
+ #include <linux/mm_types.h>
++#include <linux/utrace_struct.h>
+ 
+ #include <asm/system.h>
+ #include <asm/page.h>
+@@ -1314,6 +1315,11 @@ struct task_struct {
+ #endif
+        seccomp_t seccomp;
+ 
++#ifdef CONFIG_UTRACE
++       struct utrace utrace;
++       unsigned long utrace_flags;
++#endif
++
+ /* vserver context data */
+        struct vx_info *vx_info;
+        struct nx_info *nx_info;
+diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
+index 7c2bfd9..a91d9a4 100644
+--- a/include/linux/tracehook.h
++++ b/include/linux/tracehook.h
+@@ -49,6 +49,7 @@
+ #include <linux/sched.h>
+ #include <linux/ptrace.h>
+ #include <linux/security.h>
++#include <linux/utrace.h>
+ struct linux_binprm;
+ 
+ /**
+@@ -63,6 +64,8 @@ struct linux_binprm;
+  */
+ static inline int tracehook_expect_breakpoints(struct task_struct *task)
+ {
++	if (unlikely(task_utrace_flags(task) & UTRACE_EVENT(SIGNAL_CORE)))
++		return 1;
+ 	return (task_ptrace(task) & PT_PTRACED) != 0;
+ }
+ 
+@@ -111,6 +114,9 @@ static inline void ptrace_report_syscall(struct pt_regs *regs)
+ static inline __must_check int tracehook_report_syscall_entry(
+ 	struct pt_regs *regs)
+ {
++	if ((task_utrace_flags(current) & UTRACE_EVENT(SYSCALL_ENTRY)) &&
++	    utrace_report_syscall_entry(regs))
++		return 1;
+ 	ptrace_report_syscall(regs);
+ 	return 0;
+ }
+@@ -134,6 +140,8 @@ static inline __must_check int tracehook_report_syscall_entry(
+  */
+ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
+ {
++	if (task_utrace_flags(current) & UTRACE_EVENT(SYSCALL_EXIT))
++		utrace_report_syscall_exit(regs);
+ 	ptrace_report_syscall(regs);
+ }
+ 
+@@ -194,6 +202,8 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt,
+ 					 struct linux_binprm *bprm,
+ 					 struct pt_regs *regs)
+ {
++	if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(EXEC)))
++		utrace_report_exec(fmt, bprm, regs);
+ 	if (!ptrace_event(PT_TRACE_EXEC, PTRACE_EVENT_EXEC, 0) &&
+ 	    unlikely(task_ptrace(current) & PT_PTRACED))
+ 		send_sig(SIGTRAP, current, 0);
+@@ -211,6 +221,8 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt,
+  */
+ static inline void tracehook_report_exit(long *exit_code)
+ {
++	if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(EXIT)))
++		utrace_report_exit(exit_code);
+ 	ptrace_event(PT_TRACE_EXIT, PTRACE_EVENT_EXIT, *exit_code);
+ }
+ 
+@@ -254,6 +266,7 @@ static inline int tracehook_prepare_clone(unsigned clone_flags)
+ static inline void tracehook_finish_clone(struct task_struct *child,
+ 					  unsigned long clone_flags, int trace)
+ {
++	utrace_init_task(child);
+ 	ptrace_init_task(child, (clone_flags & CLONE_PTRACE) || trace);
+ }
+ 
+@@ -278,6 +291,8 @@ static inline void tracehook_report_clone(struct pt_regs *regs,
+ 					  unsigned long clone_flags,
+ 					  pid_t pid, struct task_struct *child)
+ {
++	if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(CLONE)))
++		utrace_report_clone(clone_flags, child);
+ 	if (unlikely(task_ptrace(child))) {
+ 		/*
+ 		 * It doesn't matter who attached/attaching to this
+@@ -310,6 +325,9 @@ static inline void tracehook_report_clone_complete(int trace,
+ 						   pid_t pid,
+ 						   struct task_struct *child)
+ {
++	if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(CLONE)) &&
++	    (clone_flags & CLONE_VFORK))
++		utrace_finish_vfork(current);
+ 	if (unlikely(trace))
+ 		ptrace_event(0, trace, pid);
+ }
+@@ -344,6 +362,7 @@ static inline void tracehook_report_vfork_done(struct task_struct *child,
+  */
+ static inline void tracehook_prepare_release_task(struct task_struct *task)
+ {
++	utrace_release_task(task);
+ }
+ 
+ /**
+@@ -358,6 +377,7 @@ static inline void tracehook_prepare_release_task(struct task_struct *task)
+ static inline void tracehook_finish_release_task(struct task_struct *task)
+ {
+ 	ptrace_release_task(task);
++	BUG_ON(task->exit_state != EXIT_DEAD);
+ }
+ 
+ /**
+@@ -379,6 +399,8 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
+ 					    const struct k_sigaction *ka,
+ 					    struct pt_regs *regs, int stepping)
+ {
++	if (task_utrace_flags(current))
++		utrace_signal_handler(current, stepping);
+ 	if (stepping)
+ 		ptrace_notify(SIGTRAP);
+ }
+@@ -396,6 +418,8 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
+ static inline int tracehook_consider_ignored_signal(struct task_struct *task,
+ 						    int sig)
+ {
++	if (unlikely(task_utrace_flags(task) & UTRACE_EVENT(SIGNAL_IGN)))
++		return 1;
+ 	return (task_ptrace(task) & PT_PTRACED) != 0;
+ }
+ 
+@@ -415,6 +439,9 @@ static inline int tracehook_consider_ignored_signal(struct task_struct *task,
+ static inline int tracehook_consider_fatal_signal(struct task_struct *task,
+ 						  int sig)
+ {
++	if (unlikely(task_utrace_flags(task) & (UTRACE_EVENT(SIGNAL_TERM) |
++						UTRACE_EVENT(SIGNAL_CORE))))
++		return 1;
+ 	return (task_ptrace(task) & PT_PTRACED) != 0;
+ }
+ 
+@@ -429,6 +456,8 @@ static inline int tracehook_consider_fatal_signal(struct task_struct *task,
+  */
+ static inline int tracehook_force_sigpending(void)
+ {
++	if (unlikely(task_utrace_flags(current)))
++		return utrace_interrupt_pending();
+ 	return 0;
+ }
+ 
+@@ -458,6 +487,8 @@ static inline int tracehook_get_signal(struct task_struct *task,
+ 				       siginfo_t *info,
+ 				       struct k_sigaction *return_ka)
+ {
++	if (unlikely(task_utrace_flags(task)))
++		return utrace_get_signal(task, regs, info, return_ka);
+ 	return 0;
+ }
+ 
+@@ -485,6 +516,8 @@ static inline int tracehook_get_signal(struct task_struct *task,
+  */
+ static inline int tracehook_notify_jctl(int notify, int why)
+ {
++	if (task_utrace_flags(current) & UTRACE_EVENT(JCTL))
++		utrace_report_jctl(notify, why);
+ 	return notify ?: (current->ptrace & PT_PTRACED) ? why : 0;
+ }
+ 
+@@ -508,6 +541,8 @@ static inline int tracehook_notify_jctl(int notify, int why)
+ static inline int tracehook_notify_death(struct task_struct *task,
+ 					 void **death_cookie, int group_dead)
+ {
++	*death_cookie = task_utrace_struct(task);
++
+ 	if (task_detached(task))
+ 		return task->ptrace ? SIGCHLD : DEATH_REAP;
+ 
+@@ -544,6 +579,20 @@ static inline void tracehook_report_death(struct task_struct *task,
+ 					  int signal, void *death_cookie,
+ 					  int group_dead)
+ {
++	/*
++	 * This barrier ensures that our caller's setting of
++	 * @task->exit_state precedes checking @task->utrace_flags here.
++	 * If utrace_set_events() was just called to enable
++	 * UTRACE_EVENT(DEATH), then we are obliged to call
++	 * utrace_report_death() and not miss it.  utrace_set_events()
++	 * uses tasklist_lock to synchronize enabling the bit with the
++	 * actual change to @task->exit_state, but we need this barrier
++	 * to be sure we see a flags change made just before our caller
++	 * took the tasklist_lock.
++	 */
++	smp_mb();
++	if (task_utrace_flags(task) & _UTRACE_DEATH_EVENTS)
++		utrace_report_death(task, death_cookie, group_dead, signal);
+ }
+ 
+ #ifdef TIF_NOTIFY_RESUME
+@@ -573,10 +622,20 @@ static inline void set_notify_resume(struct task_struct *task)
+  * asynchronously, this will be called again before we return to
+  * user mode.
+  *
+- * Called without locks.
++ * Called without locks.  However, on some machines this may be
++ * called with interrupts disabled.
+  */
+ static inline void tracehook_notify_resume(struct pt_regs *regs)
+ {
++	struct task_struct *task = current;
++	/*
++	 * This pairs with the barrier implicit in set_notify_resume().
++	 * It ensures that we read the nonzero utrace_flags set before
++	 * set_notify_resume() was called by utrace setup.
++	 */
++	smp_rmb();
++	if (task_utrace_flags(task))
++		utrace_resume(task, regs);
+ }
+ #endif	/* TIF_NOTIFY_RESUME */
+ 
+diff --git a/include/linux/utrace.h b/include/linux/utrace.h
+new file mode 100644
+index 0000000..f877ec6
+--- /dev/null
++++ b/include/linux/utrace.h
+@@ -0,0 +1,692 @@
++/*
++ * utrace infrastructure interface for debugging user processes
++ *
++ * Copyright (C) 2006-2009 Red Hat, Inc.  All rights reserved.
++ *
++ * This copyrighted material is made available to anyone wishing to use,
++ * modify, copy, or redistribute it subject to the terms and conditions
++ * of the GNU General Public License v.2.
++ *
++ * Red Hat Author: Roland McGrath.
++ *
++ * This interface allows for notification of interesting events in a
++ * thread.  It also mediates access to thread state such as registers.
++ * Multiple unrelated users can be associated with a single thread.
++ * We call each of these a tracing engine.
++ *
++ * A tracing engine starts by calling utrace_attach_task() or
++ * utrace_attach_pid() on the chosen thread, passing in a set of hooks
++ * (&struct utrace_engine_ops), and some associated data.  This produces a
++ * &struct utrace_engine, which is the handle used for all other
++ * operations.  An attached engine has its ops vector, its data, and an
++ * event mask controlled by utrace_set_events().
++ *
++ * For each event bit that is set, that engine will get the
++ * appropriate ops->report_*() callback when the event occurs.  The
++ * &struct utrace_engine_ops need not provide callbacks for an event
++ * unless the engine sets one of the associated event bits.
++ */
++
++#ifndef _LINUX_UTRACE_H
++#define _LINUX_UTRACE_H	1
++
++#include <linux/list.h>
++#include <linux/kref.h>
++#include <linux/signal.h>
++#include <linux/sched.h>
++
++struct linux_binprm;
++struct pt_regs;
++struct utrace;
++struct user_regset;
++struct user_regset_view;
++
++/*
++ * Event bits passed to utrace_set_events().
++ * These appear in &struct task_struct.@utrace_flags
++ * and &struct utrace_engine.@flags.
++ */
++enum utrace_events {
++	_UTRACE_EVENT_QUIESCE,	/* Thread is available for examination.  */
++	_UTRACE_EVENT_REAP,  	/* Zombie reaped, no more tracing possible.  */
++	_UTRACE_EVENT_CLONE,	/* Successful clone/fork/vfork just done.  */
++	_UTRACE_EVENT_EXEC,	/* Successful execve just completed.  */
++	_UTRACE_EVENT_EXIT,	/* Thread exit in progress.  */
++	_UTRACE_EVENT_DEATH,	/* Thread has died.  */
++	_UTRACE_EVENT_SYSCALL_ENTRY, /* User entered kernel for system call. */
++	_UTRACE_EVENT_SYSCALL_EXIT, /* Returning to user after system call.  */
++	_UTRACE_EVENT_SIGNAL,	/* Signal delivery will run a user handler.  */
++	_UTRACE_EVENT_SIGNAL_IGN, /* No-op signal to be delivered.  */
++	_UTRACE_EVENT_SIGNAL_STOP, /* Signal delivery will suspend.  */
++	_UTRACE_EVENT_SIGNAL_TERM, /* Signal delivery will terminate.  */
++	_UTRACE_EVENT_SIGNAL_CORE, /* Signal delivery will dump core.  */
++	_UTRACE_EVENT_JCTL,	/* Job control stop or continue completed.  */
++	_UTRACE_NEVENTS
++};
++#define UTRACE_EVENT(type)	(1UL << _UTRACE_EVENT_##type)
++
++/*
++ * All the kinds of signal events.
++ * These all use the @report_signal() callback.
++ */
++#define UTRACE_EVENT_SIGNAL_ALL	(UTRACE_EVENT(SIGNAL) \
++				 | UTRACE_EVENT(SIGNAL_IGN) \
++				 | UTRACE_EVENT(SIGNAL_STOP) \
++				 | UTRACE_EVENT(SIGNAL_TERM) \
++				 | UTRACE_EVENT(SIGNAL_CORE))
++/*
++ * Both kinds of syscall events; these call the @report_syscall_entry()
++ * and @report_syscall_exit() callbacks, respectively.
++ */
++#define UTRACE_EVENT_SYSCALL	\
++	(UTRACE_EVENT(SYSCALL_ENTRY) | UTRACE_EVENT(SYSCALL_EXIT))
++
++/*
++ * The event reports triggered synchronously by task death.
++ */
++#define _UTRACE_DEATH_EVENTS (UTRACE_EVENT(DEATH) | UTRACE_EVENT(QUIESCE))
++
++/*
++ * Hooks in <linux/tracehook.h> call these entry points to the
++ * utrace dispatch.  They are weak references here only so
++ * tracehook.h doesn't need to #ifndef CONFIG_UTRACE them to
++ * avoid external references in case of unoptimized compilation.
++ */
++bool utrace_interrupt_pending(void)
++	__attribute__((weak));
++void utrace_resume(struct task_struct *, struct pt_regs *)
++	__attribute__((weak));
++int utrace_get_signal(struct task_struct *, struct pt_regs *,
++		      siginfo_t *, struct k_sigaction *)
++	__attribute__((weak));
++void utrace_report_clone(unsigned long, struct task_struct *)
++	__attribute__((weak));
++void utrace_finish_vfork(struct task_struct *)
++	__attribute__((weak));
++void utrace_report_exit(long *exit_code)
++	__attribute__((weak));
++void utrace_report_death(struct task_struct *, struct utrace *, bool, int)
++	__attribute__((weak));
++void utrace_report_jctl(int notify, int type)
++	__attribute__((weak));
++void utrace_report_exec(struct linux_binfmt *, struct linux_binprm *,
++			struct pt_regs *regs)
++	__attribute__((weak));
++bool utrace_report_syscall_entry(struct pt_regs *)
++	__attribute__((weak));
++void utrace_report_syscall_exit(struct pt_regs *)
++	__attribute__((weak));
++void utrace_signal_handler(struct task_struct *, int)
++	__attribute__((weak));
++
++#ifndef CONFIG_UTRACE
++
++/*
++ * <linux/tracehook.h> uses these accessors to avoid #ifdef CONFIG_UTRACE.
++ */
++static inline unsigned long task_utrace_flags(struct task_struct *task)
++{
++	return 0;
++}
++static inline struct utrace *task_utrace_struct(struct task_struct *task)
++{
++	return NULL;
++}
++static inline void utrace_init_task(struct task_struct *child)
++{
++}
++static inline void utrace_release_task(struct task_struct *task)
++{
++}
++
++static inline void task_utrace_proc_status(struct seq_file *m,
++					   struct task_struct *p)
++{
++}
++
++#else  /* CONFIG_UTRACE */
++
++static inline unsigned long task_utrace_flags(struct task_struct *task)
++{
++	return task->utrace_flags;
++}
++
++static inline struct utrace *task_utrace_struct(struct task_struct *task)
++{
++	return &task->utrace;
++}
++
++static inline void utrace_init_task(struct task_struct *task)
++{
++	task->utrace_flags = 0;
++	memset(&task->utrace, 0, sizeof(task->utrace));
++	INIT_LIST_HEAD(&task->utrace.attached);
++	INIT_LIST_HEAD(&task->utrace.attaching);
++	spin_lock_init(&task->utrace.lock);
++}
++
++void utrace_release_task(struct task_struct *);
++void task_utrace_proc_status(struct seq_file *m, struct task_struct *p);
++
++
++/*
++ * Version number of the API defined in this file.  This will change
++ * whenever a tracing engine's code would need some updates to keep
++ * working.  We maintain this here for the benefit of tracing engine code
++ * that is developed concurrently with utrace API improvements before they
++ * are merged into the kernel, making LINUX_VERSION_CODE checks unwieldy.
++ */
++#define UTRACE_API_VERSION	20090416
++
++/**
++ * enum utrace_resume_action - engine's choice of action for a traced task
++ * @UTRACE_STOP:		Stay quiescent after callbacks.
++ * @UTRACE_REPORT:		Make some callback soon.
++ * @UTRACE_INTERRUPT:		Make @report_signal() callback soon.
++ * @UTRACE_SINGLESTEP:		Resume in user mode for one instruction.
++ * @UTRACE_BLOCKSTEP:		Resume in user mode until next branch.
++ * @UTRACE_RESUME:		Resume normally in user mode.
++ * @UTRACE_DETACH:		Detach my engine (implies %UTRACE_RESUME).
++ *
++ * See utrace_control() for detailed descriptions of each action.  This is
++ * encoded in the @action argument and the return value for every callback
++ * with a &u32 return value.
++ *
++ * The order of these is important.  When there is more than one engine,
++ * each supplies its choice and the smallest value prevails.
++ */
++enum utrace_resume_action {
++	UTRACE_STOP,
++	UTRACE_REPORT,
++	UTRACE_INTERRUPT,
++	UTRACE_SINGLESTEP,
++	UTRACE_BLOCKSTEP,
++	UTRACE_RESUME,
++	UTRACE_DETACH
++};
++#define	UTRACE_RESUME_MASK	0x0f
++
++/**
++ * utrace_resume_action - &enum utrace_resume_action from callback action
++ * @action:		&u32 callback @action argument or return value
++ *
++ * This extracts the &enum utrace_resume_action from @action,
++ * which is the @action argument to a &struct utrace_engine_ops
++ * callback or the return value from one.
++ */
++static inline enum utrace_resume_action utrace_resume_action(u32 action)
++{
++	return action & UTRACE_RESUME_MASK;
++}
++
++/**
++ * enum utrace_signal_action - disposition of signal
++ * @UTRACE_SIGNAL_DELIVER:	Deliver according to sigaction.
++ * @UTRACE_SIGNAL_IGN:		Ignore the signal.
++ * @UTRACE_SIGNAL_TERM:		Terminate the process.
++ * @UTRACE_SIGNAL_CORE:		Terminate with core dump.
++ * @UTRACE_SIGNAL_STOP:		Deliver as absolute stop.
++ * @UTRACE_SIGNAL_TSTP:		Deliver as job control stop.
++ * @UTRACE_SIGNAL_REPORT:	Reporting before pending signals.
++ * @UTRACE_SIGNAL_HANDLER:	Reporting after signal handler setup.
++ *
++ * This is encoded in the @action argument and the return value for
++ * a @report_signal() callback.  It says what will happen to the
++ * signal described by the &siginfo_t parameter to the callback.
++ *
++ * The %UTRACE_SIGNAL_REPORT value is used in an @action argument when
++ * a tracing report is being made before dequeuing any pending signal.
++ * If this is immediately after a signal handler has been set up, then
++ * %UTRACE_SIGNAL_HANDLER is used instead.  A @report_signal callback
++ * that uses %UTRACE_SIGNAL_DELIVER|%UTRACE_SINGLESTEP will ensure
++ * it sees a %UTRACE_SIGNAL_HANDLER report.
++ */
++enum utrace_signal_action {
++	UTRACE_SIGNAL_DELIVER	= 0x00,
++	UTRACE_SIGNAL_IGN	= 0x10,
++	UTRACE_SIGNAL_TERM	= 0x20,
++	UTRACE_SIGNAL_CORE	= 0x30,
++	UTRACE_SIGNAL_STOP	= 0x40,
++	UTRACE_SIGNAL_TSTP	= 0x50,
++	UTRACE_SIGNAL_REPORT	= 0x60,
++	UTRACE_SIGNAL_HANDLER	= 0x70
++};
++#define	UTRACE_SIGNAL_MASK	0xf0
++#define UTRACE_SIGNAL_HOLD	0x100 /* Flag, push signal back on queue.  */
++
++/**
++ * utrace_signal_action - &enum utrace_signal_action from callback action
++ * @action:		@report_signal callback @action argument or return value
++ *
++ * This extracts the &enum utrace_signal_action from @action, which
++ * is the @action argument to a @report_signal callback or the
++ * return value from one.
++ */
++static inline enum utrace_signal_action utrace_signal_action(u32 action)
++{
++	return action & UTRACE_SIGNAL_MASK;
++}
++
++/**
++ * enum utrace_syscall_action - disposition of system call attempt
++ * @UTRACE_SYSCALL_RUN:		Run the system call.
++ * @UTRACE_SYSCALL_ABORT:	Don't run the system call.
++ *
++ * This is encoded in the @action argument and the return value for
++ * a @report_syscall_entry callback.
++ */
++enum utrace_syscall_action {
++	UTRACE_SYSCALL_RUN	= 0x00,
++	UTRACE_SYSCALL_ABORT	= 0x10
++};
++#define	UTRACE_SYSCALL_MASK	0xf0
++
++/**
++ * utrace_syscall_action - &enum utrace_syscall_action from callback action
++ * @action:		@report_syscall_entry callback @action or return value
++ *
++ * This extracts the &enum utrace_syscall_action from @action, which
++ * is the @action argument to a @report_syscall_entry callback or the
++ * return value from one.
++ */
++static inline enum utrace_syscall_action utrace_syscall_action(u32 action)
++{
++	return action & UTRACE_SYSCALL_MASK;
++}
++
++/*
++ * Flags for utrace_attach_task() and utrace_attach_pid().
++ */
++#define UTRACE_ATTACH_CREATE		0x0010 /* Attach a new engine.  */
++#define UTRACE_ATTACH_EXCLUSIVE		0x0020 /* Refuse if existing match.  */
++#define UTRACE_ATTACH_MATCH_OPS		0x0001 /* Match engines on ops.  */
++#define UTRACE_ATTACH_MATCH_DATA	0x0002 /* Match engines on data.  */
++#define UTRACE_ATTACH_MATCH_MASK	0x000f
++
++/**
++ * struct utrace_engine - per-engine structure
++ * @ops:	&struct utrace_engine_ops pointer passed to utrace_attach_task()
++ * @data:	engine-private &void * passed to utrace_attach_task()
++ * @flags:	event mask set by utrace_set_events() plus internal flag bits
++ *
++ * The task itself never has to worry about engines detaching while
++ * it's doing event callbacks.  These structures are removed from the
++ * task's active list only when it's stopped, or by the task itself.
++ *
++ * utrace_engine_get() and utrace_engine_put() maintain a reference count.
++ * When it drops to zero, the structure is freed.  One reference is held
++ * implicitly while the engine is attached to its task.
++ */
++struct utrace_engine {
++/* private: */
++	struct kref kref;
++	struct list_head entry;
++
++/* public: */
++	const struct utrace_engine_ops *ops;
++	void *data;
++
++	unsigned long flags;
++};
++
++/**
++ * utrace_engine_get - acquire a reference on a &struct utrace_engine
++ * @engine:	&struct utrace_engine pointer
++ *
++ * You must hold a reference on @engine, and you get another.
++ */
++static inline void utrace_engine_get(struct utrace_engine *engine)
++{
++	kref_get(&engine->kref);
++}
++
++void __utrace_engine_release(struct kref *);
++
++/**
++ * utrace_engine_put - release a reference on a &struct utrace_engine
++ * @engine:	&struct utrace_engine pointer
++ *
++ * You must hold a reference on @engine, and you lose that reference.
++ * If it was the last one, @engine becomes an invalid pointer.
++ */
++static inline void utrace_engine_put(struct utrace_engine *engine)
++{
++	kref_put(&engine->kref, __utrace_engine_release);
++}
++
++/**
++ * struct utrace_engine_ops - tracing engine callbacks
++ *
++ * Each @report_*() callback corresponds to an %UTRACE_EVENT(*) bit.
++ * utrace_set_events() calls on @engine choose which callbacks will be made
++ * to @engine from @task.
++ *
++ * Most callbacks take an @action argument, giving the resume action
++ * chosen by other tracing engines.  All callbacks take an @engine
++ * argument, and a @task argument, which is always equal to @current.
++ * For some calls, @action also includes bits specific to that event
++ * and utrace_resume_action() is used to extract the resume action.
++ * This shows what would happen if @engine wasn't there, or will if
++ * the callback's return value uses %UTRACE_RESUME.  This always
++ * starts as %UTRACE_RESUME when no other tracing is being done on
++ * this task.
++ *
++ * All return values contain &enum utrace_resume_action bits.  For
++ * some calls, other bits specific to that kind of event are added to
++ * the resume action bits with OR.  These are the same bits used in
++ * the @action argument.  The resume action returned by a callback
++ * does not override previous engines' choices, it only says what
++ * @engine wants done.  What @task actually does is the action that's
++ * most constrained among the choices made by all attached engines.
++ * See utrace_control() for more information on the actions.
++ *
++ * When %UTRACE_STOP is used in @report_syscall_entry, then @task
++ * stops before attempting the system call.  In other cases, the
++ * resume action does not take effect until @task is ready to check
++ * for signals and return to user mode.  If there are more callbacks
++ * to be made, the last round of calls determines the final action.
++ * A @report_quiesce callback with @event zero, or a @report_signal
++ * callback, will always be the last one made before @task resumes.
++ * Only %UTRACE_STOP is "sticky"--if @engine returned %UTRACE_STOP
++ * then @task stays stopped unless @engine returns different from a
++ * following callback.
++ *
++ * The report_death() and report_reap() callbacks do not take @action
++ * arguments, and only %UTRACE_DETACH is meaningful in the return value
++ * from a report_death() callback.  None of the resume actions applies
++ * to a dead thread.
++ *
++ * All @report_*() hooks are called with no locks held, in a generally
++ * safe environment when we will be returning to user mode soon (or just
++ * entered the kernel).  It is fine to block for memory allocation and
++ * the like, but all hooks are asynchronous and must not block on
++ * external events!  If you want the thread to block, use %UTRACE_STOP
++ * in your hook's return value; then later wake it up with utrace_control().
++ *
++ * @report_quiesce:
++ *	Requested by %UTRACE_EVENT(%QUIESCE).
++ *	This does not indicate any event, but just that @task (the current
++ *	thread) is in a safe place for examination.  This call is made
++ *	before each specific event callback, except for @report_reap.
++ *	The @event argument gives the %UTRACE_EVENT(@which) value for
++ *	the event occurring.  This callback might be made for events @engine
++ *	has not requested, if some other engine is tracing the event;
++ *	calling utrace_set_events() call here can request the immediate
++ *	callback for this occurrence of @event.  @event is zero when there
++ *	is no other event, @task is now ready to check for signals and
++ *	return to user mode, and some engine has used %UTRACE_REPORT or
++ *	%UTRACE_INTERRUPT to request this callback.  For this case,
++ *	if @report_signal is not %NULL, the @report_quiesce callback
++ *	may be replaced with a @report_signal callback passing
++ *	%UTRACE_SIGNAL_REPORT in its @action argument, whenever @task is
++ *	entering the signal-check path anyway.
++ *
++ * @report_signal:
++ *	Requested by %UTRACE_EVENT(%SIGNAL_*) or %UTRACE_EVENT(%QUIESCE).
++ *	Use utrace_signal_action() and utrace_resume_action() on @action.
++ *	The signal action is %UTRACE_SIGNAL_REPORT when some engine has
++ *	used %UTRACE_REPORT or %UTRACE_INTERRUPT; the callback can choose
++ *	to stop or to deliver an artificial signal, before pending signals.
++ *	It's %UTRACE_SIGNAL_HANDLER instead when signal handler setup just
++ *	finished (after a previous %UTRACE_SIGNAL_DELIVER return); this
++ *	serves in lieu of any %UTRACE_SIGNAL_REPORT callback requested by
++ *	%UTRACE_REPORT or %UTRACE_INTERRUPT, and is also implicitly
++ *	requested by %UTRACE_SINGLESTEP or %UTRACE_BLOCKSTEP into the
++ *	signal delivery.  The other signal actions indicate a signal about
++ *	to be delivered; the previous engine's return value sets the signal
++ *	action seen by the the following engine's callback.  The @info data
++ *	can be changed at will, including @info->si_signo.  The settings in
++ *	@return_ka determines what %UTRACE_SIGNAL_DELIVER does.  @orig_ka
++ *	is what was in force before other tracing engines intervened, and
++ *	it's %NULL when this report began as %UTRACE_SIGNAL_REPORT or
++ *	%UTRACE_SIGNAL_HANDLER.  For a report without a new signal, @info
++ *	is left uninitialized and must be set completely by an engine that
++ *	chooses to deliver a signal; if there was a previous @report_signal
++ *	callback ending in %UTRACE_STOP and it was just resumed using
++ *	%UTRACE_REPORT or %UTRACE_INTERRUPT, then @info is left unchanged
++ *	from the previous callback.  In this way, the original signal can
++ *	be left in @info while returning %UTRACE_STOP|%UTRACE_SIGNAL_IGN
++ *	and then found again when resuming @task with %UTRACE_INTERRUPT.
++ *	The %UTRACE_SIGNAL_HOLD flag bit can be OR'd into the return value,
++ *	and might be in @action if the previous engine returned it.  This
++ *	flag asks that the signal in @info be pushed back on @task's queue
++ *	so that it will be seen again after whatever action is taken now.
++ *
++ * @report_clone:
++ *	Requested by %UTRACE_EVENT(%CLONE).
++ *	Event reported for parent, before the new task @child might run.
++ *	@clone_flags gives the flags used in the clone system call,
++ *	or equivalent flags for a fork() or vfork() system call.
++ *	This function can use utrace_attach_task() on @child.  It's guaranteed
++ *	that asynchronous utrace_attach_task() calls will be ordered after
++ *	any calls in @report_clone callbacks for the parent.  Thus
++ *	when using %UTRACE_ATTACH_EXCLUSIVE in the asynchronous calls,
++ *	you can be sure that the parent's @report_clone callback has
++ *	already attached to @child or chosen not to.  Passing %UTRACE_STOP
++ *	to utrace_control() on @child here keeps the child stopped before
++ *	it ever runs in user mode, %UTRACE_REPORT or %UTRACE_INTERRUPT
++ *	ensures a callback from @child before it starts in user mode.
++ *
++ * @report_jctl:
++ *	Requested by %UTRACE_EVENT(%JCTL).
++ *	Job control event; @type is %CLD_STOPPED or %CLD_CONTINUED,
++ *	indicating whether we are stopping or resuming now.  If @notify
++ *	is nonzero, @task is the last thread to stop and so will send
++ *	%SIGCHLD to its parent after this callback; @notify reflects
++ *	what the parent's %SIGCHLD has in @si_code, which can sometimes
++ *	be %CLD_STOPPED even when @type is %CLD_CONTINUED.
++ *
++ * @report_exec:
++ *	Requested by %UTRACE_EVENT(%EXEC).
++ *	An execve system call has succeeded and the new program is about to
++ *	start running.  The initial user register state is handy to be tweaked
++ *	directly in @regs.  @fmt and @bprm gives the details of this exec.
++ *
++ * @report_syscall_entry:
++ *	Requested by %UTRACE_EVENT(%SYSCALL_ENTRY).
++ *	Thread has entered the kernel to request a system call.
++ *	The user register state is handy to be tweaked directly in @regs.
++ *	The @action argument contains an &enum utrace_syscall_action,
++ *	use utrace_syscall_action() to extract it.  The return value
++ *	overrides the last engine's action for the system call.
++ *	If the final action is %UTRACE_SYSCALL_ABORT, no system call
++ *	is made.  The details of the system call being attempted can
++ *	be fetched here with syscall_get_nr() and syscall_get_arguments().
++ *	The parameter registers can be changed with syscall_set_arguments().
++ *
++ * @report_syscall_exit:
++ *	Requested by %UTRACE_EVENT(%SYSCALL_EXIT).
++ *	Thread is about to leave the kernel after a system call request.
++ *	The user register state is handy to be tweaked directly in @regs.
++ *	The results of the system call attempt can be examined here using
++ *	syscall_get_error() and syscall_get_return_value().  It is safe
++ *	here to call syscall_set_return_value() or syscall_rollback().
++ *
++ * @report_exit:
++ *	Requested by %UTRACE_EVENT(%EXIT).
++ *	Thread is exiting and cannot be prevented from doing so,
++ *	but all its state is still live.  The @code value will be
++ *	the wait result seen by the parent, and can be changed by
++ *	this engine or others.  The @orig_code value is the real
++ *	status, not changed by any tracing engine.  Returning %UTRACE_STOP
++ *	here keeps @task stopped before it cleans up its state and dies,
++ *	so it can be examined by other processes.  When @task is allowed
++ *	to run, it will die and get to the @report_death callback.
++ *
++ * @report_death:
++ *	Requested by %UTRACE_EVENT(%DEATH).
++ *	Thread is really dead now.  It might be reaped by its parent at
++ *	any time, or self-reap immediately.  Though the actual reaping
++ *	may happen in parallel, a report_reap() callback will always be
++ *	ordered after a report_death() callback.
++ *
++ * @report_reap:
++ *	Requested by %UTRACE_EVENT(%REAP).
++ *	Called when someone reaps the dead task (parent, init, or self).
++ *	This means the parent called wait, or else this was a detached
++ *	thread or a process whose parent ignores SIGCHLD.
++ *	No more callbacks are made after this one.
++ *	The engine is always detached.
++ *	There is nothing more a tracing engine can do about this thread.
++ *	After this callback, the @engine pointer will become invalid.
++ *	The @task pointer may become invalid if get_task_struct() hasn't
++ *	been used to keep it alive.
++ *	An engine should always request this callback if it stores the
++ *	@engine pointer or stores any pointer in @engine->data, so it
++ *	can clean up its data structures.
++ *	Unlike other callbacks, this can be called from the parent's context
++ *	rather than from the traced thread itself--it must not delay the
++ *	parent by blocking.
++ */
++struct utrace_engine_ops {
++	u32 (*report_quiesce)(enum utrace_resume_action action,
++			      struct utrace_engine *engine,
++			      struct task_struct *task,
++			      unsigned long event);
++	u32 (*report_signal)(u32 action,
++			     struct utrace_engine *engine,
++			     struct task_struct *task,
++			     struct pt_regs *regs,
++			     siginfo_t *info,
++			     const struct k_sigaction *orig_ka,
++			     struct k_sigaction *return_ka);
++	u32 (*report_clone)(enum utrace_resume_action action,
++			    struct utrace_engine *engine,
++			    struct task_struct *parent,
++			    unsigned long clone_flags,
++			    struct task_struct *child);
++	u32 (*report_jctl)(enum utrace_resume_action action,
++			   struct utrace_engine *engine,
++			   struct task_struct *task,
++			   int type, int notify);
++	u32 (*report_exec)(enum utrace_resume_action action,
++			   struct utrace_engine *engine,
++			   struct task_struct *task,
++			   const struct linux_binfmt *fmt,
++			   const struct linux_binprm *bprm,
++			   struct pt_regs *regs);
++	u32 (*report_syscall_entry)(u32 action,
++				    struct utrace_engine *engine,
++				    struct task_struct *task,
++				    struct pt_regs *regs);
++	u32 (*report_syscall_exit)(enum utrace_resume_action action,
++				   struct utrace_engine *engine,
++				   struct task_struct *task,
++				   struct pt_regs *regs);
++	u32 (*report_exit)(enum utrace_resume_action action,
++			   struct utrace_engine *engine,
++			   struct task_struct *task,
++			   long orig_code, long *code);
++	u32 (*report_death)(struct utrace_engine *engine,
++			    struct task_struct *task,
++			    bool group_dead, int signal);
++	void (*report_reap)(struct utrace_engine *engine,
++			    struct task_struct *task);
++};
++
++/**
++ * struct utrace_examiner - private state for using utrace_prepare_examine()
++ *
++ * The members of &struct utrace_examiner are private to the implementation.
++ * This data type holds the state from a call to utrace_prepare_examine()
++ * to be used by a call to utrace_finish_examine().
++ */
++struct utrace_examiner {
++/* private: */
++	long state;
++	unsigned long ncsw;
++};
++
++/*
++ * These are the exported entry points for tracing engines to use.
++ * See kernel/utrace.c for their kerneldoc comments with interface details.
++ */
++struct utrace_engine *utrace_attach_task(struct task_struct *, int,
++					 const struct utrace_engine_ops *,
++					 void *);
++struct utrace_engine *utrace_attach_pid(struct pid *, int,
++					const struct utrace_engine_ops *,
++					void *);
++int __must_check utrace_control(struct task_struct *,
++				struct utrace_engine *,
++				enum utrace_resume_action);
++int __must_check utrace_set_events(struct task_struct *,
++				   struct utrace_engine *,
++				   unsigned long eventmask);
++int __must_check utrace_barrier(struct task_struct *,
++				struct utrace_engine *);
++int __must_check utrace_prepare_examine(struct task_struct *,
++					struct utrace_engine *,
++					struct utrace_examiner *);
++int __must_check utrace_finish_examine(struct task_struct *,
++				       struct utrace_engine *,
++				       struct utrace_examiner *);
++
++/**
++ * utrace_control_pid - control a thread being traced by a tracing engine
++ * @pid:		thread to affect
++ * @engine:		attached engine to affect
++ * @action:		&enum utrace_resume_action for thread to do
++ *
++ * This is the same as utrace_control(), but takes a &struct pid
++ * pointer rather than a &struct task_struct pointer.  The caller must
++ * hold a ref on @pid, but does not need to worry about the task
++ * staying valid.  If it's been reaped so that @pid points nowhere,
++ * then this call returns -%ESRCH.
++ */
++static inline __must_check int utrace_control_pid(
++	struct pid *pid, struct utrace_engine *engine,
++	enum utrace_resume_action action)
++{
++	/*
++	 * We don't bother with rcu_read_lock() here to protect the
++	 * task_struct pointer, because utrace_control will return
++	 * -ESRCH without looking at that pointer if the engine is
++	 * already detached.  A task_struct pointer can't die before
++	 * all the engines are detached in release_task() first.
++	 */
++	struct task_struct *task = pid_task(pid, PIDTYPE_PID);
++	return unlikely(!task) ? -ESRCH : utrace_control(task, engine, action);
++}
++
++/**
++ * utrace_set_events_pid - choose which event reports a tracing engine gets
++ * @pid:		thread to affect
++ * @engine:		attached engine to affect
++ * @eventmask:		new event mask
++ *
++ * This is the same as utrace_set_events(), but takes a &struct pid
++ * pointer rather than a &struct task_struct pointer.  The caller must
++ * hold a ref on @pid, but does not need to worry about the task
++ * staying valid.  If it's been reaped so that @pid points nowhere,
++ * then this call returns -%ESRCH.
++ */
++static inline __must_check int utrace_set_events_pid(
++	struct pid *pid, struct utrace_engine *engine, unsigned long eventmask)
++{
++	struct task_struct *task = pid_task(pid, PIDTYPE_PID);
++	return unlikely(!task) ? -ESRCH :
++		utrace_set_events(task, engine, eventmask);
++}
++
++/**
++ * utrace_barrier_pid - synchronize with simultaneous tracing callbacks
++ * @pid:		thread to affect
++ * @engine:		engine to affect (can be detached)
++ *
++ * This is the same as utrace_barrier(), but takes a &struct pid
++ * pointer rather than a &struct task_struct pointer.  The caller must
++ * hold a ref on @pid, but does not need to worry about the task
++ * staying valid.  If it's been reaped so that @pid points nowhere,
++ * then this call returns -%ESRCH.
++ */
++static inline __must_check int utrace_barrier_pid(struct pid *pid,
++						  struct utrace_engine *engine)
++{
++	struct task_struct *task = pid_task(pid, PIDTYPE_PID);
++	return unlikely(!task) ? -ESRCH : utrace_barrier(task, engine);
++}
++
++#endif	/* CONFIG_UTRACE */
++
++#endif	/* linux/utrace.h */
+diff --git a/include/linux/utrace_struct.h b/include/linux/utrace_struct.h
+new file mode 100644
+index 0000000..aba7e09
+--- /dev/null
++++ b/include/linux/utrace_struct.h
+@@ -0,0 +1,58 @@
++/*
++ * 'struct utrace' data structure for kernel/utrace.c private use.
++ *
++ * Copyright (C) 2006-2009 Red Hat, Inc.  All rights reserved.
++ *
++ * This copyrighted material is made available to anyone wishing to use,
++ * modify, copy, or redistribute it subject to the terms and conditions
++ * of the GNU General Public License v.2.
++ */
++
++#ifndef _LINUX_UTRACE_STRUCT_H
++#define _LINUX_UTRACE_STRUCT_H	1
++
++#ifdef CONFIG_UTRACE
++
++#include <linux/list.h>
++#include <linux/spinlock.h>
++
++/*
++ * Per-thread structure private to utrace implementation.  This properly
++ * belongs in kernel/utrace.c and its use is entirely private to the code
++ * there.  It is only defined in a header file so that it can be embedded
++ * in the struct task_struct layout.  It is here rather than in utrace.h
++ * to avoid header nesting order issues getting too complex.
++ *
++ */
++struct utrace {
++	struct task_struct *cloning;
++
++	struct list_head attached, attaching;
++	spinlock_t lock;
++
++	struct utrace_engine *reporting;
++
++	unsigned int stopped:1;
++	unsigned int report:1;
++	unsigned int interrupt:1;
++	unsigned int signal_handler:1;
++	unsigned int vfork_stop:1; /* need utrace_stop() before vfork wait */
++	unsigned int death:1;	/* in utrace_report_death() now */
++	unsigned int reap:1;	/* release_task() has run */
++};
++
++# define INIT_UTRACE(tsk)						      \
++	.utrace_flags = 0,						      \
++	.utrace = {							      \
++		.lock = __SPIN_LOCK_UNLOCKED(tsk.utrace.lock),		      \
++		.attached = LIST_HEAD_INIT(tsk.utrace.attached),	      \
++		.attaching = LIST_HEAD_INIT(tsk.utrace.attaching),	      \
++	},
++
++#else
++
++# define INIT_UTRACE(tsk)	/* Nothing. */
++
++#endif	/* CONFIG_UTRACE */
++
++#endif	/* linux/utrace_struct.h */
+diff --git a/init/Kconfig b/init/Kconfig
+index 1ce05a4..f720929 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1191,6 +1191,15 @@ config STOP_MACHINE
+ 	help
+ 	  Need stop_machine() primitive.
+ 
++menuconfig UTRACE
++	bool "Infrastructure for tracing and debugging user processes"
++	depends on EXPERIMENTAL
++	depends on HAVE_ARCH_TRACEHOOK
++	help
++	  Enable the utrace process tracing interface.  This is an internal
++	  kernel interface exported to kernel modules, to track events in
++	  user threads, extract and change user thread state.
++
+ source "block/Kconfig"
+ 
+ config PREEMPT_NOTIFIERS
+diff --git a/kernel/Makefile b/kernel/Makefile
+index 780c8dc..cd16d49 100644
+--- a/kernel/Makefile
++++ b/kernel/Makefile
+@@ -69,6 +69,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o
+ obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
+ obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+ obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
++obj-$(CONFIG_UTRACE) += utrace.o
+ obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
+ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+ obj-$(CONFIG_GCOV_KERNEL) += gcov/
+diff --git a/kernel/ptrace.c b/kernel/ptrace.c
+index 61c78b2..935eeee 100644
+--- a/kernel/ptrace.c
++++ b/kernel/ptrace.c
+@@ -16,6 +16,7 @@
+ #include <linux/pagemap.h>
+ #include <linux/smp_lock.h>
+ #include <linux/ptrace.h>
++#include <linux/utrace.h>
+ #include <linux/security.h>
+ #include <linux/signal.h>
+ #include <linux/audit.h>
+@@ -164,6 +165,14 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
+ 	return !err;
+ }
+ 
++/*
++ * For experimental use of utrace, exclude ptrace on the same task.
++ */
++static inline bool exclude_ptrace(struct task_struct *task)
++{
++	return unlikely(!!task_utrace_flags(task));
++}
++
+ int ptrace_attach(struct task_struct *task)
+ {
+ 	int retval;
+@@ -186,6 +195,13 @@ int ptrace_attach(struct task_struct *task)
+ 		goto out;
+ 
+ 	task_lock(task);
++
++	if (exclude_ptrace(task)) {
++		retval = -EBUSY;
++		task_unlock(task);
++		goto unlock_creds;
++	}
++
+ 	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+ 	task_unlock(task);
+ 	if (retval)
+@@ -226,7 +242,9 @@ int ptrace_traceme(void)
+ 
+ 	write_lock_irq(&tasklist_lock);
+ 	/* Are we already being traced? */
+-	if (!current->ptrace) {
++	if (exclude_ptrace(current)) {
++		ret = -EBUSY;
++	} else if (!current->ptrace) {
+ 		ret = security_ptrace_traceme(current->parent);
+ 		/*
+ 		 * Check PF_EXITING to ensure ->real_parent has not passed
+@@ -577,7 +595,17 @@ int ptrace_request(struct task_struct *child, long request,
+ 	return ret;
+ }
+ 
+-static struct task_struct *ptrace_get_task_struct(pid_t pid)
++/**
++ * ptrace_get_task_struct  --  grab a task struct reference for ptrace
++ * @pid:       process id to grab a task_struct reference of
++ *
++ * This function is a helper for ptrace implementations.  It checks
++ * permissions and then grabs a task struct for use of the actual
++ * ptrace implementation.
++ *
++ * Returns the task_struct for @pid or an ERR_PTR() on failure.
++ */
++struct task_struct *ptrace_get_task_struct(pid_t pid)
+ {
+ 	struct task_struct *child;
+ 
+diff --git a/kernel/utrace.c b/kernel/utrace.c
+new file mode 100644
+index 0000000..74b5fc5
+--- /dev/null
++++ b/kernel/utrace.c
+@@ -0,0 +1,2357 @@
++/*
++ * utrace infrastructure interface for debugging user processes
++ *
++ * Copyright (C) 2006-2009 Red Hat, Inc.  All rights reserved.
++ *
++ * This copyrighted material is made available to anyone wishing to use,
++ * modify, copy, or redistribute it subject to the terms and conditions
++ * of the GNU General Public License v.2.
++ *
++ * Red Hat Author: Roland McGrath.
++ */
++
++#include <linux/utrace.h>
++#include <linux/tracehook.h>
++#include <linux/regset.h>
++#include <asm/syscall.h>
++#include <linux/ptrace.h>
++#include <linux/err.h>
++#include <linux/sched.h>
++#include <linux/freezer.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <linux/seq_file.h>
++
++
++/*
++ * Rules for 'struct utrace', defined in <linux/utrace_struct.h>
++ * but used entirely privately in this file.
++ *
++ * The common event reporting loops are done by the task making the
++ * report without ever taking any locks.  To facilitate this, the two
++ * lists @attached and @attaching work together for smooth asynchronous
++ * attaching with low overhead.  Modifying either list requires @lock.
++ * The @attaching list can be modified any time while holding @lock.
++ * New engines being attached always go on this list.
++ *
++ * The @attached list is what the task itself uses for its reporting
++ * loops.  When the task itself is not quiescent, it can use the
++ * @attached list without taking any lock.  Nobody may modify the list
++ * when the task is not quiescent.  When it is quiescent, that means
++ * that it won't run again without taking @lock itself before using
++ * the list.
++ *
++ * At each place where we know the task is quiescent (or it's current),
++ * while holding @lock, we call splice_attaching(), below.  This moves
++ * the @attaching list members on to the end of the @attached list.
++ * Since this happens at the start of any reporting pass, any new
++ * engines attached asynchronously go on the stable @attached list
++ * in time to have their callbacks seen.
++ */
++
++static struct kmem_cache *utrace_engine_cachep;
++static const struct utrace_engine_ops utrace_detached_ops; /* forward decl */
++
++static int __init utrace_init(void)
++{
++	utrace_engine_cachep = KMEM_CACHE(utrace_engine, SLAB_PANIC);
++	return 0;
++}
++module_init(utrace_init);
++
++/*
++ * This is called with @utrace->lock held when the task is safely
++ * quiescent, i.e. it won't consult utrace->attached without the lock.
++ * Move any engines attached asynchronously from @utrace->attaching
++ * onto the @utrace->attached list.
++ */
++static void splice_attaching(struct utrace *utrace)
++{
++	list_splice_tail_init(&utrace->attaching, &utrace->attached);
++}
++
++/*
++ * This is the exported function used by the utrace_engine_put() inline.
++ */
++void __utrace_engine_release(struct kref *kref)
++{
++	struct utrace_engine *engine = container_of(kref, struct utrace_engine,
++						    kref);
++	BUG_ON(!list_empty(&engine->entry));
++	kmem_cache_free(utrace_engine_cachep, engine);
++}
++EXPORT_SYMBOL_GPL(__utrace_engine_release);
++
++static bool engine_matches(struct utrace_engine *engine, int flags,
++			   const struct utrace_engine_ops *ops, void *data)
++{
++	if ((flags & UTRACE_ATTACH_MATCH_OPS) && engine->ops != ops)
++		return false;
++	if ((flags & UTRACE_ATTACH_MATCH_DATA) && engine->data != data)
++		return false;
++	return engine->ops && engine->ops != &utrace_detached_ops;
++}
++
++static struct utrace_engine *matching_engine(
++	struct utrace *utrace, int flags,
++	const struct utrace_engine_ops *ops, void *data)
++{
++	struct utrace_engine *engine;
++	list_for_each_entry(engine, &utrace->attached, entry)
++		if (engine_matches(engine, flags, ops, data))
++			return engine;
++	list_for_each_entry(engine, &utrace->attaching, entry)
++		if (engine_matches(engine, flags, ops, data))
++			return engine;
++	return NULL;
++}
++
++/*
++ * For experimental use, utrace attach is mutually exclusive with ptrace.
++ */
++static inline bool exclude_utrace(struct task_struct *task)
++{
++	return unlikely(!!task->ptrace);
++}
++
++/*
++ * Called without locks, when we might be the first utrace engine to attach.
++ * If this is a newborn thread and we are not the creator, we have to wait
++ * for it.  The creator gets the first chance to attach.  The PF_STARTING
++ * flag is cleared after its report_clone hook has had a chance to run.
++ */
++static inline int utrace_attach_delay(struct task_struct *target)
++{
++	if ((target->flags & PF_STARTING) &&
++	    current->utrace.cloning != target)
++		do {
++			schedule_timeout_interruptible(1);
++			if (signal_pending(current))
++				return -ERESTARTNOINTR;
++		} while (target->flags & PF_STARTING);
++
++	return 0;
++}
++
++/*
++ * Enqueue @engine, or maybe don't if UTRACE_ATTACH_EXCLUSIVE.
++ */
++static int utrace_add_engine(struct task_struct *target,
++			     struct utrace *utrace,
++			     struct utrace_engine *engine,
++			     int flags,
++			     const struct utrace_engine_ops *ops,
++			     void *data)
++{
++	int ret;
++
++	spin_lock(&utrace->lock);
++
++	if (utrace->reap) {
++		/*
++		 * Already entered utrace_release_task(), cannot attach now.
++		 */
++		ret = -ESRCH;
++	} else if ((flags & UTRACE_ATTACH_EXCLUSIVE) &&
++	    unlikely(matching_engine(utrace, flags, ops, data))) {
++		ret = -EEXIST;
++	} else {
++		/*
++		 * Put the new engine on the pending ->attaching list.
++		 * Make sure it gets onto the ->attached list by the next
++		 * time it's examined.
++		 *
++		 * When target == current, it would be safe just to call
++		 * splice_attaching() right here.  But if we're inside a
++		 * callback, that would mean the new engine also gets
++		 * notified about the event that precipitated its own
++		 * creation.  This is not what the user wants.
++		 *
++		 * Setting ->report ensures that start_report() takes the
++		 * lock and does it next time.  Whenever setting ->report,
++		 * we must maintain the invariant that TIF_NOTIFY_RESUME is
++		 * also set.  Otherwise utrace_control() or utrace_do_stop()
++		 * might skip setting TIF_NOTIFY_RESUME upon seeing ->report
++		 * already set, and we'd miss a necessary callback.
++		 *
++		 * In case we had no engines before, make sure that
++		 * utrace_flags is not zero when tracehook_notify_resume()
++		 * checks.  That would bypass utrace reporting clearing
++		 * TIF_NOTIFY_RESUME, and thus violate the same invariant.
++		 */
++		target->utrace_flags |= UTRACE_EVENT(REAP);
++		list_add_tail(&engine->entry, &utrace->attaching);
++		utrace->report = 1;
++		set_notify_resume(target);
++
++		ret = 0;
++	}
++
++	spin_unlock(&utrace->lock);
++
++	return ret;
++}
++
++/**
++ * utrace_attach_task - attach new engine, or look up an attached engine
++ * @target:	thread to attach to
++ * @flags:	flag bits combined with OR, see below
++ * @ops:	callback table for new engine
++ * @data:	engine private data pointer
++ *
++ * The caller must ensure that the @target thread does not get freed,
++ * i.e. hold a ref or be its parent.  It is always safe to call this
++ * on @current, or on the @child pointer in a @report_clone callback.
++ * For most other cases, it's easier to use utrace_attach_pid() instead.
++ *
++ * UTRACE_ATTACH_CREATE:
++ * Create a new engine.  If %UTRACE_ATTACH_CREATE is not specified, you
++ * only look up an existing engine already attached to the thread.
++ *
++ * UTRACE_ATTACH_EXCLUSIVE:
++ * Attempting to attach a second (matching) engine fails with -%EEXIST.
++ *
++ * UTRACE_ATTACH_MATCH_OPS: Only consider engines matching @ops.
++ * UTRACE_ATTACH_MATCH_DATA: Only consider engines matching @data.
++ *
++ * Calls with neither %UTRACE_ATTACH_MATCH_OPS nor %UTRACE_ATTACH_MATCH_DATA
++ * match the first among any engines attached to @target.  That means that
++ * %UTRACE_ATTACH_EXCLUSIVE in such a call fails with -%EEXIST if there
++ * are any engines on @target at all.
++ */
++struct utrace_engine *utrace_attach_task(
++	struct task_struct *target, int flags,
++	const struct utrace_engine_ops *ops, void *data)
++{
++	struct utrace *utrace;
++	struct utrace_engine *engine;
++	int ret;
++
++	utrace = &target->utrace;
++
++	if (unlikely(target->exit_state == EXIT_DEAD)) {
++		/*
++		 * The target has already been reaped.
++		 * Check this early, though it's not synchronized.
++		 * utrace_add_engine() will do the final check.
++		 */
++		if (!(flags & UTRACE_ATTACH_CREATE))
++			return ERR_PTR(-ENOENT);
++		return ERR_PTR(-ESRCH);
++	}
++
++	if (!(flags & UTRACE_ATTACH_CREATE)) {
++		spin_lock(&utrace->lock);
++		engine = matching_engine(utrace, flags, ops, data);
++		if (engine)
++			utrace_engine_get(engine);
++		spin_unlock(&utrace->lock);
++		return engine ?: ERR_PTR(-ENOENT);
++	}
++
++	if (unlikely(!ops) || unlikely(ops == &utrace_detached_ops))
++		return ERR_PTR(-EINVAL);
++
++	if (unlikely(target->flags & PF_KTHREAD))
++		/*
++		 * Silly kernel, utrace is for users!
++		 */
++		return ERR_PTR(-EPERM);
++
++	engine = kmem_cache_alloc(utrace_engine_cachep, GFP_KERNEL);
++	if (unlikely(!engine))
++		return ERR_PTR(-ENOMEM);
++
++	/*
++	 * Initialize the new engine structure.  It starts out with two
++	 * refs: one ref to return, and one ref for being attached.
++	 */
++	kref_set(&engine->kref, 2);
++	engine->flags = 0;
++	engine->ops = ops;
++	engine->data = data;
++
++	ret = utrace_attach_delay(target);
++	if (likely(!ret))
++		ret = utrace_add_engine(target, utrace, engine,
++					flags, ops, data);
++
++	if (unlikely(ret)) {
++		kmem_cache_free(utrace_engine_cachep, engine);
++		engine = ERR_PTR(ret);
++	}
++
++	return engine;
++}
++EXPORT_SYMBOL_GPL(utrace_attach_task);
++
++/**
++ * utrace_attach_pid - attach new engine, or look up an attached engine
++ * @pid:	&struct pid pointer representing thread to attach to
++ * @flags:	flag bits combined with OR, see utrace_attach_task()
++ * @ops:	callback table for new engine
++ * @data:	engine private data pointer
++ *
++ * This is the same as utrace_attach_task(), but takes a &struct pid
++ * pointer rather than a &struct task_struct pointer.  The caller must
++ * hold a ref on @pid, but does not need to worry about the task
++ * staying valid.  If it's been reaped so that @pid points nowhere,
++ * then this call returns -%ESRCH.
++ */
++struct utrace_engine *utrace_attach_pid(
++	struct pid *pid, int flags,
++	const struct utrace_engine_ops *ops, void *data)
++{
++	struct utrace_engine *engine = ERR_PTR(-ESRCH);
++	struct task_struct *task = get_pid_task(pid, PIDTYPE_PID);
++	if (task) {
++		engine = utrace_attach_task(task, flags, ops, data);
++		put_task_struct(task);
++	}
++	return engine;
++}
++EXPORT_SYMBOL_GPL(utrace_attach_pid);
++
++/*
++ * When an engine is detached, the target thread may still see it and
++ * make callbacks until it quiesces.  We install a special ops vector
++ * with these two callbacks.  When the target thread quiesces, it can
++ * safely free the engine itself.  For any event we will always get
++ * the report_quiesce() callback first, so we only need this one
++ * pointer to be set.  The only exception is report_reap(), so we
++ * supply that callback too.
++ */
++static u32 utrace_detached_quiesce(enum utrace_resume_action action,
++				   struct utrace_engine *engine,
++				   struct task_struct *task,
++				   unsigned long event)
++{
++	return UTRACE_DETACH;
++}
++
++static void utrace_detached_reap(struct utrace_engine *engine,
++				 struct task_struct *task)
++{
++}
++
++static const struct utrace_engine_ops utrace_detached_ops = {
++	.report_quiesce = &utrace_detached_quiesce,
++	.report_reap = &utrace_detached_reap
++};
++
++/*
++ * After waking up from TASK_TRACED, clear bookkeeping in @utrace.
++ * Returns true if we were woken up prematurely by SIGKILL.
++ */
++static inline bool finish_utrace_stop(struct task_struct *task,
++				      struct utrace *utrace)
++{
++	bool killed = false;
++
++	/*
++	 * utrace_wakeup() clears @utrace->stopped before waking us up.
++	 * We're officially awake if it's clear.
++	 */
++	spin_lock(&utrace->lock);
++	if (unlikely(utrace->stopped)) {
++		/*
++		 * If we're here with it still set, it must have been
++		 * signal_wake_up() instead, waking us up for a SIGKILL.
++		 */
++		spin_lock_irq(&task->sighand->siglock);
++		WARN_ON(!sigismember(&task->pending.signal, SIGKILL));
++		spin_unlock_irq(&task->sighand->siglock);
++		utrace->stopped = 0;
++		killed = true;
++	}
++	spin_unlock(&utrace->lock);
++
++	return killed;
++}
++
++/*
++ * Perform %UTRACE_STOP, i.e. block in TASK_TRACED until woken up.
++ * @task == current, @utrace == current->utrace, which is not locked.
++ * Return true if we were woken up by SIGKILL even though some utrace
++ * engine may still want us to stay stopped.
++ */
++static bool utrace_stop(struct task_struct *task, struct utrace *utrace,
++			bool report)
++{
++	bool killed;
++
++	/*
++	 * @utrace->stopped is the flag that says we are safely
++	 * inside this function.  It should never be set on entry.
++	 */
++	BUG_ON(utrace->stopped);
++
++	/*
++	 * The siglock protects us against signals.  As well as SIGKILL
++	 * waking us up, we must synchronize with the signal bookkeeping
++	 * for stop signals and SIGCONT.
++	 */
++	spin_lock(&utrace->lock);
++	spin_lock_irq(&task->sighand->siglock);
++
++	if (unlikely(sigismember(&task->pending.signal, SIGKILL))) {
++		spin_unlock_irq(&task->sighand->siglock);
++		spin_unlock(&utrace->lock);
++		return true;
++	}
++
++	if (report) {
++		/*
++		 * Ensure a reporting pass when we're resumed.
++		 */
++		utrace->report = 1;
++		set_thread_flag(TIF_NOTIFY_RESUME);
++	}
++
++	utrace->stopped = 1;
++	__set_current_state(TASK_TRACED);
++
++	/*
++	 * If there is a group stop in progress,
++	 * we must participate in the bookkeeping.
++	 */
++	if (task->signal->group_stop_count > 0)
++		--task->signal->group_stop_count;
++
++	spin_unlock_irq(&task->sighand->siglock);
++	spin_unlock(&utrace->lock);
++
++	schedule();
++
++	/*
++	 * While in TASK_TRACED, we were considered "frozen enough".
++	 * Now that we woke up, it's crucial if we're supposed to be
++	 * frozen that we freeze now before running anything substantial.
++	 */
++	try_to_freeze();
++
++	killed = finish_utrace_stop(task, utrace);
++
++	/*
++	 * While we were in TASK_TRACED, complete_signal() considered
++	 * us "uninterested" in signal wakeups.  Now make sure our
++	 * TIF_SIGPENDING state is correct for normal running.
++	 */
++	spin_lock_irq(&task->sighand->siglock);
++	recalc_sigpending();
++	spin_unlock_irq(&task->sighand->siglock);
++
++	return killed;
++}
++
++/*
++ * The caller has to hold a ref on the engine.  If the attached flag is
++ * true (all but utrace_barrier() calls), the engine is supposed to be
++ * attached.  If the attached flag is false (utrace_barrier() only),
++ * then return -ERESTARTSYS for an engine marked for detach but not yet
++ * fully detached.  The task pointer can be invalid if the engine is
++ * detached.
++ *
++ * Get the utrace lock for the target task.
++ * Returns the struct if locked, or ERR_PTR(-errno).
++ *
++ * This has to be robust against races with:
++ *	utrace_control(target, UTRACE_DETACH) calls
++ *	UTRACE_DETACH after reports
++ *	utrace_report_death
++ *	utrace_release_task
++ */
++static struct utrace *get_utrace_lock(struct task_struct *target,
++				      struct utrace_engine *engine,
++				      bool attached)
++	__acquires(utrace->lock)
++{
++	struct utrace *utrace;
++
++	rcu_read_lock();
++
++	/*
++	 * If this engine was already detached, bail out before we look at
++	 * the task_struct pointer at all.  If it's detached after this
++	 * check, then RCU is still keeping this task_struct pointer valid.
++	 *
++	 * The ops pointer is NULL when the engine is fully detached.
++	 * It's &utrace_detached_ops when it's marked detached but still
++	 * on the list.  In the latter case, utrace_barrier() still works,
++	 * since the target might be in the middle of an old callback.
++	 */
++	if (unlikely(!engine->ops)) {
++		rcu_read_unlock();
++		return ERR_PTR(-ESRCH);
++	}
++
++	if (unlikely(engine->ops == &utrace_detached_ops)) {
++		rcu_read_unlock();
++		return attached ? ERR_PTR(-ESRCH) : ERR_PTR(-ERESTARTSYS);
++	}
++
++	utrace = &target->utrace;
++	if (unlikely(target->exit_state == EXIT_DEAD)) {
++		/*
++		 * If all engines detached already, utrace is clear.
++		 * Otherwise, we're called after utrace_release_task might
++		 * have started.  A call to this engine's report_reap
++		 * callback might already be in progress.
++		 */
++		utrace = ERR_PTR(-ESRCH);
++	} else {
++		spin_lock(&utrace->lock);
++		if (unlikely(!engine->ops) ||
++		    unlikely(engine->ops == &utrace_detached_ops)) {
++			/*
++			 * By the time we got the utrace lock,
++			 * it had been reaped or detached already.
++			 */
++			spin_unlock(&utrace->lock);
++			utrace = ERR_PTR(-ESRCH);
++			if (!attached && engine->ops == &utrace_detached_ops)
++				utrace = ERR_PTR(-ERESTARTSYS);
++		}
++	}
++	rcu_read_unlock();
++
++	return utrace;
++}
++
++/*
++ * Now that we don't hold any locks, run through any
++ * detached engines and free their references.  Each
++ * engine had one implicit ref while it was attached.
++ */
++static void put_detached_list(struct list_head *list)
++{
++	struct utrace_engine *engine, *next;
++	list_for_each_entry_safe(engine, next, list, entry) {
++		list_del_init(&engine->entry);
++		utrace_engine_put(engine);
++	}
++}
++
++/*
++ * Called with utrace->lock held.
++ * Notify and clean up all engines, then free utrace.
++ */
++static void utrace_reap(struct task_struct *target, struct utrace *utrace)
++	__releases(utrace->lock)
++{
++	struct utrace_engine *engine, *next;
++	const struct utrace_engine_ops *ops;
++	LIST_HEAD(detached);
++
++restart:
++	splice_attaching(utrace);
++	list_for_each_entry_safe(engine, next, &utrace->attached, entry) {
++		ops = engine->ops;
++		engine->ops = NULL;
++		list_move(&engine->entry, &detached);
++
++		/*
++		 * If it didn't need a callback, we don't need to drop
++		 * the lock.  Now nothing else refers to this engine.
++		 */
++		if (!(engine->flags & UTRACE_EVENT(REAP)))
++			continue;
++
++		/*
++		 * This synchronizes with utrace_barrier().  Since we
++		 * need the utrace->lock here anyway (unlike the other
++		 * reporting loops), we don't need any memory barrier
++		 * as utrace_barrier() holds the lock.
++		 */
++		utrace->reporting = engine;
++		spin_unlock(&utrace->lock);
++
++		(*ops->report_reap)(engine, target);
++
++		utrace->reporting = NULL;
++
++		put_detached_list(&detached);
++
++		spin_lock(&utrace->lock);
++		goto restart;
++	}
++
++	spin_unlock(&utrace->lock);
++
++	put_detached_list(&detached);
++}
++
++/*
++ * Called by release_task.  After this, target->utrace must be cleared.
++ */
++void utrace_release_task(struct task_struct *target)
++{
++	struct utrace *utrace;
++
++	utrace = &target->utrace;
++
++	spin_lock(&utrace->lock);
++
++	utrace->reap = 1;
++
++	if (!(target->utrace_flags & _UTRACE_DEATH_EVENTS)) {
++		utrace_reap(target, utrace); /* Unlocks and frees.  */
++		return;
++	}
++
++	/*
++	 * The target will do some final callbacks but hasn't
++	 * finished them yet.  We know because it clears these
++	 * event bits after it's done.  Instead of cleaning up here
++	 * and requiring utrace_report_death to cope with it, we
++	 * delay the REAP report and the teardown until after the
++	 * target finishes its death reports.
++	 */
++
++	spin_unlock(&utrace->lock);
++}
++
++/*
++ * We use an extra bit in utrace_engine.flags past the event bits,
++ * to record whether the engine is keeping the target thread stopped.
++ */
++#define ENGINE_STOP		(1UL << _UTRACE_NEVENTS)
++
++static void mark_engine_wants_stop(struct utrace_engine *engine)
++{
++	engine->flags |= ENGINE_STOP;
++}
++
++static void clear_engine_wants_stop(struct utrace_engine *engine)
++{
++	engine->flags &= ~ENGINE_STOP;
++}
++
++static bool engine_wants_stop(struct utrace_engine *engine)
++{
++	return (engine->flags & ENGINE_STOP) != 0;
++}
++
++/**
++ * utrace_set_events - choose which event reports a tracing engine gets
++ * @target:		thread to affect
++ * @engine:		attached engine to affect
++ * @events:		new event mask
++ *
++ * This changes the set of events for which @engine wants callbacks made.
++ *
++ * This fails with -%EALREADY and does nothing if you try to clear
++ * %UTRACE_EVENT(%DEATH) when the @report_death callback may already have
++ * begun, if you try to clear %UTRACE_EVENT(%REAP) when the @report_reap
++ * callback may already have begun, or if you try to newly set
++ * %UTRACE_EVENT(%DEATH) or %UTRACE_EVENT(%QUIESCE) when @target is
++ * already dead or dying.
++ *
++ * This can fail with -%ESRCH when @target has already been detached,
++ * including forcible detach on reaping.
++ *
++ * If @target was stopped before the call, then after a successful call,
++ * no event callbacks not requested in @events will be made; if
++ * %UTRACE_EVENT(%QUIESCE) is included in @events, then a @report_quiesce
++ * callback will be made when @target resumes.  If @target was not stopped,
++ * and was about to make a callback to @engine, this returns -%EINPROGRESS.
++ * In this case, the callback in progress might be one excluded from the
++ * new @events setting.  When this returns zero, you can be sure that no
++ * event callbacks you've disabled in @events can be made.
++ *
++ * To synchronize after an -%EINPROGRESS return, see utrace_barrier().
++ *
++ * When @target is @current, -%EINPROGRESS is not returned.  But
++ * note that a newly-created engine will not receive any callbacks
++ * related to an event notification already in progress.  This call
++ * enables @events callbacks to be made as soon as @engine becomes
++ * eligible for any callbacks, see utrace_attach_task().
++ *
++ * These rules provide for coherent synchronization based on %UTRACE_STOP,
++ * even when %SIGKILL is breaking its normal simple rules.
++ */
++int utrace_set_events(struct task_struct *target,
++		      struct utrace_engine *engine,
++		      unsigned long events)
++{
++	struct utrace *utrace;
++	unsigned long old_flags, old_utrace_flags, set_utrace_flags;
++	int ret;
++
++	utrace = get_utrace_lock(target, engine, true);
++	if (unlikely(IS_ERR(utrace)))
++		return PTR_ERR(utrace);
++
++	old_utrace_flags = target->utrace_flags;
++	set_utrace_flags = events;
++	old_flags = engine->flags;
++
++	if (target->exit_state &&
++	    (((events & ~old_flags) & _UTRACE_DEATH_EVENTS) ||
++	     (utrace->death &&
++	      ((old_flags & ~events) & _UTRACE_DEATH_EVENTS)) ||
++	     (utrace->reap && ((old_flags & ~events) & UTRACE_EVENT(REAP))))) {
++		spin_unlock(&utrace->lock);
++		return -EALREADY;
++	}
++
++	/*
++	 * When setting these flags, it's essential that we really
++	 * synchronize with exit_notify().  They cannot be set after
++	 * exit_notify() takes the tasklist_lock.  By holding the read
++	 * lock here while setting the flags, we ensure that the calls
++	 * to tracehook_notify_death() and tracehook_report_death() will
++	 * see the new flags.  This ensures that utrace_release_task()
++	 * knows positively that utrace_report_death() will be called or
++	 * that it won't.
++	 */
++	if ((set_utrace_flags & ~old_utrace_flags) & _UTRACE_DEATH_EVENTS) {
++		read_lock(&tasklist_lock);
++		if (unlikely(target->exit_state)) {
++			read_unlock(&tasklist_lock);
++			spin_unlock(&utrace->lock);
++			return -EALREADY;
++		}
++		target->utrace_flags |= set_utrace_flags;
++		read_unlock(&tasklist_lock);
++	}
++
++	engine->flags = events | (engine->flags & ENGINE_STOP);
++	target->utrace_flags |= set_utrace_flags;
++
++	if ((set_utrace_flags & UTRACE_EVENT_SYSCALL) &&
++	    !(old_utrace_flags & UTRACE_EVENT_SYSCALL))
++		set_tsk_thread_flag(target, TIF_SYSCALL_TRACE);
++
++	ret = 0;
++	if (!utrace->stopped && target != current) {
++		/*
++		 * This barrier ensures that our engine->flags changes
++		 * have hit before we examine utrace->reporting,
++		 * pairing with the barrier in start_callback().  If
++		 * @target has not yet hit finish_callback() to clear
++		 * utrace->reporting, we might be in the middle of a
++		 * callback to @engine.
++		 */
++		smp_mb();
++		if (utrace->reporting == engine)
++			ret = -EINPROGRESS;
++	}
++
++	spin_unlock(&utrace->lock);
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(utrace_set_events);
++
++/*
++ * Asynchronously mark an engine as being detached.
++ *
++ * This must work while the target thread races with us doing
++ * start_callback(), defined below.  It uses smp_rmb() between checking
++ * @engine->flags and using @engine->ops.  Here we change @engine->ops
++ * first, then use smp_wmb() before changing @engine->flags.  This ensures
++ * it can check the old flags before using the old ops, or check the old
++ * flags before using the new ops, or check the new flags before using the
++ * new ops, but can never check the new flags before using the old ops.
++ * Hence, utrace_detached_ops might be used with any old flags in place.
++ * It has report_quiesce() and report_reap() callbacks to handle all cases.
++ */
++static void mark_engine_detached(struct utrace_engine *engine)
++{
++	engine->ops = &utrace_detached_ops;
++	smp_wmb();
++	engine->flags = UTRACE_EVENT(QUIESCE);
++}
++
++/*
++ * Get @target to stop and return true if it is already stopped now.
++ * If we return false, it will make some event callback soonish.
++ * Called with @utrace locked.
++ */
++static bool utrace_do_stop(struct task_struct *target, struct utrace *utrace)
++{
++	bool stopped = false;
++
++	spin_lock_irq(&target->sighand->siglock);
++	if (unlikely(target->exit_state)) {
++		/*
++		 * On the exit path, it's only truly quiescent
++		 * if it has already been through
++		 * utrace_report_death(), or never will.
++		 */
++		if (!(target->utrace_flags & _UTRACE_DEATH_EVENTS))
++			utrace->stopped = stopped = true;
++	} else if (task_is_stopped(target)) {
++		/*
++		 * Stopped is considered quiescent; when it wakes up, it will
++		 * go through utrace_get_signal() before doing anything else.
++		 */
++		utrace->stopped = stopped = true;
++	} else if (!utrace->report && !utrace->interrupt) {
++		utrace->report = 1;
++		set_notify_resume(target);
++	}
++	spin_unlock_irq(&target->sighand->siglock);
++
++	return stopped;
++}
++
++/*
++ * If the target is not dead it should not be in tracing
++ * stop any more.  Wake it unless it's in job control stop.
++ *
++ * Called with @utrace->lock held and @utrace->stopped set.
++ */
++static void utrace_wakeup(struct task_struct *target, struct utrace *utrace)
++{
++	struct sighand_struct *sighand;
++	unsigned long irqflags;
++
++	utrace->stopped = 0;
++
++	sighand = lock_task_sighand(target, &irqflags);
++	if (unlikely(!sighand))
++		return;
++
++	if (likely(task_is_stopped_or_traced(target))) {
++		if (target->signal->flags & SIGNAL_STOP_STOPPED)
++			target->state = TASK_STOPPED;
++		else
++			wake_up_state(target, __TASK_STOPPED | __TASK_TRACED);
++	}
++
++	unlock_task_sighand(target, &irqflags);
++}
++
++/*
++ * This is called when there might be some detached engines on the list or
++ * some stale bits in @task->utrace_flags.  Clean them up and recompute the
++ * flags.
++ *
++ * @action is NULL when @task is stopped and @utrace->stopped is set; wake
++ * it up if it should not be.  @action is set when @task is current; if
++ * we're fully detached, reset *@action to UTRACE_RESUME.
++ *
++ * Called with @utrace->lock held, returns with it released.
++ * After this returns, @utrace might be freed if everything detached.
++ */
++static void utrace_reset(struct task_struct *task, struct utrace *utrace,
++			 enum utrace_resume_action *action)
++	__releases(utrace->lock)
++{
++	struct utrace_engine *engine, *next;
++	unsigned long flags = 0;
++	LIST_HEAD(detached);
++	bool wake = !action;
++	BUG_ON(wake != (task != current));
++
++	splice_attaching(utrace);
++
++	/*
++	 * Update the set of events of interest from the union
++	 * of the interests of the remaining tracing engines.
++	 * For any engine marked detached, remove it from the list.
++	 * We'll collect them on the detached list.
++	 */
++	list_for_each_entry_safe(engine, next, &utrace->attached, entry) {
++		if (engine->ops == &utrace_detached_ops) {
++			engine->ops = NULL;
++			list_move(&engine->entry, &detached);
++		} else {
++			flags |= engine->flags | UTRACE_EVENT(REAP);
++			wake = wake && !engine_wants_stop(engine);
++		}
++	}
++
++	if (task->exit_state) {
++		/*
++		 * Once it's already dead, we never install any flags
++		 * except REAP.  When ->exit_state is set and events
++		 * like DEATH are not set, then they never can be set.
++		 * This ensures that utrace_release_task() knows
++		 * positively that utrace_report_death() can never run.
++		 */
++		BUG_ON(utrace->death);
++		flags &= UTRACE_EVENT(REAP);
++		wake = false;
++	} else if (!(flags & UTRACE_EVENT_SYSCALL) &&
++		   test_tsk_thread_flag(task, TIF_SYSCALL_TRACE)) {
++		clear_tsk_thread_flag(task, TIF_SYSCALL_TRACE);
++	}
++
++	task->utrace_flags = flags;
++
++	if (wake)
++		utrace_wakeup(task, utrace);
++
++	/*
++	 * If any engines are left, we're done.
++	 */
++	spin_unlock(&utrace->lock);
++	if (!flags) {
++		/*
++		 * No more engines, cleared out the utrace.
++		 */
++
++		if (action)
++			*action = UTRACE_RESUME;
++	}
++
++	put_detached_list(&detached);
++}
++
++/*
++ * You can't do anything to a dead task but detach it.
++ * If release_task() has been called, you can't do that.
++ *
++ * On the exit path, DEATH and QUIESCE event bits are set only
++ * before utrace_report_death() has taken the lock.  At that point,
++ * the death report will come soon, so disallow detach until it's
++ * done.  This prevents us from racing with it detaching itself.
++ *
++ * Called with utrace->lock held, when @target->exit_state is nonzero.
++ */
++static inline int utrace_control_dead(struct task_struct *target,
++				      struct utrace *utrace,
++				      enum utrace_resume_action action)
++{
++	if (action != UTRACE_DETACH || unlikely(utrace->reap))
++		return -ESRCH;
++
++	if (unlikely(utrace->death))
++		/*
++		 * We have already started the death report.  We can't
++		 * prevent the report_death and report_reap callbacks,
++		 * so tell the caller they will happen.
++		 */
++		return -EALREADY;
++
++	return 0;
++}
++
++/**
++ * utrace_control - control a thread being traced by a tracing engine
++ * @target:		thread to affect
++ * @engine:		attached engine to affect
++ * @action:		&enum utrace_resume_action for thread to do
++ *
++ * This is how a tracing engine asks a traced thread to do something.
++ * This call is controlled by the @action argument, which has the
++ * same meaning as the &enum utrace_resume_action value returned by
++ * event reporting callbacks.
++ *
++ * If @target is already dead (@target->exit_state nonzero),
++ * all actions except %UTRACE_DETACH fail with -%ESRCH.
++ *
++ * The following sections describe each option for the @action argument.
++ *
++ * UTRACE_DETACH:
++ *
++ * After this, the @engine data structure is no longer accessible,
++ * and the thread might be reaped.  The thread will start running
++ * again if it was stopped and no longer has any attached engines
++ * that want it stopped.
++ *
++ * If the @report_reap callback may already have begun, this fails
++ * with -%ESRCH.  If the @report_death callback may already have
++ * begun, this fails with -%EALREADY.
++ *
++ * If @target is not already stopped, then a callback to this engine
++ * might be in progress or about to start on another CPU.  If so,
++ * then this returns -%EINPROGRESS; the detach happens as soon as
++ * the pending callback is finished.  To synchronize after an
++ * -%EINPROGRESS return, see utrace_barrier().
++ *
++ * If @target is properly stopped before utrace_control() is called,
++ * then after successful return it's guaranteed that no more callbacks
++ * to the @engine->ops vector will be made.
++ *
++ * The only exception is %SIGKILL (and exec or group-exit by another
++ * thread in the group), which can cause asynchronous @report_death
++ * and/or @report_reap callbacks even when %UTRACE_STOP was used.
++ * (In that event, this fails with -%ESRCH or -%EALREADY, see above.)
++ *
++ * UTRACE_STOP:
++ * This asks that @target stop running.  This returns 0 only if
++ * @target is already stopped, either for tracing or for job
++ * control.  Then @target will remain stopped until another
++ * utrace_control() call is made on @engine; @target can be woken
++ * only by %SIGKILL (or equivalent, such as exec or termination by
++ * another thread in the same thread group).
++ *
++ * This returns -%EINPROGRESS if @target is not already stopped.
++ * Then the effect is like %UTRACE_REPORT.  A @report_quiesce or
++ * @report_signal callback will be made soon.  Your callback can
++ * then return %UTRACE_STOP to keep @target stopped.
++ *
++ * This does not interrupt system calls in progress, including ones
++ * that sleep for a long time.  For that, use %UTRACE_INTERRUPT.
++ * To interrupt system calls and then keep @target stopped, your
++ * @report_signal callback can return %UTRACE_STOP.
++ *
++ * UTRACE_RESUME:
++ *
++ * Just let @target continue running normally, reversing the effect
++ * of a previous %UTRACE_STOP.  If another engine is keeping @target
++ * stopped, then it remains stopped until all engines let it resume.
++ * If @target was not stopped, this has no effect.
++ *
++ * UTRACE_REPORT:
++ *
++ * This is like %UTRACE_RESUME, but also ensures that there will be
++ * a @report_quiesce or @report_signal callback made soon.  If
++ * @target had been stopped, then there will be a callback before it
++ * resumes running normally.  If another engine is keeping @target
++ * stopped, then there might be no callbacks until all engines let
++ * it resume.
++ *
++ * UTRACE_INTERRUPT:
++ *
++ * This is like %UTRACE_REPORT, but ensures that @target will make a
++ * @report_signal callback before it resumes or delivers signals.
++ * If @target was in a system call or about to enter one, work in
++ * progress will be interrupted as if by %SIGSTOP.  If another
++ * engine is keeping @target stopped, then there might be no
++ * callbacks until all engines let it resume.
++ *
++ * This gives @engine an opportunity to introduce a forced signal
++ * disposition via its @report_signal callback.
++ *
++ * UTRACE_SINGLESTEP:
++ *
++ * It's invalid to use this unless arch_has_single_step() returned true.
++ * This is like %UTRACE_RESUME, but resumes for one user instruction
++ * only.  It's invalid to use this in utrace_control() unless @target
++ * had been stopped by @engine previously.
++ *
++ * Note that passing %UTRACE_SINGLESTEP or %UTRACE_BLOCKSTEP to
++ * utrace_control() or returning it from an event callback alone does
++ * not necessarily ensure that stepping will be enabled.  If there are
++ * more callbacks made to any engine before returning to user mode,
++ * then the resume action is chosen only by the last set of callbacks.
++ * To be sure, enable %UTRACE_EVENT(%QUIESCE) and look for the
++ * @report_quiesce callback with a zero event mask, or the
++ * @report_signal callback with %UTRACE_SIGNAL_REPORT.
++ *
++ * UTRACE_BLOCKSTEP:
++ *
++ * It's invalid to use this unless arch_has_block_step() returned true.
++ * This is like %UTRACE_SINGLESTEP, but resumes for one whole basic
++ * block of user instructions.
++ *
++ * %UTRACE_BLOCKSTEP devolves to %UTRACE_SINGLESTEP when another
++ * tracing engine is using %UTRACE_SINGLESTEP at the same time.
++ */
++int utrace_control(struct task_struct *target,
++		   struct utrace_engine *engine,
++		   enum utrace_resume_action action)
++{
++	struct utrace *utrace;
++	bool resume;
++	int ret;
++
++	if (unlikely(action > UTRACE_DETACH))
++		return -EINVAL;
++
++	utrace = get_utrace_lock(target, engine, true);
++	if (unlikely(IS_ERR(utrace)))
++		return PTR_ERR(utrace);
++
++	if (target->exit_state) {
++		ret = utrace_control_dead(target, utrace, action);
++		if (ret) {
++			spin_unlock(&utrace->lock);
++			return ret;
++		}
++	}
++
++	resume = utrace->stopped;
++	ret = 0;
++
++	clear_engine_wants_stop(engine);
++	switch (action) {
++	case UTRACE_STOP:
++		mark_engine_wants_stop(engine);
++		if (!resume && !utrace_do_stop(target, utrace))
++			ret = -EINPROGRESS;
++		resume = false;
++		break;
++
++	case UTRACE_DETACH:
++		mark_engine_detached(engine);
++		resume = resume || utrace_do_stop(target, utrace);
++		if (!resume) {
++			/*
++			 * As in utrace_set_events(), this barrier ensures
++			 * that our engine->flags changes have hit before we
++			 * examine utrace->reporting, pairing with the barrier
++			 * in start_callback().  If @target has not yet hit
++			 * finish_callback() to clear utrace->reporting, we
++			 * might be in the middle of a callback to @engine.
++			 */
++			smp_mb();
++			if (utrace->reporting == engine)
++				ret = -EINPROGRESS;
++			break;
++		}
++		/* Fall through.  */
++
++	case UTRACE_RESUME:
++		/*
++		 * This and all other cases imply resuming if stopped.
++		 * There might not be another report before it just
++		 * resumes, so make sure single-step is not left set.
++		 */
++		if (likely(resume))
++			user_disable_single_step(target);
++		break;
++
++	case UTRACE_REPORT:
++		/*
++		 * Make the thread call tracehook_notify_resume() soon.
++		 * But don't bother if it's already been interrupted.
++		 * In that case, utrace_get_signal() will be reporting soon.
++		 */
++		if (!utrace->report && !utrace->interrupt) {
++			utrace->report = 1;
++			set_notify_resume(target);
++		}
++		break;
++
++	case UTRACE_INTERRUPT:
++		/*
++		 * Make the thread call tracehook_get_signal() soon.
++		 */
++		if (utrace->interrupt)
++			break;
++		utrace->interrupt = 1;
++
++		/*
++		 * If it's not already stopped, interrupt it now.
++		 * We need the siglock here in case it calls
++		 * recalc_sigpending() and clears its own
++		 * TIF_SIGPENDING.  By taking the lock, we've
++		 * serialized any later recalc_sigpending() after
++		 * our setting of utrace->interrupt to force it on.
++		 */
++		if (resume) {
++			/*
++			 * This is really just to keep the invariant
++			 * that TIF_SIGPENDING is set with utrace->interrupt.
++			 * When it's stopped, we know it's always going
++			 * through utrace_get_signal and will recalculate.
++			 */
++			set_tsk_thread_flag(target, TIF_SIGPENDING);
++		} else {
++			struct sighand_struct *sighand;
++			unsigned long irqflags;
++			sighand = lock_task_sighand(target, &irqflags);
++			if (likely(sighand)) {
++				signal_wake_up(target, 0);
++				unlock_task_sighand(target, &irqflags);
++			}
++		}
++		break;
++
++	case UTRACE_BLOCKSTEP:
++		/*
++		 * Resume from stopped, step one block.
++		 */
++		if (unlikely(!arch_has_block_step())) {
++			WARN_ON(1);
++			/* Fall through to treat it as SINGLESTEP.  */
++		} else if (likely(resume)) {
++			user_enable_block_step(target);
++			break;
++		}
++
++	case UTRACE_SINGLESTEP:
++		/*
++		 * Resume from stopped, step one instruction.
++		 */
++		if (unlikely(!arch_has_single_step())) {
++			WARN_ON(1);
++			resume = false;
++			ret = -EOPNOTSUPP;
++			break;
++		}
++
++		if (likely(resume))
++			user_enable_single_step(target);
++		else
++			/*
++			 * You were supposed to stop it before asking
++			 * it to step.
++			 */
++			ret = -EAGAIN;
++		break;
++	}
++
++	/*
++	 * Let the thread resume running.  If it's not stopped now,
++	 * there is nothing more we need to do.
++	 */
++	if (resume)
++		utrace_reset(target, utrace, NULL);
++	else
++		spin_unlock(&utrace->lock);
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(utrace_control);
++
++/**
++ * utrace_barrier - synchronize with simultaneous tracing callbacks
++ * @target:		thread to affect
++ * @engine:		engine to affect (can be detached)
++ *
++ * This blocks while @target might be in the midst of making a callback to
++ * @engine.  It can be interrupted by signals and will return -%ERESTARTSYS.
++ * A return value of zero means no callback from @target to @engine was
++ * in progress.  Any effect of its return value (such as %UTRACE_STOP) has
++ * already been applied to @engine.
++ *
++ * It's not necessary to keep the @target pointer alive for this call.
++ * It's only necessary to hold a ref on @engine.  This will return
++ * safely even if @target has been reaped and has no task refs.
++ *
++ * A successful return from utrace_barrier() guarantees its ordering
++ * with respect to utrace_set_events() and utrace_control() calls.  If
++ * @target was not properly stopped, event callbacks just disabled might
++ * still be in progress; utrace_barrier() waits until there is no chance
++ * an unwanted callback can be in progress.
++ */
++int utrace_barrier(struct task_struct *target, struct utrace_engine *engine)
++{
++	struct utrace *utrace;
++	int ret = -ERESTARTSYS;
++
++	if (unlikely(target == current))
++		return 0;
++
++	do {
++		utrace = get_utrace_lock(target, engine, false);
++		if (unlikely(IS_ERR(utrace))) {
++			ret = PTR_ERR(utrace);
++			if (ret != -ERESTARTSYS)
++				break;
++		} else {
++			/*
++			 * All engine state changes are done while
++			 * holding the lock, i.e. before we get here.
++			 * Since we have the lock, we only need to
++			 * worry about @target making a callback.
++			 * When it has entered start_callback() but
++			 * not yet gotten to finish_callback(), we
++			 * will see utrace->reporting == @engine.
++			 * When @target doesn't take the lock, it uses
++			 * barriers to order setting utrace->reporting
++			 * before it examines the engine state.
++			 */
++			if (utrace->reporting != engine)
++				ret = 0;
++			spin_unlock(&utrace->lock);
++			if (!ret)
++				break;
++		}
++		schedule_timeout_interruptible(1);
++	} while (!signal_pending(current));
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(utrace_barrier);
++
++/*
++ * This is local state used for reporting loops, perhaps optimized away.
++ */
++struct utrace_report {
++	enum utrace_resume_action action;
++	u32 result;
++	bool detaches;
++	bool reports;
++	bool takers;
++	bool killed;
++};
++
++#define INIT_REPORT(var) \
++	struct utrace_report var = { UTRACE_RESUME, 0, \
++				     false, false, false, false }
++
++/*
++ * We are now making the report, so clear the flag saying we need one.
++ */
++static void start_report(struct utrace *utrace)
++{
++	BUG_ON(utrace->stopped);
++	if (utrace->report) {
++		spin_lock(&utrace->lock);
++		utrace->report = 0;
++		splice_attaching(utrace);
++		spin_unlock(&utrace->lock);
++	}
++}
++
++/*
++ * Complete a normal reporting pass, pairing with a start_report() call.
++ * This handles any UTRACE_DETACH or UTRACE_REPORT or UTRACE_INTERRUPT
++ * returns from engine callbacks.  If any engine's last callback used
++ * UTRACE_STOP, we do UTRACE_REPORT here to ensure we stop before user
++ * mode.  If there were no callbacks made, it will recompute
++ * @task->utrace_flags to avoid another false-positive.
++ */
++static void finish_report(struct utrace_report *report,
++			  struct task_struct *task, struct utrace *utrace)
++{
++	bool clean = (report->takers && !report->detaches);
++
++	if (report->action <= UTRACE_REPORT && !utrace->report) {
++		spin_lock(&utrace->lock);
++		utrace->report = 1;
++		set_tsk_thread_flag(task, TIF_NOTIFY_RESUME);
++	} else if (report->action == UTRACE_INTERRUPT && !utrace->interrupt) {
++		spin_lock(&utrace->lock);
++		utrace->interrupt = 1;
++		set_tsk_thread_flag(task, TIF_SIGPENDING);
++	} else if (clean) {
++		return;
++	} else {
++		spin_lock(&utrace->lock);
++	}
++
++	if (clean)
++		spin_unlock(&utrace->lock);
++	else
++		utrace_reset(task, utrace, &report->action);
++}
++
++/*
++ * Apply the return value of one engine callback to @report.
++ * Returns true if @engine detached and should not get any more callbacks.
++ */
++static bool finish_callback(struct utrace *utrace,
++			    struct utrace_report *report,
++			    struct utrace_engine *engine,
++			    u32 ret)
++{
++	enum utrace_resume_action action = utrace_resume_action(ret);
++
++	report->result = ret & ~UTRACE_RESUME_MASK;
++
++	/*
++	 * If utrace_control() was used, treat that like UTRACE_DETACH here.
++	 */
++	if (action == UTRACE_DETACH || engine->ops == &utrace_detached_ops) {
++		engine->ops = &utrace_detached_ops;
++		report->detaches = true;
++	} else {
++		if (action < report->action)
++			report->action = action;
++
++		if (action == UTRACE_STOP) {
++			if (!engine_wants_stop(engine)) {
++				spin_lock(&utrace->lock);
++				mark_engine_wants_stop(engine);
++				spin_unlock(&utrace->lock);
++			}
++		} else {
++			if (action == UTRACE_REPORT)
++				report->reports = true;
++
++			if (engine_wants_stop(engine)) {
++				spin_lock(&utrace->lock);
++				clear_engine_wants_stop(engine);
++				spin_unlock(&utrace->lock);
++			}
++		}
++	}
++
++	/*
++	 * Now that we have applied the effect of the return value,
++	 * clear this so that utrace_barrier() can stop waiting.
++	 * A subsequent utrace_control() can stop or resume @engine
++	 * and know this was ordered after its callback's action.
++	 *
++	 * We don't need any barriers here because utrace_barrier()
++	 * takes utrace->lock.  If we touched engine->flags above,
++	 * the lock guaranteed this change was before utrace_barrier()
++	 * examined utrace->reporting.
++	 */
++	utrace->reporting = NULL;
++
++	/*
++	 * This is a good place to make sure tracing engines don't
++	 * introduce too much latency under voluntary preemption.
++	 */
++	if (need_resched())
++		cond_resched();
++
++	return engine->ops == &utrace_detached_ops;
++}
++
++/*
++ * Start the callbacks for @engine to consider @event (a bit mask).
++ * This makes the report_quiesce() callback first.  If @engine wants
++ * a specific callback for @event, we return the ops vector to use.
++ * If not, we return NULL.  The return value from the ops->callback
++ * function called should be passed to finish_callback().
++ */
++static const struct utrace_engine_ops *start_callback(
++	struct utrace *utrace, struct utrace_report *report,
++	struct utrace_engine *engine, struct task_struct *task,
++	unsigned long event)
++{
++	const struct utrace_engine_ops *ops;
++	unsigned long want;
++
++	/*
++	 * This barrier ensures that we've set utrace->reporting before
++	 * we examine engine->flags or engine->ops.  utrace_barrier()
++	 * relies on this ordering to indicate that the effect of any
++	 * utrace_control() and utrace_set_events() calls is in place
++	 * by the time utrace->reporting can be seen to be NULL.
++	 */
++	utrace->reporting = engine;
++	smp_mb();
++
++	/*
++	 * This pairs with the barrier in mark_engine_detached().
++	 * It makes sure that we never see the old ops vector with
++	 * the new flags, in case the original vector had no report_quiesce.
++	 */
++	want = engine->flags;
++	smp_rmb();
++	ops = engine->ops;
++
++	if (want & UTRACE_EVENT(QUIESCE)) {
++		if (finish_callback(utrace, report, engine,
++				    (*ops->report_quiesce)(report->action,
++							   engine, task,
++							   event)))
++			return NULL;
++
++		/*
++		 * finish_callback() reset utrace->reporting after the
++		 * quiesce callback.  Now we set it again (as above)
++		 * before re-examining engine->flags, which could have
++		 * been changed synchronously by ->report_quiesce or
++		 * asynchronously by utrace_control() or utrace_set_events().
++		 */
++		utrace->reporting = engine;
++		smp_mb();
++		want = engine->flags;
++	}
++
++	if (want & ENGINE_STOP)
++		report->action = UTRACE_STOP;
++
++	if (want & event) {
++		report->takers = true;
++		return ops;
++	}
++
++	utrace->reporting = NULL;
++	return NULL;
++}
++
++/*
++ * Do a normal reporting pass for engines interested in @event.
++ * @callback is the name of the member in the ops vector, and remaining
++ * args are the extras it takes after the standard three args.
++ */
++#define REPORT(task, utrace, report, event, callback, ...)		      \
++	do {								      \
++		start_report(utrace);					      \
++		REPORT_CALLBACKS(, task, utrace, report, event, callback,     \
++				 (report)->action, engine, current,	      \
++				 ## __VA_ARGS__);  	   		      \
++		finish_report(report, task, utrace);			      \
++	} while (0)
++#define REPORT_CALLBACKS(rev, task, utrace, report, event, callback, ...)     \
++	do {								      \
++		struct utrace_engine *engine;				      \
++		const struct utrace_engine_ops *ops;			      \
++		list_for_each_entry##rev(engine, &utrace->attached, entry) {  \
++			ops = start_callback(utrace, report, engine, task,    \
++					     event);			      \
++			if (!ops)					      \
++				continue;				      \
++			finish_callback(utrace, report, engine,		      \
++					(*ops->callback)(__VA_ARGS__));	      \
++		}							      \
++	} while (0)
++
++/*
++ * Called iff UTRACE_EVENT(EXEC) flag is set.
++ */
++void utrace_report_exec(struct linux_binfmt *fmt, struct linux_binprm *bprm,
++			struct pt_regs *regs)
++{
++	struct task_struct *task = current;
++	struct utrace *utrace = task_utrace_struct(task);
++	INIT_REPORT(report);
++
++	REPORT(task, utrace, &report, UTRACE_EVENT(EXEC),
++	       report_exec, fmt, bprm, regs);
++}
++
++/*
++ * Called iff UTRACE_EVENT(SYSCALL_ENTRY) flag is set.
++ * Return true to prevent the system call.
++ */
++bool utrace_report_syscall_entry(struct pt_regs *regs)
++{
++	struct task_struct *task = current;
++	struct utrace *utrace = task_utrace_struct(task);
++	INIT_REPORT(report);
++
++	start_report(utrace);
++	REPORT_CALLBACKS(_reverse, task, utrace, &report,
++			 UTRACE_EVENT(SYSCALL_ENTRY), report_syscall_entry,
++			 report.result | report.action, engine, current, regs);
++	finish_report(&report, task, utrace);
++
++	if (report.action == UTRACE_STOP &&
++	    unlikely(utrace_stop(task, utrace, false)))
++		/*
++		 * We are continuing despite UTRACE_STOP because of a
++		 * SIGKILL.  Don't let the system call actually proceed.
++		 */
++		return true;
++
++	return report.result == UTRACE_SYSCALL_ABORT;
++}
++
++/*
++ * Called iff UTRACE_EVENT(SYSCALL_EXIT) flag is set.
++ */
++void utrace_report_syscall_exit(struct pt_regs *regs)
++{
++	struct task_struct *task = current;
++	struct utrace *utrace = task_utrace_struct(task);
++	INIT_REPORT(report);
++
++	REPORT(task, utrace, &report, UTRACE_EVENT(SYSCALL_EXIT),
++	       report_syscall_exit, regs);
++}
++
++/*
++ * Called iff UTRACE_EVENT(CLONE) flag is set.
++ * This notification call blocks the wake_up_new_task call on the child.
++ * So we must not quiesce here.  tracehook_report_clone_complete will do
++ * a quiescence check momentarily.
++ */
++void utrace_report_clone(unsigned long clone_flags, struct task_struct *child)
++{
++	struct task_struct *task = current;
++	struct utrace *utrace = task_utrace_struct(task);
++	INIT_REPORT(report);
++
++	/*
++	 * We don't use the REPORT() macro here, because we need
++	 * to clear utrace->cloning before finish_report().
++	 * After finish_report(), utrace can be a stale pointer
++	 * in cases when report.action is still UTRACE_RESUME.
++	 */
++	start_report(utrace);
++	utrace->cloning = child;
++
++	REPORT_CALLBACKS(, task, utrace, &report,
++			 UTRACE_EVENT(CLONE), report_clone,
++			 report.action, engine, task, clone_flags, child);
++
++	utrace->cloning = NULL;
++	finish_report(&report, task, utrace);
++
++	/*
++	 * For a vfork, we will go into an uninterruptible block waiting
++	 * for the child.  We need UTRACE_STOP to happen before this, not
++	 * after.  For CLONE_VFORK, utrace_finish_vfork() will be called.
++	 */
++	if (report.action == UTRACE_STOP && (clone_flags & CLONE_VFORK)) {
++		spin_lock(&utrace->lock);
++		utrace->vfork_stop = 1;
++		spin_unlock(&utrace->lock);
++	}
++}
++
++/*
++ * We're called after utrace_report_clone() for a CLONE_VFORK.
++ * If UTRACE_STOP was left from the clone report, we stop here.
++ * After this, we'll enter the uninterruptible wait_for_completion()
++ * waiting for the child.
++ */
++void utrace_finish_vfork(struct task_struct *task)
++{
++	struct utrace *utrace = task_utrace_struct(task);
++
++	spin_lock(&utrace->lock);
++	if (!utrace->vfork_stop)
++		spin_unlock(&utrace->lock);
++	else {
++		utrace->vfork_stop = 0;
++		spin_unlock(&utrace->lock);
++		utrace_stop(task, utrace, false);
++	}
++}
++
++/*
++ * Called iff UTRACE_EVENT(JCTL) flag is set.
++ *
++ * Called with siglock held.
++ */
++void utrace_report_jctl(int notify, int what)
++{
++	struct task_struct *task = current;
++	struct utrace *utrace = task_utrace_struct(task);
++	INIT_REPORT(report);
++	bool stop = task_is_stopped(task);
++
++	/*
++	 * We have to come out of TASK_STOPPED in case the event report
++	 * hooks might block.  Since we held the siglock throughout, it's
++	 * as if we were never in TASK_STOPPED yet at all.
++	 */
++	if (stop) {
++		__set_current_state(TASK_RUNNING);
++		task->signal->flags &= ~SIGNAL_STOP_STOPPED;
++		++task->signal->group_stop_count;
++	}
++	spin_unlock_irq(&task->sighand->siglock);
++
++	/*
++	 * We get here with CLD_STOPPED when we've just entered
++	 * TASK_STOPPED, or with CLD_CONTINUED when we've just come
++	 * out but not yet been through utrace_get_signal() again.
++	 *
++	 * While in TASK_STOPPED, we can be considered safely
++	 * stopped by utrace_do_stop() and detached asynchronously.
++	 * If we woke up and checked task->utrace_flags before that
++	 * was finished, we might be here with utrace already
++	 * removed or in the middle of being removed.
++	 *
++	 * If we are indeed attached, then make sure we are no
++	 * longer considered stopped while we run callbacks.
++	 */
++	spin_lock(&utrace->lock);
++	utrace->stopped = 0;
++	/*
++	 * Do start_report()'s work too since we already have the lock anyway.
++	 */
++	utrace->report = 0;
++	splice_attaching(utrace);
++	spin_unlock(&utrace->lock);
++
++	REPORT(task, utrace, &report, UTRACE_EVENT(JCTL),
++	       report_jctl, what, notify);
++
++	/*
++	 * Retake the lock, and go back into TASK_STOPPED
++	 * unless the stop was just cleared.
++	 */
++	spin_lock_irq(&task->sighand->siglock);
++	if (stop && task->signal->group_stop_count > 0) {
++		__set_current_state(TASK_STOPPED);
++		if (--task->signal->group_stop_count == 0)
++			task->signal->flags |= SIGNAL_STOP_STOPPED;
++	}
++}
++
++/*
++ * Called iff UTRACE_EVENT(EXIT) flag is set.
++ */
++void utrace_report_exit(long *exit_code)
++{
++	struct task_struct *task = current;
++	struct utrace *utrace = task_utrace_struct(task);
++	INIT_REPORT(report);
++	long orig_code = *exit_code;
++
++	REPORT(task, utrace, &report, UTRACE_EVENT(EXIT),
++	       report_exit, orig_code, exit_code);
++
++	if (report.action == UTRACE_STOP)
++		utrace_stop(task, utrace, false);
++}
++
++/*
++ * Called iff UTRACE_EVENT(DEATH) or UTRACE_EVENT(QUIESCE) flag is set.
++ *
++ * It is always possible that we are racing with utrace_release_task here.
++ * For this reason, utrace_release_task checks for the event bits that get
++ * us here, and delays its cleanup for us to do.
++ */
++void utrace_report_death(struct task_struct *task, struct utrace *utrace,
++			 bool group_dead, int signal)
++{
++	INIT_REPORT(report);
++
++	BUG_ON(!task->exit_state);
++
++	/*
++	 * We are presently considered "quiescent"--which is accurate
++	 * inasmuch as we won't run any more user instructions ever again.
++	 * But for utrace_control and utrace_set_events to be robust, they
++	 * must be sure whether or not we will run any more callbacks.  If
++	 * a call comes in before we do, taking the lock here synchronizes
++	 * us so we don't run any callbacks just disabled.  Calls that come
++	 * in while we're running the callbacks will see the exit.death
++	 * flag and know that we are not yet fully quiescent for purposes
++	 * of detach bookkeeping.
++	 */
++	spin_lock(&utrace->lock);
++	BUG_ON(utrace->death);
++	utrace->death = 1;
++	utrace->report = 0;
++	utrace->interrupt = 0;
++	spin_unlock(&utrace->lock);
++
++	REPORT_CALLBACKS(, task, utrace, &report, UTRACE_EVENT(DEATH),
++			 report_death, engine, task, group_dead, signal);
++
++	spin_lock(&utrace->lock);
++
++	/*
++	 * After we unlock (possibly inside utrace_reap for callbacks) with
++	 * this flag clear, competing utrace_control/utrace_set_events calls
++	 * know that we've finished our callbacks and any detach bookkeeping.
++	 */
++	utrace->death = 0;
++
++	if (utrace->reap)
++		/*
++		 * utrace_release_task() was already called in parallel.
++		 * We must complete its work now.
++		 */
++		utrace_reap(task, utrace);
++	else
++		utrace_reset(task, utrace, &report.action);
++}
++
++/*
++ * Finish the last reporting pass before returning to user mode.
++ */
++static void finish_resume_report(struct utrace_report *report,
++				 struct task_struct *task,
++				 struct utrace *utrace)
++{
++	if (report->detaches || !report->takers) {
++		spin_lock(&utrace->lock);
++		utrace_reset(task, utrace, &report->action);
++	}
++
++	switch (report->action) {
++	case UTRACE_STOP:
++		report->killed = utrace_stop(task, utrace, report->reports);
++		break;
++
++	case UTRACE_INTERRUPT:
++		if (!signal_pending(task))
++			set_tsk_thread_flag(task, TIF_SIGPENDING);
++		break;
++
++	case UTRACE_BLOCKSTEP:
++		if (likely(arch_has_block_step())) {
++			user_enable_block_step(task);
++			break;
++		}
++
++		/*
++		 * This means some callback is to blame for failing
++		 * to check arch_has_block_step() itself.  Warn and
++		 * then fall through to treat it as SINGLESTEP.
++		 */
++		WARN_ON(1);
++
++	case UTRACE_SINGLESTEP:
++		if (likely(arch_has_single_step()))
++			user_enable_single_step(task);
++		else
++			/*
++			 * This means some callback is to blame for failing
++			 * to check arch_has_single_step() itself.  Spew
++			 * about it so the loser will fix his module.
++			 */
++			WARN_ON(1);
++		break;
++
++	case UTRACE_REPORT:
++	case UTRACE_RESUME:
++	default:
++		user_disable_single_step(task);
++		break;
++	}
++}
++
++/*
++ * This is called when TIF_NOTIFY_RESUME had been set (and is now clear).
++ * We are close to user mode, and this is the place to report or stop.
++ * When we return, we're going to user mode or into the signals code.
++ */
++void utrace_resume(struct task_struct *task, struct pt_regs *regs)
++{
++	struct utrace *utrace = task_utrace_struct(task);
++	INIT_REPORT(report);
++	struct utrace_engine *engine;
++
++	/*
++	 * Some machines get here with interrupts disabled.  The same arch
++	 * code path leads to calling into get_signal_to_deliver(), which
++	 * implicitly reenables them by virtue of spin_unlock_irq.
++	 */
++	local_irq_enable();
++
++	/*
++	 * If this flag is still set it's because there was a signal
++	 * handler setup done but no report_signal following it.  Clear
++	 * the flag before we get to user so it doesn't confuse us later.
++	 */
++	if (unlikely(utrace->signal_handler)) {
++		int skip;
++		spin_lock(&utrace->lock);
++		utrace->signal_handler = 0;
++		skip = !utrace->report;
++		spin_unlock(&utrace->lock);
++		if (skip)
++			return;
++	}
++
++	/*
++	 * If UTRACE_INTERRUPT was just used, we don't bother with a report
++	 * here.  We will report and stop in utrace_get_signal().  In case
++	 * of a race with utrace_control(), make sure we don't momentarily
++	 * return to user mode because TIF_SIGPENDING was not set yet.
++	 */
++	if (unlikely(utrace->interrupt)) {
++		set_thread_flag(TIF_SIGPENDING);
++		return;
++	}
++
++	/*
++	 * Do a simple reporting pass, with no callback after report_quiesce.
++	 */
++	start_report(utrace);
++
++	list_for_each_entry(engine, &utrace->attached, entry)
++		start_callback(utrace, &report, engine, task, 0);
++
++	/*
++	 * Finish the report and either stop or get ready to resume.
++	 */
++	finish_resume_report(&report, task, utrace);
++}
++
++/*
++ * Return true if current has forced signal_pending().
++ *
++ * This is called only when current->utrace_flags is nonzero, so we know
++ * that current->utrace must be set.  It's not inlined in tracehook.h
++ * just so that struct utrace can stay opaque outside this file.
++ */
++bool utrace_interrupt_pending(void)
++{
++	return task_utrace_struct(current)->interrupt;
++}
++
++/*
++ * Take the siglock and push @info back on our queue.
++ * Returns with @task->sighand->siglock held.
++ */
++static void push_back_signal(struct task_struct *task, siginfo_t *info)
++	__acquires(task->sighand->siglock)
++{
++	struct sigqueue *q;
++
++	if (unlikely(!info->si_signo)) { /* Oh, a wise guy! */
++		spin_lock_irq(&task->sighand->siglock);
++		return;
++	}
++
++	q = sigqueue_alloc();
++	if (likely(q)) {
++		q->flags = 0;
++		copy_siginfo(&q->info, info);
++	}
++
++	spin_lock_irq(&task->sighand->siglock);
++
++	sigaddset(&task->pending.signal, info->si_signo);
++	if (likely(q))
++		list_add(&q->list, &task->pending.list);
++
++	set_tsk_thread_flag(task, TIF_SIGPENDING);
++}
++
++/*
++ * This is the hook from the signals code, called with the siglock held.
++ * Here is the ideal place to stop.  We also dequeue and intercept signals.
++ */
++int utrace_get_signal(struct task_struct *task, struct pt_regs *regs,
++		      siginfo_t *info, struct k_sigaction *return_ka)
++	__releases(task->sighand->siglock)
++	__acquires(task->sighand->siglock)
++{
++	struct utrace *utrace;
++	struct k_sigaction *ka;
++	INIT_REPORT(report);
++	struct utrace_engine *engine;
++	const struct utrace_engine_ops *ops;
++	unsigned long event, want;
++	u32 ret;
++	int signr;
++
++	utrace = &task->utrace;
++	if (utrace->interrupt || utrace->report || utrace->signal_handler) {
++		/*
++		 * We've been asked for an explicit report before we
++		 * even check for pending signals.
++		 */
++
++		spin_unlock_irq(&task->sighand->siglock);
++
++		spin_lock(&utrace->lock);
++
++		splice_attaching(utrace);
++
++		if (unlikely(!utrace->interrupt) && unlikely(!utrace->report))
++			report.result = UTRACE_SIGNAL_IGN;
++		else if (utrace->signal_handler)
++			report.result = UTRACE_SIGNAL_HANDLER;
++		else
++			report.result = UTRACE_SIGNAL_REPORT;
++
++		/*
++		 * We are now making the report and it's on the
++		 * interrupt path, so clear the flags asking for those.
++		 */
++		utrace->interrupt = utrace->report = utrace->signal_handler = 0;
++		utrace->stopped = 0;
++
++		/*
++		 * Make sure signal_pending() only returns true
++		 * if there are real signals pending.
++		 */
++		if (signal_pending(task)) {
++			spin_lock_irq(&task->sighand->siglock);
++			recalc_sigpending();
++			spin_unlock_irq(&task->sighand->siglock);
++		}
++
++		spin_unlock(&utrace->lock);
++
++		if (unlikely(report.result == UTRACE_SIGNAL_IGN))
++			/*
++			 * We only got here to clear utrace->signal_handler.
++			 */
++			return -1;
++
++		/*
++		 * Do a reporting pass for no signal, just for EVENT(QUIESCE).
++		 * The engine callbacks can fill in *info and *return_ka.
++		 * We'll pass NULL for the @orig_ka argument to indicate
++		 * that there was no original signal.
++		 */
++		event = 0;
++		ka = NULL;
++		memset(return_ka, 0, sizeof *return_ka);
++	} else if ((task->utrace_flags & UTRACE_EVENT_SIGNAL_ALL) == 0 &&
++		   !utrace->stopped) {
++		/*
++		 * If no engine is interested in intercepting signals,
++		 * let the caller just dequeue them normally.
++		 */
++		return 0;
++	} else {
++		if (unlikely(utrace->stopped)) {
++			spin_unlock_irq(&task->sighand->siglock);
++			spin_lock(&utrace->lock);
++			utrace->stopped = 0;
++			spin_unlock(&utrace->lock);
++			spin_lock_irq(&task->sighand->siglock);
++		}
++
++		/*
++		 * Steal the next signal so we can let tracing engines
++		 * examine it.  From the signal number and sigaction,
++		 * determine what normal delivery would do.  If no
++		 * engine perturbs it, we'll do that by returning the
++		 * signal number after setting *return_ka.
++		 */
++		signr = dequeue_signal(task, &task->blocked, info);
++		if (signr == 0)
++			return signr;
++		BUG_ON(signr != info->si_signo);
++
++		ka = &task->sighand->action[signr - 1];
++		*return_ka = *ka;
++
++		/*
++		 * We are never allowed to interfere with SIGKILL.
++		 * Just punt after filling in *return_ka for our caller.
++		 */
++		if (signr == SIGKILL)
++			return signr;
++
++		if (ka->sa.sa_handler == SIG_IGN) {
++			event = UTRACE_EVENT(SIGNAL_IGN);
++			report.result = UTRACE_SIGNAL_IGN;
++		} else if (ka->sa.sa_handler != SIG_DFL) {
++			event = UTRACE_EVENT(SIGNAL);
++			report.result = UTRACE_SIGNAL_DELIVER;
++		} else if (sig_kernel_coredump(signr)) {
++			event = UTRACE_EVENT(SIGNAL_CORE);
++			report.result = UTRACE_SIGNAL_CORE;
++		} else if (sig_kernel_ignore(signr)) {
++			event = UTRACE_EVENT(SIGNAL_IGN);
++			report.result = UTRACE_SIGNAL_IGN;
++		} else if (signr == SIGSTOP) {
++			event = UTRACE_EVENT(SIGNAL_STOP);
++			report.result = UTRACE_SIGNAL_STOP;
++		} else if (sig_kernel_stop(signr)) {
++			event = UTRACE_EVENT(SIGNAL_STOP);
++			report.result = UTRACE_SIGNAL_TSTP;
++		} else {
++			event = UTRACE_EVENT(SIGNAL_TERM);
++			report.result = UTRACE_SIGNAL_TERM;
++		}
++
++		/*
++		 * Now that we know what event type this signal is, we
++		 * can short-circuit if no engines care about those.
++		 */
++		if ((task->utrace_flags & (event | UTRACE_EVENT(QUIESCE))) == 0)
++			return signr;
++
++		/*
++		 * We have some interested engines, so tell them about
++		 * the signal and let them change its disposition.
++		 */
++		spin_unlock_irq(&task->sighand->siglock);
++	}
++
++	/*
++	 * This reporting pass chooses what signal disposition we'll act on.
++	 */
++	list_for_each_entry(engine, &utrace->attached, entry) {
++		/*
++		 * See start_callback() comment about this barrier.
++		 */
++		utrace->reporting = engine;
++		smp_mb();
++
++		/*
++		 * This pairs with the barrier in mark_engine_detached(),
++		 * see start_callback() comments.
++		 */
++		want = engine->flags;
++		smp_rmb();
++		ops = engine->ops;
++
++		if ((want & (event | UTRACE_EVENT(QUIESCE))) == 0) {
++			utrace->reporting = NULL;
++			continue;
++		}
++
++		if (ops->report_signal)
++			ret = (*ops->report_signal)(
++				report.result | report.action, engine, task,
++				regs, info, ka, return_ka);
++		else
++			ret = (report.result | (*ops->report_quiesce)(
++				       report.action, engine, task, event));
++
++		/*
++		 * Avoid a tight loop reporting again and again if some
++		 * engine is too stupid.
++		 */
++		switch (utrace_resume_action(ret)) {
++		default:
++			break;
++		case UTRACE_INTERRUPT:
++		case UTRACE_REPORT:
++			ret = (ret & ~UTRACE_RESUME_MASK) | UTRACE_RESUME;
++			break;
++		}
++
++		finish_callback(utrace, &report, engine, ret);
++	}
++
++	/*
++	 * We express the chosen action to the signals code in terms
++	 * of a representative signal whose default action does it.
++	 * Our caller uses our return value (signr) to decide what to
++	 * do, but uses info->si_signo as the signal number to report.
++	 */
++	switch (utrace_signal_action(report.result)) {
++	case UTRACE_SIGNAL_TERM:
++		signr = SIGTERM;
++		break;
++
++	case UTRACE_SIGNAL_CORE:
++		signr = SIGQUIT;
++		break;
++
++	case UTRACE_SIGNAL_STOP:
++		signr = SIGSTOP;
++		break;
++
++	case UTRACE_SIGNAL_TSTP:
++		signr = SIGTSTP;
++		break;
++
++	case UTRACE_SIGNAL_DELIVER:
++		signr = info->si_signo;
++
++		if (return_ka->sa.sa_handler == SIG_DFL) {
++			/*
++			 * We'll do signr's normal default action.
++			 * For ignore, we'll fall through below.
++			 * For stop/death, break locks and returns it.
++			 */
++			if (likely(signr) && !sig_kernel_ignore(signr))
++				break;
++		} else if (return_ka->sa.sa_handler != SIG_IGN &&
++			   likely(signr)) {
++			/*
++			 * Complete the bookkeeping after the report.
++			 * The handler will run.  If an engine wanted to
++			 * stop or step, then make sure we do another
++			 * report after signal handler setup.
++			 */
++			if (report.action != UTRACE_RESUME)
++				report.action = UTRACE_INTERRUPT;
++			finish_report(&report, task, utrace);
++
++			if (unlikely(report.result & UTRACE_SIGNAL_HOLD))
++				push_back_signal(task, info);
++			else
++				spin_lock_irq(&task->sighand->siglock);
++
++			/*
++			 * We do the SA_ONESHOT work here since the
++			 * normal path will only touch *return_ka now.
++			 */
++			if (unlikely(return_ka->sa.sa_flags & SA_ONESHOT)) {
++				return_ka->sa.sa_flags &= ~SA_ONESHOT;
++				if (likely(valid_signal(signr))) {
++					ka = &task->sighand->action[signr - 1];
++					ka->sa.sa_handler = SIG_DFL;
++				}
++			}
++
++			return signr;
++		}
++
++		/* Fall through for an ignored signal.  */
++
++	case UTRACE_SIGNAL_IGN:
++	case UTRACE_SIGNAL_REPORT:
++	default:
++		/*
++		 * If the signal is being ignored, then we are on the way
++		 * directly back to user mode.  We can stop here, or step,
++		 * as in utrace_resume(), above.  After we've dealt with that,
++		 * our caller will relock and come back through here.
++		 */
++		finish_resume_report(&report, task, utrace);
++
++		if (unlikely(report.killed)) {
++			/*
++			 * The only reason we woke up now was because of a
++			 * SIGKILL.  Don't do normal dequeuing in case it
++			 * might get a signal other than SIGKILL.  That would
++			 * perturb the death state so it might differ from
++			 * what the debugger would have allowed to happen.
++			 * Instead, pluck out just the SIGKILL to be sure
++			 * we'll die immediately with nothing else different
++			 * from the quiescent state the debugger wanted us in.
++			 */
++			sigset_t sigkill_only;
++			siginitsetinv(&sigkill_only, sigmask(SIGKILL));
++			spin_lock_irq(&task->sighand->siglock);
++			signr = dequeue_signal(task, &sigkill_only, info);
++			BUG_ON(signr != SIGKILL);
++			*return_ka = task->sighand->action[SIGKILL - 1];
++			return signr;
++		}
++
++		if (unlikely(report.result & UTRACE_SIGNAL_HOLD)) {
++			push_back_signal(task, info);
++			spin_unlock_irq(&task->sighand->siglock);
++		}
++
++		return -1;
++	}
++
++	/*
++	 * Complete the bookkeeping after the report.
++	 * This sets utrace->report if UTRACE_STOP was used.
++	 */
++	finish_report(&report, task, utrace);
++
++	return_ka->sa.sa_handler = SIG_DFL;
++
++	if (unlikely(report.result & UTRACE_SIGNAL_HOLD))
++		push_back_signal(task, info);
++	else
++		spin_lock_irq(&task->sighand->siglock);
++
++	if (sig_kernel_stop(signr))
++		task->signal->flags |= SIGNAL_STOP_DEQUEUED;
++
++	return signr;
++}
++
++/*
++ * This gets called after a signal handler has been set up.
++ * We set a flag so the next report knows it happened.
++ * If we're already stepping, make sure we do a report_signal.
++ * If not, make sure we get into utrace_resume() where we can
++ * clear the signal_handler flag before resuming.
++ */
++void utrace_signal_handler(struct task_struct *task, int stepping)
++{
++	struct utrace *utrace = task_utrace_struct(task);
++
++	spin_lock(&utrace->lock);
++
++	utrace->signal_handler = 1;
++	if (stepping) {
++		utrace->interrupt = 1;
++		set_tsk_thread_flag(task, TIF_SIGPENDING);
++	} else {
++		set_tsk_thread_flag(task, TIF_NOTIFY_RESUME);
++	}
++
++	spin_unlock(&utrace->lock);
++}
++
++/**
++ * utrace_prepare_examine - prepare to examine thread state
++ * @target:		thread of interest, a &struct task_struct pointer
++ * @engine:		engine pointer returned by utrace_attach_task()
++ * @exam:		temporary state, a &struct utrace_examiner pointer
++ *
++ * This call prepares to safely examine the thread @target using
++ * &struct user_regset calls, or direct access to thread-synchronous fields.
++ *
++ * When @target is current, this call is superfluous.  When @target is
++ * another thread, it must held stopped via %UTRACE_STOP by @engine.
++ *
++ * This call may block the caller until @target stays stopped, so it must
++ * be called only after the caller is sure @target is about to unschedule.
++ * This means a zero return from a utrace_control() call on @engine giving
++ * %UTRACE_STOP, or a report_quiesce() or report_signal() callback to
++ * @engine that used %UTRACE_STOP in its return value.
++ *
++ * Returns -%ESRCH if @target is dead or -%EINVAL if %UTRACE_STOP was
++ * not used.  If @target has started running again despite %UTRACE_STOP
++ * (for %SIGKILL or a spurious wakeup), this call returns -%EAGAIN.
++ *
++ * When this call returns zero, it's safe to use &struct user_regset
++ * calls and task_user_regset_view() on @target and to examine some of
++ * its fields directly.  When the examination is complete, a
++ * utrace_finish_examine() call must follow to check whether it was
++ * completed safely.
++ */
++int utrace_prepare_examine(struct task_struct *target,
++			   struct utrace_engine *engine,
++			   struct utrace_examiner *exam)
++{
++	int ret = 0;
++
++	if (unlikely(target == current))
++		return 0;
++
++	rcu_read_lock();
++	if (unlikely(!engine_wants_stop(engine)))
++		ret = -EINVAL;
++	else if (unlikely(target->exit_state))
++		ret = -ESRCH;
++	else {
++		exam->state = target->state;
++		if (unlikely(exam->state == TASK_RUNNING))
++			ret = -EAGAIN;
++		else
++			get_task_struct(target);
++	}
++	rcu_read_unlock();
++
++	if (likely(!ret)) {
++		exam->ncsw = wait_task_inactive(target, exam->state);
++		put_task_struct(target);
++		if (unlikely(!exam->ncsw))
++			ret = -EAGAIN;
++	}
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(utrace_prepare_examine);
++
++/**
++ * utrace_finish_examine - complete an examination of thread state
++ * @target:		thread of interest, a &struct task_struct pointer
++ * @engine:		engine pointer returned by utrace_attach_task()
++ * @exam:		pointer passed to utrace_prepare_examine() call
++ *
++ * This call completes an examination on the thread @target begun by a
++ * paired utrace_prepare_examine() call with the same arguments that
++ * returned success (zero).
++ *
++ * When @target is current, this call is superfluous.  When @target is
++ * another thread, this returns zero if @target has remained unscheduled
++ * since the paired utrace_prepare_examine() call returned zero.
++ *
++ * When this returns an error, any examination done since the paired
++ * utrace_prepare_examine() call is unreliable and the data extracted
++ * should be discarded.  The error is -%EINVAL if @engine is not
++ * keeping @target stopped, or -%EAGAIN if @target woke up unexpectedly.
++ */
++int utrace_finish_examine(struct task_struct *target,
++			  struct utrace_engine *engine,
++			  struct utrace_examiner *exam)
++{
++	int ret = 0;
++
++	if (unlikely(target == current))
++		return 0;
++
++	rcu_read_lock();
++	if (unlikely(!engine_wants_stop(engine)))
++		ret = -EINVAL;
++	else if (unlikely(target->state != exam->state))
++		ret = -EAGAIN;
++	else
++		get_task_struct(target);
++	rcu_read_unlock();
++
++	if (likely(!ret)) {
++		unsigned long ncsw = wait_task_inactive(target, exam->state);
++		if (unlikely(ncsw != exam->ncsw))
++			ret = -EAGAIN;
++		put_task_struct(target);
++	}
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(utrace_finish_examine);
++
++/*
++ * This is declared in linux/regset.h and defined in machine-dependent
++ * code.  We put the export here to ensure no machine forgets it.
++ */
++EXPORT_SYMBOL_GPL(task_user_regset_view);
++
++/*
++ * Called with rcu_read_lock() held.
++ */
++void task_utrace_proc_status(struct seq_file *m, struct task_struct *p)
++{
++	struct utrace *utrace = &p->utrace;
++	seq_printf(m, "Utrace:\t%lx%s%s%s\n",
++		   p->utrace_flags,
++		   utrace->stopped ? " (stopped)" : "",
++		   utrace->report ? " (report)" : "",
++		   utrace->interrupt ? " (interrupt)" : "");
++}
diff --git a/original/kernel.spec b/original/kernel.spec
new file mode 100644
index 000000000..5c1605474
--- /dev/null
+++ b/original/kernel.spec
@@ -0,0 +1,3886 @@
+# We have to override the new %%install behavior because, well... the kernel is special.
+%global __spec_install_pre %{___build_pre}
+
+Summary: The Linux kernel
+
+# For a stable, released kernel, released_kernel should be 1. For rawhide
+# and/or a kernel built from an rc or git snapshot, released_kernel should
+# be 0.
+%global released_kernel 1
+
+# Versions of various parts
+
+# Polite request for people who spin their own kernel rpms:
+# please modify the "buildid" define in a way that identifies
+# that the kernel isn't the stock distribution kernel, for example,
+# by setting the define to ".local" or ".bz123456"
+#
+# % define buildid .local
+
+# fedora_build defines which build revision of this kernel version we're
+# building. Rather than incrementing forever, as with the prior versioning
+# setup, we set fedora_cvs_origin to the current cvs revision s/1.// of the
+# kernel spec when the kernel is rebased, so fedora_build automatically
+# works out to the offset from the rebase, so it doesn't get too ginormous.
+#
+# If you're building on a branch, the RCS revision will be something like
+# 1.1205.1.1.  In this case we drop the initial 1, subtract fedora_cvs_origin
+# from the second number, and then append the rest of the RCS string as is.
+# Don't stare at the awk too long, you'll go blind.
+%define fedora_cvs_origin   1786
+%define fedora_cvs_revision() %2
+%global fedora_build %(echo %{fedora_cvs_origin}.%{fedora_cvs_revision $Revision: 1.1948 $} | awk -F . '{ OFS = "."; ORS = ""; print $3 - $1 ; i = 4 ; OFS = ""; while (i <= NF) { print ".", $i ; i++} }')
+
+# base_sublevel is the kernel version we're starting with and patching
+# on top of -- for example, 2.6.22-rc7-git1 starts with a 2.6.21 base,
+# which yields a base_sublevel of 21.
+%define base_sublevel 31
+
+## If this is a released kernel ##
+%if 0%{?released_kernel}
+
+# Do we have a -stable update to apply?
+%define stable_update 6
+# Is it a -stable RC?
+%define stable_rc 0
+# Set rpm version accordingly
+%if 0%{?stable_update}
+%define stablerev .%{stable_update}
+%define stable_base %{stable_update}
+%if 0%{?stable_rc}
+# stable RCs are incremental patches, so we need the previous stable patch
+%define stable_base %(echo $((%{stable_update} - 1)))
+%endif
+%endif
+%define rpmversion 2.6.%{base_sublevel}%{?stablerev}
+
+## The not-released-kernel case ##
+%else
+# The next upstream release sublevel (base_sublevel+1)
+%define upstream_sublevel %(echo $((%{base_sublevel} + 1)))
+# The rc snapshot level
+%define rcrev 9
+# The git snapshot level
+%define gitrev 2
+# Set rpm version accordingly
+%define rpmversion 2.6.%{upstream_sublevel}
+%endif
+# Nb: The above rcrev and gitrev values automagically define Patch00 and Patch01 below.
+
+# What parts do we want to build?  We must build at least one kernel.
+# These are the kernels that are built IF the architecture allows it.
+# All should default to 1 (enabled) and be flipped to 0 (disabled)
+# by later arch-specific checks.
+
+# The following build options are enabled by default.
+# Use either --without <opt> in your rpmbuild command or force values
+# to 0 in here to disable them.
+#
+# standard kernel
+%define with_up        %{?_without_up:        0} %{?!_without_up:        1}
+# kernel-smp (only valid for ppc 32-bit)
+%define with_smp       %{?_without_smp:       0} %{?!_without_smp:       1}
+# kernel-kdump
+%define with_kdump     %{?_without_kdump:     0} %{?!_without_kdump:     1}
+# kernel-debug
+%define with_debug     %{?_without_debug:     0} %{?!_without_debug:     1}
+# kernel-doc
+%define with_doc       %{?_without_doc:       0} %{?!_without_doc:       1}
+# kernel-headers
+%define with_headers   %{?_without_headers:   0} %{?!_without_headers:   1}
+# kernel-firmware
+%define with_firmware  %{?_with_firmware:     1} %{?!_with_firmware:     0}
+# tools/perf
+%define with_perftool  %{?_without_perftool:  0} %{?!_without_perftool:  1}
+# perf noarch subpkg
+%define with_perf      %{?_without_perf:      0} %{?!_without_perf:      1}
+# kernel-debuginfo
+%define with_debuginfo %{?_without_debuginfo: 0} %{?!_without_debuginfo: 1}
+# kernel-bootwrapper (for creating zImages from kernel + initrd)
+%define with_bootwrapper %{?_without_bootwrapper: 0} %{?!_without_bootwrapper: 1}
+# Want to build a the vsdo directories installed
+%define with_vdso_install %{?_without_vdso_install: 0} %{?!_without_vdso_install: 1}
+# Use dracut instead of mkinitrd for initrd image generation
+%define with_dracut       %{?_without_dracut:       0} %{?!_without_dracut:       1}
+
+# Build the kernel-doc package, but don't fail the build if it botches.
+# Here "true" means "continue" and "false" means "fail the build".
+%if 0%{?released_kernel}
+%define doc_build_fail false
+%else
+%define doc_build_fail true
+%endif
+
+%define rawhide_skip_docs 0
+%if 0%{?rawhide_skip_docs}
+%define with_doc 0
+%endif
+
+# Additional options for user-friendly one-off kernel building:
+#
+# Only build the base kernel (--with baseonly):
+%define with_baseonly  %{?_with_baseonly:     1} %{?!_with_baseonly:     0}
+# Only build the smp kernel (--with smponly):
+%define with_smponly   %{?_with_smponly:      1} %{?!_with_smponly:      0}
+# Only build the debug kernel (--with dbgonly):
+%define with_dbgonly   %{?_with_dbgonly:      1} %{?!_with_dbgonly:      0}
+
+# should we do C=1 builds with sparse
+%define with_sparse	%{?_with_sparse:      1} %{?!_with_sparse:      0}
+
+# Set debugbuildsenabled to 1 for production (build separate debug kernels)
+#  and 0 for rawhide (all kernels are debug kernels).
+# See also 'make debug' and 'make release'.
+%define debugbuildsenabled 1
+
+# Want to build a vanilla kernel build without any non-upstream patches?
+# (well, almost none, we need nonintconfig for build purposes). Default to 0 (off).
+%define with_vanilla %{?_with_vanilla: 1} %{?!_with_vanilla: 0}
+
+# pkg_release is what we'll fill in for the rpm Release: field
+%if 0%{?released_kernel}
+
+%if 0%{?stable_rc}
+%define stable_rctag .rc%{stable_rc}
+%endif
+%define pkg_release %{fedora_build}%{?stable_rctag}%{?buildid}%{?dist}
+
+%else
+
+# non-released_kernel
+%if 0%{?rcrev}
+%define rctag .rc%rcrev
+%endif
+%if 0%{?gitrev}
+%define gittag .git%gitrev
+%if !0%{?rcrev}
+%define rctag .rc0
+%endif
+%endif
+%define pkg_release 0.%{fedora_build}%{?rctag}%{?gittag}%{?buildid}%{?dist}
+
+%endif
+
+# The kernel tarball/base version
+%define kversion 2.6.%{base_sublevel}
+
+%define make_target bzImage
+
+%define KVERREL %{PACKAGE_VERSION}-%{PACKAGE_RELEASE}.%{_target_cpu}
+%define hdrarch %_target_cpu
+%define asmarch %_target_cpu
+
+%if 0%{!?nopatches:1}
+%define nopatches 0
+%endif
+
+%if %{with_vanilla}
+%define nopatches 1
+%endif
+
+%if %{nopatches}
+%define with_bootwrapper 0
+%define variant -vanilla
+%else
+%define variant_fedora -fedora
+%endif
+
+%define using_upstream_branch 0
+%if 0%{?upstream_branch:1}
+%define stable_update 0
+%define using_upstream_branch 1
+%define variant -%{upstream_branch}%{?variant_fedora}
+%define pkg_release 0.%{fedora_build}%{upstream_branch_tag}%{?buildid}%{?dist}
+%endif
+
+%if !%{debugbuildsenabled}
+%define with_debug 0
+%endif
+
+%if !%{with_debuginfo}
+%define _enable_debug_packages 0
+%endif
+%define debuginfodir /usr/lib/debug
+
+# kernel-PAE is only built on i686.
+%ifarch i686
+%define with_pae 1
+%else
+%define with_pae 0
+%endif
+
+# if requested, only build base kernel
+%if %{with_baseonly}
+%define with_smp 0
+%define with_kdump 0
+%define with_debug 0
+%endif
+
+# if requested, only build smp kernel
+%if %{with_smponly}
+%define with_up 0
+%define with_kdump 0
+%define with_debug 0
+%endif
+
+# if requested, only build debug kernel
+%if %{with_dbgonly}
+%if %{debugbuildsenabled}
+%define with_up 0
+%endif
+%define with_smp 0
+%define with_pae 0
+%define with_xen 0
+%define with_kdump 0
+%define with_perftool 0
+%endif
+
+%define all_x86 i386 i686
+
+%if %{with_vdso_install}
+# These arches install vdso/ directories.
+%define vdso_arches %{all_x86} x86_64 ppc ppc64
+%endif
+
+# Overrides for generic default options
+
+# only ppc and alphav56 need separate smp kernels
+%ifnarch ppc alphaev56
+%define with_smp 0
+%endif
+
+# only build kernel-kdump on ppc64
+# (no relocatable kernel support upstream yet)
+#FIXME: Temporarily disabled to speed up builds.
+#ifnarch ppc64
+%define with_kdump 0
+#endif
+
+# don't do debug builds on anything but i686 and x86_64
+%ifnarch i686 x86_64
+%define with_debug 0
+%endif
+
+# only package docs noarch
+%ifnarch noarch
+%define with_doc 0
+%define with_perf 0
+%endif
+
+# don't build noarch kernels or headers (duh)
+%ifarch noarch
+%define with_up 0
+%define with_headers 0
+%define all_arch_configs kernel-%{version}-*.config
+%define with_firmware  %{?_without_firmware:  0} %{?!_without_firmware:  1}
+%endif
+
+# bootwrapper is only on ppc
+%ifnarch ppc ppc64
+%define with_bootwrapper 0
+%endif
+
+# sparse blows up on ppc64 alpha and sparc64
+%ifarch ppc64 ppc alpha sparc64
+%define with_sparse 0
+%endif
+
+# Per-arch tweaks
+
+%ifarch %{all_x86}
+%define asmarch x86
+%define hdrarch i386
+%define all_arch_configs kernel-%{version}-i?86*.config
+%define image_install_path boot
+%define kernel_image arch/x86/boot/bzImage
+%endif
+
+%ifarch x86_64
+%define asmarch x86
+%define all_arch_configs kernel-%{version}-x86_64*.config
+%define image_install_path boot
+%define kernel_image arch/x86/boot/bzImage
+%endif
+
+%ifarch ppc64
+%define asmarch powerpc
+%define hdrarch powerpc
+%define all_arch_configs kernel-%{version}-ppc64*.config
+%define image_install_path boot
+%define make_target vmlinux
+%define kernel_image vmlinux
+%define kernel_image_elf 1
+%endif
+
+%ifarch s390x
+%define asmarch s390
+%define hdrarch s390
+%define all_arch_configs kernel-%{version}-s390x.config
+%define image_install_path boot
+%define make_target image
+%define kernel_image arch/s390/boot/image
+%endif
+
+%ifarch sparc
+# We only build sparc headers since we dont support sparc32 hardware
+%endif
+
+%ifarch sparc64
+%define asmarch sparc
+%define all_arch_configs kernel-%{version}-sparc64*.config
+%define make_target image
+%define kernel_image arch/sparc/boot/image
+%define image_install_path boot
+%define with_perftool 0
+%endif
+
+%ifarch ppc
+%define asmarch powerpc
+%define hdrarch powerpc
+%define all_arch_configs kernel-%{version}-ppc{-,.}*config
+%define image_install_path boot
+%define make_target vmlinux
+%define kernel_image vmlinux
+%define kernel_image_elf 1
+%endif
+
+%ifarch ia64
+%define all_arch_configs kernel-%{version}-ia64*.config
+%define image_install_path boot/efi/EFI/redhat
+%define make_target compressed
+%define kernel_image vmlinux.gz
+%endif
+
+%ifarch alpha alphaev56
+%define all_arch_configs kernel-%{version}-alpha*.config
+%define image_install_path boot
+%define make_target vmlinux
+%define kernel_image vmlinux
+%endif
+
+%ifarch %{arm}
+%define all_arch_configs kernel-%{version}-arm*.config
+%define image_install_path boot
+%define hdrarch arm
+%define make_target vmlinux
+%define kernel_image vmlinux
+%endif
+
+%if %{nopatches}
+# XXX temporary until last vdso patches are upstream
+%define vdso_arches ppc ppc64
+%endif
+
+%if %{nopatches}%{using_upstream_branch}
+# Ignore unknown options in our config-* files.
+# Some options go with patches we're not applying.
+%define oldconfig_target loose_nonint_oldconfig
+%else
+%define oldconfig_target nonint_oldconfig
+%endif
+
+# To temporarily exclude an architecture from being built, add it to
+# %nobuildarches. Do _NOT_ use the ExclusiveArch: line, because if we
+# don't build kernel-headers then the new build system will no longer let
+# us use the previous build of that package -- it'll just be completely AWOL.
+# Which is a BadThing(tm).
+
+# We don't build a kernel on i386; we only do kernel-headers there,
+# and we no longer build for 31bit S390. Same for 32bit sparc and arm.
+%define nobuildarches i386 s390 sparc %{arm}
+
+%ifarch %nobuildarches
+%define with_up 0
+%define with_smp 0
+%define with_pae 0
+%define with_kdump 0
+%define with_debuginfo 0
+%define with_perftool 0
+%define _enable_debug_packages 0
+%endif
+
+%define with_pae_debug 0
+%if %{with_pae}
+%define with_pae_debug %{with_debug}
+%endif
+
+#
+# Three sets of minimum package version requirements in the form of Conflicts:
+# to versions below the minimum
+#
+
+#
+# First the general kernel 2.6 required versions as per
+# Documentation/Changes
+#
+%define kernel_dot_org_conflicts  ppp < 2.4.3-3, isdn4k-utils < 3.2-32, nfs-utils < 1.0.7-12, e2fsprogs < 1.37-4, util-linux < 2.12, jfsutils < 1.1.7-2, reiserfs-utils < 3.6.19-2, xfsprogs < 2.6.13-4, procps < 3.2.5-6.3, oprofile < 0.9.1-2
+
+#
+# Then a series of requirements that are distribution specific, either
+# because we add patches for something, or the older versions have
+# problems with the newer kernel or lack certain things that make
+# integration in the distro harder than needed.
+#
+%define package_conflicts initscripts < 7.23, udev < 063-6, iptables < 1.3.2-1, ipw2200-firmware < 2.4, iwl4965-firmware < 228.57.2, selinux-policy-targeted < 1.25.3-14, squashfs-tools < 4.0, wireless-tools < 29-3
+
+#
+# The ld.so.conf.d file we install uses syntax older ldconfig's don't grok.
+#
+%define kernel_xen_conflicts glibc < 2.3.5-1, xen < 3.0.1
+
+%define kernel_PAE_obsoletes kernel-smp < 2.6.17, kernel-xen <= 2.6.27-0.2.rc0.git6.fc10
+%define kernel_PAE_provides kernel-xen = %{rpmversion}-%{pkg_release}
+
+%ifarch x86_64
+%define kernel_obsoletes kernel-xen <= 2.6.27-0.2.rc0.git6.fc10
+%define kernel_provides kernel-xen = %{rpmversion}-%{pkg_release}
+%endif
+
+# We moved the drm include files into kernel-headers, make sure there's
+# a recent enough libdrm-devel on the system that doesn't have those.
+%define kernel_headers_conflicts libdrm-devel < 2.4.0-0.15
+
+#
+# Packages that need to be installed before the kernel is, because the %post
+# scripts use them.
+#
+%define kernel_prereq  fileutils, module-init-tools, initscripts >= 8.11.1-1, kernel-firmware >= %{rpmversion}-%{pkg_release}, grubby >= 7.0.4-1
+%if %{with_dracut}
+%define initrd_prereq  dracut >= 001-7
+%else
+%define initrd_prereq  mkinitrd >= 6.0.61-1
+%endif
+
+#
+# This macro does requires, provides, conflicts, obsoletes for a kernel package.
+#	%%kernel_reqprovconf <subpackage>
+# It uses any kernel_<subpackage>_conflicts and kernel_<subpackage>_obsoletes
+# macros defined above.
+#
+%define kernel_reqprovconf \
+Provides: kernel = %{rpmversion}-%{pkg_release}\
+Provides: kernel-%{_target_cpu} = %{rpmversion}-%{pkg_release}%{?1:.%{1}}\
+Provides: kernel-drm = 4.3.0\
+Provides: kernel-drm-nouveau = 15\
+Provides: kernel-modeset = 1\
+Provides: kernel-uname-r = %{KVERREL}%{?1:.%{1}}\
+Requires(pre): %{kernel_prereq}\
+Requires(pre): %{initrd_prereq}\
+Requires(post): /sbin/new-kernel-pkg\
+Requires(preun): /sbin/new-kernel-pkg\
+Conflicts: %{kernel_dot_org_conflicts}\
+Conflicts: %{package_conflicts}\
+%{expand:%%{?kernel%{?1:_%{1}}_conflicts:Conflicts: %%{kernel%{?1:_%{1}}_conflicts}}}\
+%{expand:%%{?kernel%{?1:_%{1}}_obsoletes:Obsoletes: %%{kernel%{?1:_%{1}}_obsoletes}}}\
+%{expand:%%{?kernel%{?1:_%{1}}_provides:Provides: %%{kernel%{?1:_%{1}}_provides}}}\
+# We can't let RPM do the dependencies automatic because it'll then pick up\
+# a correct but undesirable perl dependency from the module headers which\
+# isn't required for the kernel proper to function\
+AutoReq: no\
+AutoProv: yes\
+%{nil}
+
+Name: kernel%{?variant}
+Group: System Environment/Kernel
+License: GPLv2
+URL: http://www.kernel.org/
+Version: %{rpmversion}
+Release: %{pkg_release}
+# DO NOT CHANGE THE 'ExclusiveArch' LINE TO TEMPORARILY EXCLUDE AN ARCHITECTURE BUILD.
+# SET %%nobuildarches (ABOVE) INSTEAD
+ExclusiveArch: noarch %{all_x86} x86_64 ppc ppc64 ia64 sparc sparc64 s390x alpha alphaev56 %{arm}
+ExclusiveOS: Linux
+
+%kernel_reqprovconf
+%ifarch x86_64 sparc64
+Obsoletes: kernel-smp
+%endif
+
+
+#
+# List the packages used during the kernel build
+#
+BuildRequires: module-init-tools, patch >= 2.5.4, bash >= 2.03, sh-utils, tar
+BuildRequires: bzip2, findutils, gzip, m4, perl, make >= 3.78, diffutils, gawk
+BuildRequires: gcc >= 3.4.2, binutils >= 2.12, redhat-rpm-config
+BuildRequires: net-tools
+BuildRequires: xmlto, asciidoc
+%if %{with_sparse}
+BuildRequires: sparse >= 0.4.1
+%endif
+%if %{with_perftool}
+BuildRequires: elfutils-libelf-devel zlib-devel binutils-devel
+%endif
+BuildConflicts: rhbuildsys(DiskFree) < 500Mb
+
+%define fancy_debuginfo 0
+%if %{with_debuginfo}
+%if 0%{?fedora} >= 8 || 0%{?rhel} >= 6
+%define fancy_debuginfo 1
+%endif
+%endif
+
+%if %{fancy_debuginfo}
+# Fancy new debuginfo generation introduced in Fedora 8.
+BuildRequires: rpm-build >= 4.4.2.1-4
+%define debuginfo_args --strict-build-id
+%endif
+
+Source0: ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-%{kversion}.tar.bz2
+
+Source11: genkey
+Source14: find-provides
+Source15: merge.pl
+
+Source20: Makefile.config
+Source21: config-debug
+Source22: config-nodebug
+Source23: config-generic
+Source24: config-rhel-generic
+
+Source30: config-x86-generic
+Source31: config-i686-PAE
+
+Source40: config-x86_64-generic
+
+Source50: config-powerpc-generic
+Source51: config-powerpc32-generic
+Source52: config-powerpc32-smp
+Source53: config-powerpc64
+
+Source60: config-ia64-generic
+
+Source70: config-s390x
+
+Source90: config-sparc64-generic
+
+Source100: config-arm
+
+Source200: perf
+
+# Here should be only the patches up to the upstream canonical Linus tree.
+
+# For a stable release kernel
+%if 0%{?stable_update}
+%if 0%{?stable_base}
+%define    stable_patch_00  patch-2.6.%{base_sublevel}.%{stable_base}.bz2
+Patch00: %{stable_patch_00}
+%endif
+%if 0%{?stable_rc}
+%define    stable_patch_01  patch-2.6.%{base_sublevel}.%{stable_update}-rc%{stable_rc}.bz2
+Patch01: %{stable_patch_01}
+%endif
+
+# non-released_kernel case
+# These are automagically defined by the rcrev and gitrev values set up
+# near the top of this spec file.
+%else
+%if 0%{?rcrev}
+Patch00: patch-2.6.%{upstream_sublevel}-rc%{rcrev}.bz2
+%if 0%{?gitrev}
+Patch01: patch-2.6.%{upstream_sublevel}-rc%{rcrev}-git%{gitrev}.bz2
+%endif
+%else
+# pre-{base_sublevel+1}-rc1 case
+%if 0%{?gitrev}
+Patch00: patch-2.6.%{base_sublevel}-git%{gitrev}.bz2
+%endif
+%endif
+%endif
+
+%if %{using_upstream_branch}
+### BRANCH PATCH ###
+%endif
+
+Patch02: git-linus.diff
+
+# we always need nonintconfig, even for -vanilla kernels
+Patch03: linux-2.6-build-nonintconfig.patch
+
+# we also need compile fixes for -vanilla
+Patch04: linux-2.6-compile-fixes.patch
+
+# build tweak for build ID magic, even for -vanilla
+Patch05: linux-2.6-makefile-after_link.patch
+
+%if !%{nopatches}
+
+# revert upstream patches we get via other methods
+Patch09: linux-2.6-upstream-reverts.patch
+# Git trees.
+Patch10: git-cpufreq.patch
+Patch11: git-bluetooth.patch
+
+# Standalone patches
+Patch20: linux-2.6-hotfixes.patch
+
+Patch21: linux-2.6-tracehook.patch
+Patch22: linux-2.6-utrace.patch
+
+Patch30: sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch
+
+Patch31: disable-stackprotector-all.patch
+
+# Intel IOMMU fixes/workarounds
+Patch100: linux-2.6-die-closed-source-bios-muppets-die.patch
+Patch101: linux-2.6-intel-iommu-updates.patch
+Patch102: linux-2.6-iommu-at-zero.patch
+Patch103: linux-2.6-iommu-dmar-all-1s.patch
+Patch104: linux-2.6-iommu-another-hp-screwup.patch
+Patch105: linux-2.6-iommu-sanity-checks-for-intr-remap-too.patch
+Patch106: linux-2.6-iommu-hp-cantiga-resume.patch
+
+Patch141: linux-2.6-ps3-storage-alias.patch
+Patch143: linux-2.6-g5-therm-shutdown.patch
+Patch144: linux-2.6-vio-modalias.patch
+Patch147: linux-2.6-imac-transparent-bridge.patch
+
+Patch150: linux-2.6.29-sparc-IOC_TYPECHECK.patch
+
+Patch160: linux-2.6-execshield.patch
+
+Patch250: linux-2.6-debug-sizeof-structs.patch
+Patch260: linux-2.6-debug-nmi-timeout.patch
+Patch270: linux-2.6-debug-taint-vm.patch
+Patch280: linux-2.6-debug-spinlock-taint.patch
+Patch300: linux-2.6-driver-level-usb-autosuspend.diff
+Patch302: linux-2.6-qcserial-autosuspend.diff
+Patch303: linux-2.6-bluetooth-autosuspend.diff
+Patch304: linux-2.6-usb-uvc-autosuspend.diff
+Patch340: linux-2.6-debug-vm-would-have-oomkilled.patch
+Patch360: linux-2.6-debug-always-inline-kzalloc.patch
+Patch380: linux-2.6-defaults-pci_no_msi.patch
+Patch381: linux-2.6-pciehp-update.patch
+Patch382: linux-2.6-defaults-pciehp.patch
+Patch383: linux-2.6-defaults-aspm.patch
+Patch390: linux-2.6-defaults-acpi-video.patch
+Patch391: linux-2.6-acpi-video-dos.patch
+Patch450: linux-2.6-input-kill-stupid-messages.patch
+Patch451: linux-2.6-input-fix-toshiba-hotkeys.patch
+Patch452: linux-2.6.30-no-pcspkr-modalias.patch
+
+Patch460: linux-2.6-serial-460800.patch
+
+Patch470: die-floppy-die.patch
+
+Patch500: linux-2.6.31-copy_from_user-bounds.patch
+
+Patch510: linux-2.6-silence-noise.patch
+Patch520: linux-2.6.30-hush-rom-warning.patch
+Patch530: linux-2.6-silence-fbcon-logo.patch
+Patch570: linux-2.6-selinux-mprotect-checks.patch
+Patch580: linux-2.6-sparc-selinux-mprotect-checks.patch
+
+Patch600: linux-2.6-defaults-alsa-hda-beep-off.patch
+Patch601: linux-2.6-alsa-improve-hda-powerdown.patch
+Patch610: hda_intel-prealloc-4mb-dmabuffer.patch
+Patch611: alsa-tell-user-that-stream-to-be-rewound-is-suspended.patch
+
+Patch670: linux-2.6-ata-quirk.patch
+Patch671: linux-2.6-ahci-export-capabilities.patch
+
+Patch680: prism54-remove-pci-dev-table.patch
+Patch681: linux-2.6-ath9k-fixes.patch
+
+Patch800: linux-2.6-crash-driver.patch
+
+Patch900: linux-2.6-pci-cacheline-sizing.patch
+
+# ACPI
+Patch1100: linux-2.6.31-cpuidle-faster-io.patch
+# EC fixes from 2.6.32 (#492699, #525681)
+Patch1110: acpi-ec-merge-irq-and-poll-modes.patch
+Patch1120: acpi-ec-use-burst-mode-only-for-msi-notebooks.patch
+Patch1130: acpi-ec-restart-command-even-if-no-interrupts-from-ec.patch
+
+Patch1515: lirc-2.6.31.patch
+Patch1517: hdpvr-ir-enable.patch
+Patch1518: hid-ignore-all-recent-imon-devices.patch
+
+# virt + ksm patches
+Patch1550: linux-2.6-ksm.patch
+Patch1551: linux-2.6-ksm-kvm.patch
+Patch1552: linux-2.6-ksm-updates.patch
+Patch1553: linux-2.6-ksm-fix-munlock.patch
+Patch1554: linux-2.6-ksm-updates-from-32.patch
+Patch1579: linux-2.6-virtio_blk-revert-QUEUE_FLAG_VIRT-addition.patch
+Patch1583: linux-2.6-xen-fix-is_disconnected_device-exists_disconnected_device.patch
+Patch1584: linux-2.6-xen-improvement-to-wait_for_devices.patch
+Patch1585: linux-2.6-xen-increase-device-connection-timeout.patch
+Patch1586: linux-2.6-virtio_blk-add-support-for-cache-flush.patch
+
+# nouveau + drm fixes
+Patch1810: kms-offb-handoff.patch
+Patch1812: drm-next-b390f944.patch
+Patch1813: drm-radeon-pm.patch
+Patch1814: drm-nouveau.patch
+Patch1818: drm-i915-resume-force-mode.patch
+# intel drm is all merged upstream
+Patch1824: drm-intel-next.patch
+Patch1825: drm-intel-pm.patch
+Patch1826: drm-intel-no-tv-hotplug.patch
+Patch1827: drm-i915-fix-tvmode-oops.patch
+Patch1831: drm-conservative-fallback-modes.patch
+Patch1832: drm-edid-retry.patch
+Patch1834: drm-edid-header-fixup.patch
+Patch1835: drm-default-mode.patch
+Patch1837: drm-i915-fix-sync-to-vbl-when-vga-is-off.patch
+Patch1839: drm-radeon-misc-fixes.patch
+Patch1840: drm-radeon-rv410-test-fix.patch
+
+# vga arb
+Patch1900: linux-2.6-vga-arb.patch
+Patch1901: drm-vga-arb.patch
+Patch1902: drm-radeon-kms-arbiter-return-ignore.patch
+
+# make harmless fbcon debug less loud
+Patch1903: fbcon-lower-debug.patch
+
+# kludge to make ich9 e1000 work
+Patch2000: linux-2.6-e1000-ich9.patch
+
+# linux1394 git patches
+Patch2200: linux-2.6-firewire-git-update.patch
+Patch2201: linux-2.6-firewire-git-pending.patch
+
+# Quiet boot fixes
+# silence the ACPI blacklist code
+Patch2802: linux-2.6-silence-acpi-blacklist.patch
+
+Patch2899: linux-2.6-v4l-dvb-fixes.patch
+Patch2900: linux-2.6-v4l-dvb-update.patch
+Patch2901: linux-2.6-v4l-dvb-experimental.patch
+Patch2904: v4l-dvb-fix-cx25840-firmware-loading.patch
+
+# fs fixes
+
+#btrfs
+Patch3000: linux-2.6-btrfs-upstream.patch
+
+# NFSv4
+Patch3050: linux-2.6-nfsd4-proots.patch
+Patch3060: linux-2.6-nfs4-ver4opt.patch
+Patch3061: linux-2.6-nfs4-callback-hidden.patch
+
+# VIA Nano / VX8xx updates
+Patch11010: via-hwmon-temp-sensor.patch
+
+# patches headed upstream
+Patch12010: linux-2.6-dell-laptop-rfkill-fix.patch
+Patch12011: linux-2.6-block-silently-error-unsupported-empty-barriers-too.patch
+Patch12012: linux-2.6-rtc-show-hctosys.patch
+Patch12013: linux-2.6-rfkill-all.patch
+Patch12014: linux-2.6-selinux-module-load-perms.patch
+
+# sched fixes cherry-picked from 2.6.32
+Patch13100: sched-deal-with-low-load-in-wake-affine.patch
+Patch13101: sched-ensure-child-cant-gain-time-over-its-parent-after-fork.patch
+Patch13102: sched-remove-shortcut-from-select-task-rq-fair.patch
+# latency defaults from 2.6.32
+Patch13110: sched-retune-scheduler-latency-defaults.patch
+# Fix huge wakeup latencies
+Patch13120: sched-update-the-clock-of-runqueue-select-task-rq-selected.patch
+
+# patches headed for -stable
+
+# make perf counter API available to userspace (#527264)
+Patch14010: perf-make-perf-counter-h-available-to-userspace.patch
+
+# fix resource counter issues on *big* machines
+Patch14101: improve-resource-counter-scalability.patch
+
+# fix perf for sysprof
+Patch14420: perf-events-fix-swevent-hrtimer-sampling.patch
+Patch14421: perf-events-dont-generate-events-for-the-idle-task.patch
+
+Patch14430: crypto-via-padlock-fix-nano-aes.patch
+
+# tg3 fixes (#527209)
+Patch14451: tg3-01-delay-mdio-bus-init-until-fw-finishes.patch
+Patch14452: tg3-02-fix-tso-test-against-wrong-flags-var.patch
+Patch14453: tg3-03-fix-57780-asic-rev-pcie-link-receiver-errors.patch
+Patch14454: tg3-04-prevent-tx-bd-corruption.patch
+Patch14455: tg3-05-assign-flags-to-fixes-in-start_xmit_dma_bug.patch
+Patch14456: tg3-06-fix-5906-transmit-hangs.patch
+
+Patch14460: highmem-Fix-debug_kmap_atomic-to-also-handle-KM_IRQ_.patch
+Patch14461: highmem-Fix-race-in-debug_kmap_atomic-which-could-ca.patch
+Patch14462: highmem-fix-arm-powerpc-kmap_types.patch
+
+Patch14463: dlm-fix-connection-close-handling.patch
+
+# rhbz#544144 [bbf31bf18d34caa87dd01f08bf713635593697f2]
+Patch14464: ipv4-fix-null-ptr-deref-in-ip_fragment.patch
+
+%endif
+
+BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root
+
+%description
+The kernel package contains the Linux kernel (vmlinuz), the core of any
+Linux operating system.  The kernel handles the basic functions
+of the operating system: memory allocation, process allocation, device
+input and output, etc.
+
+
+%package doc
+Summary: Various documentation bits found in the kernel source
+Group: Documentation
+%description doc
+This package contains documentation files from the kernel
+source. Various bits of information about the Linux kernel and the
+device drivers shipped with it are documented in these files.
+
+You'll want to install this package if you need a reference to the
+options that can be passed to Linux kernel modules at load time.
+
+
+%package headers
+Summary: Header files for the Linux kernel for use by glibc
+Group: Development/System
+Obsoletes: glibc-kernheaders
+Provides: glibc-kernheaders = 3.0-46
+%description headers
+Kernel-headers includes the C header files that specify the interface
+between the Linux kernel and userspace libraries and programs.  The
+header files define structures and constants that are needed for
+building most standard programs and are also needed for rebuilding the
+glibc package.
+
+%package firmware
+Summary: Firmware files used by the Linux kernel
+Group: Development/System
+# This is... complicated.
+# Look at the WHENCE file.
+License: GPL+ and GPLv2+ and MIT and Redistributable, no modification permitted
+%if "x%{?variant}" != "x"
+Provides: kernel-firmware = %{rpmversion}-%{pkg_release}
+%endif
+%description firmware
+Kernel-firmware includes firmware files required for some devices to
+operate.
+
+%package bootwrapper
+Summary: Boot wrapper files for generating combined kernel + initrd images
+Group: Development/System
+Requires: gzip
+%description bootwrapper
+Kernel-bootwrapper contains the wrapper code which makes bootable "zImage"
+files combining both kernel and initial ramdisk.
+
+%package debuginfo-common-%{_target_cpu}
+Summary: Kernel source files used by %{name}-debuginfo packages
+Group: Development/Debug
+%description debuginfo-common-%{_target_cpu}
+This package is required by %{name}-debuginfo subpackages.
+It provides the kernel source files common to all builds.
+
+%package -n perf
+Summary: Performance monitoring for the Linux kernel
+Group: Development/System
+License: GPLv2
+%description -n perf
+This package provides the supporting documentation for the perf tool
+shipped in each kernel image subpackage.
+
+#
+# This macro creates a kernel-<subpackage>-debuginfo package.
+#	%%kernel_debuginfo_package <subpackage>
+#
+%define kernel_debuginfo_package() \
+%package %{?1:%{1}-}debuginfo\
+Summary: Debug information for package %{name}%{?1:-%{1}}\
+Group: Development/Debug\
+Requires: %{name}-debuginfo-common-%{_target_cpu} = %{version}-%{release}\
+Provides: %{name}%{?1:-%{1}}-debuginfo-%{_target_cpu} = %{version}-%{release}\
+AutoReqProv: no\
+%description -n %{name}%{?1:-%{1}}-debuginfo\
+This package provides debug information for package %{name}%{?1:-%{1}}.\
+This is required to use SystemTap with %{name}%{?1:-%{1}}-%{KVERREL}.\
+%{expand:%%global debuginfo_args %{?debuginfo_args} -p '/.*/%%{KVERREL}%{?1:\.%{1}}/.*|/.*%%{KVERREL}%{?1:\.%{1}}(\.debug)?' -o debuginfo%{?1}.list}\
+%{nil}
+
+#
+# This macro creates a kernel-<subpackage>-devel package.
+#	%%kernel_devel_package <subpackage> <pretty-name>
+#
+%define kernel_devel_package() \
+%package %{?1:%{1}-}devel\
+Summary: Development package for building kernel modules to match the %{?2:%{2} }kernel\
+Group: System Environment/Kernel\
+Provides: kernel%{?1:-%{1}}-devel-%{_target_cpu} = %{version}-%{release}\
+Provides: kernel-devel-%{_target_cpu} = %{version}-%{release}%{?1:.%{1}}\
+Provides: kernel-devel = %{version}-%{release}%{?1:.%{1}}\
+Provides: kernel-devel-uname-r = %{KVERREL}%{?1:.%{1}}\
+AutoReqProv: no\
+Requires(pre): /usr/bin/find\
+%description -n kernel%{?variant}%{?1:-%{1}}-devel\
+This package provides kernel headers and makefiles sufficient to build modules\
+against the %{?2:%{2} }kernel package.\
+%{nil}
+
+#
+# This macro creates a kernel-<subpackage> and its -devel and -debuginfo too.
+#	%%define variant_summary The Linux kernel compiled for <configuration>
+#	%%kernel_variant_package [-n <pretty-name>] <subpackage>
+#
+%define kernel_variant_package(n:) \
+%package %1\
+Summary: %{variant_summary}\
+Group: System Environment/Kernel\
+%kernel_reqprovconf\
+%{expand:%%kernel_devel_package %1 %{!?-n:%1}%{?-n:%{-n*}}}\
+%{expand:%%kernel_debuginfo_package %1}\
+%{nil}
+
+
+# First the auxiliary packages of the main kernel package.
+%kernel_devel_package
+%kernel_debuginfo_package
+
+
+# Now, each variant package.
+
+%define variant_summary The Linux kernel compiled for SMP machines
+%kernel_variant_package -n SMP smp
+%description smp
+This package includes a SMP version of the Linux kernel. It is
+required only on machines with two or more CPUs as well as machines with
+hyperthreading technology.
+
+Install the kernel-smp package if your machine uses two or more CPUs.
+
+
+%define variant_summary The Linux kernel compiled for PAE capable machines
+%kernel_variant_package PAE
+%description PAE
+This package includes a version of the Linux kernel with support for up to
+64GB of high memory. It requires a CPU with Physical Address Extensions (PAE).
+The non-PAE kernel can only address up to 4GB of memory.
+Install the kernel-PAE package if your machine has more than 4GB of memory.
+
+
+%define variant_summary The Linux kernel compiled with extra debugging enabled for PAE capable machines
+%kernel_variant_package PAEdebug
+Obsoletes: kernel-PAE-debug
+%description PAEdebug
+This package includes a version of the Linux kernel with support for up to
+64GB of high memory. It requires a CPU with Physical Address Extensions (PAE).
+The non-PAE kernel can only address up to 4GB of memory.
+Install the kernel-PAE package if your machine has more than 4GB of memory.
+
+This variant of the kernel has numerous debugging options enabled.
+It should only be installed when trying to gather additional information
+on kernel bugs, as some of these options impact performance noticably.
+
+
+%define variant_summary The Linux kernel compiled with extra debugging enabled
+%kernel_variant_package debug
+%description debug
+The kernel package contains the Linux kernel (vmlinuz), the core of any
+Linux operating system.  The kernel handles the basic functions
+of the operating system:  memory allocation, process allocation, device
+input and output, etc.
+
+This variant of the kernel has numerous debugging options enabled.
+It should only be installed when trying to gather additional information
+on kernel bugs, as some of these options impact performance noticably.
+
+
+%define variant_summary A minimal Linux kernel compiled for crash dumps
+%kernel_variant_package kdump
+%description kdump
+This package includes a kdump version of the Linux kernel. It is
+required only on machines which will use the kexec-based kernel crash dump
+mechanism.
+
+
+%prep
+# do a few sanity-checks for --with *only builds
+%if %{with_baseonly}
+%if !%{with_up}%{with_pae}
+echo "Cannot build --with baseonly, up build is disabled"
+exit 1
+%endif
+%endif
+
+%if %{with_smponly}
+%if !%{with_smp}
+echo "Cannot build --with smponly, smp build is disabled"
+exit 1
+%endif
+%endif
+
+# more sanity checking; do it quietly
+if [ "%{patches}" != "%%{patches}" ] ; then
+  for patch in %{patches} ; do
+    if [ ! -f $patch ] ; then
+      echo "ERROR: Patch  ${patch##/*/}  listed in specfile but is missing"
+      exit 1
+    fi
+  done
+fi 2>/dev/null
+
+patch_command='patch -p1 -F1 -s'
+ApplyPatch()
+{
+  local patch=$1
+  shift
+  if [ ! -f $RPM_SOURCE_DIR/$patch ]; then
+    exit 1
+  fi
+  if ! egrep "^Patch[0-9]+: $patch\$" %{_specdir}/${RPM_PACKAGE_NAME%%%%%{?variant}}.spec ; then
+    if [ "${patch:0:10}" != "patch-2.6." ] ; then
+      echo "ERROR: Patch  $patch  not listed as a source patch in specfile"
+      exit 1
+    fi
+  fi 2>/dev/null
+  case "$patch" in
+  *.bz2) bunzip2 < "$RPM_SOURCE_DIR/$patch" | $patch_command ${1+"$@"} ;;
+  *.gz) gunzip < "$RPM_SOURCE_DIR/$patch" | $patch_command ${1+"$@"} ;;
+  *) $patch_command ${1+"$@"} < "$RPM_SOURCE_DIR/$patch" ;;
+  esac
+}
+
+# don't apply patch if it's empty
+ApplyOptionalPatch()
+{
+  local patch=$1
+  shift
+  if [ ! -f $RPM_SOURCE_DIR/$patch ]; then
+    exit 1
+  fi
+  local C=$(wc -l $RPM_SOURCE_DIR/$patch | awk '{print $1}')
+  if [ "$C" -gt 9 ]; then
+    ApplyPatch $patch ${1+"$@"}
+  fi
+}
+
+# we don't want a .config file when building firmware: it just confuses the build system
+%define build_firmware \
+   mv .config .config.firmware_save \
+   make INSTALL_FW_PATH=$RPM_BUILD_ROOT/lib/firmware firmware_install \
+   mv .config.firmware_save .config
+
+# First we unpack the kernel tarball.
+# If this isn't the first make prep, we use links to the existing clean tarball
+# which speeds things up quite a bit.
+
+# Update to latest upstream.
+%if 0%{?released_kernel}
+%define vanillaversion 2.6.%{base_sublevel}
+# non-released_kernel case
+%else
+%if 0%{?rcrev}
+%define vanillaversion 2.6.%{upstream_sublevel}-rc%{rcrev}
+%if 0%{?gitrev}
+%define vanillaversion 2.6.%{upstream_sublevel}-rc%{rcrev}-git%{gitrev}
+%endif
+%else
+# pre-{base_sublevel+1}-rc1 case
+%if 0%{?gitrev}
+%define vanillaversion 2.6.%{base_sublevel}-git%{gitrev}
+%endif
+%endif
+%endif
+
+# We can share hardlinked source trees by putting a list of
+# directory names of the CVS checkouts that we want to share
+# with in .shared-srctree. (Full pathnames are required.)
+[ -f .shared-srctree ] && sharedirs=$(cat .shared-srctree)
+
+if [ ! -d kernel-%{kversion}/vanilla-%{vanillaversion} ]; then
+
+  if [ -d kernel-%{kversion}/vanilla-%{kversion} ]; then
+
+    cd kernel-%{kversion}
+
+    # Any vanilla-* directories other than the base one are stale.
+    for dir in vanilla-*; do
+      [ "$dir" = vanilla-%{kversion} ] || rm -rf $dir &
+    done
+
+  else
+
+    # Ok, first time we do a make prep.
+    rm -f pax_global_header
+    for sharedir in $sharedirs ; do
+      if [[ ! -z $sharedir  &&  -d $sharedir/kernel-%{kversion}/vanilla-%{kversion} ]] ; then
+        break
+      fi
+    done
+    if [[ ! -z $sharedir  &&  -d $sharedir/kernel-%{kversion}/vanilla-%{kversion} ]] ; then
+%setup -q -n kernel-%{kversion} -c -T
+      cp -rl $sharedir/kernel-%{kversion}/vanilla-%{kversion} .
+    else
+%setup -q -n kernel-%{kversion} -c
+      mv linux-%{kversion} vanilla-%{kversion}
+    fi
+
+  fi
+
+%if "%{kversion}" != "%{vanillaversion}"
+
+  for sharedir in $sharedirs ; do
+    if [[ ! -z $sharedir  &&  -d $sharedir/kernel-%{kversion}/vanilla-%{vanillaversion} ]] ; then
+      break
+    fi
+  done
+  if [[ ! -z $sharedir  &&  -d $sharedir/kernel-%{kversion}/vanilla-%{vanillaversion} ]] ; then
+
+    cp -rl $sharedir/kernel-%{kversion}/vanilla-%{vanillaversion} .
+
+  else
+
+    cp -rl vanilla-%{kversion} vanilla-%{vanillaversion}
+    cd vanilla-%{vanillaversion}
+
+# Update vanilla to the latest upstream.
+# (non-released_kernel case only)
+%if 0%{?rcrev}
+    ApplyPatch patch-2.6.%{upstream_sublevel}-rc%{rcrev}.bz2
+%if 0%{?gitrev}
+    ApplyPatch patch-2.6.%{upstream_sublevel}-rc%{rcrev}-git%{gitrev}.bz2
+%endif
+%else
+# pre-{base_sublevel+1}-rc1 case
+%if 0%{?gitrev}
+    ApplyPatch patch-2.6.%{base_sublevel}-git%{gitrev}.bz2
+%endif
+%endif
+
+    cd ..
+
+  fi
+
+%endif
+
+else
+  # We already have a vanilla dir.
+  cd kernel-%{kversion}
+fi
+
+if [ -d linux-%{kversion}.%{_target_cpu} ]; then
+  # Just in case we ctrl-c'd a prep already
+  rm -rf deleteme.%{_target_cpu}
+  # Move away the stale away, and delete in background.
+  mv linux-%{kversion}.%{_target_cpu} deleteme.%{_target_cpu}
+  rm -rf deleteme.%{_target_cpu} &
+fi
+
+cp -rl vanilla-%{vanillaversion} linux-%{kversion}.%{_target_cpu}
+
+cd linux-%{kversion}.%{_target_cpu}
+
+# released_kernel with possible stable updates
+%if 0%{?stable_base}
+ApplyPatch %{stable_patch_00}
+%endif
+%if 0%{?stable_rc}
+ApplyPatch %{stable_patch_01}
+%endif
+
+%if %{using_upstream_branch}
+### BRANCH APPLY ###
+%endif
+
+# Drop some necessary files from the source dir into the buildroot
+cp $RPM_SOURCE_DIR/config-* .
+cp %{SOURCE15} .
+
+# Dynamically generate kernel .config files from config-* files
+make -f %{SOURCE20} VERSION=%{version} configs
+
+#if a rhel kernel, apply the rhel config options
+%if 0%{?rhel}
+  for i in %{all_arch_configs}
+  do
+    mv $i $i.tmp
+    ./merge.pl config-rhel-generic $i.tmp > $i
+    rm $i.tmp
+  done
+%endif
+
+#ApplyOptionalPatch git-linus.diff
+
+# This patch adds a "make nonint_oldconfig" which is non-interactive and
+# also gives a list of missing options at the end. Useful for automated
+# builds (as used in the buildsystem).
+ApplyPatch linux-2.6-build-nonintconfig.patch
+
+ApplyPatch linux-2.6-makefile-after_link.patch
+
+#
+# misc small stuff to make things compile
+#
+ApplyOptionalPatch linux-2.6-compile-fixes.patch
+
+%if !%{nopatches}
+
+# revert patches from upstream that conflict or that we get via other means
+ApplyOptionalPatch linux-2.6-upstream-reverts.patch -R
+
+ApplyOptionalPatch git-cpufreq.patch
+#ApplyOptionalPatch git-bluetooth.patch
+
+ApplyPatch linux-2.6-hotfixes.patch
+
+# Roland's utrace ptrace replacement.
+ApplyPatch linux-2.6-tracehook.patch
+ApplyPatch linux-2.6-utrace.patch
+
+ApplyPatch sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch
+
+ApplyPatch disable-stackprotector-all.patch
+
+# Architecture patches
+# x86(-64)
+ApplyPatch via-hwmon-temp-sensor.patch
+ApplyPatch linux-2.6-dell-laptop-rfkill-fix.patch
+
+#
+# Intel IOMMU
+#
+# Quiesce USB host controllers before setting up the IOMMU
+ApplyPatch linux-2.6-die-closed-source-bios-muppets-die.patch
+# Some performance fixes, unify hardware/software passthrough support, and
+# most importantly: notice when the BIOS points us to a region that returns
+# all 0xFF, and claims that there's an IOMMU there.
+ApplyPatch linux-2.6-intel-iommu-updates.patch
+ApplyPatch linux-2.6-iommu-at-zero.patch
+ApplyPatch linux-2.6-iommu-dmar-all-1s.patch
+# Check for RMRRs which end before they start
+ApplyPatch linux-2.6-iommu-another-hp-screwup.patch
+# Apply the 'at zero' and 'all 0xFF' sanity checks for intr_remap too
+ApplyPatch linux-2.6-iommu-sanity-checks-for-intr-remap-too.patch
+# Fix up MMIO BAR for integrated graphics on HP laptops on resume (#536675)
+ApplyPatch linux-2.6-iommu-hp-cantiga-resume.patch
+
+#
+# PowerPC
+#
+### NOT (YET) UPSTREAM:
+# The storage alias patch is Fedora-local, and allows the old 'ps3_storage'
+# module name to work on upgrades. Otherwise, I believe mkinitrd will fail
+# to pull the module in,
+ApplyPatch linux-2.6-ps3-storage-alias.patch
+# Alleviate G5 thermal shutdown problems
+ApplyPatch linux-2.6-g5-therm-shutdown.patch
+# Provide modalias in sysfs for vio devices
+ApplyPatch linux-2.6-vio-modalias.patch
+# Work around PCIe bridge setup on iSight
+ApplyPatch linux-2.6-imac-transparent-bridge.patch
+
+#
+# SPARC64
+#
+ApplyPatch linux-2.6.29-sparc-IOC_TYPECHECK.patch
+
+#
+# Exec shield
+#
+ApplyPatch linux-2.6-execshield.patch
+
+#
+# bugfixes to drivers and filesystems
+#
+
+# ext4
+
+# xfs
+
+# btrfs
+ApplyPatch linux-2.6-btrfs-upstream.patch
+
+# eCryptfs
+
+# NFSv4
+ApplyPatch linux-2.6-nfsd4-proots.patch
+ApplyPatch linux-2.6-nfs4-ver4opt.patch
+ApplyPatch linux-2.6-nfs4-callback-hidden.patch
+
+# USB
+ApplyPatch linux-2.6-driver-level-usb-autosuspend.diff
+ApplyPatch linux-2.6-qcserial-autosuspend.diff
+ApplyPatch linux-2.6-bluetooth-autosuspend.diff
+ApplyPatch linux-2.6-usb-uvc-autosuspend.diff
+
+# ACPI
+ApplyPatch linux-2.6-defaults-acpi-video.patch
+ApplyPatch linux-2.6-acpi-video-dos.patch
+# cpuidle: Fix the menu governor to boost IO performance
+ApplyPatch linux-2.6.31-cpuidle-faster-io.patch
+# EC fixes from 2.6.32 (#492699, #525681)
+ApplyPatch acpi-ec-merge-irq-and-poll-modes.patch
+ApplyPatch acpi-ec-use-burst-mode-only-for-msi-notebooks.patch
+ApplyPatch acpi-ec-restart-command-even-if-no-interrupts-from-ec.patch
+
+# Various low-impact patches to aid debugging.
+ApplyPatch linux-2.6-debug-sizeof-structs.patch
+ApplyPatch linux-2.6-debug-nmi-timeout.patch
+ApplyPatch linux-2.6-debug-taint-vm.patch
+ApplyPatch linux-2.6-debug-spinlock-taint.patch
+ApplyPatch linux-2.6-debug-vm-would-have-oomkilled.patch
+ApplyPatch linux-2.6-debug-always-inline-kzalloc.patch
+
+#
+# PCI
+#
+# disable message signaled interrupts
+ApplyPatch linux-2.6-defaults-pci_no_msi.patch
+# update the pciehp driver
+#ApplyPatch linux-2.6-pciehp-update.patch
+# default to enabling passively listening for hotplug events
+#ApplyPatch linux-2.6-defaults-pciehp.patch
+# enable ASPM by default on hardware we expect to work
+ApplyPatch linux-2.6-defaults-aspm.patch
+
+#
+# SCSI Bits.
+#
+
+# ALSA
+# squelch hda_beep by default
+ApplyPatch linux-2.6-defaults-alsa-hda-beep-off.patch
+ApplyPatch linux-2.6-alsa-improve-hda-powerdown.patch
+ApplyPatch hda_intel-prealloc-4mb-dmabuffer.patch
+ApplyPatch alsa-tell-user-that-stream-to-be-rewound-is-suspended.patch
+
+# Networking
+
+# Misc fixes
+# The input layer spews crap no-one cares about.
+ApplyPatch linux-2.6-input-kill-stupid-messages.patch
+
+# stop floppy.ko from autoloading during udev...
+ApplyPatch die-floppy-die.patch
+
+# make copy_from_user to a stack slot provable right
+# hosed stuff, just drop this close to beta
+#ApplyPatch linux-2.6.31-copy_from_user-bounds.patch
+
+# Get away from having to poll Toshibas
+#ApplyPatch linux-2.6-input-fix-toshiba-hotkeys.patch
+
+ApplyPatch linux-2.6.30-no-pcspkr-modalias.patch
+
+# Allow to use 480600 baud on 16C950 UARTs
+ApplyPatch linux-2.6-serial-460800.patch
+
+# Silence some useless messages that still get printed with 'quiet'
+ApplyPatch linux-2.6-silence-noise.patch
+ApplyPatch linux-2.6.30-hush-rom-warning.patch
+
+# Make fbcon not show the penguins with 'quiet'
+ApplyPatch linux-2.6-silence-fbcon-logo.patch
+
+# Fix the SELinux mprotect checks on executable mappings
+#ApplyPatch linux-2.6-selinux-mprotect-checks.patch
+# Fix SELinux for sparc
+#ApplyPatch linux-2.6-sparc-selinux-mprotect-checks.patch
+
+# Changes to upstream defaults.
+
+
+# ia64 ata quirk
+ApplyPatch linux-2.6-ata-quirk.patch
+
+# Make it possible to identify non-hotplug SATA ports
+ApplyPatch linux-2.6-ahci-export-capabilities.patch
+
+# prism54: remove pci modinfo device table
+ApplyPatch prism54-remove-pci-dev-table.patch
+
+# ath9k: add fixes suggested by upstream maintainer
+ApplyPatch linux-2.6-ath9k-fixes.patch
+
+# /dev/crash driver.
+ApplyPatch linux-2.6-crash-driver.patch
+
+# Determine cacheline sizes in a generic manner.
+ApplyPatch linux-2.6-pci-cacheline-sizing.patch
+
+# http://www.lirc.org/
+ApplyPatch lirc-2.6.31.patch
+# enable IR receiver on Hauppauge HD PVR (v4l-dvb merge pending)
+ApplyPatch hdpvr-ir-enable.patch
+# tell usbhid to ignore all imon devices (sent upstream 2009.07.31)
+ApplyPatch hid-ignore-all-recent-imon-devices.patch
+
+# Add kernel KSM support
+ApplyPatch linux-2.6-ksm.patch
+ApplyPatch linux-2.6-ksm-updates.patch
+ApplyPatch linux-2.6-ksm-fix-munlock.patch
+ApplyPatch linux-2.6-ksm-updates-from-32.patch
+# Optimize KVM for KSM support
+ApplyPatch linux-2.6-ksm-kvm.patch
+
+# Assorted Virt Fixes
+ApplyPatch linux-2.6-virtio_blk-revert-QUEUE_FLAG_VIRT-addition.patch
+ApplyPatch linux-2.6-xen-fix-is_disconnected_device-exists_disconnected_device.patch
+ApplyPatch linux-2.6-xen-improvement-to-wait_for_devices.patch
+ApplyPatch linux-2.6-xen-increase-device-connection-timeout.patch
+ApplyPatch linux-2.6-virtio_blk-add-support-for-cache-flush.patch
+
+# Fix block I/O errors in KVM
+ApplyPatch linux-2.6-block-silently-error-unsupported-empty-barriers-too.patch
+
+ApplyPatch linux-2.6-e1000-ich9.patch
+
+# Nouveau DRM + drm fixes
+ApplyPatch kms-offb-handoff.patch
+ApplyPatch drm-next-b390f944.patch
+ApplyPatch drm-radeon-misc-fixes.patch
+ApplyPatch drm-radeon-rv410-test-fix.patch
+ApplyPatch drm-conservative-fallback-modes.patch
+ApplyPatch drm-edid-retry.patch
+ApplyPatch drm-edid-header-fixup.patch
+ApplyPatch drm-default-mode.patch
+
+ApplyPatch drm-nouveau.patch
+# pm broken on my thinkpad t60p - airlied
+#ApplyPatch drm-radeon-pm.patch
+ApplyPatch drm-i915-resume-force-mode.patch
+ApplyOptionalPatch drm-intel-next.patch
+#this appears to be upstream - mjg59?
+#ApplyPatch drm-intel-pm.patch
+ApplyPatch drm-intel-no-tv-hotplug.patch
+ApplyPatch drm-i915-fix-tvmode-oops.patch
+ApplyPatch drm-i915-fix-sync-to-vbl-when-vga-is-off.patch
+#ApplyPatch drm-disable-r600-aspm.patch
+
+# VGA arb + drm
+ApplyPatch linux-2.6-vga-arb.patch
+ApplyPatch drm-vga-arb.patch
+ApplyPatch drm-radeon-kms-arbiter-return-ignore.patch
+
+# Lower debug level of fbcon handover messages (rh#538526)
+ApplyPatch fbcon-lower-debug.patch
+
+# linux1394 git patches
+# apply if non-empty
+ApplyOptionalPatch linux-2.6-firewire-git-update.patch
+ApplyOptionalPatch linux-2.6-firewire-git-pending.patch
+
+# silence the ACPI blacklist code
+ApplyPatch linux-2.6-silence-acpi-blacklist.patch
+
+# V4L/DVB updates/fixes/experimental drivers
+# apply if non-empty
+ApplyOptionalPatch linux-2.6-v4l-dvb-fixes.patch
+ApplyOptionalPatch linux-2.6-v4l-dvb-update.patch
+ApplyOptionalPatch linux-2.6-v4l-dvb-experimental.patch
+
+ApplyPatch v4l-dvb-fix-cx25840-firmware-loading.patch
+
+# Patches headed upstream
+ApplyPatch linux-2.6-rtc-show-hctosys.patch
+ApplyPatch linux-2.6-rfkill-all.patch
+ApplyPatch linux-2.6-selinux-module-load-perms.patch
+
+# patches headed for -stable
+
+# make perf counter API available to userspace (#527264)
+ApplyPatch perf-make-perf-counter-h-available-to-userspace.patch
+
+ApplyPatch improve-resource-counter-scalability.patch
+
+# fix perf for sysprof
+ApplyPatch perf-events-fix-swevent-hrtimer-sampling.patch
+ApplyPatch perf-events-dont-generate-events-for-the-idle-task.patch
+
+# Fix oops in padlock
+ApplyPatch crypto-via-padlock-fix-nano-aes.patch
+
+# tg3 fixes (#527209)
+ApplyPatch tg3-01-delay-mdio-bus-init-until-fw-finishes.patch
+ApplyPatch tg3-02-fix-tso-test-against-wrong-flags-var.patch
+ApplyPatch tg3-03-fix-57780-asic-rev-pcie-link-receiver-errors.patch
+ApplyPatch tg3-04-prevent-tx-bd-corruption.patch
+ApplyPatch tg3-05-assign-flags-to-fixes-in-start_xmit_dma_bug.patch
+ApplyPatch tg3-06-fix-5906-transmit-hangs.patch
+
+# sched fixes cherry-picked from 2.6.32
+ApplyPatch sched-deal-with-low-load-in-wake-affine.patch
+ApplyPatch sched-ensure-child-cant-gain-time-over-its-parent-after-fork.patch
+ApplyPatch sched-remove-shortcut-from-select-task-rq-fair.patch
+# latency defaults from 2.6.32
+ApplyPatch sched-retune-scheduler-latency-defaults.patch
+# fix wakeup latency
+ApplyPatch sched-update-the-clock-of-runqueue-select-task-rq-selected.patch
+
+ApplyPatch highmem-Fix-debug_kmap_atomic-to-also-handle-KM_IRQ_.patch
+ApplyPatch highmem-Fix-race-in-debug_kmap_atomic-which-could-ca.patch
+ApplyPatch highmem-fix-arm-powerpc-kmap_types.patch
+
+ApplyPatch dlm-fix-connection-close-handling.patch
+
+# rhbz#544144
+ApplyPatch ipv4-fix-null-ptr-deref-in-ip_fragment.patch
+
+# END OF PATCH APPLICATIONS
+
+%endif
+
+# Any further pre-build tree manipulations happen here.
+
+chmod +x scripts/checkpatch.pl
+
+# only deal with configs if we are going to build for the arch
+%ifnarch %nobuildarches
+
+mkdir configs
+
+# Remove configs not for the buildarch
+for cfg in kernel-%{version}-*.config; do
+  if [ `echo %{all_arch_configs} | grep -c $cfg` -eq 0 ]; then
+    rm -f $cfg
+  fi
+done
+
+%if !%{debugbuildsenabled}
+rm -f kernel-%{version}-*debug.config
+%endif
+
+# now run oldconfig over all the config files
+for i in *.config
+do
+  mv $i .config
+  Arch=`head -1 .config | cut -b 3-`
+  make ARCH=$Arch %{oldconfig_target}
+  echo "# $Arch" > configs/$i
+  cat .config >> configs/$i
+done
+# end of kernel config
+%endif
+
+# get rid of unwanted files resulting from patch fuzz
+find . \( -name "*.orig" -o -name "*~" \) -exec rm -f {} \; >/dev/null
+
+cd ..
+
+###
+### build
+###
+%build
+
+%if %{with_sparse}
+%define sparse_mflags	C=1
+%endif
+
+%if %{fancy_debuginfo}
+# This override tweaks the kernel makefiles so that we run debugedit on an
+# object before embedding it.  When we later run find-debuginfo.sh, it will
+# run debugedit again.  The edits it does change the build ID bits embedded
+# in the stripped object, but repeating debugedit is a no-op.  We do it
+# beforehand to get the proper final build ID bits into the embedded image.
+# This affects the vDSO images in vmlinux, and the vmlinux image in bzImage.
+export AFTER_LINK=\
+'sh -xc "/usr/lib/rpm/debugedit -b $$RPM_BUILD_DIR -d /usr/src/debug -i $@"'
+%endif
+
+cp_vmlinux()
+{
+  eu-strip --remove-comment -o "$2" "$1"
+}
+
+BuildKernel() {
+    MakeTarget=$1
+    KernelImage=$2
+    Flavour=$3
+    InstallName=${4:-vmlinuz}
+
+    # Pick the right config file for the kernel we're building
+    Config=kernel-%{version}-%{_target_cpu}${Flavour:+-${Flavour}}.config
+    DevelDir=/usr/src/kernels/%{KVERREL}${Flavour:+.${Flavour}}
+
+    # When the bootable image is just the ELF kernel, strip it.
+    # We already copy the unstripped file into the debuginfo package.
+    if [ "$KernelImage" = vmlinux ]; then
+      CopyKernel=cp_vmlinux
+    else
+      CopyKernel=cp
+    fi
+
+    KernelVer=%{version}-%{release}.%{_target_cpu}${Flavour:+.${Flavour}}
+    echo BUILDING A KERNEL FOR ${Flavour} %{_target_cpu}...
+
+    # make sure EXTRAVERSION says what we want it to say
+    perl -p -i -e "s/^EXTRAVERSION.*/EXTRAVERSION = %{?stablerev}-%{release}.%{_target_cpu}${Flavour:+.${Flavour}}/" Makefile
+
+    # if pre-rc1 devel kernel, must fix up SUBLEVEL for our versioning scheme
+    %if !0%{?rcrev}
+    %if 0%{?gitrev}
+    perl -p -i -e 's/^SUBLEVEL.*/SUBLEVEL = %{upstream_sublevel}/' Makefile
+    %endif
+    %endif
+
+    # and now to start the build process
+
+    make -s mrproper
+    cp configs/$Config .config
+
+    Arch=`head -1 .config | cut -b 3-`
+    echo USING ARCH=$Arch
+
+    make -s ARCH=$Arch %{oldconfig_target} > /dev/null
+    make -s ARCH=$Arch V=1 %{?_smp_mflags} $MakeTarget %{?sparse_mflags}
+    make -s ARCH=$Arch V=1 %{?_smp_mflags} modules %{?sparse_mflags} || exit 1
+
+%if %{with_perftool}
+    pushd tools/perf
+# make sure the scripts are executable... won't be in tarball until 2.6.31 :/
+    chmod +x util/generate-cmdlist.sh util/PERF-VERSION-GEN
+    make -s V=1 %{?_smp_mflags} perf
+    mkdir -p $RPM_BUILD_ROOT/usr/libexec/
+    install -m 755 perf $RPM_BUILD_ROOT/usr/libexec/perf.$KernelVer
+    popd
+%endif
+
+    # Start installing the results
+%if %{with_debuginfo}
+    mkdir -p $RPM_BUILD_ROOT%{debuginfodir}/boot
+    mkdir -p $RPM_BUILD_ROOT%{debuginfodir}/%{image_install_path}
+%endif
+    mkdir -p $RPM_BUILD_ROOT/%{image_install_path}
+    install -m 644 .config $RPM_BUILD_ROOT/boot/config-$KernelVer
+    install -m 644 System.map $RPM_BUILD_ROOT/boot/System.map-$KernelVer
+%if %{with_dracut}
+    # We estimate the size of the initramfs because rpm needs to take this size
+    # into consideration when performing disk space calculations. (See bz #530778)
+    dd if=/dev/zero of=$RPM_BUILD_ROOT/boot/initramfs-$KernelVer.img bs=1M count=20
+%else
+    dd if=/dev/zero of=$RPM_BUILD_ROOT/boot/initrd-$KernelVer.img bs=1M count=5
+%endif
+    if [ -f arch/$Arch/boot/zImage.stub ]; then
+      cp arch/$Arch/boot/zImage.stub $RPM_BUILD_ROOT/%{image_install_path}/zImage.stub-$KernelVer || :
+    fi
+    $CopyKernel $KernelImage \
+    		$RPM_BUILD_ROOT/%{image_install_path}/$InstallName-$KernelVer
+    chmod 755 $RPM_BUILD_ROOT/%{image_install_path}/$InstallName-$KernelVer
+
+    mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer
+    # Override $(mod-fw) because we don't want it to install any firmware
+    # We'll do that ourselves with 'make firmware_install'
+    make -s ARCH=$Arch INSTALL_MOD_PATH=$RPM_BUILD_ROOT modules_install KERNELRELEASE=$KernelVer mod-fw=
+%ifarch %{vdso_arches}
+    make -s ARCH=$Arch INSTALL_MOD_PATH=$RPM_BUILD_ROOT vdso_install KERNELRELEASE=$KernelVer
+    if grep '^CONFIG_XEN=y$' .config >/dev/null; then
+      echo > ldconfig-kernel.conf "\
+# This directive teaches ldconfig to search in nosegneg subdirectories
+# and cache the DSOs there with extra bit 0 set in their hwcap match
+# fields.  In Xen guest kernels, the vDSO tells the dynamic linker to
+# search in nosegneg subdirectories and to match this extra hwcap bit
+# in the ld.so.cache file.
+hwcap 0 nosegneg"
+    fi
+    if [ ! -s ldconfig-kernel.conf ]; then
+      echo > ldconfig-kernel.conf "\
+# Placeholder file, no vDSO hwcap entries used in this kernel."
+    fi
+    %{__install} -D -m 444 ldconfig-kernel.conf \
+        $RPM_BUILD_ROOT/etc/ld.so.conf.d/kernel-$KernelVer.conf
+%endif
+
+    # And save the headers/makefiles etc for building modules against
+    #
+    # This all looks scary, but the end result is supposed to be:
+    # * all arch relevant include/ files
+    # * all Makefile/Kconfig files
+    # * all script/ files
+
+    rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/source
+    mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    (cd $RPM_BUILD_ROOT/lib/modules/$KernelVer ; ln -s build source)
+    # dirs for additional modules per module-init-tools, kbuild/modules.txt
+    mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/extra
+    mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/updates
+    mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/weak-updates
+    # first copy everything
+    cp --parents `find  -type f -name "Makefile*" -o -name "Kconfig*"` $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    cp Module.symvers $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    cp System.map $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    if [ -s Module.markers ]; then
+      cp Module.markers $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    fi
+    # then drop all but the needed Makefiles/Kconfig files
+    rm -rf $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/Documentation
+    rm -rf $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/scripts
+    rm -rf $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include
+    cp .config $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    cp -a scripts $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+    if [ -d arch/$Arch/scripts ]; then
+      cp -a arch/$Arch/scripts $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/arch/%{_arch} || :
+    fi
+    if [ -f arch/$Arch/*lds ]; then
+      cp -a arch/$Arch/*lds $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/arch/%{_arch}/ || :
+    fi
+    rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/scripts/*.o
+    rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/scripts/*/*.o
+%ifarch ppc
+    cp -a --parents arch/powerpc/lib/crtsavres.[So] $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/
+%endif
+    if [ -d arch/%{asmarch}/include ]; then
+      cp -a --parents arch/%{asmarch}/include $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/
+    fi
+    mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include
+    cd include
+    cp -a acpi config crypto keys linux math-emu media mtd net pcmcia rdma rxrpc scsi sound trace video drm asm-generic $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include
+    asmdir=$(readlink asm)
+    cp -a $asmdir $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/
+    pushd $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include
+    ln -s $asmdir asm
+    popd
+    # Make sure the Makefile and version.h have a matching timestamp so that
+    # external modules can be built
+    touch -r $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/Makefile $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/linux/version.h
+    touch -r $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/.config $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/linux/autoconf.h
+    # Copy .config to include/config/auto.conf so "make prepare" is unnecessary.
+    cp $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/.config $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/config/auto.conf
+    cd ..
+
+    #
+    # save the vmlinux file for kernel debugging into the kernel-debuginfo rpm
+    #
+%if %{with_debuginfo}
+    mkdir -p $RPM_BUILD_ROOT%{debuginfodir}/lib/modules/$KernelVer
+    cp vmlinux $RPM_BUILD_ROOT%{debuginfodir}/lib/modules/$KernelVer
+%endif
+
+    find $RPM_BUILD_ROOT/lib/modules/$KernelVer -name "*.ko" -type f >modnames
+
+    # mark modules executable so that strip-to-file can strip them
+    xargs --no-run-if-empty chmod u+x < modnames
+
+    # Generate a list of modules for block and networking.
+
+    fgrep /drivers/ modnames | xargs --no-run-if-empty nm -upA |
+    sed -n 's,^.*/\([^/]*\.ko\):  *U \(.*\)$,\1 \2,p' > drivers.undef
+
+    collect_modules_list()
+    {
+      sed -r -n -e "s/^([^ ]+) \\.?($2)\$/\\1/p" drivers.undef |
+      LC_ALL=C sort -u > $RPM_BUILD_ROOT/lib/modules/$KernelVer/modules.$1
+    }
+
+    collect_modules_list networking \
+    			 'register_netdev|ieee80211_register_hw|usbnet_probe'
+    collect_modules_list block \
+    			 'ata_scsi_ioctl|scsi_add_host|blk_init_queue|register_mtd_blktrans|scsi_esp_register|scsi_register_device_handler'
+    collect_modules_list drm \
+    			 'drm_open|drm_init'
+    collect_modules_list modesetting \
+    			 'drm_crtc_init'
+
+    # detect missing or incorrect license tags
+    rm -f modinfo
+    while read i
+    do
+      echo -n "${i#$RPM_BUILD_ROOT/lib/modules/$KernelVer/} " >> modinfo
+      /sbin/modinfo -l $i >> modinfo
+    done < modnames
+
+    egrep -v \
+    	  'GPL( v2)?$|Dual BSD/GPL$|Dual MPL/GPL$|GPL and additional rights$' \
+	  modinfo && exit 1
+
+    rm -f modinfo modnames
+
+    # remove files that will be auto generated by depmod at rpm -i time
+    for i in alias alias.bin ccwmap dep dep.bin ieee1394map inputmap isapnpmap ofmap pcimap seriomap symbols symbols.bin usbmap
+    do
+      rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/modules.$i
+    done
+
+    # Move the devel headers out of the root file system
+    mkdir -p $RPM_BUILD_ROOT/usr/src/kernels
+    mv $RPM_BUILD_ROOT/lib/modules/$KernelVer/build $RPM_BUILD_ROOT/$DevelDir
+    ln -sf ../../..$DevelDir $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+}
+
+###
+# DO it...
+###
+
+# prepare directories
+rm -rf $RPM_BUILD_ROOT
+mkdir -p $RPM_BUILD_ROOT/boot
+
+cd linux-%{kversion}.%{_target_cpu}
+
+%if %{with_debug}
+BuildKernel %make_target %kernel_image debug
+%endif
+
+%if %{with_pae_debug}
+BuildKernel %make_target %kernel_image PAEdebug
+%endif
+
+%if %{with_pae}
+BuildKernel %make_target %kernel_image PAE
+%endif
+
+%if %{with_up}
+BuildKernel %make_target %kernel_image
+%endif
+
+%if %{with_smp}
+BuildKernel %make_target %kernel_image smp
+%endif
+
+%if %{with_kdump}
+BuildKernel vmlinux vmlinux kdump vmlinux
+%endif
+
+%if %{with_doc}
+# Make the HTML and man pages.
+# XXX nix %{?_smp_mflags} here, buggy Documentation/*/Makefile!
+make htmldocs mandocs || %{doc_build_fail}
+
+# sometimes non-world-readable files sneak into the kernel source tree
+chmod -R a=rX Documentation
+find Documentation -type d | xargs chmod u+w
+%endif
+
+%if %{with_perf}
+pushd tools/perf
+make %{?_smp_mflags} man || %{doc_build_fail}
+popd
+%endif
+
+###
+### Special hacks for debuginfo subpackages.
+###
+
+# This macro is used by %%install, so we must redefine it before that.
+%define debug_package %{nil}
+
+%if %{fancy_debuginfo}
+%define __debug_install_post \
+  /usr/lib/rpm/find-debuginfo.sh %{debuginfo_args} %{_builddir}/%{?buildsubdir}\
+%{nil}
+%endif
+
+%if %{with_debuginfo}
+%ifnarch noarch
+%global __debug_package 1
+%files -f debugfiles.list debuginfo-common-%{_target_cpu}
+%defattr(-,root,root)
+%endif
+%endif
+
+###
+### install
+###
+
+%install
+
+cd linux-%{kversion}.%{_target_cpu}
+
+%if %{with_doc}
+docdir=$RPM_BUILD_ROOT%{_datadir}/doc/kernel-doc-%{rpmversion}
+man9dir=$RPM_BUILD_ROOT%{_datadir}/man/man9
+
+# copy the source over
+mkdir -p $docdir
+tar -f - --exclude=man --exclude='.*' -c Documentation | tar xf - -C $docdir
+
+# Install man pages for the kernel API.
+mkdir -p $man9dir
+find Documentation/DocBook/man -name '*.9.gz' -print0 |
+xargs -0 --no-run-if-empty %{__install} -m 444 -t $man9dir $m
+ls $man9dir | grep -q '' || > $man9dir/BROKEN
+%endif # with_doc
+
+# perf docs
+%if %{with_perf}
+mandir=$RPM_BUILD_ROOT%{_datadir}/man
+man1dir=$mandir/man1
+pushd tools/perf/Documentation
+make install-man mandir=$mandir
+popd
+
+pushd $man1dir
+for d in *.1; do
+ gzip $d;
+done
+popd
+%endif # with_perf
+
+# perf shell wrapper
+%if %{with_perf}
+mkdir -p $RPM_BUILD_ROOT/usr/sbin/
+cp $RPM_SOURCE_DIR/perf $RPM_BUILD_ROOT/usr/sbin/perf
+chmod 0755 $RPM_BUILD_ROOT/usr/sbin/perf
+mkdir -p $RPM_BUILD_ROOT%{_datadir}/doc/perf
+%endif
+
+%if %{with_headers}
+# Install kernel headers
+make ARCH=%{hdrarch} INSTALL_HDR_PATH=$RPM_BUILD_ROOT/usr headers_install
+
+# Do headers_check but don't die if it fails.
+make ARCH=%{hdrarch} INSTALL_HDR_PATH=$RPM_BUILD_ROOT/usr headers_check \
+     > hdrwarnings.txt || :
+if grep -q exist hdrwarnings.txt; then
+   sed s:^$RPM_BUILD_ROOT/usr/include/:: hdrwarnings.txt
+   # Temporarily cause a build failure if header inconsistencies.
+   # exit 1
+fi
+
+find $RPM_BUILD_ROOT/usr/include \
+     \( -name .install -o -name .check -o \
+     	-name ..install.cmd -o -name ..check.cmd \) | xargs rm -f
+
+# glibc provides scsi headers for itself, for now
+rm -rf $RPM_BUILD_ROOT/usr/include/scsi
+rm -f $RPM_BUILD_ROOT/usr/include/asm*/atomic.h
+rm -f $RPM_BUILD_ROOT/usr/include/asm*/io.h
+rm -f $RPM_BUILD_ROOT/usr/include/asm*/irq.h
+%endif
+
+%if %{with_firmware}
+%{build_firmware}
+%endif
+
+%if %{with_bootwrapper}
+make DESTDIR=$RPM_BUILD_ROOT bootwrapper_install WRAPPER_OBJDIR=%{_libdir}/kernel-wrapper WRAPPER_DTSDIR=%{_libdir}/kernel-wrapper/dts
+%endif
+
+
+###
+### clean
+###
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+###
+### scripts
+###
+
+#
+# This macro defines a %%post script for a kernel*-devel package.
+#	%%kernel_devel_post [<subpackage>]
+#
+%define kernel_devel_post() \
+%{expand:%%post %{?1:%{1}-}devel}\
+if [ -f /etc/sysconfig/kernel ]\
+then\
+    . /etc/sysconfig/kernel || exit $?\
+fi\
+if [ "$HARDLINK" != "no" -a -x /usr/sbin/hardlink ]\
+then\
+    (cd /usr/src/kernels/%{KVERREL}%{?1:.%{1}} &&\
+     /usr/bin/find . -type f | while read f; do\
+       hardlink -c /usr/src/kernels/*.fc*.*/$f $f\
+     done)\
+fi\
+%{nil}
+
+# This macro defines a %%posttrans script for a kernel package.
+#	%%kernel_variant_posttrans [<subpackage>]
+# More text can follow to go at the end of this variant's %%post.
+#
+%define kernel_variant_posttrans() \
+%{expand:%%posttrans %{?1}}\
+/sbin/new-kernel-pkg --package kernel%{?1:-%{1}} --rpmposttrans %{KVERREL}%{?1:.%{1}} || exit $?\
+%{nil}
+
+#
+# This macro defines a %%post script for a kernel package and its devel package.
+#	%%kernel_variant_post [-v <subpackage>] [-r <replace>]
+# More text can follow to go at the end of this variant's %%post.
+#
+%define kernel_variant_post(v:r:) \
+%{expand:%%kernel_devel_post %{?-v*}}\
+%{expand:%%kernel_variant_posttrans %{?-v*}}\
+%{expand:%%post %{?-v*}}\
+%{-r:\
+if [ `uname -i` == "x86_64" -o `uname -i` == "i386" ] &&\
+   [ -f /etc/sysconfig/kernel ]; then\
+  /bin/sed -r -i -e 's/^DEFAULTKERNEL=%{-r*}$/DEFAULTKERNEL=kernel%{?-v:-%{-v*}}/' /etc/sysconfig/kernel || exit $?\
+fi}\
+%{expand:\
+%if %{with_dracut}\
+/sbin/new-kernel-pkg --package kernel%{?-v:-%{-v*}} --mkinitrd --dracut --depmod --install %{KVERREL}%{?-v:.%{-v*}} || exit $?\
+%else\
+/sbin/new-kernel-pkg --package kernel%{?-v:-%{-v*}} --mkinitrd --depmod --install %{KVERREL}%{?-v:.%{-v*}} || exit $?\
+%endif}\
+#if [ -x /sbin/weak-modules ]\
+#then\
+#    /sbin/weak-modules --add-kernel %{KVERREL}%{?-v*} || exit $?\
+#fi\
+%{nil}
+
+#
+# This macro defines a %%preun script for a kernel package.
+#	%%kernel_variant_preun <subpackage>
+#
+%define kernel_variant_preun() \
+%{expand:%%preun %{?1}}\
+/sbin/new-kernel-pkg --rminitrd --rmmoddep --remove %{KVERREL}%{?1:.%{1}} || exit $?\
+#if [ -x /sbin/weak-modules ]\
+#then\
+#    /sbin/weak-modules --remove-kernel %{KVERREL}%{?1} || exit $?\
+#fi\
+%{nil}
+
+%kernel_variant_preun
+%ifarch x86_64
+%kernel_variant_post -r (kernel-smp|kernel-xen)
+%else
+%kernel_variant_post -r kernel-smp
+%endif
+
+%kernel_variant_preun smp
+%kernel_variant_post -v smp
+
+%kernel_variant_preun PAE
+%kernel_variant_post -v PAE -r (kernel|kernel-smp|kernel-xen)
+
+%kernel_variant_preun debug
+%kernel_variant_post -v debug
+
+%kernel_variant_post -v PAEdebug -r (kernel|kernel-smp|kernel-xen)
+%kernel_variant_preun PAEdebug
+
+if [ -x /sbin/ldconfig ]
+then
+    /sbin/ldconfig -X || exit $?
+fi
+
+###
+### file lists
+###
+
+%if %{with_headers}
+%files headers
+%defattr(-,root,root)
+/usr/include/*
+%endif
+
+%if %{with_firmware}
+%files firmware
+%defattr(-,root,root)
+/lib/firmware/*
+%doc linux-%{kversion}.%{_target_cpu}/firmware/WHENCE
+%endif
+
+%if %{with_bootwrapper}
+%files bootwrapper
+%defattr(-,root,root)
+/usr/sbin/*
+%{_libdir}/kernel-wrapper
+%endif
+
+# only some architecture builds need kernel-doc
+%if %{with_doc}
+%files doc
+%defattr(-,root,root)
+%{_datadir}/doc/kernel-doc-%{rpmversion}/Documentation/*
+%dir %{_datadir}/doc/kernel-doc-%{rpmversion}/Documentation
+%dir %{_datadir}/doc/kernel-doc-%{rpmversion}
+%{_datadir}/man/man9/*
+%endif
+
+%if %{with_perf}
+%files -n perf
+%defattr(-,root,root)
+%{_datadir}/doc/perf
+/usr/sbin/perf
+%{_datadir}/man/man1/*
+%endif
+
+# This is %{image_install_path} on an arch where that includes ELF files,
+# or empty otherwise.
+%define elf_image_install_path %{?kernel_image_elf:%{image_install_path}}
+
+#
+# This macro defines the %%files sections for a kernel package
+# and its devel and debuginfo packages.
+#	%%kernel_variant_files [-k vmlinux] <condition> <subpackage>
+#
+%define kernel_variant_files(k:) \
+%if %{1}\
+%{expand:%%files %{?2}}\
+%defattr(-,root,root)\
+/%{image_install_path}/%{?-k:%{-k*}}%{!?-k:vmlinuz}-%{KVERREL}%{?2:.%{2}}\
+/boot/System.map-%{KVERREL}%{?2:.%{2}}\
+%if %{with_perftool}\
+/usr/libexec/perf.%{KVERREL}%{?2:.%{2}}\
+%endif\
+#/boot/symvers-%{KVERREL}%{?2:.%{2}}.gz\
+/boot/config-%{KVERREL}%{?2:.%{2}}\
+%dir /lib/modules/%{KVERREL}%{?2:.%{2}}\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/kernel\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/build\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/source\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/extra\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/updates\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/weak-updates\
+%ifarch %{vdso_arches}\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/vdso\
+/etc/ld.so.conf.d/kernel-%{KVERREL}%{?2:.%{2}}.conf\
+%endif\
+/lib/modules/%{KVERREL}%{?2:.%{2}}/modules.*\
+%if %{with_dracut}\
+/boot/initramfs-%{KVERREL}%{?2:.%{2}}.img\
+%else\
+/boot/initrd-%{KVERREL}%{?2:.%{2}}.img\
+%endif\
+%{expand:%%files %{?2:%{2}-}devel}\
+%defattr(-,root,root)\
+%dir /usr/src/kernels\
+%verify(not mtime) /usr/src/kernels/%{KVERREL}%{?2:.%{2}}\
+/usr/src/kernels/%{KVERREL}%{?2:.%{2}}\
+%if %{with_debuginfo}\
+%ifnarch noarch\
+%if %{fancy_debuginfo}\
+%{expand:%%files -f debuginfo%{?2}.list %{?2:%{2}-}debuginfo}\
+%else\
+%{expand:%%files %{?2:%{2}-}debuginfo}\
+%endif\
+%defattr(-,root,root)\
+%if !%{fancy_debuginfo}\
+%if "%{elf_image_install_path}" != ""\
+%{debuginfodir}/%{elf_image_install_path}/*-%{KVERREL}%{?2:.%{2}}.debug\
+%endif\
+%{debuginfodir}/lib/modules/%{KVERREL}%{?2:.%{2}}\
+%{debuginfodir}/usr/src/kernels/%{KVERREL}%{?2:.%{2}}\
+%endif\
+%endif\
+%endif\
+%endif\
+%{nil}
+
+
+%kernel_variant_files %{with_up}
+%kernel_variant_files %{with_smp} smp
+%kernel_variant_files %{with_debug} debug
+%kernel_variant_files %{with_pae} PAE
+%kernel_variant_files %{with_pae_debug} PAEdebug
+%kernel_variant_files -k vmlinux %{with_kdump} kdump
+
+# plz don't put in a version string unless you're going to tag
+# and build.
+
+%changelog
+* Thu Dec 03 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.6-162
+- ipv4-fix-null-ptr-deref-in-ip_fragment.patch: null ptr deref
+  bug fix.
+
+* Thu Dec 03 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-161
+- rv410 LVDS on resume test fix from AMD (#541562)
+
+* Wed Dec 02 2009 John W. Linville <linville@redhat.com> 2.6.31.6-160
+- ath9k: add fixes suggested by upstream maintainer
+
+* Wed Dec 02 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-159
+- drm-radeon-misc-fixes.patch: r400 LVDS, r600 digital dpms, cursor fix, tv property
+
+* Wed Dec 02 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.6-158
+- nouveau: more complete lvds script selection on >=G80 (rh#522690, rh#529859)
+- nouveau: more complete tmds script selection on >=G80 (rh#537853)
+- nouveau: TV detection fixes
+
+* Tue Dec 01 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-157
+- div/0 fix harder (#540593) - also ignore unposted GPUs with no BIOS
+
+* Tue Dec 01 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-156
+- drm-next: fixes LVDS resume on r4xx, div/0 on no bios (#540593)
+  lockup on tv-out only startup.
+
+* Mon Nov 30 2009 Kyle McMartin <kyle@redhat.com>
+- drm-i915-fix-sync-to-vbl-when-vga-is-off.patch: add (rhbz#541670)
+
+* Sun Nov 29 2009 Kyle McMartin <kyle@redhat.com>
+- Drop linux-2.6-sysrq-c.patch, made consistent upstream.
+
+* Fri Nov 27 2009 Jarod Wilson <jarod@redhat.com> 2.6.31.6-153
+- add device name to lirc_zilog, fixes issues w/multiple target devices
+- add lirc_imon pure input mode support for onboard decode devices
+
+* Wed Nov 26 2009 David Woodhouse <David.Woodhouse@intel.com> 2.6.31.6-152
+- Fix intel_tv_mode_set oops (#540218)
+
+* Wed Nov 26 2009 David Woodhouse <David.Woodhouse@intel.com> 2.6.31.6-151
+- VT-d: Work around yet more HP BIOS brokenness (#536675)
+
+* Wed Nov 25 2009 Kyle McMartin <kyle@redhat.com>
+- dlm: fix connection close handling.
+  Fix by lmb, requested by fabio.
+
+* Wed Nov 25 2009 David Woodhouse <David.Woodhouse@intel.com> 2.6.31.6-149
+- VT-d: Work around more HP BIOS brokenness.
+
+* Tue Nov 24 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-148
+- radeon: flush HDP cache on rendering wait - fixes r600 rendercheck failure
+
+* Mon Nov 23 2009 Adam Jackson <ajax@redhat.com>
+- drm-default-mode.patch: Default to 1024x768 to match UMS. (#538761)
+
+* Mon Nov 23 2009 Roland McGrath <roland@redhat.com> 2.6.31.6-146
+- Fix oops in x86-32 kernel's iret handling for bogus user %cs. (#540580)
+
+* Fri Nov 21 2009 Kyle McMartin <kyle@redhat.com>
+- Fix up ssp' highmem fixes with fixes for arm & ppc.
+
+* Thu Nov 20 2009 Chris Wright <chrisw@redhat.com> 2.6.31.6-144
+- VT-d: another fallback for another BIOS bug (#524808)
+
+* Thu Nov 19 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.6-142
+- Oops, add new patch to spec file
+
+* Thu Nov 19 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.6-141
+- Lower debug level of fbcon handover messages (rh#538526)
+
+* Thu Nov 19 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-140
+- drm-next-44c83571.patch: oops pulled the wrong tree into my f12 tree
+
+* Thu Nov 19 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.6-139
+- nouveau: s/r fixes on chipsets using bios opcode 0x87
+- nouveau: fixes to bios opcode 0x8e
+- nouveau: hopefully fix nv1x context switching issues (rh#526577)
+- nouveau: support for NVA5 (GeForce G220)
+- nouveau: fixes for NVAA support
+
+* Thu Nov 19 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-138
+- drm-next-d56672a9.patch: fix some rn50 cloning issues
+
+* Wed Nov 18 2009 David Woodhouse <David.Woodhouse@intel.com> 2.6.31.6-137
+- Actually force the IOMMU not to be used when we detect the HP/Acer bug.
+
+* Tue Nov 17 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.6-136
+- ACPI embedded controller fixes from Fedora 11.
+
+* Tue Nov 17 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.6-135
+- Scheduler fixes and latency tuning patches from F-11.
+
+* Tue Nov 17 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-134
+- glad to see edid retry patch was compiled.
+
+* Tue Nov 17 2009 Dave Airlie <airlied@redhat.com> 2.6.31.6-133
+- drm-next-984d1f3c.patch: rebase with upstream fixes - drop all merged
+
+* Thu Nov 12 2009 Adam Jackson <ajax@redhat.com>
+- Actually apply the EDID retry patch
+- drm-edid-header-fixup.patch: Fix up some broken EDID headers (#534120)
+
+* Thu Nov 12 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.6-130
+- Use ApplyOptionalPatch for v4l and firewire updates.
+- Drop unused v4l ABI fix.
+
+* Thu Nov 12 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.6-129
+- Linux 2.6.31.6
+- Drop merged patches:
+  linux-2.6-iwlwifi-reduce-noise-when-skb-allocation-fails.patch
+  linux-2.6-libertas-crash.patch
+  pci-increase-alignment-to-make-more-space.patch
+  acpi-revert-attach-device-to-handle-early.patch
+  ahci-revert-restore-sb600-sata-controller-64-bit-dma.patch
+  acpi-pci-fix-null-pointer-dereference-in-acpi-get-pci-dev.patch
+  af_unix-fix-deadlock-connecting-to-shutdown-socket.patch
+  keys-get_instantiation_keyring-should-inc-the-keyring-refcount.patch
+  netlink-fix-typo-in-initialization.patch
+  fs-pipe-null-ptr-deref-fix.patch
+
+* Wed Nov 11 2009 Justin M. Forbes <jforbes@redhat.com> 2.6.31.5-128
+- Fix KSM for i686 users. (#532215)
+- Add KSM fixes from 2.6.32
+
+* Sun Nov 08 2009 David Woodhouse <David.Woodhouse@intel.com> 2.6.31.5-127
+- Apply fix for fallback when HP/Acer BIOS bug detected (#524808)
+- Re-enable DMAR.
+- Fix libertas crash due to skb pointer bug
+
+* Sat Nov 07 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.5-126
+- Re-enable linux-2.6-die-closed-source-bios-muppets-die.patch, DMAR
+  still defaulting to off.
+
+* Sat Nov 07 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.5-125
+- Disable linux-2.6-die-closed-source-bios-muppets-die.patch and
+  default DMAR to off (can be re-enabled with intel_iommu=on on the
+  command line due to last minute issues and reversion upstream.)
+
+* Thu Nov 05 2009 Jarod Wilson <jarod@redhat.com>
+- Add --with dbgonly rpmbuild option to build only debug kernels
+
+* Thu Nov 05 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-122
+- comment out kmap atomic for now, it breaks ppc build
+
+* Thu Nov 05 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-121
+- drm-radeon-fix-agp-resume.patch (#531825)
+
+* Thu Nov 05 2009 Kyle McMartin <kyle@redhat.com>
+- Add two patches from Soren from mingo/linux-2.6-x86.git to fix
+  debug_kmap_atomic prints.
+
+* Thu Nov 05 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: fix rh#532924
+
+* Wed Nov 04 2009 Kyle McMartin <kyle@redhat.com>
+- Make JBD2_DEBUG a toggleable debug setting. Leave it the way it was.
+  (Double checked resulting configs, don't fret.)
+
+* Wed Nov 04 2009 Adam Jackson <ajax@redhat.com> 2.6.31.5-117
+- drm-edid-retry.patch: Try DDC up to four times, like X. (#532957)
+
+* Wed Nov 04 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.5-116
+- tg3 bug fixes (#527209)
+
+* Wed Nov 04 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.5-115
+- fs/pipe.c: fix null pointer dereference (CVE-2009-3547)
+
+* Wed Nov 04 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.5-114
+- nouveau: provide info userspace needs to handle low memory situations
+- nouveau: fix for rh#532711
+- nouveau: add option to provide more debug info for rh#532579
+- patch only so large because of included register rename
+
+* Tue Nov 03 2009 Adam Jackson <ajax@redhat.com> 2.6.31.5-113
+- drm-conservative-fallback-modes.patch: When an output is connected but
+  fails EDID, only add modes with refresh rates <= 60 (#514600)
+
+* Tue Nov 03 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-112
+- drm-r600-lenovo-w500-fix.patch: add second patch from upstream fix
+
+* Tue Nov 03 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-111
+- drm-r600-lenovo-w500-fix.patch: fix lenovo w500 acpi video kill laptop dead
+- drop aspm r600 patch as correct fix should be in 110
+
+* Tue Nov 03 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-110
+- r600: fix for ring setup RMW issue.
+
+* Mon Nov 02 2009 John W. Linville <linville@redhat.com> 2.6.31.5-109
+- prism54: remove pci modinfo device table (#447047)
+
+* Mon Nov 02 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.5-108
+- Enable acerhdf driver for fan speed control on Acer Aspire One notebook (#532463)
+
+* Mon Nov 02 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-107
+- r600: back that out, thanks to yaneti for testing.
+
+* Mon Nov 02 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-106
+- r600: ring size guesswork fix.
+
+* Fri Oct 30 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-105
+- drm-radeon-agp-font-fix.patch: hopefully fix AGP coherency issue
+
+* Wed Oct 28 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-104
+- drm-next-ea1495a6.patch: fix rs400 resume on my test box
+
+* Wed Oct 28 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-103
+- drm-next-fc7f7119.patch: fix oops in SS code, fix multi-card, dvo.
+- drm-radeon-kms-arbiter-return-ignore.patch: fix arbiter for non-VGA display
+
+* Tue Oct 27 2009 Chuck Ebbert <cebbert@redhat.com>
+- Fix oops in VIA padlock-aes code.
+
+* Tue Oct 27 2009 Dave Airlie <airlied@redhat.com>
+- kms: add offb handoff patch for ppc to work
+
+* Tue Oct 27 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: misc fixes, very initial NVA8 work
+
+* Tue Oct 27 2009 Dave Airlie <airlied@redhat.com>
+- fix dd command lines
+
+* Mon Oct 26 2009 Dave Jones <davej@redhat.com>
+- Make a 20MB initramfs file so rpm gets its diskspace calculations right. (#530778)
+
+* Mon Oct 26 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-97
+- drm: rebase to drm-next, drop palette fix, merged upstream
+- drm-intel-big-hammer.patch: drop, proper fix in 2.6.31.5
+- drm-disable-r600-aspm.patch: test patch to disable aspm on r600/r700 for now
+
+* Fri Oct 23 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.5-96
+- Bump NR_CPUS to 256 on x86_64.
+- Add two backports (ugh, just had to go renaming perf counters to events...)
+  for fixing sysprof with perf.
+
+* Fri Oct 23 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-95
+- re enable MSI
+
+* Fri Oct 23 2009 Dave Airlie <airlied@redhat.com> 2.6.31.5-94
+- disable debug + stackprotector
+
+* Fri Oct 23 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.31.5
+
+* Thu Oct 22 2009 Chuck Ebbert <cebbert@redhat.com>
+- Fix exploitable OOPS in keyring code. (CVE-2009-3624)
+- Fix kernel memory leak to userspace. (CVE-2009-3612)
+
+* Thu Oct 22 2009 Dave Airlie <airlied@redhat.com>  2.6.31.5-91.rc1
+- kms: fix palette
+
+* Wed Oct 21 2009 Chuck Ebbert <cebbert@redhat.com>
+- Disable powersave by default for AC97 audio devices. (#524414)
+
+* Wed Oct 21 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.31.5-rc1
+- Remove the merged HP DC7900 workaround from iommu-updates patch.
+- Drop merged patch:
+  linux-2.6-raidlockdep.patch
+
+* Mon Oct 19 2009 Kyle McMartin <kyle@redhat.com>
+- af_unix-fix-deadlock-connecting-to-shutdown-socket.patch: fix for
+  rhbz#529626.
+
+* Sat Oct 17 2009 Chuck Ebbert <cebbert@redhat.com>
+- Replace linux-2.6-bluetooth-autosuspend.diff with upstream version.
+
+* Fri Oct 16 2009 Josef Bacik <josef@toxicpanda.com>
+- Update btrfs to latest upstream
+
+* Fri Oct 16 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.4-85
+- Fix another ACPI boot hang (#513680)
+
+* Fri Oct 16 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.4-84
+- nouveau: more vbios opcodes, minor fixes, hopeful fix for rh#529292
+
+* Wed Oct 14 2009 Roland McGrath <roland@redhat.com> 2.6.31.4-83
+- Remove work-around for gcc bug #521991, now fixed.
+- Build *docs non-parallel, working around kernel's makefile bugs.
+
+* Wed Oct 14 2009 Peter Jones <pjones@redhat.com>
+- Add scsi_register_device_handler to modules.block's symbol list so
+  we'll have scsi device handlers in installer images.
+
+* Tue Oct 13 2009 Steve Dickson <steved@redhat.com> 2.6.31.4-81
+- Fixed hang during NFS installs (bz 528537)
+
+* Tue Oct 13 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.4-80
+- Disable 64-bit DMA on SB600 SATA controllers.
+
+* Tue Oct 13 2009 Kyle McMartin <kyle@redhat.com>
+- Always build perf docs, regardless of whether we build kernel-doc.
+  Seems rather unfair to not ship the manpages half the time.
+  Also, drop BuildRequires %if when not with_doc, the rules about %if
+  there are f*!&^ing complicated.
+
+* Mon Oct 12 2009 Kyle McMartin <kyle@redhat.com>
+- Build the perf manpages properly.
+
+* Mon Oct 12 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.4-77
+- Fix boot hang with ACPI on some systems.
+
+* Mon Oct 12 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.4-76
+- Linux 2.6.31.4
+
+* Mon Oct 12 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.4-75.rc2
+- improve-resource-counter-scalability.patch: Fix scalability issues
+  on big machines, requested by prarit.
+
+* Mon Oct 12 2009 Jarod Wilson <jarod@redhat.com>
+- Fix irq status check bugs in lirc_ene0100
+
+* Mon Oct 12 2009 Chuck Ebbert <cebbert@redhat.com>
+- Fix 2.6.31 regression that caused device failures with ACPI enabled.
+
+* Sun Oct 11 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.31.4-rc2
+- Drop merged patch: linux-2.6-frace-fixes.patch
+
+* Sat Oct 10 2009 Chuck Ebbert <cebbert@redhat.com>
+- Make performance counter API available to userspace programs (#527264)
+
+* Sat Oct 10 2009 Dave Jones <davej@redhat.com>
+- Drop the NX kernel data patch for now. Causes no-boot on some systems.
+
+* Fri Oct 09 2009 Dave Jones <davej@redhat.com>
+- Backport two critical ftrace fixes.
+  ftrace: check for failure for all conversions
+  tracing: correct module boundaries for ftrace_release
+
+* Fri Oct 09 2009 Jarod Wilson <jarod@redhat.com>
+- Build docs sub-package again
+
+* Thu Oct 08 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.3-67
+- Linux 2.6.31.3
+- rebase drm-next trivially.
+- dropped merged upstream patches,
+ - linux-2.6-fix-usb-serial-autosuspend.diff
+ - linux-2.6-iwlagn-modify-digital-SVR-for-1000.patch
+ - linux-2.6-iwlwifi-Handle-new-firmware-file-with-ucode-build-number-in-header.patch
+ - linux-2.6-iwlwifi-fix-debugfs-buffer-handling.patch
+ - linux-2.6-iwlwifi-fix-unloading-driver-while-scanning.patch
+ - linux-2.6-iwlwifi-remove-deprecated-6000-series-adapters.patch
+ - linux-2.6-iwlwifi-traverse-linklist-to-find-the-valid-OTP-block.patch
+ - linux-2.6-iwlwifi-update-1000-series-API-version-to-match-firmware.patch
+ - linux-2.6-xen-check-efer-fix.patch
+ - linux-2.6-xen-spinlock-enable-interrupts-only-when-blocking.patch
+ - linux-2.6-xen-spinlock-stronger-barrier.patch
+ - linux-2.6-xen-stack-protector-fix.patch
+ - linux-2.6.31-cpufreq-powernow-k8-oops.patch
+
+* Thu Oct 08 2009 Ben Skeggs <bskeggs@redhat.com>
+- ppc: compile nvidiafb as a module only, nvidiafb+nouveau = bang! (rh#491308)
+
+* Thu Oct 08 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.1-65
+- nouveau: {drm-next,context,fbcon,misc} fixes, connector forcing
+
+* Thu Oct 08 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-64
+- rebase latest drm-next, fixes many s/r and r600 problems
+
+* Wed Oct 07 2009 Dave Jones <davej@redhat.com>
+- Don't mark the initramfs file as a ghost.
+
+* Wed Oct 07 2009 Dave Jones <davej@redhat.com>
+- Enable FUNCTION_GRAPH_TRACER on x86-64.
+
+* Wed Oct 07 2009 Dave Jones <davej@redhat.com>
+- Disable CONFIG_IRQSOFF_TRACER on srostedt's recommendation.
+  (Adds unwanted overhead when not in use).
+
+* Tue Oct  6 2009 Justin M. Forbes <jforbes@redhat.com>
+- virtio_blk: add support for cache flush (#526869)
+
+* Fri Oct  2 2009 John W. Linville <linville@redhat.com>
+- Backport "iwlwifi: reduce noise when skb allocation fails"
+
+* Wed Sep 30 2009 David Woodhouse <David.Woodhouse@intel.com>
+- Update IOMMU code; mostly a bunch more workarounds for broken BIOSes.
+
+* Wed Sep 30 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-56
+- revert all the arjan patches until someone tests them.
+
+* Tue Sep 29 2009 Steve Dickson <steved@redhat.com>  2.6.31.1-55
+- Updated the NFS4 pseudo root code with a fix from upstream
+
+* Tue Sep 29 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-54
+- Fix broken capabilties that stopped dbus working due to copy from user
+  fixups.
+
+* Tue Sep 29 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-53
+- drm-next-4c57edba4.patch: fix r600 dri1 memory leak and r600 bugs
+
+* Mon Sep 28 2009 Dave Jones <davej@redhat.com> 2.6.31.1-52
+- Use __builtin_object_size to validate the buffer size for copy_from_user
+  + associated fixes to various copy_from_user invocations.
+
+* Mon Sep 28 2009 Justin M. Forbes <jmforbes@redhat.com> 2.6.31.1-50
+- Increase timeout for xen frontend devices to connect.
+
+* Sat Sep 26 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.1-49
+- Add Xen spinlock patches to improve scalability.
+
+* Sat Sep 26 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-48
+- drm-next-8ef8678c8.patch: fix intel/nouveau kms
+
+* Fri Sep 25 2009 Justin M. Forbes <bskeggs@redhat.com> 2.6.31.1-47
+- Fix xen guest booting when NX is disabled (#525290)
+
+* Fri Sep 25 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31.1-46
+- drm-nouveau.patch: cleanups, fixes, pre-G80 s/r fixes, init rework
+
+* Fri Sep 25 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-45
+- drm-next-adea4796c.patch: fix r600 glxgears
+
+* Fri Sep 25 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-44
+- bump a extra one because I accidentially CVS.
+
+* Thu Sep 24 2009 Dave Airlie <airlied@redhat.com> 2.6.31.1-42
+- drm-next update - fix r600 s/r, and command line mode picking and r600 tv
+
+* Thu Sep 24 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31.1-41
+- Linux 2.6.31.1
+- Drop patches merged upstream:
+    linux-2.6-kvm-vmx-check-cpl-before-emulating-debug-register-access.patch
+    linux-2.6-use-__pa_symbol-to-calculate-address-of-C-symbol.patch
+    linux-2.6-kvm-pvmmu-do-not-batch-pte-updates-from-interrupt-context.patch
+    linux-2.6-scsi-sd-fix-oops-during-scanning.patch
+    linux-2.6-scsi-sg-fix-oops-in-error-path.patch
+
+* Thu Sep 24 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-40
+- Drop the modules-ro-nx patch: it's causing ftrace to be unable
+  to NOP out module function call tracking. (#524042)
+
+* Wed Sep 23 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-39
+- touch initramfs-$foo not dracut-$foo.
+
+* Wed Sep 23 2009 Adam Jackson <ajax@redhat.com> 2.6.31-37
+- drm: Fix various buglets in EDID parsing.
+
+* Mon Sep 21 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: more on rh#522649, added some useful info to debugfs
+- lots of coding style cleanups, which is the reason for the huge commit
+
+* Fri Sep 18 2009 Dave Jones <davej@redhat.com>
+- %ghost the dracut initramfs file.
+
+* Thu Sep 17 2009 Hans de Goede <hdegoede@redhat.com>
+- Now that we have %%post generation of dracut images we do not need to
+  Require dracut-kernel anymore
+
+* Thu Sep 17 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-33
+- Turn off CONFIG_CC_OPTIMIZE_FOR_SIZE on ppc64 until ld decides to play nice
+  and generate the save/restore stubs.
+
+* Thu Sep 17 2009 Kristian HÃ¸gsberg <krh@redhat.com>
+- Drop drm page-flip patch for F12.
+
+* Thu Sep 17 2009 Dave Jones <davej@redhat.com>
+- cpuidle: Fix the menu governor to boost IO performance.
+
+* Wed Sep 16 2009 John W. Linville <linville@redhat.com>
+- Add a few more iwl1000 support patches.
+- Remove support for deprecated iwl6000 parts.
+
+* Wed Sep 16 2009 Eric Paris <eparis@redhat.com>
+- Do not check CAP_SYS_MODULE when networking tres to autoload a module
+
+* Wed Sep 16 2009 John W. Linville <linville@redhat.com>
+- Add iwl1000 support patches.
+
+* Wed Sep 16 2009 Adam Jackson <ajax@redhat.com>
+- Disable hotplug interrupts on TV connectors on i915.
+
+* Wed Sep 16 2009 Dave Jones <davej@redhat.com>
+- Fix NULL deref in powernow-k8 driver. (korg #13780)
+
+* Wed Sep 16 2009 Hans de Goede <hdegoede@redhat.com>
+- Fix lockdep warning (and potential real deadlock) in mdraid10 code,
+  requested for -stable, rh#515471
+
+* Wed Sep 16 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31-17
+- nouveau: potential fix for rh#522649 + misc other fixes
+
+* Tue Sep 15 2009 Chuck Ebbert <cebbert@redhat.com>
+- Add unused-kernel-patches Make target, change some patches to
+  use ApplyOptionalPatch
+
+* Tue Sep 15 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: misc fixes to context-related issues, fixes some severe nv4x bugs
+
+* Tue Sep 15 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: temporarily disable fbcon accel, it's racing with ttm
+
+* Mon Sep 14 2009 Steve Dickson <steved@redhat.com>
+- Added support for -o v4 mount parsing
+
+* Mon Sep 14 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: avoid PFIFO IRQ hardlock, misc LVDS mode fixes, nv5x RAMFC cleanup
+
+* Sun Sep 13 2009 Chuck Ebbert <cebbert@redhat.com>
+- SCSI oops fixes requested for -stable
+
+* Fri Sep 11 2009 Dave Jones <davej@redhat.com>
+- Apply NX/RO to modules
+
+* Fri Sep 11 2009 Dave Jones <davej@redhat.com>
+- Mark kernel data section as NX
+
+* Fri Sep 11 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: bring in Matthew Garret's initial switchable graphics support
+
+* Fri Sep 11 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: fixed use of strap-based panel mode when required (rh#522649)
+- nouveau: temporarily block accel on NVAC chipsets (rh#522361, rh#522575)
+
+* Thu Sep 10 2009 Matthew Garrett <mjg@redhat.com>
+- linux-2.6-ahci-export-capabilities.patch: Backport from upstream
+- linux-2.6-rtc-show-hctosys.patch: Export the hctosys state of an rtc
+- linux-2.6-rfkill-all.patch: Support for keys that toggle all rfkill state
+
+* Thu Sep 10 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: add some scaler-only modes for LVDS, GEM/TTM fixes
+
+* Wed Sep 09 2009 Dennis Gilmore <dennis@ausil.us> 2.6.31-2
+- touch the dracut initrd file when using %%{with_dracut}
+
+* Wed Sep 09 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-1
+- Linux 2.6.31
+
+* Wed Sep 09 2009 Chuck Ebbert <cebbert@redhat.com>
+- Enable VXpocket and PDaudioCF PCMCIA sound drivers.
+
+* Wed Sep 09 2009 Hans de Goede <hdegoede@redhat.com>
+- Move to %%post generation of dracut initrd, because of GPL issues surrounding
+  shipping a prebuild initrd
+- Require grubby >= 7.0.4-1, for %%post generation
+
+* Wed Sep  9 2009 Steve Dickson <steved@redhat.com>
+- Updated the NFS4 pseudo root code to the latest release.
+
+* Wed Sep 09 2009 Justin M. Forbes <jforbes@redhat.com>
+- Revert virtio_blk to rotational mode. (#509383)
+
+* Wed Sep 09 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.219.rc9.git
+- uggh lost nouveau bits in page flip
+
+* Wed Sep 09 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.218.rc9.git2
+- fix r600 oops with page flip patch (#520766)
+
+* Wed Sep 09 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: fix display resume on pre-G8x chips
+
+* Wed Sep 09 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: add getparam to know using tile_flags is ok for scanout
+
+* Wed Sep 09 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc9-git2
+
+* Wed Sep  9 2009 Roland McGrath <roland@redhat.com> 2.6.31-0.214.rc9.git1
+- compile with -fno-var-tracking-assignments, work around gcc bug #521991
+
+* Wed Sep 09 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.213.rc9.git1
+- fix two bugs in r600 kms, fencing + mobile lvds
+
+* Tue Sep 08 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31-0.212.rc9.git1
+- drm-nouveau.patch: fix ppc build
+
+* Tue Sep 08 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31-0.211.rc9.git1
+- drm-nouveau.patch: more misc fixes
+
+* Tue Sep 08 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.210.rc9.git1
+- drm-page-flip.patch: rebase again
+
+* Tue Sep 08 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.209.rc9.git1
+- drm-next.patch: fix r600 signal interruption return value
+
+* Tue Sep 08 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31-0.208.rc9.git1
+- drm-nouveau.patch: latest upstream + rebase onto drm-next
+
+* Tue Sep 08 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.207.rc9.git1
+- drm-vga-arb.patch: update to avoid lockdep + add r600 support
+
+* Tue Sep 08 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.206.rc9.git1
+- drm: rebase to drm-next - r600 accel + kms should start working now
+
+* Mon Sep 07 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.205.rc9.git1
+- 2.6.31-rc9-git1
+- Temporarily hack the drm-next patch so it still applies; the result
+  should still be safe to build.
+
+* Sat Sep 05 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.204.rc9
+- 2.6.31-rc9
+
+* Fri Sep 04 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.203.rc8.git2
+- Fix kernel build errors when building firmware by removing the
+  .config file before that step and restoring it afterward.
+
+* Thu Sep 03 2009 Adam Jackson <ajax@redhat.com>
+- drm-ddc-caching-bug.patch: Empty the connector's mode list when it's
+  disconnected.
+
+* Thu Sep 03 2009 Jarod Wilson <jarod@redhat.com>
+- Update hdpvr and lirc_zilog drivers for 2.6.31 i2c
+
+* Thu Sep 03 2009 Justin M.Forbes <jforbes@redhat.com>
+- Fix xen guest with stack protector. (#508120)
+- Small kvm fixes.
+
+* Wed Sep 02 2009 Adam Jackson <ajax@redhat.com> 2.6.31-0.199.rc8.git2
+- drm-intel-pm.patch: Disable by default, too flickery on too many machines.
+  Enable with i915.powersave=1.
+
+* Wed Sep 02 2009 Dave Jones <davej@redhat.com>
+- Add missing scriptlet dependancy. (#520788)
+
+* Tue Sep 01 2009 Adam Jackson <ajax@redhat.com>
+- Make DRM less chatty about EDID failures.  No one cares.
+
+* Tue Sep 01 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc8-git2
+- Blank out drm-intel-next: entire contents are now upstream.
+
+* Tue Sep 01 2009 Dave Jones <davej@redhat.com>
+- Make firmware buildarch noarch. (Suggested by drago01 on irc)
+
+* Tue Sep 01 2009 Jarod Wilson <jarod@redhat.com>
+- Fix up lirc_zilog to enable functional IR transmit and receive
+  on the Hauppauge HD PVR
+- Fix audio on PVR-500 when used in same system as HVR-1800 (#480728)
+
+* Sun Aug 30 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc8-git1
+- Drop linux-2.6-inotify-accounting.patch, merged upstream.
+
+* Sun Aug 30 2009 Jarod Wilson <jarod@redhat.com>
+- fix lirc_imon oops on older devices w/o tx ctrl ep (#520008)
+
+* Fri Aug 28 2009 Eric Paris <eparis@redhat.com> 2.6.31-0.190.rc8
+- fix inotify length accounting and send inotify events
+
+* Fri Aug 28 2009 David Woodhouse <David.Woodhouse@intel.com>
+- Enable Solos DSL driver
+
+* Fri Aug 28 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc8
+
+* Thu Aug 27 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.185.rc7.git6
+- 2.6.31-rc7-git6
+- Drop patch merged upstream:
+  xen-fb-probe-fix.patch
+
+* Thu Aug 27 2009 Adam Jackson <ajax@redhat.com>
+- drm-rv710-ucode-fix.patch: Treat successful microcode load on RV710 as,
+  you know, success. (#519718)
+
+* Thu Aug 27 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc7-git5
+- Drop patch linux-2.6-ima-leak.patch, now merged upstream.
+
+* Wed Aug 26 2009 Jarod Wilson <jarod@redhat.com>
+- Fix up hdpvr ir enable patch for use w/modular i2c (David Engel)
+
+* Wed Aug 26 2009 Eric Paris <eparis@redhat.com>
+- fix iint_cache leak in IMA code
+  drop the ima=0 patch
+
+* Wed Aug 26 2009 Justin M. Forbes <jforbes@redhat.com>
+- Fix munlock with KSM (#516909)
+- Re-enable KSM
+
+* Wed Aug 26 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc7-git4
+- Drop patches merged upstream:
+  xen-x86-fix-stackprotect.patch
+  xen-x86-no-stackprotect.patch
+
+* Wed Aug 26 2009 Adam Jackson <ajax@redhat.com>
+- drm-intel-next.patch: Update, various output setup fixes.
+
+* Wed Aug 26 2009 David Woodhouse <David.Woodhouse@intel.com>
+- Make WiMAX modular (#512070)
+
+* Tue Aug 25 2009 Kyle McMartin <kyle@redhat.com>
+- allow-disabling-ima.diff: debugging patch... adds ima=0 kernel
+  param to disable initialization of IMA.
+
+* Tue Aug 25 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31-0.174.rc7.git2
+- drm-nouveau.patch: upstream update, pre-nv50 tv-out + misc fixes
+
+* Tue Aug 25 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.173.rc7.git2
+- Fix Xen boot (#508120)
+
+* Tue Aug 25 2009 Dave Airlie <airlied@redhat.com>
+- pull in drm-next tree + rebase around it
+
+* Mon Aug 24 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc7-git2
+
+* Mon Aug 24 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc7-git1
+
+* Sat Aug 22 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc7
+
+* Thu Aug 20 2009 Mark McLoughlin <markmc@redhat.com>
+- Disable LZMA for xen (#515831)
+
+* Thu Aug 20 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc6-git5
+- Fix up drm-r600-kms.patch
+- Drop fix-perf-make-man-failure.patch
+
+* Wed Aug 19 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc6-git5
+- Revert linux-2.6-debug-vm-would-have-oomkilled.patch to v1.2
+  because upstream changes to oom-kill.c were all reverted.
+
+* Tue Aug 18 2009 Kyle McMartin <kyle@redhat.com>
+- Fix up perf so that it builds docs now that they are fixed.
+- with_docs disables perf docs too. be warned. (logic is that the
+  build deps are (mostly) the same, so if you don't want one, odds are...)
+
+* Tue Aug 18 2009 Dave Jones <davej@redhat.com>
+- 2.6.31-rc6-git3
+
+* Mon Aug 17 2009 Dave Jones <davej@redhat.com> 2.6.31-0.161.rc6.git2
+- 2.6.31-rc6-git2
+
+* Mon Aug 17 2009 Chuck Ebbert <cebbert@redhat.com>
+- Stop generating the (unused) ppc64-kdump.config file.
+
+* Mon Aug 17 2009 Jarod Wilson <jarod@redhat.com>
+- Add new lirc driver for built-in ENE0100 device on some laptops
+
+* Sun Aug 16 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.158.rc6
+- Improve the perf script so it prints something helpful if the
+  perf binary doesn't exist.
+
+* Sat Aug 15 2009 Dave Jones <davej@redhat.com> 2.6.31-0.157.rc6
+- Disable KSM patches on a hunch.  Chasing the "encrypted VGs don't work" bug.
+
+* Fri Aug 14 2009 Dave Jones <davej@redhat.com> 2.6.31-0.155.rc6
+- 2.6.31-rc6
+
+* Wed Aug 12 2009 Kyle McMartin <kyle@redhat.com>
+- fix perf.
+- move perf to perf.$ver instead of perf-$ver...
+
+* Wed Aug 12 2009 Dennis Gilmore <dennis@ausil.us>
+- Obsolete kernel-smp on sparc64
+- Require grubby >= 7.0.2-1 since thats what introduces the dracut options we use
+
+* Wed Aug 12 2009 Kristian HÃ¸gsberg <krh@redhat.com>
+- Fix drm-page-flip.patch to not break radeon kms and to not reset
+  crtc offset into fb on flip.
+
+* Wed Aug 12 2009 Adam Jackson <ajax@redhat.com>
+- Update drm-intel-next patch
+
+* Tue Aug 11 2009 Dennis Gilmore <dennis@ausil.us> - 2.6.31-0.149.rc5.git3
+- disable building the -smp kernel on sparc64
+- disable building kernel-perf on sparc64 syscalls not supported
+
+* Tue Aug 11 2009 Eric Paris <eparis@redhat.com>
+- Enable config IMA
+
+* Tue Aug 11 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: various cleanups and fixes + more sanity checking in dma paths
+
+* Mon Aug 10 2009 Jarod Wilson <jarod@redhat.com>
+- Add new device ID to lirc_mceusb (#512483)
+- Fix some lockdep false positives
+- Add support for setting and enabling iMON clock via sysfs
+- Add tunable pad threshold support to lirc_imon
+- Add new pseudo-IR protocl to lirc_imon for universals w/o a pad
+- Fix mouse device support on older iMON devices
+
+* Mon Aug 10 2009 David Woodhouse <David.Woodhouse@intel.com> 2.6.31-0.145.rc5.git3
+- Merge latest Intel IOMMU fixes and BIOS workarounds, re-enable by default.
+
+* Sun Aug 09 2009 Kyle McMartin <kyle@redhat.com>
+- btusb autosuspend: fix build on !CONFIG_PM by stubbing out
+  suspend/resume methods.
+
+* Sat Aug 08 2009 Dennis Gilmore <dennis@ausil.us> 2.6.31-0.141.rc5.git3
+- disable kgdb on sparc64 uni-processor kernel
+- set max cpus to 256 on sparc64
+- enable AT keyboard on sparc64
+
+* Fri Aug 07 2009 Justin M. Forbes <jforbes@redhat.com>
+- Apply KSM updates from upstream
+
+* Fri Aug 07 2009 Hans de Goede <hdegoede@redhat.com>
+- When building a dracut generic initrd tell new-kernel-pkg to use that
+  instead of running mkinitrd
+
+* Fri Aug 07 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.139.rc5.git3
+- drm-r600-kms.patch - update r600 KMS
+- drm-radeon-fixes.patch - patches for queue to Linus
+
+* Thu Aug 06 2009 Justin M. Forbes <jforbes@redhat.com> 2.6.31-0.138.rc5.git3
+- Fix kvm virtio_blk errors (#514901)
+
+* Thu Aug 06 2009 Adam Jackson <ajax@redhat.com>
+- Hush DRM vblank warnings, they're constant (and harmless) under DRI2.
+
+* Thu Aug 06 2009 Dave Airlie <airlied@redhat.com> 2.6.31.0.134.rc5.git3
+- fixup vga arb warning at startup and handover between gpus
+
+* Thu Aug 06 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.0.133.rc5.git3
+- die-floppy-die.patch: it's the 21st century, let's not rely on
+  steam powered technology.
+
+* Wed Aug 05 2009 Dave Airlie <airlied@redhat.com> 2.6.31.0.132.rc5.git3
+- revert-ftrace-powerpc-snafu.patch - fix ppc build
+
+* Wed Aug 05 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: respect nomodeset
+
+* Wed Aug 05 2009 Chuck Ebbert <cebbert@redhat.com>
+- Fix /usr/sbin/perf script. (#515494)
+
+* Wed Aug 05 2009 Dave Jones <davej@redhat.com>
+- Fix shift in pci cacheline size printk.
+
+* Wed Aug 05 2009 Dave Airlie <airlied@redhat.com> 2.6.31.0.128.rc5.git3
+- 2.6.31-rc5-git3
+- drop cpufreq + set memory fixes
+
+* Wed Aug 05 2009 Dave Airlie <airlied@redhat.com>
+- Add Jeromes initial r600 kms work.
+- rebase arb patch
+
+* Tue Aug 04 2009 Kyle McMartin <kyle@redhat.com>
+- alsa-tell-user-that-stream-to-be-rewound-is-suspended.patch: apply patch
+  destined for 2.6.32, requested by Lennart.
+
+* Tue Aug 04 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: more code share between nv50/<nv50 kms, bug fixes
+
+* Tue Aug 04 2009 Dave Airlie <airlied@redhat.com>
+- update VGA arb patches again
+
+* Mon Aug 03 2009 Adam Jackson <ajax@redhat.com>
+- Update intel drm from anholt's tree
+- Rebase drm-intel-pm.patch to match
+- Drop gen3 fb hack, merged
+- Drop previous watermark setup change
+
+* Mon Aug 03 2009 Dave Jones <davej@redhat.com> 2.6.31-0.122.rc5.git2
+- 2.6.31-rc5-git2
+
+* Mon Aug 03 2009 Adam Jackson <ajax@redhat.com>
+- (Attempt to) fix watermark setup on Intel 9xx parts.
+
+* Mon Aug 03 2009 Jarod Wilson <jarod@redhat.com>
+- make usbhid driver ignore all recent SoundGraph iMON devices, so the
+  lirc_imon driver can grab them instead
+
+* Mon Aug 03 2009 Dave Airlie <airlied@redhat.com>
+- update VGA arb patches
+
+* Sat Aug 01 2009 David Woodhouse <David.Woodhouse@intel.com> 2.6.31-0.118.rc5
+- Fix boot failures on ppc32 (#514010, #505071)
+
+* Fri Jul 31 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.117.rc5
+- Linux 2.6.31-rc5
+
+* Fri Jul 31 2009 Matthew Garrett <mjg@redhat.com>
+- linux-2.6-dell-laptop-rfkill-fix.patch: Fix up Dell rfkill
+
+* Fri Jul 31 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: build against 2.6.31-rc4-git6, fix script parsing on some G8x chips
+
+* Thu Jul 30 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.31-rc4-git6
+  New config item: CONFIG_BATTERY_DS2782 is not set
+- Add last-minute set_memory_wc() fix from LKML.
+
+* Thu Jul 30 2009 Matthew Garrett <mjg@redhat.com>
+- drm-intel-pm.patch: Don't reclock external outputs. Increase the reduced
+   clock slightly to avoid upsetting some hardware. Disable renderclock
+   adjustment for the moment - it's breaking on some hardware.
+
+* Thu Jul 30 2009 Ben Skeggs <bskeggs@redhat.com>
+- nouveau: another DCB 1.5 entry, G80 corruption fixes, small <G80 KMS fix
+
+* Thu Jul 30 2009 Dave Airlie <airlied@redhat.com>
+- fix VGA ARB + kms
+
+* Wed Jul 29 2009 Dave Jones <davej@redhat.com>
+- Add support for dracut. (Harald Hoyer)
+
+* Wed Jul 29 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: nv50/nva0 tiled scanout fixes, nv40 kms fixes
+
+* Wed Jul 29 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.31-rc4-git3
+- Drop linux-2.6-ecryptfs-overflow-fixes.patch, merged upstream now.
+
+* Wed Jul 29 2009 Dave Airlie <airlied@redhat.com>
+- update VGA arb patches
+
+* Tue Jul 28 2009 Adam Jackson <ajax@redhat.com>
+- Remove the pcspkr modalias.  If you're still living in 1994, load it
+  by hand.
+
+* Tue Jul 28 2009 Eric Sandeen <sandeen@redhat.com> 2.6.31-0.102.rc4.git2
+- Fix eCryptfs overflow issues (CVE-2009-2406, CVE-2009-2407)
+
+* Tue Jul 28 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.101.rc4.git2
+- 2.6.31-rc4-git2
+- rebase linux-2.6-fix-usb-serial-autosuspend.diff
+- config changes:
+ - USB_GSPCA_SN9C20X=m (_EVDEV=y)
+
+* Tue Jul 28 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: cleanup userspace API, various bugfixes.
+  Looks worse than it is, register macros got cleaned up, which
+  touches pretty much everywhere..
+
+* Mon Jul 27 2009 Adam Jackson <ajax@redhat.com>
+- Warn quieter about not finding PCI bus parents for ROM BARs, they're
+  not usually needed and there's nothing you can do about it anyway.
+
+* Mon Jul 27 2009 Matthew Garrett <mjg@redhat.com>
+- linux-2.6-alsa-improve-hda-powerdown.patch - attempt to reduce audio glitches
+   caused by HDA powerdown
+- disable CONFIG_DEBUG_KOBJECT again for now, since it produces huge dmesg spew
+
+* Mon Jul 27 2009 Dave Airlie <airlied@redhat.com>
+- update vga arb code
+
+* Mon Jul 27 2009 Matthew Garrett <mjg@redhat.com>
+- drm-intel-pm.patch - Add runtime PM for Intel graphics
+
+* Fri Jul 24 2009 Kristian HÃ¸gsberg <krh@redhat.com>
+- Add drm-page-flip.patch to support vsynced page flipping on intel
+  chipsets.
+- Really add patch.
+- Fix patch to not break nouveau.
+
+* Fri Jul 24 2009 Chuck Ebbert <cebbert@redhat.com>
+- Enable CONFIG_DEBUG_KOBJECT in debug kernels. (#513606)
+
+* Thu Jul 23 2009 Kyle McMartin <kyle@redhat.com>
+- perf BuildRequires binutils-devel now.
+
+* Thu Jul 23 2009 Justin M. Forbes <jforbes@redhat.com>
+- Add KSM support
+
+* Thu Jul 23 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.87.rc4
+- Linux 2.6.31-rc4
+- config changes:
+ - USB_CDC_PHONET=m [all]
+ - EVENT_PROFILE=y [i386, x86_64, powerpc, s390]
+
+* Wed Jul 22 2009 Tom "spot" Callaway <tcallawa@redhat.com>
+- We have to override the new %%install behavior because, well... the kernel is special.
+
+* Wed Jul 22 2009 Dave Jones <davej@redhat.com>
+- 2.6.31-rc3-git5
+
+* Wed Jul 22 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31-0.82.rc3.git4
+- Enable KMS for nouveau
+
+* Wed Jul 22 2009 Ben Skeggs <bskeggs@redhat.com>
+- Update nouveau from upstream (initial suspend/resume + misc bugfixes)
+
+* Mon Jul 20 2009 Adam Jackson <ajax@redhat.com>
+- Disable VGA arbiter patches for a moment
+
+* Mon Jul 20 2009 Adam Jackson <ajax@redhat.com>
+- Revive 4k framebuffers for intel gen3
+
+* Mon Jul 20 2009 Dave Jones <davej@redhat.com> 2.6.31-0.78.rc3.git4
+- Enable CONFIG_RTC_HCTOSYS (#489494)
+
+* Mon Jul 20 2009 Dave Jones <davej@redhat.com> 2.6.31-0.77.rc3.git4
+- Don't build 586 kernels any more.
+
+* Sun Jul 19 2009 Dave Jones <davej@redhat.com> 2.6.31-0.75.rc3.git4
+- build a 'full' package on i686 (Bill Nottingham)
+
+* Sun Jul 19 2009 Dave Jones <davej@redhat.com> 2.6.31-0.74.rc3.git4
+- 2.6.31-rc3-git4
+
+* Sat Jul 18 2009 Matthew Garrett <mjg@redhat.com>
+- linux-2.6-driver-level-usb-autosuspend.diff - allow drivers to enable autopm
+- linux-2.6-fix-usb-serial-autosuspend.diff - fix generic usb-serial autopm
+- linux-2.6-qcserial-autosuspend.diff - enable autopm by default on qcserial
+- linux-2.6-bluetooth-autosuspend.diff - enable autopm by default on btusb
+- linux-2.6-usb-uvc-autosuspend.diff - enable autopm by default on uvc
+
+* Thu Jul 16 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc3-git3
+
+* Thu Jul 16 2009 Matthew Garrett <mjg@redhat.com>
+- linux-2.6-defaults-aspm.patch - default ASPM to on for PCIe >= 1.1 hardware
+
+* Thu Jul 16 2009 Dave Airlie <airlied@redhat.com> 2.6.31-0.69.rc3
+- linux-2.6-vga-arb.patch - add VGA arbiter.
+- drm-vga-arb.patch - add VGA arbiter support to drm
+
+* Tue Jul 14 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.68-rc3
+- 2.6.31-rc3
+- config changes:
+ - RTL8192SU is not set, (staging)
+
+* Mon Jul 13 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.67.rc2.git9
+- 2.6.31-rc2-git9
+- config changes:
+ - BLK_DEV_OSD=m
+
+* Mon Jul 13 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: update from upstream
+
+* Fri Jul 10 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc2-git6
+- Drop dmadebug-spinlock patch -- merged upstream.
+
+* Fri Jul 10 2009 Dave Jones <davej@redhat.com> 2.6.31-0.64.rc2.git5
+- Don't jump through hoops that ppc powerbooks have to on sensible systems
+  in cpufreq_suspend.
+
+* Fri Jul 10 2009 Dave Jones <davej@redhat.com>
+- 2.6.31-rc2-git5
+
+* Thu Jul 09 2009 Dave Jones <davej@redhat.com> 2.6.31-0.62.rc2.git4
+- Use correct spinlock initialization in dma-debug
+
+* Thu Jul 09 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.61.rc2.git4
+- 2.6.31-rc2-git4
+
+* Thu Jul 09 2009 Jarod Wilson <jarod@redhat.com>
+- Enable IR receiver on the Hauppauge HD PVR
+- Trim the changelog, axing everything before 2.6.29 (see cvs
+  if you still really want to see that far back)
+
+* Wed Jul 08 2009 Dave Jones <davej@redhat.com>
+- Enable a bunch of debugging options that were missed somehow.
+
+* Wed Jul 08 2009 Kyle McMartin <kyle@redhat.com>
+- Bump NR_CPUS on x86_64 to 512.
+
+* Wed Jul 08 2009 Adam Jackson <ajax@redhat.com>
+- drm-no-gem-on-i8xx.patch: Drop, intel 2D driver requires GEM now. This
+  should be entertaining.
+
+* Wed Jul 08 2009 Kyle McMartin <kyle@redhat.com>
+- First cut of /usr/sbin/perf wrapper script and 'perf'
+  subpackage.
+
+* Wed Jul 08 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.54.rc2.git2
+- Rebase and re-apply all the Fedora-specific linux-2.6-debug-*
+  patches.
+- Cull a bunch of upstreamed patches from the spec.
+
+* Wed Jul 08 2009 Steve Dickson <steved@redhat.com>
+- Added NFSD v4 dynamic pseudo root patch which allows
+  NFS v3 exports to be mounted by v4 clients.
+
+* Tue Jul 07 2009 Jarod Wilson <jarod@redhat.com>
+- See if we can't make lirc_streamzap behave better... (#508952)
+
+* Tue Jul 07 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.47.rc2.git2
+- 2.6.31-rc2-git2
+
+* Tue Jul 07 2009 Jarod Wilson <jarod@redhat.com>
+- Make lirc_i2c actually work with 2.6.31 i2c
+
+* Mon Jul 06 2009 Chuck Ebbert <cebbert@redhat.com>
+- Use LZMA for kernel compression on X86.
+
+* Mon Jul 06 2009 Jarod Wilson <jarod@redhat.com>
+- Hack up lirc_i2c and lirc_zilog to compile with 2.6.31 i2c
+  changes. The drivers might not actually be functional now, but
+  at least they compile again. Will fix later, if need be...
+
+* Sat Jul 04 2009 Dave Jones <davej@redhat.com> 2.6.31-0.42.rc2
+- 2.6.31-rc2
+
+* Sat Jul 04 2009 Chuck Ebbert <cebbert@redhat.com>
+- 2.6.31-rc1-git11
+
+* Fri Jul 03 2009 Hans de Goede <hdegoede@redhat.com>
+- Disable v4l1 ov511 and quickcam_messenger drivers (obsoleted by
+  v4l2 gspca subdrivers)
+
+* Thu Jul 02 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.39.rc1.git9
+- 2.6.31-rc1-git9
+- linux-2.6-dm-fix-exstore-search.patch: similar patch merged upstream.
+
+* Tue Jun 30 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.38.rc1.git7
+- 2.6.31-rc1-git7
+
+* Tue Jun 30 2009 Dave Jones <davej@redhat.com> 2.6.31-0.37.rc1.git5
+- Disable kmemleak. Way too noisy, and not finding any real bugs.
+
+* Tue Jun 30 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: match upstream
+
+* Mon Jun 29 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.35.rc1.git5
+- 2.6.31-rc1-git5
+- CONFIG_LEDS_LP3944=m
+
+* Mon Jun 29 2009 Chuck Ebbert <cebbert@redhat.com>
+- Try to fix the dm overlay bug for real (#505121)
+
+* Sat Jun 27 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31-0.33.rc1.git2
+- drm-nouveau.patch: fix conflicts from 2.6.31-rc1-git2
+
+* Fri Jun 26 2009 Dave Jones <davej@redhat.com> 2.6.31-0.31.rc1.git2
+- Further improvements to kmemleak
+
+* Fri Jun 26 2009 Dave Jones <davej@redhat.com> 2.6.31-0.30.rc1.git2
+- 2.6.31-rc1-git2
+
+* Fri Jun 26 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: latest upstream + reenable
+
+* Thu Jun 25 2009 Dave Jones <davej@redhat.com> 2.6.31-0.29.rc1
+- Make kmemleak scan process stacks by default.
+  Should reduce false positives (which does also increase false negatives,
+  but that's at least less noisy)
+
+* Wed Jun 24 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.28.rc1
+- 2.6.31-rc1
+- linux-2.6-utrace.patch: rebase on kernel/Makefile changes
+- config changes:
+ - generic:
+  - CONFIG_DM_LOG_USERSPACE=m
+  - CONFIG_DM_MULTIPATH_QL=m
+  - CONFIG_DM_MULTIPATH_ST=m
+  - CONFIG_BATTERY_MAX17040=m
+  - CONFIG_I2C_DESIGNWARE is off (depends on clk.h)
+
+* Wed Jun 24 2009 Kyle McMartin <kyle@redhat.com>
+- Move perf to /usr/libexec/perf-$KernelVer.
+
+* Wed Jun 24 2009 Kyle McMartin <kyle@redhat.com>
+- config changes:
+ - generic:
+  - CONFIG_SCSI_DEBUG=m (was off, requested by davidz)
+
+* Wed Jun 24 2009 Dave Jones <davej@redhat.com> 2.6.31-0.22.rc0.git22
+- 2.6.30-git22
+
+* Tue Jun 23 2009 Dave Jones <davej@redhat.com> 2.6.31-0.22.rc0.git20
+- 2.6.30-git20
+
+* Mon Jun 22 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.24.rc0.git18
+- Enable tools/perf, installed as /bin/perf-$KernelVer. Docs and a /bin/perf
+  wrapper come next if this builds ok.
+
+* Mon Jun 22 2009 Kyle McMartin <kyle@redhat.com>
+- sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch: pull in
+  two fixes from Mike Galbraith from tip.git
+
+* Sun Jun 21 2009 Dave Jones <davej@redhat.com> 2.6.31-0.21.rc0.git18
+- Add patch to possibly fix the pktlen problem on via-velocity.
+
+* Sun Jun 21 2009 Dave Jones <davej@redhat.com> 2.6.31-0.20.rc0.git18
+- 2.6.30-git18
+  VIA crypto & mmc patches now upstream.
+
+* Sun Jun 21 2009 Dave Jones <davej@redhat.com>
+- Determine cacheline sizes in a generic manner.
+
+* Sun Jun 21 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.18.rc0.git17
+- 2.6.30-git17
+- Config changes:
+  - powerpc32-generic
+      CONFIG_PERF_COUNTERS=y
+  - generic
+      CONFIG_KEYBOARD_LM8323 is not set
+      CONFIG_MOUSE_SYNAPTICS_I2C=m
+      CONFIG_TOUCHSCREEN_EETI=m
+      CONFIG_TOUCHSCREEN_W90X900=m
+- Dropped agp-set_memory_ucwb.patch, all fixed upstream now.
+
+* Sat Jun 20 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.0.17.rc0.git15
+- config changes:
+ - ppc generic:
+  - CONFIG_PPC_DISABLE_WERROR=y (switched... chrp fails otherwise, stack
+    frame size.)
+
+* Sat Jun 20 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.0.16.rc0.git15
+- 2.6.30-git15
+- config changes:
+ - generic:
+  - CONFIG_LBDAF=y
+ - staging:
+  - CONFIG_USB_SERIAL_QUATECH2 is not set
+  - CONFIG_VT6655 is not set
+  - CONFIG_USB_CPC is not set
+  - CONFIG_RDC_17F3101X is not set
+  - CONFIG_FB_UDL is not set
+ - ppc32:
+  - CONFIG_KMETER1=y
+ - ppc generic:
+  - CONFIG_PPC_DISABLE_WERROR is not set
+- lirc disabled due to i2c detach_client removal.
+
+* Sat Jun 20 2009 Kyle McMartin <kyle@redhat.com>
+- sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch: add,
+  queued in tip/sched/core (ca94c442535a44d508c99a77e54f21a59f4fc462)
+
+* Fri Jun 19 2009 Kyle McMartin <kyle@redhat.com> 2.6.31.0.15.rc0.git14
+- Fix up ptrace, hopefully. Builds on x86_64 at least.
+
+* Fri Jun 19 2009 Chuck Ebbert <cebbert@redhat.com>
+- linux-2.6-tip.git-203abd67b75f7714ce98ab0cdbd6cfd7ad79dec4.patch
+  Fixes oops on boot with qemu (#507007)
+
+* Fri Jun 19 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.13.rc0.git14
+- 2.6.30-git14
+
+* Fri Jun 19 2009 Chuck Ebbert <cebbert@redhat.com>
+- Fix up the via-sdmmc and via-hwmon-temp-sensor patches.
+- Drop VIA Padlock patches merged upstream:
+    via-rng-enable-64bit.patch
+    via-padlock-10-enable-64bit.patch
+    via-padlock-20-add-x86-dependency.patch
+
+* Thu Jun 18 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.11.rc0.git13
+- 2.6.30-git13
+- config changes:
+ - arm:
+  - CONFIG_UACCESS_WITH_MEMCPY is not set
+ - i686-PAE:
+  - CONFIG_XEN_DEV_EVTCHN=m
+  - CONFIG_XEN_SYS_HYPERVISOR=y
+ - ia64:
+  - CONFIG_RCU_FANOUT=64
+ - nodebug:
+  - CONFIG_DEBUG_KMEMLEAK is not set
+  - CONFIG_DEBUG_KMEMLEAK_TEST=m
+ - powerpc:
+  - CONFIG_CAN_SJA1000_OF_PLATFORM=m
+  - CONFIG_PPC_EMULATED_STATS=y
+  - CONFIG_SWIOTLB=y
+  - CONFIG_RDS is not set (broken on ppc32)
+ - powerpc32:
+  - CONFIG_RCU_FANOUT=32
+ - powerpc64:
+  - CONFIG_RCU_FANOUT=64
+  - CONFIG_PERF_COUNTERS=y
+ - s390x:
+  - CONFIG_RCU_FANOUT=64
+  - CONFIG_SECCOMP=y
+  - CONFIG_PM=y
+  - CONFIG_HIBERNATION=y
+  - CONFIG_PM_STD_PARTITION="/dev/jokes"
+ - sparc64:
+  - CONFIG_RCU_FANOUT=64
+ - x86:
+  - CONFIG_RCU_FANOUT=32
+  - CONFIG_IOMMU_STRESS is not set
+  - CONFIG_PERF_COUNTERS=y
+  - CONFIG_X86_OLD_MCE is not set
+  - CONFIG_X86_MCE_INTEL=y
+  - CONFIG_X86_MCE_AMD=y
+  - CONFIG_X86_ANCIENT_MCE is not set
+  - CONFIG_X86_MCE_INJECT is not set
+ - x86_64:
+  - CONFIG_EDAC_AMD64=m
+  - CONFIG_EDAC_AMD64_ERROR_INJECTION is not set
+  - CONFIG_XEN_DEV_EVTCHN=m
+  - CONFIG_XEN_SYS_HYPERVISOR=y
+  - CONFIG_RCU_FANOUT=64
+  - CONFIG_IOMMU_STRESS is not set
+  - CONFIG_PERF_COUNTERS=y
+  - CONFIG_X86_MCE_INJECT is not set
+ - generic:
+  - CONFIG_RCU_FANOUT=32
+  - CONFIG_MMC_SDHCI_PLTFM=m
+  - CONFIG_MMC_CB710=m
+  - CONFIG_CB710_CORE=m
+  - CONFIG_CB710_DEBUG is not set
+  - CONFIG_SCSI_MVSAS_DEBUG is not set
+  - CONFIG_SCSI_BNX2_ISCSI=m
+  - CONFIG_NETFILTER_XT_MATCH_OSF=m
+  - CONFIG_RFKILL_INPUT=y (used to be =m, which was invalid)
+  - CONFIG_DE2104X_DSL=0
+  - CONFIG_KS8842 is not set
+  - CONFIG_CFG80211_DEBUGFS=y
+  - CONFIG_MAC80211_DEFAULT_PS=y
+  - CONFIG_IWM=m
+  - CONFIG_IWM_DEBUG is not set
+  - CONFIG_RT2800USB=m
+  - CONFIG_CAN_DEV=m
+  - CONFIG_CAN_CALC_BITTIMING=y
+  - CONFIG_CAN_SJA1000=m
+  - CONFIG_CAN_SJA1000_PLATFORM=m
+  - CONFIG_CAN_EMS_PCI=m
+  - CONFIG_CAN_KVASER_PCI=m
+  - CONFIG_EEPROM_MAX6875=m
+  - CONFIG_SENSORS_TMP401=m
+  - CONFIG_MEDIA_SUPPORT=m
+  - CONFIG_SND_CTXFI=m
+  - CONFIG_SND_LX6464ES=m
+  - CONFIG_SND_HDA_CODEC_CA0110=y
+  - CONFIG_USB_XHCI_HCD=m
+  - CONFIG_USB_XHCI_HCD_DEBUGGING is not set
+  - CONFIG_DRAGONRISE_FF=y (used to be =m)
+  - CONFIG_GREENASIA_FF=y (used to be =m)
+  - CONFIG_SMARTJOYPLUS_FF=y (used to be =m)
+  - CONFIG_USB_NET_INT51X1=m
+  - CONFIG_CUSE=m
+  - CONFIG_FUNCTION_PROFILER=y
+  - CONFIG_RING_BUFFER_BENCHMARK=m
+  - CONFIG_REGULATOR_USERSPACE_CONSUMER=m
+  - CONFIG_REGULATOR_MAX1586=m
+  - CONFIG_REGULATOR_LP3971=m
+  - CONFIG_RCU_FANOUT_EXACT is not set
+  - CONFIG_DEFAULT_MMAP_MIN_ADDR=4096
+  - CONFIG_FSNOTIFY=y
+  - CONFIG_IEEE802154=m
+  - CONFIG_IEEE802154_DRIVERS=m
+  - CONFIG_IEEE802154_FAKEHARD=m
+  - CONFIG_CNIC=m
+
+* Wed Jun 17 2009 Jarod Wilson <jarod@redhat.com>
+- New lirc_imon hotness, update 2:
+  * support dual-interface devices with a single lirc device
+  * directional pad functions as an input device mouse
+  * touchscreen devices finally properly supported
+  * support for using MCE/RC-6 protocol remotes
+  * fix oops in RF remote association code (F10 bug #475496)
+  * fix re-enabling case/panel buttons and/or knobs
+- Add some misc additional lirc_mceusb2 transceiver IDs
+- Add missing unregister_chrdev_region() call to lirc_dev exit
+- Add it8720 support to lirc_it87
+
+* Tue Jun 16 2009 Chuck Ebbert <cebbert@redhat.com>
+- Update via-sdmmc driver
+
+* Mon Jun 15 2009 Jarod Wilson <jarod@redhat.com>
+- Update lirc patches w/new imon hotness
+
+* Fri Jun 12 2009 Chuck Ebbert <cebbert@redhat.com>
+- Update VIA temp sensor and mmc drivers.
+
+* Fri Jun 12 2009 John W. Linville <linville@redhat.com> 2.6.30-6
+- neigh: fix state transition INCOMPLETE->FAILED via Netlink request
+- enable CONFIG_ARPD (used by OpenNHRP)
+
+* Wed Jun 10 2009 Chuck Ebbert <cebbert@redhat.com>
+- VIA Nano updates:
+  Enable Padlock AES encryption and random number generator on x86-64
+  Add via-sdmmc and via-cputemp drivers
+
+* Wed Jun 10 2009 Kyle McMartin <kyle@redhat.com> 2.6.30-1
+- Linux 2.6.30 rebase.
+
+* Tue Jun 09 2009 John W. Linville <linville@tuxdriver.com>
+- Clean-up some wireless bits in config-generic
+
+* Tue Jun 09 2009 Chuck Ebbert <cebbert@redhat.com>
+- Add support for ACPI P-states on VIA processors.
+- Disable the e_powersaver driver.
+
+* Tue Jun 09 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.30-rc8-git6
+
+* Fri Jun 05 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.30-rc8-git1
+
+* Wed Jun 03 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.30-rc8
+
+* Tue Jun  2 2009 Roland McGrath <roland@redhat.com>
+- utrace update (fixes stap PR10185)
+
+* Tue Jun 02 2009 Dave Jones <davej@redhat.com>
+- For reasons unknown, RT2X00 driver was being built-in.
+  Make it modular.
+
+* Tue Jun 02 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc7-git5
+
+* Sat May 30 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc7-git4
+
+* Thu May 28 2009 Dave Jones <davej@redhat.com
+- 2.6.30-rc7-git3
+
+* Wed May 27 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc7-git2
+
+* Tue May 26 2009 Dave Jones <davej@redhat.com>
+- Various cpufreq patches from git.
+
+* Tue May 26 2009 Dave Jones <davej@redhat.com
+- 2.6.30-rc7-git1
+
+* Tue May 26 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc7-git1
+
+* Mon May 25 2009 Kyle McMartin <kyle@redhat.com>
+- rds-only-on-64-bit-or-x86.patch: drop patch, issue is fixed upstream.
+
+* Sat May 23 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc7
+
+* Thu May 21 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc6-git6
+
+* Wed May 20 2009  Chuck Ebbert <cebbert@redhat.com>
+- Enable Divas (formerly Eicon) ISDN drivers on x86_64. (#480837)
+
+* Wed May 20 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc6-git5
+
+* Mon May 18 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc6-git3
+
+* Sun May 17 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc6-git2
+
+* Sat May 16 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc6
+
+* Mon May 11 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.30-rc5-git1
+
+* Fri May 08 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.30-rc5
+
+* Fri May 08 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.30-rc4-git4
+
+* Wed May 06 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.30-rc4-git3
+- linux-2.6-cdrom-door-status.patch: merged upstream.
+- linux-2.6-iwl3945-remove-useless-exports.patch: merged upstream.
+- linux-2.6-utrace.patch: rebase against changes to fs/proc/array.c
+- USB_NET_CDC_EEM=m
+
+* Fri May 01 2009 Eric Sandeen <sandeen@redhat.com>
+- Fix ext4 corruption on partial write into prealloc block
+
+* Thu Apr 30 2009 Kyle McMartin <kyle@redhat.com>
+- 2.6.30-rc4
+
+* Wed Apr 29 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc3-git6
+
+* Tue Apr 28 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc3-git4
+
+* Tue Apr 28 2009 Chuck Ebbert <cebbert@redhat.com>
+- Make the kernel-vanilla package buildable again.
+- Allow building with older versions of RPM.
+
+* Tue Apr 28 2009 Neil Horman <nhorman@redhat.com>
+- Backport missing snmp stats (bz 492391)
+
+* Tue Apr 28 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.30-0.72.rc3.git3
+- Drop unused exports from the iwl3945 driver.
+
+* Tue Apr 28 2009 Chuck Ebbert <cebbert@redhat.com>
+- Linux 2.6.30-rc3-git3
+
+* Mon Apr 27 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc3-git2
+
+* Sun Apr 26 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.30-0.68.rc3.git1
+- Linux 2.6.30-rc3-git1
+
+* Wed Apr 22 2009 Dave Jones <davej@redhat.com> 2.6.30-0.67.rc3
+- Disable SYSFS_DEPRECATED on ia64
+
+* Wed Apr 22 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.30-rc3
+- PROC_VMCORE=y: Exports the dump image of crashed
+  kernel in ELF format
+
+* Wed Apr 22 2009 Neil Horman <nhorman@redhat.com>
+- Enable RELOCATABLE and CRASH_DUMP for powerpc64
+- With this we can remove the -kdump build variant
+- for the ppc64 arch
+
+* Tue Apr 21 2009 Chuck Ebbert <cebbert@redhat.com>
+- Don't include the modules.*.bin files in the RPM package.
+
+* Tue Apr 21 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc2-git7
+
+* Mon Apr 20 2009 Dave Jones <davej@redhat.com>
+- Various s390x config tweaks. (#496596, #496601, #496605, #496607)
+
+* Mon Apr 20 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc2-git6
+
+* Sat Apr 18 2009 Chuck Ebbert <cebbert@redhat.com>
+- Set CONFIG_UEVENT_HELPER_PATH to the empty string (#496296)
+
+* Fri Apr 17 2009 Dave Jones <davej@redhat.com>
+- 2.6.30-rc2-git3
+
+* Thu Apr 16 2009 Kyle McMartin <kyle@redhat.com> 2.6.30-0.58.rc2.git1
+- 2.6.30-rc2-git1
+
+* Wed Apr 15 2009 Kyle McMartin <kyle@redhat.com> 2.6.30-0.57.rc2
+- 2.6.30-rc2
+
+* Tue Apr 14 2009 Kyle McMartin <kyle@redhat.com>
+- 2.6.30-rc1-git7
+- CONFIG_TOUCHSCREEN_AD7879_I2C=m
+- CONFIG_STRIP_ASM_SYMS=y, off for -debug
+
+* Mon Apr 13 2009 Kyle McMartin <kyle@redhat.com>
+- ppc-fix-parport_pc.patch: add from linuxppc-dev@
+
+* Mon Apr 13 2009 Kyle McMartin <kyle@redhat.com>
+- execshield: fix build (load_user_cs_desc is 32-bit only in tlb.c)
+
+* Sun Apr 12 2009 Kyle McMartin <kyle@redhat.com>
+- 2.6.30-rc1-git5
+- revert-fix-modules_install-via-nfs.patch: reverted upstream
+
+* Thu Apr 09 2009 Kyle McMartin <kyle@redhat.com>
+- actually drop utrace-ftrace from srpm.
+
+* Thu Apr 09 2009 Kyle McMartin <kyle@redhat.com>
+- 2.6.30-rc1-git2
+- CONFIG_IGBVF=m
+- CONFIG_NETFILTER_XT_TARGET_LED=m
+
+* Thu Apr 09 2009 Dave Jones <davej@redhat.com>
+- Bring back the /dev/crash driver. (#492803)
+
+* Wed Apr 08 2009 Dave Jones <davej@redhat.com>
+- disable MMIOTRACE in non-debug builds (#494584)
+
+* Wed Apr 08 2009 Kyle McMartin <kyle@redhat.com> 2.6.30-0.44.rc1
+- 2.6.30-rc1
+- linux-2.6-hwmon-atk0110.patch: drop
+- CONFIG_DETECT_HUNG_TASK=y
+- # CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+
+* Tue Apr  7 2009 Roland McGrath <roland@redhat.com>
+- utrace update, drop unfinished utrace-ftrace
+
+* Tue Apr 07 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.29-git15
+- EXT3_DEFAULTS_TO_ORDERED on for now.
+- X86_X2APIC enabled.
+- LEDS_LP5521, LEDS_BD2802 off... look not generally relevant.
+- LIBFCOE on.
+
+* Tue Apr 07 2009 Dave Jones <davej@redhat.com>
+- Enable CONFIG_CIFS_STATS (#494545)
+
+* Mon Apr 06 2009 Kyle McMartin <kyle@redhat.com>
+- linux-2.6-execshield.patch: rebase for 2.6.30
+
+* Mon Apr 06 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.29-git13
+- drop patches merged upstream:
+  - fix-ppc-debug_kmap_atomic.patch
+  - fix-staging-at76.patch
+  - linux-2.6-acpi-video-didl-intel-outputs.patch
+  - linux-2.6-acpi-strict-resources.patch
+  - linux-2.6-sony-laptop-rfkill.patch
+  - linux-2.6-btrfs-fix-umount-hang.patch
+  - linux-2.6-fiemap-header-install.patch
+  - linux-2.6-debug-dma-api.patch
+  - dma-api-debug-fixes.patch
+  - linux-2.6-ext4-flush-on-close.patch
+  - linux-2.6-relatime-by-default.patch
+  - linux-2.6-pci-sysfs-remove-id.patch
+  - linux-2.6-scsi-cpqarray-set-master.patch
+  - alsa-rewrite-hw_ptr-updaters.patch
+  - alsa-pcm-always-reset-invalid-position.patch
+  - alsa-pcm-fix-delta-calc-at-overlap.patch
+  - alsa-pcm-safer-boundary-checks.patch
+  - linux-2.6-input-hid-extra-gamepad.patch
+  - linux-2.6-ipw2x00-age-scan-results-on-resume.patch
+  - linux-2.6-dropwatch-protocol.patch
+  - linux-2.6-net-fix-gro-bug.patch
+  - linux-2.6-net-fix-another-gro-bug.patch
+  - linux-2.6-net-xfrm-fix-spin-unlock.patch
+  - linux-2.6.29-pat-change-is_linear_pfn_mapping-to-not-use-vm_pgoff.patch
+  - linux-2.6.29-pat-pci-change-prot-for-inherit.patch
+
+* Thu Apr 02 2009 Josef Bacik <josef@toxicpanda.com>
+- linux-2.6-btrfs-fix-umount-hang.patch: fix umount hang on btrfs
+
+* Thu Apr 02 2009 Kyle McMartin <kyle@redhat.com>
+- fix-ppc-debug_kmap_atomic.patch: fix build failures on ppc.
+
+* Thu Apr 02 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.29-git9
+
+* Tue Mar 31 2009 Kyle McMartin <kyle@redhat.com>
+- rds-only-on-64-bit-or-x86.patch: add
+- at76-netdev_ops.patch: add
+
+* Tue Mar 31 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.29-git8
+- linux-2.6-net-fix-another-gro-bug.patch: upstream.
+
+* Tue Mar 31 2009 Eric Sandeen <sandeen@redhat.com>
+- add fiemap.h to kernel-headers
+- build ext4 (and jbd2 and crc16) into the kernel
+
+* Tue Mar 31 2009 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.29-git7
+- fix-staging-at76.patch: pull patch from linux-wireless to fix...
+
+* Mon Mar 30 2009 Kyle McMartin <kyle@redhat.com> 2.6.30-0.28.rc0.git6
+- Linux 2.6.29-git6
+- Bunch of stuff disabled, most merged, some needs rebasing.
+
+* Mon Mar 30 2009 Chuck Ebbert <cebbert@redhat.com>
+- Make the .shared-srctree file a list so more than two checkouts
+  can share source files.
+
+* Mon Mar 30 2009 Chuck Ebbert <cebbert@redhat.com>
+- Separate PAT fixes that are headed for -stable from our out-of-tree ones.
+
+* Mon Mar 30 2009 Dave Jones <davej@redhat.com>
+- Make io schedulers selectable at boot time again. (#492817)
+
+* Mon Mar 30 2009 Dave Jones <davej@redhat.com>
+- Add a strict-devmem=0 boot argument (#492803)
+
+* Mon Mar 30 2009 Adam Jackson <ajax@redhat.com>
+- linux-2.6.29-pat-fixes.patch: Fix PAT/GTT interaction
+
+* Mon Mar 30 2009 Mauro Carvalho Chehab <mchehab@redhat.com>
+- some fixes of troubles caused by v4l2 subdev conversion
+
+* Mon Mar 30 2009 Mark McLoughlin <markmc@redhat.com> 2.6.29-21
+- Fix guest->remote network stall with virtio/GSO (#490266)
+
+* Mon Mar 30 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch
+  - rewrite nouveau PCI(E) GART functions, should fix rh#492492
+  - kms: kernel option to allow dual-link dvi
+  - modinfo descriptions for module parameters
+
+* Sun Mar 29 2009 Mauro Carvalho Chehab <mchehab@redhat.com>
+- more v4l/dvb updates: v4l subdev conversion and some driver improvements
+
+* Sun Mar 29 2009 Chuck Ebbert <cebbert@redhat.com>
+- More fixes for ALSA hardware pointer updating.
+
+* Sat Mar 28 2009 Mauro Carvalho Chehab <mchehab@redhat.com>
+- linux-2.6-revert-dvb-net-kabi-change.patch: attempt to fix dvb net breakage
+- update v4l fixes patch to reflect what's ready for 2.6.30
+- update v4l devel patch to reflect what will be kept on linux-next for a while
+
+* Fri Mar 27 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.29-16
+- Fix 2.6.29 networking lockups.
+- Fix locking in net/xfrm/xfrm_state.c (#489764)
+
+* Fri Mar 27 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: do nothing for dac_{prepare,commit}, it's useless
+  and breaks some things in strange ways.
+
+* Fri Mar 27 2009 Ben Skeggs <bskeggs@redhat.com>
+- nv50: clear 0x1900/8 on init, possible fix for rh#492240
+- forcibly disable GEM also if KMS requested where not supported
+- inform the user if we disable KMS because of it not being supported
+
+* Thu Mar 26 2009 Matthew Garrett <mjg@redhat.com>
+- linux-2.6-relatime-by-default.patch: Backport relatime code from 2.6.30
+
+* Thu Mar 26 2009 Dave Jones <davej@redhat.com>
+- Check for modesetting enabled before forcing mode on 915. (#490336)
+
+* Thu Mar 26 2009 Dave Jones <davej@redhat.com>
+- Set kernel-PAE as default in grub. (#487578)
+
+* Thu Mar 26 2009 Dave Jones <davej@redhat.com>
+- Enable CONFIG_MOUSE_PS2_ELANTECH (#492163)
+
+* Thu Mar 26 2009 Kyle McMartin <kyle@redhat.com>
+- linux-2.6-v4l-pvrusb2-fixes.patch: fix build for uncle steve.
+
+* Thu Mar 26 2009 Mauro Carvalho Chehab <mchehab@redhat.com>
+- Move all 2.6.30 stuff into linux-2.6-v4l-dvb-fixes.patch, in
+  preparation for upstream pull;
+- Added two new drivers: gspca sq905c and DVB Intel ce6230
+- Updated to the latest v4l-dvb drivers.
+
+* Wed Mar 25 2009 Mauro Carvalho Chehab <mchehab@redhat.com>
+- remove duplicated Cinergy T2 entry at config-generic
+
+* Wed Mar 25 2009 Neil Horman <nhorman@redhat.com>
+- Add dropmonitor/dropwatch protocol from 2.6.30
+
+* Wed Mar 25 2009 Kyle McMartin <kyle@redhat.com>
+- alsa-rewrite-hw_ptr-updaters.patch: snd_pcm_update_hw_ptr() tries to
+  detect the unexpected hwptr jumps more strictly to avoid the position
+  mess-up, which often results in the bad quality I/O with pulseaudio.
+
+* Wed Mar 25 2009 Ben Skeggs <bskeggs@redhat.com>
+- drm-nouveau.patch: idle channels better before destroying them
+
+* Tue Mar 24 2009 Kyle McMartin <kyle@redhat.com>
+- Disable DMAR by default until suspend & resume is fixed.
+
+* Tue Mar 24 2009 Josef Bacik <josef@toxicpanda.com>
+- fsync replay fixes for btrfs
+
+* Mon Mar 23 2009 Dave Jones <davej@redhat.com>
+- 2.6.29
+
+###
+# The following Emacs magic makes C-c C-e use UTC dates.
+# Local Variables:
+# rpm-change-log-uses-utc: t
+# End:
+###
diff --git a/original/linux-2.6-btrfs-upstream.patch b/original/linux-2.6-btrfs-upstream.patch
new file mode 100644
index 000000000..46ae7fff5
--- /dev/null
+++ b/original/linux-2.6-btrfs-upstream.patch
@@ -0,0 +1,10829 @@
+diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
+index f128427..3616042 100644
+--- a/fs/btrfs/acl.c
++++ b/fs/btrfs/acl.c
+@@ -27,7 +27,7 @@
+ #include "btrfs_inode.h"
+ #include "xattr.h"
+ 
+-#ifdef CONFIG_FS_POSIX_ACL
++#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ 
+ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+ {
+@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
+ 	.set	= btrfs_xattr_acl_access_set,
+ };
+ 
+-#else /* CONFIG_FS_POSIX_ACL */
++#else /* CONFIG_BTRFS_FS_POSIX_ACL */
+ 
+ int btrfs_acl_chmod(struct inode *inode)
+ {
+@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
+ 	return 0;
+ }
+ 
+-#endif /* CONFIG_FS_POSIX_ACL */
++#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
+diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
+index 019e8af..c0861e7 100644
+--- a/fs/btrfs/async-thread.c
++++ b/fs/btrfs/async-thread.c
+@@ -48,6 +48,9 @@ struct btrfs_worker_thread {
+ 	/* number of things on the pending list */
+ 	atomic_t num_pending;
+ 
++	/* reference counter for this struct */
++	atomic_t refs;
++
+ 	unsigned long sequence;
+ 
+ 	/* protects the pending list. */
+@@ -61,6 +64,51 @@ struct btrfs_worker_thread {
+ };
+ 
+ /*
++ * btrfs_start_workers uses kthread_run, which can block waiting for memory
++ * for a very long time.  It will actually throttle on page writeback,
++ * and so it may not make progress until after our btrfs worker threads
++ * process all of the pending work structs in their queue
++ *
++ * This means we can't use btrfs_start_workers from inside a btrfs worker
++ * thread that is used as part of cleaning dirty memory, which pretty much
++ * involves all of the worker threads.
++ *
++ * Instead we have a helper queue who never has more than one thread
++ * where we scheduler thread start operations.  This worker_start struct
++ * is used to contain the work and hold a pointer to the queue that needs
++ * another worker.
++ */
++struct worker_start {
++	struct btrfs_work work;
++	struct btrfs_workers *queue;
++};
++
++static void start_new_worker_func(struct btrfs_work *work)
++{
++	struct worker_start *start;
++	start = container_of(work, struct worker_start, work);
++	btrfs_start_workers(start->queue, 1);
++	kfree(start);
++}
++
++static int start_new_worker(struct btrfs_workers *queue)
++{
++	struct worker_start *start;
++	int ret;
++
++	start = kzalloc(sizeof(*start), GFP_NOFS);
++	if (!start)
++		return -ENOMEM;
++
++	start->work.func = start_new_worker_func;
++	start->queue = queue;
++	ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
++	if (ret)
++		kfree(start);
++	return ret;
++}
++
++/*
+  * helper function to move a thread onto the idle list after it
+  * has finished some requests.
+  */
+@@ -71,7 +119,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
+ 		unsigned long flags;
+ 		spin_lock_irqsave(&worker->workers->lock, flags);
+ 		worker->idle = 1;
+-		list_move(&worker->worker_list, &worker->workers->idle_list);
++
++		/* the list may be empty if the worker is just starting */
++		if (!list_empty(&worker->worker_list)) {
++			list_move(&worker->worker_list,
++				 &worker->workers->idle_list);
++		}
+ 		spin_unlock_irqrestore(&worker->workers->lock, flags);
+ 	}
+ }
+@@ -87,23 +140,51 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
+ 		unsigned long flags;
+ 		spin_lock_irqsave(&worker->workers->lock, flags);
+ 		worker->idle = 0;
+-		list_move_tail(&worker->worker_list,
+-			       &worker->workers->worker_list);
++
++		if (!list_empty(&worker->worker_list)) {
++			list_move_tail(&worker->worker_list,
++				      &worker->workers->worker_list);
++		}
+ 		spin_unlock_irqrestore(&worker->workers->lock, flags);
+ 	}
+ }
+ 
+-static noinline int run_ordered_completions(struct btrfs_workers *workers,
+-					    struct btrfs_work *work)
++static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
+ {
++	struct btrfs_workers *workers = worker->workers;
+ 	unsigned long flags;
+ 
++	rmb();
++	if (!workers->atomic_start_pending)
++		return;
++
++	spin_lock_irqsave(&workers->lock, flags);
++	if (!workers->atomic_start_pending)
++		goto out;
++
++	workers->atomic_start_pending = 0;
++	if (workers->num_workers + workers->num_workers_starting >=
++	    workers->max_workers)
++		goto out;
++
++	workers->num_workers_starting += 1;
++	spin_unlock_irqrestore(&workers->lock, flags);
++	start_new_worker(workers);
++	return;
++
++out:
++	spin_unlock_irqrestore(&workers->lock, flags);
++}
++
++static noinline int run_ordered_completions(struct btrfs_workers *workers,
++					    struct btrfs_work *work)
++{
+ 	if (!workers->ordered)
+ 		return 0;
+ 
+ 	set_bit(WORK_DONE_BIT, &work->flags);
+ 
+-	spin_lock_irqsave(&workers->lock, flags);
++	spin_lock(&workers->order_lock);
+ 
+ 	while (1) {
+ 		if (!list_empty(&workers->prio_order_list)) {
+@@ -126,45 +207,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
+ 		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
+ 			break;
+ 
+-		spin_unlock_irqrestore(&workers->lock, flags);
++		spin_unlock(&workers->order_lock);
+ 
+ 		work->ordered_func(work);
+ 
+ 		/* now take the lock again and call the freeing code */
+-		spin_lock_irqsave(&workers->lock, flags);
++		spin_lock(&workers->order_lock);
+ 		list_del(&work->order_list);
+ 		work->ordered_free(work);
+ 	}
+ 
+-	spin_unlock_irqrestore(&workers->lock, flags);
++	spin_unlock(&workers->order_lock);
+ 	return 0;
+ }
+ 
++static void put_worker(struct btrfs_worker_thread *worker)
++{
++	if (atomic_dec_and_test(&worker->refs))
++		kfree(worker);
++}
++
++static int try_worker_shutdown(struct btrfs_worker_thread *worker)
++{
++	int freeit = 0;
++
++	spin_lock_irq(&worker->lock);
++	spin_lock(&worker->workers->lock);
++	if (worker->workers->num_workers > 1 &&
++	    worker->idle &&
++	    !worker->working &&
++	    !list_empty(&worker->worker_list) &&
++	    list_empty(&worker->prio_pending) &&
++	    list_empty(&worker->pending) &&
++	    atomic_read(&worker->num_pending) == 0) {
++		freeit = 1;
++		list_del_init(&worker->worker_list);
++		worker->workers->num_workers--;
++	}
++	spin_unlock(&worker->workers->lock);
++	spin_unlock_irq(&worker->lock);
++
++	if (freeit)
++		put_worker(worker);
++	return freeit;
++}
++
++static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
++					struct list_head *prio_head,
++					struct list_head *head)
++{
++	struct btrfs_work *work = NULL;
++	struct list_head *cur = NULL;
++
++	if(!list_empty(prio_head))
++		cur = prio_head->next;
++
++	smp_mb();
++	if (!list_empty(&worker->prio_pending))
++		goto refill;
++
++	if (!list_empty(head))
++		cur = head->next;
++
++	if (cur)
++		goto out;
++
++refill:
++	spin_lock_irq(&worker->lock);
++	list_splice_tail_init(&worker->prio_pending, prio_head);
++	list_splice_tail_init(&worker->pending, head);
++
++	if (!list_empty(prio_head))
++		cur = prio_head->next;
++	else if (!list_empty(head))
++		cur = head->next;
++	spin_unlock_irq(&worker->lock);
++
++	if (!cur)
++		goto out_fail;
++
++out:
++	work = list_entry(cur, struct btrfs_work, list);
++
++out_fail:
++	return work;
++}
++
+ /*
+  * main loop for servicing work items
+  */
+ static int worker_loop(void *arg)
+ {
+ 	struct btrfs_worker_thread *worker = arg;
+-	struct list_head *cur;
++	struct list_head head;
++	struct list_head prio_head;
+ 	struct btrfs_work *work;
++
++	INIT_LIST_HEAD(&head);
++	INIT_LIST_HEAD(&prio_head);
++
+ 	do {
+-		spin_lock_irq(&worker->lock);
+-again_locked:
++again:
+ 		while (1) {
+-			if (!list_empty(&worker->prio_pending))
+-				cur = worker->prio_pending.next;
+-			else if (!list_empty(&worker->pending))
+-				cur = worker->pending.next;
+-			else
++
++
++			work = get_next_work(worker, &prio_head, &head);
++			if (!work)
+ 				break;
+ 
+-			work = list_entry(cur, struct btrfs_work, list);
+ 			list_del(&work->list);
+ 			clear_bit(WORK_QUEUED_BIT, &work->flags);
+ 
+ 			work->worker = worker;
+-			spin_unlock_irq(&worker->lock);
+ 
+ 			work->func(work);
+ 
+@@ -175,9 +329,13 @@ again_locked:
+ 			 */
+ 			run_ordered_completions(worker->workers, work);
+ 
+-			spin_lock_irq(&worker->lock);
+-			check_idle_worker(worker);
++			check_pending_worker_creates(worker);
++
+ 		}
++
++		spin_lock_irq(&worker->lock);
++		check_idle_worker(worker);
++
+ 		if (freezing(current)) {
+ 			worker->working = 0;
+ 			spin_unlock_irq(&worker->lock);
+@@ -216,8 +374,10 @@ again_locked:
+ 				spin_lock_irq(&worker->lock);
+ 				set_current_state(TASK_INTERRUPTIBLE);
+ 				if (!list_empty(&worker->pending) ||
+-				    !list_empty(&worker->prio_pending))
+-					goto again_locked;
++				    !list_empty(&worker->prio_pending)) {
++					spin_unlock_irq(&worker->lock);
++					goto again;
++				}
+ 
+ 				/*
+ 				 * this makes sure we get a wakeup when someone
+@@ -226,8 +386,13 @@ again_locked:
+ 				worker->working = 0;
+ 				spin_unlock_irq(&worker->lock);
+ 
+-				if (!kthread_should_stop())
+-					schedule();
++				if (!kthread_should_stop()) {
++					schedule_timeout(HZ * 120);
++					if (!worker->working &&
++					    try_worker_shutdown(worker)) {
++						return 0;
++					}
++				}
+ 			}
+ 			__set_current_state(TASK_RUNNING);
+ 		}
+@@ -242,41 +407,61 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
+ {
+ 	struct list_head *cur;
+ 	struct btrfs_worker_thread *worker;
++	int can_stop;
+ 
++	spin_lock_irq(&workers->lock);
+ 	list_splice_init(&workers->idle_list, &workers->worker_list);
+ 	while (!list_empty(&workers->worker_list)) {
+ 		cur = workers->worker_list.next;
+ 		worker = list_entry(cur, struct btrfs_worker_thread,
+ 				    worker_list);
+-		kthread_stop(worker->task);
+-		list_del(&worker->worker_list);
+-		kfree(worker);
++
++		atomic_inc(&worker->refs);
++		workers->num_workers -= 1;
++		if (!list_empty(&worker->worker_list)) {
++			list_del_init(&worker->worker_list);
++			put_worker(worker);
++			can_stop = 1;
++		} else
++			can_stop = 0;
++		spin_unlock_irq(&workers->lock);
++		if (can_stop)
++			kthread_stop(worker->task);
++		spin_lock_irq(&workers->lock);
++		put_worker(worker);
+ 	}
++	spin_unlock_irq(&workers->lock);
+ 	return 0;
+ }
+ 
+ /*
+  * simple init on struct btrfs_workers
+  */
+-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
++void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
++			struct btrfs_workers *async_helper)
+ {
+ 	workers->num_workers = 0;
++	workers->num_workers_starting = 0;
+ 	INIT_LIST_HEAD(&workers->worker_list);
+ 	INIT_LIST_HEAD(&workers->idle_list);
+ 	INIT_LIST_HEAD(&workers->order_list);
+ 	INIT_LIST_HEAD(&workers->prio_order_list);
+ 	spin_lock_init(&workers->lock);
++	spin_lock_init(&workers->order_lock);
+ 	workers->max_workers = max;
+ 	workers->idle_thresh = 32;
+ 	workers->name = name;
+ 	workers->ordered = 0;
++	workers->atomic_start_pending = 0;
++	workers->atomic_worker_start = async_helper;
+ }
+ 
+ /*
+  * starts new worker threads.  This does not enforce the max worker
+  * count in case you need to temporarily go past it.
+  */
+-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
++static int __btrfs_start_workers(struct btrfs_workers *workers,
++				 int num_workers)
+ {
+ 	struct btrfs_worker_thread *worker;
+ 	int ret = 0;
+@@ -293,7 +478,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+ 		INIT_LIST_HEAD(&worker->prio_pending);
+ 		INIT_LIST_HEAD(&worker->worker_list);
+ 		spin_lock_init(&worker->lock);
++
+ 		atomic_set(&worker->num_pending, 0);
++		atomic_set(&worker->refs, 1);
+ 		worker->workers = workers;
+ 		worker->task = kthread_run(worker_loop, worker,
+ 					   "btrfs-%s-%d", workers->name,
+@@ -303,11 +490,12 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+ 			kfree(worker);
+ 			goto fail;
+ 		}
+-
+ 		spin_lock_irq(&workers->lock);
+ 		list_add_tail(&worker->worker_list, &workers->idle_list);
+ 		worker->idle = 1;
+ 		workers->num_workers++;
++		workers->num_workers_starting--;
++		WARN_ON(workers->num_workers_starting < 0);
+ 		spin_unlock_irq(&workers->lock);
+ 	}
+ 	return 0;
+@@ -316,6 +504,14 @@ fail:
+ 	return ret;
+ }
+ 
++int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
++{
++	spin_lock_irq(&workers->lock);
++	workers->num_workers_starting += num_workers;
++	spin_unlock_irq(&workers->lock);
++	return __btrfs_start_workers(workers, num_workers);
++}
++
+ /*
+  * run through the list and find a worker thread that doesn't have a lot
+  * to do right now.  This can return null if we aren't yet at the thread
+@@ -325,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+ {
+ 	struct btrfs_worker_thread *worker;
+ 	struct list_head *next;
+-	int enforce_min = workers->num_workers < workers->max_workers;
++	int enforce_min;
++
++	enforce_min = (workers->num_workers + workers->num_workers_starting) <
++		workers->max_workers;
+ 
+ 	/*
+ 	 * if we find an idle thread, don't move it to the end of the
+@@ -350,7 +549,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+ 	 */
+ 	next = workers->worker_list.next;
+ 	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
+-	atomic_inc(&worker->num_pending);
+ 	worker->sequence++;
+ 
+ 	if (worker->sequence % workers->idle_thresh == 0)
+@@ -367,35 +565,49 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+ {
+ 	struct btrfs_worker_thread *worker;
+ 	unsigned long flags;
++	struct list_head *fallback;
+ 
+ again:
+ 	spin_lock_irqsave(&workers->lock, flags);
+ 	worker = next_worker(workers);
+-	spin_unlock_irqrestore(&workers->lock, flags);
+ 
+ 	if (!worker) {
+-		spin_lock_irqsave(&workers->lock, flags);
+-		if (workers->num_workers >= workers->max_workers) {
+-			struct list_head *fallback = NULL;
+-			/*
+-			 * we have failed to find any workers, just
+-			 * return the force one
+-			 */
+-			if (!list_empty(&workers->worker_list))
+-				fallback = workers->worker_list.next;
+-			if (!list_empty(&workers->idle_list))
+-				fallback = workers->idle_list.next;
+-			BUG_ON(!fallback);
+-			worker = list_entry(fallback,
+-				  struct btrfs_worker_thread, worker_list);
+-			spin_unlock_irqrestore(&workers->lock, flags);
++		if (workers->num_workers + workers->num_workers_starting >=
++		    workers->max_workers) {
++			goto fallback;
++		} else if (workers->atomic_worker_start) {
++			workers->atomic_start_pending = 1;
++			goto fallback;
+ 		} else {
++			workers->num_workers_starting++;
+ 			spin_unlock_irqrestore(&workers->lock, flags);
+ 			/* we're below the limit, start another worker */
+-			btrfs_start_workers(workers, 1);
++			__btrfs_start_workers(workers, 1);
+ 			goto again;
+ 		}
+ 	}
++	goto found;
++
++fallback:
++	fallback = NULL;
++	/*
++	 * we have failed to find any workers, just
++	 * return the first one we can find.
++	 */
++	if (!list_empty(&workers->worker_list))
++		fallback = workers->worker_list.next;
++	if (!list_empty(&workers->idle_list))
++		fallback = workers->idle_list.next;
++	BUG_ON(!fallback);
++	worker = list_entry(fallback,
++		  struct btrfs_worker_thread, worker_list);
++found:
++	/*
++	 * this makes sure the worker doesn't exit before it is placed
++	 * onto a busy/idle list
++	 */
++	atomic_inc(&worker->num_pending);
++	spin_unlock_irqrestore(&workers->lock, flags);
+ 	return worker;
+ }
+ 
+@@ -427,7 +639,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
+ 		spin_lock(&worker->workers->lock);
+ 		worker->idle = 0;
+ 		list_move_tail(&worker->worker_list,
+-			       &worker->workers->worker_list);
++			      &worker->workers->worker_list);
+ 		spin_unlock(&worker->workers->lock);
+ 	}
+ 	if (!worker->working) {
+@@ -435,9 +647,9 @@ int btrfs_requeue_work(struct btrfs_work *work)
+ 		worker->working = 1;
+ 	}
+ 
+-	spin_unlock_irqrestore(&worker->lock, flags);
+ 	if (wake)
+ 		wake_up_process(worker->task);
++	spin_unlock_irqrestore(&worker->lock, flags);
+ out:
+ 
+ 	return 0;
+@@ -463,14 +675,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+ 
+ 	worker = find_worker(workers);
+ 	if (workers->ordered) {
+-		spin_lock_irqsave(&workers->lock, flags);
++		/*
++		 * you're not allowed to do ordered queues from an
++		 * interrupt handler
++		 */
++		spin_lock(&workers->order_lock);
+ 		if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
+ 			list_add_tail(&work->order_list,
+ 				      &workers->prio_order_list);
+ 		} else {
+ 			list_add_tail(&work->order_list, &workers->order_list);
+ 		}
+-		spin_unlock_irqrestore(&workers->lock, flags);
++		spin_unlock(&workers->order_lock);
+ 	} else {
+ 		INIT_LIST_HEAD(&work->order_list);
+ 	}
+@@ -481,7 +697,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+ 		list_add_tail(&work->list, &worker->prio_pending);
+ 	else
+ 		list_add_tail(&work->list, &worker->pending);
+-	atomic_inc(&worker->num_pending);
+ 	check_busy_worker(worker);
+ 
+ 	/*
+@@ -492,10 +707,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+ 		wake = 1;
+ 	worker->working = 1;
+ 
+-	spin_unlock_irqrestore(&worker->lock, flags);
+-
+ 	if (wake)
+ 		wake_up_process(worker->task);
++	spin_unlock_irqrestore(&worker->lock, flags);
++
+ out:
+ 	return 0;
+ }
+diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
+index 1b511c1..5077746 100644
+--- a/fs/btrfs/async-thread.h
++++ b/fs/btrfs/async-thread.h
+@@ -64,6 +64,8 @@ struct btrfs_workers {
+ 	/* current number of running workers */
+ 	int num_workers;
+ 
++	int num_workers_starting;
++
+ 	/* max number of workers allowed.  changed by btrfs_start_workers */
+ 	int max_workers;
+ 
+@@ -73,6 +75,16 @@ struct btrfs_workers {
+ 	/* force completions in the order they were queued */
+ 	int ordered;
+ 
++	/* more workers required, but in an interrupt handler */
++	int atomic_start_pending;
++
++	/*
++	 * are we allowed to sleep while starting workers or are we required
++	 * to start them at a later time?  If we can't sleep, this indicates
++	 * which queue we need to use to schedule thread creation.
++	 */
++	struct btrfs_workers *atomic_worker_start;
++
+ 	/* list with all the work threads.  The workers on the idle thread
+ 	 * may be actively servicing jobs, but they haven't yet hit the
+ 	 * idle thresh limit above.
+@@ -90,6 +102,9 @@ struct btrfs_workers {
+ 	/* lock for finding the next worker thread to queue on */
+ 	spinlock_t lock;
+ 
++	/* lock for the ordered lists */
++	spinlock_t order_lock;
++
+ 	/* extra name for this worker, used for current->name */
+ 	char *name;
+ };
+@@ -97,7 +112,8 @@ struct btrfs_workers {
+ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+ int btrfs_stop_workers(struct btrfs_workers *workers);
+-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
++void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
++			struct btrfs_workers *async_starter);
+ int btrfs_requeue_work(struct btrfs_work *work);
+ void btrfs_set_work_high_prio(struct btrfs_work *work);
+ #endif
+diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
+index ea1ea0a..f6783a4 100644
+--- a/fs/btrfs/btrfs_inode.h
++++ b/fs/btrfs/btrfs_inode.h
+@@ -86,6 +86,12 @@ struct btrfs_inode {
+ 	 * transid of the trans_handle that last modified this inode
+ 	 */
+ 	u64 last_trans;
++
++	/*
++	 * log transid when this inode was last modified
++	 */
++	u64 last_sub_trans;
++
+ 	/*
+ 	 * transid that last logged this inode
+ 	 */
+@@ -128,6 +134,16 @@ struct btrfs_inode {
+ 	u64 last_unlink_trans;
+ 
+ 	/*
++	 * Counters to keep track of the number of extent item's we may use due
++	 * to delalloc and such.  outstanding_extents is the number of extent
++	 * items we think we'll end up using, and reserved_extents is the number
++	 * of extent items we've reserved metadata for.
++	 */
++	spinlock_t accounting_lock;
++	int reserved_extents;
++	int outstanding_extents;
++
++	/*
+ 	 * ordered_data_close is set by truncate when a file that used
+ 	 * to have good data has been truncated to zero.  When it is set
+ 	 * the btrfs file release call will add this inode to the
+@@ -138,6 +154,7 @@ struct btrfs_inode {
+ 	 * of these.
+ 	 */
+ 	unsigned ordered_data_close:1;
++	unsigned dummy_inode:1;
+ 
+ 	struct inode vfs_inode;
+ };
+diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
+index 9d8ba4d..a11a320 100644
+--- a/fs/btrfs/compression.c
++++ b/fs/btrfs/compression.c
+@@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
+ 		 */
+ 		set_page_extent_mapped(page);
+ 		lock_extent(tree, last_offset, end, GFP_NOFS);
+-		spin_lock(&em_tree->lock);
++		read_lock(&em_tree->lock);
+ 		em = lookup_extent_mapping(em_tree, last_offset,
+ 					   PAGE_CACHE_SIZE);
+-		spin_unlock(&em_tree->lock);
++		read_unlock(&em_tree->lock);
+ 
+ 		if (!em || last_offset < em->start ||
+ 		    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+@@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ 	em_tree = &BTRFS_I(inode)->extent_tree;
+ 
+ 	/* we need the actual starting offset of this extent in the file */
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree,
+ 				   page_offset(bio->bi_io_vec->bv_page),
+ 				   PAGE_CACHE_SIZE);
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 
+ 	compressed_len = em->block_len;
+ 	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
+index 3fdcc05..ec96f3a 100644
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
+ 	int split;
+ 	int num_doubles = 0;
+ 
++	l = path->nodes[0];
++	slot = path->slots[0];
++	if (extend && data_size + btrfs_item_size_nr(l, slot) +
++	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
++		return -EOVERFLOW;
++
+ 	/* first try to make some room by pushing left and right */
+ 	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+ 		wret = push_leaf_right(trans, root, path, data_size, 0);
+diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
+index 837435c..e5dd628 100644
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -114,6 +114,10 @@ struct btrfs_ordered_sum;
+  */
+ #define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+ 
++#define BTRFS_BTREE_INODE_OBJECTID 1
++
++#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
++
+ /*
+  * we can actually store much bigger names, but lets not confuse the rest
+  * of linux
+@@ -670,21 +674,29 @@ struct btrfs_space_info {
+ 	u64 bytes_reserved;	/* total bytes the allocator has reserved for
+ 				   current allocations */
+ 	u64 bytes_readonly;	/* total bytes that are read only */
+-
+-	/* delalloc accounting */
+-	u64 bytes_delalloc;	/* number of bytes reserved for allocation,
+-				   this space is not necessarily reserved yet
+-				   by the allocator */
++	u64 bytes_super;	/* total bytes reserved for the super blocks */
++	u64 bytes_root;		/* the number of bytes needed to commit a
++				   transaction */
+ 	u64 bytes_may_use;	/* number of bytes that may be used for
+-				   delalloc */
++				   delalloc/allocations */
++	u64 bytes_delalloc;	/* number of bytes currently reserved for
++				   delayed allocation */
+ 
+ 	int full;		/* indicates that we cannot allocate any more
+ 				   chunks for this space */
+ 	int force_alloc;	/* set if we need to force a chunk alloc for
+ 				   this space */
++	int force_delalloc;	/* make people start doing filemap_flush until
++				   we're under a threshold */
+ 
+ 	struct list_head list;
+ 
++	/* for controlling how we free up space for allocations */
++	wait_queue_head_t allocate_wait;
++	wait_queue_head_t flush_wait;
++	int allocating_chunk;
++	int flushing;
++
+ 	/* for block groups in our same type */
+ 	struct list_head block_groups;
+ 	spinlock_t lock;
+@@ -726,6 +738,15 @@ enum btrfs_caching_type {
+ 	BTRFS_CACHE_FINISHED	= 2,
+ };
+ 
++struct btrfs_caching_control {
++	struct list_head list;
++	struct mutex mutex;
++	wait_queue_head_t wait;
++	struct btrfs_block_group_cache *block_group;
++	u64 progress;
++	atomic_t count;
++};
++
+ struct btrfs_block_group_cache {
+ 	struct btrfs_key key;
+ 	struct btrfs_block_group_item item;
+@@ -733,6 +754,7 @@ struct btrfs_block_group_cache {
+ 	spinlock_t lock;
+ 	u64 pinned;
+ 	u64 reserved;
++	u64 bytes_super;
+ 	u64 flags;
+ 	u64 sectorsize;
+ 	int extents_thresh;
+@@ -742,8 +764,9 @@ struct btrfs_block_group_cache {
+ 	int dirty;
+ 
+ 	/* cache tracking stuff */
+-	wait_queue_head_t caching_q;
+ 	int cached;
++	struct btrfs_caching_control *caching_ctl;
++	u64 last_byte_to_unpin;
+ 
+ 	struct btrfs_space_info *space_info;
+ 
+@@ -782,13 +805,16 @@ struct btrfs_fs_info {
+ 
+ 	/* the log root tree is a directory of all the other log roots */
+ 	struct btrfs_root *log_root_tree;
++
++	spinlock_t fs_roots_radix_lock;
+ 	struct radix_tree_root fs_roots_radix;
+ 
+ 	/* block group cache stuff */
+ 	spinlock_t block_group_cache_lock;
+ 	struct rb_root block_group_cache_tree;
+ 
+-	struct extent_io_tree pinned_extents;
++	struct extent_io_tree freed_extents[2];
++	struct extent_io_tree *pinned_extents;
+ 
+ 	/* logical->physical extent mapping */
+ 	struct btrfs_mapping_tree mapping_tree;
+@@ -822,11 +848,7 @@ struct btrfs_fs_info {
+ 	struct mutex transaction_kthread_mutex;
+ 	struct mutex cleaner_mutex;
+ 	struct mutex chunk_mutex;
+-	struct mutex drop_mutex;
+ 	struct mutex volume_mutex;
+-	struct mutex tree_reloc_mutex;
+-	struct rw_semaphore extent_commit_sem;
+-
+ 	/*
+ 	 * this protects the ordered operations list only while we are
+ 	 * processing all of the entries on it.  This way we make
+@@ -835,10 +857,16 @@ struct btrfs_fs_info {
+ 	 * before jumping into the main commit.
+ 	 */
+ 	struct mutex ordered_operations_mutex;
++	struct rw_semaphore extent_commit_sem;
++
++	struct rw_semaphore subvol_sem;
++
++	struct srcu_struct subvol_srcu;
+ 
+ 	struct list_head trans_list;
+ 	struct list_head hashers;
+ 	struct list_head dead_roots;
++	struct list_head caching_block_groups;
+ 
+ 	atomic_t nr_async_submits;
+ 	atomic_t async_submit_draining;
+@@ -882,6 +910,7 @@ struct btrfs_fs_info {
+ 	 * A third pool does submit_bio to avoid deadlocking with the other
+ 	 * two
+ 	 */
++	struct btrfs_workers generic_worker;
+ 	struct btrfs_workers workers;
+ 	struct btrfs_workers delalloc_workers;
+ 	struct btrfs_workers endio_workers;
+@@ -889,6 +918,7 @@ struct btrfs_fs_info {
+ 	struct btrfs_workers endio_meta_write_workers;
+ 	struct btrfs_workers endio_write_workers;
+ 	struct btrfs_workers submit_workers;
++	struct btrfs_workers enospc_workers;
+ 	/*
+ 	 * fixup workers take dirty pages that didn't properly go through
+ 	 * the cow mechanism and make them safe to write.  It happens
+@@ -979,7 +1009,10 @@ struct btrfs_root {
+ 	atomic_t log_writers;
+ 	atomic_t log_commit[2];
+ 	unsigned long log_transid;
++	unsigned long last_log_commit;
+ 	unsigned long log_batch;
++	pid_t log_start_pid;
++	bool log_multiple_pids;
+ 
+ 	u64 objectid;
+ 	u64 last_trans;
+@@ -996,10 +1029,12 @@ struct btrfs_root {
+ 	u32 stripesize;
+ 
+ 	u32 type;
+-	u64 highest_inode;
+-	u64 last_inode_alloc;
++
++	u64 highest_objectid;
+ 	int ref_cows;
+ 	int track_dirty;
++	int in_radix;
++
+ 	u64 defrag_trans_start;
+ 	struct btrfs_key defrag_progress;
+ 	struct btrfs_key defrag_max;
+@@ -1118,6 +1153,7 @@ struct btrfs_root {
+ #define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
+ #define BTRFS_MOUNT_SSD_SPREAD		(1 << 8)
+ #define BTRFS_MOUNT_NOSSD		(1 << 9)
++#define BTRFS_MOUNT_DISCARD		(1 << 10)
+ 
+ #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
+ #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
+@@ -1920,8 +1956,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ 			   struct btrfs_root *root, unsigned long count);
+ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+-int btrfs_update_pinned_extents(struct btrfs_root *root,
+-				u64 bytenr, u64 num, int pin);
++int btrfs_pin_extent(struct btrfs_root *root,
++		     u64 bytenr, u64 num, int reserved);
+ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+ 			struct btrfs_root *root, struct extent_buffer *leaf);
+ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+@@ -1971,9 +2007,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
+ 		      u64 root_objectid, u64 owner, u64 offset);
+ 
+ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
++int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
++				struct btrfs_root *root);
+ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+-			       struct btrfs_root *root,
+-			       struct extent_io_tree *unpin);
++			       struct btrfs_root *root);
+ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+ 			 struct btrfs_root *root,
+ 			 u64 bytenr, u64 num_bytes, u64 parent,
+@@ -1984,6 +2021,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
+ int btrfs_free_block_groups(struct btrfs_fs_info *info);
+ int btrfs_read_block_groups(struct btrfs_root *root);
++int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
+ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ 			   struct btrfs_root *root, u64 bytes_used,
+ 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
+@@ -1997,7 +2035,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
+ void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+ 
+-int btrfs_check_metadata_free_space(struct btrfs_root *root);
++int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
++int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
++int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
++					  struct inode *inode, int num_items);
++int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
++					struct inode *inode, int num_items);
+ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+ 				u64 bytes);
+ void btrfs_free_reserved_data_space(struct btrfs_root *root,
+@@ -2006,7 +2049,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+ 				 u64 bytes);
+ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+ 			      u64 bytes);
+-void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
+ /* ctree.c */
+ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+ 		     int level, int *slot);
+@@ -2100,12 +2142,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ 			struct extent_buffer *parent);
+ /* root-item.c */
+ int btrfs_find_root_ref(struct btrfs_root *tree_root,
+-		   struct btrfs_path *path,
+-		   u64 root_id, u64 ref_id);
++			struct btrfs_path *path,
++			u64 root_id, u64 ref_id);
+ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ 		       struct btrfs_root *tree_root,
+-		       u64 root_id, u8 type, u64 ref_id,
+-		       u64 dirid, u64 sequence,
++		       u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
++		       const char *name, int name_len);
++int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
++		       struct btrfs_root *tree_root,
++		       u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+ 		       const char *name, int name_len);
+ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ 		   struct btrfs_key *key);
+@@ -2120,6 +2165,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
+ int btrfs_search_root(struct btrfs_root *root, u64 search_start,
+ 		      u64 *found_objectid);
+ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
++int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
+ int btrfs_set_root_node(struct btrfs_root_item *item,
+ 			struct extent_buffer *node);
+ /* dir-item.c */
+@@ -2138,6 +2184,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+ 			    struct btrfs_path *path, u64 dir,
+ 			    u64 objectid, const char *name, int name_len,
+ 			    int mod);
++struct btrfs_dir_item *
++btrfs_search_dir_index_item(struct btrfs_root *root,
++			    struct btrfs_path *path, u64 dirid,
++			    const char *name, int name_len);
+ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ 			      struct btrfs_path *path,
+ 			      const char *name, int name_len);
+@@ -2160,6 +2210,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+ 			     struct btrfs_root *root, u64 offset);
+ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+ 			  struct btrfs_root *root, u64 offset);
++int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
+ 
+ /* inode-map.c */
+ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+@@ -2232,6 +2283,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ int btrfs_add_link(struct btrfs_trans_handle *trans,
+ 		   struct inode *parent_inode, struct inode *inode,
+ 		   const char *name, int name_len, int add_backref, u64 index);
++int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
++			struct btrfs_root *root,
++			struct inode *dir, u64 objectid,
++			const char *name, int name_len);
+ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ 			       struct btrfs_root *root,
+ 			       struct inode *inode, u64 new_size,
+@@ -2242,7 +2297,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
+ int btrfs_writepages(struct address_space *mapping,
+ 		     struct writeback_control *wbc);
+ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+-			     struct btrfs_root *new_root, struct dentry *dentry,
++			     struct btrfs_root *new_root,
+ 			     u64 new_dirid, u64 alloc_hint);
+ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+ 			 size_t size, struct bio *bio, unsigned long bio_flags);
+@@ -2258,6 +2313,7 @@ int btrfs_write_inode(struct inode *inode, int wait);
+ void btrfs_dirty_inode(struct inode *inode);
+ struct inode *btrfs_alloc_inode(struct super_block *sb);
+ void btrfs_destroy_inode(struct inode *inode);
++void btrfs_drop_inode(struct inode *inode);
+ int btrfs_init_cachep(void);
+ void btrfs_destroy_cachep(void);
+ long btrfs_ioctl_trans_end(struct file *file);
+@@ -2275,6 +2331,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
+ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
+ void btrfs_orphan_cleanup(struct btrfs_root *root);
+ int btrfs_cont_expand(struct inode *inode, loff_t size);
++int btrfs_invalidate_inodes(struct btrfs_root *root);
++extern const struct dentry_operations btrfs_dentry_operations;
+ 
+ /* ioctl.c */
+ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+@@ -2290,7 +2348,7 @@ extern struct file_operations btrfs_file_operations;
+ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ 		       struct btrfs_root *root, struct inode *inode,
+ 		       u64 start, u64 end, u64 locked_end,
+-		       u64 inline_limit, u64 *hint_block);
++		       u64 inline_limit, u64 *hint_block, int drop_cache);
+ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+ 			      struct btrfs_root *root,
+ 			      struct inode *inode, u64 start, u64 end);
+@@ -2317,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
+ int btrfs_sync_fs(struct super_block *sb, int wait);
+ 
+ /* acl.c */
+-#ifdef CONFIG_FS_POSIX_ACL
++#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ int btrfs_check_acl(struct inode *inode, int mask);
+ #else
+ #define btrfs_check_acl NULL
+diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
+index 1d70236..f3a6075 100644
+--- a/fs/btrfs/dir-item.c
++++ b/fs/btrfs/dir-item.c
+@@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+ 	return btrfs_match_dir_item_name(root, path, name, name_len);
+ }
+ 
++struct btrfs_dir_item *
++btrfs_search_dir_index_item(struct btrfs_root *root,
++			    struct btrfs_path *path, u64 dirid,
++			    const char *name, int name_len)
++{
++	struct extent_buffer *leaf;
++	struct btrfs_dir_item *di;
++	struct btrfs_key key;
++	u32 nritems;
++	int ret;
++
++	key.objectid = dirid;
++	key.type = BTRFS_DIR_INDEX_KEY;
++	key.offset = 0;
++
++	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
++	if (ret < 0)
++		return ERR_PTR(ret);
++
++	leaf = path->nodes[0];
++	nritems = btrfs_header_nritems(leaf);
++
++	while (1) {
++		if (path->slots[0] >= nritems) {
++			ret = btrfs_next_leaf(root, path);
++			if (ret < 0)
++				return ERR_PTR(ret);
++			if (ret > 0)
++				break;
++			leaf = path->nodes[0];
++			nritems = btrfs_header_nritems(leaf);
++			continue;
++		}
++
++		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++		if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
++			break;
++
++		di = btrfs_match_dir_item_name(root, path, name, name_len);
++		if (di)
++			return di;
++
++		path->slots[0]++;
++	}
++	return NULL;
++}
++
+ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+ 					  struct btrfs_root *root,
+ 					  struct btrfs_path *path, u64 dir,
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index e83be2e..d4132aa 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -41,6 +41,7 @@
+ 
+ static struct extent_io_ops btree_extent_io_ops;
+ static void end_workqueue_fn(struct btrfs_work *work);
++static void free_fs_root(struct btrfs_root *root);
+ 
+ static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
+ 
+@@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,
+ 	struct extent_map *em;
+ 	int ret;
+ 
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree, start, len);
+ 	if (em) {
+ 		em->bdev =
+ 			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+-		spin_unlock(&em_tree->lock);
++		read_unlock(&em_tree->lock);
+ 		goto out;
+ 	}
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 
+ 	em = alloc_extent_map(GFP_NOFS);
+ 	if (!em) {
+@@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
+ 	em->block_start = 0;
+ 	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+ 
+-	spin_lock(&em_tree->lock);
++	write_lock(&em_tree->lock);
+ 	ret = add_extent_mapping(em_tree, em);
+ 	if (ret == -EEXIST) {
+ 		u64 failed_start = em->start;
+@@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
+ 		free_extent_map(em);
+ 		em = NULL;
+ 	}
+-	spin_unlock(&em_tree->lock);
++	write_unlock(&em_tree->lock);
+ 
+ 	if (ret)
+ 		em = ERR_PTR(ret);
+@@ -828,7 +829,9 @@ int btrfs_write_tree_block(struct extent_buffer *buf)
+ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+ {
+ 	return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
+-				  buf->start, buf->start + buf->len - 1);
++				  buf->start >> PAGE_CACHE_SHIFT,
++				  (buf->start + buf->len - 1) >>
++				   PAGE_CACHE_SHIFT);
+ }
+ 
+ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+@@ -895,8 +898,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+ 	root->fs_info = fs_info;
+ 	root->objectid = objectid;
+ 	root->last_trans = 0;
+-	root->highest_inode = 0;
+-	root->last_inode_alloc = 0;
++	root->highest_objectid = 0;
+ 	root->name = NULL;
+ 	root->in_sysfs = 0;
+ 	root->inode_tree.rb_node = NULL;
+@@ -917,6 +919,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+ 	atomic_set(&root->log_writers, 0);
+ 	root->log_batch = 0;
+ 	root->log_transid = 0;
++	root->last_log_commit = 0;
+ 	extent_io_tree_init(&root->dirty_log_pages,
+ 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+ 
+@@ -952,14 +955,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
+ 		     root, fs_info, objectid);
+ 	ret = btrfs_find_last_root(tree_root, objectid,
+ 				   &root->root_item, &root->root_key);
++	if (ret > 0)
++		return -ENOENT;
+ 	BUG_ON(ret);
+ 
+ 	generation = btrfs_root_generation(&root->root_item);
+ 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+ 				     blocksize, generation);
+-	root->commit_root = btrfs_root_node(root);
+ 	BUG_ON(!root->node);
++	root->commit_root = btrfs_root_node(root);
+ 	return 0;
+ }
+ 
+@@ -1085,6 +1090,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+ 	WARN_ON(root->log_root);
+ 	root->log_root = log_root;
+ 	root->log_transid = 0;
++	root->last_log_commit = 0;
+ 	return 0;
+ }
+ 
+@@ -1095,7 +1101,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+ 	struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ 	struct btrfs_path *path;
+ 	struct extent_buffer *l;
+-	u64 highest_inode;
+ 	u64 generation;
+ 	u32 blocksize;
+ 	int ret = 0;
+@@ -1110,7 +1115,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+ 			kfree(root);
+ 			return ERR_PTR(ret);
+ 		}
+-		goto insert;
++		goto out;
+ 	}
+ 
+ 	__setup_root(tree_root->nodesize, tree_root->leafsize,
+@@ -1120,39 +1125,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+ 	path = btrfs_alloc_path();
+ 	BUG_ON(!path);
+ 	ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
+-	if (ret != 0) {
+-		if (ret > 0)
+-			ret = -ENOENT;
+-		goto out;
++	if (ret == 0) {
++		l = path->nodes[0];
++		read_extent_buffer(l, &root->root_item,
++				btrfs_item_ptr_offset(l, path->slots[0]),
++				sizeof(root->root_item));
++		memcpy(&root->root_key, location, sizeof(*location));
+ 	}
+-	l = path->nodes[0];
+-	read_extent_buffer(l, &root->root_item,
+-	       btrfs_item_ptr_offset(l, path->slots[0]),
+-	       sizeof(root->root_item));
+-	memcpy(&root->root_key, location, sizeof(*location));
+-	ret = 0;
+-out:
+-	btrfs_release_path(root, path);
+ 	btrfs_free_path(path);
+ 	if (ret) {
+-		kfree(root);
++		if (ret > 0)
++			ret = -ENOENT;
+ 		return ERR_PTR(ret);
+ 	}
++
+ 	generation = btrfs_root_generation(&root->root_item);
+ 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+ 				     blocksize, generation);
+ 	root->commit_root = btrfs_root_node(root);
+ 	BUG_ON(!root->node);
+-insert:
+-	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
++out:
++	if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
+ 		root->ref_cows = 1;
+-		ret = btrfs_find_highest_inode(root, &highest_inode);
+-		if (ret == 0) {
+-			root->highest_inode = highest_inode;
+-			root->last_inode_alloc = highest_inode;
+-		}
+-	}
++
+ 	return root;
+ }
+ 
+@@ -1187,39 +1183,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+ 		return fs_info->dev_root;
+ 	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
+ 		return fs_info->csum_root;
+-
++again:
++	spin_lock(&fs_info->fs_roots_radix_lock);
+ 	root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ 				 (unsigned long)location->objectid);
++	spin_unlock(&fs_info->fs_roots_radix_lock);
+ 	if (root)
+ 		return root;
+ 
++	ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
++	if (ret == 0)
++		ret = -ENOENT;
++	if (ret < 0)
++		return ERR_PTR(ret);
++
+ 	root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+ 	if (IS_ERR(root))
+ 		return root;
+ 
++	WARN_ON(btrfs_root_refs(&root->root_item) == 0);
+ 	set_anon_super(&root->anon_super, NULL);
+ 
++	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
++	if (ret)
++		goto fail;
++
++	spin_lock(&fs_info->fs_roots_radix_lock);
+ 	ret = radix_tree_insert(&fs_info->fs_roots_radix,
+ 				(unsigned long)root->root_key.objectid,
+ 				root);
++	if (ret == 0)
++		root->in_radix = 1;
++	spin_unlock(&fs_info->fs_roots_radix_lock);
++	radix_tree_preload_end();
+ 	if (ret) {
+-		free_extent_buffer(root->node);
+-		kfree(root);
+-		return ERR_PTR(ret);
++		if (ret == -EEXIST) {
++			free_fs_root(root);
++			goto again;
++		}
++		goto fail;
+ 	}
+-	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+-		ret = btrfs_find_dead_roots(fs_info->tree_root,
+-					    root->root_key.objectid);
+-		BUG_ON(ret);
++
++	ret = btrfs_find_dead_roots(fs_info->tree_root,
++				    root->root_key.objectid);
++	WARN_ON(ret);
++
++	if (!(fs_info->sb->s_flags & MS_RDONLY))
+ 		btrfs_orphan_cleanup(root);
+-	}
++
+ 	return root;
++fail:
++	free_fs_root(root);
++	return ERR_PTR(ret);
+ }
+ 
+ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+ 				      struct btrfs_key *location,
+ 				      const char *name, int namelen)
+ {
++	return btrfs_read_fs_root_no_name(fs_info, location);
++#if 0
+ 	struct btrfs_root *root;
+ 	int ret;
+ 
+@@ -1236,7 +1259,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+ 		kfree(root);
+ 		return ERR_PTR(ret);
+ 	}
+-#if 0
++
+ 	ret = btrfs_sysfs_add_root(root);
+ 	if (ret) {
+ 		free_extent_buffer(root->node);
+@@ -1244,9 +1267,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+ 		kfree(root);
+ 		return ERR_PTR(ret);
+ 	}
+-#endif
+ 	root->in_sysfs = 1;
+ 	return root;
++#endif
+ }
+ 
+ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+@@ -1325,9 +1348,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+ 	offset = page_offset(page);
+ 
+ 	em_tree = &BTRFS_I(inode)->extent_tree;
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 	if (!em) {
+ 		__unplug_io_fn(bdi, page);
+ 		return;
+@@ -1359,8 +1382,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
+ 
+ 	err = bdi_register(bdi, NULL, "btrfs-%d",
+ 				atomic_inc_return(&btrfs_bdi_num));
+-	if (err)
++	if (err) {
++		bdi_destroy(bdi);
+ 		return err;
++	}
+ 
+ 	bdi->ra_pages	= default_backing_dev_info.ra_pages;
+ 	bdi->unplug_io_fn	= btrfs_unplug_io_fn;
+@@ -1450,9 +1475,12 @@ static int cleaner_kthread(void *arg)
+ 			break;
+ 
+ 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+-		mutex_lock(&root->fs_info->cleaner_mutex);
+-		btrfs_clean_old_snapshots(root);
+-		mutex_unlock(&root->fs_info->cleaner_mutex);
++
++		if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
++		    mutex_trylock(&root->fs_info->cleaner_mutex)) {
++			btrfs_clean_old_snapshots(root);
++			mutex_unlock(&root->fs_info->cleaner_mutex);
++		}
+ 
+ 		if (freezing(current)) {
+ 			refrigerator();
+@@ -1557,15 +1585,36 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 		err = -ENOMEM;
+ 		goto fail;
+ 	}
+-	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
++
++	ret = init_srcu_struct(&fs_info->subvol_srcu);
++	if (ret) {
++		err = ret;
++		goto fail;
++	}
++
++	ret = setup_bdi(fs_info, &fs_info->bdi);
++	if (ret) {
++		err = ret;
++		goto fail_srcu;
++	}
++
++	fs_info->btree_inode = new_inode(sb);
++	if (!fs_info->btree_inode) {
++		err = -ENOMEM;
++		goto fail_bdi;
++	}
++
++	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ 	INIT_LIST_HEAD(&fs_info->trans_list);
+ 	INIT_LIST_HEAD(&fs_info->dead_roots);
+ 	INIT_LIST_HEAD(&fs_info->hashers);
+ 	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+ 	INIT_LIST_HEAD(&fs_info->ordered_operations);
++	INIT_LIST_HEAD(&fs_info->caching_block_groups);
+ 	spin_lock_init(&fs_info->delalloc_lock);
+ 	spin_lock_init(&fs_info->new_trans_lock);
+ 	spin_lock_init(&fs_info->ref_cache_lock);
++	spin_lock_init(&fs_info->fs_roots_radix_lock);
+ 
+ 	init_completion(&fs_info->kobj_unregister);
+ 	fs_info->tree_root = tree_root;
+@@ -1584,12 +1633,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 	fs_info->sb = sb;
+ 	fs_info->max_extent = (u64)-1;
+ 	fs_info->max_inline = 8192 * 1024;
+-	if (setup_bdi(fs_info, &fs_info->bdi))
+-		goto fail_bdi;
+-	fs_info->btree_inode = new_inode(sb);
+-	fs_info->btree_inode->i_ino = 1;
+-	fs_info->btree_inode->i_nlink = 1;
+-	fs_info->metadata_ratio = 8;
++	fs_info->metadata_ratio = 0;
+ 
+ 	fs_info->thread_pool_size = min_t(unsigned long,
+ 					  num_online_cpus() + 2, 8);
+@@ -1600,6 +1644,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 	sb->s_blocksize = 4096;
+ 	sb->s_blocksize_bits = blksize_bits(4096);
+ 
++	fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
++	fs_info->btree_inode->i_nlink = 1;
+ 	/*
+ 	 * we set the i_size on the btree inode to the max possible int.
+ 	 * the real end of the address space is determined by all of
+@@ -1618,28 +1664,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 
+ 	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+ 
++	BTRFS_I(fs_info->btree_inode)->root = tree_root;
++	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
++	       sizeof(struct btrfs_key));
++	BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
++	insert_inode_hash(fs_info->btree_inode);
++
+ 	spin_lock_init(&fs_info->block_group_cache_lock);
+ 	fs_info->block_group_cache_tree.rb_node = NULL;
+ 
+-	extent_io_tree_init(&fs_info->pinned_extents,
++	extent_io_tree_init(&fs_info->freed_extents[0],
+ 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
++	extent_io_tree_init(&fs_info->freed_extents[1],
++			     fs_info->btree_inode->i_mapping, GFP_NOFS);
++	fs_info->pinned_extents = &fs_info->freed_extents[0];
+ 	fs_info->do_barriers = 1;
+ 
+-	BTRFS_I(fs_info->btree_inode)->root = tree_root;
+-	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+-	       sizeof(struct btrfs_key));
+-	insert_inode_hash(fs_info->btree_inode);
+ 
+ 	mutex_init(&fs_info->trans_mutex);
+ 	mutex_init(&fs_info->ordered_operations_mutex);
+ 	mutex_init(&fs_info->tree_log_mutex);
+-	mutex_init(&fs_info->drop_mutex);
+ 	mutex_init(&fs_info->chunk_mutex);
+ 	mutex_init(&fs_info->transaction_kthread_mutex);
+ 	mutex_init(&fs_info->cleaner_mutex);
+ 	mutex_init(&fs_info->volume_mutex);
+-	mutex_init(&fs_info->tree_reloc_mutex);
+ 	init_rwsem(&fs_info->extent_commit_sem);
++	init_rwsem(&fs_info->subvol_sem);
+ 
+ 	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
+ 	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
+@@ -1699,20 +1749,24 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 		goto fail_iput;
+ 	}
+ 
+-	/*
+-	 * we need to start all the end_io workers up front because the
+-	 * queue work function gets called at interrupt time, and so it
+-	 * cannot dynamically grow.
+-	 */
++	btrfs_init_workers(&fs_info->generic_worker,
++			   "genwork", 1, NULL);
++
+ 	btrfs_init_workers(&fs_info->workers, "worker",
+-			   fs_info->thread_pool_size);
++			   fs_info->thread_pool_size,
++			   &fs_info->generic_worker);
+ 
+ 	btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
+-			   fs_info->thread_pool_size);
++			   fs_info->thread_pool_size,
++			   &fs_info->generic_worker);
+ 
+ 	btrfs_init_workers(&fs_info->submit_workers, "submit",
+ 			   min_t(u64, fs_devices->num_devices,
+-			   fs_info->thread_pool_size));
++			   fs_info->thread_pool_size),
++			   &fs_info->generic_worker);
++	btrfs_init_workers(&fs_info->enospc_workers, "enospc",
++			   fs_info->thread_pool_size,
++			   &fs_info->generic_worker);
+ 
+ 	/* a higher idle thresh on the submit workers makes it much more
+ 	 * likely that bios will be send down in a sane order to the
+@@ -1726,15 +1780,20 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 	fs_info->delalloc_workers.idle_thresh = 2;
+ 	fs_info->delalloc_workers.ordered = 1;
+ 
+-	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
++	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
++			   &fs_info->generic_worker);
+ 	btrfs_init_workers(&fs_info->endio_workers, "endio",
+-			   fs_info->thread_pool_size);
++			   fs_info->thread_pool_size,
++			   &fs_info->generic_worker);
+ 	btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
+-			   fs_info->thread_pool_size);
++			   fs_info->thread_pool_size,
++			   &fs_info->generic_worker);
+ 	btrfs_init_workers(&fs_info->endio_meta_write_workers,
+-			   "endio-meta-write", fs_info->thread_pool_size);
++			   "endio-meta-write", fs_info->thread_pool_size,
++			   &fs_info->generic_worker);
+ 	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
+-			   fs_info->thread_pool_size);
++			   fs_info->thread_pool_size,
++			   &fs_info->generic_worker);
+ 
+ 	/*
+ 	 * endios are largely parallel and should have a very
+@@ -1743,20 +1802,19 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 	fs_info->endio_workers.idle_thresh = 4;
+ 	fs_info->endio_meta_workers.idle_thresh = 4;
+ 
+-	fs_info->endio_write_workers.idle_thresh = 64;
+-	fs_info->endio_meta_write_workers.idle_thresh = 64;
++	fs_info->endio_write_workers.idle_thresh = 2;
++	fs_info->endio_meta_write_workers.idle_thresh = 2;
+ 
+ 	btrfs_start_workers(&fs_info->workers, 1);
++	btrfs_start_workers(&fs_info->generic_worker, 1);
+ 	btrfs_start_workers(&fs_info->submit_workers, 1);
+ 	btrfs_start_workers(&fs_info->delalloc_workers, 1);
+ 	btrfs_start_workers(&fs_info->fixup_workers, 1);
+-	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+-	btrfs_start_workers(&fs_info->endio_meta_workers,
+-			    fs_info->thread_pool_size);
+-	btrfs_start_workers(&fs_info->endio_meta_write_workers,
+-			    fs_info->thread_pool_size);
+-	btrfs_start_workers(&fs_info->endio_write_workers,
+-			    fs_info->thread_pool_size);
++	btrfs_start_workers(&fs_info->endio_workers, 1);
++	btrfs_start_workers(&fs_info->endio_meta_workers, 1);
++	btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
++	btrfs_start_workers(&fs_info->endio_write_workers, 1);
++	btrfs_start_workers(&fs_info->enospc_workers, 1);
+ 
+ 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+ 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+@@ -1916,6 +1974,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
+ 		}
+ 	}
+ 
++	ret = btrfs_find_orphan_roots(tree_root);
++	BUG_ON(ret);
++
+ 	if (!(sb->s_flags & MS_RDONLY)) {
+ 		ret = btrfs_recover_relocation(tree_root);
+ 		BUG_ON(ret);
+@@ -1959,6 +2020,7 @@ fail_chunk_root:
+ 	free_extent_buffer(chunk_root->node);
+ 	free_extent_buffer(chunk_root->commit_root);
+ fail_sb_buffer:
++	btrfs_stop_workers(&fs_info->generic_worker);
+ 	btrfs_stop_workers(&fs_info->fixup_workers);
+ 	btrfs_stop_workers(&fs_info->delalloc_workers);
+ 	btrfs_stop_workers(&fs_info->workers);
+@@ -1967,6 +2029,7 @@ fail_sb_buffer:
+ 	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+ 	btrfs_stop_workers(&fs_info->endio_write_workers);
+ 	btrfs_stop_workers(&fs_info->submit_workers);
++	btrfs_stop_workers(&fs_info->enospc_workers);
+ fail_iput:
+ 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+ 	iput(fs_info->btree_inode);
+@@ -1975,6 +2038,8 @@ fail_iput:
+ 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ fail_bdi:
+ 	bdi_destroy(&fs_info->bdi);
++fail_srcu:
++	cleanup_srcu_struct(&fs_info->subvol_srcu);
+ fail:
+ 	kfree(extent_root);
+ 	kfree(tree_root);
+@@ -2234,20 +2299,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
+ 
+ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+ {
+-	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
++	spin_lock(&fs_info->fs_roots_radix_lock);
+ 	radix_tree_delete(&fs_info->fs_roots_radix,
+ 			  (unsigned long)root->root_key.objectid);
++	spin_unlock(&fs_info->fs_roots_radix_lock);
++
++	if (btrfs_root_refs(&root->root_item) == 0)
++		synchronize_srcu(&fs_info->subvol_srcu);
++
++	free_fs_root(root);
++	return 0;
++}
++
++static void free_fs_root(struct btrfs_root *root)
++{
++	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ 	if (root->anon_super.s_dev) {
+ 		down_write(&root->anon_super.s_umount);
+ 		kill_anon_super(&root->anon_super);
+ 	}
+-	if (root->node)
+-		free_extent_buffer(root->node);
+-	if (root->commit_root)
+-		free_extent_buffer(root->commit_root);
++	free_extent_buffer(root->node);
++	free_extent_buffer(root->commit_root);
+ 	kfree(root->name);
+ 	kfree(root);
+-	return 0;
+ }
+ 
+ static int del_fs_roots(struct btrfs_fs_info *fs_info)
+@@ -2256,6 +2330,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
+ 	struct btrfs_root *gang[8];
+ 	int i;
+ 
++	while (!list_empty(&fs_info->dead_roots)) {
++		gang[0] = list_entry(fs_info->dead_roots.next,
++				     struct btrfs_root, root_list);
++		list_del(&gang[0]->root_list);
++
++		if (gang[0]->in_radix) {
++			btrfs_free_fs_root(fs_info, gang[0]);
++		} else {
++			free_extent_buffer(gang[0]->node);
++			free_extent_buffer(gang[0]->commit_root);
++			kfree(gang[0]);
++		}
++	}
++
+ 	while (1) {
+ 		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ 					     (void **)gang, 0,
+@@ -2285,9 +2373,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
+ 		root_objectid = gang[ret - 1]->root_key.objectid + 1;
+ 		for (i = 0; i < ret; i++) {
+ 			root_objectid = gang[i]->root_key.objectid;
+-			ret = btrfs_find_dead_roots(fs_info->tree_root,
+-						    root_objectid);
+-			BUG_ON(ret);
+ 			btrfs_orphan_cleanup(gang[i]);
+ 		}
+ 		root_objectid++;
+@@ -2357,12 +2442,12 @@ int close_ctree(struct btrfs_root *root)
+ 	free_extent_buffer(root->fs_info->csum_root->commit_root);
+ 
+ 	btrfs_free_block_groups(root->fs_info);
+-	btrfs_free_pinned_extents(root->fs_info);
+ 
+ 	del_fs_roots(fs_info);
+ 
+ 	iput(fs_info->btree_inode);
+ 
++	btrfs_stop_workers(&fs_info->generic_worker);
+ 	btrfs_stop_workers(&fs_info->fixup_workers);
+ 	btrfs_stop_workers(&fs_info->delalloc_workers);
+ 	btrfs_stop_workers(&fs_info->workers);
+@@ -2371,11 +2456,13 @@ int close_ctree(struct btrfs_root *root)
+ 	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+ 	btrfs_stop_workers(&fs_info->endio_write_workers);
+ 	btrfs_stop_workers(&fs_info->submit_workers);
++	btrfs_stop_workers(&fs_info->enospc_workers);
+ 
+ 	btrfs_close_devices(fs_info->fs_devices);
+ 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ 
+ 	bdi_destroy(&fs_info->bdi);
++	cleanup_srcu_struct(&fs_info->subvol_srcu);
+ 
+ 	kfree(fs_info->extent_root);
+ 	kfree(fs_info->tree_root);
+diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
+index 9596b40..ba5c3fd 100644
+--- a/fs/btrfs/export.c
++++ b/fs/btrfs/export.c
+@@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+ 	len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
+ 	type = FILEID_BTRFS_WITHOUT_PARENT;
+ 
+-	fid->objectid = BTRFS_I(inode)->location.objectid;
++	fid->objectid = inode->i_ino;
+ 	fid->root_objectid = BTRFS_I(inode)->root->objectid;
+ 	fid->gen = inode->i_generation;
+ 
+@@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+ }
+ 
+ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+-				       u64 root_objectid, u32 generation)
++				       u64 root_objectid, u32 generation,
++				       int check_generation)
+ {
++	struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+ 	struct btrfs_root *root;
++	struct dentry *dentry;
+ 	struct inode *inode;
+ 	struct btrfs_key key;
++	int index;
++	int err = 0;
++
++	if (objectid < BTRFS_FIRST_FREE_OBJECTID)
++		return ERR_PTR(-ESTALE);
+ 
+ 	key.objectid = root_objectid;
+ 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ 	key.offset = (u64)-1;
+ 
+-	root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
+-	if (IS_ERR(root))
+-		return ERR_CAST(root);
++	index = srcu_read_lock(&fs_info->subvol_srcu);
++
++	root = btrfs_read_fs_root_no_name(fs_info, &key);
++	if (IS_ERR(root)) {
++		err = PTR_ERR(root);
++		goto fail;
++	}
++
++	if (btrfs_root_refs(&root->root_item) == 0) {
++		err = -ENOENT;
++		goto fail;
++	}
+ 
+ 	key.objectid = objectid;
+ 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ 	key.offset = 0;
+ 
+ 	inode = btrfs_iget(sb, &key, root);
+-	if (IS_ERR(inode))
+-		return (void *)inode;
++	if (IS_ERR(inode)) {
++		err = PTR_ERR(inode);
++		goto fail;
++	}
++
++	srcu_read_unlock(&fs_info->subvol_srcu, index);
+ 
+-	if (generation != inode->i_generation) {
++	if (check_generation && generation != inode->i_generation) {
+ 		iput(inode);
+ 		return ERR_PTR(-ESTALE);
+ 	}
+ 
+-	return d_obtain_alias(inode);
++	dentry = d_obtain_alias(inode);
++	if (!IS_ERR(dentry))
++		dentry->d_op = &btrfs_dentry_operations;
++	return dentry;
++fail:
++	srcu_read_unlock(&fs_info->subvol_srcu, index);
++	return ERR_PTR(err);
+ }
+ 
+ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+@@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+ 	objectid = fid->parent_objectid;
+ 	generation = fid->parent_gen;
+ 
+-	return btrfs_get_dentry(sb, objectid, root_objectid, generation);
++	return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+ }
+ 
+ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+@@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+ 	root_objectid = fid->root_objectid;
+ 	generation = fid->gen;
+ 
+-	return btrfs_get_dentry(sb, objectid, root_objectid, generation);
++	return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+ }
+ 
+ static struct dentry *btrfs_get_parent(struct dentry *child)
+ {
+ 	struct inode *dir = child->d_inode;
++	static struct dentry *dentry;
+ 	struct btrfs_root *root = BTRFS_I(dir)->root;
+-	struct btrfs_key key;
+ 	struct btrfs_path *path;
+ 	struct extent_buffer *leaf;
+-	int slot;
+-	u64 objectid;
++	struct btrfs_root_ref *ref;
++	struct btrfs_key key;
++	struct btrfs_key found_key;
+ 	int ret;
+ 
+ 	path = btrfs_alloc_path();
+ 
+-	key.objectid = dir->i_ino;
+-	btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+-	key.offset = (u64)-1;
++	if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
++		key.objectid = root->root_key.objectid;
++		key.type = BTRFS_ROOT_BACKREF_KEY;
++		key.offset = (u64)-1;
++		root = root->fs_info->tree_root;
++	} else {
++		key.objectid = dir->i_ino;
++		key.type = BTRFS_INODE_REF_KEY;
++		key.offset = (u64)-1;
++	}
+ 
+ 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+-	if (ret < 0) {
+-		/* Error */
+-		btrfs_free_path(path);
+-		return ERR_PTR(ret);
++	if (ret < 0)
++		goto fail;
++
++	BUG_ON(ret == 0);
++	if (path->slots[0] == 0) {
++		ret = -ENOENT;
++		goto fail;
+ 	}
++
++	path->slots[0]--;
+ 	leaf = path->nodes[0];
+-	slot = path->slots[0];
+-	if (ret) {
+-		/* btrfs_search_slot() returns the slot where we'd want to
+-		   insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
+-		   The _real_ backref, telling us what the parent inode
+-		   _actually_ is, will be in the slot _before_ the one
+-		   that btrfs_search_slot() returns. */
+-		if (!slot) {
+-			/* Unless there is _no_ key in the tree before... */
+-			btrfs_free_path(path);
+-			return ERR_PTR(-EIO);
+-		}
+-		slot--;
++
++	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
++	if (found_key.objectid != key.objectid || found_key.type != key.type) {
++		ret = -ENOENT;
++		goto fail;
+ 	}
+ 
+-	btrfs_item_key_to_cpu(leaf, &key, slot);
++	if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
++		ref = btrfs_item_ptr(leaf, path->slots[0],
++				     struct btrfs_root_ref);
++		key.objectid = btrfs_root_ref_dirid(leaf, ref);
++	} else {
++		key.objectid = found_key.offset;
++	}
+ 	btrfs_free_path(path);
+ 
+-	if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
+-		return ERR_PTR(-EINVAL);
+-
+-	objectid = key.offset;
+-
+-	/* If we are already at the root of a subvol, return the real root */
+-	if (objectid == dir->i_ino)
+-		return dget(dir->i_sb->s_root);
++	if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
++		return btrfs_get_dentry(root->fs_info->sb, key.objectid,
++					found_key.offset, 0, 0);
++	}
+ 
+-	/* Build a new key for the inode item */
+-	key.objectid = objectid;
+-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
++	key.type = BTRFS_INODE_ITEM_KEY;
+ 	key.offset = 0;
+-
+-	return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
++	dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
++	if (!IS_ERR(dentry))
++		dentry->d_op = &btrfs_dentry_operations;
++	return dentry;
++fail:
++	btrfs_free_path(path);
++	return ERR_PTR(ret);
+ }
+ 
+ const struct export_operations btrfs_export_ops = {
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index 72a2b9c..c56f916 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -32,12 +32,12 @@
+ #include "locking.h"
+ #include "free-space-cache.h"
+ 
+-static int update_reserved_extents(struct btrfs_root *root,
+-				   u64 bytenr, u64 num, int reserve);
+ static int update_block_group(struct btrfs_trans_handle *trans,
+ 			      struct btrfs_root *root,
+ 			      u64 bytenr, u64 num_bytes, int alloc,
+ 			      int mark_free);
++static int update_reserved_extents(struct btrfs_block_group_cache *cache,
++				   u64 num_bytes, int reserve);
+ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ 				struct btrfs_root *root,
+ 				u64 bytenr, u64 num_bytes, u64 parent,
+@@ -57,10 +57,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+ 				     u64 parent, u64 root_objectid,
+ 				     u64 flags, struct btrfs_disk_key *key,
+ 				     int level, struct btrfs_key *ins);
+-
+ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ 			  struct btrfs_root *extent_root, u64 alloc_bytes,
+ 			  u64 flags, int force);
++static int pin_down_bytes(struct btrfs_trans_handle *trans,
++			  struct btrfs_root *root,
++			  struct btrfs_path *path,
++			  u64 bytenr, u64 num_bytes,
++			  int is_data, int reserved,
++			  struct extent_buffer **must_clean);
++static int find_next_key(struct btrfs_path *path, int level,
++			 struct btrfs_key *key);
++static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
++			    int dump_block_groups);
+ 
+ static noinline int
+ block_group_cache_done(struct btrfs_block_group_cache *cache)
+@@ -153,34 +162,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
+ 	return ret;
+ }
+ 
+-/*
+- * We always set EXTENT_LOCKED for the super mirror extents so we don't
+- * overwrite them, so those bits need to be unset.  Also, if we are unmounting
+- * with pinned extents still sitting there because we had a block group caching,
+- * we need to clear those now, since we are done.
+- */
+-void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
++static int add_excluded_extent(struct btrfs_root *root,
++			       u64 start, u64 num_bytes)
+ {
+-	u64 start, end, last = 0;
+-	int ret;
++	u64 end = start + num_bytes - 1;
++	set_extent_bits(&root->fs_info->freed_extents[0],
++			start, end, EXTENT_UPTODATE, GFP_NOFS);
++	set_extent_bits(&root->fs_info->freed_extents[1],
++			start, end, EXTENT_UPTODATE, GFP_NOFS);
++	return 0;
++}
+ 
+-	while (1) {
+-		ret = find_first_extent_bit(&info->pinned_extents, last,
+-					    &start, &end,
+-					    EXTENT_LOCKED|EXTENT_DIRTY);
+-		if (ret)
+-			break;
++static void free_excluded_extents(struct btrfs_root *root,
++				  struct btrfs_block_group_cache *cache)
++{
++	u64 start, end;
+ 
+-		clear_extent_bits(&info->pinned_extents, start, end,
+-				  EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
+-		last = end+1;
+-	}
++	start = cache->key.objectid;
++	end = start + cache->key.offset - 1;
++
++	clear_extent_bits(&root->fs_info->freed_extents[0],
++			  start, end, EXTENT_UPTODATE, GFP_NOFS);
++	clear_extent_bits(&root->fs_info->freed_extents[1],
++			  start, end, EXTENT_UPTODATE, GFP_NOFS);
+ }
+ 
+-static int remove_sb_from_cache(struct btrfs_root *root,
+-				struct btrfs_block_group_cache *cache)
++static int exclude_super_stripes(struct btrfs_root *root,
++				 struct btrfs_block_group_cache *cache)
+ {
+-	struct btrfs_fs_info *fs_info = root->fs_info;
+ 	u64 bytenr;
+ 	u64 *logical;
+ 	int stripe_len;
+@@ -192,17 +201,42 @@ static int remove_sb_from_cache(struct btrfs_root *root,
+ 				       cache->key.objectid, bytenr,
+ 				       0, &logical, &nr, &stripe_len);
+ 		BUG_ON(ret);
++
+ 		while (nr--) {
+-			try_lock_extent(&fs_info->pinned_extents,
+-					logical[nr],
+-					logical[nr] + stripe_len - 1, GFP_NOFS);
++			cache->bytes_super += stripe_len;
++			ret = add_excluded_extent(root, logical[nr],
++						  stripe_len);
++			BUG_ON(ret);
+ 		}
++
+ 		kfree(logical);
+ 	}
+-
+ 	return 0;
+ }
+ 
++static struct btrfs_caching_control *
++get_caching_control(struct btrfs_block_group_cache *cache)
++{
++	struct btrfs_caching_control *ctl;
++
++	spin_lock(&cache->lock);
++	if (cache->cached != BTRFS_CACHE_STARTED) {
++		spin_unlock(&cache->lock);
++		return NULL;
++	}
++
++	ctl = cache->caching_ctl;
++	atomic_inc(&ctl->count);
++	spin_unlock(&cache->lock);
++	return ctl;
++}
++
++static void put_caching_control(struct btrfs_caching_control *ctl)
++{
++	if (atomic_dec_and_test(&ctl->count))
++		kfree(ctl);
++}
++
+ /*
+  * this is only called by cache_block_group, since we could have freed extents
+  * we need to check the pinned_extents for any extents that can't be used yet
+@@ -215,9 +249,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ 	int ret;
+ 
+ 	while (start < end) {
+-		ret = find_first_extent_bit(&info->pinned_extents, start,
++		ret = find_first_extent_bit(info->pinned_extents, start,
+ 					    &extent_start, &extent_end,
+-					    EXTENT_DIRTY|EXTENT_LOCKED);
++					    EXTENT_DIRTY | EXTENT_UPTODATE);
+ 		if (ret)
+ 			break;
+ 
+@@ -249,22 +283,27 @@ static int caching_kthread(void *data)
+ {
+ 	struct btrfs_block_group_cache *block_group = data;
+ 	struct btrfs_fs_info *fs_info = block_group->fs_info;
+-	u64 last = 0;
++	struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
++	struct btrfs_root *extent_root = fs_info->extent_root;
+ 	struct btrfs_path *path;
+-	int ret = 0;
+-	struct btrfs_key key;
+ 	struct extent_buffer *leaf;
+-	int slot;
++	struct btrfs_key key;
+ 	u64 total_found = 0;
+-
+-	BUG_ON(!fs_info);
++	u64 last = 0;
++	u32 nritems;
++	int ret = 0;
+ 
+ 	path = btrfs_alloc_path();
+ 	if (!path)
+ 		return -ENOMEM;
+ 
+-	atomic_inc(&block_group->space_info->caching_threads);
++	exclude_super_stripes(extent_root, block_group);
++	spin_lock(&block_group->space_info->lock);
++	block_group->space_info->bytes_super += block_group->bytes_super;
++	spin_unlock(&block_group->space_info->lock);
++
+ 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
++
+ 	/*
+ 	 * We don't want to deadlock with somebody trying to allocate a new
+ 	 * extent for the extent root while also trying to search the extent
+@@ -277,74 +316,64 @@ static int caching_kthread(void *data)
+ 
+ 	key.objectid = last;
+ 	key.offset = 0;
+-	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
++	key.type = BTRFS_EXTENT_ITEM_KEY;
+ again:
++	mutex_lock(&caching_ctl->mutex);
+ 	/* need to make sure the commit_root doesn't disappear */
+ 	down_read(&fs_info->extent_commit_sem);
+ 
+-	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
++	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ 	if (ret < 0)
+ 		goto err;
+ 
++	leaf = path->nodes[0];
++	nritems = btrfs_header_nritems(leaf);
++
+ 	while (1) {
+ 		smp_mb();
+-		if (block_group->fs_info->closing > 1) {
++		if (fs_info->closing > 1) {
+ 			last = (u64)-1;
+ 			break;
+ 		}
+ 
+-		leaf = path->nodes[0];
+-		slot = path->slots[0];
+-		if (slot >= btrfs_header_nritems(leaf)) {
+-			ret = btrfs_next_leaf(fs_info->extent_root, path);
+-			if (ret < 0)
+-				goto err;
+-			else if (ret)
++		if (path->slots[0] < nritems) {
++			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++		} else {
++			ret = find_next_key(path, 0, &key);
++			if (ret)
+ 				break;
+ 
+-			if (need_resched() ||
+-			    btrfs_transaction_in_commit(fs_info)) {
+-				leaf = path->nodes[0];
+-
+-				/* this shouldn't happen, but if the
+-				 * leaf is empty just move on.
+-				 */
+-				if (btrfs_header_nritems(leaf) == 0)
+-					break;
+-				/*
+-				 * we need to copy the key out so that
+-				 * we are sure the next search advances
+-				 * us forward in the btree.
+-				 */
+-				btrfs_item_key_to_cpu(leaf, &key, 0);
+-				btrfs_release_path(fs_info->extent_root, path);
+-				up_read(&fs_info->extent_commit_sem);
++			caching_ctl->progress = last;
++			btrfs_release_path(extent_root, path);
++			up_read(&fs_info->extent_commit_sem);
++			mutex_unlock(&caching_ctl->mutex);
++			if (btrfs_transaction_in_commit(fs_info))
+ 				schedule_timeout(1);
+-				goto again;
+-			}
++			else
++				cond_resched();
++			goto again;
++		}
+ 
++		if (key.objectid < block_group->key.objectid) {
++			path->slots[0]++;
+ 			continue;
+ 		}
+-		btrfs_item_key_to_cpu(leaf, &key, slot);
+-		if (key.objectid < block_group->key.objectid)
+-			goto next;
+ 
+ 		if (key.objectid >= block_group->key.objectid +
+ 		    block_group->key.offset)
+ 			break;
+ 
+-		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
++		if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+ 			total_found += add_new_free_space(block_group,
+ 							  fs_info, last,
+ 							  key.objectid);
+ 			last = key.objectid + key.offset;
+-		}
+ 
+-		if (total_found > (1024 * 1024 * 2)) {
+-			total_found = 0;
+-			wake_up(&block_group->caching_q);
++			if (total_found > (1024 * 1024 * 2)) {
++				total_found = 0;
++				wake_up(&caching_ctl->wait);
++			}
+ 		}
+-next:
+ 		path->slots[0]++;
+ 	}
+ 	ret = 0;
+@@ -352,33 +381,65 @@ next:
+ 	total_found += add_new_free_space(block_group, fs_info, last,
+ 					  block_group->key.objectid +
+ 					  block_group->key.offset);
++	caching_ctl->progress = (u64)-1;
+ 
+ 	spin_lock(&block_group->lock);
++	block_group->caching_ctl = NULL;
+ 	block_group->cached = BTRFS_CACHE_FINISHED;
+ 	spin_unlock(&block_group->lock);
+ 
+ err:
+ 	btrfs_free_path(path);
+ 	up_read(&fs_info->extent_commit_sem);
+-	atomic_dec(&block_group->space_info->caching_threads);
+-	wake_up(&block_group->caching_q);
+ 
++	free_excluded_extents(extent_root, block_group);
++
++	mutex_unlock(&caching_ctl->mutex);
++	wake_up(&caching_ctl->wait);
++
++	put_caching_control(caching_ctl);
++	atomic_dec(&block_group->space_info->caching_threads);
+ 	return 0;
+ }
+ 
+ static int cache_block_group(struct btrfs_block_group_cache *cache)
+ {
++	struct btrfs_fs_info *fs_info = cache->fs_info;
++	struct btrfs_caching_control *caching_ctl;
+ 	struct task_struct *tsk;
+ 	int ret = 0;
+ 
++	smp_mb();
++	if (cache->cached != BTRFS_CACHE_NO)
++		return 0;
++
++	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
++	BUG_ON(!caching_ctl);
++
++	INIT_LIST_HEAD(&caching_ctl->list);
++	mutex_init(&caching_ctl->mutex);
++	init_waitqueue_head(&caching_ctl->wait);
++	caching_ctl->block_group = cache;
++	caching_ctl->progress = cache->key.objectid;
++	/* one for caching kthread, one for caching block group list */
++	atomic_set(&caching_ctl->count, 2);
++
+ 	spin_lock(&cache->lock);
+ 	if (cache->cached != BTRFS_CACHE_NO) {
+ 		spin_unlock(&cache->lock);
+-		return ret;
++		kfree(caching_ctl);
++		return 0;
+ 	}
++	cache->caching_ctl = caching_ctl;
+ 	cache->cached = BTRFS_CACHE_STARTED;
+ 	spin_unlock(&cache->lock);
+ 
++	down_write(&fs_info->extent_commit_sem);
++	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
++	up_write(&fs_info->extent_commit_sem);
++
++	atomic_inc(&cache->space_info->caching_threads);
++
+ 	tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+ 			  cache->key.objectid);
+ 	if (IS_ERR(tsk)) {
+@@ -1507,22 +1568,22 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
+ 	return ret;
+ }
+ 
+-#ifdef BIO_RW_DISCARD
+ static void btrfs_issue_discard(struct block_device *bdev,
+ 				u64 start, u64 len)
+ {
+ 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+ }
+-#endif
+ 
+ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ 				u64 num_bytes)
+ {
+-#ifdef BIO_RW_DISCARD
+ 	int ret;
+ 	u64 map_length = num_bytes;
+ 	struct btrfs_multi_bio *multi = NULL;
+ 
++	if (!btrfs_test_opt(root, DISCARD))
++		return 0;
++
+ 	/* Tell the block device(s) that the sectors can be discarded */
+ 	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+ 			      bytenr, &map_length, &multi, 0);
+@@ -1542,9 +1603,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ 	}
+ 
+ 	return ret;
+-#else
+-	return 0;
+-#endif
+ }
+ 
+ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+@@ -1656,7 +1714,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
+ 						 parent, ref_root, flags,
+ 						 ref->objectid, ref->offset,
+ 						 &ins, node->ref_mod);
+-		update_reserved_extents(root, ins.objectid, ins.offset, 0);
+ 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
+ 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+ 					     node->num_bytes, parent,
+@@ -1782,7 +1839,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ 						extent_op->flags_to_set,
+ 						&extent_op->key,
+ 						ref->level, &ins);
+-		update_reserved_extents(root, ins.objectid, ins.offset, 0);
+ 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
+ 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+ 					     node->num_bytes, parent, ref_root,
+@@ -1817,16 +1873,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+ 		BUG_ON(extent_op);
+ 		head = btrfs_delayed_node_to_head(node);
+ 		if (insert_reserved) {
++			int mark_free = 0;
++			struct extent_buffer *must_clean = NULL;
++
++			ret = pin_down_bytes(trans, root, NULL,
++					     node->bytenr, node->num_bytes,
++					     head->is_data, 1, &must_clean);
++			if (ret > 0)
++				mark_free = 1;
++
++			if (must_clean) {
++				clean_tree_block(NULL, root, must_clean);
++				btrfs_tree_unlock(must_clean);
++				free_extent_buffer(must_clean);
++			}
+ 			if (head->is_data) {
+ 				ret = btrfs_del_csums(trans, root,
+ 						      node->bytenr,
+ 						      node->num_bytes);
+ 				BUG_ON(ret);
+ 			}
+-			btrfs_update_pinned_extents(root, node->bytenr,
+-						    node->num_bytes, 1);
+-			update_reserved_extents(root, node->bytenr,
+-						node->num_bytes, 0);
++			if (mark_free) {
++				ret = btrfs_free_reserved_extent(root,
++							node->bytenr,
++							node->num_bytes);
++				BUG_ON(ret);
++			}
+ 		}
+ 		mutex_unlock(&head->mutex);
+ 		return 0;
+@@ -2691,60 +2763,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+ 						       alloc_target);
+ }
+ 
++static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
++{
++	u64 num_bytes;
++	int level;
++
++	level = BTRFS_MAX_LEVEL - 2;
++	/*
++	 * NOTE: these calculations are absolutely the worst possible case.
++	 * This assumes that _every_ item we insert will require a new leaf, and
++	 * that the tree has grown to its maximum level size.
++	 */
++
++	/*
++	 * for every item we insert we could insert both an extent item and a
++	 * extent ref item.  Then for ever item we insert, we will need to cow
++	 * both the original leaf, plus the leaf to the left and right of it.
++	 *
++	 * Unless we are talking about the extent root, then we just want the
++	 * number of items * 2, since we just need the extent item plus its ref.
++	 */
++	if (root == root->fs_info->extent_root)
++		num_bytes = num_items * 2;
++	else
++		num_bytes = (num_items + (2 * num_items)) * 3;
++
++	/*
++	 * num_bytes is total number of leaves we could need times the leaf
++	 * size, and then for every leaf we could end up cow'ing 2 nodes per
++	 * level, down to the leaf level.
++	 */
++	num_bytes = (num_bytes * root->leafsize) +
++		(num_bytes * (level * 2)) * root->nodesize;
++
++	return num_bytes;
++}
++
+ /*
+- * for now this just makes sure we have at least 5% of our metadata space free
+- * for use.
++ * Unreserve metadata space for delalloc.  If we have less reserved credits than
++ * we have extents, this function does nothing.
+  */
+-int btrfs_check_metadata_free_space(struct btrfs_root *root)
++int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
++					  struct inode *inode, int num_items)
+ {
+ 	struct btrfs_fs_info *info = root->fs_info;
+ 	struct btrfs_space_info *meta_sinfo;
+-	u64 alloc_target, thresh;
+-	int committed = 0, ret;
++	u64 num_bytes;
++	u64 alloc_target;
++	bool bug = false;
+ 
+ 	/* get the space info for where the metadata will live */
+ 	alloc_target = btrfs_get_alloc_profile(root, 0);
+ 	meta_sinfo = __find_space_info(info, alloc_target);
+ 
+-again:
++	num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
++					   num_items);
++
+ 	spin_lock(&meta_sinfo->lock);
+-	if (!meta_sinfo->full)
+-		thresh = meta_sinfo->total_bytes * 80;
+-	else
+-		thresh = meta_sinfo->total_bytes * 95;
++	spin_lock(&BTRFS_I(inode)->accounting_lock);
++	if (BTRFS_I(inode)->reserved_extents <=
++	    BTRFS_I(inode)->outstanding_extents) {
++		spin_unlock(&BTRFS_I(inode)->accounting_lock);
++		spin_unlock(&meta_sinfo->lock);
++		return 0;
++	}
++	spin_unlock(&BTRFS_I(inode)->accounting_lock);
++
++	BTRFS_I(inode)->reserved_extents--;
++	BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
++
++	if (meta_sinfo->bytes_delalloc < num_bytes) {
++		bug = true;
++		meta_sinfo->bytes_delalloc = 0;
++	} else {
++		meta_sinfo->bytes_delalloc -= num_bytes;
++	}
++	spin_unlock(&meta_sinfo->lock);
++
++	BUG_ON(bug);
++
++	return 0;
++}
++
++static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
++{
++	u64 thresh;
++
++	thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
++		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
++		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
++		meta_sinfo->bytes_may_use;
+ 
++	thresh = meta_sinfo->total_bytes - thresh;
++	thresh *= 80;
+ 	do_div(thresh, 100);
++	if (thresh <= meta_sinfo->bytes_delalloc)
++		meta_sinfo->force_delalloc = 1;
++	else
++		meta_sinfo->force_delalloc = 0;
++}
+ 
+-	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+-	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+-		struct btrfs_trans_handle *trans;
+-		if (!meta_sinfo->full) {
+-			meta_sinfo->force_alloc = 1;
+-			spin_unlock(&meta_sinfo->lock);
++struct async_flush {
++	struct btrfs_root *root;
++	struct btrfs_space_info *info;
++	struct btrfs_work work;
++};
+ 
+-			trans = btrfs_start_transaction(root, 1);
+-			if (!trans)
+-				return -ENOMEM;
++static noinline void flush_delalloc_async(struct btrfs_work *work)
++{
++	struct async_flush *async;
++	struct btrfs_root *root;
++	struct btrfs_space_info *info;
+ 
+-			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+-					     2 * 1024 * 1024, alloc_target, 0);
+-			btrfs_end_transaction(trans, root);
++	async = container_of(work, struct async_flush, work);
++	root = async->root;
++	info = async->info;
++
++	btrfs_start_delalloc_inodes(root);
++	wake_up(&info->flush_wait);
++	btrfs_wait_ordered_extents(root, 0);
++
++	spin_lock(&info->lock);
++	info->flushing = 0;
++	spin_unlock(&info->lock);
++	wake_up(&info->flush_wait);
++
++	kfree(async);
++}
++
++static void wait_on_flush(struct btrfs_space_info *info)
++{
++	DEFINE_WAIT(wait);
++	u64 used;
++
++	while (1) {
++		prepare_to_wait(&info->flush_wait, &wait,
++				TASK_UNINTERRUPTIBLE);
++		spin_lock(&info->lock);
++		if (!info->flushing) {
++			spin_unlock(&info->lock);
++			break;
++		}
++
++		used = info->bytes_used + info->bytes_reserved +
++			info->bytes_pinned + info->bytes_readonly +
++			info->bytes_super + info->bytes_root +
++			info->bytes_may_use + info->bytes_delalloc;
++		if (used < info->total_bytes) {
++			spin_unlock(&info->lock);
++			break;
++		}
++		spin_unlock(&info->lock);
++		schedule();
++	}
++	finish_wait(&info->flush_wait, &wait);
++}
++
++static void flush_delalloc(struct btrfs_root *root,
++				 struct btrfs_space_info *info)
++{
++	struct async_flush *async;
++	bool wait = false;
++
++	spin_lock(&info->lock);
++
++	if (!info->flushing) {
++		info->flushing = 1;
++		init_waitqueue_head(&info->flush_wait);
++	} else {
++		wait = true;
++	}
++
++	spin_unlock(&info->lock);
++
++	if (wait) {
++		wait_on_flush(info);
++		return;
++	}
++
++	async = kzalloc(sizeof(*async), GFP_NOFS);
++	if (!async)
++		goto flush;
++
++	async->root = root;
++	async->info = info;
++	async->work.func = flush_delalloc_async;
++
++	btrfs_queue_worker(&root->fs_info->enospc_workers,
++			   &async->work);
++	wait_on_flush(info);
++	return;
++
++flush:
++	btrfs_start_delalloc_inodes(root);
++	btrfs_wait_ordered_extents(root, 0);
++
++	spin_lock(&info->lock);
++	info->flushing = 0;
++	spin_unlock(&info->lock);
++	wake_up(&info->flush_wait);
++}
++
++static int maybe_allocate_chunk(struct btrfs_root *root,
++				 struct btrfs_space_info *info)
++{
++	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
++	struct btrfs_trans_handle *trans;
++	bool wait = false;
++	int ret = 0;
++	u64 min_metadata;
++	u64 free_space;
++
++	free_space = btrfs_super_total_bytes(disk_super);
++	/*
++	 * we allow the metadata to grow to a max of either 5gb or 5% of the
++	 * space in the volume.
++	 */
++	min_metadata = min((u64)5 * 1024 * 1024 * 1024,
++			     div64_u64(free_space * 5, 100));
++	if (info->total_bytes >= min_metadata) {
++		spin_unlock(&info->lock);
++		return 0;
++	}
++
++	if (info->full) {
++		spin_unlock(&info->lock);
++		return 0;
++	}
++
++	if (!info->allocating_chunk) {
++		info->force_alloc = 1;
++		info->allocating_chunk = 1;
++		init_waitqueue_head(&info->allocate_wait);
++	} else {
++		wait = true;
++	}
++
++	spin_unlock(&info->lock);
++
++	if (wait) {
++		wait_event(info->allocate_wait,
++			   !info->allocating_chunk);
++		return 1;
++	}
++
++	trans = btrfs_start_transaction(root, 1);
++	if (!trans) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	ret = do_chunk_alloc(trans, root->fs_info->extent_root,
++			     4096 + 2 * 1024 * 1024,
++			     info->flags, 0);
++	btrfs_end_transaction(trans, root);
++	if (ret)
++		goto out;
++out:
++	spin_lock(&info->lock);
++	info->allocating_chunk = 0;
++	spin_unlock(&info->lock);
++	wake_up(&info->allocate_wait);
++
++	if (ret)
++		return 0;
++	return 1;
++}
++
++/*
++ * Reserve metadata space for delalloc.
++ */
++int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
++					struct inode *inode, int num_items)
++{
++	struct btrfs_fs_info *info = root->fs_info;
++	struct btrfs_space_info *meta_sinfo;
++	u64 num_bytes;
++	u64 used;
++	u64 alloc_target;
++	int flushed = 0;
++	int force_delalloc;
++
++	/* get the space info for where the metadata will live */
++	alloc_target = btrfs_get_alloc_profile(root, 0);
++	meta_sinfo = __find_space_info(info, alloc_target);
++
++	num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
++					   num_items);
++again:
++	spin_lock(&meta_sinfo->lock);
++
++	force_delalloc = meta_sinfo->force_delalloc;
++
++	if (unlikely(!meta_sinfo->bytes_root))
++		meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
++
++	if (!flushed)
++		meta_sinfo->bytes_delalloc += num_bytes;
++
++	used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
++		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
++		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
++		meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
++
++	if (used > meta_sinfo->total_bytes) {
++		flushed++;
++
++		if (flushed == 1) {
++			if (maybe_allocate_chunk(root, meta_sinfo))
++				goto again;
++			flushed++;
++		} else {
++			spin_unlock(&meta_sinfo->lock);
++		}
++
++		if (flushed == 2) {
++			filemap_flush(inode->i_mapping);
++			goto again;
++		} else if (flushed == 3) {
++			flush_delalloc(root, meta_sinfo);
+ 			goto again;
+ 		}
++		spin_lock(&meta_sinfo->lock);
++		meta_sinfo->bytes_delalloc -= num_bytes;
+ 		spin_unlock(&meta_sinfo->lock);
++		printk(KERN_ERR "enospc, has %d, reserved %d\n",
++		       BTRFS_I(inode)->outstanding_extents,
++		       BTRFS_I(inode)->reserved_extents);
++		dump_space_info(meta_sinfo, 0, 0);
++		return -ENOSPC;
++	}
+ 
+-		if (!committed) {
+-			committed = 1;
+-			trans = btrfs_join_transaction(root, 1);
+-			if (!trans)
+-				return -ENOMEM;
+-			ret = btrfs_commit_transaction(trans, root);
+-			if (ret)
+-				return ret;
++	BTRFS_I(inode)->reserved_extents++;
++	check_force_delalloc(meta_sinfo);
++	spin_unlock(&meta_sinfo->lock);
++
++	if (!flushed && force_delalloc)
++		filemap_flush(inode->i_mapping);
++
++	return 0;
++}
++
++/*
++ * unreserve num_items number of items worth of metadata space.  This needs to
++ * be paired with btrfs_reserve_metadata_space.
++ *
++ * NOTE: if you have the option, run this _AFTER_ you do a
++ * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
++ * oprations which will result in more used metadata, so we want to make sure we
++ * can do that without issue.
++ */
++int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
++{
++	struct btrfs_fs_info *info = root->fs_info;
++	struct btrfs_space_info *meta_sinfo;
++	u64 num_bytes;
++	u64 alloc_target;
++	bool bug = false;
++
++	/* get the space info for where the metadata will live */
++	alloc_target = btrfs_get_alloc_profile(root, 0);
++	meta_sinfo = __find_space_info(info, alloc_target);
++
++	num_bytes = calculate_bytes_needed(root, num_items);
++
++	spin_lock(&meta_sinfo->lock);
++	if (meta_sinfo->bytes_may_use < num_bytes) {
++		bug = true;
++		meta_sinfo->bytes_may_use = 0;
++	} else {
++		meta_sinfo->bytes_may_use -= num_bytes;
++	}
++	spin_unlock(&meta_sinfo->lock);
++
++	BUG_ON(bug);
++
++	return 0;
++}
++
++/*
++ * Reserve some metadata space for use.  We'll calculate the worste case number
++ * of bytes that would be needed to modify num_items number of items.  If we
++ * have space, fantastic, if not, you get -ENOSPC.  Please call
++ * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
++ * items you reserved, since whatever metadata you needed should have already
++ * been allocated.
++ *
++ * This will commit the transaction to make more space if we don't have enough
++ * metadata space.  THe only time we don't do this is if we're reserving space
++ * inside of a transaction, then we will just return -ENOSPC and it is the
++ * callers responsibility to handle it properly.
++ */
++int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
++{
++	struct btrfs_fs_info *info = root->fs_info;
++	struct btrfs_space_info *meta_sinfo;
++	u64 num_bytes;
++	u64 used;
++	u64 alloc_target;
++	int retries = 0;
++
++	/* get the space info for where the metadata will live */
++	alloc_target = btrfs_get_alloc_profile(root, 0);
++	meta_sinfo = __find_space_info(info, alloc_target);
++
++	num_bytes = calculate_bytes_needed(root, num_items);
++again:
++	spin_lock(&meta_sinfo->lock);
++
++	if (unlikely(!meta_sinfo->bytes_root))
++		meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
++
++	if (!retries)
++		meta_sinfo->bytes_may_use += num_bytes;
++
++	used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
++		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
++		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
++		meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
++
++	if (used > meta_sinfo->total_bytes) {
++		retries++;
++		if (retries == 1) {
++			if (maybe_allocate_chunk(root, meta_sinfo))
++				goto again;
++			retries++;
++		} else {
++			spin_unlock(&meta_sinfo->lock);
++		}
++
++		if (retries == 2) {
++			flush_delalloc(root, meta_sinfo);
+ 			goto again;
+ 		}
++		spin_lock(&meta_sinfo->lock);
++		meta_sinfo->bytes_may_use -= num_bytes;
++		spin_unlock(&meta_sinfo->lock);
++
++		dump_space_info(meta_sinfo, 0, 0);
+ 		return -ENOSPC;
+ 	}
++
++	check_force_delalloc(meta_sinfo);
+ 	spin_unlock(&meta_sinfo->lock);
+ 
+ 	return 0;
+@@ -2764,13 +3224,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+ 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ 
+ 	data_sinfo = BTRFS_I(inode)->space_info;
++	if (!data_sinfo)
++		goto alloc;
++
+ again:
+ 	/* make sure we have enough space to handle the data first */
+ 	spin_lock(&data_sinfo->lock);
+ 	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
+ 	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
+ 	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
+-	    data_sinfo->bytes_may_use < bytes) {
++	    data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
+ 		struct btrfs_trans_handle *trans;
+ 
+ 		/*
+@@ -2782,7 +3245,7 @@ again:
+ 
+ 			data_sinfo->force_alloc = 1;
+ 			spin_unlock(&data_sinfo->lock);
+-
++alloc:
+ 			alloc_target = btrfs_get_alloc_profile(root, 1);
+ 			trans = btrfs_start_transaction(root, 1);
+ 			if (!trans)
+@@ -2794,12 +3257,17 @@ again:
+ 			btrfs_end_transaction(trans, root);
+ 			if (ret)
+ 				return ret;
++
++			if (!data_sinfo) {
++				btrfs_set_inode_space_info(root, inode);
++				data_sinfo = BTRFS_I(inode)->space_info;
++			}
+ 			goto again;
+ 		}
+ 		spin_unlock(&data_sinfo->lock);
+ 
+ 		/* commit the current transaction and try again */
+-		if (!committed) {
++		if (!committed && !root->fs_info->open_ioctl_trans) {
+ 			committed = 1;
+ 			trans = btrfs_join_transaction(root, 1);
+ 			if (!trans)
+@@ -2827,7 +3295,7 @@ again:
+ 	BTRFS_I(inode)->reserved_bytes += bytes;
+ 	spin_unlock(&data_sinfo->lock);
+ 
+-	return btrfs_check_metadata_free_space(root);
++	return 0;
+ }
+ 
+ /*
+@@ -2926,17 +3394,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ 	BUG_ON(!space_info);
+ 
+ 	spin_lock(&space_info->lock);
+-	if (space_info->force_alloc) {
++	if (space_info->force_alloc)
+ 		force = 1;
+-		space_info->force_alloc = 0;
+-	}
+ 	if (space_info->full) {
+ 		spin_unlock(&space_info->lock);
+ 		goto out;
+ 	}
+ 
+ 	thresh = space_info->total_bytes - space_info->bytes_readonly;
+-	thresh = div_factor(thresh, 6);
++	thresh = div_factor(thresh, 8);
+ 	if (!force &&
+ 	   (space_info->bytes_used + space_info->bytes_pinned +
+ 	    space_info->bytes_reserved + alloc_bytes) < thresh) {
+@@ -2950,7 +3416,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ 	 * we keep a reasonable number of metadata chunks allocated in the
+ 	 * FS as well.
+ 	 */
+-	if (flags & BTRFS_BLOCK_GROUP_DATA) {
++	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
+ 		fs_info->data_chunk_allocations++;
+ 		if (!(fs_info->data_chunk_allocations %
+ 		      fs_info->metadata_ratio))
+@@ -2958,8 +3424,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ 	}
+ 
+ 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
++	spin_lock(&space_info->lock);
+ 	if (ret)
+ 		space_info->full = 1;
++	space_info->force_alloc = 0;
++	spin_unlock(&space_info->lock);
+ out:
+ 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
+ 	return ret;
+@@ -3008,10 +3477,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
+ 		num_bytes = min(total, cache->key.offset - byte_in_group);
+ 		if (alloc) {
+ 			old_val += num_bytes;
++			btrfs_set_block_group_used(&cache->item, old_val);
++			cache->reserved -= num_bytes;
+ 			cache->space_info->bytes_used += num_bytes;
++			cache->space_info->bytes_reserved -= num_bytes;
+ 			if (cache->ro)
+ 				cache->space_info->bytes_readonly -= num_bytes;
+-			btrfs_set_block_group_used(&cache->item, old_val);
+ 			spin_unlock(&cache->lock);
+ 			spin_unlock(&cache->space_info->lock);
+ 		} else {
+@@ -3056,127 +3527,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
+ 	return bytenr;
+ }
+ 
+-int btrfs_update_pinned_extents(struct btrfs_root *root,
+-				u64 bytenr, u64 num, int pin)
++/*
++ * this function must be called within transaction
++ */
++int btrfs_pin_extent(struct btrfs_root *root,
++		     u64 bytenr, u64 num_bytes, int reserved)
+ {
+-	u64 len;
+-	struct btrfs_block_group_cache *cache;
+ 	struct btrfs_fs_info *fs_info = root->fs_info;
++	struct btrfs_block_group_cache *cache;
+ 
+-	if (pin)
+-		set_extent_dirty(&fs_info->pinned_extents,
+-				bytenr, bytenr + num - 1, GFP_NOFS);
+-
+-	while (num > 0) {
+-		cache = btrfs_lookup_block_group(fs_info, bytenr);
+-		BUG_ON(!cache);
+-		len = min(num, cache->key.offset -
+-			  (bytenr - cache->key.objectid));
+-		if (pin) {
+-			spin_lock(&cache->space_info->lock);
+-			spin_lock(&cache->lock);
+-			cache->pinned += len;
+-			cache->space_info->bytes_pinned += len;
+-			spin_unlock(&cache->lock);
+-			spin_unlock(&cache->space_info->lock);
+-			fs_info->total_pinned += len;
+-		} else {
+-			int unpin = 0;
++	cache = btrfs_lookup_block_group(fs_info, bytenr);
++	BUG_ON(!cache);
+ 
+-			/*
+-			 * in order to not race with the block group caching, we
+-			 * only want to unpin the extent if we are cached.  If
+-			 * we aren't cached, we want to start async caching this
+-			 * block group so we can free the extent the next time
+-			 * around.
+-			 */
+-			spin_lock(&cache->space_info->lock);
+-			spin_lock(&cache->lock);
+-			unpin = (cache->cached == BTRFS_CACHE_FINISHED);
+-			if (likely(unpin)) {
+-				cache->pinned -= len;
+-				cache->space_info->bytes_pinned -= len;
+-				fs_info->total_pinned -= len;
+-			}
+-			spin_unlock(&cache->lock);
+-			spin_unlock(&cache->space_info->lock);
++	spin_lock(&cache->space_info->lock);
++	spin_lock(&cache->lock);
++	cache->pinned += num_bytes;
++	cache->space_info->bytes_pinned += num_bytes;
++	if (reserved) {
++		cache->reserved -= num_bytes;
++		cache->space_info->bytes_reserved -= num_bytes;
++	}
++	spin_unlock(&cache->lock);
++	spin_unlock(&cache->space_info->lock);
+ 
+-			if (likely(unpin))
+-				clear_extent_dirty(&fs_info->pinned_extents,
+-						   bytenr, bytenr + len -1,
+-						   GFP_NOFS);
+-			else
+-				cache_block_group(cache);
++	btrfs_put_block_group(cache);
+ 
+-			if (unpin)
+-				btrfs_add_free_space(cache, bytenr, len);
+-		}
+-		btrfs_put_block_group(cache);
+-		bytenr += len;
+-		num -= len;
++	set_extent_dirty(fs_info->pinned_extents,
++			 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
++	return 0;
++}
++
++static int update_reserved_extents(struct btrfs_block_group_cache *cache,
++				   u64 num_bytes, int reserve)
++{
++	spin_lock(&cache->space_info->lock);
++	spin_lock(&cache->lock);
++	if (reserve) {
++		cache->reserved += num_bytes;
++		cache->space_info->bytes_reserved += num_bytes;
++	} else {
++		cache->reserved -= num_bytes;
++		cache->space_info->bytes_reserved -= num_bytes;
+ 	}
++	spin_unlock(&cache->lock);
++	spin_unlock(&cache->space_info->lock);
+ 	return 0;
+ }
+ 
+-static int update_reserved_extents(struct btrfs_root *root,
+-				   u64 bytenr, u64 num, int reserve)
++int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
++				struct btrfs_root *root)
+ {
+-	u64 len;
+-	struct btrfs_block_group_cache *cache;
+ 	struct btrfs_fs_info *fs_info = root->fs_info;
++	struct btrfs_caching_control *next;
++	struct btrfs_caching_control *caching_ctl;
++	struct btrfs_block_group_cache *cache;
+ 
+-	while (num > 0) {
+-		cache = btrfs_lookup_block_group(fs_info, bytenr);
+-		BUG_ON(!cache);
+-		len = min(num, cache->key.offset -
+-			  (bytenr - cache->key.objectid));
++	down_write(&fs_info->extent_commit_sem);
+ 
+-		spin_lock(&cache->space_info->lock);
+-		spin_lock(&cache->lock);
+-		if (reserve) {
+-			cache->reserved += len;
+-			cache->space_info->bytes_reserved += len;
++	list_for_each_entry_safe(caching_ctl, next,
++				 &fs_info->caching_block_groups, list) {
++		cache = caching_ctl->block_group;
++		if (block_group_cache_done(cache)) {
++			cache->last_byte_to_unpin = (u64)-1;
++			list_del_init(&caching_ctl->list);
++			put_caching_control(caching_ctl);
+ 		} else {
+-			cache->reserved -= len;
+-			cache->space_info->bytes_reserved -= len;
++			cache->last_byte_to_unpin = caching_ctl->progress;
+ 		}
+-		spin_unlock(&cache->lock);
+-		spin_unlock(&cache->space_info->lock);
+-		btrfs_put_block_group(cache);
+-		bytenr += len;
+-		num -= len;
+ 	}
++
++	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
++		fs_info->pinned_extents = &fs_info->freed_extents[1];
++	else
++		fs_info->pinned_extents = &fs_info->freed_extents[0];
++
++	up_write(&fs_info->extent_commit_sem);
+ 	return 0;
+ }
+ 
+-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
++static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+ {
+-	u64 last = 0;
+-	u64 start;
+-	u64 end;
+-	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
+-	int ret;
++	struct btrfs_fs_info *fs_info = root->fs_info;
++	struct btrfs_block_group_cache *cache = NULL;
++	u64 len;
+ 
+-	while (1) {
+-		ret = find_first_extent_bit(pinned_extents, last,
+-					    &start, &end, EXTENT_DIRTY);
+-		if (ret)
+-			break;
++	while (start <= end) {
++		if (!cache ||
++		    start >= cache->key.objectid + cache->key.offset) {
++			if (cache)
++				btrfs_put_block_group(cache);
++			cache = btrfs_lookup_block_group(fs_info, start);
++			BUG_ON(!cache);
++		}
+ 
+-		set_extent_dirty(copy, start, end, GFP_NOFS);
+-		last = end + 1;
++		len = cache->key.objectid + cache->key.offset - start;
++		len = min(len, end + 1 - start);
++
++		if (start < cache->last_byte_to_unpin) {
++			len = min(len, cache->last_byte_to_unpin - start);
++			btrfs_add_free_space(cache, start, len);
++		}
++
++		spin_lock(&cache->space_info->lock);
++		spin_lock(&cache->lock);
++		cache->pinned -= len;
++		cache->space_info->bytes_pinned -= len;
++		spin_unlock(&cache->lock);
++		spin_unlock(&cache->space_info->lock);
++
++		start += len;
+ 	}
++
++	if (cache)
++		btrfs_put_block_group(cache);
+ 	return 0;
+ }
+ 
+ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+-			       struct btrfs_root *root,
+-			       struct extent_io_tree *unpin)
++			       struct btrfs_root *root)
+ {
++	struct btrfs_fs_info *fs_info = root->fs_info;
++	struct extent_io_tree *unpin;
+ 	u64 start;
+ 	u64 end;
+ 	int ret;
+ 
++	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
++		unpin = &fs_info->freed_extents[1];
++	else
++		unpin = &fs_info->freed_extents[0];
++
+ 	while (1) {
+ 		ret = find_first_extent_bit(unpin, 0, &start, &end,
+ 					    EXTENT_DIRTY);
+@@ -3185,10 +3665,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+ 
+ 		ret = btrfs_discard_extent(root, start, end + 1 - start);
+ 
+-		/* unlocks the pinned mutex */
+-		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
+ 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
+-
++		unpin_extent_range(root, start, end);
+ 		cond_resched();
+ 	}
+ 
+@@ -3198,7 +3676,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+ static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ 			  struct btrfs_root *root,
+ 			  struct btrfs_path *path,
+-			  u64 bytenr, u64 num_bytes, int is_data,
++			  u64 bytenr, u64 num_bytes,
++			  int is_data, int reserved,
+ 			  struct extent_buffer **must_clean)
+ {
+ 	int err = 0;
+@@ -3207,6 +3686,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ 	if (is_data)
+ 		goto pinit;
+ 
++	/*
++	 * discard is sloooow, and so triggering discards on
++	 * individual btree blocks isn't a good plan.  Just
++	 * pin everything in discard mode.
++	 */
++	if (btrfs_test_opt(root, DISCARD))
++		goto pinit;
++
+ 	buf = btrfs_find_tree_block(root, bytenr, num_bytes);
+ 	if (!buf)
+ 		goto pinit;
+@@ -3230,15 +3717,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ 	}
+ 	free_extent_buffer(buf);
+ pinit:
+-	btrfs_set_path_blocking(path);
++	if (path)
++		btrfs_set_path_blocking(path);
+ 	/* unlocks the pinned mutex */
+-	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
++	btrfs_pin_extent(root, bytenr, num_bytes, reserved);
+ 
+ 	BUG_ON(err < 0);
+ 	return 0;
+ }
+ 
+-
+ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ 				struct btrfs_root *root,
+ 				u64 bytenr, u64 num_bytes, u64 parent,
+@@ -3412,7 +3899,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ 		}
+ 
+ 		ret = pin_down_bytes(trans, root, path, bytenr,
+-				     num_bytes, is_data, &must_clean);
++				     num_bytes, is_data, 0, &must_clean);
+ 		if (ret > 0)
+ 			mark_free = 1;
+ 		BUG_ON(ret < 0);
+@@ -3543,8 +4030,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
+ 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
+ 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
+ 		/* unlocks the pinned mutex */
+-		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+-		update_reserved_extents(root, bytenr, num_bytes, 0);
++		btrfs_pin_extent(root, bytenr, num_bytes, 1);
+ 		ret = 0;
+ 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ 		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+@@ -3584,19 +4070,33 @@ static noinline int
+ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+ 				u64 num_bytes)
+ {
++	struct btrfs_caching_control *caching_ctl;
+ 	DEFINE_WAIT(wait);
+ 
+-	prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
+-
+-	if (block_group_cache_done(cache)) {
+-		finish_wait(&cache->caching_q, &wait);
++	caching_ctl = get_caching_control(cache);
++	if (!caching_ctl)
+ 		return 0;
+-	}
+-	schedule();
+-	finish_wait(&cache->caching_q, &wait);
+ 
+-	wait_event(cache->caching_q, block_group_cache_done(cache) ||
++	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
+ 		   (cache->free_space >= num_bytes));
++
++	put_caching_control(caching_ctl);
++	return 0;
++}
++
++static noinline int
++wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
++{
++	struct btrfs_caching_control *caching_ctl;
++	DEFINE_WAIT(wait);
++
++	caching_ctl = get_caching_control(cache);
++	if (!caching_ctl)
++		return 0;
++
++	wait_event(caching_ctl->wait, block_group_cache_done(cache));
++
++	put_caching_control(caching_ctl);
+ 	return 0;
+ }
+ 
+@@ -3634,6 +4134,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
+ 	int last_ptr_loop = 0;
+ 	int loop = 0;
+ 	bool found_uncached_bg = false;
++	bool failed_cluster_refill = false;
++	bool failed_alloc = false;
+ 
+ 	WARN_ON(num_bytes < root->sectorsize);
+ 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+@@ -3731,7 +4233,16 @@ have_block_group:
+ 		if (unlikely(block_group->ro))
+ 			goto loop;
+ 
+-		if (last_ptr) {
++		/*
++		 * Ok we want to try and use the cluster allocator, so lets look
++		 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
++		 * have tried the cluster allocator plenty of times at this
++		 * point and not have found anything, so we are likely way too
++		 * fragmented for the clustering stuff to find anything, so lets
++		 * just skip it and let the allocator find whatever block it can
++		 * find
++		 */
++		if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
+ 			/*
+ 			 * the refill lock keeps out other
+ 			 * people trying to start a new cluster
+@@ -3806,9 +4317,11 @@ refill_cluster:
+ 					spin_unlock(&last_ptr->refill_lock);
+ 					goto checks;
+ 				}
+-			} else if (!cached && loop > LOOP_CACHING_NOWAIT) {
++			} else if (!cached && loop > LOOP_CACHING_NOWAIT
++				   && !failed_cluster_refill) {
+ 				spin_unlock(&last_ptr->refill_lock);
+ 
++				failed_cluster_refill = true;
+ 				wait_block_group_cache_progress(block_group,
+ 				       num_bytes + empty_cluster + empty_size);
+ 				goto have_block_group;
+@@ -3820,25 +4333,30 @@ refill_cluster:
+ 			 * cluster.  Free the cluster we've been trying
+ 			 * to use, and go to the next block group
+ 			 */
+-			if (loop < LOOP_NO_EMPTY_SIZE) {
+-				btrfs_return_cluster_to_free_space(NULL,
+-								   last_ptr);
+-				spin_unlock(&last_ptr->refill_lock);
+-				goto loop;
+-			}
++			btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ 			spin_unlock(&last_ptr->refill_lock);
++			goto loop;
+ 		}
+ 
+ 		offset = btrfs_find_space_for_alloc(block_group, search_start,
+ 						    num_bytes, empty_size);
+-		if (!offset && (cached || (!cached &&
+-					   loop == LOOP_CACHING_NOWAIT))) {
+-			goto loop;
+-		} else if (!offset && (!cached &&
+-				       loop > LOOP_CACHING_NOWAIT)) {
++		/*
++		 * If we didn't find a chunk, and we haven't failed on this
++		 * block group before, and this block group is in the middle of
++		 * caching and we are ok with waiting, then go ahead and wait
++		 * for progress to be made, and set failed_alloc to true.
++		 *
++		 * If failed_alloc is true then we've already waited on this
++		 * block group once and should move on to the next block group.
++		 */
++		if (!offset && !failed_alloc && !cached &&
++		    loop > LOOP_CACHING_NOWAIT) {
+ 			wait_block_group_cache_progress(block_group,
+-					num_bytes + empty_size);
++						num_bytes + empty_size);
++			failed_alloc = true;
+ 			goto have_block_group;
++		} else if (!offset) {
++			goto loop;
+ 		}
+ checks:
+ 		search_start = stripe_align(root, offset);
+@@ -3880,9 +4398,13 @@ checks:
+ 					     search_start - offset);
+ 		BUG_ON(offset > search_start);
+ 
++		update_reserved_extents(block_group, num_bytes, 1);
++
+ 		/* we are all good, lets return */
+ 		break;
+ loop:
++		failed_cluster_refill = false;
++		failed_alloc = false;
+ 		btrfs_put_block_group(block_group);
+ 	}
+ 	up_read(&space_info->groups_sem);
+@@ -3940,21 +4462,32 @@ loop:
+ 	return ret;
+ }
+ 
+-static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
++static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
++			    int dump_block_groups)
+ {
+ 	struct btrfs_block_group_cache *cache;
+ 
++	spin_lock(&info->lock);
+ 	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+ 	       (unsigned long long)(info->total_bytes - info->bytes_used -
+-				    info->bytes_pinned - info->bytes_reserved),
++				    info->bytes_pinned - info->bytes_reserved -
++				    info->bytes_super),
+ 	       (info->full) ? "" : "not ");
+ 	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
+-	       " may_use=%llu, used=%llu\n",
++	       " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
++	       "\n",
+ 	       (unsigned long long)info->total_bytes,
+ 	       (unsigned long long)info->bytes_pinned,
+ 	       (unsigned long long)info->bytes_delalloc,
+ 	       (unsigned long long)info->bytes_may_use,
+-	       (unsigned long long)info->bytes_used);
++	       (unsigned long long)info->bytes_used,
++	       (unsigned long long)info->bytes_root,
++	       (unsigned long long)info->bytes_super,
++	       (unsigned long long)info->bytes_reserved);
++	spin_unlock(&info->lock);
++
++	if (!dump_block_groups)
++		return;
+ 
+ 	down_read(&info->groups_sem);
+ 	list_for_each_entry(cache, &info->block_groups, list) {
+@@ -3972,12 +4505,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+ 	up_read(&info->groups_sem);
+ }
+ 
+-static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+-				  struct btrfs_root *root,
+-				  u64 num_bytes, u64 min_alloc_size,
+-				  u64 empty_size, u64 hint_byte,
+-				  u64 search_end, struct btrfs_key *ins,
+-				  u64 data)
++int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
++			 struct btrfs_root *root,
++			 u64 num_bytes, u64 min_alloc_size,
++			 u64 empty_size, u64 hint_byte,
++			 u64 search_end, struct btrfs_key *ins,
++			 u64 data)
+ {
+ 	int ret;
+ 	u64 search_start = 0;
+@@ -4022,7 +4555,7 @@ again:
+ 		printk(KERN_ERR "btrfs allocation failed flags %llu, "
+ 		       "wanted %llu\n", (unsigned long long)data,
+ 		       (unsigned long long)num_bytes);
+-		dump_space_info(sinfo, num_bytes);
++		dump_space_info(sinfo, num_bytes, 1);
+ 	}
+ 
+ 	return ret;
+@@ -4043,25 +4576,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+ 	ret = btrfs_discard_extent(root, start, len);
+ 
+ 	btrfs_add_free_space(cache, start, len);
++	update_reserved_extents(cache, len, 0);
+ 	btrfs_put_block_group(cache);
+-	update_reserved_extents(root, start, len, 0);
+-
+-	return ret;
+-}
+-
+-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+-				  struct btrfs_root *root,
+-				  u64 num_bytes, u64 min_alloc_size,
+-				  u64 empty_size, u64 hint_byte,
+-				  u64 search_end, struct btrfs_key *ins,
+-				  u64 data)
+-{
+-	int ret;
+-	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
+-				     empty_size, hint_byte, search_end, ins,
+-				     data);
+-	if (!ret)
+-		update_reserved_extents(root, ins->objectid, ins->offset, 1);
+ 
+ 	return ret;
+ }
+@@ -4222,15 +4738,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+ {
+ 	int ret;
+ 	struct btrfs_block_group_cache *block_group;
++	struct btrfs_caching_control *caching_ctl;
++	u64 start = ins->objectid;
++	u64 num_bytes = ins->offset;
+ 
+ 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+ 	cache_block_group(block_group);
+-	wait_event(block_group->caching_q,
+-		   block_group_cache_done(block_group));
++	caching_ctl = get_caching_control(block_group);
+ 
+-	ret = btrfs_remove_free_space(block_group, ins->objectid,
+-				      ins->offset);
+-	BUG_ON(ret);
++	if (!caching_ctl) {
++		BUG_ON(!block_group_cache_done(block_group));
++		ret = btrfs_remove_free_space(block_group, start, num_bytes);
++		BUG_ON(ret);
++	} else {
++		mutex_lock(&caching_ctl->mutex);
++
++		if (start >= caching_ctl->progress) {
++			ret = add_excluded_extent(root, start, num_bytes);
++			BUG_ON(ret);
++		} else if (start + num_bytes <= caching_ctl->progress) {
++			ret = btrfs_remove_free_space(block_group,
++						      start, num_bytes);
++			BUG_ON(ret);
++		} else {
++			num_bytes = caching_ctl->progress - start;
++			ret = btrfs_remove_free_space(block_group,
++						      start, num_bytes);
++			BUG_ON(ret);
++
++			start = caching_ctl->progress;
++			num_bytes = ins->objectid + ins->offset -
++				    caching_ctl->progress;
++			ret = add_excluded_extent(root, start, num_bytes);
++			BUG_ON(ret);
++		}
++
++		mutex_unlock(&caching_ctl->mutex);
++		put_caching_control(caching_ctl);
++	}
++
++	update_reserved_extents(block_group, ins->offset, 1);
+ 	btrfs_put_block_group(block_group);
+ 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
+ 					 0, owner, offset, ins, 1);
+@@ -4254,9 +4801,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
+ 	int ret;
+ 	u64 flags = 0;
+ 
+-	ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
+-				     empty_size, hint_byte, search_end,
+-				     ins, 0);
++	ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
++				   empty_size, hint_byte, search_end,
++				   ins, 0);
+ 	if (ret)
+ 		return ret;
+ 
+@@ -4267,7 +4814,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
+ 	} else
+ 		BUG_ON(parent > 0);
+ 
+-	update_reserved_extents(root, ins->objectid, ins->offset, 1);
+ 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ 		struct btrfs_delayed_extent_op *extent_op;
+ 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+@@ -4346,452 +4892,108 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+ 	return buf;
+ }
+ 
+-#if 0
+-int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+-			struct btrfs_root *root, struct extent_buffer *leaf)
+-{
+-	u64 disk_bytenr;
+-	u64 num_bytes;
+-	struct btrfs_key key;
+-	struct btrfs_file_extent_item *fi;
+-	u32 nritems;
+-	int i;
+-	int ret;
+-
+-	BUG_ON(!btrfs_is_leaf(leaf));
+-	nritems = btrfs_header_nritems(leaf);
+-
+-	for (i = 0; i < nritems; i++) {
+-		cond_resched();
+-		btrfs_item_key_to_cpu(leaf, &key, i);
+-
+-		/* only extents have references, skip everything else */
+-		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+-			continue;
+-
+-		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+-
+-		/* inline extents live in the btree, they don't have refs */
+-		if (btrfs_file_extent_type(leaf, fi) ==
+-		    BTRFS_FILE_EXTENT_INLINE)
+-			continue;
+-
+-		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+-
+-		/* holes don't have refs */
+-		if (disk_bytenr == 0)
+-			continue;
+-
+-		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+-		ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
+-					leaf->start, 0, key.objectid, 0);
+-		BUG_ON(ret);
+-	}
+-	return 0;
+-}
+-
+-static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
+-					struct btrfs_root *root,
+-					struct btrfs_leaf_ref *ref)
+-{
+-	int i;
+-	int ret;
+-	struct btrfs_extent_info *info;
+-	struct refsort *sorted;
+-
+-	if (ref->nritems == 0)
+-		return 0;
+-
+-	sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
+-	for (i = 0; i < ref->nritems; i++) {
+-		sorted[i].bytenr = ref->extents[i].bytenr;
+-		sorted[i].slot = i;
+-	}
+-	sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
+-
+-	/*
+-	 * the items in the ref were sorted when the ref was inserted
+-	 * into the ref cache, so this is already in order
+-	 */
+-	for (i = 0; i < ref->nritems; i++) {
+-		info = ref->extents + sorted[i].slot;
+-		ret = btrfs_free_extent(trans, root, info->bytenr,
+-					  info->num_bytes, ref->bytenr,
+-					  ref->owner, ref->generation,
+-					  info->objectid, 0);
+-
+-		atomic_inc(&root->fs_info->throttle_gen);
+-		wake_up(&root->fs_info->transaction_throttle);
+-		cond_resched();
+-
+-		BUG_ON(ret);
+-		info++;
+-	}
+-
+-	kfree(sorted);
+-	return 0;
+-}
+-
+-
+-static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
+-				     struct btrfs_root *root, u64 start,
+-				     u64 len, u32 *refs)
+-{
+-	int ret;
+-
+-	ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
+-	BUG_ON(ret);
+-
+-#if 0 /* some debugging code in case we see problems here */
+-	/* if the refs count is one, it won't get increased again.  But
+-	 * if the ref count is > 1, someone may be decreasing it at
+-	 * the same time we are.
+-	 */
+-	if (*refs != 1) {
+-		struct extent_buffer *eb = NULL;
+-		eb = btrfs_find_create_tree_block(root, start, len);
+-		if (eb)
+-			btrfs_tree_lock(eb);
+-
+-		mutex_lock(&root->fs_info->alloc_mutex);
+-		ret = lookup_extent_ref(NULL, root, start, len, refs);
+-		BUG_ON(ret);
+-		mutex_unlock(&root->fs_info->alloc_mutex);
+-
+-		if (eb) {
+-			btrfs_tree_unlock(eb);
+-			free_extent_buffer(eb);
+-		}
+-		if (*refs == 1) {
+-			printk(KERN_ERR "btrfs block %llu went down to one "
+-			       "during drop_snap\n", (unsigned long long)start);
+-		}
+-
+-	}
+-#endif
+-
+-	cond_resched();
+-	return ret;
+-}
++struct walk_control {
++	u64 refs[BTRFS_MAX_LEVEL];
++	u64 flags[BTRFS_MAX_LEVEL];
++	struct btrfs_key update_progress;
++	int stage;
++	int level;
++	int shared_level;
++	int update_ref;
++	int keep_locks;
++	int reada_slot;
++	int reada_count;
++};
+ 
++#define DROP_REFERENCE	1
++#define UPDATE_BACKREF	2
+ 
+-/*
+- * this is used while deleting old snapshots, and it drops the refs
+- * on a whole subtree starting from a level 1 node.
+- *
+- * The idea is to sort all the leaf pointers, and then drop the
+- * ref on all the leaves in order.  Most of the time the leaves
+- * will have ref cache entries, so no leaf IOs will be required to
+- * find the extents they have references on.
+- *
+- * For each leaf, any references it has are also dropped in order
+- *
+- * This ends up dropping the references in something close to optimal
+- * order for reading and modifying the extent allocation tree.
+- */
+-static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
+-					struct btrfs_root *root,
+-					struct btrfs_path *path)
++static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
++				     struct btrfs_root *root,
++				     struct walk_control *wc,
++				     struct btrfs_path *path)
+ {
+ 	u64 bytenr;
+-	u64 root_owner;
+-	u64 root_gen;
+-	struct extent_buffer *eb = path->nodes[1];
+-	struct extent_buffer *leaf;
+-	struct btrfs_leaf_ref *ref;
+-	struct refsort *sorted = NULL;
+-	int nritems = btrfs_header_nritems(eb);
++	u64 generation;
++	u64 refs;
++	u64 flags;
++	u64 last = 0;
++	u32 nritems;
++	u32 blocksize;
++	struct btrfs_key key;
++	struct extent_buffer *eb;
+ 	int ret;
+-	int i;
+-	int refi = 0;
+-	int slot = path->slots[1];
+-	u32 blocksize = btrfs_level_size(root, 0);
+-	u32 refs;
+-
+-	if (nritems == 0)
+-		goto out;
+-
+-	root_owner = btrfs_header_owner(eb);
+-	root_gen = btrfs_header_generation(eb);
+-	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
++	int slot;
++	int nread = 0;
+ 
+-	/*
+-	 * step one, sort all the leaf pointers so we don't scribble
+-	 * randomly into the extent allocation tree
+-	 */
+-	for (i = slot; i < nritems; i++) {
+-		sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
+-		sorted[refi].slot = i;
+-		refi++;
++	if (path->slots[wc->level] < wc->reada_slot) {
++		wc->reada_count = wc->reada_count * 2 / 3;
++		wc->reada_count = max(wc->reada_count, 2);
++	} else {
++		wc->reada_count = wc->reada_count * 3 / 2;
++		wc->reada_count = min_t(int, wc->reada_count,
++					BTRFS_NODEPTRS_PER_BLOCK(root));
+ 	}
+ 
+-	/*
+-	 * nritems won't be zero, but if we're picking up drop_snapshot
+-	 * after a crash, slot might be > 0, so double check things
+-	 * just in case.
+-	 */
+-	if (refi == 0)
+-		goto out;
++	eb = path->nodes[wc->level];
++	nritems = btrfs_header_nritems(eb);
++	blocksize = btrfs_level_size(root, wc->level - 1);
+ 
+-	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
++	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
++		if (nread >= wc->reada_count)
++			break;
+ 
+-	/*
+-	 * the first loop frees everything the leaves point to
+-	 */
+-	for (i = 0; i < refi; i++) {
+-		u64 ptr_gen;
++		cond_resched();
++		bytenr = btrfs_node_blockptr(eb, slot);
++		generation = btrfs_node_ptr_generation(eb, slot);
+ 
+-		bytenr = sorted[i].bytenr;
++		if (slot == path->slots[wc->level])
++			goto reada;
+ 
+-		/*
+-		 * check the reference count on this leaf.  If it is > 1
+-		 * we just decrement it below and don't update any
+-		 * of the refs the leaf points to.
+-		 */
+-		ret = drop_snap_lookup_refcount(trans, root, bytenr,
+-						blocksize, &refs);
+-		BUG_ON(ret);
+-		if (refs != 1)
++		if (wc->stage == UPDATE_BACKREF &&
++		    generation <= root->root_key.offset)
+ 			continue;
+ 
+-		ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
+-
+-		/*
+-		 * the leaf only had one reference, which means the
+-		 * only thing pointing to this leaf is the snapshot
+-		 * we're deleting.  It isn't possible for the reference
+-		 * count to increase again later
+-		 *
+-		 * The reference cache is checked for the leaf,
+-		 * and if found we'll be able to drop any refs held by
+-		 * the leaf without needing to read it in.
+-		 */
+-		ref = btrfs_lookup_leaf_ref(root, bytenr);
+-		if (ref && ref->generation != ptr_gen) {
+-			btrfs_free_leaf_ref(root, ref);
+-			ref = NULL;
+-		}
+-		if (ref) {
+-			ret = cache_drop_leaf_ref(trans, root, ref);
+-			BUG_ON(ret);
+-			btrfs_remove_leaf_ref(root, ref);
+-			btrfs_free_leaf_ref(root, ref);
+-		} else {
+-			/*
+-			 * the leaf wasn't in the reference cache, so
+-			 * we have to read it.
+-			 */
+-			leaf = read_tree_block(root, bytenr, blocksize,
+-					       ptr_gen);
+-			ret = btrfs_drop_leaf_ref(trans, root, leaf);
+-			BUG_ON(ret);
+-			free_extent_buffer(leaf);
+-		}
+-		atomic_inc(&root->fs_info->throttle_gen);
+-		wake_up(&root->fs_info->transaction_throttle);
+-		cond_resched();
+-	}
+-
+-	/*
+-	 * run through the loop again to free the refs on the leaves.
+-	 * This is faster than doing it in the loop above because
+-	 * the leaves are likely to be clustered together.  We end up
+-	 * working in nice chunks on the extent allocation tree.
+-	 */
+-	for (i = 0; i < refi; i++) {
+-		bytenr = sorted[i].bytenr;
+-		ret = btrfs_free_extent(trans, root, bytenr,
+-					blocksize, eb->start,
+-					root_owner, root_gen, 0, 1);
++		/* We don't lock the tree block, it's OK to be racy here */
++		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
++					       &refs, &flags);
+ 		BUG_ON(ret);
++		BUG_ON(refs == 0);
+ 
+-		atomic_inc(&root->fs_info->throttle_gen);
+-		wake_up(&root->fs_info->transaction_throttle);
+-		cond_resched();
+-	}
+-out:
+-	kfree(sorted);
+-
+-	/*
+-	 * update the path to show we've processed the entire level 1
+-	 * node.  This will get saved into the root's drop_snapshot_progress
+-	 * field so these drops are not repeated again if this transaction
+-	 * commits.
+-	 */
+-	path->slots[1] = nritems;
+-	return 0;
+-}
+-
+-/*
+- * helper function for drop_snapshot, this walks down the tree dropping ref
+- * counts as it goes.
+- */
+-static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+-				   struct btrfs_root *root,
+-				   struct btrfs_path *path, int *level)
+-{
+-	u64 root_owner;
+-	u64 root_gen;
+-	u64 bytenr;
+-	u64 ptr_gen;
+-	struct extent_buffer *next;
+-	struct extent_buffer *cur;
+-	struct extent_buffer *parent;
+-	u32 blocksize;
+-	int ret;
+-	u32 refs;
+-
+-	WARN_ON(*level < 0);
+-	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+-	ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
+-				path->nodes[*level]->len, &refs);
+-	BUG_ON(ret);
+-	if (refs > 1)
+-		goto out;
+-
+-	/*
+-	 * walk down to the last node level and free all the leaves
+-	 */
+-	while (*level >= 0) {
+-		WARN_ON(*level < 0);
+-		WARN_ON(*level >= BTRFS_MAX_LEVEL);
+-		cur = path->nodes[*level];
+-
+-		if (btrfs_header_level(cur) != *level)
+-			WARN_ON(1);
+-
+-		if (path->slots[*level] >=
+-		    btrfs_header_nritems(cur))
+-			break;
++		if (wc->stage == DROP_REFERENCE) {
++			if (refs == 1)
++				goto reada;
+ 
+-		/* the new code goes down to level 1 and does all the
+-		 * leaves pointed to that node in bulk.  So, this check
+-		 * for level 0 will always be false.
+-		 *
+-		 * But, the disk format allows the drop_snapshot_progress
+-		 * field in the root to leave things in a state where
+-		 * a leaf will need cleaning up here.  If someone crashes
+-		 * with the old code and then boots with the new code,
+-		 * we might find a leaf here.
+-		 */
+-		if (*level == 0) {
+-			ret = btrfs_drop_leaf_ref(trans, root, cur);
+-			BUG_ON(ret);
+-			break;
++			if (wc->level == 1 &&
++			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
++				continue;
++			if (!wc->update_ref ||
++			    generation <= root->root_key.offset)
++				continue;
++			btrfs_node_key_to_cpu(eb, &key, slot);
++			ret = btrfs_comp_cpu_keys(&key,
++						  &wc->update_progress);
++			if (ret < 0)
++				continue;
++		} else {
++			if (wc->level == 1 &&
++			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
++				continue;
+ 		}
+-
+-		/*
+-		 * once we get to level one, process the whole node
+-		 * at once, including everything below it.
+-		 */
+-		if (*level == 1) {
+-			ret = drop_level_one_refs(trans, root, path);
+-			BUG_ON(ret);
++reada:
++		ret = readahead_tree_block(root, bytenr, blocksize,
++					   generation);
++		if (ret)
+ 			break;
+-		}
+-
+-		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+-		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+-		blocksize = btrfs_level_size(root, *level - 1);
+-
+-		ret = drop_snap_lookup_refcount(trans, root, bytenr,
+-						blocksize, &refs);
+-		BUG_ON(ret);
+-
+-		/*
+-		 * if there is more than one reference, we don't need
+-		 * to read that node to drop any references it has.  We
+-		 * just drop the ref we hold on that node and move on to the
+-		 * next slot in this level.
+-		 */
+-		if (refs != 1) {
+-			parent = path->nodes[*level];
+-			root_owner = btrfs_header_owner(parent);
+-			root_gen = btrfs_header_generation(parent);
+-			path->slots[*level]++;
+-
+-			ret = btrfs_free_extent(trans, root, bytenr,
+-						blocksize, parent->start,
+-						root_owner, root_gen,
+-						*level - 1, 1);
+-			BUG_ON(ret);
+-
+-			atomic_inc(&root->fs_info->throttle_gen);
+-			wake_up(&root->fs_info->transaction_throttle);
+-			cond_resched();
+-
+-			continue;
+-		}
+-
+-		/*
+-		 * we need to keep freeing things in the next level down.
+-		 * read the block and loop around to process it
+-		 */
+-		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+-		WARN_ON(*level <= 0);
+-		if (path->nodes[*level-1])
+-			free_extent_buffer(path->nodes[*level-1]);
+-		path->nodes[*level-1] = next;
+-		*level = btrfs_header_level(next);
+-		path->slots[*level] = 0;
+-		cond_resched();
++		last = bytenr + blocksize;
++		nread++;
+ 	}
+-out:
+-	WARN_ON(*level < 0);
+-	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+-
+-	if (path->nodes[*level] == root->node) {
+-		parent = path->nodes[*level];
+-		bytenr = path->nodes[*level]->start;
+-	} else {
+-		parent = path->nodes[*level + 1];
+-		bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
+-	}
+-
+-	blocksize = btrfs_level_size(root, *level);
+-	root_owner = btrfs_header_owner(parent);
+-	root_gen = btrfs_header_generation(parent);
+-
+-	/*
+-	 * cleanup and free the reference on the last node
+-	 * we processed
+-	 */
+-	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+-				  parent->start, root_owner, root_gen,
+-				  *level, 1);
+-	free_extent_buffer(path->nodes[*level]);
+-	path->nodes[*level] = NULL;
+-
+-	*level += 1;
+-	BUG_ON(ret);
+-
+-	cond_resched();
+-	return 0;
++	wc->reada_slot = slot;
+ }
+-#endif
+-
+-struct walk_control {
+-	u64 refs[BTRFS_MAX_LEVEL];
+-	u64 flags[BTRFS_MAX_LEVEL];
+-	struct btrfs_key update_progress;
+-	int stage;
+-	int level;
+-	int shared_level;
+-	int update_ref;
+-	int keep_locks;
+-};
+-
+-#define DROP_REFERENCE	1
+-#define UPDATE_BACKREF	2
+ 
+ /*
+  * hepler to process tree block while walking down the tree.
+  *
+- * when wc->stage == DROP_REFERENCE, this function checks
+- * reference count of the block. if the block is shared and
+- * we need update back refs for the subtree rooted at the
+- * block, this function changes wc->stage to UPDATE_BACKREF
+- *
+  * when wc->stage == UPDATE_BACKREF, this function updates
+  * back refs for pointers in the block.
+  *
+@@ -4800,11 +5002,10 @@ struct walk_control {
+ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ 				   struct btrfs_root *root,
+ 				   struct btrfs_path *path,
+-				   struct walk_control *wc)
++				   struct walk_control *wc, int lookup_info)
+ {
+ 	int level = wc->level;
+ 	struct extent_buffer *eb = path->nodes[level];
+-	struct btrfs_key key;
+ 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ 	int ret;
+ 
+@@ -4816,8 +5017,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ 	 * when reference count of tree block is 1, it won't increase
+ 	 * again. once full backref flag is set, we never clear it.
+ 	 */
+-	if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+-	    (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
++	if (lookup_info &&
++	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
++	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
+ 		BUG_ON(!path->locks[level]);
+ 		ret = btrfs_lookup_extent_info(trans, root,
+ 					       eb->start, eb->len,
+@@ -4827,21 +5029,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ 		BUG_ON(wc->refs[level] == 0);
+ 	}
+ 
+-	if (wc->stage == DROP_REFERENCE &&
+-	    wc->update_ref && wc->refs[level] > 1) {
+-		BUG_ON(eb == root->node);
+-		BUG_ON(path->slots[level] > 0);
+-		if (level == 0)
+-			btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
+-		else
+-			btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
+-		if (btrfs_header_owner(eb) == root->root_key.objectid &&
+-		    btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
+-			wc->stage = UPDATE_BACKREF;
+-			wc->shared_level = level;
+-		}
+-	}
+-
+ 	if (wc->stage == DROP_REFERENCE) {
+ 		if (wc->refs[level] > 1)
+ 			return 1;
+@@ -4878,6 +5065,136 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ }
+ 
+ /*
++ * hepler to process tree block pointer.
++ *
++ * when wc->stage == DROP_REFERENCE, this function checks
++ * reference count of the block pointed to. if the block
++ * is shared and we need update back refs for the subtree
++ * rooted at the block, this function changes wc->stage to
++ * UPDATE_BACKREF. if the block is shared and there is no
++ * need to update back, this function drops the reference
++ * to the block.
++ *
++ * NOTE: return value 1 means we should stop walking down.
++ */
++static noinline int do_walk_down(struct btrfs_trans_handle *trans,
++				 struct btrfs_root *root,
++				 struct btrfs_path *path,
++				 struct walk_control *wc, int *lookup_info)
++{
++	u64 bytenr;
++	u64 generation;
++	u64 parent;
++	u32 blocksize;
++	struct btrfs_key key;
++	struct extent_buffer *next;
++	int level = wc->level;
++	int reada = 0;
++	int ret = 0;
++
++	generation = btrfs_node_ptr_generation(path->nodes[level],
++					       path->slots[level]);
++	/*
++	 * if the lower level block was created before the snapshot
++	 * was created, we know there is no need to update back refs
++	 * for the subtree
++	 */
++	if (wc->stage == UPDATE_BACKREF &&
++	    generation <= root->root_key.offset) {
++		*lookup_info = 1;
++		return 1;
++	}
++
++	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
++	blocksize = btrfs_level_size(root, level - 1);
++
++	next = btrfs_find_tree_block(root, bytenr, blocksize);
++	if (!next) {
++		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
++		reada = 1;
++	}
++	btrfs_tree_lock(next);
++	btrfs_set_lock_blocking(next);
++
++	ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
++				       &wc->refs[level - 1],
++				       &wc->flags[level - 1]);
++	BUG_ON(ret);
++	BUG_ON(wc->refs[level - 1] == 0);
++	*lookup_info = 0;
++
++	if (wc->stage == DROP_REFERENCE) {
++		if (wc->refs[level - 1] > 1) {
++			if (level == 1 &&
++			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
++				goto skip;
++
++			if (!wc->update_ref ||
++			    generation <= root->root_key.offset)
++				goto skip;
++
++			btrfs_node_key_to_cpu(path->nodes[level], &key,
++					      path->slots[level]);
++			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
++			if (ret < 0)
++				goto skip;
++
++			wc->stage = UPDATE_BACKREF;
++			wc->shared_level = level - 1;
++		}
++	} else {
++		if (level == 1 &&
++		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
++			goto skip;
++	}
++
++	if (!btrfs_buffer_uptodate(next, generation)) {
++		btrfs_tree_unlock(next);
++		free_extent_buffer(next);
++		next = NULL;
++		*lookup_info = 1;
++	}
++
++	if (!next) {
++		if (reada && level == 1)
++			reada_walk_down(trans, root, wc, path);
++		next = read_tree_block(root, bytenr, blocksize, generation);
++		btrfs_tree_lock(next);
++		btrfs_set_lock_blocking(next);
++	}
++
++	level--;
++	BUG_ON(level != btrfs_header_level(next));
++	path->nodes[level] = next;
++	path->slots[level] = 0;
++	path->locks[level] = 1;
++	wc->level = level;
++	if (wc->level == 1)
++		wc->reada_slot = 0;
++	return 0;
++skip:
++	wc->refs[level - 1] = 0;
++	wc->flags[level - 1] = 0;
++	if (wc->stage == DROP_REFERENCE) {
++		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
++			parent = path->nodes[level]->start;
++		} else {
++			BUG_ON(root->root_key.objectid !=
++			       btrfs_header_owner(path->nodes[level]));
++			parent = 0;
++		}
++
++		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
++					root->root_key.objectid, level - 1, 0);
++		BUG_ON(ret);
++	}
++	btrfs_tree_unlock(next);
++	free_extent_buffer(next);
++	*lookup_info = 1;
++	return 1;
++}
++
++/*
+  * hepler to process tree block while walking up the tree.
+  *
+  * when wc->stage == DROP_REFERENCE, this function drops
+@@ -4904,7 +5221,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+ 		if (level < wc->shared_level)
+ 			goto out;
+ 
+-		BUG_ON(wc->refs[level] <= 1);
+ 		ret = find_next_key(path, level + 1, &wc->update_progress);
+ 		if (ret > 0)
+ 			wc->update_ref = 0;
+@@ -4935,8 +5251,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+ 				path->locks[level] = 0;
+ 				return 1;
+ 			}
+-		} else {
+-			BUG_ON(level != 0);
+ 		}
+ 	}
+ 
+@@ -4989,39 +5303,28 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+ 				   struct btrfs_path *path,
+ 				   struct walk_control *wc)
+ {
+-	struct extent_buffer *next;
+-	struct extent_buffer *cur;
+-	u64 bytenr;
+-	u64 ptr_gen;
+-	u32 blocksize;
+ 	int level = wc->level;
++	int lookup_info = 1;
+ 	int ret;
+ 
+ 	while (level >= 0) {
+-		cur = path->nodes[level];
+-		BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
++		if (path->slots[level] >=
++		    btrfs_header_nritems(path->nodes[level]))
++			break;
+ 
+-		ret = walk_down_proc(trans, root, path, wc);
++		ret = walk_down_proc(trans, root, path, wc, lookup_info);
+ 		if (ret > 0)
+ 			break;
+ 
+ 		if (level == 0)
+ 			break;
+ 
+-		bytenr = btrfs_node_blockptr(cur, path->slots[level]);
+-		blocksize = btrfs_level_size(root, level - 1);
+-		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
+-
+-		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+-		btrfs_tree_lock(next);
+-		btrfs_set_lock_blocking(next);
+-
+-		level--;
+-		BUG_ON(level != btrfs_header_level(next));
+-		path->nodes[level] = next;
+-		path->slots[level] = 0;
+-		path->locks[level] = 1;
+-		wc->level = level;
++		ret = do_walk_down(trans, root, path, wc, &lookup_info);
++		if (ret > 0) {
++			path->slots[level]++;
++			continue;
++		}
++		level = wc->level;
+ 	}
+ 	return 0;
+ }
+@@ -5111,9 +5414,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+ 			err = ret;
+ 			goto out;
+ 		}
+-		btrfs_node_key_to_cpu(path->nodes[level], &key,
+-				      path->slots[level]);
+-		WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
++		WARN_ON(ret > 0);
+ 
+ 		/*
+ 		 * unlock our path, this is safe because only this
+@@ -5148,6 +5449,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+ 	wc->stage = DROP_REFERENCE;
+ 	wc->update_ref = update_ref;
+ 	wc->keep_locks = 0;
++	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+ 
+ 	while (1) {
+ 		ret = walk_down_tree(trans, root, path, wc);
+@@ -5200,9 +5502,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+ 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
+ 	BUG_ON(ret);
+ 
+-	free_extent_buffer(root->node);
+-	free_extent_buffer(root->commit_root);
+-	kfree(root);
++	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
++		ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
++					   NULL, NULL);
++		BUG_ON(ret < 0);
++		if (ret > 0) {
++			ret = btrfs_del_orphan_item(trans, tree_root,
++						    root->root_key.objectid);
++			BUG_ON(ret);
++		}
++	}
++
++	if (root->in_radix) {
++		btrfs_free_fs_root(tree_root->fs_info, root);
++	} else {
++		free_extent_buffer(root->node);
++		free_extent_buffer(root->commit_root);
++		kfree(root);
++	}
+ out:
+ 	btrfs_end_transaction(trans, tree_root);
+ 	kfree(wc);
+@@ -5254,6 +5571,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ 	wc->stage = DROP_REFERENCE;
+ 	wc->update_ref = 0;
+ 	wc->keep_locks = 1;
++	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+ 
+ 	while (1) {
+ 		wret = walk_down_tree(trans, root, path, wc);
+@@ -5396,9 +5714,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
+ 	lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
+ 	while (1) {
+ 		int ret;
+-		spin_lock(&em_tree->lock);
++		write_lock(&em_tree->lock);
+ 		ret = add_extent_mapping(em_tree, em);
+-		spin_unlock(&em_tree->lock);
++		write_unlock(&em_tree->lock);
+ 		if (ret != -EEXIST) {
+ 			free_extent_map(em);
+ 			break;
+@@ -6841,287 +7159,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+ 	return 0;
+ }
+ 
+-#if 0
+-static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+-				 struct btrfs_root *root,
+-				 u64 objectid, u64 size)
+-{
+-	struct btrfs_path *path;
+-	struct btrfs_inode_item *item;
+-	struct extent_buffer *leaf;
+-	int ret;
+-
+-	path = btrfs_alloc_path();
+-	if (!path)
+-		return -ENOMEM;
+-
+-	path->leave_spinning = 1;
+-	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+-	if (ret)
+-		goto out;
+-
+-	leaf = path->nodes[0];
+-	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+-	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+-	btrfs_set_inode_generation(leaf, item, 1);
+-	btrfs_set_inode_size(leaf, item, size);
+-	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+-	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+-	btrfs_mark_buffer_dirty(leaf);
+-	btrfs_release_path(root, path);
+-out:
+-	btrfs_free_path(path);
+-	return ret;
+-}
+-
+-static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+-					struct btrfs_block_group_cache *group)
++/*
++ * checks to see if its even possible to relocate this block group.
++ *
++ * @return - -1 if it's not a good idea to relocate this block group, 0 if its
++ * ok to go ahead and try.
++ */
++int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
+ {
+-	struct inode *inode = NULL;
+-	struct btrfs_trans_handle *trans;
+-	struct btrfs_root *root;
+-	struct btrfs_key root_key;
+-	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+-	int err = 0;
++	struct btrfs_block_group_cache *block_group;
++	struct btrfs_space_info *space_info;
++	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
++	struct btrfs_device *device;
++	int full = 0;
++	int ret = 0;
+ 
+-	root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+-	root_key.type = BTRFS_ROOT_ITEM_KEY;
+-	root_key.offset = (u64)-1;
+-	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+-	if (IS_ERR(root))
+-		return ERR_CAST(root);
++	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+ 
+-	trans = btrfs_start_transaction(root, 1);
+-	BUG_ON(!trans);
++	/* odd, couldn't find the block group, leave it alone */
++	if (!block_group)
++		return -1;
+ 
+-	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+-	if (err)
++	/* no bytes used, we're good */
++	if (!btrfs_block_group_used(&block_group->item))
+ 		goto out;
+ 
+-	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+-	BUG_ON(err);
+-
+-	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+-				       group->key.offset, 0, group->key.offset,
+-				       0, 0, 0);
+-	BUG_ON(err);
+-
+-	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+-	if (inode->i_state & I_NEW) {
+-		BTRFS_I(inode)->root = root;
+-		BTRFS_I(inode)->location.objectid = objectid;
+-		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+-		BTRFS_I(inode)->location.offset = 0;
+-		btrfs_read_locked_inode(inode);
+-		unlock_new_inode(inode);
+-		BUG_ON(is_bad_inode(inode));
+-	} else {
+-		BUG_ON(1);
+-	}
+-	BTRFS_I(inode)->index_cnt = group->key.objectid;
+-
+-	err = btrfs_orphan_add(trans, inode);
+-out:
+-	btrfs_end_transaction(trans, root);
+-	if (err) {
+-		if (inode)
+-			iput(inode);
+-		inode = ERR_PTR(err);
+-	}
+-	return inode;
+-}
+-
+-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+-{
+-
+-	struct btrfs_ordered_sum *sums;
+-	struct btrfs_sector_sum *sector_sum;
+-	struct btrfs_ordered_extent *ordered;
+-	struct btrfs_root *root = BTRFS_I(inode)->root;
+-	struct list_head list;
+-	size_t offset;
+-	int ret;
+-	u64 disk_bytenr;
+-
+-	INIT_LIST_HEAD(&list);
+-
+-	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+-	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+-
+-	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+-	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+-				       disk_bytenr + len - 1, &list);
+-
+-	while (!list_empty(&list)) {
+-		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+-		list_del_init(&sums->list);
+-
+-		sector_sum = sums->sums;
+-		sums->bytenr = ordered->start;
++	space_info = block_group->space_info;
++	spin_lock(&space_info->lock);
+ 
+-		offset = 0;
+-		while (offset < sums->len) {
+-			sector_sum->bytenr += ordered->start - disk_bytenr;
+-			sector_sum++;
+-			offset += root->sectorsize;
+-		}
++	full = space_info->full;
+ 
+-		btrfs_add_ordered_sum(inode, ordered, sums);
++	/*
++	 * if this is the last block group we have in this space, we can't
++	 * relocate it unless we're able to allocate a new chunk below.
++	 *
++	 * Otherwise, we need to make sure we have room in the space to handle
++	 * all of the extents from this block group.  If we can, we're good
++	 */
++	if ((space_info->total_bytes != block_group->key.offset) &&
++	   (space_info->bytes_used + space_info->bytes_reserved +
++	    space_info->bytes_pinned + space_info->bytes_readonly +
++	    btrfs_block_group_used(&block_group->item) <
++	    space_info->total_bytes)) {
++		spin_unlock(&space_info->lock);
++		goto out;
+ 	}
+-	btrfs_put_ordered_extent(ordered);
+-	return 0;
+-}
+-
+-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
+-{
+-	struct btrfs_trans_handle *trans;
+-	struct btrfs_path *path;
+-	struct btrfs_fs_info *info = root->fs_info;
+-	struct extent_buffer *leaf;
+-	struct inode *reloc_inode;
+-	struct btrfs_block_group_cache *block_group;
+-	struct btrfs_key key;
+-	u64 skipped;
+-	u64 cur_byte;
+-	u64 total_found;
+-	u32 nritems;
+-	int ret;
+-	int progress;
+-	int pass = 0;
+-
+-	root = root->fs_info->extent_root;
+-
+-	block_group = btrfs_lookup_block_group(info, group_start);
+-	BUG_ON(!block_group);
+-
+-	printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
+-	       (unsigned long long)block_group->key.objectid,
+-	       (unsigned long long)block_group->flags);
+-
+-	path = btrfs_alloc_path();
+-	BUG_ON(!path);
+-
+-	reloc_inode = create_reloc_inode(info, block_group);
+-	BUG_ON(IS_ERR(reloc_inode));
+-
+-	__alloc_chunk_for_shrink(root, block_group, 1);
+-	set_block_group_readonly(block_group);
+-
+-	btrfs_start_delalloc_inodes(info->tree_root);
+-	btrfs_wait_ordered_extents(info->tree_root, 0);
+-again:
+-	skipped = 0;
+-	total_found = 0;
+-	progress = 0;
+-	key.objectid = block_group->key.objectid;
+-	key.offset = 0;
+-	key.type = 0;
+-	cur_byte = key.objectid;
+-
+-	trans = btrfs_start_transaction(info->tree_root, 1);
+-	btrfs_commit_transaction(trans, info->tree_root);
++	spin_unlock(&space_info->lock);
+ 
+-	mutex_lock(&root->fs_info->cleaner_mutex);
+-	btrfs_clean_old_snapshots(info->tree_root);
+-	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
+-	mutex_unlock(&root->fs_info->cleaner_mutex);
++	/*
++	 * ok we don't have enough space, but maybe we have free space on our
++	 * devices to allocate new chunks for relocation, so loop through our
++	 * alloc devices and guess if we have enough space.  However, if we
++	 * were marked as full, then we know there aren't enough chunks, and we
++	 * can just return.
++	 */
++	ret = -1;
++	if (full)
++		goto out;
+ 
+-	trans = btrfs_start_transaction(info->tree_root, 1);
+-	btrfs_commit_transaction(trans, info->tree_root);
++	mutex_lock(&root->fs_info->chunk_mutex);
++	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
++		u64 min_free = btrfs_block_group_used(&block_group->item);
++		u64 dev_offset, max_avail;
+ 
+-	while (1) {
+-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+-		if (ret < 0)
+-			goto out;
+-next:
+-		leaf = path->nodes[0];
+-		nritems = btrfs_header_nritems(leaf);
+-		if (path->slots[0] >= nritems) {
+-			ret = btrfs_next_leaf(root, path);
+-			if (ret < 0)
+-				goto out;
+-			if (ret == 1) {
+-				ret = 0;
++		/*
++		 * check to make sure we can actually find a chunk with enough
++		 * space to fit our block group in.
++		 */
++		if (device->total_bytes > device->bytes_used + min_free) {
++			ret = find_free_dev_extent(NULL, device, min_free,
++						   &dev_offset, &max_avail);
++			if (!ret)
+ 				break;
+-			}
+-			leaf = path->nodes[0];
+-			nritems = btrfs_header_nritems(leaf);
+-		}
+-
+-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+-
+-		if (key.objectid >= block_group->key.objectid +
+-		    block_group->key.offset)
+-			break;
+-
+-		if (progress && need_resched()) {
+-			btrfs_release_path(root, path);
+-			cond_resched();
+-			progress = 0;
+-			continue;
+-		}
+-		progress = 1;
+-
+-		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
+-		    key.objectid + key.offset <= cur_byte) {
+-			path->slots[0]++;
+-			goto next;
++			ret = -1;
+ 		}
+-
+-		total_found++;
+-		cur_byte = key.objectid + key.offset;
+-		btrfs_release_path(root, path);
+-
+-		__alloc_chunk_for_shrink(root, block_group, 0);
+-		ret = relocate_one_extent(root, path, &key, block_group,
+-					  reloc_inode, pass);
+-		BUG_ON(ret < 0);
+-		if (ret > 0)
+-			skipped++;
+-
+-		key.objectid = cur_byte;
+-		key.type = 0;
+-		key.offset = 0;
+ 	}
+-
+-	btrfs_release_path(root, path);
+-
+-	if (pass == 0) {
+-		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
+-		invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
+-	}
+-
+-	if (total_found > 0) {
+-		printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
+-		       (unsigned long long)total_found, pass);
+-		pass++;
+-		if (total_found == skipped && pass > 2) {
+-			iput(reloc_inode);
+-			reloc_inode = create_reloc_inode(info, block_group);
+-			pass = 0;
+-		}
+-		goto again;
+-	}
+-
+-	/* delete reloc_inode */
+-	iput(reloc_inode);
+-
+-	/* unpin extents in this range */
+-	trans = btrfs_start_transaction(info->tree_root, 1);
+-	btrfs_commit_transaction(trans, info->tree_root);
+-
+-	spin_lock(&block_group->lock);
+-	WARN_ON(block_group->pinned > 0);
+-	WARN_ON(block_group->reserved > 0);
+-	WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
+-	spin_unlock(&block_group->lock);
+-	btrfs_put_block_group(block_group);
+-	ret = 0;
++	mutex_unlock(&root->fs_info->chunk_mutex);
+ out:
+-	btrfs_free_path(path);
++	btrfs_put_block_group(block_group);
+ 	return ret;
+ }
+-#endif
+ 
+ static int find_first_block_group(struct btrfs_root *root,
+ 		struct btrfs_path *path, struct btrfs_key *key)
+@@ -7164,8 +7281,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
+ {
+ 	struct btrfs_block_group_cache *block_group;
+ 	struct btrfs_space_info *space_info;
++	struct btrfs_caching_control *caching_ctl;
+ 	struct rb_node *n;
+ 
++	down_write(&info->extent_commit_sem);
++	while (!list_empty(&info->caching_block_groups)) {
++		caching_ctl = list_entry(info->caching_block_groups.next,
++					 struct btrfs_caching_control, list);
++		list_del(&caching_ctl->list);
++		put_caching_control(caching_ctl);
++	}
++	up_write(&info->extent_commit_sem);
++
+ 	spin_lock(&info->block_group_cache_lock);
+ 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+ 		block_group = rb_entry(n, struct btrfs_block_group_cache,
+@@ -7179,8 +7306,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
+ 		up_write(&block_group->space_info->groups_sem);
+ 
+ 		if (block_group->cached == BTRFS_CACHE_STARTED)
+-			wait_event(block_group->caching_q,
+-				   block_group_cache_done(block_group));
++			wait_block_group_cache_done(block_group);
+ 
+ 		btrfs_remove_free_space_cache(block_group);
+ 
+@@ -7250,7 +7376,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
+ 		spin_lock_init(&cache->lock);
+ 		spin_lock_init(&cache->tree_lock);
+ 		cache->fs_info = info;
+-		init_waitqueue_head(&cache->caching_q);
+ 		INIT_LIST_HEAD(&cache->list);
+ 		INIT_LIST_HEAD(&cache->cluster_list);
+ 
+@@ -7272,8 +7397,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
+ 		cache->flags = btrfs_block_group_flags(&cache->item);
+ 		cache->sectorsize = root->sectorsize;
+ 
+-		remove_sb_from_cache(root, cache);
+-
+ 		/*
+ 		 * check for two cases, either we are full, and therefore
+ 		 * don't need to bother with the caching work since we won't
+@@ -7282,13 +7405,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
+ 		 * time, particularly in the full case.
+ 		 */
+ 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
++			exclude_super_stripes(root, cache);
++			cache->last_byte_to_unpin = (u64)-1;
+ 			cache->cached = BTRFS_CACHE_FINISHED;
++			free_excluded_extents(root, cache);
+ 		} else if (btrfs_block_group_used(&cache->item) == 0) {
++			exclude_super_stripes(root, cache);
++			cache->last_byte_to_unpin = (u64)-1;
+ 			cache->cached = BTRFS_CACHE_FINISHED;
+ 			add_new_free_space(cache, root->fs_info,
+ 					   found_key.objectid,
+ 					   found_key.objectid +
+ 					   found_key.offset);
++			free_excluded_extents(root, cache);
+ 		}
+ 
+ 		ret = update_space_info(info, cache->flags, found_key.offset,
+@@ -7296,6 +7425,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
+ 					&space_info);
+ 		BUG_ON(ret);
+ 		cache->space_info = space_info;
++		spin_lock(&cache->space_info->lock);
++		cache->space_info->bytes_super += cache->bytes_super;
++		spin_unlock(&cache->space_info->lock);
++
+ 		down_write(&space_info->groups_sem);
+ 		list_add_tail(&cache->list, &space_info->block_groups);
+ 		up_write(&space_info->groups_sem);
+@@ -7345,7 +7478,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ 	atomic_set(&cache->count, 1);
+ 	spin_lock_init(&cache->lock);
+ 	spin_lock_init(&cache->tree_lock);
+-	init_waitqueue_head(&cache->caching_q);
+ 	INIT_LIST_HEAD(&cache->list);
+ 	INIT_LIST_HEAD(&cache->cluster_list);
+ 
+@@ -7354,15 +7486,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ 	cache->flags = type;
+ 	btrfs_set_block_group_flags(&cache->item, type);
+ 
++	cache->last_byte_to_unpin = (u64)-1;
+ 	cache->cached = BTRFS_CACHE_FINISHED;
+-	remove_sb_from_cache(root, cache);
++	exclude_super_stripes(root, cache);
+ 
+ 	add_new_free_space(cache, root->fs_info, chunk_offset,
+ 			   chunk_offset + size);
+ 
++	free_excluded_extents(root, cache);
++
+ 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
+ 				&cache->space_info);
+ 	BUG_ON(ret);
++
++	spin_lock(&cache->space_info->lock);
++	cache->space_info->bytes_super += cache->bytes_super;
++	spin_unlock(&cache->space_info->lock);
++
+ 	down_write(&cache->space_info->groups_sem);
+ 	list_add_tail(&cache->list, &cache->space_info->block_groups);
+ 	up_write(&cache->space_info->groups_sem);
+@@ -7428,8 +7568,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ 	up_write(&block_group->space_info->groups_sem);
+ 
+ 	if (block_group->cached == BTRFS_CACHE_STARTED)
+-		wait_event(block_group->caching_q,
+-			   block_group_cache_done(block_group));
++		wait_block_group_cache_done(block_group);
+ 
+ 	btrfs_remove_free_space_cache(block_group);
+ 
+diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
+index 6826018..96577e8 100644
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
+ 	return NULL;
+ }
+ 
++static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
++		     struct extent_state *other)
++{
++	if (tree->ops && tree->ops->merge_extent_hook)
++		tree->ops->merge_extent_hook(tree->mapping->host, new,
++					     other);
++}
++
+ /*
+  * utility function to look for merge candidates inside a given range.
+  * Any extents with matching state are merged together into a single
+@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
+ 		other = rb_entry(other_node, struct extent_state, rb_node);
+ 		if (other->end == state->start - 1 &&
+ 		    other->state == state->state) {
++			merge_cb(tree, state, other);
+ 			state->start = other->start;
+ 			other->tree = NULL;
+ 			rb_erase(&other->rb_node, &tree->state);
+@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
+ 		other = rb_entry(other_node, struct extent_state, rb_node);
+ 		if (other->start == state->end + 1 &&
+ 		    other->state == state->state) {
++			merge_cb(tree, state, other);
+ 			other->start = state->start;
+ 			state->tree = NULL;
+ 			rb_erase(&state->rb_node, &tree->state);
+ 			free_extent_state(state);
++			state = NULL;
+ 		}
+ 	}
++
+ 	return 0;
+ }
+ 
+-static void set_state_cb(struct extent_io_tree *tree,
++static int set_state_cb(struct extent_io_tree *tree,
+ 			 struct extent_state *state,
+ 			 unsigned long bits)
+ {
+ 	if (tree->ops && tree->ops->set_bit_hook) {
+-		tree->ops->set_bit_hook(tree->mapping->host, state->start,
+-					state->end, state->state, bits);
++		return tree->ops->set_bit_hook(tree->mapping->host,
++					       state->start, state->end,
++					       state->state, bits);
+ 	}
++
++	return 0;
+ }
+ 
+ static void clear_state_cb(struct extent_io_tree *tree,
+ 			   struct extent_state *state,
+ 			   unsigned long bits)
+ {
+-	if (tree->ops && tree->ops->clear_bit_hook) {
+-		tree->ops->clear_bit_hook(tree->mapping->host, state->start,
+-					  state->end, state->state, bits);
+-	}
++	if (tree->ops && tree->ops->clear_bit_hook)
++		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
+ }
+ 
+ /*
+@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
+ 			int bits)
+ {
+ 	struct rb_node *node;
++	int ret;
+ 
+ 	if (end < start) {
+ 		printk(KERN_ERR "btrfs end < start %llu %llu\n",
+@@ -365,12 +379,15 @@ static int insert_state(struct extent_io_tree *tree,
+ 		       (unsigned long long)start);
+ 		WARN_ON(1);
+ 	}
++	state->start = start;
++	state->end = end;
++	ret = set_state_cb(tree, state, bits);
++	if (ret)
++		return ret;
++
+ 	if (bits & EXTENT_DIRTY)
+ 		tree->dirty_bytes += end - start + 1;
+-	set_state_cb(tree, state, bits);
+ 	state->state |= bits;
+-	state->start = start;
+-	state->end = end;
+ 	node = tree_insert(&tree->state, end, &state->rb_node);
+ 	if (node) {
+ 		struct extent_state *found;
+@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
+ 	return 0;
+ }
+ 
++static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
++		     u64 split)
++{
++	if (tree->ops && tree->ops->split_extent_hook)
++		return tree->ops->split_extent_hook(tree->mapping->host,
++						    orig, split);
++	return 0;
++}
++
+ /*
+  * split a given extent state struct in two, inserting the preallocated
+  * struct 'prealloc' as the newly created second half.  'split' indicates an
+@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+ 		       struct extent_state *prealloc, u64 split)
+ {
+ 	struct rb_node *node;
++
++	split_cb(tree, orig, split);
++
+ 	prealloc->start = orig->start;
+ 	prealloc->end = split - 1;
+ 	prealloc->state = orig->state;
+@@ -431,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
+ 			    struct extent_state *state, int bits, int wake,
+ 			    int delete)
+ {
+-	int ret = state->state & bits;
++	int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
++	int ret = state->state & bits_to_clear;
+ 
+ 	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+ 		u64 range = state->end - state->start + 1;
+@@ -439,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
+ 		tree->dirty_bytes -= range;
+ 	}
+ 	clear_state_cb(tree, state, bits);
+-	state->state &= ~bits;
++	state->state &= ~bits_to_clear;
+ 	if (wake)
+ 		wake_up(&state->wq);
+ 	if (delete || state->state == 0) {
+@@ -471,10 +501,14 @@ static int clear_state_bit(struct extent_io_tree *tree,
+  * bits were already set, or zero if none of the bits were already set.
+  */
+ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+-		     int bits, int wake, int delete, gfp_t mask)
++		     int bits, int wake, int delete,
++		     struct extent_state **cached_state,
++		     gfp_t mask)
+ {
+ 	struct extent_state *state;
++	struct extent_state *cached;
+ 	struct extent_state *prealloc = NULL;
++	struct rb_node *next_node;
+ 	struct rb_node *node;
+ 	u64 last_end;
+ 	int err;
+@@ -488,6 +522,17 @@ again:
+ 	}
+ 
+ 	spin_lock(&tree->lock);
++	if (cached_state) {
++		cached = *cached_state;
++		*cached_state = NULL;
++		cached_state = NULL;
++		if (cached && cached->tree && cached->start == start) {
++			atomic_dec(&cached->refs);
++			state = cached;
++			goto hit_next;
++		}
++		free_extent_state(cached);
++	}
+ 	/*
+ 	 * this search will find the extents that end after
+ 	 * our range starts
+@@ -496,6 +541,7 @@ again:
+ 	if (!node)
+ 		goto out;
+ 	state = rb_entry(node, struct extent_state, rb_node);
++hit_next:
+ 	if (state->start > end)
+ 		goto out;
+ 	WARN_ON(state->end < start);
+@@ -526,13 +572,11 @@ again:
+ 		if (err)
+ 			goto out;
+ 		if (state->end <= end) {
+-			set |= clear_state_bit(tree, state, bits,
+-					wake, delete);
++			set |= clear_state_bit(tree, state, bits, wake,
++					       delete);
+ 			if (last_end == (u64)-1)
+ 				goto out;
+ 			start = last_end + 1;
+-		} else {
+-			start = state->start;
+ 		}
+ 		goto search_again;
+ 	}
+@@ -547,19 +591,30 @@ again:
+ 			prealloc = alloc_extent_state(GFP_ATOMIC);
+ 		err = split_state(tree, state, prealloc, end + 1);
+ 		BUG_ON(err == -EEXIST);
+-
+ 		if (wake)
+ 			wake_up(&state->wq);
+-		set |= clear_state_bit(tree, prealloc, bits,
+-				       wake, delete);
++
++		set |= clear_state_bit(tree, prealloc, bits, wake, delete);
++
+ 		prealloc = NULL;
+ 		goto out;
+ 	}
+ 
++	if (state->end < end && prealloc && !need_resched())
++		next_node = rb_next(&state->rb_node);
++	else
++		next_node = NULL;
++
+ 	set |= clear_state_bit(tree, state, bits, wake, delete);
+ 	if (last_end == (u64)-1)
+ 		goto out;
+ 	start = last_end + 1;
++	if (start <= end && next_node) {
++		state = rb_entry(next_node, struct extent_state,
++				 rb_node);
++		if (state->start == start)
++			goto hit_next;
++	}
+ 	goto search_again;
+ 
+ out:
+@@ -641,40 +696,59 @@ out:
+ 	return 0;
+ }
+ 
+-static void set_state_bits(struct extent_io_tree *tree,
++static int set_state_bits(struct extent_io_tree *tree,
+ 			   struct extent_state *state,
+ 			   int bits)
+ {
++	int ret;
++
++	ret = set_state_cb(tree, state, bits);
++	if (ret)
++		return ret;
++
+ 	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+ 		u64 range = state->end - state->start + 1;
+ 		tree->dirty_bytes += range;
+ 	}
+-	set_state_cb(tree, state, bits);
+ 	state->state |= bits;
++
++	return 0;
++}
++
++static void cache_state(struct extent_state *state,
++			struct extent_state **cached_ptr)
++{
++	if (cached_ptr && !(*cached_ptr)) {
++		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
++			*cached_ptr = state;
++			atomic_inc(&state->refs);
++		}
++	}
+ }
+ 
+ /*
+- * set some bits on a range in the tree.  This may require allocations
+- * or sleeping, so the gfp mask is used to indicate what is allowed.
++ * set some bits on a range in the tree.  This may require allocations or
++ * sleeping, so the gfp mask is used to indicate what is allowed.
+  *
+- * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
+- * range already has the desired bits set.  The start of the existing
+- * range is returned in failed_start in this case.
++ * If any of the exclusive bits are set, this will fail with -EEXIST if some
++ * part of the range already has the desired bits set.  The start of the
++ * existing range is returned in failed_start in this case.
+  *
+- * [start, end] is inclusive
+- * This takes the tree lock.
++ * [start, end] is inclusive This takes the tree lock.
+  */
++
+ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+-			  int bits, int exclusive, u64 *failed_start,
++			  int bits, int exclusive_bits, u64 *failed_start,
++			  struct extent_state **cached_state,
+ 			  gfp_t mask)
+ {
+ 	struct extent_state *state;
+ 	struct extent_state *prealloc = NULL;
+ 	struct rb_node *node;
+ 	int err = 0;
+-	int set;
+ 	u64 last_start;
+ 	u64 last_end;
++
+ again:
+ 	if (!prealloc && (mask & __GFP_WAIT)) {
+ 		prealloc = alloc_extent_state(mask);
+@@ -683,6 +757,13 @@ again:
+ 	}
+ 
+ 	spin_lock(&tree->lock);
++	if (cached_state && *cached_state) {
++		state = *cached_state;
++		if (state->start == start && state->tree) {
++			node = &state->rb_node;
++			goto hit_next;
++		}
++	}
+ 	/*
+ 	 * this search will find all the extents that end after
+ 	 * our range starts.
+@@ -694,8 +775,8 @@ again:
+ 		BUG_ON(err == -EEXIST);
+ 		goto out;
+ 	}
+-
+ 	state = rb_entry(node, struct extent_state, rb_node);
++hit_next:
+ 	last_start = state->start;
+ 	last_end = state->end;
+ 
+@@ -706,17 +787,32 @@ again:
+ 	 * Just lock what we found and keep going
+ 	 */
+ 	if (state->start == start && state->end <= end) {
+-		set = state->state & bits;
+-		if (set && exclusive) {
++		struct rb_node *next_node;
++		if (state->state & exclusive_bits) {
+ 			*failed_start = state->start;
+ 			err = -EEXIST;
+ 			goto out;
+ 		}
+-		set_state_bits(tree, state, bits);
++
++		err = set_state_bits(tree, state, bits);
++		if (err)
++			goto out;
++
++		cache_state(state, cached_state);
+ 		merge_state(tree, state);
+ 		if (last_end == (u64)-1)
+ 			goto out;
++
+ 		start = last_end + 1;
++		if (start < end && prealloc && !need_resched()) {
++			next_node = rb_next(node);
++			if (next_node) {
++				state = rb_entry(next_node, struct extent_state,
++						 rb_node);
++				if (state->start == start)
++					goto hit_next;
++			}
++		}
+ 		goto search_again;
+ 	}
+ 
+@@ -737,8 +833,7 @@ again:
+ 	 * desired bit on it.
+ 	 */
+ 	if (state->start < start) {
+-		set = state->state & bits;
+-		if (exclusive && set) {
++		if (state->state & exclusive_bits) {
+ 			*failed_start = start;
+ 			err = -EEXIST;
+ 			goto out;
+@@ -749,13 +844,14 @@ again:
+ 		if (err)
+ 			goto out;
+ 		if (state->end <= end) {
+-			set_state_bits(tree, state, bits);
++			err = set_state_bits(tree, state, bits);
++			if (err)
++				goto out;
++			cache_state(state, cached_state);
+ 			merge_state(tree, state);
+ 			if (last_end == (u64)-1)
+ 				goto out;
+ 			start = last_end + 1;
+-		} else {
+-			start = state->start;
+ 		}
+ 		goto search_again;
+ 	}
+@@ -774,10 +870,13 @@ again:
+ 			this_end = last_start - 1;
+ 		err = insert_state(tree, prealloc, start, this_end,
+ 				   bits);
+-		prealloc = NULL;
+ 		BUG_ON(err == -EEXIST);
+-		if (err)
++		if (err) {
++			prealloc = NULL;
+ 			goto out;
++		}
++		cache_state(prealloc, cached_state);
++		prealloc = NULL;
+ 		start = this_end + 1;
+ 		goto search_again;
+ 	}
+@@ -788,8 +887,7 @@ again:
+ 	 * on the first half
+ 	 */
+ 	if (state->start <= end && state->end > end) {
+-		set = state->state & bits;
+-		if (exclusive && set) {
++		if (state->state & exclusive_bits) {
+ 			*failed_start = start;
+ 			err = -EEXIST;
+ 			goto out;
+@@ -797,7 +895,12 @@ again:
+ 		err = split_state(tree, state, prealloc, end + 1);
+ 		BUG_ON(err == -EEXIST);
+ 
+-		set_state_bits(tree, prealloc, bits);
++		err = set_state_bits(tree, prealloc, bits);
++		if (err) {
++			prealloc = NULL;
++			goto out;
++		}
++		cache_state(prealloc, cached_state);
+ 		merge_state(tree, prealloc);
+ 		prealloc = NULL;
+ 		goto out;
+@@ -826,86 +929,65 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ 		     gfp_t mask)
+ {
+ 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+-			      mask);
+-}
+-
+-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+-		       gfp_t mask)
+-{
+-	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
++			      NULL, mask);
+ }
+ 
+ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ 		    int bits, gfp_t mask)
+ {
+ 	return set_extent_bit(tree, start, end, bits, 0, NULL,
+-			      mask);
++			      NULL, mask);
+ }
+ 
+ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ 		      int bits, gfp_t mask)
+ {
+-	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
++	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
+ }
+ 
+ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+ 		     gfp_t mask)
+ {
+ 	return set_extent_bit(tree, start, end,
+-			      EXTENT_DELALLOC | EXTENT_DIRTY,
+-			      0, NULL, mask);
++			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
++			      0, NULL, NULL, mask);
+ }
+ 
+ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ 		       gfp_t mask)
+ {
+ 	return clear_extent_bit(tree, start, end,
+-				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
+-}
+-
+-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+-			 gfp_t mask)
+-{
+-	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
++				EXTENT_DIRTY | EXTENT_DELALLOC |
++				EXTENT_DO_ACCOUNTING, 0, 0,
++				NULL, mask);
+ }
+ 
+ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+ 		     gfp_t mask)
+ {
+ 	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+-			      mask);
++			      NULL, mask);
+ }
+ 
+ static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+ 		       gfp_t mask)
+ {
+-	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
++	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
++				NULL, mask);
+ }
+ 
+ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ 			gfp_t mask)
+ {
+ 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+-			      mask);
++			      NULL, mask);
+ }
+ 
+ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ 				 u64 end, gfp_t mask)
+ {
+-	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
+-}
+-
+-static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+-			 gfp_t mask)
+-{
+-	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
+-			      0, NULL, mask);
+-}
+-
+-static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
+-				  u64 end, gfp_t mask)
+-{
+-	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
++	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
++				NULL, mask);
+ }
+ 
+ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+@@ -917,13 +999,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+  * either insert or lock state struct between start and end use mask to tell
+  * us if waiting is desired.
+  */
+-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
++int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
++		     int bits, struct extent_state **cached_state, gfp_t mask)
+ {
+ 	int err;
+ 	u64 failed_start;
+ 	while (1) {
+-		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+-				     &failed_start, mask);
++		err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
++				     EXTENT_LOCKED, &failed_start,
++				     cached_state, mask);
+ 		if (err == -EEXIST && (mask & __GFP_WAIT)) {
+ 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+ 			start = failed_start;
+@@ -935,27 +1019,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+ 	return err;
+ }
+ 
++int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
++{
++	return lock_extent_bits(tree, start, end, 0, NULL, mask);
++}
++
+ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ 		    gfp_t mask)
+ {
+ 	int err;
+ 	u64 failed_start;
+ 
+-	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+-			     &failed_start, mask);
++	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
++			     &failed_start, NULL, mask);
+ 	if (err == -EEXIST) {
+ 		if (failed_start > start)
+ 			clear_extent_bit(tree, start, failed_start - 1,
+-					 EXTENT_LOCKED, 1, 0, mask);
++					 EXTENT_LOCKED, 1, 0, NULL, mask);
+ 		return 0;
+ 	}
+ 	return 1;
+ }
+ 
++int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
++			 struct extent_state **cached, gfp_t mask)
++{
++	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
++				mask);
++}
++
+ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ 		  gfp_t mask)
+ {
+-	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
++	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
++				mask);
+ }
+ 
+ /*
+@@ -974,7 +1071,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
+ 		page_cache_release(page);
+ 		index++;
+ 	}
+-	set_extent_dirty(tree, start, end, GFP_NOFS);
+ 	return 0;
+ }
+ 
+@@ -994,7 +1090,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+ 		page_cache_release(page);
+ 		index++;
+ 	}
+-	set_extent_writeback(tree, start, end, GFP_NOFS);
+ 	return 0;
+ }
+ 
+@@ -1232,6 +1327,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
+ 	u64 delalloc_start;
+ 	u64 delalloc_end;
+ 	u64 found;
++	struct extent_state *cached_state = NULL;
+ 	int ret;
+ 	int loops = 0;
+ 
+@@ -1269,6 +1365,7 @@ again:
+ 		/* some of the pages are gone, lets avoid looping by
+ 		 * shortening the size of the delalloc range we're searching
+ 		 */
++		free_extent_state(cached_state);
+ 		if (!loops) {
+ 			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+ 			max_bytes = PAGE_CACHE_SIZE - offset;
+@@ -1282,18 +1379,21 @@ again:
+ 	BUG_ON(ret);
+ 
+ 	/* step three, lock the state bits for the whole range */
+-	lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
++	lock_extent_bits(tree, delalloc_start, delalloc_end,
++			 0, &cached_state, GFP_NOFS);
+ 
+ 	/* then test to make sure it is all still delalloc */
+ 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
+-			     EXTENT_DELALLOC, 1);
++			     EXTENT_DELALLOC, 1, cached_state);
+ 	if (!ret) {
+-		unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
++		unlock_extent_cached(tree, delalloc_start, delalloc_end,
++				     &cached_state, GFP_NOFS);
+ 		__unlock_for_delalloc(inode, locked_page,
+ 			      delalloc_start, delalloc_end);
+ 		cond_resched();
+ 		goto again;
+ 	}
++	free_extent_state(cached_state);
+ 	*start = delalloc_start;
+ 	*end = delalloc_end;
+ out_failed:
+@@ -1303,11 +1403,7 @@ out_failed:
+ int extent_clear_unlock_delalloc(struct inode *inode,
+ 				struct extent_io_tree *tree,
+ 				u64 start, u64 end, struct page *locked_page,
+-				int unlock_pages,
+-				int clear_unlock,
+-				int clear_delalloc, int clear_dirty,
+-				int set_writeback,
+-				int end_writeback)
++				unsigned long op)
+ {
+ 	int ret;
+ 	struct page *pages[16];
+@@ -1317,16 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
+ 	int i;
+ 	int clear_bits = 0;
+ 
+-	if (clear_unlock)
++	if (op & EXTENT_CLEAR_UNLOCK)
+ 		clear_bits |= EXTENT_LOCKED;
+-	if (clear_dirty)
++	if (op & EXTENT_CLEAR_DIRTY)
+ 		clear_bits |= EXTENT_DIRTY;
+ 
+-	if (clear_delalloc)
++	if (op & EXTENT_CLEAR_DELALLOC)
+ 		clear_bits |= EXTENT_DELALLOC;
+ 
+-	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+-	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
++	if (op & EXTENT_CLEAR_ACCOUNTING)
++		clear_bits |= EXTENT_DO_ACCOUNTING;
++
++	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
++	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
++		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
++		    EXTENT_SET_PRIVATE2)))
+ 		return 0;
+ 
+ 	while (nr_pages > 0) {
+@@ -1334,17 +1435,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
+ 				     min_t(unsigned long,
+ 				     nr_pages, ARRAY_SIZE(pages)), pages);
+ 		for (i = 0; i < ret; i++) {
++
++			if (op & EXTENT_SET_PRIVATE2)
++				SetPagePrivate2(pages[i]);
++
+ 			if (pages[i] == locked_page) {
+ 				page_cache_release(pages[i]);
+ 				continue;
+ 			}
+-			if (clear_dirty)
++			if (op & EXTENT_CLEAR_DIRTY)
+ 				clear_page_dirty_for_io(pages[i]);
+-			if (set_writeback)
++			if (op & EXTENT_SET_WRITEBACK)
+ 				set_page_writeback(pages[i]);
+-			if (end_writeback)
++			if (op & EXTENT_END_WRITEBACK)
+ 				end_page_writeback(pages[i]);
+-			if (unlock_pages)
++			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
+ 				unlock_page(pages[i]);
+ 			page_cache_release(pages[i]);
+ 		}
+@@ -1476,14 +1581,17 @@ out:
+  * range is found set.
+  */
+ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+-		   int bits, int filled)
++		   int bits, int filled, struct extent_state *cached)
+ {
+ 	struct extent_state *state = NULL;
+ 	struct rb_node *node;
+ 	int bitset = 0;
+ 
+ 	spin_lock(&tree->lock);
+-	node = tree_search(tree, start);
++	if (cached && cached->tree && cached->start == start)
++		node = &cached->rb_node;
++	else
++		node = tree_search(tree, start);
+ 	while (node && start <= end) {
+ 		state = rb_entry(node, struct extent_state, rb_node);
+ 
+@@ -1503,6 +1611,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ 			bitset = 0;
+ 			break;
+ 		}
++
++		if (state->end == (u64)-1)
++			break;
++
+ 		start = state->end + 1;
+ 		if (start > end)
+ 			break;
+@@ -1526,7 +1638,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,
+ {
+ 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ 	u64 end = start + PAGE_CACHE_SIZE - 1;
+-	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
++	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
+ 		SetPageUptodate(page);
+ 	return 0;
+ }
+@@ -1540,7 +1652,7 @@ static int check_page_locked(struct extent_io_tree *tree,
+ {
+ 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ 	u64 end = start + PAGE_CACHE_SIZE - 1;
+-	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
++	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
+ 		unlock_page(page);
+ 	return 0;
+ }
+@@ -1552,10 +1664,7 @@ static int check_page_locked(struct extent_io_tree *tree,
+ static int check_page_writeback(struct extent_io_tree *tree,
+ 			     struct page *page)
+ {
+-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+-	u64 end = start + PAGE_CACHE_SIZE - 1;
+-	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
+-		end_page_writeback(page);
++	end_page_writeback(page);
+ 	return 0;
+ }
+ 
+@@ -1613,13 +1722,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
+ 		}
+ 
+ 		if (!uptodate) {
+-			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
++			clear_extent_uptodate(tree, start, end, GFP_NOFS);
+ 			ClearPageUptodate(page);
+ 			SetPageError(page);
+ 		}
+ 
+-		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
+-
+ 		if (whole_page)
+ 			end_page_writeback(page);
+ 		else
+@@ -1983,7 +2090,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
+ 			continue;
+ 		}
+ 		/* the get_extent function already copied into the page */
+-		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
++		if (test_range_bit(tree, cur, cur_end,
++				   EXTENT_UPTODATE, 1, NULL)) {
+ 			check_page_uptodate(tree, page);
+ 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ 			cur = cur + iosize;
+@@ -2078,6 +2186,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 	u64 iosize;
+ 	u64 unlock_start;
+ 	sector_t sector;
++	struct extent_state *cached_state = NULL;
+ 	struct extent_map *em;
+ 	struct block_device *bdev;
+ 	int ret;
+@@ -2124,6 +2233,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 	delalloc_end = 0;
+ 	page_started = 0;
+ 	if (!epd->extent_locked) {
++		u64 delalloc_to_write = 0;
+ 		/*
+ 		 * make sure the wbc mapping index is at least updated
+ 		 * to this page.
+@@ -2143,8 +2253,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 			tree->ops->fill_delalloc(inode, page, delalloc_start,
+ 						 delalloc_end, &page_started,
+ 						 &nr_written);
++			/*
++			 * delalloc_end is already one less than the total
++			 * length, so we don't subtract one from
++			 * PAGE_CACHE_SIZE
++			 */
++			delalloc_to_write += (delalloc_end - delalloc_start +
++					      PAGE_CACHE_SIZE) >>
++					      PAGE_CACHE_SHIFT;
+ 			delalloc_start = delalloc_end + 1;
+ 		}
++		if (wbc->nr_to_write < delalloc_to_write) {
++			int thresh = 8192;
++
++			if (delalloc_to_write < thresh * 2)
++				thresh = delalloc_to_write;
++			wbc->nr_to_write = min_t(u64, delalloc_to_write,
++						 thresh);
++		}
+ 
+ 		/* did the fill delalloc function already unlock and start
+ 		 * the IO?
+@@ -2160,15 +2286,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 			goto done_unlocked;
+ 		}
+ 	}
+-	lock_extent(tree, start, page_end, GFP_NOFS);
+-
+-	unlock_start = start;
+-
+ 	if (tree->ops && tree->ops->writepage_start_hook) {
+ 		ret = tree->ops->writepage_start_hook(page, start,
+ 						      page_end);
+ 		if (ret == -EAGAIN) {
+-			unlock_extent(tree, start, page_end, GFP_NOFS);
+ 			redirty_page_for_writepage(wbc, page);
+ 			update_nr_written(page, wbc, nr_written);
+ 			unlock_page(page);
+@@ -2184,12 +2305,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 	update_nr_written(page, wbc, nr_written + 1);
+ 
+ 	end = page_end;
+-	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
+-		printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
+-
+ 	if (last_byte <= start) {
+-		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+-		unlock_extent(tree, start, page_end, GFP_NOFS);
+ 		if (tree->ops && tree->ops->writepage_end_io_hook)
+ 			tree->ops->writepage_end_io_hook(page, start,
+ 							 page_end, NULL, 1);
+@@ -2197,13 +2313,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 		goto done;
+ 	}
+ 
+-	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
+ 	blocksize = inode->i_sb->s_blocksize;
+ 
+ 	while (cur <= end) {
+ 		if (cur >= last_byte) {
+-			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+-			unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+ 			if (tree->ops && tree->ops->writepage_end_io_hook)
+ 				tree->ops->writepage_end_io_hook(page, cur,
+ 							 page_end, NULL, 1);
+@@ -2235,12 +2348,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 		 */
+ 		if (compressed || block_start == EXTENT_MAP_HOLE ||
+ 		    block_start == EXTENT_MAP_INLINE) {
+-			clear_extent_dirty(tree, cur,
+-					   cur + iosize - 1, GFP_NOFS);
+-
+-			unlock_extent(tree, unlock_start, cur + iosize - 1,
+-				      GFP_NOFS);
+-
+ 			/*
+ 			 * end_io notification does not happen here for
+ 			 * compressed extents
+@@ -2265,13 +2372,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 		}
+ 		/* leave this out until we have a page_mkwrite call */
+ 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
+-				   EXTENT_DIRTY, 0)) {
++				   EXTENT_DIRTY, 0, NULL)) {
+ 			cur = cur + iosize;
+ 			pg_offset += iosize;
+ 			continue;
+ 		}
+ 
+-		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
+ 		if (tree->ops && tree->ops->writepage_io_hook) {
+ 			ret = tree->ops->writepage_io_hook(page, cur,
+ 						cur + iosize - 1);
+@@ -2309,12 +2415,12 @@ done:
+ 		set_page_writeback(page);
+ 		end_page_writeback(page);
+ 	}
+-	if (unlock_start <= page_end)
+-		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+ 	unlock_page(page);
+ 
+ done_unlocked:
+ 
++	/* drop our reference on any cached states */
++	free_extent_state(cached_state);
+ 	return 0;
+ }
+ 
+@@ -2339,9 +2445,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
+ 			     writepage_t writepage, void *data,
+ 			     void (*flush_fn)(void *))
+ {
+-	struct backing_dev_info *bdi = mapping->backing_dev_info;
+ 	int ret = 0;
+ 	int done = 0;
++	int nr_to_write_done = 0;
+ 	struct pagevec pvec;
+ 	int nr_pages;
+ 	pgoff_t index;
+@@ -2361,7 +2467,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
+ 		scanned = 1;
+ 	}
+ retry:
+-	while (!done && (index <= end) &&
++	while (!done && !nr_to_write_done && (index <= end) &&
+ 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ 			      PAGECACHE_TAG_DIRTY, min(end - index,
+ 				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+@@ -2412,12 +2518,15 @@ retry:
+ 				unlock_page(page);
+ 				ret = 0;
+ 			}
+-			if (ret || wbc->nr_to_write <= 0)
+-				done = 1;
+-			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+-				wbc->encountered_congestion = 1;
++			if (ret)
+ 				done = 1;
+-			}
++
++			/*
++			 * the filesystem may choose to bump up nr_to_write.
++			 * We have to make sure to honor the new nr_to_write
++			 * at any time
++			 */
++			nr_to_write_done = wbc->nr_to_write <= 0;
+ 		}
+ 		pagevec_release(&pvec);
+ 		cond_resched();
+@@ -2604,10 +2713,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
+ 		return 0;
+ 
+ 	lock_extent(tree, start, end, GFP_NOFS);
+-	wait_on_extent_writeback(tree, start, end);
++	wait_on_page_writeback(page);
+ 	clear_extent_bit(tree, start, end,
+-			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+-			 1, 1, GFP_NOFS);
++			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
++			 EXTENT_DO_ACCOUNTING,
++			 1, 1, NULL, GFP_NOFS);
+ 	return 0;
+ }
+ 
+@@ -2687,7 +2797,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
+ 		    !isnew && !PageUptodate(page) &&
+ 		    (block_off_end > to || block_off_start < from) &&
+ 		    !test_range_bit(tree, block_start, cur_end,
+-				    EXTENT_UPTODATE, 1)) {
++				    EXTENT_UPTODATE, 1, NULL)) {
+ 			u64 sector;
+ 			u64 extent_offset = block_start - em->start;
+ 			size_t iosize;
+@@ -2701,7 +2811,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
+ 			 */
+ 			set_extent_bit(tree, block_start,
+ 				       block_start + iosize - 1,
+-				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
++				       EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
+ 			ret = submit_extent_page(READ, tree, page,
+ 					 sector, iosize, page_offset, em->bdev,
+ 					 NULL, 1,
+@@ -2742,13 +2852,18 @@ int try_release_extent_state(struct extent_map_tree *map,
+ 	int ret = 1;
+ 
+ 	if (test_range_bit(tree, start, end,
+-			   EXTENT_IOBITS | EXTENT_ORDERED, 0))
++			   EXTENT_IOBITS, 0, NULL))
+ 		ret = 0;
+ 	else {
+ 		if ((mask & GFP_NOFS) == GFP_NOFS)
+ 			mask = GFP_NOFS;
+-		clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+-				 1, 1, mask);
++		/*
++		 * at this point we can safely clear everything except the
++		 * locked bit and the nodatasum bit
++		 */
++		clear_extent_bit(tree, start, end,
++				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
++				 0, 0, NULL, mask);
+ 	}
+ 	return ret;
+ }
+@@ -2771,29 +2886,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,
+ 		u64 len;
+ 		while (start <= end) {
+ 			len = end - start + 1;
+-			spin_lock(&map->lock);
++			write_lock(&map->lock);
+ 			em = lookup_extent_mapping(map, start, len);
+ 			if (!em || IS_ERR(em)) {
+-				spin_unlock(&map->lock);
++				write_unlock(&map->lock);
+ 				break;
+ 			}
+ 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+ 			    em->start != start) {
+-				spin_unlock(&map->lock);
++				write_unlock(&map->lock);
+ 				free_extent_map(em);
+ 				break;
+ 			}
+ 			if (!test_range_bit(tree, em->start,
+ 					    extent_map_end(em) - 1,
+-					    EXTENT_LOCKED | EXTENT_WRITEBACK |
+-					    EXTENT_ORDERED,
+-					    0)) {
++					    EXTENT_LOCKED | EXTENT_WRITEBACK,
++					    0, NULL)) {
+ 				remove_extent_mapping(map, em);
+ 				/* once for the rb tree */
+ 				free_extent_map(em);
+ 			}
+ 			start = extent_map_end(em);
+-			spin_unlock(&map->lock);
++			write_unlock(&map->lock);
+ 
+ 			/* once for us */
+ 			free_extent_map(em);
+@@ -3203,7 +3317,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
+ 	int uptodate;
+ 	unsigned long index;
+ 
+-	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
++	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
+ 	if (ret)
+ 		return 1;
+ 	while (start <= end) {
+@@ -3233,7 +3347,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
+ 		return 1;
+ 
+ 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+-			   EXTENT_UPTODATE, 1);
++			   EXTENT_UPTODATE, 1, NULL);
+ 	if (ret)
+ 		return ret;
+ 
+@@ -3269,7 +3383,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
+ 		return 0;
+ 
+ 	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+-			   EXTENT_UPTODATE, 1)) {
++			   EXTENT_UPTODATE, 1, NULL)) {
+ 		return 0;
+ 	}
+ 
+diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
+index 5bc20ab..36de250 100644
+--- a/fs/btrfs/extent_io.h
++++ b/fs/btrfs/extent_io.h
+@@ -13,10 +13,9 @@
+ #define EXTENT_DEFRAG (1 << 6)
+ #define EXTENT_DEFRAG_DONE (1 << 7)
+ #define EXTENT_BUFFER_FILLED (1 << 8)
+-#define EXTENT_ORDERED (1 << 9)
+-#define EXTENT_ORDERED_METADATA (1 << 10)
+-#define EXTENT_BOUNDARY (1 << 11)
+-#define EXTENT_NODATASUM (1 << 12)
++#define EXTENT_BOUNDARY (1 << 9)
++#define EXTENT_NODATASUM (1 << 10)
++#define EXTENT_DO_ACCOUNTING (1 << 11)
+ #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+ 
+ /* flags for bio submission */
+@@ -27,6 +26,16 @@
+ #define EXTENT_BUFFER_BLOCKING 1
+ #define EXTENT_BUFFER_DIRTY 2
+ 
++/* these are flags for extent_clear_unlock_delalloc */
++#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
++#define EXTENT_CLEAR_UNLOCK	 0x2
++#define EXTENT_CLEAR_DELALLOC	 0x4
++#define EXTENT_CLEAR_DIRTY	 0x8
++#define EXTENT_SET_WRITEBACK	 0x10
++#define EXTENT_END_WRITEBACK	 0x20
++#define EXTENT_SET_PRIVATE2	 0x40
++#define EXTENT_CLEAR_ACCOUNTING  0x80
++
+ /*
+  * page->private values.  Every page that is controlled by the extent
+  * map has page->private set to one.
+@@ -62,8 +71,13 @@ struct extent_io_ops {
+ 				      struct extent_state *state, int uptodate);
+ 	int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
+ 			    unsigned long old, unsigned long bits);
+-	int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
+-			    unsigned long old, unsigned long bits);
++	int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
++			      unsigned long bits);
++	int (*merge_extent_hook)(struct inode *inode,
++				 struct extent_state *new,
++				 struct extent_state *other);
++	int (*split_extent_hook)(struct inode *inode,
++				 struct extent_state *orig, u64 split);
+ 	int (*write_cache_pages_lock_hook)(struct page *page);
+ };
+ 
+@@ -81,10 +95,14 @@ struct extent_state {
+ 	u64 start;
+ 	u64 end; /* inclusive */
+ 	struct rb_node rb_node;
++
++	/* ADD NEW ELEMENTS AFTER THIS */
+ 	struct extent_io_tree *tree;
+ 	wait_queue_head_t wq;
+ 	atomic_t refs;
+ 	unsigned long state;
++	u64 split_start;
++	u64 split_end;
+ 
+ 	/* for use by the FS */
+ 	u64 private;
+@@ -142,6 +160,8 @@ int try_release_extent_state(struct extent_map_tree *map,
+ 			     struct extent_io_tree *tree, struct page *page,
+ 			     gfp_t mask);
+ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
++int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
++		     int bits, struct extent_state **cached, gfp_t mask);
+ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ 		    gfp_t mask);
+@@ -155,11 +175,12 @@ u64 count_range_bits(struct extent_io_tree *tree,
+ 		     u64 max_bytes, unsigned long bits);
+ 
+ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+-		   int bits, int filled);
++		   int bits, int filled, struct extent_state *cached_state);
+ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ 		      int bits, gfp_t mask);
+ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+-		     int bits, int wake, int delete, gfp_t mask);
++		     int bits, int wake, int delete, struct extent_state **cached,
++		     gfp_t mask);
+ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ 		    int bits, gfp_t mask);
+ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+@@ -278,9 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree,
+ int extent_clear_unlock_delalloc(struct inode *inode,
+ 				struct extent_io_tree *tree,
+ 				u64 start, u64 end, struct page *locked_page,
+-				int unlock_page,
+-				int clear_unlock,
+-				int clear_delalloc, int clear_dirty,
+-				int set_writeback,
+-				int end_writeback);
++				unsigned long op);
+ #endif
+diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
+index 30c9365..2c726b7 100644
+--- a/fs/btrfs/extent_map.c
++++ b/fs/btrfs/extent_map.c
+@@ -36,7 +36,7 @@ void extent_map_exit(void)
+ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
+ {
+ 	tree->map.rb_node = NULL;
+-	spin_lock_init(&tree->lock);
++	rwlock_init(&tree->lock);
+ }
+ 
+ /**
+@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+ 	return 0;
+ }
+ 
++int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
++{
++	int ret = 0;
++	struct extent_map *merge = NULL;
++	struct rb_node *rb;
++	struct extent_map *em;
++
++	write_lock(&tree->lock);
++	em = lookup_extent_mapping(tree, start, len);
++
++	WARN_ON(em->start != start || !em);
++
++	if (!em)
++		goto out;
++
++	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
++
++	if (em->start != 0) {
++		rb = rb_prev(&em->rb_node);
++		if (rb)
++			merge = rb_entry(rb, struct extent_map, rb_node);
++		if (rb && mergable_maps(merge, em)) {
++			em->start = merge->start;
++			em->len += merge->len;
++			em->block_len += merge->block_len;
++			em->block_start = merge->block_start;
++			merge->in_tree = 0;
++			rb_erase(&merge->rb_node, &tree->map);
++			free_extent_map(merge);
++		}
++	}
++
++	rb = rb_next(&em->rb_node);
++	if (rb)
++		merge = rb_entry(rb, struct extent_map, rb_node);
++	if (rb && mergable_maps(em, merge)) {
++		em->len += merge->len;
++		em->block_len += merge->len;
++		rb_erase(&merge->rb_node, &tree->map);
++		merge->in_tree = 0;
++		free_extent_map(merge);
++	}
++
++	free_extent_map(em);
++out:
++	write_unlock(&tree->lock);
++	return ret;
++
++}
++
+ /**
+  * add_extent_mapping - add new extent map to the extent tree
+  * @tree:	tree to insert new map in
+@@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
+ 		ret = -EEXIST;
+ 		goto out;
+ 	}
+-	assert_spin_locked(&tree->lock);
+ 	rb = tree_insert(&tree->map, em->start, &em->rb_node);
+ 	if (rb) {
+ 		ret = -EEXIST;
+@@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ 	struct rb_node *next = NULL;
+ 	u64 end = range_end(start, len);
+ 
+-	assert_spin_locked(&tree->lock);
+ 	rb_node = __tree_search(&tree->map, start, &prev, &next);
+ 	if (!rb_node && prev) {
+ 		em = rb_entry(prev, struct extent_map, rb_node);
+@@ -319,6 +367,54 @@ out:
+ }
+ 
+ /**
++ * search_extent_mapping - find a nearby extent map
++ * @tree:	tree to lookup in
++ * @start:	byte offset to start the search
++ * @len:	length of the lookup range
++ *
++ * Find and return the first extent_map struct in @tree that intersects the
++ * [start, len] range.
++ *
++ * If one can't be found, any nearby extent may be returned
++ */
++struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
++					 u64 start, u64 len)
++{
++	struct extent_map *em;
++	struct rb_node *rb_node;
++	struct rb_node *prev = NULL;
++	struct rb_node *next = NULL;
++
++	rb_node = __tree_search(&tree->map, start, &prev, &next);
++	if (!rb_node && prev) {
++		em = rb_entry(prev, struct extent_map, rb_node);
++		goto found;
++	}
++	if (!rb_node && next) {
++		em = rb_entry(next, struct extent_map, rb_node);
++		goto found;
++	}
++	if (!rb_node) {
++		em = NULL;
++		goto out;
++	}
++	if (IS_ERR(rb_node)) {
++		em = ERR_PTR(PTR_ERR(rb_node));
++		goto out;
++	}
++	em = rb_entry(rb_node, struct extent_map, rb_node);
++	goto found;
++
++	em = NULL;
++	goto out;
++
++found:
++	atomic_inc(&em->refs);
++out:
++	return em;
++}
++
++/**
+  * remove_extent_mapping - removes an extent_map from the extent tree
+  * @tree:	extent tree to remove from
+  * @em:		extent map beeing removed
+@@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+ 	int ret = 0;
+ 
+ 	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+-	assert_spin_locked(&tree->lock);
+ 	rb_erase(&em->rb_node, &tree->map);
+ 	em->in_tree = 0;
+ 	return ret;
+diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
+index fb6eeef..ab6d74b 100644
+--- a/fs/btrfs/extent_map.h
++++ b/fs/btrfs/extent_map.h
+@@ -31,7 +31,7 @@ struct extent_map {
+ 
+ struct extent_map_tree {
+ 	struct rb_root map;
+-	spinlock_t lock;
++	rwlock_t lock;
+ };
+ 
+ static inline u64 extent_map_end(struct extent_map *em)
+@@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask);
+ void free_extent_map(struct extent_map *em);
+ int __init extent_map_init(void);
+ void extent_map_exit(void);
++int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
++struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
++					 u64 start, u64 len);
+ #endif
+diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
+index 4b83397..4599113 100644
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+ 	int err = 0;
+ 	int i;
+ 	struct inode *inode = fdentry(file)->d_inode;
+-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+-	u64 hint_byte;
+ 	u64 num_bytes;
+ 	u64 start_pos;
+ 	u64 end_of_last_block;
+@@ -125,23 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+ 		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ 
+ 	end_of_last_block = start_pos + num_bytes - 1;
++	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
++	if (err)
++		return err;
+ 
+-	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+-	trans = btrfs_join_transaction(root, 1);
+-	if (!trans) {
+-		err = -ENOMEM;
+-		goto out_unlock;
+-	}
+-	btrfs_set_trans_block_group(trans, inode);
+-	hint_byte = 0;
+-
+-	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+-
+-	/* check for reserved extents on each page, we don't want
+-	 * to reset the delalloc bit on things that already have
+-	 * extents reserved.
+-	 */
+-	btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+ 	for (i = 0; i < num_pages; i++) {
+ 		struct page *p = pages[i];
+ 		SetPageUptodate(p);
+@@ -155,9 +140,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+ 		 * at this time.
+ 		 */
+ 	}
+-	err = btrfs_end_transaction(trans, root);
+-out_unlock:
+-	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+ 	return err;
+ }
+ 
+@@ -189,18 +171,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ 		if (!split2)
+ 			split2 = alloc_extent_map(GFP_NOFS);
+ 
+-		spin_lock(&em_tree->lock);
++		write_lock(&em_tree->lock);
+ 		em = lookup_extent_mapping(em_tree, start, len);
+ 		if (!em) {
+-			spin_unlock(&em_tree->lock);
++			write_unlock(&em_tree->lock);
+ 			break;
+ 		}
+ 		flags = em->flags;
+ 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+-			spin_unlock(&em_tree->lock);
+ 			if (em->start <= start &&
+ 			    (!testend || em->start + em->len >= start + len)) {
+ 				free_extent_map(em);
++				write_unlock(&em_tree->lock);
+ 				break;
+ 			}
+ 			if (start < em->start) {
+@@ -210,6 +192,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ 				start = em->start + em->len;
+ 			}
+ 			free_extent_map(em);
++			write_unlock(&em_tree->lock);
+ 			continue;
+ 		}
+ 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+@@ -260,7 +243,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ 			free_extent_map(split);
+ 			split = NULL;
+ 		}
+-		spin_unlock(&em_tree->lock);
++		write_unlock(&em_tree->lock);
+ 
+ 		/* once for us */
+ 		free_extent_map(em);
+@@ -289,7 +272,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ 		       struct btrfs_root *root, struct inode *inode,
+ 		       u64 start, u64 end, u64 locked_end,
+-		       u64 inline_limit, u64 *hint_byte)
++		       u64 inline_limit, u64 *hint_byte, int drop_cache)
+ {
+ 	u64 extent_end = 0;
+ 	u64 search_start = start;
+@@ -314,7 +297,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ 	int ret;
+ 
+ 	inline_limit = 0;
+-	btrfs_drop_extent_cache(inode, start, end - 1, 0);
++	if (drop_cache)
++		btrfs_drop_extent_cache(inode, start, end - 1, 0);
+ 
+ 	path = btrfs_alloc_path();
+ 	if (!path)
+@@ -894,7 +878,8 @@ again:
+ 			btrfs_put_ordered_extent(ordered);
+ 
+ 		clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
+-				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
++				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
++				  EXTENT_DO_ACCOUNTING,
+ 				  GFP_NOFS);
+ 		unlock_extent(&BTRFS_I(inode)->io_tree,
+ 			      start_pos, last_pos - 1, GFP_NOFS);
+@@ -936,21 +921,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
+ 	start_pos = pos;
+ 
+ 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
++
++	/* do the reserve before the mutex lock in case we have to do some
++	 * flushing.  We wouldn't deadlock, but this is more polite.
++	 */
++	err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
++	if (err)
++		goto out_nolock;
++
++	mutex_lock(&inode->i_mutex);
++
+ 	current->backing_dev_info = inode->i_mapping->backing_dev_info;
+ 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ 	if (err)
+-		goto out_nolock;
++		goto out;
++
+ 	if (count == 0)
+-		goto out_nolock;
++		goto out;
+ 
+ 	err = file_remove_suid(file);
+ 	if (err)
+-		goto out_nolock;
++		goto out;
++
+ 	file_update_time(file);
+ 
+ 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+ 
+-	mutex_lock(&inode->i_mutex);
++	/* generic_write_checks can change our pos */
++	start_pos = pos;
++
+ 	BTRFS_I(inode)->sequence++;
+ 	first_index = pos >> PAGE_CACHE_SHIFT;
+ 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+@@ -1047,6 +1046,7 @@ out:
+ 	mutex_unlock(&inode->i_mutex);
+ 	if (ret)
+ 		err = ret;
++	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ 
+ out_nolock:
+ 	kfree(pages);
+@@ -1087,8 +1087,10 @@ out_nolock:
+ 					btrfs_end_transaction(trans, root);
+ 				else
+ 					btrfs_commit_transaction(trans, root);
+-			} else {
++			} else if (ret != BTRFS_NO_LOG_SYNC) {
+ 				btrfs_commit_transaction(trans, root);
++			} else {
++				btrfs_end_transaction(trans, root);
+ 			}
+ 		}
+ 		if (file->f_flags & O_DIRECT) {
+@@ -1138,6 +1140,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+ 	int ret = 0;
+ 	struct btrfs_trans_handle *trans;
+ 
++
++	/* we wait first, since the writeback may change the inode */
++	root->log_batch++;
++	/* the VFS called filemap_fdatawrite for us */
++	btrfs_wait_ordered_range(inode, 0, (u64)-1);
++	root->log_batch++;
++
+ 	/*
+ 	 * check the transaction that last modified this inode
+ 	 * and see if its already been committed
+@@ -1145,6 +1154,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+ 	if (!BTRFS_I(inode)->last_trans)
+ 		goto out;
+ 
++	/*
++	 * if the last transaction that changed this file was before
++	 * the current transaction, we can bail out now without any
++	 * syncing
++	 */
+ 	mutex_lock(&root->fs_info->trans_mutex);
+ 	if (BTRFS_I(inode)->last_trans <=
+ 	    root->fs_info->last_trans_committed) {
+@@ -1154,13 +1168,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+ 	}
+ 	mutex_unlock(&root->fs_info->trans_mutex);
+ 
+-	root->log_batch++;
+-	filemap_fdatawrite(inode->i_mapping);
+-	btrfs_wait_ordered_range(inode, 0, (u64)-1);
+-	root->log_batch++;
+-
+-	if (datasync && !(inode->i_state & I_DIRTY_PAGES))
+-		goto out;
+ 	/*
+ 	 * ok we haven't committed the transaction yet, lets do a commit
+ 	 */
+@@ -1189,14 +1196,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+ 	 */
+ 	mutex_unlock(&dentry->d_inode->i_mutex);
+ 
+-	if (ret > 0) {
+-		ret = btrfs_commit_transaction(trans, root);
+-	} else {
+-		ret = btrfs_sync_log(trans, root);
+-		if (ret == 0)
+-			ret = btrfs_end_transaction(trans, root);
+-		else
++	if (ret != BTRFS_NO_LOG_SYNC) {
++		if (ret > 0) {
+ 			ret = btrfs_commit_transaction(trans, root);
++		} else {
++			ret = btrfs_sync_log(trans, root);
++			if (ret == 0)
++				ret = btrfs_end_transaction(trans, root);
++			else
++				ret = btrfs_commit_transaction(trans, root);
++		}
++	} else {
++		ret = btrfs_end_transaction(trans, root);
+ 	}
+ 	mutex_lock(&dentry->d_inode->i_mutex);
+ out:
+diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
+index 5edcee3..5c2caad 100644
+--- a/fs/btrfs/free-space-cache.c
++++ b/fs/btrfs/free-space-cache.c
+@@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
+ 
+ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+ {
+-	u64 max_bytes, possible_bytes;
++	u64 max_bytes;
++	u64 bitmap_bytes;
++	u64 extent_bytes;
+ 
+ 	/*
+ 	 * The goal is to keep the total amount of memory used per 1gb of space
+@@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+ 	max_bytes = MAX_CACHE_BYTES_PER_GIG *
+ 		(div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+ 
+-	possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
+-		(sizeof(struct btrfs_free_space) *
+-		 block_group->extents_thresh);
++	/*
++	 * we want to account for 1 more bitmap than what we have so we can make
++	 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
++	 * we add more bitmaps.
++	 */
++	bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
+ 
+-	if (possible_bytes > max_bytes) {
+-		int extent_bytes = max_bytes -
+-			(block_group->total_bitmaps * PAGE_CACHE_SIZE);
++	if (bitmap_bytes >= max_bytes) {
++		block_group->extents_thresh = 0;
++		return;
++	}
+ 
+-		if (extent_bytes <= 0) {
+-			block_group->extents_thresh = 0;
+-			return;
+-		}
++	/*
++	 * we want the extent entry threshold to always be at most 1/2 the maxw
++	 * bytes we can have, or whatever is less than that.
++	 */
++	extent_bytes = max_bytes - bitmap_bytes;
++	extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
+ 
+-		block_group->extents_thresh = extent_bytes /
+-			(sizeof(struct btrfs_free_space));
+-	}
++	block_group->extents_thresh =
++		div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
+ }
+ 
+ static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
+@@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+ 	BUG_ON(block_group->total_bitmaps >= max_bitmaps);
+ 
+ 	info->offset = offset_to_bitmap(block_group, offset);
++	info->bytes = 0;
+ 	link_free_space(block_group, info);
+ 	block_group->total_bitmaps++;
+ 
+diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
+index 6b627c6..72ce3c1 100644
+--- a/fs/btrfs/inode-item.c
++++ b/fs/btrfs/inode-item.c
+@@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ 		ptr = (unsigned long)(ref + 1);
+ 		ret = 0;
+ 	} else if (ret < 0) {
++		if (ret == -EOVERFLOW)
++			ret = -EMLINK;
+ 		goto out;
+ 	} else {
+ 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+@@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+ 
+ 	ret = btrfs_insert_empty_item(trans, root, path, &key,
+ 				      sizeof(struct btrfs_inode_item));
+-	if (ret == 0 && objectid > root->highest_inode)
+-		root->highest_inode = objectid;
+ 	return ret;
+ }
+ 
+diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
+index 9abbced..c56eb59 100644
+--- a/fs/btrfs/inode-map.c
++++ b/fs/btrfs/inode-map.c
+@@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
+ 		slot = path->slots[0] - 1;
+ 		l = path->nodes[0];
+ 		btrfs_item_key_to_cpu(l, &found_key, slot);
+-		*objectid = found_key.objectid;
++		*objectid = max_t(u64, found_key.objectid,
++				  BTRFS_FIRST_FREE_OBJECTID - 1);
+ 	} else {
+-		*objectid = BTRFS_FIRST_FREE_OBJECTID;
++		*objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
+ 	}
+ 	ret = 0;
+ error:
+@@ -53,91 +54,27 @@ error:
+ 	return ret;
+ }
+ 
+-/*
+- * walks the btree of allocated inodes and find a hole.
+- */
+ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+ 			     struct btrfs_root *root,
+ 			     u64 dirid, u64 *objectid)
+ {
+-	struct btrfs_path *path;
+-	struct btrfs_key key;
+ 	int ret;
+-	int slot = 0;
+-	u64 last_ino = 0;
+-	int start_found;
+-	struct extent_buffer *l;
+-	struct btrfs_key search_key;
+-	u64 search_start = dirid;
+-
+ 	mutex_lock(&root->objectid_mutex);
+-	if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
+-	    root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
+-		*objectid = ++root->last_inode_alloc;
+-		mutex_unlock(&root->objectid_mutex);
+-		return 0;
+-	}
+-	path = btrfs_alloc_path();
+-	BUG_ON(!path);
+-	search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
+-	search_key.objectid = search_start;
+-	search_key.type = 0;
+-	search_key.offset = 0;
+-
+-	start_found = 0;
+-	ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
+-	if (ret < 0)
+-		goto error;
+ 
+-	while (1) {
+-		l = path->nodes[0];
+-		slot = path->slots[0];
+-		if (slot >= btrfs_header_nritems(l)) {
+-			ret = btrfs_next_leaf(root, path);
+-			if (ret == 0)
+-				continue;
+-			if (ret < 0)
+-				goto error;
+-			if (!start_found) {
+-				*objectid = search_start;
+-				start_found = 1;
+-				goto found;
+-			}
+-			*objectid = last_ino > search_start ?
+-				last_ino : search_start;
+-			goto found;
+-		}
+-		btrfs_item_key_to_cpu(l, &key, slot);
+-		if (key.objectid >= search_start) {
+-			if (start_found) {
+-				if (last_ino < search_start)
+-					last_ino = search_start;
+-				if (key.objectid > last_ino) {
+-					*objectid = last_ino;
+-					goto found;
+-				}
+-			} else if (key.objectid > search_start) {
+-				*objectid = search_start;
+-				goto found;
+-			}
+-		}
+-		if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
+-			break;
++	if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
++		ret = btrfs_find_highest_inode(root, &root->highest_objectid);
++		if (ret)
++			goto out;
++	}
+ 
+-		start_found = 1;
+-		last_ino = key.objectid + 1;
+-		path->slots[0]++;
++	if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
++		ret = -ENOSPC;
++		goto out;
+ 	}
+-	BUG_ON(1);
+-found:
+-	btrfs_release_path(root, path);
+-	btrfs_free_path(path);
+-	BUG_ON(*objectid < search_start);
+-	mutex_unlock(&root->objectid_mutex);
+-	return 0;
+-error:
+-	btrfs_release_path(root, path);
+-	btrfs_free_path(path);
++
++	*objectid = ++root->highest_objectid;
++	ret = 0;
++out:
+ 	mutex_unlock(&root->objectid_mutex);
+ 	return ret;
+ }
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 59cba18..f69e5e0 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
+ 	}
+ 
+ 	ret = btrfs_drop_extents(trans, root, inode, start,
+-				 aligned_end, aligned_end, start, &hint_byte);
++				 aligned_end, aligned_end, start,
++				 &hint_byte, 1);
+ 	BUG_ON(ret);
+ 
+ 	if (isize > actual_end)
+@@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
+ 				   inline_len, compressed_size,
+ 				   compressed_pages);
+ 	BUG_ON(ret);
+-	btrfs_drop_extent_cache(inode, start, aligned_end, 0);
++	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
+ 	return 0;
+ }
+ 
+@@ -423,9 +424,12 @@ again:
+ 			 * and free up our temp pages.
+ 			 */
+ 			extent_clear_unlock_delalloc(inode,
+-						     &BTRFS_I(inode)->io_tree,
+-						     start, end, NULL, 1, 0,
+-						     0, 1, 1, 1);
++			     &BTRFS_I(inode)->io_tree,
++			     start, end, NULL,
++			     EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
++			     EXTENT_CLEAR_DELALLOC |
++			     EXTENT_CLEAR_ACCOUNTING |
++			     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
+ 			ret = 0;
+ 			goto free_pages_out;
+ 		}
+@@ -611,9 +615,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
+ 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ 
+ 		while (1) {
+-			spin_lock(&em_tree->lock);
++			write_lock(&em_tree->lock);
+ 			ret = add_extent_mapping(em_tree, em);
+-			spin_unlock(&em_tree->lock);
++			write_unlock(&em_tree->lock);
+ 			if (ret != -EEXIST) {
+ 				free_extent_map(em);
+ 				break;
+@@ -636,11 +640,14 @@ static noinline int submit_compressed_extents(struct inode *inode,
+ 		 * clear dirty, set writeback and unlock the pages.
+ 		 */
+ 		extent_clear_unlock_delalloc(inode,
+-					     &BTRFS_I(inode)->io_tree,
+-					     async_extent->start,
+-					     async_extent->start +
+-					     async_extent->ram_size - 1,
+-					     NULL, 1, 1, 0, 1, 1, 0);
++				&BTRFS_I(inode)->io_tree,
++				async_extent->start,
++				async_extent->start +
++				async_extent->ram_size - 1,
++				NULL, EXTENT_CLEAR_UNLOCK_PAGE |
++				EXTENT_CLEAR_UNLOCK |
++				EXTENT_CLEAR_DELALLOC |
++				EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
+ 
+ 		ret = btrfs_submit_compressed_write(inode,
+ 				    async_extent->start,
+@@ -711,9 +718,15 @@ static noinline int cow_file_range(struct inode *inode,
+ 					    start, end, 0, NULL);
+ 		if (ret == 0) {
+ 			extent_clear_unlock_delalloc(inode,
+-						     &BTRFS_I(inode)->io_tree,
+-						     start, end, NULL, 1, 1,
+-						     1, 1, 1, 1);
++				     &BTRFS_I(inode)->io_tree,
++				     start, end, NULL,
++				     EXTENT_CLEAR_UNLOCK_PAGE |
++				     EXTENT_CLEAR_UNLOCK |
++				     EXTENT_CLEAR_DELALLOC |
++				     EXTENT_CLEAR_ACCOUNTING |
++				     EXTENT_CLEAR_DIRTY |
++				     EXTENT_SET_WRITEBACK |
++				     EXTENT_END_WRITEBACK);
+ 			*nr_written = *nr_written +
+ 			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+ 			*page_started = 1;
+@@ -725,9 +738,20 @@ static noinline int cow_file_range(struct inode *inode,
+ 	BUG_ON(disk_num_bytes >
+ 	       btrfs_super_total_bytes(&root->fs_info->super_copy));
+ 
++
++	read_lock(&BTRFS_I(inode)->extent_tree.lock);
++	em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
++				   start, num_bytes);
++	if (em) {
++		alloc_hint = em->block_start;
++		free_extent_map(em);
++	}
++	read_unlock(&BTRFS_I(inode)->extent_tree.lock);
+ 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+ 
+ 	while (disk_num_bytes > 0) {
++		unsigned long op;
++
+ 		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
+ 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+ 					   root->sectorsize, 0, alloc_hint,
+@@ -737,7 +761,6 @@ static noinline int cow_file_range(struct inode *inode,
+ 		em = alloc_extent_map(GFP_NOFS);
+ 		em->start = start;
+ 		em->orig_start = em->start;
+-
+ 		ram_size = ins.offset;
+ 		em->len = ins.offset;
+ 
+@@ -747,9 +770,9 @@ static noinline int cow_file_range(struct inode *inode,
+ 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ 
+ 		while (1) {
+-			spin_lock(&em_tree->lock);
++			write_lock(&em_tree->lock);
+ 			ret = add_extent_mapping(em_tree, em);
+-			spin_unlock(&em_tree->lock);
++			write_unlock(&em_tree->lock);
+ 			if (ret != -EEXIST) {
+ 				free_extent_map(em);
+ 				break;
+@@ -776,11 +799,17 @@ static noinline int cow_file_range(struct inode *inode,
+ 		/* we're not doing compressed IO, don't unlock the first
+ 		 * page (which the caller expects to stay locked), don't
+ 		 * clear any dirty bits and don't set any writeback bits
++		 *
++		 * Do set the Private2 bit so we know this page was properly
++		 * setup for writepage
+ 		 */
++		op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
++		op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
++			EXTENT_SET_PRIVATE2;
++
+ 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+ 					     start, start + ram_size - 1,
+-					     locked_page, unlock, 1,
+-					     1, 0, 0, 0);
++					     locked_page, op);
+ 		disk_num_bytes -= cur_alloc_size;
+ 		num_bytes -= cur_alloc_size;
+ 		alloc_hint = ins.objectid + ins.offset;
+@@ -852,8 +881,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+ 	u64 cur_end;
+ 	int limit = 10 * 1024 * 1042;
+ 
+-	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
+-			 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
++	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
++			 1, 0, NULL, GFP_NOFS);
+ 	while (start < end) {
+ 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+ 		async_cow->inode = inode;
+@@ -994,6 +1023,7 @@ next_slot:
+ 
+ 		if (found_key.offset > cur_offset) {
+ 			extent_end = found_key.offset;
++			extent_type = 0;
+ 			goto out_check;
+ 		}
+ 
+@@ -1080,9 +1110,9 @@ out_check:
+ 			em->bdev = root->fs_info->fs_devices->latest_bdev;
+ 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ 			while (1) {
+-				spin_lock(&em_tree->lock);
++				write_lock(&em_tree->lock);
+ 				ret = add_extent_mapping(em_tree, em);
+-				spin_unlock(&em_tree->lock);
++				write_unlock(&em_tree->lock);
+ 				if (ret != -EEXIST) {
+ 					free_extent_map(em);
+ 					break;
+@@ -1100,8 +1130,10 @@ out_check:
+ 		BUG_ON(ret);
+ 
+ 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+-					cur_offset, cur_offset + num_bytes - 1,
+-					locked_page, 1, 1, 1, 0, 0, 0);
++				cur_offset, cur_offset + num_bytes - 1,
++				locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
++				EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
++				EXTENT_SET_PRIVATE2);
+ 		cur_offset = extent_end;
+ 		if (cur_offset > end)
+ 			break;
+@@ -1147,6 +1179,89 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+ 	return ret;
+ }
+ 
++static int btrfs_split_extent_hook(struct inode *inode,
++				    struct extent_state *orig, u64 split)
++{
++	struct btrfs_root *root = BTRFS_I(inode)->root;
++	u64 size;
++
++	if (!(orig->state & EXTENT_DELALLOC))
++		return 0;
++
++	size = orig->end - orig->start + 1;
++	if (size > root->fs_info->max_extent) {
++		u64 num_extents;
++		u64 new_size;
++
++		new_size = orig->end - split + 1;
++		num_extents = div64_u64(size + root->fs_info->max_extent - 1,
++					root->fs_info->max_extent);
++
++		/*
++		 * if we break a large extent up then leave oustanding_extents
++		 * be, since we've already accounted for the large extent.
++		 */
++		if (div64_u64(new_size + root->fs_info->max_extent - 1,
++			      root->fs_info->max_extent) < num_extents)
++			return 0;
++	}
++
++	spin_lock(&BTRFS_I(inode)->accounting_lock);
++	BTRFS_I(inode)->outstanding_extents++;
++	spin_unlock(&BTRFS_I(inode)->accounting_lock);
++
++	return 0;
++}
++
++/*
++ * extent_io.c merge_extent_hook, used to track merged delayed allocation
++ * extents so we can keep track of new extents that are just merged onto old
++ * extents, such as when we are doing sequential writes, so we can properly
++ * account for the metadata space we'll need.
++ */
++static int btrfs_merge_extent_hook(struct inode *inode,
++				   struct extent_state *new,
++				   struct extent_state *other)
++{
++	struct btrfs_root *root = BTRFS_I(inode)->root;
++	u64 new_size, old_size;
++	u64 num_extents;
++
++	/* not delalloc, ignore it */
++	if (!(other->state & EXTENT_DELALLOC))
++		return 0;
++
++	old_size = other->end - other->start + 1;
++	if (new->start < other->start)
++		new_size = other->end - new->start + 1;
++	else
++		new_size = new->end - other->start + 1;
++
++	/* we're not bigger than the max, unreserve the space and go */
++	if (new_size <= root->fs_info->max_extent) {
++		spin_lock(&BTRFS_I(inode)->accounting_lock);
++		BTRFS_I(inode)->outstanding_extents--;
++		spin_unlock(&BTRFS_I(inode)->accounting_lock);
++		return 0;
++	}
++
++	/*
++	 * If we grew by another max_extent, just return, we want to keep that
++	 * reserved amount.
++	 */
++	num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
++				root->fs_info->max_extent);
++	if (div64_u64(new_size + root->fs_info->max_extent - 1,
++		      root->fs_info->max_extent) > num_extents)
++		return 0;
++
++	spin_lock(&BTRFS_I(inode)->accounting_lock);
++	BTRFS_I(inode)->outstanding_extents--;
++	spin_unlock(&BTRFS_I(inode)->accounting_lock);
++
++	return 0;
++}
++
+ /*
+  * extent_io.c set_bit_hook, used to track delayed allocation
+  * bytes in this file, and to maintain the list of inodes that
+@@ -1155,6 +1270,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+ 		       unsigned long old, unsigned long bits)
+ {
++
+ 	/*
+ 	 * set_bit and clear bit hooks normally require _irqsave/restore
+ 	 * but in this case, we are only testeing for the DELALLOC
+@@ -1162,6 +1278,10 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+ 	 */
+ 	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ 		struct btrfs_root *root = BTRFS_I(inode)->root;
++
++		spin_lock(&BTRFS_I(inode)->accounting_lock);
++		BTRFS_I(inode)->outstanding_extents++;
++		spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ 		btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+ 		spin_lock(&root->fs_info->delalloc_lock);
+ 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
+@@ -1178,22 +1298,31 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+ /*
+  * extent_io.c clear_bit_hook, see set_bit_hook for why
+  */
+-static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
+-			 unsigned long old, unsigned long bits)
++static int btrfs_clear_bit_hook(struct inode *inode,
++				struct extent_state *state, unsigned long bits)
+ {
+ 	/*
+ 	 * set_bit and clear bit hooks normally require _irqsave/restore
+ 	 * but in this case, we are only testeing for the DELALLOC
+ 	 * bit, which is only set or cleared with irqs on
+ 	 */
+-	if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
++	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ 		struct btrfs_root *root = BTRFS_I(inode)->root;
+ 
++		if (bits & EXTENT_DO_ACCOUNTING) {
++			spin_lock(&BTRFS_I(inode)->accounting_lock);
++			BTRFS_I(inode)->outstanding_extents--;
++			spin_unlock(&BTRFS_I(inode)->accounting_lock);
++			btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
++		}
++
+ 		spin_lock(&root->fs_info->delalloc_lock);
+-		if (end - start + 1 > root->fs_info->delalloc_bytes) {
++		if (state->end - state->start + 1 >
++		    root->fs_info->delalloc_bytes) {
+ 			printk(KERN_INFO "btrfs warning: delalloc account "
+ 			       "%llu %llu\n",
+-			       (unsigned long long)end - start + 1,
++			       (unsigned long long)
++			       state->end - state->start + 1,
+ 			       (unsigned long long)
+ 			       root->fs_info->delalloc_bytes);
+ 			btrfs_delalloc_free_space(root, inode, (u64)-1);
+@@ -1201,9 +1330,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
+ 			BTRFS_I(inode)->delalloc_bytes = 0;
+ 		} else {
+ 			btrfs_delalloc_free_space(root, inode,
+-						  end - start + 1);
+-			root->fs_info->delalloc_bytes -= end - start + 1;
+-			BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
++						  state->end -
++						  state->start + 1);
++			root->fs_info->delalloc_bytes -= state->end -
++				state->start + 1;
++			BTRFS_I(inode)->delalloc_bytes -= state->end -
++				state->start + 1;
+ 		}
+ 		if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+ 		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+@@ -1374,10 +1506,8 @@ again:
+ 	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+ 
+ 	/* already ordered? We're done */
+-	if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+-			     EXTENT_ORDERED, 0)) {
++	if (PagePrivate2(page))
+ 		goto out;
+-	}
+ 
+ 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ 	if (ordered) {
+@@ -1413,11 +1543,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+ 	struct inode *inode = page->mapping->host;
+ 	struct btrfs_writepage_fixup *fixup;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+-	int ret;
+ 
+-	ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+-			     EXTENT_ORDERED, 0);
+-	if (ret)
++	/* this page is properly in the ordered list */
++	if (TestClearPagePrivate2(page))
+ 		return 0;
+ 
+ 	if (PageChecked(page))
+@@ -1455,9 +1583,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+ 	BUG_ON(!path);
+ 
+ 	path->leave_spinning = 1;
++
++	/*
++	 * we may be replacing one extent in the tree with another.
++	 * The new extent is pinned in the extent map, and we don't want
++	 * to drop it from the cache until it is completely in the btree.
++	 *
++	 * So, tell btrfs_drop_extents to leave this extent in the cache.
++	 * the caller is expected to unpin it and allow it to be merged
++	 * with the others.
++	 */
+ 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
+ 				 file_pos + num_bytes, locked_end,
+-				 file_pos, &hint);
++				 file_pos, &hint, 0);
+ 	BUG_ON(ret);
+ 
+ 	ins.objectid = inode->i_ino;
+@@ -1485,7 +1623,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+ 	btrfs_mark_buffer_dirty(leaf);
+ 
+ 	inode_add_bytes(inode, num_bytes);
+-	btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
+ 
+ 	ins.objectid = disk_bytenr;
+ 	ins.offset = disk_num_bytes;
+@@ -1596,6 +1733,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+ 						ordered_extent->len,
+ 						compressed, 0, 0,
+ 						BTRFS_FILE_EXTENT_REG);
++		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
++				   ordered_extent->file_offset,
++				   ordered_extent->len);
+ 		BUG_ON(ret);
+ 	}
+ 	unlock_extent(io_tree, ordered_extent->file_offset,
+@@ -1623,6 +1763,7 @@ nocow:
+ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+ 				struct extent_state *state, int uptodate)
+ {
++	ClearPagePrivate2(page);
+ 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
+ }
+ 
+@@ -1669,13 +1810,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
+ 		failrec->last_mirror = 0;
+ 		failrec->bio_flags = 0;
+ 
+-		spin_lock(&em_tree->lock);
++		read_lock(&em_tree->lock);
+ 		em = lookup_extent_mapping(em_tree, start, failrec->len);
+ 		if (em->start > start || em->start + em->len < start) {
+ 			free_extent_map(em);
+ 			em = NULL;
+ 		}
+-		spin_unlock(&em_tree->lock);
++		read_unlock(&em_tree->lock);
+ 
+ 		if (!em || IS_ERR(em)) {
+ 			kfree(failrec);
+@@ -1794,7 +1935,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+ 		return 0;
+ 
+ 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+-	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
++	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
+ 		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
+ 				  GFP_NOFS);
+ 		return 0;
+@@ -2352,6 +2493,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+ 	return ret;
+ }
+ 
++int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
++			struct btrfs_root *root,
++			struct inode *dir, u64 objectid,
++			const char *name, int name_len)
++{
++	struct btrfs_path *path;
++	struct extent_buffer *leaf;
++	struct btrfs_dir_item *di;
++	struct btrfs_key key;
++	u64 index;
++	int ret;
++
++	path = btrfs_alloc_path();
++	if (!path)
++		return -ENOMEM;
++
++	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
++				   name, name_len, -1);
++	BUG_ON(!di || IS_ERR(di));
++
++	leaf = path->nodes[0];
++	btrfs_dir_item_key_to_cpu(leaf, di, &key);
++	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
++	ret = btrfs_delete_one_dir_name(trans, root, path, di);
++	BUG_ON(ret);
++	btrfs_release_path(root, path);
++
++	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
++				 objectid, root->root_key.objectid,
++				 dir->i_ino, &index, name, name_len);
++	if (ret < 0) {
++		BUG_ON(ret != -ENOENT);
++		di = btrfs_search_dir_index_item(root, path, dir->i_ino,
++						 name, name_len);
++		BUG_ON(!di || IS_ERR(di));
++
++		leaf = path->nodes[0];
++		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++		btrfs_release_path(root, path);
++		index = key.offset;
++	}
++
++	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
++					 index, name, name_len, -1);
++	BUG_ON(!di || IS_ERR(di));
++
++	leaf = path->nodes[0];
++	btrfs_dir_item_key_to_cpu(leaf, di, &key);
++	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
++	ret = btrfs_delete_one_dir_name(trans, root, path, di);
++	BUG_ON(ret);
++	btrfs_release_path(root, path);
++
++	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
++	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
++	ret = btrfs_update_inode(trans, root, dir);
++	BUG_ON(ret);
++	dir->i_sb->s_dirt = 1;
++
++	btrfs_free_path(path);
++	return 0;
++}
++
+ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+ {
+ 	struct inode *inode = dentry->d_inode;
+@@ -2361,29 +2565,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+ 	struct btrfs_trans_handle *trans;
+ 	unsigned long nr = 0;
+ 
+-	/*
+-	 * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
+-	 * the root of a subvolume or snapshot
+-	 */
+ 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
+-	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
++	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ 		return -ENOTEMPTY;
+-	}
+ 
+ 	trans = btrfs_start_transaction(root, 1);
+ 	btrfs_set_trans_block_group(trans, dir);
+ 
++	if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
++		err = btrfs_unlink_subvol(trans, root, dir,
++					  BTRFS_I(inode)->location.objectid,
++					  dentry->d_name.name,
++					  dentry->d_name.len);
++		goto out;
++	}
++
+ 	err = btrfs_orphan_add(trans, inode);
+ 	if (err)
+-		goto fail_trans;
++		goto out;
+ 
+ 	/* now the directory is empty */
+ 	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+ 				 dentry->d_name.name, dentry->d_name.len);
+ 	if (!err)
+ 		btrfs_i_size_write(inode, 0);
+-
+-fail_trans:
++out:
+ 	nr = trans->blocks_used;
+ 	ret = btrfs_end_transaction_throttle(trans, root);
+ 	btrfs_btree_balance_dirty(root, nr);
+@@ -2826,12 +3032,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+ 
+ 	if ((offset & (blocksize - 1)) == 0)
+ 		goto out;
++	ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
++	if (ret)
++		goto out;
++
++	ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
++	if (ret)
++		goto out;
+ 
+ 	ret = -ENOMEM;
+ again:
+ 	page = grab_cache_page(mapping, index);
+-	if (!page)
++	if (!page) {
++		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
++		btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ 		goto out;
++	}
+ 
+ 	page_start = page_offset(page);
+ 	page_end = page_start + PAGE_CACHE_SIZE - 1;
+@@ -2864,7 +3080,16 @@ again:
+ 		goto again;
+ 	}
+ 
+-	btrfs_set_extent_delalloc(inode, page_start, page_end);
++	clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
++			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
++			  GFP_NOFS);
++
++	ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
++	if (ret) {
++		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
++		goto out_unlock;
++	}
++
+ 	ret = 0;
+ 	if (offset != PAGE_CACHE_SIZE) {
+ 		kaddr = kmap(page);
+@@ -2877,6 +3102,9 @@ again:
+ 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ 
+ out_unlock:
++	if (ret)
++		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
++	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ 	unlock_page(page);
+ 	page_cache_release(page);
+ out:
+@@ -2895,17 +3123,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
+ 	u64 last_byte;
+ 	u64 cur_offset;
+ 	u64 hole_size;
+-	int err;
++	int err = 0;
+ 
+ 	if (size <= hole_start)
+ 		return 0;
+ 
+-	err = btrfs_check_metadata_free_space(root);
++	err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+ 	if (err)
+ 		return err;
+ 
+-	btrfs_truncate_page(inode->i_mapping, inode->i_size);
+-
+ 	while (1) {
+ 		struct btrfs_ordered_extent *ordered;
+ 		btrfs_wait_ordered_range(inode, hole_start,
+@@ -2935,15 +3161,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
+ 						 cur_offset,
+ 						 cur_offset + hole_size,
+ 						 block_end,
+-						 cur_offset, &hint_byte);
++						 cur_offset, &hint_byte, 1);
+ 			if (err)
+ 				break;
++
++			err = btrfs_reserve_metadata_space(root, 1);
++			if (err)
++				break;
++
+ 			err = btrfs_insert_file_extent(trans, root,
+ 					inode->i_ino, cur_offset, 0,
+ 					0, hole_size, 0, hole_size,
+ 					0, 0, 0);
+ 			btrfs_drop_extent_cache(inode, hole_start,
+ 					last_byte - 1, 0);
++			btrfs_unreserve_metadata_space(root, 1);
+ 		}
+ 		free_extent_map(em);
+ 		cur_offset = last_byte;
+@@ -3003,6 +3235,11 @@ void btrfs_delete_inode(struct inode *inode)
+ 	}
+ 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ 
++	if (inode->i_nlink > 0) {
++		BUG_ON(btrfs_root_refs(&root->root_item) != 0);
++		goto no_delete;
++	}
++
+ 	btrfs_i_size_write(inode, 0);
+ 	trans = btrfs_join_transaction(root, 1);
+ 
+@@ -3070,29 +3307,67 @@ out_err:
+  * is kind of like crossing a mount point.
+  */
+ static int fixup_tree_root_location(struct btrfs_root *root,
+-			     struct btrfs_key *location,
+-			     struct btrfs_root **sub_root,
+-			     struct dentry *dentry)
++				    struct inode *dir,
++				    struct dentry *dentry,
++				    struct btrfs_key *location,
++				    struct btrfs_root **sub_root)
+ {
+-	struct btrfs_root_item *ri;
++	struct btrfs_path *path;
++	struct btrfs_root *new_root;
++	struct btrfs_root_ref *ref;
++	struct extent_buffer *leaf;
++	int ret;
++	int err = 0;
+ 
+-	if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
+-		return 0;
+-	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+-		return 0;
++	path = btrfs_alloc_path();
++	if (!path) {
++		err = -ENOMEM;
++		goto out;
++	}
+ 
+-	*sub_root = btrfs_read_fs_root(root->fs_info, location,
+-					dentry->d_name.name,
+-					dentry->d_name.len);
+-	if (IS_ERR(*sub_root))
+-		return PTR_ERR(*sub_root);
++	err = -ENOENT;
++	ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
++				  BTRFS_I(dir)->root->root_key.objectid,
++				  location->objectid);
++	if (ret) {
++		if (ret < 0)
++			err = ret;
++		goto out;
++	}
+ 
+-	ri = &(*sub_root)->root_item;
+-	location->objectid = btrfs_root_dirid(ri);
+-	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+-	location->offset = 0;
++	leaf = path->nodes[0];
++	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
++	if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
++	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
++		goto out;
+ 
+-	return 0;
++	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
++				   (unsigned long)(ref + 1),
++				   dentry->d_name.len);
++	if (ret)
++		goto out;
++
++	btrfs_release_path(root->fs_info->tree_root, path);
++
++	new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
++	if (IS_ERR(new_root)) {
++		err = PTR_ERR(new_root);
++		goto out;
++	}
++
++	if (btrfs_root_refs(&new_root->root_item) == 0) {
++		err = -ENOENT;
++		goto out;
++	}
++
++	*sub_root = new_root;
++	location->objectid = btrfs_root_dirid(&new_root->root_item);
++	location->type = BTRFS_INODE_ITEM_KEY;
++	location->offset = 0;
++	err = 0;
++out:
++	btrfs_free_path(path);
++	return err;
+ }
+ 
+ static void inode_tree_add(struct inode *inode)
+@@ -3101,11 +3376,13 @@ static void inode_tree_add(struct inode *inode)
+ 	struct btrfs_inode *entry;
+ 	struct rb_node **p;
+ 	struct rb_node *parent;
+-
+ again:
+ 	p = &root->inode_tree.rb_node;
+ 	parent = NULL;
+ 
++	if (hlist_unhashed(&inode->i_hash))
++		return;
++
+ 	spin_lock(&root->inode_lock);
+ 	while (*p) {
+ 		parent = *p;
+@@ -3132,13 +3409,87 @@ again:
+ static void inode_tree_del(struct inode *inode)
+ {
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
++	int empty = 0;
+ 
+ 	spin_lock(&root->inode_lock);
+ 	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
+ 		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+ 		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
++		empty = RB_EMPTY_ROOT(&root->inode_tree);
++	}
++	spin_unlock(&root->inode_lock);
++
++	if (empty && btrfs_root_refs(&root->root_item) == 0) {
++		synchronize_srcu(&root->fs_info->subvol_srcu);
++		spin_lock(&root->inode_lock);
++		empty = RB_EMPTY_ROOT(&root->inode_tree);
++		spin_unlock(&root->inode_lock);
++		if (empty)
++			btrfs_add_dead_root(root);
++	}
++}
++
++int btrfs_invalidate_inodes(struct btrfs_root *root)
++{
++	struct rb_node *node;
++	struct rb_node *prev;
++	struct btrfs_inode *entry;
++	struct inode *inode;
++	u64 objectid = 0;
++
++	WARN_ON(btrfs_root_refs(&root->root_item) != 0);
++
++	spin_lock(&root->inode_lock);
++again:
++	node = root->inode_tree.rb_node;
++	prev = NULL;
++	while (node) {
++		prev = node;
++		entry = rb_entry(node, struct btrfs_inode, rb_node);
++
++		if (objectid < entry->vfs_inode.i_ino)
++			node = node->rb_left;
++		else if (objectid > entry->vfs_inode.i_ino)
++			node = node->rb_right;
++		else
++			break;
++	}
++	if (!node) {
++		while (prev) {
++			entry = rb_entry(prev, struct btrfs_inode, rb_node);
++			if (objectid <= entry->vfs_inode.i_ino) {
++				node = prev;
++				break;
++			}
++			prev = rb_next(prev);
++		}
++	}
++	while (node) {
++		entry = rb_entry(node, struct btrfs_inode, rb_node);
++		objectid = entry->vfs_inode.i_ino + 1;
++		inode = igrab(&entry->vfs_inode);
++		if (inode) {
++			spin_unlock(&root->inode_lock);
++			if (atomic_read(&inode->i_count) > 1)
++				d_prune_aliases(inode);
++			/*
++			 * btrfs_drop_inode will remove it from
++			 * the inode cache when its usage count
++			 * hits zero.
++			 */
++			iput(inode);
++			cond_resched();
++			spin_lock(&root->inode_lock);
++			goto again;
++		}
++
++		if (cond_resched_lock(&root->inode_lock))
++			goto again;
++
++		node = rb_next(node);
+ 	}
+ 	spin_unlock(&root->inode_lock);
++	return 0;
+ }
+ 
+ static noinline void init_btrfs_i(struct inode *inode)
+@@ -3148,6 +3499,7 @@ static noinline void init_btrfs_i(struct inode *inode)
+ 	bi->generation = 0;
+ 	bi->sequence = 0;
+ 	bi->last_trans = 0;
++	bi->last_sub_trans = 0;
+ 	bi->logged_trans = 0;
+ 	bi->delalloc_bytes = 0;
+ 	bi->reserved_bytes = 0;
+@@ -3225,15 +3577,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+ 	return inode;
+ }
+ 
++static struct inode *new_simple_dir(struct super_block *s,
++				    struct btrfs_key *key,
++				    struct btrfs_root *root)
++{
++	struct inode *inode = new_inode(s);
++
++	if (!inode)
++		return ERR_PTR(-ENOMEM);
++
++	init_btrfs_i(inode);
++
++	BTRFS_I(inode)->root = root;
++	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
++	BTRFS_I(inode)->dummy_inode = 1;
++
++	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
++	inode->i_op = &simple_dir_inode_operations;
++	inode->i_fop = &simple_dir_operations;
++	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
++	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
++
++	return inode;
++}
++
+ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+ {
+ 	struct inode *inode;
+-	struct btrfs_inode *bi = BTRFS_I(dir);
+-	struct btrfs_root *root = bi->root;
++	struct btrfs_root *root = BTRFS_I(dir)->root;
+ 	struct btrfs_root *sub_root = root;
+ 	struct btrfs_key location;
++	int index;
+ 	int ret;
+ 
++	dentry->d_op = &btrfs_dentry_operations;
++
+ 	if (dentry->d_name.len > BTRFS_NAME_LEN)
+ 		return ERR_PTR(-ENAMETOOLONG);
+ 
+@@ -3242,29 +3620,52 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+ 	if (ret < 0)
+ 		return ERR_PTR(ret);
+ 
+-	inode = NULL;
+-	if (location.objectid) {
+-		ret = fixup_tree_root_location(root, &location, &sub_root,
+-						dentry);
+-		if (ret < 0)
+-			return ERR_PTR(ret);
+-		if (ret > 0)
+-			return ERR_PTR(-ENOENT);
++	if (location.objectid == 0)
++		return NULL;
++
++	if (location.type == BTRFS_INODE_ITEM_KEY) {
++		inode = btrfs_iget(dir->i_sb, &location, root);
++		return inode;
++	}
++
++	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
++
++	index = srcu_read_lock(&root->fs_info->subvol_srcu);
++	ret = fixup_tree_root_location(root, dir, dentry,
++				       &location, &sub_root);
++	if (ret < 0) {
++		if (ret != -ENOENT)
++			inode = ERR_PTR(ret);
++		else
++			inode = new_simple_dir(dir->i_sb, &location, sub_root);
++	} else {
+ 		inode = btrfs_iget(dir->i_sb, &location, sub_root);
+-		if (IS_ERR(inode))
+-			return ERR_CAST(inode);
+ 	}
++	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
++
+ 	return inode;
+ }
+ 
++static int btrfs_dentry_delete(struct dentry *dentry)
++{
++	struct btrfs_root *root;
++
++	if (!dentry->d_inode && !IS_ROOT(dentry))
++		dentry = dentry->d_parent;
++
++	if (dentry->d_inode) {
++		root = BTRFS_I(dentry->d_inode)->root;
++		if (btrfs_root_refs(&root->root_item) == 0)
++			return 1;
++	}
++	return 0;
++}
++
+ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
+ 				   struct nameidata *nd)
+ {
+ 	struct inode *inode;
+ 
+-	if (dentry->d_name.len > BTRFS_NAME_LEN)
+-		return ERR_PTR(-ENAMETOOLONG);
+-
+ 	inode = btrfs_lookup_dentry(dir, dentry);
+ 	if (IS_ERR(inode))
+ 		return ERR_CAST(inode);
+@@ -3603,9 +4004,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
+ 	if (ret != 0)
+ 		goto fail;
+ 
+-	if (objectid > root->highest_inode)
+-		root->highest_inode = objectid;
+-
+ 	inode->i_uid = current_fsuid();
+ 
+ 	if (dir && (dir->i_mode & S_ISGID)) {
+@@ -3673,26 +4071,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
+ 		   struct inode *parent_inode, struct inode *inode,
+ 		   const char *name, int name_len, int add_backref, u64 index)
+ {
+-	int ret;
++	int ret = 0;
+ 	struct btrfs_key key;
+ 	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
+ 
+-	key.objectid = inode->i_ino;
+-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+-	key.offset = 0;
++	if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
++		memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
++	} else {
++		key.objectid = inode->i_ino;
++		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
++		key.offset = 0;
++	}
++
++	if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
++		ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
++					 key.objectid, root->root_key.objectid,
++					 parent_inode->i_ino,
++					 index, name, name_len);
++	} else if (add_backref) {
++		ret = btrfs_insert_inode_ref(trans, root,
++					     name, name_len, inode->i_ino,
++					     parent_inode->i_ino, index);
++	}
+ 
+-	ret = btrfs_insert_dir_item(trans, root, name, name_len,
+-				    parent_inode->i_ino,
+-				    &key, btrfs_inode_type(inode),
+-				    index);
+ 	if (ret == 0) {
+-		if (add_backref) {
+-			ret = btrfs_insert_inode_ref(trans, root,
+-						     name, name_len,
+-						     inode->i_ino,
+-						     parent_inode->i_ino,
+-						     index);
+-		}
++		ret = btrfs_insert_dir_item(trans, root, name, name_len,
++					    parent_inode->i_ino, &key,
++					    btrfs_inode_type(inode), index);
++		BUG_ON(ret);
++
+ 		btrfs_i_size_write(parent_inode, parent_inode->i_size +
+ 				   name_len * 2);
+ 		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+@@ -3732,11 +4139,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
+ 	if (!new_valid_dev(rdev))
+ 		return -EINVAL;
+ 
+-	err = btrfs_check_metadata_free_space(root);
++	/*
++	 * 2 for inode item and ref
++	 * 2 for dir items
++	 * 1 for xattr if selinux is on
++	 */
++	err = btrfs_reserve_metadata_space(root, 5);
+ 	if (err)
+-		goto fail;
++		return err;
+ 
+ 	trans = btrfs_start_transaction(root, 1);
++	if (!trans)
++		goto fail;
+ 	btrfs_set_trans_block_group(trans, dir);
+ 
+ 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+@@ -3774,6 +4188,7 @@ out_unlock:
+ 	nr = trans->blocks_used;
+ 	btrfs_end_transaction_throttle(trans, root);
+ fail:
++	btrfs_unreserve_metadata_space(root, 5);
+ 	if (drop_inode) {
+ 		inode_dec_link_count(inode);
+ 		iput(inode);
+@@ -3794,10 +4209,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
+ 	u64 objectid;
+ 	u64 index = 0;
+ 
+-	err = btrfs_check_metadata_free_space(root);
++	/*
++	 * 2 for inode item and ref
++	 * 2 for dir items
++	 * 1 for xattr if selinux is on
++	 */
++	err = btrfs_reserve_metadata_space(root, 5);
+ 	if (err)
+-		goto fail;
++		return err;
++
+ 	trans = btrfs_start_transaction(root, 1);
++	if (!trans)
++		goto fail;
+ 	btrfs_set_trans_block_group(trans, dir);
+ 
+ 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+@@ -3838,6 +4261,7 @@ out_unlock:
+ 	nr = trans->blocks_used;
+ 	btrfs_end_transaction_throttle(trans, root);
+ fail:
++	btrfs_unreserve_metadata_space(root, 5);
+ 	if (drop_inode) {
+ 		inode_dec_link_count(inode);
+ 		iput(inode);
+@@ -3860,10 +4284,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+ 	if (inode->i_nlink == 0)
+ 		return -ENOENT;
+ 
+-	btrfs_inc_nlink(inode);
+-	err = btrfs_check_metadata_free_space(root);
++	/*
++	 * 1 item for inode ref
++	 * 2 items for dir items
++	 */
++	err = btrfs_reserve_metadata_space(root, 3);
+ 	if (err)
+-		goto fail;
++		return err;
++
++	btrfs_inc_nlink(inode);
++
+ 	err = btrfs_set_inode_index(dir, &index);
+ 	if (err)
+ 		goto fail;
+@@ -3875,20 +4305,19 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+ 
+ 	err = btrfs_add_nondir(trans, dentry, inode, 1, index);
+ 
+-	if (err)
+-		drop_inode = 1;
+-
+-	btrfs_update_inode_block_group(trans, dir);
+-	err = btrfs_update_inode(trans, root, inode);
+-
+-	if (err)
++	if (err) {
+ 		drop_inode = 1;
++	} else {
++		btrfs_update_inode_block_group(trans, dir);
++		err = btrfs_update_inode(trans, root, inode);
++		BUG_ON(err);
++		btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
++	}
+ 
+ 	nr = trans->blocks_used;
+-
+-	btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+ 	btrfs_end_transaction_throttle(trans, root);
+ fail:
++	btrfs_unreserve_metadata_space(root, 3);
+ 	if (drop_inode) {
+ 		inode_dec_link_count(inode);
+ 		iput(inode);
+@@ -3908,17 +4337,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+ 	u64 index = 0;
+ 	unsigned long nr = 1;
+ 
+-	err = btrfs_check_metadata_free_space(root);
++	/*
++	 * 2 items for inode and ref
++	 * 2 items for dir items
++	 * 1 for xattr if selinux is on
++	 */
++	err = btrfs_reserve_metadata_space(root, 5);
+ 	if (err)
+-		goto out_unlock;
++		return err;
+ 
+ 	trans = btrfs_start_transaction(root, 1);
+-	btrfs_set_trans_block_group(trans, dir);
+-
+-	if (IS_ERR(trans)) {
+-		err = PTR_ERR(trans);
++	if (!trans) {
++		err = -ENOMEM;
+ 		goto out_unlock;
+ 	}
++	btrfs_set_trans_block_group(trans, dir);
+ 
+ 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+ 	if (err) {
+@@ -3967,6 +4400,7 @@ out_fail:
+ 	btrfs_end_transaction_throttle(trans, root);
+ 
+ out_unlock:
++	btrfs_unreserve_metadata_space(root, 5);
+ 	if (drop_on_err)
+ 		iput(inode);
+ 	btrfs_btree_balance_dirty(root, nr);
+@@ -4064,11 +4498,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+ 	int compressed;
+ 
+ again:
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree, start, len);
+ 	if (em)
+ 		em->bdev = root->fs_info->fs_devices->latest_bdev;
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 
+ 	if (em) {
+ 		if (em->start > start || em->start + em->len <= start)
+@@ -4215,6 +4649,11 @@ again:
+ 				map = kmap(page);
+ 				read_extent_buffer(leaf, map + pg_offset, ptr,
+ 						   copy_size);
++				if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
++					memset(map + pg_offset + copy_size, 0,
++					       PAGE_CACHE_SIZE - pg_offset -
++					       copy_size);
++				}
+ 				kunmap(page);
+ 			}
+ 			flush_dcache_page(page);
+@@ -4259,7 +4698,7 @@ insert:
+ 	}
+ 
+ 	err = 0;
+-	spin_lock(&em_tree->lock);
++	write_lock(&em_tree->lock);
+ 	ret = add_extent_mapping(em_tree, em);
+ 	/* it is possible that someone inserted the extent into the tree
+ 	 * while we had the lock dropped.  It is also possible that
+@@ -4299,7 +4738,7 @@ insert:
+ 			err = 0;
+ 		}
+ 	}
+-	spin_unlock(&em_tree->lock);
++	write_unlock(&em_tree->lock);
+ out:
+ 	if (path)
+ 		btrfs_free_path(path);
+@@ -4398,13 +4837,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+ 	u64 page_start = page_offset(page);
+ 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+ 
++
++	/*
++	 * we have the page locked, so new writeback can't start,
++	 * and the dirty bit won't be cleared while we are here.
++	 *
++	 * Wait for IO on this page so that we can safely clear
++	 * the PagePrivate2 bit and do ordered accounting
++	 */
+ 	wait_on_page_writeback(page);
++
+ 	tree = &BTRFS_I(page->mapping->host)->io_tree;
+ 	if (offset) {
+ 		btrfs_releasepage(page, GFP_NOFS);
+ 		return;
+ 	}
+-
+ 	lock_extent(tree, page_start, page_end, GFP_NOFS);
+ 	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+ 					   page_offset(page));
+@@ -4415,16 +4862,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+ 		 */
+ 		clear_extent_bit(tree, page_start, page_end,
+ 				 EXTENT_DIRTY | EXTENT_DELALLOC |
+-				 EXTENT_LOCKED, 1, 0, GFP_NOFS);
+-		btrfs_finish_ordered_io(page->mapping->host,
+-					page_start, page_end);
++				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
++				 NULL, GFP_NOFS);
++		/*
++		 * whoever cleared the private bit is responsible
++		 * for the finish_ordered_io
++		 */
++		if (TestClearPagePrivate2(page)) {
++			btrfs_finish_ordered_io(page->mapping->host,
++						page_start, page_end);
++		}
+ 		btrfs_put_ordered_extent(ordered);
+ 		lock_extent(tree, page_start, page_end, GFP_NOFS);
+ 	}
+ 	clear_extent_bit(tree, page_start, page_end,
+ 		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+-		 EXTENT_ORDERED,
+-		 1, 1, GFP_NOFS);
++		 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
+ 	__btrfs_releasepage(page, GFP_NOFS);
+ 
+ 	ClearPageChecked(page);
+@@ -4473,6 +4926,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+ 		goto out;
+ 	}
+ 
++	ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
++	if (ret) {
++		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
++		ret = VM_FAULT_SIGBUS;
++		goto out;
++	}
++
+ 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+ again:
+ 	lock_page(page);
+@@ -4504,7 +4964,24 @@ again:
+ 		goto again;
+ 	}
+ 
+-	btrfs_set_extent_delalloc(inode, page_start, page_end);
++	/*
++	 * XXX - page_mkwrite gets called every time the page is dirtied, even
++	 * if it was already dirty, so for space accounting reasons we need to
++	 * clear any delalloc bits for the range we are fixing to save.  There
++	 * is probably a better way to do this, but for now keep consistent with
++	 * prepare_pages in the normal write path.
++	 */
++	clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
++			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
++			  GFP_NOFS);
++
++	ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
++	if (ret) {
++		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
++		ret = VM_FAULT_SIGBUS;
++		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
++		goto out_unlock;
++	}
+ 	ret = 0;
+ 
+ 	/* page is wholly or partially inside EOF */
+@@ -4521,11 +4998,17 @@ again:
+ 	}
+ 	ClearPageChecked(page);
+ 	set_page_dirty(page);
++	SetPageUptodate(page);
++
++	BTRFS_I(inode)->last_trans = root->fs_info->generation;
++	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+ 
+-	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+ 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ 
+ out_unlock:
++	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
++	if (!ret)
++		return VM_FAULT_LOCKED;
+ 	unlock_page(page);
+ out:
+ 	return ret;
+@@ -4544,7 +5027,9 @@ static void btrfs_truncate(struct inode *inode)
+ 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ 		return;
+ 
+-	btrfs_truncate_page(inode->i_mapping, inode->i_size);
++	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
++	if (ret)
++		return;
+ 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+ 
+ 	trans = btrfs_start_transaction(root, 1);
+@@ -4594,11 +5079,11 @@ out:
+  * create a new subvolume directory/inode (helper for the ioctl).
+  */
+ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+-			     struct btrfs_root *new_root, struct dentry *dentry,
++			     struct btrfs_root *new_root,
+ 			     u64 new_dirid, u64 alloc_hint)
+ {
+ 	struct inode *inode;
+-	int error;
++	int err;
+ 	u64 index = 0;
+ 
+ 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
+@@ -4611,11 +5096,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+ 	inode->i_nlink = 1;
+ 	btrfs_i_size_write(inode, 0);
+ 
+-	error = btrfs_update_inode(trans, new_root, inode);
+-	if (error)
+-		return error;
++	err = btrfs_update_inode(trans, new_root, inode);
++	BUG_ON(err);
+ 
+-	d_instantiate(dentry, inode);
++	iput(inode);
+ 	return 0;
+ }
+ 
+@@ -4640,7 +5124,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
+ 	if (!ei)
+ 		return NULL;
+ 	ei->last_trans = 0;
++	ei->last_sub_trans = 0;
+ 	ei->logged_trans = 0;
++	ei->outstanding_extents = 0;
++	ei->reserved_extents = 0;
++	ei->root = NULL;
++	spin_lock_init(&ei->accounting_lock);
+ 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+ 	INIT_LIST_HEAD(&ei->i_orphan);
+ 	INIT_LIST_HEAD(&ei->ordered_operations);
+@@ -4656,6 +5145,14 @@ void btrfs_destroy_inode(struct inode *inode)
+ 	WARN_ON(inode->i_data.nrpages);
+ 
+ 	/*
++	 * This can happen where we create an inode, but somebody else also
++	 * created the same inode and we need to destroy the one we already
++	 * created.
++	 */
++	if (!root)
++		goto free;
++
++	/*
+ 	 * Make sure we're properly removed from the ordered operation
+ 	 * lists.
+ 	 */
+@@ -4690,9 +5187,20 @@ void btrfs_destroy_inode(struct inode *inode)
+ 	}
+ 	inode_tree_del(inode);
+ 	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
++free:
+ 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+ }
+ 
++void btrfs_drop_inode(struct inode *inode)
++{
++	struct btrfs_root *root = BTRFS_I(inode)->root;
++
++	if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
++		generic_delete_inode(inode);
++	else
++		generic_drop_inode(inode);
++}
++
+ static void init_once(void *foo)
+ {
+ 	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+@@ -4761,31 +5269,37 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ {
+ 	struct btrfs_trans_handle *trans;
+ 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
++	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+ 	struct inode *new_inode = new_dentry->d_inode;
+ 	struct inode *old_inode = old_dentry->d_inode;
+ 	struct timespec ctime = CURRENT_TIME;
+ 	u64 index = 0;
++	u64 root_objectid;
+ 	int ret;
+ 
+-	/* we're not allowed to rename between subvolumes */
+-	if (BTRFS_I(old_inode)->root->root_key.objectid !=
+-	    BTRFS_I(new_dir)->root->root_key.objectid)
++	if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
++		return -EPERM;
++
++	/* we only allow rename subvolume link between subvolumes */
++	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ 		return -EXDEV;
+ 
++	if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
++	    (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
++		return -ENOTEMPTY;
++
+ 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
+-	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
++	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+ 		return -ENOTEMPTY;
+-	}
+ 
+-	/* to rename a snapshot or subvolume, we need to juggle the
+-	 * backrefs.  This isn't coded yet
++	/*
++	 * 2 items for dir items
++	 * 1 item for orphan entry
++	 * 1 item for ref
+ 	 */
+-	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+-		return -EXDEV;
+-
+-	ret = btrfs_check_metadata_free_space(root);
++	ret = btrfs_reserve_metadata_space(root, 4);
+ 	if (ret)
+-		goto out_unlock;
++		return ret;
+ 
+ 	/*
+ 	 * we're using rename to replace one file with another.
+@@ -4796,8 +5310,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ 	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+ 		filemap_flush(old_inode->i_mapping);
+ 
++	/* close the racy window with snapshot create/destroy ioctl */
++	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
++		down_read(&root->fs_info->subvol_sem);
++
+ 	trans = btrfs_start_transaction(root, 1);
++	btrfs_set_trans_block_group(trans, new_dir);
++
++	if (dest != root)
++		btrfs_record_root_in_trans(trans, dest);
++
++	ret = btrfs_set_inode_index(new_dir, &index);
++	if (ret)
++		goto out_fail;
+ 
++	if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
++		/* force full log commit if subvolume involved. */
++		root->fs_info->last_trans_log_full_commit = trans->transid;
++	} else {
++		ret = btrfs_insert_inode_ref(trans, dest,
++					     new_dentry->d_name.name,
++					     new_dentry->d_name.len,
++					     old_inode->i_ino,
++					     new_dir->i_ino, index);
++		if (ret)
++			goto out_fail;
++		/*
++		 * this is an ugly little race, but the rename is required
++		 * to make sure that if we crash, the inode is either at the
++		 * old name or the new one.  pinning the log transaction lets
++		 * us make sure we don't allow a log commit to come in after
++		 * we unlink the name but before we add the new name back in.
++		 */
++		btrfs_pin_log_trans(root);
++	}
+ 	/*
+ 	 * make sure the inode gets flushed if it is replacing
+ 	 * something.
+@@ -4807,18 +5353,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ 		btrfs_add_ordered_operation(trans, root, old_inode);
+ 	}
+ 
+-	/*
+-	 * this is an ugly little race, but the rename is required to make
+-	 * sure that if we crash, the inode is either at the old name
+-	 * or the new one.  pinning the log transaction lets us make sure
+-	 * we don't allow a log commit to come in after we unlink the
+-	 * name but before we add the new name back in.
+-	 */
+-	btrfs_pin_log_trans(root);
+-
+-	btrfs_set_trans_block_group(trans, new_dir);
+-
+-	btrfs_inc_nlink(old_dentry->d_inode);
+ 	old_dir->i_ctime = old_dir->i_mtime = ctime;
+ 	new_dir->i_ctime = new_dir->i_mtime = ctime;
+ 	old_inode->i_ctime = ctime;
+@@ -4826,47 +5360,60 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ 	if (old_dentry->d_parent != new_dentry->d_parent)
+ 		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+ 
+-	ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
+-				 old_dentry->d_name.name,
+-				 old_dentry->d_name.len);
+-	if (ret)
+-		goto out_fail;
++	if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
++		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
++		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
++					old_dentry->d_name.name,
++					old_dentry->d_name.len);
++	} else {
++		btrfs_inc_nlink(old_dentry->d_inode);
++		ret = btrfs_unlink_inode(trans, root, old_dir,
++					 old_dentry->d_inode,
++					 old_dentry->d_name.name,
++					 old_dentry->d_name.len);
++	}
++	BUG_ON(ret);
+ 
+ 	if (new_inode) {
+ 		new_inode->i_ctime = CURRENT_TIME;
+-		ret = btrfs_unlink_inode(trans, root, new_dir,
+-					 new_dentry->d_inode,
+-					 new_dentry->d_name.name,
+-					 new_dentry->d_name.len);
+-		if (ret)
+-			goto out_fail;
++		if (unlikely(new_inode->i_ino ==
++			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
++			root_objectid = BTRFS_I(new_inode)->location.objectid;
++			ret = btrfs_unlink_subvol(trans, dest, new_dir,
++						root_objectid,
++						new_dentry->d_name.name,
++						new_dentry->d_name.len);
++			BUG_ON(new_inode->i_nlink == 0);
++		} else {
++			ret = btrfs_unlink_inode(trans, dest, new_dir,
++						 new_dentry->d_inode,
++						 new_dentry->d_name.name,
++						 new_dentry->d_name.len);
++		}
++		BUG_ON(ret);
+ 		if (new_inode->i_nlink == 0) {
+ 			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
+-			if (ret)
+-				goto out_fail;
++			BUG_ON(ret);
+ 		}
+-
+ 	}
+-	ret = btrfs_set_inode_index(new_dir, &index);
+-	if (ret)
+-		goto out_fail;
+ 
+-	ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
+-			     old_inode, new_dentry->d_name.name,
+-			     new_dentry->d_name.len, 1, index);
+-	if (ret)
+-		goto out_fail;
++	ret = btrfs_add_link(trans, new_dir, old_inode,
++			     new_dentry->d_name.name,
++			     new_dentry->d_name.len, 0, index);
++	BUG_ON(ret);
+ 
+-	btrfs_log_new_name(trans, old_inode, old_dir,
+-				       new_dentry->d_parent);
++	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
++		btrfs_log_new_name(trans, old_inode, old_dir,
++				   new_dentry->d_parent);
++		btrfs_end_log_trans(root);
++	}
+ out_fail:
+-
+-	/* this btrfs_end_log_trans just allows the current
+-	 * log-sub transaction to complete
+-	 */
+-	btrfs_end_log_trans(root);
+ 	btrfs_end_transaction_throttle(trans, root);
+-out_unlock:
++
++	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
++		up_read(&root->fs_info->subvol_sem);
++
++	btrfs_unreserve_metadata_space(root, 4);
+ 	return ret;
+ }
+ 
+@@ -4938,11 +5485,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
+ 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
+ 		return -ENAMETOOLONG;
+ 
+-	err = btrfs_check_metadata_free_space(root);
++	/*
++	 * 2 items for inode item and ref
++	 * 2 items for dir items
++	 * 1 item for xattr if selinux is on
++	 */
++	err = btrfs_reserve_metadata_space(root, 5);
+ 	if (err)
+-		goto out_fail;
++		return err;
+ 
+ 	trans = btrfs_start_transaction(root, 1);
++	if (!trans)
++		goto out_fail;
+ 	btrfs_set_trans_block_group(trans, dir);
+ 
+ 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+@@ -5023,6 +5577,7 @@ out_unlock:
+ 	nr = trans->blocks_used;
+ 	btrfs_end_transaction_throttle(trans, root);
+ out_fail:
++	btrfs_unreserve_metadata_space(root, 5);
+ 	if (drop_inode) {
+ 		inode_dec_link_count(inode);
+ 		iput(inode);
+@@ -5044,6 +5599,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
+ 
+ 	while (num_bytes > 0) {
+ 		alloc_size = min(num_bytes, root->fs_info->max_extent);
++
++		ret = btrfs_reserve_metadata_space(root, 1);
++		if (ret)
++			goto out;
++
+ 		ret = btrfs_reserve_extent(trans, root, alloc_size,
+ 					   root->sectorsize, 0, alloc_hint,
+ 					   (u64)-1, &ins, 1);
+@@ -5058,9 +5618,12 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
+ 						  0, 0, 0,
+ 						  BTRFS_FILE_EXTENT_PREALLOC);
+ 		BUG_ON(ret);
++		btrfs_drop_extent_cache(inode, cur_offset,
++					cur_offset + ins.offset -1, 0);
+ 		num_bytes -= ins.offset;
+ 		cur_offset += ins.offset;
+ 		alloc_hint = ins.objectid + ins.offset;
++		btrfs_unreserve_metadata_space(root, 1);
+ 	}
+ out:
+ 	if (cur_offset > start) {
+@@ -5223,6 +5786,7 @@ static struct inode_operations btrfs_dir_ro_inode_operations = {
+ 	.lookup		= btrfs_lookup,
+ 	.permission	= btrfs_permission,
+ };
++
+ static struct file_operations btrfs_dir_file_operations = {
+ 	.llseek		= generic_file_llseek,
+ 	.read		= generic_read_dir,
+@@ -5245,6 +5809,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
+ 	.readpage_io_failed_hook = btrfs_io_failed_hook,
+ 	.set_bit_hook = btrfs_set_bit_hook,
+ 	.clear_bit_hook = btrfs_clear_bit_hook,
++	.merge_extent_hook = btrfs_merge_extent_hook,
++	.split_extent_hook = btrfs_split_extent_hook,
+ };
+ 
+ /*
+@@ -5309,3 +5875,7 @@ static struct inode_operations btrfs_symlink_inode_operations = {
+ 	.listxattr	= btrfs_listxattr,
+ 	.removexattr	= btrfs_removexattr,
+ };
++
++const struct dentry_operations btrfs_dentry_operations = {
++	.d_delete	= btrfs_dentry_delete,
++};
+diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
+index bd88f25..cdbb054 100644
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root,
+ 	struct btrfs_root_item root_item;
+ 	struct btrfs_inode_item *inode_item;
+ 	struct extent_buffer *leaf;
+-	struct btrfs_root *new_root = root;
+-	struct inode *dir;
++	struct btrfs_root *new_root;
++	struct inode *dir = dentry->d_parent->d_inode;
+ 	int ret;
+ 	int err;
+ 	u64 objectid;
+@@ -239,9 +239,15 @@ static noinline int create_subvol(struct btrfs_root *root,
+ 	u64 index = 0;
+ 	unsigned long nr = 1;
+ 
+-	ret = btrfs_check_metadata_free_space(root);
++	/*
++	 * 1 - inode item
++	 * 2 - refs
++	 * 1 - root item
++	 * 2 - dir items
++	 */
++	ret = btrfs_reserve_metadata_space(root, 6);
+ 	if (ret)
+-		goto fail_commit;
++		return ret;
+ 
+ 	trans = btrfs_start_transaction(root, 1);
+ 	BUG_ON(!trans);
+@@ -304,11 +310,17 @@ static noinline int create_subvol(struct btrfs_root *root,
+ 	if (ret)
+ 		goto fail;
+ 
++	key.offset = (u64)-1;
++	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
++	BUG_ON(IS_ERR(new_root));
++
++	btrfs_record_root_in_trans(trans, new_root);
++
++	ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
++				       BTRFS_I(dir)->block_group);
+ 	/*
+ 	 * insert the directory item
+ 	 */
+-	key.offset = (u64)-1;
+-	dir = dentry->d_parent->d_inode;
+ 	ret = btrfs_set_inode_index(dir, &index);
+ 	BUG_ON(ret);
+ 
+@@ -322,43 +334,20 @@ static noinline int create_subvol(struct btrfs_root *root,
+ 	ret = btrfs_update_inode(trans, root, dir);
+ 	BUG_ON(ret);
+ 
+-	/* add the backref first */
+ 	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+-				 objectid, BTRFS_ROOT_BACKREF_KEY,
+-				 root->root_key.objectid,
++				 objectid, root->root_key.objectid,
+ 				 dir->i_ino, index, name, namelen);
+ 
+ 	BUG_ON(ret);
+ 
+-	/* now add the forward ref */
+-	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+-				 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
+-				 objectid,
+-				 dir->i_ino, index, name, namelen);
+-
+-	BUG_ON(ret);
+-
+-	ret = btrfs_commit_transaction(trans, root);
+-	if (ret)
+-		goto fail_commit;
+-
+-	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+-	BUG_ON(!new_root);
+-
+-	trans = btrfs_start_transaction(new_root, 1);
+-	BUG_ON(!trans);
+-
+-	ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
+-				       BTRFS_I(dir)->block_group);
+-	if (ret)
+-		goto fail;
+-
++	d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
+ fail:
+ 	nr = trans->blocks_used;
+-	err = btrfs_commit_transaction(trans, new_root);
++	err = btrfs_commit_transaction(trans, root);
+ 	if (err && !ret)
+ 		ret = err;
+-fail_commit:
++
++	btrfs_unreserve_metadata_space(root, 6);
+ 	btrfs_btree_balance_dirty(root, nr);
+ 	return ret;
+ }
+@@ -375,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+ 	if (!root->ref_cows)
+ 		return -EINVAL;
+ 
+-	ret = btrfs_check_metadata_free_space(root);
++	/*
++	 * 1 - inode item
++	 * 2 - refs
++	 * 1 - root item
++	 * 2 - dir items
++	 */
++	ret = btrfs_reserve_metadata_space(root, 6);
+ 	if (ret)
+ 		goto fail_unlock;
+ 
+ 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+ 	if (!pending_snapshot) {
+ 		ret = -ENOMEM;
++		btrfs_unreserve_metadata_space(root, 6);
+ 		goto fail_unlock;
+ 	}
+ 	pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
+ 	if (!pending_snapshot->name) {
+ 		ret = -ENOMEM;
+ 		kfree(pending_snapshot);
++		btrfs_unreserve_metadata_space(root, 6);
+ 		goto fail_unlock;
+ 	}
+ 	memcpy(pending_snapshot->name, name, namelen);
+@@ -420,14 +417,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+  * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
+  * inside this filesystem so it's quite a bit simpler.
+  */
+-static noinline int btrfs_mksubvol(struct path *parent, char *name,
+-				   int mode, int namelen,
++static noinline int btrfs_mksubvol(struct path *parent,
++				   char *name, int namelen,
+ 				   struct btrfs_root *snap_src)
+ {
++	struct inode *dir  = parent->dentry->d_inode;
+ 	struct dentry *dentry;
+ 	int error;
+ 
+-	mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
++	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ 
+ 	dentry = lookup_one_len(name, parent->dentry, namelen);
+ 	error = PTR_ERR(dentry);
+@@ -438,99 +436,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
+ 	if (dentry->d_inode)
+ 		goto out_dput;
+ 
+-	if (!IS_POSIXACL(parent->dentry->d_inode))
+-		mode &= ~current_umask();
+-
+ 	error = mnt_want_write(parent->mnt);
+ 	if (error)
+ 		goto out_dput;
+ 
+-	error = btrfs_may_create(parent->dentry->d_inode, dentry);
++	error = btrfs_may_create(dir, dentry);
+ 	if (error)
+ 		goto out_drop_write;
+ 
+-	/*
+-	 * Actually perform the low-level subvolume creation after all
+-	 * this VFS fuzz.
+-	 *
+-	 * Eventually we want to pass in an inode under which we create this
+-	 * subvolume, but for now all are under the filesystem root.
+-	 *
+-	 * Also we should pass on the mode eventually to allow creating new
+-	 * subvolume with specific mode bits.
+-	 */
++	down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
++
++	if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
++		goto out_up_read;
++
+ 	if (snap_src) {
+-		struct dentry *dir = dentry->d_parent;
+-		struct dentry *test = dir->d_parent;
+-		struct btrfs_path *path = btrfs_alloc_path();
+-		int ret;
+-		u64 test_oid;
+-		u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
+-
+-		test_oid = snap_src->root_key.objectid;
+-
+-		ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+-					  path, parent_oid, test_oid);
+-		if (ret == 0)
+-			goto create;
+-		btrfs_release_path(snap_src->fs_info->tree_root, path);
+-
+-		/* we need to make sure we aren't creating a directory loop
+-		 * by taking a snapshot of something that has our current
+-		 * subvol in its directory tree.  So, this loops through
+-		 * the dentries and checks the forward refs for each subvolume
+-		 * to see if is references the subvolume where we are
+-		 * placing this new snapshot.
+-		 */
+-		while (1) {
+-			if (!test ||
+-			    dir == snap_src->fs_info->sb->s_root ||
+-			    test == snap_src->fs_info->sb->s_root ||
+-			    test->d_inode->i_sb != snap_src->fs_info->sb) {
+-				break;
+-			}
+-			if (S_ISLNK(test->d_inode->i_mode)) {
+-				printk(KERN_INFO "Btrfs symlink in snapshot "
+-				       "path, failed\n");
+-				error = -EMLINK;
+-				btrfs_free_path(path);
+-				goto out_drop_write;
+-			}
+-			test_oid =
+-				BTRFS_I(test->d_inode)->root->root_key.objectid;
+-			ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+-				  path, test_oid, parent_oid);
+-			if (ret == 0) {
+-				printk(KERN_INFO "Btrfs snapshot creation "
+-				       "failed, looping\n");
+-				error = -EMLINK;
+-				btrfs_free_path(path);
+-				goto out_drop_write;
+-			}
+-			btrfs_release_path(snap_src->fs_info->tree_root, path);
+-			test = test->d_parent;
+-		}
+-create:
+-		btrfs_free_path(path);
+-		error = create_snapshot(snap_src, dentry, name, namelen);
++		error = create_snapshot(snap_src, dentry,
++					name, namelen);
+ 	} else {
+-		error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
+-				      dentry, name, namelen);
++		error = create_subvol(BTRFS_I(dir)->root, dentry,
++				      name, namelen);
+ 	}
+-	if (error)
+-		goto out_drop_write;
+-
+-	fsnotify_mkdir(parent->dentry->d_inode, dentry);
++	if (!error)
++		fsnotify_mkdir(dir, dentry);
++out_up_read:
++	up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+ out_drop_write:
+ 	mnt_drop_write(parent->mnt);
+ out_dput:
+ 	dput(dentry);
+ out_unlock:
+-	mutex_unlock(&parent->dentry->d_inode->i_mutex);
++	mutex_unlock(&dir->i_mutex);
+ 	return error;
+ }
+ 
+-
+ static int btrfs_defrag_file(struct file *file)
+ {
+ 	struct inode *inode = fdentry(file)->d_inode;
+@@ -596,9 +534,8 @@ again:
+ 		clear_page_dirty_for_io(page);
+ 
+ 		btrfs_set_extent_delalloc(inode, page_start, page_end);
+-
+-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ 		set_page_dirty(page);
++		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ 		unlock_page(page);
+ 		page_cache_release(page);
+ 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
+@@ -609,7 +546,8 @@ out_unlock:
+ 	return 0;
+ }
+ 
+-static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
++static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
++					void __user *arg)
+ {
+ 	u64 new_size;
+ 	u64 old_size;
+@@ -718,10 +656,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
+ {
+ 	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ 	struct btrfs_ioctl_vol_args *vol_args;
+-	struct btrfs_dir_item *di;
+-	struct btrfs_path *path;
+ 	struct file *src_file;
+-	u64 root_dirid;
+ 	int namelen;
+ 	int ret = 0;
+ 
+@@ -739,32 +674,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
+ 		goto out;
+ 	}
+ 
+-	path = btrfs_alloc_path();
+-	if (!path) {
+-		ret = -ENOMEM;
+-		goto out;
+-	}
+-
+-	root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
+-	di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
+-			    path, root_dirid,
+-			    vol_args->name, namelen, 0);
+-	btrfs_free_path(path);
+-
+-	if (di && !IS_ERR(di)) {
+-		ret = -EEXIST;
+-		goto out;
+-	}
+-
+-	if (IS_ERR(di)) {
+-		ret = PTR_ERR(di);
+-		goto out;
+-	}
+-
+ 	if (subvol) {
+-		ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+-				     file->f_path.dentry->d_inode->i_mode,
+-				     namelen, NULL);
++		ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
++				     NULL);
+ 	} else {
+ 		struct inode *src_inode;
+ 		src_file = fget(vol_args->fd);
+@@ -781,17 +693,157 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
+ 			fput(src_file);
+ 			goto out;
+ 		}
+-		ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+-			     file->f_path.dentry->d_inode->i_mode,
+-			     namelen, BTRFS_I(src_inode)->root);
++		ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
++				     BTRFS_I(src_inode)->root);
+ 		fput(src_file);
+ 	}
+-
+ out:
+ 	kfree(vol_args);
+ 	return ret;
+ }
+ 
++/*
++ * helper to check if the subvolume references other subvolumes
++ */
++static noinline int may_destroy_subvol(struct btrfs_root *root)
++{
++	struct btrfs_path *path;
++	struct btrfs_key key;
++	int ret;
++
++	path = btrfs_alloc_path();
++	if (!path)
++		return -ENOMEM;
++
++	key.objectid = root->root_key.objectid;
++	key.type = BTRFS_ROOT_REF_KEY;
++	key.offset = (u64)-1;
++
++	ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
++				&key, path, 0, 0);
++	if (ret < 0)
++		goto out;
++	BUG_ON(ret == 0);
++
++	ret = 0;
++	if (path->slots[0] > 0) {
++		path->slots[0]--;
++		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
++		if (key.objectid == root->root_key.objectid &&
++		    key.type == BTRFS_ROOT_REF_KEY)
++			ret = -ENOTEMPTY;
++	}
++out:
++	btrfs_free_path(path);
++	return ret;
++}
++
++static noinline int btrfs_ioctl_snap_destroy(struct file *file,
++					     void __user *arg)
++{
++	struct dentry *parent = fdentry(file);
++	struct dentry *dentry;
++	struct inode *dir = parent->d_inode;
++	struct inode *inode;
++	struct btrfs_root *root = BTRFS_I(dir)->root;
++	struct btrfs_root *dest = NULL;
++	struct btrfs_ioctl_vol_args *vol_args;
++	struct btrfs_trans_handle *trans;
++	int namelen;
++	int ret;
++	int err = 0;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	vol_args = memdup_user(arg, sizeof(*vol_args));
++	if (IS_ERR(vol_args))
++		return PTR_ERR(vol_args);
++
++	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
++	namelen = strlen(vol_args->name);
++	if (strchr(vol_args->name, '/') ||
++	    strncmp(vol_args->name, "..", namelen) == 0) {
++		err = -EINVAL;
++		goto out;
++	}
++
++	err = mnt_want_write(file->f_path.mnt);
++	if (err)
++		goto out;
++
++	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
++	dentry = lookup_one_len(vol_args->name, parent, namelen);
++	if (IS_ERR(dentry)) {
++		err = PTR_ERR(dentry);
++		goto out_unlock_dir;
++	}
++
++	if (!dentry->d_inode) {
++		err = -ENOENT;
++		goto out_dput;
++	}
++
++	inode = dentry->d_inode;
++	if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
++		err = -EINVAL;
++		goto out_dput;
++	}
++
++	dest = BTRFS_I(inode)->root;
++
++	mutex_lock(&inode->i_mutex);
++	err = d_invalidate(dentry);
++	if (err)
++		goto out_unlock;
++
++	down_write(&root->fs_info->subvol_sem);
++
++	err = may_destroy_subvol(dest);
++	if (err)
++		goto out_up_write;
++
++	trans = btrfs_start_transaction(root, 1);
++	ret = btrfs_unlink_subvol(trans, root, dir,
++				dest->root_key.objectid,
++				dentry->d_name.name,
++				dentry->d_name.len);
++	BUG_ON(ret);
++
++	btrfs_record_root_in_trans(trans, dest);
++
++	memset(&dest->root_item.drop_progress, 0,
++		sizeof(dest->root_item.drop_progress));
++	dest->root_item.drop_level = 0;
++	btrfs_set_root_refs(&dest->root_item, 0);
++
++	ret = btrfs_insert_orphan_item(trans,
++				root->fs_info->tree_root,
++				dest->root_key.objectid);
++	BUG_ON(ret);
++
++	ret = btrfs_commit_transaction(trans, root);
++	BUG_ON(ret);
++	inode->i_flags |= S_DEAD;
++out_up_write:
++	up_write(&root->fs_info->subvol_sem);
++out_unlock:
++	mutex_unlock(&inode->i_mutex);
++	if (!err) {
++		shrink_dcache_sb(root->fs_info->sb);
++		btrfs_invalidate_inodes(dest);
++		d_delete(dentry);
++	}
++out_dput:
++	dput(dentry);
++out_unlock_dir:
++	mutex_unlock(&dir->i_mutex);
++	mnt_drop_write(file->f_path.mnt);
++out:
++	kfree(vol_args);
++	return err;
++}
++
+ static int btrfs_ioctl_defrag(struct file *file)
+ {
+ 	struct inode *inode = fdentry(file)->d_inode;
+@@ -865,8 +917,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+ 	return ret;
+ }
+ 
+-static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+-		u64 off, u64 olen, u64 destoff)
++static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
++				       u64 off, u64 olen, u64 destoff)
+ {
+ 	struct inode *inode = fdentry(file)->d_inode;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+@@ -976,7 +1028,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+ 
+ 	/* punch hole in destination first */
+ 	btrfs_drop_extents(trans, root, inode, off, off + len,
+-			   off + len, 0, &hint_byte);
++			   off + len, 0, &hint_byte, 1);
+ 
+ 	/* clone data */
+ 	key.objectid = src->i_ino;
+@@ -1071,9 +1123,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+ 					datao += off - key.offset;
+ 					datal -= off - key.offset;
+ 				}
+-				if (key.offset + datao + datal + key.offset >
+-				    off + len)
+-					datal = off + len - key.offset - datao;
++
++				if (key.offset + datal > off + len)
++					datal = off + len - key.offset;
++
+ 				/* disko == 0 means it's a hole */
+ 				if (!disko)
+ 					datao = 0;
+@@ -1182,15 +1235,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
+ 	struct inode *inode = fdentry(file)->d_inode;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	struct btrfs_trans_handle *trans;
+-	int ret = 0;
++	int ret;
+ 
++	ret = -EPERM;
+ 	if (!capable(CAP_SYS_ADMIN))
+-		return -EPERM;
++		goto out;
+ 
+-	if (file->private_data) {
+-		ret = -EINPROGRESS;
++	ret = -EINPROGRESS;
++	if (file->private_data)
+ 		goto out;
+-	}
+ 
+ 	ret = mnt_want_write(file->f_path.mnt);
+ 	if (ret)
+@@ -1200,12 +1253,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
+ 	root->fs_info->open_ioctl_trans++;
+ 	mutex_unlock(&root->fs_info->trans_mutex);
+ 
++	ret = -ENOMEM;
+ 	trans = btrfs_start_ioctl_transaction(root, 0);
+-	if (trans)
+-		file->private_data = trans;
+-	else
+-		ret = -ENOMEM;
+-	/*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
++	if (!trans)
++		goto out_drop;
++
++	file->private_data = trans;
++	return 0;
++
++out_drop:
++	mutex_lock(&root->fs_info->trans_mutex);
++	root->fs_info->open_ioctl_trans--;
++	mutex_unlock(&root->fs_info->trans_mutex);
++	mnt_drop_write(file->f_path.mnt);
+ out:
+ 	return ret;
+ }
+@@ -1221,24 +1281,20 @@ long btrfs_ioctl_trans_end(struct file *file)
+ 	struct inode *inode = fdentry(file)->d_inode;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	struct btrfs_trans_handle *trans;
+-	int ret = 0;
+ 
+ 	trans = file->private_data;
+-	if (!trans) {
+-		ret = -EINVAL;
+-		goto out;
+-	}
+-	btrfs_end_transaction(trans, root);
++	if (!trans)
++		return -EINVAL;
+ 	file->private_data = NULL;
+ 
++	btrfs_end_transaction(trans, root);
++
+ 	mutex_lock(&root->fs_info->trans_mutex);
+ 	root->fs_info->open_ioctl_trans--;
+ 	mutex_unlock(&root->fs_info->trans_mutex);
+ 
+ 	mnt_drop_write(file->f_path.mnt);
+-
+-out:
+-	return ret;
++	return 0;
+ }
+ 
+ long btrfs_ioctl(struct file *file, unsigned int
+@@ -1258,6 +1314,8 @@ long btrfs_ioctl(struct file *file, unsigned int
+ 		return btrfs_ioctl_snap_create(file, argp, 0);
+ 	case BTRFS_IOC_SUBVOL_CREATE:
+ 		return btrfs_ioctl_snap_create(file, argp, 1);
++	case BTRFS_IOC_SNAP_DESTROY:
++		return btrfs_ioctl_snap_destroy(file, argp);
+ 	case BTRFS_IOC_DEFRAG:
+ 		return btrfs_ioctl_defrag(file);
+ 	case BTRFS_IOC_RESIZE:
+diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
+index b320b10..bc49914 100644
+--- a/fs/btrfs/ioctl.h
++++ b/fs/btrfs/ioctl.h
+@@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args {
+ 
+ #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
+ 				   struct btrfs_ioctl_vol_args)
+-
++#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
++				struct btrfs_ioctl_vol_args)
+ #endif
+diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
+index d6f0806..ab21c29 100644
+--- a/fs/btrfs/ordered-data.c
++++ b/fs/btrfs/ordered-data.c
+@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+  *
+  * len is the length of the extent
+  *
+- * This also sets the EXTENT_ORDERED bit on the range in the inode.
+- *
+  * The tree is given a single reference on the ordered extent that was
+  * inserted.
+  */
+@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ 	entry->start = start;
+ 	entry->len = len;
+ 	entry->disk_len = disk_len;
++	entry->bytes_left = len;
+ 	entry->inode = inode;
+ 	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
+ 		set_bit(type, &entry->flags);
+@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ 			   &entry->rb_node);
+ 	BUG_ON(node);
+ 
+-	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
+-			   entry_end(entry) - 1, GFP_NOFS);
+-
+ 	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ 	list_add_tail(&entry->root_extent_list,
+ 		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
+@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
+ 	struct btrfs_ordered_inode_tree *tree;
+ 	struct rb_node *node;
+ 	struct btrfs_ordered_extent *entry;
+-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ 	int ret;
+ 
+ 	tree = &BTRFS_I(inode)->ordered_tree;
+ 	mutex_lock(&tree->mutex);
+-	clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
+-			     GFP_NOFS);
+ 	node = tree_search(tree, file_offset);
+ 	if (!node) {
+ 		ret = 1;
+@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
+ 		goto out;
+ 	}
+ 
+-	ret = test_range_bit(io_tree, entry->file_offset,
+-			     entry->file_offset + entry->len - 1,
+-			     EXTENT_ORDERED, 0);
+-	if (ret == 0)
++	if (io_size > entry->bytes_left) {
++		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
++		       (unsigned long long)entry->bytes_left,
++		       (unsigned long long)io_size);
++	}
++	entry->bytes_left -= io_size;
++	if (entry->bytes_left == 0)
+ 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
++	else
++		ret = 1;
+ out:
+ 	mutex_unlock(&tree->mutex);
+ 	return ret == 0;
+@@ -308,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode,
+ 	tree->last = NULL;
+ 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ 
++	spin_lock(&BTRFS_I(inode)->accounting_lock);
++	BTRFS_I(inode)->outstanding_extents--;
++	spin_unlock(&BTRFS_I(inode)->accounting_lock);
++	btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
++					      inode, 1);
++
+ 	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ 	list_del_init(&entry->root_extent_list);
+ 
+@@ -476,6 +480,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+ 	u64 orig_end;
+ 	u64 wait_end;
+ 	struct btrfs_ordered_extent *ordered;
++	int found;
+ 
+ 	if (start + len < start) {
+ 		orig_end = INT_LIMIT(loff_t);
+@@ -502,6 +507,7 @@ again:
+ 					   orig_end >> PAGE_CACHE_SHIFT);
+ 
+ 	end = orig_end;
++	found = 0;
+ 	while (1) {
+ 		ordered = btrfs_lookup_first_ordered_extent(inode, end);
+ 		if (!ordered)
+@@ -514,6 +520,7 @@ again:
+ 			btrfs_put_ordered_extent(ordered);
+ 			break;
+ 		}
++		found++;
+ 		btrfs_start_ordered_extent(inode, ordered, 1);
+ 		end = ordered->file_offset;
+ 		btrfs_put_ordered_extent(ordered);
+@@ -521,8 +528,8 @@ again:
+ 			break;
+ 		end--;
+ 	}
+-	if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+-			   EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
++	if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
++			   EXTENT_DELALLOC, 0, NULL)) {
+ 		schedule_timeout(1);
+ 		goto again;
+ 	}
+@@ -613,7 +620,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
+ 	 */
+ 	if (test_range_bit(io_tree, disk_i_size,
+ 			   ordered->file_offset + ordered->len - 1,
+-			   EXTENT_DELALLOC, 0)) {
++			   EXTENT_DELALLOC, 0, NULL)) {
+ 		goto out;
+ 	}
+ 	/*
+@@ -664,7 +671,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
+ 	 */
+ 	if (i_size_test > entry_end(ordered) &&
+ 	    !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
+-			   EXTENT_DELALLOC, 0)) {
++			   EXTENT_DELALLOC, 0, NULL)) {
+ 		new_i_size = min_t(u64, i_size_test, i_size_read(inode));
+ 	}
+ 	BTRFS_I(inode)->disk_i_size = new_i_size;
+diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
+index 3d31c88..993a7ea 100644
+--- a/fs/btrfs/ordered-data.h
++++ b/fs/btrfs/ordered-data.h
+@@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
+ 	/* extent length on disk */
+ 	u64 disk_len;
+ 
++	/* number of bytes that still need writing */
++	u64 bytes_left;
++
+ 	/* flags (described above) */
+ 	unsigned long flags;
+ 
+diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
+index 3c0d52a..79cba5f 100644
+--- a/fs/btrfs/orphan.c
++++ b/fs/btrfs/orphan.c
+@@ -65,3 +65,23 @@ out:
+ 	btrfs_free_path(path);
+ 	return ret;
+ }
++
++int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
++{
++	struct btrfs_path *path;
++	struct btrfs_key key;
++	int ret;
++
++	key.objectid = BTRFS_ORPHAN_OBJECTID;
++	key.type = BTRFS_ORPHAN_ITEM_KEY;
++	key.offset = offset;
++
++	path = btrfs_alloc_path();
++	if (!path)
++		return -ENOMEM;
++
++	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
++
++	btrfs_free_path(path);
++	return ret;
++}
+diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
+index c04f7f2..cfcc93c 100644
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -121,6 +121,15 @@ struct inodevec {
+ 	int nr;
+ };
+ 
++#define MAX_EXTENTS 128
++
++struct file_extent_cluster {
++	u64 start;
++	u64 end;
++	u64 boundary[MAX_EXTENTS];
++	unsigned int nr;
++};
++
+ struct reloc_control {
+ 	/* block group to relocate */
+ 	struct btrfs_block_group_cache *block_group;
+@@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
+ 				struct reloc_control *rc)
+ {
+ 	if (test_range_bit(&rc->processed_blocks, bytenr,
+-			   bytenr + blocksize - 1, EXTENT_DIRTY, 1))
++			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
+ 		return 1;
+ 	return 0;
+ }
+@@ -2529,56 +2538,94 @@ out:
+ }
+ 
+ static noinline_for_stack
+-int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
++int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
++			 u64 block_start)
++{
++	struct btrfs_root *root = BTRFS_I(inode)->root;
++	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
++	struct extent_map *em;
++	int ret = 0;
++
++	em = alloc_extent_map(GFP_NOFS);
++	if (!em)
++		return -ENOMEM;
++
++	em->start = start;
++	em->len = end + 1 - start;
++	em->block_len = em->len;
++	em->block_start = block_start;
++	em->bdev = root->fs_info->fs_devices->latest_bdev;
++	set_bit(EXTENT_FLAG_PINNED, &em->flags);
++
++	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
++	while (1) {
++		write_lock(&em_tree->lock);
++		ret = add_extent_mapping(em_tree, em);
++		write_unlock(&em_tree->lock);
++		if (ret != -EEXIST) {
++			free_extent_map(em);
++			break;
++		}
++		btrfs_drop_extent_cache(inode, start, end, 0);
++	}
++	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
++	return ret;
++}
++
++static int relocate_file_extent_cluster(struct inode *inode,
++					struct file_extent_cluster *cluster)
+ {
+ 	u64 page_start;
+ 	u64 page_end;
+-	unsigned long i;
+-	unsigned long first_index;
++	u64 offset = BTRFS_I(inode)->index_cnt;
++	unsigned long index;
+ 	unsigned long last_index;
+-	unsigned int total_read = 0;
+-	unsigned int total_dirty = 0;
++	unsigned int dirty_page = 0;
+ 	struct page *page;
+ 	struct file_ra_state *ra;
+-	struct btrfs_ordered_extent *ordered;
+-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
++	int nr = 0;
+ 	int ret = 0;
+ 
++	if (!cluster->nr)
++		return 0;
++
+ 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+ 	if (!ra)
+ 		return -ENOMEM;
+ 
++	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
++	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
++
+ 	mutex_lock(&inode->i_mutex);
+-	first_index = start >> PAGE_CACHE_SHIFT;
+-	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
+ 
+-	/* make sure the dirty trick played by the caller work */
+-	while (1) {
+-		ret = invalidate_inode_pages2_range(inode->i_mapping,
+-						    first_index, last_index);
+-		if (ret != -EBUSY)
+-			break;
+-		schedule_timeout(HZ/10);
+-	}
++	i_size_write(inode, cluster->end + 1 - offset);
++	ret = setup_extent_mapping(inode, cluster->start - offset,
++				   cluster->end - offset, cluster->start);
+ 	if (ret)
+ 		goto out_unlock;
+ 
+ 	file_ra_state_init(ra, inode->i_mapping);
+ 
+-	for (i = first_index ; i <= last_index; i++) {
+-		if (total_read % ra->ra_pages == 0) {
+-			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
+-				min(last_index, ra->ra_pages + i - 1));
+-		}
+-		total_read++;
+-again:
+-		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
+-			BUG_ON(1);
+-		page = grab_cache_page(inode->i_mapping, i);
++	WARN_ON(cluster->start != cluster->boundary[0]);
++	while (index <= last_index) {
++		page = find_lock_page(inode->i_mapping, index);
+ 		if (!page) {
+-			ret = -ENOMEM;
+-			goto out_unlock;
++			page_cache_sync_readahead(inode->i_mapping,
++						  ra, NULL, index,
++						  last_index + 1 - index);
++			page = grab_cache_page(inode->i_mapping, index);
++			if (!page) {
++				ret = -ENOMEM;
++				goto out_unlock;
++			}
++		}
++
++		if (PageReadahead(page)) {
++			page_cache_async_readahead(inode->i_mapping,
++						   ra, NULL, page, index,
++						   last_index + 1 - index);
+ 		}
++
+ 		if (!PageUptodate(page)) {
+ 			btrfs_readpage(NULL, page);
+ 			lock_page(page);
+@@ -2589,75 +2636,79 @@ again:
+ 				goto out_unlock;
+ 			}
+ 		}
+-		wait_on_page_writeback(page);
+ 
+ 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+ 		page_end = page_start + PAGE_CACHE_SIZE - 1;
+-		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+-
+-		ordered = btrfs_lookup_ordered_extent(inode, page_start);
+-		if (ordered) {
+-			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+-			unlock_page(page);
+-			page_cache_release(page);
+-			btrfs_start_ordered_extent(inode, ordered, 1);
+-			btrfs_put_ordered_extent(ordered);
+-			goto again;
+-		}
++
++		lock_extent(&BTRFS_I(inode)->io_tree,
++			    page_start, page_end, GFP_NOFS);
++
+ 		set_page_extent_mapped(page);
+ 
+-		if (i == first_index)
+-			set_extent_bits(io_tree, page_start, page_end,
++		if (nr < cluster->nr &&
++		    page_start + offset == cluster->boundary[nr]) {
++			set_extent_bits(&BTRFS_I(inode)->io_tree,
++					page_start, page_end,
+ 					EXTENT_BOUNDARY, GFP_NOFS);
++			nr++;
++		}
+ 		btrfs_set_extent_delalloc(inode, page_start, page_end);
+ 
+ 		set_page_dirty(page);
+-		total_dirty++;
++		dirty_page++;
+ 
+-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
++		unlock_extent(&BTRFS_I(inode)->io_tree,
++			      page_start, page_end, GFP_NOFS);
+ 		unlock_page(page);
+ 		page_cache_release(page);
++
++		index++;
++		if (nr < cluster->nr &&
++		    page_end + 1 + offset == cluster->boundary[nr]) {
++			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
++							   dirty_page);
++			dirty_page = 0;
++		}
++	}
++	if (dirty_page) {
++		balance_dirty_pages_ratelimited_nr(inode->i_mapping,
++						   dirty_page);
+ 	}
++	WARN_ON(nr != cluster->nr);
+ out_unlock:
+ 	mutex_unlock(&inode->i_mutex);
+ 	kfree(ra);
+-	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+ 	return ret;
+ }
+ 
+ static noinline_for_stack
+-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
++int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
++			 struct file_extent_cluster *cluster)
+ {
+-	struct btrfs_root *root = BTRFS_I(inode)->root;
+-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+-	struct extent_map *em;
+-	u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
+-	u64 end = start + extent_key->offset - 1;
+-
+-	em = alloc_extent_map(GFP_NOFS);
+-	em->start = start;
+-	em->len = extent_key->offset;
+-	em->block_len = extent_key->offset;
+-	em->block_start = extent_key->objectid;
+-	em->bdev = root->fs_info->fs_devices->latest_bdev;
+-	set_bit(EXTENT_FLAG_PINNED, &em->flags);
++	int ret;
+ 
+-	/* setup extent map to cheat btrfs_readpage */
+-	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+-	while (1) {
+-		int ret;
+-		spin_lock(&em_tree->lock);
+-		ret = add_extent_mapping(em_tree, em);
+-		spin_unlock(&em_tree->lock);
+-		if (ret != -EEXIST) {
+-			free_extent_map(em);
+-			break;
+-		}
+-		btrfs_drop_extent_cache(inode, start, end, 0);
++	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
++		ret = relocate_file_extent_cluster(inode, cluster);
++		if (ret)
++			return ret;
++		cluster->nr = 0;
+ 	}
+-	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ 
+-	return relocate_inode_pages(inode, start, extent_key->offset);
++	if (!cluster->nr)
++		cluster->start = extent_key->objectid;
++	else
++		BUG_ON(cluster->nr >= MAX_EXTENTS);
++	cluster->end = extent_key->objectid + extent_key->offset - 1;
++	cluster->boundary[cluster->nr] = extent_key->objectid;
++	cluster->nr++;
++
++	if (cluster->nr >= MAX_EXTENTS) {
++		ret = relocate_file_extent_cluster(inode, cluster);
++		if (ret)
++			return ret;
++		cluster->nr = 0;
++	}
++	return 0;
+ }
+ 
+ #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+@@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags)
+ 	return 0;
+ }
+ 
++
+ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+ {
+ 	struct rb_root blocks = RB_ROOT;
+ 	struct btrfs_key key;
++	struct file_extent_cluster *cluster;
+ 	struct btrfs_trans_handle *trans = NULL;
+ 	struct btrfs_path *path;
+ 	struct btrfs_extent_item *ei;
+@@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+ 	int ret;
+ 	int err = 0;
+ 
++	cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
++	if (!cluster)
++		return -ENOMEM;
++
+ 	path = btrfs_alloc_path();
+ 	if (!path)
+ 		return -ENOMEM;
+ 
++	rc->extents_found = 0;
++	rc->extents_skipped = 0;
++
+ 	rc->search_start = rc->block_group->key.objectid;
+ 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+ 			  GFP_NOFS);
+@@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+ 		}
+ 
+ 		nr = trans->blocks_used;
+-		btrfs_end_transaction_throttle(trans, rc->extent_root);
++		btrfs_end_transaction(trans, rc->extent_root);
+ 		trans = NULL;
+ 		btrfs_btree_balance_dirty(rc->extent_root, nr);
+ 
+ 		if (rc->stage == MOVE_DATA_EXTENTS &&
+ 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
+ 			rc->found_file_extent = 1;
+-			ret = relocate_data_extent(rc->data_inode, &key);
++			ret = relocate_data_extent(rc->data_inode,
++						   &key, cluster);
+ 			if (ret < 0) {
+ 				err = ret;
+ 				break;
+@@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+ 		btrfs_btree_balance_dirty(rc->extent_root, nr);
+ 	}
+ 
++	if (!err) {
++		ret = relocate_file_extent_cluster(rc->data_inode, cluster);
++		if (ret < 0)
++			err = ret;
++	}
++
++	kfree(cluster);
++
+ 	rc->create_reloc_root = 0;
+ 	smp_mb();
+ 
+@@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+ }
+ 
+ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+-				 struct btrfs_root *root,
+-				 u64 objectid, u64 size)
++				 struct btrfs_root *root, u64 objectid)
+ {
+ 	struct btrfs_path *path;
+ 	struct btrfs_inode_item *item;
+@@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+ 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+ 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+ 	btrfs_set_inode_generation(leaf, item, 1);
+-	btrfs_set_inode_size(leaf, item, size);
++	btrfs_set_inode_size(leaf, item, 0);
+ 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+ 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+ 	btrfs_mark_buffer_dirty(leaf);
+@@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+ 	if (err)
+ 		goto out;
+ 
+-	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+-	BUG_ON(err);
+-
+-	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+-				       group->key.offset, 0, group->key.offset,
+-				       0, 0, 0);
++	err = __insert_orphan_inode(trans, root, objectid);
+ 	BUG_ON(err);
+ 
+ 	key.objectid = objectid;
+@@ -3455,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+ 	BUG_ON(!rc->block_group);
+ 
+ 	btrfs_init_workers(&rc->workers, "relocate",
+-			   fs_info->thread_pool_size);
++			   fs_info->thread_pool_size, NULL);
+ 
+ 	rc->extent_root = extent_root;
+ 	btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+@@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+ 	btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+ 
+ 	while (1) {
+-		mutex_lock(&fs_info->cleaner_mutex);
+-		btrfs_clean_old_snapshots(fs_info->tree_root);
+-		mutex_unlock(&fs_info->cleaner_mutex);
+-
+ 		rc->extents_found = 0;
+ 		rc->extents_skipped = 0;
+ 
++		mutex_lock(&fs_info->cleaner_mutex);
++
++		btrfs_clean_old_snapshots(fs_info->tree_root);
+ 		ret = relocate_block_group(rc);
++
++		mutex_unlock(&fs_info->cleaner_mutex);
+ 		if (ret < 0) {
+ 			err = ret;
+ 			break;
+@@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+ 		}
+ 	}
+ 
+-	filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
+-				 rc->block_group->key.objectid,
+-				 rc->block_group->key.objectid +
+-				 rc->block_group->key.offset - 1);
++	filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
++				     rc->block_group->key.objectid,
++				     rc->block_group->key.objectid +
++				     rc->block_group->key.offset - 1);
+ 
+ 	WARN_ON(rc->block_group->pinned > 0);
+ 	WARN_ON(rc->block_group->reserved > 0);
+@@ -3530,6 +3594,26 @@ out:
+ 	return err;
+ }
+ 
++static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
++{
++	struct btrfs_trans_handle *trans;
++	int ret;
++
++	trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
++
++	memset(&root->root_item.drop_progress, 0,
++		sizeof(root->root_item.drop_progress));
++	root->root_item.drop_level = 0;
++	btrfs_set_root_refs(&root->root_item, 0);
++	ret = btrfs_update_root(trans, root->fs_info->tree_root,
++				&root->root_key, &root->root_item);
++	BUG_ON(ret);
++
++	ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
++	BUG_ON(ret);
++	return 0;
++}
++
+ /*
+  * recover relocation interrupted by system crash.
+  *
+@@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
+ 			fs_root = read_fs_root(root->fs_info,
+ 					       reloc_root->root_key.offset);
+ 			if (IS_ERR(fs_root)) {
+-				err = PTR_ERR(fs_root);
+-				goto out;
++				ret = PTR_ERR(fs_root);
++				if (ret != -ENOENT) {
++					err = ret;
++					goto out;
++				}
++				mark_garbage_root(reloc_root);
+ 			}
+ 		}
+ 
+@@ -3613,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
+ 	mapping_tree_init(&rc->reloc_root_tree);
+ 	INIT_LIST_HEAD(&rc->reloc_roots);
+ 	btrfs_init_workers(&rc->workers, "relocate",
+-			   root->fs_info->thread_pool_size);
++			   root->fs_info->thread_pool_size, NULL);
+ 	rc->extent_root = root->fs_info->extent_root;
+ 
+ 	set_reloc_control(rc);
+diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
+index 0ddc6d6..9351428 100644
+--- a/fs/btrfs/root-tree.c
++++ b/fs/btrfs/root-tree.c
+@@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
+ 		goto out;
+ 
+ 	BUG_ON(ret == 0);
++	if (path->slots[0] == 0) {
++		ret = 1;
++		goto out;
++	}
+ 	l = path->nodes[0];
+-	BUG_ON(path->slots[0] == 0);
+ 	slot = path->slots[0] - 1;
+ 	btrfs_item_key_to_cpu(l, &found_key, slot);
+-	if (found_key.objectid != objectid) {
++	if (found_key.objectid != objectid ||
++	    found_key.type != BTRFS_ROOT_ITEM_KEY) {
+ 		ret = 1;
+ 		goto out;
+ 	}
+-	read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
+-			   sizeof(*item));
+-	memcpy(key, &found_key, sizeof(found_key));
++	if (item)
++		read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
++				   sizeof(*item));
++	if (key)
++		memcpy(key, &found_key, sizeof(found_key));
+ 	ret = 0;
+ out:
+ 	btrfs_free_path(path);
+@@ -249,6 +255,59 @@ err:
+ 	return ret;
+ }
+ 
++int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
++{
++	struct extent_buffer *leaf;
++	struct btrfs_path *path;
++	struct btrfs_key key;
++	int err = 0;
++	int ret;
++
++	path = btrfs_alloc_path();
++	if (!path)
++		return -ENOMEM;
++
++	key.objectid = BTRFS_ORPHAN_OBJECTID;
++	key.type = BTRFS_ORPHAN_ITEM_KEY;
++	key.offset = 0;
++
++	while (1) {
++		ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
++		if (ret < 0) {
++			err = ret;
++			break;
++		}
++
++		leaf = path->nodes[0];
++		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
++			ret = btrfs_next_leaf(tree_root, path);
++			if (ret < 0)
++				err = ret;
++			if (ret != 0)
++				break;
++			leaf = path->nodes[0];
++		}
++
++		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++		btrfs_release_path(tree_root, path);
++
++		if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
++		    key.type != BTRFS_ORPHAN_ITEM_KEY)
++			break;
++
++		ret = btrfs_find_dead_roots(tree_root, key.offset);
++		if (ret) {
++			err = ret;
++			break;
++		}
++
++		key.offset++;
++	}
++
++	btrfs_free_path(path);
++	return err;
++}
++
+ /* drop the root item for 'key' from 'root' */
+ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ 		   struct btrfs_key *key)
+@@ -278,31 +337,57 @@ out:
+ 	return ret;
+ }
+ 
+-#if 0 /* this will get used when snapshot deletion is implemented */
+ int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+ 		       struct btrfs_root *tree_root,
+-		       u64 root_id, u8 type, u64 ref_id)
++		       u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
++		       const char *name, int name_len)
++
+ {
++	struct btrfs_path *path;
++	struct btrfs_root_ref *ref;
++	struct extent_buffer *leaf;
+ 	struct btrfs_key key;
++	unsigned long ptr;
++	int err = 0;
+ 	int ret;
+-	struct btrfs_path *path;
+ 
+ 	path = btrfs_alloc_path();
++	if (!path)
++		return -ENOMEM;
+ 
+ 	key.objectid = root_id;
+-	key.type = type;
++	key.type = BTRFS_ROOT_BACKREF_KEY;
+ 	key.offset = ref_id;
+-
++again:
+ 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+-	BUG_ON(ret);
+-
+-	ret = btrfs_del_item(trans, tree_root, path);
+-	BUG_ON(ret);
++	BUG_ON(ret < 0);
++	if (ret == 0) {
++		leaf = path->nodes[0];
++		ref = btrfs_item_ptr(leaf, path->slots[0],
++				     struct btrfs_root_ref);
++
++		WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
++		WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
++		ptr = (unsigned long)(ref + 1);
++		WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
++		*sequence = btrfs_root_ref_sequence(leaf, ref);
++
++		ret = btrfs_del_item(trans, tree_root, path);
++		BUG_ON(ret);
++	} else
++		err = -ENOENT;
++
++	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
++		btrfs_release_path(tree_root, path);
++		key.objectid = ref_id;
++		key.type = BTRFS_ROOT_REF_KEY;
++		key.offset = root_id;
++		goto again;
++	}
+ 
+ 	btrfs_free_path(path);
+-	return ret;
++	return err;
+ }
+-#endif
+ 
+ int btrfs_find_root_ref(struct btrfs_root *tree_root,
+ 		   struct btrfs_path *path,
+@@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
+ 	return ret;
+ }
+ 
+-
+ /*
+  * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
+  * or BTRFS_ROOT_BACKREF_KEY.
+@@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
+  */
+ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ 		       struct btrfs_root *tree_root,
+-		       u64 root_id, u8 type, u64 ref_id,
+-		       u64 dirid, u64 sequence,
++		       u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+ 		       const char *name, int name_len)
+ {
+ 	struct btrfs_key key;
+@@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ 	struct extent_buffer *leaf;
+ 	unsigned long ptr;
+ 
+-
+ 	path = btrfs_alloc_path();
++	if (!path)
++		return -ENOMEM;
+ 
+ 	key.objectid = root_id;
+-	key.type = type;
++	key.type = BTRFS_ROOT_BACKREF_KEY;
+ 	key.offset = ref_id;
+-
++again:
+ 	ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
+ 				      sizeof(*ref) + name_len);
+ 	BUG_ON(ret);
+@@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ 	write_extent_buffer(leaf, name, ptr, name_len);
+ 	btrfs_mark_buffer_dirty(leaf);
+ 
++	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
++		btrfs_release_path(tree_root, path);
++		key.objectid = ref_id;
++		key.type = BTRFS_ROOT_REF_KEY;
++		key.offset = root_id;
++		goto again;
++	}
++
+ 	btrfs_free_path(path);
+-	return ret;
++	return 0;
+ }
+diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
+index 6d6d06c..939b68f 100644
+--- a/fs/btrfs/super.c
++++ b/fs/btrfs/super.c
+@@ -66,7 +66,8 @@ enum {
+ 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
+ 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
+ 	Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
+-	Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
++	Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
++	Opt_discard, Opt_err,
+ };
+ 
+ static match_table_t tokens = {
+@@ -88,6 +89,7 @@ static match_table_t tokens = {
+ 	{Opt_notreelog, "notreelog"},
+ 	{Opt_flushoncommit, "flushoncommit"},
+ 	{Opt_ratio, "metadata_ratio=%d"},
++	{Opt_discard, "discard"},
+ 	{Opt_err, NULL},
+ };
+ 
+@@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
+ 				       info->metadata_ratio);
+ 			}
+ 			break;
++		case Opt_discard:
++			btrfs_set_opt(info->mount_opt, DISCARD);
++			break;
+ 		default:
+ 			break;
+ 		}
+@@ -344,7 +349,9 @@ static int btrfs_fill_super(struct super_block *sb,
+ 	sb->s_export_op = &btrfs_export_ops;
+ 	sb->s_xattr = btrfs_xattr_handlers;
+ 	sb->s_time_gran = 1;
++#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ 	sb->s_flags |= MS_POSIXACL;
++#endif
+ 
+ 	tree_root = open_ctree(sb, fs_devices, (char *)data);
+ 
+@@ -676,6 +683,7 @@ static int btrfs_unfreeze(struct super_block *sb)
+ }
+ 
+ static struct super_operations btrfs_super_ops = {
++	.drop_inode	= btrfs_drop_inode,
+ 	.delete_inode	= btrfs_delete_inode,
+ 	.put_super	= btrfs_put_super,
+ 	.sync_fs	= btrfs_sync_fs,
+diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
+index cdbb502..bca82a4 100644
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
+ {
+ 	if (root->ref_cows && root->last_trans < trans->transid) {
+ 		WARN_ON(root == root->fs_info->extent_root);
+-		WARN_ON(root->root_item.refs == 0);
+ 		WARN_ON(root->commit_root != root->node);
+ 
+ 		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+@@ -187,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
+ 	h->alloc_exclude_start = 0;
+ 	h->delayed_ref_updates = 0;
+ 
++	if (!current->journal_info)
++		current->journal_info = h;
++
+ 	root->fs_info->running_transaction->use_count++;
+ 	record_root_in_trans(h, root);
+ 	mutex_unlock(&root->fs_info->trans_mutex);
+@@ -318,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
+ 		wake_up(&cur_trans->writer_wait);
+ 	put_transaction(cur_trans);
+ 	mutex_unlock(&info->trans_mutex);
++
++	if (current->journal_info == trans)
++		current->journal_info = NULL;
+ 	memset(trans, 0, sizeof(*trans));
+ 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ 
+@@ -339,10 +344,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+ /*
+  * when btree blocks are allocated, they have some corresponding bits set for
+  * them in one of two extent_io trees.  This is used to make sure all of
+- * those extents are on disk for transaction or log commit
++ * those extents are sent to disk but does not wait on them
+  */
+-int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+-					struct extent_io_tree *dirty_pages)
++int btrfs_write_marked_extents(struct btrfs_root *root,
++			       struct extent_io_tree *dirty_pages)
+ {
+ 	int ret;
+ 	int err = 0;
+@@ -389,6 +394,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ 			page_cache_release(page);
+ 		}
+ 	}
++	if (err)
++		werr = err;
++	return werr;
++}
++
++/*
++ * when btree blocks are allocated, they have some corresponding bits set for
++ * them in one of two extent_io trees.  This is used to make sure all of
++ * those extents are on disk for transaction or log commit.  We wait
++ * on all the pages and clear them from the dirty pages state tree
++ */
++int btrfs_wait_marked_extents(struct btrfs_root *root,
++			      struct extent_io_tree *dirty_pages)
++{
++	int ret;
++	int err = 0;
++	int werr = 0;
++	struct page *page;
++	struct inode *btree_inode = root->fs_info->btree_inode;
++	u64 start = 0;
++	u64 end;
++	unsigned long index;
++
+ 	while (1) {
+ 		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
+ 					    EXTENT_DIRTY);
+@@ -419,6 +447,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ 	return werr;
+ }
+ 
++/*
++ * when btree blocks are allocated, they have some corresponding bits set for
++ * them in one of two extent_io trees.  This is used to make sure all of
++ * those extents are on disk for transaction or log commit
++ */
++int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
++					struct extent_io_tree *dirty_pages)
++{
++	int ret;
++	int ret2;
++
++	ret = btrfs_write_marked_extents(root, dirty_pages);
++	ret2 = btrfs_wait_marked_extents(root, dirty_pages);
++	return ret || ret2;
++}
++
+ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+ 				     struct btrfs_root *root)
+ {
+@@ -720,7 +764,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+ 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+ 
+ 	key.objectid = objectid;
+-	key.offset = 0;
++	/* record when the snapshot was created in key.offset */
++	key.offset = trans->transid;
+ 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ 
+ 	old = btrfs_lock_root_node(root);
+@@ -743,6 +788,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+ 	memcpy(&pending->root_key, &key, sizeof(key));
+ fail:
+ 	kfree(new_root_item);
++	btrfs_unreserve_metadata_space(root, 6);
+ 	return ret;
+ }
+ 
+@@ -778,24 +824,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
+ 	ret = btrfs_update_inode(trans, parent_root, parent_inode);
+ 	BUG_ON(ret);
+ 
+-	/* add the backref first */
+ 	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+ 				 pending->root_key.objectid,
+-				 BTRFS_ROOT_BACKREF_KEY,
+ 				 parent_root->root_key.objectid,
+ 				 parent_inode->i_ino, index, pending->name,
+ 				 namelen);
+ 
+ 	BUG_ON(ret);
+ 
+-	/* now add the forward ref */
+-	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+-				 parent_root->root_key.objectid,
+-				 BTRFS_ROOT_REF_KEY,
+-				 pending->root_key.objectid,
+-				 parent_inode->i_ino, index, pending->name,
+-				 namelen);
+-
+ 	inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
+ 	d_instantiate(pending->dentry, inode);
+ fail:
+@@ -874,7 +910,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ 	unsigned long timeout = 1;
+ 	struct btrfs_transaction *cur_trans;
+ 	struct btrfs_transaction *prev_trans = NULL;
+-	struct extent_io_tree *pinned_copy;
+ 	DEFINE_WAIT(wait);
+ 	int ret;
+ 	int should_grow = 0;
+@@ -915,13 +950,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ 		return 0;
+ 	}
+ 
+-	pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
+-	if (!pinned_copy)
+-		return -ENOMEM;
+-
+-	extent_io_tree_init(pinned_copy,
+-			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
+-
+ 	trans->transaction->in_commit = 1;
+ 	trans->transaction->blocked = 1;
+ 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
+@@ -1019,6 +1047,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ 	ret = commit_cowonly_roots(trans, root);
+ 	BUG_ON(ret);
+ 
++	btrfs_prepare_extent_commit(trans, root);
++
+ 	cur_trans = root->fs_info->running_transaction;
+ 	spin_lock(&root->fs_info->new_trans_lock);
+ 	root->fs_info->running_transaction = NULL;
+@@ -1042,8 +1072,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ 	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
+ 	       sizeof(root->fs_info->super_copy));
+ 
+-	btrfs_copy_pinned(root, pinned_copy);
+-
+ 	trans->transaction->blocked = 0;
+ 
+ 	wake_up(&root->fs_info->transaction_wait);
+@@ -1059,8 +1087,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ 	 */
+ 	mutex_unlock(&root->fs_info->tree_log_mutex);
+ 
+-	btrfs_finish_extent_commit(trans, root, pinned_copy);
+-	kfree(pinned_copy);
++	btrfs_finish_extent_commit(trans, root);
+ 
+ 	/* do the directory inserts of any pending snapshot creations */
+ 	finish_pending_snapshots(trans, root->fs_info);
+@@ -1078,6 +1105,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ 
+ 	mutex_unlock(&root->fs_info->trans_mutex);
+ 
++	if (current->journal_info == trans)
++		current->journal_info = NULL;
++
+ 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ 	return ret;
+ }
+@@ -1096,8 +1126,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
+ 
+ 	while (!list_empty(&list)) {
+ 		root = list_entry(list.next, struct btrfs_root, root_list);
+-		list_del_init(&root->root_list);
+-		btrfs_drop_snapshot(root, 0);
++		list_del(&root->root_list);
++
++		if (btrfs_header_backref_rev(root->node) <
++		    BTRFS_MIXED_BACKREF_REV)
++			btrfs_drop_snapshot(root, 0);
++		else
++			btrfs_drop_snapshot(root, 1);
+ 	}
+ 	return 0;
+ }
+diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
+index 663c674..d4e3e7a 100644
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
+ 					      struct inode *inode)
+ {
+ 	BTRFS_I(inode)->last_trans = trans->transaction->transid;
++	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+ }
+ 
+ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+@@ -107,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+ 				struct btrfs_root *root);
+ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ 					struct extent_io_tree *dirty_pages);
++int btrfs_write_marked_extents(struct btrfs_root *root,
++					struct extent_io_tree *dirty_pages);
++int btrfs_wait_marked_extents(struct btrfs_root *root,
++					struct extent_io_tree *dirty_pages);
+ int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+ #endif
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index d91b0de..f51bf13 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
+ 
+ 	mutex_lock(&root->log_mutex);
+ 	if (root->log_root) {
++		if (!root->log_start_pid) {
++			root->log_start_pid = current->pid;
++			root->log_multiple_pids = false;
++		} else if (root->log_start_pid != current->pid) {
++			root->log_multiple_pids = true;
++		}
++
+ 		root->log_batch++;
+ 		atomic_inc(&root->log_writers);
+ 		mutex_unlock(&root->log_mutex);
+ 		return 0;
+ 	}
++	root->log_multiple_pids = false;
++	root->log_start_pid = current->pid;
+ 	mutex_lock(&root->fs_info->tree_log_mutex);
+ 	if (!root->fs_info->log_root_tree) {
+ 		ret = btrfs_init_log_root_tree(trans, root->fs_info);
+@@ -263,8 +272,8 @@ static int process_one_buffer(struct btrfs_root *log,
+ 			      struct walk_control *wc, u64 gen)
+ {
+ 	if (wc->pin)
+-		btrfs_update_pinned_extents(log->fs_info->extent_root,
+-					    eb->start, eb->len, 1);
++		btrfs_pin_extent(log->fs_info->extent_root,
++				 eb->start, eb->len, 0);
+ 
+ 	if (btrfs_buffer_uptodate(eb, gen)) {
+ 		if (wc->write)
+@@ -534,7 +543,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+ 	saved_nbytes = inode_get_bytes(inode);
+ 	/* drop any overlapping extents */
+ 	ret = btrfs_drop_extents(trans, root, inode,
+-			 start, extent_end, extent_end, start, &alloc_hint);
++			 start, extent_end, extent_end, start, &alloc_hint, 1);
+ 	BUG_ON(ret);
+ 
+ 	if (found_type == BTRFS_FILE_EXTENT_REG ||
+@@ -1971,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 	int ret;
+ 	struct btrfs_root *log = root->log_root;
+ 	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
++	u64 log_transid = 0;
+ 
+ 	mutex_lock(&root->log_mutex);
+ 	index1 = root->log_transid % 2;
+@@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 
+ 	while (1) {
+ 		unsigned long batch = root->log_batch;
+-		mutex_unlock(&root->log_mutex);
+-		schedule_timeout_uninterruptible(1);
+-		mutex_lock(&root->log_mutex);
+-
++		if (root->log_multiple_pids) {
++			mutex_unlock(&root->log_mutex);
++			schedule_timeout_uninterruptible(1);
++			mutex_lock(&root->log_mutex);
++		}
+ 		wait_for_writer(trans, root);
+ 		if (batch == root->log_batch)
+ 			break;
+@@ -2003,14 +2014,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 		goto out;
+ 	}
+ 
+-	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
++	/* we start IO on  all the marked extents here, but we don't actually
++	 * wait for them until later.
++	 */
++	ret = btrfs_write_marked_extents(log, &log->dirty_log_pages);
+ 	BUG_ON(ret);
+ 
+ 	btrfs_set_root_node(&log->root_item, log->node);
+ 
+ 	root->log_batch = 0;
++	log_transid = root->log_transid;
+ 	root->log_transid++;
+ 	log->log_transid = root->log_transid;
++	root->log_start_pid = 0;
+ 	smp_mb();
+ 	/*
+ 	 * log tree has been flushed to disk, new modifications of
+@@ -2036,6 +2052,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 
+ 	index2 = log_root_tree->log_transid % 2;
+ 	if (atomic_read(&log_root_tree->log_commit[index2])) {
++		btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+ 		wait_log_commit(trans, log_root_tree,
+ 				log_root_tree->log_transid);
+ 		mutex_unlock(&log_root_tree->log_mutex);
+@@ -2055,6 +2072,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 	 * check the full commit flag again
+ 	 */
+ 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
++		btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+ 		mutex_unlock(&log_root_tree->log_mutex);
+ 		ret = -EAGAIN;
+ 		goto out_wake_log_root;
+@@ -2063,6 +2081,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
+ 				&log_root_tree->dirty_log_pages);
+ 	BUG_ON(ret);
++	btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+ 
+ 	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+ 				log_root_tree->node->start);
+@@ -2082,9 +2101,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 	 * the running transaction open, so a full commit can't hop
+ 	 * in and cause problems either.
+ 	 */
+-	write_ctree_super(trans, root->fs_info->tree_root, 2);
++	write_ctree_super(trans, root->fs_info->tree_root, 1);
+ 	ret = 0;
+ 
++	mutex_lock(&root->log_mutex);
++	if (root->last_log_commit < log_transid)
++		root->last_log_commit = log_transid;
++	mutex_unlock(&root->log_mutex);
++
+ out_wake_log_root:
+ 	atomic_set(&log_root_tree->log_commit[index2], 0);
+ 	smp_mb();
+@@ -2841,7 +2865,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
+ 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ 			break;
+ 
+-		if (parent == sb->s_root)
++		if (IS_ROOT(parent))
+ 			break;
+ 
+ 		parent = parent->d_parent;
+@@ -2852,6 +2876,21 @@ out:
+ 	return ret;
+ }
+ 
++static int inode_in_log(struct btrfs_trans_handle *trans,
++		 struct inode *inode)
++{
++	struct btrfs_root *root = BTRFS_I(inode)->root;
++	int ret = 0;
++
++	mutex_lock(&root->log_mutex);
++	if (BTRFS_I(inode)->logged_trans == trans->transid &&
++	    BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
++		ret = 1;
++	mutex_unlock(&root->log_mutex);
++	return ret;
++}
++
++
+ /*
+  * helper function around btrfs_log_inode to make sure newly created
+  * parent directories also end up in the log.  A minimal inode and backref
+@@ -2880,11 +2919,22 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ 		goto end_no_trans;
+ 	}
+ 
++	if (root != BTRFS_I(inode)->root ||
++	    btrfs_root_refs(&root->root_item) == 0) {
++		ret = 1;
++		goto end_no_trans;
++	}
++
+ 	ret = check_parent_dirs_for_sync(trans, inode, parent,
+ 					 sb, last_committed);
+ 	if (ret)
+ 		goto end_no_trans;
+ 
++	if (inode_in_log(trans, inode)) {
++		ret = BTRFS_NO_LOG_SYNC;
++		goto end_no_trans;
++	}
++
+ 	start_log_trans(trans, root);
+ 
+ 	ret = btrfs_log_inode(trans, root, inode, inode_only);
+@@ -2907,12 +2957,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ 			break;
+ 
+ 		inode = parent->d_inode;
++		if (root != BTRFS_I(inode)->root)
++			break;
++
+ 		if (BTRFS_I(inode)->generation >
+ 		    root->fs_info->last_trans_committed) {
+ 			ret = btrfs_log_inode(trans, root, inode, inode_only);
+ 			BUG_ON(ret);
+ 		}
+-		if (parent == sb->s_root)
++		if (IS_ROOT(parent))
+ 			break;
+ 
+ 		parent = parent->d_parent;
+@@ -2951,7 +3004,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+ 	struct btrfs_key tmp_key;
+ 	struct btrfs_root *log;
+ 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+-	u64 highest_inode;
+ 	struct walk_control wc = {
+ 		.process_func = process_one_buffer,
+ 		.stage = 0,
+@@ -3010,11 +3062,6 @@ again:
+ 						      path);
+ 			BUG_ON(ret);
+ 		}
+-		ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
+-		if (ret == 0) {
+-			wc.replay_dest->highest_inode = highest_inode;
+-			wc.replay_dest->last_inode_alloc = highest_inode;
+-		}
+ 
+ 		key.offset = found_key.offset - 1;
+ 		wc.replay_dest->log_root = NULL;
+diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
+index d09c760..0776eac 100644
+--- a/fs/btrfs/tree-log.h
++++ b/fs/btrfs/tree-log.h
+@@ -19,6 +19,9 @@
+ #ifndef __TREE_LOG_
+ #define __TREE_LOG_
+ 
++/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
++#define BTRFS_NO_LOG_SYNC 256
++
+ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ 		   struct btrfs_root *root);
+ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
+index 5dbefd1..20cbd2e 100644
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -276,7 +276,7 @@ loop_lock:
+ 		 * is now congested.  Back off and let other work structs
+ 		 * run instead
+ 		 */
+-		if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
++		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
+ 		    fs_info->fs_devices->open_devices > 1) {
+ 			struct io_context *ioc;
+ 
+@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
+ 			goto error;
+ 
+ 		device->name = kstrdup(orig_dev->name, GFP_NOFS);
+-		if (!device->name)
++		if (!device->name) {
++			kfree(device);
+ 			goto error;
++		}
+ 
+ 		device->devid = orig_dev->devid;
+ 		device->work.func = pending_bios_fn;
+@@ -719,10 +721,9 @@ error:
+  * called very infrequently and that a given device has a small number
+  * of extents
+  */
+-static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
+-					 struct btrfs_device *device,
+-					 u64 num_bytes, u64 *start,
+-					 u64 *max_avail)
++int find_free_dev_extent(struct btrfs_trans_handle *trans,
++			 struct btrfs_device *device, u64 num_bytes,
++			 u64 *start, u64 *max_avail)
+ {
+ 	struct btrfs_key key;
+ 	struct btrfs_root *root = device->dev_root;
+@@ -1736,6 +1737,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
+ 	extent_root = root->fs_info->extent_root;
+ 	em_tree = &root->fs_info->mapping_tree.map_tree;
+ 
++	ret = btrfs_can_relocate(extent_root, chunk_offset);
++	if (ret)
++		return -ENOSPC;
++
+ 	/* step one, relocate all the extents inside this chunk */
+ 	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+ 	BUG_ON(ret);
+@@ -1749,9 +1754,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
+ 	 * step two, delete the device extents and the
+ 	 * chunk tree entries
+ 	 */
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 
+ 	BUG_ON(em->start > chunk_offset ||
+ 	       em->start + em->len < chunk_offset);
+@@ -1780,9 +1785,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
+ 	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+ 	BUG_ON(ret);
+ 
+-	spin_lock(&em_tree->lock);
++	write_lock(&em_tree->lock);
+ 	remove_extent_mapping(em_tree, em);
+-	spin_unlock(&em_tree->lock);
++	write_unlock(&em_tree->lock);
+ 
+ 	kfree(map);
+ 	em->bdev = NULL;
+@@ -1807,12 +1812,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+ 	struct btrfs_key found_key;
+ 	u64 chunk_tree = chunk_root->root_key.objectid;
+ 	u64 chunk_type;
++	bool retried = false;
++	int failed = 0;
+ 	int ret;
+ 
+ 	path = btrfs_alloc_path();
+ 	if (!path)
+ 		return -ENOMEM;
+ 
++again:
+ 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ 	key.offset = (u64)-1;
+ 	key.type = BTRFS_CHUNK_ITEM_KEY;
+@@ -1842,7 +1850,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+ 			ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+ 						   found_key.objectid,
+ 						   found_key.offset);
+-			BUG_ON(ret);
++			if (ret == -ENOSPC)
++				failed++;
++			else if (ret)
++				BUG();
+ 		}
+ 
+ 		if (found_key.offset == 0)
+@@ -1850,6 +1861,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+ 		key.offset = found_key.offset - 1;
+ 	}
+ 	ret = 0;
++	if (failed && !retried) {
++		failed = 0;
++		retried = true;
++		goto again;
++	} else if (failed && retried) {
++		WARN_ON(1);
++		ret = -ENOSPC;
++	}
+ error:
+ 	btrfs_free_path(path);
+ 	return ret;
+@@ -1894,6 +1913,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
+ 			continue;
+ 
+ 		ret = btrfs_shrink_device(device, old_size - size_to_free);
++		if (ret == -ENOSPC)
++			break;
+ 		BUG_ON(ret);
+ 
+ 		trans = btrfs_start_transaction(dev_root, 1);
+@@ -1938,9 +1959,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
+ 		chunk = btrfs_item_ptr(path->nodes[0],
+ 				       path->slots[0],
+ 				       struct btrfs_chunk);
+-		key.offset = found_key.offset;
+ 		/* chunk zero is special */
+-		if (key.offset == 0)
++		if (found_key.offset == 0)
+ 			break;
+ 
+ 		btrfs_release_path(chunk_root, path);
+@@ -1948,7 +1968,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
+ 					   chunk_root->root_key.objectid,
+ 					   found_key.objectid,
+ 					   found_key.offset);
+-		BUG_ON(ret);
++		BUG_ON(ret && ret != -ENOSPC);
++		key.offset = found_key.offset - 1;
+ 	}
+ 	ret = 0;
+ error:
+@@ -1974,10 +1995,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ 	u64 chunk_offset;
+ 	int ret;
+ 	int slot;
++	int failed = 0;
++	bool retried = false;
+ 	struct extent_buffer *l;
+ 	struct btrfs_key key;
+ 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ 	u64 old_total = btrfs_super_total_bytes(super_copy);
++	u64 old_size = device->total_bytes;
+ 	u64 diff = device->total_bytes - new_size;
+ 
+ 	if (new_size >= device->total_bytes)
+@@ -1987,12 +2011,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ 	if (!path)
+ 		return -ENOMEM;
+ 
+-	trans = btrfs_start_transaction(root, 1);
+-	if (!trans) {
+-		ret = -ENOMEM;
+-		goto done;
+-	}
+-
+ 	path->reada = 2;
+ 
+ 	lock_chunks(root);
+@@ -2001,8 +2019,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ 	if (device->writeable)
+ 		device->fs_devices->total_rw_bytes -= diff;
+ 	unlock_chunks(root);
+-	btrfs_end_transaction(trans, root);
+ 
++again:
+ 	key.objectid = device->devid;
+ 	key.offset = (u64)-1;
+ 	key.type = BTRFS_DEV_EXTENT_KEY;
+@@ -2017,6 +2035,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ 			goto done;
+ 		if (ret) {
+ 			ret = 0;
++			btrfs_release_path(root, path);
+ 			break;
+ 		}
+ 
+@@ -2024,14 +2043,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ 		slot = path->slots[0];
+ 		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+ 
+-		if (key.objectid != device->devid)
++		if (key.objectid != device->devid) {
++			btrfs_release_path(root, path);
+ 			break;
++		}
+ 
+ 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ 		length = btrfs_dev_extent_length(l, dev_extent);
+ 
+-		if (key.offset + length <= new_size)
++		if (key.offset + length <= new_size) {
++			btrfs_release_path(root, path);
+ 			break;
++		}
+ 
+ 		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+ 		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+@@ -2040,8 +2063,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+ 
+ 		ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
+ 					   chunk_offset);
+-		if (ret)
++		if (ret && ret != -ENOSPC)
+ 			goto done;
++		if (ret == -ENOSPC)
++			failed++;
++		key.offset -= 1;
++	}
++
++	if (failed && !retried) {
++		failed = 0;
++		retried = true;
++		goto again;
++	} else if (failed && retried) {
++		ret = -ENOSPC;
++		lock_chunks(root);
++
++		device->total_bytes = old_size;
++		if (device->writeable)
++			device->fs_devices->total_rw_bytes += diff;
++		unlock_chunks(root);
++		goto done;
+ 	}
+ 
+ 	/* Shrinking succeeded, else we would be at "done". */
+@@ -2294,9 +2335,9 @@ again:
+ 	em->block_len = em->len;
+ 
+ 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+-	spin_lock(&em_tree->lock);
++	write_lock(&em_tree->lock);
+ 	ret = add_extent_mapping(em_tree, em);
+-	spin_unlock(&em_tree->lock);
++	write_unlock(&em_tree->lock);
+ 	BUG_ON(ret);
+ 	free_extent_map(em);
+ 
+@@ -2491,9 +2532,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
+ 	int readonly = 0;
+ 	int i;
+ 
+-	spin_lock(&map_tree->map_tree.lock);
++	read_lock(&map_tree->map_tree.lock);
+ 	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+-	spin_unlock(&map_tree->map_tree.lock);
++	read_unlock(&map_tree->map_tree.lock);
+ 	if (!em)
+ 		return 1;
+ 
+@@ -2518,11 +2559,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+ 	struct extent_map *em;
+ 
+ 	while (1) {
+-		spin_lock(&tree->map_tree.lock);
++		write_lock(&tree->map_tree.lock);
+ 		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+ 		if (em)
+ 			remove_extent_mapping(&tree->map_tree, em);
+-		spin_unlock(&tree->map_tree.lock);
++		write_unlock(&tree->map_tree.lock);
+ 		if (!em)
+ 			break;
+ 		kfree(em->bdev);
+@@ -2540,9 +2581,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+ 	struct extent_map_tree *em_tree = &map_tree->map_tree;
+ 	int ret;
+ 
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree, logical, len);
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 	BUG_ON(!em);
+ 
+ 	BUG_ON(em->start > logical || em->start + em->len < logical);
+@@ -2604,9 +2645,9 @@ again:
+ 		atomic_set(&multi->error, 0);
+ 	}
+ 
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree, logical, *length);
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 
+ 	if (!em && unplug_page)
+ 		return 0;
+@@ -2763,9 +2804,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+ 	u64 stripe_nr;
+ 	int i, j, nr = 0;
+ 
+-	spin_lock(&em_tree->lock);
++	read_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree, chunk_start, 1);
+-	spin_unlock(&em_tree->lock);
++	read_unlock(&em_tree->lock);
+ 
+ 	BUG_ON(!em || em->start != chunk_start);
+ 	map = (struct map_lookup *)em->bdev;
+@@ -3053,9 +3094,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+ 	logical = key->offset;
+ 	length = btrfs_chunk_length(leaf, chunk);
+ 
+-	spin_lock(&map_tree->map_tree.lock);
++	read_lock(&map_tree->map_tree.lock);
+ 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+-	spin_unlock(&map_tree->map_tree.lock);
++	read_unlock(&map_tree->map_tree.lock);
+ 
+ 	/* already mapped? */
+ 	if (em && em->start <= logical && em->start + em->len > logical) {
+@@ -3114,9 +3155,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+ 		map->stripes[i].dev->in_fs_metadata = 1;
+ 	}
+ 
+-	spin_lock(&map_tree->map_tree.lock);
++	write_lock(&map_tree->map_tree.lock);
+ 	ret = add_extent_mapping(&map_tree->map_tree, em);
+-	spin_unlock(&map_tree->map_tree.lock);
++	write_unlock(&map_tree->map_tree.lock);
+ 	BUG_ON(ret);
+ 	free_extent_map(em);
+ 
+diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
+index 5139a83..31b0fab 100644
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root);
+ void btrfs_unlock_volumes(void);
+ void btrfs_lock_volumes(void);
+ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
++int find_free_dev_extent(struct btrfs_trans_handle *trans,
++			 struct btrfs_device *device, u64 num_bytes,
++			 u64 *start, u64 *max_avail);
+ #endif
+diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
+index a9d3bf4..b6dd596 100644
+--- a/fs/btrfs/xattr.c
++++ b/fs/btrfs/xattr.c
+@@ -260,7 +260,7 @@ err:
+  * attributes are handled directly.
+  */
+ struct xattr_handler *btrfs_xattr_handlers[] = {
+-#ifdef CONFIG_FS_POSIX_ACL
++#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ 	&btrfs_xattr_acl_access_handler,
+ 	&btrfs_xattr_acl_default_handler,
+ #endif
diff --git a/original/linux-2.6-debug-vm-would-have-oomkilled.patch b/original/linux-2.6-debug-vm-would-have-oomkilled.patch
new file mode 100644
index 000000000..5c6302644
--- /dev/null
+++ b/original/linux-2.6-debug-vm-would-have-oomkilled.patch
@@ -0,0 +1,65 @@
+diff --git a/kernel/sysctl.c b/kernel/sysctl.c
+index b2a2d68..3b132ee 100644
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -67,6 +67,7 @@ extern int sysctl_overcommit_ratio;
+ extern int sysctl_panic_on_oom;
+ extern int sysctl_oom_kill_allocating_task;
+ extern int sysctl_oom_dump_tasks;
++extern int sysctl_would_have_oomkilled;
+ extern int max_threads;
+ extern int core_uses_pid;
+ extern int suid_dumpable;
+@@ -861,6 +862,14 @@ static struct ctl_table vm_table[] = {
+ 		.proc_handler	= &proc_dointvec,
+ 	},
+ 	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "would_have_oomkilled",
++		.data		= &sysctl_would_have_oomkilled,
++		.maxlen		= sizeof(sysctl_would_have_oomkilled),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{
+ 		.ctl_name	= VM_OVERCOMMIT_RATIO,
+ 		.procname	= "overcommit_ratio",
+ 		.data		= &sysctl_overcommit_ratio,
+diff --git a/mm/oom_kill.c b/mm/oom_kill.c
+index f255eda..3335a94 100644
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -31,6 +31,7 @@
+ int sysctl_panic_on_oom;
+ int sysctl_oom_kill_allocating_task;
+ int sysctl_oom_dump_tasks;
++int sysctl_would_have_oomkilled;
+ static DEFINE_SPINLOCK(zone_scan_lock);
+ /* #define DEBUG */
+ 
+@@ -321,6 +322,12 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
+ 		return;
+ 	}
+ 
++	if (sysctl_would_have_oomkilled == 1) {
++		printk(KERN_ERR "Would have killed process %d (%s). But continuing instead.\n",
++				task_pid_nr(p), p->comm);
++		return;
++	}
++
+ 	if (verbose)
+ 		printk(KERN_ERR "Killed process %d (%s)\n",
+ 				task_pid_nr(p), p->comm);
+@@ -363,6 +370,12 @@ static int oom_kill_task(struct task_struct *p)
+ 			return 1;
+ 	} while_each_thread(g, q);
+ 
++	if (sysctl_would_have_oomkilled == 1) {
++		printk(KERN_ERR "Would have killed process %d (%s). But continuing instead.\n",
++				task_pid_nr(p), p->comm);
++		return 1;
++	}
++
+ 	__oom_kill_task(p, 1);
+ 
+ 	/*
diff --git a/original/linux-2.6-execshield.patch b/original/linux-2.6-execshield.patch
new file mode 100644
index 000000000..a98b90f5b
--- /dev/null
+++ b/original/linux-2.6-execshield.patch
@@ -0,0 +1,1013 @@
+diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
+index c45f415..3a6dbad 100644
+--- a/arch/x86/include/asm/desc.h
++++ b/arch/x86/include/asm/desc.h
+@@ -6,6 +6,7 @@
+ #include <asm/ldt.h>
+ #include <asm/mmu.h>
+ #include <linux/smp.h>
++#include <linux/mm_types.h>
+ 
+ static inline void fill_ldt(struct desc_struct *desc,
+ 			    const struct user_desc *info)
+@@ -94,6 +95,9 @@ static inline int desc_empty(const void *ptr)
+ 
+ #define load_TLS(t, cpu) native_load_tls(t, cpu)
+ #define set_ldt native_set_ldt
++#ifdef CONFIG_X86_32
++#define load_user_cs_desc native_load_user_cs_desc
++#endif /*CONFIG_X86_32*/
+ 
+ #define write_ldt_entry(dt, entry, desc)	\
+ 	native_write_ldt_entry(dt, entry, desc)
+@@ -380,4 +384,25 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
+ 	_set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
+ }
+ 
++#ifdef CONFIG_X86_32
++static inline void set_user_cs(struct desc_struct *desc, unsigned long limit)
++{
++	limit = (limit - 1) / PAGE_SIZE;
++	desc->a = limit & 0xffff;
++	desc->b = (limit & 0xf0000) | 0x00c0fb00;
++}
++
++static inline void native_load_user_cs_desc(int cpu, struct mm_struct *mm)
++{
++	get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS] = (mm)->context.user_cs;
++}
++
++#define arch_add_exec_range arch_add_exec_range
++#define arch_remove_exec_range arch_remove_exec_range
++#define arch_flush_exec_range arch_flush_exec_range
++extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit);
++extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit);
++extern void arch_flush_exec_range(struct mm_struct *mm);
++#endif /* CONFIG_X86_32 */
++
+ #endif /* _ASM_X86_DESC_H */
+diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
+index 80a1dee..8314c66 100644
+--- a/arch/x86/include/asm/mmu.h
++++ b/arch/x86/include/asm/mmu.h
+@@ -7,12 +7,19 @@
+ /*
+  * The x86 doesn't have a mmu context, but
+  * we put the segment information here.
++ *
++ * exec_limit is used to track the range PROT_EXEC
++ * mappings span.
+  */
+ typedef struct {
+ 	void *ldt;
+ 	int size;
+ 	struct mutex lock;
+ 	void *vdso;
++#ifdef CONFIG_X86_32
++	struct desc_struct user_cs;
++	unsigned long exec_limit;
++#endif
+ } mm_context_t;
+ 
+ #ifdef CONFIG_SMP
+diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
+index 4fb37c8..d5cc31c 100644
+--- a/arch/x86/include/asm/paravirt.h
++++ b/arch/x86/include/asm/paravirt.h
+@@ -139,6 +139,9 @@ struct pv_cpu_ops {
+ 	void (*store_gdt)(struct desc_ptr *);
+ 	void (*store_idt)(struct desc_ptr *);
+ 	void (*set_ldt)(const void *desc, unsigned entries);
++#ifdef CONFIG_X86_32
++	void (*load_user_cs_desc)(int cpu, struct mm_struct *mm);
++#endif /*CONFIG_X86_32*/
+ 	unsigned long (*store_tr)(void);
+ 	void (*load_tls)(struct thread_struct *t, unsigned int cpu);
+ #ifdef CONFIG_X86_64
+@@ -955,6 +958,12 @@ static inline void set_ldt(const void *addr, unsigned entries)
+ {
+ 	PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
+ }
++#ifdef CONFIG_X86_32
++static inline void load_user_cs_desc(unsigned int cpu, struct mm_struct *mm)
++{
++	PVOP_VCALL2(pv_cpu_ops.load_user_cs_desc, cpu, mm);
++}
++#endif /*CONFIG_X86_32*/
+ static inline void store_gdt(struct desc_ptr *dtr)
+ {
+ 	PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
+diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
+index c776826..fb6b579 100644
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -160,6 +160,9 @@ static inline int hlt_works(int cpu)
+ 
+ #define cache_line_size()	(boot_cpu_data.x86_cache_alignment)
+ 
++#define __HAVE_ARCH_ALIGN_STACK
++extern unsigned long arch_align_stack(unsigned long sp);
++
+ extern void cpu_detect(struct cpuinfo_x86 *c);
+ 
+ extern struct pt_regs *idle_regs(struct pt_regs *);
+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
+index 3ffdcfa..62cba96 100644
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -804,6 +804,20 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+ 	/* Filter out anything that depends on CPUID levels we don't have */
+ 	filter_cpuid_features(c, true);
+ 
++	/*
++	 *  emulation of NX with segment limits unfortunately means
++	 *  we have to disable the fast system calls, due to the way that
++	 *  sysexit clears the segment limits on return.
++	 *  If we have either disabled exec-shield on the boot command line,
++	 *  or we have NX, then we don't need to do this.
++	 */
++	if (exec_shield != 0) {
++#ifdef CONFIG_X86_PAE
++		if (!test_cpu_cap(c, X86_FEATURE_NX))
++#endif
++			clear_cpu_cap(c, X86_FEATURE_SEP);
++	}
++
+ 	/* If the model name is still unset, do table lookup. */
+ 	if (!c->x86_model_id[0]) {
+ 		const char *p;
+diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
+index 70ec9b9..d956b8c 100644
+--- a/arch/x86/kernel/paravirt.c
++++ b/arch/x86/kernel/paravirt.c
+@@ -369,6 +369,9 @@ struct pv_cpu_ops pv_cpu_ops = {
+ 	.read_tscp = native_read_tscp,
+ 	.load_tr_desc = native_load_tr_desc,
+ 	.set_ldt = native_set_ldt,
++#ifdef CONFIG_X86_32
++	.load_user_cs_desc = native_load_user_cs_desc,
++#endif /*CONFIG_X86_32*/
+ 	.load_gdt = native_load_gdt,
+ 	.load_idt = native_load_idt,
+ 	.store_gdt = native_store_gdt,
+diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
+index 59f4524..068e286 100644
+--- a/arch/x86/kernel/process_32.c
++++ b/arch/x86/kernel/process_32.c
+@@ -299,7 +299,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
+ void
+ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+ {
++	int cpu;
++
+ 	set_user_gs(regs, 0);
++
+ 	regs->fs		= 0;
+ 	set_fs(USER_DS);
+ 	regs->ds		= __USER_DS;
+@@ -308,6 +311,11 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+ 	regs->cs		= __USER_CS;
+ 	regs->ip		= new_ip;
+ 	regs->sp		= new_sp;
++
++	cpu = get_cpu();
++	load_user_cs_desc(cpu, current->mm);
++	put_cpu();
++
+ 	/*
+ 	 * Free the old FP and other extended state
+ 	 */
+@@ -354,7 +362,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+ 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
+ 
+ 	__unlazy_fpu(prev_p);
+-
++	if (next_p->mm)
++		load_user_cs_desc(cpu, next_p->mm);
+ 
+ 	/* we're going to use this soon, after a few expensive things */
+ 	if (next_p->fpu_counter > 5)
+@@ -495,3 +504,40 @@ unsigned long get_wchan(struct task_struct *p)
+ 	return 0;
+ }
+ 
++static void modify_cs(struct mm_struct *mm, unsigned long limit)
++{
++	mm->context.exec_limit = limit;
++	set_user_cs(&mm->context.user_cs, limit);
++	if (mm == current->mm) {
++		int cpu;
++
++		cpu = get_cpu();
++		load_user_cs_desc(cpu, mm);
++		put_cpu();
++	}
++}
++
++void arch_add_exec_range(struct mm_struct *mm, unsigned long limit)
++{
++	if (limit > mm->context.exec_limit)
++		modify_cs(mm, limit);
++}
++
++void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end)
++{
++	struct vm_area_struct *vma;
++	unsigned long limit = PAGE_SIZE;
++
++	if (old_end == mm->context.exec_limit) {
++		for (vma = mm->mmap; vma; vma = vma->vm_next)
++			if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
++				limit = vma->vm_end;
++		modify_cs(mm, limit);
++	}
++}
++
++void arch_flush_exec_range(struct mm_struct *mm)
++{
++	mm->context.exec_limit = 0;
++	set_user_cs(&mm->context.user_cs, 0);
++}
+diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
+index 07d60c8..41e9129 100644
+--- a/arch/x86/kernel/traps.c
++++ b/arch/x86/kernel/traps.c
+@@ -118,6 +118,76 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err)
+ 	if (!user_mode_vm(regs))
+ 		die(str, regs, err);
+ }
++
++static inline int
++__compare_user_cs_desc(const struct desc_struct *desc1,
++	const struct desc_struct *desc2)
++{
++	return ((desc1->limit0 != desc2->limit0) ||
++		(desc1->limit != desc2->limit) ||
++		(desc1->base0 != desc2->base0) ||
++		(desc1->base1 != desc2->base1) ||
++		(desc1->base2 != desc2->base2));
++}
++
++/*
++ * lazy-check for CS validity on exec-shield binaries:
++ *
++ * the original non-exec stack patch was written by
++ * Solar Designer <solar at openwall.com>. Thanks!
++ */
++static int
++check_lazy_exec_limit(int cpu, struct pt_regs *regs, long error_code)
++{
++	struct desc_struct *desc1, *desc2;
++	struct vm_area_struct *vma;
++	unsigned long limit;
++
++	if (current->mm == NULL)
++		return 0;
++
++	limit = -1UL;
++	if (current->mm->context.exec_limit != -1UL) {
++		limit = PAGE_SIZE;
++		spin_lock(&current->mm->page_table_lock);
++		for (vma = current->mm->mmap; vma; vma = vma->vm_next)
++			if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
++				limit = vma->vm_end;
++		vma = get_gate_vma(current);
++		if (vma && (vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
++			limit = vma->vm_end;
++		spin_unlock(&current->mm->page_table_lock);
++		if (limit >= TASK_SIZE)
++			limit = -1UL;
++		current->mm->context.exec_limit = limit;
++	}
++	set_user_cs(&current->mm->context.user_cs, limit);
++
++	desc1 = &current->mm->context.user_cs;
++	desc2 = get_cpu_gdt_table(cpu) + GDT_ENTRY_DEFAULT_USER_CS;
++
++	if (__compare_user_cs_desc(desc1, desc2)) {
++		/*
++		 * The CS was not in sync - reload it and retry the
++		 * instruction. If the instruction still faults then
++		 * we won't hit this branch next time around.
++		 */
++		if (print_fatal_signals >= 2) {
++			printk(KERN_ERR "#GPF fixup (%ld[seg:%lx]) at %08lx, CPU#%d.\n",
++				error_code, error_code/8, regs->ip,
++				smp_processor_id());
++			printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x, CPU_cs: %08x/%08x.\n",
++				current->mm->context.exec_limit,
++				desc1->a, desc1->b, desc2->a, desc2->b);
++		}
++
++		load_user_cs_desc(cpu, current->mm);
++
++		return 1;
++	}
++
++	return 0;
++}
+ #endif
+ 
+ static void __kprobes
+@@ -276,6 +346,29 @@ do_general_protection(struct pt_regs *regs, long error_code)
+ 	if (!user_mode(regs))
+ 		goto gp_in_kernel;
+ 
++#ifdef CONFIG_X86_32
++{
++	int cpu;
++	int ok;
++
++	cpu = get_cpu();
++	ok = check_lazy_exec_limit(cpu, regs, error_code);
++	put_cpu();
++
++	if (ok)
++		return;
++
++	if (print_fatal_signals) {
++		printk(KERN_ERR "#GPF(%ld[seg:%lx]) at %08lx, CPU#%d.\n",
++			error_code, error_code/8, regs->ip, smp_processor_id());
++		printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x.\n",
++			current->mm->context.exec_limit,
++			current->mm->context.user_cs.a,
++			current->mm->context.user_cs.b);
++	}
++}
++#endif /*CONFIG_X86_32*/
++
+ 	tsk->thread.error_code = error_code;
+ 	tsk->thread.trap_no = 13;
+ 
+@@ -885,19 +978,37 @@ do_device_not_available(struct pt_regs *regs, long error_code)
+ }
+ 
+ #ifdef CONFIG_X86_32
++/*
++ * The fixup code for errors in iret jumps to here (iret_exc). It loses
++ * the original trap number and erorr code. The bogus trap 32 and error
++ * code 0 are what the vanilla kernel delivers via:
++ * DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
++ *
++ * NOTE: Because of the final "1" in the macro we need to enable interrupts.
++ *
++ * In case of a general protection fault in the iret instruction, we
++ * need to check for a lazy CS update for exec-shield.
++ */
+ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
+ {
+-	siginfo_t info;
++	int ok;
++	int cpu;
++
+ 	local_irq_enable();
+ 
+-	info.si_signo = SIGILL;
+-	info.si_errno = 0;
+-	info.si_code = ILL_BADSTK;
+-	info.si_addr = NULL;
+-	if (notify_die(DIE_TRAP, "iret exception",
+-			regs, error_code, 32, SIGILL) == NOTIFY_STOP)
+-		return;
+-	do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
++	cpu = get_cpu();
++	ok = check_lazy_exec_limit(cpu, regs, error_code);
++	put_cpu();
++
++	if (!ok && notify_die(DIE_TRAP, "iret exception", regs,
++		error_code, 32, SIGSEGV) != NOTIFY_STOP) {
++			siginfo_t info;
++			info.si_signo = SIGSEGV;
++			info.si_errno = 0;
++			info.si_code = ILL_BADSTK;
++			info.si_addr = 0;
++			do_trap(32, SIGSEGV, "iret exception", regs, error_code, &info);
++	}
+ }
+ #endif
+ 
+diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
+index 34c1bfb..32c3d8d 100644
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -228,6 +228,12 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
+ 	set_nx();
+ 	if (nx_enabled)
+ 		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
++#ifdef CONFIG_X86_32
++	else
++	if (exec_shield)
++		printk(KERN_INFO "Using x86 segment limits to approximate "
++			"NX protection\n");
++#endif
+ 
+ 	/* Enable PSE if available */
+ 	if (cpu_has_pse)
+diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
+index 949708d..c1373b6 100644
+--- a/arch/x86/mm/init_32.c
++++ b/arch/x86/mm/init_32.c
+@@ -587,6 +587,54 @@ void zap_low_mappings(void)
+ pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
+ EXPORT_SYMBOL_GPL(__supported_pte_mask);
+ 
++#ifdef CONFIG_X86_PAE
++
++static int disable_nx __initdata;
++
++/*
++ * noexec = on|off
++ *
++ * Control non executable mappings.
++ *
++ * on      Enable
++ * off     Disable (disables exec-shield too)
++ */
++static int __init noexec_setup(char *str)
++{
++	if (!str || !strcmp(str, "on")) {
++		if (cpu_has_nx) {
++			__supported_pte_mask |= _PAGE_NX;
++			disable_nx = 0;
++		}
++	} else if (!strcmp(str, "off")) {
++		disable_nx = 1;
++		__supported_pte_mask &= ~_PAGE_NX;
++		exec_shield = 0;
++	} else
++		return -EINVAL;
++
++	return 0;
++}
++early_param("noexec", noexec_setup);
++
++void __init set_nx(void)
++{
++	unsigned int v[4], l, h;
++
++	if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
++		cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
++
++		if ((v[3] & (1 << 20)) && !disable_nx) {
++			rdmsr(MSR_EFER, l, h);
++			l |= EFER_NX;
++			wrmsr(MSR_EFER, l, h);
++			nx_enabled = 1;
++			__supported_pte_mask |= _PAGE_NX;
++		}
++	}
++}
++#endif
++
+ /* user-defined highmem size */
+ static unsigned int highmem_pages = -1;
+ 
+diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
+index 1658296..72056cf 100644
+--- a/arch/x86/mm/mmap.c
++++ b/arch/x86/mm/mmap.c
+@@ -111,13 +111,16 @@ static unsigned long mmap_legacy_base(void)
+  */
+ void arch_pick_mmap_layout(struct mm_struct *mm)
+ {
+-	if (mmap_is_legacy()) {
++	if (!(2 & exec_shield) && mmap_is_legacy()) {
+ 		mm->mmap_base = mmap_legacy_base();
+ 		mm->get_unmapped_area = arch_get_unmapped_area;
+ 		mm->unmap_area = arch_unmap_area;
+ 	} else {
+ 		mm->mmap_base = mmap_base();
+ 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
++		if (!(current->personality & READ_IMPLIES_EXEC)
++		    && mmap_is_ia32())
++			mm->get_unmapped_exec_area = arch_get_unmapped_exec_area;
+ 		mm->unmap_area = arch_unmap_area_topdown;
+ 	}
+ }
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index 821e970..ea5a4c3 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -6,6 +6,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/module.h>
+ 
++#include <asm/desc.h>
+ #include <asm/tlbflush.h>
+ #include <asm/mmu_context.h>
+ #include <asm/apic.h>
+@@ -129,6 +130,12 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
+ 	union smp_flush_state *f;
+ 
+ 	cpu = smp_processor_id();
++
++#ifdef CONFIG_X86_32
++	if (current->active_mm)
++		load_user_cs_desc(cpu, current->active_mm);
++#endif
++
+ 	/*
+ 	 * orig_rax contains the negated interrupt vector.
+ 	 * Use that to determine where the sender put the data.
+diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
+index 58bc00f..1fdafb5 100644
+--- a/arch/x86/vdso/vdso32-setup.c
++++ b/arch/x86/vdso/vdso32-setup.c
+@@ -331,7 +331,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+ 	if (compat)
+ 		addr = VDSO_HIGH_BASE;
+ 	else {
+-		addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
++		addr = get_unmapped_area_prot(NULL, 0, PAGE_SIZE, 0, 0, 1);
+ 		if (IS_ERR_VALUE(addr)) {
+ 			ret = addr;
+ 			goto up_fail;
+diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
+index 0a1700a..37b8744 100644
+--- a/arch/x86/xen/enlighten.c
++++ b/arch/x86/xen/enlighten.c
+@@ -321,6 +321,24 @@ static void xen_set_ldt(const void *addr, unsigned entries)
+ 	xen_mc_issue(PARAVIRT_LAZY_CPU);
+ }
+ 
++#ifdef CONFIG_X86_32
++static void xen_load_user_cs_desc(int cpu, struct mm_struct *mm)
++{
++	void *gdt;
++	xmaddr_t mgdt;
++	u64 descriptor;
++	struct desc_struct user_cs;
++
++	gdt = &get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS];
++	mgdt = virt_to_machine(gdt);
++
++	user_cs = mm->context.user_cs;
++	descriptor = (u64) user_cs.a | ((u64) user_cs.b) << 32;
++
++	HYPERVISOR_update_descriptor(mgdt.maddr, descriptor);
++}
++#endif /*CONFIG_X86_32*/
++
+ static void xen_load_gdt(const struct desc_ptr *dtr)
+ {
+ 	unsigned long va = dtr->address;
+@@ -886,6 +904,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+ 
+ 	.load_tr_desc = paravirt_nop,
+ 	.set_ldt = xen_set_ldt,
++#ifdef CONFIG_X86_32
++	.load_user_cs_desc = xen_load_user_cs_desc,
++#endif /*CONFIG_X86_32*/
+ 	.load_gdt = xen_load_gdt,
+ 	.load_idt = xen_load_idt,
+ 	.load_tls = xen_load_tls,
+diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
+index 40381df..f856fab 100644
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -73,7 +73,7 @@ static struct linux_binfmt elf_format = {
+ 		.hasvdso	= 1
+ };
+ 
+-#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
++#define BAD_ADDR(x) IS_ERR_VALUE(x)
+ 
+ static int set_brk(unsigned long start, unsigned long end)
+ {
+@@ -721,6 +721,11 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+ 			break;
+ 		}
+ 
++	if (current->personality == PER_LINUX && (exec_shield & 2)) {
++		executable_stack = EXSTACK_DISABLE_X;
++		current->flags |= PF_RANDOMIZE;
++	}
++
+ 	/* Some simple consistency checks for the interpreter */
+ 	if (elf_interpreter) {
+ 		retval = -ELIBBAD;
+@@ -740,6 +745,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+ 	if (retval)
+ 		goto out_free_dentry;
+ 
++#ifdef CONFIG_X86_32
++	/*
++	 * Turn off the CS limit completely if exec-shield disabled or
++	 * NX active:
++	 */
++	if (!exec_shield || executable_stack != EXSTACK_DISABLE_X || nx_enabled)
++		arch_add_exec_range(current->mm, -1);
++#endif
++
+ 	/* OK, This is the point of no return */
+ 	current->flags &= ~PF_FORKNOEXEC;
+ 	current->mm->def_flags = def_flags;
+@@ -747,7 +761,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+ 	/* Do this immediately, since STACK_TOP as used in setup_arg_pages
+ 	   may depend on the personality.  */
+ 	SET_PERSONALITY(loc->elf_ex);
+-	if (elf_read_implies_exec(loc->elf_ex, executable_stack))
++	if (!(exec_shield & 2) &&
++			elf_read_implies_exec(loc->elf_ex, executable_stack))
+ 		current->personality |= READ_IMPLIES_EXEC;
+ 
+ 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+@@ -912,7 +927,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+ 					    interpreter,
+ 					    &interp_map_addr,
+ 					    load_bias);
+-		if (!IS_ERR((void *)elf_entry)) {
++		if (!BAD_ADDR(elf_entry)) {
+ 			/*
+ 			 * load_elf_interp() returns relocation
+ 			 * adjustment
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index ad613ed..08f08d0 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1135,7 +1135,13 @@ extern int install_special_mapping(struct mm_struct *mm,
+ 				   unsigned long addr, unsigned long len,
+ 				   unsigned long flags, struct page **pages);
+ 
+-extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
++extern unsigned long get_unmapped_area_prot(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, int);
++
++static inline unsigned long get_unmapped_area(struct file *file, unsigned long addr,
++		unsigned long len, unsigned long pgoff, unsigned long flags)
++{
++	return get_unmapped_area_prot(file, addr, len, pgoff, flags, 0);
++}
+ 
+ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+ 	unsigned long len, unsigned long prot,
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 0e80e26..af904ea 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -198,6 +198,9 @@ struct mm_struct {
+ 	unsigned long (*get_unmapped_area) (struct file *filp,
+ 				unsigned long addr, unsigned long len,
+ 				unsigned long pgoff, unsigned long flags);
++       unsigned long (*get_unmapped_exec_area) (struct file *filp,
++				unsigned long addr, unsigned long len,
++				unsigned long pgoff, unsigned long flags);
+ 	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
+ 	unsigned long mmap_base;		/* base of mmap area */
+ 	unsigned long task_size;		/* size of task vm space */
+diff --git a/include/linux/resource.h b/include/linux/resource.h
+index 40fc7e6..68c2549 100644
+--- a/include/linux/resource.h
++++ b/include/linux/resource.h
+@@ -55,8 +55,11 @@ struct rlimit {
+ /*
+  * Limit the stack by to some sane default: root can always
+  * increase this limit if needed..  8MB seems reasonable.
++ *
++ * (2MB more to cover randomization effects.)
+  */
+-#define _STK_LIM	(8*1024*1024)
++#define _STK_LIM	(10*1024*1024)
++#define EXEC_STACK_BIAS	(2*1024*1024)
+ 
+ /*
+  * GPG2 wants 64kB of mlocked memory, to make sure pass phrases
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 4896fdf..3513e03 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -101,6 +101,9 @@ struct fs_struct;
+ struct bts_context;
+ struct perf_counter_context;
+ 
++extern int exec_shield;
++extern int print_fatal_signals;
++
+ /*
+  * List of flags we want to share for kernel threads,
+  * if only because they are not used by them anyway.
+@@ -359,6 +362,10 @@ extern int sysctl_max_map_count;
+ extern unsigned long
+ arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
+ 		       unsigned long, unsigned long);
++
++extern unsigned long
++arch_get_unmapped_exec_area(struct file *, unsigned long, unsigned long,
++		       unsigned long, unsigned long);
+ extern unsigned long
+ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ 			  unsigned long len, unsigned long pgoff,
+diff --git a/kernel/sysctl.c b/kernel/sysctl.c
+index ce664f9..1905e22 100644
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -87,6 +87,26 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
+ #ifndef CONFIG_MMU
+ extern int sysctl_nr_trim_pages;
+ #endif
++
++int exec_shield = (1<<0);
++/* exec_shield is a bitmask:
++ * 0: off; vdso at STACK_TOP, 1 page below TASK_SIZE
++ * (1<<0) 1: on [also on if !=0]
++ * (1<<1) 2: force noexecstack regardless of PT_GNU_STACK
++ * The old settings
++ * (1<<2) 4: vdso just below .text of main (unless too low)
++ * (1<<3) 8: vdso just below .text of PT_INTERP (unless too low)
++ * are ignored because the vdso is placed completely randomly
++ */
++
++static int __init setup_exec_shield(char *str)
++{
++	get_option(&str, &exec_shield);
++
++	return 1;
++}
++__setup("exec-shield=", setup_exec_shield);
++
+ #ifdef CONFIG_RCU_TORTURE_TEST
+ extern int rcutorture_runnable;
+ #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
+@@ -382,6 +402,14 @@ static struct ctl_table kern_table[] = {
+ 		.proc_handler	= &proc_dointvec,
+ 	},
+ 	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "exec-shield",
++		.data		= &exec_shield,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{
+ 		.ctl_name	= KERN_CORE_USES_PID,
+ 		.procname	= "core_uses_pid",
+ 		.data		= &core_uses_pid,
+diff --git a/mm/mmap.c b/mm/mmap.c
+index 34579b2..260bb3c 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -29,6 +29,7 @@
+ #include <linux/rmap.h>
+ #include <linux/mmu_notifier.h>
+ #include <linux/perf_counter.h>
++#include <linux/random.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/cacheflush.h>
+@@ -45,6 +46,18 @@
+ #define arch_rebalance_pgtables(addr, len)		(addr)
+ #endif
+ 
++/* No sane architecture will #define these to anything else */
++#ifndef arch_add_exec_range
++#define arch_add_exec_range(mm, limit)	do { ; } while (0)
++#endif
++#ifndef arch_flush_exec_range
++#define arch_flush_exec_range(mm)	do { ; } while (0)
++#endif
++#ifndef arch_remove_exec_range
++#define arch_remove_exec_range(mm, limit)	do { ; } while (0)
++#endif
++
++
+ static void unmap_region(struct mm_struct *mm,
+ 		struct vm_area_struct *vma, struct vm_area_struct *prev,
+ 		unsigned long start, unsigned long end);
+@@ -392,6 +405,8 @@ static inline void
+ __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+ 		struct vm_area_struct *prev, struct rb_node *rb_parent)
+ {
++	if (vma->vm_flags & VM_EXEC)
++		arch_add_exec_range(mm, vma->vm_end);
+ 	if (prev) {
+ 		vma->vm_next = prev->vm_next;
+ 		prev->vm_next = vma;
+@@ -494,6 +509,8 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	rb_erase(&vma->vm_rb, &mm->mm_rb);
+ 	if (mm->mmap_cache == vma)
+ 		mm->mmap_cache = prev;
++	if (vma->vm_flags & VM_EXEC)
++		arch_remove_exec_range(mm, vma->vm_end);
+ }
+ 
+ /*
+@@ -803,6 +820,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
+ 		} else					/* cases 2, 5, 7 */
+ 			vma_adjust(prev, prev->vm_start,
+ 				end, prev->vm_pgoff, NULL);
++		if (prev->vm_flags & VM_EXEC)
++			arch_add_exec_range(mm, prev->vm_end);
+ 		return prev;
+ 	}
+ 
+@@ -957,7 +976,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+ 	/* Obtain the address to map to. we verify (or select) it and ensure
+ 	 * that it represents a valid section of the address space.
+ 	 */
+-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
++	addr = get_unmapped_area_prot(file, addr, len, pgoff, flags,
++		prot & PROT_EXEC);
+ 	if (addr & ~PAGE_MASK)
+ 		return addr;
+ 
+@@ -1442,13 +1462,17 @@ void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
+ }
+ 
+ unsigned long
+-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+-		unsigned long pgoff, unsigned long flags)
++get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
++		unsigned long pgoff, unsigned long flags, int exec)
+ {
+ 	unsigned long (*get_area)(struct file *, unsigned long,
+ 				  unsigned long, unsigned long, unsigned long);
+ 
+-	get_area = current->mm->get_unmapped_area;
++	if (exec && current->mm->get_unmapped_exec_area)
++		get_area = current->mm->get_unmapped_exec_area;
++	else
++		get_area = current->mm->get_unmapped_area;
++
+ 	if (file && file->f_op && file->f_op->get_unmapped_area)
+ 		get_area = file->f_op->get_unmapped_area;
+ 	addr = get_area(file, addr, len, pgoff, flags);
+@@ -1462,8 +1486,76 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+ 
+ 	return arch_rebalance_pgtables(addr, len);
+ }
++EXPORT_SYMBOL(get_unmapped_area_prot);
++
++#define SHLIB_BASE	0x00110000
++
++unsigned long
++arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0,
++		unsigned long len0, unsigned long pgoff, unsigned long flags)
++{
++	unsigned long addr = addr0, len = len0;
++	struct mm_struct *mm = current->mm;
++	struct vm_area_struct *vma;
++	unsigned long tmp;
++
++	if (len > TASK_SIZE)
++		return -ENOMEM;
++
++	if (flags & MAP_FIXED)
++		return addr;
++
++	if (!addr)
++		addr = randomize_range(SHLIB_BASE, 0x01000000, len);
++
++	if (addr) {
++		addr = PAGE_ALIGN(addr);
++		vma = find_vma(mm, addr);
++		if (TASK_SIZE - len >= addr &&
++		    (!vma || addr + len <= vma->vm_start))
++			return addr;
++	}
++
++	addr = SHLIB_BASE;
++	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
++		/* At this point:  (!vma || addr < vma->vm_end). */
++		if (TASK_SIZE - len < addr)
++			return -ENOMEM;
++
++		if (!vma || addr + len <= vma->vm_start) {
++			/*
++			 * Must not let a PROT_EXEC mapping get into the
++			 * brk area:
++			 */
++			if (addr + len > mm->brk)
++				goto failed;
++
++			/*
++			 * Up until the brk area we randomize addresses
++			 * as much as possible:
++			 */
++			if (addr >= 0x01000000) {
++				tmp = randomize_range(0x01000000,
++					PAGE_ALIGN(max(mm->start_brk,
++					(unsigned long)0x08000000)), len);
++				vma = find_vma(mm, tmp);
++				if (TASK_SIZE - len >= tmp &&
++				    (!vma || tmp + len <= vma->vm_start))
++					return tmp;
++			}
++			/*
++			 * Ok, randomization didnt work out - return
++			 * the result of the linear search:
++			 */
++			return addr;
++		}
++		addr = vma->vm_end;
++	}
++
++failed:
++	return current->mm->get_unmapped_area(filp, addr0, len0, pgoff, flags);
++}
+ 
+-EXPORT_SYMBOL(get_unmapped_area);
+ 
+ /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
+ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+@@ -1538,6 +1630,14 @@ out:
+ 	return prev ? prev->vm_next : vma;
+ }
+ 
++static int over_stack_limit(unsigned long sz)
++{
++	if (sz < EXEC_STACK_BIAS)
++		return 0;
++	return (sz - EXEC_STACK_BIAS) >
++			current->signal->rlim[RLIMIT_STACK].rlim_cur;
++}
++
+ /*
+  * Verify that the stack growth is acceptable and
+  * update accounting. This is shared with both the
+@@ -1554,7 +1654,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
+ 		return -ENOMEM;
+ 
+ 	/* Stack limit test */
+-	if (size > rlim[RLIMIT_STACK].rlim_cur)
++	if (over_stack_limit(size))
+ 		return -ENOMEM;
+ 
+ 	/* mlock limit tests */
+@@ -1864,10 +1964,14 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+ 	if (new->vm_ops && new->vm_ops->open)
+ 		new->vm_ops->open(new);
+ 
+-	if (new_below)
++	if (new_below) {
++		unsigned long old_end = vma->vm_end;
++
+ 		vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
+ 			((addr - new->vm_start) >> PAGE_SHIFT), new);
+-	else
++		if (vma->vm_flags & VM_EXEC)
++			arch_remove_exec_range(mm, old_end);
++	} else
+ 		vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+ 
+ 	return 0;
+@@ -2116,6 +2220,7 @@ void exit_mmap(struct mm_struct *mm)
+ 	vm_unacct_memory(nr_accounted);
+ 	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
+ 	tlb_finish_mmu(tlb, 0, end);
++	arch_flush_exec_range(mm);
+ 
+ 	/*
+ 	 * Walk the list again, actually closing and freeing it,
+diff --git a/mm/mprotect.c b/mm/mprotect.c
+index d80311b..032423d 100644
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -26,9 +26,14 @@
+ #include <linux/perf_counter.h>
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
++#include <asm/pgalloc.h>
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+ 
++#ifndef arch_remove_exec_range
++#define arch_remove_exec_range(mm, limit)      do { ; } while (0)
++#endif
++
+ #ifndef pgprot_modify
+ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+ {
+@@ -139,7 +144,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
+ 	struct mm_struct *mm = vma->vm_mm;
+ 	unsigned long oldflags = vma->vm_flags;
+ 	long nrpages = (end - start) >> PAGE_SHIFT;
+-	unsigned long charged = 0;
++	unsigned long charged = 0, old_end = vma->vm_end;
+ 	pgoff_t pgoff;
+ 	int error;
+ 	int dirty_accountable = 0;
+@@ -204,6 +209,9 @@ success:
+ 		dirty_accountable = 1;
+ 	}
+ 
++	if (oldflags & VM_EXEC)
++		arch_remove_exec_range(current->mm, old_end);
++
+ 	mmu_notifier_invalidate_range_start(mm, start, end);
+ 	if (is_vm_hugetlb_page(vma))
+ 		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
+diff --git a/mm/mremap.c b/mm/mremap.c
+index a39b7b9..6bebfde 100644
+--- a/mm/mremap.c
++++ b/mm/mremap.c
+@@ -400,8 +400,8 @@ unsigned long do_mremap(unsigned long addr,
+ 			if (vma->vm_flags & VM_MAYSHARE)
+ 				map_flags |= MAP_SHARED;
+ 
+-			new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+-						vma->vm_pgoff, map_flags);
++			new_addr = get_unmapped_area_prot(vma->vm_file, 0, new_len,
++				vma->vm_pgoff, map_flags, vma->vm_flags & VM_EXEC);
+ 			if (new_addr & ~PAGE_MASK) {
+ 				ret = new_addr;
+ 				goto out;
diff --git a/original/linux-2.6-utrace.patch b/original/linux-2.6-utrace.patch
new file mode 100644
index 000000000..861080917
--- /dev/null
+++ b/original/linux-2.6-utrace.patch
@@ -0,0 +1,4102 @@
+diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
+index 9632444..bf4b9e8 100644
+--- a/Documentation/DocBook/Makefile
++++ b/Documentation/DocBook/Makefile
+@@ -9,7 +9,7 @@
+ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \
+ 	    kernel-hacking.xml kernel-locking.xml deviceiobook.xml \
+ 	    procfs-guide.xml writing_usb_driver.xml networking.xml \
+-	    kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml \
++	    kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml utrace.xml \
+ 	    gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
+ 	    genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \
+ 	    mac80211.xml debugobjects.xml sh.xml regulator.xml \
+diff --git a/Documentation/DocBook/utrace.tmpl b/Documentation/DocBook/utrace.tmpl
+new file mode 100644
+index 0000000..6cc58a1
+--- /dev/null
++++ b/Documentation/DocBook/utrace.tmpl
+@@ -0,0 +1,590 @@
++<?xml version="1.0" encoding="UTF-8"?>
++<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
++"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
++
++<book id="utrace">
++  <bookinfo>
++    <title>The utrace User Debugging Infrastructure</title>
++  </bookinfo>
++
++  <toc></toc>
++
++  <chapter id="concepts"><title>utrace concepts</title>
++
++  <sect1 id="intro"><title>Introduction</title>
++
++  <para>
++    <application>utrace</application> is infrastructure code for tracing
++    and controlling user threads.  This is the foundation for writing
++    tracing engines, which can be loadable kernel modules.
++  </para>
++
++  <para>
++    The basic actors in <application>utrace</application> are the thread
++    and the tracing engine.  A tracing engine is some body of code that
++    calls into the <filename>&lt;linux/utrace.h&gt;</filename>
++    interfaces, represented by a <structname>struct
++    utrace_engine_ops</structname>.  (Usually it's a kernel module,
++    though the legacy <function>ptrace</function> support is a tracing
++    engine that is not in a kernel module.)  The interface operates on
++    individual threads (<structname>struct task_struct</structname>).
++    If an engine wants to treat several threads as a group, that is up
++    to its higher-level code.
++  </para>
++
++  <para>
++    Tracing begins by attaching an engine to a thread, using
++    <function>utrace_attach_task</function> or
++    <function>utrace_attach_pid</function>.  If successful, it returns a
++    pointer that is the handle used in all other calls.
++  </para>
++
++  </sect1>
++
++  <sect1 id="callbacks"><title>Events and Callbacks</title>
++
++  <para>
++    An attached engine does nothing by default.  An engine makes something
++    happen by requesting callbacks via <function>utrace_set_events</function>
++    and poking the thread with <function>utrace_control</function>.
++    The synchronization issues related to these two calls
++    are discussed further below in <xref linkend="teardown"/>.
++  </para>
++
++  <para>
++    Events are specified using the macro
++    <constant>UTRACE_EVENT(<replaceable>type</replaceable>)</constant>.
++    Each event type is associated with a callback in <structname>struct
++    utrace_engine_ops</structname>.  A tracing engine can leave unused
++    callbacks <constant>NULL</constant>.  The only callbacks required
++    are those used by the event flags it sets.
++  </para>
++
++  <para>
++    Many engines can be attached to each thread.  When a thread has an
++    event, each engine gets a callback if it has set the event flag for
++    that event type.  For most events, engines are called in the order they
++    attached.  Engines that attach after the event has occurred do not get
++    callbacks for that event.  This includes any new engines just attached
++    by an existing engine's callback function.  Once the sequence of
++    callbacks for that one event has completed, such new engines are then
++    eligible in the next sequence that starts when there is another event.
++  </para>
++
++  <para>
++    Event reporting callbacks have details particular to the event type,
++    but are all called in similar environments and have the same
++    constraints.  Callbacks are made from safe points, where no locks
++    are held, no special resources are pinned (usually), and the
++    user-mode state of the thread is accessible.  So, callback code has
++    a pretty free hand.  But to be a good citizen, callback code should
++    never block for long periods.  It is fine to block in
++    <function>kmalloc</function> and the like, but never wait for i/o or
++    for user mode to do something.  If you need the thread to wait, use
++    <constant>UTRACE_STOP</constant> and return from the callback
++    quickly.  When your i/o finishes or whatever, you can use
++    <function>utrace_control</function> to resume the thread.
++  </para>
++
++  <para>
++    The <constant>UTRACE_EVENT(SYSCALL_ENTRY)</constant> event is a special
++    case.  While other events happen in the kernel when it will return to
++    user mode soon, this event happens when entering the kernel before it
++    will proceed with the work requested from user mode.  Because of this
++    difference, the <function>report_syscall_entry</function> callback is
++    special in two ways.  For this event, engines are called in reverse of
++    the normal order (this includes the <function>report_quiesce</function>
++    call that precedes a <function>report_syscall_entry</function> call).
++    This preserves the semantics that the last engine to attach is called
++    "closest to user mode"--the engine that is first to see a thread's user
++    state when it enters the kernel is also the last to see that state when
++    the thread returns to user mode.  For the same reason, if these
++    callbacks use <constant>UTRACE_STOP</constant> (see the next section),
++    the thread stops immediately after callbacks rather than only when it's
++    ready to return to user mode; when allowed to resume, it will actually
++    attempt the system call indicated by the register values at that time.
++  </para>
++
++  </sect1>
++
++  <sect1 id="safely"><title>Stopping Safely</title>
++
++  <sect2 id="well-behaved"><title>Writing well-behaved callbacks</title>
++
++  <para>
++    Well-behaved callbacks are important to maintain two essential
++    properties of the interface.  The first of these is that unrelated
++    tracing engines should not interfere with each other.  If your engine's
++    event callback does not return quickly, then another engine won't get
++    the event notification in a timely manner.  The second important
++    property is that tracing should be as noninvasive as possible to the
++    normal operation of the system overall and of the traced thread in
++    particular.  That is, attached tracing engines should not perturb a
++    thread's behavior, except to the extent that changing its user-visible
++    state is explicitly what you want to do.  (Obviously some perturbation
++    is unavoidable, primarily timing changes, ranging from small delays due
++    to the overhead of tracing, to arbitrary pauses in user code execution
++    when a user stops a thread with a debugger for examination.)  Even when
++    you explicitly want the perturbation of making the traced thread block,
++    just blocking directly in your callback has more unwanted effects.  For
++    example, the <constant>CLONE</constant> event callbacks are called when
++    the new child thread has been created but not yet started running; the
++    child can never be scheduled until the <constant>CLONE</constant>
++    tracing callbacks return.  (This allows engines tracing the parent to
++    attach to the child.)  If a <constant>CLONE</constant> event callback
++    blocks the parent thread, it also prevents the child thread from
++    running (even to process a <constant>SIGKILL</constant>).  If what you
++    want is to make both the parent and child block, then use
++    <function>utrace_attach_task</function> on the child and then use
++    <constant>UTRACE_STOP</constant> on both threads.  A more crucial
++    problem with blocking in callbacks is that it can prevent
++    <constant>SIGKILL</constant> from working.  A thread that is blocking
++    due to <constant>UTRACE_STOP</constant> will still wake up and die
++    immediately when sent a <constant>SIGKILL</constant>, as all threads
++    should.  Relying on the <application>utrace</application>
++    infrastructure rather than on private synchronization calls in event
++    callbacks is an important way to help keep tracing robustly
++    noninvasive.
++  </para>
++
++  </sect2>
++
++  <sect2 id="UTRACE_STOP"><title>Using <constant>UTRACE_STOP</constant></title>
++
++  <para>
++    To control another thread and access its state, it must be stopped
++    with <constant>UTRACE_STOP</constant>.  This means that it is
++    stopped and won't start running again while we access it.  When a
++    thread is not already stopped, <function>utrace_control</function>
++    returns <constant>-EINPROGRESS</constant> and an engine must wait
++    for an event callback when the thread is ready to stop.  The thread
++    may be running on another CPU or may be blocked.  When it is ready
++    to be examined, it will make callbacks to engines that set the
++    <constant>UTRACE_EVENT(QUIESCE)</constant> event bit.  To wake up an
++    interruptible wait, use <constant>UTRACE_INTERRUPT</constant>.
++  </para>
++
++  <para>
++    As long as some engine has used <constant>UTRACE_STOP</constant> and
++    not called <function>utrace_control</function> to resume the thread,
++    then the thread will remain stopped.  <constant>SIGKILL</constant>
++    will wake it up, but it will not run user code.  When the stop is
++    cleared with <function>utrace_control</function> or a callback
++    return value, the thread starts running again.
++    (See also <xref linkend="teardown"/>.)
++  </para>
++
++  </sect2>
++
++  </sect1>
++
++  <sect1 id="teardown"><title>Tear-down Races</title>
++
++  <sect2 id="SIGKILL"><title>Primacy of <constant>SIGKILL</constant></title>
++  <para>
++    Ordinarily synchronization issues for tracing engines are kept fairly
++    straightforward by using <constant>UTRACE_STOP</constant>.  You ask a
++    thread to stop, and then once it makes the
++    <function>report_quiesce</function> callback it cannot do anything else
++    that would result in another callback, until you let it with a
++    <function>utrace_control</function> call.  This simple arrangement
++    avoids complex and error-prone code in each one of a tracing engine's
++    event callbacks to keep them serialized with the engine's other
++    operations done on that thread from another thread of control.
++    However, giving tracing engines complete power to keep a traced thread
++    stuck in place runs afoul of a more important kind of simplicity that
++    the kernel overall guarantees: nothing can prevent or delay
++    <constant>SIGKILL</constant> from making a thread die and release its
++    resources.  To preserve this important property of
++    <constant>SIGKILL</constant>, it as a special case can break
++    <constant>UTRACE_STOP</constant> like nothing else normally can.  This
++    includes both explicit <constant>SIGKILL</constant> signals and the
++    implicit <constant>SIGKILL</constant> sent to each other thread in the
++    same thread group by a thread doing an exec, or processing a fatal
++    signal, or making an <function>exit_group</function> system call.  A
++    tracing engine can prevent a thread from beginning the exit or exec or
++    dying by signal (other than <constant>SIGKILL</constant>) if it is
++    attached to that thread, but once the operation begins, no tracing
++    engine can prevent or delay all other threads in the same thread group
++    dying.
++  </para>
++  </sect2>
++
++  <sect2 id="reap"><title>Final callbacks</title>
++  <para>
++    The <function>report_reap</function> callback is always the final event
++    in the life cycle of a traced thread.  Tracing engines can use this as
++    the trigger to clean up their own data structures.  The
++    <function>report_death</function> callback is always the penultimate
++    event a tracing engine might see; it's seen unless the thread was
++    already in the midst of dying when the engine attached.  Many tracing
++    engines will have no interest in when a parent reaps a dead process,
++    and nothing they want to do with a zombie thread once it dies; for
++    them, the <function>report_death</function> callback is the natural
++    place to clean up data structures and detach.  To facilitate writing
++    such engines robustly, given the asynchrony of
++    <constant>SIGKILL</constant>, and without error-prone manual
++    implementation of synchronization schemes, the
++    <application>utrace</application> infrastructure provides some special
++    guarantees about the <function>report_death</function> and
++    <function>report_reap</function> callbacks.  It still takes some care
++    to be sure your tracing engine is robust to tear-down races, but these
++    rules make it reasonably straightforward and concise to handle a lot of
++    corner cases correctly.
++  </para>
++  </sect2>
++
++  <sect2 id="refcount"><title>Engine and task pointers</title>
++  <para>
++    The first sort of guarantee concerns the core data structures
++    themselves.  <structname>struct utrace_engine</structname> is
++    a reference-counted data structure.  While you hold a reference, an
++    engine pointer will always stay valid so that you can safely pass it to
++    any <application>utrace</application> call.  Each call to
++    <function>utrace_attach_task</function> or
++    <function>utrace_attach_pid</function> returns an engine pointer with a
++    reference belonging to the caller.  You own that reference until you
++    drop it using <function>utrace_engine_put</function>.  There is an
++    implicit reference on the engine while it is attached.  So if you drop
++    your only reference, and then use
++    <function>utrace_attach_task</function> without
++    <constant>UTRACE_ATTACH_CREATE</constant> to look up that same engine,
++    you will get the same pointer with a new reference to replace the one
++    you dropped, just like calling <function>utrace_engine_get</function>.
++    When an engine has been detached, either explicitly with
++    <constant>UTRACE_DETACH</constant> or implicitly after
++    <function>report_reap</function>, then any references you hold are all
++    that keep the old engine pointer alive.
++  </para>
++
++  <para>
++    There is nothing a kernel module can do to keep a <structname>struct
++    task_struct</structname> alive outside of
++    <function>rcu_read_lock</function>.  When the task dies and is reaped
++    by its parent (or itself), that structure can be freed so that any
++    dangling pointers you have stored become invalid.
++    <application>utrace</application> will not prevent this, but it can
++    help you detect it safely.  By definition, a task that has been reaped
++    has had all its engines detached.  All
++    <application>utrace</application> calls can be safely called on a
++    detached engine if the caller holds a reference on that engine pointer,
++    even if the task pointer passed in the call is invalid.  All calls
++    return <constant>-ESRCH</constant> for a detached engine, which tells
++    you that the task pointer you passed could be invalid now.  Since
++    <function>utrace_control</function> and
++    <function>utrace_set_events</function> do not block, you can call those
++    inside a <function>rcu_read_lock</function> section and be sure after
++    they don't return <constant>-ESRCH</constant> that the task pointer is
++    still valid until <function>rcu_read_unlock</function>.  The
++    infrastructure never holds task references of its own.  Though neither
++    <function>rcu_read_lock</function> nor any other lock is held while
++    making a callback, it's always guaranteed that the <structname>struct
++    task_struct</structname> and the <structname>struct
++    utrace_engine</structname> passed as arguments remain valid
++    until the callback function returns.
++  </para>
++
++  <para>
++    The common means for safely holding task pointers that is available to
++    kernel modules is to use <structname>struct pid</structname>, which
++    permits <function>put_pid</function> from kernel modules.  When using
++    that, the calls <function>utrace_attach_pid</function>,
++    <function>utrace_control_pid</function>,
++    <function>utrace_set_events_pid</function>, and
++    <function>utrace_barrier_pid</function> are available.
++  </para>
++  </sect2>
++
++  <sect2 id="reap-after-death">
++    <title>
++      Serialization of <constant>DEATH</constant> and <constant>REAP</constant>
++    </title>
++    <para>
++      The second guarantee is the serialization of
++      <constant>DEATH</constant> and <constant>REAP</constant> event
++      callbacks for a given thread.  The actual reaping by the parent
++      (<function>release_task</function> call) can occur simultaneously
++      while the thread is still doing the final steps of dying, including
++      the <function>report_death</function> callback.  If a tracing engine
++      has requested both <constant>DEATH</constant> and
++      <constant>REAP</constant> event reports, it's guaranteed that the
++      <function>report_reap</function> callback will not be made until
++      after the <function>report_death</function> callback has returned.
++      If the <function>report_death</function> callback itself detaches
++      from the thread, then the <function>report_reap</function> callback
++      will never be made.  Thus it is safe for a
++      <function>report_death</function> callback to clean up data
++      structures and detach.
++    </para>
++  </sect2>
++
++  <sect2 id="interlock"><title>Interlock with final callbacks</title>
++  <para>
++    The final sort of guarantee is that a tracing engine will know for sure
++    whether or not the <function>report_death</function> and/or
++    <function>report_reap</function> callbacks will be made for a certain
++    thread.  These tear-down races are disambiguated by the error return
++    values of <function>utrace_set_events</function> and
++    <function>utrace_control</function>.  Normally
++    <function>utrace_control</function> called with
++    <constant>UTRACE_DETACH</constant> returns zero, and this means that no
++    more callbacks will be made.  If the thread is in the midst of dying,
++    it returns <constant>-EALREADY</constant> to indicate that the
++    <constant>report_death</constant> callback may already be in progress;
++    when you get this error, you know that any cleanup your
++    <function>report_death</function> callback does is about to happen or
++    has just happened--note that if the <function>report_death</function>
++    callback does not detach, the engine remains attached until the thread
++    gets reaped.  If the thread is in the midst of being reaped,
++    <function>utrace_control</function> returns <constant>-ESRCH</constant>
++    to indicate that the <function>report_reap</function> callback may
++    already be in progress; this means the engine is implicitly detached
++    when the callback completes.  This makes it possible for a tracing
++    engine that has decided asynchronously to detach from a thread to
++    safely clean up its data structures, knowing that no
++    <function>report_death</function> or <function>report_reap</function>
++    callback will try to do the same.  <constant>utrace_detach</constant>
++    returns <constant>-ESRCH</constant> when the <structname>struct
++    utrace_engine</structname> has already been detached, but is
++    still a valid pointer because of its reference count.  A tracing engine
++    can use this to safely synchronize its own independent multiple threads
++    of control with each other and with its event callbacks that detach.
++  </para>
++
++  <para>
++    In the same vein, <function>utrace_set_events</function> normally
++    returns zero; if the target thread was stopped before the call, then
++    after a successful call, no event callbacks not requested in the new
++    flags will be made.  It fails with <constant>-EALREADY</constant> if
++    you try to clear <constant>UTRACE_EVENT(DEATH)</constant> when the
++    <function>report_death</function> callback may already have begun, if
++    you try to clear <constant>UTRACE_EVENT(REAP)</constant> when the
++    <function>report_reap</function> callback may already have begun, or if
++    you try to newly set <constant>UTRACE_EVENT(DEATH)</constant> or
++    <constant>UTRACE_EVENT(QUIESCE)</constant> when the target is already
++    dead or dying.  Like <function>utrace_control</function>, it returns
++    <constant>-ESRCH</constant> when the thread has already been detached
++    (including forcible detach on reaping).  This lets the tracing engine
++    know for sure which event callbacks it will or won't see after
++    <function>utrace_set_events</function> has returned.  By checking for
++    errors, it can know whether to clean up its data structures immediately
++    or to let its callbacks do the work.
++  </para>
++  </sect2>
++
++  <sect2 id="barrier"><title>Using <function>utrace_barrier</function></title>
++  <para>
++    When a thread is safely stopped, calling
++    <function>utrace_control</function> with <constant>UTRACE_DETACH</constant>
++    or calling <function>utrace_set_events</function> to disable some events
++    ensures synchronously that your engine won't get any more of the callbacks
++    that have been disabled (none at all when detaching).  But these can also
++    be used while the thread is not stopped, when it might be simultaneously
++    making a callback to your engine.  For this situation, these calls return
++    <constant>-EINPROGRESS</constant> when it's possible a callback is in
++    progress.  If you are not prepared to have your old callbacks still run,
++    then you can synchronize to be sure all the old callbacks are finished,
++    using <function>utrace_barrier</function>.  This is necessary if the
++    kernel module containing your callback code is going to be unloaded.
++  </para>
++  <para>
++    After using <constant>UTRACE_DETACH</constant> once, further calls to
++    <function>utrace_control</function> with the same engine pointer will
++    return <constant>-ESRCH</constant>.  In contrast, after getting
++    <constant>-EINPROGRESS</constant> from
++    <function>utrace_set_events</function>, you can call
++    <function>utrace_set_events</function> again later and if it returns zero
++    then know the old callbacks have finished.
++  </para>
++  <para>
++    Unlike all other calls, <function>utrace_barrier</function> (and
++    <function>utrace_barrier_pid</function>) will accept any engine pointer you
++    hold a reference on, even if <constant>UTRACE_DETACH</constant> has already
++    been used.  After any <function>utrace_control</function> or
++    <function>utrace_set_events</function> call (these do not block), you can
++    call <function>utrace_barrier</function> to block until callbacks have
++    finished.  This returns <constant>-ESRCH</constant> only if the engine is
++    completely detached (finished all callbacks).  Otherwise it waits
++    until the thread is definitely not in the midst of a callback to this
++    engine and then returns zero, but can return
++    <constant>-ERESTARTSYS</constant> if its wait is interrupted.
++  </para>
++  </sect2>
++
++</sect1>
++
++</chapter>
++
++<chapter id="core"><title>utrace core API</title>
++
++<para>
++  The utrace API is declared in <filename>&lt;linux/utrace.h&gt;</filename>.
++</para>
++
++!Iinclude/linux/utrace.h
++!Ekernel/utrace.c
++
++</chapter>
++
++<chapter id="machine"><title>Machine State</title>
++
++<para>
++  The <function>task_current_syscall</function> function can be used on any
++  valid <structname>struct task_struct</structname> at any time, and does
++  not even require that <function>utrace_attach_task</function> was used at all.
++</para>
++
++<para>
++  The other ways to access the registers and other machine-dependent state of
++  a task can only be used on a task that is at a known safe point.  The safe
++  points are all the places where <function>utrace_set_events</function> can
++  request callbacks (except for the <constant>DEATH</constant> and
++  <constant>REAP</constant> events).  So at any event callback, it is safe to
++  examine <varname>current</varname>.
++</para>
++
++<para>
++  One task can examine another only after a callback in the target task that
++  returns <constant>UTRACE_STOP</constant> so that task will not return to user
++  mode after the safe point.  This guarantees that the task will not resume
++  until the same engine uses <function>utrace_control</function>, unless the
++  task dies suddenly.  To examine safely, one must use a pair of calls to
++  <function>utrace_prepare_examine</function> and
++  <function>utrace_finish_examine</function> surrounding the calls to
++  <structname>struct user_regset</structname> functions or direct examination
++  of task data structures.  <function>utrace_prepare_examine</function> returns
++  an error if the task is not properly stopped and not dead.  After a
++  successful examination, the paired <function>utrace_finish_examine</function>
++  call returns an error if the task ever woke up during the examination.  If
++  so, any data gathered may be scrambled and should be discarded.  This means
++  there was a spurious wake-up (which should not happen), or a sudden death.
++</para>
++
++<sect1 id="regset"><title><structname>struct user_regset</structname></title>
++
++<para>
++  The <structname>struct user_regset</structname> API
++  is declared in <filename>&lt;linux/regset.h&gt;</filename>.
++</para>
++
++!Finclude/linux/regset.h
++
++</sect1>
++
++<sect1 id="task_current_syscall">
++  <title><filename>System Call Information</filename></title>
++
++<para>
++  This function is declared in <filename>&lt;linux/ptrace.h&gt;</filename>.
++</para>
++
++!Elib/syscall.c
++
++</sect1>
++
++<sect1 id="syscall"><title><filename>System Call Tracing</filename></title>
++
++<para>
++  The arch API for system call information is declared in
++  <filename>&lt;asm/syscall.h&gt;</filename>.
++  Each of these calls can be used only at system call entry tracing,
++  or can be used only at system call exit and the subsequent safe points
++  before returning to user mode.
++  At system call entry tracing means either during a
++  <structfield>report_syscall_entry</structfield> callback,
++  or any time after that callback has returned <constant>UTRACE_STOP</constant>.
++</para>
++
++!Finclude/asm-generic/syscall.h
++
++</sect1>
++
++</chapter>
++
++<chapter id="internals"><title>Kernel Internals</title>
++
++<para>
++  This chapter covers the interface to the tracing infrastructure
++  from the core of the kernel and the architecture-specific code.
++  This is for maintainers of the kernel and arch code, and not relevant
++  to using the tracing facilities described in preceding chapters.
++</para>
++
++<sect1 id="tracehook"><title>Core Calls In</title>
++
++<para>
++  These calls are declared in <filename>&lt;linux/tracehook.h&gt;</filename>.
++  The core kernel calls these functions at various important places.
++</para>
++
++!Finclude/linux/tracehook.h
++
++</sect1>
++
++<sect1 id="arch"><title>Architecture Calls Out</title>
++
++<para>
++  An arch that has done all these things sets
++  <constant>CONFIG_HAVE_ARCH_TRACEHOOK</constant>.
++  This is required to enable the <application>utrace</application> code.
++</para>
++
++<sect2 id="arch-ptrace"><title><filename>&lt;asm/ptrace.h&gt;</filename></title>
++
++<para>
++  An arch defines these in <filename>&lt;asm/ptrace.h&gt;</filename>
++  if it supports hardware single-step or block-step features.
++</para>
++
++!Finclude/linux/ptrace.h arch_has_single_step arch_has_block_step
++!Finclude/linux/ptrace.h user_enable_single_step user_enable_block_step
++!Finclude/linux/ptrace.h user_disable_single_step
++
++</sect2>
++
++<sect2 id="arch-syscall">
++  <title><filename>&lt;asm/syscall.h&gt;</filename></title>
++
++  <para>
++    An arch provides <filename>&lt;asm/syscall.h&gt;</filename> that
++    defines these as inlines, or declares them as exported functions.
++    These interfaces are described in <xref linkend="syscall"/>.
++  </para>
++
++</sect2>
++
++<sect2 id="arch-tracehook">
++  <title><filename>&lt;linux/tracehook.h&gt;</filename></title>
++
++  <para>
++    An arch must define <constant>TIF_NOTIFY_RESUME</constant>
++    and <constant>TIF_SYSCALL_TRACE</constant>
++    in its <filename>&lt;asm/thread_info.h&gt;</filename>.
++    The arch code must call the following functions, all declared
++    in <filename>&lt;linux/tracehook.h&gt;</filename> and
++    described in <xref linkend="tracehook"/>:
++
++    <itemizedlist>
++      <listitem>
++	<para><function>tracehook_notify_resume</function></para>
++      </listitem>
++      <listitem>
++	<para><function>tracehook_report_syscall_entry</function></para>
++      </listitem>
++      <listitem>
++	<para><function>tracehook_report_syscall_exit</function></para>
++      </listitem>
++      <listitem>
++	<para><function>tracehook_signal_handler</function></para>
++      </listitem>
++    </itemizedlist>
++
++  </para>
++
++</sect2>
++
++</sect1>
++
++</chapter>
++
++</book>
+diff --git a/fs/proc/array.c b/fs/proc/array.c
+index 725a650..e299a63 100644
+--- a/fs/proc/array.c
++++ b/fs/proc/array.c
+@@ -82,6 +82,7 @@
+ #include <linux/pid_namespace.h>
+ #include <linux/ptrace.h>
+ #include <linux/tracehook.h>
++#include <linux/utrace.h>
+ 
+ #include <asm/pgtable.h>
+ #include <asm/processor.h>
+@@ -188,6 +189,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
+ 		cred->uid, cred->euid, cred->suid, cred->fsuid,
+ 		cred->gid, cred->egid, cred->sgid, cred->fsgid);
+ 
++	task_utrace_proc_status(m, p);
++
+ 	task_lock(p);
+ 	if (p->files)
+ 		fdt = files_fdtable(p->files);
+diff --git a/include/linux/init_task.h b/include/linux/init_task.h
+index 5368fbd..aecd24e 100644
+--- a/include/linux/init_task.h
++++ b/include/linux/init_task.h
+@@ -167,6 +167,7 @@ extern struct cred init_cred;
+ 		[PIDTYPE_SID]  = INIT_PID_LINK(PIDTYPE_SID),		\
+ 	},								\
+ 	.dirties = INIT_PROP_LOCAL_SINGLE(dirties),			\
++	INIT_UTRACE(tsk)						\
+ 	INIT_IDS							\
+ 	INIT_PERF_COUNTERS(tsk)						\
+ 	INIT_TRACE_IRQFLAGS						\
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 4d07542..2060aa1 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -59,6 +59,7 @@ struct sched_param {
+ #include <linux/errno.h>
+ #include <linux/nodemask.h>
+ #include <linux/mm_types.h>
++#include <linux/utrace_struct.h>
+ 
+ #include <asm/system.h>
+ #include <asm/page.h>
+@@ -1313,6 +1314,11 @@ struct task_struct {
+ #endif
+ 	seccomp_t seccomp;
+ 
++#ifdef CONFIG_UTRACE
++	struct utrace utrace;
++	unsigned long utrace_flags;
++#endif
++
+ /* Thread group tracking */
+    	u32 parent_exec_id;
+    	u32 self_exec_id;
+diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
+index 7c2bfd9..a91d9a4 100644
+--- a/include/linux/tracehook.h
++++ b/include/linux/tracehook.h
+@@ -49,6 +49,7 @@
+ #include <linux/sched.h>
+ #include <linux/ptrace.h>
+ #include <linux/security.h>
++#include <linux/utrace.h>
+ struct linux_binprm;
+ 
+ /**
+@@ -63,6 +64,8 @@ struct linux_binprm;
+  */
+ static inline int tracehook_expect_breakpoints(struct task_struct *task)
+ {
++	if (unlikely(task_utrace_flags(task) & UTRACE_EVENT(SIGNAL_CORE)))
++		return 1;
+ 	return (task_ptrace(task) & PT_PTRACED) != 0;
+ }
+ 
+@@ -111,6 +114,9 @@ static inline void ptrace_report_syscall(struct pt_regs *regs)
+ static inline __must_check int tracehook_report_syscall_entry(
+ 	struct pt_regs *regs)
+ {
++	if ((task_utrace_flags(current) & UTRACE_EVENT(SYSCALL_ENTRY)) &&
++	    utrace_report_syscall_entry(regs))
++		return 1;
+ 	ptrace_report_syscall(regs);
+ 	return 0;
+ }
+@@ -134,6 +140,8 @@ static inline __must_check int tracehook_report_syscall_entry(
+  */
+ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
+ {
++	if (task_utrace_flags(current) & UTRACE_EVENT(SYSCALL_EXIT))
++		utrace_report_syscall_exit(regs);
+ 	ptrace_report_syscall(regs);
+ }
+ 
+@@ -194,6 +202,8 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt,
+ 					 struct linux_binprm *bprm,
+ 					 struct pt_regs *regs)
+ {
++	if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(EXEC)))
++		utrace_report_exec(fmt, bprm, regs);
+ 	if (!ptrace_event(PT_TRACE_EXEC, PTRACE_EVENT_EXEC, 0) &&
+ 	    unlikely(task_ptrace(current) & PT_PTRACED))
+ 		send_sig(SIGTRAP, current, 0);
+@@ -211,6 +221,8 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt,
+  */
+ static inline void tracehook_report_exit(long *exit_code)
+ {
++	if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(EXIT)))
++		utrace_report_exit(exit_code);
+ 	ptrace_event(PT_TRACE_EXIT, PTRACE_EVENT_EXIT, *exit_code);
+ }
+ 
+@@ -254,6 +266,7 @@ static inline int tracehook_prepare_clone(unsigned clone_flags)
+ static inline void tracehook_finish_clone(struct task_struct *child,
+ 					  unsigned long clone_flags, int trace)
+ {
++	utrace_init_task(child);
+ 	ptrace_init_task(child, (clone_flags & CLONE_PTRACE) || trace);
+ }
+ 
+@@ -278,6 +291,8 @@ static inline void tracehook_report_clone(struct pt_regs *regs,
+ 					  unsigned long clone_flags,
+ 					  pid_t pid, struct task_struct *child)
+ {
++	if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(CLONE)))
++		utrace_report_clone(clone_flags, child);
+ 	if (unlikely(task_ptrace(child))) {
+ 		/*
+ 		 * It doesn't matter who attached/attaching to this
+@@ -310,6 +325,9 @@ static inline void tracehook_report_clone_complete(int trace,
+ 						   pid_t pid,
+ 						   struct task_struct *child)
+ {
++	if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(CLONE)) &&
++	    (clone_flags & CLONE_VFORK))
++		utrace_finish_vfork(current);
+ 	if (unlikely(trace))
+ 		ptrace_event(0, trace, pid);
+ }
+@@ -344,6 +362,7 @@ static inline void tracehook_report_vfork_done(struct task_struct *child,
+  */
+ static inline void tracehook_prepare_release_task(struct task_struct *task)
+ {
++	utrace_release_task(task);
+ }
+ 
+ /**
+@@ -358,6 +377,7 @@ static inline void tracehook_prepare_release_task(struct task_struct *task)
+ static inline void tracehook_finish_release_task(struct task_struct *task)
+ {
+ 	ptrace_release_task(task);
++	BUG_ON(task->exit_state != EXIT_DEAD);
+ }
+ 
+ /**
+@@ -379,6 +399,8 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
+ 					    const struct k_sigaction *ka,
+ 					    struct pt_regs *regs, int stepping)
+ {
++	if (task_utrace_flags(current))
++		utrace_signal_handler(current, stepping);
+ 	if (stepping)
+ 		ptrace_notify(SIGTRAP);
+ }
+@@ -396,6 +418,8 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
+ static inline int tracehook_consider_ignored_signal(struct task_struct *task,
+ 						    int sig)
+ {
++	if (unlikely(task_utrace_flags(task) & UTRACE_EVENT(SIGNAL_IGN)))
++		return 1;
+ 	return (task_ptrace(task) & PT_PTRACED) != 0;
+ }
+ 
+@@ -415,6 +439,9 @@ static inline int tracehook_consider_ignored_signal(struct task_struct *task,
+ static inline int tracehook_consider_fatal_signal(struct task_struct *task,
+ 						  int sig)
+ {
++	if (unlikely(task_utrace_flags(task) & (UTRACE_EVENT(SIGNAL_TERM) |
++						UTRACE_EVENT(SIGNAL_CORE))))
++		return 1;
+ 	return (task_ptrace(task) & PT_PTRACED) != 0;
+ }
+ 
+@@ -429,6 +456,8 @@ static inline int tracehook_consider_fatal_signal(struct task_struct *task,
+  */
+ static inline int tracehook_force_sigpending(void)
+ {
++	if (unlikely(task_utrace_flags(current)))
++		return utrace_interrupt_pending();
+ 	return 0;
+ }
+ 
+@@ -458,6 +487,8 @@ static inline int tracehook_get_signal(struct task_struct *task,
+ 				       siginfo_t *info,
+ 				       struct k_sigaction *return_ka)
+ {
++	if (unlikely(task_utrace_flags(task)))
++		return utrace_get_signal(task, regs, info, return_ka);
+ 	return 0;
+ }
+ 
+@@ -485,6 +516,8 @@ static inline int tracehook_get_signal(struct task_struct *task,
+  */
+ static inline int tracehook_notify_jctl(int notify, int why)
+ {
++	if (task_utrace_flags(current) & UTRACE_EVENT(JCTL))
++		utrace_report_jctl(notify, why);
+ 	return notify ?: (current->ptrace & PT_PTRACED) ? why : 0;
+ }
+ 
+@@ -508,6 +541,8 @@ static inline int tracehook_notify_jctl(int notify, int why)
+ static inline int tracehook_notify_death(struct task_struct *task,
+ 					 void **death_cookie, int group_dead)
+ {
++	*death_cookie = task_utrace_struct(task);
++
+ 	if (task_detached(task))
+ 		return task->ptrace ? SIGCHLD : DEATH_REAP;
+ 
+@@ -544,6 +579,20 @@ static inline void tracehook_report_death(struct task_struct *task,
+ 					  int signal, void *death_cookie,
+ 					  int group_dead)
+ {
++	/*
++	 * This barrier ensures that our caller's setting of
++	 * @task->exit_state precedes checking @task->utrace_flags here.
++	 * If utrace_set_events() was just called to enable
++	 * UTRACE_EVENT(DEATH), then we are obliged to call
++	 * utrace_report_death() and not miss it.  utrace_set_events()
++	 * uses tasklist_lock to synchronize enabling the bit with the
++	 * actual change to @task->exit_state, but we need this barrier
++	 * to be sure we see a flags change made just before our caller
++	 * took the tasklist_lock.
++	 */
++	smp_mb();
++	if (task_utrace_flags(task) & _UTRACE_DEATH_EVENTS)
++		utrace_report_death(task, death_cookie, group_dead, signal);
+ }
+ 
+ #ifdef TIF_NOTIFY_RESUME
+@@ -573,10 +622,20 @@ static inline void set_notify_resume(struct task_struct *task)
+  * asynchronously, this will be called again before we return to
+  * user mode.
+  *
+- * Called without locks.
++ * Called without locks.  However, on some machines this may be
++ * called with interrupts disabled.
+  */
+ static inline void tracehook_notify_resume(struct pt_regs *regs)
+ {
++	struct task_struct *task = current;
++	/*
++	 * This pairs with the barrier implicit in set_notify_resume().
++	 * It ensures that we read the nonzero utrace_flags set before
++	 * set_notify_resume() was called by utrace setup.
++	 */
++	smp_rmb();
++	if (task_utrace_flags(task))
++		utrace_resume(task, regs);
+ }
+ #endif	/* TIF_NOTIFY_RESUME */
+ 
+diff --git a/include/linux/utrace.h b/include/linux/utrace.h
+new file mode 100644
+index 0000000..f877ec6
+--- /dev/null
++++ b/include/linux/utrace.h
+@@ -0,0 +1,692 @@
++/*
++ * utrace infrastructure interface for debugging user processes
++ *
++ * Copyright (C) 2006-2009 Red Hat, Inc.  All rights reserved.
++ *
++ * This copyrighted material is made available to anyone wishing to use,
++ * modify, copy, or redistribute it subject to the terms and conditions
++ * of the GNU General Public License v.2.
++ *
++ * Red Hat Author: Roland McGrath.
++ *
++ * This interface allows for notification of interesting events in a
++ * thread.  It also mediates access to thread state such as registers.
++ * Multiple unrelated users can be associated with a single thread.
++ * We call each of these a tracing engine.
++ *
++ * A tracing engine starts by calling utrace_attach_task() or
++ * utrace_attach_pid() on the chosen thread, passing in a set of hooks
++ * (&struct utrace_engine_ops), and some associated data.  This produces a
++ * &struct utrace_engine, which is the handle used for all other
++ * operations.  An attached engine has its ops vector, its data, and an
++ * event mask controlled by utrace_set_events().
++ *
++ * For each event bit that is set, that engine will get the
++ * appropriate ops->report_*() callback when the event occurs.  The
++ * &struct utrace_engine_ops need not provide callbacks for an event
++ * unless the engine sets one of the associated event bits.
++ */
++
++#ifndef _LINUX_UTRACE_H
++#define _LINUX_UTRACE_H	1
++
++#include <linux/list.h>
++#include <linux/kref.h>
++#include <linux/signal.h>
++#include <linux/sched.h>
++
++struct linux_binprm;
++struct pt_regs;
++struct utrace;
++struct user_regset;
++struct user_regset_view;
++
++/*
++ * Event bits passed to utrace_set_events().
++ * These appear in &struct task_struct.@utrace_flags
++ * and &struct utrace_engine.@flags.
++ */
++enum utrace_events {
++	_UTRACE_EVENT_QUIESCE,	/* Thread is available for examination.  */
++	_UTRACE_EVENT_REAP,  	/* Zombie reaped, no more tracing possible.  */
++	_UTRACE_EVENT_CLONE,	/* Successful clone/fork/vfork just done.  */
++	_UTRACE_EVENT_EXEC,	/* Successful execve just completed.  */
++	_UTRACE_EVENT_EXIT,	/* Thread exit in progress.  */
++	_UTRACE_EVENT_DEATH,	/* Thread has died.  */
++	_UTRACE_EVENT_SYSCALL_ENTRY, /* User entered kernel for system call. */
++	_UTRACE_EVENT_SYSCALL_EXIT, /* Returning to user after system call.  */
++	_UTRACE_EVENT_SIGNAL,	/* Signal delivery will run a user handler.  */
++	_UTRACE_EVENT_SIGNAL_IGN, /* No-op signal to be delivered.  */
++	_UTRACE_EVENT_SIGNAL_STOP, /* Signal delivery will suspend.  */
++	_UTRACE_EVENT_SIGNAL_TERM, /* Signal delivery will terminate.  */
++	_UTRACE_EVENT_SIGNAL_CORE, /* Signal delivery will dump core.  */
++	_UTRACE_EVENT_JCTL,	/* Job control stop or continue completed.  */
++	_UTRACE_NEVENTS
++};
++#define UTRACE_EVENT(type)	(1UL << _UTRACE_EVENT_##type)
++
++/*
++ * All the kinds of signal events.
++ * These all use the @report_signal() callback.
++ */
++#define UTRACE_EVENT_SIGNAL_ALL	(UTRACE_EVENT(SIGNAL) \
++				 | UTRACE_EVENT(SIGNAL_IGN) \
++				 | UTRACE_EVENT(SIGNAL_STOP) \
++				 | UTRACE_EVENT(SIGNAL_TERM) \
++				 | UTRACE_EVENT(SIGNAL_CORE))
++/*
++ * Both kinds of syscall events; these call the @report_syscall_entry()
++ * and @report_syscall_exit() callbacks, respectively.
++ */
++#define UTRACE_EVENT_SYSCALL	\
++	(UTRACE_EVENT(SYSCALL_ENTRY) | UTRACE_EVENT(SYSCALL_EXIT))
++
++/*
++ * The event reports triggered synchronously by task death.
++ */
++#define _UTRACE_DEATH_EVENTS (UTRACE_EVENT(DEATH) | UTRACE_EVENT(QUIESCE))
++
++/*
++ * Hooks in <linux/tracehook.h> call these entry points to the
++ * utrace dispatch.  They are weak references here only so
++ * tracehook.h doesn't need to #ifndef CONFIG_UTRACE them to
++ * avoid external references in case of unoptimized compilation.
++ */
++bool utrace_interrupt_pending(void)
++	__attribute__((weak));
++void utrace_resume(struct task_struct *, struct pt_regs *)
++	__attribute__((weak));
++int utrace_get_signal(struct task_struct *, struct pt_regs *,
++		      siginfo_t *, struct k_sigaction *)
++	__attribute__((weak));
++void utrace_report_clone(unsigned long, struct task_struct *)
++	__attribute__((weak));
++void utrace_finish_vfork(struct task_struct *)
++	__attribute__((weak));
++void utrace_report_exit(long *exit_code)
++	__attribute__((weak));
++void utrace_report_death(struct task_struct *, struct utrace *, bool, int)
++	__attribute__((weak));
++void utrace_report_jctl(int notify, int type)
++	__attribute__((weak));
++void utrace_report_exec(struct linux_binfmt *, struct linux_binprm *,
++			struct pt_regs *regs)
++	__attribute__((weak));
++bool utrace_report_syscall_entry(struct pt_regs *)
++	__attribute__((weak));
++void utrace_report_syscall_exit(struct pt_regs *)
++	__attribute__((weak));
++void utrace_signal_handler(struct task_struct *, int)
++	__attribute__((weak));
++
++#ifndef CONFIG_UTRACE
++
++/*
++ * <linux/tracehook.h> uses these accessors to avoid #ifdef CONFIG_UTRACE.
++ */
++static inline unsigned long task_utrace_flags(struct task_struct *task)
++{
++	return 0;
++}
++static inline struct utrace *task_utrace_struct(struct task_struct *task)
++{
++	return NULL;
++}
++static inline void utrace_init_task(struct task_struct *child)
++{
++}
++static inline void utrace_release_task(struct task_struct *task)
++{
++}
++
++static inline void task_utrace_proc_status(struct seq_file *m,
++					   struct task_struct *p)
++{
++}
++
++#else  /* CONFIG_UTRACE */
++
++static inline unsigned long task_utrace_flags(struct task_struct *task)
++{
++	return task->utrace_flags;
++}
++
++static inline struct utrace *task_utrace_struct(struct task_struct *task)
++{
++	return &task->utrace;
++}
++
++static inline void utrace_init_task(struct task_struct *task)
++{
++	task->utrace_flags = 0;
++	memset(&task->utrace, 0, sizeof(task->utrace));
++	INIT_LIST_HEAD(&task->utrace.attached);
++	INIT_LIST_HEAD(&task->utrace.attaching);
++	spin_lock_init(&task->utrace.lock);
++}
++
++void utrace_release_task(struct task_struct *);
++void task_utrace_proc_status(struct seq_file *m, struct task_struct *p);
++
++
++/*
++ * Version number of the API defined in this file.  This will change
++ * whenever a tracing engine's code would need some updates to keep
++ * working.  We maintain this here for the benefit of tracing engine code
++ * that is developed concurrently with utrace API improvements before they
++ * are merged into the kernel, making LINUX_VERSION_CODE checks unwieldy.
++ */
++#define UTRACE_API_VERSION	20090416
++
++/**
++ * enum utrace_resume_action - engine's choice of action for a traced task
++ * @UTRACE_STOP:		Stay quiescent after callbacks.
++ * @UTRACE_REPORT:		Make some callback soon.
++ * @UTRACE_INTERRUPT:		Make @report_signal() callback soon.
++ * @UTRACE_SINGLESTEP:		Resume in user mode for one instruction.
++ * @UTRACE_BLOCKSTEP:		Resume in user mode until next branch.
++ * @UTRACE_RESUME:		Resume normally in user mode.
++ * @UTRACE_DETACH:		Detach my engine (implies %UTRACE_RESUME).
++ *
++ * See utrace_control() for detailed descriptions of each action.  This is
++ * encoded in the @action argument and the return value for every callback
++ * with a &u32 return value.
++ *
++ * The order of these is important.  When there is more than one engine,
++ * each supplies its choice and the smallest value prevails.
++ */
++enum utrace_resume_action {
++	UTRACE_STOP,
++	UTRACE_REPORT,
++	UTRACE_INTERRUPT,
++	UTRACE_SINGLESTEP,
++	UTRACE_BLOCKSTEP,
++	UTRACE_RESUME,
++	UTRACE_DETACH
++};
++#define	UTRACE_RESUME_MASK	0x0f
++
++/**
++ * utrace_resume_action - &enum utrace_resume_action from callback action
++ * @action:		&u32 callback @action argument or return value
++ *
++ * This extracts the &enum utrace_resume_action from @action,
++ * which is the @action argument to a &struct utrace_engine_ops
++ * callback or the return value from one.
++ */
++static inline enum utrace_resume_action utrace_resume_action(u32 action)
++{
++	return action & UTRACE_RESUME_MASK;
++}
++
++/**
++ * enum utrace_signal_action - disposition of signal
++ * @UTRACE_SIGNAL_DELIVER:	Deliver according to sigaction.
++ * @UTRACE_SIGNAL_IGN:		Ignore the signal.
++ * @UTRACE_SIGNAL_TERM:		Terminate the process.
++ * @UTRACE_SIGNAL_CORE:		Terminate with core dump.
++ * @UTRACE_SIGNAL_STOP:		Deliver as absolute stop.
++ * @UTRACE_SIGNAL_TSTP:		Deliver as job control stop.
++ * @UTRACE_SIGNAL_REPORT:	Reporting before pending signals.
++ * @UTRACE_SIGNAL_HANDLER:	Reporting after signal handler setup.
++ *
++ * This is encoded in the @action argument and the return value for
++ * a @report_signal() callback.  It says what will happen to the
++ * signal described by the &siginfo_t parameter to the callback.
++ *
++ * The %UTRACE_SIGNAL_REPORT value is used in an @action argument when
++ * a tracing report is being made before dequeuing any pending signal.
++ * If this is immediately after a signal handler has been set up, then
++ * %UTRACE_SIGNAL_HANDLER is used instead.  A @report_signal callback
++ * that uses %UTRACE_SIGNAL_DELIVER|%UTRACE_SINGLESTEP will ensure
++ * it sees a %UTRACE_SIGNAL_HANDLER report.
++ */
++enum utrace_signal_action {
++	UTRACE_SIGNAL_DELIVER	= 0x00,
++	UTRACE_SIGNAL_IGN	= 0x10,
++	UTRACE_SIGNAL_TERM	= 0x20,
++	UTRACE_SIGNAL_CORE	= 0x30,
++	UTRACE_SIGNAL_STOP	= 0x40,
++	UTRACE_SIGNAL_TSTP	= 0x50,
++	UTRACE_SIGNAL_REPORT	= 0x60,
++	UTRACE_SIGNAL_HANDLER	= 0x70
++};
++#define	UTRACE_SIGNAL_MASK	0xf0
++#define UTRACE_SIGNAL_HOLD	0x100 /* Flag, push signal back on queue.  */
++
++/**
++ * utrace_signal_action - &enum utrace_signal_action from callback action
++ * @action:		@report_signal callback @action argument or return value
++ *
++ * This extracts the &enum utrace_signal_action from @action, which
++ * is the @action argument to a @report_signal callback or the
++ * return value from one.
++ */
++static inline enum utrace_signal_action utrace_signal_action(u32 action)
++{
++	return action & UTRACE_SIGNAL_MASK;
++}
++
++/**
++ * enum utrace_syscall_action - disposition of system call attempt
++ * @UTRACE_SYSCALL_RUN:		Run the system call.
++ * @UTRACE_SYSCALL_ABORT:	Don't run the system call.
++ *
++ * This is encoded in the @action argument and the return value for
++ * a @report_syscall_entry callback.
++ */
++enum utrace_syscall_action {
++	UTRACE_SYSCALL_RUN	= 0x00,
++	UTRACE_SYSCALL_ABORT	= 0x10
++};
++#define	UTRACE_SYSCALL_MASK	0xf0
++
++/**
++ * utrace_syscall_action - &enum utrace_syscall_action from callback action
++ * @action:		@report_syscall_entry callback @action or return value
++ *
++ * This extracts the &enum utrace_syscall_action from @action, which
++ * is the @action argument to a @report_syscall_entry callback or the
++ * return value from one.
++ */
++static inline enum utrace_syscall_action utrace_syscall_action(u32 action)
++{
++	return action & UTRACE_SYSCALL_MASK;
++}
++
++/*
++ * Flags for utrace_attach_task() and utrace_attach_pid().
++ */
++#define UTRACE_ATTACH_CREATE		0x0010 /* Attach a new engine.  */
++#define UTRACE_ATTACH_EXCLUSIVE		0x0020 /* Refuse if existing match.  */
++#define UTRACE_ATTACH_MATCH_OPS		0x0001 /* Match engines on ops.  */
++#define UTRACE_ATTACH_MATCH_DATA	0x0002 /* Match engines on data.  */
++#define UTRACE_ATTACH_MATCH_MASK	0x000f
++
++/**
++ * struct utrace_engine - per-engine structure
++ * @ops:	&struct utrace_engine_ops pointer passed to utrace_attach_task()
++ * @data:	engine-private &void * passed to utrace_attach_task()
++ * @flags:	event mask set by utrace_set_events() plus internal flag bits
++ *
++ * The task itself never has to worry about engines detaching while
++ * it's doing event callbacks.  These structures are removed from the
++ * task's active list only when it's stopped, or by the task itself.
++ *
++ * utrace_engine_get() and utrace_engine_put() maintain a reference count.
++ * When it drops to zero, the structure is freed.  One reference is held
++ * implicitly while the engine is attached to its task.
++ */
++struct utrace_engine {
++/* private: */
++	struct kref kref;
++	struct list_head entry;
++
++/* public: */
++	const struct utrace_engine_ops *ops;
++	void *data;
++
++	unsigned long flags;
++};
++
++/**
++ * utrace_engine_get - acquire a reference on a &struct utrace_engine
++ * @engine:	&struct utrace_engine pointer
++ *
++ * You must hold a reference on @engine, and you get another.
++ */
++static inline void utrace_engine_get(struct utrace_engine *engine)
++{
++	kref_get(&engine->kref);
++}
++
++void __utrace_engine_release(struct kref *);
++
++/**
++ * utrace_engine_put - release a reference on a &struct utrace_engine
++ * @engine:	&struct utrace_engine pointer
++ *
++ * You must hold a reference on @engine, and you lose that reference.
++ * If it was the last one, @engine becomes an invalid pointer.
++ */
++static inline void utrace_engine_put(struct utrace_engine *engine)
++{
++	kref_put(&engine->kref, __utrace_engine_release);
++}
++
++/**
++ * struct utrace_engine_ops - tracing engine callbacks
++ *
++ * Each @report_*() callback corresponds to an %UTRACE_EVENT(*) bit.
++ * utrace_set_events() calls on @engine choose which callbacks will be made
++ * to @engine from @task.
++ *
++ * Most callbacks take an @action argument, giving the resume action
++ * chosen by other tracing engines.  All callbacks take an @engine
++ * argument, and a @task argument, which is always equal to @current.
++ * For some calls, @action also includes bits specific to that event
++ * and utrace_resume_action() is used to extract the resume action.
++ * This shows what would happen if @engine wasn't there, or will if
++ * the callback's return value uses %UTRACE_RESUME.  This always
++ * starts as %UTRACE_RESUME when no other tracing is being done on
++ * this task.
++ *
++ * All return values contain &enum utrace_resume_action bits.  For
++ * some calls, other bits specific to that kind of event are added to
++ * the resume action bits with OR.  These are the same bits used in
++ * the @action argument.  The resume action returned by a callback
++ * does not override previous engines' choices, it only says what
++ * @engine wants done.  What @task actually does is the action that's
++ * most constrained among the choices made by all attached engines.
++ * See utrace_control() for more information on the actions.
++ *
++ * When %UTRACE_STOP is used in @report_syscall_entry, then @task
++ * stops before attempting the system call.  In other cases, the
++ * resume action does not take effect until @task is ready to check
++ * for signals and return to user mode.  If there are more callbacks
++ * to be made, the last round of calls determines the final action.
++ * A @report_quiesce callback with @event zero, or a @report_signal
++ * callback, will always be the last one made before @task resumes.
++ * Only %UTRACE_STOP is "sticky"--if @engine returned %UTRACE_STOP
++ * then @task stays stopped unless @engine returns different from a
++ * following callback.
++ *
++ * The report_death() and report_reap() callbacks do not take @action
++ * arguments, and only %UTRACE_DETACH is meaningful in the return value
++ * from a report_death() callback.  None of the resume actions applies
++ * to a dead thread.
++ *
++ * All @report_*() hooks are called with no locks held, in a generally
++ * safe environment when we will be returning to user mode soon (or just
++ * entered the kernel).  It is fine to block for memory allocation and
++ * the like, but all hooks are asynchronous and must not block on
++ * external events!  If you want the thread to block, use %UTRACE_STOP
++ * in your hook's return value; then later wake it up with utrace_control().
++ *
++ * @report_quiesce:
++ *	Requested by %UTRACE_EVENT(%QUIESCE).
++ *	This does not indicate any event, but just that @task (the current
++ *	thread) is in a safe place for examination.  This call is made
++ *	before each specific event callback, except for @report_reap.
++ *	The @event argument gives the %UTRACE_EVENT(@which) value for
++ *	the event occurring.  This callback might be made for events @engine
++ *	has not requested, if some other engine is tracing the event;
++ *	calling utrace_set_events() call here can request the immediate
++ *	callback for this occurrence of @event.  @event is zero when there
++ *	is no other event, @task is now ready to check for signals and
++ *	return to user mode, and some engine has used %UTRACE_REPORT or
++ *	%UTRACE_INTERRUPT to request this callback.  For this case,
++ *	if @report_signal is not %NULL, the @report_quiesce callback
++ *	may be replaced with a @report_signal callback passing
++ *	%UTRACE_SIGNAL_REPORT in its @action argument, whenever @task is
++ *	entering the signal-check path anyway.
++ *
++ * @report_signal:
++ *	Requested by %UTRACE_EVENT(%SIGNAL_*) or %UTRACE_EVENT(%QUIESCE).
++ *	Use utrace_signal_action() and utrace_resume_action() on @action.
++ *	The signal action is %UTRACE_SIGNAL_REPORT when some engine has
++ *	used %UTRACE_REPORT or %UTRACE_INTERRUPT; the callback can choose
++ *	to stop or to deliver an artificial signal, before pending signals.
++ *	It's %UTRACE_SIGNAL_HANDLER instead when signal handler setup just
++ *	finished (after a previous %UTRACE_SIGNAL_DELIVER return); this
++ *	serves in lieu of any %UTRACE_SIGNAL_REPORT callback requested by
++ *	%UTRACE_REPORT or %UTRACE_INTERRUPT, and is also implicitly
++ *	requested by %UTRACE_SINGLESTEP or %UTRACE_BLOCKSTEP into the
++ *	signal delivery.  The other signal actions indicate a signal about
++ *	to be delivered; the previous engine's return value sets the signal
++ *	action seen by the the following engine's callback.  The @info data
++ *	can be changed at will, including @info->si_signo.  The settings in
++ *	@return_ka determines what %UTRACE_SIGNAL_DELIVER does.  @orig_ka
++ *	is what was in force before other tracing engines intervened, and
++ *	it's %NULL when this report began as %UTRACE_SIGNAL_REPORT or
++ *	%UTRACE_SIGNAL_HANDLER.  For a report without a new signal, @info
++ *	is left uninitialized and must be set completely by an engine that
++ *	chooses to deliver a signal; if there was a previous @report_signal
++ *	callback ending in %UTRACE_STOP and it was just resumed using
++ *	%UTRACE_REPORT or %UTRACE_INTERRUPT, then @info is left unchanged
++ *	from the previous callback.  In this way, the original signal can
++ *	be left in @info while returning %UTRACE_STOP|%UTRACE_SIGNAL_IGN
++ *	and then found again when resuming @task with %UTRACE_INTERRUPT.
++ *	The %UTRACE_SIGNAL_HOLD flag bit can be OR'd into the return value,
++ *	and might be in @action if the previous engine returned it.  This
++ *	flag asks that the signal in @info be pushed back on @task's queue
++ *	so that it will be seen again after whatever action is taken now.
++ *
++ * @report_clone:
++ *	Requested by %UTRACE_EVENT(%CLONE).
++ *	Event reported for parent, before the new task @child might run.
++ *	@clone_flags gives the flags used in the clone system call,
++ *	or equivalent flags for a fork() or vfork() system call.
++ *	This function can use utrace_attach_task() on @child.  It's guaranteed
++ *	that asynchronous utrace_attach_task() calls will be ordered after
++ *	any calls in @report_clone callbacks for the parent.  Thus
++ *	when using %UTRACE_ATTACH_EXCLUSIVE in the asynchronous calls,
++ *	you can be sure that the parent's @report_clone callback has
++ *	already attached to @child or chosen not to.  Passing %UTRACE_STOP
++ *	to utrace_control() on @child here keeps the child stopped before
++ *	it ever runs in user mode, %UTRACE_REPORT or %UTRACE_INTERRUPT
++ *	ensures a callback from @child before it starts in user mode.
++ *
++ * @report_jctl:
++ *	Requested by %UTRACE_EVENT(%JCTL).
++ *	Job control event; @type is %CLD_STOPPED or %CLD_CONTINUED,
++ *	indicating whether we are stopping or resuming now.  If @notify
++ *	is nonzero, @task is the last thread to stop and so will send
++ *	%SIGCHLD to its parent after this callback; @notify reflects
++ *	what the parent's %SIGCHLD has in @si_code, which can sometimes
++ *	be %CLD_STOPPED even when @type is %CLD_CONTINUED.
++ *
++ * @report_exec:
++ *	Requested by %UTRACE_EVENT(%EXEC).
++ *	An execve system call has succeeded and the new program is about to
++ *	start running.  The initial user register state is handy to be tweaked
++ *	directly in @regs.  @fmt and @bprm gives the details of this exec.
++ *
++ * @report_syscall_entry:
++ *	Requested by %UTRACE_EVENT(%SYSCALL_ENTRY).
++ *	Thread has entered the kernel to request a system call.
++ *	The user register state is handy to be tweaked directly in @regs.
++ *	The @action argument contains an &enum utrace_syscall_action,
++ *	use utrace_syscall_action() to extract it.  The return value
++ *	overrides the last engine's action for the system call.
++ *	If the final action is %UTRACE_SYSCALL_ABORT, no system call
++ *	is made.  The details of the system call being attempted can
++ *	be fetched here with syscall_get_nr() and syscall_get_arguments().
++ *	The parameter registers can be changed with syscall_set_arguments().
++ *
++ * @report_syscall_exit:
++ *	Requested by %UTRACE_EVENT(%SYSCALL_EXIT).
++ *	Thread is about to leave the kernel after a system call request.
++ *	The user register state is handy to be tweaked directly in @regs.
++ *	The results of the system call attempt can be examined here using
++ *	syscall_get_error() and syscall_get_return_value().  It is safe
++ *	here to call syscall_set_return_value() or syscall_rollback().
++ *
++ * @report_exit:
++ *	Requested by %UTRACE_EVENT(%EXIT).
++ *	Thread is exiting and cannot be prevented from doing so,
++ *	but all its state is still live.  The @code value will be
++ *	the wait result seen by the parent, and can be changed by
++ *	this engine or others.  The @orig_code value is the real
++ *	status, not changed by any tracing engine.  Returning %UTRACE_STOP
++ *	here keeps @task stopped before it cleans up its state and dies,
++ *	so it can be examined by other processes.  When @task is allowed
++ *	to run, it will die and get to the @report_death callback.
++ *
++ * @report_death:
++ *	Requested by %UTRACE_EVENT(%DEATH).
++ *	Thread is really dead now.  It might be reaped by its parent at
++ *	any time, or self-reap immediately.  Though the actual reaping
++ *	may happen in parallel, a report_reap() callback will always be
++ *	ordered after a report_death() callback.
++ *
++ * @report_reap:
++ *	Requested by %UTRACE_EVENT(%REAP).
++ *	Called when someone reaps the dead task (parent, init, or self).
++ *	This means the parent called wait, or else this was a detached
++ *	thread or a process whose parent ignores SIGCHLD.
++ *	No more callbacks are made after this one.
++ *	The engine is always detached.
++ *	There is nothing more a tracing engine can do about this thread.
++ *	After this callback, the @engine pointer will become invalid.
++ *	The @task pointer may become invalid if get_task_struct() hasn't
++ *	been used to keep it alive.
++ *	An engine should always request this callback if it stores the
++ *	@engine pointer or stores any pointer in @engine->data, so it
++ *	can clean up its data structures.
++ *	Unlike other callbacks, this can be called from the parent's context
++ *	rather than from the traced thread itself--it must not delay the
++ *	parent by blocking.
++ */
++struct utrace_engine_ops {
++	u32 (*report_quiesce)(enum utrace_resume_action action,
++			      struct utrace_engine *engine,
++			      struct task_struct *task,
++			      unsigned long event);
++	u32 (*report_signal)(u32 action,
++			     struct utrace_engine *engine,
++			     struct task_struct *task,
++			     struct pt_regs *regs,
++			     siginfo_t *info,
++			     const struct k_sigaction *orig_ka,
++			     struct k_sigaction *return_ka);
++	u32 (*report_clone)(enum utrace_resume_action action,
++			    struct utrace_engine *engine,
++			    struct task_struct *parent,
++			    unsigned long clone_flags,
++			    struct task_struct *child);
++	u32 (*report_jctl)(enum utrace_resume_action action,
++			   struct utrace_engine *engine,
++			   struct task_struct *task,
++			   int type, int notify);
++	u32 (*report_exec)(enum utrace_resume_action action,
++			   struct utrace_engine *engine,
++			   struct task_struct *task,
++			   const struct linux_binfmt *fmt,
++			   const struct linux_binprm *bprm,
++			   struct pt_regs *regs);
++	u32 (*report_syscall_entry)(u32 action,
++				    struct utrace_engine *engine,
++				    struct task_struct *task,
++				    struct pt_regs *regs);
++	u32 (*report_syscall_exit)(enum utrace_resume_action action,
++				   struct utrace_engine *engine,
++				   struct task_struct *task,
++				   struct pt_regs *regs);
++	u32 (*report_exit)(enum utrace_resume_action action,
++			   struct utrace_engine *engine,
++			   struct task_struct *task,
++			   long orig_code, long *code);
++	u32 (*report_death)(struct utrace_engine *engine,
++			    struct task_struct *task,
++			    bool group_dead, int signal);
++	void (*report_reap)(struct utrace_engine *engine,
++			    struct task_struct *task);
++};
++
++/**
++ * struct utrace_examiner - private state for using utrace_prepare_examine()
++ *
++ * The members of &struct utrace_examiner are private to the implementation.
++ * This data type holds the state from a call to utrace_prepare_examine()
++ * to be used by a call to utrace_finish_examine().
++ */
++struct utrace_examiner {
++/* private: */
++	long state;
++	unsigned long ncsw;
++};
++
++/*
++ * These are the exported entry points for tracing engines to use.
++ * See kernel/utrace.c for their kerneldoc comments with interface details.
++ */
++struct utrace_engine *utrace_attach_task(struct task_struct *, int,
++					 const struct utrace_engine_ops *,
++					 void *);
++struct utrace_engine *utrace_attach_pid(struct pid *, int,
++					const struct utrace_engine_ops *,
++					void *);
++int __must_check utrace_control(struct task_struct *,
++				struct utrace_engine *,
++				enum utrace_resume_action);
++int __must_check utrace_set_events(struct task_struct *,
++				   struct utrace_engine *,
++				   unsigned long eventmask);
++int __must_check utrace_barrier(struct task_struct *,
++				struct utrace_engine *);
++int __must_check utrace_prepare_examine(struct task_struct *,
++					struct utrace_engine *,
++					struct utrace_examiner *);
++int __must_check utrace_finish_examine(struct task_struct *,
++				       struct utrace_engine *,
++				       struct utrace_examiner *);
++
++/**
++ * utrace_control_pid - control a thread being traced by a tracing engine
++ * @pid:		thread to affect
++ * @engine:		attached engine to affect
++ * @action:		&enum utrace_resume_action for thread to do
++ *
++ * This is the same as utrace_control(), but takes a &struct pid
++ * pointer rather than a &struct task_struct pointer.  The caller must
++ * hold a ref on @pid, but does not need to worry about the task
++ * staying valid.  If it's been reaped so that @pid points nowhere,
++ * then this call returns -%ESRCH.
++ */
++static inline __must_check int utrace_control_pid(
++	struct pid *pid, struct utrace_engine *engine,
++	enum utrace_resume_action action)
++{
++	/*
++	 * We don't bother with rcu_read_lock() here to protect the
++	 * task_struct pointer, because utrace_control will return
++	 * -ESRCH without looking at that pointer if the engine is
++	 * already detached.  A task_struct pointer can't die before
++	 * all the engines are detached in release_task() first.
++	 */
++	struct task_struct *task = pid_task(pid, PIDTYPE_PID);
++	return unlikely(!task) ? -ESRCH : utrace_control(task, engine, action);
++}
++
++/**
++ * utrace_set_events_pid - choose which event reports a tracing engine gets
++ * @pid:		thread to affect
++ * @engine:		attached engine to affect
++ * @eventmask:		new event mask
++ *
++ * This is the same as utrace_set_events(), but takes a &struct pid
++ * pointer rather than a &struct task_struct pointer.  The caller must
++ * hold a ref on @pid, but does not need to worry about the task
++ * staying valid.  If it's been reaped so that @pid points nowhere,
++ * then this call returns -%ESRCH.
++ */
++static inline __must_check int utrace_set_events_pid(
++	struct pid *pid, struct utrace_engine *engine, unsigned long eventmask)
++{
++	struct task_struct *task = pid_task(pid, PIDTYPE_PID);
++	return unlikely(!task) ? -ESRCH :
++		utrace_set_events(task, engine, eventmask);
++}
++
++/**
++ * utrace_barrier_pid - synchronize with simultaneous tracing callbacks
++ * @pid:		thread to affect
++ * @engine:		engine to affect (can be detached)
++ *
++ * This is the same as utrace_barrier(), but takes a &struct pid
++ * pointer rather than a &struct task_struct pointer.  The caller must
++ * hold a ref on @pid, but does not need to worry about the task
++ * staying valid.  If it's been reaped so that @pid points nowhere,
++ * then this call returns -%ESRCH.
++ */
++static inline __must_check int utrace_barrier_pid(struct pid *pid,
++						  struct utrace_engine *engine)
++{
++	struct task_struct *task = pid_task(pid, PIDTYPE_PID);
++	return unlikely(!task) ? -ESRCH : utrace_barrier(task, engine);
++}
++
++#endif	/* CONFIG_UTRACE */
++
++#endif	/* linux/utrace.h */
+diff --git a/include/linux/utrace_struct.h b/include/linux/utrace_struct.h
+new file mode 100644
+index 0000000..aba7e09
+--- /dev/null
++++ b/include/linux/utrace_struct.h
+@@ -0,0 +1,58 @@
++/*
++ * 'struct utrace' data structure for kernel/utrace.c private use.
++ *
++ * Copyright (C) 2006-2009 Red Hat, Inc.  All rights reserved.
++ *
++ * This copyrighted material is made available to anyone wishing to use,
++ * modify, copy, or redistribute it subject to the terms and conditions
++ * of the GNU General Public License v.2.
++ */
++
++#ifndef _LINUX_UTRACE_STRUCT_H
++#define _LINUX_UTRACE_STRUCT_H	1
++
++#ifdef CONFIG_UTRACE
++
++#include <linux/list.h>
++#include <linux/spinlock.h>
++
++/*
++ * Per-thread structure private to utrace implementation.  This properly
++ * belongs in kernel/utrace.c and its use is entirely private to the code
++ * there.  It is only defined in a header file so that it can be embedded
++ * in the struct task_struct layout.  It is here rather than in utrace.h
++ * to avoid header nesting order issues getting too complex.
++ *
++ */
++struct utrace {
++	struct task_struct *cloning;
++
++	struct list_head attached, attaching;
++	spinlock_t lock;
++
++	struct utrace_engine *reporting;
++
++	unsigned int stopped:1;
++	unsigned int report:1;
++	unsigned int interrupt:1;
++	unsigned int signal_handler:1;
++	unsigned int vfork_stop:1; /* need utrace_stop() before vfork wait */
++	unsigned int death:1;	/* in utrace_report_death() now */
++	unsigned int reap:1;	/* release_task() has run */
++};
++
++# define INIT_UTRACE(tsk)						      \
++	.utrace_flags = 0,						      \
++	.utrace = {							      \
++		.lock = __SPIN_LOCK_UNLOCKED(tsk.utrace.lock),		      \
++		.attached = LIST_HEAD_INIT(tsk.utrace.attached),	      \
++		.attaching = LIST_HEAD_INIT(tsk.utrace.attaching),	      \
++	},
++
++#else
++
++# define INIT_UTRACE(tsk)	/* Nothing. */
++
++#endif	/* CONFIG_UTRACE */
++
++#endif	/* linux/utrace_struct.h */
+diff --git a/init/Kconfig b/init/Kconfig
+index 1ce05a4..f720929 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1191,6 +1191,15 @@ config STOP_MACHINE
+ 	help
+ 	  Need stop_machine() primitive.
+ 
++menuconfig UTRACE
++	bool "Infrastructure for tracing and debugging user processes"
++	depends on EXPERIMENTAL
++	depends on HAVE_ARCH_TRACEHOOK
++	help
++	  Enable the utrace process tracing interface.  This is an internal
++	  kernel interface exported to kernel modules, to track events in
++	  user threads, extract and change user thread state.
++
+ source "block/Kconfig"
+ 
+ config PREEMPT_NOTIFIERS
+diff --git a/kernel/Makefile b/kernel/Makefile
+index 780c8dc..cd16d49 100644
+--- a/kernel/Makefile
++++ b/kernel/Makefile
+@@ -69,6 +69,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o
+ obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
+ obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+ obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
++obj-$(CONFIG_UTRACE) += utrace.o
+ obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
+ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+ obj-$(CONFIG_GCOV_KERNEL) += gcov/
+diff --git a/kernel/ptrace.c b/kernel/ptrace.c
+index 61c78b2..935eeee 100644
+--- a/kernel/ptrace.c
++++ b/kernel/ptrace.c
+@@ -16,6 +16,7 @@
+ #include <linux/pagemap.h>
+ #include <linux/smp_lock.h>
+ #include <linux/ptrace.h>
++#include <linux/utrace.h>
+ #include <linux/security.h>
+ #include <linux/signal.h>
+ #include <linux/audit.h>
+@@ -164,6 +165,14 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
+ 	return !err;
+ }
+ 
++/*
++ * For experimental use of utrace, exclude ptrace on the same task.
++ */
++static inline bool exclude_ptrace(struct task_struct *task)
++{
++	return unlikely(!!task_utrace_flags(task));
++}
++
+ int ptrace_attach(struct task_struct *task)
+ {
+ 	int retval;
+@@ -186,6 +195,13 @@ int ptrace_attach(struct task_struct *task)
+ 		goto out;
+ 
+ 	task_lock(task);
++
++	if (exclude_ptrace(task)) {
++		retval = -EBUSY;
++		task_unlock(task);
++		goto unlock_creds;
++	}
++
+ 	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+ 	task_unlock(task);
+ 	if (retval)
+@@ -226,7 +242,9 @@ int ptrace_traceme(void)
+ 
+ 	write_lock_irq(&tasklist_lock);
+ 	/* Are we already being traced? */
+-	if (!current->ptrace) {
++	if (exclude_ptrace(current)) {
++		ret = -EBUSY;
++	} else if (!current->ptrace) {
+ 		ret = security_ptrace_traceme(current->parent);
+ 		/*
+ 		 * Check PF_EXITING to ensure ->real_parent has not passed
+@@ -577,7 +595,17 @@ int ptrace_request(struct task_struct *child, long request,
+ 	return ret;
+ }
+ 
+-static struct task_struct *ptrace_get_task_struct(pid_t pid)
++/**
++ * ptrace_get_task_struct  --  grab a task struct reference for ptrace
++ * @pid:       process id to grab a task_struct reference of
++ *
++ * This function is a helper for ptrace implementations.  It checks
++ * permissions and then grabs a task struct for use of the actual
++ * ptrace implementation.
++ *
++ * Returns the task_struct for @pid or an ERR_PTR() on failure.
++ */
++struct task_struct *ptrace_get_task_struct(pid_t pid)
+ {
+ 	struct task_struct *child;
+ 
+diff --git a/kernel/utrace.c b/kernel/utrace.c
+new file mode 100644
+index 0000000..74b5fc5
+--- /dev/null
++++ b/kernel/utrace.c
+@@ -0,0 +1,2357 @@
++/*
++ * utrace infrastructure interface for debugging user processes
++ *
++ * Copyright (C) 2006-2009 Red Hat, Inc.  All rights reserved.
++ *
++ * This copyrighted material is made available to anyone wishing to use,
++ * modify, copy, or redistribute it subject to the terms and conditions
++ * of the GNU General Public License v.2.
++ *
++ * Red Hat Author: Roland McGrath.
++ */
++
++#include <linux/utrace.h>
++#include <linux/tracehook.h>
++#include <linux/regset.h>
++#include <asm/syscall.h>
++#include <linux/ptrace.h>
++#include <linux/err.h>
++#include <linux/sched.h>
++#include <linux/freezer.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <linux/seq_file.h>
++
++
++/*
++ * Rules for 'struct utrace', defined in <linux/utrace_struct.h>
++ * but used entirely privately in this file.
++ *
++ * The common event reporting loops are done by the task making the
++ * report without ever taking any locks.  To facilitate this, the two
++ * lists @attached and @attaching work together for smooth asynchronous
++ * attaching with low overhead.  Modifying either list requires @lock.
++ * The @attaching list can be modified any time while holding @lock.
++ * New engines being attached always go on this list.
++ *
++ * The @attached list is what the task itself uses for its reporting
++ * loops.  When the task itself is not quiescent, it can use the
++ * @attached list without taking any lock.  Nobody may modify the list
++ * when the task is not quiescent.  When it is quiescent, that means
++ * that it won't run again without taking @lock itself before using
++ * the list.
++ *
++ * At each place where we know the task is quiescent (or it's current),
++ * while holding @lock, we call splice_attaching(), below.  This moves
++ * the @attaching list members on to the end of the @attached list.
++ * Since this happens at the start of any reporting pass, any new
++ * engines attached asynchronously go on the stable @attached list
++ * in time to have their callbacks seen.
++ */
++
++static struct kmem_cache *utrace_engine_cachep;
++static const struct utrace_engine_ops utrace_detached_ops; /* forward decl */
++
++static int __init utrace_init(void)
++{
++	utrace_engine_cachep = KMEM_CACHE(utrace_engine, SLAB_PANIC);
++	return 0;
++}
++module_init(utrace_init);
++
++/*
++ * This is called with @utrace->lock held when the task is safely
++ * quiescent, i.e. it won't consult utrace->attached without the lock.
++ * Move any engines attached asynchronously from @utrace->attaching
++ * onto the @utrace->attached list.
++ */
++static void splice_attaching(struct utrace *utrace)
++{
++	list_splice_tail_init(&utrace->attaching, &utrace->attached);
++}
++
++/*
++ * This is the exported function used by the utrace_engine_put() inline.
++ */
++void __utrace_engine_release(struct kref *kref)
++{
++	struct utrace_engine *engine = container_of(kref, struct utrace_engine,
++						    kref);
++	BUG_ON(!list_empty(&engine->entry));
++	kmem_cache_free(utrace_engine_cachep, engine);
++}
++EXPORT_SYMBOL_GPL(__utrace_engine_release);
++
++static bool engine_matches(struct utrace_engine *engine, int flags,
++			   const struct utrace_engine_ops *ops, void *data)
++{
++	if ((flags & UTRACE_ATTACH_MATCH_OPS) && engine->ops != ops)
++		return false;
++	if ((flags & UTRACE_ATTACH_MATCH_DATA) && engine->data != data)
++		return false;
++	return engine->ops && engine->ops != &utrace_detached_ops;
++}
++
++static struct utrace_engine *matching_engine(
++	struct utrace *utrace, int flags,
++	const struct utrace_engine_ops *ops, void *data)
++{
++	struct utrace_engine *engine;
++	list_for_each_entry(engine, &utrace->attached, entry)
++		if (engine_matches(engine, flags, ops, data))
++			return engine;
++	list_for_each_entry(engine, &utrace->attaching, entry)
++		if (engine_matches(engine, flags, ops, data))
++			return engine;
++	return NULL;
++}
++
++/*
++ * For experimental use, utrace attach is mutually exclusive with ptrace.
++ */
++static inline bool exclude_utrace(struct task_struct *task)
++{
++	return unlikely(!!task->ptrace);
++}
++
++/*
++ * Called without locks, when we might be the first utrace engine to attach.
++ * If this is a newborn thread and we are not the creator, we have to wait
++ * for it.  The creator gets the first chance to attach.  The PF_STARTING
++ * flag is cleared after its report_clone hook has had a chance to run.
++ */
++static inline int utrace_attach_delay(struct task_struct *target)
++{
++	if ((target->flags & PF_STARTING) &&
++	    current->utrace.cloning != target)
++		do {
++			schedule_timeout_interruptible(1);
++			if (signal_pending(current))
++				return -ERESTARTNOINTR;
++		} while (target->flags & PF_STARTING);
++
++	return 0;
++}
++
++/*
++ * Enqueue @engine, or maybe don't if UTRACE_ATTACH_EXCLUSIVE.
++ */
++static int utrace_add_engine(struct task_struct *target,
++			     struct utrace *utrace,
++			     struct utrace_engine *engine,
++			     int flags,
++			     const struct utrace_engine_ops *ops,
++			     void *data)
++{
++	int ret;
++
++	spin_lock(&utrace->lock);
++
++	if (utrace->reap) {
++		/*
++		 * Already entered utrace_release_task(), cannot attach now.
++		 */
++		ret = -ESRCH;
++	} else if ((flags & UTRACE_ATTACH_EXCLUSIVE) &&
++	    unlikely(matching_engine(utrace, flags, ops, data))) {
++		ret = -EEXIST;
++	} else {
++		/*
++		 * Put the new engine on the pending ->attaching list.
++		 * Make sure it gets onto the ->attached list by the next
++		 * time it's examined.
++		 *
++		 * When target == current, it would be safe just to call
++		 * splice_attaching() right here.  But if we're inside a
++		 * callback, that would mean the new engine also gets
++		 * notified about the event that precipitated its own
++		 * creation.  This is not what the user wants.
++		 *
++		 * Setting ->report ensures that start_report() takes the
++		 * lock and does it next time.  Whenever setting ->report,
++		 * we must maintain the invariant that TIF_NOTIFY_RESUME is
++		 * also set.  Otherwise utrace_control() or utrace_do_stop()
++		 * might skip setting TIF_NOTIFY_RESUME upon seeing ->report
++		 * already set, and we'd miss a necessary callback.
++		 *
++		 * In case we had no engines before, make sure that
++		 * utrace_flags is not zero when tracehook_notify_resume()
++		 * checks.  That would bypass utrace reporting clearing
++		 * TIF_NOTIFY_RESUME, and thus violate the same invariant.
++		 */
++		target->utrace_flags |= UTRACE_EVENT(REAP);
++		list_add_tail(&engine->entry, &utrace->attaching);
++		utrace->report = 1;
++		set_notify_resume(target);
++
++		ret = 0;
++	}
++
++	spin_unlock(&utrace->lock);
++
++	return ret;
++}
++
++/**
++ * utrace_attach_task - attach new engine, or look up an attached engine
++ * @target:	thread to attach to
++ * @flags:	flag bits combined with OR, see below
++ * @ops:	callback table for new engine
++ * @data:	engine private data pointer
++ *
++ * The caller must ensure that the @target thread does not get freed,
++ * i.e. hold a ref or be its parent.  It is always safe to call this
++ * on @current, or on the @child pointer in a @report_clone callback.
++ * For most other cases, it's easier to use utrace_attach_pid() instead.
++ *
++ * UTRACE_ATTACH_CREATE:
++ * Create a new engine.  If %UTRACE_ATTACH_CREATE is not specified, you
++ * only look up an existing engine already attached to the thread.
++ *
++ * UTRACE_ATTACH_EXCLUSIVE:
++ * Attempting to attach a second (matching) engine fails with -%EEXIST.
++ *
++ * UTRACE_ATTACH_MATCH_OPS: Only consider engines matching @ops.
++ * UTRACE_ATTACH_MATCH_DATA: Only consider engines matching @data.
++ *
++ * Calls with neither %UTRACE_ATTACH_MATCH_OPS nor %UTRACE_ATTACH_MATCH_DATA
++ * match the first among any engines attached to @target.  That means that
++ * %UTRACE_ATTACH_EXCLUSIVE in such a call fails with -%EEXIST if there
++ * are any engines on @target at all.
++ */
++struct utrace_engine *utrace_attach_task(
++	struct task_struct *target, int flags,
++	const struct utrace_engine_ops *ops, void *data)
++{
++	struct utrace *utrace;
++	struct utrace_engine *engine;
++	int ret;
++
++	utrace = &target->utrace;
++
++	if (unlikely(target->exit_state == EXIT_DEAD)) {
++		/*
++		 * The target has already been reaped.
++		 * Check this early, though it's not synchronized.
++		 * utrace_add_engine() will do the final check.
++		 */
++		if (!(flags & UTRACE_ATTACH_CREATE))
++			return ERR_PTR(-ENOENT);
++		return ERR_PTR(-ESRCH);
++	}
++
++	if (!(flags & UTRACE_ATTACH_CREATE)) {
++		spin_lock(&utrace->lock);
++		engine = matching_engine(utrace, flags, ops, data);
++		if (engine)
++			utrace_engine_get(engine);
++		spin_unlock(&utrace->lock);
++		return engine ?: ERR_PTR(-ENOENT);
++	}
++
++	if (unlikely(!ops) || unlikely(ops == &utrace_detached_ops))
++		return ERR_PTR(-EINVAL);
++
++	if (unlikely(target->flags & PF_KTHREAD))
++		/*
++		 * Silly kernel, utrace is for users!
++		 */
++		return ERR_PTR(-EPERM);
++
++	engine = kmem_cache_alloc(utrace_engine_cachep, GFP_KERNEL);
++	if (unlikely(!engine))
++		return ERR_PTR(-ENOMEM);
++
++	/*
++	 * Initialize the new engine structure.  It starts out with two
++	 * refs: one ref to return, and one ref for being attached.
++	 */
++	kref_set(&engine->kref, 2);
++	engine->flags = 0;
++	engine->ops = ops;
++	engine->data = data;
++
++	ret = utrace_attach_delay(target);
++	if (likely(!ret))
++		ret = utrace_add_engine(target, utrace, engine,
++					flags, ops, data);
++
++	if (unlikely(ret)) {
++		kmem_cache_free(utrace_engine_cachep, engine);
++		engine = ERR_PTR(ret);
++	}
++
++	return engine;
++}
++EXPORT_SYMBOL_GPL(utrace_attach_task);
++
++/**
++ * utrace_attach_pid - attach new engine, or look up an attached engine
++ * @pid:	&struct pid pointer representing thread to attach to
++ * @flags:	flag bits combined with OR, see utrace_attach_task()
++ * @ops:	callback table for new engine
++ * @data:	engine private data pointer
++ *
++ * This is the same as utrace_attach_task(), but takes a &struct pid
++ * pointer rather than a &struct task_struct pointer.  The caller must
++ * hold a ref on @pid, but does not need to worry about the task
++ * staying valid.  If it's been reaped so that @pid points nowhere,
++ * then this call returns -%ESRCH.
++ */
++struct utrace_engine *utrace_attach_pid(
++	struct pid *pid, int flags,
++	const struct utrace_engine_ops *ops, void *data)
++{
++	struct utrace_engine *engine = ERR_PTR(-ESRCH);
++	struct task_struct *task = get_pid_task(pid, PIDTYPE_PID);
++	if (task) {
++		engine = utrace_attach_task(task, flags, ops, data);
++		put_task_struct(task);
++	}
++	return engine;
++}
++EXPORT_SYMBOL_GPL(utrace_attach_pid);
++
++/*
++ * When an engine is detached, the target thread may still see it and
++ * make callbacks until it quiesces.  We install a special ops vector
++ * with these two callbacks.  When the target thread quiesces, it can
++ * safely free the engine itself.  For any event we will always get
++ * the report_quiesce() callback first, so we only need this one
++ * pointer to be set.  The only exception is report_reap(), so we
++ * supply that callback too.
++ */
++static u32 utrace_detached_quiesce(enum utrace_resume_action action,
++				   struct utrace_engine *engine,
++				   struct task_struct *task,
++				   unsigned long event)
++{
++	return UTRACE_DETACH;
++}
++
++static void utrace_detached_reap(struct utrace_engine *engine,
++				 struct task_struct *task)
++{
++}
++
++static const struct utrace_engine_ops utrace_detached_ops = {
++	.report_quiesce = &utrace_detached_quiesce,
++	.report_reap = &utrace_detached_reap
++};
++
++/*
++ * After waking up from TASK_TRACED, clear bookkeeping in @utrace.
++ * Returns true if we were woken up prematurely by SIGKILL.
++ */
++static inline bool finish_utrace_stop(struct task_struct *task,
++				      struct utrace *utrace)
++{
++	bool killed = false;
++
++	/*
++	 * utrace_wakeup() clears @utrace->stopped before waking us up.
++	 * We're officially awake if it's clear.
++	 */
++	spin_lock(&utrace->lock);
++	if (unlikely(utrace->stopped)) {
++		/*
++		 * If we're here with it still set, it must have been
++		 * signal_wake_up() instead, waking us up for a SIGKILL.
++		 */
++		spin_lock_irq(&task->sighand->siglock);
++		WARN_ON(!sigismember(&task->pending.signal, SIGKILL));
++		spin_unlock_irq(&task->sighand->siglock);
++		utrace->stopped = 0;
++		killed = true;
++	}
++	spin_unlock(&utrace->lock);
++
++	return killed;
++}
++
++/*
++ * Perform %UTRACE_STOP, i.e. block in TASK_TRACED until woken up.
++ * @task == current, @utrace == current->utrace, which is not locked.
++ * Return true if we were woken up by SIGKILL even though some utrace
++ * engine may still want us to stay stopped.
++ */
++static bool utrace_stop(struct task_struct *task, struct utrace *utrace,
++			bool report)
++{
++	bool killed;
++
++	/*
++	 * @utrace->stopped is the flag that says we are safely
++	 * inside this function.  It should never be set on entry.
++	 */
++	BUG_ON(utrace->stopped);
++
++	/*
++	 * The siglock protects us against signals.  As well as SIGKILL
++	 * waking us up, we must synchronize with the signal bookkeeping
++	 * for stop signals and SIGCONT.
++	 */
++	spin_lock(&utrace->lock);
++	spin_lock_irq(&task->sighand->siglock);
++
++	if (unlikely(sigismember(&task->pending.signal, SIGKILL))) {
++		spin_unlock_irq(&task->sighand->siglock);
++		spin_unlock(&utrace->lock);
++		return true;
++	}
++
++	if (report) {
++		/*
++		 * Ensure a reporting pass when we're resumed.
++		 */
++		utrace->report = 1;
++		set_thread_flag(TIF_NOTIFY_RESUME);
++	}
++
++	utrace->stopped = 1;
++	__set_current_state(TASK_TRACED);
++
++	/*
++	 * If there is a group stop in progress,
++	 * we must participate in the bookkeeping.
++	 */
++	if (task->signal->group_stop_count > 0)
++		--task->signal->group_stop_count;
++
++	spin_unlock_irq(&task->sighand->siglock);
++	spin_unlock(&utrace->lock);
++
++	schedule();
++
++	/*
++	 * While in TASK_TRACED, we were considered "frozen enough".
++	 * Now that we woke up, it's crucial if we're supposed to be
++	 * frozen that we freeze now before running anything substantial.
++	 */
++	try_to_freeze();
++
++	killed = finish_utrace_stop(task, utrace);
++
++	/*
++	 * While we were in TASK_TRACED, complete_signal() considered
++	 * us "uninterested" in signal wakeups.  Now make sure our
++	 * TIF_SIGPENDING state is correct for normal running.
++	 */
++	spin_lock_irq(&task->sighand->siglock);
++	recalc_sigpending();
++	spin_unlock_irq(&task->sighand->siglock);
++
++	return killed;
++}
++
++/*
++ * The caller has to hold a ref on the engine.  If the attached flag is
++ * true (all but utrace_barrier() calls), the engine is supposed to be
++ * attached.  If the attached flag is false (utrace_barrier() only),
++ * then return -ERESTARTSYS for an engine marked for detach but not yet
++ * fully detached.  The task pointer can be invalid if the engine is
++ * detached.
++ *
++ * Get the utrace lock for the target task.
++ * Returns the struct if locked, or ERR_PTR(-errno).
++ *
++ * This has to be robust against races with:
++ *	utrace_control(target, UTRACE_DETACH) calls
++ *	UTRACE_DETACH after reports
++ *	utrace_report_death
++ *	utrace_release_task
++ */
++static struct utrace *get_utrace_lock(struct task_struct *target,
++				      struct utrace_engine *engine,
++				      bool attached)
++	__acquires(utrace->lock)
++{
++	struct utrace *utrace;
++
++	rcu_read_lock();
++
++	/*
++	 * If this engine was already detached, bail out before we look at
++	 * the task_struct pointer at all.  If it's detached after this
++	 * check, then RCU is still keeping this task_struct pointer valid.
++	 *
++	 * The ops pointer is NULL when the engine is fully detached.
++	 * It's &utrace_detached_ops when it's marked detached but still
++	 * on the list.  In the latter case, utrace_barrier() still works,
++	 * since the target might be in the middle of an old callback.
++	 */
++	if (unlikely(!engine->ops)) {
++		rcu_read_unlock();
++		return ERR_PTR(-ESRCH);
++	}
++
++	if (unlikely(engine->ops == &utrace_detached_ops)) {
++		rcu_read_unlock();
++		return attached ? ERR_PTR(-ESRCH) : ERR_PTR(-ERESTARTSYS);
++	}
++
++	utrace = &target->utrace;
++	if (unlikely(target->exit_state == EXIT_DEAD)) {
++		/*
++		 * If all engines detached already, utrace is clear.
++		 * Otherwise, we're called after utrace_release_task might
++		 * have started.  A call to this engine's report_reap
++		 * callback might already be in progress.
++		 */
++		utrace = ERR_PTR(-ESRCH);
++	} else {
++		spin_lock(&utrace->lock);
++		if (unlikely(!engine->ops) ||
++		    unlikely(engine->ops == &utrace_detached_ops)) {
++			/*
++			 * By the time we got the utrace lock,
++			 * it had been reaped or detached already.
++			 */
++			spin_unlock(&utrace->lock);
++			utrace = ERR_PTR(-ESRCH);
++			if (!attached && engine->ops == &utrace_detached_ops)
++				utrace = ERR_PTR(-ERESTARTSYS);
++		}
++	}
++	rcu_read_unlock();
++
++	return utrace;
++}
++
++/*
++ * Now that we don't hold any locks, run through any
++ * detached engines and free their references.  Each
++ * engine had one implicit ref while it was attached.
++ */
++static void put_detached_list(struct list_head *list)
++{
++	struct utrace_engine *engine, *next;
++	list_for_each_entry_safe(engine, next, list, entry) {
++		list_del_init(&engine->entry);
++		utrace_engine_put(engine);
++	}
++}
++
++/*
++ * Called with utrace->lock held.
++ * Notify and clean up all engines, then free utrace.
++ */
++static void utrace_reap(struct task_struct *target, struct utrace *utrace)
++	__releases(utrace->lock)
++{
++	struct utrace_engine *engine, *next;
++	const struct utrace_engine_ops *ops;
++	LIST_HEAD(detached);
++
++restart:
++	splice_attaching(utrace);
++	list_for_each_entry_safe(engine, next, &utrace->attached, entry) {
++		ops = engine->ops;
++		engine->ops = NULL;
++		list_move(&engine->entry, &detached);
++
++		/*
++		 * If it didn't need a callback, we don't need to drop
++		 * the lock.  Now nothing else refers to this engine.
++		 */
++		if (!(engine->flags & UTRACE_EVENT(REAP)))
++			continue;
++
++		/*
++		 * This synchronizes with utrace_barrier().  Since we
++		 * need the utrace->lock here anyway (unlike the other
++		 * reporting loops), we don't need any memory barrier
++		 * as utrace_barrier() holds the lock.
++		 */
++		utrace->reporting = engine;
++		spin_unlock(&utrace->lock);
++
++		(*ops->report_reap)(engine, target);
++
++		utrace->reporting = NULL;
++
++		put_detached_list(&detached);
++
++		spin_lock(&utrace->lock);
++		goto restart;
++	}
++
++	spin_unlock(&utrace->lock);
++
++	put_detached_list(&detached);
++}
++
++/*
++ * Called by release_task.  After this, target->utrace must be cleared.
++ */
++void utrace_release_task(struct task_struct *target)
++{
++	struct utrace *utrace;
++
++	utrace = &target->utrace;
++
++	spin_lock(&utrace->lock);
++
++	utrace->reap = 1;
++
++	if (!(target->utrace_flags & _UTRACE_DEATH_EVENTS)) {
++		utrace_reap(target, utrace); /* Unlocks and frees.  */
++		return;
++	}
++
++	/*
++	 * The target will do some final callbacks but hasn't
++	 * finished them yet.  We know because it clears these
++	 * event bits after it's done.  Instead of cleaning up here
++	 * and requiring utrace_report_death to cope with it, we
++	 * delay the REAP report and the teardown until after the
++	 * target finishes its death reports.
++	 */
++
++	spin_unlock(&utrace->lock);
++}
++
++/*
++ * We use an extra bit in utrace_engine.flags past the event bits,
++ * to record whether the engine is keeping the target thread stopped.
++ */
++#define ENGINE_STOP		(1UL << _UTRACE_NEVENTS)
++
++static void mark_engine_wants_stop(struct utrace_engine *engine)
++{
++	engine->flags |= ENGINE_STOP;
++}
++
++static void clear_engine_wants_stop(struct utrace_engine *engine)
++{
++	engine->flags &= ~ENGINE_STOP;
++}
++
++static bool engine_wants_stop(struct utrace_engine *engine)
++{
++	return (engine->flags & ENGINE_STOP) != 0;
++}
++
++/**
++ * utrace_set_events - choose which event reports a tracing engine gets
++ * @target:		thread to affect
++ * @engine:		attached engine to affect
++ * @events:		new event mask
++ *
++ * This changes the set of events for which @engine wants callbacks made.
++ *
++ * This fails with -%EALREADY and does nothing if you try to clear
++ * %UTRACE_EVENT(%DEATH) when the @report_death callback may already have
++ * begun, if you try to clear %UTRACE_EVENT(%REAP) when the @report_reap
++ * callback may already have begun, or if you try to newly set
++ * %UTRACE_EVENT(%DEATH) or %UTRACE_EVENT(%QUIESCE) when @target is
++ * already dead or dying.
++ *
++ * This can fail with -%ESRCH when @target has already been detached,
++ * including forcible detach on reaping.
++ *
++ * If @target was stopped before the call, then after a successful call,
++ * no event callbacks not requested in @events will be made; if
++ * %UTRACE_EVENT(%QUIESCE) is included in @events, then a @report_quiesce
++ * callback will be made when @target resumes.  If @target was not stopped,
++ * and was about to make a callback to @engine, this returns -%EINPROGRESS.
++ * In this case, the callback in progress might be one excluded from the
++ * new @events setting.  When this returns zero, you can be sure that no
++ * event callbacks you've disabled in @events can be made.
++ *
++ * To synchronize after an -%EINPROGRESS return, see utrace_barrier().
++ *
++ * When @target is @current, -%EINPROGRESS is not returned.  But
++ * note that a newly-created engine will not receive any callbacks
++ * related to an event notification already in progress.  This call
++ * enables @events callbacks to be made as soon as @engine becomes
++ * eligible for any callbacks, see utrace_attach_task().
++ *
++ * These rules provide for coherent synchronization based on %UTRACE_STOP,
++ * even when %SIGKILL is breaking its normal simple rules.
++ */
++int utrace_set_events(struct task_struct *target,
++		      struct utrace_engine *engine,
++		      unsigned long events)
++{
++	struct utrace *utrace;
++	unsigned long old_flags, old_utrace_flags, set_utrace_flags;
++	int ret;
++
++	utrace = get_utrace_lock(target, engine, true);
++	if (unlikely(IS_ERR(utrace)))
++		return PTR_ERR(utrace);
++
++	old_utrace_flags = target->utrace_flags;
++	set_utrace_flags = events;
++	old_flags = engine->flags;
++
++	if (target->exit_state &&
++	    (((events & ~old_flags) & _UTRACE_DEATH_EVENTS) ||
++	     (utrace->death &&
++	      ((old_flags & ~events) & _UTRACE_DEATH_EVENTS)) ||
++	     (utrace->reap && ((old_flags & ~events) & UTRACE_EVENT(REAP))))) {
++		spin_unlock(&utrace->lock);
++		return -EALREADY;
++	}
++
++	/*
++	 * When setting these flags, it's essential that we really
++	 * synchronize with exit_notify().  They cannot be set after
++	 * exit_notify() takes the tasklist_lock.  By holding the read
++	 * lock here while setting the flags, we ensure that the calls
++	 * to tracehook_notify_death() and tracehook_report_death() will
++	 * see the new flags.  This ensures that utrace_release_task()
++	 * knows positively that utrace_report_death() will be called or
++	 * that it won't.
++	 */
++	if ((set_utrace_flags & ~old_utrace_flags) & _UTRACE_DEATH_EVENTS) {
++		read_lock(&tasklist_lock);
++		if (unlikely(target->exit_state)) {
++			read_unlock(&tasklist_lock);
++			spin_unlock(&utrace->lock);
++			return -EALREADY;
++		}
++		target->utrace_flags |= set_utrace_flags;
++		read_unlock(&tasklist_lock);
++	}
++
++	engine->flags = events | (engine->flags & ENGINE_STOP);
++	target->utrace_flags |= set_utrace_flags;
++
++	if ((set_utrace_flags & UTRACE_EVENT_SYSCALL) &&
++	    !(old_utrace_flags & UTRACE_EVENT_SYSCALL))
++		set_tsk_thread_flag(target, TIF_SYSCALL_TRACE);
++
++	ret = 0;
++	if (!utrace->stopped && target != current) {
++		/*
++		 * This barrier ensures that our engine->flags changes
++		 * have hit before we examine utrace->reporting,
++		 * pairing with the barrier in start_callback().  If
++		 * @target has not yet hit finish_callback() to clear
++		 * utrace->reporting, we might be in the middle of a
++		 * callback to @engine.
++		 */
++		smp_mb();
++		if (utrace->reporting == engine)
++			ret = -EINPROGRESS;
++	}
++
++	spin_unlock(&utrace->lock);
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(utrace_set_events);
++
++/*
++ * Asynchronously mark an engine as being detached.
++ *
++ * This must work while the target thread races with us doing
++ * start_callback(), defined below.  It uses smp_rmb() between checking
++ * @engine->flags and using @engine->ops.  Here we change @engine->ops
++ * first, then use smp_wmb() before changing @engine->flags.  This ensures
++ * it can check the old flags before using the old ops, or check the old
++ * flags before using the new ops, or check the new flags before using the
++ * new ops, but can never check the new flags before using the old ops.
++ * Hence, utrace_detached_ops might be used with any old flags in place.
++ * It has report_quiesce() and report_reap() callbacks to handle all cases.
++ */
++static void mark_engine_detached(struct utrace_engine *engine)
++{
++	engine->ops = &utrace_detached_ops;
++	smp_wmb();
++	engine->flags = UTRACE_EVENT(QUIESCE);
++}
++
++/*
++ * Get @target to stop and return true if it is already stopped now.
++ * If we return false, it will make some event callback soonish.
++ * Called with @utrace locked.
++ */
++static bool utrace_do_stop(struct task_struct *target, struct utrace *utrace)
++{
++	bool stopped = false;
++
++	spin_lock_irq(&target->sighand->siglock);
++	if (unlikely(target->exit_state)) {
++		/*
++		 * On the exit path, it's only truly quiescent
++		 * if it has already been through
++		 * utrace_report_death(), or never will.
++		 */
++		if (!(target->utrace_flags & _UTRACE_DEATH_EVENTS))
++			utrace->stopped = stopped = true;
++	} else if (task_is_stopped(target)) {
++		/*
++		 * Stopped is considered quiescent; when it wakes up, it will
++		 * go through utrace_get_signal() before doing anything else.
++		 */
++		utrace->stopped = stopped = true;
++	} else if (!utrace->report && !utrace->interrupt) {
++		utrace->report = 1;
++		set_notify_resume(target);
++	}
++	spin_unlock_irq(&target->sighand->siglock);
++
++	return stopped;
++}
++
++/*
++ * If the target is not dead it should not be in tracing
++ * stop any more.  Wake it unless it's in job control stop.
++ *
++ * Called with @utrace->lock held and @utrace->stopped set.
++ */
++static void utrace_wakeup(struct task_struct *target, struct utrace *utrace)
++{
++	struct sighand_struct *sighand;
++	unsigned long irqflags;
++
++	utrace->stopped = 0;
++
++	sighand = lock_task_sighand(target, &irqflags);
++	if (unlikely(!sighand))
++		return;
++
++	if (likely(task_is_stopped_or_traced(target))) {
++		if (target->signal->flags & SIGNAL_STOP_STOPPED)
++			target->state = TASK_STOPPED;
++		else
++			wake_up_state(target, __TASK_STOPPED | __TASK_TRACED);
++	}
++
++	unlock_task_sighand(target, &irqflags);
++}
++
++/*
++ * This is called when there might be some detached engines on the list or
++ * some stale bits in @task->utrace_flags.  Clean them up and recompute the
++ * flags.
++ *
++ * @action is NULL when @task is stopped and @utrace->stopped is set; wake
++ * it up if it should not be.  @action is set when @task is current; if
++ * we're fully detached, reset *@action to UTRACE_RESUME.
++ *
++ * Called with @utrace->lock held, returns with it released.
++ * After this returns, @utrace might be freed if everything detached.
++ */
++static void utrace_reset(struct task_struct *task, struct utrace *utrace,
++			 enum utrace_resume_action *action)
++	__releases(utrace->lock)
++{
++	struct utrace_engine *engine, *next;
++	unsigned long flags = 0;
++	LIST_HEAD(detached);
++	bool wake = !action;
++	BUG_ON(wake != (task != current));
++
++	splice_attaching(utrace);
++
++	/*
++	 * Update the set of events of interest from the union
++	 * of the interests of the remaining tracing engines.
++	 * For any engine marked detached, remove it from the list.
++	 * We'll collect them on the detached list.
++	 */
++	list_for_each_entry_safe(engine, next, &utrace->attached, entry) {
++		if (engine->ops == &utrace_detached_ops) {
++			engine->ops = NULL;
++			list_move(&engine->entry, &detached);
++		} else {
++			flags |= engine->flags | UTRACE_EVENT(REAP);
++			wake = wake && !engine_wants_stop(engine);
++		}
++	}
++
++	if (task->exit_state) {
++		/*
++		 * Once it's already dead, we never install any flags
++		 * except REAP.  When ->exit_state is set and events
++		 * like DEATH are not set, then they never can be set.
++		 * This ensures that utrace_release_task() knows
++		 * positively that utrace_report_death() can never run.
++		 */
++		BUG_ON(utrace->death);
++		flags &= UTRACE_EVENT(REAP);
++		wake = false;
++	} else if (!(flags & UTRACE_EVENT_SYSCALL) &&
++		   test_tsk_thread_flag(task, TIF_SYSCALL_TRACE)) {
++		clear_tsk_thread_flag(task, TIF_SYSCALL_TRACE);
++	}
++
++	task->utrace_flags = flags;
++
++	if (wake)
++		utrace_wakeup(task, utrace);
++
++	/*
++	 * If any engines are left, we're done.
++	 */
++	spin_unlock(&utrace->lock);
++	if (!flags) {
++		/*
++		 * No more engines, cleared out the utrace.
++		 */
++
++		if (action)
++			*action = UTRACE_RESUME;
++	}
++
++	put_detached_list(&detached);
++}
++
++/*
++ * You can't do anything to a dead task but detach it.
++ * If release_task() has been called, you can't do that.
++ *
++ * On the exit path, DEATH and QUIESCE event bits are set only
++ * before utrace_report_death() has taken the lock.  At that point,
++ * the death report will come soon, so disallow detach until it's
++ * done.  This prevents us from racing with it detaching itself.
++ *
++ * Called with utrace->lock held, when @target->exit_state is nonzero.
++ */
++static inline int utrace_control_dead(struct task_struct *target,
++				      struct utrace *utrace,
++				      enum utrace_resume_action action)
++{
++	if (action != UTRACE_DETACH || unlikely(utrace->reap))
++		return -ESRCH;
++
++	if (unlikely(utrace->death))
++		/*
++		 * We have already started the death report.  We can't
++		 * prevent the report_death and report_reap callbacks,
++		 * so tell the caller they will happen.
++		 */
++		return -EALREADY;
++
++	return 0;
++}
++
++/**
++ * utrace_control - control a thread being traced by a tracing engine
++ * @target:		thread to affect
++ * @engine:		attached engine to affect
++ * @action:		&enum utrace_resume_action for thread to do
++ *
++ * This is how a tracing engine asks a traced thread to do something.
++ * This call is controlled by the @action argument, which has the
++ * same meaning as the &enum utrace_resume_action value returned by
++ * event reporting callbacks.
++ *
++ * If @target is already dead (@target->exit_state nonzero),
++ * all actions except %UTRACE_DETACH fail with -%ESRCH.
++ *
++ * The following sections describe each option for the @action argument.
++ *
++ * UTRACE_DETACH:
++ *
++ * After this, the @engine data structure is no longer accessible,
++ * and the thread might be reaped.  The thread will start running
++ * again if it was stopped and no longer has any attached engines
++ * that want it stopped.
++ *
++ * If the @report_reap callback may already have begun, this fails
++ * with -%ESRCH.  If the @report_death callback may already have
++ * begun, this fails with -%EALREADY.
++ *
++ * If @target is not already stopped, then a callback to this engine
++ * might be in progress or about to start on another CPU.  If so,
++ * then this returns -%EINPROGRESS; the detach happens as soon as
++ * the pending callback is finished.  To synchronize after an
++ * -%EINPROGRESS return, see utrace_barrier().
++ *
++ * If @target is properly stopped before utrace_control() is called,
++ * then after successful return it's guaranteed that no more callbacks
++ * to the @engine->ops vector will be made.
++ *
++ * The only exception is %SIGKILL (and exec or group-exit by another
++ * thread in the group), which can cause asynchronous @report_death
++ * and/or @report_reap callbacks even when %UTRACE_STOP was used.
++ * (In that event, this fails with -%ESRCH or -%EALREADY, see above.)
++ *
++ * UTRACE_STOP:
++ * This asks that @target stop running.  This returns 0 only if
++ * @target is already stopped, either for tracing or for job
++ * control.  Then @target will remain stopped until another
++ * utrace_control() call is made on @engine; @target can be woken
++ * only by %SIGKILL (or equivalent, such as exec or termination by
++ * another thread in the same thread group).
++ *
++ * This returns -%EINPROGRESS if @target is not already stopped.
++ * Then the effect is like %UTRACE_REPORT.  A @report_quiesce or
++ * @report_signal callback will be made soon.  Your callback can
++ * then return %UTRACE_STOP to keep @target stopped.
++ *
++ * This does not interrupt system calls in progress, including ones
++ * that sleep for a long time.  For that, use %UTRACE_INTERRUPT.
++ * To interrupt system calls and then keep @target stopped, your
++ * @report_signal callback can return %UTRACE_STOP.
++ *
++ * UTRACE_RESUME:
++ *
++ * Just let @target continue running normally, reversing the effect
++ * of a previous %UTRACE_STOP.  If another engine is keeping @target
++ * stopped, then it remains stopped until all engines let it resume.
++ * If @target was not stopped, this has no effect.
++ *
++ * UTRACE_REPORT:
++ *
++ * This is like %UTRACE_RESUME, but also ensures that there will be
++ * a @report_quiesce or @report_signal callback made soon.  If
++ * @target had been stopped, then there will be a callback before it
++ * resumes running normally.  If another engine is keeping @target
++ * stopped, then there might be no callbacks until all engines let
++ * it resume.
++ *
++ * UTRACE_INTERRUPT:
++ *
++ * This is like %UTRACE_REPORT, but ensures that @target will make a
++ * @report_signal callback before it resumes or delivers signals.
++ * If @target was in a system call or about to enter one, work in
++ * progress will be interrupted as if by %SIGSTOP.  If another
++ * engine is keeping @target stopped, then there might be no
++ * callbacks until all engines let it resume.
++ *
++ * This gives @engine an opportunity to introduce a forced signal
++ * disposition via its @report_signal callback.
++ *
++ * UTRACE_SINGLESTEP:
++ *
++ * It's invalid to use this unless arch_has_single_step() returned true.
++ * This is like %UTRACE_RESUME, but resumes for one user instruction
++ * only.  It's invalid to use this in utrace_control() unless @target
++ * had been stopped by @engine previously.
++ *
++ * Note that passing %UTRACE_SINGLESTEP or %UTRACE_BLOCKSTEP to
++ * utrace_control() or returning it from an event callback alone does
++ * not necessarily ensure that stepping will be enabled.  If there are
++ * more callbacks made to any engine before returning to user mode,
++ * then the resume action is chosen only by the last set of callbacks.
++ * To be sure, enable %UTRACE_EVENT(%QUIESCE) and look for the
++ * @report_quiesce callback with a zero event mask, or the
++ * @report_signal callback with %UTRACE_SIGNAL_REPORT.
++ *
++ * UTRACE_BLOCKSTEP:
++ *
++ * It's invalid to use this unless arch_has_block_step() returned true.
++ * This is like %UTRACE_SINGLESTEP, but resumes for one whole basic
++ * block of user instructions.
++ *
++ * %UTRACE_BLOCKSTEP devolves to %UTRACE_SINGLESTEP when another
++ * tracing engine is using %UTRACE_SINGLESTEP at the same time.
++ */
++int utrace_control(struct task_struct *target,
++		   struct utrace_engine *engine,
++		   enum utrace_resume_action action)
++{
++	struct utrace *utrace;
++	bool resume;
++	int ret;
++
++	if (unlikely(action > UTRACE_DETACH))
++		return -EINVAL;
++
++	utrace = get_utrace_lock(target, engine, true);
++	if (unlikely(IS_ERR(utrace)))
++		return PTR_ERR(utrace);
++
++	if (target->exit_state) {
++		ret = utrace_control_dead(target, utrace, action);
++		if (ret) {
++			spin_unlock(&utrace->lock);
++			return ret;
++		}
++	}
++
++	resume = utrace->stopped;
++	ret = 0;
++
++	clear_engine_wants_stop(engine);
++	switch (action) {
++	case UTRACE_STOP:
++		mark_engine_wants_stop(engine);
++		if (!resume && !utrace_do_stop(target, utrace))
++			ret = -EINPROGRESS;
++		resume = false;
++		break;
++
++	case UTRACE_DETACH:
++		mark_engine_detached(engine);
++		resume = resume || utrace_do_stop(target, utrace);
++		if (!resume) {
++			/*
++			 * As in utrace_set_events(), this barrier ensures
++			 * that our engine->flags changes have hit before we
++			 * examine utrace->reporting, pairing with the barrier
++			 * in start_callback().  If @target has not yet hit
++			 * finish_callback() to clear utrace->reporting, we
++			 * might be in the middle of a callback to @engine.
++			 */
++			smp_mb();
++			if (utrace->reporting == engine)
++				ret = -EINPROGRESS;
++			break;
++		}
++		/* Fall through.  */
++
++	case UTRACE_RESUME:
++		/*
++		 * This and all other cases imply resuming if stopped.
++		 * There might not be another report before it just
++		 * resumes, so make sure single-step is not left set.
++		 */
++		if (likely(resume))
++			user_disable_single_step(target);
++		break;
++
++	case UTRACE_REPORT:
++		/*
++		 * Make the thread call tracehook_notify_resume() soon.
++		 * But don't bother if it's already been interrupted.
++		 * In that case, utrace_get_signal() will be reporting soon.
++		 */
++		if (!utrace->report && !utrace->interrupt) {
++			utrace->report = 1;
++			set_notify_resume(target);
++		}
++		break;
++
++	case UTRACE_INTERRUPT:
++		/*
++		 * Make the thread call tracehook_get_signal() soon.
++		 */
++		if (utrace->interrupt)
++			break;
++		utrace->interrupt = 1;
++
++		/*
++		 * If it's not already stopped, interrupt it now.
++		 * We need the siglock here in case it calls
++		 * recalc_sigpending() and clears its own
++		 * TIF_SIGPENDING.  By taking the lock, we've
++		 * serialized any later recalc_sigpending() after
++		 * our setting of utrace->interrupt to force it on.
++		 */
++		if (resume) {
++			/*
++			 * This is really just to keep the invariant
++			 * that TIF_SIGPENDING is set with utrace->interrupt.
++			 * When it's stopped, we know it's always going
++			 * through utrace_get_signal and will recalculate.
++			 */
++			set_tsk_thread_flag(target, TIF_SIGPENDING);
++		} else {
++			struct sighand_struct *sighand;
++			unsigned long irqflags;
++			sighand = lock_task_sighand(target, &irqflags);
++			if (likely(sighand)) {
++				signal_wake_up(target, 0);
++				unlock_task_sighand(target, &irqflags);
++			}
++		}
++		break;
++
++	case UTRACE_BLOCKSTEP:
++		/*
++		 * Resume from stopped, step one block.
++		 */
++		if (unlikely(!arch_has_block_step())) {
++			WARN_ON(1);
++			/* Fall through to treat it as SINGLESTEP.  */
++		} else if (likely(resume)) {
++			user_enable_block_step(target);
++			break;
++		}
++
++	case UTRACE_SINGLESTEP:
++		/*
++		 * Resume from stopped, step one instruction.
++		 */
++		if (unlikely(!arch_has_single_step())) {
++			WARN_ON(1);
++			resume = false;
++			ret = -EOPNOTSUPP;
++			break;
++		}
++
++		if (likely(resume))
++			user_enable_single_step(target);
++		else
++			/*
++			 * You were supposed to stop it before asking
++			 * it to step.
++			 */
++			ret = -EAGAIN;
++		break;
++	}
++
++	/*
++	 * Let the thread resume running.  If it's not stopped now,
++	 * there is nothing more we need to do.
++	 */
++	if (resume)
++		utrace_reset(target, utrace, NULL);
++	else
++		spin_unlock(&utrace->lock);
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(utrace_control);
++
++/**
++ * utrace_barrier - synchronize with simultaneous tracing callbacks
++ * @target:		thread to affect
++ * @engine:		engine to affect (can be detached)
++ *
++ * This blocks while @target might be in the midst of making a callback to
++ * @engine.  It can be interrupted by signals and will return -%ERESTARTSYS.
++ * A return value of zero means no callback from @target to @engine was
++ * in progress.  Any effect of its return value (such as %UTRACE_STOP) has
++ * already been applied to @engine.
++ *
++ * It's not necessary to keep the @target pointer alive for this call.
++ * It's only necessary to hold a ref on @engine.  This will return
++ * safely even if @target has been reaped and has no task refs.
++ *
++ * A successful return from utrace_barrier() guarantees its ordering
++ * with respect to utrace_set_events() and utrace_control() calls.  If
++ * @target was not properly stopped, event callbacks just disabled might
++ * still be in progress; utrace_barrier() waits until there is no chance
++ * an unwanted callback can be in progress.
++ */
++int utrace_barrier(struct task_struct *target, struct utrace_engine *engine)
++{
++	struct utrace *utrace;
++	int ret = -ERESTARTSYS;
++
++	if (unlikely(target == current))
++		return 0;
++
++	do {
++		utrace = get_utrace_lock(target, engine, false);
++		if (unlikely(IS_ERR(utrace))) {
++			ret = PTR_ERR(utrace);
++			if (ret != -ERESTARTSYS)
++				break;
++		} else {
++			/*
++			 * All engine state changes are done while
++			 * holding the lock, i.e. before we get here.
++			 * Since we have the lock, we only need to
++			 * worry about @target making a callback.
++			 * When it has entered start_callback() but
++			 * not yet gotten to finish_callback(), we
++			 * will see utrace->reporting == @engine.
++			 * When @target doesn't take the lock, it uses
++			 * barriers to order setting utrace->reporting
++			 * before it examines the engine state.
++			 */
++			if (utrace->reporting != engine)
++				ret = 0;
++			spin_unlock(&utrace->lock);
++			if (!ret)
++				break;
++		}
++		schedule_timeout_interruptible(1);
++	} while (!signal_pending(current));
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(utrace_barrier);
++
++/*
++ * This is local state used for reporting loops, perhaps optimized away.
++ */
++struct utrace_report {
++	enum utrace_resume_action action;
++	u32 result;
++	bool detaches;
++	bool reports;
++	bool takers;
++	bool killed;
++};
++
++#define INIT_REPORT(var) \
++	struct utrace_report var = { UTRACE_RESUME, 0, \
++				     false, false, false, false }
++
++/*
++ * We are now making the report, so clear the flag saying we need one.
++ */
++static void start_report(struct utrace *utrace)
++{
++	BUG_ON(utrace->stopped);
++	if (utrace->report) {
++		spin_lock(&utrace->lock);
++		utrace->report = 0;
++		splice_attaching(utrace);
++		spin_unlock(&utrace->lock);
++	}
++}
++
++/*
++ * Complete a normal reporting pass, pairing with a start_report() call.
++ * This handles any UTRACE_DETACH or UTRACE_REPORT or UTRACE_INTERRUPT
++ * returns from engine callbacks.  If any engine's last callback used
++ * UTRACE_STOP, we do UTRACE_REPORT here to ensure we stop before user
++ * mode.  If there were no callbacks made, it will recompute
++ * @task->utrace_flags to avoid another false-positive.
++ */
++static void finish_report(struct utrace_report *report,
++			  struct task_struct *task, struct utrace *utrace)
++{
++	bool clean = (report->takers && !report->detaches);
++
++	if (report->action <= UTRACE_REPORT && !utrace->report) {
++		spin_lock(&utrace->lock);
++		utrace->report = 1;
++		set_tsk_thread_flag(task, TIF_NOTIFY_RESUME);
++	} else if (report->action == UTRACE_INTERRUPT && !utrace->interrupt) {
++		spin_lock(&utrace->lock);
++		utrace->interrupt = 1;
++		set_tsk_thread_flag(task, TIF_SIGPENDING);
++	} else if (clean) {
++		return;
++	} else {
++		spin_lock(&utrace->lock);
++	}
++
++	if (clean)
++		spin_unlock(&utrace->lock);
++	else
++		utrace_reset(task, utrace, &report->action);
++}
++
++/*
++ * Apply the return value of one engine callback to @report.
++ * Returns true if @engine detached and should not get any more callbacks.
++ */
++static bool finish_callback(struct utrace *utrace,
++			    struct utrace_report *report,
++			    struct utrace_engine *engine,
++			    u32 ret)
++{
++	enum utrace_resume_action action = utrace_resume_action(ret);
++
++	report->result = ret & ~UTRACE_RESUME_MASK;
++
++	/*
++	 * If utrace_control() was used, treat that like UTRACE_DETACH here.
++	 */
++	if (action == UTRACE_DETACH || engine->ops == &utrace_detached_ops) {
++		engine->ops = &utrace_detached_ops;
++		report->detaches = true;
++	} else {
++		if (action < report->action)
++			report->action = action;
++
++		if (action == UTRACE_STOP) {
++			if (!engine_wants_stop(engine)) {
++				spin_lock(&utrace->lock);
++				mark_engine_wants_stop(engine);
++				spin_unlock(&utrace->lock);
++			}
++		} else {
++			if (action == UTRACE_REPORT)
++				report->reports = true;
++
++			if (engine_wants_stop(engine)) {
++				spin_lock(&utrace->lock);
++				clear_engine_wants_stop(engine);
++				spin_unlock(&utrace->lock);
++			}
++		}
++	}
++
++	/*
++	 * Now that we have applied the effect of the return value,
++	 * clear this so that utrace_barrier() can stop waiting.
++	 * A subsequent utrace_control() can stop or resume @engine
++	 * and know this was ordered after its callback's action.
++	 *
++	 * We don't need any barriers here because utrace_barrier()
++	 * takes utrace->lock.  If we touched engine->flags above,
++	 * the lock guaranteed this change was before utrace_barrier()
++	 * examined utrace->reporting.
++	 */
++	utrace->reporting = NULL;
++
++	/*
++	 * This is a good place to make sure tracing engines don't
++	 * introduce too much latency under voluntary preemption.
++	 */
++	if (need_resched())
++		cond_resched();
++
++	return engine->ops == &utrace_detached_ops;
++}
++
++/*
++ * Start the callbacks for @engine to consider @event (a bit mask).
++ * This makes the report_quiesce() callback first.  If @engine wants
++ * a specific callback for @event, we return the ops vector to use.
++ * If not, we return NULL.  The return value from the ops->callback
++ * function called should be passed to finish_callback().
++ */
++static const struct utrace_engine_ops *start_callback(
++	struct utrace *utrace, struct utrace_report *report,
++	struct utrace_engine *engine, struct task_struct *task,
++	unsigned long event)
++{
++	const struct utrace_engine_ops *ops;
++	unsigned long want;
++
++	/*
++	 * This barrier ensures that we've set utrace->reporting before
++	 * we examine engine->flags or engine->ops.  utrace_barrier()
++	 * relies on this ordering to indicate that the effect of any
++	 * utrace_control() and utrace_set_events() calls is in place
++	 * by the time utrace->reporting can be seen to be NULL.
++	 */
++	utrace->reporting = engine;
++	smp_mb();
++
++	/*
++	 * This pairs with the barrier in mark_engine_detached().
++	 * It makes sure that we never see the old ops vector with
++	 * the new flags, in case the original vector had no report_quiesce.
++	 */
++	want = engine->flags;
++	smp_rmb();
++	ops = engine->ops;
++
++	if (want & UTRACE_EVENT(QUIESCE)) {
++		if (finish_callback(utrace, report, engine,
++				    (*ops->report_quiesce)(report->action,
++							   engine, task,
++							   event)))
++			return NULL;
++
++		/*
++		 * finish_callback() reset utrace->reporting after the
++		 * quiesce callback.  Now we set it again (as above)
++		 * before re-examining engine->flags, which could have
++		 * been changed synchronously by ->report_quiesce or
++		 * asynchronously by utrace_control() or utrace_set_events().
++		 */
++		utrace->reporting = engine;
++		smp_mb();
++		want = engine->flags;
++	}
++
++	if (want & ENGINE_STOP)
++		report->action = UTRACE_STOP;
++
++	if (want & event) {
++		report->takers = true;
++		return ops;
++	}
++
++	utrace->reporting = NULL;
++	return NULL;
++}
++
++/*
++ * Do a normal reporting pass for engines interested in @event.
++ * @callback is the name of the member in the ops vector, and remaining
++ * args are the extras it takes after the standard three args.
++ */
++#define REPORT(task, utrace, report, event, callback, ...)		      \
++	do {								      \
++		start_report(utrace);					      \
++		REPORT_CALLBACKS(, task, utrace, report, event, callback,     \
++				 (report)->action, engine, current,	      \
++				 ## __VA_ARGS__);  	   		      \
++		finish_report(report, task, utrace);			      \
++	} while (0)
++#define REPORT_CALLBACKS(rev, task, utrace, report, event, callback, ...)     \
++	do {								      \
++		struct utrace_engine *engine;				      \
++		const struct utrace_engine_ops *ops;			      \
++		list_for_each_entry##rev(engine, &utrace->attached, entry) {  \
++			ops = start_callback(utrace, report, engine, task,    \
++					     event);			      \
++			if (!ops)					      \
++				continue;				      \
++			finish_callback(utrace, report, engine,		      \
++					(*ops->callback)(__VA_ARGS__));	      \
++		}							      \
++	} while (0)
++
++/*
++ * Called iff UTRACE_EVENT(EXEC) flag is set.
++ */
++void utrace_report_exec(struct linux_binfmt *fmt, struct linux_binprm *bprm,
++			struct pt_regs *regs)
++{
++	struct task_struct *task = current;
++	struct utrace *utrace = task_utrace_struct(task);
++	INIT_REPORT(report);
++
++	REPORT(task, utrace, &report, UTRACE_EVENT(EXEC),
++	       report_exec, fmt, bprm, regs);
++}
++
++/*
++ * Called iff UTRACE_EVENT(SYSCALL_ENTRY) flag is set.
++ * Return true to prevent the system call.
++ */
++bool utrace_report_syscall_entry(struct pt_regs *regs)
++{
++	struct task_struct *task = current;
++	struct utrace *utrace = task_utrace_struct(task);
++	INIT_REPORT(report);
++
++	start_report(utrace);
++	REPORT_CALLBACKS(_reverse, task, utrace, &report,
++			 UTRACE_EVENT(SYSCALL_ENTRY), report_syscall_entry,
++			 report.result | report.action, engine, current, regs);
++	finish_report(&report, task, utrace);
++
++	if (report.action == UTRACE_STOP &&
++	    unlikely(utrace_stop(task, utrace, false)))
++		/*
++		 * We are continuing despite UTRACE_STOP because of a
++		 * SIGKILL.  Don't let the system call actually proceed.
++		 */
++		return true;
++
++	return report.result == UTRACE_SYSCALL_ABORT;
++}
++
++/*
++ * Called iff UTRACE_EVENT(SYSCALL_EXIT) flag is set.
++ */
++void utrace_report_syscall_exit(struct pt_regs *regs)
++{
++	struct task_struct *task = current;
++	struct utrace *utrace = task_utrace_struct(task);
++	INIT_REPORT(report);
++
++	REPORT(task, utrace, &report, UTRACE_EVENT(SYSCALL_EXIT),
++	       report_syscall_exit, regs);
++}
++
++/*
++ * Called iff UTRACE_EVENT(CLONE) flag is set.
++ * This notification call blocks the wake_up_new_task call on the child.
++ * So we must not quiesce here.  tracehook_report_clone_complete will do
++ * a quiescence check momentarily.
++ */
++void utrace_report_clone(unsigned long clone_flags, struct task_struct *child)
++{
++	struct task_struct *task = current;
++	struct utrace *utrace = task_utrace_struct(task);
++	INIT_REPORT(report);
++
++	/*
++	 * We don't use the REPORT() macro here, because we need
++	 * to clear utrace->cloning before finish_report().
++	 * After finish_report(), utrace can be a stale pointer
++	 * in cases when report.action is still UTRACE_RESUME.
++	 */
++	start_report(utrace);
++	utrace->cloning = child;
++
++	REPORT_CALLBACKS(, task, utrace, &report,
++			 UTRACE_EVENT(CLONE), report_clone,
++			 report.action, engine, task, clone_flags, child);
++
++	utrace->cloning = NULL;
++	finish_report(&report, task, utrace);
++
++	/*
++	 * For a vfork, we will go into an uninterruptible block waiting
++	 * for the child.  We need UTRACE_STOP to happen before this, not
++	 * after.  For CLONE_VFORK, utrace_finish_vfork() will be called.
++	 */
++	if (report.action == UTRACE_STOP && (clone_flags & CLONE_VFORK)) {
++		spin_lock(&utrace->lock);
++		utrace->vfork_stop = 1;
++		spin_unlock(&utrace->lock);
++	}
++}
++
++/*
++ * We're called after utrace_report_clone() for a CLONE_VFORK.
++ * If UTRACE_STOP was left from the clone report, we stop here.
++ * After this, we'll enter the uninterruptible wait_for_completion()
++ * waiting for the child.
++ */
++void utrace_finish_vfork(struct task_struct *task)
++{
++	struct utrace *utrace = task_utrace_struct(task);
++
++	spin_lock(&utrace->lock);
++	if (!utrace->vfork_stop)
++		spin_unlock(&utrace->lock);
++	else {
++		utrace->vfork_stop = 0;
++		spin_unlock(&utrace->lock);
++		utrace_stop(task, utrace, false);
++	}
++}
++
++/*
++ * Called iff UTRACE_EVENT(JCTL) flag is set.
++ *
++ * Called with siglock held.
++ */
++void utrace_report_jctl(int notify, int what)
++{
++	struct task_struct *task = current;
++	struct utrace *utrace = task_utrace_struct(task);
++	INIT_REPORT(report);
++	bool stop = task_is_stopped(task);
++
++	/*
++	 * We have to come out of TASK_STOPPED in case the event report
++	 * hooks might block.  Since we held the siglock throughout, it's
++	 * as if we were never in TASK_STOPPED yet at all.
++	 */
++	if (stop) {
++		__set_current_state(TASK_RUNNING);
++		task->signal->flags &= ~SIGNAL_STOP_STOPPED;
++		++task->signal->group_stop_count;
++	}
++	spin_unlock_irq(&task->sighand->siglock);
++
++	/*
++	 * We get here with CLD_STOPPED when we've just entered
++	 * TASK_STOPPED, or with CLD_CONTINUED when we've just come
++	 * out but not yet been through utrace_get_signal() again.
++	 *
++	 * While in TASK_STOPPED, we can be considered safely
++	 * stopped by utrace_do_stop() and detached asynchronously.
++	 * If we woke up and checked task->utrace_flags before that
++	 * was finished, we might be here with utrace already
++	 * removed or in the middle of being removed.
++	 *
++	 * If we are indeed attached, then make sure we are no
++	 * longer considered stopped while we run callbacks.
++	 */
++	spin_lock(&utrace->lock);
++	utrace->stopped = 0;
++	/*
++	 * Do start_report()'s work too since we already have the lock anyway.
++	 */
++	utrace->report = 0;
++	splice_attaching(utrace);
++	spin_unlock(&utrace->lock);
++
++	REPORT(task, utrace, &report, UTRACE_EVENT(JCTL),
++	       report_jctl, what, notify);
++
++	/*
++	 * Retake the lock, and go back into TASK_STOPPED
++	 * unless the stop was just cleared.
++	 */
++	spin_lock_irq(&task->sighand->siglock);
++	if (stop && task->signal->group_stop_count > 0) {
++		__set_current_state(TASK_STOPPED);
++		if (--task->signal->group_stop_count == 0)
++			task->signal->flags |= SIGNAL_STOP_STOPPED;
++	}
++}
++
++/*
++ * Called iff UTRACE_EVENT(EXIT) flag is set.
++ */
++void utrace_report_exit(long *exit_code)
++{
++	struct task_struct *task = current;
++	struct utrace *utrace = task_utrace_struct(task);
++	INIT_REPORT(report);
++	long orig_code = *exit_code;
++
++	REPORT(task, utrace, &report, UTRACE_EVENT(EXIT),
++	       report_exit, orig_code, exit_code);
++
++	if (report.action == UTRACE_STOP)
++		utrace_stop(task, utrace, false);
++}
++
++/*
++ * Called iff UTRACE_EVENT(DEATH) or UTRACE_EVENT(QUIESCE) flag is set.
++ *
++ * It is always possible that we are racing with utrace_release_task here.
++ * For this reason, utrace_release_task checks for the event bits that get
++ * us here, and delays its cleanup for us to do.
++ */
++void utrace_report_death(struct task_struct *task, struct utrace *utrace,
++			 bool group_dead, int signal)
++{
++	INIT_REPORT(report);
++
++	BUG_ON(!task->exit_state);
++
++	/*
++	 * We are presently considered "quiescent"--which is accurate
++	 * inasmuch as we won't run any more user instructions ever again.
++	 * But for utrace_control and utrace_set_events to be robust, they
++	 * must be sure whether or not we will run any more callbacks.  If
++	 * a call comes in before we do, taking the lock here synchronizes
++	 * us so we don't run any callbacks just disabled.  Calls that come
++	 * in while we're running the callbacks will see the exit.death
++	 * flag and know that we are not yet fully quiescent for purposes
++	 * of detach bookkeeping.
++	 */
++	spin_lock(&utrace->lock);
++	BUG_ON(utrace->death);
++	utrace->death = 1;
++	utrace->report = 0;
++	utrace->interrupt = 0;
++	spin_unlock(&utrace->lock);
++
++	REPORT_CALLBACKS(, task, utrace, &report, UTRACE_EVENT(DEATH),
++			 report_death, engine, task, group_dead, signal);
++
++	spin_lock(&utrace->lock);
++
++	/*
++	 * After we unlock (possibly inside utrace_reap for callbacks) with
++	 * this flag clear, competing utrace_control/utrace_set_events calls
++	 * know that we've finished our callbacks and any detach bookkeeping.
++	 */
++	utrace->death = 0;
++
++	if (utrace->reap)
++		/*
++		 * utrace_release_task() was already called in parallel.
++		 * We must complete its work now.
++		 */
++		utrace_reap(task, utrace);
++	else
++		utrace_reset(task, utrace, &report.action);
++}
++
++/*
++ * Finish the last reporting pass before returning to user mode.
++ */
++static void finish_resume_report(struct utrace_report *report,
++				 struct task_struct *task,
++				 struct utrace *utrace)
++{
++	if (report->detaches || !report->takers) {
++		spin_lock(&utrace->lock);
++		utrace_reset(task, utrace, &report->action);
++	}
++
++	switch (report->action) {
++	case UTRACE_STOP:
++		report->killed = utrace_stop(task, utrace, report->reports);
++		break;
++
++	case UTRACE_INTERRUPT:
++		if (!signal_pending(task))
++			set_tsk_thread_flag(task, TIF_SIGPENDING);
++		break;
++
++	case UTRACE_BLOCKSTEP:
++		if (likely(arch_has_block_step())) {
++			user_enable_block_step(task);
++			break;
++		}
++
++		/*
++		 * This means some callback is to blame for failing
++		 * to check arch_has_block_step() itself.  Warn and
++		 * then fall through to treat it as SINGLESTEP.
++		 */
++		WARN_ON(1);
++
++	case UTRACE_SINGLESTEP:
++		if (likely(arch_has_single_step()))
++			user_enable_single_step(task);
++		else
++			/*
++			 * This means some callback is to blame for failing
++			 * to check arch_has_single_step() itself.  Spew
++			 * about it so the loser will fix his module.
++			 */
++			WARN_ON(1);
++		break;
++
++	case UTRACE_REPORT:
++	case UTRACE_RESUME:
++	default:
++		user_disable_single_step(task);
++		break;
++	}
++}
++
++/*
++ * This is called when TIF_NOTIFY_RESUME had been set (and is now clear).
++ * We are close to user mode, and this is the place to report or stop.
++ * When we return, we're going to user mode or into the signals code.
++ */
++void utrace_resume(struct task_struct *task, struct pt_regs *regs)
++{
++	struct utrace *utrace = task_utrace_struct(task);
++	INIT_REPORT(report);
++	struct utrace_engine *engine;
++
++	/*
++	 * Some machines get here with interrupts disabled.  The same arch
++	 * code path leads to calling into get_signal_to_deliver(), which
++	 * implicitly reenables them by virtue of spin_unlock_irq.
++	 */
++	local_irq_enable();
++
++	/*
++	 * If this flag is still set it's because there was a signal
++	 * handler setup done but no report_signal following it.  Clear
++	 * the flag before we get to user so it doesn't confuse us later.
++	 */
++	if (unlikely(utrace->signal_handler)) {
++		int skip;
++		spin_lock(&utrace->lock);
++		utrace->signal_handler = 0;
++		skip = !utrace->report;
++		spin_unlock(&utrace->lock);
++		if (skip)
++			return;
++	}
++
++	/*
++	 * If UTRACE_INTERRUPT was just used, we don't bother with a report
++	 * here.  We will report and stop in utrace_get_signal().  In case
++	 * of a race with utrace_control(), make sure we don't momentarily
++	 * return to user mode because TIF_SIGPENDING was not set yet.
++	 */
++	if (unlikely(utrace->interrupt)) {
++		set_thread_flag(TIF_SIGPENDING);
++		return;
++	}
++
++	/*
++	 * Do a simple reporting pass, with no callback after report_quiesce.
++	 */
++	start_report(utrace);
++
++	list_for_each_entry(engine, &utrace->attached, entry)
++		start_callback(utrace, &report, engine, task, 0);
++
++	/*
++	 * Finish the report and either stop or get ready to resume.
++	 */
++	finish_resume_report(&report, task, utrace);
++}
++
++/*
++ * Return true if current has forced signal_pending().
++ *
++ * This is called only when current->utrace_flags is nonzero, so we know
++ * that current->utrace must be set.  It's not inlined in tracehook.h
++ * just so that struct utrace can stay opaque outside this file.
++ */
++bool utrace_interrupt_pending(void)
++{
++	return task_utrace_struct(current)->interrupt;
++}
++
++/*
++ * Take the siglock and push @info back on our queue.
++ * Returns with @task->sighand->siglock held.
++ */
++static void push_back_signal(struct task_struct *task, siginfo_t *info)
++	__acquires(task->sighand->siglock)
++{
++	struct sigqueue *q;
++
++	if (unlikely(!info->si_signo)) { /* Oh, a wise guy! */
++		spin_lock_irq(&task->sighand->siglock);
++		return;
++	}
++
++	q = sigqueue_alloc();
++	if (likely(q)) {
++		q->flags = 0;
++		copy_siginfo(&q->info, info);
++	}
++
++	spin_lock_irq(&task->sighand->siglock);
++
++	sigaddset(&task->pending.signal, info->si_signo);
++	if (likely(q))
++		list_add(&q->list, &task->pending.list);
++
++	set_tsk_thread_flag(task, TIF_SIGPENDING);
++}
++
++/*
++ * This is the hook from the signals code, called with the siglock held.
++ * Here is the ideal place to stop.  We also dequeue and intercept signals.
++ */
++int utrace_get_signal(struct task_struct *task, struct pt_regs *regs,
++		      siginfo_t *info, struct k_sigaction *return_ka)
++	__releases(task->sighand->siglock)
++	__acquires(task->sighand->siglock)
++{
++	struct utrace *utrace;
++	struct k_sigaction *ka;
++	INIT_REPORT(report);
++	struct utrace_engine *engine;
++	const struct utrace_engine_ops *ops;
++	unsigned long event, want;
++	u32 ret;
++	int signr;
++
++	utrace = &task->utrace;
++	if (utrace->interrupt || utrace->report || utrace->signal_handler) {
++		/*
++		 * We've been asked for an explicit report before we
++		 * even check for pending signals.
++		 */
++
++		spin_unlock_irq(&task->sighand->siglock);
++
++		spin_lock(&utrace->lock);
++
++		splice_attaching(utrace);
++
++		if (unlikely(!utrace->interrupt) && unlikely(!utrace->report))
++			report.result = UTRACE_SIGNAL_IGN;
++		else if (utrace->signal_handler)
++			report.result = UTRACE_SIGNAL_HANDLER;
++		else
++			report.result = UTRACE_SIGNAL_REPORT;
++
++		/*
++		 * We are now making the report and it's on the
++		 * interrupt path, so clear the flags asking for those.
++		 */
++		utrace->interrupt = utrace->report = utrace->signal_handler = 0;
++		utrace->stopped = 0;
++
++		/*
++		 * Make sure signal_pending() only returns true
++		 * if there are real signals pending.
++		 */
++		if (signal_pending(task)) {
++			spin_lock_irq(&task->sighand->siglock);
++			recalc_sigpending();
++			spin_unlock_irq(&task->sighand->siglock);
++		}
++
++		spin_unlock(&utrace->lock);
++
++		if (unlikely(report.result == UTRACE_SIGNAL_IGN))
++			/*
++			 * We only got here to clear utrace->signal_handler.
++			 */
++			return -1;
++
++		/*
++		 * Do a reporting pass for no signal, just for EVENT(QUIESCE).
++		 * The engine callbacks can fill in *info and *return_ka.
++		 * We'll pass NULL for the @orig_ka argument to indicate
++		 * that there was no original signal.
++		 */
++		event = 0;
++		ka = NULL;
++		memset(return_ka, 0, sizeof *return_ka);
++	} else if ((task->utrace_flags & UTRACE_EVENT_SIGNAL_ALL) == 0 &&
++		   !utrace->stopped) {
++		/*
++		 * If no engine is interested in intercepting signals,
++		 * let the caller just dequeue them normally.
++		 */
++		return 0;
++	} else {
++		if (unlikely(utrace->stopped)) {
++			spin_unlock_irq(&task->sighand->siglock);
++			spin_lock(&utrace->lock);
++			utrace->stopped = 0;
++			spin_unlock(&utrace->lock);
++			spin_lock_irq(&task->sighand->siglock);
++		}
++
++		/*
++		 * Steal the next signal so we can let tracing engines
++		 * examine it.  From the signal number and sigaction,
++		 * determine what normal delivery would do.  If no
++		 * engine perturbs it, we'll do that by returning the
++		 * signal number after setting *return_ka.
++		 */
++		signr = dequeue_signal(task, &task->blocked, info);
++		if (signr == 0)
++			return signr;
++		BUG_ON(signr != info->si_signo);
++
++		ka = &task->sighand->action[signr - 1];
++		*return_ka = *ka;
++
++		/*
++		 * We are never allowed to interfere with SIGKILL.
++		 * Just punt after filling in *return_ka for our caller.
++		 */
++		if (signr == SIGKILL)
++			return signr;
++
++		if (ka->sa.sa_handler == SIG_IGN) {
++			event = UTRACE_EVENT(SIGNAL_IGN);
++			report.result = UTRACE_SIGNAL_IGN;
++		} else if (ka->sa.sa_handler != SIG_DFL) {
++			event = UTRACE_EVENT(SIGNAL);
++			report.result = UTRACE_SIGNAL_DELIVER;
++		} else if (sig_kernel_coredump(signr)) {
++			event = UTRACE_EVENT(SIGNAL_CORE);
++			report.result = UTRACE_SIGNAL_CORE;
++		} else if (sig_kernel_ignore(signr)) {
++			event = UTRACE_EVENT(SIGNAL_IGN);
++			report.result = UTRACE_SIGNAL_IGN;
++		} else if (signr == SIGSTOP) {
++			event = UTRACE_EVENT(SIGNAL_STOP);
++			report.result = UTRACE_SIGNAL_STOP;
++		} else if (sig_kernel_stop(signr)) {
++			event = UTRACE_EVENT(SIGNAL_STOP);
++			report.result = UTRACE_SIGNAL_TSTP;
++		} else {
++			event = UTRACE_EVENT(SIGNAL_TERM);
++			report.result = UTRACE_SIGNAL_TERM;
++		}
++
++		/*
++		 * Now that we know what event type this signal is, we
++		 * can short-circuit if no engines care about those.
++		 */
++		if ((task->utrace_flags & (event | UTRACE_EVENT(QUIESCE))) == 0)
++			return signr;
++
++		/*
++		 * We have some interested engines, so tell them about
++		 * the signal and let them change its disposition.
++		 */
++		spin_unlock_irq(&task->sighand->siglock);
++	}
++
++	/*
++	 * This reporting pass chooses what signal disposition we'll act on.
++	 */
++	list_for_each_entry(engine, &utrace->attached, entry) {
++		/*
++		 * See start_callback() comment about this barrier.
++		 */
++		utrace->reporting = engine;
++		smp_mb();
++
++		/*
++		 * This pairs with the barrier in mark_engine_detached(),
++		 * see start_callback() comments.
++		 */
++		want = engine->flags;
++		smp_rmb();
++		ops = engine->ops;
++
++		if ((want & (event | UTRACE_EVENT(QUIESCE))) == 0) {
++			utrace->reporting = NULL;
++			continue;
++		}
++
++		if (ops->report_signal)
++			ret = (*ops->report_signal)(
++				report.result | report.action, engine, task,
++				regs, info, ka, return_ka);
++		else
++			ret = (report.result | (*ops->report_quiesce)(
++				       report.action, engine, task, event));
++
++		/*
++		 * Avoid a tight loop reporting again and again if some
++		 * engine is too stupid.
++		 */
++		switch (utrace_resume_action(ret)) {
++		default:
++			break;
++		case UTRACE_INTERRUPT:
++		case UTRACE_REPORT:
++			ret = (ret & ~UTRACE_RESUME_MASK) | UTRACE_RESUME;
++			break;
++		}
++
++		finish_callback(utrace, &report, engine, ret);
++	}
++
++	/*
++	 * We express the chosen action to the signals code in terms
++	 * of a representative signal whose default action does it.
++	 * Our caller uses our return value (signr) to decide what to
++	 * do, but uses info->si_signo as the signal number to report.
++	 */
++	switch (utrace_signal_action(report.result)) {
++	case UTRACE_SIGNAL_TERM:
++		signr = SIGTERM;
++		break;
++
++	case UTRACE_SIGNAL_CORE:
++		signr = SIGQUIT;
++		break;
++
++	case UTRACE_SIGNAL_STOP:
++		signr = SIGSTOP;
++		break;
++
++	case UTRACE_SIGNAL_TSTP:
++		signr = SIGTSTP;
++		break;
++
++	case UTRACE_SIGNAL_DELIVER:
++		signr = info->si_signo;
++
++		if (return_ka->sa.sa_handler == SIG_DFL) {
++			/*
++			 * We'll do signr's normal default action.
++			 * For ignore, we'll fall through below.
++			 * For stop/death, break locks and returns it.
++			 */
++			if (likely(signr) && !sig_kernel_ignore(signr))
++				break;
++		} else if (return_ka->sa.sa_handler != SIG_IGN &&
++			   likely(signr)) {
++			/*
++			 * Complete the bookkeeping after the report.
++			 * The handler will run.  If an engine wanted to
++			 * stop or step, then make sure we do another
++			 * report after signal handler setup.
++			 */
++			if (report.action != UTRACE_RESUME)
++				report.action = UTRACE_INTERRUPT;
++			finish_report(&report, task, utrace);
++
++			if (unlikely(report.result & UTRACE_SIGNAL_HOLD))
++				push_back_signal(task, info);
++			else
++				spin_lock_irq(&task->sighand->siglock);
++
++			/*
++			 * We do the SA_ONESHOT work here since the
++			 * normal path will only touch *return_ka now.
++			 */
++			if (unlikely(return_ka->sa.sa_flags & SA_ONESHOT)) {
++				return_ka->sa.sa_flags &= ~SA_ONESHOT;
++				if (likely(valid_signal(signr))) {
++					ka = &task->sighand->action[signr - 1];
++					ka->sa.sa_handler = SIG_DFL;
++				}
++			}
++
++			return signr;
++		}
++
++		/* Fall through for an ignored signal.  */
++
++	case UTRACE_SIGNAL_IGN:
++	case UTRACE_SIGNAL_REPORT:
++	default:
++		/*
++		 * If the signal is being ignored, then we are on the way
++		 * directly back to user mode.  We can stop here, or step,
++		 * as in utrace_resume(), above.  After we've dealt with that,
++		 * our caller will relock and come back through here.
++		 */
++		finish_resume_report(&report, task, utrace);
++
++		if (unlikely(report.killed)) {
++			/*
++			 * The only reason we woke up now was because of a
++			 * SIGKILL.  Don't do normal dequeuing in case it
++			 * might get a signal other than SIGKILL.  That would
++			 * perturb the death state so it might differ from
++			 * what the debugger would have allowed to happen.
++			 * Instead, pluck out just the SIGKILL to be sure
++			 * we'll die immediately with nothing else different
++			 * from the quiescent state the debugger wanted us in.
++			 */
++			sigset_t sigkill_only;
++			siginitsetinv(&sigkill_only, sigmask(SIGKILL));
++			spin_lock_irq(&task->sighand->siglock);
++			signr = dequeue_signal(task, &sigkill_only, info);
++			BUG_ON(signr != SIGKILL);
++			*return_ka = task->sighand->action[SIGKILL - 1];
++			return signr;
++		}
++
++		if (unlikely(report.result & UTRACE_SIGNAL_HOLD)) {
++			push_back_signal(task, info);
++			spin_unlock_irq(&task->sighand->siglock);
++		}
++
++		return -1;
++	}
++
++	/*
++	 * Complete the bookkeeping after the report.
++	 * This sets utrace->report if UTRACE_STOP was used.
++	 */
++	finish_report(&report, task, utrace);
++
++	return_ka->sa.sa_handler = SIG_DFL;
++
++	if (unlikely(report.result & UTRACE_SIGNAL_HOLD))
++		push_back_signal(task, info);
++	else
++		spin_lock_irq(&task->sighand->siglock);
++
++	if (sig_kernel_stop(signr))
++		task->signal->flags |= SIGNAL_STOP_DEQUEUED;
++
++	return signr;
++}
++
++/*
++ * This gets called after a signal handler has been set up.
++ * We set a flag so the next report knows it happened.
++ * If we're already stepping, make sure we do a report_signal.
++ * If not, make sure we get into utrace_resume() where we can
++ * clear the signal_handler flag before resuming.
++ */
++void utrace_signal_handler(struct task_struct *task, int stepping)
++{
++	struct utrace *utrace = task_utrace_struct(task);
++
++	spin_lock(&utrace->lock);
++
++	utrace->signal_handler = 1;
++	if (stepping) {
++		utrace->interrupt = 1;
++		set_tsk_thread_flag(task, TIF_SIGPENDING);
++	} else {
++		set_tsk_thread_flag(task, TIF_NOTIFY_RESUME);
++	}
++
++	spin_unlock(&utrace->lock);
++}
++
++/**
++ * utrace_prepare_examine - prepare to examine thread state
++ * @target:		thread of interest, a &struct task_struct pointer
++ * @engine:		engine pointer returned by utrace_attach_task()
++ * @exam:		temporary state, a &struct utrace_examiner pointer
++ *
++ * This call prepares to safely examine the thread @target using
++ * &struct user_regset calls, or direct access to thread-synchronous fields.
++ *
++ * When @target is current, this call is superfluous.  When @target is
++ * another thread, it must held stopped via %UTRACE_STOP by @engine.
++ *
++ * This call may block the caller until @target stays stopped, so it must
++ * be called only after the caller is sure @target is about to unschedule.
++ * This means a zero return from a utrace_control() call on @engine giving
++ * %UTRACE_STOP, or a report_quiesce() or report_signal() callback to
++ * @engine that used %UTRACE_STOP in its return value.
++ *
++ * Returns -%ESRCH if @target is dead or -%EINVAL if %UTRACE_STOP was
++ * not used.  If @target has started running again despite %UTRACE_STOP
++ * (for %SIGKILL or a spurious wakeup), this call returns -%EAGAIN.
++ *
++ * When this call returns zero, it's safe to use &struct user_regset
++ * calls and task_user_regset_view() on @target and to examine some of
++ * its fields directly.  When the examination is complete, a
++ * utrace_finish_examine() call must follow to check whether it was
++ * completed safely.
++ */
++int utrace_prepare_examine(struct task_struct *target,
++			   struct utrace_engine *engine,
++			   struct utrace_examiner *exam)
++{
++	int ret = 0;
++
++	if (unlikely(target == current))
++		return 0;
++
++	rcu_read_lock();
++	if (unlikely(!engine_wants_stop(engine)))
++		ret = -EINVAL;
++	else if (unlikely(target->exit_state))
++		ret = -ESRCH;
++	else {
++		exam->state = target->state;
++		if (unlikely(exam->state == TASK_RUNNING))
++			ret = -EAGAIN;
++		else
++			get_task_struct(target);
++	}
++	rcu_read_unlock();
++
++	if (likely(!ret)) {
++		exam->ncsw = wait_task_inactive(target, exam->state);
++		put_task_struct(target);
++		if (unlikely(!exam->ncsw))
++			ret = -EAGAIN;
++	}
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(utrace_prepare_examine);
++
++/**
++ * utrace_finish_examine - complete an examination of thread state
++ * @target:		thread of interest, a &struct task_struct pointer
++ * @engine:		engine pointer returned by utrace_attach_task()
++ * @exam:		pointer passed to utrace_prepare_examine() call
++ *
++ * This call completes an examination on the thread @target begun by a
++ * paired utrace_prepare_examine() call with the same arguments that
++ * returned success (zero).
++ *
++ * When @target is current, this call is superfluous.  When @target is
++ * another thread, this returns zero if @target has remained unscheduled
++ * since the paired utrace_prepare_examine() call returned zero.
++ *
++ * When this returns an error, any examination done since the paired
++ * utrace_prepare_examine() call is unreliable and the data extracted
++ * should be discarded.  The error is -%EINVAL if @engine is not
++ * keeping @target stopped, or -%EAGAIN if @target woke up unexpectedly.
++ */
++int utrace_finish_examine(struct task_struct *target,
++			  struct utrace_engine *engine,
++			  struct utrace_examiner *exam)
++{
++	int ret = 0;
++
++	if (unlikely(target == current))
++		return 0;
++
++	rcu_read_lock();
++	if (unlikely(!engine_wants_stop(engine)))
++		ret = -EINVAL;
++	else if (unlikely(target->state != exam->state))
++		ret = -EAGAIN;
++	else
++		get_task_struct(target);
++	rcu_read_unlock();
++
++	if (likely(!ret)) {
++		unsigned long ncsw = wait_task_inactive(target, exam->state);
++		if (unlikely(ncsw != exam->ncsw))
++			ret = -EAGAIN;
++		put_task_struct(target);
++	}
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(utrace_finish_examine);
++
++/*
++ * This is declared in linux/regset.h and defined in machine-dependent
++ * code.  We put the export here to ensure no machine forgets it.
++ */
++EXPORT_SYMBOL_GPL(task_user_regset_view);
++
++/*
++ * Called with rcu_read_lock() held.
++ */
++void task_utrace_proc_status(struct seq_file *m, struct task_struct *p)
++{
++	struct utrace *utrace = &p->utrace;
++	seq_printf(m, "Utrace:\t%lx%s%s%s\n",
++		   p->utrace_flags,
++		   utrace->stopped ? " (stopped)" : "",
++		   utrace->report ? " (report)" : "",
++		   utrace->interrupt ? " (interrupt)" : "");
++}
diff --git a/rpmmacros.in b/rpmmacros.in
new file mode 100644
index 000000000..745559038
--- /dev/null
+++ b/rpmmacros.in
@@ -0,0 +1,7 @@
+%_topdir PWD
+%_tmppath PWD/tmp
+%__spec_install_pre %{___build_pre}
+%_install_langs C:en_US:en
+%_netsharedpath /proc:/dev/pts:/usr/share/info
+%_excludedocs 1
+%__file_context_path /dev/null
diff --git a/rpmmacros.sh b/rpmmacros.sh
new file mode 100755
index 000000000..a87f78520
--- /dev/null
+++ b/rpmmacros.sh
@@ -0,0 +1 @@
+sed -e "s,PWD,$(pwd),g" rpmmacros.in > .rpmmacros
diff --git a/sources b/sources
new file mode 100644
index 000000000..05c590b6f
--- /dev/null
+++ b/sources
@@ -0,0 +1,2 @@
+e2a867bcb1ad901981707edefc8f936b24b27090 http://mirror.onelab.eu/third-party/kernel-2.6.31.6-162.fc12.src.rpm
+c14f136d15db7130c3121bbc634ab98c41f06394 http://vserver.13thfloor.at/Experimental/patch-2.6.31.6-vs2.3.0.36.27.diff
-- 
2.47.0