From bc1f73059fa29dbdd87d683265e765b323a34994 Mon Sep 17 00:00:00 2001 From: Thierry Parmentelat Date: Fri, 11 Dec 2009 14:21:29 +0000 Subject: [PATCH] first draft for a plain f12 kernel --- Makefile | 95 + kernel.spec | 3897 ++++++ linux-2.6-btrfs-upstream.patch | 10828 +++++++++++++++ linux-2.6-debug-vm-would-have-oomkilled.patch | 65 + linux-2.6-execshield.patch | 1013 ++ linux-2.6-utrace.patch | 4102 ++++++ original/kernel.spec | 3886 ++++++ original/linux-2.6-btrfs-upstream.patch | 10829 ++++++++++++++++ ...ux-2.6-debug-vm-would-have-oomkilled.patch | 65 + original/linux-2.6-execshield.patch | 1013 ++ original/linux-2.6-utrace.patch | 4102 ++++++ rpmmacros.in | 7 + rpmmacros.sh | 1 + sources | 2 + 14 files changed, 39905 insertions(+) create mode 100644 Makefile create mode 100644 kernel.spec create mode 100644 linux-2.6-btrfs-upstream.patch create mode 100644 linux-2.6-debug-vm-would-have-oomkilled.patch create mode 100644 linux-2.6-execshield.patch create mode 100644 linux-2.6-utrace.patch create mode 100644 original/kernel.spec create mode 100644 original/linux-2.6-btrfs-upstream.patch create mode 100644 original/linux-2.6-debug-vm-would-have-oomkilled.patch create mode 100644 original/linux-2.6-execshield.patch create mode 100644 original/linux-2.6-utrace.patch create mode 100644 rpmmacros.in create mode 100755 rpmmacros.sh create mode 100644 sources diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..545e454a9 --- /dev/null +++ b/Makefile @@ -0,0 +1,95 @@ +CURL ?= $(shell if test -f /usr/bin/curl ; then echo "curl -H Pragma: -O -R -S --fail --show-error" ; fi) +WGET ?= $(shell if test -f /usr/bin/wget ; then echo "wget -nd -m" ; fi) +CLIENT ?= $(if $(CURL),$(CURL),$(if $(WGET),$(WGET))) +AWK = awk +SHA1SUM = sha1sum +SED = sed + +SPECFILE = kernel.spec + +# Thierry - when called from within the build, PWD is /build +PWD=$(shell pwd) + +# get nevr from specfile. +ifndef NAME +NAME := $(shell rpm $(RPMDEFS) $(DISTDEFS) -q --qf "%{NAME}\n" --specfile $(SPECFILE) | head -1) +endif +ifndef EPOCH +EPOCH := $(shell rpm $(RPMDEFS) $(DISTDEFS) -q --qf "%{EPOCH}\n" --specfile $(SPECFILE) | head -1 | sed 's/(none)//') +endif +ifeq ($(EPOCH),(none)) +override EPOCH := "" +endif +ifndef VERSION +VERSION := $(shell rpm $(RPMDEFS) $(DISTDEFS) -q --qf "%{VERSION}\n" --specfile $(SPECFILE)| head -1) +endif +ifndef RELEASE +RELEASE := $(shell rpm $(RPMDEFS) $(DISTDEFS) -q --qf "%{RELEASE}\n" --specfile $(SPECFILE)| head -1) +endif + +define get_sources_sha1 +$(shell cat sources 2>/dev/null | awk 'gensub("^.*/", "", 1, $$2) == "$@" { print $$1; exit; }') +endef +define get_sources_url +$(shell cat sources 2>/dev/null | awk 'gensub("^.*/", "", 1, $$2) == "$@" { print $$2; exit; }') +endef +SOURCEFILES := $(shell cat sources 2>/dev/null | awk '{ print gensub("^.*/", "", 1, $$2) }') +SOURCE_RPM := $(firstword $(SOURCEFILES)) + +sources: $(SOURCEFILES) $(TARGETS) + +$(SOURCEFILES): #FORCE + @if [ ! -e "$@" ] ; then echo "$(CLIENT) $(get_sources_url)" ; $(CLIENT) $(get_sources_url) ; fi + @if [ ! -e "$@" ] ; then echo "Could not download source file: $@ does not exist" ; exit 1 ; fi + @if test "$$(sha1sum $@ | awk '{print $$1}')" != "$(get_sources_sha1)" ; then \ + echo "sha1sum of the downloaded $@ does not match the one from 'sources' file" ; \ + echo "Local copy: $$(sha1sum $@)" ; \ + echo "In sources: $$(grep $@ sources)" ; \ + exit 1 ; \ + else \ + ls -l $@ ; \ + fi + +download-sources: + @for i in $(SOURCES); do \ + if [ ! -e "$${i##*/}" ]; then \ + echo "$(CLIENT) $$i"; \ + $(CLIENT) $$i; \ + fi; \ + done + +replace-sources: + rm -f sources + @$(MAKE) new-sources + +new-sources: download-sources + @for i in $(SOURCES); do \ + echo "$(SHA1SUM) $$i >> sources"; \ + $(SHA1SUM) $${i##*/} | $(AWK) '{ printf "%s %s\n", $$1, "'"$$i"'" }' >> sources; \ + done + +PREPARCH ?= noarch +RPMDIRDEFS = --define "_sourcedir $(PWD)" --define "_builddir $(PWD)" --define "_srcrpmdir $(PWD)" --define "_rpmdir $(PWD)" +trees: sources + rpmbuild $(RPMDIRDEFS) $(RPMDEFS) --nodeps -bp --target $(PREPARCH) $(SPECFILE) + +# use the stock source rpm, unwrap it, +# install our own specfile and patched patches +# and patch configs for IPV6 +# then rewrap with rpm +srpm: sources + mkdir SOURCES SRPMS + (cd SOURCES; rpm2cpio ../$(SOURCE_RPM) | cpio -diu; \ + cp ../$(SPECFILE) . ; cp ../linux*.patch . ; \ + sed -i -e s,CONFIG_IPV6=m,CONFIG_IPV6=y, config-generic) + ./rpmmacros.sh + export HOME=$(shell pwd) ; rpmbuild $(RPMDIRDEFS) $(RPMDEFS) --nodeps -bs SOURCES/$(SPECFILE) + cp $(SOURCE_RPM) $(EXPECTED_SRPM) + +TARGET ?= $(shell uname -m) +rpm: sources + rpmbuild $(RPMDIRDEFS) $(RPMDEFS) --nodeps --target $(TARGET) -bb $(SPECFILE) + +clean: + rm -f *.rpm + diff --git a/kernel.spec b/kernel.spec new file mode 100644 index 000000000..983e1743d --- /dev/null +++ b/kernel.spec @@ -0,0 +1,3897 @@ +# We have to override the new %%install behavior because, well... the kernel is special. +%global __spec_install_pre %{___build_pre} + +Summary: The Linux kernel + +# For a stable, released kernel, released_kernel should be 1. For rawhide +# and/or a kernel built from an rc or git snapshot, released_kernel should +# be 0. +%global released_kernel 1 + +# Versions of various parts + +# Polite request for people who spin their own kernel rpms: +# please modify the "buildid" define in a way that identifies +# that the kernel isn't the stock distribution kernel, for example, +# by setting the define to ".local" or ".bz123456" +# +###-vs- +%define buildid .vs2.3.0.36.27 + +# fedora_build defines which build revision of this kernel version we're +# building. Rather than incrementing forever, as with the prior versioning +# setup, we set fedora_cvs_origin to the current cvs revision s/1.// of the +# kernel spec when the kernel is rebased, so fedora_build automatically +# works out to the offset from the rebase, so it doesn't get too ginormous. +# +# If you're building on a branch, the RCS revision will be something like +# 1.1205.1.1. In this case we drop the initial 1, subtract fedora_cvs_origin +# from the second number, and then append the rest of the RCS string as is. +# Don't stare at the awk too long, you'll go blind. +%define fedora_cvs_origin 1786 +%define fedora_cvs_revision() %2 +%global fedora_build %(echo %{fedora_cvs_origin}.%{fedora_cvs_revision $Revision: 1.1948 $} | awk -F . '{ OFS = "."; ORS = ""; print $3 - $1 ; i = 4 ; OFS = ""; while (i <= NF) { print ".", $i ; i++} }') + +# base_sublevel is the kernel version we're starting with and patching +# on top of -- for example, 2.6.22-rc7-git1 starts with a 2.6.21 base, +# which yields a base_sublevel of 21. +%define base_sublevel 31 + +## If this is a released kernel ## +%if 0%{?released_kernel} + +# Do we have a -stable update to apply? +%define stable_update 6 +# Is it a -stable RC? +%define stable_rc 0 +# Set rpm version accordingly +%if 0%{?stable_update} +%define stablerev .%{stable_update} +%define stable_base %{stable_update} +%if 0%{?stable_rc} +# stable RCs are incremental patches, so we need the previous stable patch +%define stable_base %(echo $((%{stable_update} - 1))) +%endif +%endif +%define rpmversion 2.6.%{base_sublevel}%{?stablerev} + +## The not-released-kernel case ## +%else +# The next upstream release sublevel (base_sublevel+1) +%define upstream_sublevel %(echo $((%{base_sublevel} + 1))) +# The rc snapshot level +%define rcrev 9 +# The git snapshot level +%define gitrev 2 +# Set rpm version accordingly +%define rpmversion 2.6.%{upstream_sublevel} +%endif +# Nb: The above rcrev and gitrev values automagically define Patch00 and Patch01 below. + +# What parts do we want to build? We must build at least one kernel. +# These are the kernels that are built IF the architecture allows it. +# All should default to 1 (enabled) and be flipped to 0 (disabled) +# by later arch-specific checks. + +# The following build options are enabled by default. +# Use either --without in your rpmbuild command or force values +# to 0 in here to disable them. +# +# standard kernel +%define with_up %{?_without_up: 0} %{?!_without_up: 1} +# kernel-smp (only valid for ppc 32-bit) +%define with_smp %{?_without_smp: 0} %{?!_without_smp: 1} +# kernel-kdump +%define with_kdump %{?_without_kdump: 0} %{?!_without_kdump: 1} +# kernel-debug +%define with_debug %{?_without_debug: 0} %{?!_without_debug: 1} +# kernel-doc +%define with_doc %{?_without_doc: 0} %{?!_without_doc: 1} +# kernel-headers +%define with_headers %{?_without_headers: 0} %{?!_without_headers: 1} +# kernel-firmware +%define with_firmware %{?_with_firmware: 1} %{?!_with_firmware: 0} +# tools/perf +%define with_perftool %{?_without_perftool: 0} %{?!_without_perftool: 1} +# perf noarch subpkg +%define with_perf %{?_without_perf: 0} %{?!_without_perf: 1} +# kernel-debuginfo +%define with_debuginfo %{?_without_debuginfo: 0} %{?!_without_debuginfo: 1} +# kernel-bootwrapper (for creating zImages from kernel + initrd) +%define with_bootwrapper %{?_without_bootwrapper: 0} %{?!_without_bootwrapper: 1} +# Want to build a the vsdo directories installed +%define with_vdso_install %{?_without_vdso_install: 0} %{?!_without_vdso_install: 1} +# Use dracut instead of mkinitrd for initrd image generation +%define with_dracut %{?_without_dracut: 0} %{?!_without_dracut: 1} + +# Build the kernel-doc package, but don't fail the build if it botches. +# Here "true" means "continue" and "false" means "fail the build". +%if 0%{?released_kernel} +%define doc_build_fail false +%else +%define doc_build_fail true +%endif + +%define rawhide_skip_docs 0 +%if 0%{?rawhide_skip_docs} +%define with_doc 0 +%endif + +# Additional options for user-friendly one-off kernel building: +# +# Only build the base kernel (--with baseonly): +%define with_baseonly %{?_with_baseonly: 1} %{?!_with_baseonly: 0} +# Only build the smp kernel (--with smponly): +%define with_smponly %{?_with_smponly: 1} %{?!_with_smponly: 0} +# Only build the debug kernel (--with dbgonly): +%define with_dbgonly %{?_with_dbgonly: 1} %{?!_with_dbgonly: 0} + +# should we do C=1 builds with sparse +%define with_sparse %{?_with_sparse: 1} %{?!_with_sparse: 0} + +# Set debugbuildsenabled to 1 for production (build separate debug kernels) +# and 0 for rawhide (all kernels are debug kernels). +# See also 'make debug' and 'make release'. +%define debugbuildsenabled 1 + +# Want to build a vanilla kernel build without any non-upstream patches? +# (well, almost none, we need nonintconfig for build purposes). Default to 0 (off). +%define with_vanilla %{?_with_vanilla: 1} %{?!_with_vanilla: 0} + +# pkg_release is what we'll fill in for the rpm Release: field +%if 0%{?released_kernel} + +%if 0%{?stable_rc} +%define stable_rctag .rc%{stable_rc} +%endif +%define pkg_release %{fedora_build}%{?stable_rctag}%{?buildid}%{?dist} + +%else + +# non-released_kernel +%if 0%{?rcrev} +%define rctag .rc%rcrev +%endif +%if 0%{?gitrev} +%define gittag .git%gitrev +%if !0%{?rcrev} +%define rctag .rc0 +%endif +%endif +%define pkg_release 0.%{fedora_build}%{?rctag}%{?gittag}%{?buildid}%{?dist} + +%endif + +# The kernel tarball/base version +%define kversion 2.6.%{base_sublevel} + +%define make_target bzImage + +%define KVERREL %{PACKAGE_VERSION}-%{PACKAGE_RELEASE}.%{_target_cpu} +%define hdrarch %_target_cpu +%define asmarch %_target_cpu + +%if 0%{!?nopatches:1} +%define nopatches 0 +%endif + +%if %{with_vanilla} +%define nopatches 1 +%endif + +%if %{nopatches} +%define with_bootwrapper 0 +%define variant -vanilla +%else +%define variant_fedora -fedora +%endif + +%define using_upstream_branch 0 +%if 0%{?upstream_branch:1} +%define stable_update 0 +%define using_upstream_branch 1 +%define variant -%{upstream_branch}%{?variant_fedora} +%define pkg_release 0.%{fedora_build}%{upstream_branch_tag}%{?buildid}%{?dist} +%endif + +%if !%{debugbuildsenabled} +%define with_debug 0 +%endif + +%if !%{with_debuginfo} +%define _enable_debug_packages 0 +%endif +%define debuginfodir /usr/lib/debug + +# kernel-PAE is only built on i686. +%ifarch i686 +%define with_pae 1 +%else +%define with_pae 0 +%endif + +# if requested, only build base kernel +%if %{with_baseonly} +%define with_smp 0 +%define with_kdump 0 +%define with_debug 0 +%endif + +# if requested, only build smp kernel +%if %{with_smponly} +%define with_up 0 +%define with_kdump 0 +%define with_debug 0 +%endif + +# if requested, only build debug kernel +%if %{with_dbgonly} +%if %{debugbuildsenabled} +%define with_up 0 +%endif +%define with_smp 0 +%define with_pae 0 +%define with_xen 0 +%define with_kdump 0 +%define with_perftool 0 +%endif + +%define all_x86 i386 i686 + +%if %{with_vdso_install} +# These arches install vdso/ directories. +%define vdso_arches %{all_x86} x86_64 ppc ppc64 +%endif + +# Overrides for generic default options + +# only ppc and alphav56 need separate smp kernels +%ifnarch ppc alphaev56 +%define with_smp 0 +%endif + +# only build kernel-kdump on ppc64 +# (no relocatable kernel support upstream yet) +#FIXME: Temporarily disabled to speed up builds. +#ifnarch ppc64 +%define with_kdump 0 +#endif + +# don't do debug builds on anything but i686 and x86_64 +%ifnarch i686 x86_64 +%define with_debug 0 +%endif + +# only package docs noarch +%ifnarch noarch +%define with_doc 0 +%define with_perf 0 +%endif + +# don't build noarch kernels or headers (duh) +%ifarch noarch +%define with_up 0 +%define with_headers 0 +%define all_arch_configs kernel-%{version}-*.config +%define with_firmware %{?_without_firmware: 0} %{?!_without_firmware: 1} +%endif + +# bootwrapper is only on ppc +%ifnarch ppc ppc64 +%define with_bootwrapper 0 +%endif + +# sparse blows up on ppc64 alpha and sparc64 +%ifarch ppc64 ppc alpha sparc64 +%define with_sparse 0 +%endif + +# Per-arch tweaks + +%ifarch %{all_x86} +%define asmarch x86 +%define hdrarch i386 +%define all_arch_configs kernel-%{version}-i?86*.config +%define image_install_path boot +%define kernel_image arch/x86/boot/bzImage +%endif + +%ifarch x86_64 +%define asmarch x86 +%define all_arch_configs kernel-%{version}-x86_64*.config +%define image_install_path boot +%define kernel_image arch/x86/boot/bzImage +%endif + +%ifarch ppc64 +%define asmarch powerpc +%define hdrarch powerpc +%define all_arch_configs kernel-%{version}-ppc64*.config +%define image_install_path boot +%define make_target vmlinux +%define kernel_image vmlinux +%define kernel_image_elf 1 +%endif + +%ifarch s390x +%define asmarch s390 +%define hdrarch s390 +%define all_arch_configs kernel-%{version}-s390x.config +%define image_install_path boot +%define make_target image +%define kernel_image arch/s390/boot/image +%endif + +%ifarch sparc +# We only build sparc headers since we dont support sparc32 hardware +%endif + +%ifarch sparc64 +%define asmarch sparc +%define all_arch_configs kernel-%{version}-sparc64*.config +%define make_target image +%define kernel_image arch/sparc/boot/image +%define image_install_path boot +%define with_perftool 0 +%endif + +%ifarch ppc +%define asmarch powerpc +%define hdrarch powerpc +%define all_arch_configs kernel-%{version}-ppc{-,.}*config +%define image_install_path boot +%define make_target vmlinux +%define kernel_image vmlinux +%define kernel_image_elf 1 +%endif + +%ifarch ia64 +%define all_arch_configs kernel-%{version}-ia64*.config +%define image_install_path boot/efi/EFI/redhat +%define make_target compressed +%define kernel_image vmlinux.gz +%endif + +%ifarch alpha alphaev56 +%define all_arch_configs kernel-%{version}-alpha*.config +%define image_install_path boot +%define make_target vmlinux +%define kernel_image vmlinux +%endif + +%ifarch %{arm} +%define all_arch_configs kernel-%{version}-arm*.config +%define image_install_path boot +%define hdrarch arm +%define make_target vmlinux +%define kernel_image vmlinux +%endif + +%if %{nopatches} +# XXX temporary until last vdso patches are upstream +%define vdso_arches ppc ppc64 +%endif + +%if %{nopatches}%{using_upstream_branch} +# Ignore unknown options in our config-* files. +# Some options go with patches we're not applying. +%define oldconfig_target loose_nonint_oldconfig +%else +%define oldconfig_target nonint_oldconfig +%endif + +# To temporarily exclude an architecture from being built, add it to +# %nobuildarches. Do _NOT_ use the ExclusiveArch: line, because if we +# don't build kernel-headers then the new build system will no longer let +# us use the previous build of that package -- it'll just be completely AWOL. +# Which is a BadThing(tm). + +# We don't build a kernel on i386; we only do kernel-headers there, +# and we no longer build for 31bit S390. Same for 32bit sparc and arm. +%define nobuildarches i386 s390 sparc %{arm} + +%ifarch %nobuildarches +%define with_up 0 +%define with_smp 0 +%define with_pae 0 +%define with_kdump 0 +%define with_debuginfo 0 +%define with_perftool 0 +%define _enable_debug_packages 0 +%endif + +%define with_pae_debug 0 +%if %{with_pae} +%define with_pae_debug %{with_debug} +%endif + +# +# Three sets of minimum package version requirements in the form of Conflicts: +# to versions below the minimum +# + +# +# First the general kernel 2.6 required versions as per +# Documentation/Changes +# +%define kernel_dot_org_conflicts ppp < 2.4.3-3, isdn4k-utils < 3.2-32, nfs-utils < 1.0.7-12, e2fsprogs < 1.37-4, util-linux < 2.12, jfsutils < 1.1.7-2, reiserfs-utils < 3.6.19-2, xfsprogs < 2.6.13-4, procps < 3.2.5-6.3, oprofile < 0.9.1-2 + +# +# Then a series of requirements that are distribution specific, either +# because we add patches for something, or the older versions have +# problems with the newer kernel or lack certain things that make +# integration in the distro harder than needed. +# +%define package_conflicts initscripts < 7.23, udev < 063-6, iptables < 1.3.2-1, ipw2200-firmware < 2.4, iwl4965-firmware < 228.57.2, selinux-policy-targeted < 1.25.3-14, squashfs-tools < 4.0, wireless-tools < 29-3 + +# +# The ld.so.conf.d file we install uses syntax older ldconfig's don't grok. +# +%define kernel_xen_conflicts glibc < 2.3.5-1, xen < 3.0.1 + +%define kernel_PAE_obsoletes kernel-smp < 2.6.17, kernel-xen <= 2.6.27-0.2.rc0.git6.fc10 +%define kernel_PAE_provides kernel-xen = %{rpmversion}-%{pkg_release} + +%ifarch x86_64 +%define kernel_obsoletes kernel-xen <= 2.6.27-0.2.rc0.git6.fc10 +%define kernel_provides kernel-xen = %{rpmversion}-%{pkg_release} +%endif + +# We moved the drm include files into kernel-headers, make sure there's +# a recent enough libdrm-devel on the system that doesn't have those. +%define kernel_headers_conflicts libdrm-devel < 2.4.0-0.15 + +# +# Packages that need to be installed before the kernel is, because the %post +# scripts use them. +# +%define kernel_prereq fileutils, module-init-tools, initscripts >= 8.11.1-1, kernel-firmware >= %{rpmversion}-%{fedora_build}, grubby >= 7.0.4-1 +%if %{with_dracut} +%define initrd_prereq dracut >= 001-7 +%else +%define initrd_prereq mkinitrd >= 6.0.61-1 +%endif + +# +# This macro does requires, provides, conflicts, obsoletes for a kernel package. +# %%kernel_reqprovconf +# It uses any kernel__conflicts and kernel__obsoletes +# macros defined above. +# +%define kernel_reqprovconf \ +Provides: kernel = %{rpmversion}-%{pkg_release}\ +Provides: kernel-%{_target_cpu} = %{rpmversion}-%{pkg_release}%{?1:.%{1}}\ +Provides: kernel-drm = 4.3.0\ +Provides: kernel-drm-nouveau = 15\ +Provides: kernel-modeset = 1\ +Provides: kernel-uname-r = %{KVERREL}%{?1:.%{1}}\ +Requires(pre): %{kernel_prereq}\ +Requires(pre): %{initrd_prereq}\ +Requires(post): /sbin/new-kernel-pkg\ +Requires(preun): /sbin/new-kernel-pkg\ +Conflicts: %{kernel_dot_org_conflicts}\ +Conflicts: %{package_conflicts}\ +%{expand:%%{?kernel%{?1:_%{1}}_conflicts:Conflicts: %%{kernel%{?1:_%{1}}_conflicts}}}\ +%{expand:%%{?kernel%{?1:_%{1}}_obsoletes:Obsoletes: %%{kernel%{?1:_%{1}}_obsoletes}}}\ +%{expand:%%{?kernel%{?1:_%{1}}_provides:Provides: %%{kernel%{?1:_%{1}}_provides}}}\ +# We can't let RPM do the dependencies automatic because it'll then pick up\ +# a correct but undesirable perl dependency from the module headers which\ +# isn't required for the kernel proper to function\ +AutoReq: no\ +AutoProv: yes\ +%{nil} + +Name: kernel%{?variant} +Group: System Environment/Kernel +License: GPLv2 +URL: http://www.kernel.org/ +Version: %{rpmversion} +Release: %{pkg_release} +# DO NOT CHANGE THE 'ExclusiveArch' LINE TO TEMPORARILY EXCLUDE AN ARCHITECTURE BUILD. +# SET %%nobuildarches (ABOVE) INSTEAD +ExclusiveArch: noarch %{all_x86} x86_64 ppc ppc64 ia64 sparc sparc64 s390x alpha alphaev56 %{arm} +ExclusiveOS: Linux + +%kernel_reqprovconf +%ifarch x86_64 sparc64 +Obsoletes: kernel-smp +%endif + + +# +# List the packages used during the kernel build +# +BuildRequires: module-init-tools, patch >= 2.5.4, bash >= 2.03, sh-utils, tar +BuildRequires: bzip2, findutils, gzip, m4, perl, make >= 3.78, diffutils, gawk +BuildRequires: gcc >= 3.4.2, binutils >= 2.12, redhat-rpm-config +BuildRequires: net-tools +BuildRequires: xmlto, asciidoc +%if %{with_sparse} +BuildRequires: sparse >= 0.4.1 +%endif +%if %{with_perftool} +BuildRequires: elfutils-libelf-devel zlib-devel binutils-devel +%endif +BuildConflicts: rhbuildsys(DiskFree) < 500Mb + +%define fancy_debuginfo 0 +%if %{with_debuginfo} +%if 0%{?fedora} >= 8 || 0%{?rhel} >= 6 +%define fancy_debuginfo 1 +%endif +%endif + +%if %{fancy_debuginfo} +# Fancy new debuginfo generation introduced in Fedora 8. +BuildRequires: rpm-build >= 4.4.2.1-4 +%define debuginfo_args --strict-build-id +%endif + +Source0: ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-%{kversion}.tar.bz2 + +Source11: genkey +Source14: find-provides +Source15: merge.pl + +Source20: Makefile.config +Source21: config-debug +Source22: config-nodebug +Source23: config-generic +Source24: config-rhel-generic + +Source30: config-x86-generic +Source31: config-i686-PAE + +Source40: config-x86_64-generic + +Source50: config-powerpc-generic +Source51: config-powerpc32-generic +Source52: config-powerpc32-smp +Source53: config-powerpc64 + +Source60: config-ia64-generic + +Source70: config-s390x + +Source90: config-sparc64-generic + +Source100: config-arm + +Source200: perf + +# Here should be only the patches up to the upstream canonical Linus tree. + +# For a stable release kernel +%if 0%{?stable_update} +%if 0%{?stable_base} +%define stable_patch_00 patch-2.6.%{base_sublevel}.%{stable_base}.bz2 +Patch00: %{stable_patch_00} +%endif +%if 0%{?stable_rc} +%define stable_patch_01 patch-2.6.%{base_sublevel}.%{stable_update}-rc%{stable_rc}.bz2 +Patch01: %{stable_patch_01} +%endif + +# non-released_kernel case +# These are automagically defined by the rcrev and gitrev values set up +# near the top of this spec file. +%else +%if 0%{?rcrev} +Patch00: patch-2.6.%{upstream_sublevel}-rc%{rcrev}.bz2 +%if 0%{?gitrev} +Patch01: patch-2.6.%{upstream_sublevel}-rc%{rcrev}-git%{gitrev}.bz2 +%endif +%else +# pre-{base_sublevel+1}-rc1 case +%if 0%{?gitrev} +Patch00: patch-2.6.%{base_sublevel}-git%{gitrev}.bz2 +%endif +%endif +%endif + +%if %{using_upstream_branch} +### BRANCH PATCH ### +%endif + +Patch02: git-linus.diff + +# we always need nonintconfig, even for -vanilla kernels +Patch03: linux-2.6-build-nonintconfig.patch + +# we also need compile fixes for -vanilla +Patch04: linux-2.6-compile-fixes.patch + +# build tweak for build ID magic, even for -vanilla +Patch05: linux-2.6-makefile-after_link.patch + +###-vs- http://vserver.13thfloor.at/ExperimentalT/patch-2.6.31.6-vs2.3.0.36.27.diff +Patch06: patch-2.6.31.6-vs2.3.0.36.27.diff + +%if !%{nopatches} + +# revert upstream patches we get via other methods +Patch09: linux-2.6-upstream-reverts.patch +# Git trees. +Patch10: git-cpufreq.patch +Patch11: git-bluetooth.patch + +# Standalone patches +Patch20: linux-2.6-hotfixes.patch + +Patch21: linux-2.6-tracehook.patch +Patch22: linux-2.6-utrace.patch + +Patch30: sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch + +Patch31: disable-stackprotector-all.patch + +# Intel IOMMU fixes/workarounds +Patch100: linux-2.6-die-closed-source-bios-muppets-die.patch +Patch101: linux-2.6-intel-iommu-updates.patch +Patch102: linux-2.6-iommu-at-zero.patch +Patch103: linux-2.6-iommu-dmar-all-1s.patch +Patch104: linux-2.6-iommu-another-hp-screwup.patch +Patch105: linux-2.6-iommu-sanity-checks-for-intr-remap-too.patch +Patch106: linux-2.6-iommu-hp-cantiga-resume.patch + +Patch141: linux-2.6-ps3-storage-alias.patch +Patch143: linux-2.6-g5-therm-shutdown.patch +Patch144: linux-2.6-vio-modalias.patch +Patch147: linux-2.6-imac-transparent-bridge.patch + +Patch150: linux-2.6.29-sparc-IOC_TYPECHECK.patch + +Patch160: linux-2.6-execshield.patch + +Patch250: linux-2.6-debug-sizeof-structs.patch +Patch260: linux-2.6-debug-nmi-timeout.patch +Patch270: linux-2.6-debug-taint-vm.patch +Patch280: linux-2.6-debug-spinlock-taint.patch +Patch300: linux-2.6-driver-level-usb-autosuspend.diff +Patch302: linux-2.6-qcserial-autosuspend.diff +Patch303: linux-2.6-bluetooth-autosuspend.diff +Patch304: linux-2.6-usb-uvc-autosuspend.diff +Patch340: linux-2.6-debug-vm-would-have-oomkilled.patch +Patch360: linux-2.6-debug-always-inline-kzalloc.patch +Patch380: linux-2.6-defaults-pci_no_msi.patch +Patch381: linux-2.6-pciehp-update.patch +Patch382: linux-2.6-defaults-pciehp.patch +Patch383: linux-2.6-defaults-aspm.patch +Patch390: linux-2.6-defaults-acpi-video.patch +Patch391: linux-2.6-acpi-video-dos.patch +Patch450: linux-2.6-input-kill-stupid-messages.patch +Patch451: linux-2.6-input-fix-toshiba-hotkeys.patch +Patch452: linux-2.6.30-no-pcspkr-modalias.patch + +Patch460: linux-2.6-serial-460800.patch + +Patch470: die-floppy-die.patch + +Patch500: linux-2.6.31-copy_from_user-bounds.patch + +Patch510: linux-2.6-silence-noise.patch +Patch520: linux-2.6.30-hush-rom-warning.patch +Patch530: linux-2.6-silence-fbcon-logo.patch +Patch570: linux-2.6-selinux-mprotect-checks.patch +Patch580: linux-2.6-sparc-selinux-mprotect-checks.patch + +Patch600: linux-2.6-defaults-alsa-hda-beep-off.patch +Patch601: linux-2.6-alsa-improve-hda-powerdown.patch +Patch610: hda_intel-prealloc-4mb-dmabuffer.patch +Patch611: alsa-tell-user-that-stream-to-be-rewound-is-suspended.patch + +Patch670: linux-2.6-ata-quirk.patch +Patch671: linux-2.6-ahci-export-capabilities.patch + +Patch680: prism54-remove-pci-dev-table.patch +Patch681: linux-2.6-ath9k-fixes.patch + +Patch800: linux-2.6-crash-driver.patch + +Patch900: linux-2.6-pci-cacheline-sizing.patch + +# ACPI +Patch1100: linux-2.6.31-cpuidle-faster-io.patch +# EC fixes from 2.6.32 (#492699, #525681) +Patch1110: acpi-ec-merge-irq-and-poll-modes.patch +Patch1120: acpi-ec-use-burst-mode-only-for-msi-notebooks.patch +Patch1130: acpi-ec-restart-command-even-if-no-interrupts-from-ec.patch + +Patch1515: lirc-2.6.31.patch +Patch1517: hdpvr-ir-enable.patch +Patch1518: hid-ignore-all-recent-imon-devices.patch + +# virt + ksm patches +Patch1550: linux-2.6-ksm.patch +Patch1551: linux-2.6-ksm-kvm.patch +Patch1552: linux-2.6-ksm-updates.patch +Patch1553: linux-2.6-ksm-fix-munlock.patch +Patch1554: linux-2.6-ksm-updates-from-32.patch +Patch1579: linux-2.6-virtio_blk-revert-QUEUE_FLAG_VIRT-addition.patch +Patch1583: linux-2.6-xen-fix-is_disconnected_device-exists_disconnected_device.patch +Patch1584: linux-2.6-xen-improvement-to-wait_for_devices.patch +Patch1585: linux-2.6-xen-increase-device-connection-timeout.patch +Patch1586: linux-2.6-virtio_blk-add-support-for-cache-flush.patch + +# nouveau + drm fixes +Patch1810: kms-offb-handoff.patch +Patch1812: drm-next-b390f944.patch +Patch1813: drm-radeon-pm.patch +Patch1814: drm-nouveau.patch +Patch1818: drm-i915-resume-force-mode.patch +# intel drm is all merged upstream +Patch1824: drm-intel-next.patch +Patch1825: drm-intel-pm.patch +Patch1826: drm-intel-no-tv-hotplug.patch +Patch1827: drm-i915-fix-tvmode-oops.patch +Patch1831: drm-conservative-fallback-modes.patch +Patch1832: drm-edid-retry.patch +Patch1834: drm-edid-header-fixup.patch +Patch1835: drm-default-mode.patch +Patch1837: drm-i915-fix-sync-to-vbl-when-vga-is-off.patch +Patch1839: drm-radeon-misc-fixes.patch +Patch1840: drm-radeon-rv410-test-fix.patch + +# vga arb +Patch1900: linux-2.6-vga-arb.patch +Patch1901: drm-vga-arb.patch +Patch1902: drm-radeon-kms-arbiter-return-ignore.patch + +# make harmless fbcon debug less loud +Patch1903: fbcon-lower-debug.patch + +# kludge to make ich9 e1000 work +Patch2000: linux-2.6-e1000-ich9.patch + +# linux1394 git patches +Patch2200: linux-2.6-firewire-git-update.patch +Patch2201: linux-2.6-firewire-git-pending.patch + +# Quiet boot fixes +# silence the ACPI blacklist code +Patch2802: linux-2.6-silence-acpi-blacklist.patch + +Patch2899: linux-2.6-v4l-dvb-fixes.patch +Patch2900: linux-2.6-v4l-dvb-update.patch +Patch2901: linux-2.6-v4l-dvb-experimental.patch +Patch2904: v4l-dvb-fix-cx25840-firmware-loading.patch + +# fs fixes + +#btrfs +Patch3000: linux-2.6-btrfs-upstream.patch + +# NFSv4 +Patch3050: linux-2.6-nfsd4-proots.patch +Patch3060: linux-2.6-nfs4-ver4opt.patch +Patch3061: linux-2.6-nfs4-callback-hidden.patch + +# VIA Nano / VX8xx updates +Patch11010: via-hwmon-temp-sensor.patch + +# patches headed upstream +Patch12010: linux-2.6-dell-laptop-rfkill-fix.patch +Patch12011: linux-2.6-block-silently-error-unsupported-empty-barriers-too.patch +Patch12012: linux-2.6-rtc-show-hctosys.patch +Patch12013: linux-2.6-rfkill-all.patch +Patch12014: linux-2.6-selinux-module-load-perms.patch + +# sched fixes cherry-picked from 2.6.32 +Patch13100: sched-deal-with-low-load-in-wake-affine.patch +Patch13101: sched-ensure-child-cant-gain-time-over-its-parent-after-fork.patch +Patch13102: sched-remove-shortcut-from-select-task-rq-fair.patch +# latency defaults from 2.6.32 +Patch13110: sched-retune-scheduler-latency-defaults.patch +# Fix huge wakeup latencies +Patch13120: sched-update-the-clock-of-runqueue-select-task-rq-selected.patch + +# patches headed for -stable + +# make perf counter API available to userspace (#527264) +Patch14010: perf-make-perf-counter-h-available-to-userspace.patch + +# fix resource counter issues on *big* machines +Patch14101: improve-resource-counter-scalability.patch + +# fix perf for sysprof +Patch14420: perf-events-fix-swevent-hrtimer-sampling.patch +Patch14421: perf-events-dont-generate-events-for-the-idle-task.patch + +Patch14430: crypto-via-padlock-fix-nano-aes.patch + +# tg3 fixes (#527209) +Patch14451: tg3-01-delay-mdio-bus-init-until-fw-finishes.patch +Patch14452: tg3-02-fix-tso-test-against-wrong-flags-var.patch +Patch14453: tg3-03-fix-57780-asic-rev-pcie-link-receiver-errors.patch +Patch14454: tg3-04-prevent-tx-bd-corruption.patch +Patch14455: tg3-05-assign-flags-to-fixes-in-start_xmit_dma_bug.patch +Patch14456: tg3-06-fix-5906-transmit-hangs.patch + +Patch14460: highmem-Fix-debug_kmap_atomic-to-also-handle-KM_IRQ_.patch +Patch14461: highmem-Fix-race-in-debug_kmap_atomic-which-could-ca.patch +Patch14462: highmem-fix-arm-powerpc-kmap_types.patch + +Patch14463: dlm-fix-connection-close-handling.patch + +# rhbz#544144 [bbf31bf18d34caa87dd01f08bf713635593697f2] +Patch14464: ipv4-fix-null-ptr-deref-in-ip_fragment.patch + +%endif + +BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root + +%description +The kernel package contains the Linux kernel (vmlinuz), the core of any +Linux operating system. The kernel handles the basic functions +of the operating system: memory allocation, process allocation, device +input and output, etc. + + +%package doc +Summary: Various documentation bits found in the kernel source +Group: Documentation +%description doc +This package contains documentation files from the kernel +source. Various bits of information about the Linux kernel and the +device drivers shipped with it are documented in these files. + +You'll want to install this package if you need a reference to the +options that can be passed to Linux kernel modules at load time. + + +%package headers +Summary: Header files for the Linux kernel for use by glibc +Group: Development/System +Obsoletes: glibc-kernheaders +Provides: glibc-kernheaders = 3.0-46 +%description headers +Kernel-headers includes the C header files that specify the interface +between the Linux kernel and userspace libraries and programs. The +header files define structures and constants that are needed for +building most standard programs and are also needed for rebuilding the +glibc package. + +%package firmware +Summary: Firmware files used by the Linux kernel +Group: Development/System +# This is... complicated. +# Look at the WHENCE file. +License: GPL+ and GPLv2+ and MIT and Redistributable, no modification permitted +%if "x%{?variant}" != "x" +Provides: kernel-firmware = %{rpmversion}-%{pkg_release} +%endif +%description firmware +Kernel-firmware includes firmware files required for some devices to +operate. + +%package bootwrapper +Summary: Boot wrapper files for generating combined kernel + initrd images +Group: Development/System +Requires: gzip +%description bootwrapper +Kernel-bootwrapper contains the wrapper code which makes bootable "zImage" +files combining both kernel and initial ramdisk. + +%package debuginfo-common-%{_target_cpu} +Summary: Kernel source files used by %{name}-debuginfo packages +Group: Development/Debug +%description debuginfo-common-%{_target_cpu} +This package is required by %{name}-debuginfo subpackages. +It provides the kernel source files common to all builds. + +%package -n perf +Summary: Performance monitoring for the Linux kernel +Group: Development/System +License: GPLv2 +%description -n perf +This package provides the supporting documentation for the perf tool +shipped in each kernel image subpackage. + +# +# This macro creates a kernel--debuginfo package. +# %%kernel_debuginfo_package +# +%define kernel_debuginfo_package() \ +%package %{?1:%{1}-}debuginfo\ +Summary: Debug information for package %{name}%{?1:-%{1}}\ +Group: Development/Debug\ +Requires: %{name}-debuginfo-common-%{_target_cpu} = %{version}-%{release}\ +Provides: %{name}%{?1:-%{1}}-debuginfo-%{_target_cpu} = %{version}-%{release}\ +AutoReqProv: no\ +%description -n %{name}%{?1:-%{1}}-debuginfo\ +This package provides debug information for package %{name}%{?1:-%{1}}.\ +This is required to use SystemTap with %{name}%{?1:-%{1}}-%{KVERREL}.\ +%{expand:%%global debuginfo_args %{?debuginfo_args} -p '/.*/%%{KVERREL}%{?1:\.%{1}}/.*|/.*%%{KVERREL}%{?1:\.%{1}}(\.debug)?' -o debuginfo%{?1}.list}\ +%{nil} + +# +# This macro creates a kernel--devel package. +# %%kernel_devel_package +# +%define kernel_devel_package() \ +%package %{?1:%{1}-}devel\ +Summary: Development package for building kernel modules to match the %{?2:%{2} }kernel\ +Group: System Environment/Kernel\ +Provides: kernel%{?1:-%{1}}-devel-%{_target_cpu} = %{version}-%{release}\ +Provides: kernel-devel-%{_target_cpu} = %{version}-%{release}%{?1:.%{1}}\ +Provides: kernel-devel = %{version}-%{release}%{?1:.%{1}}\ +Provides: kernel-devel-uname-r = %{KVERREL}%{?1:.%{1}}\ +AutoReqProv: no\ +Requires(pre): /usr/bin/find\ +%description -n kernel%{?variant}%{?1:-%{1}}-devel\ +This package provides kernel headers and makefiles sufficient to build modules\ +against the %{?2:%{2} }kernel package.\ +%{nil} + +# +# This macro creates a kernel- and its -devel and -debuginfo too. +# %%define variant_summary The Linux kernel compiled for +# %%kernel_variant_package [-n ] +# +%define kernel_variant_package(n:) \ +%package %1\ +Summary: %{variant_summary}\ +Group: System Environment/Kernel\ +%kernel_reqprovconf\ +%{expand:%%kernel_devel_package %1 %{!?-n:%1}%{?-n:%{-n*}}}\ +%{expand:%%kernel_debuginfo_package %1}\ +%{nil} + + +# First the auxiliary packages of the main kernel package. +%kernel_devel_package +%kernel_debuginfo_package + + +# Now, each variant package. + +%define variant_summary The Linux kernel compiled for SMP machines +%kernel_variant_package -n SMP smp +%description smp +This package includes a SMP version of the Linux kernel. It is +required only on machines with two or more CPUs as well as machines with +hyperthreading technology. + +Install the kernel-smp package if your machine uses two or more CPUs. + + +%define variant_summary The Linux kernel compiled for PAE capable machines +%kernel_variant_package PAE +%description PAE +This package includes a version of the Linux kernel with support for up to +64GB of high memory. It requires a CPU with Physical Address Extensions (PAE). +The non-PAE kernel can only address up to 4GB of memory. +Install the kernel-PAE package if your machine has more than 4GB of memory. + + +%define variant_summary The Linux kernel compiled with extra debugging enabled for PAE capable machines +%kernel_variant_package PAEdebug +Obsoletes: kernel-PAE-debug +%description PAEdebug +This package includes a version of the Linux kernel with support for up to +64GB of high memory. It requires a CPU with Physical Address Extensions (PAE). +The non-PAE kernel can only address up to 4GB of memory. +Install the kernel-PAE package if your machine has more than 4GB of memory. + +This variant of the kernel has numerous debugging options enabled. +It should only be installed when trying to gather additional information +on kernel bugs, as some of these options impact performance noticably. + + +%define variant_summary The Linux kernel compiled with extra debugging enabled +%kernel_variant_package debug +%description debug +The kernel package contains the Linux kernel (vmlinuz), the core of any +Linux operating system. The kernel handles the basic functions +of the operating system: memory allocation, process allocation, device +input and output, etc. + +This variant of the kernel has numerous debugging options enabled. +It should only be installed when trying to gather additional information +on kernel bugs, as some of these options impact performance noticably. + + +%define variant_summary A minimal Linux kernel compiled for crash dumps +%kernel_variant_package kdump +%description kdump +This package includes a kdump version of the Linux kernel. It is +required only on machines which will use the kexec-based kernel crash dump +mechanism. + + +%prep +# do a few sanity-checks for --with *only builds +%if %{with_baseonly} +%if !%{with_up}%{with_pae} +echo "Cannot build --with baseonly, up build is disabled" +exit 1 +%endif +%endif + +%if %{with_smponly} +%if !%{with_smp} +echo "Cannot build --with smponly, smp build is disabled" +exit 1 +%endif +%endif + +# more sanity checking; do it quietly +if [ "%{patches}" != "%%{patches}" ] ; then + for patch in %{patches} ; do + if [ ! -f $patch ] ; then + echo "ERROR: Patch ${patch##/*/} listed in specfile but is missing" + exit 1 + fi + done +fi 2>/dev/null + +patch_command='patch -p1 -F1 -s' +ApplyPatch() +{ + local patch=$1 + shift + if [ ! -f $RPM_SOURCE_DIR/$patch ]; then + exit 1 + fi + if ! egrep "^Patch[0-9]+: $patch\$" %{_specdir}/${RPM_PACKAGE_NAME%%%%%{?variant}}.spec ; then + if [ "${patch:0:10}" != "patch-2.6." ] ; then + echo "ERROR: Patch $patch not listed as a source patch in specfile" + exit 1 + fi + fi 2>/dev/null + case "$patch" in + *.bz2) bunzip2 < "$RPM_SOURCE_DIR/$patch" | $patch_command ${1+"$@"} ;; + *.gz) gunzip < "$RPM_SOURCE_DIR/$patch" | $patch_command ${1+"$@"} ;; + *) $patch_command ${1+"$@"} < "$RPM_SOURCE_DIR/$patch" ;; + esac +} + +# don't apply patch if it's empty +ApplyOptionalPatch() +{ + local patch=$1 + shift + if [ ! -f $RPM_SOURCE_DIR/$patch ]; then + exit 1 + fi + local C=$(wc -l $RPM_SOURCE_DIR/$patch | awk '{print $1}') + if [ "$C" -gt 9 ]; then + ApplyPatch $patch ${1+"$@"} + fi +} + +# we don't want a .config file when building firmware: it just confuses the build system +%define build_firmware \ + mv .config .config.firmware_save \ + make INSTALL_FW_PATH=$RPM_BUILD_ROOT/lib/firmware firmware_install \ + mv .config.firmware_save .config + +# First we unpack the kernel tarball. +# If this isn't the first make prep, we use links to the existing clean tarball +# which speeds things up quite a bit. + +# Update to latest upstream. +%if 0%{?released_kernel} +%define vanillaversion 2.6.%{base_sublevel} +# non-released_kernel case +%else +%if 0%{?rcrev} +%define vanillaversion 2.6.%{upstream_sublevel}-rc%{rcrev} +%if 0%{?gitrev} +%define vanillaversion 2.6.%{upstream_sublevel}-rc%{rcrev}-git%{gitrev} +%endif +%else +# pre-{base_sublevel+1}-rc1 case +%if 0%{?gitrev} +%define vanillaversion 2.6.%{base_sublevel}-git%{gitrev} +%endif +%endif +%endif + +# We can share hardlinked source trees by putting a list of +# directory names of the CVS checkouts that we want to share +# with in .shared-srctree. (Full pathnames are required.) +[ -f .shared-srctree ] && sharedirs=$(cat .shared-srctree) + +if [ ! -d kernel-%{kversion}/vanilla-%{vanillaversion} ]; then + + if [ -d kernel-%{kversion}/vanilla-%{kversion} ]; then + + cd kernel-%{kversion} + + # Any vanilla-* directories other than the base one are stale. + for dir in vanilla-*; do + [ "$dir" = vanilla-%{kversion} ] || rm -rf $dir & + done + + else + + # Ok, first time we do a make prep. + rm -f pax_global_header + for sharedir in $sharedirs ; do + if [[ ! -z $sharedir && -d $sharedir/kernel-%{kversion}/vanilla-%{kversion} ]] ; then + break + fi + done + if [[ ! -z $sharedir && -d $sharedir/kernel-%{kversion}/vanilla-%{kversion} ]] ; then +%setup -q -n kernel-%{kversion} -c -T + cp -rl $sharedir/kernel-%{kversion}/vanilla-%{kversion} . + else +%setup -q -n kernel-%{kversion} -c + mv linux-%{kversion} vanilla-%{kversion} + fi + + fi + +%if "%{kversion}" != "%{vanillaversion}" + + for sharedir in $sharedirs ; do + if [[ ! -z $sharedir && -d $sharedir/kernel-%{kversion}/vanilla-%{vanillaversion} ]] ; then + break + fi + done + if [[ ! -z $sharedir && -d $sharedir/kernel-%{kversion}/vanilla-%{vanillaversion} ]] ; then + + cp -rl $sharedir/kernel-%{kversion}/vanilla-%{vanillaversion} . + + else + + cp -rl vanilla-%{kversion} vanilla-%{vanillaversion} + cd vanilla-%{vanillaversion} + +# Update vanilla to the latest upstream. +# (non-released_kernel case only) +%if 0%{?rcrev} + ApplyPatch patch-2.6.%{upstream_sublevel}-rc%{rcrev}.bz2 +%if 0%{?gitrev} + ApplyPatch patch-2.6.%{upstream_sublevel}-rc%{rcrev}-git%{gitrev}.bz2 +%endif +%else +# pre-{base_sublevel+1}-rc1 case +%if 0%{?gitrev} + ApplyPatch patch-2.6.%{base_sublevel}-git%{gitrev}.bz2 +%endif +%endif + + cd .. + + fi + +%endif + +else + # We already have a vanilla dir. + cd kernel-%{kversion} +fi + +if [ -d linux-%{kversion}.%{_target_cpu} ]; then + # Just in case we ctrl-c'd a prep already + rm -rf deleteme.%{_target_cpu} + # Move away the stale away, and delete in background. + mv linux-%{kversion}.%{_target_cpu} deleteme.%{_target_cpu} + rm -rf deleteme.%{_target_cpu} & +fi + +cp -rl vanilla-%{vanillaversion} linux-%{kversion}.%{_target_cpu} + +cd linux-%{kversion}.%{_target_cpu} + +# released_kernel with possible stable updates +%if 0%{?stable_base} +ApplyPatch %{stable_patch_00} +%endif +%if 0%{?stable_rc} +ApplyPatch %{stable_patch_01} +%endif + +%if %{using_upstream_branch} +### BRANCH APPLY ### +%endif + +# Drop some necessary files from the source dir into the buildroot +cp $RPM_SOURCE_DIR/config-* . +cp %{SOURCE15} . + +# Dynamically generate kernel .config files from config-* files +make -f %{SOURCE20} VERSION=%{version} configs + +#if a rhel kernel, apply the rhel config options +%if 0%{?rhel} + for i in %{all_arch_configs} + do + mv $i $i.tmp + ./merge.pl config-rhel-generic $i.tmp > $i + rm $i.tmp + done +%endif + +#ApplyOptionalPatch git-linus.diff + +# This patch adds a "make nonint_oldconfig" which is non-interactive and +# also gives a list of missing options at the end. Useful for automated +# builds (as used in the buildsystem). +ApplyPatch linux-2.6-build-nonintconfig.patch + +ApplyPatch linux-2.6-makefile-after_link.patch + +###-vs- +ApplyPatch patch-2.6.31.6-vs2.3.0.36.27.diff + +# +# misc small stuff to make things compile +# +ApplyOptionalPatch linux-2.6-compile-fixes.patch + +%if !%{nopatches} + +# revert patches from upstream that conflict or that we get via other means +ApplyOptionalPatch linux-2.6-upstream-reverts.patch -R + +ApplyOptionalPatch git-cpufreq.patch +#ApplyOptionalPatch git-bluetooth.patch + +ApplyPatch linux-2.6-hotfixes.patch + +# Roland's utrace ptrace replacement. +ApplyPatch linux-2.6-tracehook.patch +###-vs- +ApplyPatch linux-2.6-utrace.patch -F3 + +ApplyPatch sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch + +ApplyPatch disable-stackprotector-all.patch + +# Architecture patches +# x86(-64) +ApplyPatch via-hwmon-temp-sensor.patch +ApplyPatch linux-2.6-dell-laptop-rfkill-fix.patch + +# +# Intel IOMMU +# +# Quiesce USB host controllers before setting up the IOMMU +ApplyPatch linux-2.6-die-closed-source-bios-muppets-die.patch +# Some performance fixes, unify hardware/software passthrough support, and +# most importantly: notice when the BIOS points us to a region that returns +# all 0xFF, and claims that there's an IOMMU there. +ApplyPatch linux-2.6-intel-iommu-updates.patch +ApplyPatch linux-2.6-iommu-at-zero.patch +ApplyPatch linux-2.6-iommu-dmar-all-1s.patch +# Check for RMRRs which end before they start +ApplyPatch linux-2.6-iommu-another-hp-screwup.patch +# Apply the 'at zero' and 'all 0xFF' sanity checks for intr_remap too +ApplyPatch linux-2.6-iommu-sanity-checks-for-intr-remap-too.patch +# Fix up MMIO BAR for integrated graphics on HP laptops on resume (#536675) +ApplyPatch linux-2.6-iommu-hp-cantiga-resume.patch + +# +# PowerPC +# +### NOT (YET) UPSTREAM: +# The storage alias patch is Fedora-local, and allows the old 'ps3_storage' +# module name to work on upgrades. Otherwise, I believe mkinitrd will fail +# to pull the module in, +ApplyPatch linux-2.6-ps3-storage-alias.patch +# Alleviate G5 thermal shutdown problems +ApplyPatch linux-2.6-g5-therm-shutdown.patch +# Provide modalias in sysfs for vio devices +ApplyPatch linux-2.6-vio-modalias.patch +# Work around PCIe bridge setup on iSight +ApplyPatch linux-2.6-imac-transparent-bridge.patch + +# +# SPARC64 +# +ApplyPatch linux-2.6.29-sparc-IOC_TYPECHECK.patch + +# +# Exec shield +# +###-vs- +ApplyPatch linux-2.6-execshield.patch -F3 + +# +# bugfixes to drivers and filesystems +# + +# ext4 + +# xfs + +# btrfs +###-vs- +ApplyPatch linux-2.6-btrfs-upstream.patch + +# eCryptfs + +# NFSv4 +ApplyPatch linux-2.6-nfsd4-proots.patch +ApplyPatch linux-2.6-nfs4-ver4opt.patch +ApplyPatch linux-2.6-nfs4-callback-hidden.patch + +# USB +ApplyPatch linux-2.6-driver-level-usb-autosuspend.diff +ApplyPatch linux-2.6-qcserial-autosuspend.diff +ApplyPatch linux-2.6-bluetooth-autosuspend.diff +ApplyPatch linux-2.6-usb-uvc-autosuspend.diff + +# ACPI +ApplyPatch linux-2.6-defaults-acpi-video.patch +ApplyPatch linux-2.6-acpi-video-dos.patch +# cpuidle: Fix the menu governor to boost IO performance +ApplyPatch linux-2.6.31-cpuidle-faster-io.patch +# EC fixes from 2.6.32 (#492699, #525681) +ApplyPatch acpi-ec-merge-irq-and-poll-modes.patch +ApplyPatch acpi-ec-use-burst-mode-only-for-msi-notebooks.patch +ApplyPatch acpi-ec-restart-command-even-if-no-interrupts-from-ec.patch + +# Various low-impact patches to aid debugging. +ApplyPatch linux-2.6-debug-sizeof-structs.patch +ApplyPatch linux-2.6-debug-nmi-timeout.patch +ApplyPatch linux-2.6-debug-taint-vm.patch +ApplyPatch linux-2.6-debug-spinlock-taint.patch +###-vs- +ApplyPatch linux-2.6-debug-vm-would-have-oomkilled.patch +ApplyPatch linux-2.6-debug-always-inline-kzalloc.patch + +# +# PCI +# +# disable message signaled interrupts +ApplyPatch linux-2.6-defaults-pci_no_msi.patch +# update the pciehp driver +#ApplyPatch linux-2.6-pciehp-update.patch +# default to enabling passively listening for hotplug events +#ApplyPatch linux-2.6-defaults-pciehp.patch +# enable ASPM by default on hardware we expect to work +ApplyPatch linux-2.6-defaults-aspm.patch + +# +# SCSI Bits. +# + +# ALSA +# squelch hda_beep by default +ApplyPatch linux-2.6-defaults-alsa-hda-beep-off.patch +ApplyPatch linux-2.6-alsa-improve-hda-powerdown.patch +ApplyPatch hda_intel-prealloc-4mb-dmabuffer.patch +ApplyPatch alsa-tell-user-that-stream-to-be-rewound-is-suspended.patch + +# Networking + +# Misc fixes +# The input layer spews crap no-one cares about. +ApplyPatch linux-2.6-input-kill-stupid-messages.patch + +# stop floppy.ko from autoloading during udev... +ApplyPatch die-floppy-die.patch + +# make copy_from_user to a stack slot provable right +# hosed stuff, just drop this close to beta +#ApplyPatch linux-2.6.31-copy_from_user-bounds.patch + +# Get away from having to poll Toshibas +#ApplyPatch linux-2.6-input-fix-toshiba-hotkeys.patch + +ApplyPatch linux-2.6.30-no-pcspkr-modalias.patch + +# Allow to use 480600 baud on 16C950 UARTs +ApplyPatch linux-2.6-serial-460800.patch + +# Silence some useless messages that still get printed with 'quiet' +ApplyPatch linux-2.6-silence-noise.patch +ApplyPatch linux-2.6.30-hush-rom-warning.patch + +# Make fbcon not show the penguins with 'quiet' +ApplyPatch linux-2.6-silence-fbcon-logo.patch + +# Fix the SELinux mprotect checks on executable mappings +#ApplyPatch linux-2.6-selinux-mprotect-checks.patch +# Fix SELinux for sparc +#ApplyPatch linux-2.6-sparc-selinux-mprotect-checks.patch + +# Changes to upstream defaults. + + +# ia64 ata quirk +ApplyPatch linux-2.6-ata-quirk.patch + +# Make it possible to identify non-hotplug SATA ports +ApplyPatch linux-2.6-ahci-export-capabilities.patch + +# prism54: remove pci modinfo device table +ApplyPatch prism54-remove-pci-dev-table.patch + +# ath9k: add fixes suggested by upstream maintainer +ApplyPatch linux-2.6-ath9k-fixes.patch + +# /dev/crash driver. +ApplyPatch linux-2.6-crash-driver.patch + +# Determine cacheline sizes in a generic manner. +ApplyPatch linux-2.6-pci-cacheline-sizing.patch + +# http://www.lirc.org/ +ApplyPatch lirc-2.6.31.patch +# enable IR receiver on Hauppauge HD PVR (v4l-dvb merge pending) +ApplyPatch hdpvr-ir-enable.patch +# tell usbhid to ignore all imon devices (sent upstream 2009.07.31) +ApplyPatch hid-ignore-all-recent-imon-devices.patch + +# Add kernel KSM support +ApplyPatch linux-2.6-ksm.patch +ApplyPatch linux-2.6-ksm-updates.patch +ApplyPatch linux-2.6-ksm-fix-munlock.patch +ApplyPatch linux-2.6-ksm-updates-from-32.patch +# Optimize KVM for KSM support +ApplyPatch linux-2.6-ksm-kvm.patch + +# Assorted Virt Fixes +ApplyPatch linux-2.6-virtio_blk-revert-QUEUE_FLAG_VIRT-addition.patch +ApplyPatch linux-2.6-xen-fix-is_disconnected_device-exists_disconnected_device.patch +ApplyPatch linux-2.6-xen-improvement-to-wait_for_devices.patch +ApplyPatch linux-2.6-xen-increase-device-connection-timeout.patch +ApplyPatch linux-2.6-virtio_blk-add-support-for-cache-flush.patch + +# Fix block I/O errors in KVM +ApplyPatch linux-2.6-block-silently-error-unsupported-empty-barriers-too.patch + +ApplyPatch linux-2.6-e1000-ich9.patch + +# Nouveau DRM + drm fixes +ApplyPatch kms-offb-handoff.patch +ApplyPatch drm-next-b390f944.patch +ApplyPatch drm-radeon-misc-fixes.patch +ApplyPatch drm-radeon-rv410-test-fix.patch +ApplyPatch drm-conservative-fallback-modes.patch +ApplyPatch drm-edid-retry.patch +ApplyPatch drm-edid-header-fixup.patch +ApplyPatch drm-default-mode.patch + +ApplyPatch drm-nouveau.patch +# pm broken on my thinkpad t60p - airlied +#ApplyPatch drm-radeon-pm.patch +ApplyPatch drm-i915-resume-force-mode.patch +ApplyOptionalPatch drm-intel-next.patch +#this appears to be upstream - mjg59? +#ApplyPatch drm-intel-pm.patch +ApplyPatch drm-intel-no-tv-hotplug.patch +ApplyPatch drm-i915-fix-tvmode-oops.patch +ApplyPatch drm-i915-fix-sync-to-vbl-when-vga-is-off.patch +#ApplyPatch drm-disable-r600-aspm.patch + +# VGA arb + drm +ApplyPatch linux-2.6-vga-arb.patch +ApplyPatch drm-vga-arb.patch +ApplyPatch drm-radeon-kms-arbiter-return-ignore.patch + +# Lower debug level of fbcon handover messages (rh#538526) +ApplyPatch fbcon-lower-debug.patch + +# linux1394 git patches +# apply if non-empty +ApplyOptionalPatch linux-2.6-firewire-git-update.patch +ApplyOptionalPatch linux-2.6-firewire-git-pending.patch + +# silence the ACPI blacklist code +ApplyPatch linux-2.6-silence-acpi-blacklist.patch + +# V4L/DVB updates/fixes/experimental drivers +# apply if non-empty +ApplyOptionalPatch linux-2.6-v4l-dvb-fixes.patch +ApplyOptionalPatch linux-2.6-v4l-dvb-update.patch +ApplyOptionalPatch linux-2.6-v4l-dvb-experimental.patch + +ApplyPatch v4l-dvb-fix-cx25840-firmware-loading.patch + +# Patches headed upstream +ApplyPatch linux-2.6-rtc-show-hctosys.patch +ApplyPatch linux-2.6-rfkill-all.patch +ApplyPatch linux-2.6-selinux-module-load-perms.patch + +# patches headed for -stable + +# make perf counter API available to userspace (#527264) +ApplyPatch perf-make-perf-counter-h-available-to-userspace.patch + +ApplyPatch improve-resource-counter-scalability.patch + +# fix perf for sysprof +ApplyPatch perf-events-fix-swevent-hrtimer-sampling.patch +ApplyPatch perf-events-dont-generate-events-for-the-idle-task.patch + +# Fix oops in padlock +ApplyPatch crypto-via-padlock-fix-nano-aes.patch + +# tg3 fixes (#527209) +ApplyPatch tg3-01-delay-mdio-bus-init-until-fw-finishes.patch +ApplyPatch tg3-02-fix-tso-test-against-wrong-flags-var.patch +ApplyPatch tg3-03-fix-57780-asic-rev-pcie-link-receiver-errors.patch +ApplyPatch tg3-04-prevent-tx-bd-corruption.patch +ApplyPatch tg3-05-assign-flags-to-fixes-in-start_xmit_dma_bug.patch +ApplyPatch tg3-06-fix-5906-transmit-hangs.patch + +# sched fixes cherry-picked from 2.6.32 +ApplyPatch sched-deal-with-low-load-in-wake-affine.patch +ApplyPatch sched-ensure-child-cant-gain-time-over-its-parent-after-fork.patch +ApplyPatch sched-remove-shortcut-from-select-task-rq-fair.patch +# latency defaults from 2.6.32 +ApplyPatch sched-retune-scheduler-latency-defaults.patch +# fix wakeup latency +ApplyPatch sched-update-the-clock-of-runqueue-select-task-rq-selected.patch + +ApplyPatch highmem-Fix-debug_kmap_atomic-to-also-handle-KM_IRQ_.patch +ApplyPatch highmem-Fix-race-in-debug_kmap_atomic-which-could-ca.patch +ApplyPatch highmem-fix-arm-powerpc-kmap_types.patch + +ApplyPatch dlm-fix-connection-close-handling.patch + +# rhbz#544144 +ApplyPatch ipv4-fix-null-ptr-deref-in-ip_fragment.patch + +# END OF PATCH APPLICATIONS + +%endif + +# Any further pre-build tree manipulations happen here. + +chmod +x scripts/checkpatch.pl + +# only deal with configs if we are going to build for the arch +%ifnarch %nobuildarches + +mkdir configs + +# Remove configs not for the buildarch +for cfg in kernel-%{version}-*.config; do + if [ `echo %{all_arch_configs} | grep -c $cfg` -eq 0 ]; then + rm -f $cfg + fi +done + +%if !%{debugbuildsenabled} +rm -f kernel-%{version}-*debug.config +%endif + +# now run oldconfig over all the config files +for i in *.config +do + mv $i .config + Arch=`head -1 .config | cut -b 3-` + make ARCH=$Arch %{oldconfig_target} + echo "# $Arch" > configs/$i + cat .config >> configs/$i +done +# end of kernel config +%endif + +# get rid of unwanted files resulting from patch fuzz +find . \( -name "*.orig" -o -name "*~" \) -exec rm -f {} \; >/dev/null + +cd .. + +### +### build +### +%build + +%if %{with_sparse} +%define sparse_mflags C=1 +%endif + +%if %{fancy_debuginfo} +# This override tweaks the kernel makefiles so that we run debugedit on an +# object before embedding it. When we later run find-debuginfo.sh, it will +# run debugedit again. The edits it does change the build ID bits embedded +# in the stripped object, but repeating debugedit is a no-op. We do it +# beforehand to get the proper final build ID bits into the embedded image. +# This affects the vDSO images in vmlinux, and the vmlinux image in bzImage. +export AFTER_LINK=\ +'sh -xc "/usr/lib/rpm/debugedit -b $$RPM_BUILD_DIR -d /usr/src/debug -i $@"' +%endif + +cp_vmlinux() +{ + eu-strip --remove-comment -o "$2" "$1" +} + +BuildKernel() { + MakeTarget=$1 + KernelImage=$2 + Flavour=$3 + InstallName=${4:-vmlinuz} + + # Pick the right config file for the kernel we're building + Config=kernel-%{version}-%{_target_cpu}${Flavour:+-${Flavour}}.config + DevelDir=/usr/src/kernels/%{KVERREL}${Flavour:+.${Flavour}} + + # When the bootable image is just the ELF kernel, strip it. + # We already copy the unstripped file into the debuginfo package. + if [ "$KernelImage" = vmlinux ]; then + CopyKernel=cp_vmlinux + else + CopyKernel=cp + fi + + KernelVer=%{version}-%{release}.%{_target_cpu}${Flavour:+.${Flavour}} + echo BUILDING A KERNEL FOR ${Flavour} %{_target_cpu}... + + # make sure EXTRAVERSION says what we want it to say + perl -p -i -e "s/^EXTRAVERSION.*/EXTRAVERSION = %{?stablerev}-%{release}.%{_target_cpu}${Flavour:+.${Flavour}}/" Makefile + + # if pre-rc1 devel kernel, must fix up SUBLEVEL for our versioning scheme + %if !0%{?rcrev} + %if 0%{?gitrev} + perl -p -i -e 's/^SUBLEVEL.*/SUBLEVEL = %{upstream_sublevel}/' Makefile + %endif + %endif + + # and now to start the build process + + make -s mrproper + cp configs/$Config .config + + Arch=`head -1 .config | cut -b 3-` + echo USING ARCH=$Arch + + make -s ARCH=$Arch %{oldconfig_target} > /dev/null + make -s ARCH=$Arch V=1 %{?_smp_mflags} $MakeTarget %{?sparse_mflags} + make -s ARCH=$Arch V=1 %{?_smp_mflags} modules %{?sparse_mflags} || exit 1 + +%if %{with_perftool} + pushd tools/perf +# make sure the scripts are executable... won't be in tarball until 2.6.31 :/ + chmod +x util/generate-cmdlist.sh util/PERF-VERSION-GEN + make -s V=1 %{?_smp_mflags} perf + mkdir -p $RPM_BUILD_ROOT/usr/libexec/ + install -m 755 perf $RPM_BUILD_ROOT/usr/libexec/perf.$KernelVer + popd +%endif + + # Start installing the results +%if %{with_debuginfo} + mkdir -p $RPM_BUILD_ROOT%{debuginfodir}/boot + mkdir -p $RPM_BUILD_ROOT%{debuginfodir}/%{image_install_path} +%endif + mkdir -p $RPM_BUILD_ROOT/%{image_install_path} + install -m 644 .config $RPM_BUILD_ROOT/boot/config-$KernelVer + install -m 644 System.map $RPM_BUILD_ROOT/boot/System.map-$KernelVer +%if %{with_dracut} + # We estimate the size of the initramfs because rpm needs to take this size + # into consideration when performing disk space calculations. (See bz #530778) + dd if=/dev/zero of=$RPM_BUILD_ROOT/boot/initramfs-$KernelVer.img bs=1M count=20 +%else + dd if=/dev/zero of=$RPM_BUILD_ROOT/boot/initrd-$KernelVer.img bs=1M count=5 +%endif + if [ -f arch/$Arch/boot/zImage.stub ]; then + cp arch/$Arch/boot/zImage.stub $RPM_BUILD_ROOT/%{image_install_path}/zImage.stub-$KernelVer || : + fi + $CopyKernel $KernelImage \ + $RPM_BUILD_ROOT/%{image_install_path}/$InstallName-$KernelVer + chmod 755 $RPM_BUILD_ROOT/%{image_install_path}/$InstallName-$KernelVer + + mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer + # Override $(mod-fw) because we don't want it to install any firmware + # We'll do that ourselves with 'make firmware_install' + make -s ARCH=$Arch INSTALL_MOD_PATH=$RPM_BUILD_ROOT modules_install KERNELRELEASE=$KernelVer mod-fw= +%ifarch %{vdso_arches} + make -s ARCH=$Arch INSTALL_MOD_PATH=$RPM_BUILD_ROOT vdso_install KERNELRELEASE=$KernelVer + if grep '^CONFIG_XEN=y$' .config >/dev/null; then + echo > ldconfig-kernel.conf "\ +# This directive teaches ldconfig to search in nosegneg subdirectories +# and cache the DSOs there with extra bit 0 set in their hwcap match +# fields. In Xen guest kernels, the vDSO tells the dynamic linker to +# search in nosegneg subdirectories and to match this extra hwcap bit +# in the ld.so.cache file. +hwcap 0 nosegneg" + fi + if [ ! -s ldconfig-kernel.conf ]; then + echo > ldconfig-kernel.conf "\ +# Placeholder file, no vDSO hwcap entries used in this kernel." + fi + %{__install} -D -m 444 ldconfig-kernel.conf \ + $RPM_BUILD_ROOT/etc/ld.so.conf.d/kernel-$KernelVer.conf +%endif + + # And save the headers/makefiles etc for building modules against + # + # This all looks scary, but the end result is supposed to be: + # * all arch relevant include/ files + # * all Makefile/Kconfig files + # * all script/ files + + rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/source + mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + (cd $RPM_BUILD_ROOT/lib/modules/$KernelVer ; ln -s build source) + # dirs for additional modules per module-init-tools, kbuild/modules.txt + mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/extra + mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/updates + mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/weak-updates + # first copy everything + cp --parents `find -type f -name "Makefile*" -o -name "Kconfig*"` $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + cp Module.symvers $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + cp System.map $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + if [ -s Module.markers ]; then + cp Module.markers $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + fi + # then drop all but the needed Makefiles/Kconfig files + rm -rf $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/Documentation + rm -rf $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/scripts + rm -rf $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include + cp .config $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + cp -a scripts $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + if [ -d arch/$Arch/scripts ]; then + cp -a arch/$Arch/scripts $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/arch/%{_arch} || : + fi + if [ -f arch/$Arch/*lds ]; then + cp -a arch/$Arch/*lds $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/arch/%{_arch}/ || : + fi + rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/scripts/*.o + rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/scripts/*/*.o +%ifarch ppc + cp -a --parents arch/powerpc/lib/crtsavres.[So] $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/ +%endif + if [ -d arch/%{asmarch}/include ]; then + cp -a --parents arch/%{asmarch}/include $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/ + fi + mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include + cd include + cp -a acpi config crypto keys linux math-emu media mtd net pcmcia rdma rxrpc scsi sound trace video drm asm-generic $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include + asmdir=$(readlink asm) + cp -a $asmdir $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/ + pushd $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include + ln -s $asmdir asm + popd + # Make sure the Makefile and version.h have a matching timestamp so that + # external modules can be built + touch -r $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/Makefile $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/linux/version.h + touch -r $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/.config $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/linux/autoconf.h + # Copy .config to include/config/auto.conf so "make prepare" is unnecessary. + cp $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/.config $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/config/auto.conf + cd .. + + # + # save the vmlinux file for kernel debugging into the kernel-debuginfo rpm + # +%if %{with_debuginfo} + mkdir -p $RPM_BUILD_ROOT%{debuginfodir}/lib/modules/$KernelVer + cp vmlinux $RPM_BUILD_ROOT%{debuginfodir}/lib/modules/$KernelVer +%endif + + find $RPM_BUILD_ROOT/lib/modules/$KernelVer -name "*.ko" -type f >modnames + + # mark modules executable so that strip-to-file can strip them + xargs --no-run-if-empty chmod u+x < modnames + + # Generate a list of modules for block and networking. + + fgrep /drivers/ modnames | xargs --no-run-if-empty nm -upA | + sed -n 's,^.*/\([^/]*\.ko\): *U \(.*\)$,\1 \2,p' > drivers.undef + + collect_modules_list() + { + sed -r -n -e "s/^([^ ]+) \\.?($2)\$/\\1/p" drivers.undef | + LC_ALL=C sort -u > $RPM_BUILD_ROOT/lib/modules/$KernelVer/modules.$1 + } + + collect_modules_list networking \ + 'register_netdev|ieee80211_register_hw|usbnet_probe' + collect_modules_list block \ + 'ata_scsi_ioctl|scsi_add_host|blk_init_queue|register_mtd_blktrans|scsi_esp_register|scsi_register_device_handler' + collect_modules_list drm \ + 'drm_open|drm_init' + collect_modules_list modesetting \ + 'drm_crtc_init' + + # detect missing or incorrect license tags + rm -f modinfo + while read i + do + echo -n "${i#$RPM_BUILD_ROOT/lib/modules/$KernelVer/} " >> modinfo + /sbin/modinfo -l $i >> modinfo + done < modnames + + egrep -v \ + 'GPL( v2)?$|Dual BSD/GPL$|Dual MPL/GPL$|GPL and additional rights$' \ + modinfo && exit 1 + + rm -f modinfo modnames + + # remove files that will be auto generated by depmod at rpm -i time + for i in alias alias.bin ccwmap dep dep.bin ieee1394map inputmap isapnpmap ofmap pcimap seriomap symbols symbols.bin usbmap + do + rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/modules.$i + done + + # Move the devel headers out of the root file system + mkdir -p $RPM_BUILD_ROOT/usr/src/kernels + mv $RPM_BUILD_ROOT/lib/modules/$KernelVer/build $RPM_BUILD_ROOT/$DevelDir + ln -sf ../../..$DevelDir $RPM_BUILD_ROOT/lib/modules/$KernelVer/build +} + +### +# DO it... +### + +# prepare directories +rm -rf $RPM_BUILD_ROOT +mkdir -p $RPM_BUILD_ROOT/boot + +cd linux-%{kversion}.%{_target_cpu} + +%if %{with_debug} +BuildKernel %make_target %kernel_image debug +%endif + +%if %{with_pae_debug} +BuildKernel %make_target %kernel_image PAEdebug +%endif + +%if %{with_pae} +BuildKernel %make_target %kernel_image PAE +%endif + +%if %{with_up} +BuildKernel %make_target %kernel_image +%endif + +%if %{with_smp} +BuildKernel %make_target %kernel_image smp +%endif + +%if %{with_kdump} +BuildKernel vmlinux vmlinux kdump vmlinux +%endif + +%if %{with_doc} +# Make the HTML and man pages. +# XXX nix %{?_smp_mflags} here, buggy Documentation/*/Makefile! +make htmldocs mandocs || %{doc_build_fail} + +# sometimes non-world-readable files sneak into the kernel source tree +chmod -R a=rX Documentation +find Documentation -type d | xargs chmod u+w +%endif + +%if %{with_perf} +pushd tools/perf +make %{?_smp_mflags} man || %{doc_build_fail} +popd +%endif + +### +### Special hacks for debuginfo subpackages. +### + +# This macro is used by %%install, so we must redefine it before that. +%define debug_package %{nil} + +%if %{fancy_debuginfo} +%define __debug_install_post \ + /usr/lib/rpm/find-debuginfo.sh %{debuginfo_args} %{_builddir}/%{?buildsubdir}\ +%{nil} +%endif + +%if %{with_debuginfo} +%ifnarch noarch +%global __debug_package 1 +%files -f debugfiles.list debuginfo-common-%{_target_cpu} +%defattr(-,root,root) +%endif +%endif + +### +### install +### + +%install + +cd linux-%{kversion}.%{_target_cpu} + +%if %{with_doc} +docdir=$RPM_BUILD_ROOT%{_datadir}/doc/kernel-doc-%{rpmversion} +man9dir=$RPM_BUILD_ROOT%{_datadir}/man/man9 + +# copy the source over +mkdir -p $docdir +tar -f - --exclude=man --exclude='.*' -c Documentation | tar xf - -C $docdir + +# Install man pages for the kernel API. +mkdir -p $man9dir +find Documentation/DocBook/man -name '*.9.gz' -print0 | +xargs -0 --no-run-if-empty %{__install} -m 444 -t $man9dir $m +ls $man9dir | grep -q '' || > $man9dir/BROKEN +%endif # with_doc + +# perf docs +%if %{with_perf} +mandir=$RPM_BUILD_ROOT%{_datadir}/man +man1dir=$mandir/man1 +pushd tools/perf/Documentation +make install-man mandir=$mandir +popd + +pushd $man1dir +for d in *.1; do + gzip $d; +done +popd +%endif # with_perf + +# perf shell wrapper +%if %{with_perf} +mkdir -p $RPM_BUILD_ROOT/usr/sbin/ +cp $RPM_SOURCE_DIR/perf $RPM_BUILD_ROOT/usr/sbin/perf +chmod 0755 $RPM_BUILD_ROOT/usr/sbin/perf +mkdir -p $RPM_BUILD_ROOT%{_datadir}/doc/perf +%endif + +%if %{with_headers} +# Install kernel headers +make ARCH=%{hdrarch} INSTALL_HDR_PATH=$RPM_BUILD_ROOT/usr headers_install + +# Do headers_check but don't die if it fails. +make ARCH=%{hdrarch} INSTALL_HDR_PATH=$RPM_BUILD_ROOT/usr headers_check \ + > hdrwarnings.txt || : +if grep -q exist hdrwarnings.txt; then + sed s:^$RPM_BUILD_ROOT/usr/include/:: hdrwarnings.txt + # Temporarily cause a build failure if header inconsistencies. + # exit 1 +fi + +find $RPM_BUILD_ROOT/usr/include \ + \( -name .install -o -name .check -o \ + -name ..install.cmd -o -name ..check.cmd \) | xargs rm -f + +# glibc provides scsi headers for itself, for now +rm -rf $RPM_BUILD_ROOT/usr/include/scsi +rm -f $RPM_BUILD_ROOT/usr/include/asm*/atomic.h +rm -f $RPM_BUILD_ROOT/usr/include/asm*/io.h +rm -f $RPM_BUILD_ROOT/usr/include/asm*/irq.h +%endif + +%if %{with_firmware} +%{build_firmware} +%endif + +%if %{with_bootwrapper} +make DESTDIR=$RPM_BUILD_ROOT bootwrapper_install WRAPPER_OBJDIR=%{_libdir}/kernel-wrapper WRAPPER_DTSDIR=%{_libdir}/kernel-wrapper/dts +%endif + + +### +### clean +### + +%clean +rm -rf $RPM_BUILD_ROOT + +### +### scripts +### + +# +# This macro defines a %%post script for a kernel*-devel package. +# %%kernel_devel_post [] +# +%define kernel_devel_post() \ +%{expand:%%post %{?1:%{1}-}devel}\ +if [ -f /etc/sysconfig/kernel ]\ +then\ + . /etc/sysconfig/kernel || exit $?\ +fi\ +if [ "$HARDLINK" != "no" -a -x /usr/sbin/hardlink ]\ +then\ + (cd /usr/src/kernels/%{KVERREL}%{?1:.%{1}} &&\ + /usr/bin/find . -type f | while read f; do\ + hardlink -c /usr/src/kernels/*.fc*.*/$f $f\ + done)\ +fi\ +%{nil} + +# This macro defines a %%posttrans script for a kernel package. +# %%kernel_variant_posttrans [] +# More text can follow to go at the end of this variant's %%post. +# +%define kernel_variant_posttrans() \ +%{expand:%%posttrans %{?1}}\ +/sbin/new-kernel-pkg --package kernel%{?1:-%{1}} --rpmposttrans %{KVERREL}%{?1:.%{1}} || exit $?\ +%{nil} + +# +# This macro defines a %%post script for a kernel package and its devel package. +# %%kernel_variant_post [-v ] [-r ] +# More text can follow to go at the end of this variant's %%post. +# +%define kernel_variant_post(v:r:) \ +%{expand:%%kernel_devel_post %{?-v*}}\ +%{expand:%%kernel_variant_posttrans %{?-v*}}\ +%{expand:%%post %{?-v*}}\ +%{-r:\ +if [ `uname -i` == "x86_64" -o `uname -i` == "i386" ] &&\ + [ -f /etc/sysconfig/kernel ]; then\ + /bin/sed -r -i -e 's/^DEFAULTKERNEL=%{-r*}$/DEFAULTKERNEL=kernel%{?-v:-%{-v*}}/' /etc/sysconfig/kernel || exit $?\ +fi}\ +%{expand:\ +%if %{with_dracut}\ +/sbin/new-kernel-pkg --package kernel%{?-v:-%{-v*}} --mkinitrd --dracut --depmod --install %{KVERREL}%{?-v:.%{-v*}} || exit $?\ +%else\ +/sbin/new-kernel-pkg --package kernel%{?-v:-%{-v*}} --mkinitrd --depmod --install %{KVERREL}%{?-v:.%{-v*}} || exit $?\ +%endif}\ +#if [ -x /sbin/weak-modules ]\ +#then\ +# /sbin/weak-modules --add-kernel %{KVERREL}%{?-v*} || exit $?\ +#fi\ +%{nil} + +# +# This macro defines a %%preun script for a kernel package. +# %%kernel_variant_preun +# +%define kernel_variant_preun() \ +%{expand:%%preun %{?1}}\ +/sbin/new-kernel-pkg --rminitrd --rmmoddep --remove %{KVERREL}%{?1:.%{1}} || exit $?\ +#if [ -x /sbin/weak-modules ]\ +#then\ +# /sbin/weak-modules --remove-kernel %{KVERREL}%{?1} || exit $?\ +#fi\ +%{nil} + +%kernel_variant_preun +%ifarch x86_64 +%kernel_variant_post -r (kernel-smp|kernel-xen) +%else +%kernel_variant_post -r kernel-smp +%endif + +%kernel_variant_preun smp +%kernel_variant_post -v smp + +%kernel_variant_preun PAE +%kernel_variant_post -v PAE -r (kernel|kernel-smp|kernel-xen) + +%kernel_variant_preun debug +%kernel_variant_post -v debug + +%kernel_variant_post -v PAEdebug -r (kernel|kernel-smp|kernel-xen) +%kernel_variant_preun PAEdebug + +if [ -x /sbin/ldconfig ] +then + /sbin/ldconfig -X || exit $? +fi + +### +### file lists +### + +%if %{with_headers} +%files headers +%defattr(-,root,root) +/usr/include/* +%endif + +%if %{with_firmware} +%files firmware +%defattr(-,root,root) +/lib/firmware/* +%doc linux-%{kversion}.%{_target_cpu}/firmware/WHENCE +%endif + +%if %{with_bootwrapper} +%files bootwrapper +%defattr(-,root,root) +/usr/sbin/* +%{_libdir}/kernel-wrapper +%endif + +# only some architecture builds need kernel-doc +%if %{with_doc} +%files doc +%defattr(-,root,root) +%{_datadir}/doc/kernel-doc-%{rpmversion}/Documentation/* +%dir %{_datadir}/doc/kernel-doc-%{rpmversion}/Documentation +%dir %{_datadir}/doc/kernel-doc-%{rpmversion} +%{_datadir}/man/man9/* +%endif + +%if %{with_perf} +%files -n perf +%defattr(-,root,root) +%{_datadir}/doc/perf +/usr/sbin/perf +%{_datadir}/man/man1/* +%endif + +# This is %{image_install_path} on an arch where that includes ELF files, +# or empty otherwise. +%define elf_image_install_path %{?kernel_image_elf:%{image_install_path}} + +# +# This macro defines the %%files sections for a kernel package +# and its devel and debuginfo packages. +# %%kernel_variant_files [-k vmlinux] +# +%define kernel_variant_files(k:) \ +%if %{1}\ +%{expand:%%files %{?2}}\ +%defattr(-,root,root)\ +/%{image_install_path}/%{?-k:%{-k*}}%{!?-k:vmlinuz}-%{KVERREL}%{?2:.%{2}}\ +/boot/System.map-%{KVERREL}%{?2:.%{2}}\ +%if %{with_perftool}\ +/usr/libexec/perf.%{KVERREL}%{?2:.%{2}}\ +%endif\ +#/boot/symvers-%{KVERREL}%{?2:.%{2}}.gz\ +/boot/config-%{KVERREL}%{?2:.%{2}}\ +%dir /lib/modules/%{KVERREL}%{?2:.%{2}}\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/kernel\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/build\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/source\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/extra\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/updates\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/weak-updates\ +%ifarch %{vdso_arches}\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/vdso\ +/etc/ld.so.conf.d/kernel-%{KVERREL}%{?2:.%{2}}.conf\ +%endif\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/modules.*\ +%if %{with_dracut}\ +/boot/initramfs-%{KVERREL}%{?2:.%{2}}.img\ +%else\ +/boot/initrd-%{KVERREL}%{?2:.%{2}}.img\ +%endif\ +%{expand:%%files %{?2:%{2}-}devel}\ +%defattr(-,root,root)\ +%dir /usr/src/kernels\ +%verify(not mtime) /usr/src/kernels/%{KVERREL}%{?2:.%{2}}\ +/usr/src/kernels/%{KVERREL}%{?2:.%{2}}\ +%if %{with_debuginfo}\ +%ifnarch noarch\ +%if %{fancy_debuginfo}\ +%{expand:%%files -f debuginfo%{?2}.list %{?2:%{2}-}debuginfo}\ +%else\ +%{expand:%%files %{?2:%{2}-}debuginfo}\ +%endif\ +%defattr(-,root,root)\ +%if !%{fancy_debuginfo}\ +%if "%{elf_image_install_path}" != ""\ +%{debuginfodir}/%{elf_image_install_path}/*-%{KVERREL}%{?2:.%{2}}.debug\ +%endif\ +%{debuginfodir}/lib/modules/%{KVERREL}%{?2:.%{2}}\ +%{debuginfodir}/usr/src/kernels/%{KVERREL}%{?2:.%{2}}\ +%endif\ +%endif\ +%endif\ +%endif\ +%{nil} + + +%kernel_variant_files %{with_up} +%kernel_variant_files %{with_smp} smp +%kernel_variant_files %{with_debug} debug +%kernel_variant_files %{with_pae} PAE +%kernel_variant_files %{with_pae_debug} PAEdebug +%kernel_variant_files -k vmlinux %{with_kdump} kdump + +# plz don't put in a version string unless you're going to tag +# and build. + +%changelog +* Thu Dec 03 2009 Kyle McMartin 2.6.31.6-162 +- ipv4-fix-null-ptr-deref-in-ip_fragment.patch: null ptr deref + bug fix. + +* Thu Dec 03 2009 Dave Airlie 2.6.31.6-161 +- rv410 LVDS on resume test fix from AMD (#541562) + +* Wed Dec 02 2009 John W. Linville 2.6.31.6-160 +- ath9k: add fixes suggested by upstream maintainer + +* Wed Dec 02 2009 Dave Airlie 2.6.31.6-159 +- drm-radeon-misc-fixes.patch: r400 LVDS, r600 digital dpms, cursor fix, tv property + +* Wed Dec 02 2009 Ben Skeggs 2.6.31.6-158 +- nouveau: more complete lvds script selection on >=G80 (rh#522690, rh#529859) +- nouveau: more complete tmds script selection on >=G80 (rh#537853) +- nouveau: TV detection fixes + +* Tue Dec 01 2009 Dave Airlie 2.6.31.6-157 +- div/0 fix harder (#540593) - also ignore unposted GPUs with no BIOS + +* Tue Dec 01 2009 Dave Airlie 2.6.31.6-156 +- drm-next: fixes LVDS resume on r4xx, div/0 on no bios (#540593) + lockup on tv-out only startup. + +* Mon Nov 30 2009 Kyle McMartin +- drm-i915-fix-sync-to-vbl-when-vga-is-off.patch: add (rhbz#541670) + +* Sun Nov 29 2009 Kyle McMartin +- Drop linux-2.6-sysrq-c.patch, made consistent upstream. + +* Fri Nov 27 2009 Jarod Wilson 2.6.31.6-153 +- add device name to lirc_zilog, fixes issues w/multiple target devices +- add lirc_imon pure input mode support for onboard decode devices + +* Wed Nov 26 2009 David Woodhouse 2.6.31.6-152 +- Fix intel_tv_mode_set oops (#540218) + +* Wed Nov 26 2009 David Woodhouse 2.6.31.6-151 +- VT-d: Work around yet more HP BIOS brokenness (#536675) + +* Wed Nov 25 2009 Kyle McMartin +- dlm: fix connection close handling. + Fix by lmb, requested by fabio. + +* Wed Nov 25 2009 David Woodhouse 2.6.31.6-149 +- VT-d: Work around more HP BIOS brokenness. + +* Tue Nov 24 2009 Dave Airlie 2.6.31.6-148 +- radeon: flush HDP cache on rendering wait - fixes r600 rendercheck failure + +* Mon Nov 23 2009 Adam Jackson +- drm-default-mode.patch: Default to 1024x768 to match UMS. (#538761) + +* Mon Nov 23 2009 Roland McGrath 2.6.31.6-146 +- Fix oops in x86-32 kernel's iret handling for bogus user %cs. (#540580) + +* Fri Nov 21 2009 Kyle McMartin +- Fix up ssp' highmem fixes with fixes for arm & ppc. + +* Thu Nov 20 2009 Chris Wright 2.6.31.6-144 +- VT-d: another fallback for another BIOS bug (#524808) + +* Thu Nov 19 2009 Ben Skeggs 2.6.31.6-142 +- Oops, add new patch to spec file + +* Thu Nov 19 2009 Ben Skeggs 2.6.31.6-141 +- Lower debug level of fbcon handover messages (rh#538526) + +* Thu Nov 19 2009 Dave Airlie 2.6.31.6-140 +- drm-next-44c83571.patch: oops pulled the wrong tree into my f12 tree + +* Thu Nov 19 2009 Ben Skeggs 2.6.31.6-139 +- nouveau: s/r fixes on chipsets using bios opcode 0x87 +- nouveau: fixes to bios opcode 0x8e +- nouveau: hopefully fix nv1x context switching issues (rh#526577) +- nouveau: support for NVA5 (GeForce G220) +- nouveau: fixes for NVAA support + +* Thu Nov 19 2009 Dave Airlie 2.6.31.6-138 +- drm-next-d56672a9.patch: fix some rn50 cloning issues + +* Wed Nov 18 2009 David Woodhouse 2.6.31.6-137 +- Actually force the IOMMU not to be used when we detect the HP/Acer bug. + +* Tue Nov 17 2009 Chuck Ebbert 2.6.31.6-136 +- ACPI embedded controller fixes from Fedora 11. + +* Tue Nov 17 2009 Chuck Ebbert 2.6.31.6-135 +- Scheduler fixes and latency tuning patches from F-11. + +* Tue Nov 17 2009 Dave Airlie 2.6.31.6-134 +- glad to see edid retry patch was compiled. + +* Tue Nov 17 2009 Dave Airlie 2.6.31.6-133 +- drm-next-984d1f3c.patch: rebase with upstream fixes - drop all merged + +* Thu Nov 12 2009 Adam Jackson +- Actually apply the EDID retry patch +- drm-edid-header-fixup.patch: Fix up some broken EDID headers (#534120) + +* Thu Nov 12 2009 Chuck Ebbert 2.6.31.6-130 +- Use ApplyOptionalPatch for v4l and firewire updates. +- Drop unused v4l ABI fix. + +* Thu Nov 12 2009 Chuck Ebbert 2.6.31.6-129 +- Linux 2.6.31.6 +- Drop merged patches: + linux-2.6-iwlwifi-reduce-noise-when-skb-allocation-fails.patch + linux-2.6-libertas-crash.patch + pci-increase-alignment-to-make-more-space.patch + acpi-revert-attach-device-to-handle-early.patch + ahci-revert-restore-sb600-sata-controller-64-bit-dma.patch + acpi-pci-fix-null-pointer-dereference-in-acpi-get-pci-dev.patch + af_unix-fix-deadlock-connecting-to-shutdown-socket.patch + keys-get_instantiation_keyring-should-inc-the-keyring-refcount.patch + netlink-fix-typo-in-initialization.patch + fs-pipe-null-ptr-deref-fix.patch + +* Wed Nov 11 2009 Justin M. Forbes 2.6.31.5-128 +- Fix KSM for i686 users. (#532215) +- Add KSM fixes from 2.6.32 + +* Sun Nov 08 2009 David Woodhouse 2.6.31.5-127 +- Apply fix for fallback when HP/Acer BIOS bug detected (#524808) +- Re-enable DMAR. +- Fix libertas crash due to skb pointer bug + +* Sat Nov 07 2009 Kyle McMartin 2.6.31.5-126 +- Re-enable linux-2.6-die-closed-source-bios-muppets-die.patch, DMAR + still defaulting to off. + +* Sat Nov 07 2009 Kyle McMartin 2.6.31.5-125 +- Disable linux-2.6-die-closed-source-bios-muppets-die.patch and + default DMAR to off (can be re-enabled with intel_iommu=on on the + command line due to last minute issues and reversion upstream.) + +* Thu Nov 05 2009 Jarod Wilson +- Add --with dbgonly rpmbuild option to build only debug kernels + +* Thu Nov 05 2009 Dave Airlie 2.6.31.5-122 +- comment out kmap atomic for now, it breaks ppc build + +* Thu Nov 05 2009 Dave Airlie 2.6.31.5-121 +- drm-radeon-fix-agp-resume.patch (#531825) + +* Thu Nov 05 2009 Kyle McMartin +- Add two patches from Soren from mingo/linux-2.6-x86.git to fix + debug_kmap_atomic prints. + +* Thu Nov 05 2009 Ben Skeggs +- nouveau: fix rh#532924 + +* Wed Nov 04 2009 Kyle McMartin +- Make JBD2_DEBUG a toggleable debug setting. Leave it the way it was. + (Double checked resulting configs, don't fret.) + +* Wed Nov 04 2009 Adam Jackson 2.6.31.5-117 +- drm-edid-retry.patch: Try DDC up to four times, like X. (#532957) + +* Wed Nov 04 2009 Chuck Ebbert 2.6.31.5-116 +- tg3 bug fixes (#527209) + +* Wed Nov 04 2009 Kyle McMartin 2.6.31.5-115 +- fs/pipe.c: fix null pointer dereference (CVE-2009-3547) + +* Wed Nov 04 2009 Ben Skeggs 2.6.31.5-114 +- nouveau: provide info userspace needs to handle low memory situations +- nouveau: fix for rh#532711 +- nouveau: add option to provide more debug info for rh#532579 +- patch only so large because of included register rename + +* Tue Nov 03 2009 Adam Jackson 2.6.31.5-113 +- drm-conservative-fallback-modes.patch: When an output is connected but + fails EDID, only add modes with refresh rates <= 60 (#514600) + +* Tue Nov 03 2009 Dave Airlie 2.6.31.5-112 +- drm-r600-lenovo-w500-fix.patch: add second patch from upstream fix + +* Tue Nov 03 2009 Dave Airlie 2.6.31.5-111 +- drm-r600-lenovo-w500-fix.patch: fix lenovo w500 acpi video kill laptop dead +- drop aspm r600 patch as correct fix should be in 110 + +* Tue Nov 03 2009 Dave Airlie 2.6.31.5-110 +- r600: fix for ring setup RMW issue. + +* Mon Nov 02 2009 John W. Linville 2.6.31.5-109 +- prism54: remove pci modinfo device table (#447047) + +* Mon Nov 02 2009 Chuck Ebbert 2.6.31.5-108 +- Enable acerhdf driver for fan speed control on Acer Aspire One notebook (#532463) + +* Mon Nov 02 2009 Dave Airlie 2.6.31.5-107 +- r600: back that out, thanks to yaneti for testing. + +* Mon Nov 02 2009 Dave Airlie 2.6.31.5-106 +- r600: ring size guesswork fix. + +* Fri Oct 30 2009 Dave Airlie 2.6.31.5-105 +- drm-radeon-agp-font-fix.patch: hopefully fix AGP coherency issue + +* Wed Oct 28 2009 Dave Airlie 2.6.31.5-104 +- drm-next-ea1495a6.patch: fix rs400 resume on my test box + +* Wed Oct 28 2009 Dave Airlie 2.6.31.5-103 +- drm-next-fc7f7119.patch: fix oops in SS code, fix multi-card, dvo. +- drm-radeon-kms-arbiter-return-ignore.patch: fix arbiter for non-VGA display + +* Tue Oct 27 2009 Chuck Ebbert +- Fix oops in VIA padlock-aes code. + +* Tue Oct 27 2009 Dave Airlie +- kms: add offb handoff patch for ppc to work + +* Tue Oct 27 2009 Ben Skeggs +- drm-nouveau.patch: misc fixes, very initial NVA8 work + +* Tue Oct 27 2009 Dave Airlie +- fix dd command lines + +* Mon Oct 26 2009 Dave Jones +- Make a 20MB initramfs file so rpm gets its diskspace calculations right. (#530778) + +* Mon Oct 26 2009 Dave Airlie 2.6.31.5-97 +- drm: rebase to drm-next, drop palette fix, merged upstream +- drm-intel-big-hammer.patch: drop, proper fix in 2.6.31.5 +- drm-disable-r600-aspm.patch: test patch to disable aspm on r600/r700 for now + +* Fri Oct 23 2009 Kyle McMartin 2.6.31.5-96 +- Bump NR_CPUS to 256 on x86_64. +- Add two backports (ugh, just had to go renaming perf counters to events...) + for fixing sysprof with perf. + +* Fri Oct 23 2009 Dave Airlie 2.6.31.5-95 +- re enable MSI + +* Fri Oct 23 2009 Dave Airlie 2.6.31.5-94 +- disable debug + stackprotector + +* Fri Oct 23 2009 Chuck Ebbert +- Linux 2.6.31.5 + +* Thu Oct 22 2009 Chuck Ebbert +- Fix exploitable OOPS in keyring code. (CVE-2009-3624) +- Fix kernel memory leak to userspace. (CVE-2009-3612) + +* Thu Oct 22 2009 Dave Airlie 2.6.31.5-91.rc1 +- kms: fix palette + +* Wed Oct 21 2009 Chuck Ebbert +- Disable powersave by default for AC97 audio devices. (#524414) + +* Wed Oct 21 2009 Chuck Ebbert +- Linux 2.6.31.5-rc1 +- Remove the merged HP DC7900 workaround from iommu-updates patch. +- Drop merged patch: + linux-2.6-raidlockdep.patch + +* Mon Oct 19 2009 Kyle McMartin +- af_unix-fix-deadlock-connecting-to-shutdown-socket.patch: fix for + rhbz#529626. + +* Sat Oct 17 2009 Chuck Ebbert +- Replace linux-2.6-bluetooth-autosuspend.diff with upstream version. + +* Fri Oct 16 2009 Josef Bacik +- Update btrfs to latest upstream + +* Fri Oct 16 2009 Chuck Ebbert 2.6.31.4-85 +- Fix another ACPI boot hang (#513680) + +* Fri Oct 16 2009 Ben Skeggs 2.6.31.4-84 +- nouveau: more vbios opcodes, minor fixes, hopeful fix for rh#529292 + +* Wed Oct 14 2009 Roland McGrath 2.6.31.4-83 +- Remove work-around for gcc bug #521991, now fixed. +- Build *docs non-parallel, working around kernel's makefile bugs. + +* Wed Oct 14 2009 Peter Jones +- Add scsi_register_device_handler to modules.block's symbol list so + we'll have scsi device handlers in installer images. + +* Tue Oct 13 2009 Steve Dickson 2.6.31.4-81 +- Fixed hang during NFS installs (bz 528537) + +* Tue Oct 13 2009 Chuck Ebbert 2.6.31.4-80 +- Disable 64-bit DMA on SB600 SATA controllers. + +* Tue Oct 13 2009 Kyle McMartin +- Always build perf docs, regardless of whether we build kernel-doc. + Seems rather unfair to not ship the manpages half the time. + Also, drop BuildRequires %if when not with_doc, the rules about %if + there are f*!&^ing complicated. + +* Mon Oct 12 2009 Kyle McMartin +- Build the perf manpages properly. + +* Mon Oct 12 2009 Chuck Ebbert 2.6.31.4-77 +- Fix boot hang with ACPI on some systems. + +* Mon Oct 12 2009 Chuck Ebbert 2.6.31.4-76 +- Linux 2.6.31.4 + +* Mon Oct 12 2009 Kyle McMartin 2.6.31.4-75.rc2 +- improve-resource-counter-scalability.patch: Fix scalability issues + on big machines, requested by prarit. + +* Mon Oct 12 2009 Jarod Wilson +- Fix irq status check bugs in lirc_ene0100 + +* Mon Oct 12 2009 Chuck Ebbert +- Fix 2.6.31 regression that caused device failures with ACPI enabled. + +* Sun Oct 11 2009 Chuck Ebbert +- Linux 2.6.31.4-rc2 +- Drop merged patch: linux-2.6-frace-fixes.patch + +* Sat Oct 10 2009 Chuck Ebbert +- Make performance counter API available to userspace programs (#527264) + +* Sat Oct 10 2009 Dave Jones +- Drop the NX kernel data patch for now. Causes no-boot on some systems. + +* Fri Oct 09 2009 Dave Jones +- Backport two critical ftrace fixes. + ftrace: check for failure for all conversions + tracing: correct module boundaries for ftrace_release + +* Fri Oct 09 2009 Jarod Wilson +- Build docs sub-package again + +* Thu Oct 08 2009 Kyle McMartin 2.6.31.3-67 +- Linux 2.6.31.3 +- rebase drm-next trivially. +- dropped merged upstream patches, + - linux-2.6-fix-usb-serial-autosuspend.diff + - linux-2.6-iwlagn-modify-digital-SVR-for-1000.patch + - linux-2.6-iwlwifi-Handle-new-firmware-file-with-ucode-build-number-in-header.patch + - linux-2.6-iwlwifi-fix-debugfs-buffer-handling.patch + - linux-2.6-iwlwifi-fix-unloading-driver-while-scanning.patch + - linux-2.6-iwlwifi-remove-deprecated-6000-series-adapters.patch + - linux-2.6-iwlwifi-traverse-linklist-to-find-the-valid-OTP-block.patch + - linux-2.6-iwlwifi-update-1000-series-API-version-to-match-firmware.patch + - linux-2.6-xen-check-efer-fix.patch + - linux-2.6-xen-spinlock-enable-interrupts-only-when-blocking.patch + - linux-2.6-xen-spinlock-stronger-barrier.patch + - linux-2.6-xen-stack-protector-fix.patch + - linux-2.6.31-cpufreq-powernow-k8-oops.patch + +* Thu Oct 08 2009 Ben Skeggs +- ppc: compile nvidiafb as a module only, nvidiafb+nouveau = bang! (rh#491308) + +* Thu Oct 08 2009 Ben Skeggs 2.6.31.1-65 +- nouveau: {drm-next,context,fbcon,misc} fixes, connector forcing + +* Thu Oct 08 2009 Dave Airlie 2.6.31.1-64 +- rebase latest drm-next, fixes many s/r and r600 problems + +* Wed Oct 07 2009 Dave Jones +- Don't mark the initramfs file as a ghost. + +* Wed Oct 07 2009 Dave Jones +- Enable FUNCTION_GRAPH_TRACER on x86-64. + +* Wed Oct 07 2009 Dave Jones +- Disable CONFIG_IRQSOFF_TRACER on srostedt's recommendation. + (Adds unwanted overhead when not in use). + +* Tue Oct 6 2009 Justin M. Forbes +- virtio_blk: add support for cache flush (#526869) + +* Fri Oct 2 2009 John W. Linville +- Backport "iwlwifi: reduce noise when skb allocation fails" + +* Wed Sep 30 2009 David Woodhouse +- Update IOMMU code; mostly a bunch more workarounds for broken BIOSes. + +* Wed Sep 30 2009 Dave Airlie 2.6.31.1-56 +- revert all the arjan patches until someone tests them. + +* Tue Sep 29 2009 Steve Dickson 2.6.31.1-55 +- Updated the NFS4 pseudo root code with a fix from upstream + +* Tue Sep 29 2009 Dave Airlie 2.6.31.1-54 +- Fix broken capabilties that stopped dbus working due to copy from user + fixups. + +* Tue Sep 29 2009 Dave Airlie 2.6.31.1-53 +- drm-next-4c57edba4.patch: fix r600 dri1 memory leak and r600 bugs + +* Mon Sep 28 2009 Dave Jones 2.6.31.1-52 +- Use __builtin_object_size to validate the buffer size for copy_from_user + + associated fixes to various copy_from_user invocations. + +* Mon Sep 28 2009 Justin M. Forbes 2.6.31.1-50 +- Increase timeout for xen frontend devices to connect. + +* Sat Sep 26 2009 Chuck Ebbert 2.6.31.1-49 +- Add Xen spinlock patches to improve scalability. + +* Sat Sep 26 2009 Dave Airlie 2.6.31.1-48 +- drm-next-8ef8678c8.patch: fix intel/nouveau kms + +* Fri Sep 25 2009 Justin M. Forbes 2.6.31.1-47 +- Fix xen guest booting when NX is disabled (#525290) + +* Fri Sep 25 2009 Ben Skeggs 2.6.31.1-46 +- drm-nouveau.patch: cleanups, fixes, pre-G80 s/r fixes, init rework + +* Fri Sep 25 2009 Dave Airlie 2.6.31.1-45 +- drm-next-adea4796c.patch: fix r600 glxgears + +* Fri Sep 25 2009 Dave Airlie 2.6.31.1-44 +- bump a extra one because I accidentially CVS. + +* Thu Sep 24 2009 Dave Airlie 2.6.31.1-42 +- drm-next update - fix r600 s/r, and command line mode picking and r600 tv + +* Thu Sep 24 2009 Chuck Ebbert 2.6.31.1-41 +- Linux 2.6.31.1 +- Drop patches merged upstream: + linux-2.6-kvm-vmx-check-cpl-before-emulating-debug-register-access.patch + linux-2.6-use-__pa_symbol-to-calculate-address-of-C-symbol.patch + linux-2.6-kvm-pvmmu-do-not-batch-pte-updates-from-interrupt-context.patch + linux-2.6-scsi-sd-fix-oops-during-scanning.patch + linux-2.6-scsi-sg-fix-oops-in-error-path.patch + +* Thu Sep 24 2009 Chuck Ebbert 2.6.31-40 +- Drop the modules-ro-nx patch: it's causing ftrace to be unable + to NOP out module function call tracking. (#524042) + +* Wed Sep 23 2009 Kyle McMartin 2.6.31-39 +- touch initramfs-$foo not dracut-$foo. + +* Wed Sep 23 2009 Adam Jackson 2.6.31-37 +- drm: Fix various buglets in EDID parsing. + +* Mon Sep 21 2009 Ben Skeggs +- nouveau: more on rh#522649, added some useful info to debugfs +- lots of coding style cleanups, which is the reason for the huge commit + +* Fri Sep 18 2009 Dave Jones +- %ghost the dracut initramfs file. + +* Thu Sep 17 2009 Hans de Goede +- Now that we have %%post generation of dracut images we do not need to + Require dracut-kernel anymore + +* Thu Sep 17 2009 Kyle McMartin 2.6.31-33 +- Turn off CONFIG_CC_OPTIMIZE_FOR_SIZE on ppc64 until ld decides to play nice + and generate the save/restore stubs. + +* Thu Sep 17 2009 Kristian Høgsberg +- Drop drm page-flip patch for F12. + +* Thu Sep 17 2009 Dave Jones +- cpuidle: Fix the menu governor to boost IO performance. + +* Wed Sep 16 2009 John W. Linville +- Add a few more iwl1000 support patches. +- Remove support for deprecated iwl6000 parts. + +* Wed Sep 16 2009 Eric Paris +- Do not check CAP_SYS_MODULE when networking tres to autoload a module + +* Wed Sep 16 2009 John W. Linville +- Add iwl1000 support patches. + +* Wed Sep 16 2009 Adam Jackson +- Disable hotplug interrupts on TV connectors on i915. + +* Wed Sep 16 2009 Dave Jones +- Fix NULL deref in powernow-k8 driver. (korg #13780) + +* Wed Sep 16 2009 Hans de Goede +- Fix lockdep warning (and potential real deadlock) in mdraid10 code, + requested for -stable, rh#515471 + +* Wed Sep 16 2009 Ben Skeggs 2.6.31-17 +- nouveau: potential fix for rh#522649 + misc other fixes + +* Tue Sep 15 2009 Chuck Ebbert +- Add unused-kernel-patches Make target, change some patches to + use ApplyOptionalPatch + +* Tue Sep 15 2009 Ben Skeggs +- nouveau: misc fixes to context-related issues, fixes some severe nv4x bugs + +* Tue Sep 15 2009 Ben Skeggs +- nouveau: temporarily disable fbcon accel, it's racing with ttm + +* Mon Sep 14 2009 Steve Dickson +- Added support for -o v4 mount parsing + +* Mon Sep 14 2009 Ben Skeggs +- nouveau: avoid PFIFO IRQ hardlock, misc LVDS mode fixes, nv5x RAMFC cleanup + +* Sun Sep 13 2009 Chuck Ebbert +- SCSI oops fixes requested for -stable + +* Fri Sep 11 2009 Dave Jones +- Apply NX/RO to modules + +* Fri Sep 11 2009 Dave Jones +- Mark kernel data section as NX + +* Fri Sep 11 2009 Ben Skeggs +- nouveau: bring in Matthew Garret's initial switchable graphics support + +* Fri Sep 11 2009 Ben Skeggs +- nouveau: fixed use of strap-based panel mode when required (rh#522649) +- nouveau: temporarily block accel on NVAC chipsets (rh#522361, rh#522575) + +* Thu Sep 10 2009 Matthew Garrett +- linux-2.6-ahci-export-capabilities.patch: Backport from upstream +- linux-2.6-rtc-show-hctosys.patch: Export the hctosys state of an rtc +- linux-2.6-rfkill-all.patch: Support for keys that toggle all rfkill state + +* Thu Sep 10 2009 Ben Skeggs +- drm-nouveau.patch: add some scaler-only modes for LVDS, GEM/TTM fixes + +* Wed Sep 09 2009 Dennis Gilmore 2.6.31-2 +- touch the dracut initrd file when using %%{with_dracut} + +* Wed Sep 09 2009 Chuck Ebbert 2.6.31-1 +- Linux 2.6.31 + +* Wed Sep 09 2009 Chuck Ebbert +- Enable VXpocket and PDaudioCF PCMCIA sound drivers. + +* Wed Sep 09 2009 Hans de Goede +- Move to %%post generation of dracut initrd, because of GPL issues surrounding + shipping a prebuild initrd +- Require grubby >= 7.0.4-1, for %%post generation + +* Wed Sep 9 2009 Steve Dickson +- Updated the NFS4 pseudo root code to the latest release. + +* Wed Sep 09 2009 Justin M. Forbes +- Revert virtio_blk to rotational mode. (#509383) + +* Wed Sep 09 2009 Dave Airlie 2.6.31-0.219.rc9.git +- uggh lost nouveau bits in page flip + +* Wed Sep 09 2009 Dave Airlie 2.6.31-0.218.rc9.git2 +- fix r600 oops with page flip patch (#520766) + +* Wed Sep 09 2009 Ben Skeggs +- drm-nouveau.patch: fix display resume on pre-G8x chips + +* Wed Sep 09 2009 Ben Skeggs +- drm-nouveau.patch: add getparam to know using tile_flags is ok for scanout + +* Wed Sep 09 2009 Chuck Ebbert +- 2.6.31-rc9-git2 + +* Wed Sep 9 2009 Roland McGrath 2.6.31-0.214.rc9.git1 +- compile with -fno-var-tracking-assignments, work around gcc bug #521991 + +* Wed Sep 09 2009 Dave Airlie 2.6.31-0.213.rc9.git1 +- fix two bugs in r600 kms, fencing + mobile lvds + +* Tue Sep 08 2009 Ben Skeggs 2.6.31-0.212.rc9.git1 +- drm-nouveau.patch: fix ppc build + +* Tue Sep 08 2009 Ben Skeggs 2.6.31-0.211.rc9.git1 +- drm-nouveau.patch: more misc fixes + +* Tue Sep 08 2009 Dave Airlie 2.6.31-0.210.rc9.git1 +- drm-page-flip.patch: rebase again + +* Tue Sep 08 2009 Dave Airlie 2.6.31-0.209.rc9.git1 +- drm-next.patch: fix r600 signal interruption return value + +* Tue Sep 08 2009 Ben Skeggs 2.6.31-0.208.rc9.git1 +- drm-nouveau.patch: latest upstream + rebase onto drm-next + +* Tue Sep 08 2009 Dave Airlie 2.6.31-0.207.rc9.git1 +- drm-vga-arb.patch: update to avoid lockdep + add r600 support + +* Tue Sep 08 2009 Dave Airlie 2.6.31-0.206.rc9.git1 +- drm: rebase to drm-next - r600 accel + kms should start working now + +* Mon Sep 07 2009 Chuck Ebbert 2.6.31-0.205.rc9.git1 +- 2.6.31-rc9-git1 +- Temporarily hack the drm-next patch so it still applies; the result + should still be safe to build. + +* Sat Sep 05 2009 Chuck Ebbert 2.6.31-0.204.rc9 +- 2.6.31-rc9 + +* Fri Sep 04 2009 Chuck Ebbert 2.6.31-0.203.rc8.git2 +- Fix kernel build errors when building firmware by removing the + .config file before that step and restoring it afterward. + +* Thu Sep 03 2009 Adam Jackson +- drm-ddc-caching-bug.patch: Empty the connector's mode list when it's + disconnected. + +* Thu Sep 03 2009 Jarod Wilson +- Update hdpvr and lirc_zilog drivers for 2.6.31 i2c + +* Thu Sep 03 2009 Justin M.Forbes +- Fix xen guest with stack protector. (#508120) +- Small kvm fixes. + +* Wed Sep 02 2009 Adam Jackson 2.6.31-0.199.rc8.git2 +- drm-intel-pm.patch: Disable by default, too flickery on too many machines. + Enable with i915.powersave=1. + +* Wed Sep 02 2009 Dave Jones +- Add missing scriptlet dependancy. (#520788) + +* Tue Sep 01 2009 Adam Jackson +- Make DRM less chatty about EDID failures. No one cares. + +* Tue Sep 01 2009 Chuck Ebbert +- 2.6.31-rc8-git2 +- Blank out drm-intel-next: entire contents are now upstream. + +* Tue Sep 01 2009 Dave Jones +- Make firmware buildarch noarch. (Suggested by drago01 on irc) + +* Tue Sep 01 2009 Jarod Wilson +- Fix up lirc_zilog to enable functional IR transmit and receive + on the Hauppauge HD PVR +- Fix audio on PVR-500 when used in same system as HVR-1800 (#480728) + +* Sun Aug 30 2009 Chuck Ebbert +- 2.6.31-rc8-git1 +- Drop linux-2.6-inotify-accounting.patch, merged upstream. + +* Sun Aug 30 2009 Jarod Wilson +- fix lirc_imon oops on older devices w/o tx ctrl ep (#520008) + +* Fri Aug 28 2009 Eric Paris 2.6.31-0.190.rc8 +- fix inotify length accounting and send inotify events + +* Fri Aug 28 2009 David Woodhouse +- Enable Solos DSL driver + +* Fri Aug 28 2009 Chuck Ebbert +- 2.6.31-rc8 + +* Thu Aug 27 2009 Chuck Ebbert 2.6.31-0.185.rc7.git6 +- 2.6.31-rc7-git6 +- Drop patch merged upstream: + xen-fb-probe-fix.patch + +* Thu Aug 27 2009 Adam Jackson +- drm-rv710-ucode-fix.patch: Treat successful microcode load on RV710 as, + you know, success. (#519718) + +* Thu Aug 27 2009 Chuck Ebbert +- 2.6.31-rc7-git5 +- Drop patch linux-2.6-ima-leak.patch, now merged upstream. + +* Wed Aug 26 2009 Jarod Wilson +- Fix up hdpvr ir enable patch for use w/modular i2c (David Engel) + +* Wed Aug 26 2009 Eric Paris +- fix iint_cache leak in IMA code + drop the ima=0 patch + +* Wed Aug 26 2009 Justin M. Forbes +- Fix munlock with KSM (#516909) +- Re-enable KSM + +* Wed Aug 26 2009 Chuck Ebbert +- 2.6.31-rc7-git4 +- Drop patches merged upstream: + xen-x86-fix-stackprotect.patch + xen-x86-no-stackprotect.patch + +* Wed Aug 26 2009 Adam Jackson +- drm-intel-next.patch: Update, various output setup fixes. + +* Wed Aug 26 2009 David Woodhouse +- Make WiMAX modular (#512070) + +* Tue Aug 25 2009 Kyle McMartin +- allow-disabling-ima.diff: debugging patch... adds ima=0 kernel + param to disable initialization of IMA. + +* Tue Aug 25 2009 Ben Skeggs 2.6.31-0.174.rc7.git2 +- drm-nouveau.patch: upstream update, pre-nv50 tv-out + misc fixes + +* Tue Aug 25 2009 Chuck Ebbert 2.6.31-0.173.rc7.git2 +- Fix Xen boot (#508120) + +* Tue Aug 25 2009 Dave Airlie +- pull in drm-next tree + rebase around it + +* Mon Aug 24 2009 Chuck Ebbert +- 2.6.31-rc7-git2 + +* Mon Aug 24 2009 Chuck Ebbert +- 2.6.31-rc7-git1 + +* Sat Aug 22 2009 Chuck Ebbert +- 2.6.31-rc7 + +* Thu Aug 20 2009 Mark McLoughlin +- Disable LZMA for xen (#515831) + +* Thu Aug 20 2009 Chuck Ebbert +- 2.6.31-rc6-git5 +- Fix up drm-r600-kms.patch +- Drop fix-perf-make-man-failure.patch + +* Wed Aug 19 2009 Chuck Ebbert +- 2.6.31-rc6-git5 +- Revert linux-2.6-debug-vm-would-have-oomkilled.patch to v1.2 + because upstream changes to oom-kill.c were all reverted. + +* Tue Aug 18 2009 Kyle McMartin +- Fix up perf so that it builds docs now that they are fixed. +- with_docs disables perf docs too. be warned. (logic is that the + build deps are (mostly) the same, so if you don't want one, odds are...) + +* Tue Aug 18 2009 Dave Jones +- 2.6.31-rc6-git3 + +* Mon Aug 17 2009 Dave Jones 2.6.31-0.161.rc6.git2 +- 2.6.31-rc6-git2 + +* Mon Aug 17 2009 Chuck Ebbert +- Stop generating the (unused) ppc64-kdump.config file. + +* Mon Aug 17 2009 Jarod Wilson +- Add new lirc driver for built-in ENE0100 device on some laptops + +* Sun Aug 16 2009 Kyle McMartin 2.6.31-0.158.rc6 +- Improve the perf script so it prints something helpful if the + perf binary doesn't exist. + +* Sat Aug 15 2009 Dave Jones 2.6.31-0.157.rc6 +- Disable KSM patches on a hunch. Chasing the "encrypted VGs don't work" bug. + +* Fri Aug 14 2009 Dave Jones 2.6.31-0.155.rc6 +- 2.6.31-rc6 + +* Wed Aug 12 2009 Kyle McMartin +- fix perf. +- move perf to perf.$ver instead of perf-$ver... + +* Wed Aug 12 2009 Dennis Gilmore +- Obsolete kernel-smp on sparc64 +- Require grubby >= 7.0.2-1 since thats what introduces the dracut options we use + +* Wed Aug 12 2009 Kristian Høgsberg +- Fix drm-page-flip.patch to not break radeon kms and to not reset + crtc offset into fb on flip. + +* Wed Aug 12 2009 Adam Jackson +- Update drm-intel-next patch + +* Tue Aug 11 2009 Dennis Gilmore - 2.6.31-0.149.rc5.git3 +- disable building the -smp kernel on sparc64 +- disable building kernel-perf on sparc64 syscalls not supported + +* Tue Aug 11 2009 Eric Paris +- Enable config IMA + +* Tue Aug 11 2009 Ben Skeggs +- nouveau: various cleanups and fixes + more sanity checking in dma paths + +* Mon Aug 10 2009 Jarod Wilson +- Add new device ID to lirc_mceusb (#512483) +- Fix some lockdep false positives +- Add support for setting and enabling iMON clock via sysfs +- Add tunable pad threshold support to lirc_imon +- Add new pseudo-IR protocl to lirc_imon for universals w/o a pad +- Fix mouse device support on older iMON devices + +* Mon Aug 10 2009 David Woodhouse 2.6.31-0.145.rc5.git3 +- Merge latest Intel IOMMU fixes and BIOS workarounds, re-enable by default. + +* Sun Aug 09 2009 Kyle McMartin +- btusb autosuspend: fix build on !CONFIG_PM by stubbing out + suspend/resume methods. + +* Sat Aug 08 2009 Dennis Gilmore 2.6.31-0.141.rc5.git3 +- disable kgdb on sparc64 uni-processor kernel +- set max cpus to 256 on sparc64 +- enable AT keyboard on sparc64 + +* Fri Aug 07 2009 Justin M. Forbes +- Apply KSM updates from upstream + +* Fri Aug 07 2009 Hans de Goede +- When building a dracut generic initrd tell new-kernel-pkg to use that + instead of running mkinitrd + +* Fri Aug 07 2009 Dave Airlie 2.6.31-0.139.rc5.git3 +- drm-r600-kms.patch - update r600 KMS +- drm-radeon-fixes.patch - patches for queue to Linus + +* Thu Aug 06 2009 Justin M. Forbes 2.6.31-0.138.rc5.git3 +- Fix kvm virtio_blk errors (#514901) + +* Thu Aug 06 2009 Adam Jackson +- Hush DRM vblank warnings, they're constant (and harmless) under DRI2. + +* Thu Aug 06 2009 Dave Airlie 2.6.31.0.134.rc5.git3 +- fixup vga arb warning at startup and handover between gpus + +* Thu Aug 06 2009 Kyle McMartin 2.6.31.0.133.rc5.git3 +- die-floppy-die.patch: it's the 21st century, let's not rely on + steam powered technology. + +* Wed Aug 05 2009 Dave Airlie 2.6.31.0.132.rc5.git3 +- revert-ftrace-powerpc-snafu.patch - fix ppc build + +* Wed Aug 05 2009 Ben Skeggs +- nouveau: respect nomodeset + +* Wed Aug 05 2009 Chuck Ebbert +- Fix /usr/sbin/perf script. (#515494) + +* Wed Aug 05 2009 Dave Jones +- Fix shift in pci cacheline size printk. + +* Wed Aug 05 2009 Dave Airlie 2.6.31.0.128.rc5.git3 +- 2.6.31-rc5-git3 +- drop cpufreq + set memory fixes + +* Wed Aug 05 2009 Dave Airlie +- Add Jeromes initial r600 kms work. +- rebase arb patch + +* Tue Aug 04 2009 Kyle McMartin +- alsa-tell-user-that-stream-to-be-rewound-is-suspended.patch: apply patch + destined for 2.6.32, requested by Lennart. + +* Tue Aug 04 2009 Ben Skeggs +- nouveau: more code share between nv50/ +- update VGA arb patches again + +* Mon Aug 03 2009 Adam Jackson +- Update intel drm from anholt's tree +- Rebase drm-intel-pm.patch to match +- Drop gen3 fb hack, merged +- Drop previous watermark setup change + +* Mon Aug 03 2009 Dave Jones 2.6.31-0.122.rc5.git2 +- 2.6.31-rc5-git2 + +* Mon Aug 03 2009 Adam Jackson +- (Attempt to) fix watermark setup on Intel 9xx parts. + +* Mon Aug 03 2009 Jarod Wilson +- make usbhid driver ignore all recent SoundGraph iMON devices, so the + lirc_imon driver can grab them instead + +* Mon Aug 03 2009 Dave Airlie +- update VGA arb patches + +* Sat Aug 01 2009 David Woodhouse 2.6.31-0.118.rc5 +- Fix boot failures on ppc32 (#514010, #505071) + +* Fri Jul 31 2009 Kyle McMartin 2.6.31-0.117.rc5 +- Linux 2.6.31-rc5 + +* Fri Jul 31 2009 Matthew Garrett +- linux-2.6-dell-laptop-rfkill-fix.patch: Fix up Dell rfkill + +* Fri Jul 31 2009 Ben Skeggs +- nouveau: build against 2.6.31-rc4-git6, fix script parsing on some G8x chips + +* Thu Jul 30 2009 Chuck Ebbert +- Linux 2.6.31-rc4-git6 + New config item: CONFIG_BATTERY_DS2782 is not set +- Add last-minute set_memory_wc() fix from LKML. + +* Thu Jul 30 2009 Matthew Garrett +- drm-intel-pm.patch: Don't reclock external outputs. Increase the reduced + clock slightly to avoid upsetting some hardware. Disable renderclock + adjustment for the moment - it's breaking on some hardware. + +* Thu Jul 30 2009 Ben Skeggs +- nouveau: another DCB 1.5 entry, G80 corruption fixes, small +- fix VGA ARB + kms + +* Wed Jul 29 2009 Dave Jones +- Add support for dracut. (Harald Hoyer) + +* Wed Jul 29 2009 Ben Skeggs +- drm-nouveau.patch: nv50/nva0 tiled scanout fixes, nv40 kms fixes + +* Wed Jul 29 2009 Chuck Ebbert +- Linux 2.6.31-rc4-git3 +- Drop linux-2.6-ecryptfs-overflow-fixes.patch, merged upstream now. + +* Wed Jul 29 2009 Dave Airlie +- update VGA arb patches + +* Tue Jul 28 2009 Adam Jackson +- Remove the pcspkr modalias. If you're still living in 1994, load it + by hand. + +* Tue Jul 28 2009 Eric Sandeen 2.6.31-0.102.rc4.git2 +- Fix eCryptfs overflow issues (CVE-2009-2406, CVE-2009-2407) + +* Tue Jul 28 2009 Kyle McMartin 2.6.31-0.101.rc4.git2 +- 2.6.31-rc4-git2 +- rebase linux-2.6-fix-usb-serial-autosuspend.diff +- config changes: + - USB_GSPCA_SN9C20X=m (_EVDEV=y) + +* Tue Jul 28 2009 Ben Skeggs +- drm-nouveau.patch: cleanup userspace API, various bugfixes. + Looks worse than it is, register macros got cleaned up, which + touches pretty much everywhere.. + +* Mon Jul 27 2009 Adam Jackson +- Warn quieter about not finding PCI bus parents for ROM BARs, they're + not usually needed and there's nothing you can do about it anyway. + +* Mon Jul 27 2009 Matthew Garrett +- linux-2.6-alsa-improve-hda-powerdown.patch - attempt to reduce audio glitches + caused by HDA powerdown +- disable CONFIG_DEBUG_KOBJECT again for now, since it produces huge dmesg spew + +* Mon Jul 27 2009 Dave Airlie +- update vga arb code + +* Mon Jul 27 2009 Matthew Garrett +- drm-intel-pm.patch - Add runtime PM for Intel graphics + +* Fri Jul 24 2009 Kristian Høgsberg +- Add drm-page-flip.patch to support vsynced page flipping on intel + chipsets. +- Really add patch. +- Fix patch to not break nouveau. + +* Fri Jul 24 2009 Chuck Ebbert +- Enable CONFIG_DEBUG_KOBJECT in debug kernels. (#513606) + +* Thu Jul 23 2009 Kyle McMartin +- perf BuildRequires binutils-devel now. + +* Thu Jul 23 2009 Justin M. Forbes +- Add KSM support + +* Thu Jul 23 2009 Kyle McMartin 2.6.31-0.87.rc4 +- Linux 2.6.31-rc4 +- config changes: + - USB_CDC_PHONET=m [all] + - EVENT_PROFILE=y [i386, x86_64, powerpc, s390] + +* Wed Jul 22 2009 Tom "spot" Callaway +- We have to override the new %%install behavior because, well... the kernel is special. + +* Wed Jul 22 2009 Dave Jones +- 2.6.31-rc3-git5 + +* Wed Jul 22 2009 Ben Skeggs 2.6.31-0.82.rc3.git4 +- Enable KMS for nouveau + +* Wed Jul 22 2009 Ben Skeggs +- Update nouveau from upstream (initial suspend/resume + misc bugfixes) + +* Mon Jul 20 2009 Adam Jackson +- Disable VGA arbiter patches for a moment + +* Mon Jul 20 2009 Adam Jackson +- Revive 4k framebuffers for intel gen3 + +* Mon Jul 20 2009 Dave Jones 2.6.31-0.78.rc3.git4 +- Enable CONFIG_RTC_HCTOSYS (#489494) + +* Mon Jul 20 2009 Dave Jones 2.6.31-0.77.rc3.git4 +- Don't build 586 kernels any more. + +* Sun Jul 19 2009 Dave Jones 2.6.31-0.75.rc3.git4 +- build a 'full' package on i686 (Bill Nottingham) + +* Sun Jul 19 2009 Dave Jones 2.6.31-0.74.rc3.git4 +- 2.6.31-rc3-git4 + +* Sat Jul 18 2009 Matthew Garrett +- linux-2.6-driver-level-usb-autosuspend.diff - allow drivers to enable autopm +- linux-2.6-fix-usb-serial-autosuspend.diff - fix generic usb-serial autopm +- linux-2.6-qcserial-autosuspend.diff - enable autopm by default on qcserial +- linux-2.6-bluetooth-autosuspend.diff - enable autopm by default on btusb +- linux-2.6-usb-uvc-autosuspend.diff - enable autopm by default on uvc + +* Thu Jul 16 2009 Chuck Ebbert +- 2.6.31-rc3-git3 + +* Thu Jul 16 2009 Matthew Garrett +- linux-2.6-defaults-aspm.patch - default ASPM to on for PCIe >= 1.1 hardware + +* Thu Jul 16 2009 Dave Airlie 2.6.31-0.69.rc3 +- linux-2.6-vga-arb.patch - add VGA arbiter. +- drm-vga-arb.patch - add VGA arbiter support to drm + +* Tue Jul 14 2009 Kyle McMartin 2.6.31-0.68-rc3 +- 2.6.31-rc3 +- config changes: + - RTL8192SU is not set, (staging) + +* Mon Jul 13 2009 Kyle McMartin 2.6.31-0.67.rc2.git9 +- 2.6.31-rc2-git9 +- config changes: + - BLK_DEV_OSD=m + +* Mon Jul 13 2009 Ben Skeggs +- drm-nouveau.patch: update from upstream + +* Fri Jul 10 2009 Chuck Ebbert +- 2.6.31-rc2-git6 +- Drop dmadebug-spinlock patch -- merged upstream. + +* Fri Jul 10 2009 Dave Jones 2.6.31-0.64.rc2.git5 +- Don't jump through hoops that ppc powerbooks have to on sensible systems + in cpufreq_suspend. + +* Fri Jul 10 2009 Dave Jones +- 2.6.31-rc2-git5 + +* Thu Jul 09 2009 Dave Jones 2.6.31-0.62.rc2.git4 +- Use correct spinlock initialization in dma-debug + +* Thu Jul 09 2009 Chuck Ebbert 2.6.31-0.61.rc2.git4 +- 2.6.31-rc2-git4 + +* Thu Jul 09 2009 Jarod Wilson +- Enable IR receiver on the Hauppauge HD PVR +- Trim the changelog, axing everything before 2.6.29 (see cvs + if you still really want to see that far back) + +* Wed Jul 08 2009 Dave Jones +- Enable a bunch of debugging options that were missed somehow. + +* Wed Jul 08 2009 Kyle McMartin +- Bump NR_CPUS on x86_64 to 512. + +* Wed Jul 08 2009 Adam Jackson +- drm-no-gem-on-i8xx.patch: Drop, intel 2D driver requires GEM now. This + should be entertaining. + +* Wed Jul 08 2009 Kyle McMartin +- First cut of /usr/sbin/perf wrapper script and 'perf' + subpackage. + +* Wed Jul 08 2009 Kyle McMartin 2.6.31-0.54.rc2.git2 +- Rebase and re-apply all the Fedora-specific linux-2.6-debug-* + patches. +- Cull a bunch of upstreamed patches from the spec. + +* Wed Jul 08 2009 Steve Dickson +- Added NFSD v4 dynamic pseudo root patch which allows + NFS v3 exports to be mounted by v4 clients. + +* Tue Jul 07 2009 Jarod Wilson +- See if we can't make lirc_streamzap behave better... (#508952) + +* Tue Jul 07 2009 Chuck Ebbert 2.6.31-0.47.rc2.git2 +- 2.6.31-rc2-git2 + +* Tue Jul 07 2009 Jarod Wilson +- Make lirc_i2c actually work with 2.6.31 i2c + +* Mon Jul 06 2009 Chuck Ebbert +- Use LZMA for kernel compression on X86. + +* Mon Jul 06 2009 Jarod Wilson +- Hack up lirc_i2c and lirc_zilog to compile with 2.6.31 i2c + changes. The drivers might not actually be functional now, but + at least they compile again. Will fix later, if need be... + +* Sat Jul 04 2009 Dave Jones 2.6.31-0.42.rc2 +- 2.6.31-rc2 + +* Sat Jul 04 2009 Chuck Ebbert +- 2.6.31-rc1-git11 + +* Fri Jul 03 2009 Hans de Goede +- Disable v4l1 ov511 and quickcam_messenger drivers (obsoleted by + v4l2 gspca subdrivers) + +* Thu Jul 02 2009 Kyle McMartin 2.6.31-0.39.rc1.git9 +- 2.6.31-rc1-git9 +- linux-2.6-dm-fix-exstore-search.patch: similar patch merged upstream. + +* Tue Jun 30 2009 Chuck Ebbert 2.6.31-0.38.rc1.git7 +- 2.6.31-rc1-git7 + +* Tue Jun 30 2009 Dave Jones 2.6.31-0.37.rc1.git5 +- Disable kmemleak. Way too noisy, and not finding any real bugs. + +* Tue Jun 30 2009 Ben Skeggs +- drm-nouveau.patch: match upstream + +* Mon Jun 29 2009 Chuck Ebbert 2.6.31-0.35.rc1.git5 +- 2.6.31-rc1-git5 +- CONFIG_LEDS_LP3944=m + +* Mon Jun 29 2009 Chuck Ebbert +- Try to fix the dm overlay bug for real (#505121) + +* Sat Jun 27 2009 Ben Skeggs 2.6.31-0.33.rc1.git2 +- drm-nouveau.patch: fix conflicts from 2.6.31-rc1-git2 + +* Fri Jun 26 2009 Dave Jones 2.6.31-0.31.rc1.git2 +- Further improvements to kmemleak + +* Fri Jun 26 2009 Dave Jones 2.6.31-0.30.rc1.git2 +- 2.6.31-rc1-git2 + +* Fri Jun 26 2009 Ben Skeggs +- drm-nouveau.patch: latest upstream + reenable + +* Thu Jun 25 2009 Dave Jones 2.6.31-0.29.rc1 +- Make kmemleak scan process stacks by default. + Should reduce false positives (which does also increase false negatives, + but that's at least less noisy) + +* Wed Jun 24 2009 Kyle McMartin 2.6.31-0.28.rc1 +- 2.6.31-rc1 +- linux-2.6-utrace.patch: rebase on kernel/Makefile changes +- config changes: + - generic: + - CONFIG_DM_LOG_USERSPACE=m + - CONFIG_DM_MULTIPATH_QL=m + - CONFIG_DM_MULTIPATH_ST=m + - CONFIG_BATTERY_MAX17040=m + - CONFIG_I2C_DESIGNWARE is off (depends on clk.h) + +* Wed Jun 24 2009 Kyle McMartin +- Move perf to /usr/libexec/perf-$KernelVer. + +* Wed Jun 24 2009 Kyle McMartin +- config changes: + - generic: + - CONFIG_SCSI_DEBUG=m (was off, requested by davidz) + +* Wed Jun 24 2009 Dave Jones 2.6.31-0.22.rc0.git22 +- 2.6.30-git22 + +* Tue Jun 23 2009 Dave Jones 2.6.31-0.22.rc0.git20 +- 2.6.30-git20 + +* Mon Jun 22 2009 Kyle McMartin 2.6.31-0.24.rc0.git18 +- Enable tools/perf, installed as /bin/perf-$KernelVer. Docs and a /bin/perf + wrapper come next if this builds ok. + +* Mon Jun 22 2009 Kyle McMartin +- sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch: pull in + two fixes from Mike Galbraith from tip.git + +* Sun Jun 21 2009 Dave Jones 2.6.31-0.21.rc0.git18 +- Add patch to possibly fix the pktlen problem on via-velocity. + +* Sun Jun 21 2009 Dave Jones 2.6.31-0.20.rc0.git18 +- 2.6.30-git18 + VIA crypto & mmc patches now upstream. + +* Sun Jun 21 2009 Dave Jones +- Determine cacheline sizes in a generic manner. + +* Sun Jun 21 2009 Chuck Ebbert 2.6.31-0.18.rc0.git17 +- 2.6.30-git17 +- Config changes: + - powerpc32-generic + CONFIG_PERF_COUNTERS=y + - generic + CONFIG_KEYBOARD_LM8323 is not set + CONFIG_MOUSE_SYNAPTICS_I2C=m + CONFIG_TOUCHSCREEN_EETI=m + CONFIG_TOUCHSCREEN_W90X900=m +- Dropped agp-set_memory_ucwb.patch, all fixed upstream now. + +* Sat Jun 20 2009 Kyle McMartin 2.6.31.0.17.rc0.git15 +- config changes: + - ppc generic: + - CONFIG_PPC_DISABLE_WERROR=y (switched... chrp fails otherwise, stack + frame size.) + +* Sat Jun 20 2009 Kyle McMartin 2.6.31.0.16.rc0.git15 +- 2.6.30-git15 +- config changes: + - generic: + - CONFIG_LBDAF=y + - staging: + - CONFIG_USB_SERIAL_QUATECH2 is not set + - CONFIG_VT6655 is not set + - CONFIG_USB_CPC is not set + - CONFIG_RDC_17F3101X is not set + - CONFIG_FB_UDL is not set + - ppc32: + - CONFIG_KMETER1=y + - ppc generic: + - CONFIG_PPC_DISABLE_WERROR is not set +- lirc disabled due to i2c detach_client removal. + +* Sat Jun 20 2009 Kyle McMartin +- sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch: add, + queued in tip/sched/core (ca94c442535a44d508c99a77e54f21a59f4fc462) + +* Fri Jun 19 2009 Kyle McMartin 2.6.31.0.15.rc0.git14 +- Fix up ptrace, hopefully. Builds on x86_64 at least. + +* Fri Jun 19 2009 Chuck Ebbert +- linux-2.6-tip.git-203abd67b75f7714ce98ab0cdbd6cfd7ad79dec4.patch + Fixes oops on boot with qemu (#507007) + +* Fri Jun 19 2009 Kyle McMartin 2.6.31-0.13.rc0.git14 +- 2.6.30-git14 + +* Fri Jun 19 2009 Chuck Ebbert +- Fix up the via-sdmmc and via-hwmon-temp-sensor patches. +- Drop VIA Padlock patches merged upstream: + via-rng-enable-64bit.patch + via-padlock-10-enable-64bit.patch + via-padlock-20-add-x86-dependency.patch + +* Thu Jun 18 2009 Kyle McMartin 2.6.31-0.11.rc0.git13 +- 2.6.30-git13 +- config changes: + - arm: + - CONFIG_UACCESS_WITH_MEMCPY is not set + - i686-PAE: + - CONFIG_XEN_DEV_EVTCHN=m + - CONFIG_XEN_SYS_HYPERVISOR=y + - ia64: + - CONFIG_RCU_FANOUT=64 + - nodebug: + - CONFIG_DEBUG_KMEMLEAK is not set + - CONFIG_DEBUG_KMEMLEAK_TEST=m + - powerpc: + - CONFIG_CAN_SJA1000_OF_PLATFORM=m + - CONFIG_PPC_EMULATED_STATS=y + - CONFIG_SWIOTLB=y + - CONFIG_RDS is not set (broken on ppc32) + - powerpc32: + - CONFIG_RCU_FANOUT=32 + - powerpc64: + - CONFIG_RCU_FANOUT=64 + - CONFIG_PERF_COUNTERS=y + - s390x: + - CONFIG_RCU_FANOUT=64 + - CONFIG_SECCOMP=y + - CONFIG_PM=y + - CONFIG_HIBERNATION=y + - CONFIG_PM_STD_PARTITION="/dev/jokes" + - sparc64: + - CONFIG_RCU_FANOUT=64 + - x86: + - CONFIG_RCU_FANOUT=32 + - CONFIG_IOMMU_STRESS is not set + - CONFIG_PERF_COUNTERS=y + - CONFIG_X86_OLD_MCE is not set + - CONFIG_X86_MCE_INTEL=y + - CONFIG_X86_MCE_AMD=y + - CONFIG_X86_ANCIENT_MCE is not set + - CONFIG_X86_MCE_INJECT is not set + - x86_64: + - CONFIG_EDAC_AMD64=m + - CONFIG_EDAC_AMD64_ERROR_INJECTION is not set + - CONFIG_XEN_DEV_EVTCHN=m + - CONFIG_XEN_SYS_HYPERVISOR=y + - CONFIG_RCU_FANOUT=64 + - CONFIG_IOMMU_STRESS is not set + - CONFIG_PERF_COUNTERS=y + - CONFIG_X86_MCE_INJECT is not set + - generic: + - CONFIG_RCU_FANOUT=32 + - CONFIG_MMC_SDHCI_PLTFM=m + - CONFIG_MMC_CB710=m + - CONFIG_CB710_CORE=m + - CONFIG_CB710_DEBUG is not set + - CONFIG_SCSI_MVSAS_DEBUG is not set + - CONFIG_SCSI_BNX2_ISCSI=m + - CONFIG_NETFILTER_XT_MATCH_OSF=m + - CONFIG_RFKILL_INPUT=y (used to be =m, which was invalid) + - CONFIG_DE2104X_DSL=0 + - CONFIG_KS8842 is not set + - CONFIG_CFG80211_DEBUGFS=y + - CONFIG_MAC80211_DEFAULT_PS=y + - CONFIG_IWM=m + - CONFIG_IWM_DEBUG is not set + - CONFIG_RT2800USB=m + - CONFIG_CAN_DEV=m + - CONFIG_CAN_CALC_BITTIMING=y + - CONFIG_CAN_SJA1000=m + - CONFIG_CAN_SJA1000_PLATFORM=m + - CONFIG_CAN_EMS_PCI=m + - CONFIG_CAN_KVASER_PCI=m + - CONFIG_EEPROM_MAX6875=m + - CONFIG_SENSORS_TMP401=m + - CONFIG_MEDIA_SUPPORT=m + - CONFIG_SND_CTXFI=m + - CONFIG_SND_LX6464ES=m + - CONFIG_SND_HDA_CODEC_CA0110=y + - CONFIG_USB_XHCI_HCD=m + - CONFIG_USB_XHCI_HCD_DEBUGGING is not set + - CONFIG_DRAGONRISE_FF=y (used to be =m) + - CONFIG_GREENASIA_FF=y (used to be =m) + - CONFIG_SMARTJOYPLUS_FF=y (used to be =m) + - CONFIG_USB_NET_INT51X1=m + - CONFIG_CUSE=m + - CONFIG_FUNCTION_PROFILER=y + - CONFIG_RING_BUFFER_BENCHMARK=m + - CONFIG_REGULATOR_USERSPACE_CONSUMER=m + - CONFIG_REGULATOR_MAX1586=m + - CONFIG_REGULATOR_LP3971=m + - CONFIG_RCU_FANOUT_EXACT is not set + - CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 + - CONFIG_FSNOTIFY=y + - CONFIG_IEEE802154=m + - CONFIG_IEEE802154_DRIVERS=m + - CONFIG_IEEE802154_FAKEHARD=m + - CONFIG_CNIC=m + +* Wed Jun 17 2009 Jarod Wilson +- New lirc_imon hotness, update 2: + * support dual-interface devices with a single lirc device + * directional pad functions as an input device mouse + * touchscreen devices finally properly supported + * support for using MCE/RC-6 protocol remotes + * fix oops in RF remote association code (F10 bug #475496) + * fix re-enabling case/panel buttons and/or knobs +- Add some misc additional lirc_mceusb2 transceiver IDs +- Add missing unregister_chrdev_region() call to lirc_dev exit +- Add it8720 support to lirc_it87 + +* Tue Jun 16 2009 Chuck Ebbert +- Update via-sdmmc driver + +* Mon Jun 15 2009 Jarod Wilson +- Update lirc patches w/new imon hotness + +* Fri Jun 12 2009 Chuck Ebbert +- Update VIA temp sensor and mmc drivers. + +* Fri Jun 12 2009 John W. Linville 2.6.30-6 +- neigh: fix state transition INCOMPLETE->FAILED via Netlink request +- enable CONFIG_ARPD (used by OpenNHRP) + +* Wed Jun 10 2009 Chuck Ebbert +- VIA Nano updates: + Enable Padlock AES encryption and random number generator on x86-64 + Add via-sdmmc and via-cputemp drivers + +* Wed Jun 10 2009 Kyle McMartin 2.6.30-1 +- Linux 2.6.30 rebase. + +* Tue Jun 09 2009 John W. Linville +- Clean-up some wireless bits in config-generic + +* Tue Jun 09 2009 Chuck Ebbert +- Add support for ACPI P-states on VIA processors. +- Disable the e_powersaver driver. + +* Tue Jun 09 2009 Chuck Ebbert +- Linux 2.6.30-rc8-git6 + +* Fri Jun 05 2009 Chuck Ebbert +- Linux 2.6.30-rc8-git1 + +* Wed Jun 03 2009 Kyle McMartin +- Linux 2.6.30-rc8 + +* Tue Jun 2 2009 Roland McGrath +- utrace update (fixes stap PR10185) + +* Tue Jun 02 2009 Dave Jones +- For reasons unknown, RT2X00 driver was being built-in. + Make it modular. + +* Tue Jun 02 2009 Dave Jones +- 2.6.30-rc7-git5 + +* Sat May 30 2009 Dave Jones +- 2.6.30-rc7-git4 + +* Thu May 28 2009 Dave Jones +- 2.6.30-rc7-git2 + +* Tue May 26 2009 Dave Jones +- Various cpufreq patches from git. + +* Tue May 26 2009 Dave Jones +- 2.6.30-rc7-git1 + +* Mon May 25 2009 Kyle McMartin +- rds-only-on-64-bit-or-x86.patch: drop patch, issue is fixed upstream. + +* Sat May 23 2009 Dave Jones +- 2.6.30-rc7 + +* Thu May 21 2009 Dave Jones +- 2.6.30-rc6-git6 + +* Wed May 20 2009 Chuck Ebbert +- Enable Divas (formerly Eicon) ISDN drivers on x86_64. (#480837) + +* Wed May 20 2009 Dave Jones +- 2.6.30-rc6-git5 + +* Mon May 18 2009 Dave Jones +- 2.6.30-rc6-git3 + +* Sun May 17 2009 Dave Jones +- 2.6.30-rc6-git2 + +* Sat May 16 2009 Dave Jones +- 2.6.30-rc6 + +* Mon May 11 2009 Kyle McMartin +- Linux 2.6.30-rc5-git1 + +* Fri May 08 2009 Kyle McMartin +- Linux 2.6.30-rc5 + +* Fri May 08 2009 Kyle McMartin +- Linux 2.6.30-rc4-git4 + +* Wed May 06 2009 Kyle McMartin +- Linux 2.6.30-rc4-git3 +- linux-2.6-cdrom-door-status.patch: merged upstream. +- linux-2.6-iwl3945-remove-useless-exports.patch: merged upstream. +- linux-2.6-utrace.patch: rebase against changes to fs/proc/array.c +- USB_NET_CDC_EEM=m + +* Fri May 01 2009 Eric Sandeen +- Fix ext4 corruption on partial write into prealloc block + +* Thu Apr 30 2009 Kyle McMartin +- 2.6.30-rc4 + +* Wed Apr 29 2009 Dave Jones +- 2.6.30-rc3-git6 + +* Tue Apr 28 2009 Dave Jones +- 2.6.30-rc3-git4 + +* Tue Apr 28 2009 Chuck Ebbert +- Make the kernel-vanilla package buildable again. +- Allow building with older versions of RPM. + +* Tue Apr 28 2009 Neil Horman +- Backport missing snmp stats (bz 492391) + +* Tue Apr 28 2009 Chuck Ebbert 2.6.30-0.72.rc3.git3 +- Drop unused exports from the iwl3945 driver. + +* Tue Apr 28 2009 Chuck Ebbert +- Linux 2.6.30-rc3-git3 + +* Mon Apr 27 2009 Dave Jones +- 2.6.30-rc3-git2 + +* Sun Apr 26 2009 Chuck Ebbert 2.6.30-0.68.rc3.git1 +- Linux 2.6.30-rc3-git1 + +* Wed Apr 22 2009 Dave Jones 2.6.30-0.67.rc3 +- Disable SYSFS_DEPRECATED on ia64 + +* Wed Apr 22 2009 Kyle McMartin +- Linux 2.6.30-rc3 +- PROC_VMCORE=y: Exports the dump image of crashed + kernel in ELF format + +* Wed Apr 22 2009 Neil Horman +- Enable RELOCATABLE and CRASH_DUMP for powerpc64 +- With this we can remove the -kdump build variant +- for the ppc64 arch + +* Tue Apr 21 2009 Chuck Ebbert +- Don't include the modules.*.bin files in the RPM package. + +* Tue Apr 21 2009 Dave Jones +- 2.6.30-rc2-git7 + +* Mon Apr 20 2009 Dave Jones +- Various s390x config tweaks. (#496596, #496601, #496605, #496607) + +* Mon Apr 20 2009 Dave Jones +- 2.6.30-rc2-git6 + +* Sat Apr 18 2009 Chuck Ebbert +- Set CONFIG_UEVENT_HELPER_PATH to the empty string (#496296) + +* Fri Apr 17 2009 Dave Jones +- 2.6.30-rc2-git3 + +* Thu Apr 16 2009 Kyle McMartin 2.6.30-0.58.rc2.git1 +- 2.6.30-rc2-git1 + +* Wed Apr 15 2009 Kyle McMartin 2.6.30-0.57.rc2 +- 2.6.30-rc2 + +* Tue Apr 14 2009 Kyle McMartin +- 2.6.30-rc1-git7 +- CONFIG_TOUCHSCREEN_AD7879_I2C=m +- CONFIG_STRIP_ASM_SYMS=y, off for -debug + +* Mon Apr 13 2009 Kyle McMartin +- ppc-fix-parport_pc.patch: add from linuxppc-dev@ + +* Mon Apr 13 2009 Kyle McMartin +- execshield: fix build (load_user_cs_desc is 32-bit only in tlb.c) + +* Sun Apr 12 2009 Kyle McMartin +- 2.6.30-rc1-git5 +- revert-fix-modules_install-via-nfs.patch: reverted upstream + +* Thu Apr 09 2009 Kyle McMartin +- actually drop utrace-ftrace from srpm. + +* Thu Apr 09 2009 Kyle McMartin +- 2.6.30-rc1-git2 +- CONFIG_IGBVF=m +- CONFIG_NETFILTER_XT_TARGET_LED=m + +* Thu Apr 09 2009 Dave Jones +- Bring back the /dev/crash driver. (#492803) + +* Wed Apr 08 2009 Dave Jones +- disable MMIOTRACE in non-debug builds (#494584) + +* Wed Apr 08 2009 Kyle McMartin 2.6.30-0.44.rc1 +- 2.6.30-rc1 +- linux-2.6-hwmon-atk0110.patch: drop +- CONFIG_DETECT_HUNG_TASK=y +- # CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set + +* Tue Apr 7 2009 Roland McGrath +- utrace update, drop unfinished utrace-ftrace + +* Tue Apr 07 2009 Kyle McMartin +- Linux 2.6.29-git15 +- EXT3_DEFAULTS_TO_ORDERED on for now. +- X86_X2APIC enabled. +- LEDS_LP5521, LEDS_BD2802 off... look not generally relevant. +- LIBFCOE on. + +* Tue Apr 07 2009 Dave Jones +- Enable CONFIG_CIFS_STATS (#494545) + +* Mon Apr 06 2009 Kyle McMartin +- linux-2.6-execshield.patch: rebase for 2.6.30 + +* Mon Apr 06 2009 Kyle McMartin +- Linux 2.6.29-git13 +- drop patches merged upstream: + - fix-ppc-debug_kmap_atomic.patch + - fix-staging-at76.patch + - linux-2.6-acpi-video-didl-intel-outputs.patch + - linux-2.6-acpi-strict-resources.patch + - linux-2.6-sony-laptop-rfkill.patch + - linux-2.6-btrfs-fix-umount-hang.patch + - linux-2.6-fiemap-header-install.patch + - linux-2.6-debug-dma-api.patch + - dma-api-debug-fixes.patch + - linux-2.6-ext4-flush-on-close.patch + - linux-2.6-relatime-by-default.patch + - linux-2.6-pci-sysfs-remove-id.patch + - linux-2.6-scsi-cpqarray-set-master.patch + - alsa-rewrite-hw_ptr-updaters.patch + - alsa-pcm-always-reset-invalid-position.patch + - alsa-pcm-fix-delta-calc-at-overlap.patch + - alsa-pcm-safer-boundary-checks.patch + - linux-2.6-input-hid-extra-gamepad.patch + - linux-2.6-ipw2x00-age-scan-results-on-resume.patch + - linux-2.6-dropwatch-protocol.patch + - linux-2.6-net-fix-gro-bug.patch + - linux-2.6-net-fix-another-gro-bug.patch + - linux-2.6-net-xfrm-fix-spin-unlock.patch + - linux-2.6.29-pat-change-is_linear_pfn_mapping-to-not-use-vm_pgoff.patch + - linux-2.6.29-pat-pci-change-prot-for-inherit.patch + +* Thu Apr 02 2009 Josef Bacik +- linux-2.6-btrfs-fix-umount-hang.patch: fix umount hang on btrfs + +* Thu Apr 02 2009 Kyle McMartin +- fix-ppc-debug_kmap_atomic.patch: fix build failures on ppc. + +* Thu Apr 02 2009 Kyle McMartin +- Linux 2.6.29-git9 + +* Tue Mar 31 2009 Kyle McMartin +- rds-only-on-64-bit-or-x86.patch: add +- at76-netdev_ops.patch: add + +* Tue Mar 31 2009 Kyle McMartin +- Linux 2.6.29-git8 +- linux-2.6-net-fix-another-gro-bug.patch: upstream. + +* Tue Mar 31 2009 Eric Sandeen +- add fiemap.h to kernel-headers +- build ext4 (and jbd2 and crc16) into the kernel + +* Tue Mar 31 2009 Kyle McMartin +- Linux 2.6.29-git7 +- fix-staging-at76.patch: pull patch from linux-wireless to fix... + +* Mon Mar 30 2009 Kyle McMartin 2.6.30-0.28.rc0.git6 +- Linux 2.6.29-git6 +- Bunch of stuff disabled, most merged, some needs rebasing. + +* Mon Mar 30 2009 Chuck Ebbert +- Make the .shared-srctree file a list so more than two checkouts + can share source files. + +* Mon Mar 30 2009 Chuck Ebbert +- Separate PAT fixes that are headed for -stable from our out-of-tree ones. + +* Mon Mar 30 2009 Dave Jones +- Make io schedulers selectable at boot time again. (#492817) + +* Mon Mar 30 2009 Dave Jones +- Add a strict-devmem=0 boot argument (#492803) + +* Mon Mar 30 2009 Adam Jackson +- linux-2.6.29-pat-fixes.patch: Fix PAT/GTT interaction + +* Mon Mar 30 2009 Mauro Carvalho Chehab +- some fixes of troubles caused by v4l2 subdev conversion + +* Mon Mar 30 2009 Mark McLoughlin 2.6.29-21 +- Fix guest->remote network stall with virtio/GSO (#490266) + +* Mon Mar 30 2009 Ben Skeggs +- drm-nouveau.patch + - rewrite nouveau PCI(E) GART functions, should fix rh#492492 + - kms: kernel option to allow dual-link dvi + - modinfo descriptions for module parameters + +* Sun Mar 29 2009 Mauro Carvalho Chehab +- more v4l/dvb updates: v4l subdev conversion and some driver improvements + +* Sun Mar 29 2009 Chuck Ebbert +- More fixes for ALSA hardware pointer updating. + +* Sat Mar 28 2009 Mauro Carvalho Chehab +- linux-2.6-revert-dvb-net-kabi-change.patch: attempt to fix dvb net breakage +- update v4l fixes patch to reflect what's ready for 2.6.30 +- update v4l devel patch to reflect what will be kept on linux-next for a while + +* Fri Mar 27 2009 Chuck Ebbert 2.6.29-16 +- Fix 2.6.29 networking lockups. +- Fix locking in net/xfrm/xfrm_state.c (#489764) + +* Fri Mar 27 2009 Ben Skeggs +- drm-nouveau.patch: do nothing for dac_{prepare,commit}, it's useless + and breaks some things in strange ways. + +* Fri Mar 27 2009 Ben Skeggs +- nv50: clear 0x1900/8 on init, possible fix for rh#492240 +- forcibly disable GEM also if KMS requested where not supported +- inform the user if we disable KMS because of it not being supported + +* Thu Mar 26 2009 Matthew Garrett +- linux-2.6-relatime-by-default.patch: Backport relatime code from 2.6.30 + +* Thu Mar 26 2009 Dave Jones +- Check for modesetting enabled before forcing mode on 915. (#490336) + +* Thu Mar 26 2009 Dave Jones +- Set kernel-PAE as default in grub. (#487578) + +* Thu Mar 26 2009 Dave Jones +- Enable CONFIG_MOUSE_PS2_ELANTECH (#492163) + +* Thu Mar 26 2009 Kyle McMartin +- linux-2.6-v4l-pvrusb2-fixes.patch: fix build for uncle steve. + +* Thu Mar 26 2009 Mauro Carvalho Chehab +- Move all 2.6.30 stuff into linux-2.6-v4l-dvb-fixes.patch, in + preparation for upstream pull; +- Added two new drivers: gspca sq905c and DVB Intel ce6230 +- Updated to the latest v4l-dvb drivers. + +* Wed Mar 25 2009 Mauro Carvalho Chehab +- remove duplicated Cinergy T2 entry at config-generic + +* Wed Mar 25 2009 Neil Horman +- Add dropmonitor/dropwatch protocol from 2.6.30 + +* Wed Mar 25 2009 Kyle McMartin +- alsa-rewrite-hw_ptr-updaters.patch: snd_pcm_update_hw_ptr() tries to + detect the unexpected hwptr jumps more strictly to avoid the position + mess-up, which often results in the bad quality I/O with pulseaudio. + +* Wed Mar 25 2009 Ben Skeggs +- drm-nouveau.patch: idle channels better before destroying them + +* Tue Mar 24 2009 Kyle McMartin +- Disable DMAR by default until suspend & resume is fixed. + +* Tue Mar 24 2009 Josef Bacik +- fsync replay fixes for btrfs + +* Mon Mar 23 2009 Dave Jones +- 2.6.29 + +### +# The following Emacs magic makes C-c C-e use UTC dates. +# Local Variables: +# rpm-change-log-uses-utc: t +# End: +### diff --git a/linux-2.6-btrfs-upstream.patch b/linux-2.6-btrfs-upstream.patch new file mode 100644 index 000000000..d309773f2 --- /dev/null +++ b/linux-2.6-btrfs-upstream.patch @@ -0,0 +1,10828 @@ +diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c +index f128427..3616042 100644 +--- a/fs/btrfs/acl.c ++++ b/fs/btrfs/acl.c +@@ -27,7 +27,7 @@ + #include "btrfs_inode.h" + #include "xattr.h" + +-#ifdef CONFIG_FS_POSIX_ACL ++#ifdef CONFIG_BTRFS_FS_POSIX_ACL + + static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) + { +@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = { + .set = btrfs_xattr_acl_access_set, + }; + +-#else /* CONFIG_FS_POSIX_ACL */ ++#else /* CONFIG_BTRFS_FS_POSIX_ACL */ + + int btrfs_acl_chmod(struct inode *inode) + { +@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) + return 0; + } + +-#endif /* CONFIG_FS_POSIX_ACL */ ++#endif /* CONFIG_BTRFS_FS_POSIX_ACL */ +diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c +index 019e8af..c0861e7 100644 +--- a/fs/btrfs/async-thread.c ++++ b/fs/btrfs/async-thread.c +@@ -48,6 +48,9 @@ struct btrfs_worker_thread { + /* number of things on the pending list */ + atomic_t num_pending; + ++ /* reference counter for this struct */ ++ atomic_t refs; ++ + unsigned long sequence; + + /* protects the pending list. */ +@@ -61,6 +64,51 @@ struct btrfs_worker_thread { + }; + + /* ++ * btrfs_start_workers uses kthread_run, which can block waiting for memory ++ * for a very long time. It will actually throttle on page writeback, ++ * and so it may not make progress until after our btrfs worker threads ++ * process all of the pending work structs in their queue ++ * ++ * This means we can't use btrfs_start_workers from inside a btrfs worker ++ * thread that is used as part of cleaning dirty memory, which pretty much ++ * involves all of the worker threads. ++ * ++ * Instead we have a helper queue who never has more than one thread ++ * where we scheduler thread start operations. This worker_start struct ++ * is used to contain the work and hold a pointer to the queue that needs ++ * another worker. ++ */ ++struct worker_start { ++ struct btrfs_work work; ++ struct btrfs_workers *queue; ++}; ++ ++static void start_new_worker_func(struct btrfs_work *work) ++{ ++ struct worker_start *start; ++ start = container_of(work, struct worker_start, work); ++ btrfs_start_workers(start->queue, 1); ++ kfree(start); ++} ++ ++static int start_new_worker(struct btrfs_workers *queue) ++{ ++ struct worker_start *start; ++ int ret; ++ ++ start = kzalloc(sizeof(*start), GFP_NOFS); ++ if (!start) ++ return -ENOMEM; ++ ++ start->work.func = start_new_worker_func; ++ start->queue = queue; ++ ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); ++ if (ret) ++ kfree(start); ++ return ret; ++} ++ ++/* + * helper function to move a thread onto the idle list after it + * has finished some requests. + */ +@@ -71,7 +119,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker) + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 1; +- list_move(&worker->worker_list, &worker->workers->idle_list); ++ ++ /* the list may be empty if the worker is just starting */ ++ if (!list_empty(&worker->worker_list)) { ++ list_move(&worker->worker_list, ++ &worker->workers->idle_list); ++ } + spin_unlock_irqrestore(&worker->workers->lock, flags); + } + } +@@ -87,23 +140,51 @@ static void check_busy_worker(struct btrfs_worker_thread *worker) + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 0; +- list_move_tail(&worker->worker_list, +- &worker->workers->worker_list); ++ ++ if (!list_empty(&worker->worker_list)) { ++ list_move_tail(&worker->worker_list, ++ &worker->workers->worker_list); ++ } + spin_unlock_irqrestore(&worker->workers->lock, flags); + } + } + +-static noinline int run_ordered_completions(struct btrfs_workers *workers, +- struct btrfs_work *work) ++static void check_pending_worker_creates(struct btrfs_worker_thread *worker) + { ++ struct btrfs_workers *workers = worker->workers; + unsigned long flags; + ++ rmb(); ++ if (!workers->atomic_start_pending) ++ return; ++ ++ spin_lock_irqsave(&workers->lock, flags); ++ if (!workers->atomic_start_pending) ++ goto out; ++ ++ workers->atomic_start_pending = 0; ++ if (workers->num_workers + workers->num_workers_starting >= ++ workers->max_workers) ++ goto out; ++ ++ workers->num_workers_starting += 1; ++ spin_unlock_irqrestore(&workers->lock, flags); ++ start_new_worker(workers); ++ return; ++ ++out: ++ spin_unlock_irqrestore(&workers->lock, flags); ++} ++ ++static noinline int run_ordered_completions(struct btrfs_workers *workers, ++ struct btrfs_work *work) ++{ + if (!workers->ordered) + return 0; + + set_bit(WORK_DONE_BIT, &work->flags); + +- spin_lock_irqsave(&workers->lock, flags); ++ spin_lock(&workers->order_lock); + + while (1) { + if (!list_empty(&workers->prio_order_list)) { +@@ -126,45 +207,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, + if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) + break; + +- spin_unlock_irqrestore(&workers->lock, flags); ++ spin_unlock(&workers->order_lock); + + work->ordered_func(work); + + /* now take the lock again and call the freeing code */ +- spin_lock_irqsave(&workers->lock, flags); ++ spin_lock(&workers->order_lock); + list_del(&work->order_list); + work->ordered_free(work); + } + +- spin_unlock_irqrestore(&workers->lock, flags); ++ spin_unlock(&workers->order_lock); + return 0; + } + ++static void put_worker(struct btrfs_worker_thread *worker) ++{ ++ if (atomic_dec_and_test(&worker->refs)) ++ kfree(worker); ++} ++ ++static int try_worker_shutdown(struct btrfs_worker_thread *worker) ++{ ++ int freeit = 0; ++ ++ spin_lock_irq(&worker->lock); ++ spin_lock(&worker->workers->lock); ++ if (worker->workers->num_workers > 1 && ++ worker->idle && ++ !worker->working && ++ !list_empty(&worker->worker_list) && ++ list_empty(&worker->prio_pending) && ++ list_empty(&worker->pending) && ++ atomic_read(&worker->num_pending) == 0) { ++ freeit = 1; ++ list_del_init(&worker->worker_list); ++ worker->workers->num_workers--; ++ } ++ spin_unlock(&worker->workers->lock); ++ spin_unlock_irq(&worker->lock); ++ ++ if (freeit) ++ put_worker(worker); ++ return freeit; ++} ++ ++static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, ++ struct list_head *prio_head, ++ struct list_head *head) ++{ ++ struct btrfs_work *work = NULL; ++ struct list_head *cur = NULL; ++ ++ if(!list_empty(prio_head)) ++ cur = prio_head->next; ++ ++ smp_mb(); ++ if (!list_empty(&worker->prio_pending)) ++ goto refill; ++ ++ if (!list_empty(head)) ++ cur = head->next; ++ ++ if (cur) ++ goto out; ++ ++refill: ++ spin_lock_irq(&worker->lock); ++ list_splice_tail_init(&worker->prio_pending, prio_head); ++ list_splice_tail_init(&worker->pending, head); ++ ++ if (!list_empty(prio_head)) ++ cur = prio_head->next; ++ else if (!list_empty(head)) ++ cur = head->next; ++ spin_unlock_irq(&worker->lock); ++ ++ if (!cur) ++ goto out_fail; ++ ++out: ++ work = list_entry(cur, struct btrfs_work, list); ++ ++out_fail: ++ return work; ++} ++ + /* + * main loop for servicing work items + */ + static int worker_loop(void *arg) + { + struct btrfs_worker_thread *worker = arg; +- struct list_head *cur; ++ struct list_head head; ++ struct list_head prio_head; + struct btrfs_work *work; ++ ++ INIT_LIST_HEAD(&head); ++ INIT_LIST_HEAD(&prio_head); ++ + do { +- spin_lock_irq(&worker->lock); +-again_locked: ++again: + while (1) { +- if (!list_empty(&worker->prio_pending)) +- cur = worker->prio_pending.next; +- else if (!list_empty(&worker->pending)) +- cur = worker->pending.next; +- else ++ ++ ++ work = get_next_work(worker, &prio_head, &head); ++ if (!work) + break; + +- work = list_entry(cur, struct btrfs_work, list); + list_del(&work->list); + clear_bit(WORK_QUEUED_BIT, &work->flags); + + work->worker = worker; +- spin_unlock_irq(&worker->lock); + + work->func(work); + +@@ -175,9 +329,13 @@ again_locked: + */ + run_ordered_completions(worker->workers, work); + +- spin_lock_irq(&worker->lock); +- check_idle_worker(worker); ++ check_pending_worker_creates(worker); ++ + } ++ ++ spin_lock_irq(&worker->lock); ++ check_idle_worker(worker); ++ + if (freezing(current)) { + worker->working = 0; + spin_unlock_irq(&worker->lock); +@@ -216,8 +374,10 @@ again_locked: + spin_lock_irq(&worker->lock); + set_current_state(TASK_INTERRUPTIBLE); + if (!list_empty(&worker->pending) || +- !list_empty(&worker->prio_pending)) +- goto again_locked; ++ !list_empty(&worker->prio_pending)) { ++ spin_unlock_irq(&worker->lock); ++ goto again; ++ } + + /* + * this makes sure we get a wakeup when someone +@@ -226,8 +386,13 @@ again_locked: + worker->working = 0; + spin_unlock_irq(&worker->lock); + +- if (!kthread_should_stop()) +- schedule(); ++ if (!kthread_should_stop()) { ++ schedule_timeout(HZ * 120); ++ if (!worker->working && ++ try_worker_shutdown(worker)) { ++ return 0; ++ } ++ } + } + __set_current_state(TASK_RUNNING); + } +@@ -242,41 +407,61 @@ int btrfs_stop_workers(struct btrfs_workers *workers) + { + struct list_head *cur; + struct btrfs_worker_thread *worker; ++ int can_stop; + ++ spin_lock_irq(&workers->lock); + list_splice_init(&workers->idle_list, &workers->worker_list); + while (!list_empty(&workers->worker_list)) { + cur = workers->worker_list.next; + worker = list_entry(cur, struct btrfs_worker_thread, + worker_list); +- kthread_stop(worker->task); +- list_del(&worker->worker_list); +- kfree(worker); ++ ++ atomic_inc(&worker->refs); ++ workers->num_workers -= 1; ++ if (!list_empty(&worker->worker_list)) { ++ list_del_init(&worker->worker_list); ++ put_worker(worker); ++ can_stop = 1; ++ } else ++ can_stop = 0; ++ spin_unlock_irq(&workers->lock); ++ if (can_stop) ++ kthread_stop(worker->task); ++ spin_lock_irq(&workers->lock); ++ put_worker(worker); + } ++ spin_unlock_irq(&workers->lock); + return 0; + } + + /* + * simple init on struct btrfs_workers + */ +-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) ++void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, ++ struct btrfs_workers *async_helper) + { + workers->num_workers = 0; ++ workers->num_workers_starting = 0; + INIT_LIST_HEAD(&workers->worker_list); + INIT_LIST_HEAD(&workers->idle_list); + INIT_LIST_HEAD(&workers->order_list); + INIT_LIST_HEAD(&workers->prio_order_list); + spin_lock_init(&workers->lock); ++ spin_lock_init(&workers->order_lock); + workers->max_workers = max; + workers->idle_thresh = 32; + workers->name = name; + workers->ordered = 0; ++ workers->atomic_start_pending = 0; ++ workers->atomic_worker_start = async_helper; + } + + /* + * starts new worker threads. This does not enforce the max worker + * count in case you need to temporarily go past it. + */ +-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) ++static int __btrfs_start_workers(struct btrfs_workers *workers, ++ int num_workers) + { + struct btrfs_worker_thread *worker; + int ret = 0; +@@ -293,7 +478,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) + INIT_LIST_HEAD(&worker->prio_pending); + INIT_LIST_HEAD(&worker->worker_list); + spin_lock_init(&worker->lock); ++ + atomic_set(&worker->num_pending, 0); ++ atomic_set(&worker->refs, 1); + worker->workers = workers; + worker->task = kthread_run(worker_loop, worker, + "btrfs-%s-%d", workers->name, +@@ -303,11 +490,12 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) + kfree(worker); + goto fail; + } +- + spin_lock_irq(&workers->lock); + list_add_tail(&worker->worker_list, &workers->idle_list); + worker->idle = 1; + workers->num_workers++; ++ workers->num_workers_starting--; ++ WARN_ON(workers->num_workers_starting < 0); + spin_unlock_irq(&workers->lock); + } + return 0; +@@ -316,6 +504,14 @@ fail: + return ret; + } + ++int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) ++{ ++ spin_lock_irq(&workers->lock); ++ workers->num_workers_starting += num_workers; ++ spin_unlock_irq(&workers->lock); ++ return __btrfs_start_workers(workers, num_workers); ++} ++ + /* + * run through the list and find a worker thread that doesn't have a lot + * to do right now. This can return null if we aren't yet at the thread +@@ -325,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) + { + struct btrfs_worker_thread *worker; + struct list_head *next; +- int enforce_min = workers->num_workers < workers->max_workers; ++ int enforce_min; ++ ++ enforce_min = (workers->num_workers + workers->num_workers_starting) < ++ workers->max_workers; + + /* + * if we find an idle thread, don't move it to the end of the +@@ -350,7 +549,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) + */ + next = workers->worker_list.next; + worker = list_entry(next, struct btrfs_worker_thread, worker_list); +- atomic_inc(&worker->num_pending); + worker->sequence++; + + if (worker->sequence % workers->idle_thresh == 0) +@@ -367,35 +565,49 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) + { + struct btrfs_worker_thread *worker; + unsigned long flags; ++ struct list_head *fallback; + + again: + spin_lock_irqsave(&workers->lock, flags); + worker = next_worker(workers); +- spin_unlock_irqrestore(&workers->lock, flags); + + if (!worker) { +- spin_lock_irqsave(&workers->lock, flags); +- if (workers->num_workers >= workers->max_workers) { +- struct list_head *fallback = NULL; +- /* +- * we have failed to find any workers, just +- * return the force one +- */ +- if (!list_empty(&workers->worker_list)) +- fallback = workers->worker_list.next; +- if (!list_empty(&workers->idle_list)) +- fallback = workers->idle_list.next; +- BUG_ON(!fallback); +- worker = list_entry(fallback, +- struct btrfs_worker_thread, worker_list); +- spin_unlock_irqrestore(&workers->lock, flags); ++ if (workers->num_workers + workers->num_workers_starting >= ++ workers->max_workers) { ++ goto fallback; ++ } else if (workers->atomic_worker_start) { ++ workers->atomic_start_pending = 1; ++ goto fallback; + } else { ++ workers->num_workers_starting++; + spin_unlock_irqrestore(&workers->lock, flags); + /* we're below the limit, start another worker */ +- btrfs_start_workers(workers, 1); ++ __btrfs_start_workers(workers, 1); + goto again; + } + } ++ goto found; ++ ++fallback: ++ fallback = NULL; ++ /* ++ * we have failed to find any workers, just ++ * return the first one we can find. ++ */ ++ if (!list_empty(&workers->worker_list)) ++ fallback = workers->worker_list.next; ++ if (!list_empty(&workers->idle_list)) ++ fallback = workers->idle_list.next; ++ BUG_ON(!fallback); ++ worker = list_entry(fallback, ++ struct btrfs_worker_thread, worker_list); ++found: ++ /* ++ * this makes sure the worker doesn't exit before it is placed ++ * onto a busy/idle list ++ */ ++ atomic_inc(&worker->num_pending); ++ spin_unlock_irqrestore(&workers->lock, flags); + return worker; + } + +@@ -427,7 +639,7 @@ int btrfs_requeue_work(struct btrfs_work *work) + spin_lock(&worker->workers->lock); + worker->idle = 0; + list_move_tail(&worker->worker_list, +- &worker->workers->worker_list); ++ &worker->workers->worker_list); + spin_unlock(&worker->workers->lock); + } + if (!worker->working) { +@@ -435,9 +647,9 @@ int btrfs_requeue_work(struct btrfs_work *work) + worker->working = 1; + } + +- spin_unlock_irqrestore(&worker->lock, flags); + if (wake) + wake_up_process(worker->task); ++ spin_unlock_irqrestore(&worker->lock, flags); + out: + + return 0; +@@ -463,14 +675,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) + + worker = find_worker(workers); + if (workers->ordered) { +- spin_lock_irqsave(&workers->lock, flags); ++ /* ++ * you're not allowed to do ordered queues from an ++ * interrupt handler ++ */ ++ spin_lock(&workers->order_lock); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { + list_add_tail(&work->order_list, + &workers->prio_order_list); + } else { + list_add_tail(&work->order_list, &workers->order_list); + } +- spin_unlock_irqrestore(&workers->lock, flags); ++ spin_unlock(&workers->order_lock); + } else { + INIT_LIST_HEAD(&work->order_list); + } +@@ -481,7 +697,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); +- atomic_inc(&worker->num_pending); + check_busy_worker(worker); + + /* +@@ -492,10 +707,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) + wake = 1; + worker->working = 1; + +- spin_unlock_irqrestore(&worker->lock, flags); +- + if (wake) + wake_up_process(worker->task); ++ spin_unlock_irqrestore(&worker->lock, flags); ++ + out: + return 0; + } +diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h +index 1b511c1..5077746 100644 +--- a/fs/btrfs/async-thread.h ++++ b/fs/btrfs/async-thread.h +@@ -64,6 +64,8 @@ struct btrfs_workers { + /* current number of running workers */ + int num_workers; + ++ int num_workers_starting; ++ + /* max number of workers allowed. changed by btrfs_start_workers */ + int max_workers; + +@@ -73,6 +75,16 @@ struct btrfs_workers { + /* force completions in the order they were queued */ + int ordered; + ++ /* more workers required, but in an interrupt handler */ ++ int atomic_start_pending; ++ ++ /* ++ * are we allowed to sleep while starting workers or are we required ++ * to start them at a later time? If we can't sleep, this indicates ++ * which queue we need to use to schedule thread creation. ++ */ ++ struct btrfs_workers *atomic_worker_start; ++ + /* list with all the work threads. The workers on the idle thread + * may be actively servicing jobs, but they haven't yet hit the + * idle thresh limit above. +@@ -90,6 +102,9 @@ struct btrfs_workers { + /* lock for finding the next worker thread to queue on */ + spinlock_t lock; + ++ /* lock for the ordered lists */ ++ spinlock_t order_lock; ++ + /* extra name for this worker, used for current->name */ + char *name; + }; +@@ -97,7 +112,8 @@ struct btrfs_workers { + int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); + int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); + int btrfs_stop_workers(struct btrfs_workers *workers); +-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); ++void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, ++ struct btrfs_workers *async_starter); + int btrfs_requeue_work(struct btrfs_work *work); + void btrfs_set_work_high_prio(struct btrfs_work *work); + #endif +diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h +index ea1ea0a..f6783a4 100644 +--- a/fs/btrfs/btrfs_inode.h ++++ b/fs/btrfs/btrfs_inode.h +@@ -86,6 +86,12 @@ struct btrfs_inode { + * transid of the trans_handle that last modified this inode + */ + u64 last_trans; ++ ++ /* ++ * log transid when this inode was last modified ++ */ ++ u64 last_sub_trans; ++ + /* + * transid that last logged this inode + */ +@@ -128,6 +134,16 @@ struct btrfs_inode { + u64 last_unlink_trans; + + /* ++ * Counters to keep track of the number of extent item's we may use due ++ * to delalloc and such. outstanding_extents is the number of extent ++ * items we think we'll end up using, and reserved_extents is the number ++ * of extent items we've reserved metadata for. ++ */ ++ spinlock_t accounting_lock; ++ int reserved_extents; ++ int outstanding_extents; ++ ++ /* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the +@@ -138,6 +154,7 @@ struct btrfs_inode { + * of these. + */ + unsigned ordered_data_close:1; ++ unsigned dummy_inode:1; + + struct inode vfs_inode; + }; +diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c +index 9d8ba4d..a11a320 100644 +--- a/fs/btrfs/compression.c ++++ b/fs/btrfs/compression.c +@@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode, + */ + set_page_extent_mapped(page); + lock_extent(tree, last_offset, end, GFP_NOFS); +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, last_offset, + PAGE_CACHE_SIZE); +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + if (!em || last_offset < em->start || + (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || +@@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + em_tree = &BTRFS_I(inode)->extent_tree; + + /* we need the actual starting offset of this extent in the file */ +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, + page_offset(bio->bi_io_vec->bv_page), + PAGE_CACHE_SIZE); +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + compressed_len = em->block_len; + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); +diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c +index 3fdcc05..ec96f3a 100644 +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, + int split; + int num_doubles = 0; + ++ l = path->nodes[0]; ++ slot = path->slots[0]; ++ if (extend && data_size + btrfs_item_size_nr(l, slot) + ++ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) ++ return -EOVERFLOW; ++ + /* first try to make some room by pushing left and right */ + if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { + wret = push_leaf_right(trans, root, path, data_size, 0); +diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h +index 837435c..e5dd628 100644 +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -114,6 +114,10 @@ struct btrfs_ordered_sum; + */ + #define BTRFS_DEV_ITEMS_OBJECTID 1ULL + ++#define BTRFS_BTREE_INODE_OBJECTID 1 ++ ++#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 ++ + /* + * we can actually store much bigger names, but lets not confuse the rest + * of linux +@@ -670,21 +674,29 @@ struct btrfs_space_info { + u64 bytes_reserved; /* total bytes the allocator has reserved for + current allocations */ + u64 bytes_readonly; /* total bytes that are read only */ +- +- /* delalloc accounting */ +- u64 bytes_delalloc; /* number of bytes reserved for allocation, +- this space is not necessarily reserved yet +- by the allocator */ ++ u64 bytes_super; /* total bytes reserved for the super blocks */ ++ u64 bytes_root; /* the number of bytes needed to commit a ++ transaction */ + u64 bytes_may_use; /* number of bytes that may be used for +- delalloc */ ++ delalloc/allocations */ ++ u64 bytes_delalloc; /* number of bytes currently reserved for ++ delayed allocation */ + + int full; /* indicates that we cannot allocate any more + chunks for this space */ + int force_alloc; /* set if we need to force a chunk alloc for + this space */ ++ int force_delalloc; /* make people start doing filemap_flush until ++ we're under a threshold */ + + struct list_head list; + ++ /* for controlling how we free up space for allocations */ ++ wait_queue_head_t allocate_wait; ++ wait_queue_head_t flush_wait; ++ int allocating_chunk; ++ int flushing; ++ + /* for block groups in our same type */ + struct list_head block_groups; + spinlock_t lock; +@@ -726,6 +738,15 @@ enum btrfs_caching_type { + BTRFS_CACHE_FINISHED = 2, + }; + ++struct btrfs_caching_control { ++ struct list_head list; ++ struct mutex mutex; ++ wait_queue_head_t wait; ++ struct btrfs_block_group_cache *block_group; ++ u64 progress; ++ atomic_t count; ++}; ++ + struct btrfs_block_group_cache { + struct btrfs_key key; + struct btrfs_block_group_item item; +@@ -733,6 +754,7 @@ struct btrfs_block_group_cache { + spinlock_t lock; + u64 pinned; + u64 reserved; ++ u64 bytes_super; + u64 flags; + u64 sectorsize; + int extents_thresh; +@@ -742,8 +764,9 @@ struct btrfs_block_group_cache { + int dirty; + + /* cache tracking stuff */ +- wait_queue_head_t caching_q; + int cached; ++ struct btrfs_caching_control *caching_ctl; ++ u64 last_byte_to_unpin; + + struct btrfs_space_info *space_info; + +@@ -782,13 +805,16 @@ struct btrfs_fs_info { + + /* the log root tree is a directory of all the other log roots */ + struct btrfs_root *log_root_tree; ++ ++ spinlock_t fs_roots_radix_lock; + struct radix_tree_root fs_roots_radix; + + /* block group cache stuff */ + spinlock_t block_group_cache_lock; + struct rb_root block_group_cache_tree; + +- struct extent_io_tree pinned_extents; ++ struct extent_io_tree freed_extents[2]; ++ struct extent_io_tree *pinned_extents; + + /* logical->physical extent mapping */ + struct btrfs_mapping_tree mapping_tree; +@@ -822,11 +848,7 @@ struct btrfs_fs_info { + struct mutex transaction_kthread_mutex; + struct mutex cleaner_mutex; + struct mutex chunk_mutex; +- struct mutex drop_mutex; + struct mutex volume_mutex; +- struct mutex tree_reloc_mutex; +- struct rw_semaphore extent_commit_sem; +- + /* + * this protects the ordered operations list only while we are + * processing all of the entries on it. This way we make +@@ -835,10 +857,16 @@ struct btrfs_fs_info { + * before jumping into the main commit. + */ + struct mutex ordered_operations_mutex; ++ struct rw_semaphore extent_commit_sem; ++ ++ struct rw_semaphore subvol_sem; ++ ++ struct srcu_struct subvol_srcu; + + struct list_head trans_list; + struct list_head hashers; + struct list_head dead_roots; ++ struct list_head caching_block_groups; + + atomic_t nr_async_submits; + atomic_t async_submit_draining; +@@ -882,6 +910,7 @@ struct btrfs_fs_info { + * A third pool does submit_bio to avoid deadlocking with the other + * two + */ ++ struct btrfs_workers generic_worker; + struct btrfs_workers workers; + struct btrfs_workers delalloc_workers; + struct btrfs_workers endio_workers; +@@ -889,6 +918,7 @@ struct btrfs_fs_info { + struct btrfs_workers endio_meta_write_workers; + struct btrfs_workers endio_write_workers; + struct btrfs_workers submit_workers; ++ struct btrfs_workers enospc_workers; + /* + * fixup workers take dirty pages that didn't properly go through + * the cow mechanism and make them safe to write. It happens +@@ -979,7 +1009,10 @@ struct btrfs_root { + atomic_t log_writers; + atomic_t log_commit[2]; + unsigned long log_transid; ++ unsigned long last_log_commit; + unsigned long log_batch; ++ pid_t log_start_pid; ++ bool log_multiple_pids; + + u64 objectid; + u64 last_trans; +@@ -996,10 +1029,12 @@ struct btrfs_root { + u32 stripesize; + + u32 type; +- u64 highest_inode; +- u64 last_inode_alloc; ++ ++ u64 highest_objectid; + int ref_cows; + int track_dirty; ++ int in_radix; ++ + u64 defrag_trans_start; + struct btrfs_key defrag_progress; + struct btrfs_key defrag_max; +@@ -1118,6 +1153,7 @@ struct btrfs_root { + #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) + #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) + #define BTRFS_MOUNT_NOSSD (1 << 9) ++#define BTRFS_MOUNT_DISCARD (1 << 10) + + #define BTRFS_MOUNT_TAGGED (1 << 24) + +@@ -1920,8 +1956,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache); + int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count); + int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); +-int btrfs_update_pinned_extents(struct btrfs_root *root, +- u64 bytenr, u64 num, int pin); ++int btrfs_pin_extent(struct btrfs_root *root, ++ u64 bytenr, u64 num, int reserved); + int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *leaf); + int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, +@@ -1971,9 +2007,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, + u64 root_objectid, u64 owner, u64 offset); + + int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); ++int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root); + int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- struct extent_io_tree *unpin); ++ struct btrfs_root *root); + int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, +@@ -1984,6 +2021,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); + int btrfs_free_block_groups(struct btrfs_fs_info *info); + int btrfs_read_block_groups(struct btrfs_root *root); ++int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr); + int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_objectid, u64 chunk_offset, +@@ -1997,7 +2035,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); + void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); + void btrfs_clear_space_info_full(struct btrfs_fs_info *info); + +-int btrfs_check_metadata_free_space(struct btrfs_root *root); ++int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); ++int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); ++int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, ++ struct inode *inode, int num_items); ++int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, ++ struct inode *inode, int num_items); + int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); + void btrfs_free_reserved_data_space(struct btrfs_root *root, +@@ -2006,7 +2049,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); + void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); +-void btrfs_free_pinned_extents(struct btrfs_fs_info *info); + /* ctree.c */ + int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot); +@@ -2100,12 +2142,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct extent_buffer *parent); + /* root-item.c */ + int btrfs_find_root_ref(struct btrfs_root *tree_root, +- struct btrfs_path *path, +- u64 root_id, u64 ref_id); ++ struct btrfs_path *path, ++ u64 root_id, u64 ref_id); + int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, +- u64 root_id, u8 type, u64 ref_id, +- u64 dirid, u64 sequence, ++ u64 root_id, u64 ref_id, u64 dirid, u64 sequence, ++ const char *name, int name_len); ++int btrfs_del_root_ref(struct btrfs_trans_handle *trans, ++ struct btrfs_root *tree_root, ++ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, + const char *name, int name_len); + int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key); +@@ -2120,6 +2165,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct + int btrfs_search_root(struct btrfs_root *root, u64 search_start, + u64 *found_objectid); + int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); ++int btrfs_find_orphan_roots(struct btrfs_root *tree_root); + int btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); + /* dir-item.c */ +@@ -2138,6 +2184,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 dir, + u64 objectid, const char *name, int name_len, + int mod); ++struct btrfs_dir_item * ++btrfs_search_dir_index_item(struct btrfs_root *root, ++ struct btrfs_path *path, u64 dirid, ++ const char *name, int name_len); + struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len); +@@ -2160,6 +2210,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); + int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); ++int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); + + /* inode-map.c */ + int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, +@@ -2232,6 +2283,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index); ++int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct inode *dir, u64 objectid, ++ const char *name, int name_len); + int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 new_size, +@@ -2242,7 +2297,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); + int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc); + int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, +- struct btrfs_root *new_root, struct dentry *dentry, ++ struct btrfs_root *new_root, + u64 new_dirid, u64 alloc_hint); + int btrfs_merge_bio_hook(struct page *page, unsigned long offset, + size_t size, struct bio *bio, unsigned long bio_flags); +@@ -2258,6 +2313,7 @@ int btrfs_write_inode(struct inode *inode, int wait); + void btrfs_dirty_inode(struct inode *inode); + struct inode *btrfs_alloc_inode(struct super_block *sb); + void btrfs_destroy_inode(struct inode *inode); ++void btrfs_drop_inode(struct inode *inode); + int btrfs_init_cachep(void); + void btrfs_destroy_cachep(void); + long btrfs_ioctl_trans_end(struct file *file); +@@ -2275,6 +2331,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); + int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); + void btrfs_orphan_cleanup(struct btrfs_root *root); + int btrfs_cont_expand(struct inode *inode, loff_t size); ++int btrfs_invalidate_inodes(struct btrfs_root *root); ++extern const struct dentry_operations btrfs_dentry_operations; + + /* ioctl.c */ + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +@@ -2290,7 +2348,7 @@ extern struct file_operations btrfs_file_operations; + int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, u64 end, u64 locked_end, +- u64 inline_limit, u64 *hint_block); ++ u64 inline_limit, u64 *hint_block, int drop_cache); + int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 start, u64 end); +@@ -2317,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options); + int btrfs_sync_fs(struct super_block *sb, int wait); + + /* acl.c */ +-#ifdef CONFIG_FS_POSIX_ACL ++#ifdef CONFIG_BTRFS_FS_POSIX_ACL + int btrfs_check_acl(struct inode *inode, int mask); + #else + #define btrfs_check_acl NULL +diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c +index 1d70236..f3a6075 100644 +--- a/fs/btrfs/dir-item.c ++++ b/fs/btrfs/dir-item.c +@@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + return btrfs_match_dir_item_name(root, path, name, name_len); + } + ++struct btrfs_dir_item * ++btrfs_search_dir_index_item(struct btrfs_root *root, ++ struct btrfs_path *path, u64 dirid, ++ const char *name, int name_len) ++{ ++ struct extent_buffer *leaf; ++ struct btrfs_dir_item *di; ++ struct btrfs_key key; ++ u32 nritems; ++ int ret; ++ ++ key.objectid = dirid; ++ key.type = BTRFS_DIR_INDEX_KEY; ++ key.offset = 0; ++ ++ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); ++ if (ret < 0) ++ return ERR_PTR(ret); ++ ++ leaf = path->nodes[0]; ++ nritems = btrfs_header_nritems(leaf); ++ ++ while (1) { ++ if (path->slots[0] >= nritems) { ++ ret = btrfs_next_leaf(root, path); ++ if (ret < 0) ++ return ERR_PTR(ret); ++ if (ret > 0) ++ break; ++ leaf = path->nodes[0]; ++ nritems = btrfs_header_nritems(leaf); ++ continue; ++ } ++ ++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ++ if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) ++ break; ++ ++ di = btrfs_match_dir_item_name(root, path, name, name_len); ++ if (di) ++ return di; ++ ++ path->slots[0]++; ++ } ++ return NULL; ++} ++ + struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index e83be2e..d4132aa 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -41,6 +41,7 @@ + + static struct extent_io_ops btree_extent_io_ops; + static void end_workqueue_fn(struct btrfs_work *work); ++static void free_fs_root(struct btrfs_root *root); + + static atomic_t btrfs_bdi_num = ATOMIC_INIT(0); + +@@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode, + struct extent_map *em; + int ret; + +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) { + em->bdev = + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + goto out; + } +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + em = alloc_extent_map(GFP_NOFS); + if (!em) { +@@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, + em->block_start = 0; + em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + if (ret == -EEXIST) { + u64 failed_start = em->start; +@@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, + free_extent_map(em); + em = NULL; + } +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + + if (ret) + em = ERR_PTR(ret); +@@ -828,7 +829,9 @@ int btrfs_write_tree_block(struct extent_buffer *buf) + int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) + { + return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, +- buf->start, buf->start + buf->len - 1); ++ buf->start >> PAGE_CACHE_SHIFT, ++ (buf->start + buf->len - 1) >> ++ PAGE_CACHE_SHIFT); + } + + struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, +@@ -895,8 +898,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, + root->fs_info = fs_info; + root->objectid = objectid; + root->last_trans = 0; +- root->highest_inode = 0; +- root->last_inode_alloc = 0; ++ root->highest_objectid = 0; + root->name = NULL; + root->in_sysfs = 0; + root->inode_tree.rb_node = NULL; +@@ -917,6 +919,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, + atomic_set(&root->log_writers, 0); + root->log_batch = 0; + root->log_transid = 0; ++ root->last_log_commit = 0; + extent_io_tree_init(&root->dirty_log_pages, + fs_info->btree_inode->i_mapping, GFP_NOFS); + +@@ -952,14 +955,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root, + root, fs_info, objectid); + ret = btrfs_find_last_root(tree_root, objectid, + &root->root_item, &root->root_key); ++ if (ret > 0) ++ return -ENOENT; + BUG_ON(ret); + + generation = btrfs_root_generation(&root->root_item); + blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + blocksize, generation); +- root->commit_root = btrfs_root_node(root); + BUG_ON(!root->node); ++ root->commit_root = btrfs_root_node(root); + return 0; + } + +@@ -1085,6 +1090,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + WARN_ON(root->log_root); + root->log_root = log_root; + root->log_transid = 0; ++ root->last_log_commit = 0; + return 0; + } + +@@ -1095,7 +1101,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_path *path; + struct extent_buffer *l; +- u64 highest_inode; + u64 generation; + u32 blocksize; + int ret = 0; +@@ -1110,7 +1115,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + kfree(root); + return ERR_PTR(ret); + } +- goto insert; ++ goto out; + } + + __setup_root(tree_root->nodesize, tree_root->leafsize, +@@ -1120,39 +1125,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); +- if (ret != 0) { +- if (ret > 0) +- ret = -ENOENT; +- goto out; ++ if (ret == 0) { ++ l = path->nodes[0]; ++ read_extent_buffer(l, &root->root_item, ++ btrfs_item_ptr_offset(l, path->slots[0]), ++ sizeof(root->root_item)); ++ memcpy(&root->root_key, location, sizeof(*location)); + } +- l = path->nodes[0]; +- read_extent_buffer(l, &root->root_item, +- btrfs_item_ptr_offset(l, path->slots[0]), +- sizeof(root->root_item)); +- memcpy(&root->root_key, location, sizeof(*location)); +- ret = 0; +-out: +- btrfs_release_path(root, path); + btrfs_free_path(path); + if (ret) { +- kfree(root); ++ if (ret > 0) ++ ret = -ENOENT; + return ERR_PTR(ret); + } ++ + generation = btrfs_root_generation(&root->root_item); + blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + blocksize, generation); + root->commit_root = btrfs_root_node(root); + BUG_ON(!root->node); +-insert: +- if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { ++out: ++ if (location->objectid != BTRFS_TREE_LOG_OBJECTID) + root->ref_cows = 1; +- ret = btrfs_find_highest_inode(root, &highest_inode); +- if (ret == 0) { +- root->highest_inode = highest_inode; +- root->last_inode_alloc = highest_inode; +- } +- } ++ + return root; + } + +@@ -1187,39 +1183,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + return fs_info->dev_root; + if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) + return fs_info->csum_root; +- ++again: ++ spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)location->objectid); ++ spin_unlock(&fs_info->fs_roots_radix_lock); + if (root) + return root; + ++ ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); ++ if (ret == 0) ++ ret = -ENOENT; ++ if (ret < 0) ++ return ERR_PTR(ret); ++ + root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); + if (IS_ERR(root)) + return root; + ++ WARN_ON(btrfs_root_refs(&root->root_item) == 0); + set_anon_super(&root->anon_super, NULL); + ++ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); ++ if (ret) ++ goto fail; ++ ++ spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); ++ if (ret == 0) ++ root->in_radix = 1; ++ spin_unlock(&fs_info->fs_roots_radix_lock); ++ radix_tree_preload_end(); + if (ret) { +- free_extent_buffer(root->node); +- kfree(root); +- return ERR_PTR(ret); ++ if (ret == -EEXIST) { ++ free_fs_root(root); ++ goto again; ++ } ++ goto fail; + } +- if (!(fs_info->sb->s_flags & MS_RDONLY)) { +- ret = btrfs_find_dead_roots(fs_info->tree_root, +- root->root_key.objectid); +- BUG_ON(ret); ++ ++ ret = btrfs_find_dead_roots(fs_info->tree_root, ++ root->root_key.objectid); ++ WARN_ON(ret); ++ ++ if (!(fs_info->sb->s_flags & MS_RDONLY)) + btrfs_orphan_cleanup(root); +- } ++ + return root; ++fail: ++ free_fs_root(root); ++ return ERR_PTR(ret); + } + + struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *location, + const char *name, int namelen) + { ++ return btrfs_read_fs_root_no_name(fs_info, location); ++#if 0 + struct btrfs_root *root; + int ret; + +@@ -1236,7 +1259,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, + kfree(root); + return ERR_PTR(ret); + } +-#if 0 ++ + ret = btrfs_sysfs_add_root(root); + if (ret) { + free_extent_buffer(root->node); +@@ -1244,9 +1267,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, + kfree(root); + return ERR_PTR(ret); + } +-#endif + root->in_sysfs = 1; + return root; ++#endif + } + + static int btrfs_congested_fn(void *congested_data, int bdi_bits) +@@ -1325,9 +1348,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) + offset = page_offset(page); + + em_tree = &BTRFS_I(inode)->extent_tree; +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + if (!em) { + __unplug_io_fn(bdi, page); + return; +@@ -1359,8 +1382,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) + + err = bdi_register(bdi, NULL, "btrfs-%d", + atomic_inc_return(&btrfs_bdi_num)); +- if (err) ++ if (err) { ++ bdi_destroy(bdi); + return err; ++ } + + bdi->ra_pages = default_backing_dev_info.ra_pages; + bdi->unplug_io_fn = btrfs_unplug_io_fn; +@@ -1450,9 +1475,12 @@ static int cleaner_kthread(void *arg) + break; + + vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); +- mutex_lock(&root->fs_info->cleaner_mutex); +- btrfs_clean_old_snapshots(root); +- mutex_unlock(&root->fs_info->cleaner_mutex); ++ ++ if (!(root->fs_info->sb->s_flags & MS_RDONLY) && ++ mutex_trylock(&root->fs_info->cleaner_mutex)) { ++ btrfs_clean_old_snapshots(root); ++ mutex_unlock(&root->fs_info->cleaner_mutex); ++ } + + if (freezing(current)) { + refrigerator(); +@@ -1557,15 +1585,36 @@ struct btrfs_root *open_ctree(struct super_block *sb, + err = -ENOMEM; + goto fail; + } +- INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); ++ ++ ret = init_srcu_struct(&fs_info->subvol_srcu); ++ if (ret) { ++ err = ret; ++ goto fail; ++ } ++ ++ ret = setup_bdi(fs_info, &fs_info->bdi); ++ if (ret) { ++ err = ret; ++ goto fail_srcu; ++ } ++ ++ fs_info->btree_inode = new_inode(sb); ++ if (!fs_info->btree_inode) { ++ err = -ENOMEM; ++ goto fail_bdi; ++ } ++ ++ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + INIT_LIST_HEAD(&fs_info->trans_list); + INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->hashers); + INIT_LIST_HEAD(&fs_info->delalloc_inodes); + INIT_LIST_HEAD(&fs_info->ordered_operations); ++ INIT_LIST_HEAD(&fs_info->caching_block_groups); + spin_lock_init(&fs_info->delalloc_lock); + spin_lock_init(&fs_info->new_trans_lock); + spin_lock_init(&fs_info->ref_cache_lock); ++ spin_lock_init(&fs_info->fs_roots_radix_lock); + + init_completion(&fs_info->kobj_unregister); + fs_info->tree_root = tree_root; +@@ -1584,12 +1633,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, + fs_info->sb = sb; + fs_info->max_extent = (u64)-1; + fs_info->max_inline = 8192 * 1024; +- if (setup_bdi(fs_info, &fs_info->bdi)) +- goto fail_bdi; +- fs_info->btree_inode = new_inode(sb); +- fs_info->btree_inode->i_ino = 1; +- fs_info->btree_inode->i_nlink = 1; +- fs_info->metadata_ratio = 8; ++ fs_info->metadata_ratio = 0; + + fs_info->thread_pool_size = min_t(unsigned long, + num_online_cpus() + 2, 8); +@@ -1600,6 +1644,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, + sb->s_blocksize = 4096; + sb->s_blocksize_bits = blksize_bits(4096); + ++ fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; ++ fs_info->btree_inode->i_nlink = 1; + /* + * we set the i_size on the btree inode to the max possible int. + * the real end of the address space is determined by all of +@@ -1618,28 +1664,32 @@ struct btrfs_root *open_ctree(struct super_block *sb, + + BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; + ++ BTRFS_I(fs_info->btree_inode)->root = tree_root; ++ memset(&BTRFS_I(fs_info->btree_inode)->location, 0, ++ sizeof(struct btrfs_key)); ++ BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; ++ insert_inode_hash(fs_info->btree_inode); ++ + spin_lock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree.rb_node = NULL; + +- extent_io_tree_init(&fs_info->pinned_extents, ++ extent_io_tree_init(&fs_info->freed_extents[0], + fs_info->btree_inode->i_mapping, GFP_NOFS); ++ extent_io_tree_init(&fs_info->freed_extents[1], ++ fs_info->btree_inode->i_mapping, GFP_NOFS); ++ fs_info->pinned_extents = &fs_info->freed_extents[0]; + fs_info->do_barriers = 1; + +- BTRFS_I(fs_info->btree_inode)->root = tree_root; +- memset(&BTRFS_I(fs_info->btree_inode)->location, 0, +- sizeof(struct btrfs_key)); +- insert_inode_hash(fs_info->btree_inode); + + mutex_init(&fs_info->trans_mutex); + mutex_init(&fs_info->ordered_operations_mutex); + mutex_init(&fs_info->tree_log_mutex); +- mutex_init(&fs_info->drop_mutex); + mutex_init(&fs_info->chunk_mutex); + mutex_init(&fs_info->transaction_kthread_mutex); + mutex_init(&fs_info->cleaner_mutex); + mutex_init(&fs_info->volume_mutex); +- mutex_init(&fs_info->tree_reloc_mutex); + init_rwsem(&fs_info->extent_commit_sem); ++ init_rwsem(&fs_info->subvol_sem); + + btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); + btrfs_init_free_cluster(&fs_info->data_alloc_cluster); +@@ -1699,20 +1749,24 @@ struct btrfs_root *open_ctree(struct super_block *sb, + goto fail_iput; + } + +- /* +- * we need to start all the end_io workers up front because the +- * queue work function gets called at interrupt time, and so it +- * cannot dynamically grow. +- */ ++ btrfs_init_workers(&fs_info->generic_worker, ++ "genwork", 1, NULL); ++ + btrfs_init_workers(&fs_info->workers, "worker", +- fs_info->thread_pool_size); ++ fs_info->thread_pool_size, ++ &fs_info->generic_worker); + + btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", +- fs_info->thread_pool_size); ++ fs_info->thread_pool_size, ++ &fs_info->generic_worker); + + btrfs_init_workers(&fs_info->submit_workers, "submit", + min_t(u64, fs_devices->num_devices, +- fs_info->thread_pool_size)); ++ fs_info->thread_pool_size), ++ &fs_info->generic_worker); ++ btrfs_init_workers(&fs_info->enospc_workers, "enospc", ++ fs_info->thread_pool_size, ++ &fs_info->generic_worker); + + /* a higher idle thresh on the submit workers makes it much more + * likely that bios will be send down in a sane order to the +@@ -1726,15 +1780,20 @@ struct btrfs_root *open_ctree(struct super_block *sb, + fs_info->delalloc_workers.idle_thresh = 2; + fs_info->delalloc_workers.ordered = 1; + +- btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); ++ btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, ++ &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_workers, "endio", +- fs_info->thread_pool_size); ++ fs_info->thread_pool_size, ++ &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", +- fs_info->thread_pool_size); ++ fs_info->thread_pool_size, ++ &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_meta_write_workers, +- "endio-meta-write", fs_info->thread_pool_size); ++ "endio-meta-write", fs_info->thread_pool_size, ++ &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", +- fs_info->thread_pool_size); ++ fs_info->thread_pool_size, ++ &fs_info->generic_worker); + + /* + * endios are largely parallel and should have a very +@@ -1743,20 +1802,19 @@ struct btrfs_root *open_ctree(struct super_block *sb, + fs_info->endio_workers.idle_thresh = 4; + fs_info->endio_meta_workers.idle_thresh = 4; + +- fs_info->endio_write_workers.idle_thresh = 64; +- fs_info->endio_meta_write_workers.idle_thresh = 64; ++ fs_info->endio_write_workers.idle_thresh = 2; ++ fs_info->endio_meta_write_workers.idle_thresh = 2; + + btrfs_start_workers(&fs_info->workers, 1); ++ btrfs_start_workers(&fs_info->generic_worker, 1); + btrfs_start_workers(&fs_info->submit_workers, 1); + btrfs_start_workers(&fs_info->delalloc_workers, 1); + btrfs_start_workers(&fs_info->fixup_workers, 1); +- btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); +- btrfs_start_workers(&fs_info->endio_meta_workers, +- fs_info->thread_pool_size); +- btrfs_start_workers(&fs_info->endio_meta_write_workers, +- fs_info->thread_pool_size); +- btrfs_start_workers(&fs_info->endio_write_workers, +- fs_info->thread_pool_size); ++ btrfs_start_workers(&fs_info->endio_workers, 1); ++ btrfs_start_workers(&fs_info->endio_meta_workers, 1); ++ btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); ++ btrfs_start_workers(&fs_info->endio_write_workers, 1); ++ btrfs_start_workers(&fs_info->enospc_workers, 1); + + fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); + fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, +@@ -1916,6 +1974,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, + } + } + ++ ret = btrfs_find_orphan_roots(tree_root); ++ BUG_ON(ret); ++ + if (!(sb->s_flags & MS_RDONLY)) { + ret = btrfs_recover_relocation(tree_root); + BUG_ON(ret); +@@ -1959,6 +2020,7 @@ fail_chunk_root: + free_extent_buffer(chunk_root->node); + free_extent_buffer(chunk_root->commit_root); + fail_sb_buffer: ++ btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); +@@ -1967,6 +2029,7 @@ fail_sb_buffer: + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->submit_workers); ++ btrfs_stop_workers(&fs_info->enospc_workers); + fail_iput: + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + iput(fs_info->btree_inode); +@@ -1975,6 +2038,8 @@ fail_iput: + btrfs_mapping_tree_free(&fs_info->mapping_tree); + fail_bdi: + bdi_destroy(&fs_info->bdi); ++fail_srcu: ++ cleanup_srcu_struct(&fs_info->subvol_srcu); + fail: + kfree(extent_root); + kfree(tree_root); +@@ -2234,20 +2299,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans, + + int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) + { +- WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); ++ spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_delete(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid); ++ spin_unlock(&fs_info->fs_roots_radix_lock); ++ ++ if (btrfs_root_refs(&root->root_item) == 0) ++ synchronize_srcu(&fs_info->subvol_srcu); ++ ++ free_fs_root(root); ++ return 0; ++} ++ ++static void free_fs_root(struct btrfs_root *root) ++{ ++ WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + if (root->anon_super.s_dev) { + down_write(&root->anon_super.s_umount); + kill_anon_super(&root->anon_super); + } +- if (root->node) +- free_extent_buffer(root->node); +- if (root->commit_root) +- free_extent_buffer(root->commit_root); ++ free_extent_buffer(root->node); ++ free_extent_buffer(root->commit_root); + kfree(root->name); + kfree(root); +- return 0; + } + + static int del_fs_roots(struct btrfs_fs_info *fs_info) +@@ -2256,6 +2330,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info) + struct btrfs_root *gang[8]; + int i; + ++ while (!list_empty(&fs_info->dead_roots)) { ++ gang[0] = list_entry(fs_info->dead_roots.next, ++ struct btrfs_root, root_list); ++ list_del(&gang[0]->root_list); ++ ++ if (gang[0]->in_radix) { ++ btrfs_free_fs_root(fs_info, gang[0]); ++ } else { ++ free_extent_buffer(gang[0]->node); ++ free_extent_buffer(gang[0]->commit_root); ++ kfree(gang[0]); ++ } ++ } ++ + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, +@@ -2285,9 +2373,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) + root_objectid = gang[ret - 1]->root_key.objectid + 1; + for (i = 0; i < ret; i++) { + root_objectid = gang[i]->root_key.objectid; +- ret = btrfs_find_dead_roots(fs_info->tree_root, +- root_objectid); +- BUG_ON(ret); + btrfs_orphan_cleanup(gang[i]); + } + root_objectid++; +@@ -2357,12 +2442,12 @@ int close_ctree(struct btrfs_root *root) + free_extent_buffer(root->fs_info->csum_root->commit_root); + + btrfs_free_block_groups(root->fs_info); +- btrfs_free_pinned_extents(root->fs_info); + + del_fs_roots(fs_info); + + iput(fs_info->btree_inode); + ++ btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); +@@ -2371,11 +2456,13 @@ int close_ctree(struct btrfs_root *root) + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->submit_workers); ++ btrfs_stop_workers(&fs_info->enospc_workers); + + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + + bdi_destroy(&fs_info->bdi); ++ cleanup_srcu_struct(&fs_info->subvol_srcu); + + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); +diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c +index 9596b40..ba5c3fd 100644 +--- a/fs/btrfs/export.c ++++ b/fs/btrfs/export.c +@@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + len = BTRFS_FID_SIZE_NON_CONNECTABLE; + type = FILEID_BTRFS_WITHOUT_PARENT; + +- fid->objectid = BTRFS_I(inode)->location.objectid; ++ fid->objectid = inode->i_ino; + fid->root_objectid = BTRFS_I(inode)->root->objectid; + fid->gen = inode->i_generation; + +@@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + } + + static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, +- u64 root_objectid, u32 generation) ++ u64 root_objectid, u32 generation, ++ int check_generation) + { ++ struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; + struct btrfs_root *root; ++ struct dentry *dentry; + struct inode *inode; + struct btrfs_key key; ++ int index; ++ int err = 0; ++ ++ if (objectid < BTRFS_FIRST_FREE_OBJECTID) ++ return ERR_PTR(-ESTALE); + + key.objectid = root_objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + +- root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key); +- if (IS_ERR(root)) +- return ERR_CAST(root); ++ index = srcu_read_lock(&fs_info->subvol_srcu); ++ ++ root = btrfs_read_fs_root_no_name(fs_info, &key); ++ if (IS_ERR(root)) { ++ err = PTR_ERR(root); ++ goto fail; ++ } ++ ++ if (btrfs_root_refs(&root->root_item) == 0) { ++ err = -ENOENT; ++ goto fail; ++ } + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + inode = btrfs_iget(sb, &key, root); +- if (IS_ERR(inode)) +- return (void *)inode; ++ if (IS_ERR(inode)) { ++ err = PTR_ERR(inode); ++ goto fail; ++ } ++ ++ srcu_read_unlock(&fs_info->subvol_srcu, index); + +- if (generation != inode->i_generation) { ++ if (check_generation && generation != inode->i_generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + +- return d_obtain_alias(inode); ++ dentry = d_obtain_alias(inode); ++ if (!IS_ERR(dentry)) ++ dentry->d_op = &btrfs_dentry_operations; ++ return dentry; ++fail: ++ srcu_read_unlock(&fs_info->subvol_srcu, index); ++ return ERR_PTR(err); + } + + static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, +@@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, + objectid = fid->parent_objectid; + generation = fid->parent_gen; + +- return btrfs_get_dentry(sb, objectid, root_objectid, generation); ++ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); + } + + static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, +@@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + root_objectid = fid->root_objectid; + generation = fid->gen; + +- return btrfs_get_dentry(sb, objectid, root_objectid, generation); ++ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); + } + + static struct dentry *btrfs_get_parent(struct dentry *child) + { + struct inode *dir = child->d_inode; ++ static struct dentry *dentry; + struct btrfs_root *root = BTRFS_I(dir)->root; +- struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; +- int slot; +- u64 objectid; ++ struct btrfs_root_ref *ref; ++ struct btrfs_key key; ++ struct btrfs_key found_key; + int ret; + + path = btrfs_alloc_path(); + +- key.objectid = dir->i_ino; +- btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); +- key.offset = (u64)-1; ++ if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) { ++ key.objectid = root->root_key.objectid; ++ key.type = BTRFS_ROOT_BACKREF_KEY; ++ key.offset = (u64)-1; ++ root = root->fs_info->tree_root; ++ } else { ++ key.objectid = dir->i_ino; ++ key.type = BTRFS_INODE_REF_KEY; ++ key.offset = (u64)-1; ++ } + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +- if (ret < 0) { +- /* Error */ +- btrfs_free_path(path); +- return ERR_PTR(ret); ++ if (ret < 0) ++ goto fail; ++ ++ BUG_ON(ret == 0); ++ if (path->slots[0] == 0) { ++ ret = -ENOENT; ++ goto fail; + } ++ ++ path->slots[0]--; + leaf = path->nodes[0]; +- slot = path->slots[0]; +- if (ret) { +- /* btrfs_search_slot() returns the slot where we'd want to +- insert a backref for parent inode #0xFFFFFFFFFFFFFFFF. +- The _real_ backref, telling us what the parent inode +- _actually_ is, will be in the slot _before_ the one +- that btrfs_search_slot() returns. */ +- if (!slot) { +- /* Unless there is _no_ key in the tree before... */ +- btrfs_free_path(path); +- return ERR_PTR(-EIO); +- } +- slot--; ++ ++ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); ++ if (found_key.objectid != key.objectid || found_key.type != key.type) { ++ ret = -ENOENT; ++ goto fail; + } + +- btrfs_item_key_to_cpu(leaf, &key, slot); ++ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { ++ ref = btrfs_item_ptr(leaf, path->slots[0], ++ struct btrfs_root_ref); ++ key.objectid = btrfs_root_ref_dirid(leaf, ref); ++ } else { ++ key.objectid = found_key.offset; ++ } + btrfs_free_path(path); + +- if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) +- return ERR_PTR(-EINVAL); +- +- objectid = key.offset; +- +- /* If we are already at the root of a subvol, return the real root */ +- if (objectid == dir->i_ino) +- return dget(dir->i_sb->s_root); ++ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { ++ return btrfs_get_dentry(root->fs_info->sb, key.objectid, ++ found_key.offset, 0, 0); ++ } + +- /* Build a new key for the inode item */ +- key.objectid = objectid; +- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); ++ key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; +- +- return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); ++ dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); ++ if (!IS_ERR(dentry)) ++ dentry->d_op = &btrfs_dentry_operations; ++ return dentry; ++fail: ++ btrfs_free_path(path); ++ return ERR_PTR(ret); + } + + const struct export_operations btrfs_export_ops = { +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 72a2b9c..c56f916 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -32,12 +32,12 @@ + #include "locking.h" + #include "free-space-cache.h" + +-static int update_reserved_extents(struct btrfs_root *root, +- u64 bytenr, u64 num, int reserve); + static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int alloc, + int mark_free); ++static int update_reserved_extents(struct btrfs_block_group_cache *cache, ++ u64 num_bytes, int reserve); + static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, +@@ -57,10 +57,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + u64 parent, u64 root_objectid, + u64 flags, struct btrfs_disk_key *key, + int level, struct btrfs_key *ins); +- + static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 alloc_bytes, + u64 flags, int force); ++static int pin_down_bytes(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct btrfs_path *path, ++ u64 bytenr, u64 num_bytes, ++ int is_data, int reserved, ++ struct extent_buffer **must_clean); ++static int find_next_key(struct btrfs_path *path, int level, ++ struct btrfs_key *key); ++static void dump_space_info(struct btrfs_space_info *info, u64 bytes, ++ int dump_block_groups); + + static noinline int + block_group_cache_done(struct btrfs_block_group_cache *cache) +@@ -153,34 +162,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, + return ret; + } + +-/* +- * We always set EXTENT_LOCKED for the super mirror extents so we don't +- * overwrite them, so those bits need to be unset. Also, if we are unmounting +- * with pinned extents still sitting there because we had a block group caching, +- * we need to clear those now, since we are done. +- */ +-void btrfs_free_pinned_extents(struct btrfs_fs_info *info) ++static int add_excluded_extent(struct btrfs_root *root, ++ u64 start, u64 num_bytes) + { +- u64 start, end, last = 0; +- int ret; ++ u64 end = start + num_bytes - 1; ++ set_extent_bits(&root->fs_info->freed_extents[0], ++ start, end, EXTENT_UPTODATE, GFP_NOFS); ++ set_extent_bits(&root->fs_info->freed_extents[1], ++ start, end, EXTENT_UPTODATE, GFP_NOFS); ++ return 0; ++} + +- while (1) { +- ret = find_first_extent_bit(&info->pinned_extents, last, +- &start, &end, +- EXTENT_LOCKED|EXTENT_DIRTY); +- if (ret) +- break; ++static void free_excluded_extents(struct btrfs_root *root, ++ struct btrfs_block_group_cache *cache) ++{ ++ u64 start, end; + +- clear_extent_bits(&info->pinned_extents, start, end, +- EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS); +- last = end+1; +- } ++ start = cache->key.objectid; ++ end = start + cache->key.offset - 1; ++ ++ clear_extent_bits(&root->fs_info->freed_extents[0], ++ start, end, EXTENT_UPTODATE, GFP_NOFS); ++ clear_extent_bits(&root->fs_info->freed_extents[1], ++ start, end, EXTENT_UPTODATE, GFP_NOFS); + } + +-static int remove_sb_from_cache(struct btrfs_root *root, +- struct btrfs_block_group_cache *cache) ++static int exclude_super_stripes(struct btrfs_root *root, ++ struct btrfs_block_group_cache *cache) + { +- struct btrfs_fs_info *fs_info = root->fs_info; + u64 bytenr; + u64 *logical; + int stripe_len; +@@ -192,17 +201,42 @@ static int remove_sb_from_cache(struct btrfs_root *root, + cache->key.objectid, bytenr, + 0, &logical, &nr, &stripe_len); + BUG_ON(ret); ++ + while (nr--) { +- try_lock_extent(&fs_info->pinned_extents, +- logical[nr], +- logical[nr] + stripe_len - 1, GFP_NOFS); ++ cache->bytes_super += stripe_len; ++ ret = add_excluded_extent(root, logical[nr], ++ stripe_len); ++ BUG_ON(ret); + } ++ + kfree(logical); + } +- + return 0; + } + ++static struct btrfs_caching_control * ++get_caching_control(struct btrfs_block_group_cache *cache) ++{ ++ struct btrfs_caching_control *ctl; ++ ++ spin_lock(&cache->lock); ++ if (cache->cached != BTRFS_CACHE_STARTED) { ++ spin_unlock(&cache->lock); ++ return NULL; ++ } ++ ++ ctl = cache->caching_ctl; ++ atomic_inc(&ctl->count); ++ spin_unlock(&cache->lock); ++ return ctl; ++} ++ ++static void put_caching_control(struct btrfs_caching_control *ctl) ++{ ++ if (atomic_dec_and_test(&ctl->count)) ++ kfree(ctl); ++} ++ + /* + * this is only called by cache_block_group, since we could have freed extents + * we need to check the pinned_extents for any extents that can't be used yet +@@ -215,9 +249,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + int ret; + + while (start < end) { +- ret = find_first_extent_bit(&info->pinned_extents, start, ++ ret = find_first_extent_bit(info->pinned_extents, start, + &extent_start, &extent_end, +- EXTENT_DIRTY|EXTENT_LOCKED); ++ EXTENT_DIRTY | EXTENT_UPTODATE); + if (ret) + break; + +@@ -249,22 +283,27 @@ static int caching_kthread(void *data) + { + struct btrfs_block_group_cache *block_group = data; + struct btrfs_fs_info *fs_info = block_group->fs_info; +- u64 last = 0; ++ struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; ++ struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_path *path; +- int ret = 0; +- struct btrfs_key key; + struct extent_buffer *leaf; +- int slot; ++ struct btrfs_key key; + u64 total_found = 0; +- +- BUG_ON(!fs_info); ++ u64 last = 0; ++ u32 nritems; ++ int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + +- atomic_inc(&block_group->space_info->caching_threads); ++ exclude_super_stripes(extent_root, block_group); ++ spin_lock(&block_group->space_info->lock); ++ block_group->space_info->bytes_super += block_group->bytes_super; ++ spin_unlock(&block_group->space_info->lock); ++ + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); ++ + /* + * We don't want to deadlock with somebody trying to allocate a new + * extent for the extent root while also trying to search the extent +@@ -277,74 +316,64 @@ static int caching_kthread(void *data) + + key.objectid = last; + key.offset = 0; +- btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); ++ key.type = BTRFS_EXTENT_ITEM_KEY; + again: ++ mutex_lock(&caching_ctl->mutex); + /* need to make sure the commit_root doesn't disappear */ + down_read(&fs_info->extent_commit_sem); + +- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); ++ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto err; + ++ leaf = path->nodes[0]; ++ nritems = btrfs_header_nritems(leaf); ++ + while (1) { + smp_mb(); +- if (block_group->fs_info->closing > 1) { ++ if (fs_info->closing > 1) { + last = (u64)-1; + break; + } + +- leaf = path->nodes[0]; +- slot = path->slots[0]; +- if (slot >= btrfs_header_nritems(leaf)) { +- ret = btrfs_next_leaf(fs_info->extent_root, path); +- if (ret < 0) +- goto err; +- else if (ret) ++ if (path->slots[0] < nritems) { ++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ++ } else { ++ ret = find_next_key(path, 0, &key); ++ if (ret) + break; + +- if (need_resched() || +- btrfs_transaction_in_commit(fs_info)) { +- leaf = path->nodes[0]; +- +- /* this shouldn't happen, but if the +- * leaf is empty just move on. +- */ +- if (btrfs_header_nritems(leaf) == 0) +- break; +- /* +- * we need to copy the key out so that +- * we are sure the next search advances +- * us forward in the btree. +- */ +- btrfs_item_key_to_cpu(leaf, &key, 0); +- btrfs_release_path(fs_info->extent_root, path); +- up_read(&fs_info->extent_commit_sem); ++ caching_ctl->progress = last; ++ btrfs_release_path(extent_root, path); ++ up_read(&fs_info->extent_commit_sem); ++ mutex_unlock(&caching_ctl->mutex); ++ if (btrfs_transaction_in_commit(fs_info)) + schedule_timeout(1); +- goto again; +- } ++ else ++ cond_resched(); ++ goto again; ++ } + ++ if (key.objectid < block_group->key.objectid) { ++ path->slots[0]++; + continue; + } +- btrfs_item_key_to_cpu(leaf, &key, slot); +- if (key.objectid < block_group->key.objectid) +- goto next; + + if (key.objectid >= block_group->key.objectid + + block_group->key.offset) + break; + +- if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { ++ if (key.type == BTRFS_EXTENT_ITEM_KEY) { + total_found += add_new_free_space(block_group, + fs_info, last, + key.objectid); + last = key.objectid + key.offset; +- } + +- if (total_found > (1024 * 1024 * 2)) { +- total_found = 0; +- wake_up(&block_group->caching_q); ++ if (total_found > (1024 * 1024 * 2)) { ++ total_found = 0; ++ wake_up(&caching_ctl->wait); ++ } + } +-next: + path->slots[0]++; + } + ret = 0; +@@ -352,33 +381,65 @@ next: + total_found += add_new_free_space(block_group, fs_info, last, + block_group->key.objectid + + block_group->key.offset); ++ caching_ctl->progress = (u64)-1; + + spin_lock(&block_group->lock); ++ block_group->caching_ctl = NULL; + block_group->cached = BTRFS_CACHE_FINISHED; + spin_unlock(&block_group->lock); + + err: + btrfs_free_path(path); + up_read(&fs_info->extent_commit_sem); +- atomic_dec(&block_group->space_info->caching_threads); +- wake_up(&block_group->caching_q); + ++ free_excluded_extents(extent_root, block_group); ++ ++ mutex_unlock(&caching_ctl->mutex); ++ wake_up(&caching_ctl->wait); ++ ++ put_caching_control(caching_ctl); ++ atomic_dec(&block_group->space_info->caching_threads); + return 0; + } + + static int cache_block_group(struct btrfs_block_group_cache *cache) + { ++ struct btrfs_fs_info *fs_info = cache->fs_info; ++ struct btrfs_caching_control *caching_ctl; + struct task_struct *tsk; + int ret = 0; + ++ smp_mb(); ++ if (cache->cached != BTRFS_CACHE_NO) ++ return 0; ++ ++ caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); ++ BUG_ON(!caching_ctl); ++ ++ INIT_LIST_HEAD(&caching_ctl->list); ++ mutex_init(&caching_ctl->mutex); ++ init_waitqueue_head(&caching_ctl->wait); ++ caching_ctl->block_group = cache; ++ caching_ctl->progress = cache->key.objectid; ++ /* one for caching kthread, one for caching block group list */ ++ atomic_set(&caching_ctl->count, 2); ++ + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); +- return ret; ++ kfree(caching_ctl); ++ return 0; + } ++ cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_STARTED; + spin_unlock(&cache->lock); + ++ down_write(&fs_info->extent_commit_sem); ++ list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); ++ up_write(&fs_info->extent_commit_sem); ++ ++ atomic_inc(&cache->space_info->caching_threads); ++ + tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", + cache->key.objectid); + if (IS_ERR(tsk)) { +@@ -1507,22 +1568,22 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, + return ret; + } + +-#ifdef BIO_RW_DISCARD + static void btrfs_issue_discard(struct block_device *bdev, + u64 start, u64 len) + { + blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); + } +-#endif + + static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) + { +-#ifdef BIO_RW_DISCARD + int ret; + u64 map_length = num_bytes; + struct btrfs_multi_bio *multi = NULL; + ++ if (!btrfs_test_opt(root, DISCARD)) ++ return 0; ++ + /* Tell the block device(s) that the sectors can be discarded */ + ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, + bytenr, &map_length, &multi, 0); +@@ -1542,9 +1603,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + } + + return ret; +-#else +- return 0; +-#endif + } + + int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, +@@ -1656,7 +1714,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, + parent, ref_root, flags, + ref->objectid, ref->offset, + &ins, node->ref_mod); +- update_reserved_extents(root, ins.objectid, ins.offset, 0); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, +@@ -1782,7 +1839,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, + extent_op->flags_to_set, + &extent_op->key, + ref->level, &ins); +- update_reserved_extents(root, ins.objectid, ins.offset, 0); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, ref_root, +@@ -1817,16 +1873,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, + BUG_ON(extent_op); + head = btrfs_delayed_node_to_head(node); + if (insert_reserved) { ++ int mark_free = 0; ++ struct extent_buffer *must_clean = NULL; ++ ++ ret = pin_down_bytes(trans, root, NULL, ++ node->bytenr, node->num_bytes, ++ head->is_data, 1, &must_clean); ++ if (ret > 0) ++ mark_free = 1; ++ ++ if (must_clean) { ++ clean_tree_block(NULL, root, must_clean); ++ btrfs_tree_unlock(must_clean); ++ free_extent_buffer(must_clean); ++ } + if (head->is_data) { + ret = btrfs_del_csums(trans, root, + node->bytenr, + node->num_bytes); + BUG_ON(ret); + } +- btrfs_update_pinned_extents(root, node->bytenr, +- node->num_bytes, 1); +- update_reserved_extents(root, node->bytenr, +- node->num_bytes, 0); ++ if (mark_free) { ++ ret = btrfs_free_reserved_extent(root, ++ node->bytenr, ++ node->num_bytes); ++ BUG_ON(ret); ++ } + } + mutex_unlock(&head->mutex); + return 0; +@@ -2691,60 +2763,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) + alloc_target); + } + ++static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) ++{ ++ u64 num_bytes; ++ int level; ++ ++ level = BTRFS_MAX_LEVEL - 2; ++ /* ++ * NOTE: these calculations are absolutely the worst possible case. ++ * This assumes that _every_ item we insert will require a new leaf, and ++ * that the tree has grown to its maximum level size. ++ */ ++ ++ /* ++ * for every item we insert we could insert both an extent item and a ++ * extent ref item. Then for ever item we insert, we will need to cow ++ * both the original leaf, plus the leaf to the left and right of it. ++ * ++ * Unless we are talking about the extent root, then we just want the ++ * number of items * 2, since we just need the extent item plus its ref. ++ */ ++ if (root == root->fs_info->extent_root) ++ num_bytes = num_items * 2; ++ else ++ num_bytes = (num_items + (2 * num_items)) * 3; ++ ++ /* ++ * num_bytes is total number of leaves we could need times the leaf ++ * size, and then for every leaf we could end up cow'ing 2 nodes per ++ * level, down to the leaf level. ++ */ ++ num_bytes = (num_bytes * root->leafsize) + ++ (num_bytes * (level * 2)) * root->nodesize; ++ ++ return num_bytes; ++} ++ + /* +- * for now this just makes sure we have at least 5% of our metadata space free +- * for use. ++ * Unreserve metadata space for delalloc. If we have less reserved credits than ++ * we have extents, this function does nothing. + */ +-int btrfs_check_metadata_free_space(struct btrfs_root *root) ++int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, ++ struct inode *inode, int num_items) + { + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; +- u64 alloc_target, thresh; +- int committed = 0, ret; ++ u64 num_bytes; ++ u64 alloc_target; ++ bool bug = false; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + +-again: ++ num_bytes = calculate_bytes_needed(root->fs_info->extent_root, ++ num_items); ++ + spin_lock(&meta_sinfo->lock); +- if (!meta_sinfo->full) +- thresh = meta_sinfo->total_bytes * 80; +- else +- thresh = meta_sinfo->total_bytes * 95; ++ spin_lock(&BTRFS_I(inode)->accounting_lock); ++ if (BTRFS_I(inode)->reserved_extents <= ++ BTRFS_I(inode)->outstanding_extents) { ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); ++ spin_unlock(&meta_sinfo->lock); ++ return 0; ++ } ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); ++ ++ BTRFS_I(inode)->reserved_extents--; ++ BUG_ON(BTRFS_I(inode)->reserved_extents < 0); ++ ++ if (meta_sinfo->bytes_delalloc < num_bytes) { ++ bug = true; ++ meta_sinfo->bytes_delalloc = 0; ++ } else { ++ meta_sinfo->bytes_delalloc -= num_bytes; ++ } ++ spin_unlock(&meta_sinfo->lock); ++ ++ BUG_ON(bug); ++ ++ return 0; ++} ++ ++static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) ++{ ++ u64 thresh; ++ ++ thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + ++ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + ++ meta_sinfo->bytes_super + meta_sinfo->bytes_root + ++ meta_sinfo->bytes_may_use; + ++ thresh = meta_sinfo->total_bytes - thresh; ++ thresh *= 80; + do_div(thresh, 100); ++ if (thresh <= meta_sinfo->bytes_delalloc) ++ meta_sinfo->force_delalloc = 1; ++ else ++ meta_sinfo->force_delalloc = 0; ++} + +- if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + +- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) { +- struct btrfs_trans_handle *trans; +- if (!meta_sinfo->full) { +- meta_sinfo->force_alloc = 1; +- spin_unlock(&meta_sinfo->lock); ++struct async_flush { ++ struct btrfs_root *root; ++ struct btrfs_space_info *info; ++ struct btrfs_work work; ++}; + +- trans = btrfs_start_transaction(root, 1); +- if (!trans) +- return -ENOMEM; ++static noinline void flush_delalloc_async(struct btrfs_work *work) ++{ ++ struct async_flush *async; ++ struct btrfs_root *root; ++ struct btrfs_space_info *info; + +- ret = do_chunk_alloc(trans, root->fs_info->extent_root, +- 2 * 1024 * 1024, alloc_target, 0); +- btrfs_end_transaction(trans, root); ++ async = container_of(work, struct async_flush, work); ++ root = async->root; ++ info = async->info; ++ ++ btrfs_start_delalloc_inodes(root); ++ wake_up(&info->flush_wait); ++ btrfs_wait_ordered_extents(root, 0); ++ ++ spin_lock(&info->lock); ++ info->flushing = 0; ++ spin_unlock(&info->lock); ++ wake_up(&info->flush_wait); ++ ++ kfree(async); ++} ++ ++static void wait_on_flush(struct btrfs_space_info *info) ++{ ++ DEFINE_WAIT(wait); ++ u64 used; ++ ++ while (1) { ++ prepare_to_wait(&info->flush_wait, &wait, ++ TASK_UNINTERRUPTIBLE); ++ spin_lock(&info->lock); ++ if (!info->flushing) { ++ spin_unlock(&info->lock); ++ break; ++ } ++ ++ used = info->bytes_used + info->bytes_reserved + ++ info->bytes_pinned + info->bytes_readonly + ++ info->bytes_super + info->bytes_root + ++ info->bytes_may_use + info->bytes_delalloc; ++ if (used < info->total_bytes) { ++ spin_unlock(&info->lock); ++ break; ++ } ++ spin_unlock(&info->lock); ++ schedule(); ++ } ++ finish_wait(&info->flush_wait, &wait); ++} ++ ++static void flush_delalloc(struct btrfs_root *root, ++ struct btrfs_space_info *info) ++{ ++ struct async_flush *async; ++ bool wait = false; ++ ++ spin_lock(&info->lock); ++ ++ if (!info->flushing) { ++ info->flushing = 1; ++ init_waitqueue_head(&info->flush_wait); ++ } else { ++ wait = true; ++ } ++ ++ spin_unlock(&info->lock); ++ ++ if (wait) { ++ wait_on_flush(info); ++ return; ++ } ++ ++ async = kzalloc(sizeof(*async), GFP_NOFS); ++ if (!async) ++ goto flush; ++ ++ async->root = root; ++ async->info = info; ++ async->work.func = flush_delalloc_async; ++ ++ btrfs_queue_worker(&root->fs_info->enospc_workers, ++ &async->work); ++ wait_on_flush(info); ++ return; ++ ++flush: ++ btrfs_start_delalloc_inodes(root); ++ btrfs_wait_ordered_extents(root, 0); ++ ++ spin_lock(&info->lock); ++ info->flushing = 0; ++ spin_unlock(&info->lock); ++ wake_up(&info->flush_wait); ++} ++ ++static int maybe_allocate_chunk(struct btrfs_root *root, ++ struct btrfs_space_info *info) ++{ ++ struct btrfs_super_block *disk_super = &root->fs_info->super_copy; ++ struct btrfs_trans_handle *trans; ++ bool wait = false; ++ int ret = 0; ++ u64 min_metadata; ++ u64 free_space; ++ ++ free_space = btrfs_super_total_bytes(disk_super); ++ /* ++ * we allow the metadata to grow to a max of either 5gb or 5% of the ++ * space in the volume. ++ */ ++ min_metadata = min((u64)5 * 1024 * 1024 * 1024, ++ div64_u64(free_space * 5, 100)); ++ if (info->total_bytes >= min_metadata) { ++ spin_unlock(&info->lock); ++ return 0; ++ } ++ ++ if (info->full) { ++ spin_unlock(&info->lock); ++ return 0; ++ } ++ ++ if (!info->allocating_chunk) { ++ info->force_alloc = 1; ++ info->allocating_chunk = 1; ++ init_waitqueue_head(&info->allocate_wait); ++ } else { ++ wait = true; ++ } ++ ++ spin_unlock(&info->lock); ++ ++ if (wait) { ++ wait_event(info->allocate_wait, ++ !info->allocating_chunk); ++ return 1; ++ } ++ ++ trans = btrfs_start_transaction(root, 1); ++ if (!trans) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ ret = do_chunk_alloc(trans, root->fs_info->extent_root, ++ 4096 + 2 * 1024 * 1024, ++ info->flags, 0); ++ btrfs_end_transaction(trans, root); ++ if (ret) ++ goto out; ++out: ++ spin_lock(&info->lock); ++ info->allocating_chunk = 0; ++ spin_unlock(&info->lock); ++ wake_up(&info->allocate_wait); ++ ++ if (ret) ++ return 0; ++ return 1; ++} ++ ++/* ++ * Reserve metadata space for delalloc. ++ */ ++int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, ++ struct inode *inode, int num_items) ++{ ++ struct btrfs_fs_info *info = root->fs_info; ++ struct btrfs_space_info *meta_sinfo; ++ u64 num_bytes; ++ u64 used; ++ u64 alloc_target; ++ int flushed = 0; ++ int force_delalloc; ++ ++ /* get the space info for where the metadata will live */ ++ alloc_target = btrfs_get_alloc_profile(root, 0); ++ meta_sinfo = __find_space_info(info, alloc_target); ++ ++ num_bytes = calculate_bytes_needed(root->fs_info->extent_root, ++ num_items); ++again: ++ spin_lock(&meta_sinfo->lock); ++ ++ force_delalloc = meta_sinfo->force_delalloc; ++ ++ if (unlikely(!meta_sinfo->bytes_root)) ++ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); ++ ++ if (!flushed) ++ meta_sinfo->bytes_delalloc += num_bytes; ++ ++ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + ++ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + ++ meta_sinfo->bytes_super + meta_sinfo->bytes_root + ++ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; ++ ++ if (used > meta_sinfo->total_bytes) { ++ flushed++; ++ ++ if (flushed == 1) { ++ if (maybe_allocate_chunk(root, meta_sinfo)) ++ goto again; ++ flushed++; ++ } else { ++ spin_unlock(&meta_sinfo->lock); ++ } ++ ++ if (flushed == 2) { ++ filemap_flush(inode->i_mapping); ++ goto again; ++ } else if (flushed == 3) { ++ flush_delalloc(root, meta_sinfo); + goto again; + } ++ spin_lock(&meta_sinfo->lock); ++ meta_sinfo->bytes_delalloc -= num_bytes; + spin_unlock(&meta_sinfo->lock); ++ printk(KERN_ERR "enospc, has %d, reserved %d\n", ++ BTRFS_I(inode)->outstanding_extents, ++ BTRFS_I(inode)->reserved_extents); ++ dump_space_info(meta_sinfo, 0, 0); ++ return -ENOSPC; ++ } + +- if (!committed) { +- committed = 1; +- trans = btrfs_join_transaction(root, 1); +- if (!trans) +- return -ENOMEM; +- ret = btrfs_commit_transaction(trans, root); +- if (ret) +- return ret; ++ BTRFS_I(inode)->reserved_extents++; ++ check_force_delalloc(meta_sinfo); ++ spin_unlock(&meta_sinfo->lock); ++ ++ if (!flushed && force_delalloc) ++ filemap_flush(inode->i_mapping); ++ ++ return 0; ++} ++ ++/* ++ * unreserve num_items number of items worth of metadata space. This needs to ++ * be paired with btrfs_reserve_metadata_space. ++ * ++ * NOTE: if you have the option, run this _AFTER_ you do a ++ * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref ++ * oprations which will result in more used metadata, so we want to make sure we ++ * can do that without issue. ++ */ ++int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) ++{ ++ struct btrfs_fs_info *info = root->fs_info; ++ struct btrfs_space_info *meta_sinfo; ++ u64 num_bytes; ++ u64 alloc_target; ++ bool bug = false; ++ ++ /* get the space info for where the metadata will live */ ++ alloc_target = btrfs_get_alloc_profile(root, 0); ++ meta_sinfo = __find_space_info(info, alloc_target); ++ ++ num_bytes = calculate_bytes_needed(root, num_items); ++ ++ spin_lock(&meta_sinfo->lock); ++ if (meta_sinfo->bytes_may_use < num_bytes) { ++ bug = true; ++ meta_sinfo->bytes_may_use = 0; ++ } else { ++ meta_sinfo->bytes_may_use -= num_bytes; ++ } ++ spin_unlock(&meta_sinfo->lock); ++ ++ BUG_ON(bug); ++ ++ return 0; ++} ++ ++/* ++ * Reserve some metadata space for use. We'll calculate the worste case number ++ * of bytes that would be needed to modify num_items number of items. If we ++ * have space, fantastic, if not, you get -ENOSPC. Please call ++ * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of ++ * items you reserved, since whatever metadata you needed should have already ++ * been allocated. ++ * ++ * This will commit the transaction to make more space if we don't have enough ++ * metadata space. THe only time we don't do this is if we're reserving space ++ * inside of a transaction, then we will just return -ENOSPC and it is the ++ * callers responsibility to handle it properly. ++ */ ++int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) ++{ ++ struct btrfs_fs_info *info = root->fs_info; ++ struct btrfs_space_info *meta_sinfo; ++ u64 num_bytes; ++ u64 used; ++ u64 alloc_target; ++ int retries = 0; ++ ++ /* get the space info for where the metadata will live */ ++ alloc_target = btrfs_get_alloc_profile(root, 0); ++ meta_sinfo = __find_space_info(info, alloc_target); ++ ++ num_bytes = calculate_bytes_needed(root, num_items); ++again: ++ spin_lock(&meta_sinfo->lock); ++ ++ if (unlikely(!meta_sinfo->bytes_root)) ++ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); ++ ++ if (!retries) ++ meta_sinfo->bytes_may_use += num_bytes; ++ ++ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + ++ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + ++ meta_sinfo->bytes_super + meta_sinfo->bytes_root + ++ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; ++ ++ if (used > meta_sinfo->total_bytes) { ++ retries++; ++ if (retries == 1) { ++ if (maybe_allocate_chunk(root, meta_sinfo)) ++ goto again; ++ retries++; ++ } else { ++ spin_unlock(&meta_sinfo->lock); ++ } ++ ++ if (retries == 2) { ++ flush_delalloc(root, meta_sinfo); + goto again; + } ++ spin_lock(&meta_sinfo->lock); ++ meta_sinfo->bytes_may_use -= num_bytes; ++ spin_unlock(&meta_sinfo->lock); ++ ++ dump_space_info(meta_sinfo, 0, 0); + return -ENOSPC; + } ++ ++ check_force_delalloc(meta_sinfo); + spin_unlock(&meta_sinfo->lock); + + return 0; +@@ -2764,13 +3224,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + data_sinfo = BTRFS_I(inode)->space_info; ++ if (!data_sinfo) ++ goto alloc; ++ + again: + /* make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + if (data_sinfo->total_bytes - data_sinfo->bytes_used - + data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - + data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - +- data_sinfo->bytes_may_use < bytes) { ++ data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { + struct btrfs_trans_handle *trans; + + /* +@@ -2782,7 +3245,7 @@ again: + + data_sinfo->force_alloc = 1; + spin_unlock(&data_sinfo->lock); +- ++alloc: + alloc_target = btrfs_get_alloc_profile(root, 1); + trans = btrfs_start_transaction(root, 1); + if (!trans) +@@ -2794,12 +3257,17 @@ again: + btrfs_end_transaction(trans, root); + if (ret) + return ret; ++ ++ if (!data_sinfo) { ++ btrfs_set_inode_space_info(root, inode); ++ data_sinfo = BTRFS_I(inode)->space_info; ++ } + goto again; + } + spin_unlock(&data_sinfo->lock); + + /* commit the current transaction and try again */ +- if (!committed) { ++ if (!committed && !root->fs_info->open_ioctl_trans) { + committed = 1; + trans = btrfs_join_transaction(root, 1); + if (!trans) +@@ -2827,7 +3295,7 @@ again: + BTRFS_I(inode)->reserved_bytes += bytes; + spin_unlock(&data_sinfo->lock); + +- return btrfs_check_metadata_free_space(root); ++ return 0; + } + + /* +@@ -2926,17 +3394,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, + BUG_ON(!space_info); + + spin_lock(&space_info->lock); +- if (space_info->force_alloc) { ++ if (space_info->force_alloc) + force = 1; +- space_info->force_alloc = 0; +- } + if (space_info->full) { + spin_unlock(&space_info->lock); + goto out; + } + + thresh = space_info->total_bytes - space_info->bytes_readonly; +- thresh = div_factor(thresh, 6); ++ thresh = div_factor(thresh, 8); + if (!force && + (space_info->bytes_used + space_info->bytes_pinned + + space_info->bytes_reserved + alloc_bytes) < thresh) { +@@ -2950,7 +3416,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ +- if (flags & BTRFS_BLOCK_GROUP_DATA) { ++ if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) +@@ -2958,8 +3424,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, + } + + ret = btrfs_alloc_chunk(trans, extent_root, flags); ++ spin_lock(&space_info->lock); + if (ret) + space_info->full = 1; ++ space_info->force_alloc = 0; ++ spin_unlock(&space_info->lock); + out: + mutex_unlock(&extent_root->fs_info->chunk_mutex); + return ret; +@@ -3008,10 +3477,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, + num_bytes = min(total, cache->key.offset - byte_in_group); + if (alloc) { + old_val += num_bytes; ++ btrfs_set_block_group_used(&cache->item, old_val); ++ cache->reserved -= num_bytes; + cache->space_info->bytes_used += num_bytes; ++ cache->space_info->bytes_reserved -= num_bytes; + if (cache->ro) + cache->space_info->bytes_readonly -= num_bytes; +- btrfs_set_block_group_used(&cache->item, old_val); + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + } else { +@@ -3056,127 +3527,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) + return bytenr; + } + +-int btrfs_update_pinned_extents(struct btrfs_root *root, +- u64 bytenr, u64 num, int pin) ++/* ++ * this function must be called within transaction ++ */ ++int btrfs_pin_extent(struct btrfs_root *root, ++ u64 bytenr, u64 num_bytes, int reserved) + { +- u64 len; +- struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *fs_info = root->fs_info; ++ struct btrfs_block_group_cache *cache; + +- if (pin) +- set_extent_dirty(&fs_info->pinned_extents, +- bytenr, bytenr + num - 1, GFP_NOFS); +- +- while (num > 0) { +- cache = btrfs_lookup_block_group(fs_info, bytenr); +- BUG_ON(!cache); +- len = min(num, cache->key.offset - +- (bytenr - cache->key.objectid)); +- if (pin) { +- spin_lock(&cache->space_info->lock); +- spin_lock(&cache->lock); +- cache->pinned += len; +- cache->space_info->bytes_pinned += len; +- spin_unlock(&cache->lock); +- spin_unlock(&cache->space_info->lock); +- fs_info->total_pinned += len; +- } else { +- int unpin = 0; ++ cache = btrfs_lookup_block_group(fs_info, bytenr); ++ BUG_ON(!cache); + +- /* +- * in order to not race with the block group caching, we +- * only want to unpin the extent if we are cached. If +- * we aren't cached, we want to start async caching this +- * block group so we can free the extent the next time +- * around. +- */ +- spin_lock(&cache->space_info->lock); +- spin_lock(&cache->lock); +- unpin = (cache->cached == BTRFS_CACHE_FINISHED); +- if (likely(unpin)) { +- cache->pinned -= len; +- cache->space_info->bytes_pinned -= len; +- fs_info->total_pinned -= len; +- } +- spin_unlock(&cache->lock); +- spin_unlock(&cache->space_info->lock); ++ spin_lock(&cache->space_info->lock); ++ spin_lock(&cache->lock); ++ cache->pinned += num_bytes; ++ cache->space_info->bytes_pinned += num_bytes; ++ if (reserved) { ++ cache->reserved -= num_bytes; ++ cache->space_info->bytes_reserved -= num_bytes; ++ } ++ spin_unlock(&cache->lock); ++ spin_unlock(&cache->space_info->lock); + +- if (likely(unpin)) +- clear_extent_dirty(&fs_info->pinned_extents, +- bytenr, bytenr + len -1, +- GFP_NOFS); +- else +- cache_block_group(cache); ++ btrfs_put_block_group(cache); + +- if (unpin) +- btrfs_add_free_space(cache, bytenr, len); +- } +- btrfs_put_block_group(cache); +- bytenr += len; +- num -= len; ++ set_extent_dirty(fs_info->pinned_extents, ++ bytenr, bytenr + num_bytes - 1, GFP_NOFS); ++ return 0; ++} ++ ++static int update_reserved_extents(struct btrfs_block_group_cache *cache, ++ u64 num_bytes, int reserve) ++{ ++ spin_lock(&cache->space_info->lock); ++ spin_lock(&cache->lock); ++ if (reserve) { ++ cache->reserved += num_bytes; ++ cache->space_info->bytes_reserved += num_bytes; ++ } else { ++ cache->reserved -= num_bytes; ++ cache->space_info->bytes_reserved -= num_bytes; + } ++ spin_unlock(&cache->lock); ++ spin_unlock(&cache->space_info->lock); + return 0; + } + +-static int update_reserved_extents(struct btrfs_root *root, +- u64 bytenr, u64 num, int reserve) ++int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root) + { +- u64 len; +- struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *fs_info = root->fs_info; ++ struct btrfs_caching_control *next; ++ struct btrfs_caching_control *caching_ctl; ++ struct btrfs_block_group_cache *cache; + +- while (num > 0) { +- cache = btrfs_lookup_block_group(fs_info, bytenr); +- BUG_ON(!cache); +- len = min(num, cache->key.offset - +- (bytenr - cache->key.objectid)); ++ down_write(&fs_info->extent_commit_sem); + +- spin_lock(&cache->space_info->lock); +- spin_lock(&cache->lock); +- if (reserve) { +- cache->reserved += len; +- cache->space_info->bytes_reserved += len; ++ list_for_each_entry_safe(caching_ctl, next, ++ &fs_info->caching_block_groups, list) { ++ cache = caching_ctl->block_group; ++ if (block_group_cache_done(cache)) { ++ cache->last_byte_to_unpin = (u64)-1; ++ list_del_init(&caching_ctl->list); ++ put_caching_control(caching_ctl); + } else { +- cache->reserved -= len; +- cache->space_info->bytes_reserved -= len; ++ cache->last_byte_to_unpin = caching_ctl->progress; + } +- spin_unlock(&cache->lock); +- spin_unlock(&cache->space_info->lock); +- btrfs_put_block_group(cache); +- bytenr += len; +- num -= len; + } ++ ++ if (fs_info->pinned_extents == &fs_info->freed_extents[0]) ++ fs_info->pinned_extents = &fs_info->freed_extents[1]; ++ else ++ fs_info->pinned_extents = &fs_info->freed_extents[0]; ++ ++ up_write(&fs_info->extent_commit_sem); + return 0; + } + +-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) ++static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) + { +- u64 last = 0; +- u64 start; +- u64 end; +- struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; +- int ret; ++ struct btrfs_fs_info *fs_info = root->fs_info; ++ struct btrfs_block_group_cache *cache = NULL; ++ u64 len; + +- while (1) { +- ret = find_first_extent_bit(pinned_extents, last, +- &start, &end, EXTENT_DIRTY); +- if (ret) +- break; ++ while (start <= end) { ++ if (!cache || ++ start >= cache->key.objectid + cache->key.offset) { ++ if (cache) ++ btrfs_put_block_group(cache); ++ cache = btrfs_lookup_block_group(fs_info, start); ++ BUG_ON(!cache); ++ } + +- set_extent_dirty(copy, start, end, GFP_NOFS); +- last = end + 1; ++ len = cache->key.objectid + cache->key.offset - start; ++ len = min(len, end + 1 - start); ++ ++ if (start < cache->last_byte_to_unpin) { ++ len = min(len, cache->last_byte_to_unpin - start); ++ btrfs_add_free_space(cache, start, len); ++ } ++ ++ spin_lock(&cache->space_info->lock); ++ spin_lock(&cache->lock); ++ cache->pinned -= len; ++ cache->space_info->bytes_pinned -= len; ++ spin_unlock(&cache->lock); ++ spin_unlock(&cache->space_info->lock); ++ ++ start += len; + } ++ ++ if (cache) ++ btrfs_put_block_group(cache); + return 0; + } + + int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- struct extent_io_tree *unpin) ++ struct btrfs_root *root) + { ++ struct btrfs_fs_info *fs_info = root->fs_info; ++ struct extent_io_tree *unpin; + u64 start; + u64 end; + int ret; + ++ if (fs_info->pinned_extents == &fs_info->freed_extents[0]) ++ unpin = &fs_info->freed_extents[1]; ++ else ++ unpin = &fs_info->freed_extents[0]; ++ + while (1) { + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY); +@@ -3185,10 +3665,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + + ret = btrfs_discard_extent(root, start, end + 1 - start); + +- /* unlocks the pinned mutex */ +- btrfs_update_pinned_extents(root, start, end + 1 - start, 0); + clear_extent_dirty(unpin, start, end, GFP_NOFS); +- ++ unpin_extent_range(root, start, end); + cond_resched(); + } + +@@ -3198,7 +3676,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + static int pin_down_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, +- u64 bytenr, u64 num_bytes, int is_data, ++ u64 bytenr, u64 num_bytes, ++ int is_data, int reserved, + struct extent_buffer **must_clean) + { + int err = 0; +@@ -3207,6 +3686,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, + if (is_data) + goto pinit; + ++ /* ++ * discard is sloooow, and so triggering discards on ++ * individual btree blocks isn't a good plan. Just ++ * pin everything in discard mode. ++ */ ++ if (btrfs_test_opt(root, DISCARD)) ++ goto pinit; ++ + buf = btrfs_find_tree_block(root, bytenr, num_bytes); + if (!buf) + goto pinit; +@@ -3230,15 +3717,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, + } + free_extent_buffer(buf); + pinit: +- btrfs_set_path_blocking(path); ++ if (path) ++ btrfs_set_path_blocking(path); + /* unlocks the pinned mutex */ +- btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); ++ btrfs_pin_extent(root, bytenr, num_bytes, reserved); + + BUG_ON(err < 0); + return 0; + } + +- + static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, +@@ -3412,7 +3899,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + } + + ret = pin_down_bytes(trans, root, path, bytenr, +- num_bytes, is_data, &must_clean); ++ num_bytes, is_data, 0, &must_clean); + if (ret > 0) + mark_free = 1; + BUG_ON(ret < 0); +@@ -3543,8 +4030,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, + if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { + WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); + /* unlocks the pinned mutex */ +- btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); +- update_reserved_extents(root, bytenr, num_bytes, 0); ++ btrfs_pin_extent(root, bytenr, num_bytes, 1); + ret = 0; + } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, +@@ -3584,19 +4070,33 @@ static noinline int + wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, + u64 num_bytes) + { ++ struct btrfs_caching_control *caching_ctl; + DEFINE_WAIT(wait); + +- prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE); +- +- if (block_group_cache_done(cache)) { +- finish_wait(&cache->caching_q, &wait); ++ caching_ctl = get_caching_control(cache); ++ if (!caching_ctl) + return 0; +- } +- schedule(); +- finish_wait(&cache->caching_q, &wait); + +- wait_event(cache->caching_q, block_group_cache_done(cache) || ++ wait_event(caching_ctl->wait, block_group_cache_done(cache) || + (cache->free_space >= num_bytes)); ++ ++ put_caching_control(caching_ctl); ++ return 0; ++} ++ ++static noinline int ++wait_block_group_cache_done(struct btrfs_block_group_cache *cache) ++{ ++ struct btrfs_caching_control *caching_ctl; ++ DEFINE_WAIT(wait); ++ ++ caching_ctl = get_caching_control(cache); ++ if (!caching_ctl) ++ return 0; ++ ++ wait_event(caching_ctl->wait, block_group_cache_done(cache)); ++ ++ put_caching_control(caching_ctl); + return 0; + } + +@@ -3634,6 +4134,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, + int last_ptr_loop = 0; + int loop = 0; + bool found_uncached_bg = false; ++ bool failed_cluster_refill = false; ++ bool failed_alloc = false; + + WARN_ON(num_bytes < root->sectorsize); + btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); +@@ -3731,7 +4233,16 @@ have_block_group: + if (unlikely(block_group->ro)) + goto loop; + +- if (last_ptr) { ++ /* ++ * Ok we want to try and use the cluster allocator, so lets look ++ * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will ++ * have tried the cluster allocator plenty of times at this ++ * point and not have found anything, so we are likely way too ++ * fragmented for the clustering stuff to find anything, so lets ++ * just skip it and let the allocator find whatever block it can ++ * find ++ */ ++ if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { + /* + * the refill lock keeps out other + * people trying to start a new cluster +@@ -3806,9 +4317,11 @@ refill_cluster: + spin_unlock(&last_ptr->refill_lock); + goto checks; + } +- } else if (!cached && loop > LOOP_CACHING_NOWAIT) { ++ } else if (!cached && loop > LOOP_CACHING_NOWAIT ++ && !failed_cluster_refill) { + spin_unlock(&last_ptr->refill_lock); + ++ failed_cluster_refill = true; + wait_block_group_cache_progress(block_group, + num_bytes + empty_cluster + empty_size); + goto have_block_group; +@@ -3820,25 +4333,30 @@ refill_cluster: + * cluster. Free the cluster we've been trying + * to use, and go to the next block group + */ +- if (loop < LOOP_NO_EMPTY_SIZE) { +- btrfs_return_cluster_to_free_space(NULL, +- last_ptr); +- spin_unlock(&last_ptr->refill_lock); +- goto loop; +- } ++ btrfs_return_cluster_to_free_space(NULL, last_ptr); + spin_unlock(&last_ptr->refill_lock); ++ goto loop; + } + + offset = btrfs_find_space_for_alloc(block_group, search_start, + num_bytes, empty_size); +- if (!offset && (cached || (!cached && +- loop == LOOP_CACHING_NOWAIT))) { +- goto loop; +- } else if (!offset && (!cached && +- loop > LOOP_CACHING_NOWAIT)) { ++ /* ++ * If we didn't find a chunk, and we haven't failed on this ++ * block group before, and this block group is in the middle of ++ * caching and we are ok with waiting, then go ahead and wait ++ * for progress to be made, and set failed_alloc to true. ++ * ++ * If failed_alloc is true then we've already waited on this ++ * block group once and should move on to the next block group. ++ */ ++ if (!offset && !failed_alloc && !cached && ++ loop > LOOP_CACHING_NOWAIT) { + wait_block_group_cache_progress(block_group, +- num_bytes + empty_size); ++ num_bytes + empty_size); ++ failed_alloc = true; + goto have_block_group; ++ } else if (!offset) { ++ goto loop; + } + checks: + search_start = stripe_align(root, offset); +@@ -3880,9 +4398,13 @@ checks: + search_start - offset); + BUG_ON(offset > search_start); + ++ update_reserved_extents(block_group, num_bytes, 1); ++ + /* we are all good, lets return */ + break; + loop: ++ failed_cluster_refill = false; ++ failed_alloc = false; + btrfs_put_block_group(block_group); + } + up_read(&space_info->groups_sem); +@@ -3940,21 +4462,32 @@ loop: + return ret; + } + +-static void dump_space_info(struct btrfs_space_info *info, u64 bytes) ++static void dump_space_info(struct btrfs_space_info *info, u64 bytes, ++ int dump_block_groups) + { + struct btrfs_block_group_cache *cache; + ++ spin_lock(&info->lock); + printk(KERN_INFO "space_info has %llu free, is %sfull\n", + (unsigned long long)(info->total_bytes - info->bytes_used - +- info->bytes_pinned - info->bytes_reserved), ++ info->bytes_pinned - info->bytes_reserved - ++ info->bytes_super), + (info->full) ? "" : "not "); + printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," +- " may_use=%llu, used=%llu\n", ++ " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" ++ "\n", + (unsigned long long)info->total_bytes, + (unsigned long long)info->bytes_pinned, + (unsigned long long)info->bytes_delalloc, + (unsigned long long)info->bytes_may_use, +- (unsigned long long)info->bytes_used); ++ (unsigned long long)info->bytes_used, ++ (unsigned long long)info->bytes_root, ++ (unsigned long long)info->bytes_super, ++ (unsigned long long)info->bytes_reserved); ++ spin_unlock(&info->lock); ++ ++ if (!dump_block_groups) ++ return; + + down_read(&info->groups_sem); + list_for_each_entry(cache, &info->block_groups, list) { +@@ -3972,12 +4505,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes) + up_read(&info->groups_sem); + } + +-static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- u64 num_bytes, u64 min_alloc_size, +- u64 empty_size, u64 hint_byte, +- u64 search_end, struct btrfs_key *ins, +- u64 data) ++int btrfs_reserve_extent(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ u64 num_bytes, u64 min_alloc_size, ++ u64 empty_size, u64 hint_byte, ++ u64 search_end, struct btrfs_key *ins, ++ u64 data) + { + int ret; + u64 search_start = 0; +@@ -4022,7 +4555,7 @@ again: + printk(KERN_ERR "btrfs allocation failed flags %llu, " + "wanted %llu\n", (unsigned long long)data, + (unsigned long long)num_bytes); +- dump_space_info(sinfo, num_bytes); ++ dump_space_info(sinfo, num_bytes, 1); + } + + return ret; +@@ -4043,25 +4576,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) + ret = btrfs_discard_extent(root, start, len); + + btrfs_add_free_space(cache, start, len); ++ update_reserved_extents(cache, len, 0); + btrfs_put_block_group(cache); +- update_reserved_extents(root, start, len, 0); +- +- return ret; +-} +- +-int btrfs_reserve_extent(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- u64 num_bytes, u64 min_alloc_size, +- u64 empty_size, u64 hint_byte, +- u64 search_end, struct btrfs_key *ins, +- u64 data) +-{ +- int ret; +- ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, +- empty_size, hint_byte, search_end, ins, +- data); +- if (!ret) +- update_reserved_extents(root, ins->objectid, ins->offset, 1); + + return ret; + } +@@ -4222,15 +4738,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + { + int ret; + struct btrfs_block_group_cache *block_group; ++ struct btrfs_caching_control *caching_ctl; ++ u64 start = ins->objectid; ++ u64 num_bytes = ins->offset; + + block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + cache_block_group(block_group); +- wait_event(block_group->caching_q, +- block_group_cache_done(block_group)); ++ caching_ctl = get_caching_control(block_group); + +- ret = btrfs_remove_free_space(block_group, ins->objectid, +- ins->offset); +- BUG_ON(ret); ++ if (!caching_ctl) { ++ BUG_ON(!block_group_cache_done(block_group)); ++ ret = btrfs_remove_free_space(block_group, start, num_bytes); ++ BUG_ON(ret); ++ } else { ++ mutex_lock(&caching_ctl->mutex); ++ ++ if (start >= caching_ctl->progress) { ++ ret = add_excluded_extent(root, start, num_bytes); ++ BUG_ON(ret); ++ } else if (start + num_bytes <= caching_ctl->progress) { ++ ret = btrfs_remove_free_space(block_group, ++ start, num_bytes); ++ BUG_ON(ret); ++ } else { ++ num_bytes = caching_ctl->progress - start; ++ ret = btrfs_remove_free_space(block_group, ++ start, num_bytes); ++ BUG_ON(ret); ++ ++ start = caching_ctl->progress; ++ num_bytes = ins->objectid + ins->offset - ++ caching_ctl->progress; ++ ret = add_excluded_extent(root, start, num_bytes); ++ BUG_ON(ret); ++ } ++ ++ mutex_unlock(&caching_ctl->mutex); ++ put_caching_control(caching_ctl); ++ } ++ ++ update_reserved_extents(block_group, ins->offset, 1); + btrfs_put_block_group(block_group); + ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, + 0, owner, offset, ins, 1); +@@ -4254,9 +4801,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, + int ret; + u64 flags = 0; + +- ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, +- empty_size, hint_byte, search_end, +- ins, 0); ++ ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes, ++ empty_size, hint_byte, search_end, ++ ins, 0); + if (ret) + return ret; + +@@ -4267,7 +4814,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, + } else + BUG_ON(parent > 0); + +- update_reserved_extents(root, ins->objectid, ins->offset, 1); + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_delayed_extent_op *extent_op; + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); +@@ -4346,452 +4892,108 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, + return buf; + } + +-#if 0 +-int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, struct extent_buffer *leaf) +-{ +- u64 disk_bytenr; +- u64 num_bytes; +- struct btrfs_key key; +- struct btrfs_file_extent_item *fi; +- u32 nritems; +- int i; +- int ret; +- +- BUG_ON(!btrfs_is_leaf(leaf)); +- nritems = btrfs_header_nritems(leaf); +- +- for (i = 0; i < nritems; i++) { +- cond_resched(); +- btrfs_item_key_to_cpu(leaf, &key, i); +- +- /* only extents have references, skip everything else */ +- if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) +- continue; +- +- fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); +- +- /* inline extents live in the btree, they don't have refs */ +- if (btrfs_file_extent_type(leaf, fi) == +- BTRFS_FILE_EXTENT_INLINE) +- continue; +- +- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); +- +- /* holes don't have refs */ +- if (disk_bytenr == 0) +- continue; +- +- num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); +- ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, +- leaf->start, 0, key.objectid, 0); +- BUG_ON(ret); +- } +- return 0; +-} +- +-static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- struct btrfs_leaf_ref *ref) +-{ +- int i; +- int ret; +- struct btrfs_extent_info *info; +- struct refsort *sorted; +- +- if (ref->nritems == 0) +- return 0; +- +- sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS); +- for (i = 0; i < ref->nritems; i++) { +- sorted[i].bytenr = ref->extents[i].bytenr; +- sorted[i].slot = i; +- } +- sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL); +- +- /* +- * the items in the ref were sorted when the ref was inserted +- * into the ref cache, so this is already in order +- */ +- for (i = 0; i < ref->nritems; i++) { +- info = ref->extents + sorted[i].slot; +- ret = btrfs_free_extent(trans, root, info->bytenr, +- info->num_bytes, ref->bytenr, +- ref->owner, ref->generation, +- info->objectid, 0); +- +- atomic_inc(&root->fs_info->throttle_gen); +- wake_up(&root->fs_info->transaction_throttle); +- cond_resched(); +- +- BUG_ON(ret); +- info++; +- } +- +- kfree(sorted); +- return 0; +-} +- +- +-static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, u64 start, +- u64 len, u32 *refs) +-{ +- int ret; +- +- ret = btrfs_lookup_extent_refs(trans, root, start, len, refs); +- BUG_ON(ret); +- +-#if 0 /* some debugging code in case we see problems here */ +- /* if the refs count is one, it won't get increased again. But +- * if the ref count is > 1, someone may be decreasing it at +- * the same time we are. +- */ +- if (*refs != 1) { +- struct extent_buffer *eb = NULL; +- eb = btrfs_find_create_tree_block(root, start, len); +- if (eb) +- btrfs_tree_lock(eb); +- +- mutex_lock(&root->fs_info->alloc_mutex); +- ret = lookup_extent_ref(NULL, root, start, len, refs); +- BUG_ON(ret); +- mutex_unlock(&root->fs_info->alloc_mutex); +- +- if (eb) { +- btrfs_tree_unlock(eb); +- free_extent_buffer(eb); +- } +- if (*refs == 1) { +- printk(KERN_ERR "btrfs block %llu went down to one " +- "during drop_snap\n", (unsigned long long)start); +- } +- +- } +-#endif +- +- cond_resched(); +- return ret; +-} ++struct walk_control { ++ u64 refs[BTRFS_MAX_LEVEL]; ++ u64 flags[BTRFS_MAX_LEVEL]; ++ struct btrfs_key update_progress; ++ int stage; ++ int level; ++ int shared_level; ++ int update_ref; ++ int keep_locks; ++ int reada_slot; ++ int reada_count; ++}; + ++#define DROP_REFERENCE 1 ++#define UPDATE_BACKREF 2 + +-/* +- * this is used while deleting old snapshots, and it drops the refs +- * on a whole subtree starting from a level 1 node. +- * +- * The idea is to sort all the leaf pointers, and then drop the +- * ref on all the leaves in order. Most of the time the leaves +- * will have ref cache entries, so no leaf IOs will be required to +- * find the extents they have references on. +- * +- * For each leaf, any references it has are also dropped in order +- * +- * This ends up dropping the references in something close to optimal +- * order for reading and modifying the extent allocation tree. +- */ +-static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- struct btrfs_path *path) ++static noinline void reada_walk_down(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct walk_control *wc, ++ struct btrfs_path *path) + { + u64 bytenr; +- u64 root_owner; +- u64 root_gen; +- struct extent_buffer *eb = path->nodes[1]; +- struct extent_buffer *leaf; +- struct btrfs_leaf_ref *ref; +- struct refsort *sorted = NULL; +- int nritems = btrfs_header_nritems(eb); ++ u64 generation; ++ u64 refs; ++ u64 flags; ++ u64 last = 0; ++ u32 nritems; ++ u32 blocksize; ++ struct btrfs_key key; ++ struct extent_buffer *eb; + int ret; +- int i; +- int refi = 0; +- int slot = path->slots[1]; +- u32 blocksize = btrfs_level_size(root, 0); +- u32 refs; +- +- if (nritems == 0) +- goto out; +- +- root_owner = btrfs_header_owner(eb); +- root_gen = btrfs_header_generation(eb); +- sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); ++ int slot; ++ int nread = 0; + +- /* +- * step one, sort all the leaf pointers so we don't scribble +- * randomly into the extent allocation tree +- */ +- for (i = slot; i < nritems; i++) { +- sorted[refi].bytenr = btrfs_node_blockptr(eb, i); +- sorted[refi].slot = i; +- refi++; ++ if (path->slots[wc->level] < wc->reada_slot) { ++ wc->reada_count = wc->reada_count * 2 / 3; ++ wc->reada_count = max(wc->reada_count, 2); ++ } else { ++ wc->reada_count = wc->reada_count * 3 / 2; ++ wc->reada_count = min_t(int, wc->reada_count, ++ BTRFS_NODEPTRS_PER_BLOCK(root)); + } + +- /* +- * nritems won't be zero, but if we're picking up drop_snapshot +- * after a crash, slot might be > 0, so double check things +- * just in case. +- */ +- if (refi == 0) +- goto out; ++ eb = path->nodes[wc->level]; ++ nritems = btrfs_header_nritems(eb); ++ blocksize = btrfs_level_size(root, wc->level - 1); + +- sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); ++ for (slot = path->slots[wc->level]; slot < nritems; slot++) { ++ if (nread >= wc->reada_count) ++ break; + +- /* +- * the first loop frees everything the leaves point to +- */ +- for (i = 0; i < refi; i++) { +- u64 ptr_gen; ++ cond_resched(); ++ bytenr = btrfs_node_blockptr(eb, slot); ++ generation = btrfs_node_ptr_generation(eb, slot); + +- bytenr = sorted[i].bytenr; ++ if (slot == path->slots[wc->level]) ++ goto reada; + +- /* +- * check the reference count on this leaf. If it is > 1 +- * we just decrement it below and don't update any +- * of the refs the leaf points to. +- */ +- ret = drop_snap_lookup_refcount(trans, root, bytenr, +- blocksize, &refs); +- BUG_ON(ret); +- if (refs != 1) ++ if (wc->stage == UPDATE_BACKREF && ++ generation <= root->root_key.offset) + continue; + +- ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); +- +- /* +- * the leaf only had one reference, which means the +- * only thing pointing to this leaf is the snapshot +- * we're deleting. It isn't possible for the reference +- * count to increase again later +- * +- * The reference cache is checked for the leaf, +- * and if found we'll be able to drop any refs held by +- * the leaf without needing to read it in. +- */ +- ref = btrfs_lookup_leaf_ref(root, bytenr); +- if (ref && ref->generation != ptr_gen) { +- btrfs_free_leaf_ref(root, ref); +- ref = NULL; +- } +- if (ref) { +- ret = cache_drop_leaf_ref(trans, root, ref); +- BUG_ON(ret); +- btrfs_remove_leaf_ref(root, ref); +- btrfs_free_leaf_ref(root, ref); +- } else { +- /* +- * the leaf wasn't in the reference cache, so +- * we have to read it. +- */ +- leaf = read_tree_block(root, bytenr, blocksize, +- ptr_gen); +- ret = btrfs_drop_leaf_ref(trans, root, leaf); +- BUG_ON(ret); +- free_extent_buffer(leaf); +- } +- atomic_inc(&root->fs_info->throttle_gen); +- wake_up(&root->fs_info->transaction_throttle); +- cond_resched(); +- } +- +- /* +- * run through the loop again to free the refs on the leaves. +- * This is faster than doing it in the loop above because +- * the leaves are likely to be clustered together. We end up +- * working in nice chunks on the extent allocation tree. +- */ +- for (i = 0; i < refi; i++) { +- bytenr = sorted[i].bytenr; +- ret = btrfs_free_extent(trans, root, bytenr, +- blocksize, eb->start, +- root_owner, root_gen, 0, 1); ++ /* We don't lock the tree block, it's OK to be racy here */ ++ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, ++ &refs, &flags); + BUG_ON(ret); ++ BUG_ON(refs == 0); + +- atomic_inc(&root->fs_info->throttle_gen); +- wake_up(&root->fs_info->transaction_throttle); +- cond_resched(); +- } +-out: +- kfree(sorted); +- +- /* +- * update the path to show we've processed the entire level 1 +- * node. This will get saved into the root's drop_snapshot_progress +- * field so these drops are not repeated again if this transaction +- * commits. +- */ +- path->slots[1] = nritems; +- return 0; +-} +- +-/* +- * helper function for drop_snapshot, this walks down the tree dropping ref +- * counts as it goes. +- */ +-static noinline int walk_down_tree(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- struct btrfs_path *path, int *level) +-{ +- u64 root_owner; +- u64 root_gen; +- u64 bytenr; +- u64 ptr_gen; +- struct extent_buffer *next; +- struct extent_buffer *cur; +- struct extent_buffer *parent; +- u32 blocksize; +- int ret; +- u32 refs; +- +- WARN_ON(*level < 0); +- WARN_ON(*level >= BTRFS_MAX_LEVEL); +- ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start, +- path->nodes[*level]->len, &refs); +- BUG_ON(ret); +- if (refs > 1) +- goto out; +- +- /* +- * walk down to the last node level and free all the leaves +- */ +- while (*level >= 0) { +- WARN_ON(*level < 0); +- WARN_ON(*level >= BTRFS_MAX_LEVEL); +- cur = path->nodes[*level]; +- +- if (btrfs_header_level(cur) != *level) +- WARN_ON(1); +- +- if (path->slots[*level] >= +- btrfs_header_nritems(cur)) +- break; ++ if (wc->stage == DROP_REFERENCE) { ++ if (refs == 1) ++ goto reada; + +- /* the new code goes down to level 1 and does all the +- * leaves pointed to that node in bulk. So, this check +- * for level 0 will always be false. +- * +- * But, the disk format allows the drop_snapshot_progress +- * field in the root to leave things in a state where +- * a leaf will need cleaning up here. If someone crashes +- * with the old code and then boots with the new code, +- * we might find a leaf here. +- */ +- if (*level == 0) { +- ret = btrfs_drop_leaf_ref(trans, root, cur); +- BUG_ON(ret); +- break; ++ if (wc->level == 1 && ++ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) ++ continue; ++ if (!wc->update_ref || ++ generation <= root->root_key.offset) ++ continue; ++ btrfs_node_key_to_cpu(eb, &key, slot); ++ ret = btrfs_comp_cpu_keys(&key, ++ &wc->update_progress); ++ if (ret < 0) ++ continue; ++ } else { ++ if (wc->level == 1 && ++ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) ++ continue; + } +- +- /* +- * once we get to level one, process the whole node +- * at once, including everything below it. +- */ +- if (*level == 1) { +- ret = drop_level_one_refs(trans, root, path); +- BUG_ON(ret); ++reada: ++ ret = readahead_tree_block(root, bytenr, blocksize, ++ generation); ++ if (ret) + break; +- } +- +- bytenr = btrfs_node_blockptr(cur, path->slots[*level]); +- ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); +- blocksize = btrfs_level_size(root, *level - 1); +- +- ret = drop_snap_lookup_refcount(trans, root, bytenr, +- blocksize, &refs); +- BUG_ON(ret); +- +- /* +- * if there is more than one reference, we don't need +- * to read that node to drop any references it has. We +- * just drop the ref we hold on that node and move on to the +- * next slot in this level. +- */ +- if (refs != 1) { +- parent = path->nodes[*level]; +- root_owner = btrfs_header_owner(parent); +- root_gen = btrfs_header_generation(parent); +- path->slots[*level]++; +- +- ret = btrfs_free_extent(trans, root, bytenr, +- blocksize, parent->start, +- root_owner, root_gen, +- *level - 1, 1); +- BUG_ON(ret); +- +- atomic_inc(&root->fs_info->throttle_gen); +- wake_up(&root->fs_info->transaction_throttle); +- cond_resched(); +- +- continue; +- } +- +- /* +- * we need to keep freeing things in the next level down. +- * read the block and loop around to process it +- */ +- next = read_tree_block(root, bytenr, blocksize, ptr_gen); +- WARN_ON(*level <= 0); +- if (path->nodes[*level-1]) +- free_extent_buffer(path->nodes[*level-1]); +- path->nodes[*level-1] = next; +- *level = btrfs_header_level(next); +- path->slots[*level] = 0; +- cond_resched(); ++ last = bytenr + blocksize; ++ nread++; + } +-out: +- WARN_ON(*level < 0); +- WARN_ON(*level >= BTRFS_MAX_LEVEL); +- +- if (path->nodes[*level] == root->node) { +- parent = path->nodes[*level]; +- bytenr = path->nodes[*level]->start; +- } else { +- parent = path->nodes[*level + 1]; +- bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]); +- } +- +- blocksize = btrfs_level_size(root, *level); +- root_owner = btrfs_header_owner(parent); +- root_gen = btrfs_header_generation(parent); +- +- /* +- * cleanup and free the reference on the last node +- * we processed +- */ +- ret = btrfs_free_extent(trans, root, bytenr, blocksize, +- parent->start, root_owner, root_gen, +- *level, 1); +- free_extent_buffer(path->nodes[*level]); +- path->nodes[*level] = NULL; +- +- *level += 1; +- BUG_ON(ret); +- +- cond_resched(); +- return 0; ++ wc->reada_slot = slot; + } +-#endif +- +-struct walk_control { +- u64 refs[BTRFS_MAX_LEVEL]; +- u64 flags[BTRFS_MAX_LEVEL]; +- struct btrfs_key update_progress; +- int stage; +- int level; +- int shared_level; +- int update_ref; +- int keep_locks; +-}; +- +-#define DROP_REFERENCE 1 +-#define UPDATE_BACKREF 2 + + /* + * hepler to process tree block while walking down the tree. + * +- * when wc->stage == DROP_REFERENCE, this function checks +- * reference count of the block. if the block is shared and +- * we need update back refs for the subtree rooted at the +- * block, this function changes wc->stage to UPDATE_BACKREF +- * + * when wc->stage == UPDATE_BACKREF, this function updates + * back refs for pointers in the block. + * +@@ -4800,11 +5002,10 @@ struct walk_control { + static noinline int walk_down_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, +- struct walk_control *wc) ++ struct walk_control *wc, int lookup_info) + { + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; +- struct btrfs_key key; + u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; + int ret; + +@@ -4816,8 +5017,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, + * when reference count of tree block is 1, it won't increase + * again. once full backref flag is set, we never clear it. + */ +- if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || +- (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { ++ if (lookup_info && ++ ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || ++ (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { + BUG_ON(!path->locks[level]); + ret = btrfs_lookup_extent_info(trans, root, + eb->start, eb->len, +@@ -4827,21 +5029,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, + BUG_ON(wc->refs[level] == 0); + } + +- if (wc->stage == DROP_REFERENCE && +- wc->update_ref && wc->refs[level] > 1) { +- BUG_ON(eb == root->node); +- BUG_ON(path->slots[level] > 0); +- if (level == 0) +- btrfs_item_key_to_cpu(eb, &key, path->slots[level]); +- else +- btrfs_node_key_to_cpu(eb, &key, path->slots[level]); +- if (btrfs_header_owner(eb) == root->root_key.objectid && +- btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) { +- wc->stage = UPDATE_BACKREF; +- wc->shared_level = level; +- } +- } +- + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level] > 1) + return 1; +@@ -4878,6 +5065,136 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, + } + + /* ++ * hepler to process tree block pointer. ++ * ++ * when wc->stage == DROP_REFERENCE, this function checks ++ * reference count of the block pointed to. if the block ++ * is shared and we need update back refs for the subtree ++ * rooted at the block, this function changes wc->stage to ++ * UPDATE_BACKREF. if the block is shared and there is no ++ * need to update back, this function drops the reference ++ * to the block. ++ * ++ * NOTE: return value 1 means we should stop walking down. ++ */ ++static noinline int do_walk_down(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct btrfs_path *path, ++ struct walk_control *wc, int *lookup_info) ++{ ++ u64 bytenr; ++ u64 generation; ++ u64 parent; ++ u32 blocksize; ++ struct btrfs_key key; ++ struct extent_buffer *next; ++ int level = wc->level; ++ int reada = 0; ++ int ret = 0; ++ ++ generation = btrfs_node_ptr_generation(path->nodes[level], ++ path->slots[level]); ++ /* ++ * if the lower level block was created before the snapshot ++ * was created, we know there is no need to update back refs ++ * for the subtree ++ */ ++ if (wc->stage == UPDATE_BACKREF && ++ generation <= root->root_key.offset) { ++ *lookup_info = 1; ++ return 1; ++ } ++ ++ bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); ++ blocksize = btrfs_level_size(root, level - 1); ++ ++ next = btrfs_find_tree_block(root, bytenr, blocksize); ++ if (!next) { ++ next = btrfs_find_create_tree_block(root, bytenr, blocksize); ++ reada = 1; ++ } ++ btrfs_tree_lock(next); ++ btrfs_set_lock_blocking(next); ++ ++ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, ++ &wc->refs[level - 1], ++ &wc->flags[level - 1]); ++ BUG_ON(ret); ++ BUG_ON(wc->refs[level - 1] == 0); ++ *lookup_info = 0; ++ ++ if (wc->stage == DROP_REFERENCE) { ++ if (wc->refs[level - 1] > 1) { ++ if (level == 1 && ++ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) ++ goto skip; ++ ++ if (!wc->update_ref || ++ generation <= root->root_key.offset) ++ goto skip; ++ ++ btrfs_node_key_to_cpu(path->nodes[level], &key, ++ path->slots[level]); ++ ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); ++ if (ret < 0) ++ goto skip; ++ ++ wc->stage = UPDATE_BACKREF; ++ wc->shared_level = level - 1; ++ } ++ } else { ++ if (level == 1 && ++ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) ++ goto skip; ++ } ++ ++ if (!btrfs_buffer_uptodate(next, generation)) { ++ btrfs_tree_unlock(next); ++ free_extent_buffer(next); ++ next = NULL; ++ *lookup_info = 1; ++ } ++ ++ if (!next) { ++ if (reada && level == 1) ++ reada_walk_down(trans, root, wc, path); ++ next = read_tree_block(root, bytenr, blocksize, generation); ++ btrfs_tree_lock(next); ++ btrfs_set_lock_blocking(next); ++ } ++ ++ level--; ++ BUG_ON(level != btrfs_header_level(next)); ++ path->nodes[level] = next; ++ path->slots[level] = 0; ++ path->locks[level] = 1; ++ wc->level = level; ++ if (wc->level == 1) ++ wc->reada_slot = 0; ++ return 0; ++skip: ++ wc->refs[level - 1] = 0; ++ wc->flags[level - 1] = 0; ++ if (wc->stage == DROP_REFERENCE) { ++ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { ++ parent = path->nodes[level]->start; ++ } else { ++ BUG_ON(root->root_key.objectid != ++ btrfs_header_owner(path->nodes[level])); ++ parent = 0; ++ } ++ ++ ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, ++ root->root_key.objectid, level - 1, 0); ++ BUG_ON(ret); ++ } ++ btrfs_tree_unlock(next); ++ free_extent_buffer(next); ++ *lookup_info = 1; ++ return 1; ++} ++ ++/* + * hepler to process tree block while walking up the tree. + * + * when wc->stage == DROP_REFERENCE, this function drops +@@ -4904,7 +5221,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, + if (level < wc->shared_level) + goto out; + +- BUG_ON(wc->refs[level] <= 1); + ret = find_next_key(path, level + 1, &wc->update_progress); + if (ret > 0) + wc->update_ref = 0; +@@ -4935,8 +5251,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, + path->locks[level] = 0; + return 1; + } +- } else { +- BUG_ON(level != 0); + } + } + +@@ -4989,39 +5303,28 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct walk_control *wc) + { +- struct extent_buffer *next; +- struct extent_buffer *cur; +- u64 bytenr; +- u64 ptr_gen; +- u32 blocksize; + int level = wc->level; ++ int lookup_info = 1; + int ret; + + while (level >= 0) { +- cur = path->nodes[level]; +- BUG_ON(path->slots[level] >= btrfs_header_nritems(cur)); ++ if (path->slots[level] >= ++ btrfs_header_nritems(path->nodes[level])) ++ break; + +- ret = walk_down_proc(trans, root, path, wc); ++ ret = walk_down_proc(trans, root, path, wc, lookup_info); + if (ret > 0) + break; + + if (level == 0) + break; + +- bytenr = btrfs_node_blockptr(cur, path->slots[level]); +- blocksize = btrfs_level_size(root, level - 1); +- ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]); +- +- next = read_tree_block(root, bytenr, blocksize, ptr_gen); +- btrfs_tree_lock(next); +- btrfs_set_lock_blocking(next); +- +- level--; +- BUG_ON(level != btrfs_header_level(next)); +- path->nodes[level] = next; +- path->slots[level] = 0; +- path->locks[level] = 1; +- wc->level = level; ++ ret = do_walk_down(trans, root, path, wc, &lookup_info); ++ if (ret > 0) { ++ path->slots[level]++; ++ continue; ++ } ++ level = wc->level; + } + return 0; + } +@@ -5111,9 +5414,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) + err = ret; + goto out; + } +- btrfs_node_key_to_cpu(path->nodes[level], &key, +- path->slots[level]); +- WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key))); ++ WARN_ON(ret > 0); + + /* + * unlock our path, this is safe because only this +@@ -5148,6 +5449,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) + wc->stage = DROP_REFERENCE; + wc->update_ref = update_ref; + wc->keep_locks = 0; ++ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); + + while (1) { + ret = walk_down_tree(trans, root, path, wc); +@@ -5200,9 +5502,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) + ret = btrfs_del_root(trans, tree_root, &root->root_key); + BUG_ON(ret); + +- free_extent_buffer(root->node); +- free_extent_buffer(root->commit_root); +- kfree(root); ++ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { ++ ret = btrfs_find_last_root(tree_root, root->root_key.objectid, ++ NULL, NULL); ++ BUG_ON(ret < 0); ++ if (ret > 0) { ++ ret = btrfs_del_orphan_item(trans, tree_root, ++ root->root_key.objectid); ++ BUG_ON(ret); ++ } ++ } ++ ++ if (root->in_radix) { ++ btrfs_free_fs_root(tree_root->fs_info, root); ++ } else { ++ free_extent_buffer(root->node); ++ free_extent_buffer(root->commit_root); ++ kfree(root); ++ } + out: + btrfs_end_transaction(trans, tree_root); + kfree(wc); +@@ -5254,6 +5571,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + wc->stage = DROP_REFERENCE; + wc->update_ref = 0; + wc->keep_locks = 1; ++ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); + + while (1) { + wret = walk_down_tree(trans, root, path, wc); +@@ -5396,9 +5714,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode, + lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); + while (1) { + int ret; +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; +@@ -6841,287 +7159,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root, + return 0; + } + +-#if 0 +-static int __insert_orphan_inode(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- u64 objectid, u64 size) +-{ +- struct btrfs_path *path; +- struct btrfs_inode_item *item; +- struct extent_buffer *leaf; +- int ret; +- +- path = btrfs_alloc_path(); +- if (!path) +- return -ENOMEM; +- +- path->leave_spinning = 1; +- ret = btrfs_insert_empty_inode(trans, root, path, objectid); +- if (ret) +- goto out; +- +- leaf = path->nodes[0]; +- item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); +- memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); +- btrfs_set_inode_generation(leaf, item, 1); +- btrfs_set_inode_size(leaf, item, size); +- btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); +- btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); +- btrfs_mark_buffer_dirty(leaf); +- btrfs_release_path(root, path); +-out: +- btrfs_free_path(path); +- return ret; +-} +- +-static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, +- struct btrfs_block_group_cache *group) ++/* ++ * checks to see if its even possible to relocate this block group. ++ * ++ * @return - -1 if it's not a good idea to relocate this block group, 0 if its ++ * ok to go ahead and try. ++ */ ++int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) + { +- struct inode *inode = NULL; +- struct btrfs_trans_handle *trans; +- struct btrfs_root *root; +- struct btrfs_key root_key; +- u64 objectid = BTRFS_FIRST_FREE_OBJECTID; +- int err = 0; ++ struct btrfs_block_group_cache *block_group; ++ struct btrfs_space_info *space_info; ++ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; ++ struct btrfs_device *device; ++ int full = 0; ++ int ret = 0; + +- root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; +- root_key.type = BTRFS_ROOT_ITEM_KEY; +- root_key.offset = (u64)-1; +- root = btrfs_read_fs_root_no_name(fs_info, &root_key); +- if (IS_ERR(root)) +- return ERR_CAST(root); ++ block_group = btrfs_lookup_block_group(root->fs_info, bytenr); + +- trans = btrfs_start_transaction(root, 1); +- BUG_ON(!trans); ++ /* odd, couldn't find the block group, leave it alone */ ++ if (!block_group) ++ return -1; + +- err = btrfs_find_free_objectid(trans, root, objectid, &objectid); +- if (err) ++ /* no bytes used, we're good */ ++ if (!btrfs_block_group_used(&block_group->item)) + goto out; + +- err = __insert_orphan_inode(trans, root, objectid, group->key.offset); +- BUG_ON(err); +- +- err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, +- group->key.offset, 0, group->key.offset, +- 0, 0, 0); +- BUG_ON(err); +- +- inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); +- if (inode->i_state & I_NEW) { +- BTRFS_I(inode)->root = root; +- BTRFS_I(inode)->location.objectid = objectid; +- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; +- BTRFS_I(inode)->location.offset = 0; +- btrfs_read_locked_inode(inode); +- unlock_new_inode(inode); +- BUG_ON(is_bad_inode(inode)); +- } else { +- BUG_ON(1); +- } +- BTRFS_I(inode)->index_cnt = group->key.objectid; +- +- err = btrfs_orphan_add(trans, inode); +-out: +- btrfs_end_transaction(trans, root); +- if (err) { +- if (inode) +- iput(inode); +- inode = ERR_PTR(err); +- } +- return inode; +-} +- +-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) +-{ +- +- struct btrfs_ordered_sum *sums; +- struct btrfs_sector_sum *sector_sum; +- struct btrfs_ordered_extent *ordered; +- struct btrfs_root *root = BTRFS_I(inode)->root; +- struct list_head list; +- size_t offset; +- int ret; +- u64 disk_bytenr; +- +- INIT_LIST_HEAD(&list); +- +- ordered = btrfs_lookup_ordered_extent(inode, file_pos); +- BUG_ON(ordered->file_offset != file_pos || ordered->len != len); +- +- disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; +- ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, +- disk_bytenr + len - 1, &list); +- +- while (!list_empty(&list)) { +- sums = list_entry(list.next, struct btrfs_ordered_sum, list); +- list_del_init(&sums->list); +- +- sector_sum = sums->sums; +- sums->bytenr = ordered->start; ++ space_info = block_group->space_info; ++ spin_lock(&space_info->lock); + +- offset = 0; +- while (offset < sums->len) { +- sector_sum->bytenr += ordered->start - disk_bytenr; +- sector_sum++; +- offset += root->sectorsize; +- } ++ full = space_info->full; + +- btrfs_add_ordered_sum(inode, ordered, sums); ++ /* ++ * if this is the last block group we have in this space, we can't ++ * relocate it unless we're able to allocate a new chunk below. ++ * ++ * Otherwise, we need to make sure we have room in the space to handle ++ * all of the extents from this block group. If we can, we're good ++ */ ++ if ((space_info->total_bytes != block_group->key.offset) && ++ (space_info->bytes_used + space_info->bytes_reserved + ++ space_info->bytes_pinned + space_info->bytes_readonly + ++ btrfs_block_group_used(&block_group->item) < ++ space_info->total_bytes)) { ++ spin_unlock(&space_info->lock); ++ goto out; + } +- btrfs_put_ordered_extent(ordered); +- return 0; +-} +- +-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start) +-{ +- struct btrfs_trans_handle *trans; +- struct btrfs_path *path; +- struct btrfs_fs_info *info = root->fs_info; +- struct extent_buffer *leaf; +- struct inode *reloc_inode; +- struct btrfs_block_group_cache *block_group; +- struct btrfs_key key; +- u64 skipped; +- u64 cur_byte; +- u64 total_found; +- u32 nritems; +- int ret; +- int progress; +- int pass = 0; +- +- root = root->fs_info->extent_root; +- +- block_group = btrfs_lookup_block_group(info, group_start); +- BUG_ON(!block_group); +- +- printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n", +- (unsigned long long)block_group->key.objectid, +- (unsigned long long)block_group->flags); +- +- path = btrfs_alloc_path(); +- BUG_ON(!path); +- +- reloc_inode = create_reloc_inode(info, block_group); +- BUG_ON(IS_ERR(reloc_inode)); +- +- __alloc_chunk_for_shrink(root, block_group, 1); +- set_block_group_readonly(block_group); +- +- btrfs_start_delalloc_inodes(info->tree_root); +- btrfs_wait_ordered_extents(info->tree_root, 0); +-again: +- skipped = 0; +- total_found = 0; +- progress = 0; +- key.objectid = block_group->key.objectid; +- key.offset = 0; +- key.type = 0; +- cur_byte = key.objectid; +- +- trans = btrfs_start_transaction(info->tree_root, 1); +- btrfs_commit_transaction(trans, info->tree_root); ++ spin_unlock(&space_info->lock); + +- mutex_lock(&root->fs_info->cleaner_mutex); +- btrfs_clean_old_snapshots(info->tree_root); +- btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); +- mutex_unlock(&root->fs_info->cleaner_mutex); ++ /* ++ * ok we don't have enough space, but maybe we have free space on our ++ * devices to allocate new chunks for relocation, so loop through our ++ * alloc devices and guess if we have enough space. However, if we ++ * were marked as full, then we know there aren't enough chunks, and we ++ * can just return. ++ */ ++ ret = -1; ++ if (full) ++ goto out; + +- trans = btrfs_start_transaction(info->tree_root, 1); +- btrfs_commit_transaction(trans, info->tree_root); ++ mutex_lock(&root->fs_info->chunk_mutex); ++ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { ++ u64 min_free = btrfs_block_group_used(&block_group->item); ++ u64 dev_offset, max_avail; + +- while (1) { +- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +- if (ret < 0) +- goto out; +-next: +- leaf = path->nodes[0]; +- nritems = btrfs_header_nritems(leaf); +- if (path->slots[0] >= nritems) { +- ret = btrfs_next_leaf(root, path); +- if (ret < 0) +- goto out; +- if (ret == 1) { +- ret = 0; ++ /* ++ * check to make sure we can actually find a chunk with enough ++ * space to fit our block group in. ++ */ ++ if (device->total_bytes > device->bytes_used + min_free) { ++ ret = find_free_dev_extent(NULL, device, min_free, ++ &dev_offset, &max_avail); ++ if (!ret) + break; +- } +- leaf = path->nodes[0]; +- nritems = btrfs_header_nritems(leaf); +- } +- +- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); +- +- if (key.objectid >= block_group->key.objectid + +- block_group->key.offset) +- break; +- +- if (progress && need_resched()) { +- btrfs_release_path(root, path); +- cond_resched(); +- progress = 0; +- continue; +- } +- progress = 1; +- +- if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY || +- key.objectid + key.offset <= cur_byte) { +- path->slots[0]++; +- goto next; ++ ret = -1; + } +- +- total_found++; +- cur_byte = key.objectid + key.offset; +- btrfs_release_path(root, path); +- +- __alloc_chunk_for_shrink(root, block_group, 0); +- ret = relocate_one_extent(root, path, &key, block_group, +- reloc_inode, pass); +- BUG_ON(ret < 0); +- if (ret > 0) +- skipped++; +- +- key.objectid = cur_byte; +- key.type = 0; +- key.offset = 0; + } +- +- btrfs_release_path(root, path); +- +- if (pass == 0) { +- btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1); +- invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1); +- } +- +- if (total_found > 0) { +- printk(KERN_INFO "btrfs found %llu extents in pass %d\n", +- (unsigned long long)total_found, pass); +- pass++; +- if (total_found == skipped && pass > 2) { +- iput(reloc_inode); +- reloc_inode = create_reloc_inode(info, block_group); +- pass = 0; +- } +- goto again; +- } +- +- /* delete reloc_inode */ +- iput(reloc_inode); +- +- /* unpin extents in this range */ +- trans = btrfs_start_transaction(info->tree_root, 1); +- btrfs_commit_transaction(trans, info->tree_root); +- +- spin_lock(&block_group->lock); +- WARN_ON(block_group->pinned > 0); +- WARN_ON(block_group->reserved > 0); +- WARN_ON(btrfs_block_group_used(&block_group->item) > 0); +- spin_unlock(&block_group->lock); +- btrfs_put_block_group(block_group); +- ret = 0; ++ mutex_unlock(&root->fs_info->chunk_mutex); + out: +- btrfs_free_path(path); ++ btrfs_put_block_group(block_group); + return ret; + } +-#endif + + static int find_first_block_group(struct btrfs_root *root, + struct btrfs_path *path, struct btrfs_key *key) +@@ -7164,8 +7281,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) + { + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; ++ struct btrfs_caching_control *caching_ctl; + struct rb_node *n; + ++ down_write(&info->extent_commit_sem); ++ while (!list_empty(&info->caching_block_groups)) { ++ caching_ctl = list_entry(info->caching_block_groups.next, ++ struct btrfs_caching_control, list); ++ list_del(&caching_ctl->list); ++ put_caching_control(caching_ctl); ++ } ++ up_write(&info->extent_commit_sem); ++ + spin_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + block_group = rb_entry(n, struct btrfs_block_group_cache, +@@ -7179,8 +7306,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) + up_write(&block_group->space_info->groups_sem); + + if (block_group->cached == BTRFS_CACHE_STARTED) +- wait_event(block_group->caching_q, +- block_group_cache_done(block_group)); ++ wait_block_group_cache_done(block_group); + + btrfs_remove_free_space_cache(block_group); + +@@ -7250,7 +7376,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) + spin_lock_init(&cache->lock); + spin_lock_init(&cache->tree_lock); + cache->fs_info = info; +- init_waitqueue_head(&cache->caching_q); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + +@@ -7272,8 +7397,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) + cache->flags = btrfs_block_group_flags(&cache->item); + cache->sectorsize = root->sectorsize; + +- remove_sb_from_cache(root, cache); +- + /* + * check for two cases, either we are full, and therefore + * don't need to bother with the caching work since we won't +@@ -7282,13 +7405,19 @@ int btrfs_read_block_groups(struct btrfs_root *root) + * time, particularly in the full case. + */ + if (found_key.offset == btrfs_block_group_used(&cache->item)) { ++ exclude_super_stripes(root, cache); ++ cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; ++ free_excluded_extents(root, cache); + } else if (btrfs_block_group_used(&cache->item) == 0) { ++ exclude_super_stripes(root, cache); ++ cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + add_new_free_space(cache, root->fs_info, + found_key.objectid, + found_key.objectid + + found_key.offset); ++ free_excluded_extents(root, cache); + } + + ret = update_space_info(info, cache->flags, found_key.offset, +@@ -7296,6 +7425,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) + &space_info); + BUG_ON(ret); + cache->space_info = space_info; ++ spin_lock(&cache->space_info->lock); ++ cache->space_info->bytes_super += cache->bytes_super; ++ spin_unlock(&cache->space_info->lock); ++ + down_write(&space_info->groups_sem); + list_add_tail(&cache->list, &space_info->block_groups); + up_write(&space_info->groups_sem); +@@ -7345,7 +7478,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + spin_lock_init(&cache->tree_lock); +- init_waitqueue_head(&cache->caching_q); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + +@@ -7354,15 +7486,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, + cache->flags = type; + btrfs_set_block_group_flags(&cache->item, type); + ++ cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; +- remove_sb_from_cache(root, cache); ++ exclude_super_stripes(root, cache); + + add_new_free_space(cache, root->fs_info, chunk_offset, + chunk_offset + size); + ++ free_excluded_extents(root, cache); ++ + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, + &cache->space_info); + BUG_ON(ret); ++ ++ spin_lock(&cache->space_info->lock); ++ cache->space_info->bytes_super += cache->bytes_super; ++ spin_unlock(&cache->space_info->lock); ++ + down_write(&cache->space_info->groups_sem); + list_add_tail(&cache->list, &cache->space_info->block_groups); + up_write(&cache->space_info->groups_sem); +@@ -7428,8 +7568,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + up_write(&block_group->space_info->groups_sem); + + if (block_group->cached == BTRFS_CACHE_STARTED) +- wait_event(block_group->caching_q, +- block_group_cache_done(block_group)); ++ wait_block_group_cache_done(block_group); + + btrfs_remove_free_space_cache(block_group); + +diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c +index 6826018..96577e8 100644 +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree, + return NULL; + } + ++static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, ++ struct extent_state *other) ++{ ++ if (tree->ops && tree->ops->merge_extent_hook) ++ tree->ops->merge_extent_hook(tree->mapping->host, new, ++ other); ++} ++ + /* + * utility function to look for merge candidates inside a given range. + * Any extents with matching state are merged together into a single +@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree, + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->end == state->start - 1 && + other->state == state->state) { ++ merge_cb(tree, state, other); + state->start = other->start; + other->tree = NULL; + rb_erase(&other->rb_node, &tree->state); +@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree, + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->start == state->end + 1 && + other->state == state->state) { ++ merge_cb(tree, state, other); + other->start = state->start; + state->tree = NULL; + rb_erase(&state->rb_node, &tree->state); + free_extent_state(state); ++ state = NULL; + } + } ++ + return 0; + } + +-static void set_state_cb(struct extent_io_tree *tree, ++static int set_state_cb(struct extent_io_tree *tree, + struct extent_state *state, + unsigned long bits) + { + if (tree->ops && tree->ops->set_bit_hook) { +- tree->ops->set_bit_hook(tree->mapping->host, state->start, +- state->end, state->state, bits); ++ return tree->ops->set_bit_hook(tree->mapping->host, ++ state->start, state->end, ++ state->state, bits); + } ++ ++ return 0; + } + + static void clear_state_cb(struct extent_io_tree *tree, + struct extent_state *state, + unsigned long bits) + { +- if (tree->ops && tree->ops->clear_bit_hook) { +- tree->ops->clear_bit_hook(tree->mapping->host, state->start, +- state->end, state->state, bits); +- } ++ if (tree->ops && tree->ops->clear_bit_hook) ++ tree->ops->clear_bit_hook(tree->mapping->host, state, bits); + } + + /* +@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree, + int bits) + { + struct rb_node *node; ++ int ret; + + if (end < start) { + printk(KERN_ERR "btrfs end < start %llu %llu\n", +@@ -365,12 +379,15 @@ static int insert_state(struct extent_io_tree *tree, + (unsigned long long)start); + WARN_ON(1); + } ++ state->start = start; ++ state->end = end; ++ ret = set_state_cb(tree, state, bits); ++ if (ret) ++ return ret; ++ + if (bits & EXTENT_DIRTY) + tree->dirty_bytes += end - start + 1; +- set_state_cb(tree, state, bits); + state->state |= bits; +- state->start = start; +- state->end = end; + node = tree_insert(&tree->state, end, &state->rb_node); + if (node) { + struct extent_state *found; +@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree, + return 0; + } + ++static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, ++ u64 split) ++{ ++ if (tree->ops && tree->ops->split_extent_hook) ++ return tree->ops->split_extent_hook(tree->mapping->host, ++ orig, split); ++ return 0; ++} ++ + /* + * split a given extent state struct in two, inserting the preallocated + * struct 'prealloc' as the newly created second half. 'split' indicates an +@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, + struct extent_state *prealloc, u64 split) + { + struct rb_node *node; ++ ++ split_cb(tree, orig, split); ++ + prealloc->start = orig->start; + prealloc->end = split - 1; + prealloc->state = orig->state; +@@ -431,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, int bits, int wake, + int delete) + { +- int ret = state->state & bits; ++ int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; ++ int ret = state->state & bits_to_clear; + + if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; +@@ -439,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree, + tree->dirty_bytes -= range; + } + clear_state_cb(tree, state, bits); +- state->state &= ~bits; ++ state->state &= ~bits_to_clear; + if (wake) + wake_up(&state->wq); + if (delete || state->state == 0) { +@@ -471,10 +501,14 @@ static int clear_state_bit(struct extent_io_tree *tree, + * bits were already set, or zero if none of the bits were already set. + */ + int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, +- int bits, int wake, int delete, gfp_t mask) ++ int bits, int wake, int delete, ++ struct extent_state **cached_state, ++ gfp_t mask) + { + struct extent_state *state; ++ struct extent_state *cached; + struct extent_state *prealloc = NULL; ++ struct rb_node *next_node; + struct rb_node *node; + u64 last_end; + int err; +@@ -488,6 +522,17 @@ again: + } + + spin_lock(&tree->lock); ++ if (cached_state) { ++ cached = *cached_state; ++ *cached_state = NULL; ++ cached_state = NULL; ++ if (cached && cached->tree && cached->start == start) { ++ atomic_dec(&cached->refs); ++ state = cached; ++ goto hit_next; ++ } ++ free_extent_state(cached); ++ } + /* + * this search will find the extents that end after + * our range starts +@@ -496,6 +541,7 @@ again: + if (!node) + goto out; + state = rb_entry(node, struct extent_state, rb_node); ++hit_next: + if (state->start > end) + goto out; + WARN_ON(state->end < start); +@@ -526,13 +572,11 @@ again: + if (err) + goto out; + if (state->end <= end) { +- set |= clear_state_bit(tree, state, bits, +- wake, delete); ++ set |= clear_state_bit(tree, state, bits, wake, ++ delete); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; +- } else { +- start = state->start; + } + goto search_again; + } +@@ -547,19 +591,30 @@ again: + prealloc = alloc_extent_state(GFP_ATOMIC); + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); +- + if (wake) + wake_up(&state->wq); +- set |= clear_state_bit(tree, prealloc, bits, +- wake, delete); ++ ++ set |= clear_state_bit(tree, prealloc, bits, wake, delete); ++ + prealloc = NULL; + goto out; + } + ++ if (state->end < end && prealloc && !need_resched()) ++ next_node = rb_next(&state->rb_node); ++ else ++ next_node = NULL; ++ + set |= clear_state_bit(tree, state, bits, wake, delete); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; ++ if (start <= end && next_node) { ++ state = rb_entry(next_node, struct extent_state, ++ rb_node); ++ if (state->start == start) ++ goto hit_next; ++ } + goto search_again; + + out: +@@ -641,40 +696,59 @@ out: + return 0; + } + +-static void set_state_bits(struct extent_io_tree *tree, ++static int set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, + int bits) + { ++ int ret; ++ ++ ret = set_state_cb(tree, state, bits); ++ if (ret) ++ return ret; ++ + if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + tree->dirty_bytes += range; + } +- set_state_cb(tree, state, bits); + state->state |= bits; ++ ++ return 0; ++} ++ ++static void cache_state(struct extent_state *state, ++ struct extent_state **cached_ptr) ++{ ++ if (cached_ptr && !(*cached_ptr)) { ++ if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { ++ *cached_ptr = state; ++ atomic_inc(&state->refs); ++ } ++ } + } + + /* +- * set some bits on a range in the tree. This may require allocations +- * or sleeping, so the gfp mask is used to indicate what is allowed. ++ * set some bits on a range in the tree. This may require allocations or ++ * sleeping, so the gfp mask is used to indicate what is allowed. + * +- * If 'exclusive' == 1, this will fail with -EEXIST if some part of the +- * range already has the desired bits set. The start of the existing +- * range is returned in failed_start in this case. ++ * If any of the exclusive bits are set, this will fail with -EEXIST if some ++ * part of the range already has the desired bits set. The start of the ++ * existing range is returned in failed_start in this case. + * +- * [start, end] is inclusive +- * This takes the tree lock. ++ * [start, end] is inclusive This takes the tree lock. + */ ++ + static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, +- int bits, int exclusive, u64 *failed_start, ++ int bits, int exclusive_bits, u64 *failed_start, ++ struct extent_state **cached_state, + gfp_t mask) + { + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err = 0; +- int set; + u64 last_start; + u64 last_end; ++ + again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); +@@ -683,6 +757,13 @@ again: + } + + spin_lock(&tree->lock); ++ if (cached_state && *cached_state) { ++ state = *cached_state; ++ if (state->start == start && state->tree) { ++ node = &state->rb_node; ++ goto hit_next; ++ } ++ } + /* + * this search will find all the extents that end after + * our range starts. +@@ -694,8 +775,8 @@ again: + BUG_ON(err == -EEXIST); + goto out; + } +- + state = rb_entry(node, struct extent_state, rb_node); ++hit_next: + last_start = state->start; + last_end = state->end; + +@@ -706,17 +787,32 @@ again: + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { +- set = state->state & bits; +- if (set && exclusive) { ++ struct rb_node *next_node; ++ if (state->state & exclusive_bits) { + *failed_start = state->start; + err = -EEXIST; + goto out; + } +- set_state_bits(tree, state, bits); ++ ++ err = set_state_bits(tree, state, bits); ++ if (err) ++ goto out; ++ ++ cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; ++ + start = last_end + 1; ++ if (start < end && prealloc && !need_resched()) { ++ next_node = rb_next(node); ++ if (next_node) { ++ state = rb_entry(next_node, struct extent_state, ++ rb_node); ++ if (state->start == start) ++ goto hit_next; ++ } ++ } + goto search_again; + } + +@@ -737,8 +833,7 @@ again: + * desired bit on it. + */ + if (state->start < start) { +- set = state->state & bits; +- if (exclusive && set) { ++ if (state->state & exclusive_bits) { + *failed_start = start; + err = -EEXIST; + goto out; +@@ -749,13 +844,14 @@ again: + if (err) + goto out; + if (state->end <= end) { +- set_state_bits(tree, state, bits); ++ err = set_state_bits(tree, state, bits); ++ if (err) ++ goto out; ++ cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; +- } else { +- start = state->start; + } + goto search_again; + } +@@ -774,10 +870,13 @@ again: + this_end = last_start - 1; + err = insert_state(tree, prealloc, start, this_end, + bits); +- prealloc = NULL; + BUG_ON(err == -EEXIST); +- if (err) ++ if (err) { ++ prealloc = NULL; + goto out; ++ } ++ cache_state(prealloc, cached_state); ++ prealloc = NULL; + start = this_end + 1; + goto search_again; + } +@@ -788,8 +887,7 @@ again: + * on the first half + */ + if (state->start <= end && state->end > end) { +- set = state->state & bits; +- if (exclusive && set) { ++ if (state->state & exclusive_bits) { + *failed_start = start; + err = -EEXIST; + goto out; +@@ -797,7 +895,12 @@ again: + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + +- set_state_bits(tree, prealloc, bits); ++ err = set_state_bits(tree, prealloc, bits); ++ if (err) { ++ prealloc = NULL; ++ goto out; ++ } ++ cache_state(prealloc, cached_state); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; +@@ -826,86 +929,65 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { + return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, +- mask); +-} +- +-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, +- gfp_t mask) +-{ +- return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask); ++ NULL, mask); + } + + int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) + { + return set_extent_bit(tree, start, end, bits, 0, NULL, +- mask); ++ NULL, mask); + } + + int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) + { +- return clear_extent_bit(tree, start, end, bits, 0, 0, mask); ++ return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); + } + + int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { + return set_extent_bit(tree, start, end, +- EXTENT_DELALLOC | EXTENT_DIRTY, +- 0, NULL, mask); ++ EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, ++ 0, NULL, NULL, mask); + } + + int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { + return clear_extent_bit(tree, start, end, +- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); +-} +- +-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, +- gfp_t mask) +-{ +- return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask); ++ EXTENT_DIRTY | EXTENT_DELALLOC | ++ EXTENT_DO_ACCOUNTING, 0, 0, ++ NULL, mask); + } + + int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { + return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, +- mask); ++ NULL, mask); + } + + static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { +- return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); ++ return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, ++ NULL, mask); + } + + int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, +- mask); ++ NULL, mask); + } + + static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) + { +- return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); +-} +- +-static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end, +- gfp_t mask) +-{ +- return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, +- 0, NULL, mask); +-} +- +-static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, +- u64 end, gfp_t mask) +-{ +- return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); ++ return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, ++ NULL, mask); + } + + int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) +@@ -917,13 +999,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) + * either insert or lock state struct between start and end use mask to tell + * us if waiting is desired. + */ +-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) ++int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, ++ int bits, struct extent_state **cached_state, gfp_t mask) + { + int err; + u64 failed_start; + while (1) { +- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, +- &failed_start, mask); ++ err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, ++ EXTENT_LOCKED, &failed_start, ++ cached_state, mask); + if (err == -EEXIST && (mask & __GFP_WAIT)) { + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + start = failed_start; +@@ -935,27 +1019,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) + return err; + } + ++int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) ++{ ++ return lock_extent_bits(tree, start, end, 0, NULL, mask); ++} ++ + int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { + int err; + u64 failed_start; + +- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, +- &failed_start, mask); ++ err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, ++ &failed_start, NULL, mask); + if (err == -EEXIST) { + if (failed_start > start) + clear_extent_bit(tree, start, failed_start - 1, +- EXTENT_LOCKED, 1, 0, mask); ++ EXTENT_LOCKED, 1, 0, NULL, mask); + return 0; + } + return 1; + } + ++int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, ++ struct extent_state **cached, gfp_t mask) ++{ ++ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, ++ mask); ++} ++ + int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { +- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); ++ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, ++ mask); + } + + /* +@@ -974,7 +1071,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) + page_cache_release(page); + index++; + } +- set_extent_dirty(tree, start, end, GFP_NOFS); + return 0; + } + +@@ -994,7 +1090,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) + page_cache_release(page); + index++; + } +- set_extent_writeback(tree, start, end, GFP_NOFS); + return 0; + } + +@@ -1232,6 +1327,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode, + u64 delalloc_start; + u64 delalloc_end; + u64 found; ++ struct extent_state *cached_state = NULL; + int ret; + int loops = 0; + +@@ -1269,6 +1365,7 @@ again: + /* some of the pages are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching + */ ++ free_extent_state(cached_state); + if (!loops) { + unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); + max_bytes = PAGE_CACHE_SIZE - offset; +@@ -1282,18 +1379,21 @@ again: + BUG_ON(ret); + + /* step three, lock the state bits for the whole range */ +- lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); ++ lock_extent_bits(tree, delalloc_start, delalloc_end, ++ 0, &cached_state, GFP_NOFS); + + /* then test to make sure it is all still delalloc */ + ret = test_range_bit(tree, delalloc_start, delalloc_end, +- EXTENT_DELALLOC, 1); ++ EXTENT_DELALLOC, 1, cached_state); + if (!ret) { +- unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); ++ unlock_extent_cached(tree, delalloc_start, delalloc_end, ++ &cached_state, GFP_NOFS); + __unlock_for_delalloc(inode, locked_page, + delalloc_start, delalloc_end); + cond_resched(); + goto again; + } ++ free_extent_state(cached_state); + *start = delalloc_start; + *end = delalloc_end; + out_failed: +@@ -1303,11 +1403,7 @@ out_failed: + int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, +- int unlock_pages, +- int clear_unlock, +- int clear_delalloc, int clear_dirty, +- int set_writeback, +- int end_writeback) ++ unsigned long op) + { + int ret; + struct page *pages[16]; +@@ -1317,16 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode, + int i; + int clear_bits = 0; + +- if (clear_unlock) ++ if (op & EXTENT_CLEAR_UNLOCK) + clear_bits |= EXTENT_LOCKED; +- if (clear_dirty) ++ if (op & EXTENT_CLEAR_DIRTY) + clear_bits |= EXTENT_DIRTY; + +- if (clear_delalloc) ++ if (op & EXTENT_CLEAR_DELALLOC) + clear_bits |= EXTENT_DELALLOC; + +- clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); +- if (!(unlock_pages || clear_dirty || set_writeback || end_writeback)) ++ if (op & EXTENT_CLEAR_ACCOUNTING) ++ clear_bits |= EXTENT_DO_ACCOUNTING; ++ ++ clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); ++ if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | ++ EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | ++ EXTENT_SET_PRIVATE2))) + return 0; + + while (nr_pages > 0) { +@@ -1334,17 +1435,21 @@ int extent_clear_unlock_delalloc(struct inode *inode, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { ++ ++ if (op & EXTENT_SET_PRIVATE2) ++ SetPagePrivate2(pages[i]); ++ + if (pages[i] == locked_page) { + page_cache_release(pages[i]); + continue; + } +- if (clear_dirty) ++ if (op & EXTENT_CLEAR_DIRTY) + clear_page_dirty_for_io(pages[i]); +- if (set_writeback) ++ if (op & EXTENT_SET_WRITEBACK) + set_page_writeback(pages[i]); +- if (end_writeback) ++ if (op & EXTENT_END_WRITEBACK) + end_page_writeback(pages[i]); +- if (unlock_pages) ++ if (op & EXTENT_CLEAR_UNLOCK_PAGE) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } +@@ -1476,14 +1581,17 @@ out: + * range is found set. + */ + int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, +- int bits, int filled) ++ int bits, int filled, struct extent_state *cached) + { + struct extent_state *state = NULL; + struct rb_node *node; + int bitset = 0; + + spin_lock(&tree->lock); +- node = tree_search(tree, start); ++ if (cached && cached->tree && cached->start == start) ++ node = &cached->rb_node; ++ else ++ node = tree_search(tree, start); + while (node && start <= end) { + state = rb_entry(node, struct extent_state, rb_node); + +@@ -1503,6 +1611,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + bitset = 0; + break; + } ++ ++ if (state->end == (u64)-1) ++ break; ++ + start = state->end + 1; + if (start > end) + break; +@@ -1526,7 +1638,7 @@ static int check_page_uptodate(struct extent_io_tree *tree, + { + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; +- if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) ++ if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) + SetPageUptodate(page); + return 0; + } +@@ -1540,7 +1652,7 @@ static int check_page_locked(struct extent_io_tree *tree, + { + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; +- if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) ++ if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) + unlock_page(page); + return 0; + } +@@ -1552,10 +1664,7 @@ static int check_page_locked(struct extent_io_tree *tree, + static int check_page_writeback(struct extent_io_tree *tree, + struct page *page) + { +- u64 start = (u64)page->index << PAGE_CACHE_SHIFT; +- u64 end = start + PAGE_CACHE_SIZE - 1; +- if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) +- end_page_writeback(page); ++ end_page_writeback(page); + return 0; + } + +@@ -1613,13 +1722,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err) + } + + if (!uptodate) { +- clear_extent_uptodate(tree, start, end, GFP_ATOMIC); ++ clear_extent_uptodate(tree, start, end, GFP_NOFS); + ClearPageUptodate(page); + SetPageError(page); + } + +- clear_extent_writeback(tree, start, end, GFP_ATOMIC); +- + if (whole_page) + end_page_writeback(page); + else +@@ -1983,7 +2090,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, + continue; + } + /* the get_extent function already copied into the page */ +- if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { ++ if (test_range_bit(tree, cur, cur_end, ++ EXTENT_UPTODATE, 1, NULL)) { + check_page_uptodate(tree, page); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; +@@ -2078,6 +2186,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + u64 iosize; + u64 unlock_start; + sector_t sector; ++ struct extent_state *cached_state = NULL; + struct extent_map *em; + struct block_device *bdev; + int ret; +@@ -2124,6 +2233,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + delalloc_end = 0; + page_started = 0; + if (!epd->extent_locked) { ++ u64 delalloc_to_write = 0; + /* + * make sure the wbc mapping index is at least updated + * to this page. +@@ -2143,8 +2253,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + tree->ops->fill_delalloc(inode, page, delalloc_start, + delalloc_end, &page_started, + &nr_written); ++ /* ++ * delalloc_end is already one less than the total ++ * length, so we don't subtract one from ++ * PAGE_CACHE_SIZE ++ */ ++ delalloc_to_write += (delalloc_end - delalloc_start + ++ PAGE_CACHE_SIZE) >> ++ PAGE_CACHE_SHIFT; + delalloc_start = delalloc_end + 1; + } ++ if (wbc->nr_to_write < delalloc_to_write) { ++ int thresh = 8192; ++ ++ if (delalloc_to_write < thresh * 2) ++ thresh = delalloc_to_write; ++ wbc->nr_to_write = min_t(u64, delalloc_to_write, ++ thresh); ++ } + + /* did the fill delalloc function already unlock and start + * the IO? +@@ -2160,15 +2286,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + goto done_unlocked; + } + } +- lock_extent(tree, start, page_end, GFP_NOFS); +- +- unlock_start = start; +- + if (tree->ops && tree->ops->writepage_start_hook) { + ret = tree->ops->writepage_start_hook(page, start, + page_end); + if (ret == -EAGAIN) { +- unlock_extent(tree, start, page_end, GFP_NOFS); + redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); + unlock_page(page); +@@ -2184,12 +2305,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + update_nr_written(page, wbc, nr_written + 1); + + end = page_end; +- if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) +- printk(KERN_ERR "btrfs delalloc bits after lock_extent\n"); +- + if (last_byte <= start) { +- clear_extent_dirty(tree, start, page_end, GFP_NOFS); +- unlock_extent(tree, start, page_end, GFP_NOFS); + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, + page_end, NULL, 1); +@@ -2197,13 +2313,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + goto done; + } + +- set_extent_uptodate(tree, start, page_end, GFP_NOFS); + blocksize = inode->i_sb->s_blocksize; + + while (cur <= end) { + if (cur >= last_byte) { +- clear_extent_dirty(tree, cur, page_end, GFP_NOFS); +- unlock_extent(tree, unlock_start, page_end, GFP_NOFS); + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, cur, + page_end, NULL, 1); +@@ -2235,12 +2348,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + */ + if (compressed || block_start == EXTENT_MAP_HOLE || + block_start == EXTENT_MAP_INLINE) { +- clear_extent_dirty(tree, cur, +- cur + iosize - 1, GFP_NOFS); +- +- unlock_extent(tree, unlock_start, cur + iosize - 1, +- GFP_NOFS); +- + /* + * end_io notification does not happen here for + * compressed extents +@@ -2265,13 +2372,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + } + /* leave this out until we have a page_mkwrite call */ + if (0 && !test_range_bit(tree, cur, cur + iosize - 1, +- EXTENT_DIRTY, 0)) { ++ EXTENT_DIRTY, 0, NULL)) { + cur = cur + iosize; + pg_offset += iosize; + continue; + } + +- clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); + if (tree->ops && tree->ops->writepage_io_hook) { + ret = tree->ops->writepage_io_hook(page, cur, + cur + iosize - 1); +@@ -2309,12 +2415,12 @@ done: + set_page_writeback(page); + end_page_writeback(page); + } +- if (unlock_start <= page_end) +- unlock_extent(tree, unlock_start, page_end, GFP_NOFS); + unlock_page(page); + + done_unlocked: + ++ /* drop our reference on any cached states */ ++ free_extent_state(cached_state); + return 0; + } + +@@ -2339,9 +2445,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, + writepage_t writepage, void *data, + void (*flush_fn)(void *)) + { +- struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; ++ int nr_to_write_done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; +@@ -2361,7 +2467,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, + scanned = 1; + } + retry: +- while (!done && (index <= end) && ++ while (!done && !nr_to_write_done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1))) { +@@ -2412,12 +2518,15 @@ retry: + unlock_page(page); + ret = 0; + } +- if (ret || wbc->nr_to_write <= 0) +- done = 1; +- if (wbc->nonblocking && bdi_write_congested(bdi)) { +- wbc->encountered_congestion = 1; ++ if (ret) + done = 1; +- } ++ ++ /* ++ * the filesystem may choose to bump up nr_to_write. ++ * We have to make sure to honor the new nr_to_write ++ * at any time ++ */ ++ nr_to_write_done = wbc->nr_to_write <= 0; + } + pagevec_release(&pvec); + cond_resched(); +@@ -2604,10 +2713,11 @@ int extent_invalidatepage(struct extent_io_tree *tree, + return 0; + + lock_extent(tree, start, end, GFP_NOFS); +- wait_on_extent_writeback(tree, start, end); ++ wait_on_page_writeback(page); + clear_extent_bit(tree, start, end, +- EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, +- 1, 1, GFP_NOFS); ++ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | ++ EXTENT_DO_ACCOUNTING, ++ 1, 1, NULL, GFP_NOFS); + return 0; + } + +@@ -2687,7 +2797,7 @@ int extent_prepare_write(struct extent_io_tree *tree, + !isnew && !PageUptodate(page) && + (block_off_end > to || block_off_start < from) && + !test_range_bit(tree, block_start, cur_end, +- EXTENT_UPTODATE, 1)) { ++ EXTENT_UPTODATE, 1, NULL)) { + u64 sector; + u64 extent_offset = block_start - em->start; + size_t iosize; +@@ -2701,7 +2811,7 @@ int extent_prepare_write(struct extent_io_tree *tree, + */ + set_extent_bit(tree, block_start, + block_start + iosize - 1, +- EXTENT_LOCKED, 0, NULL, GFP_NOFS); ++ EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS); + ret = submit_extent_page(READ, tree, page, + sector, iosize, page_offset, em->bdev, + NULL, 1, +@@ -2742,13 +2852,18 @@ int try_release_extent_state(struct extent_map_tree *map, + int ret = 1; + + if (test_range_bit(tree, start, end, +- EXTENT_IOBITS | EXTENT_ORDERED, 0)) ++ EXTENT_IOBITS, 0, NULL)) + ret = 0; + else { + if ((mask & GFP_NOFS) == GFP_NOFS) + mask = GFP_NOFS; +- clear_extent_bit(tree, start, end, EXTENT_UPTODATE, +- 1, 1, mask); ++ /* ++ * at this point we can safely clear everything except the ++ * locked bit and the nodatasum bit ++ */ ++ clear_extent_bit(tree, start, end, ++ ~(EXTENT_LOCKED | EXTENT_NODATASUM), ++ 0, 0, NULL, mask); + } + return ret; + } +@@ -2771,29 +2886,28 @@ int try_release_extent_mapping(struct extent_map_tree *map, + u64 len; + while (start <= end) { + len = end - start + 1; +- spin_lock(&map->lock); ++ write_lock(&map->lock); + em = lookup_extent_mapping(map, start, len); + if (!em || IS_ERR(em)) { +- spin_unlock(&map->lock); ++ write_unlock(&map->lock); + break; + } + if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || + em->start != start) { +- spin_unlock(&map->lock); ++ write_unlock(&map->lock); + free_extent_map(em); + break; + } + if (!test_range_bit(tree, em->start, + extent_map_end(em) - 1, +- EXTENT_LOCKED | EXTENT_WRITEBACK | +- EXTENT_ORDERED, +- 0)) { ++ EXTENT_LOCKED | EXTENT_WRITEBACK, ++ 0, NULL)) { + remove_extent_mapping(map, em); + /* once for the rb tree */ + free_extent_map(em); + } + start = extent_map_end(em); +- spin_unlock(&map->lock); ++ write_unlock(&map->lock); + + /* once for us */ + free_extent_map(em); +@@ -3203,7 +3317,7 @@ int extent_range_uptodate(struct extent_io_tree *tree, + int uptodate; + unsigned long index; + +- ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1); ++ ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); + if (ret) + return 1; + while (start <= end) { +@@ -3233,7 +3347,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, + return 1; + + ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, +- EXTENT_UPTODATE, 1); ++ EXTENT_UPTODATE, 1, NULL); + if (ret) + return ret; + +@@ -3269,7 +3383,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, + return 0; + + if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, +- EXTENT_UPTODATE, 1)) { ++ EXTENT_UPTODATE, 1, NULL)) { + return 0; + } + +diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h +index 5bc20ab..36de250 100644 +--- a/fs/btrfs/extent_io.h ++++ b/fs/btrfs/extent_io.h +@@ -13,10 +13,9 @@ + #define EXTENT_DEFRAG (1 << 6) + #define EXTENT_DEFRAG_DONE (1 << 7) + #define EXTENT_BUFFER_FILLED (1 << 8) +-#define EXTENT_ORDERED (1 << 9) +-#define EXTENT_ORDERED_METADATA (1 << 10) +-#define EXTENT_BOUNDARY (1 << 11) +-#define EXTENT_NODATASUM (1 << 12) ++#define EXTENT_BOUNDARY (1 << 9) ++#define EXTENT_NODATASUM (1 << 10) ++#define EXTENT_DO_ACCOUNTING (1 << 11) + #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) + + /* flags for bio submission */ +@@ -27,6 +26,16 @@ + #define EXTENT_BUFFER_BLOCKING 1 + #define EXTENT_BUFFER_DIRTY 2 + ++/* these are flags for extent_clear_unlock_delalloc */ ++#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 ++#define EXTENT_CLEAR_UNLOCK 0x2 ++#define EXTENT_CLEAR_DELALLOC 0x4 ++#define EXTENT_CLEAR_DIRTY 0x8 ++#define EXTENT_SET_WRITEBACK 0x10 ++#define EXTENT_END_WRITEBACK 0x20 ++#define EXTENT_SET_PRIVATE2 0x40 ++#define EXTENT_CLEAR_ACCOUNTING 0x80 ++ + /* + * page->private values. Every page that is controlled by the extent + * map has page->private set to one. +@@ -62,8 +71,13 @@ struct extent_io_ops { + struct extent_state *state, int uptodate); + int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits); +- int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, +- unsigned long old, unsigned long bits); ++ int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, ++ unsigned long bits); ++ int (*merge_extent_hook)(struct inode *inode, ++ struct extent_state *new, ++ struct extent_state *other); ++ int (*split_extent_hook)(struct inode *inode, ++ struct extent_state *orig, u64 split); + int (*write_cache_pages_lock_hook)(struct page *page); + }; + +@@ -81,10 +95,14 @@ struct extent_state { + u64 start; + u64 end; /* inclusive */ + struct rb_node rb_node; ++ ++ /* ADD NEW ELEMENTS AFTER THIS */ + struct extent_io_tree *tree; + wait_queue_head_t wq; + atomic_t refs; + unsigned long state; ++ u64 split_start; ++ u64 split_end; + + /* for use by the FS */ + u64 private; +@@ -142,6 +160,8 @@ int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask); + int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); ++int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, ++ int bits, struct extent_state **cached, gfp_t mask); + int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); + int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +@@ -155,11 +175,12 @@ u64 count_range_bits(struct extent_io_tree *tree, + u64 max_bytes, unsigned long bits); + + int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, +- int bits, int filled); ++ int bits, int filled, struct extent_state *cached_state); + int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); + int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, +- int bits, int wake, int delete, gfp_t mask); ++ int bits, int wake, int delete, struct extent_state **cached, ++ gfp_t mask); + int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); + int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, +@@ -278,9 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree, + int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, +- int unlock_page, +- int clear_unlock, +- int clear_delalloc, int clear_dirty, +- int set_writeback, +- int end_writeback); ++ unsigned long op); + #endif +diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c +index 30c9365..2c726b7 100644 +--- a/fs/btrfs/extent_map.c ++++ b/fs/btrfs/extent_map.c +@@ -36,7 +36,7 @@ void extent_map_exit(void) + void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) + { + tree->map.rb_node = NULL; +- spin_lock_init(&tree->lock); ++ rwlock_init(&tree->lock); + } + + /** +@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) + return 0; + } + ++int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) ++{ ++ int ret = 0; ++ struct extent_map *merge = NULL; ++ struct rb_node *rb; ++ struct extent_map *em; ++ ++ write_lock(&tree->lock); ++ em = lookup_extent_mapping(tree, start, len); ++ ++ WARN_ON(em->start != start || !em); ++ ++ if (!em) ++ goto out; ++ ++ clear_bit(EXTENT_FLAG_PINNED, &em->flags); ++ ++ if (em->start != 0) { ++ rb = rb_prev(&em->rb_node); ++ if (rb) ++ merge = rb_entry(rb, struct extent_map, rb_node); ++ if (rb && mergable_maps(merge, em)) { ++ em->start = merge->start; ++ em->len += merge->len; ++ em->block_len += merge->block_len; ++ em->block_start = merge->block_start; ++ merge->in_tree = 0; ++ rb_erase(&merge->rb_node, &tree->map); ++ free_extent_map(merge); ++ } ++ } ++ ++ rb = rb_next(&em->rb_node); ++ if (rb) ++ merge = rb_entry(rb, struct extent_map, rb_node); ++ if (rb && mergable_maps(em, merge)) { ++ em->len += merge->len; ++ em->block_len += merge->len; ++ rb_erase(&merge->rb_node, &tree->map); ++ merge->in_tree = 0; ++ free_extent_map(merge); ++ } ++ ++ free_extent_map(em); ++out: ++ write_unlock(&tree->lock); ++ return ret; ++ ++} ++ + /** + * add_extent_mapping - add new extent map to the extent tree + * @tree: tree to insert new map in +@@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree, + ret = -EEXIST; + goto out; + } +- assert_spin_locked(&tree->lock); + rb = tree_insert(&tree->map, em->start, &em->rb_node); + if (rb) { + ret = -EEXIST; +@@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + struct rb_node *next = NULL; + u64 end = range_end(start, len); + +- assert_spin_locked(&tree->lock); + rb_node = __tree_search(&tree->map, start, &prev, &next); + if (!rb_node && prev) { + em = rb_entry(prev, struct extent_map, rb_node); +@@ -319,6 +367,54 @@ out: + } + + /** ++ * search_extent_mapping - find a nearby extent map ++ * @tree: tree to lookup in ++ * @start: byte offset to start the search ++ * @len: length of the lookup range ++ * ++ * Find and return the first extent_map struct in @tree that intersects the ++ * [start, len] range. ++ * ++ * If one can't be found, any nearby extent may be returned ++ */ ++struct extent_map *search_extent_mapping(struct extent_map_tree *tree, ++ u64 start, u64 len) ++{ ++ struct extent_map *em; ++ struct rb_node *rb_node; ++ struct rb_node *prev = NULL; ++ struct rb_node *next = NULL; ++ ++ rb_node = __tree_search(&tree->map, start, &prev, &next); ++ if (!rb_node && prev) { ++ em = rb_entry(prev, struct extent_map, rb_node); ++ goto found; ++ } ++ if (!rb_node && next) { ++ em = rb_entry(next, struct extent_map, rb_node); ++ goto found; ++ } ++ if (!rb_node) { ++ em = NULL; ++ goto out; ++ } ++ if (IS_ERR(rb_node)) { ++ em = ERR_PTR(PTR_ERR(rb_node)); ++ goto out; ++ } ++ em = rb_entry(rb_node, struct extent_map, rb_node); ++ goto found; ++ ++ em = NULL; ++ goto out; ++ ++found: ++ atomic_inc(&em->refs); ++out: ++ return em; ++} ++ ++/** + * remove_extent_mapping - removes an extent_map from the extent tree + * @tree: extent tree to remove from + * @em: extent map beeing removed +@@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) + int ret = 0; + + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); +- assert_spin_locked(&tree->lock); + rb_erase(&em->rb_node, &tree->map); + em->in_tree = 0; + return ret; +diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h +index fb6eeef..ab6d74b 100644 +--- a/fs/btrfs/extent_map.h ++++ b/fs/btrfs/extent_map.h +@@ -31,7 +31,7 @@ struct extent_map { + + struct extent_map_tree { + struct rb_root map; +- spinlock_t lock; ++ rwlock_t lock; + }; + + static inline u64 extent_map_end(struct extent_map *em) +@@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask); + void free_extent_map(struct extent_map *em); + int __init extent_map_init(void); + void extent_map_exit(void); ++int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); ++struct extent_map *search_extent_mapping(struct extent_map_tree *tree, ++ u64 start, u64 len); + #endif +diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c +index 4b83397..4599113 100644 +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, + int err = 0; + int i; + struct inode *inode = fdentry(file)->d_inode; +- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; +- u64 hint_byte; + u64 num_bytes; + u64 start_pos; + u64 end_of_last_block; +@@ -125,23 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + end_of_last_block = start_pos + num_bytes - 1; ++ err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); ++ if (err) ++ return err; + +- lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); +- trans = btrfs_join_transaction(root, 1); +- if (!trans) { +- err = -ENOMEM; +- goto out_unlock; +- } +- btrfs_set_trans_block_group(trans, inode); +- hint_byte = 0; +- +- set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS); +- +- /* check for reserved extents on each page, we don't want +- * to reset the delalloc bit on things that already have +- * extents reserved. +- */ +- btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); + for (i = 0; i < num_pages; i++) { + struct page *p = pages[i]; + SetPageUptodate(p); +@@ -155,9 +140,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, + * at this time. + */ + } +- err = btrfs_end_transaction(trans, root); +-out_unlock: +- unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); + return err; + } + +@@ -189,18 +171,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + if (!split2) + split2 = alloc_extent_map(GFP_NOFS); + +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (!em) { +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + break; + } + flags = em->flags; + if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { +- spin_unlock(&em_tree->lock); + if (em->start <= start && + (!testend || em->start + em->len >= start + len)) { + free_extent_map(em); ++ write_unlock(&em_tree->lock); + break; + } + if (start < em->start) { +@@ -210,6 +192,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + start = em->start + em->len; + } + free_extent_map(em); ++ write_unlock(&em_tree->lock); + continue; + } + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); +@@ -260,7 +243,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + free_extent_map(split); + split = NULL; + } +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + + /* once for us */ + free_extent_map(em); +@@ -289,7 +272,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, u64 end, u64 locked_end, +- u64 inline_limit, u64 *hint_byte) ++ u64 inline_limit, u64 *hint_byte, int drop_cache) + { + u64 extent_end = 0; + u64 search_start = start; +@@ -314,7 +297,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, + int ret; + + inline_limit = 0; +- btrfs_drop_extent_cache(inode, start, end - 1, 0); ++ if (drop_cache) ++ btrfs_drop_extent_cache(inode, start, end - 1, 0); + + path = btrfs_alloc_path(); + if (!path) +@@ -894,7 +878,8 @@ again: + btrfs_put_ordered_extent(ordered); + + clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, +- last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, ++ last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | ++ EXTENT_DO_ACCOUNTING, + GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, GFP_NOFS); +@@ -936,21 +921,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, + start_pos = pos; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); ++ ++ /* do the reserve before the mutex lock in case we have to do some ++ * flushing. We wouldn't deadlock, but this is more polite. ++ */ ++ err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); ++ if (err) ++ goto out_nolock; ++ ++ mutex_lock(&inode->i_mutex); ++ + current->backing_dev_info = inode->i_mapping->backing_dev_info; + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) +- goto out_nolock; ++ goto out; ++ + if (count == 0) +- goto out_nolock; ++ goto out; + + err = file_remove_suid(file); + if (err) +- goto out_nolock; ++ goto out; ++ + file_update_time(file); + + pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + +- mutex_lock(&inode->i_mutex); ++ /* generic_write_checks can change our pos */ ++ start_pos = pos; ++ + BTRFS_I(inode)->sequence++; + first_index = pos >> PAGE_CACHE_SHIFT; + last_index = (pos + count) >> PAGE_CACHE_SHIFT; +@@ -1047,6 +1046,7 @@ out: + mutex_unlock(&inode->i_mutex); + if (ret) + err = ret; ++ btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + + out_nolock: + kfree(pages); +@@ -1087,8 +1087,10 @@ out_nolock: + btrfs_end_transaction(trans, root); + else + btrfs_commit_transaction(trans, root); +- } else { ++ } else if (ret != BTRFS_NO_LOG_SYNC) { + btrfs_commit_transaction(trans, root); ++ } else { ++ btrfs_end_transaction(trans, root); + } + } + if (file->f_flags & O_DIRECT) { +@@ -1138,6 +1140,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) + int ret = 0; + struct btrfs_trans_handle *trans; + ++ ++ /* we wait first, since the writeback may change the inode */ ++ root->log_batch++; ++ /* the VFS called filemap_fdatawrite for us */ ++ btrfs_wait_ordered_range(inode, 0, (u64)-1); ++ root->log_batch++; ++ + /* + * check the transaction that last modified this inode + * and see if its already been committed +@@ -1145,6 +1154,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) + if (!BTRFS_I(inode)->last_trans) + goto out; + ++ /* ++ * if the last transaction that changed this file was before ++ * the current transaction, we can bail out now without any ++ * syncing ++ */ + mutex_lock(&root->fs_info->trans_mutex); + if (BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed) { +@@ -1154,13 +1168,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) + } + mutex_unlock(&root->fs_info->trans_mutex); + +- root->log_batch++; +- filemap_fdatawrite(inode->i_mapping); +- btrfs_wait_ordered_range(inode, 0, (u64)-1); +- root->log_batch++; +- +- if (datasync && !(inode->i_state & I_DIRTY_PAGES)) +- goto out; + /* + * ok we haven't committed the transaction yet, lets do a commit + */ +@@ -1189,14 +1196,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) + */ + mutex_unlock(&dentry->d_inode->i_mutex); + +- if (ret > 0) { +- ret = btrfs_commit_transaction(trans, root); +- } else { +- ret = btrfs_sync_log(trans, root); +- if (ret == 0) +- ret = btrfs_end_transaction(trans, root); +- else ++ if (ret != BTRFS_NO_LOG_SYNC) { ++ if (ret > 0) { + ret = btrfs_commit_transaction(trans, root); ++ } else { ++ ret = btrfs_sync_log(trans, root); ++ if (ret == 0) ++ ret = btrfs_end_transaction(trans, root); ++ else ++ ret = btrfs_commit_transaction(trans, root); ++ } ++ } else { ++ ret = btrfs_end_transaction(trans, root); + } + mutex_lock(&dentry->d_inode->i_mutex); + out: +diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c +index 5edcee3..5c2caad 100644 +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, + + static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) + { +- u64 max_bytes, possible_bytes; ++ u64 max_bytes; ++ u64 bitmap_bytes; ++ u64 extent_bytes; + + /* + * The goal is to keep the total amount of memory used per 1gb of space +@@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) + max_bytes = MAX_CACHE_BYTES_PER_GIG * + (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); + +- possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) + +- (sizeof(struct btrfs_free_space) * +- block_group->extents_thresh); ++ /* ++ * we want to account for 1 more bitmap than what we have so we can make ++ * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as ++ * we add more bitmaps. ++ */ ++ bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE; + +- if (possible_bytes > max_bytes) { +- int extent_bytes = max_bytes - +- (block_group->total_bitmaps * PAGE_CACHE_SIZE); ++ if (bitmap_bytes >= max_bytes) { ++ block_group->extents_thresh = 0; ++ return; ++ } + +- if (extent_bytes <= 0) { +- block_group->extents_thresh = 0; +- return; +- } ++ /* ++ * we want the extent entry threshold to always be at most 1/2 the maxw ++ * bytes we can have, or whatever is less than that. ++ */ ++ extent_bytes = max_bytes - bitmap_bytes; ++ extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); + +- block_group->extents_thresh = extent_bytes / +- (sizeof(struct btrfs_free_space)); +- } ++ block_group->extents_thresh = ++ div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); + } + + static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, +@@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group, + BUG_ON(block_group->total_bitmaps >= max_bitmaps); + + info->offset = offset_to_bitmap(block_group, offset); ++ info->bytes = 0; + link_free_space(block_group, info); + block_group->total_bitmaps++; + +diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c +index 6b627c6..72ce3c1 100644 +--- a/fs/btrfs/inode-item.c ++++ b/fs/btrfs/inode-item.c +@@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + ptr = (unsigned long)(ref + 1); + ret = 0; + } else if (ret < 0) { ++ if (ret == -EOVERFLOW) ++ ret = -EMLINK; + goto out; + } else { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], +@@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_inode_item)); +- if (ret == 0 && objectid > root->highest_inode) +- root->highest_inode = objectid; + return ret; + } + +diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c +index 9abbced..c56eb59 100644 +--- a/fs/btrfs/inode-map.c ++++ b/fs/btrfs/inode-map.c +@@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) + slot = path->slots[0] - 1; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); +- *objectid = found_key.objectid; ++ *objectid = max_t(u64, found_key.objectid, ++ BTRFS_FIRST_FREE_OBJECTID - 1); + } else { +- *objectid = BTRFS_FIRST_FREE_OBJECTID; ++ *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + } + ret = 0; + error: +@@ -53,91 +54,27 @@ error: + return ret; + } + +-/* +- * walks the btree of allocated inodes and find a hole. +- */ + int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 dirid, u64 *objectid) + { +- struct btrfs_path *path; +- struct btrfs_key key; + int ret; +- int slot = 0; +- u64 last_ino = 0; +- int start_found; +- struct extent_buffer *l; +- struct btrfs_key search_key; +- u64 search_start = dirid; +- + mutex_lock(&root->objectid_mutex); +- if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID && +- root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) { +- *objectid = ++root->last_inode_alloc; +- mutex_unlock(&root->objectid_mutex); +- return 0; +- } +- path = btrfs_alloc_path(); +- BUG_ON(!path); +- search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID); +- search_key.objectid = search_start; +- search_key.type = 0; +- search_key.offset = 0; +- +- start_found = 0; +- ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); +- if (ret < 0) +- goto error; + +- while (1) { +- l = path->nodes[0]; +- slot = path->slots[0]; +- if (slot >= btrfs_header_nritems(l)) { +- ret = btrfs_next_leaf(root, path); +- if (ret == 0) +- continue; +- if (ret < 0) +- goto error; +- if (!start_found) { +- *objectid = search_start; +- start_found = 1; +- goto found; +- } +- *objectid = last_ino > search_start ? +- last_ino : search_start; +- goto found; +- } +- btrfs_item_key_to_cpu(l, &key, slot); +- if (key.objectid >= search_start) { +- if (start_found) { +- if (last_ino < search_start) +- last_ino = search_start; +- if (key.objectid > last_ino) { +- *objectid = last_ino; +- goto found; +- } +- } else if (key.objectid > search_start) { +- *objectid = search_start; +- goto found; +- } +- } +- if (key.objectid >= BTRFS_LAST_FREE_OBJECTID) +- break; ++ if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { ++ ret = btrfs_find_highest_inode(root, &root->highest_objectid); ++ if (ret) ++ goto out; ++ } + +- start_found = 1; +- last_ino = key.objectid + 1; +- path->slots[0]++; ++ if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { ++ ret = -ENOSPC; ++ goto out; + } +- BUG_ON(1); +-found: +- btrfs_release_path(root, path); +- btrfs_free_path(path); +- BUG_ON(*objectid < search_start); +- mutex_unlock(&root->objectid_mutex); +- return 0; +-error: +- btrfs_release_path(root, path); +- btrfs_free_path(path); ++ ++ *objectid = ++root->highest_objectid; ++ ret = 0; ++out: + mutex_unlock(&root->objectid_mutex); + return ret; + } +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 59cba18..f69e5e0 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, + } + + ret = btrfs_drop_extents(trans, root, inode, start, +- aligned_end, aligned_end, start, &hint_byte); ++ aligned_end, aligned_end, start, ++ &hint_byte, 1); + BUG_ON(ret); + + if (isize > actual_end) +@@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, + inline_len, compressed_size, + compressed_pages); + BUG_ON(ret); +- btrfs_drop_extent_cache(inode, start, aligned_end, 0); ++ btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); + return 0; + } + +@@ -423,9 +424,12 @@ again: + * and free up our temp pages. + */ + extent_clear_unlock_delalloc(inode, +- &BTRFS_I(inode)->io_tree, +- start, end, NULL, 1, 0, +- 0, 1, 1, 1); ++ &BTRFS_I(inode)->io_tree, ++ start, end, NULL, ++ EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | ++ EXTENT_CLEAR_DELALLOC | ++ EXTENT_CLEAR_ACCOUNTING | ++ EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); + ret = 0; + goto free_pages_out; + } +@@ -611,9 +615,9 @@ static noinline int submit_compressed_extents(struct inode *inode, + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + + while (1) { +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; +@@ -636,11 +640,14 @@ static noinline int submit_compressed_extents(struct inode *inode, + * clear dirty, set writeback and unlock the pages. + */ + extent_clear_unlock_delalloc(inode, +- &BTRFS_I(inode)->io_tree, +- async_extent->start, +- async_extent->start + +- async_extent->ram_size - 1, +- NULL, 1, 1, 0, 1, 1, 0); ++ &BTRFS_I(inode)->io_tree, ++ async_extent->start, ++ async_extent->start + ++ async_extent->ram_size - 1, ++ NULL, EXTENT_CLEAR_UNLOCK_PAGE | ++ EXTENT_CLEAR_UNLOCK | ++ EXTENT_CLEAR_DELALLOC | ++ EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); + + ret = btrfs_submit_compressed_write(inode, + async_extent->start, +@@ -711,9 +718,15 @@ static noinline int cow_file_range(struct inode *inode, + start, end, 0, NULL); + if (ret == 0) { + extent_clear_unlock_delalloc(inode, +- &BTRFS_I(inode)->io_tree, +- start, end, NULL, 1, 1, +- 1, 1, 1, 1); ++ &BTRFS_I(inode)->io_tree, ++ start, end, NULL, ++ EXTENT_CLEAR_UNLOCK_PAGE | ++ EXTENT_CLEAR_UNLOCK | ++ EXTENT_CLEAR_DELALLOC | ++ EXTENT_CLEAR_ACCOUNTING | ++ EXTENT_CLEAR_DIRTY | ++ EXTENT_SET_WRITEBACK | ++ EXTENT_END_WRITEBACK); + *nr_written = *nr_written + + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; + *page_started = 1; +@@ -725,9 +738,20 @@ static noinline int cow_file_range(struct inode *inode, + BUG_ON(disk_num_bytes > + btrfs_super_total_bytes(&root->fs_info->super_copy)); + ++ ++ read_lock(&BTRFS_I(inode)->extent_tree.lock); ++ em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, ++ start, num_bytes); ++ if (em) { ++ alloc_hint = em->block_start; ++ free_extent_map(em); ++ } ++ read_unlock(&BTRFS_I(inode)->extent_tree.lock); + btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); + + while (disk_num_bytes > 0) { ++ unsigned long op; ++ + cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); + ret = btrfs_reserve_extent(trans, root, cur_alloc_size, + root->sectorsize, 0, alloc_hint, +@@ -737,7 +761,6 @@ static noinline int cow_file_range(struct inode *inode, + em = alloc_extent_map(GFP_NOFS); + em->start = start; + em->orig_start = em->start; +- + ram_size = ins.offset; + em->len = ins.offset; + +@@ -747,9 +770,9 @@ static noinline int cow_file_range(struct inode *inode, + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + while (1) { +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; +@@ -776,11 +799,17 @@ static noinline int cow_file_range(struct inode *inode, + /* we're not doing compressed IO, don't unlock the first + * page (which the caller expects to stay locked), don't + * clear any dirty bits and don't set any writeback bits ++ * ++ * Do set the Private2 bit so we know this page was properly ++ * setup for writepage + */ ++ op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; ++ op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | ++ EXTENT_SET_PRIVATE2; ++ + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + start, start + ram_size - 1, +- locked_page, unlock, 1, +- 1, 0, 0, 0); ++ locked_page, op); + disk_num_bytes -= cur_alloc_size; + num_bytes -= cur_alloc_size; + alloc_hint = ins.objectid + ins.offset; +@@ -852,8 +881,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, + u64 cur_end; + int limit = 10 * 1024 * 1042; + +- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | +- EXTENT_DELALLOC, 1, 0, GFP_NOFS); ++ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, ++ 1, 0, NULL, GFP_NOFS); + while (start < end) { + async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); + async_cow->inode = inode; +@@ -994,6 +1023,7 @@ next_slot: + + if (found_key.offset > cur_offset) { + extent_end = found_key.offset; ++ extent_type = 0; + goto out_check; + } + +@@ -1080,9 +1110,9 @@ out_check: + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + while (1) { +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; +@@ -1100,8 +1130,10 @@ out_check: + BUG_ON(ret); + + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, +- cur_offset, cur_offset + num_bytes - 1, +- locked_page, 1, 1, 1, 0, 0, 0); ++ cur_offset, cur_offset + num_bytes - 1, ++ locked_page, EXTENT_CLEAR_UNLOCK_PAGE | ++ EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | ++ EXTENT_SET_PRIVATE2); + cur_offset = extent_end; + if (cur_offset > end) + break; +@@ -1147,6 +1179,89 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, + return ret; + } + ++static int btrfs_split_extent_hook(struct inode *inode, ++ struct extent_state *orig, u64 split) ++{ ++ struct btrfs_root *root = BTRFS_I(inode)->root; ++ u64 size; ++ ++ if (!(orig->state & EXTENT_DELALLOC)) ++ return 0; ++ ++ size = orig->end - orig->start + 1; ++ if (size > root->fs_info->max_extent) { ++ u64 num_extents; ++ u64 new_size; ++ ++ new_size = orig->end - split + 1; ++ num_extents = div64_u64(size + root->fs_info->max_extent - 1, ++ root->fs_info->max_extent); ++ ++ /* ++ * if we break a large extent up then leave oustanding_extents ++ * be, since we've already accounted for the large extent. ++ */ ++ if (div64_u64(new_size + root->fs_info->max_extent - 1, ++ root->fs_info->max_extent) < num_extents) ++ return 0; ++ } ++ ++ spin_lock(&BTRFS_I(inode)->accounting_lock); ++ BTRFS_I(inode)->outstanding_extents++; ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); ++ ++ return 0; ++} ++ ++/* ++ * extent_io.c merge_extent_hook, used to track merged delayed allocation ++ * extents so we can keep track of new extents that are just merged onto old ++ * extents, such as when we are doing sequential writes, so we can properly ++ * account for the metadata space we'll need. ++ */ ++static int btrfs_merge_extent_hook(struct inode *inode, ++ struct extent_state *new, ++ struct extent_state *other) ++{ ++ struct btrfs_root *root = BTRFS_I(inode)->root; ++ u64 new_size, old_size; ++ u64 num_extents; ++ ++ /* not delalloc, ignore it */ ++ if (!(other->state & EXTENT_DELALLOC)) ++ return 0; ++ ++ old_size = other->end - other->start + 1; ++ if (new->start < other->start) ++ new_size = other->end - new->start + 1; ++ else ++ new_size = new->end - other->start + 1; ++ ++ /* we're not bigger than the max, unreserve the space and go */ ++ if (new_size <= root->fs_info->max_extent) { ++ spin_lock(&BTRFS_I(inode)->accounting_lock); ++ BTRFS_I(inode)->outstanding_extents--; ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); ++ return 0; ++ } ++ ++ /* ++ * If we grew by another max_extent, just return, we want to keep that ++ * reserved amount. ++ */ ++ num_extents = div64_u64(old_size + root->fs_info->max_extent - 1, ++ root->fs_info->max_extent); ++ if (div64_u64(new_size + root->fs_info->max_extent - 1, ++ root->fs_info->max_extent) > num_extents) ++ return 0; ++ ++ spin_lock(&BTRFS_I(inode)->accounting_lock); ++ BTRFS_I(inode)->outstanding_extents--; ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); ++ ++ return 0; ++} ++ + /* + * extent_io.c set_bit_hook, used to track delayed allocation + * bytes in this file, and to maintain the list of inodes that +@@ -1155,6 +1270,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, + static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits) + { ++ + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testeing for the DELALLOC +@@ -1162,6 +1278,10 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, + */ + if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; ++ ++ spin_lock(&BTRFS_I(inode)->accounting_lock); ++ BTRFS_I(inode)->outstanding_extents++; ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); + btrfs_delalloc_reserve_space(root, inode, end - start + 1); + spin_lock(&root->fs_info->delalloc_lock); + BTRFS_I(inode)->delalloc_bytes += end - start + 1; +@@ -1178,22 +1298,31 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, + /* + * extent_io.c clear_bit_hook, see set_bit_hook for why + */ +-static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, +- unsigned long old, unsigned long bits) ++static int btrfs_clear_bit_hook(struct inode *inode, ++ struct extent_state *state, unsigned long bits) + { + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testeing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ +- if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { ++ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; + ++ if (bits & EXTENT_DO_ACCOUNTING) { ++ spin_lock(&BTRFS_I(inode)->accounting_lock); ++ BTRFS_I(inode)->outstanding_extents--; ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); ++ btrfs_unreserve_metadata_for_delalloc(root, inode, 1); ++ } ++ + spin_lock(&root->fs_info->delalloc_lock); +- if (end - start + 1 > root->fs_info->delalloc_bytes) { ++ if (state->end - state->start + 1 > ++ root->fs_info->delalloc_bytes) { + printk(KERN_INFO "btrfs warning: delalloc account " + "%llu %llu\n", +- (unsigned long long)end - start + 1, ++ (unsigned long long) ++ state->end - state->start + 1, + (unsigned long long) + root->fs_info->delalloc_bytes); + btrfs_delalloc_free_space(root, inode, (u64)-1); +@@ -1201,9 +1330,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, + BTRFS_I(inode)->delalloc_bytes = 0; + } else { + btrfs_delalloc_free_space(root, inode, +- end - start + 1); +- root->fs_info->delalloc_bytes -= end - start + 1; +- BTRFS_I(inode)->delalloc_bytes -= end - start + 1; ++ state->end - ++ state->start + 1); ++ root->fs_info->delalloc_bytes -= state->end - ++ state->start + 1; ++ BTRFS_I(inode)->delalloc_bytes -= state->end - ++ state->start + 1; + } + if (BTRFS_I(inode)->delalloc_bytes == 0 && + !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { +@@ -1374,10 +1506,8 @@ again: + lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); + + /* already ordered? We're done */ +- if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, +- EXTENT_ORDERED, 0)) { ++ if (PagePrivate2(page)) + goto out; +- } + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { +@@ -1413,11 +1543,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) + struct inode *inode = page->mapping->host; + struct btrfs_writepage_fixup *fixup; + struct btrfs_root *root = BTRFS_I(inode)->root; +- int ret; + +- ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, +- EXTENT_ORDERED, 0); +- if (ret) ++ /* this page is properly in the ordered list */ ++ if (TestClearPagePrivate2(page)) + return 0; + + if (PageChecked(page)) +@@ -1455,9 +1583,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, + BUG_ON(!path); + + path->leave_spinning = 1; ++ ++ /* ++ * we may be replacing one extent in the tree with another. ++ * The new extent is pinned in the extent map, and we don't want ++ * to drop it from the cache until it is completely in the btree. ++ * ++ * So, tell btrfs_drop_extents to leave this extent in the cache. ++ * the caller is expected to unpin it and allow it to be merged ++ * with the others. ++ */ + ret = btrfs_drop_extents(trans, root, inode, file_pos, + file_pos + num_bytes, locked_end, +- file_pos, &hint); ++ file_pos, &hint, 0); + BUG_ON(ret); + + ins.objectid = inode->i_ino; +@@ -1485,7 +1623,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, + btrfs_mark_buffer_dirty(leaf); + + inode_add_bytes(inode, num_bytes); +- btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0); + + ins.objectid = disk_bytenr; + ins.offset = disk_num_bytes; +@@ -1596,6 +1733,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) + ordered_extent->len, + compressed, 0, 0, + BTRFS_FILE_EXTENT_REG); ++ unpin_extent_cache(&BTRFS_I(inode)->extent_tree, ++ ordered_extent->file_offset, ++ ordered_extent->len); + BUG_ON(ret); + } + unlock_extent(io_tree, ordered_extent->file_offset, +@@ -1623,6 +1763,7 @@ nocow: + static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate) + { ++ ClearPagePrivate2(page); + return btrfs_finish_ordered_io(page->mapping->host, start, end); + } + +@@ -1669,13 +1810,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, + failrec->last_mirror = 0; + failrec->bio_flags = 0; + +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (em->start > start || em->start + em->len < start) { + free_extent_map(em); + em = NULL; + } +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + if (!em || IS_ERR(em)) { + kfree(failrec); +@@ -1794,7 +1935,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, + return 0; + + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && +- test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) { ++ test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { + clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, + GFP_NOFS); + return 0; +@@ -2352,6 +2493,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) + return ret; + } + ++int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct inode *dir, u64 objectid, ++ const char *name, int name_len) ++{ ++ struct btrfs_path *path; ++ struct extent_buffer *leaf; ++ struct btrfs_dir_item *di; ++ struct btrfs_key key; ++ u64 index; ++ int ret; ++ ++ path = btrfs_alloc_path(); ++ if (!path) ++ return -ENOMEM; ++ ++ di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, ++ name, name_len, -1); ++ BUG_ON(!di || IS_ERR(di)); ++ ++ leaf = path->nodes[0]; ++ btrfs_dir_item_key_to_cpu(leaf, di, &key); ++ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ++ ret = btrfs_delete_one_dir_name(trans, root, path, di); ++ BUG_ON(ret); ++ btrfs_release_path(root, path); ++ ++ ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, ++ objectid, root->root_key.objectid, ++ dir->i_ino, &index, name, name_len); ++ if (ret < 0) { ++ BUG_ON(ret != -ENOENT); ++ di = btrfs_search_dir_index_item(root, path, dir->i_ino, ++ name, name_len); ++ BUG_ON(!di || IS_ERR(di)); ++ ++ leaf = path->nodes[0]; ++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ++ btrfs_release_path(root, path); ++ index = key.offset; ++ } ++ ++ di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, ++ index, name, name_len, -1); ++ BUG_ON(!di || IS_ERR(di)); ++ ++ leaf = path->nodes[0]; ++ btrfs_dir_item_key_to_cpu(leaf, di, &key); ++ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ++ ret = btrfs_delete_one_dir_name(trans, root, path, di); ++ BUG_ON(ret); ++ btrfs_release_path(root, path); ++ ++ btrfs_i_size_write(dir, dir->i_size - name_len * 2); ++ dir->i_mtime = dir->i_ctime = CURRENT_TIME; ++ ret = btrfs_update_inode(trans, root, dir); ++ BUG_ON(ret); ++ dir->i_sb->s_dirt = 1; ++ ++ btrfs_free_path(path); ++ return 0; ++} ++ + static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) + { + struct inode *inode = dentry->d_inode; +@@ -2361,29 +2565,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) + struct btrfs_trans_handle *trans; + unsigned long nr = 0; + +- /* +- * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir +- * the root of a subvolume or snapshot +- */ + if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || +- inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { ++ inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + return -ENOTEMPTY; +- } + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + ++ if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ++ err = btrfs_unlink_subvol(trans, root, dir, ++ BTRFS_I(inode)->location.objectid, ++ dentry->d_name.name, ++ dentry->d_name.len); ++ goto out; ++ } ++ + err = btrfs_orphan_add(trans, inode); + if (err) +- goto fail_trans; ++ goto out; + + /* now the directory is empty */ + err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len); + if (!err) + btrfs_i_size_write(inode, 0); +- +-fail_trans: ++out: + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + btrfs_btree_balance_dirty(root, nr); +@@ -2826,12 +3032,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) + + if ((offset & (blocksize - 1)) == 0) + goto out; ++ ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); ++ if (ret) ++ goto out; ++ ++ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); ++ if (ret) ++ goto out; + + ret = -ENOMEM; + again: + page = grab_cache_page(mapping, index); +- if (!page) ++ if (!page) { ++ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); ++ btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + goto out; ++ } + + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; +@@ -2864,7 +3080,16 @@ again: + goto again; + } + +- btrfs_set_extent_delalloc(inode, page_start, page_end); ++ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, ++ EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, ++ GFP_NOFS); ++ ++ ret = btrfs_set_extent_delalloc(inode, page_start, page_end); ++ if (ret) { ++ unlock_extent(io_tree, page_start, page_end, GFP_NOFS); ++ goto out_unlock; ++ } ++ + ret = 0; + if (offset != PAGE_CACHE_SIZE) { + kaddr = kmap(page); +@@ -2877,6 +3102,9 @@ again: + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + + out_unlock: ++ if (ret) ++ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); ++ btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + unlock_page(page); + page_cache_release(page); + out: +@@ -2895,17 +3123,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) + u64 last_byte; + u64 cur_offset; + u64 hole_size; +- int err; ++ int err = 0; + + if (size <= hole_start) + return 0; + +- err = btrfs_check_metadata_free_space(root); ++ err = btrfs_truncate_page(inode->i_mapping, inode->i_size); + if (err) + return err; + +- btrfs_truncate_page(inode->i_mapping, inode->i_size); +- + while (1) { + struct btrfs_ordered_extent *ordered; + btrfs_wait_ordered_range(inode, hole_start, +@@ -2935,15 +3161,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) + cur_offset, + cur_offset + hole_size, + block_end, +- cur_offset, &hint_byte); ++ cur_offset, &hint_byte, 1); + if (err) + break; ++ ++ err = btrfs_reserve_metadata_space(root, 1); ++ if (err) ++ break; ++ + err = btrfs_insert_file_extent(trans, root, + inode->i_ino, cur_offset, 0, + 0, hole_size, 0, hole_size, + 0, 0, 0); + btrfs_drop_extent_cache(inode, hole_start, + last_byte - 1, 0); ++ btrfs_unreserve_metadata_space(root, 1); + } + free_extent_map(em); + cur_offset = last_byte; +@@ -3003,6 +3235,11 @@ void btrfs_delete_inode(struct inode *inode) + } + btrfs_wait_ordered_range(inode, 0, (u64)-1); + ++ if (inode->i_nlink > 0) { ++ BUG_ON(btrfs_root_refs(&root->root_item) != 0); ++ goto no_delete; ++ } ++ + btrfs_i_size_write(inode, 0); + trans = btrfs_join_transaction(root, 1); + +@@ -3070,29 +3307,67 @@ out_err: + * is kind of like crossing a mount point. + */ + static int fixup_tree_root_location(struct btrfs_root *root, +- struct btrfs_key *location, +- struct btrfs_root **sub_root, +- struct dentry *dentry) ++ struct inode *dir, ++ struct dentry *dentry, ++ struct btrfs_key *location, ++ struct btrfs_root **sub_root) + { +- struct btrfs_root_item *ri; ++ struct btrfs_path *path; ++ struct btrfs_root *new_root; ++ struct btrfs_root_ref *ref; ++ struct extent_buffer *leaf; ++ int ret; ++ int err = 0; + +- if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY) +- return 0; +- if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) +- return 0; ++ path = btrfs_alloc_path(); ++ if (!path) { ++ err = -ENOMEM; ++ goto out; ++ } + +- *sub_root = btrfs_read_fs_root(root->fs_info, location, +- dentry->d_name.name, +- dentry->d_name.len); +- if (IS_ERR(*sub_root)) +- return PTR_ERR(*sub_root); ++ err = -ENOENT; ++ ret = btrfs_find_root_ref(root->fs_info->tree_root, path, ++ BTRFS_I(dir)->root->root_key.objectid, ++ location->objectid); ++ if (ret) { ++ if (ret < 0) ++ err = ret; ++ goto out; ++ } + +- ri = &(*sub_root)->root_item; +- location->objectid = btrfs_root_dirid(ri); +- btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); +- location->offset = 0; ++ leaf = path->nodes[0]; ++ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); ++ if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino || ++ btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) ++ goto out; + +- return 0; ++ ret = memcmp_extent_buffer(leaf, dentry->d_name.name, ++ (unsigned long)(ref + 1), ++ dentry->d_name.len); ++ if (ret) ++ goto out; ++ ++ btrfs_release_path(root->fs_info->tree_root, path); ++ ++ new_root = btrfs_read_fs_root_no_name(root->fs_info, location); ++ if (IS_ERR(new_root)) { ++ err = PTR_ERR(new_root); ++ goto out; ++ } ++ ++ if (btrfs_root_refs(&new_root->root_item) == 0) { ++ err = -ENOENT; ++ goto out; ++ } ++ ++ *sub_root = new_root; ++ location->objectid = btrfs_root_dirid(&new_root->root_item); ++ location->type = BTRFS_INODE_ITEM_KEY; ++ location->offset = 0; ++ err = 0; ++out: ++ btrfs_free_path(path); ++ return err; + } + + static void inode_tree_add(struct inode *inode) +@@ -3101,11 +3376,13 @@ static void inode_tree_add(struct inode *inode) + struct btrfs_inode *entry; + struct rb_node **p; + struct rb_node *parent; +- + again: + p = &root->inode_tree.rb_node; + parent = NULL; + ++ if (hlist_unhashed(&inode->i_hash)) ++ return; ++ + spin_lock(&root->inode_lock); + while (*p) { + parent = *p; +@@ -3132,13 +3409,87 @@ again: + static void inode_tree_del(struct inode *inode) + { + struct btrfs_root *root = BTRFS_I(inode)->root; ++ int empty = 0; + + spin_lock(&root->inode_lock); + if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { + rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); ++ empty = RB_EMPTY_ROOT(&root->inode_tree); ++ } ++ spin_unlock(&root->inode_lock); ++ ++ if (empty && btrfs_root_refs(&root->root_item) == 0) { ++ synchronize_srcu(&root->fs_info->subvol_srcu); ++ spin_lock(&root->inode_lock); ++ empty = RB_EMPTY_ROOT(&root->inode_tree); ++ spin_unlock(&root->inode_lock); ++ if (empty) ++ btrfs_add_dead_root(root); ++ } ++} ++ ++int btrfs_invalidate_inodes(struct btrfs_root *root) ++{ ++ struct rb_node *node; ++ struct rb_node *prev; ++ struct btrfs_inode *entry; ++ struct inode *inode; ++ u64 objectid = 0; ++ ++ WARN_ON(btrfs_root_refs(&root->root_item) != 0); ++ ++ spin_lock(&root->inode_lock); ++again: ++ node = root->inode_tree.rb_node; ++ prev = NULL; ++ while (node) { ++ prev = node; ++ entry = rb_entry(node, struct btrfs_inode, rb_node); ++ ++ if (objectid < entry->vfs_inode.i_ino) ++ node = node->rb_left; ++ else if (objectid > entry->vfs_inode.i_ino) ++ node = node->rb_right; ++ else ++ break; ++ } ++ if (!node) { ++ while (prev) { ++ entry = rb_entry(prev, struct btrfs_inode, rb_node); ++ if (objectid <= entry->vfs_inode.i_ino) { ++ node = prev; ++ break; ++ } ++ prev = rb_next(prev); ++ } ++ } ++ while (node) { ++ entry = rb_entry(node, struct btrfs_inode, rb_node); ++ objectid = entry->vfs_inode.i_ino + 1; ++ inode = igrab(&entry->vfs_inode); ++ if (inode) { ++ spin_unlock(&root->inode_lock); ++ if (atomic_read(&inode->i_count) > 1) ++ d_prune_aliases(inode); ++ /* ++ * btrfs_drop_inode will remove it from ++ * the inode cache when its usage count ++ * hits zero. ++ */ ++ iput(inode); ++ cond_resched(); ++ spin_lock(&root->inode_lock); ++ goto again; ++ } ++ ++ if (cond_resched_lock(&root->inode_lock)) ++ goto again; ++ ++ node = rb_next(node); + } + spin_unlock(&root->inode_lock); ++ return 0; + } + + static noinline void init_btrfs_i(struct inode *inode) +@@ -3148,6 +3499,7 @@ static noinline void init_btrfs_i(struct inode *inode) + bi->generation = 0; + bi->sequence = 0; + bi->last_trans = 0; ++ bi->last_sub_trans = 0; + bi->logged_trans = 0; + bi->delalloc_bytes = 0; + bi->reserved_bytes = 0; +@@ -3225,15 +3577,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + return inode; + } + ++static struct inode *new_simple_dir(struct super_block *s, ++ struct btrfs_key *key, ++ struct btrfs_root *root) ++{ ++ struct inode *inode = new_inode(s); ++ ++ if (!inode) ++ return ERR_PTR(-ENOMEM); ++ ++ init_btrfs_i(inode); ++ ++ BTRFS_I(inode)->root = root; ++ memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); ++ BTRFS_I(inode)->dummy_inode = 1; ++ ++ inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; ++ inode->i_op = &simple_dir_inode_operations; ++ inode->i_fop = &simple_dir_operations; ++ inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; ++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; ++ ++ return inode; ++} ++ + struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) + { + struct inode *inode; +- struct btrfs_inode *bi = BTRFS_I(dir); +- struct btrfs_root *root = bi->root; ++ struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *sub_root = root; + struct btrfs_key location; ++ int index; + int ret; + ++ dentry->d_op = &btrfs_dentry_operations; ++ + if (dentry->d_name.len > BTRFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + +@@ -3242,29 +3620,52 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) + if (ret < 0) + return ERR_PTR(ret); + +- inode = NULL; +- if (location.objectid) { +- ret = fixup_tree_root_location(root, &location, &sub_root, +- dentry); +- if (ret < 0) +- return ERR_PTR(ret); +- if (ret > 0) +- return ERR_PTR(-ENOENT); ++ if (location.objectid == 0) ++ return NULL; ++ ++ if (location.type == BTRFS_INODE_ITEM_KEY) { ++ inode = btrfs_iget(dir->i_sb, &location, root); ++ return inode; ++ } ++ ++ BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); ++ ++ index = srcu_read_lock(&root->fs_info->subvol_srcu); ++ ret = fixup_tree_root_location(root, dir, dentry, ++ &location, &sub_root); ++ if (ret < 0) { ++ if (ret != -ENOENT) ++ inode = ERR_PTR(ret); ++ else ++ inode = new_simple_dir(dir->i_sb, &location, sub_root); ++ } else { + inode = btrfs_iget(dir->i_sb, &location, sub_root); +- if (IS_ERR(inode)) +- return ERR_CAST(inode); + } ++ srcu_read_unlock(&root->fs_info->subvol_srcu, index); ++ + return inode; + } + ++static int btrfs_dentry_delete(struct dentry *dentry) ++{ ++ struct btrfs_root *root; ++ ++ if (!dentry->d_inode && !IS_ROOT(dentry)) ++ dentry = dentry->d_parent; ++ ++ if (dentry->d_inode) { ++ root = BTRFS_I(dentry->d_inode)->root; ++ if (btrfs_root_refs(&root->root_item) == 0) ++ return 1; ++ } ++ return 0; ++} ++ + static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) + { + struct inode *inode; + +- if (dentry->d_name.len > BTRFS_NAME_LEN) +- return ERR_PTR(-ENAMETOOLONG); +- + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) + return ERR_CAST(inode); +@@ -3603,9 +4004,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, + if (ret != 0) + goto fail; + +- if (objectid > root->highest_inode) +- root->highest_inode = objectid; +- + inode->i_uid = current_fsuid(); + + if (dir && (dir->i_mode & S_ISGID)) { +@@ -3673,26 +4071,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index) + { +- int ret; ++ int ret = 0; + struct btrfs_key key; + struct btrfs_root *root = BTRFS_I(parent_inode)->root; + +- key.objectid = inode->i_ino; +- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); +- key.offset = 0; ++ if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { ++ memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); ++ } else { ++ key.objectid = inode->i_ino; ++ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); ++ key.offset = 0; ++ } ++ ++ if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { ++ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, ++ key.objectid, root->root_key.objectid, ++ parent_inode->i_ino, ++ index, name, name_len); ++ } else if (add_backref) { ++ ret = btrfs_insert_inode_ref(trans, root, ++ name, name_len, inode->i_ino, ++ parent_inode->i_ino, index); ++ } + +- ret = btrfs_insert_dir_item(trans, root, name, name_len, +- parent_inode->i_ino, +- &key, btrfs_inode_type(inode), +- index); + if (ret == 0) { +- if (add_backref) { +- ret = btrfs_insert_inode_ref(trans, root, +- name, name_len, +- inode->i_ino, +- parent_inode->i_ino, +- index); +- } ++ ret = btrfs_insert_dir_item(trans, root, name, name_len, ++ parent_inode->i_ino, &key, ++ btrfs_inode_type(inode), index); ++ BUG_ON(ret); ++ + btrfs_i_size_write(parent_inode, parent_inode->i_size + + name_len * 2); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; +@@ -3732,11 +4139,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, + if (!new_valid_dev(rdev)) + return -EINVAL; + +- err = btrfs_check_metadata_free_space(root); ++ /* ++ * 2 for inode item and ref ++ * 2 for dir items ++ * 1 for xattr if selinux is on ++ */ ++ err = btrfs_reserve_metadata_space(root, 5); + if (err) +- goto fail; ++ return err; + + trans = btrfs_start_transaction(root, 1); ++ if (!trans) ++ goto fail; + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); +@@ -3774,6 +4188,7 @@ out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + fail: ++ btrfs_unreserve_metadata_space(root, 5); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); +@@ -3794,10 +4209,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, + u64 objectid; + u64 index = 0; + +- err = btrfs_check_metadata_free_space(root); ++ /* ++ * 2 for inode item and ref ++ * 2 for dir items ++ * 1 for xattr if selinux is on ++ */ ++ err = btrfs_reserve_metadata_space(root, 5); + if (err) +- goto fail; ++ return err; ++ + trans = btrfs_start_transaction(root, 1); ++ if (!trans) ++ goto fail; + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); +@@ -3838,6 +4261,7 @@ out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + fail: ++ btrfs_unreserve_metadata_space(root, 5); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); +@@ -3860,10 +4284,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, + if (inode->i_nlink == 0) + return -ENOENT; + +- btrfs_inc_nlink(inode); +- err = btrfs_check_metadata_free_space(root); ++ /* ++ * 1 item for inode ref ++ * 2 items for dir items ++ */ ++ err = btrfs_reserve_metadata_space(root, 3); + if (err) +- goto fail; ++ return err; ++ ++ btrfs_inc_nlink(inode); ++ + err = btrfs_set_inode_index(dir, &index); + if (err) + goto fail; +@@ -3875,20 +4305,19 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, + + err = btrfs_add_nondir(trans, dentry, inode, 1, index); + +- if (err) +- drop_inode = 1; +- +- btrfs_update_inode_block_group(trans, dir); +- err = btrfs_update_inode(trans, root, inode); +- +- if (err) ++ if (err) { + drop_inode = 1; ++ } else { ++ btrfs_update_inode_block_group(trans, dir); ++ err = btrfs_update_inode(trans, root, inode); ++ BUG_ON(err); ++ btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); ++ } + + nr = trans->blocks_used; +- +- btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); + btrfs_end_transaction_throttle(trans, root); + fail: ++ btrfs_unreserve_metadata_space(root, 3); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); +@@ -3908,17 +4337,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) + u64 index = 0; + unsigned long nr = 1; + +- err = btrfs_check_metadata_free_space(root); ++ /* ++ * 2 items for inode and ref ++ * 2 items for dir items ++ * 1 for xattr if selinux is on ++ */ ++ err = btrfs_reserve_metadata_space(root, 5); + if (err) +- goto out_unlock; ++ return err; + + trans = btrfs_start_transaction(root, 1); +- btrfs_set_trans_block_group(trans, dir); +- +- if (IS_ERR(trans)) { +- err = PTR_ERR(trans); ++ if (!trans) { ++ err = -ENOMEM; + goto out_unlock; + } ++ btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { +@@ -3967,6 +4400,7 @@ out_fail: + btrfs_end_transaction_throttle(trans, root); + + out_unlock: ++ btrfs_unreserve_metadata_space(root, 5); + if (drop_on_err) + iput(inode); + btrfs_btree_balance_dirty(root, nr); +@@ -4064,11 +4498,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + int compressed; + + again: +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) + em->bdev = root->fs_info->fs_devices->latest_bdev; +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + if (em) { + if (em->start > start || em->start + em->len <= start) +@@ -4215,6 +4649,11 @@ again: + map = kmap(page); + read_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); ++ if (pg_offset + copy_size < PAGE_CACHE_SIZE) { ++ memset(map + pg_offset + copy_size, 0, ++ PAGE_CACHE_SIZE - pg_offset - ++ copy_size); ++ } + kunmap(page); + } + flush_dcache_page(page); +@@ -4259,7 +4698,7 @@ insert: + } + + err = 0; +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + /* it is possible that someone inserted the extent into the tree + * while we had the lock dropped. It is also possible that +@@ -4299,7 +4738,7 @@ insert: + err = 0; + } + } +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + out: + if (path) + btrfs_free_path(path); +@@ -4398,13 +4837,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) + u64 page_start = page_offset(page); + u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + ++ ++ /* ++ * we have the page locked, so new writeback can't start, ++ * and the dirty bit won't be cleared while we are here. ++ * ++ * Wait for IO on this page so that we can safely clear ++ * the PagePrivate2 bit and do ordered accounting ++ */ + wait_on_page_writeback(page); ++ + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (offset) { + btrfs_releasepage(page, GFP_NOFS); + return; + } +- + lock_extent(tree, page_start, page_end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(page->mapping->host, + page_offset(page)); +@@ -4415,16 +4862,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) + */ + clear_extent_bit(tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | +- EXTENT_LOCKED, 1, 0, GFP_NOFS); +- btrfs_finish_ordered_io(page->mapping->host, +- page_start, page_end); ++ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, ++ NULL, GFP_NOFS); ++ /* ++ * whoever cleared the private bit is responsible ++ * for the finish_ordered_io ++ */ ++ if (TestClearPagePrivate2(page)) { ++ btrfs_finish_ordered_io(page->mapping->host, ++ page_start, page_end); ++ } + btrfs_put_ordered_extent(ordered); + lock_extent(tree, page_start, page_end, GFP_NOFS); + } + clear_extent_bit(tree, page_start, page_end, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | +- EXTENT_ORDERED, +- 1, 1, GFP_NOFS); ++ EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); + __btrfs_releasepage(page, GFP_NOFS); + + ClearPageChecked(page); +@@ -4473,6 +4926,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) + goto out; + } + ++ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); ++ if (ret) { ++ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); ++ ret = VM_FAULT_SIGBUS; ++ goto out; ++ } ++ + ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ + again: + lock_page(page); +@@ -4504,7 +4964,24 @@ again: + goto again; + } + +- btrfs_set_extent_delalloc(inode, page_start, page_end); ++ /* ++ * XXX - page_mkwrite gets called every time the page is dirtied, even ++ * if it was already dirty, so for space accounting reasons we need to ++ * clear any delalloc bits for the range we are fixing to save. There ++ * is probably a better way to do this, but for now keep consistent with ++ * prepare_pages in the normal write path. ++ */ ++ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, ++ EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, ++ GFP_NOFS); ++ ++ ret = btrfs_set_extent_delalloc(inode, page_start, page_end); ++ if (ret) { ++ unlock_extent(io_tree, page_start, page_end, GFP_NOFS); ++ ret = VM_FAULT_SIGBUS; ++ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); ++ goto out_unlock; ++ } + ret = 0; + + /* page is wholly or partially inside EOF */ +@@ -4521,11 +4998,17 @@ again: + } + ClearPageChecked(page); + set_page_dirty(page); ++ SetPageUptodate(page); ++ ++ BTRFS_I(inode)->last_trans = root->fs_info->generation; ++ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + +- BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + + out_unlock: ++ btrfs_unreserve_metadata_for_delalloc(root, inode, 1); ++ if (!ret) ++ return VM_FAULT_LOCKED; + unlock_page(page); + out: + return ret; +@@ -4544,7 +5027,9 @@ static void btrfs_truncate(struct inode *inode) + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + +- btrfs_truncate_page(inode->i_mapping, inode->i_size); ++ ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); ++ if (ret) ++ return; + btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); + + trans = btrfs_start_transaction(root, 1); +@@ -4594,11 +5079,11 @@ out: + * create a new subvolume directory/inode (helper for the ioctl). + */ + int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, +- struct btrfs_root *new_root, struct dentry *dentry, ++ struct btrfs_root *new_root, + u64 new_dirid, u64 alloc_hint) + { + struct inode *inode; +- int error; ++ int err; + u64 index = 0; + + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, +@@ -4611,11 +5096,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, + inode->i_nlink = 1; + btrfs_i_size_write(inode, 0); + +- error = btrfs_update_inode(trans, new_root, inode); +- if (error) +- return error; ++ err = btrfs_update_inode(trans, new_root, inode); ++ BUG_ON(err); + +- d_instantiate(dentry, inode); ++ iput(inode); + return 0; + } + +@@ -4640,7 +5124,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) + if (!ei) + return NULL; + ei->last_trans = 0; ++ ei->last_sub_trans = 0; + ei->logged_trans = 0; ++ ei->outstanding_extents = 0; ++ ei->reserved_extents = 0; ++ ei->root = NULL; ++ spin_lock_init(&ei->accounting_lock); + btrfs_ordered_inode_tree_init(&ei->ordered_tree); + INIT_LIST_HEAD(&ei->i_orphan); + INIT_LIST_HEAD(&ei->ordered_operations); +@@ -4656,6 +5145,14 @@ void btrfs_destroy_inode(struct inode *inode) + WARN_ON(inode->i_data.nrpages); + + /* ++ * This can happen where we create an inode, but somebody else also ++ * created the same inode and we need to destroy the one we already ++ * created. ++ */ ++ if (!root) ++ goto free; ++ ++ /* + * Make sure we're properly removed from the ordered operation + * lists. + */ +@@ -4690,9 +5187,20 @@ void btrfs_destroy_inode(struct inode *inode) + } + inode_tree_del(inode); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); ++free: + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); + } + ++void btrfs_drop_inode(struct inode *inode) ++{ ++ struct btrfs_root *root = BTRFS_I(inode)->root; ++ ++ if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) ++ generic_delete_inode(inode); ++ else ++ generic_drop_inode(inode); ++} ++ + static void init_once(void *foo) + { + struct btrfs_inode *ei = (struct btrfs_inode *) foo; +@@ -4761,31 +5269,37 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(old_dir)->root; ++ struct btrfs_root *dest = BTRFS_I(new_dir)->root; + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct timespec ctime = CURRENT_TIME; + u64 index = 0; ++ u64 root_objectid; + int ret; + +- /* we're not allowed to rename between subvolumes */ +- if (BTRFS_I(old_inode)->root->root_key.objectid != +- BTRFS_I(new_dir)->root->root_key.objectid) ++ if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) ++ return -EPERM; ++ ++ /* we only allow rename subvolume link between subvolumes */ ++ if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + return -EXDEV; + ++ if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || ++ (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) ++ return -ENOTEMPTY; ++ + if (S_ISDIR(old_inode->i_mode) && new_inode && +- new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) { ++ new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) + return -ENOTEMPTY; +- } + +- /* to rename a snapshot or subvolume, we need to juggle the +- * backrefs. This isn't coded yet ++ /* ++ * 2 items for dir items ++ * 1 item for orphan entry ++ * 1 item for ref + */ +- if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) +- return -EXDEV; +- +- ret = btrfs_check_metadata_free_space(root); ++ ret = btrfs_reserve_metadata_space(root, 4); + if (ret) +- goto out_unlock; ++ return ret; + + /* + * we're using rename to replace one file with another. +@@ -4796,8 +5310,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + filemap_flush(old_inode->i_mapping); + ++ /* close the racy window with snapshot create/destroy ioctl */ ++ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) ++ down_read(&root->fs_info->subvol_sem); ++ + trans = btrfs_start_transaction(root, 1); ++ btrfs_set_trans_block_group(trans, new_dir); ++ ++ if (dest != root) ++ btrfs_record_root_in_trans(trans, dest); ++ ++ ret = btrfs_set_inode_index(new_dir, &index); ++ if (ret) ++ goto out_fail; + ++ if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { ++ /* force full log commit if subvolume involved. */ ++ root->fs_info->last_trans_log_full_commit = trans->transid; ++ } else { ++ ret = btrfs_insert_inode_ref(trans, dest, ++ new_dentry->d_name.name, ++ new_dentry->d_name.len, ++ old_inode->i_ino, ++ new_dir->i_ino, index); ++ if (ret) ++ goto out_fail; ++ /* ++ * this is an ugly little race, but the rename is required ++ * to make sure that if we crash, the inode is either at the ++ * old name or the new one. pinning the log transaction lets ++ * us make sure we don't allow a log commit to come in after ++ * we unlink the name but before we add the new name back in. ++ */ ++ btrfs_pin_log_trans(root); ++ } + /* + * make sure the inode gets flushed if it is replacing + * something. +@@ -4807,18 +5353,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + btrfs_add_ordered_operation(trans, root, old_inode); + } + +- /* +- * this is an ugly little race, but the rename is required to make +- * sure that if we crash, the inode is either at the old name +- * or the new one. pinning the log transaction lets us make sure +- * we don't allow a log commit to come in after we unlink the +- * name but before we add the new name back in. +- */ +- btrfs_pin_log_trans(root); +- +- btrfs_set_trans_block_group(trans, new_dir); +- +- btrfs_inc_nlink(old_dentry->d_inode); + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + old_inode->i_ctime = ctime; +@@ -4826,47 +5360,60 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + if (old_dentry->d_parent != new_dentry->d_parent) + btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + +- ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, +- old_dentry->d_name.name, +- old_dentry->d_name.len); +- if (ret) +- goto out_fail; ++ if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { ++ root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; ++ ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, ++ old_dentry->d_name.name, ++ old_dentry->d_name.len); ++ } else { ++ btrfs_inc_nlink(old_dentry->d_inode); ++ ret = btrfs_unlink_inode(trans, root, old_dir, ++ old_dentry->d_inode, ++ old_dentry->d_name.name, ++ old_dentry->d_name.len); ++ } ++ BUG_ON(ret); + + if (new_inode) { + new_inode->i_ctime = CURRENT_TIME; +- ret = btrfs_unlink_inode(trans, root, new_dir, +- new_dentry->d_inode, +- new_dentry->d_name.name, +- new_dentry->d_name.len); +- if (ret) +- goto out_fail; ++ if (unlikely(new_inode->i_ino == ++ BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ++ root_objectid = BTRFS_I(new_inode)->location.objectid; ++ ret = btrfs_unlink_subvol(trans, dest, new_dir, ++ root_objectid, ++ new_dentry->d_name.name, ++ new_dentry->d_name.len); ++ BUG_ON(new_inode->i_nlink == 0); ++ } else { ++ ret = btrfs_unlink_inode(trans, dest, new_dir, ++ new_dentry->d_inode, ++ new_dentry->d_name.name, ++ new_dentry->d_name.len); ++ } ++ BUG_ON(ret); + if (new_inode->i_nlink == 0) { + ret = btrfs_orphan_add(trans, new_dentry->d_inode); +- if (ret) +- goto out_fail; ++ BUG_ON(ret); + } +- + } +- ret = btrfs_set_inode_index(new_dir, &index); +- if (ret) +- goto out_fail; + +- ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode, +- old_inode, new_dentry->d_name.name, +- new_dentry->d_name.len, 1, index); +- if (ret) +- goto out_fail; ++ ret = btrfs_add_link(trans, new_dir, old_inode, ++ new_dentry->d_name.name, ++ new_dentry->d_name.len, 0, index); ++ BUG_ON(ret); + +- btrfs_log_new_name(trans, old_inode, old_dir, +- new_dentry->d_parent); ++ if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { ++ btrfs_log_new_name(trans, old_inode, old_dir, ++ new_dentry->d_parent); ++ btrfs_end_log_trans(root); ++ } + out_fail: +- +- /* this btrfs_end_log_trans just allows the current +- * log-sub transaction to complete +- */ +- btrfs_end_log_trans(root); + btrfs_end_transaction_throttle(trans, root); +-out_unlock: ++ ++ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) ++ up_read(&root->fs_info->subvol_sem); ++ ++ btrfs_unreserve_metadata_space(root, 4); + return ret; + } + +@@ -4938,11 +5485,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) + return -ENAMETOOLONG; + +- err = btrfs_check_metadata_free_space(root); ++ /* ++ * 2 items for inode item and ref ++ * 2 items for dir items ++ * 1 item for xattr if selinux is on ++ */ ++ err = btrfs_reserve_metadata_space(root, 5); + if (err) +- goto out_fail; ++ return err; + + trans = btrfs_start_transaction(root, 1); ++ if (!trans) ++ goto out_fail; + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); +@@ -5023,6 +5577,7 @@ out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + out_fail: ++ btrfs_unreserve_metadata_space(root, 5); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); +@@ -5044,6 +5599,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, + + while (num_bytes > 0) { + alloc_size = min(num_bytes, root->fs_info->max_extent); ++ ++ ret = btrfs_reserve_metadata_space(root, 1); ++ if (ret) ++ goto out; ++ + ret = btrfs_reserve_extent(trans, root, alloc_size, + root->sectorsize, 0, alloc_hint, + (u64)-1, &ins, 1); +@@ -5058,9 +5618,12 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, + 0, 0, 0, + BTRFS_FILE_EXTENT_PREALLOC); + BUG_ON(ret); ++ btrfs_drop_extent_cache(inode, cur_offset, ++ cur_offset + ins.offset -1, 0); + num_bytes -= ins.offset; + cur_offset += ins.offset; + alloc_hint = ins.objectid + ins.offset; ++ btrfs_unreserve_metadata_space(root, 1); + } + out: + if (cur_offset > start) { +@@ -5223,6 +5786,7 @@ static struct inode_operations btrfs_dir_ro_inode_operations = { + .lookup = btrfs_lookup, + .permission = btrfs_permission, + }; ++ + static struct file_operations btrfs_dir_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, +@@ -5245,6 +5809,8 @@ static struct extent_io_ops btrfs_extent_io_ops = { + .readpage_io_failed_hook = btrfs_io_failed_hook, + .set_bit_hook = btrfs_set_bit_hook, + .clear_bit_hook = btrfs_clear_bit_hook, ++ .merge_extent_hook = btrfs_merge_extent_hook, ++ .split_extent_hook = btrfs_split_extent_hook, + }; + + /* +@@ -5309,3 +5875,7 @@ static struct inode_operations btrfs_symlink_inode_operations = { + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + }; ++ ++const struct dentry_operations btrfs_dentry_operations = { ++ .d_delete = btrfs_dentry_delete, ++}; +diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c +index bd88f25..cdbb054 100644 +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root, + struct btrfs_root_item root_item; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; +- struct btrfs_root *new_root = root; +- struct inode *dir; ++ struct btrfs_root *new_root; ++ struct inode *dir = dentry->d_parent->d_inode; + int ret; + int err; + u64 objectid; +@@ -239,9 +239,15 @@ static noinline int create_subvol(struct btrfs_root *root, + u64 index = 0; + unsigned long nr = 1; + +- ret = btrfs_check_metadata_free_space(root); ++ /* ++ * 1 - inode item ++ * 2 - refs ++ * 1 - root item ++ * 2 - dir items ++ */ ++ ret = btrfs_reserve_metadata_space(root, 6); + if (ret) +- goto fail_commit; ++ return ret; + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); +@@ -304,11 +310,17 @@ static noinline int create_subvol(struct btrfs_root *root, + if (ret) + goto fail; + ++ key.offset = (u64)-1; ++ new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); ++ BUG_ON(IS_ERR(new_root)); ++ ++ btrfs_record_root_in_trans(trans, new_root); ++ ++ ret = btrfs_create_subvol_root(trans, new_root, new_dirid, ++ BTRFS_I(dir)->block_group); + /* + * insert the directory item + */ +- key.offset = (u64)-1; +- dir = dentry->d_parent->d_inode; + ret = btrfs_set_inode_index(dir, &index); + BUG_ON(ret); + +@@ -322,43 +334,20 @@ static noinline int create_subvol(struct btrfs_root *root, + ret = btrfs_update_inode(trans, root, dir); + BUG_ON(ret); + +- /* add the backref first */ + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, +- objectid, BTRFS_ROOT_BACKREF_KEY, +- root->root_key.objectid, ++ objectid, root->root_key.objectid, + dir->i_ino, index, name, namelen); + + BUG_ON(ret); + +- /* now add the forward ref */ +- ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, +- root->root_key.objectid, BTRFS_ROOT_REF_KEY, +- objectid, +- dir->i_ino, index, name, namelen); +- +- BUG_ON(ret); +- +- ret = btrfs_commit_transaction(trans, root); +- if (ret) +- goto fail_commit; +- +- new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); +- BUG_ON(!new_root); +- +- trans = btrfs_start_transaction(new_root, 1); +- BUG_ON(!trans); +- +- ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid, +- BTRFS_I(dir)->block_group); +- if (ret) +- goto fail; +- ++ d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); + fail: + nr = trans->blocks_used; +- err = btrfs_commit_transaction(trans, new_root); ++ err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; +-fail_commit: ++ ++ btrfs_unreserve_metadata_space(root, 6); + btrfs_btree_balance_dirty(root, nr); + return ret; + } +@@ -375,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, + if (!root->ref_cows) + return -EINVAL; + +- ret = btrfs_check_metadata_free_space(root); ++ /* ++ * 1 - inode item ++ * 2 - refs ++ * 1 - root item ++ * 2 - dir items ++ */ ++ ret = btrfs_reserve_metadata_space(root, 6); + if (ret) + goto fail_unlock; + + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) { + ret = -ENOMEM; ++ btrfs_unreserve_metadata_space(root, 6); + goto fail_unlock; + } + pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); + if (!pending_snapshot->name) { + ret = -ENOMEM; + kfree(pending_snapshot); ++ btrfs_unreserve_metadata_space(root, 6); + goto fail_unlock; + } + memcpy(pending_snapshot->name, name, namelen); +@@ -420,14 +417,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) + * sys_mkdirat and vfs_mkdir, but we only do a single component lookup + * inside this filesystem so it's quite a bit simpler. + */ +-static noinline int btrfs_mksubvol(struct path *parent, char *name, +- int mode, int namelen, ++static noinline int btrfs_mksubvol(struct path *parent, ++ char *name, int namelen, + struct btrfs_root *snap_src) + { ++ struct inode *dir = parent->dentry->d_inode; + struct dentry *dentry; + int error; + +- mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT); ++ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + + dentry = lookup_one_len(name, parent->dentry, namelen); + error = PTR_ERR(dentry); +@@ -438,99 +436,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, + if (dentry->d_inode) + goto out_dput; + +- if (!IS_POSIXACL(parent->dentry->d_inode)) +- mode &= ~current_umask(); +- + error = mnt_want_write(parent->mnt); + if (error) + goto out_dput; + +- error = btrfs_may_create(parent->dentry->d_inode, dentry); ++ error = btrfs_may_create(dir, dentry); + if (error) + goto out_drop_write; + +- /* +- * Actually perform the low-level subvolume creation after all +- * this VFS fuzz. +- * +- * Eventually we want to pass in an inode under which we create this +- * subvolume, but for now all are under the filesystem root. +- * +- * Also we should pass on the mode eventually to allow creating new +- * subvolume with specific mode bits. +- */ ++ down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); ++ ++ if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) ++ goto out_up_read; ++ + if (snap_src) { +- struct dentry *dir = dentry->d_parent; +- struct dentry *test = dir->d_parent; +- struct btrfs_path *path = btrfs_alloc_path(); +- int ret; +- u64 test_oid; +- u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid; +- +- test_oid = snap_src->root_key.objectid; +- +- ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, +- path, parent_oid, test_oid); +- if (ret == 0) +- goto create; +- btrfs_release_path(snap_src->fs_info->tree_root, path); +- +- /* we need to make sure we aren't creating a directory loop +- * by taking a snapshot of something that has our current +- * subvol in its directory tree. So, this loops through +- * the dentries and checks the forward refs for each subvolume +- * to see if is references the subvolume where we are +- * placing this new snapshot. +- */ +- while (1) { +- if (!test || +- dir == snap_src->fs_info->sb->s_root || +- test == snap_src->fs_info->sb->s_root || +- test->d_inode->i_sb != snap_src->fs_info->sb) { +- break; +- } +- if (S_ISLNK(test->d_inode->i_mode)) { +- printk(KERN_INFO "Btrfs symlink in snapshot " +- "path, failed\n"); +- error = -EMLINK; +- btrfs_free_path(path); +- goto out_drop_write; +- } +- test_oid = +- BTRFS_I(test->d_inode)->root->root_key.objectid; +- ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, +- path, test_oid, parent_oid); +- if (ret == 0) { +- printk(KERN_INFO "Btrfs snapshot creation " +- "failed, looping\n"); +- error = -EMLINK; +- btrfs_free_path(path); +- goto out_drop_write; +- } +- btrfs_release_path(snap_src->fs_info->tree_root, path); +- test = test->d_parent; +- } +-create: +- btrfs_free_path(path); +- error = create_snapshot(snap_src, dentry, name, namelen); ++ error = create_snapshot(snap_src, dentry, ++ name, namelen); + } else { +- error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, +- dentry, name, namelen); ++ error = create_subvol(BTRFS_I(dir)->root, dentry, ++ name, namelen); + } +- if (error) +- goto out_drop_write; +- +- fsnotify_mkdir(parent->dentry->d_inode, dentry); ++ if (!error) ++ fsnotify_mkdir(dir, dentry); ++out_up_read: ++ up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); + out_drop_write: + mnt_drop_write(parent->mnt); + out_dput: + dput(dentry); + out_unlock: +- mutex_unlock(&parent->dentry->d_inode->i_mutex); ++ mutex_unlock(&dir->i_mutex); + return error; + } + +- + static int btrfs_defrag_file(struct file *file) + { + struct inode *inode = fdentry(file)->d_inode; +@@ -596,9 +534,8 @@ again: + clear_page_dirty_for_io(page); + + btrfs_set_extent_delalloc(inode, page_start, page_end); +- +- unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_page_dirty(page); ++ unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); +@@ -609,7 +546,8 @@ out_unlock: + return 0; + } + +-static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) ++static noinline int btrfs_ioctl_resize(struct btrfs_root *root, ++ void __user *arg) + { + u64 new_size; + u64 old_size; +@@ -718,10 +656,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, + { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_ioctl_vol_args *vol_args; +- struct btrfs_dir_item *di; +- struct btrfs_path *path; + struct file *src_file; +- u64 root_dirid; + int namelen; + int ret = 0; + +@@ -739,32 +674,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, + goto out; + } + +- path = btrfs_alloc_path(); +- if (!path) { +- ret = -ENOMEM; +- goto out; +- } +- +- root_dirid = root->fs_info->sb->s_root->d_inode->i_ino, +- di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, +- path, root_dirid, +- vol_args->name, namelen, 0); +- btrfs_free_path(path); +- +- if (di && !IS_ERR(di)) { +- ret = -EEXIST; +- goto out; +- } +- +- if (IS_ERR(di)) { +- ret = PTR_ERR(di); +- goto out; +- } +- + if (subvol) { +- ret = btrfs_mksubvol(&file->f_path, vol_args->name, +- file->f_path.dentry->d_inode->i_mode, +- namelen, NULL); ++ ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, ++ NULL); + } else { + struct inode *src_inode; + src_file = fget(vol_args->fd); +@@ -781,17 +693,157 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, + fput(src_file); + goto out; + } +- ret = btrfs_mksubvol(&file->f_path, vol_args->name, +- file->f_path.dentry->d_inode->i_mode, +- namelen, BTRFS_I(src_inode)->root); ++ ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, ++ BTRFS_I(src_inode)->root); + fput(src_file); + } +- + out: + kfree(vol_args); + return ret; + } + ++/* ++ * helper to check if the subvolume references other subvolumes ++ */ ++static noinline int may_destroy_subvol(struct btrfs_root *root) ++{ ++ struct btrfs_path *path; ++ struct btrfs_key key; ++ int ret; ++ ++ path = btrfs_alloc_path(); ++ if (!path) ++ return -ENOMEM; ++ ++ key.objectid = root->root_key.objectid; ++ key.type = BTRFS_ROOT_REF_KEY; ++ key.offset = (u64)-1; ++ ++ ret = btrfs_search_slot(NULL, root->fs_info->tree_root, ++ &key, path, 0, 0); ++ if (ret < 0) ++ goto out; ++ BUG_ON(ret == 0); ++ ++ ret = 0; ++ if (path->slots[0] > 0) { ++ path->slots[0]--; ++ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ++ if (key.objectid == root->root_key.objectid && ++ key.type == BTRFS_ROOT_REF_KEY) ++ ret = -ENOTEMPTY; ++ } ++out: ++ btrfs_free_path(path); ++ return ret; ++} ++ ++static noinline int btrfs_ioctl_snap_destroy(struct file *file, ++ void __user *arg) ++{ ++ struct dentry *parent = fdentry(file); ++ struct dentry *dentry; ++ struct inode *dir = parent->d_inode; ++ struct inode *inode; ++ struct btrfs_root *root = BTRFS_I(dir)->root; ++ struct btrfs_root *dest = NULL; ++ struct btrfs_ioctl_vol_args *vol_args; ++ struct btrfs_trans_handle *trans; ++ int namelen; ++ int ret; ++ int err = 0; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ vol_args = memdup_user(arg, sizeof(*vol_args)); ++ if (IS_ERR(vol_args)) ++ return PTR_ERR(vol_args); ++ ++ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ++ namelen = strlen(vol_args->name); ++ if (strchr(vol_args->name, '/') || ++ strncmp(vol_args->name, "..", namelen) == 0) { ++ err = -EINVAL; ++ goto out; ++ } ++ ++ err = mnt_want_write(file->f_path.mnt); ++ if (err) ++ goto out; ++ ++ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); ++ dentry = lookup_one_len(vol_args->name, parent, namelen); ++ if (IS_ERR(dentry)) { ++ err = PTR_ERR(dentry); ++ goto out_unlock_dir; ++ } ++ ++ if (!dentry->d_inode) { ++ err = -ENOENT; ++ goto out_dput; ++ } ++ ++ inode = dentry->d_inode; ++ if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { ++ err = -EINVAL; ++ goto out_dput; ++ } ++ ++ dest = BTRFS_I(inode)->root; ++ ++ mutex_lock(&inode->i_mutex); ++ err = d_invalidate(dentry); ++ if (err) ++ goto out_unlock; ++ ++ down_write(&root->fs_info->subvol_sem); ++ ++ err = may_destroy_subvol(dest); ++ if (err) ++ goto out_up_write; ++ ++ trans = btrfs_start_transaction(root, 1); ++ ret = btrfs_unlink_subvol(trans, root, dir, ++ dest->root_key.objectid, ++ dentry->d_name.name, ++ dentry->d_name.len); ++ BUG_ON(ret); ++ ++ btrfs_record_root_in_trans(trans, dest); ++ ++ memset(&dest->root_item.drop_progress, 0, ++ sizeof(dest->root_item.drop_progress)); ++ dest->root_item.drop_level = 0; ++ btrfs_set_root_refs(&dest->root_item, 0); ++ ++ ret = btrfs_insert_orphan_item(trans, ++ root->fs_info->tree_root, ++ dest->root_key.objectid); ++ BUG_ON(ret); ++ ++ ret = btrfs_commit_transaction(trans, root); ++ BUG_ON(ret); ++ inode->i_flags |= S_DEAD; ++out_up_write: ++ up_write(&root->fs_info->subvol_sem); ++out_unlock: ++ mutex_unlock(&inode->i_mutex); ++ if (!err) { ++ shrink_dcache_sb(root->fs_info->sb); ++ btrfs_invalidate_inodes(dest); ++ d_delete(dentry); ++ } ++out_dput: ++ dput(dentry); ++out_unlock_dir: ++ mutex_unlock(&dir->i_mutex); ++ mnt_drop_write(file->f_path.mnt); ++out: ++ kfree(vol_args); ++ return err; ++} ++ + static int btrfs_ioctl_defrag(struct file *file) + { + struct inode *inode = fdentry(file)->d_inode; +@@ -865,8 +917,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) + return ret; + } + +-static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, +- u64 off, u64 olen, u64 destoff) ++static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, ++ u64 off, u64 olen, u64 destoff) + { + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; +@@ -976,7 +1028,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + + /* punch hole in destination first */ + btrfs_drop_extents(trans, root, inode, off, off + len, +- off + len, 0, &hint_byte); ++ off + len, 0, &hint_byte, 1); + + /* clone data */ + key.objectid = src->i_ino; +@@ -1071,9 +1123,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + datao += off - key.offset; + datal -= off - key.offset; + } +- if (key.offset + datao + datal + key.offset > +- off + len) +- datal = off + len - key.offset - datao; ++ ++ if (key.offset + datal > off + len) ++ datal = off + len - key.offset; ++ + /* disko == 0 means it's a hole */ + if (!disko) + datao = 0; +@@ -1182,15 +1235,15 @@ static long btrfs_ioctl_trans_start(struct file *file) + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; +- int ret = 0; ++ int ret; + ++ ret = -EPERM; + if (!capable(CAP_SYS_ADMIN)) +- return -EPERM; ++ goto out; + +- if (file->private_data) { +- ret = -EINPROGRESS; ++ ret = -EINPROGRESS; ++ if (file->private_data) + goto out; +- } + + ret = mnt_want_write(file->f_path.mnt); + if (ret) +@@ -1200,12 +1253,19 @@ static long btrfs_ioctl_trans_start(struct file *file) + root->fs_info->open_ioctl_trans++; + mutex_unlock(&root->fs_info->trans_mutex); + ++ ret = -ENOMEM; + trans = btrfs_start_ioctl_transaction(root, 0); +- if (trans) +- file->private_data = trans; +- else +- ret = -ENOMEM; +- /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ ++ if (!trans) ++ goto out_drop; ++ ++ file->private_data = trans; ++ return 0; ++ ++out_drop: ++ mutex_lock(&root->fs_info->trans_mutex); ++ root->fs_info->open_ioctl_trans--; ++ mutex_unlock(&root->fs_info->trans_mutex); ++ mnt_drop_write(file->f_path.mnt); + out: + return ret; + } +@@ -1221,24 +1281,20 @@ long btrfs_ioctl_trans_end(struct file *file) + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; +- int ret = 0; + + trans = file->private_data; +- if (!trans) { +- ret = -EINVAL; +- goto out; +- } +- btrfs_end_transaction(trans, root); ++ if (!trans) ++ return -EINVAL; + file->private_data = NULL; + ++ btrfs_end_transaction(trans, root); ++ + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans--; + mutex_unlock(&root->fs_info->trans_mutex); + + mnt_drop_write(file->f_path.mnt); +- +-out: +- return ret; ++ return 0; + } + + long btrfs_ioctl(struct file *file, unsigned int +@@ -1258,6 +1314,8 @@ long btrfs_ioctl(struct file *file, unsigned int + return btrfs_ioctl_snap_create(file, argp, 0); + case BTRFS_IOC_SUBVOL_CREATE: + return btrfs_ioctl_snap_create(file, argp, 1); ++ case BTRFS_IOC_SNAP_DESTROY: ++ return btrfs_ioctl_snap_destroy(file, argp); + case BTRFS_IOC_DEFRAG: + return btrfs_ioctl_defrag(file); + case BTRFS_IOC_RESIZE: +diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h +index b320b10..bc49914 100644 +--- a/fs/btrfs/ioctl.h ++++ b/fs/btrfs/ioctl.h +@@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args { + + #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ + struct btrfs_ioctl_vol_args) +- ++#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ ++ struct btrfs_ioctl_vol_args) + #endif +diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c +index d6f0806..ab21c29 100644 +--- a/fs/btrfs/ordered-data.c ++++ b/fs/btrfs/ordered-data.c +@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, + * + * len is the length of the extent + * +- * This also sets the EXTENT_ORDERED bit on the range in the inode. +- * + * The tree is given a single reference on the ordered extent that was + * inserted. + */ +@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + entry->start = start; + entry->len = len; + entry->disk_len = disk_len; ++ entry->bytes_left = len; + entry->inode = inode; + if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) + set_bit(type, &entry->flags); +@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + &entry->rb_node); + BUG_ON(node); + +- set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset, +- entry_end(entry) - 1, GFP_NOFS); +- + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + list_add_tail(&entry->root_extent_list, + &BTRFS_I(inode)->root->fs_info->ordered_extents); +@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry; +- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int ret; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); +- clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1, +- GFP_NOFS); + node = tree_search(tree, file_offset); + if (!node) { + ret = 1; +@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, + goto out; + } + +- ret = test_range_bit(io_tree, entry->file_offset, +- entry->file_offset + entry->len - 1, +- EXTENT_ORDERED, 0); +- if (ret == 0) ++ if (io_size > entry->bytes_left) { ++ printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", ++ (unsigned long long)entry->bytes_left, ++ (unsigned long long)io_size); ++ } ++ entry->bytes_left -= io_size; ++ if (entry->bytes_left == 0) + ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); ++ else ++ ret = 1; + out: + mutex_unlock(&tree->mutex); + return ret == 0; +@@ -308,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode, + tree->last = NULL; + set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + ++ spin_lock(&BTRFS_I(inode)->accounting_lock); ++ BTRFS_I(inode)->outstanding_extents--; ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); ++ btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, ++ inode, 1); ++ + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + list_del_init(&entry->root_extent_list); + +@@ -476,6 +480,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) + u64 orig_end; + u64 wait_end; + struct btrfs_ordered_extent *ordered; ++ int found; + + if (start + len < start) { + orig_end = INT_LIMIT(loff_t); +@@ -502,6 +507,7 @@ again: + orig_end >> PAGE_CACHE_SHIFT); + + end = orig_end; ++ found = 0; + while (1) { + ordered = btrfs_lookup_first_ordered_extent(inode, end); + if (!ordered) +@@ -514,6 +520,7 @@ again: + btrfs_put_ordered_extent(ordered); + break; + } ++ found++; + btrfs_start_ordered_extent(inode, ordered, 1); + end = ordered->file_offset; + btrfs_put_ordered_extent(ordered); +@@ -521,8 +528,8 @@ again: + break; + end--; + } +- if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, +- EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { ++ if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, ++ EXTENT_DELALLOC, 0, NULL)) { + schedule_timeout(1); + goto again; + } +@@ -613,7 +620,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, + */ + if (test_range_bit(io_tree, disk_i_size, + ordered->file_offset + ordered->len - 1, +- EXTENT_DELALLOC, 0)) { ++ EXTENT_DELALLOC, 0, NULL)) { + goto out; + } + /* +@@ -664,7 +671,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, + */ + if (i_size_test > entry_end(ordered) && + !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, +- EXTENT_DELALLOC, 0)) { ++ EXTENT_DELALLOC, 0, NULL)) { + new_i_size = min_t(u64, i_size_test, i_size_read(inode)); + } + BTRFS_I(inode)->disk_i_size = new_i_size; +diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h +index 3d31c88..993a7ea 100644 +--- a/fs/btrfs/ordered-data.h ++++ b/fs/btrfs/ordered-data.h +@@ -85,6 +85,9 @@ struct btrfs_ordered_extent { + /* extent length on disk */ + u64 disk_len; + ++ /* number of bytes that still need writing */ ++ u64 bytes_left; ++ + /* flags (described above) */ + unsigned long flags; + +diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c +index 3c0d52a..79cba5f 100644 +--- a/fs/btrfs/orphan.c ++++ b/fs/btrfs/orphan.c +@@ -65,3 +65,23 @@ out: + btrfs_free_path(path); + return ret; + } ++ ++int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset) ++{ ++ struct btrfs_path *path; ++ struct btrfs_key key; ++ int ret; ++ ++ key.objectid = BTRFS_ORPHAN_OBJECTID; ++ key.type = BTRFS_ORPHAN_ITEM_KEY; ++ key.offset = offset; ++ ++ path = btrfs_alloc_path(); ++ if (!path) ++ return -ENOMEM; ++ ++ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); ++ ++ btrfs_free_path(path); ++ return ret; ++} +diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c +index c04f7f2..cfcc93c 100644 +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -121,6 +121,15 @@ struct inodevec { + int nr; + }; + ++#define MAX_EXTENTS 128 ++ ++struct file_extent_cluster { ++ u64 start; ++ u64 end; ++ u64 boundary[MAX_EXTENTS]; ++ unsigned int nr; ++}; ++ + struct reloc_control { + /* block group to relocate */ + struct btrfs_block_group_cache *block_group; +@@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize, + struct reloc_control *rc) + { + if (test_range_bit(&rc->processed_blocks, bytenr, +- bytenr + blocksize - 1, EXTENT_DIRTY, 1)) ++ bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) + return 1; + return 0; + } +@@ -2529,56 +2538,94 @@ out: + } + + static noinline_for_stack +-int relocate_inode_pages(struct inode *inode, u64 start, u64 len) ++int setup_extent_mapping(struct inode *inode, u64 start, u64 end, ++ u64 block_start) ++{ ++ struct btrfs_root *root = BTRFS_I(inode)->root; ++ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; ++ struct extent_map *em; ++ int ret = 0; ++ ++ em = alloc_extent_map(GFP_NOFS); ++ if (!em) ++ return -ENOMEM; ++ ++ em->start = start; ++ em->len = end + 1 - start; ++ em->block_len = em->len; ++ em->block_start = block_start; ++ em->bdev = root->fs_info->fs_devices->latest_bdev; ++ set_bit(EXTENT_FLAG_PINNED, &em->flags); ++ ++ lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); ++ while (1) { ++ write_lock(&em_tree->lock); ++ ret = add_extent_mapping(em_tree, em); ++ write_unlock(&em_tree->lock); ++ if (ret != -EEXIST) { ++ free_extent_map(em); ++ break; ++ } ++ btrfs_drop_extent_cache(inode, start, end, 0); ++ } ++ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); ++ return ret; ++} ++ ++static int relocate_file_extent_cluster(struct inode *inode, ++ struct file_extent_cluster *cluster) + { + u64 page_start; + u64 page_end; +- unsigned long i; +- unsigned long first_index; ++ u64 offset = BTRFS_I(inode)->index_cnt; ++ unsigned long index; + unsigned long last_index; +- unsigned int total_read = 0; +- unsigned int total_dirty = 0; ++ unsigned int dirty_page = 0; + struct page *page; + struct file_ra_state *ra; +- struct btrfs_ordered_extent *ordered; +- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; ++ int nr = 0; + int ret = 0; + ++ if (!cluster->nr) ++ return 0; ++ + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + ++ index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; ++ last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; ++ + mutex_lock(&inode->i_mutex); +- first_index = start >> PAGE_CACHE_SHIFT; +- last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; + +- /* make sure the dirty trick played by the caller work */ +- while (1) { +- ret = invalidate_inode_pages2_range(inode->i_mapping, +- first_index, last_index); +- if (ret != -EBUSY) +- break; +- schedule_timeout(HZ/10); +- } ++ i_size_write(inode, cluster->end + 1 - offset); ++ ret = setup_extent_mapping(inode, cluster->start - offset, ++ cluster->end - offset, cluster->start); + if (ret) + goto out_unlock; + + file_ra_state_init(ra, inode->i_mapping); + +- for (i = first_index ; i <= last_index; i++) { +- if (total_read % ra->ra_pages == 0) { +- btrfs_force_ra(inode->i_mapping, ra, NULL, i, +- min(last_index, ra->ra_pages + i - 1)); +- } +- total_read++; +-again: +- if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) +- BUG_ON(1); +- page = grab_cache_page(inode->i_mapping, i); ++ WARN_ON(cluster->start != cluster->boundary[0]); ++ while (index <= last_index) { ++ page = find_lock_page(inode->i_mapping, index); + if (!page) { +- ret = -ENOMEM; +- goto out_unlock; ++ page_cache_sync_readahead(inode->i_mapping, ++ ra, NULL, index, ++ last_index + 1 - index); ++ page = grab_cache_page(inode->i_mapping, index); ++ if (!page) { ++ ret = -ENOMEM; ++ goto out_unlock; ++ } ++ } ++ ++ if (PageReadahead(page)) { ++ page_cache_async_readahead(inode->i_mapping, ++ ra, NULL, page, index, ++ last_index + 1 - index); + } ++ + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); +@@ -2589,75 +2636,79 @@ again: + goto out_unlock; + } + } +- wait_on_page_writeback(page); + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; +- lock_extent(io_tree, page_start, page_end, GFP_NOFS); +- +- ordered = btrfs_lookup_ordered_extent(inode, page_start); +- if (ordered) { +- unlock_extent(io_tree, page_start, page_end, GFP_NOFS); +- unlock_page(page); +- page_cache_release(page); +- btrfs_start_ordered_extent(inode, ordered, 1); +- btrfs_put_ordered_extent(ordered); +- goto again; +- } ++ ++ lock_extent(&BTRFS_I(inode)->io_tree, ++ page_start, page_end, GFP_NOFS); ++ + set_page_extent_mapped(page); + +- if (i == first_index) +- set_extent_bits(io_tree, page_start, page_end, ++ if (nr < cluster->nr && ++ page_start + offset == cluster->boundary[nr]) { ++ set_extent_bits(&BTRFS_I(inode)->io_tree, ++ page_start, page_end, + EXTENT_BOUNDARY, GFP_NOFS); ++ nr++; ++ } + btrfs_set_extent_delalloc(inode, page_start, page_end); + + set_page_dirty(page); +- total_dirty++; ++ dirty_page++; + +- unlock_extent(io_tree, page_start, page_end, GFP_NOFS); ++ unlock_extent(&BTRFS_I(inode)->io_tree, ++ page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); ++ ++ index++; ++ if (nr < cluster->nr && ++ page_end + 1 + offset == cluster->boundary[nr]) { ++ balance_dirty_pages_ratelimited_nr(inode->i_mapping, ++ dirty_page); ++ dirty_page = 0; ++ } ++ } ++ if (dirty_page) { ++ balance_dirty_pages_ratelimited_nr(inode->i_mapping, ++ dirty_page); + } ++ WARN_ON(nr != cluster->nr); + out_unlock: + mutex_unlock(&inode->i_mutex); + kfree(ra); +- balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); + return ret; + } + + static noinline_for_stack +-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key) ++int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, ++ struct file_extent_cluster *cluster) + { +- struct btrfs_root *root = BTRFS_I(inode)->root; +- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; +- struct extent_map *em; +- u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt; +- u64 end = start + extent_key->offset - 1; +- +- em = alloc_extent_map(GFP_NOFS); +- em->start = start; +- em->len = extent_key->offset; +- em->block_len = extent_key->offset; +- em->block_start = extent_key->objectid; +- em->bdev = root->fs_info->fs_devices->latest_bdev; +- set_bit(EXTENT_FLAG_PINNED, &em->flags); ++ int ret; + +- /* setup extent map to cheat btrfs_readpage */ +- lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); +- while (1) { +- int ret; +- spin_lock(&em_tree->lock); +- ret = add_extent_mapping(em_tree, em); +- spin_unlock(&em_tree->lock); +- if (ret != -EEXIST) { +- free_extent_map(em); +- break; +- } +- btrfs_drop_extent_cache(inode, start, end, 0); ++ if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { ++ ret = relocate_file_extent_cluster(inode, cluster); ++ if (ret) ++ return ret; ++ cluster->nr = 0; + } +- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + +- return relocate_inode_pages(inode, start, extent_key->offset); ++ if (!cluster->nr) ++ cluster->start = extent_key->objectid; ++ else ++ BUG_ON(cluster->nr >= MAX_EXTENTS); ++ cluster->end = extent_key->objectid + extent_key->offset - 1; ++ cluster->boundary[cluster->nr] = extent_key->objectid; ++ cluster->nr++; ++ ++ if (cluster->nr >= MAX_EXTENTS) { ++ ret = relocate_file_extent_cluster(inode, cluster); ++ if (ret) ++ return ret; ++ cluster->nr = 0; ++ } ++ return 0; + } + + #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +@@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags) + return 0; + } + ++ + static noinline_for_stack int relocate_block_group(struct reloc_control *rc) + { + struct rb_root blocks = RB_ROOT; + struct btrfs_key key; ++ struct file_extent_cluster *cluster; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_path *path; + struct btrfs_extent_item *ei; +@@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) + int ret; + int err = 0; + ++ cluster = kzalloc(sizeof(*cluster), GFP_NOFS); ++ if (!cluster) ++ return -ENOMEM; ++ + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ++ rc->extents_found = 0; ++ rc->extents_skipped = 0; ++ + rc->search_start = rc->block_group->key.objectid; + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, + GFP_NOFS); +@@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) + } + + nr = trans->blocks_used; +- btrfs_end_transaction_throttle(trans, rc->extent_root); ++ btrfs_end_transaction(trans, rc->extent_root); + trans = NULL; + btrfs_btree_balance_dirty(rc->extent_root, nr); + + if (rc->stage == MOVE_DATA_EXTENTS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + rc->found_file_extent = 1; +- ret = relocate_data_extent(rc->data_inode, &key); ++ ret = relocate_data_extent(rc->data_inode, ++ &key, cluster); + if (ret < 0) { + err = ret; + break; +@@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) + btrfs_btree_balance_dirty(rc->extent_root, nr); + } + ++ if (!err) { ++ ret = relocate_file_extent_cluster(rc->data_inode, cluster); ++ if (ret < 0) ++ err = ret; ++ } ++ ++ kfree(cluster); ++ + rc->create_reloc_root = 0; + smp_mb(); + +@@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) + } + + static int __insert_orphan_inode(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- u64 objectid, u64 size) ++ struct btrfs_root *root, u64 objectid) + { + struct btrfs_path *path; + struct btrfs_inode_item *item; +@@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + btrfs_set_inode_generation(leaf, item, 1); +- btrfs_set_inode_size(leaf, item, size); ++ btrfs_set_inode_size(leaf, item, 0); + btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); + btrfs_mark_buffer_dirty(leaf); +@@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + if (err) + goto out; + +- err = __insert_orphan_inode(trans, root, objectid, group->key.offset); +- BUG_ON(err); +- +- err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, +- group->key.offset, 0, group->key.offset, +- 0, 0, 0); ++ err = __insert_orphan_inode(trans, root, objectid); + BUG_ON(err); + + key.objectid = objectid; +@@ -3455,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) + BUG_ON(!rc->block_group); + + btrfs_init_workers(&rc->workers, "relocate", +- fs_info->thread_pool_size); ++ fs_info->thread_pool_size, NULL); + + rc->extent_root = extent_root; + btrfs_prepare_block_group_relocation(extent_root, rc->block_group); +@@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) + btrfs_wait_ordered_extents(fs_info->tree_root, 0); + + while (1) { +- mutex_lock(&fs_info->cleaner_mutex); +- btrfs_clean_old_snapshots(fs_info->tree_root); +- mutex_unlock(&fs_info->cleaner_mutex); +- + rc->extents_found = 0; + rc->extents_skipped = 0; + ++ mutex_lock(&fs_info->cleaner_mutex); ++ ++ btrfs_clean_old_snapshots(fs_info->tree_root); + ret = relocate_block_group(rc); ++ ++ mutex_unlock(&fs_info->cleaner_mutex); + if (ret < 0) { + err = ret; + break; +@@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) + } + } + +- filemap_fdatawrite_range(fs_info->btree_inode->i_mapping, +- rc->block_group->key.objectid, +- rc->block_group->key.objectid + +- rc->block_group->key.offset - 1); ++ filemap_write_and_wait_range(fs_info->btree_inode->i_mapping, ++ rc->block_group->key.objectid, ++ rc->block_group->key.objectid + ++ rc->block_group->key.offset - 1); + + WARN_ON(rc->block_group->pinned > 0); + WARN_ON(rc->block_group->reserved > 0); +@@ -3530,6 +3594,26 @@ out: + return err; + } + ++static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) ++{ ++ struct btrfs_trans_handle *trans; ++ int ret; ++ ++ trans = btrfs_start_transaction(root->fs_info->tree_root, 1); ++ ++ memset(&root->root_item.drop_progress, 0, ++ sizeof(root->root_item.drop_progress)); ++ root->root_item.drop_level = 0; ++ btrfs_set_root_refs(&root->root_item, 0); ++ ret = btrfs_update_root(trans, root->fs_info->tree_root, ++ &root->root_key, &root->root_item); ++ BUG_ON(ret); ++ ++ ret = btrfs_end_transaction(trans, root->fs_info->tree_root); ++ BUG_ON(ret); ++ return 0; ++} ++ + /* + * recover relocation interrupted by system crash. + * +@@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root) + fs_root = read_fs_root(root->fs_info, + reloc_root->root_key.offset); + if (IS_ERR(fs_root)) { +- err = PTR_ERR(fs_root); +- goto out; ++ ret = PTR_ERR(fs_root); ++ if (ret != -ENOENT) { ++ err = ret; ++ goto out; ++ } ++ mark_garbage_root(reloc_root); + } + } + +@@ -3613,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) + mapping_tree_init(&rc->reloc_root_tree); + INIT_LIST_HEAD(&rc->reloc_roots); + btrfs_init_workers(&rc->workers, "relocate", +- root->fs_info->thread_pool_size); ++ root->fs_info->thread_pool_size, NULL); + rc->extent_root = root->fs_info->extent_root; + + set_reloc_control(rc); +diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c +index 0ddc6d6..9351428 100644 +--- a/fs/btrfs/root-tree.c ++++ b/fs/btrfs/root-tree.c +@@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, + goto out; + + BUG_ON(ret == 0); ++ if (path->slots[0] == 0) { ++ ret = 1; ++ goto out; ++ } + l = path->nodes[0]; +- BUG_ON(path->slots[0] == 0); + slot = path->slots[0] - 1; + btrfs_item_key_to_cpu(l, &found_key, slot); +- if (found_key.objectid != objectid) { ++ if (found_key.objectid != objectid || ++ found_key.type != BTRFS_ROOT_ITEM_KEY) { + ret = 1; + goto out; + } +- read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), +- sizeof(*item)); +- memcpy(key, &found_key, sizeof(found_key)); ++ if (item) ++ read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), ++ sizeof(*item)); ++ if (key) ++ memcpy(key, &found_key, sizeof(found_key)); + ret = 0; + out: + btrfs_free_path(path); +@@ -249,6 +255,59 @@ err: + return ret; + } + ++int btrfs_find_orphan_roots(struct btrfs_root *tree_root) ++{ ++ struct extent_buffer *leaf; ++ struct btrfs_path *path; ++ struct btrfs_key key; ++ int err = 0; ++ int ret; ++ ++ path = btrfs_alloc_path(); ++ if (!path) ++ return -ENOMEM; ++ ++ key.objectid = BTRFS_ORPHAN_OBJECTID; ++ key.type = BTRFS_ORPHAN_ITEM_KEY; ++ key.offset = 0; ++ ++ while (1) { ++ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); ++ if (ret < 0) { ++ err = ret; ++ break; ++ } ++ ++ leaf = path->nodes[0]; ++ if (path->slots[0] >= btrfs_header_nritems(leaf)) { ++ ret = btrfs_next_leaf(tree_root, path); ++ if (ret < 0) ++ err = ret; ++ if (ret != 0) ++ break; ++ leaf = path->nodes[0]; ++ } ++ ++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ++ btrfs_release_path(tree_root, path); ++ ++ if (key.objectid != BTRFS_ORPHAN_OBJECTID || ++ key.type != BTRFS_ORPHAN_ITEM_KEY) ++ break; ++ ++ ret = btrfs_find_dead_roots(tree_root, key.offset); ++ if (ret) { ++ err = ret; ++ break; ++ } ++ ++ key.offset++; ++ } ++ ++ btrfs_free_path(path); ++ return err; ++} ++ + /* drop the root item for 'key' from 'root' */ + int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key) +@@ -278,31 +337,57 @@ out: + return ret; + } + +-#if 0 /* this will get used when snapshot deletion is implemented */ + int btrfs_del_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, +- u64 root_id, u8 type, u64 ref_id) ++ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, ++ const char *name, int name_len) ++ + { ++ struct btrfs_path *path; ++ struct btrfs_root_ref *ref; ++ struct extent_buffer *leaf; + struct btrfs_key key; ++ unsigned long ptr; ++ int err = 0; + int ret; +- struct btrfs_path *path; + + path = btrfs_alloc_path(); ++ if (!path) ++ return -ENOMEM; + + key.objectid = root_id; +- key.type = type; ++ key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = ref_id; +- ++again: + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); +- BUG_ON(ret); +- +- ret = btrfs_del_item(trans, tree_root, path); +- BUG_ON(ret); ++ BUG_ON(ret < 0); ++ if (ret == 0) { ++ leaf = path->nodes[0]; ++ ref = btrfs_item_ptr(leaf, path->slots[0], ++ struct btrfs_root_ref); ++ ++ WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); ++ WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); ++ ptr = (unsigned long)(ref + 1); ++ WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); ++ *sequence = btrfs_root_ref_sequence(leaf, ref); ++ ++ ret = btrfs_del_item(trans, tree_root, path); ++ BUG_ON(ret); ++ } else ++ err = -ENOENT; ++ ++ if (key.type == BTRFS_ROOT_BACKREF_KEY) { ++ btrfs_release_path(tree_root, path); ++ key.objectid = ref_id; ++ key.type = BTRFS_ROOT_REF_KEY; ++ key.offset = root_id; ++ goto again; ++ } + + btrfs_free_path(path); +- return ret; ++ return err; + } +-#endif + + int btrfs_find_root_ref(struct btrfs_root *tree_root, + struct btrfs_path *path, +@@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root, + return ret; + } + +- + /* + * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY + * or BTRFS_ROOT_BACKREF_KEY. +@@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root, + */ + int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, +- u64 root_id, u8 type, u64 ref_id, +- u64 dirid, u64 sequence, ++ u64 root_id, u64 ref_id, u64 dirid, u64 sequence, + const char *name, int name_len) + { + struct btrfs_key key; +@@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf; + unsigned long ptr; + +- + path = btrfs_alloc_path(); ++ if (!path) ++ return -ENOMEM; + + key.objectid = root_id; +- key.type = type; ++ key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = ref_id; +- ++again: + ret = btrfs_insert_empty_item(trans, tree_root, path, &key, + sizeof(*ref) + name_len); + BUG_ON(ret); +@@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + write_extent_buffer(leaf, name, ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + ++ if (key.type == BTRFS_ROOT_BACKREF_KEY) { ++ btrfs_release_path(tree_root, path); ++ key.objectid = ref_id; ++ key.type = BTRFS_ROOT_REF_KEY; ++ key.offset = root_id; ++ goto again; ++ } ++ + btrfs_free_path(path); +- return ret; ++ return 0; + } +diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c +index 6d6d06c..939b68f 100644 +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -66,7 +66,7 @@ enum { + Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, + Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, + Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, +- Opt_tag, Opt_notag, Opt_tagid, Opt_err, ++ Opt_tag, Opt_notag, Opt_tagid, Opt_discard, Opt_err, + }; + + static match_table_t tokens = { +@@ -88,6 +89,7 @@ static match_table_t tokens = { + {Opt_notreelog, "notreelog"}, + {Opt_flushoncommit, "flushoncommit"}, + {Opt_ratio, "metadata_ratio=%d"}, ++ {Opt_discard, "discard"}, + {Opt_tag, "tag"}, + {Opt_notag, "notag"}, + {Opt_tagid, "tagid=%u"}, +@@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) + info->metadata_ratio); + } + break; ++ case Opt_discard: ++ btrfs_set_opt(info->mount_opt, DISCARD); ++ break; + #ifndef CONFIG_TAGGING_NONE + case Opt_tag: + printk(KERN_INFO "btrfs: use tagging\n"); +@@ -344,7 +349,9 @@ static int btrfs_fill_super(struct super_block *sb, + sb->s_export_op = &btrfs_export_ops; + sb->s_xattr = btrfs_xattr_handlers; + sb->s_time_gran = 1; ++#ifdef CONFIG_BTRFS_FS_POSIX_ACL + sb->s_flags |= MS_POSIXACL; ++#endif + + tree_root = open_ctree(sb, fs_devices, (char *)data); + +@@ -676,6 +683,7 @@ static int btrfs_unfreeze(struct super_block *sb) + } + + static struct super_operations btrfs_super_ops = { ++ .drop_inode = btrfs_drop_inode, + .delete_inode = btrfs_delete_inode, + .put_super = btrfs_put_super, + .sync_fs = btrfs_sync_fs, +diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c +index cdbb502..bca82a4 100644 +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans, + { + if (root->ref_cows && root->last_trans < trans->transid) { + WARN_ON(root == root->fs_info->extent_root); +- WARN_ON(root->root_item.refs == 0); + WARN_ON(root->commit_root != root->node); + + radix_tree_tag_set(&root->fs_info->fs_roots_radix, +@@ -187,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, + h->alloc_exclude_start = 0; + h->delayed_ref_updates = 0; + ++ if (!current->journal_info) ++ current->journal_info = h; ++ + root->fs_info->running_transaction->use_count++; + record_root_in_trans(h, root); + mutex_unlock(&root->fs_info->trans_mutex); +@@ -318,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, + wake_up(&cur_trans->writer_wait); + put_transaction(cur_trans); + mutex_unlock(&info->trans_mutex); ++ ++ if (current->journal_info == trans) ++ current->journal_info = NULL; + memset(trans, 0, sizeof(*trans)); + kmem_cache_free(btrfs_trans_handle_cachep, trans); + +@@ -339,10 +344,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, + /* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of +- * those extents are on disk for transaction or log commit ++ * those extents are sent to disk but does not wait on them + */ +-int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, +- struct extent_io_tree *dirty_pages) ++int btrfs_write_marked_extents(struct btrfs_root *root, ++ struct extent_io_tree *dirty_pages) + { + int ret; + int err = 0; +@@ -389,6 +394,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + page_cache_release(page); + } + } ++ if (err) ++ werr = err; ++ return werr; ++} ++ ++/* ++ * when btree blocks are allocated, they have some corresponding bits set for ++ * them in one of two extent_io trees. This is used to make sure all of ++ * those extents are on disk for transaction or log commit. We wait ++ * on all the pages and clear them from the dirty pages state tree ++ */ ++int btrfs_wait_marked_extents(struct btrfs_root *root, ++ struct extent_io_tree *dirty_pages) ++{ ++ int ret; ++ int err = 0; ++ int werr = 0; ++ struct page *page; ++ struct inode *btree_inode = root->fs_info->btree_inode; ++ u64 start = 0; ++ u64 end; ++ unsigned long index; ++ + while (1) { + ret = find_first_extent_bit(dirty_pages, 0, &start, &end, + EXTENT_DIRTY); +@@ -419,6 +447,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + return werr; + } + ++/* ++ * when btree blocks are allocated, they have some corresponding bits set for ++ * them in one of two extent_io trees. This is used to make sure all of ++ * those extents are on disk for transaction or log commit ++ */ ++int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, ++ struct extent_io_tree *dirty_pages) ++{ ++ int ret; ++ int ret2; ++ ++ ret = btrfs_write_marked_extents(root, dirty_pages); ++ ret2 = btrfs_wait_marked_extents(root, dirty_pages); ++ return ret || ret2; ++} ++ + int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) + { +@@ -720,7 +764,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); + + key.objectid = objectid; +- key.offset = 0; ++ /* record when the snapshot was created in key.offset */ ++ key.offset = trans->transid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + + old = btrfs_lock_root_node(root); +@@ -743,6 +788,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + memcpy(&pending->root_key, &key, sizeof(key)); + fail: + kfree(new_root_item); ++ btrfs_unreserve_metadata_space(root, 6); + return ret; + } + +@@ -778,24 +824,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, + ret = btrfs_update_inode(trans, parent_root, parent_inode); + BUG_ON(ret); + +- /* add the backref first */ + ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, + pending->root_key.objectid, +- BTRFS_ROOT_BACKREF_KEY, + parent_root->root_key.objectid, + parent_inode->i_ino, index, pending->name, + namelen); + + BUG_ON(ret); + +- /* now add the forward ref */ +- ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, +- parent_root->root_key.objectid, +- BTRFS_ROOT_REF_KEY, +- pending->root_key.objectid, +- parent_inode->i_ino, index, pending->name, +- namelen); +- + inode = btrfs_lookup_dentry(parent_inode, pending->dentry); + d_instantiate(pending->dentry, inode); + fail: +@@ -874,7 +910,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + unsigned long timeout = 1; + struct btrfs_transaction *cur_trans; + struct btrfs_transaction *prev_trans = NULL; +- struct extent_io_tree *pinned_copy; + DEFINE_WAIT(wait); + int ret; + int should_grow = 0; +@@ -915,13 +950,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + return 0; + } + +- pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS); +- if (!pinned_copy) +- return -ENOMEM; +- +- extent_io_tree_init(pinned_copy, +- root->fs_info->btree_inode->i_mapping, GFP_NOFS); +- + trans->transaction->in_commit = 1; + trans->transaction->blocked = 1; + if (cur_trans->list.prev != &root->fs_info->trans_list) { +@@ -1019,6 +1047,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + ret = commit_cowonly_roots(trans, root); + BUG_ON(ret); + ++ btrfs_prepare_extent_commit(trans, root); ++ + cur_trans = root->fs_info->running_transaction; + spin_lock(&root->fs_info->new_trans_lock); + root->fs_info->running_transaction = NULL; +@@ -1042,8 +1072,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, + sizeof(root->fs_info->super_copy)); + +- btrfs_copy_pinned(root, pinned_copy); +- + trans->transaction->blocked = 0; + + wake_up(&root->fs_info->transaction_wait); +@@ -1059,8 +1087,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + */ + mutex_unlock(&root->fs_info->tree_log_mutex); + +- btrfs_finish_extent_commit(trans, root, pinned_copy); +- kfree(pinned_copy); ++ btrfs_finish_extent_commit(trans, root); + + /* do the directory inserts of any pending snapshot creations */ + finish_pending_snapshots(trans, root->fs_info); +@@ -1078,6 +1105,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + + mutex_unlock(&root->fs_info->trans_mutex); + ++ if (current->journal_info == trans) ++ current->journal_info = NULL; ++ + kmem_cache_free(btrfs_trans_handle_cachep, trans); + return ret; + } +@@ -1096,8 +1126,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) + + while (!list_empty(&list)) { + root = list_entry(list.next, struct btrfs_root, root_list); +- list_del_init(&root->root_list); +- btrfs_drop_snapshot(root, 0); ++ list_del(&root->root_list); ++ ++ if (btrfs_header_backref_rev(root->node) < ++ BTRFS_MIXED_BACKREF_REV) ++ btrfs_drop_snapshot(root, 0); ++ else ++ btrfs_drop_snapshot(root, 1); + } + return 0; + } +diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h +index 663c674..d4e3e7a 100644 +--- a/fs/btrfs/transaction.h ++++ b/fs/btrfs/transaction.h +@@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, + struct inode *inode) + { + BTRFS_I(inode)->last_trans = trans->transaction->transid; ++ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + } + + int btrfs_end_transaction(struct btrfs_trans_handle *trans, +@@ -107,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); ++int btrfs_write_marked_extents(struct btrfs_root *root, ++ struct extent_io_tree *dirty_pages); ++int btrfs_wait_marked_extents(struct btrfs_root *root, ++ struct extent_io_tree *dirty_pages); + int btrfs_transaction_in_commit(struct btrfs_fs_info *info); + #endif +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index d91b0de..f51bf13 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans, + + mutex_lock(&root->log_mutex); + if (root->log_root) { ++ if (!root->log_start_pid) { ++ root->log_start_pid = current->pid; ++ root->log_multiple_pids = false; ++ } else if (root->log_start_pid != current->pid) { ++ root->log_multiple_pids = true; ++ } ++ + root->log_batch++; + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); + return 0; + } ++ root->log_multiple_pids = false; ++ root->log_start_pid = current->pid; + mutex_lock(&root->fs_info->tree_log_mutex); + if (!root->fs_info->log_root_tree) { + ret = btrfs_init_log_root_tree(trans, root->fs_info); +@@ -263,8 +272,8 @@ static int process_one_buffer(struct btrfs_root *log, + struct walk_control *wc, u64 gen) + { + if (wc->pin) +- btrfs_update_pinned_extents(log->fs_info->extent_root, +- eb->start, eb->len, 1); ++ btrfs_pin_extent(log->fs_info->extent_root, ++ eb->start, eb->len, 0); + + if (btrfs_buffer_uptodate(eb, gen)) { + if (wc->write) +@@ -534,7 +543,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, + saved_nbytes = inode_get_bytes(inode); + /* drop any overlapping extents */ + ret = btrfs_drop_extents(trans, root, inode, +- start, extent_end, extent_end, start, &alloc_hint); ++ start, extent_end, extent_end, start, &alloc_hint, 1); + BUG_ON(ret); + + if (found_type == BTRFS_FILE_EXTENT_REG || +@@ -1971,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + int ret; + struct btrfs_root *log = root->log_root; + struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; ++ u64 log_transid = 0; + + mutex_lock(&root->log_mutex); + index1 = root->log_transid % 2; +@@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + + while (1) { + unsigned long batch = root->log_batch; +- mutex_unlock(&root->log_mutex); +- schedule_timeout_uninterruptible(1); +- mutex_lock(&root->log_mutex); +- ++ if (root->log_multiple_pids) { ++ mutex_unlock(&root->log_mutex); ++ schedule_timeout_uninterruptible(1); ++ mutex_lock(&root->log_mutex); ++ } + wait_for_writer(trans, root); + if (batch == root->log_batch) + break; +@@ -2003,14 +2014,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + goto out; + } + +- ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); ++ /* we start IO on all the marked extents here, but we don't actually ++ * wait for them until later. ++ */ ++ ret = btrfs_write_marked_extents(log, &log->dirty_log_pages); + BUG_ON(ret); + + btrfs_set_root_node(&log->root_item, log->node); + + root->log_batch = 0; ++ log_transid = root->log_transid; + root->log_transid++; + log->log_transid = root->log_transid; ++ root->log_start_pid = 0; + smp_mb(); + /* + * log tree has been flushed to disk, new modifications of +@@ -2036,6 +2052,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + + index2 = log_root_tree->log_transid % 2; + if (atomic_read(&log_root_tree->log_commit[index2])) { ++ btrfs_wait_marked_extents(log, &log->dirty_log_pages); + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid); + mutex_unlock(&log_root_tree->log_mutex); +@@ -2055,6 +2072,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + * check the full commit flag again + */ + if (root->fs_info->last_trans_log_full_commit == trans->transid) { ++ btrfs_wait_marked_extents(log, &log->dirty_log_pages); + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out_wake_log_root; +@@ -2063,6 +2081,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + ret = btrfs_write_and_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages); + BUG_ON(ret); ++ btrfs_wait_marked_extents(log, &log->dirty_log_pages); + + btrfs_set_super_log_root(&root->fs_info->super_for_commit, + log_root_tree->node->start); +@@ -2082,9 +2101,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + * the running transaction open, so a full commit can't hop + * in and cause problems either. + */ +- write_ctree_super(trans, root->fs_info->tree_root, 2); ++ write_ctree_super(trans, root->fs_info->tree_root, 1); + ret = 0; + ++ mutex_lock(&root->log_mutex); ++ if (root->last_log_commit < log_transid) ++ root->last_log_commit = log_transid; ++ mutex_unlock(&root->log_mutex); ++ + out_wake_log_root: + atomic_set(&log_root_tree->log_commit[index2], 0); + smp_mb(); +@@ -2841,7 +2865,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + break; + +- if (parent == sb->s_root) ++ if (IS_ROOT(parent)) + break; + + parent = parent->d_parent; +@@ -2852,6 +2876,21 @@ out: + return ret; + } + ++static int inode_in_log(struct btrfs_trans_handle *trans, ++ struct inode *inode) ++{ ++ struct btrfs_root *root = BTRFS_I(inode)->root; ++ int ret = 0; ++ ++ mutex_lock(&root->log_mutex); ++ if (BTRFS_I(inode)->logged_trans == trans->transid && ++ BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) ++ ret = 1; ++ mutex_unlock(&root->log_mutex); ++ return ret; ++} ++ ++ + /* + * helper function around btrfs_log_inode to make sure newly created + * parent directories also end up in the log. A minimal inode and backref +@@ -2880,11 +2919,22 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + goto end_no_trans; + } + ++ if (root != BTRFS_I(inode)->root || ++ btrfs_root_refs(&root->root_item) == 0) { ++ ret = 1; ++ goto end_no_trans; ++ } ++ + ret = check_parent_dirs_for_sync(trans, inode, parent, + sb, last_committed); + if (ret) + goto end_no_trans; + ++ if (inode_in_log(trans, inode)) { ++ ret = BTRFS_NO_LOG_SYNC; ++ goto end_no_trans; ++ } ++ + start_log_trans(trans, root); + + ret = btrfs_log_inode(trans, root, inode, inode_only); +@@ -2907,12 +2957,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + break; + + inode = parent->d_inode; ++ if (root != BTRFS_I(inode)->root) ++ break; ++ + if (BTRFS_I(inode)->generation > + root->fs_info->last_trans_committed) { + ret = btrfs_log_inode(trans, root, inode, inode_only); + BUG_ON(ret); + } +- if (parent == sb->s_root) ++ if (IS_ROOT(parent)) + break; + + parent = parent->d_parent; +@@ -2951,7 +3004,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) + struct btrfs_key tmp_key; + struct btrfs_root *log; + struct btrfs_fs_info *fs_info = log_root_tree->fs_info; +- u64 highest_inode; + struct walk_control wc = { + .process_func = process_one_buffer, + .stage = 0, +@@ -3010,11 +3062,6 @@ again: + path); + BUG_ON(ret); + } +- ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode); +- if (ret == 0) { +- wc.replay_dest->highest_inode = highest_inode; +- wc.replay_dest->last_inode_alloc = highest_inode; +- } + + key.offset = found_key.offset - 1; + wc.replay_dest->log_root = NULL; +diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h +index d09c760..0776eac 100644 +--- a/fs/btrfs/tree-log.h ++++ b/fs/btrfs/tree-log.h +@@ -19,6 +19,9 @@ + #ifndef __TREE_LOG_ + #define __TREE_LOG_ + ++/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ ++#define BTRFS_NO_LOG_SYNC 256 ++ + int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index 5dbefd1..20cbd2e 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -276,7 +276,7 @@ loop_lock: + * is now congested. Back off and let other work structs + * run instead + */ +- if (pending && bdi_write_congested(bdi) && batch_run > 32 && ++ if (pending && bdi_write_congested(bdi) && batch_run > 8 && + fs_info->fs_devices->open_devices > 1) { + struct io_context *ioc; + +@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) + goto error; + + device->name = kstrdup(orig_dev->name, GFP_NOFS); +- if (!device->name) ++ if (!device->name) { ++ kfree(device); + goto error; ++ } + + device->devid = orig_dev->devid; + device->work.func = pending_bios_fn; +@@ -719,10 +721,9 @@ error: + * called very infrequently and that a given device has a small number + * of extents + */ +-static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, +- struct btrfs_device *device, +- u64 num_bytes, u64 *start, +- u64 *max_avail) ++int find_free_dev_extent(struct btrfs_trans_handle *trans, ++ struct btrfs_device *device, u64 num_bytes, ++ u64 *start, u64 *max_avail) + { + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; +@@ -1736,6 +1737,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, + extent_root = root->fs_info->extent_root; + em_tree = &root->fs_info->mapping_tree.map_tree; + ++ ret = btrfs_can_relocate(extent_root, chunk_offset); ++ if (ret) ++ return -ENOSPC; ++ + /* step one, relocate all the extents inside this chunk */ + ret = btrfs_relocate_block_group(extent_root, chunk_offset); + BUG_ON(ret); +@@ -1749,9 +1754,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, + * step two, delete the device extents and the + * chunk tree entries + */ +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + BUG_ON(em->start > chunk_offset || + em->start + em->len < chunk_offset); +@@ -1780,9 +1785,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, + ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); + BUG_ON(ret); + +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + + kfree(map); + em->bdev = NULL; +@@ -1807,12 +1812,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) + struct btrfs_key found_key; + u64 chunk_tree = chunk_root->root_key.objectid; + u64 chunk_type; ++ bool retried = false; ++ int failed = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ++again: + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; +@@ -1842,7 +1850,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) + ret = btrfs_relocate_chunk(chunk_root, chunk_tree, + found_key.objectid, + found_key.offset); +- BUG_ON(ret); ++ if (ret == -ENOSPC) ++ failed++; ++ else if (ret) ++ BUG(); + } + + if (found_key.offset == 0) +@@ -1850,6 +1861,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) + key.offset = found_key.offset - 1; + } + ret = 0; ++ if (failed && !retried) { ++ failed = 0; ++ retried = true; ++ goto again; ++ } else if (failed && retried) { ++ WARN_ON(1); ++ ret = -ENOSPC; ++ } + error: + btrfs_free_path(path); + return ret; +@@ -1894,6 +1913,8 @@ int btrfs_balance(struct btrfs_root *dev_root) + continue; + + ret = btrfs_shrink_device(device, old_size - size_to_free); ++ if (ret == -ENOSPC) ++ break; + BUG_ON(ret); + + trans = btrfs_start_transaction(dev_root, 1); +@@ -1938,9 +1959,8 @@ int btrfs_balance(struct btrfs_root *dev_root) + chunk = btrfs_item_ptr(path->nodes[0], + path->slots[0], + struct btrfs_chunk); +- key.offset = found_key.offset; + /* chunk zero is special */ +- if (key.offset == 0) ++ if (found_key.offset == 0) + break; + + btrfs_release_path(chunk_root, path); +@@ -1948,7 +1968,8 @@ int btrfs_balance(struct btrfs_root *dev_root) + chunk_root->root_key.objectid, + found_key.objectid, + found_key.offset); +- BUG_ON(ret); ++ BUG_ON(ret && ret != -ENOSPC); ++ key.offset = found_key.offset - 1; + } + ret = 0; + error: +@@ -1974,10 +1995,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) + u64 chunk_offset; + int ret; + int slot; ++ int failed = 0; ++ bool retried = false; + struct extent_buffer *l; + struct btrfs_key key; + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); ++ u64 old_size = device->total_bytes; + u64 diff = device->total_bytes - new_size; + + if (new_size >= device->total_bytes) +@@ -1987,12 +2011,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) + if (!path) + return -ENOMEM; + +- trans = btrfs_start_transaction(root, 1); +- if (!trans) { +- ret = -ENOMEM; +- goto done; +- } +- + path->reada = 2; + + lock_chunks(root); +@@ -2001,8 +2019,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) + if (device->writeable) + device->fs_devices->total_rw_bytes -= diff; + unlock_chunks(root); +- btrfs_end_transaction(trans, root); + ++again: + key.objectid = device->devid; + key.offset = (u64)-1; + key.type = BTRFS_DEV_EXTENT_KEY; +@@ -2017,6 +2035,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) + goto done; + if (ret) { + ret = 0; ++ btrfs_release_path(root, path); + break; + } + +@@ -2024,14 +2043,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) + slot = path->slots[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + +- if (key.objectid != device->devid) ++ if (key.objectid != device->devid) { ++ btrfs_release_path(root, path); + break; ++ } + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + length = btrfs_dev_extent_length(l, dev_extent); + +- if (key.offset + length <= new_size) ++ if (key.offset + length <= new_size) { ++ btrfs_release_path(root, path); + break; ++ } + + chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); + chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); +@@ -2040,8 +2063,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) + + ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, + chunk_offset); +- if (ret) ++ if (ret && ret != -ENOSPC) + goto done; ++ if (ret == -ENOSPC) ++ failed++; ++ key.offset -= 1; ++ } ++ ++ if (failed && !retried) { ++ failed = 0; ++ retried = true; ++ goto again; ++ } else if (failed && retried) { ++ ret = -ENOSPC; ++ lock_chunks(root); ++ ++ device->total_bytes = old_size; ++ if (device->writeable) ++ device->fs_devices->total_rw_bytes += diff; ++ unlock_chunks(root); ++ goto done; + } + + /* Shrinking succeeded, else we would be at "done". */ +@@ -2294,9 +2335,9 @@ again: + em->block_len = em->len; + + em_tree = &extent_root->fs_info->mapping_tree.map_tree; +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + BUG_ON(ret); + free_extent_map(em); + +@@ -2491,9 +2532,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) + int readonly = 0; + int i; + +- spin_lock(&map_tree->map_tree.lock); ++ read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); +- spin_unlock(&map_tree->map_tree.lock); ++ read_unlock(&map_tree->map_tree.lock); + if (!em) + return 1; + +@@ -2518,11 +2559,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) + struct extent_map *em; + + while (1) { +- spin_lock(&tree->map_tree.lock); ++ write_lock(&tree->map_tree.lock); + em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); + if (em) + remove_extent_mapping(&tree->map_tree, em); +- spin_unlock(&tree->map_tree.lock); ++ write_unlock(&tree->map_tree.lock); + if (!em) + break; + kfree(em->bdev); +@@ -2540,9 +2581,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) + struct extent_map_tree *em_tree = &map_tree->map_tree; + int ret; + +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); +@@ -2604,9 +2645,9 @@ again: + atomic_set(&multi->error, 0); + } + +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, *length); +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + if (!em && unplug_page) + return 0; +@@ -2763,9 +2804,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, + u64 stripe_nr; + int i, j, nr = 0; + +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_start, 1); +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + BUG_ON(!em || em->start != chunk_start); + map = (struct map_lookup *)em->bdev; +@@ -3053,9 +3094,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); + +- spin_lock(&map_tree->map_tree.lock); ++ read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); +- spin_unlock(&map_tree->map_tree.lock); ++ read_unlock(&map_tree->map_tree.lock); + + /* already mapped? */ + if (em && em->start <= logical && em->start + em->len > logical) { +@@ -3114,9 +3155,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + map->stripes[i].dev->in_fs_metadata = 1; + } + +- spin_lock(&map_tree->map_tree.lock); ++ write_lock(&map_tree->map_tree.lock); + ret = add_extent_mapping(&map_tree->map_tree, em); +- spin_unlock(&map_tree->map_tree.lock); ++ write_unlock(&map_tree->map_tree.lock); + BUG_ON(ret); + free_extent_map(em); + +diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h +index 5139a83..31b0fab 100644 +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root); + void btrfs_unlock_volumes(void); + void btrfs_lock_volumes(void); + int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); ++int find_free_dev_extent(struct btrfs_trans_handle *trans, ++ struct btrfs_device *device, u64 num_bytes, ++ u64 *start, u64 *max_avail); + #endif +diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c +index a9d3bf4..b6dd596 100644 +--- a/fs/btrfs/xattr.c ++++ b/fs/btrfs/xattr.c +@@ -260,7 +260,7 @@ err: + * attributes are handled directly. + */ + struct xattr_handler *btrfs_xattr_handlers[] = { +-#ifdef CONFIG_FS_POSIX_ACL ++#ifdef CONFIG_BTRFS_FS_POSIX_ACL + &btrfs_xattr_acl_access_handler, + &btrfs_xattr_acl_default_handler, + #endif diff --git a/linux-2.6-debug-vm-would-have-oomkilled.patch b/linux-2.6-debug-vm-would-have-oomkilled.patch new file mode 100644 index 000000000..bcad97e35 --- /dev/null +++ b/linux-2.6-debug-vm-would-have-oomkilled.patch @@ -0,0 +1,65 @@ +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b2a2d68..3b132ee 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -67,6 +67,7 @@ extern int sysctl_overcommit_ratio; + extern int sysctl_panic_on_oom; + extern int sysctl_oom_kill_allocating_task; + extern int sysctl_oom_dump_tasks; ++extern int sysctl_would_have_oomkilled; + extern int max_threads; + extern int core_uses_pid; + extern int suid_dumpable; +@@ -861,6 +862,14 @@ static struct ctl_table vm_table[] = { + .proc_handler = &proc_dointvec, + }, + { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "would_have_oomkilled", ++ .data = &sysctl_would_have_oomkilled, ++ .maxlen = sizeof(sysctl_would_have_oomkilled), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { + .ctl_name = VM_OVERCOMMIT_RATIO, + .procname = "overcommit_ratio", + .data = &sysctl_overcommit_ratio, +diff --git a/mm/oom_kill.c b/mm/oom_kill.c +index f255eda..3335a94 100644 +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -31,6 +31,7 @@ + int sysctl_panic_on_oom; + int sysctl_oom_kill_allocating_task; + int sysctl_oom_dump_tasks; ++int sysctl_would_have_oomkilled; + static DEFINE_SPINLOCK(zone_scan_lock); + /* #define DEBUG */ + +@@ -321,6 +322,12 @@ static void __oom_kill_task(struct task_struct *p, int verbose) + return; + } + ++ if (sysctl_would_have_oomkilled == 1) { ++ printk(KERN_ERR "Would have killed process %d (%s). But continuing instead.\n", ++ task_pid_nr(p), p->comm); ++ return; ++ } ++ + if (verbose) + printk(KERN_ERR "Killed process %s(%d:#%u)\n", + p->comm, task_pid_nr(p), p->xid); +@@ -363,6 +370,12 @@ static int oom_kill_task(struct task_struct *p) + return 1; + } while_each_thread(g, q); + ++ if (sysctl_would_have_oomkilled == 1) { ++ printk(KERN_ERR "Would have killed process %d (%s). But continuing instead.\n", ++ task_pid_nr(p), p->comm); ++ return 1; ++ } ++ + __oom_kill_task(p, 1); + + /* diff --git a/linux-2.6-execshield.patch b/linux-2.6-execshield.patch new file mode 100644 index 000000000..36ee866aa --- /dev/null +++ b/linux-2.6-execshield.patch @@ -0,0 +1,1013 @@ +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h +index c45f415..3a6dbad 100644 +--- a/arch/x86/include/asm/desc.h ++++ b/arch/x86/include/asm/desc.h +@@ -6,6 +6,7 @@ + #include + #include + #include ++#include + + static inline void fill_ldt(struct desc_struct *desc, + const struct user_desc *info) +@@ -94,6 +95,9 @@ static inline int desc_empty(const void *ptr) + + #define load_TLS(t, cpu) native_load_tls(t, cpu) + #define set_ldt native_set_ldt ++#ifdef CONFIG_X86_32 ++#define load_user_cs_desc native_load_user_cs_desc ++#endif /*CONFIG_X86_32*/ + + #define write_ldt_entry(dt, entry, desc) \ + native_write_ldt_entry(dt, entry, desc) +@@ -380,4 +384,25 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) + _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); + } + ++#ifdef CONFIG_X86_32 ++static inline void set_user_cs(struct desc_struct *desc, unsigned long limit) ++{ ++ limit = (limit - 1) / PAGE_SIZE; ++ desc->a = limit & 0xffff; ++ desc->b = (limit & 0xf0000) | 0x00c0fb00; ++} ++ ++static inline void native_load_user_cs_desc(int cpu, struct mm_struct *mm) ++{ ++ get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS] = (mm)->context.user_cs; ++} ++ ++#define arch_add_exec_range arch_add_exec_range ++#define arch_remove_exec_range arch_remove_exec_range ++#define arch_flush_exec_range arch_flush_exec_range ++extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit); ++extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit); ++extern void arch_flush_exec_range(struct mm_struct *mm); ++#endif /* CONFIG_X86_32 */ ++ + #endif /* _ASM_X86_DESC_H */ +diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h +index 80a1dee..8314c66 100644 +--- a/arch/x86/include/asm/mmu.h ++++ b/arch/x86/include/asm/mmu.h +@@ -7,12 +7,19 @@ + /* + * The x86 doesn't have a mmu context, but + * we put the segment information here. ++ * ++ * exec_limit is used to track the range PROT_EXEC ++ * mappings span. + */ + typedef struct { + void *ldt; + int size; + struct mutex lock; + void *vdso; ++#ifdef CONFIG_X86_32 ++ struct desc_struct user_cs; ++ unsigned long exec_limit; ++#endif + } mm_context_t; + + #ifdef CONFIG_SMP +diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h +index 4fb37c8..d5cc31c 100644 +--- a/arch/x86/include/asm/paravirt.h ++++ b/arch/x86/include/asm/paravirt.h +@@ -139,6 +139,9 @@ struct pv_cpu_ops { + void (*store_gdt)(struct desc_ptr *); + void (*store_idt)(struct desc_ptr *); + void (*set_ldt)(const void *desc, unsigned entries); ++#ifdef CONFIG_X86_32 ++ void (*load_user_cs_desc)(int cpu, struct mm_struct *mm); ++#endif /*CONFIG_X86_32*/ + unsigned long (*store_tr)(void); + void (*load_tls)(struct thread_struct *t, unsigned int cpu); + #ifdef CONFIG_X86_64 +@@ -955,6 +958,12 @@ static inline void set_ldt(const void *addr, unsigned entries) + { + PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries); + } ++#ifdef CONFIG_X86_32 ++static inline void load_user_cs_desc(unsigned int cpu, struct mm_struct *mm) ++{ ++ PVOP_VCALL2(pv_cpu_ops.load_user_cs_desc, cpu, mm); ++} ++#endif /*CONFIG_X86_32*/ + static inline void store_gdt(struct desc_ptr *dtr) + { + PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr); +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index c776826..fb6b579 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -160,6 +160,9 @@ static inline int hlt_works(int cpu) + + #define cache_line_size() (boot_cpu_data.x86_cache_alignment) + ++#define __HAVE_ARCH_ALIGN_STACK ++extern unsigned long arch_align_stack(unsigned long sp); ++ + extern void cpu_detect(struct cpuinfo_x86 *c); + + extern struct pt_regs *idle_regs(struct pt_regs *); +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 3ffdcfa..62cba96 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -804,6 +804,20 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) + /* Filter out anything that depends on CPUID levels we don't have */ + filter_cpuid_features(c, true); + ++ /* ++ * emulation of NX with segment limits unfortunately means ++ * we have to disable the fast system calls, due to the way that ++ * sysexit clears the segment limits on return. ++ * If we have either disabled exec-shield on the boot command line, ++ * or we have NX, then we don't need to do this. ++ */ ++ if (exec_shield != 0) { ++#ifdef CONFIG_X86_PAE ++ if (!test_cpu_cap(c, X86_FEATURE_NX)) ++#endif ++ clear_cpu_cap(c, X86_FEATURE_SEP); ++ } ++ + /* If the model name is still unset, do table lookup. */ + if (!c->x86_model_id[0]) { + const char *p; +diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c +index 70ec9b9..d956b8c 100644 +--- a/arch/x86/kernel/paravirt.c ++++ b/arch/x86/kernel/paravirt.c +@@ -369,6 +369,9 @@ struct pv_cpu_ops pv_cpu_ops = { + .read_tscp = native_read_tscp, + .load_tr_desc = native_load_tr_desc, + .set_ldt = native_set_ldt, ++#ifdef CONFIG_X86_32 ++ .load_user_cs_desc = native_load_user_cs_desc, ++#endif /*CONFIG_X86_32*/ + .load_gdt = native_load_gdt, + .load_idt = native_load_idt, + .store_gdt = native_store_gdt, +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index 59f4524..068e286 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -299,7 +299,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, + void + start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) + { ++ int cpu; ++ + set_user_gs(regs, 0); ++ + regs->fs = 0; + set_fs(USER_DS); + regs->ds = __USER_DS; +@@ -308,6 +311,11 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) + regs->cs = __USER_CS; + regs->ip = new_ip; + regs->sp = new_sp; ++ ++ cpu = get_cpu(); ++ load_user_cs_desc(cpu, current->mm); ++ put_cpu(); ++ + /* + * Free the old FP and other extended state + */ +@@ -354,7 +362,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + + __unlazy_fpu(prev_p); +- ++ if (next_p->mm) ++ load_user_cs_desc(cpu, next_p->mm); + + /* we're going to use this soon, after a few expensive things */ + if (next_p->fpu_counter > 5) +@@ -495,3 +504,40 @@ unsigned long get_wchan(struct task_struct *p) + return 0; + } + ++static void modify_cs(struct mm_struct *mm, unsigned long limit) ++{ ++ mm->context.exec_limit = limit; ++ set_user_cs(&mm->context.user_cs, limit); ++ if (mm == current->mm) { ++ int cpu; ++ ++ cpu = get_cpu(); ++ load_user_cs_desc(cpu, mm); ++ put_cpu(); ++ } ++} ++ ++void arch_add_exec_range(struct mm_struct *mm, unsigned long limit) ++{ ++ if (limit > mm->context.exec_limit) ++ modify_cs(mm, limit); ++} ++ ++void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end) ++{ ++ struct vm_area_struct *vma; ++ unsigned long limit = PAGE_SIZE; ++ ++ if (old_end == mm->context.exec_limit) { ++ for (vma = mm->mmap; vma; vma = vma->vm_next) ++ if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) ++ limit = vma->vm_end; ++ modify_cs(mm, limit); ++ } ++} ++ ++void arch_flush_exec_range(struct mm_struct *mm) ++{ ++ mm->context.exec_limit = 0; ++ set_user_cs(&mm->context.user_cs, 0); ++} +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index 07d60c8..41e9129 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -118,6 +118,76 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err) + if (!user_mode_vm(regs)) + die(str, regs, err); + } ++ ++static inline int ++__compare_user_cs_desc(const struct desc_struct *desc1, ++ const struct desc_struct *desc2) ++{ ++ return ((desc1->limit0 != desc2->limit0) || ++ (desc1->limit != desc2->limit) || ++ (desc1->base0 != desc2->base0) || ++ (desc1->base1 != desc2->base1) || ++ (desc1->base2 != desc2->base2)); ++} ++ ++/* ++ * lazy-check for CS validity on exec-shield binaries: ++ * ++ * the original non-exec stack patch was written by ++ * Solar Designer . Thanks! ++ */ ++static int ++check_lazy_exec_limit(int cpu, struct pt_regs *regs, long error_code) ++{ ++ struct desc_struct *desc1, *desc2; ++ struct vm_area_struct *vma; ++ unsigned long limit; ++ ++ if (current->mm == NULL) ++ return 0; ++ ++ limit = -1UL; ++ if (current->mm->context.exec_limit != -1UL) { ++ limit = PAGE_SIZE; ++ spin_lock(¤t->mm->page_table_lock); ++ for (vma = current->mm->mmap; vma; vma = vma->vm_next) ++ if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) ++ limit = vma->vm_end; ++ vma = get_gate_vma(current); ++ if (vma && (vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) ++ limit = vma->vm_end; ++ spin_unlock(¤t->mm->page_table_lock); ++ if (limit >= TASK_SIZE) ++ limit = -1UL; ++ current->mm->context.exec_limit = limit; ++ } ++ set_user_cs(¤t->mm->context.user_cs, limit); ++ ++ desc1 = ¤t->mm->context.user_cs; ++ desc2 = get_cpu_gdt_table(cpu) + GDT_ENTRY_DEFAULT_USER_CS; ++ ++ if (__compare_user_cs_desc(desc1, desc2)) { ++ /* ++ * The CS was not in sync - reload it and retry the ++ * instruction. If the instruction still faults then ++ * we won't hit this branch next time around. ++ */ ++ if (print_fatal_signals >= 2) { ++ printk(KERN_ERR "#GPF fixup (%ld[seg:%lx]) at %08lx, CPU#%d.\n", ++ error_code, error_code/8, regs->ip, ++ smp_processor_id()); ++ printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x, CPU_cs: %08x/%08x.\n", ++ current->mm->context.exec_limit, ++ desc1->a, desc1->b, desc2->a, desc2->b); ++ } ++ ++ load_user_cs_desc(cpu, current->mm); ++ ++ return 1; ++ } ++ ++ return 0; ++} + #endif + + static void __kprobes +@@ -276,6 +346,29 @@ do_general_protection(struct pt_regs *regs, long error_code) + if (!user_mode(regs)) + goto gp_in_kernel; + ++#ifdef CONFIG_X86_32 ++{ ++ int cpu; ++ int ok; ++ ++ cpu = get_cpu(); ++ ok = check_lazy_exec_limit(cpu, regs, error_code); ++ put_cpu(); ++ ++ if (ok) ++ return; ++ ++ if (print_fatal_signals) { ++ printk(KERN_ERR "#GPF(%ld[seg:%lx]) at %08lx, CPU#%d.\n", ++ error_code, error_code/8, regs->ip, smp_processor_id()); ++ printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x.\n", ++ current->mm->context.exec_limit, ++ current->mm->context.user_cs.a, ++ current->mm->context.user_cs.b); ++ } ++} ++#endif /*CONFIG_X86_32*/ ++ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + +@@ -885,19 +978,37 @@ do_device_not_available(struct pt_regs *regs, long error_code) + } + + #ifdef CONFIG_X86_32 ++/* ++ * The fixup code for errors in iret jumps to here (iret_exc). It loses ++ * the original trap number and erorr code. The bogus trap 32 and error ++ * code 0 are what the vanilla kernel delivers via: ++ * DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) ++ * ++ * NOTE: Because of the final "1" in the macro we need to enable interrupts. ++ * ++ * In case of a general protection fault in the iret instruction, we ++ * need to check for a lazy CS update for exec-shield. ++ */ + dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) + { +- siginfo_t info; ++ int ok; ++ int cpu; ++ + local_irq_enable(); + +- info.si_signo = SIGILL; +- info.si_errno = 0; +- info.si_code = ILL_BADSTK; +- info.si_addr = NULL; +- if (notify_die(DIE_TRAP, "iret exception", +- regs, error_code, 32, SIGILL) == NOTIFY_STOP) +- return; +- do_trap(32, SIGILL, "iret exception", regs, error_code, &info); ++ cpu = get_cpu(); ++ ok = check_lazy_exec_limit(cpu, regs, error_code); ++ put_cpu(); ++ ++ if (!ok && notify_die(DIE_TRAP, "iret exception", regs, ++ error_code, 32, SIGSEGV) != NOTIFY_STOP) { ++ siginfo_t info; ++ info.si_signo = SIGSEGV; ++ info.si_errno = 0; ++ info.si_code = ILL_BADSTK; ++ info.si_addr = 0; ++ do_trap(32, SIGSEGV, "iret exception", regs, error_code, &info); ++ } + } + #endif + +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index 34c1bfb..32c3d8d 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -228,6 +228,12 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, + set_nx(); + if (nx_enabled) + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); ++#ifdef CONFIG_X86_32 ++ else ++ if (exec_shield) ++ printk(KERN_INFO "Using x86 segment limits to approximate " ++ "NX protection\n"); ++#endif + + /* Enable PSE if available */ + if (cpu_has_pse) +diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c +index 949708d..c1373b6 100644 +--- a/arch/x86/mm/init_32.c ++++ b/arch/x86/mm/init_32.c +@@ -587,6 +587,54 @@ void zap_low_mappings(void) + pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); + EXPORT_SYMBOL_GPL(__supported_pte_mask); + ++#ifdef CONFIG_X86_PAE ++ ++static int disable_nx __initdata; ++ ++/* ++ * noexec = on|off ++ * ++ * Control non executable mappings. ++ * ++ * on Enable ++ * off Disable (disables exec-shield too) ++ */ ++static int __init noexec_setup(char *str) ++{ ++ if (!str || !strcmp(str, "on")) { ++ if (cpu_has_nx) { ++ __supported_pte_mask |= _PAGE_NX; ++ disable_nx = 0; ++ } ++ } else if (!strcmp(str, "off")) { ++ disable_nx = 1; ++ __supported_pte_mask &= ~_PAGE_NX; ++ exec_shield = 0; ++ } else ++ return -EINVAL; ++ ++ return 0; ++} ++early_param("noexec", noexec_setup); ++ ++void __init set_nx(void) ++{ ++ unsigned int v[4], l, h; ++ ++ if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { ++ cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); ++ ++ if ((v[3] & (1 << 20)) && !disable_nx) { ++ rdmsr(MSR_EFER, l, h); ++ l |= EFER_NX; ++ wrmsr(MSR_EFER, l, h); ++ nx_enabled = 1; ++ __supported_pte_mask |= _PAGE_NX; ++ } ++ } ++} ++#endif ++ + /* user-defined highmem size */ + static unsigned int highmem_pages = -1; + +diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c +index 1658296..72056cf 100644 +--- a/arch/x86/mm/mmap.c ++++ b/arch/x86/mm/mmap.c +@@ -111,13 +111,16 @@ static unsigned long mmap_legacy_base(void) + */ + void arch_pick_mmap_layout(struct mm_struct *mm) + { +- if (mmap_is_legacy()) { ++ if (!(2 & exec_shield) && mmap_is_legacy()) { + mm->mmap_base = mmap_legacy_base(); + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; + } else { + mm->mmap_base = mmap_base(); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; ++ if (!(current->personality & READ_IMPLIES_EXEC) ++ && mmap_is_ia32()) ++ mm->get_unmapped_exec_area = arch_get_unmapped_exec_area; + mm->unmap_area = arch_unmap_area_topdown; + } + } +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 821e970..ea5a4c3 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -6,6 +6,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -129,6 +130,12 @@ void smp_invalidate_interrupt(struct pt_regs *regs) + union smp_flush_state *f; + + cpu = smp_processor_id(); ++ ++#ifdef CONFIG_X86_32 ++ if (current->active_mm) ++ load_user_cs_desc(cpu, current->active_mm); ++#endif ++ + /* + * orig_rax contains the negated interrupt vector. + * Use that to determine where the sender put the data. +diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c +index 58bc00f..1fdafb5 100644 +--- a/arch/x86/vdso/vdso32-setup.c ++++ b/arch/x86/vdso/vdso32-setup.c +@@ -331,7 +331,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) + if (compat) + addr = VDSO_HIGH_BASE; + else { +- addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); ++ addr = get_unmapped_area_prot(NULL, 0, PAGE_SIZE, 0, 0, 1); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; +diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c +index 0a1700a..37b8744 100644 +--- a/arch/x86/xen/enlighten.c ++++ b/arch/x86/xen/enlighten.c +@@ -321,6 +321,24 @@ static void xen_set_ldt(const void *addr, unsigned entries) + xen_mc_issue(PARAVIRT_LAZY_CPU); + } + ++#ifdef CONFIG_X86_32 ++static void xen_load_user_cs_desc(int cpu, struct mm_struct *mm) ++{ ++ void *gdt; ++ xmaddr_t mgdt; ++ u64 descriptor; ++ struct desc_struct user_cs; ++ ++ gdt = &get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS]; ++ mgdt = virt_to_machine(gdt); ++ ++ user_cs = mm->context.user_cs; ++ descriptor = (u64) user_cs.a | ((u64) user_cs.b) << 32; ++ ++ HYPERVISOR_update_descriptor(mgdt.maddr, descriptor); ++} ++#endif /*CONFIG_X86_32*/ ++ + static void xen_load_gdt(const struct desc_ptr *dtr) + { + unsigned long va = dtr->address; +@@ -886,6 +904,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { + + .load_tr_desc = paravirt_nop, + .set_ldt = xen_set_ldt, ++#ifdef CONFIG_X86_32 ++ .load_user_cs_desc = xen_load_user_cs_desc, ++#endif /*CONFIG_X86_32*/ + .load_gdt = xen_load_gdt, + .load_idt = xen_load_idt, + .load_tls = xen_load_tls, +diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c +index 40381df..f856fab 100644 +--- a/fs/binfmt_elf.c ++++ b/fs/binfmt_elf.c +@@ -73,7 +73,7 @@ static struct linux_binfmt elf_format = { + .hasvdso = 1 + }; + +-#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) ++#define BAD_ADDR(x) IS_ERR_VALUE(x) + + static int set_brk(unsigned long start, unsigned long end) + { +@@ -721,6 +721,11 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) + break; + } + ++ if (current->personality == PER_LINUX && (exec_shield & 2)) { ++ executable_stack = EXSTACK_DISABLE_X; ++ current->flags |= PF_RANDOMIZE; ++ } ++ + /* Some simple consistency checks for the interpreter */ + if (elf_interpreter) { + retval = -ELIBBAD; +@@ -740,6 +745,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) + if (retval) + goto out_free_dentry; + ++#ifdef CONFIG_X86_32 ++ /* ++ * Turn off the CS limit completely if exec-shield disabled or ++ * NX active: ++ */ ++ if (!exec_shield || executable_stack != EXSTACK_DISABLE_X || nx_enabled) ++ arch_add_exec_range(current->mm, -1); ++#endif ++ + /* OK, This is the point of no return */ + current->flags &= ~PF_FORKNOEXEC; + current->mm->def_flags = def_flags; +@@ -747,7 +761,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) + /* Do this immediately, since STACK_TOP as used in setup_arg_pages + may depend on the personality. */ + SET_PERSONALITY(loc->elf_ex); +- if (elf_read_implies_exec(loc->elf_ex, executable_stack)) ++ if (!(exec_shield & 2) && ++ elf_read_implies_exec(loc->elf_ex, executable_stack)) + current->personality |= READ_IMPLIES_EXEC; + + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) +@@ -912,7 +927,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) + interpreter, + &interp_map_addr, + load_bias); +- if (!IS_ERR((void *)elf_entry)) { ++ if (!BAD_ADDR(elf_entry)) { + /* + * load_elf_interp() returns relocation + * adjustment +diff --git a/include/linux/mm.h b/include/linux/mm.h +index ad613ed..08f08d0 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1135,7 +1135,13 @@ extern int install_special_mapping(struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long flags, struct page **pages); + +-extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); ++extern unsigned long get_unmapped_area_prot(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, int); ++ ++static inline unsigned long get_unmapped_area(struct file *file, unsigned long addr, ++ unsigned long len, unsigned long pgoff, unsigned long flags) ++{ ++ return get_unmapped_area_prot(file, addr, len, pgoff, flags, 0); ++} + + extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 0e80e26..af904ea 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -198,6 +198,9 @@ struct mm_struct { + unsigned long (*get_unmapped_area) (struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags); ++ unsigned long (*get_unmapped_exec_area) (struct file *filp, ++ unsigned long addr, unsigned long len, ++ unsigned long pgoff, unsigned long flags); + void (*unmap_area) (struct mm_struct *mm, unsigned long addr); + unsigned long mmap_base; /* base of mmap area */ + unsigned long task_size; /* size of task vm space */ +diff --git a/include/linux/resource.h b/include/linux/resource.h +index 40fc7e6..68c2549 100644 +--- a/include/linux/resource.h ++++ b/include/linux/resource.h +@@ -55,8 +55,11 @@ struct rlimit { + /* + * Limit the stack by to some sane default: root can always + * increase this limit if needed.. 8MB seems reasonable. ++ * ++ * (2MB more to cover randomization effects.) + */ +-#define _STK_LIM (8*1024*1024) ++#define _STK_LIM (10*1024*1024) ++#define EXEC_STACK_BIAS (2*1024*1024) + + /* + * GPG2 wants 64kB of mlocked memory, to make sure pass phrases +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 4896fdf..3513e03 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -101,6 +101,9 @@ struct fs_struct; + struct bts_context; + struct perf_counter_context; + ++extern int exec_shield; ++extern int print_fatal_signals; ++ + /* + * List of flags we want to share for kernel threads, + * if only because they are not used by them anyway. +@@ -359,6 +362,10 @@ extern int sysctl_max_map_count; + extern unsigned long + arch_get_unmapped_area(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); ++ ++extern unsigned long ++arch_get_unmapped_exec_area(struct file *, unsigned long, unsigned long, ++ unsigned long, unsigned long); + extern unsigned long + arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index ce664f9..1905e22 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -87,6 +87,26 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max; + #ifndef CONFIG_MMU + extern int sysctl_nr_trim_pages; + #endif ++ ++int exec_shield = (1<<0); ++/* exec_shield is a bitmask: ++ * 0: off; vdso at STACK_TOP, 1 page below TASK_SIZE ++ * (1<<0) 1: on [also on if !=0] ++ * (1<<1) 2: force noexecstack regardless of PT_GNU_STACK ++ * The old settings ++ * (1<<2) 4: vdso just below .text of main (unless too low) ++ * (1<<3) 8: vdso just below .text of PT_INTERP (unless too low) ++ * are ignored because the vdso is placed completely randomly ++ */ ++ ++static int __init setup_exec_shield(char *str) ++{ ++ get_option(&str, &exec_shield); ++ ++ return 1; ++} ++__setup("exec-shield=", setup_exec_shield); ++ + #ifdef CONFIG_RCU_TORTURE_TEST + extern int rcutorture_runnable; + #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +@@ -382,6 +402,14 @@ static struct ctl_table kern_table[] = { + .proc_handler = &proc_dointvec, + }, + { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "exec-shield", ++ .data = &exec_shield, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { + .ctl_name = KERN_CORE_USES_PID, + .procname = "core_uses_pid", + .data = &core_uses_pid, +diff --git a/mm/mmap.c b/mm/mmap.c +index 34579b2..260bb3c 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -45,6 +46,18 @@ + #define arch_rebalance_pgtables(addr, len) (addr) + #endif + ++/* No sane architecture will #define these to anything else */ ++#ifndef arch_add_exec_range ++#define arch_add_exec_range(mm, limit) do { ; } while (0) ++#endif ++#ifndef arch_flush_exec_range ++#define arch_flush_exec_range(mm) do { ; } while (0) ++#endif ++#ifndef arch_remove_exec_range ++#define arch_remove_exec_range(mm, limit) do { ; } while (0) ++#endif ++ ++ + static void unmap_region(struct mm_struct *mm, + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long start, unsigned long end); +@@ -392,6 +405,8 @@ static inline void + __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node *rb_parent) + { ++ if (vma->vm_flags & VM_EXEC) ++ arch_add_exec_range(mm, vma->vm_end); + if (prev) { + vma->vm_next = prev->vm_next; + prev->vm_next = vma; +@@ -494,6 +509,8 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, + rb_erase(&vma->vm_rb, &mm->mm_rb); + if (mm->mmap_cache == vma) + mm->mmap_cache = prev; ++ if (vma->vm_flags & VM_EXEC) ++ arch_remove_exec_range(mm, vma->vm_end); + } + + /* +@@ -803,6 +820,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, + } else /* cases 2, 5, 7 */ + vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL); ++ if (prev->vm_flags & VM_EXEC) ++ arch_add_exec_range(mm, prev->vm_end); + return prev; + } + +@@ -957,7 +976,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ +- addr = get_unmapped_area(file, addr, len, pgoff, flags); ++ addr = get_unmapped_area_prot(file, addr, len, pgoff, flags, ++ prot & PROT_EXEC); + if (addr & ~PAGE_MASK) + return addr; + +@@ -1442,13 +1462,17 @@ void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) + } + + unsigned long +-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, +- unsigned long pgoff, unsigned long flags) ++get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len, ++ unsigned long pgoff, unsigned long flags, int exec) + { + unsigned long (*get_area)(struct file *, unsigned long, + unsigned long, unsigned long, unsigned long); + +- get_area = current->mm->get_unmapped_area; ++ if (exec && current->mm->get_unmapped_exec_area) ++ get_area = current->mm->get_unmapped_exec_area; ++ else ++ get_area = current->mm->get_unmapped_area; ++ + if (file && file->f_op && file->f_op->get_unmapped_area) + get_area = file->f_op->get_unmapped_area; + addr = get_area(file, addr, len, pgoff, flags); +@@ -1462,8 +1486,76 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + + return arch_rebalance_pgtables(addr, len); + } ++EXPORT_SYMBOL(get_unmapped_area_prot); ++ ++#define SHLIB_BASE 0x00110000 ++ ++unsigned long ++arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0, ++ unsigned long len0, unsigned long pgoff, unsigned long flags) ++{ ++ unsigned long addr = addr0, len = len0; ++ struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; ++ unsigned long tmp; ++ ++ if (len > TASK_SIZE) ++ return -ENOMEM; ++ ++ if (flags & MAP_FIXED) ++ return addr; ++ ++ if (!addr) ++ addr = randomize_range(SHLIB_BASE, 0x01000000, len); ++ ++ if (addr) { ++ addr = PAGE_ALIGN(addr); ++ vma = find_vma(mm, addr); ++ if (TASK_SIZE - len >= addr && ++ (!vma || addr + len <= vma->vm_start)) ++ return addr; ++ } ++ ++ addr = SHLIB_BASE; ++ for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { ++ /* At this point: (!vma || addr < vma->vm_end). */ ++ if (TASK_SIZE - len < addr) ++ return -ENOMEM; ++ ++ if (!vma || addr + len <= vma->vm_start) { ++ /* ++ * Must not let a PROT_EXEC mapping get into the ++ * brk area: ++ */ ++ if (addr + len > mm->brk) ++ goto failed; ++ ++ /* ++ * Up until the brk area we randomize addresses ++ * as much as possible: ++ */ ++ if (addr >= 0x01000000) { ++ tmp = randomize_range(0x01000000, ++ PAGE_ALIGN(max(mm->start_brk, ++ (unsigned long)0x08000000)), len); ++ vma = find_vma(mm, tmp); ++ if (TASK_SIZE - len >= tmp && ++ (!vma || tmp + len <= vma->vm_start)) ++ return tmp; ++ } ++ /* ++ * Ok, randomization didnt work out - return ++ * the result of the linear search: ++ */ ++ return addr; ++ } ++ addr = vma->vm_end; ++ } ++ ++failed: ++ return current->mm->get_unmapped_area(filp, addr0, len0, pgoff, flags); ++} + +-EXPORT_SYMBOL(get_unmapped_area); + + /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ + struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +@@ -1538,6 +1630,14 @@ out: + return prev ? prev->vm_next : vma; + } + ++static int over_stack_limit(unsigned long sz) ++{ ++ if (sz < EXEC_STACK_BIAS) ++ return 0; ++ return (sz - EXEC_STACK_BIAS) > ++ current->signal->rlim[RLIMIT_STACK].rlim_cur; ++} ++ + /* + * Verify that the stack growth is acceptable and + * update accounting. This is shared with both the +@@ -1554,7 +1654,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns + return -ENOMEM; + + /* Stack limit test */ +- if (size > rlim[RLIMIT_STACK].rlim_cur) ++ if (over_stack_limit(size)) + return -ENOMEM; + + /* mlock limit tests */ +@@ -1864,10 +1964,14 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + +- if (new_below) ++ if (new_below) { ++ unsigned long old_end = vma->vm_end; ++ + vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + + ((addr - new->vm_start) >> PAGE_SHIFT), new); +- else ++ if (vma->vm_flags & VM_EXEC) ++ arch_remove_exec_range(mm, old_end); ++ } else + vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + + return 0; +@@ -2116,6 +2220,7 @@ void exit_mmap(struct mm_struct *mm) + vm_unacct_memory(nr_accounted); + free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); + tlb_finish_mmu(tlb, 0, end); ++ arch_flush_exec_range(mm); + } + + /* +diff --git a/mm/mprotect.c b/mm/mprotect.c +index d80311b..032423d 100644 +--- a/mm/mprotect.c ++++ b/mm/mprotect.c +@@ -26,9 +26,14 @@ + #include + #include + #include ++#include + #include + #include + ++#ifndef arch_remove_exec_range ++#define arch_remove_exec_range(mm, limit) do { ; } while (0) ++#endif ++ + #ifndef pgprot_modify + static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) + { +@@ -139,7 +144,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, + struct mm_struct *mm = vma->vm_mm; + unsigned long oldflags = vma->vm_flags; + long nrpages = (end - start) >> PAGE_SHIFT; +- unsigned long charged = 0; ++ unsigned long charged = 0, old_end = vma->vm_end; + pgoff_t pgoff; + int error; + int dirty_accountable = 0; +@@ -204,6 +209,9 @@ success: + dirty_accountable = 1; + } + ++ if (oldflags & VM_EXEC) ++ arch_remove_exec_range(current->mm, old_end); ++ + mmu_notifier_invalidate_range_start(mm, start, end); + if (is_vm_hugetlb_page(vma)) + hugetlb_change_protection(vma, start, end, vma->vm_page_prot); +diff --git a/mm/mremap.c b/mm/mremap.c +index a39b7b9..6bebfde 100644 +--- a/mm/mremap.c ++++ b/mm/mremap.c +@@ -400,8 +400,8 @@ unsigned long do_mremap(unsigned long addr, + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + +- new_addr = get_unmapped_area(vma->vm_file, 0, new_len, +- vma->vm_pgoff, map_flags); ++ new_addr = get_unmapped_area_prot(vma->vm_file, 0, new_len, ++ vma->vm_pgoff, map_flags, vma->vm_flags & VM_EXEC); + if (new_addr & ~PAGE_MASK) { + ret = new_addr; + goto out; diff --git a/linux-2.6-utrace.patch b/linux-2.6-utrace.patch new file mode 100644 index 000000000..ebb318bf6 --- /dev/null +++ b/linux-2.6-utrace.patch @@ -0,0 +1,4102 @@ +diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile +index 9632444..bf4b9e8 100644 +--- a/Documentation/DocBook/Makefile ++++ b/Documentation/DocBook/Makefile +@@ -9,7 +9,7 @@ + DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \ + kernel-hacking.xml kernel-locking.xml deviceiobook.xml \ + procfs-guide.xml writing_usb_driver.xml networking.xml \ +- kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml \ ++ kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml utrace.xml \ + gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ + genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ + mac80211.xml debugobjects.xml sh.xml regulator.xml \ +diff --git a/Documentation/DocBook/utrace.tmpl b/Documentation/DocBook/utrace.tmpl +new file mode 100644 +index 0000000..6cc58a1 +--- /dev/null ++++ b/Documentation/DocBook/utrace.tmpl +@@ -0,0 +1,590 @@ ++ ++ ++ ++ ++ ++ The utrace User Debugging Infrastructure ++ ++ ++ ++ ++ utrace concepts ++ ++ Introduction ++ ++ ++ utrace is infrastructure code for tracing ++ and controlling user threads. This is the foundation for writing ++ tracing engines, which can be loadable kernel modules. ++ ++ ++ ++ The basic actors in utrace are the thread ++ and the tracing engine. A tracing engine is some body of code that ++ calls into the <linux/utrace.h> ++ interfaces, represented by a struct ++ utrace_engine_ops. (Usually it's a kernel module, ++ though the legacy ptrace support is a tracing ++ engine that is not in a kernel module.) The interface operates on ++ individual threads (struct task_struct). ++ If an engine wants to treat several threads as a group, that is up ++ to its higher-level code. ++ ++ ++ ++ Tracing begins by attaching an engine to a thread, using ++ utrace_attach_task or ++ utrace_attach_pid. If successful, it returns a ++ pointer that is the handle used in all other calls. ++ ++ ++ ++ ++ Events and Callbacks ++ ++ ++ An attached engine does nothing by default. An engine makes something ++ happen by requesting callbacks via utrace_set_events ++ and poking the thread with utrace_control. ++ The synchronization issues related to these two calls ++ are discussed further below in . ++ ++ ++ ++ Events are specified using the macro ++ UTRACE_EVENT(type). ++ Each event type is associated with a callback in struct ++ utrace_engine_ops. A tracing engine can leave unused ++ callbacks NULL. The only callbacks required ++ are those used by the event flags it sets. ++ ++ ++ ++ Many engines can be attached to each thread. When a thread has an ++ event, each engine gets a callback if it has set the event flag for ++ that event type. For most events, engines are called in the order they ++ attached. Engines that attach after the event has occurred do not get ++ callbacks for that event. This includes any new engines just attached ++ by an existing engine's callback function. Once the sequence of ++ callbacks for that one event has completed, such new engines are then ++ eligible in the next sequence that starts when there is another event. ++ ++ ++ ++ Event reporting callbacks have details particular to the event type, ++ but are all called in similar environments and have the same ++ constraints. Callbacks are made from safe points, where no locks ++ are held, no special resources are pinned (usually), and the ++ user-mode state of the thread is accessible. So, callback code has ++ a pretty free hand. But to be a good citizen, callback code should ++ never block for long periods. It is fine to block in ++ kmalloc and the like, but never wait for i/o or ++ for user mode to do something. If you need the thread to wait, use ++ UTRACE_STOP and return from the callback ++ quickly. When your i/o finishes or whatever, you can use ++ utrace_control to resume the thread. ++ ++ ++ ++ The UTRACE_EVENT(SYSCALL_ENTRY) event is a special ++ case. While other events happen in the kernel when it will return to ++ user mode soon, this event happens when entering the kernel before it ++ will proceed with the work requested from user mode. Because of this ++ difference, the report_syscall_entry callback is ++ special in two ways. For this event, engines are called in reverse of ++ the normal order (this includes the report_quiesce ++ call that precedes a report_syscall_entry call). ++ This preserves the semantics that the last engine to attach is called ++ "closest to user mode"--the engine that is first to see a thread's user ++ state when it enters the kernel is also the last to see that state when ++ the thread returns to user mode. For the same reason, if these ++ callbacks use UTRACE_STOP (see the next section), ++ the thread stops immediately after callbacks rather than only when it's ++ ready to return to user mode; when allowed to resume, it will actually ++ attempt the system call indicated by the register values at that time. ++ ++ ++ ++ ++ Stopping Safely ++ ++ Writing well-behaved callbacks ++ ++ ++ Well-behaved callbacks are important to maintain two essential ++ properties of the interface. The first of these is that unrelated ++ tracing engines should not interfere with each other. If your engine's ++ event callback does not return quickly, then another engine won't get ++ the event notification in a timely manner. The second important ++ property is that tracing should be as noninvasive as possible to the ++ normal operation of the system overall and of the traced thread in ++ particular. That is, attached tracing engines should not perturb a ++ thread's behavior, except to the extent that changing its user-visible ++ state is explicitly what you want to do. (Obviously some perturbation ++ is unavoidable, primarily timing changes, ranging from small delays due ++ to the overhead of tracing, to arbitrary pauses in user code execution ++ when a user stops a thread with a debugger for examination.) Even when ++ you explicitly want the perturbation of making the traced thread block, ++ just blocking directly in your callback has more unwanted effects. For ++ example, the CLONE event callbacks are called when ++ the new child thread has been created but not yet started running; the ++ child can never be scheduled until the CLONE ++ tracing callbacks return. (This allows engines tracing the parent to ++ attach to the child.) If a CLONE event callback ++ blocks the parent thread, it also prevents the child thread from ++ running (even to process a SIGKILL). If what you ++ want is to make both the parent and child block, then use ++ utrace_attach_task on the child and then use ++ UTRACE_STOP on both threads. A more crucial ++ problem with blocking in callbacks is that it can prevent ++ SIGKILL from working. A thread that is blocking ++ due to UTRACE_STOP will still wake up and die ++ immediately when sent a SIGKILL, as all threads ++ should. Relying on the utrace ++ infrastructure rather than on private synchronization calls in event ++ callbacks is an important way to help keep tracing robustly ++ noninvasive. ++ ++ ++ ++ ++ Using <constant>UTRACE_STOP</constant> ++ ++ ++ To control another thread and access its state, it must be stopped ++ with UTRACE_STOP. This means that it is ++ stopped and won't start running again while we access it. When a ++ thread is not already stopped, utrace_control ++ returns -EINPROGRESS and an engine must wait ++ for an event callback when the thread is ready to stop. The thread ++ may be running on another CPU or may be blocked. When it is ready ++ to be examined, it will make callbacks to engines that set the ++ UTRACE_EVENT(QUIESCE) event bit. To wake up an ++ interruptible wait, use UTRACE_INTERRUPT. ++ ++ ++ ++ As long as some engine has used UTRACE_STOP and ++ not called utrace_control to resume the thread, ++ then the thread will remain stopped. SIGKILL ++ will wake it up, but it will not run user code. When the stop is ++ cleared with utrace_control or a callback ++ return value, the thread starts running again. ++ (See also .) ++ ++ ++ ++ ++ ++ ++ Tear-down Races ++ ++ Primacy of <constant>SIGKILL</constant> ++ ++ Ordinarily synchronization issues for tracing engines are kept fairly ++ straightforward by using UTRACE_STOP. You ask a ++ thread to stop, and then once it makes the ++ report_quiesce callback it cannot do anything else ++ that would result in another callback, until you let it with a ++ utrace_control call. This simple arrangement ++ avoids complex and error-prone code in each one of a tracing engine's ++ event callbacks to keep them serialized with the engine's other ++ operations done on that thread from another thread of control. ++ However, giving tracing engines complete power to keep a traced thread ++ stuck in place runs afoul of a more important kind of simplicity that ++ the kernel overall guarantees: nothing can prevent or delay ++ SIGKILL from making a thread die and release its ++ resources. To preserve this important property of ++ SIGKILL, it as a special case can break ++ UTRACE_STOP like nothing else normally can. This ++ includes both explicit SIGKILL signals and the ++ implicit SIGKILL sent to each other thread in the ++ same thread group by a thread doing an exec, or processing a fatal ++ signal, or making an exit_group system call. A ++ tracing engine can prevent a thread from beginning the exit or exec or ++ dying by signal (other than SIGKILL) if it is ++ attached to that thread, but once the operation begins, no tracing ++ engine can prevent or delay all other threads in the same thread group ++ dying. ++ ++ ++ ++ Final callbacks ++ ++ The report_reap callback is always the final event ++ in the life cycle of a traced thread. Tracing engines can use this as ++ the trigger to clean up their own data structures. The ++ report_death callback is always the penultimate ++ event a tracing engine might see; it's seen unless the thread was ++ already in the midst of dying when the engine attached. Many tracing ++ engines will have no interest in when a parent reaps a dead process, ++ and nothing they want to do with a zombie thread once it dies; for ++ them, the report_death callback is the natural ++ place to clean up data structures and detach. To facilitate writing ++ such engines robustly, given the asynchrony of ++ SIGKILL, and without error-prone manual ++ implementation of synchronization schemes, the ++ utrace infrastructure provides some special ++ guarantees about the report_death and ++ report_reap callbacks. It still takes some care ++ to be sure your tracing engine is robust to tear-down races, but these ++ rules make it reasonably straightforward and concise to handle a lot of ++ corner cases correctly. ++ ++ ++ ++ Engine and task pointers ++ ++ The first sort of guarantee concerns the core data structures ++ themselves. struct utrace_engine is ++ a reference-counted data structure. While you hold a reference, an ++ engine pointer will always stay valid so that you can safely pass it to ++ any utrace call. Each call to ++ utrace_attach_task or ++ utrace_attach_pid returns an engine pointer with a ++ reference belonging to the caller. You own that reference until you ++ drop it using utrace_engine_put. There is an ++ implicit reference on the engine while it is attached. So if you drop ++ your only reference, and then use ++ utrace_attach_task without ++ UTRACE_ATTACH_CREATE to look up that same engine, ++ you will get the same pointer with a new reference to replace the one ++ you dropped, just like calling utrace_engine_get. ++ When an engine has been detached, either explicitly with ++ UTRACE_DETACH or implicitly after ++ report_reap, then any references you hold are all ++ that keep the old engine pointer alive. ++ ++ ++ ++ There is nothing a kernel module can do to keep a struct ++ task_struct alive outside of ++ rcu_read_lock. When the task dies and is reaped ++ by its parent (or itself), that structure can be freed so that any ++ dangling pointers you have stored become invalid. ++ utrace will not prevent this, but it can ++ help you detect it safely. By definition, a task that has been reaped ++ has had all its engines detached. All ++ utrace calls can be safely called on a ++ detached engine if the caller holds a reference on that engine pointer, ++ even if the task pointer passed in the call is invalid. All calls ++ return -ESRCH for a detached engine, which tells ++ you that the task pointer you passed could be invalid now. Since ++ utrace_control and ++ utrace_set_events do not block, you can call those ++ inside a rcu_read_lock section and be sure after ++ they don't return -ESRCH that the task pointer is ++ still valid until rcu_read_unlock. The ++ infrastructure never holds task references of its own. Though neither ++ rcu_read_lock nor any other lock is held while ++ making a callback, it's always guaranteed that the struct ++ task_struct and the struct ++ utrace_engine passed as arguments remain valid ++ until the callback function returns. ++ ++ ++ ++ The common means for safely holding task pointers that is available to ++ kernel modules is to use struct pid, which ++ permits put_pid from kernel modules. When using ++ that, the calls utrace_attach_pid, ++ utrace_control_pid, ++ utrace_set_events_pid, and ++ utrace_barrier_pid are available. ++ ++ ++ ++ ++ ++ Serialization of <constant>DEATH</constant> and <constant>REAP</constant> ++ ++ ++ The second guarantee is the serialization of ++ DEATH and REAP event ++ callbacks for a given thread. The actual reaping by the parent ++ (release_task call) can occur simultaneously ++ while the thread is still doing the final steps of dying, including ++ the report_death callback. If a tracing engine ++ has requested both DEATH and ++ REAP event reports, it's guaranteed that the ++ report_reap callback will not be made until ++ after the report_death callback has returned. ++ If the report_death callback itself detaches ++ from the thread, then the report_reap callback ++ will never be made. Thus it is safe for a ++ report_death callback to clean up data ++ structures and detach. ++ ++ ++ ++ Interlock with final callbacks ++ ++ The final sort of guarantee is that a tracing engine will know for sure ++ whether or not the report_death and/or ++ report_reap callbacks will be made for a certain ++ thread. These tear-down races are disambiguated by the error return ++ values of utrace_set_events and ++ utrace_control. Normally ++ utrace_control called with ++ UTRACE_DETACH returns zero, and this means that no ++ more callbacks will be made. If the thread is in the midst of dying, ++ it returns -EALREADY to indicate that the ++ report_death callback may already be in progress; ++ when you get this error, you know that any cleanup your ++ report_death callback does is about to happen or ++ has just happened--note that if the report_death ++ callback does not detach, the engine remains attached until the thread ++ gets reaped. If the thread is in the midst of being reaped, ++ utrace_control returns -ESRCH ++ to indicate that the report_reap callback may ++ already be in progress; this means the engine is implicitly detached ++ when the callback completes. This makes it possible for a tracing ++ engine that has decided asynchronously to detach from a thread to ++ safely clean up its data structures, knowing that no ++ report_death or report_reap ++ callback will try to do the same. utrace_detach ++ returns -ESRCH when the struct ++ utrace_engine has already been detached, but is ++ still a valid pointer because of its reference count. A tracing engine ++ can use this to safely synchronize its own independent multiple threads ++ of control with each other and with its event callbacks that detach. ++ ++ ++ ++ In the same vein, utrace_set_events normally ++ returns zero; if the target thread was stopped before the call, then ++ after a successful call, no event callbacks not requested in the new ++ flags will be made. It fails with -EALREADY if ++ you try to clear UTRACE_EVENT(DEATH) when the ++ report_death callback may already have begun, if ++ you try to clear UTRACE_EVENT(REAP) when the ++ report_reap callback may already have begun, or if ++ you try to newly set UTRACE_EVENT(DEATH) or ++ UTRACE_EVENT(QUIESCE) when the target is already ++ dead or dying. Like utrace_control, it returns ++ -ESRCH when the thread has already been detached ++ (including forcible detach on reaping). This lets the tracing engine ++ know for sure which event callbacks it will or won't see after ++ utrace_set_events has returned. By checking for ++ errors, it can know whether to clean up its data structures immediately ++ or to let its callbacks do the work. ++ ++ ++ ++ Using <function>utrace_barrier</function> ++ ++ When a thread is safely stopped, calling ++ utrace_control with UTRACE_DETACH ++ or calling utrace_set_events to disable some events ++ ensures synchronously that your engine won't get any more of the callbacks ++ that have been disabled (none at all when detaching). But these can also ++ be used while the thread is not stopped, when it might be simultaneously ++ making a callback to your engine. For this situation, these calls return ++ -EINPROGRESS when it's possible a callback is in ++ progress. If you are not prepared to have your old callbacks still run, ++ then you can synchronize to be sure all the old callbacks are finished, ++ using utrace_barrier. This is necessary if the ++ kernel module containing your callback code is going to be unloaded. ++ ++ ++ After using UTRACE_DETACH once, further calls to ++ utrace_control with the same engine pointer will ++ return -ESRCH. In contrast, after getting ++ -EINPROGRESS from ++ utrace_set_events, you can call ++ utrace_set_events again later and if it returns zero ++ then know the old callbacks have finished. ++ ++ ++ Unlike all other calls, utrace_barrier (and ++ utrace_barrier_pid) will accept any engine pointer you ++ hold a reference on, even if UTRACE_DETACH has already ++ been used. After any utrace_control or ++ utrace_set_events call (these do not block), you can ++ call utrace_barrier to block until callbacks have ++ finished. This returns -ESRCH only if the engine is ++ completely detached (finished all callbacks). Otherwise it waits ++ until the thread is definitely not in the midst of a callback to this ++ engine and then returns zero, but can return ++ -ERESTARTSYS if its wait is interrupted. ++ ++ ++ ++ ++ ++ ++ ++utrace core API ++ ++ ++ The utrace API is declared in <linux/utrace.h>. ++ ++ ++!Iinclude/linux/utrace.h ++!Ekernel/utrace.c ++ ++ ++ ++Machine State ++ ++ ++ The task_current_syscall function can be used on any ++ valid struct task_struct at any time, and does ++ not even require that utrace_attach_task was used at all. ++ ++ ++ ++ The other ways to access the registers and other machine-dependent state of ++ a task can only be used on a task that is at a known safe point. The safe ++ points are all the places where utrace_set_events can ++ request callbacks (except for the DEATH and ++ REAP events). So at any event callback, it is safe to ++ examine current. ++ ++ ++ ++ One task can examine another only after a callback in the target task that ++ returns UTRACE_STOP so that task will not return to user ++ mode after the safe point. This guarantees that the task will not resume ++ until the same engine uses utrace_control, unless the ++ task dies suddenly. To examine safely, one must use a pair of calls to ++ utrace_prepare_examine and ++ utrace_finish_examine surrounding the calls to ++ struct user_regset functions or direct examination ++ of task data structures. utrace_prepare_examine returns ++ an error if the task is not properly stopped and not dead. After a ++ successful examination, the paired utrace_finish_examine ++ call returns an error if the task ever woke up during the examination. If ++ so, any data gathered may be scrambled and should be discarded. This means ++ there was a spurious wake-up (which should not happen), or a sudden death. ++ ++ ++<structname>struct user_regset</structname> ++ ++ ++ The struct user_regset API ++ is declared in <linux/regset.h>. ++ ++ ++!Finclude/linux/regset.h ++ ++ ++ ++ ++ <filename>System Call Information</filename> ++ ++ ++ This function is declared in <linux/ptrace.h>. ++ ++ ++!Elib/syscall.c ++ ++ ++ ++<filename>System Call Tracing</filename> ++ ++ ++ The arch API for system call information is declared in ++ <asm/syscall.h>. ++ Each of these calls can be used only at system call entry tracing, ++ or can be used only at system call exit and the subsequent safe points ++ before returning to user mode. ++ At system call entry tracing means either during a ++ report_syscall_entry callback, ++ or any time after that callback has returned UTRACE_STOP. ++ ++ ++!Finclude/asm-generic/syscall.h ++ ++ ++ ++ ++ ++Kernel Internals ++ ++ ++ This chapter covers the interface to the tracing infrastructure ++ from the core of the kernel and the architecture-specific code. ++ This is for maintainers of the kernel and arch code, and not relevant ++ to using the tracing facilities described in preceding chapters. ++ ++ ++Core Calls In ++ ++ ++ These calls are declared in <linux/tracehook.h>. ++ The core kernel calls these functions at various important places. ++ ++ ++!Finclude/linux/tracehook.h ++ ++ ++ ++Architecture Calls Out ++ ++ ++ An arch that has done all these things sets ++ CONFIG_HAVE_ARCH_TRACEHOOK. ++ This is required to enable the utrace code. ++ ++ ++<filename><asm/ptrace.h></filename> ++ ++ ++ An arch defines these in <asm/ptrace.h> ++ if it supports hardware single-step or block-step features. ++ ++ ++!Finclude/linux/ptrace.h arch_has_single_step arch_has_block_step ++!Finclude/linux/ptrace.h user_enable_single_step user_enable_block_step ++!Finclude/linux/ptrace.h user_disable_single_step ++ ++ ++ ++ ++ <filename><asm/syscall.h></filename> ++ ++ ++ An arch provides <asm/syscall.h> that ++ defines these as inlines, or declares them as exported functions. ++ These interfaces are described in . ++ ++ ++ ++ ++ ++ <filename><linux/tracehook.h></filename> ++ ++ ++ An arch must define TIF_NOTIFY_RESUME ++ and TIF_SYSCALL_TRACE ++ in its <asm/thread_info.h>. ++ The arch code must call the following functions, all declared ++ in <linux/tracehook.h> and ++ described in : ++ ++ ++ ++ tracehook_notify_resume ++ ++ ++ tracehook_report_syscall_entry ++ ++ ++ tracehook_report_syscall_exit ++ ++ ++ tracehook_signal_handler ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/fs/proc/array.c b/fs/proc/array.c +index 725a650..e299a63 100644 +--- a/fs/proc/array.c ++++ b/fs/proc/array.c +@@ -82,6 +82,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -188,6 +189,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, + cred->uid, cred->euid, cred->suid, cred->fsuid, + cred->gid, cred->egid, cred->sgid, cred->fsgid); + ++ task_utrace_proc_status(m, p); ++ + task_lock(p); + if (p->files) + fdt = files_fdtable(p->files); +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 5368fbd..aecd24e 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -167,6 +167,7 @@ extern struct cred init_cred; + [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ + }, \ + .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ ++ INIT_UTRACE(tsk) \ + INIT_IDS \ + INIT_PERF_COUNTERS(tsk) \ + INIT_TRACE_IRQFLAGS \ +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 4d07542..2060aa1 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -59,6 +59,7 @@ struct sched_param { + #include + #include + #include ++#include + + #include + #include +@@ -1314,6 +1315,11 @@ struct task_struct { + #endif + seccomp_t seccomp; + ++#ifdef CONFIG_UTRACE ++ struct utrace utrace; ++ unsigned long utrace_flags; ++#endif ++ + /* vserver context data */ + struct vx_info *vx_info; + struct nx_info *nx_info; +diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h +index 7c2bfd9..a91d9a4 100644 +--- a/include/linux/tracehook.h ++++ b/include/linux/tracehook.h +@@ -49,6 +49,7 @@ + #include + #include + #include ++#include + struct linux_binprm; + + /** +@@ -63,6 +64,8 @@ struct linux_binprm; + */ + static inline int tracehook_expect_breakpoints(struct task_struct *task) + { ++ if (unlikely(task_utrace_flags(task) & UTRACE_EVENT(SIGNAL_CORE))) ++ return 1; + return (task_ptrace(task) & PT_PTRACED) != 0; + } + +@@ -111,6 +114,9 @@ static inline void ptrace_report_syscall(struct pt_regs *regs) + static inline __must_check int tracehook_report_syscall_entry( + struct pt_regs *regs) + { ++ if ((task_utrace_flags(current) & UTRACE_EVENT(SYSCALL_ENTRY)) && ++ utrace_report_syscall_entry(regs)) ++ return 1; + ptrace_report_syscall(regs); + return 0; + } +@@ -134,6 +140,8 @@ static inline __must_check int tracehook_report_syscall_entry( + */ + static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step) + { ++ if (task_utrace_flags(current) & UTRACE_EVENT(SYSCALL_EXIT)) ++ utrace_report_syscall_exit(regs); + ptrace_report_syscall(regs); + } + +@@ -194,6 +202,8 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt, + struct linux_binprm *bprm, + struct pt_regs *regs) + { ++ if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(EXEC))) ++ utrace_report_exec(fmt, bprm, regs); + if (!ptrace_event(PT_TRACE_EXEC, PTRACE_EVENT_EXEC, 0) && + unlikely(task_ptrace(current) & PT_PTRACED)) + send_sig(SIGTRAP, current, 0); +@@ -211,6 +221,8 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt, + */ + static inline void tracehook_report_exit(long *exit_code) + { ++ if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(EXIT))) ++ utrace_report_exit(exit_code); + ptrace_event(PT_TRACE_EXIT, PTRACE_EVENT_EXIT, *exit_code); + } + +@@ -254,6 +266,7 @@ static inline int tracehook_prepare_clone(unsigned clone_flags) + static inline void tracehook_finish_clone(struct task_struct *child, + unsigned long clone_flags, int trace) + { ++ utrace_init_task(child); + ptrace_init_task(child, (clone_flags & CLONE_PTRACE) || trace); + } + +@@ -278,6 +291,8 @@ static inline void tracehook_report_clone(struct pt_regs *regs, + unsigned long clone_flags, + pid_t pid, struct task_struct *child) + { ++ if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(CLONE))) ++ utrace_report_clone(clone_flags, child); + if (unlikely(task_ptrace(child))) { + /* + * It doesn't matter who attached/attaching to this +@@ -310,6 +325,9 @@ static inline void tracehook_report_clone_complete(int trace, + pid_t pid, + struct task_struct *child) + { ++ if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(CLONE)) && ++ (clone_flags & CLONE_VFORK)) ++ utrace_finish_vfork(current); + if (unlikely(trace)) + ptrace_event(0, trace, pid); + } +@@ -344,6 +362,7 @@ static inline void tracehook_report_vfork_done(struct task_struct *child, + */ + static inline void tracehook_prepare_release_task(struct task_struct *task) + { ++ utrace_release_task(task); + } + + /** +@@ -358,6 +377,7 @@ static inline void tracehook_prepare_release_task(struct task_struct *task) + static inline void tracehook_finish_release_task(struct task_struct *task) + { + ptrace_release_task(task); ++ BUG_ON(task->exit_state != EXIT_DEAD); + } + + /** +@@ -379,6 +399,8 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info, + const struct k_sigaction *ka, + struct pt_regs *regs, int stepping) + { ++ if (task_utrace_flags(current)) ++ utrace_signal_handler(current, stepping); + if (stepping) + ptrace_notify(SIGTRAP); + } +@@ -396,6 +418,8 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info, + static inline int tracehook_consider_ignored_signal(struct task_struct *task, + int sig) + { ++ if (unlikely(task_utrace_flags(task) & UTRACE_EVENT(SIGNAL_IGN))) ++ return 1; + return (task_ptrace(task) & PT_PTRACED) != 0; + } + +@@ -415,6 +439,9 @@ static inline int tracehook_consider_ignored_signal(struct task_struct *task, + static inline int tracehook_consider_fatal_signal(struct task_struct *task, + int sig) + { ++ if (unlikely(task_utrace_flags(task) & (UTRACE_EVENT(SIGNAL_TERM) | ++ UTRACE_EVENT(SIGNAL_CORE)))) ++ return 1; + return (task_ptrace(task) & PT_PTRACED) != 0; + } + +@@ -429,6 +456,8 @@ static inline int tracehook_consider_fatal_signal(struct task_struct *task, + */ + static inline int tracehook_force_sigpending(void) + { ++ if (unlikely(task_utrace_flags(current))) ++ return utrace_interrupt_pending(); + return 0; + } + +@@ -458,6 +487,8 @@ static inline int tracehook_get_signal(struct task_struct *task, + siginfo_t *info, + struct k_sigaction *return_ka) + { ++ if (unlikely(task_utrace_flags(task))) ++ return utrace_get_signal(task, regs, info, return_ka); + return 0; + } + +@@ -485,6 +516,8 @@ static inline int tracehook_get_signal(struct task_struct *task, + */ + static inline int tracehook_notify_jctl(int notify, int why) + { ++ if (task_utrace_flags(current) & UTRACE_EVENT(JCTL)) ++ utrace_report_jctl(notify, why); + return notify ?: (current->ptrace & PT_PTRACED) ? why : 0; + } + +@@ -508,6 +541,8 @@ static inline int tracehook_notify_jctl(int notify, int why) + static inline int tracehook_notify_death(struct task_struct *task, + void **death_cookie, int group_dead) + { ++ *death_cookie = task_utrace_struct(task); ++ + if (task_detached(task)) + return task->ptrace ? SIGCHLD : DEATH_REAP; + +@@ -544,6 +579,20 @@ static inline void tracehook_report_death(struct task_struct *task, + int signal, void *death_cookie, + int group_dead) + { ++ /* ++ * This barrier ensures that our caller's setting of ++ * @task->exit_state precedes checking @task->utrace_flags here. ++ * If utrace_set_events() was just called to enable ++ * UTRACE_EVENT(DEATH), then we are obliged to call ++ * utrace_report_death() and not miss it. utrace_set_events() ++ * uses tasklist_lock to synchronize enabling the bit with the ++ * actual change to @task->exit_state, but we need this barrier ++ * to be sure we see a flags change made just before our caller ++ * took the tasklist_lock. ++ */ ++ smp_mb(); ++ if (task_utrace_flags(task) & _UTRACE_DEATH_EVENTS) ++ utrace_report_death(task, death_cookie, group_dead, signal); + } + + #ifdef TIF_NOTIFY_RESUME +@@ -573,10 +622,20 @@ static inline void set_notify_resume(struct task_struct *task) + * asynchronously, this will be called again before we return to + * user mode. + * +- * Called without locks. ++ * Called without locks. However, on some machines this may be ++ * called with interrupts disabled. + */ + static inline void tracehook_notify_resume(struct pt_regs *regs) + { ++ struct task_struct *task = current; ++ /* ++ * This pairs with the barrier implicit in set_notify_resume(). ++ * It ensures that we read the nonzero utrace_flags set before ++ * set_notify_resume() was called by utrace setup. ++ */ ++ smp_rmb(); ++ if (task_utrace_flags(task)) ++ utrace_resume(task, regs); + } + #endif /* TIF_NOTIFY_RESUME */ + +diff --git a/include/linux/utrace.h b/include/linux/utrace.h +new file mode 100644 +index 0000000..f877ec6 +--- /dev/null ++++ b/include/linux/utrace.h +@@ -0,0 +1,692 @@ ++/* ++ * utrace infrastructure interface for debugging user processes ++ * ++ * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. ++ * ++ * This copyrighted material is made available to anyone wishing to use, ++ * modify, copy, or redistribute it subject to the terms and conditions ++ * of the GNU General Public License v.2. ++ * ++ * Red Hat Author: Roland McGrath. ++ * ++ * This interface allows for notification of interesting events in a ++ * thread. It also mediates access to thread state such as registers. ++ * Multiple unrelated users can be associated with a single thread. ++ * We call each of these a tracing engine. ++ * ++ * A tracing engine starts by calling utrace_attach_task() or ++ * utrace_attach_pid() on the chosen thread, passing in a set of hooks ++ * (&struct utrace_engine_ops), and some associated data. This produces a ++ * &struct utrace_engine, which is the handle used for all other ++ * operations. An attached engine has its ops vector, its data, and an ++ * event mask controlled by utrace_set_events(). ++ * ++ * For each event bit that is set, that engine will get the ++ * appropriate ops->report_*() callback when the event occurs. The ++ * &struct utrace_engine_ops need not provide callbacks for an event ++ * unless the engine sets one of the associated event bits. ++ */ ++ ++#ifndef _LINUX_UTRACE_H ++#define _LINUX_UTRACE_H 1 ++ ++#include ++#include ++#include ++#include ++ ++struct linux_binprm; ++struct pt_regs; ++struct utrace; ++struct user_regset; ++struct user_regset_view; ++ ++/* ++ * Event bits passed to utrace_set_events(). ++ * These appear in &struct task_struct.@utrace_flags ++ * and &struct utrace_engine.@flags. ++ */ ++enum utrace_events { ++ _UTRACE_EVENT_QUIESCE, /* Thread is available for examination. */ ++ _UTRACE_EVENT_REAP, /* Zombie reaped, no more tracing possible. */ ++ _UTRACE_EVENT_CLONE, /* Successful clone/fork/vfork just done. */ ++ _UTRACE_EVENT_EXEC, /* Successful execve just completed. */ ++ _UTRACE_EVENT_EXIT, /* Thread exit in progress. */ ++ _UTRACE_EVENT_DEATH, /* Thread has died. */ ++ _UTRACE_EVENT_SYSCALL_ENTRY, /* User entered kernel for system call. */ ++ _UTRACE_EVENT_SYSCALL_EXIT, /* Returning to user after system call. */ ++ _UTRACE_EVENT_SIGNAL, /* Signal delivery will run a user handler. */ ++ _UTRACE_EVENT_SIGNAL_IGN, /* No-op signal to be delivered. */ ++ _UTRACE_EVENT_SIGNAL_STOP, /* Signal delivery will suspend. */ ++ _UTRACE_EVENT_SIGNAL_TERM, /* Signal delivery will terminate. */ ++ _UTRACE_EVENT_SIGNAL_CORE, /* Signal delivery will dump core. */ ++ _UTRACE_EVENT_JCTL, /* Job control stop or continue completed. */ ++ _UTRACE_NEVENTS ++}; ++#define UTRACE_EVENT(type) (1UL << _UTRACE_EVENT_##type) ++ ++/* ++ * All the kinds of signal events. ++ * These all use the @report_signal() callback. ++ */ ++#define UTRACE_EVENT_SIGNAL_ALL (UTRACE_EVENT(SIGNAL) \ ++ | UTRACE_EVENT(SIGNAL_IGN) \ ++ | UTRACE_EVENT(SIGNAL_STOP) \ ++ | UTRACE_EVENT(SIGNAL_TERM) \ ++ | UTRACE_EVENT(SIGNAL_CORE)) ++/* ++ * Both kinds of syscall events; these call the @report_syscall_entry() ++ * and @report_syscall_exit() callbacks, respectively. ++ */ ++#define UTRACE_EVENT_SYSCALL \ ++ (UTRACE_EVENT(SYSCALL_ENTRY) | UTRACE_EVENT(SYSCALL_EXIT)) ++ ++/* ++ * The event reports triggered synchronously by task death. ++ */ ++#define _UTRACE_DEATH_EVENTS (UTRACE_EVENT(DEATH) | UTRACE_EVENT(QUIESCE)) ++ ++/* ++ * Hooks in call these entry points to the ++ * utrace dispatch. They are weak references here only so ++ * tracehook.h doesn't need to #ifndef CONFIG_UTRACE them to ++ * avoid external references in case of unoptimized compilation. ++ */ ++bool utrace_interrupt_pending(void) ++ __attribute__((weak)); ++void utrace_resume(struct task_struct *, struct pt_regs *) ++ __attribute__((weak)); ++int utrace_get_signal(struct task_struct *, struct pt_regs *, ++ siginfo_t *, struct k_sigaction *) ++ __attribute__((weak)); ++void utrace_report_clone(unsigned long, struct task_struct *) ++ __attribute__((weak)); ++void utrace_finish_vfork(struct task_struct *) ++ __attribute__((weak)); ++void utrace_report_exit(long *exit_code) ++ __attribute__((weak)); ++void utrace_report_death(struct task_struct *, struct utrace *, bool, int) ++ __attribute__((weak)); ++void utrace_report_jctl(int notify, int type) ++ __attribute__((weak)); ++void utrace_report_exec(struct linux_binfmt *, struct linux_binprm *, ++ struct pt_regs *regs) ++ __attribute__((weak)); ++bool utrace_report_syscall_entry(struct pt_regs *) ++ __attribute__((weak)); ++void utrace_report_syscall_exit(struct pt_regs *) ++ __attribute__((weak)); ++void utrace_signal_handler(struct task_struct *, int) ++ __attribute__((weak)); ++ ++#ifndef CONFIG_UTRACE ++ ++/* ++ * uses these accessors to avoid #ifdef CONFIG_UTRACE. ++ */ ++static inline unsigned long task_utrace_flags(struct task_struct *task) ++{ ++ return 0; ++} ++static inline struct utrace *task_utrace_struct(struct task_struct *task) ++{ ++ return NULL; ++} ++static inline void utrace_init_task(struct task_struct *child) ++{ ++} ++static inline void utrace_release_task(struct task_struct *task) ++{ ++} ++ ++static inline void task_utrace_proc_status(struct seq_file *m, ++ struct task_struct *p) ++{ ++} ++ ++#else /* CONFIG_UTRACE */ ++ ++static inline unsigned long task_utrace_flags(struct task_struct *task) ++{ ++ return task->utrace_flags; ++} ++ ++static inline struct utrace *task_utrace_struct(struct task_struct *task) ++{ ++ return &task->utrace; ++} ++ ++static inline void utrace_init_task(struct task_struct *task) ++{ ++ task->utrace_flags = 0; ++ memset(&task->utrace, 0, sizeof(task->utrace)); ++ INIT_LIST_HEAD(&task->utrace.attached); ++ INIT_LIST_HEAD(&task->utrace.attaching); ++ spin_lock_init(&task->utrace.lock); ++} ++ ++void utrace_release_task(struct task_struct *); ++void task_utrace_proc_status(struct seq_file *m, struct task_struct *p); ++ ++ ++/* ++ * Version number of the API defined in this file. This will change ++ * whenever a tracing engine's code would need some updates to keep ++ * working. We maintain this here for the benefit of tracing engine code ++ * that is developed concurrently with utrace API improvements before they ++ * are merged into the kernel, making LINUX_VERSION_CODE checks unwieldy. ++ */ ++#define UTRACE_API_VERSION 20090416 ++ ++/** ++ * enum utrace_resume_action - engine's choice of action for a traced task ++ * @UTRACE_STOP: Stay quiescent after callbacks. ++ * @UTRACE_REPORT: Make some callback soon. ++ * @UTRACE_INTERRUPT: Make @report_signal() callback soon. ++ * @UTRACE_SINGLESTEP: Resume in user mode for one instruction. ++ * @UTRACE_BLOCKSTEP: Resume in user mode until next branch. ++ * @UTRACE_RESUME: Resume normally in user mode. ++ * @UTRACE_DETACH: Detach my engine (implies %UTRACE_RESUME). ++ * ++ * See utrace_control() for detailed descriptions of each action. This is ++ * encoded in the @action argument and the return value for every callback ++ * with a &u32 return value. ++ * ++ * The order of these is important. When there is more than one engine, ++ * each supplies its choice and the smallest value prevails. ++ */ ++enum utrace_resume_action { ++ UTRACE_STOP, ++ UTRACE_REPORT, ++ UTRACE_INTERRUPT, ++ UTRACE_SINGLESTEP, ++ UTRACE_BLOCKSTEP, ++ UTRACE_RESUME, ++ UTRACE_DETACH ++}; ++#define UTRACE_RESUME_MASK 0x0f ++ ++/** ++ * utrace_resume_action - &enum utrace_resume_action from callback action ++ * @action: &u32 callback @action argument or return value ++ * ++ * This extracts the &enum utrace_resume_action from @action, ++ * which is the @action argument to a &struct utrace_engine_ops ++ * callback or the return value from one. ++ */ ++static inline enum utrace_resume_action utrace_resume_action(u32 action) ++{ ++ return action & UTRACE_RESUME_MASK; ++} ++ ++/** ++ * enum utrace_signal_action - disposition of signal ++ * @UTRACE_SIGNAL_DELIVER: Deliver according to sigaction. ++ * @UTRACE_SIGNAL_IGN: Ignore the signal. ++ * @UTRACE_SIGNAL_TERM: Terminate the process. ++ * @UTRACE_SIGNAL_CORE: Terminate with core dump. ++ * @UTRACE_SIGNAL_STOP: Deliver as absolute stop. ++ * @UTRACE_SIGNAL_TSTP: Deliver as job control stop. ++ * @UTRACE_SIGNAL_REPORT: Reporting before pending signals. ++ * @UTRACE_SIGNAL_HANDLER: Reporting after signal handler setup. ++ * ++ * This is encoded in the @action argument and the return value for ++ * a @report_signal() callback. It says what will happen to the ++ * signal described by the &siginfo_t parameter to the callback. ++ * ++ * The %UTRACE_SIGNAL_REPORT value is used in an @action argument when ++ * a tracing report is being made before dequeuing any pending signal. ++ * If this is immediately after a signal handler has been set up, then ++ * %UTRACE_SIGNAL_HANDLER is used instead. A @report_signal callback ++ * that uses %UTRACE_SIGNAL_DELIVER|%UTRACE_SINGLESTEP will ensure ++ * it sees a %UTRACE_SIGNAL_HANDLER report. ++ */ ++enum utrace_signal_action { ++ UTRACE_SIGNAL_DELIVER = 0x00, ++ UTRACE_SIGNAL_IGN = 0x10, ++ UTRACE_SIGNAL_TERM = 0x20, ++ UTRACE_SIGNAL_CORE = 0x30, ++ UTRACE_SIGNAL_STOP = 0x40, ++ UTRACE_SIGNAL_TSTP = 0x50, ++ UTRACE_SIGNAL_REPORT = 0x60, ++ UTRACE_SIGNAL_HANDLER = 0x70 ++}; ++#define UTRACE_SIGNAL_MASK 0xf0 ++#define UTRACE_SIGNAL_HOLD 0x100 /* Flag, push signal back on queue. */ ++ ++/** ++ * utrace_signal_action - &enum utrace_signal_action from callback action ++ * @action: @report_signal callback @action argument or return value ++ * ++ * This extracts the &enum utrace_signal_action from @action, which ++ * is the @action argument to a @report_signal callback or the ++ * return value from one. ++ */ ++static inline enum utrace_signal_action utrace_signal_action(u32 action) ++{ ++ return action & UTRACE_SIGNAL_MASK; ++} ++ ++/** ++ * enum utrace_syscall_action - disposition of system call attempt ++ * @UTRACE_SYSCALL_RUN: Run the system call. ++ * @UTRACE_SYSCALL_ABORT: Don't run the system call. ++ * ++ * This is encoded in the @action argument and the return value for ++ * a @report_syscall_entry callback. ++ */ ++enum utrace_syscall_action { ++ UTRACE_SYSCALL_RUN = 0x00, ++ UTRACE_SYSCALL_ABORT = 0x10 ++}; ++#define UTRACE_SYSCALL_MASK 0xf0 ++ ++/** ++ * utrace_syscall_action - &enum utrace_syscall_action from callback action ++ * @action: @report_syscall_entry callback @action or return value ++ * ++ * This extracts the &enum utrace_syscall_action from @action, which ++ * is the @action argument to a @report_syscall_entry callback or the ++ * return value from one. ++ */ ++static inline enum utrace_syscall_action utrace_syscall_action(u32 action) ++{ ++ return action & UTRACE_SYSCALL_MASK; ++} ++ ++/* ++ * Flags for utrace_attach_task() and utrace_attach_pid(). ++ */ ++#define UTRACE_ATTACH_CREATE 0x0010 /* Attach a new engine. */ ++#define UTRACE_ATTACH_EXCLUSIVE 0x0020 /* Refuse if existing match. */ ++#define UTRACE_ATTACH_MATCH_OPS 0x0001 /* Match engines on ops. */ ++#define UTRACE_ATTACH_MATCH_DATA 0x0002 /* Match engines on data. */ ++#define UTRACE_ATTACH_MATCH_MASK 0x000f ++ ++/** ++ * struct utrace_engine - per-engine structure ++ * @ops: &struct utrace_engine_ops pointer passed to utrace_attach_task() ++ * @data: engine-private &void * passed to utrace_attach_task() ++ * @flags: event mask set by utrace_set_events() plus internal flag bits ++ * ++ * The task itself never has to worry about engines detaching while ++ * it's doing event callbacks. These structures are removed from the ++ * task's active list only when it's stopped, or by the task itself. ++ * ++ * utrace_engine_get() and utrace_engine_put() maintain a reference count. ++ * When it drops to zero, the structure is freed. One reference is held ++ * implicitly while the engine is attached to its task. ++ */ ++struct utrace_engine { ++/* private: */ ++ struct kref kref; ++ struct list_head entry; ++ ++/* public: */ ++ const struct utrace_engine_ops *ops; ++ void *data; ++ ++ unsigned long flags; ++}; ++ ++/** ++ * utrace_engine_get - acquire a reference on a &struct utrace_engine ++ * @engine: &struct utrace_engine pointer ++ * ++ * You must hold a reference on @engine, and you get another. ++ */ ++static inline void utrace_engine_get(struct utrace_engine *engine) ++{ ++ kref_get(&engine->kref); ++} ++ ++void __utrace_engine_release(struct kref *); ++ ++/** ++ * utrace_engine_put - release a reference on a &struct utrace_engine ++ * @engine: &struct utrace_engine pointer ++ * ++ * You must hold a reference on @engine, and you lose that reference. ++ * If it was the last one, @engine becomes an invalid pointer. ++ */ ++static inline void utrace_engine_put(struct utrace_engine *engine) ++{ ++ kref_put(&engine->kref, __utrace_engine_release); ++} ++ ++/** ++ * struct utrace_engine_ops - tracing engine callbacks ++ * ++ * Each @report_*() callback corresponds to an %UTRACE_EVENT(*) bit. ++ * utrace_set_events() calls on @engine choose which callbacks will be made ++ * to @engine from @task. ++ * ++ * Most callbacks take an @action argument, giving the resume action ++ * chosen by other tracing engines. All callbacks take an @engine ++ * argument, and a @task argument, which is always equal to @current. ++ * For some calls, @action also includes bits specific to that event ++ * and utrace_resume_action() is used to extract the resume action. ++ * This shows what would happen if @engine wasn't there, or will if ++ * the callback's return value uses %UTRACE_RESUME. This always ++ * starts as %UTRACE_RESUME when no other tracing is being done on ++ * this task. ++ * ++ * All return values contain &enum utrace_resume_action bits. For ++ * some calls, other bits specific to that kind of event are added to ++ * the resume action bits with OR. These are the same bits used in ++ * the @action argument. The resume action returned by a callback ++ * does not override previous engines' choices, it only says what ++ * @engine wants done. What @task actually does is the action that's ++ * most constrained among the choices made by all attached engines. ++ * See utrace_control() for more information on the actions. ++ * ++ * When %UTRACE_STOP is used in @report_syscall_entry, then @task ++ * stops before attempting the system call. In other cases, the ++ * resume action does not take effect until @task is ready to check ++ * for signals and return to user mode. If there are more callbacks ++ * to be made, the last round of calls determines the final action. ++ * A @report_quiesce callback with @event zero, or a @report_signal ++ * callback, will always be the last one made before @task resumes. ++ * Only %UTRACE_STOP is "sticky"--if @engine returned %UTRACE_STOP ++ * then @task stays stopped unless @engine returns different from a ++ * following callback. ++ * ++ * The report_death() and report_reap() callbacks do not take @action ++ * arguments, and only %UTRACE_DETACH is meaningful in the return value ++ * from a report_death() callback. None of the resume actions applies ++ * to a dead thread. ++ * ++ * All @report_*() hooks are called with no locks held, in a generally ++ * safe environment when we will be returning to user mode soon (or just ++ * entered the kernel). It is fine to block for memory allocation and ++ * the like, but all hooks are asynchronous and must not block on ++ * external events! If you want the thread to block, use %UTRACE_STOP ++ * in your hook's return value; then later wake it up with utrace_control(). ++ * ++ * @report_quiesce: ++ * Requested by %UTRACE_EVENT(%QUIESCE). ++ * This does not indicate any event, but just that @task (the current ++ * thread) is in a safe place for examination. This call is made ++ * before each specific event callback, except for @report_reap. ++ * The @event argument gives the %UTRACE_EVENT(@which) value for ++ * the event occurring. This callback might be made for events @engine ++ * has not requested, if some other engine is tracing the event; ++ * calling utrace_set_events() call here can request the immediate ++ * callback for this occurrence of @event. @event is zero when there ++ * is no other event, @task is now ready to check for signals and ++ * return to user mode, and some engine has used %UTRACE_REPORT or ++ * %UTRACE_INTERRUPT to request this callback. For this case, ++ * if @report_signal is not %NULL, the @report_quiesce callback ++ * may be replaced with a @report_signal callback passing ++ * %UTRACE_SIGNAL_REPORT in its @action argument, whenever @task is ++ * entering the signal-check path anyway. ++ * ++ * @report_signal: ++ * Requested by %UTRACE_EVENT(%SIGNAL_*) or %UTRACE_EVENT(%QUIESCE). ++ * Use utrace_signal_action() and utrace_resume_action() on @action. ++ * The signal action is %UTRACE_SIGNAL_REPORT when some engine has ++ * used %UTRACE_REPORT or %UTRACE_INTERRUPT; the callback can choose ++ * to stop or to deliver an artificial signal, before pending signals. ++ * It's %UTRACE_SIGNAL_HANDLER instead when signal handler setup just ++ * finished (after a previous %UTRACE_SIGNAL_DELIVER return); this ++ * serves in lieu of any %UTRACE_SIGNAL_REPORT callback requested by ++ * %UTRACE_REPORT or %UTRACE_INTERRUPT, and is also implicitly ++ * requested by %UTRACE_SINGLESTEP or %UTRACE_BLOCKSTEP into the ++ * signal delivery. The other signal actions indicate a signal about ++ * to be delivered; the previous engine's return value sets the signal ++ * action seen by the the following engine's callback. The @info data ++ * can be changed at will, including @info->si_signo. The settings in ++ * @return_ka determines what %UTRACE_SIGNAL_DELIVER does. @orig_ka ++ * is what was in force before other tracing engines intervened, and ++ * it's %NULL when this report began as %UTRACE_SIGNAL_REPORT or ++ * %UTRACE_SIGNAL_HANDLER. For a report without a new signal, @info ++ * is left uninitialized and must be set completely by an engine that ++ * chooses to deliver a signal; if there was a previous @report_signal ++ * callback ending in %UTRACE_STOP and it was just resumed using ++ * %UTRACE_REPORT or %UTRACE_INTERRUPT, then @info is left unchanged ++ * from the previous callback. In this way, the original signal can ++ * be left in @info while returning %UTRACE_STOP|%UTRACE_SIGNAL_IGN ++ * and then found again when resuming @task with %UTRACE_INTERRUPT. ++ * The %UTRACE_SIGNAL_HOLD flag bit can be OR'd into the return value, ++ * and might be in @action if the previous engine returned it. This ++ * flag asks that the signal in @info be pushed back on @task's queue ++ * so that it will be seen again after whatever action is taken now. ++ * ++ * @report_clone: ++ * Requested by %UTRACE_EVENT(%CLONE). ++ * Event reported for parent, before the new task @child might run. ++ * @clone_flags gives the flags used in the clone system call, ++ * or equivalent flags for a fork() or vfork() system call. ++ * This function can use utrace_attach_task() on @child. It's guaranteed ++ * that asynchronous utrace_attach_task() calls will be ordered after ++ * any calls in @report_clone callbacks for the parent. Thus ++ * when using %UTRACE_ATTACH_EXCLUSIVE in the asynchronous calls, ++ * you can be sure that the parent's @report_clone callback has ++ * already attached to @child or chosen not to. Passing %UTRACE_STOP ++ * to utrace_control() on @child here keeps the child stopped before ++ * it ever runs in user mode, %UTRACE_REPORT or %UTRACE_INTERRUPT ++ * ensures a callback from @child before it starts in user mode. ++ * ++ * @report_jctl: ++ * Requested by %UTRACE_EVENT(%JCTL). ++ * Job control event; @type is %CLD_STOPPED or %CLD_CONTINUED, ++ * indicating whether we are stopping or resuming now. If @notify ++ * is nonzero, @task is the last thread to stop and so will send ++ * %SIGCHLD to its parent after this callback; @notify reflects ++ * what the parent's %SIGCHLD has in @si_code, which can sometimes ++ * be %CLD_STOPPED even when @type is %CLD_CONTINUED. ++ * ++ * @report_exec: ++ * Requested by %UTRACE_EVENT(%EXEC). ++ * An execve system call has succeeded and the new program is about to ++ * start running. The initial user register state is handy to be tweaked ++ * directly in @regs. @fmt and @bprm gives the details of this exec. ++ * ++ * @report_syscall_entry: ++ * Requested by %UTRACE_EVENT(%SYSCALL_ENTRY). ++ * Thread has entered the kernel to request a system call. ++ * The user register state is handy to be tweaked directly in @regs. ++ * The @action argument contains an &enum utrace_syscall_action, ++ * use utrace_syscall_action() to extract it. The return value ++ * overrides the last engine's action for the system call. ++ * If the final action is %UTRACE_SYSCALL_ABORT, no system call ++ * is made. The details of the system call being attempted can ++ * be fetched here with syscall_get_nr() and syscall_get_arguments(). ++ * The parameter registers can be changed with syscall_set_arguments(). ++ * ++ * @report_syscall_exit: ++ * Requested by %UTRACE_EVENT(%SYSCALL_EXIT). ++ * Thread is about to leave the kernel after a system call request. ++ * The user register state is handy to be tweaked directly in @regs. ++ * The results of the system call attempt can be examined here using ++ * syscall_get_error() and syscall_get_return_value(). It is safe ++ * here to call syscall_set_return_value() or syscall_rollback(). ++ * ++ * @report_exit: ++ * Requested by %UTRACE_EVENT(%EXIT). ++ * Thread is exiting and cannot be prevented from doing so, ++ * but all its state is still live. The @code value will be ++ * the wait result seen by the parent, and can be changed by ++ * this engine or others. The @orig_code value is the real ++ * status, not changed by any tracing engine. Returning %UTRACE_STOP ++ * here keeps @task stopped before it cleans up its state and dies, ++ * so it can be examined by other processes. When @task is allowed ++ * to run, it will die and get to the @report_death callback. ++ * ++ * @report_death: ++ * Requested by %UTRACE_EVENT(%DEATH). ++ * Thread is really dead now. It might be reaped by its parent at ++ * any time, or self-reap immediately. Though the actual reaping ++ * may happen in parallel, a report_reap() callback will always be ++ * ordered after a report_death() callback. ++ * ++ * @report_reap: ++ * Requested by %UTRACE_EVENT(%REAP). ++ * Called when someone reaps the dead task (parent, init, or self). ++ * This means the parent called wait, or else this was a detached ++ * thread or a process whose parent ignores SIGCHLD. ++ * No more callbacks are made after this one. ++ * The engine is always detached. ++ * There is nothing more a tracing engine can do about this thread. ++ * After this callback, the @engine pointer will become invalid. ++ * The @task pointer may become invalid if get_task_struct() hasn't ++ * been used to keep it alive. ++ * An engine should always request this callback if it stores the ++ * @engine pointer or stores any pointer in @engine->data, so it ++ * can clean up its data structures. ++ * Unlike other callbacks, this can be called from the parent's context ++ * rather than from the traced thread itself--it must not delay the ++ * parent by blocking. ++ */ ++struct utrace_engine_ops { ++ u32 (*report_quiesce)(enum utrace_resume_action action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ unsigned long event); ++ u32 (*report_signal)(u32 action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ struct pt_regs *regs, ++ siginfo_t *info, ++ const struct k_sigaction *orig_ka, ++ struct k_sigaction *return_ka); ++ u32 (*report_clone)(enum utrace_resume_action action, ++ struct utrace_engine *engine, ++ struct task_struct *parent, ++ unsigned long clone_flags, ++ struct task_struct *child); ++ u32 (*report_jctl)(enum utrace_resume_action action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ int type, int notify); ++ u32 (*report_exec)(enum utrace_resume_action action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ const struct linux_binfmt *fmt, ++ const struct linux_binprm *bprm, ++ struct pt_regs *regs); ++ u32 (*report_syscall_entry)(u32 action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ struct pt_regs *regs); ++ u32 (*report_syscall_exit)(enum utrace_resume_action action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ struct pt_regs *regs); ++ u32 (*report_exit)(enum utrace_resume_action action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ long orig_code, long *code); ++ u32 (*report_death)(struct utrace_engine *engine, ++ struct task_struct *task, ++ bool group_dead, int signal); ++ void (*report_reap)(struct utrace_engine *engine, ++ struct task_struct *task); ++}; ++ ++/** ++ * struct utrace_examiner - private state for using utrace_prepare_examine() ++ * ++ * The members of &struct utrace_examiner are private to the implementation. ++ * This data type holds the state from a call to utrace_prepare_examine() ++ * to be used by a call to utrace_finish_examine(). ++ */ ++struct utrace_examiner { ++/* private: */ ++ long state; ++ unsigned long ncsw; ++}; ++ ++/* ++ * These are the exported entry points for tracing engines to use. ++ * See kernel/utrace.c for their kerneldoc comments with interface details. ++ */ ++struct utrace_engine *utrace_attach_task(struct task_struct *, int, ++ const struct utrace_engine_ops *, ++ void *); ++struct utrace_engine *utrace_attach_pid(struct pid *, int, ++ const struct utrace_engine_ops *, ++ void *); ++int __must_check utrace_control(struct task_struct *, ++ struct utrace_engine *, ++ enum utrace_resume_action); ++int __must_check utrace_set_events(struct task_struct *, ++ struct utrace_engine *, ++ unsigned long eventmask); ++int __must_check utrace_barrier(struct task_struct *, ++ struct utrace_engine *); ++int __must_check utrace_prepare_examine(struct task_struct *, ++ struct utrace_engine *, ++ struct utrace_examiner *); ++int __must_check utrace_finish_examine(struct task_struct *, ++ struct utrace_engine *, ++ struct utrace_examiner *); ++ ++/** ++ * utrace_control_pid - control a thread being traced by a tracing engine ++ * @pid: thread to affect ++ * @engine: attached engine to affect ++ * @action: &enum utrace_resume_action for thread to do ++ * ++ * This is the same as utrace_control(), but takes a &struct pid ++ * pointer rather than a &struct task_struct pointer. The caller must ++ * hold a ref on @pid, but does not need to worry about the task ++ * staying valid. If it's been reaped so that @pid points nowhere, ++ * then this call returns -%ESRCH. ++ */ ++static inline __must_check int utrace_control_pid( ++ struct pid *pid, struct utrace_engine *engine, ++ enum utrace_resume_action action) ++{ ++ /* ++ * We don't bother with rcu_read_lock() here to protect the ++ * task_struct pointer, because utrace_control will return ++ * -ESRCH without looking at that pointer if the engine is ++ * already detached. A task_struct pointer can't die before ++ * all the engines are detached in release_task() first. ++ */ ++ struct task_struct *task = pid_task(pid, PIDTYPE_PID); ++ return unlikely(!task) ? -ESRCH : utrace_control(task, engine, action); ++} ++ ++/** ++ * utrace_set_events_pid - choose which event reports a tracing engine gets ++ * @pid: thread to affect ++ * @engine: attached engine to affect ++ * @eventmask: new event mask ++ * ++ * This is the same as utrace_set_events(), but takes a &struct pid ++ * pointer rather than a &struct task_struct pointer. The caller must ++ * hold a ref on @pid, but does not need to worry about the task ++ * staying valid. If it's been reaped so that @pid points nowhere, ++ * then this call returns -%ESRCH. ++ */ ++static inline __must_check int utrace_set_events_pid( ++ struct pid *pid, struct utrace_engine *engine, unsigned long eventmask) ++{ ++ struct task_struct *task = pid_task(pid, PIDTYPE_PID); ++ return unlikely(!task) ? -ESRCH : ++ utrace_set_events(task, engine, eventmask); ++} ++ ++/** ++ * utrace_barrier_pid - synchronize with simultaneous tracing callbacks ++ * @pid: thread to affect ++ * @engine: engine to affect (can be detached) ++ * ++ * This is the same as utrace_barrier(), but takes a &struct pid ++ * pointer rather than a &struct task_struct pointer. The caller must ++ * hold a ref on @pid, but does not need to worry about the task ++ * staying valid. If it's been reaped so that @pid points nowhere, ++ * then this call returns -%ESRCH. ++ */ ++static inline __must_check int utrace_barrier_pid(struct pid *pid, ++ struct utrace_engine *engine) ++{ ++ struct task_struct *task = pid_task(pid, PIDTYPE_PID); ++ return unlikely(!task) ? -ESRCH : utrace_barrier(task, engine); ++} ++ ++#endif /* CONFIG_UTRACE */ ++ ++#endif /* linux/utrace.h */ +diff --git a/include/linux/utrace_struct.h b/include/linux/utrace_struct.h +new file mode 100644 +index 0000000..aba7e09 +--- /dev/null ++++ b/include/linux/utrace_struct.h +@@ -0,0 +1,58 @@ ++/* ++ * 'struct utrace' data structure for kernel/utrace.c private use. ++ * ++ * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. ++ * ++ * This copyrighted material is made available to anyone wishing to use, ++ * modify, copy, or redistribute it subject to the terms and conditions ++ * of the GNU General Public License v.2. ++ */ ++ ++#ifndef _LINUX_UTRACE_STRUCT_H ++#define _LINUX_UTRACE_STRUCT_H 1 ++ ++#ifdef CONFIG_UTRACE ++ ++#include ++#include ++ ++/* ++ * Per-thread structure private to utrace implementation. This properly ++ * belongs in kernel/utrace.c and its use is entirely private to the code ++ * there. It is only defined in a header file so that it can be embedded ++ * in the struct task_struct layout. It is here rather than in utrace.h ++ * to avoid header nesting order issues getting too complex. ++ * ++ */ ++struct utrace { ++ struct task_struct *cloning; ++ ++ struct list_head attached, attaching; ++ spinlock_t lock; ++ ++ struct utrace_engine *reporting; ++ ++ unsigned int stopped:1; ++ unsigned int report:1; ++ unsigned int interrupt:1; ++ unsigned int signal_handler:1; ++ unsigned int vfork_stop:1; /* need utrace_stop() before vfork wait */ ++ unsigned int death:1; /* in utrace_report_death() now */ ++ unsigned int reap:1; /* release_task() has run */ ++}; ++ ++# define INIT_UTRACE(tsk) \ ++ .utrace_flags = 0, \ ++ .utrace = { \ ++ .lock = __SPIN_LOCK_UNLOCKED(tsk.utrace.lock), \ ++ .attached = LIST_HEAD_INIT(tsk.utrace.attached), \ ++ .attaching = LIST_HEAD_INIT(tsk.utrace.attaching), \ ++ }, ++ ++#else ++ ++# define INIT_UTRACE(tsk) /* Nothing. */ ++ ++#endif /* CONFIG_UTRACE */ ++ ++#endif /* linux/utrace_struct.h */ +diff --git a/init/Kconfig b/init/Kconfig +index 1ce05a4..f720929 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1191,6 +1191,15 @@ config STOP_MACHINE + help + Need stop_machine() primitive. + ++menuconfig UTRACE ++ bool "Infrastructure for tracing and debugging user processes" ++ depends on EXPERIMENTAL ++ depends on HAVE_ARCH_TRACEHOOK ++ help ++ Enable the utrace process tracing interface. This is an internal ++ kernel interface exported to kernel modules, to track events in ++ user threads, extract and change user thread state. ++ + source "block/Kconfig" + + config PREEMPT_NOTIFIERS +diff --git a/kernel/Makefile b/kernel/Makefile +index 780c8dc..cd16d49 100644 +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -69,6 +69,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o + obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o + obj-$(CONFIG_STOP_MACHINE) += stop_machine.o + obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o ++obj-$(CONFIG_UTRACE) += utrace.o + obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o + obj-$(CONFIG_AUDITSYSCALL) += auditsc.o + obj-$(CONFIG_GCOV_KERNEL) += gcov/ +diff --git a/kernel/ptrace.c b/kernel/ptrace.c +index 61c78b2..935eeee 100644 +--- a/kernel/ptrace.c ++++ b/kernel/ptrace.c +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -164,6 +165,14 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) + return !err; + } + ++/* ++ * For experimental use of utrace, exclude ptrace on the same task. ++ */ ++static inline bool exclude_ptrace(struct task_struct *task) ++{ ++ return unlikely(!!task_utrace_flags(task)); ++} ++ + int ptrace_attach(struct task_struct *task) + { + int retval; +@@ -186,6 +195,13 @@ int ptrace_attach(struct task_struct *task) + goto out; + + task_lock(task); ++ ++ if (exclude_ptrace(task)) { ++ retval = -EBUSY; ++ task_unlock(task); ++ goto unlock_creds; ++ } ++ + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); + task_unlock(task); + if (retval) +@@ -226,7 +242,9 @@ int ptrace_traceme(void) + + write_lock_irq(&tasklist_lock); + /* Are we already being traced? */ +- if (!current->ptrace) { ++ if (exclude_ptrace(current)) { ++ ret = -EBUSY; ++ } else if (!current->ptrace) { + ret = security_ptrace_traceme(current->parent); + /* + * Check PF_EXITING to ensure ->real_parent has not passed +@@ -577,7 +595,17 @@ int ptrace_request(struct task_struct *child, long request, + return ret; + } + +-static struct task_struct *ptrace_get_task_struct(pid_t pid) ++/** ++ * ptrace_get_task_struct -- grab a task struct reference for ptrace ++ * @pid: process id to grab a task_struct reference of ++ * ++ * This function is a helper for ptrace implementations. It checks ++ * permissions and then grabs a task struct for use of the actual ++ * ptrace implementation. ++ * ++ * Returns the task_struct for @pid or an ERR_PTR() on failure. ++ */ ++struct task_struct *ptrace_get_task_struct(pid_t pid) + { + struct task_struct *child; + +diff --git a/kernel/utrace.c b/kernel/utrace.c +new file mode 100644 +index 0000000..74b5fc5 +--- /dev/null ++++ b/kernel/utrace.c +@@ -0,0 +1,2357 @@ ++/* ++ * utrace infrastructure interface for debugging user processes ++ * ++ * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. ++ * ++ * This copyrighted material is made available to anyone wishing to use, ++ * modify, copy, or redistribute it subject to the terms and conditions ++ * of the GNU General Public License v.2. ++ * ++ * Red Hat Author: Roland McGrath. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++/* ++ * Rules for 'struct utrace', defined in ++ * but used entirely privately in this file. ++ * ++ * The common event reporting loops are done by the task making the ++ * report without ever taking any locks. To facilitate this, the two ++ * lists @attached and @attaching work together for smooth asynchronous ++ * attaching with low overhead. Modifying either list requires @lock. ++ * The @attaching list can be modified any time while holding @lock. ++ * New engines being attached always go on this list. ++ * ++ * The @attached list is what the task itself uses for its reporting ++ * loops. When the task itself is not quiescent, it can use the ++ * @attached list without taking any lock. Nobody may modify the list ++ * when the task is not quiescent. When it is quiescent, that means ++ * that it won't run again without taking @lock itself before using ++ * the list. ++ * ++ * At each place where we know the task is quiescent (or it's current), ++ * while holding @lock, we call splice_attaching(), below. This moves ++ * the @attaching list members on to the end of the @attached list. ++ * Since this happens at the start of any reporting pass, any new ++ * engines attached asynchronously go on the stable @attached list ++ * in time to have their callbacks seen. ++ */ ++ ++static struct kmem_cache *utrace_engine_cachep; ++static const struct utrace_engine_ops utrace_detached_ops; /* forward decl */ ++ ++static int __init utrace_init(void) ++{ ++ utrace_engine_cachep = KMEM_CACHE(utrace_engine, SLAB_PANIC); ++ return 0; ++} ++module_init(utrace_init); ++ ++/* ++ * This is called with @utrace->lock held when the task is safely ++ * quiescent, i.e. it won't consult utrace->attached without the lock. ++ * Move any engines attached asynchronously from @utrace->attaching ++ * onto the @utrace->attached list. ++ */ ++static void splice_attaching(struct utrace *utrace) ++{ ++ list_splice_tail_init(&utrace->attaching, &utrace->attached); ++} ++ ++/* ++ * This is the exported function used by the utrace_engine_put() inline. ++ */ ++void __utrace_engine_release(struct kref *kref) ++{ ++ struct utrace_engine *engine = container_of(kref, struct utrace_engine, ++ kref); ++ BUG_ON(!list_empty(&engine->entry)); ++ kmem_cache_free(utrace_engine_cachep, engine); ++} ++EXPORT_SYMBOL_GPL(__utrace_engine_release); ++ ++static bool engine_matches(struct utrace_engine *engine, int flags, ++ const struct utrace_engine_ops *ops, void *data) ++{ ++ if ((flags & UTRACE_ATTACH_MATCH_OPS) && engine->ops != ops) ++ return false; ++ if ((flags & UTRACE_ATTACH_MATCH_DATA) && engine->data != data) ++ return false; ++ return engine->ops && engine->ops != &utrace_detached_ops; ++} ++ ++static struct utrace_engine *matching_engine( ++ struct utrace *utrace, int flags, ++ const struct utrace_engine_ops *ops, void *data) ++{ ++ struct utrace_engine *engine; ++ list_for_each_entry(engine, &utrace->attached, entry) ++ if (engine_matches(engine, flags, ops, data)) ++ return engine; ++ list_for_each_entry(engine, &utrace->attaching, entry) ++ if (engine_matches(engine, flags, ops, data)) ++ return engine; ++ return NULL; ++} ++ ++/* ++ * For experimental use, utrace attach is mutually exclusive with ptrace. ++ */ ++static inline bool exclude_utrace(struct task_struct *task) ++{ ++ return unlikely(!!task->ptrace); ++} ++ ++/* ++ * Called without locks, when we might be the first utrace engine to attach. ++ * If this is a newborn thread and we are not the creator, we have to wait ++ * for it. The creator gets the first chance to attach. The PF_STARTING ++ * flag is cleared after its report_clone hook has had a chance to run. ++ */ ++static inline int utrace_attach_delay(struct task_struct *target) ++{ ++ if ((target->flags & PF_STARTING) && ++ current->utrace.cloning != target) ++ do { ++ schedule_timeout_interruptible(1); ++ if (signal_pending(current)) ++ return -ERESTARTNOINTR; ++ } while (target->flags & PF_STARTING); ++ ++ return 0; ++} ++ ++/* ++ * Enqueue @engine, or maybe don't if UTRACE_ATTACH_EXCLUSIVE. ++ */ ++static int utrace_add_engine(struct task_struct *target, ++ struct utrace *utrace, ++ struct utrace_engine *engine, ++ int flags, ++ const struct utrace_engine_ops *ops, ++ void *data) ++{ ++ int ret; ++ ++ spin_lock(&utrace->lock); ++ ++ if (utrace->reap) { ++ /* ++ * Already entered utrace_release_task(), cannot attach now. ++ */ ++ ret = -ESRCH; ++ } else if ((flags & UTRACE_ATTACH_EXCLUSIVE) && ++ unlikely(matching_engine(utrace, flags, ops, data))) { ++ ret = -EEXIST; ++ } else { ++ /* ++ * Put the new engine on the pending ->attaching list. ++ * Make sure it gets onto the ->attached list by the next ++ * time it's examined. ++ * ++ * When target == current, it would be safe just to call ++ * splice_attaching() right here. But if we're inside a ++ * callback, that would mean the new engine also gets ++ * notified about the event that precipitated its own ++ * creation. This is not what the user wants. ++ * ++ * Setting ->report ensures that start_report() takes the ++ * lock and does it next time. Whenever setting ->report, ++ * we must maintain the invariant that TIF_NOTIFY_RESUME is ++ * also set. Otherwise utrace_control() or utrace_do_stop() ++ * might skip setting TIF_NOTIFY_RESUME upon seeing ->report ++ * already set, and we'd miss a necessary callback. ++ * ++ * In case we had no engines before, make sure that ++ * utrace_flags is not zero when tracehook_notify_resume() ++ * checks. That would bypass utrace reporting clearing ++ * TIF_NOTIFY_RESUME, and thus violate the same invariant. ++ */ ++ target->utrace_flags |= UTRACE_EVENT(REAP); ++ list_add_tail(&engine->entry, &utrace->attaching); ++ utrace->report = 1; ++ set_notify_resume(target); ++ ++ ret = 0; ++ } ++ ++ spin_unlock(&utrace->lock); ++ ++ return ret; ++} ++ ++/** ++ * utrace_attach_task - attach new engine, or look up an attached engine ++ * @target: thread to attach to ++ * @flags: flag bits combined with OR, see below ++ * @ops: callback table for new engine ++ * @data: engine private data pointer ++ * ++ * The caller must ensure that the @target thread does not get freed, ++ * i.e. hold a ref or be its parent. It is always safe to call this ++ * on @current, or on the @child pointer in a @report_clone callback. ++ * For most other cases, it's easier to use utrace_attach_pid() instead. ++ * ++ * UTRACE_ATTACH_CREATE: ++ * Create a new engine. If %UTRACE_ATTACH_CREATE is not specified, you ++ * only look up an existing engine already attached to the thread. ++ * ++ * UTRACE_ATTACH_EXCLUSIVE: ++ * Attempting to attach a second (matching) engine fails with -%EEXIST. ++ * ++ * UTRACE_ATTACH_MATCH_OPS: Only consider engines matching @ops. ++ * UTRACE_ATTACH_MATCH_DATA: Only consider engines matching @data. ++ * ++ * Calls with neither %UTRACE_ATTACH_MATCH_OPS nor %UTRACE_ATTACH_MATCH_DATA ++ * match the first among any engines attached to @target. That means that ++ * %UTRACE_ATTACH_EXCLUSIVE in such a call fails with -%EEXIST if there ++ * are any engines on @target at all. ++ */ ++struct utrace_engine *utrace_attach_task( ++ struct task_struct *target, int flags, ++ const struct utrace_engine_ops *ops, void *data) ++{ ++ struct utrace *utrace; ++ struct utrace_engine *engine; ++ int ret; ++ ++ utrace = &target->utrace; ++ ++ if (unlikely(target->exit_state == EXIT_DEAD)) { ++ /* ++ * The target has already been reaped. ++ * Check this early, though it's not synchronized. ++ * utrace_add_engine() will do the final check. ++ */ ++ if (!(flags & UTRACE_ATTACH_CREATE)) ++ return ERR_PTR(-ENOENT); ++ return ERR_PTR(-ESRCH); ++ } ++ ++ if (!(flags & UTRACE_ATTACH_CREATE)) { ++ spin_lock(&utrace->lock); ++ engine = matching_engine(utrace, flags, ops, data); ++ if (engine) ++ utrace_engine_get(engine); ++ spin_unlock(&utrace->lock); ++ return engine ?: ERR_PTR(-ENOENT); ++ } ++ ++ if (unlikely(!ops) || unlikely(ops == &utrace_detached_ops)) ++ return ERR_PTR(-EINVAL); ++ ++ if (unlikely(target->flags & PF_KTHREAD)) ++ /* ++ * Silly kernel, utrace is for users! ++ */ ++ return ERR_PTR(-EPERM); ++ ++ engine = kmem_cache_alloc(utrace_engine_cachep, GFP_KERNEL); ++ if (unlikely(!engine)) ++ return ERR_PTR(-ENOMEM); ++ ++ /* ++ * Initialize the new engine structure. It starts out with two ++ * refs: one ref to return, and one ref for being attached. ++ */ ++ kref_set(&engine->kref, 2); ++ engine->flags = 0; ++ engine->ops = ops; ++ engine->data = data; ++ ++ ret = utrace_attach_delay(target); ++ if (likely(!ret)) ++ ret = utrace_add_engine(target, utrace, engine, ++ flags, ops, data); ++ ++ if (unlikely(ret)) { ++ kmem_cache_free(utrace_engine_cachep, engine); ++ engine = ERR_PTR(ret); ++ } ++ ++ return engine; ++} ++EXPORT_SYMBOL_GPL(utrace_attach_task); ++ ++/** ++ * utrace_attach_pid - attach new engine, or look up an attached engine ++ * @pid: &struct pid pointer representing thread to attach to ++ * @flags: flag bits combined with OR, see utrace_attach_task() ++ * @ops: callback table for new engine ++ * @data: engine private data pointer ++ * ++ * This is the same as utrace_attach_task(), but takes a &struct pid ++ * pointer rather than a &struct task_struct pointer. The caller must ++ * hold a ref on @pid, but does not need to worry about the task ++ * staying valid. If it's been reaped so that @pid points nowhere, ++ * then this call returns -%ESRCH. ++ */ ++struct utrace_engine *utrace_attach_pid( ++ struct pid *pid, int flags, ++ const struct utrace_engine_ops *ops, void *data) ++{ ++ struct utrace_engine *engine = ERR_PTR(-ESRCH); ++ struct task_struct *task = get_pid_task(pid, PIDTYPE_PID); ++ if (task) { ++ engine = utrace_attach_task(task, flags, ops, data); ++ put_task_struct(task); ++ } ++ return engine; ++} ++EXPORT_SYMBOL_GPL(utrace_attach_pid); ++ ++/* ++ * When an engine is detached, the target thread may still see it and ++ * make callbacks until it quiesces. We install a special ops vector ++ * with these two callbacks. When the target thread quiesces, it can ++ * safely free the engine itself. For any event we will always get ++ * the report_quiesce() callback first, so we only need this one ++ * pointer to be set. The only exception is report_reap(), so we ++ * supply that callback too. ++ */ ++static u32 utrace_detached_quiesce(enum utrace_resume_action action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ unsigned long event) ++{ ++ return UTRACE_DETACH; ++} ++ ++static void utrace_detached_reap(struct utrace_engine *engine, ++ struct task_struct *task) ++{ ++} ++ ++static const struct utrace_engine_ops utrace_detached_ops = { ++ .report_quiesce = &utrace_detached_quiesce, ++ .report_reap = &utrace_detached_reap ++}; ++ ++/* ++ * After waking up from TASK_TRACED, clear bookkeeping in @utrace. ++ * Returns true if we were woken up prematurely by SIGKILL. ++ */ ++static inline bool finish_utrace_stop(struct task_struct *task, ++ struct utrace *utrace) ++{ ++ bool killed = false; ++ ++ /* ++ * utrace_wakeup() clears @utrace->stopped before waking us up. ++ * We're officially awake if it's clear. ++ */ ++ spin_lock(&utrace->lock); ++ if (unlikely(utrace->stopped)) { ++ /* ++ * If we're here with it still set, it must have been ++ * signal_wake_up() instead, waking us up for a SIGKILL. ++ */ ++ spin_lock_irq(&task->sighand->siglock); ++ WARN_ON(!sigismember(&task->pending.signal, SIGKILL)); ++ spin_unlock_irq(&task->sighand->siglock); ++ utrace->stopped = 0; ++ killed = true; ++ } ++ spin_unlock(&utrace->lock); ++ ++ return killed; ++} ++ ++/* ++ * Perform %UTRACE_STOP, i.e. block in TASK_TRACED until woken up. ++ * @task == current, @utrace == current->utrace, which is not locked. ++ * Return true if we were woken up by SIGKILL even though some utrace ++ * engine may still want us to stay stopped. ++ */ ++static bool utrace_stop(struct task_struct *task, struct utrace *utrace, ++ bool report) ++{ ++ bool killed; ++ ++ /* ++ * @utrace->stopped is the flag that says we are safely ++ * inside this function. It should never be set on entry. ++ */ ++ BUG_ON(utrace->stopped); ++ ++ /* ++ * The siglock protects us against signals. As well as SIGKILL ++ * waking us up, we must synchronize with the signal bookkeeping ++ * for stop signals and SIGCONT. ++ */ ++ spin_lock(&utrace->lock); ++ spin_lock_irq(&task->sighand->siglock); ++ ++ if (unlikely(sigismember(&task->pending.signal, SIGKILL))) { ++ spin_unlock_irq(&task->sighand->siglock); ++ spin_unlock(&utrace->lock); ++ return true; ++ } ++ ++ if (report) { ++ /* ++ * Ensure a reporting pass when we're resumed. ++ */ ++ utrace->report = 1; ++ set_thread_flag(TIF_NOTIFY_RESUME); ++ } ++ ++ utrace->stopped = 1; ++ __set_current_state(TASK_TRACED); ++ ++ /* ++ * If there is a group stop in progress, ++ * we must participate in the bookkeeping. ++ */ ++ if (task->signal->group_stop_count > 0) ++ --task->signal->group_stop_count; ++ ++ spin_unlock_irq(&task->sighand->siglock); ++ spin_unlock(&utrace->lock); ++ ++ schedule(); ++ ++ /* ++ * While in TASK_TRACED, we were considered "frozen enough". ++ * Now that we woke up, it's crucial if we're supposed to be ++ * frozen that we freeze now before running anything substantial. ++ */ ++ try_to_freeze(); ++ ++ killed = finish_utrace_stop(task, utrace); ++ ++ /* ++ * While we were in TASK_TRACED, complete_signal() considered ++ * us "uninterested" in signal wakeups. Now make sure our ++ * TIF_SIGPENDING state is correct for normal running. ++ */ ++ spin_lock_irq(&task->sighand->siglock); ++ recalc_sigpending(); ++ spin_unlock_irq(&task->sighand->siglock); ++ ++ return killed; ++} ++ ++/* ++ * The caller has to hold a ref on the engine. If the attached flag is ++ * true (all but utrace_barrier() calls), the engine is supposed to be ++ * attached. If the attached flag is false (utrace_barrier() only), ++ * then return -ERESTARTSYS for an engine marked for detach but not yet ++ * fully detached. The task pointer can be invalid if the engine is ++ * detached. ++ * ++ * Get the utrace lock for the target task. ++ * Returns the struct if locked, or ERR_PTR(-errno). ++ * ++ * This has to be robust against races with: ++ * utrace_control(target, UTRACE_DETACH) calls ++ * UTRACE_DETACH after reports ++ * utrace_report_death ++ * utrace_release_task ++ */ ++static struct utrace *get_utrace_lock(struct task_struct *target, ++ struct utrace_engine *engine, ++ bool attached) ++ __acquires(utrace->lock) ++{ ++ struct utrace *utrace; ++ ++ rcu_read_lock(); ++ ++ /* ++ * If this engine was already detached, bail out before we look at ++ * the task_struct pointer at all. If it's detached after this ++ * check, then RCU is still keeping this task_struct pointer valid. ++ * ++ * The ops pointer is NULL when the engine is fully detached. ++ * It's &utrace_detached_ops when it's marked detached but still ++ * on the list. In the latter case, utrace_barrier() still works, ++ * since the target might be in the middle of an old callback. ++ */ ++ if (unlikely(!engine->ops)) { ++ rcu_read_unlock(); ++ return ERR_PTR(-ESRCH); ++ } ++ ++ if (unlikely(engine->ops == &utrace_detached_ops)) { ++ rcu_read_unlock(); ++ return attached ? ERR_PTR(-ESRCH) : ERR_PTR(-ERESTARTSYS); ++ } ++ ++ utrace = &target->utrace; ++ if (unlikely(target->exit_state == EXIT_DEAD)) { ++ /* ++ * If all engines detached already, utrace is clear. ++ * Otherwise, we're called after utrace_release_task might ++ * have started. A call to this engine's report_reap ++ * callback might already be in progress. ++ */ ++ utrace = ERR_PTR(-ESRCH); ++ } else { ++ spin_lock(&utrace->lock); ++ if (unlikely(!engine->ops) || ++ unlikely(engine->ops == &utrace_detached_ops)) { ++ /* ++ * By the time we got the utrace lock, ++ * it had been reaped or detached already. ++ */ ++ spin_unlock(&utrace->lock); ++ utrace = ERR_PTR(-ESRCH); ++ if (!attached && engine->ops == &utrace_detached_ops) ++ utrace = ERR_PTR(-ERESTARTSYS); ++ } ++ } ++ rcu_read_unlock(); ++ ++ return utrace; ++} ++ ++/* ++ * Now that we don't hold any locks, run through any ++ * detached engines and free their references. Each ++ * engine had one implicit ref while it was attached. ++ */ ++static void put_detached_list(struct list_head *list) ++{ ++ struct utrace_engine *engine, *next; ++ list_for_each_entry_safe(engine, next, list, entry) { ++ list_del_init(&engine->entry); ++ utrace_engine_put(engine); ++ } ++} ++ ++/* ++ * Called with utrace->lock held. ++ * Notify and clean up all engines, then free utrace. ++ */ ++static void utrace_reap(struct task_struct *target, struct utrace *utrace) ++ __releases(utrace->lock) ++{ ++ struct utrace_engine *engine, *next; ++ const struct utrace_engine_ops *ops; ++ LIST_HEAD(detached); ++ ++restart: ++ splice_attaching(utrace); ++ list_for_each_entry_safe(engine, next, &utrace->attached, entry) { ++ ops = engine->ops; ++ engine->ops = NULL; ++ list_move(&engine->entry, &detached); ++ ++ /* ++ * If it didn't need a callback, we don't need to drop ++ * the lock. Now nothing else refers to this engine. ++ */ ++ if (!(engine->flags & UTRACE_EVENT(REAP))) ++ continue; ++ ++ /* ++ * This synchronizes with utrace_barrier(). Since we ++ * need the utrace->lock here anyway (unlike the other ++ * reporting loops), we don't need any memory barrier ++ * as utrace_barrier() holds the lock. ++ */ ++ utrace->reporting = engine; ++ spin_unlock(&utrace->lock); ++ ++ (*ops->report_reap)(engine, target); ++ ++ utrace->reporting = NULL; ++ ++ put_detached_list(&detached); ++ ++ spin_lock(&utrace->lock); ++ goto restart; ++ } ++ ++ spin_unlock(&utrace->lock); ++ ++ put_detached_list(&detached); ++} ++ ++/* ++ * Called by release_task. After this, target->utrace must be cleared. ++ */ ++void utrace_release_task(struct task_struct *target) ++{ ++ struct utrace *utrace; ++ ++ utrace = &target->utrace; ++ ++ spin_lock(&utrace->lock); ++ ++ utrace->reap = 1; ++ ++ if (!(target->utrace_flags & _UTRACE_DEATH_EVENTS)) { ++ utrace_reap(target, utrace); /* Unlocks and frees. */ ++ return; ++ } ++ ++ /* ++ * The target will do some final callbacks but hasn't ++ * finished them yet. We know because it clears these ++ * event bits after it's done. Instead of cleaning up here ++ * and requiring utrace_report_death to cope with it, we ++ * delay the REAP report and the teardown until after the ++ * target finishes its death reports. ++ */ ++ ++ spin_unlock(&utrace->lock); ++} ++ ++/* ++ * We use an extra bit in utrace_engine.flags past the event bits, ++ * to record whether the engine is keeping the target thread stopped. ++ */ ++#define ENGINE_STOP (1UL << _UTRACE_NEVENTS) ++ ++static void mark_engine_wants_stop(struct utrace_engine *engine) ++{ ++ engine->flags |= ENGINE_STOP; ++} ++ ++static void clear_engine_wants_stop(struct utrace_engine *engine) ++{ ++ engine->flags &= ~ENGINE_STOP; ++} ++ ++static bool engine_wants_stop(struct utrace_engine *engine) ++{ ++ return (engine->flags & ENGINE_STOP) != 0; ++} ++ ++/** ++ * utrace_set_events - choose which event reports a tracing engine gets ++ * @target: thread to affect ++ * @engine: attached engine to affect ++ * @events: new event mask ++ * ++ * This changes the set of events for which @engine wants callbacks made. ++ * ++ * This fails with -%EALREADY and does nothing if you try to clear ++ * %UTRACE_EVENT(%DEATH) when the @report_death callback may already have ++ * begun, if you try to clear %UTRACE_EVENT(%REAP) when the @report_reap ++ * callback may already have begun, or if you try to newly set ++ * %UTRACE_EVENT(%DEATH) or %UTRACE_EVENT(%QUIESCE) when @target is ++ * already dead or dying. ++ * ++ * This can fail with -%ESRCH when @target has already been detached, ++ * including forcible detach on reaping. ++ * ++ * If @target was stopped before the call, then after a successful call, ++ * no event callbacks not requested in @events will be made; if ++ * %UTRACE_EVENT(%QUIESCE) is included in @events, then a @report_quiesce ++ * callback will be made when @target resumes. If @target was not stopped, ++ * and was about to make a callback to @engine, this returns -%EINPROGRESS. ++ * In this case, the callback in progress might be one excluded from the ++ * new @events setting. When this returns zero, you can be sure that no ++ * event callbacks you've disabled in @events can be made. ++ * ++ * To synchronize after an -%EINPROGRESS return, see utrace_barrier(). ++ * ++ * When @target is @current, -%EINPROGRESS is not returned. But ++ * note that a newly-created engine will not receive any callbacks ++ * related to an event notification already in progress. This call ++ * enables @events callbacks to be made as soon as @engine becomes ++ * eligible for any callbacks, see utrace_attach_task(). ++ * ++ * These rules provide for coherent synchronization based on %UTRACE_STOP, ++ * even when %SIGKILL is breaking its normal simple rules. ++ */ ++int utrace_set_events(struct task_struct *target, ++ struct utrace_engine *engine, ++ unsigned long events) ++{ ++ struct utrace *utrace; ++ unsigned long old_flags, old_utrace_flags, set_utrace_flags; ++ int ret; ++ ++ utrace = get_utrace_lock(target, engine, true); ++ if (unlikely(IS_ERR(utrace))) ++ return PTR_ERR(utrace); ++ ++ old_utrace_flags = target->utrace_flags; ++ set_utrace_flags = events; ++ old_flags = engine->flags; ++ ++ if (target->exit_state && ++ (((events & ~old_flags) & _UTRACE_DEATH_EVENTS) || ++ (utrace->death && ++ ((old_flags & ~events) & _UTRACE_DEATH_EVENTS)) || ++ (utrace->reap && ((old_flags & ~events) & UTRACE_EVENT(REAP))))) { ++ spin_unlock(&utrace->lock); ++ return -EALREADY; ++ } ++ ++ /* ++ * When setting these flags, it's essential that we really ++ * synchronize with exit_notify(). They cannot be set after ++ * exit_notify() takes the tasklist_lock. By holding the read ++ * lock here while setting the flags, we ensure that the calls ++ * to tracehook_notify_death() and tracehook_report_death() will ++ * see the new flags. This ensures that utrace_release_task() ++ * knows positively that utrace_report_death() will be called or ++ * that it won't. ++ */ ++ if ((set_utrace_flags & ~old_utrace_flags) & _UTRACE_DEATH_EVENTS) { ++ read_lock(&tasklist_lock); ++ if (unlikely(target->exit_state)) { ++ read_unlock(&tasklist_lock); ++ spin_unlock(&utrace->lock); ++ return -EALREADY; ++ } ++ target->utrace_flags |= set_utrace_flags; ++ read_unlock(&tasklist_lock); ++ } ++ ++ engine->flags = events | (engine->flags & ENGINE_STOP); ++ target->utrace_flags |= set_utrace_flags; ++ ++ if ((set_utrace_flags & UTRACE_EVENT_SYSCALL) && ++ !(old_utrace_flags & UTRACE_EVENT_SYSCALL)) ++ set_tsk_thread_flag(target, TIF_SYSCALL_TRACE); ++ ++ ret = 0; ++ if (!utrace->stopped && target != current) { ++ /* ++ * This barrier ensures that our engine->flags changes ++ * have hit before we examine utrace->reporting, ++ * pairing with the barrier in start_callback(). If ++ * @target has not yet hit finish_callback() to clear ++ * utrace->reporting, we might be in the middle of a ++ * callback to @engine. ++ */ ++ smp_mb(); ++ if (utrace->reporting == engine) ++ ret = -EINPROGRESS; ++ } ++ ++ spin_unlock(&utrace->lock); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(utrace_set_events); ++ ++/* ++ * Asynchronously mark an engine as being detached. ++ * ++ * This must work while the target thread races with us doing ++ * start_callback(), defined below. It uses smp_rmb() between checking ++ * @engine->flags and using @engine->ops. Here we change @engine->ops ++ * first, then use smp_wmb() before changing @engine->flags. This ensures ++ * it can check the old flags before using the old ops, or check the old ++ * flags before using the new ops, or check the new flags before using the ++ * new ops, but can never check the new flags before using the old ops. ++ * Hence, utrace_detached_ops might be used with any old flags in place. ++ * It has report_quiesce() and report_reap() callbacks to handle all cases. ++ */ ++static void mark_engine_detached(struct utrace_engine *engine) ++{ ++ engine->ops = &utrace_detached_ops; ++ smp_wmb(); ++ engine->flags = UTRACE_EVENT(QUIESCE); ++} ++ ++/* ++ * Get @target to stop and return true if it is already stopped now. ++ * If we return false, it will make some event callback soonish. ++ * Called with @utrace locked. ++ */ ++static bool utrace_do_stop(struct task_struct *target, struct utrace *utrace) ++{ ++ bool stopped = false; ++ ++ spin_lock_irq(&target->sighand->siglock); ++ if (unlikely(target->exit_state)) { ++ /* ++ * On the exit path, it's only truly quiescent ++ * if it has already been through ++ * utrace_report_death(), or never will. ++ */ ++ if (!(target->utrace_flags & _UTRACE_DEATH_EVENTS)) ++ utrace->stopped = stopped = true; ++ } else if (task_is_stopped(target)) { ++ /* ++ * Stopped is considered quiescent; when it wakes up, it will ++ * go through utrace_get_signal() before doing anything else. ++ */ ++ utrace->stopped = stopped = true; ++ } else if (!utrace->report && !utrace->interrupt) { ++ utrace->report = 1; ++ set_notify_resume(target); ++ } ++ spin_unlock_irq(&target->sighand->siglock); ++ ++ return stopped; ++} ++ ++/* ++ * If the target is not dead it should not be in tracing ++ * stop any more. Wake it unless it's in job control stop. ++ * ++ * Called with @utrace->lock held and @utrace->stopped set. ++ */ ++static void utrace_wakeup(struct task_struct *target, struct utrace *utrace) ++{ ++ struct sighand_struct *sighand; ++ unsigned long irqflags; ++ ++ utrace->stopped = 0; ++ ++ sighand = lock_task_sighand(target, &irqflags); ++ if (unlikely(!sighand)) ++ return; ++ ++ if (likely(task_is_stopped_or_traced(target))) { ++ if (target->signal->flags & SIGNAL_STOP_STOPPED) ++ target->state = TASK_STOPPED; ++ else ++ wake_up_state(target, __TASK_STOPPED | __TASK_TRACED); ++ } ++ ++ unlock_task_sighand(target, &irqflags); ++} ++ ++/* ++ * This is called when there might be some detached engines on the list or ++ * some stale bits in @task->utrace_flags. Clean them up and recompute the ++ * flags. ++ * ++ * @action is NULL when @task is stopped and @utrace->stopped is set; wake ++ * it up if it should not be. @action is set when @task is current; if ++ * we're fully detached, reset *@action to UTRACE_RESUME. ++ * ++ * Called with @utrace->lock held, returns with it released. ++ * After this returns, @utrace might be freed if everything detached. ++ */ ++static void utrace_reset(struct task_struct *task, struct utrace *utrace, ++ enum utrace_resume_action *action) ++ __releases(utrace->lock) ++{ ++ struct utrace_engine *engine, *next; ++ unsigned long flags = 0; ++ LIST_HEAD(detached); ++ bool wake = !action; ++ BUG_ON(wake != (task != current)); ++ ++ splice_attaching(utrace); ++ ++ /* ++ * Update the set of events of interest from the union ++ * of the interests of the remaining tracing engines. ++ * For any engine marked detached, remove it from the list. ++ * We'll collect them on the detached list. ++ */ ++ list_for_each_entry_safe(engine, next, &utrace->attached, entry) { ++ if (engine->ops == &utrace_detached_ops) { ++ engine->ops = NULL; ++ list_move(&engine->entry, &detached); ++ } else { ++ flags |= engine->flags | UTRACE_EVENT(REAP); ++ wake = wake && !engine_wants_stop(engine); ++ } ++ } ++ ++ if (task->exit_state) { ++ /* ++ * Once it's already dead, we never install any flags ++ * except REAP. When ->exit_state is set and events ++ * like DEATH are not set, then they never can be set. ++ * This ensures that utrace_release_task() knows ++ * positively that utrace_report_death() can never run. ++ */ ++ BUG_ON(utrace->death); ++ flags &= UTRACE_EVENT(REAP); ++ wake = false; ++ } else if (!(flags & UTRACE_EVENT_SYSCALL) && ++ test_tsk_thread_flag(task, TIF_SYSCALL_TRACE)) { ++ clear_tsk_thread_flag(task, TIF_SYSCALL_TRACE); ++ } ++ ++ task->utrace_flags = flags; ++ ++ if (wake) ++ utrace_wakeup(task, utrace); ++ ++ /* ++ * If any engines are left, we're done. ++ */ ++ spin_unlock(&utrace->lock); ++ if (!flags) { ++ /* ++ * No more engines, cleared out the utrace. ++ */ ++ ++ if (action) ++ *action = UTRACE_RESUME; ++ } ++ ++ put_detached_list(&detached); ++} ++ ++/* ++ * You can't do anything to a dead task but detach it. ++ * If release_task() has been called, you can't do that. ++ * ++ * On the exit path, DEATH and QUIESCE event bits are set only ++ * before utrace_report_death() has taken the lock. At that point, ++ * the death report will come soon, so disallow detach until it's ++ * done. This prevents us from racing with it detaching itself. ++ * ++ * Called with utrace->lock held, when @target->exit_state is nonzero. ++ */ ++static inline int utrace_control_dead(struct task_struct *target, ++ struct utrace *utrace, ++ enum utrace_resume_action action) ++{ ++ if (action != UTRACE_DETACH || unlikely(utrace->reap)) ++ return -ESRCH; ++ ++ if (unlikely(utrace->death)) ++ /* ++ * We have already started the death report. We can't ++ * prevent the report_death and report_reap callbacks, ++ * so tell the caller they will happen. ++ */ ++ return -EALREADY; ++ ++ return 0; ++} ++ ++/** ++ * utrace_control - control a thread being traced by a tracing engine ++ * @target: thread to affect ++ * @engine: attached engine to affect ++ * @action: &enum utrace_resume_action for thread to do ++ * ++ * This is how a tracing engine asks a traced thread to do something. ++ * This call is controlled by the @action argument, which has the ++ * same meaning as the &enum utrace_resume_action value returned by ++ * event reporting callbacks. ++ * ++ * If @target is already dead (@target->exit_state nonzero), ++ * all actions except %UTRACE_DETACH fail with -%ESRCH. ++ * ++ * The following sections describe each option for the @action argument. ++ * ++ * UTRACE_DETACH: ++ * ++ * After this, the @engine data structure is no longer accessible, ++ * and the thread might be reaped. The thread will start running ++ * again if it was stopped and no longer has any attached engines ++ * that want it stopped. ++ * ++ * If the @report_reap callback may already have begun, this fails ++ * with -%ESRCH. If the @report_death callback may already have ++ * begun, this fails with -%EALREADY. ++ * ++ * If @target is not already stopped, then a callback to this engine ++ * might be in progress or about to start on another CPU. If so, ++ * then this returns -%EINPROGRESS; the detach happens as soon as ++ * the pending callback is finished. To synchronize after an ++ * -%EINPROGRESS return, see utrace_barrier(). ++ * ++ * If @target is properly stopped before utrace_control() is called, ++ * then after successful return it's guaranteed that no more callbacks ++ * to the @engine->ops vector will be made. ++ * ++ * The only exception is %SIGKILL (and exec or group-exit by another ++ * thread in the group), which can cause asynchronous @report_death ++ * and/or @report_reap callbacks even when %UTRACE_STOP was used. ++ * (In that event, this fails with -%ESRCH or -%EALREADY, see above.) ++ * ++ * UTRACE_STOP: ++ * This asks that @target stop running. This returns 0 only if ++ * @target is already stopped, either for tracing or for job ++ * control. Then @target will remain stopped until another ++ * utrace_control() call is made on @engine; @target can be woken ++ * only by %SIGKILL (or equivalent, such as exec or termination by ++ * another thread in the same thread group). ++ * ++ * This returns -%EINPROGRESS if @target is not already stopped. ++ * Then the effect is like %UTRACE_REPORT. A @report_quiesce or ++ * @report_signal callback will be made soon. Your callback can ++ * then return %UTRACE_STOP to keep @target stopped. ++ * ++ * This does not interrupt system calls in progress, including ones ++ * that sleep for a long time. For that, use %UTRACE_INTERRUPT. ++ * To interrupt system calls and then keep @target stopped, your ++ * @report_signal callback can return %UTRACE_STOP. ++ * ++ * UTRACE_RESUME: ++ * ++ * Just let @target continue running normally, reversing the effect ++ * of a previous %UTRACE_STOP. If another engine is keeping @target ++ * stopped, then it remains stopped until all engines let it resume. ++ * If @target was not stopped, this has no effect. ++ * ++ * UTRACE_REPORT: ++ * ++ * This is like %UTRACE_RESUME, but also ensures that there will be ++ * a @report_quiesce or @report_signal callback made soon. If ++ * @target had been stopped, then there will be a callback before it ++ * resumes running normally. If another engine is keeping @target ++ * stopped, then there might be no callbacks until all engines let ++ * it resume. ++ * ++ * UTRACE_INTERRUPT: ++ * ++ * This is like %UTRACE_REPORT, but ensures that @target will make a ++ * @report_signal callback before it resumes or delivers signals. ++ * If @target was in a system call or about to enter one, work in ++ * progress will be interrupted as if by %SIGSTOP. If another ++ * engine is keeping @target stopped, then there might be no ++ * callbacks until all engines let it resume. ++ * ++ * This gives @engine an opportunity to introduce a forced signal ++ * disposition via its @report_signal callback. ++ * ++ * UTRACE_SINGLESTEP: ++ * ++ * It's invalid to use this unless arch_has_single_step() returned true. ++ * This is like %UTRACE_RESUME, but resumes for one user instruction ++ * only. It's invalid to use this in utrace_control() unless @target ++ * had been stopped by @engine previously. ++ * ++ * Note that passing %UTRACE_SINGLESTEP or %UTRACE_BLOCKSTEP to ++ * utrace_control() or returning it from an event callback alone does ++ * not necessarily ensure that stepping will be enabled. If there are ++ * more callbacks made to any engine before returning to user mode, ++ * then the resume action is chosen only by the last set of callbacks. ++ * To be sure, enable %UTRACE_EVENT(%QUIESCE) and look for the ++ * @report_quiesce callback with a zero event mask, or the ++ * @report_signal callback with %UTRACE_SIGNAL_REPORT. ++ * ++ * UTRACE_BLOCKSTEP: ++ * ++ * It's invalid to use this unless arch_has_block_step() returned true. ++ * This is like %UTRACE_SINGLESTEP, but resumes for one whole basic ++ * block of user instructions. ++ * ++ * %UTRACE_BLOCKSTEP devolves to %UTRACE_SINGLESTEP when another ++ * tracing engine is using %UTRACE_SINGLESTEP at the same time. ++ */ ++int utrace_control(struct task_struct *target, ++ struct utrace_engine *engine, ++ enum utrace_resume_action action) ++{ ++ struct utrace *utrace; ++ bool resume; ++ int ret; ++ ++ if (unlikely(action > UTRACE_DETACH)) ++ return -EINVAL; ++ ++ utrace = get_utrace_lock(target, engine, true); ++ if (unlikely(IS_ERR(utrace))) ++ return PTR_ERR(utrace); ++ ++ if (target->exit_state) { ++ ret = utrace_control_dead(target, utrace, action); ++ if (ret) { ++ spin_unlock(&utrace->lock); ++ return ret; ++ } ++ } ++ ++ resume = utrace->stopped; ++ ret = 0; ++ ++ clear_engine_wants_stop(engine); ++ switch (action) { ++ case UTRACE_STOP: ++ mark_engine_wants_stop(engine); ++ if (!resume && !utrace_do_stop(target, utrace)) ++ ret = -EINPROGRESS; ++ resume = false; ++ break; ++ ++ case UTRACE_DETACH: ++ mark_engine_detached(engine); ++ resume = resume || utrace_do_stop(target, utrace); ++ if (!resume) { ++ /* ++ * As in utrace_set_events(), this barrier ensures ++ * that our engine->flags changes have hit before we ++ * examine utrace->reporting, pairing with the barrier ++ * in start_callback(). If @target has not yet hit ++ * finish_callback() to clear utrace->reporting, we ++ * might be in the middle of a callback to @engine. ++ */ ++ smp_mb(); ++ if (utrace->reporting == engine) ++ ret = -EINPROGRESS; ++ break; ++ } ++ /* Fall through. */ ++ ++ case UTRACE_RESUME: ++ /* ++ * This and all other cases imply resuming if stopped. ++ * There might not be another report before it just ++ * resumes, so make sure single-step is not left set. ++ */ ++ if (likely(resume)) ++ user_disable_single_step(target); ++ break; ++ ++ case UTRACE_REPORT: ++ /* ++ * Make the thread call tracehook_notify_resume() soon. ++ * But don't bother if it's already been interrupted. ++ * In that case, utrace_get_signal() will be reporting soon. ++ */ ++ if (!utrace->report && !utrace->interrupt) { ++ utrace->report = 1; ++ set_notify_resume(target); ++ } ++ break; ++ ++ case UTRACE_INTERRUPT: ++ /* ++ * Make the thread call tracehook_get_signal() soon. ++ */ ++ if (utrace->interrupt) ++ break; ++ utrace->interrupt = 1; ++ ++ /* ++ * If it's not already stopped, interrupt it now. ++ * We need the siglock here in case it calls ++ * recalc_sigpending() and clears its own ++ * TIF_SIGPENDING. By taking the lock, we've ++ * serialized any later recalc_sigpending() after ++ * our setting of utrace->interrupt to force it on. ++ */ ++ if (resume) { ++ /* ++ * This is really just to keep the invariant ++ * that TIF_SIGPENDING is set with utrace->interrupt. ++ * When it's stopped, we know it's always going ++ * through utrace_get_signal and will recalculate. ++ */ ++ set_tsk_thread_flag(target, TIF_SIGPENDING); ++ } else { ++ struct sighand_struct *sighand; ++ unsigned long irqflags; ++ sighand = lock_task_sighand(target, &irqflags); ++ if (likely(sighand)) { ++ signal_wake_up(target, 0); ++ unlock_task_sighand(target, &irqflags); ++ } ++ } ++ break; ++ ++ case UTRACE_BLOCKSTEP: ++ /* ++ * Resume from stopped, step one block. ++ */ ++ if (unlikely(!arch_has_block_step())) { ++ WARN_ON(1); ++ /* Fall through to treat it as SINGLESTEP. */ ++ } else if (likely(resume)) { ++ user_enable_block_step(target); ++ break; ++ } ++ ++ case UTRACE_SINGLESTEP: ++ /* ++ * Resume from stopped, step one instruction. ++ */ ++ if (unlikely(!arch_has_single_step())) { ++ WARN_ON(1); ++ resume = false; ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ if (likely(resume)) ++ user_enable_single_step(target); ++ else ++ /* ++ * You were supposed to stop it before asking ++ * it to step. ++ */ ++ ret = -EAGAIN; ++ break; ++ } ++ ++ /* ++ * Let the thread resume running. If it's not stopped now, ++ * there is nothing more we need to do. ++ */ ++ if (resume) ++ utrace_reset(target, utrace, NULL); ++ else ++ spin_unlock(&utrace->lock); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(utrace_control); ++ ++/** ++ * utrace_barrier - synchronize with simultaneous tracing callbacks ++ * @target: thread to affect ++ * @engine: engine to affect (can be detached) ++ * ++ * This blocks while @target might be in the midst of making a callback to ++ * @engine. It can be interrupted by signals and will return -%ERESTARTSYS. ++ * A return value of zero means no callback from @target to @engine was ++ * in progress. Any effect of its return value (such as %UTRACE_STOP) has ++ * already been applied to @engine. ++ * ++ * It's not necessary to keep the @target pointer alive for this call. ++ * It's only necessary to hold a ref on @engine. This will return ++ * safely even if @target has been reaped and has no task refs. ++ * ++ * A successful return from utrace_barrier() guarantees its ordering ++ * with respect to utrace_set_events() and utrace_control() calls. If ++ * @target was not properly stopped, event callbacks just disabled might ++ * still be in progress; utrace_barrier() waits until there is no chance ++ * an unwanted callback can be in progress. ++ */ ++int utrace_barrier(struct task_struct *target, struct utrace_engine *engine) ++{ ++ struct utrace *utrace; ++ int ret = -ERESTARTSYS; ++ ++ if (unlikely(target == current)) ++ return 0; ++ ++ do { ++ utrace = get_utrace_lock(target, engine, false); ++ if (unlikely(IS_ERR(utrace))) { ++ ret = PTR_ERR(utrace); ++ if (ret != -ERESTARTSYS) ++ break; ++ } else { ++ /* ++ * All engine state changes are done while ++ * holding the lock, i.e. before we get here. ++ * Since we have the lock, we only need to ++ * worry about @target making a callback. ++ * When it has entered start_callback() but ++ * not yet gotten to finish_callback(), we ++ * will see utrace->reporting == @engine. ++ * When @target doesn't take the lock, it uses ++ * barriers to order setting utrace->reporting ++ * before it examines the engine state. ++ */ ++ if (utrace->reporting != engine) ++ ret = 0; ++ spin_unlock(&utrace->lock); ++ if (!ret) ++ break; ++ } ++ schedule_timeout_interruptible(1); ++ } while (!signal_pending(current)); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(utrace_barrier); ++ ++/* ++ * This is local state used for reporting loops, perhaps optimized away. ++ */ ++struct utrace_report { ++ enum utrace_resume_action action; ++ u32 result; ++ bool detaches; ++ bool reports; ++ bool takers; ++ bool killed; ++}; ++ ++#define INIT_REPORT(var) \ ++ struct utrace_report var = { UTRACE_RESUME, 0, \ ++ false, false, false, false } ++ ++/* ++ * We are now making the report, so clear the flag saying we need one. ++ */ ++static void start_report(struct utrace *utrace) ++{ ++ BUG_ON(utrace->stopped); ++ if (utrace->report) { ++ spin_lock(&utrace->lock); ++ utrace->report = 0; ++ splice_attaching(utrace); ++ spin_unlock(&utrace->lock); ++ } ++} ++ ++/* ++ * Complete a normal reporting pass, pairing with a start_report() call. ++ * This handles any UTRACE_DETACH or UTRACE_REPORT or UTRACE_INTERRUPT ++ * returns from engine callbacks. If any engine's last callback used ++ * UTRACE_STOP, we do UTRACE_REPORT here to ensure we stop before user ++ * mode. If there were no callbacks made, it will recompute ++ * @task->utrace_flags to avoid another false-positive. ++ */ ++static void finish_report(struct utrace_report *report, ++ struct task_struct *task, struct utrace *utrace) ++{ ++ bool clean = (report->takers && !report->detaches); ++ ++ if (report->action <= UTRACE_REPORT && !utrace->report) { ++ spin_lock(&utrace->lock); ++ utrace->report = 1; ++ set_tsk_thread_flag(task, TIF_NOTIFY_RESUME); ++ } else if (report->action == UTRACE_INTERRUPT && !utrace->interrupt) { ++ spin_lock(&utrace->lock); ++ utrace->interrupt = 1; ++ set_tsk_thread_flag(task, TIF_SIGPENDING); ++ } else if (clean) { ++ return; ++ } else { ++ spin_lock(&utrace->lock); ++ } ++ ++ if (clean) ++ spin_unlock(&utrace->lock); ++ else ++ utrace_reset(task, utrace, &report->action); ++} ++ ++/* ++ * Apply the return value of one engine callback to @report. ++ * Returns true if @engine detached and should not get any more callbacks. ++ */ ++static bool finish_callback(struct utrace *utrace, ++ struct utrace_report *report, ++ struct utrace_engine *engine, ++ u32 ret) ++{ ++ enum utrace_resume_action action = utrace_resume_action(ret); ++ ++ report->result = ret & ~UTRACE_RESUME_MASK; ++ ++ /* ++ * If utrace_control() was used, treat that like UTRACE_DETACH here. ++ */ ++ if (action == UTRACE_DETACH || engine->ops == &utrace_detached_ops) { ++ engine->ops = &utrace_detached_ops; ++ report->detaches = true; ++ } else { ++ if (action < report->action) ++ report->action = action; ++ ++ if (action == UTRACE_STOP) { ++ if (!engine_wants_stop(engine)) { ++ spin_lock(&utrace->lock); ++ mark_engine_wants_stop(engine); ++ spin_unlock(&utrace->lock); ++ } ++ } else { ++ if (action == UTRACE_REPORT) ++ report->reports = true; ++ ++ if (engine_wants_stop(engine)) { ++ spin_lock(&utrace->lock); ++ clear_engine_wants_stop(engine); ++ spin_unlock(&utrace->lock); ++ } ++ } ++ } ++ ++ /* ++ * Now that we have applied the effect of the return value, ++ * clear this so that utrace_barrier() can stop waiting. ++ * A subsequent utrace_control() can stop or resume @engine ++ * and know this was ordered after its callback's action. ++ * ++ * We don't need any barriers here because utrace_barrier() ++ * takes utrace->lock. If we touched engine->flags above, ++ * the lock guaranteed this change was before utrace_barrier() ++ * examined utrace->reporting. ++ */ ++ utrace->reporting = NULL; ++ ++ /* ++ * This is a good place to make sure tracing engines don't ++ * introduce too much latency under voluntary preemption. ++ */ ++ if (need_resched()) ++ cond_resched(); ++ ++ return engine->ops == &utrace_detached_ops; ++} ++ ++/* ++ * Start the callbacks for @engine to consider @event (a bit mask). ++ * This makes the report_quiesce() callback first. If @engine wants ++ * a specific callback for @event, we return the ops vector to use. ++ * If not, we return NULL. The return value from the ops->callback ++ * function called should be passed to finish_callback(). ++ */ ++static const struct utrace_engine_ops *start_callback( ++ struct utrace *utrace, struct utrace_report *report, ++ struct utrace_engine *engine, struct task_struct *task, ++ unsigned long event) ++{ ++ const struct utrace_engine_ops *ops; ++ unsigned long want; ++ ++ /* ++ * This barrier ensures that we've set utrace->reporting before ++ * we examine engine->flags or engine->ops. utrace_barrier() ++ * relies on this ordering to indicate that the effect of any ++ * utrace_control() and utrace_set_events() calls is in place ++ * by the time utrace->reporting can be seen to be NULL. ++ */ ++ utrace->reporting = engine; ++ smp_mb(); ++ ++ /* ++ * This pairs with the barrier in mark_engine_detached(). ++ * It makes sure that we never see the old ops vector with ++ * the new flags, in case the original vector had no report_quiesce. ++ */ ++ want = engine->flags; ++ smp_rmb(); ++ ops = engine->ops; ++ ++ if (want & UTRACE_EVENT(QUIESCE)) { ++ if (finish_callback(utrace, report, engine, ++ (*ops->report_quiesce)(report->action, ++ engine, task, ++ event))) ++ return NULL; ++ ++ /* ++ * finish_callback() reset utrace->reporting after the ++ * quiesce callback. Now we set it again (as above) ++ * before re-examining engine->flags, which could have ++ * been changed synchronously by ->report_quiesce or ++ * asynchronously by utrace_control() or utrace_set_events(). ++ */ ++ utrace->reporting = engine; ++ smp_mb(); ++ want = engine->flags; ++ } ++ ++ if (want & ENGINE_STOP) ++ report->action = UTRACE_STOP; ++ ++ if (want & event) { ++ report->takers = true; ++ return ops; ++ } ++ ++ utrace->reporting = NULL; ++ return NULL; ++} ++ ++/* ++ * Do a normal reporting pass for engines interested in @event. ++ * @callback is the name of the member in the ops vector, and remaining ++ * args are the extras it takes after the standard three args. ++ */ ++#define REPORT(task, utrace, report, event, callback, ...) \ ++ do { \ ++ start_report(utrace); \ ++ REPORT_CALLBACKS(, task, utrace, report, event, callback, \ ++ (report)->action, engine, current, \ ++ ## __VA_ARGS__); \ ++ finish_report(report, task, utrace); \ ++ } while (0) ++#define REPORT_CALLBACKS(rev, task, utrace, report, event, callback, ...) \ ++ do { \ ++ struct utrace_engine *engine; \ ++ const struct utrace_engine_ops *ops; \ ++ list_for_each_entry##rev(engine, &utrace->attached, entry) { \ ++ ops = start_callback(utrace, report, engine, task, \ ++ event); \ ++ if (!ops) \ ++ continue; \ ++ finish_callback(utrace, report, engine, \ ++ (*ops->callback)(__VA_ARGS__)); \ ++ } \ ++ } while (0) ++ ++/* ++ * Called iff UTRACE_EVENT(EXEC) flag is set. ++ */ ++void utrace_report_exec(struct linux_binfmt *fmt, struct linux_binprm *bprm, ++ struct pt_regs *regs) ++{ ++ struct task_struct *task = current; ++ struct utrace *utrace = task_utrace_struct(task); ++ INIT_REPORT(report); ++ ++ REPORT(task, utrace, &report, UTRACE_EVENT(EXEC), ++ report_exec, fmt, bprm, regs); ++} ++ ++/* ++ * Called iff UTRACE_EVENT(SYSCALL_ENTRY) flag is set. ++ * Return true to prevent the system call. ++ */ ++bool utrace_report_syscall_entry(struct pt_regs *regs) ++{ ++ struct task_struct *task = current; ++ struct utrace *utrace = task_utrace_struct(task); ++ INIT_REPORT(report); ++ ++ start_report(utrace); ++ REPORT_CALLBACKS(_reverse, task, utrace, &report, ++ UTRACE_EVENT(SYSCALL_ENTRY), report_syscall_entry, ++ report.result | report.action, engine, current, regs); ++ finish_report(&report, task, utrace); ++ ++ if (report.action == UTRACE_STOP && ++ unlikely(utrace_stop(task, utrace, false))) ++ /* ++ * We are continuing despite UTRACE_STOP because of a ++ * SIGKILL. Don't let the system call actually proceed. ++ */ ++ return true; ++ ++ return report.result == UTRACE_SYSCALL_ABORT; ++} ++ ++/* ++ * Called iff UTRACE_EVENT(SYSCALL_EXIT) flag is set. ++ */ ++void utrace_report_syscall_exit(struct pt_regs *regs) ++{ ++ struct task_struct *task = current; ++ struct utrace *utrace = task_utrace_struct(task); ++ INIT_REPORT(report); ++ ++ REPORT(task, utrace, &report, UTRACE_EVENT(SYSCALL_EXIT), ++ report_syscall_exit, regs); ++} ++ ++/* ++ * Called iff UTRACE_EVENT(CLONE) flag is set. ++ * This notification call blocks the wake_up_new_task call on the child. ++ * So we must not quiesce here. tracehook_report_clone_complete will do ++ * a quiescence check momentarily. ++ */ ++void utrace_report_clone(unsigned long clone_flags, struct task_struct *child) ++{ ++ struct task_struct *task = current; ++ struct utrace *utrace = task_utrace_struct(task); ++ INIT_REPORT(report); ++ ++ /* ++ * We don't use the REPORT() macro here, because we need ++ * to clear utrace->cloning before finish_report(). ++ * After finish_report(), utrace can be a stale pointer ++ * in cases when report.action is still UTRACE_RESUME. ++ */ ++ start_report(utrace); ++ utrace->cloning = child; ++ ++ REPORT_CALLBACKS(, task, utrace, &report, ++ UTRACE_EVENT(CLONE), report_clone, ++ report.action, engine, task, clone_flags, child); ++ ++ utrace->cloning = NULL; ++ finish_report(&report, task, utrace); ++ ++ /* ++ * For a vfork, we will go into an uninterruptible block waiting ++ * for the child. We need UTRACE_STOP to happen before this, not ++ * after. For CLONE_VFORK, utrace_finish_vfork() will be called. ++ */ ++ if (report.action == UTRACE_STOP && (clone_flags & CLONE_VFORK)) { ++ spin_lock(&utrace->lock); ++ utrace->vfork_stop = 1; ++ spin_unlock(&utrace->lock); ++ } ++} ++ ++/* ++ * We're called after utrace_report_clone() for a CLONE_VFORK. ++ * If UTRACE_STOP was left from the clone report, we stop here. ++ * After this, we'll enter the uninterruptible wait_for_completion() ++ * waiting for the child. ++ */ ++void utrace_finish_vfork(struct task_struct *task) ++{ ++ struct utrace *utrace = task_utrace_struct(task); ++ ++ spin_lock(&utrace->lock); ++ if (!utrace->vfork_stop) ++ spin_unlock(&utrace->lock); ++ else { ++ utrace->vfork_stop = 0; ++ spin_unlock(&utrace->lock); ++ utrace_stop(task, utrace, false); ++ } ++} ++ ++/* ++ * Called iff UTRACE_EVENT(JCTL) flag is set. ++ * ++ * Called with siglock held. ++ */ ++void utrace_report_jctl(int notify, int what) ++{ ++ struct task_struct *task = current; ++ struct utrace *utrace = task_utrace_struct(task); ++ INIT_REPORT(report); ++ bool stop = task_is_stopped(task); ++ ++ /* ++ * We have to come out of TASK_STOPPED in case the event report ++ * hooks might block. Since we held the siglock throughout, it's ++ * as if we were never in TASK_STOPPED yet at all. ++ */ ++ if (stop) { ++ __set_current_state(TASK_RUNNING); ++ task->signal->flags &= ~SIGNAL_STOP_STOPPED; ++ ++task->signal->group_stop_count; ++ } ++ spin_unlock_irq(&task->sighand->siglock); ++ ++ /* ++ * We get here with CLD_STOPPED when we've just entered ++ * TASK_STOPPED, or with CLD_CONTINUED when we've just come ++ * out but not yet been through utrace_get_signal() again. ++ * ++ * While in TASK_STOPPED, we can be considered safely ++ * stopped by utrace_do_stop() and detached asynchronously. ++ * If we woke up and checked task->utrace_flags before that ++ * was finished, we might be here with utrace already ++ * removed or in the middle of being removed. ++ * ++ * If we are indeed attached, then make sure we are no ++ * longer considered stopped while we run callbacks. ++ */ ++ spin_lock(&utrace->lock); ++ utrace->stopped = 0; ++ /* ++ * Do start_report()'s work too since we already have the lock anyway. ++ */ ++ utrace->report = 0; ++ splice_attaching(utrace); ++ spin_unlock(&utrace->lock); ++ ++ REPORT(task, utrace, &report, UTRACE_EVENT(JCTL), ++ report_jctl, what, notify); ++ ++ /* ++ * Retake the lock, and go back into TASK_STOPPED ++ * unless the stop was just cleared. ++ */ ++ spin_lock_irq(&task->sighand->siglock); ++ if (stop && task->signal->group_stop_count > 0) { ++ __set_current_state(TASK_STOPPED); ++ if (--task->signal->group_stop_count == 0) ++ task->signal->flags |= SIGNAL_STOP_STOPPED; ++ } ++} ++ ++/* ++ * Called iff UTRACE_EVENT(EXIT) flag is set. ++ */ ++void utrace_report_exit(long *exit_code) ++{ ++ struct task_struct *task = current; ++ struct utrace *utrace = task_utrace_struct(task); ++ INIT_REPORT(report); ++ long orig_code = *exit_code; ++ ++ REPORT(task, utrace, &report, UTRACE_EVENT(EXIT), ++ report_exit, orig_code, exit_code); ++ ++ if (report.action == UTRACE_STOP) ++ utrace_stop(task, utrace, false); ++} ++ ++/* ++ * Called iff UTRACE_EVENT(DEATH) or UTRACE_EVENT(QUIESCE) flag is set. ++ * ++ * It is always possible that we are racing with utrace_release_task here. ++ * For this reason, utrace_release_task checks for the event bits that get ++ * us here, and delays its cleanup for us to do. ++ */ ++void utrace_report_death(struct task_struct *task, struct utrace *utrace, ++ bool group_dead, int signal) ++{ ++ INIT_REPORT(report); ++ ++ BUG_ON(!task->exit_state); ++ ++ /* ++ * We are presently considered "quiescent"--which is accurate ++ * inasmuch as we won't run any more user instructions ever again. ++ * But for utrace_control and utrace_set_events to be robust, they ++ * must be sure whether or not we will run any more callbacks. If ++ * a call comes in before we do, taking the lock here synchronizes ++ * us so we don't run any callbacks just disabled. Calls that come ++ * in while we're running the callbacks will see the exit.death ++ * flag and know that we are not yet fully quiescent for purposes ++ * of detach bookkeeping. ++ */ ++ spin_lock(&utrace->lock); ++ BUG_ON(utrace->death); ++ utrace->death = 1; ++ utrace->report = 0; ++ utrace->interrupt = 0; ++ spin_unlock(&utrace->lock); ++ ++ REPORT_CALLBACKS(, task, utrace, &report, UTRACE_EVENT(DEATH), ++ report_death, engine, task, group_dead, signal); ++ ++ spin_lock(&utrace->lock); ++ ++ /* ++ * After we unlock (possibly inside utrace_reap for callbacks) with ++ * this flag clear, competing utrace_control/utrace_set_events calls ++ * know that we've finished our callbacks and any detach bookkeeping. ++ */ ++ utrace->death = 0; ++ ++ if (utrace->reap) ++ /* ++ * utrace_release_task() was already called in parallel. ++ * We must complete its work now. ++ */ ++ utrace_reap(task, utrace); ++ else ++ utrace_reset(task, utrace, &report.action); ++} ++ ++/* ++ * Finish the last reporting pass before returning to user mode. ++ */ ++static void finish_resume_report(struct utrace_report *report, ++ struct task_struct *task, ++ struct utrace *utrace) ++{ ++ if (report->detaches || !report->takers) { ++ spin_lock(&utrace->lock); ++ utrace_reset(task, utrace, &report->action); ++ } ++ ++ switch (report->action) { ++ case UTRACE_STOP: ++ report->killed = utrace_stop(task, utrace, report->reports); ++ break; ++ ++ case UTRACE_INTERRUPT: ++ if (!signal_pending(task)) ++ set_tsk_thread_flag(task, TIF_SIGPENDING); ++ break; ++ ++ case UTRACE_BLOCKSTEP: ++ if (likely(arch_has_block_step())) { ++ user_enable_block_step(task); ++ break; ++ } ++ ++ /* ++ * This means some callback is to blame for failing ++ * to check arch_has_block_step() itself. Warn and ++ * then fall through to treat it as SINGLESTEP. ++ */ ++ WARN_ON(1); ++ ++ case UTRACE_SINGLESTEP: ++ if (likely(arch_has_single_step())) ++ user_enable_single_step(task); ++ else ++ /* ++ * This means some callback is to blame for failing ++ * to check arch_has_single_step() itself. Spew ++ * about it so the loser will fix his module. ++ */ ++ WARN_ON(1); ++ break; ++ ++ case UTRACE_REPORT: ++ case UTRACE_RESUME: ++ default: ++ user_disable_single_step(task); ++ break; ++ } ++} ++ ++/* ++ * This is called when TIF_NOTIFY_RESUME had been set (and is now clear). ++ * We are close to user mode, and this is the place to report or stop. ++ * When we return, we're going to user mode or into the signals code. ++ */ ++void utrace_resume(struct task_struct *task, struct pt_regs *regs) ++{ ++ struct utrace *utrace = task_utrace_struct(task); ++ INIT_REPORT(report); ++ struct utrace_engine *engine; ++ ++ /* ++ * Some machines get here with interrupts disabled. The same arch ++ * code path leads to calling into get_signal_to_deliver(), which ++ * implicitly reenables them by virtue of spin_unlock_irq. ++ */ ++ local_irq_enable(); ++ ++ /* ++ * If this flag is still set it's because there was a signal ++ * handler setup done but no report_signal following it. Clear ++ * the flag before we get to user so it doesn't confuse us later. ++ */ ++ if (unlikely(utrace->signal_handler)) { ++ int skip; ++ spin_lock(&utrace->lock); ++ utrace->signal_handler = 0; ++ skip = !utrace->report; ++ spin_unlock(&utrace->lock); ++ if (skip) ++ return; ++ } ++ ++ /* ++ * If UTRACE_INTERRUPT was just used, we don't bother with a report ++ * here. We will report and stop in utrace_get_signal(). In case ++ * of a race with utrace_control(), make sure we don't momentarily ++ * return to user mode because TIF_SIGPENDING was not set yet. ++ */ ++ if (unlikely(utrace->interrupt)) { ++ set_thread_flag(TIF_SIGPENDING); ++ return; ++ } ++ ++ /* ++ * Do a simple reporting pass, with no callback after report_quiesce. ++ */ ++ start_report(utrace); ++ ++ list_for_each_entry(engine, &utrace->attached, entry) ++ start_callback(utrace, &report, engine, task, 0); ++ ++ /* ++ * Finish the report and either stop or get ready to resume. ++ */ ++ finish_resume_report(&report, task, utrace); ++} ++ ++/* ++ * Return true if current has forced signal_pending(). ++ * ++ * This is called only when current->utrace_flags is nonzero, so we know ++ * that current->utrace must be set. It's not inlined in tracehook.h ++ * just so that struct utrace can stay opaque outside this file. ++ */ ++bool utrace_interrupt_pending(void) ++{ ++ return task_utrace_struct(current)->interrupt; ++} ++ ++/* ++ * Take the siglock and push @info back on our queue. ++ * Returns with @task->sighand->siglock held. ++ */ ++static void push_back_signal(struct task_struct *task, siginfo_t *info) ++ __acquires(task->sighand->siglock) ++{ ++ struct sigqueue *q; ++ ++ if (unlikely(!info->si_signo)) { /* Oh, a wise guy! */ ++ spin_lock_irq(&task->sighand->siglock); ++ return; ++ } ++ ++ q = sigqueue_alloc(); ++ if (likely(q)) { ++ q->flags = 0; ++ copy_siginfo(&q->info, info); ++ } ++ ++ spin_lock_irq(&task->sighand->siglock); ++ ++ sigaddset(&task->pending.signal, info->si_signo); ++ if (likely(q)) ++ list_add(&q->list, &task->pending.list); ++ ++ set_tsk_thread_flag(task, TIF_SIGPENDING); ++} ++ ++/* ++ * This is the hook from the signals code, called with the siglock held. ++ * Here is the ideal place to stop. We also dequeue and intercept signals. ++ */ ++int utrace_get_signal(struct task_struct *task, struct pt_regs *regs, ++ siginfo_t *info, struct k_sigaction *return_ka) ++ __releases(task->sighand->siglock) ++ __acquires(task->sighand->siglock) ++{ ++ struct utrace *utrace; ++ struct k_sigaction *ka; ++ INIT_REPORT(report); ++ struct utrace_engine *engine; ++ const struct utrace_engine_ops *ops; ++ unsigned long event, want; ++ u32 ret; ++ int signr; ++ ++ utrace = &task->utrace; ++ if (utrace->interrupt || utrace->report || utrace->signal_handler) { ++ /* ++ * We've been asked for an explicit report before we ++ * even check for pending signals. ++ */ ++ ++ spin_unlock_irq(&task->sighand->siglock); ++ ++ spin_lock(&utrace->lock); ++ ++ splice_attaching(utrace); ++ ++ if (unlikely(!utrace->interrupt) && unlikely(!utrace->report)) ++ report.result = UTRACE_SIGNAL_IGN; ++ else if (utrace->signal_handler) ++ report.result = UTRACE_SIGNAL_HANDLER; ++ else ++ report.result = UTRACE_SIGNAL_REPORT; ++ ++ /* ++ * We are now making the report and it's on the ++ * interrupt path, so clear the flags asking for those. ++ */ ++ utrace->interrupt = utrace->report = utrace->signal_handler = 0; ++ utrace->stopped = 0; ++ ++ /* ++ * Make sure signal_pending() only returns true ++ * if there are real signals pending. ++ */ ++ if (signal_pending(task)) { ++ spin_lock_irq(&task->sighand->siglock); ++ recalc_sigpending(); ++ spin_unlock_irq(&task->sighand->siglock); ++ } ++ ++ spin_unlock(&utrace->lock); ++ ++ if (unlikely(report.result == UTRACE_SIGNAL_IGN)) ++ /* ++ * We only got here to clear utrace->signal_handler. ++ */ ++ return -1; ++ ++ /* ++ * Do a reporting pass for no signal, just for EVENT(QUIESCE). ++ * The engine callbacks can fill in *info and *return_ka. ++ * We'll pass NULL for the @orig_ka argument to indicate ++ * that there was no original signal. ++ */ ++ event = 0; ++ ka = NULL; ++ memset(return_ka, 0, sizeof *return_ka); ++ } else if ((task->utrace_flags & UTRACE_EVENT_SIGNAL_ALL) == 0 && ++ !utrace->stopped) { ++ /* ++ * If no engine is interested in intercepting signals, ++ * let the caller just dequeue them normally. ++ */ ++ return 0; ++ } else { ++ if (unlikely(utrace->stopped)) { ++ spin_unlock_irq(&task->sighand->siglock); ++ spin_lock(&utrace->lock); ++ utrace->stopped = 0; ++ spin_unlock(&utrace->lock); ++ spin_lock_irq(&task->sighand->siglock); ++ } ++ ++ /* ++ * Steal the next signal so we can let tracing engines ++ * examine it. From the signal number and sigaction, ++ * determine what normal delivery would do. If no ++ * engine perturbs it, we'll do that by returning the ++ * signal number after setting *return_ka. ++ */ ++ signr = dequeue_signal(task, &task->blocked, info); ++ if (signr == 0) ++ return signr; ++ BUG_ON(signr != info->si_signo); ++ ++ ka = &task->sighand->action[signr - 1]; ++ *return_ka = *ka; ++ ++ /* ++ * We are never allowed to interfere with SIGKILL. ++ * Just punt after filling in *return_ka for our caller. ++ */ ++ if (signr == SIGKILL) ++ return signr; ++ ++ if (ka->sa.sa_handler == SIG_IGN) { ++ event = UTRACE_EVENT(SIGNAL_IGN); ++ report.result = UTRACE_SIGNAL_IGN; ++ } else if (ka->sa.sa_handler != SIG_DFL) { ++ event = UTRACE_EVENT(SIGNAL); ++ report.result = UTRACE_SIGNAL_DELIVER; ++ } else if (sig_kernel_coredump(signr)) { ++ event = UTRACE_EVENT(SIGNAL_CORE); ++ report.result = UTRACE_SIGNAL_CORE; ++ } else if (sig_kernel_ignore(signr)) { ++ event = UTRACE_EVENT(SIGNAL_IGN); ++ report.result = UTRACE_SIGNAL_IGN; ++ } else if (signr == SIGSTOP) { ++ event = UTRACE_EVENT(SIGNAL_STOP); ++ report.result = UTRACE_SIGNAL_STOP; ++ } else if (sig_kernel_stop(signr)) { ++ event = UTRACE_EVENT(SIGNAL_STOP); ++ report.result = UTRACE_SIGNAL_TSTP; ++ } else { ++ event = UTRACE_EVENT(SIGNAL_TERM); ++ report.result = UTRACE_SIGNAL_TERM; ++ } ++ ++ /* ++ * Now that we know what event type this signal is, we ++ * can short-circuit if no engines care about those. ++ */ ++ if ((task->utrace_flags & (event | UTRACE_EVENT(QUIESCE))) == 0) ++ return signr; ++ ++ /* ++ * We have some interested engines, so tell them about ++ * the signal and let them change its disposition. ++ */ ++ spin_unlock_irq(&task->sighand->siglock); ++ } ++ ++ /* ++ * This reporting pass chooses what signal disposition we'll act on. ++ */ ++ list_for_each_entry(engine, &utrace->attached, entry) { ++ /* ++ * See start_callback() comment about this barrier. ++ */ ++ utrace->reporting = engine; ++ smp_mb(); ++ ++ /* ++ * This pairs with the barrier in mark_engine_detached(), ++ * see start_callback() comments. ++ */ ++ want = engine->flags; ++ smp_rmb(); ++ ops = engine->ops; ++ ++ if ((want & (event | UTRACE_EVENT(QUIESCE))) == 0) { ++ utrace->reporting = NULL; ++ continue; ++ } ++ ++ if (ops->report_signal) ++ ret = (*ops->report_signal)( ++ report.result | report.action, engine, task, ++ regs, info, ka, return_ka); ++ else ++ ret = (report.result | (*ops->report_quiesce)( ++ report.action, engine, task, event)); ++ ++ /* ++ * Avoid a tight loop reporting again and again if some ++ * engine is too stupid. ++ */ ++ switch (utrace_resume_action(ret)) { ++ default: ++ break; ++ case UTRACE_INTERRUPT: ++ case UTRACE_REPORT: ++ ret = (ret & ~UTRACE_RESUME_MASK) | UTRACE_RESUME; ++ break; ++ } ++ ++ finish_callback(utrace, &report, engine, ret); ++ } ++ ++ /* ++ * We express the chosen action to the signals code in terms ++ * of a representative signal whose default action does it. ++ * Our caller uses our return value (signr) to decide what to ++ * do, but uses info->si_signo as the signal number to report. ++ */ ++ switch (utrace_signal_action(report.result)) { ++ case UTRACE_SIGNAL_TERM: ++ signr = SIGTERM; ++ break; ++ ++ case UTRACE_SIGNAL_CORE: ++ signr = SIGQUIT; ++ break; ++ ++ case UTRACE_SIGNAL_STOP: ++ signr = SIGSTOP; ++ break; ++ ++ case UTRACE_SIGNAL_TSTP: ++ signr = SIGTSTP; ++ break; ++ ++ case UTRACE_SIGNAL_DELIVER: ++ signr = info->si_signo; ++ ++ if (return_ka->sa.sa_handler == SIG_DFL) { ++ /* ++ * We'll do signr's normal default action. ++ * For ignore, we'll fall through below. ++ * For stop/death, break locks and returns it. ++ */ ++ if (likely(signr) && !sig_kernel_ignore(signr)) ++ break; ++ } else if (return_ka->sa.sa_handler != SIG_IGN && ++ likely(signr)) { ++ /* ++ * Complete the bookkeeping after the report. ++ * The handler will run. If an engine wanted to ++ * stop or step, then make sure we do another ++ * report after signal handler setup. ++ */ ++ if (report.action != UTRACE_RESUME) ++ report.action = UTRACE_INTERRUPT; ++ finish_report(&report, task, utrace); ++ ++ if (unlikely(report.result & UTRACE_SIGNAL_HOLD)) ++ push_back_signal(task, info); ++ else ++ spin_lock_irq(&task->sighand->siglock); ++ ++ /* ++ * We do the SA_ONESHOT work here since the ++ * normal path will only touch *return_ka now. ++ */ ++ if (unlikely(return_ka->sa.sa_flags & SA_ONESHOT)) { ++ return_ka->sa.sa_flags &= ~SA_ONESHOT; ++ if (likely(valid_signal(signr))) { ++ ka = &task->sighand->action[signr - 1]; ++ ka->sa.sa_handler = SIG_DFL; ++ } ++ } ++ ++ return signr; ++ } ++ ++ /* Fall through for an ignored signal. */ ++ ++ case UTRACE_SIGNAL_IGN: ++ case UTRACE_SIGNAL_REPORT: ++ default: ++ /* ++ * If the signal is being ignored, then we are on the way ++ * directly back to user mode. We can stop here, or step, ++ * as in utrace_resume(), above. After we've dealt with that, ++ * our caller will relock and come back through here. ++ */ ++ finish_resume_report(&report, task, utrace); ++ ++ if (unlikely(report.killed)) { ++ /* ++ * The only reason we woke up now was because of a ++ * SIGKILL. Don't do normal dequeuing in case it ++ * might get a signal other than SIGKILL. That would ++ * perturb the death state so it might differ from ++ * what the debugger would have allowed to happen. ++ * Instead, pluck out just the SIGKILL to be sure ++ * we'll die immediately with nothing else different ++ * from the quiescent state the debugger wanted us in. ++ */ ++ sigset_t sigkill_only; ++ siginitsetinv(&sigkill_only, sigmask(SIGKILL)); ++ spin_lock_irq(&task->sighand->siglock); ++ signr = dequeue_signal(task, &sigkill_only, info); ++ BUG_ON(signr != SIGKILL); ++ *return_ka = task->sighand->action[SIGKILL - 1]; ++ return signr; ++ } ++ ++ if (unlikely(report.result & UTRACE_SIGNAL_HOLD)) { ++ push_back_signal(task, info); ++ spin_unlock_irq(&task->sighand->siglock); ++ } ++ ++ return -1; ++ } ++ ++ /* ++ * Complete the bookkeeping after the report. ++ * This sets utrace->report if UTRACE_STOP was used. ++ */ ++ finish_report(&report, task, utrace); ++ ++ return_ka->sa.sa_handler = SIG_DFL; ++ ++ if (unlikely(report.result & UTRACE_SIGNAL_HOLD)) ++ push_back_signal(task, info); ++ else ++ spin_lock_irq(&task->sighand->siglock); ++ ++ if (sig_kernel_stop(signr)) ++ task->signal->flags |= SIGNAL_STOP_DEQUEUED; ++ ++ return signr; ++} ++ ++/* ++ * This gets called after a signal handler has been set up. ++ * We set a flag so the next report knows it happened. ++ * If we're already stepping, make sure we do a report_signal. ++ * If not, make sure we get into utrace_resume() where we can ++ * clear the signal_handler flag before resuming. ++ */ ++void utrace_signal_handler(struct task_struct *task, int stepping) ++{ ++ struct utrace *utrace = task_utrace_struct(task); ++ ++ spin_lock(&utrace->lock); ++ ++ utrace->signal_handler = 1; ++ if (stepping) { ++ utrace->interrupt = 1; ++ set_tsk_thread_flag(task, TIF_SIGPENDING); ++ } else { ++ set_tsk_thread_flag(task, TIF_NOTIFY_RESUME); ++ } ++ ++ spin_unlock(&utrace->lock); ++} ++ ++/** ++ * utrace_prepare_examine - prepare to examine thread state ++ * @target: thread of interest, a &struct task_struct pointer ++ * @engine: engine pointer returned by utrace_attach_task() ++ * @exam: temporary state, a &struct utrace_examiner pointer ++ * ++ * This call prepares to safely examine the thread @target using ++ * &struct user_regset calls, or direct access to thread-synchronous fields. ++ * ++ * When @target is current, this call is superfluous. When @target is ++ * another thread, it must held stopped via %UTRACE_STOP by @engine. ++ * ++ * This call may block the caller until @target stays stopped, so it must ++ * be called only after the caller is sure @target is about to unschedule. ++ * This means a zero return from a utrace_control() call on @engine giving ++ * %UTRACE_STOP, or a report_quiesce() or report_signal() callback to ++ * @engine that used %UTRACE_STOP in its return value. ++ * ++ * Returns -%ESRCH if @target is dead or -%EINVAL if %UTRACE_STOP was ++ * not used. If @target has started running again despite %UTRACE_STOP ++ * (for %SIGKILL or a spurious wakeup), this call returns -%EAGAIN. ++ * ++ * When this call returns zero, it's safe to use &struct user_regset ++ * calls and task_user_regset_view() on @target and to examine some of ++ * its fields directly. When the examination is complete, a ++ * utrace_finish_examine() call must follow to check whether it was ++ * completed safely. ++ */ ++int utrace_prepare_examine(struct task_struct *target, ++ struct utrace_engine *engine, ++ struct utrace_examiner *exam) ++{ ++ int ret = 0; ++ ++ if (unlikely(target == current)) ++ return 0; ++ ++ rcu_read_lock(); ++ if (unlikely(!engine_wants_stop(engine))) ++ ret = -EINVAL; ++ else if (unlikely(target->exit_state)) ++ ret = -ESRCH; ++ else { ++ exam->state = target->state; ++ if (unlikely(exam->state == TASK_RUNNING)) ++ ret = -EAGAIN; ++ else ++ get_task_struct(target); ++ } ++ rcu_read_unlock(); ++ ++ if (likely(!ret)) { ++ exam->ncsw = wait_task_inactive(target, exam->state); ++ put_task_struct(target); ++ if (unlikely(!exam->ncsw)) ++ ret = -EAGAIN; ++ } ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(utrace_prepare_examine); ++ ++/** ++ * utrace_finish_examine - complete an examination of thread state ++ * @target: thread of interest, a &struct task_struct pointer ++ * @engine: engine pointer returned by utrace_attach_task() ++ * @exam: pointer passed to utrace_prepare_examine() call ++ * ++ * This call completes an examination on the thread @target begun by a ++ * paired utrace_prepare_examine() call with the same arguments that ++ * returned success (zero). ++ * ++ * When @target is current, this call is superfluous. When @target is ++ * another thread, this returns zero if @target has remained unscheduled ++ * since the paired utrace_prepare_examine() call returned zero. ++ * ++ * When this returns an error, any examination done since the paired ++ * utrace_prepare_examine() call is unreliable and the data extracted ++ * should be discarded. The error is -%EINVAL if @engine is not ++ * keeping @target stopped, or -%EAGAIN if @target woke up unexpectedly. ++ */ ++int utrace_finish_examine(struct task_struct *target, ++ struct utrace_engine *engine, ++ struct utrace_examiner *exam) ++{ ++ int ret = 0; ++ ++ if (unlikely(target == current)) ++ return 0; ++ ++ rcu_read_lock(); ++ if (unlikely(!engine_wants_stop(engine))) ++ ret = -EINVAL; ++ else if (unlikely(target->state != exam->state)) ++ ret = -EAGAIN; ++ else ++ get_task_struct(target); ++ rcu_read_unlock(); ++ ++ if (likely(!ret)) { ++ unsigned long ncsw = wait_task_inactive(target, exam->state); ++ if (unlikely(ncsw != exam->ncsw)) ++ ret = -EAGAIN; ++ put_task_struct(target); ++ } ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(utrace_finish_examine); ++ ++/* ++ * This is declared in linux/regset.h and defined in machine-dependent ++ * code. We put the export here to ensure no machine forgets it. ++ */ ++EXPORT_SYMBOL_GPL(task_user_regset_view); ++ ++/* ++ * Called with rcu_read_lock() held. ++ */ ++void task_utrace_proc_status(struct seq_file *m, struct task_struct *p) ++{ ++ struct utrace *utrace = &p->utrace; ++ seq_printf(m, "Utrace:\t%lx%s%s%s\n", ++ p->utrace_flags, ++ utrace->stopped ? " (stopped)" : "", ++ utrace->report ? " (report)" : "", ++ utrace->interrupt ? " (interrupt)" : ""); ++} diff --git a/original/kernel.spec b/original/kernel.spec new file mode 100644 index 000000000..5c1605474 --- /dev/null +++ b/original/kernel.spec @@ -0,0 +1,3886 @@ +# We have to override the new %%install behavior because, well... the kernel is special. +%global __spec_install_pre %{___build_pre} + +Summary: The Linux kernel + +# For a stable, released kernel, released_kernel should be 1. For rawhide +# and/or a kernel built from an rc or git snapshot, released_kernel should +# be 0. +%global released_kernel 1 + +# Versions of various parts + +# Polite request for people who spin their own kernel rpms: +# please modify the "buildid" define in a way that identifies +# that the kernel isn't the stock distribution kernel, for example, +# by setting the define to ".local" or ".bz123456" +# +# % define buildid .local + +# fedora_build defines which build revision of this kernel version we're +# building. Rather than incrementing forever, as with the prior versioning +# setup, we set fedora_cvs_origin to the current cvs revision s/1.// of the +# kernel spec when the kernel is rebased, so fedora_build automatically +# works out to the offset from the rebase, so it doesn't get too ginormous. +# +# If you're building on a branch, the RCS revision will be something like +# 1.1205.1.1. In this case we drop the initial 1, subtract fedora_cvs_origin +# from the second number, and then append the rest of the RCS string as is. +# Don't stare at the awk too long, you'll go blind. +%define fedora_cvs_origin 1786 +%define fedora_cvs_revision() %2 +%global fedora_build %(echo %{fedora_cvs_origin}.%{fedora_cvs_revision $Revision: 1.1948 $} | awk -F . '{ OFS = "."; ORS = ""; print $3 - $1 ; i = 4 ; OFS = ""; while (i <= NF) { print ".", $i ; i++} }') + +# base_sublevel is the kernel version we're starting with and patching +# on top of -- for example, 2.6.22-rc7-git1 starts with a 2.6.21 base, +# which yields a base_sublevel of 21. +%define base_sublevel 31 + +## If this is a released kernel ## +%if 0%{?released_kernel} + +# Do we have a -stable update to apply? +%define stable_update 6 +# Is it a -stable RC? +%define stable_rc 0 +# Set rpm version accordingly +%if 0%{?stable_update} +%define stablerev .%{stable_update} +%define stable_base %{stable_update} +%if 0%{?stable_rc} +# stable RCs are incremental patches, so we need the previous stable patch +%define stable_base %(echo $((%{stable_update} - 1))) +%endif +%endif +%define rpmversion 2.6.%{base_sublevel}%{?stablerev} + +## The not-released-kernel case ## +%else +# The next upstream release sublevel (base_sublevel+1) +%define upstream_sublevel %(echo $((%{base_sublevel} + 1))) +# The rc snapshot level +%define rcrev 9 +# The git snapshot level +%define gitrev 2 +# Set rpm version accordingly +%define rpmversion 2.6.%{upstream_sublevel} +%endif +# Nb: The above rcrev and gitrev values automagically define Patch00 and Patch01 below. + +# What parts do we want to build? We must build at least one kernel. +# These are the kernels that are built IF the architecture allows it. +# All should default to 1 (enabled) and be flipped to 0 (disabled) +# by later arch-specific checks. + +# The following build options are enabled by default. +# Use either --without in your rpmbuild command or force values +# to 0 in here to disable them. +# +# standard kernel +%define with_up %{?_without_up: 0} %{?!_without_up: 1} +# kernel-smp (only valid for ppc 32-bit) +%define with_smp %{?_without_smp: 0} %{?!_without_smp: 1} +# kernel-kdump +%define with_kdump %{?_without_kdump: 0} %{?!_without_kdump: 1} +# kernel-debug +%define with_debug %{?_without_debug: 0} %{?!_without_debug: 1} +# kernel-doc +%define with_doc %{?_without_doc: 0} %{?!_without_doc: 1} +# kernel-headers +%define with_headers %{?_without_headers: 0} %{?!_without_headers: 1} +# kernel-firmware +%define with_firmware %{?_with_firmware: 1} %{?!_with_firmware: 0} +# tools/perf +%define with_perftool %{?_without_perftool: 0} %{?!_without_perftool: 1} +# perf noarch subpkg +%define with_perf %{?_without_perf: 0} %{?!_without_perf: 1} +# kernel-debuginfo +%define with_debuginfo %{?_without_debuginfo: 0} %{?!_without_debuginfo: 1} +# kernel-bootwrapper (for creating zImages from kernel + initrd) +%define with_bootwrapper %{?_without_bootwrapper: 0} %{?!_without_bootwrapper: 1} +# Want to build a the vsdo directories installed +%define with_vdso_install %{?_without_vdso_install: 0} %{?!_without_vdso_install: 1} +# Use dracut instead of mkinitrd for initrd image generation +%define with_dracut %{?_without_dracut: 0} %{?!_without_dracut: 1} + +# Build the kernel-doc package, but don't fail the build if it botches. +# Here "true" means "continue" and "false" means "fail the build". +%if 0%{?released_kernel} +%define doc_build_fail false +%else +%define doc_build_fail true +%endif + +%define rawhide_skip_docs 0 +%if 0%{?rawhide_skip_docs} +%define with_doc 0 +%endif + +# Additional options for user-friendly one-off kernel building: +# +# Only build the base kernel (--with baseonly): +%define with_baseonly %{?_with_baseonly: 1} %{?!_with_baseonly: 0} +# Only build the smp kernel (--with smponly): +%define with_smponly %{?_with_smponly: 1} %{?!_with_smponly: 0} +# Only build the debug kernel (--with dbgonly): +%define with_dbgonly %{?_with_dbgonly: 1} %{?!_with_dbgonly: 0} + +# should we do C=1 builds with sparse +%define with_sparse %{?_with_sparse: 1} %{?!_with_sparse: 0} + +# Set debugbuildsenabled to 1 for production (build separate debug kernels) +# and 0 for rawhide (all kernels are debug kernels). +# See also 'make debug' and 'make release'. +%define debugbuildsenabled 1 + +# Want to build a vanilla kernel build without any non-upstream patches? +# (well, almost none, we need nonintconfig for build purposes). Default to 0 (off). +%define with_vanilla %{?_with_vanilla: 1} %{?!_with_vanilla: 0} + +# pkg_release is what we'll fill in for the rpm Release: field +%if 0%{?released_kernel} + +%if 0%{?stable_rc} +%define stable_rctag .rc%{stable_rc} +%endif +%define pkg_release %{fedora_build}%{?stable_rctag}%{?buildid}%{?dist} + +%else + +# non-released_kernel +%if 0%{?rcrev} +%define rctag .rc%rcrev +%endif +%if 0%{?gitrev} +%define gittag .git%gitrev +%if !0%{?rcrev} +%define rctag .rc0 +%endif +%endif +%define pkg_release 0.%{fedora_build}%{?rctag}%{?gittag}%{?buildid}%{?dist} + +%endif + +# The kernel tarball/base version +%define kversion 2.6.%{base_sublevel} + +%define make_target bzImage + +%define KVERREL %{PACKAGE_VERSION}-%{PACKAGE_RELEASE}.%{_target_cpu} +%define hdrarch %_target_cpu +%define asmarch %_target_cpu + +%if 0%{!?nopatches:1} +%define nopatches 0 +%endif + +%if %{with_vanilla} +%define nopatches 1 +%endif + +%if %{nopatches} +%define with_bootwrapper 0 +%define variant -vanilla +%else +%define variant_fedora -fedora +%endif + +%define using_upstream_branch 0 +%if 0%{?upstream_branch:1} +%define stable_update 0 +%define using_upstream_branch 1 +%define variant -%{upstream_branch}%{?variant_fedora} +%define pkg_release 0.%{fedora_build}%{upstream_branch_tag}%{?buildid}%{?dist} +%endif + +%if !%{debugbuildsenabled} +%define with_debug 0 +%endif + +%if !%{with_debuginfo} +%define _enable_debug_packages 0 +%endif +%define debuginfodir /usr/lib/debug + +# kernel-PAE is only built on i686. +%ifarch i686 +%define with_pae 1 +%else +%define with_pae 0 +%endif + +# if requested, only build base kernel +%if %{with_baseonly} +%define with_smp 0 +%define with_kdump 0 +%define with_debug 0 +%endif + +# if requested, only build smp kernel +%if %{with_smponly} +%define with_up 0 +%define with_kdump 0 +%define with_debug 0 +%endif + +# if requested, only build debug kernel +%if %{with_dbgonly} +%if %{debugbuildsenabled} +%define with_up 0 +%endif +%define with_smp 0 +%define with_pae 0 +%define with_xen 0 +%define with_kdump 0 +%define with_perftool 0 +%endif + +%define all_x86 i386 i686 + +%if %{with_vdso_install} +# These arches install vdso/ directories. +%define vdso_arches %{all_x86} x86_64 ppc ppc64 +%endif + +# Overrides for generic default options + +# only ppc and alphav56 need separate smp kernels +%ifnarch ppc alphaev56 +%define with_smp 0 +%endif + +# only build kernel-kdump on ppc64 +# (no relocatable kernel support upstream yet) +#FIXME: Temporarily disabled to speed up builds. +#ifnarch ppc64 +%define with_kdump 0 +#endif + +# don't do debug builds on anything but i686 and x86_64 +%ifnarch i686 x86_64 +%define with_debug 0 +%endif + +# only package docs noarch +%ifnarch noarch +%define with_doc 0 +%define with_perf 0 +%endif + +# don't build noarch kernels or headers (duh) +%ifarch noarch +%define with_up 0 +%define with_headers 0 +%define all_arch_configs kernel-%{version}-*.config +%define with_firmware %{?_without_firmware: 0} %{?!_without_firmware: 1} +%endif + +# bootwrapper is only on ppc +%ifnarch ppc ppc64 +%define with_bootwrapper 0 +%endif + +# sparse blows up on ppc64 alpha and sparc64 +%ifarch ppc64 ppc alpha sparc64 +%define with_sparse 0 +%endif + +# Per-arch tweaks + +%ifarch %{all_x86} +%define asmarch x86 +%define hdrarch i386 +%define all_arch_configs kernel-%{version}-i?86*.config +%define image_install_path boot +%define kernel_image arch/x86/boot/bzImage +%endif + +%ifarch x86_64 +%define asmarch x86 +%define all_arch_configs kernel-%{version}-x86_64*.config +%define image_install_path boot +%define kernel_image arch/x86/boot/bzImage +%endif + +%ifarch ppc64 +%define asmarch powerpc +%define hdrarch powerpc +%define all_arch_configs kernel-%{version}-ppc64*.config +%define image_install_path boot +%define make_target vmlinux +%define kernel_image vmlinux +%define kernel_image_elf 1 +%endif + +%ifarch s390x +%define asmarch s390 +%define hdrarch s390 +%define all_arch_configs kernel-%{version}-s390x.config +%define image_install_path boot +%define make_target image +%define kernel_image arch/s390/boot/image +%endif + +%ifarch sparc +# We only build sparc headers since we dont support sparc32 hardware +%endif + +%ifarch sparc64 +%define asmarch sparc +%define all_arch_configs kernel-%{version}-sparc64*.config +%define make_target image +%define kernel_image arch/sparc/boot/image +%define image_install_path boot +%define with_perftool 0 +%endif + +%ifarch ppc +%define asmarch powerpc +%define hdrarch powerpc +%define all_arch_configs kernel-%{version}-ppc{-,.}*config +%define image_install_path boot +%define make_target vmlinux +%define kernel_image vmlinux +%define kernel_image_elf 1 +%endif + +%ifarch ia64 +%define all_arch_configs kernel-%{version}-ia64*.config +%define image_install_path boot/efi/EFI/redhat +%define make_target compressed +%define kernel_image vmlinux.gz +%endif + +%ifarch alpha alphaev56 +%define all_arch_configs kernel-%{version}-alpha*.config +%define image_install_path boot +%define make_target vmlinux +%define kernel_image vmlinux +%endif + +%ifarch %{arm} +%define all_arch_configs kernel-%{version}-arm*.config +%define image_install_path boot +%define hdrarch arm +%define make_target vmlinux +%define kernel_image vmlinux +%endif + +%if %{nopatches} +# XXX temporary until last vdso patches are upstream +%define vdso_arches ppc ppc64 +%endif + +%if %{nopatches}%{using_upstream_branch} +# Ignore unknown options in our config-* files. +# Some options go with patches we're not applying. +%define oldconfig_target loose_nonint_oldconfig +%else +%define oldconfig_target nonint_oldconfig +%endif + +# To temporarily exclude an architecture from being built, add it to +# %nobuildarches. Do _NOT_ use the ExclusiveArch: line, because if we +# don't build kernel-headers then the new build system will no longer let +# us use the previous build of that package -- it'll just be completely AWOL. +# Which is a BadThing(tm). + +# We don't build a kernel on i386; we only do kernel-headers there, +# and we no longer build for 31bit S390. Same for 32bit sparc and arm. +%define nobuildarches i386 s390 sparc %{arm} + +%ifarch %nobuildarches +%define with_up 0 +%define with_smp 0 +%define with_pae 0 +%define with_kdump 0 +%define with_debuginfo 0 +%define with_perftool 0 +%define _enable_debug_packages 0 +%endif + +%define with_pae_debug 0 +%if %{with_pae} +%define with_pae_debug %{with_debug} +%endif + +# +# Three sets of minimum package version requirements in the form of Conflicts: +# to versions below the minimum +# + +# +# First the general kernel 2.6 required versions as per +# Documentation/Changes +# +%define kernel_dot_org_conflicts ppp < 2.4.3-3, isdn4k-utils < 3.2-32, nfs-utils < 1.0.7-12, e2fsprogs < 1.37-4, util-linux < 2.12, jfsutils < 1.1.7-2, reiserfs-utils < 3.6.19-2, xfsprogs < 2.6.13-4, procps < 3.2.5-6.3, oprofile < 0.9.1-2 + +# +# Then a series of requirements that are distribution specific, either +# because we add patches for something, or the older versions have +# problems with the newer kernel or lack certain things that make +# integration in the distro harder than needed. +# +%define package_conflicts initscripts < 7.23, udev < 063-6, iptables < 1.3.2-1, ipw2200-firmware < 2.4, iwl4965-firmware < 228.57.2, selinux-policy-targeted < 1.25.3-14, squashfs-tools < 4.0, wireless-tools < 29-3 + +# +# The ld.so.conf.d file we install uses syntax older ldconfig's don't grok. +# +%define kernel_xen_conflicts glibc < 2.3.5-1, xen < 3.0.1 + +%define kernel_PAE_obsoletes kernel-smp < 2.6.17, kernel-xen <= 2.6.27-0.2.rc0.git6.fc10 +%define kernel_PAE_provides kernel-xen = %{rpmversion}-%{pkg_release} + +%ifarch x86_64 +%define kernel_obsoletes kernel-xen <= 2.6.27-0.2.rc0.git6.fc10 +%define kernel_provides kernel-xen = %{rpmversion}-%{pkg_release} +%endif + +# We moved the drm include files into kernel-headers, make sure there's +# a recent enough libdrm-devel on the system that doesn't have those. +%define kernel_headers_conflicts libdrm-devel < 2.4.0-0.15 + +# +# Packages that need to be installed before the kernel is, because the %post +# scripts use them. +# +%define kernel_prereq fileutils, module-init-tools, initscripts >= 8.11.1-1, kernel-firmware >= %{rpmversion}-%{pkg_release}, grubby >= 7.0.4-1 +%if %{with_dracut} +%define initrd_prereq dracut >= 001-7 +%else +%define initrd_prereq mkinitrd >= 6.0.61-1 +%endif + +# +# This macro does requires, provides, conflicts, obsoletes for a kernel package. +# %%kernel_reqprovconf +# It uses any kernel__conflicts and kernel__obsoletes +# macros defined above. +# +%define kernel_reqprovconf \ +Provides: kernel = %{rpmversion}-%{pkg_release}\ +Provides: kernel-%{_target_cpu} = %{rpmversion}-%{pkg_release}%{?1:.%{1}}\ +Provides: kernel-drm = 4.3.0\ +Provides: kernel-drm-nouveau = 15\ +Provides: kernel-modeset = 1\ +Provides: kernel-uname-r = %{KVERREL}%{?1:.%{1}}\ +Requires(pre): %{kernel_prereq}\ +Requires(pre): %{initrd_prereq}\ +Requires(post): /sbin/new-kernel-pkg\ +Requires(preun): /sbin/new-kernel-pkg\ +Conflicts: %{kernel_dot_org_conflicts}\ +Conflicts: %{package_conflicts}\ +%{expand:%%{?kernel%{?1:_%{1}}_conflicts:Conflicts: %%{kernel%{?1:_%{1}}_conflicts}}}\ +%{expand:%%{?kernel%{?1:_%{1}}_obsoletes:Obsoletes: %%{kernel%{?1:_%{1}}_obsoletes}}}\ +%{expand:%%{?kernel%{?1:_%{1}}_provides:Provides: %%{kernel%{?1:_%{1}}_provides}}}\ +# We can't let RPM do the dependencies automatic because it'll then pick up\ +# a correct but undesirable perl dependency from the module headers which\ +# isn't required for the kernel proper to function\ +AutoReq: no\ +AutoProv: yes\ +%{nil} + +Name: kernel%{?variant} +Group: System Environment/Kernel +License: GPLv2 +URL: http://www.kernel.org/ +Version: %{rpmversion} +Release: %{pkg_release} +# DO NOT CHANGE THE 'ExclusiveArch' LINE TO TEMPORARILY EXCLUDE AN ARCHITECTURE BUILD. +# SET %%nobuildarches (ABOVE) INSTEAD +ExclusiveArch: noarch %{all_x86} x86_64 ppc ppc64 ia64 sparc sparc64 s390x alpha alphaev56 %{arm} +ExclusiveOS: Linux + +%kernel_reqprovconf +%ifarch x86_64 sparc64 +Obsoletes: kernel-smp +%endif + + +# +# List the packages used during the kernel build +# +BuildRequires: module-init-tools, patch >= 2.5.4, bash >= 2.03, sh-utils, tar +BuildRequires: bzip2, findutils, gzip, m4, perl, make >= 3.78, diffutils, gawk +BuildRequires: gcc >= 3.4.2, binutils >= 2.12, redhat-rpm-config +BuildRequires: net-tools +BuildRequires: xmlto, asciidoc +%if %{with_sparse} +BuildRequires: sparse >= 0.4.1 +%endif +%if %{with_perftool} +BuildRequires: elfutils-libelf-devel zlib-devel binutils-devel +%endif +BuildConflicts: rhbuildsys(DiskFree) < 500Mb + +%define fancy_debuginfo 0 +%if %{with_debuginfo} +%if 0%{?fedora} >= 8 || 0%{?rhel} >= 6 +%define fancy_debuginfo 1 +%endif +%endif + +%if %{fancy_debuginfo} +# Fancy new debuginfo generation introduced in Fedora 8. +BuildRequires: rpm-build >= 4.4.2.1-4 +%define debuginfo_args --strict-build-id +%endif + +Source0: ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-%{kversion}.tar.bz2 + +Source11: genkey +Source14: find-provides +Source15: merge.pl + +Source20: Makefile.config +Source21: config-debug +Source22: config-nodebug +Source23: config-generic +Source24: config-rhel-generic + +Source30: config-x86-generic +Source31: config-i686-PAE + +Source40: config-x86_64-generic + +Source50: config-powerpc-generic +Source51: config-powerpc32-generic +Source52: config-powerpc32-smp +Source53: config-powerpc64 + +Source60: config-ia64-generic + +Source70: config-s390x + +Source90: config-sparc64-generic + +Source100: config-arm + +Source200: perf + +# Here should be only the patches up to the upstream canonical Linus tree. + +# For a stable release kernel +%if 0%{?stable_update} +%if 0%{?stable_base} +%define stable_patch_00 patch-2.6.%{base_sublevel}.%{stable_base}.bz2 +Patch00: %{stable_patch_00} +%endif +%if 0%{?stable_rc} +%define stable_patch_01 patch-2.6.%{base_sublevel}.%{stable_update}-rc%{stable_rc}.bz2 +Patch01: %{stable_patch_01} +%endif + +# non-released_kernel case +# These are automagically defined by the rcrev and gitrev values set up +# near the top of this spec file. +%else +%if 0%{?rcrev} +Patch00: patch-2.6.%{upstream_sublevel}-rc%{rcrev}.bz2 +%if 0%{?gitrev} +Patch01: patch-2.6.%{upstream_sublevel}-rc%{rcrev}-git%{gitrev}.bz2 +%endif +%else +# pre-{base_sublevel+1}-rc1 case +%if 0%{?gitrev} +Patch00: patch-2.6.%{base_sublevel}-git%{gitrev}.bz2 +%endif +%endif +%endif + +%if %{using_upstream_branch} +### BRANCH PATCH ### +%endif + +Patch02: git-linus.diff + +# we always need nonintconfig, even for -vanilla kernels +Patch03: linux-2.6-build-nonintconfig.patch + +# we also need compile fixes for -vanilla +Patch04: linux-2.6-compile-fixes.patch + +# build tweak for build ID magic, even for -vanilla +Patch05: linux-2.6-makefile-after_link.patch + +%if !%{nopatches} + +# revert upstream patches we get via other methods +Patch09: linux-2.6-upstream-reverts.patch +# Git trees. +Patch10: git-cpufreq.patch +Patch11: git-bluetooth.patch + +# Standalone patches +Patch20: linux-2.6-hotfixes.patch + +Patch21: linux-2.6-tracehook.patch +Patch22: linux-2.6-utrace.patch + +Patch30: sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch + +Patch31: disable-stackprotector-all.patch + +# Intel IOMMU fixes/workarounds +Patch100: linux-2.6-die-closed-source-bios-muppets-die.patch +Patch101: linux-2.6-intel-iommu-updates.patch +Patch102: linux-2.6-iommu-at-zero.patch +Patch103: linux-2.6-iommu-dmar-all-1s.patch +Patch104: linux-2.6-iommu-another-hp-screwup.patch +Patch105: linux-2.6-iommu-sanity-checks-for-intr-remap-too.patch +Patch106: linux-2.6-iommu-hp-cantiga-resume.patch + +Patch141: linux-2.6-ps3-storage-alias.patch +Patch143: linux-2.6-g5-therm-shutdown.patch +Patch144: linux-2.6-vio-modalias.patch +Patch147: linux-2.6-imac-transparent-bridge.patch + +Patch150: linux-2.6.29-sparc-IOC_TYPECHECK.patch + +Patch160: linux-2.6-execshield.patch + +Patch250: linux-2.6-debug-sizeof-structs.patch +Patch260: linux-2.6-debug-nmi-timeout.patch +Patch270: linux-2.6-debug-taint-vm.patch +Patch280: linux-2.6-debug-spinlock-taint.patch +Patch300: linux-2.6-driver-level-usb-autosuspend.diff +Patch302: linux-2.6-qcserial-autosuspend.diff +Patch303: linux-2.6-bluetooth-autosuspend.diff +Patch304: linux-2.6-usb-uvc-autosuspend.diff +Patch340: linux-2.6-debug-vm-would-have-oomkilled.patch +Patch360: linux-2.6-debug-always-inline-kzalloc.patch +Patch380: linux-2.6-defaults-pci_no_msi.patch +Patch381: linux-2.6-pciehp-update.patch +Patch382: linux-2.6-defaults-pciehp.patch +Patch383: linux-2.6-defaults-aspm.patch +Patch390: linux-2.6-defaults-acpi-video.patch +Patch391: linux-2.6-acpi-video-dos.patch +Patch450: linux-2.6-input-kill-stupid-messages.patch +Patch451: linux-2.6-input-fix-toshiba-hotkeys.patch +Patch452: linux-2.6.30-no-pcspkr-modalias.patch + +Patch460: linux-2.6-serial-460800.patch + +Patch470: die-floppy-die.patch + +Patch500: linux-2.6.31-copy_from_user-bounds.patch + +Patch510: linux-2.6-silence-noise.patch +Patch520: linux-2.6.30-hush-rom-warning.patch +Patch530: linux-2.6-silence-fbcon-logo.patch +Patch570: linux-2.6-selinux-mprotect-checks.patch +Patch580: linux-2.6-sparc-selinux-mprotect-checks.patch + +Patch600: linux-2.6-defaults-alsa-hda-beep-off.patch +Patch601: linux-2.6-alsa-improve-hda-powerdown.patch +Patch610: hda_intel-prealloc-4mb-dmabuffer.patch +Patch611: alsa-tell-user-that-stream-to-be-rewound-is-suspended.patch + +Patch670: linux-2.6-ata-quirk.patch +Patch671: linux-2.6-ahci-export-capabilities.patch + +Patch680: prism54-remove-pci-dev-table.patch +Patch681: linux-2.6-ath9k-fixes.patch + +Patch800: linux-2.6-crash-driver.patch + +Patch900: linux-2.6-pci-cacheline-sizing.patch + +# ACPI +Patch1100: linux-2.6.31-cpuidle-faster-io.patch +# EC fixes from 2.6.32 (#492699, #525681) +Patch1110: acpi-ec-merge-irq-and-poll-modes.patch +Patch1120: acpi-ec-use-burst-mode-only-for-msi-notebooks.patch +Patch1130: acpi-ec-restart-command-even-if-no-interrupts-from-ec.patch + +Patch1515: lirc-2.6.31.patch +Patch1517: hdpvr-ir-enable.patch +Patch1518: hid-ignore-all-recent-imon-devices.patch + +# virt + ksm patches +Patch1550: linux-2.6-ksm.patch +Patch1551: linux-2.6-ksm-kvm.patch +Patch1552: linux-2.6-ksm-updates.patch +Patch1553: linux-2.6-ksm-fix-munlock.patch +Patch1554: linux-2.6-ksm-updates-from-32.patch +Patch1579: linux-2.6-virtio_blk-revert-QUEUE_FLAG_VIRT-addition.patch +Patch1583: linux-2.6-xen-fix-is_disconnected_device-exists_disconnected_device.patch +Patch1584: linux-2.6-xen-improvement-to-wait_for_devices.patch +Patch1585: linux-2.6-xen-increase-device-connection-timeout.patch +Patch1586: linux-2.6-virtio_blk-add-support-for-cache-flush.patch + +# nouveau + drm fixes +Patch1810: kms-offb-handoff.patch +Patch1812: drm-next-b390f944.patch +Patch1813: drm-radeon-pm.patch +Patch1814: drm-nouveau.patch +Patch1818: drm-i915-resume-force-mode.patch +# intel drm is all merged upstream +Patch1824: drm-intel-next.patch +Patch1825: drm-intel-pm.patch +Patch1826: drm-intel-no-tv-hotplug.patch +Patch1827: drm-i915-fix-tvmode-oops.patch +Patch1831: drm-conservative-fallback-modes.patch +Patch1832: drm-edid-retry.patch +Patch1834: drm-edid-header-fixup.patch +Patch1835: drm-default-mode.patch +Patch1837: drm-i915-fix-sync-to-vbl-when-vga-is-off.patch +Patch1839: drm-radeon-misc-fixes.patch +Patch1840: drm-radeon-rv410-test-fix.patch + +# vga arb +Patch1900: linux-2.6-vga-arb.patch +Patch1901: drm-vga-arb.patch +Patch1902: drm-radeon-kms-arbiter-return-ignore.patch + +# make harmless fbcon debug less loud +Patch1903: fbcon-lower-debug.patch + +# kludge to make ich9 e1000 work +Patch2000: linux-2.6-e1000-ich9.patch + +# linux1394 git patches +Patch2200: linux-2.6-firewire-git-update.patch +Patch2201: linux-2.6-firewire-git-pending.patch + +# Quiet boot fixes +# silence the ACPI blacklist code +Patch2802: linux-2.6-silence-acpi-blacklist.patch + +Patch2899: linux-2.6-v4l-dvb-fixes.patch +Patch2900: linux-2.6-v4l-dvb-update.patch +Patch2901: linux-2.6-v4l-dvb-experimental.patch +Patch2904: v4l-dvb-fix-cx25840-firmware-loading.patch + +# fs fixes + +#btrfs +Patch3000: linux-2.6-btrfs-upstream.patch + +# NFSv4 +Patch3050: linux-2.6-nfsd4-proots.patch +Patch3060: linux-2.6-nfs4-ver4opt.patch +Patch3061: linux-2.6-nfs4-callback-hidden.patch + +# VIA Nano / VX8xx updates +Patch11010: via-hwmon-temp-sensor.patch + +# patches headed upstream +Patch12010: linux-2.6-dell-laptop-rfkill-fix.patch +Patch12011: linux-2.6-block-silently-error-unsupported-empty-barriers-too.patch +Patch12012: linux-2.6-rtc-show-hctosys.patch +Patch12013: linux-2.6-rfkill-all.patch +Patch12014: linux-2.6-selinux-module-load-perms.patch + +# sched fixes cherry-picked from 2.6.32 +Patch13100: sched-deal-with-low-load-in-wake-affine.patch +Patch13101: sched-ensure-child-cant-gain-time-over-its-parent-after-fork.patch +Patch13102: sched-remove-shortcut-from-select-task-rq-fair.patch +# latency defaults from 2.6.32 +Patch13110: sched-retune-scheduler-latency-defaults.patch +# Fix huge wakeup latencies +Patch13120: sched-update-the-clock-of-runqueue-select-task-rq-selected.patch + +# patches headed for -stable + +# make perf counter API available to userspace (#527264) +Patch14010: perf-make-perf-counter-h-available-to-userspace.patch + +# fix resource counter issues on *big* machines +Patch14101: improve-resource-counter-scalability.patch + +# fix perf for sysprof +Patch14420: perf-events-fix-swevent-hrtimer-sampling.patch +Patch14421: perf-events-dont-generate-events-for-the-idle-task.patch + +Patch14430: crypto-via-padlock-fix-nano-aes.patch + +# tg3 fixes (#527209) +Patch14451: tg3-01-delay-mdio-bus-init-until-fw-finishes.patch +Patch14452: tg3-02-fix-tso-test-against-wrong-flags-var.patch +Patch14453: tg3-03-fix-57780-asic-rev-pcie-link-receiver-errors.patch +Patch14454: tg3-04-prevent-tx-bd-corruption.patch +Patch14455: tg3-05-assign-flags-to-fixes-in-start_xmit_dma_bug.patch +Patch14456: tg3-06-fix-5906-transmit-hangs.patch + +Patch14460: highmem-Fix-debug_kmap_atomic-to-also-handle-KM_IRQ_.patch +Patch14461: highmem-Fix-race-in-debug_kmap_atomic-which-could-ca.patch +Patch14462: highmem-fix-arm-powerpc-kmap_types.patch + +Patch14463: dlm-fix-connection-close-handling.patch + +# rhbz#544144 [bbf31bf18d34caa87dd01f08bf713635593697f2] +Patch14464: ipv4-fix-null-ptr-deref-in-ip_fragment.patch + +%endif + +BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root + +%description +The kernel package contains the Linux kernel (vmlinuz), the core of any +Linux operating system. The kernel handles the basic functions +of the operating system: memory allocation, process allocation, device +input and output, etc. + + +%package doc +Summary: Various documentation bits found in the kernel source +Group: Documentation +%description doc +This package contains documentation files from the kernel +source. Various bits of information about the Linux kernel and the +device drivers shipped with it are documented in these files. + +You'll want to install this package if you need a reference to the +options that can be passed to Linux kernel modules at load time. + + +%package headers +Summary: Header files for the Linux kernel for use by glibc +Group: Development/System +Obsoletes: glibc-kernheaders +Provides: glibc-kernheaders = 3.0-46 +%description headers +Kernel-headers includes the C header files that specify the interface +between the Linux kernel and userspace libraries and programs. The +header files define structures and constants that are needed for +building most standard programs and are also needed for rebuilding the +glibc package. + +%package firmware +Summary: Firmware files used by the Linux kernel +Group: Development/System +# This is... complicated. +# Look at the WHENCE file. +License: GPL+ and GPLv2+ and MIT and Redistributable, no modification permitted +%if "x%{?variant}" != "x" +Provides: kernel-firmware = %{rpmversion}-%{pkg_release} +%endif +%description firmware +Kernel-firmware includes firmware files required for some devices to +operate. + +%package bootwrapper +Summary: Boot wrapper files for generating combined kernel + initrd images +Group: Development/System +Requires: gzip +%description bootwrapper +Kernel-bootwrapper contains the wrapper code which makes bootable "zImage" +files combining both kernel and initial ramdisk. + +%package debuginfo-common-%{_target_cpu} +Summary: Kernel source files used by %{name}-debuginfo packages +Group: Development/Debug +%description debuginfo-common-%{_target_cpu} +This package is required by %{name}-debuginfo subpackages. +It provides the kernel source files common to all builds. + +%package -n perf +Summary: Performance monitoring for the Linux kernel +Group: Development/System +License: GPLv2 +%description -n perf +This package provides the supporting documentation for the perf tool +shipped in each kernel image subpackage. + +# +# This macro creates a kernel--debuginfo package. +# %%kernel_debuginfo_package +# +%define kernel_debuginfo_package() \ +%package %{?1:%{1}-}debuginfo\ +Summary: Debug information for package %{name}%{?1:-%{1}}\ +Group: Development/Debug\ +Requires: %{name}-debuginfo-common-%{_target_cpu} = %{version}-%{release}\ +Provides: %{name}%{?1:-%{1}}-debuginfo-%{_target_cpu} = %{version}-%{release}\ +AutoReqProv: no\ +%description -n %{name}%{?1:-%{1}}-debuginfo\ +This package provides debug information for package %{name}%{?1:-%{1}}.\ +This is required to use SystemTap with %{name}%{?1:-%{1}}-%{KVERREL}.\ +%{expand:%%global debuginfo_args %{?debuginfo_args} -p '/.*/%%{KVERREL}%{?1:\.%{1}}/.*|/.*%%{KVERREL}%{?1:\.%{1}}(\.debug)?' -o debuginfo%{?1}.list}\ +%{nil} + +# +# This macro creates a kernel--devel package. +# %%kernel_devel_package +# +%define kernel_devel_package() \ +%package %{?1:%{1}-}devel\ +Summary: Development package for building kernel modules to match the %{?2:%{2} }kernel\ +Group: System Environment/Kernel\ +Provides: kernel%{?1:-%{1}}-devel-%{_target_cpu} = %{version}-%{release}\ +Provides: kernel-devel-%{_target_cpu} = %{version}-%{release}%{?1:.%{1}}\ +Provides: kernel-devel = %{version}-%{release}%{?1:.%{1}}\ +Provides: kernel-devel-uname-r = %{KVERREL}%{?1:.%{1}}\ +AutoReqProv: no\ +Requires(pre): /usr/bin/find\ +%description -n kernel%{?variant}%{?1:-%{1}}-devel\ +This package provides kernel headers and makefiles sufficient to build modules\ +against the %{?2:%{2} }kernel package.\ +%{nil} + +# +# This macro creates a kernel- and its -devel and -debuginfo too. +# %%define variant_summary The Linux kernel compiled for +# %%kernel_variant_package [-n ] +# +%define kernel_variant_package(n:) \ +%package %1\ +Summary: %{variant_summary}\ +Group: System Environment/Kernel\ +%kernel_reqprovconf\ +%{expand:%%kernel_devel_package %1 %{!?-n:%1}%{?-n:%{-n*}}}\ +%{expand:%%kernel_debuginfo_package %1}\ +%{nil} + + +# First the auxiliary packages of the main kernel package. +%kernel_devel_package +%kernel_debuginfo_package + + +# Now, each variant package. + +%define variant_summary The Linux kernel compiled for SMP machines +%kernel_variant_package -n SMP smp +%description smp +This package includes a SMP version of the Linux kernel. It is +required only on machines with two or more CPUs as well as machines with +hyperthreading technology. + +Install the kernel-smp package if your machine uses two or more CPUs. + + +%define variant_summary The Linux kernel compiled for PAE capable machines +%kernel_variant_package PAE +%description PAE +This package includes a version of the Linux kernel with support for up to +64GB of high memory. It requires a CPU with Physical Address Extensions (PAE). +The non-PAE kernel can only address up to 4GB of memory. +Install the kernel-PAE package if your machine has more than 4GB of memory. + + +%define variant_summary The Linux kernel compiled with extra debugging enabled for PAE capable machines +%kernel_variant_package PAEdebug +Obsoletes: kernel-PAE-debug +%description PAEdebug +This package includes a version of the Linux kernel with support for up to +64GB of high memory. It requires a CPU with Physical Address Extensions (PAE). +The non-PAE kernel can only address up to 4GB of memory. +Install the kernel-PAE package if your machine has more than 4GB of memory. + +This variant of the kernel has numerous debugging options enabled. +It should only be installed when trying to gather additional information +on kernel bugs, as some of these options impact performance noticably. + + +%define variant_summary The Linux kernel compiled with extra debugging enabled +%kernel_variant_package debug +%description debug +The kernel package contains the Linux kernel (vmlinuz), the core of any +Linux operating system. The kernel handles the basic functions +of the operating system: memory allocation, process allocation, device +input and output, etc. + +This variant of the kernel has numerous debugging options enabled. +It should only be installed when trying to gather additional information +on kernel bugs, as some of these options impact performance noticably. + + +%define variant_summary A minimal Linux kernel compiled for crash dumps +%kernel_variant_package kdump +%description kdump +This package includes a kdump version of the Linux kernel. It is +required only on machines which will use the kexec-based kernel crash dump +mechanism. + + +%prep +# do a few sanity-checks for --with *only builds +%if %{with_baseonly} +%if !%{with_up}%{with_pae} +echo "Cannot build --with baseonly, up build is disabled" +exit 1 +%endif +%endif + +%if %{with_smponly} +%if !%{with_smp} +echo "Cannot build --with smponly, smp build is disabled" +exit 1 +%endif +%endif + +# more sanity checking; do it quietly +if [ "%{patches}" != "%%{patches}" ] ; then + for patch in %{patches} ; do + if [ ! -f $patch ] ; then + echo "ERROR: Patch ${patch##/*/} listed in specfile but is missing" + exit 1 + fi + done +fi 2>/dev/null + +patch_command='patch -p1 -F1 -s' +ApplyPatch() +{ + local patch=$1 + shift + if [ ! -f $RPM_SOURCE_DIR/$patch ]; then + exit 1 + fi + if ! egrep "^Patch[0-9]+: $patch\$" %{_specdir}/${RPM_PACKAGE_NAME%%%%%{?variant}}.spec ; then + if [ "${patch:0:10}" != "patch-2.6." ] ; then + echo "ERROR: Patch $patch not listed as a source patch in specfile" + exit 1 + fi + fi 2>/dev/null + case "$patch" in + *.bz2) bunzip2 < "$RPM_SOURCE_DIR/$patch" | $patch_command ${1+"$@"} ;; + *.gz) gunzip < "$RPM_SOURCE_DIR/$patch" | $patch_command ${1+"$@"} ;; + *) $patch_command ${1+"$@"} < "$RPM_SOURCE_DIR/$patch" ;; + esac +} + +# don't apply patch if it's empty +ApplyOptionalPatch() +{ + local patch=$1 + shift + if [ ! -f $RPM_SOURCE_DIR/$patch ]; then + exit 1 + fi + local C=$(wc -l $RPM_SOURCE_DIR/$patch | awk '{print $1}') + if [ "$C" -gt 9 ]; then + ApplyPatch $patch ${1+"$@"} + fi +} + +# we don't want a .config file when building firmware: it just confuses the build system +%define build_firmware \ + mv .config .config.firmware_save \ + make INSTALL_FW_PATH=$RPM_BUILD_ROOT/lib/firmware firmware_install \ + mv .config.firmware_save .config + +# First we unpack the kernel tarball. +# If this isn't the first make prep, we use links to the existing clean tarball +# which speeds things up quite a bit. + +# Update to latest upstream. +%if 0%{?released_kernel} +%define vanillaversion 2.6.%{base_sublevel} +# non-released_kernel case +%else +%if 0%{?rcrev} +%define vanillaversion 2.6.%{upstream_sublevel}-rc%{rcrev} +%if 0%{?gitrev} +%define vanillaversion 2.6.%{upstream_sublevel}-rc%{rcrev}-git%{gitrev} +%endif +%else +# pre-{base_sublevel+1}-rc1 case +%if 0%{?gitrev} +%define vanillaversion 2.6.%{base_sublevel}-git%{gitrev} +%endif +%endif +%endif + +# We can share hardlinked source trees by putting a list of +# directory names of the CVS checkouts that we want to share +# with in .shared-srctree. (Full pathnames are required.) +[ -f .shared-srctree ] && sharedirs=$(cat .shared-srctree) + +if [ ! -d kernel-%{kversion}/vanilla-%{vanillaversion} ]; then + + if [ -d kernel-%{kversion}/vanilla-%{kversion} ]; then + + cd kernel-%{kversion} + + # Any vanilla-* directories other than the base one are stale. + for dir in vanilla-*; do + [ "$dir" = vanilla-%{kversion} ] || rm -rf $dir & + done + + else + + # Ok, first time we do a make prep. + rm -f pax_global_header + for sharedir in $sharedirs ; do + if [[ ! -z $sharedir && -d $sharedir/kernel-%{kversion}/vanilla-%{kversion} ]] ; then + break + fi + done + if [[ ! -z $sharedir && -d $sharedir/kernel-%{kversion}/vanilla-%{kversion} ]] ; then +%setup -q -n kernel-%{kversion} -c -T + cp -rl $sharedir/kernel-%{kversion}/vanilla-%{kversion} . + else +%setup -q -n kernel-%{kversion} -c + mv linux-%{kversion} vanilla-%{kversion} + fi + + fi + +%if "%{kversion}" != "%{vanillaversion}" + + for sharedir in $sharedirs ; do + if [[ ! -z $sharedir && -d $sharedir/kernel-%{kversion}/vanilla-%{vanillaversion} ]] ; then + break + fi + done + if [[ ! -z $sharedir && -d $sharedir/kernel-%{kversion}/vanilla-%{vanillaversion} ]] ; then + + cp -rl $sharedir/kernel-%{kversion}/vanilla-%{vanillaversion} . + + else + + cp -rl vanilla-%{kversion} vanilla-%{vanillaversion} + cd vanilla-%{vanillaversion} + +# Update vanilla to the latest upstream. +# (non-released_kernel case only) +%if 0%{?rcrev} + ApplyPatch patch-2.6.%{upstream_sublevel}-rc%{rcrev}.bz2 +%if 0%{?gitrev} + ApplyPatch patch-2.6.%{upstream_sublevel}-rc%{rcrev}-git%{gitrev}.bz2 +%endif +%else +# pre-{base_sublevel+1}-rc1 case +%if 0%{?gitrev} + ApplyPatch patch-2.6.%{base_sublevel}-git%{gitrev}.bz2 +%endif +%endif + + cd .. + + fi + +%endif + +else + # We already have a vanilla dir. + cd kernel-%{kversion} +fi + +if [ -d linux-%{kversion}.%{_target_cpu} ]; then + # Just in case we ctrl-c'd a prep already + rm -rf deleteme.%{_target_cpu} + # Move away the stale away, and delete in background. + mv linux-%{kversion}.%{_target_cpu} deleteme.%{_target_cpu} + rm -rf deleteme.%{_target_cpu} & +fi + +cp -rl vanilla-%{vanillaversion} linux-%{kversion}.%{_target_cpu} + +cd linux-%{kversion}.%{_target_cpu} + +# released_kernel with possible stable updates +%if 0%{?stable_base} +ApplyPatch %{stable_patch_00} +%endif +%if 0%{?stable_rc} +ApplyPatch %{stable_patch_01} +%endif + +%if %{using_upstream_branch} +### BRANCH APPLY ### +%endif + +# Drop some necessary files from the source dir into the buildroot +cp $RPM_SOURCE_DIR/config-* . +cp %{SOURCE15} . + +# Dynamically generate kernel .config files from config-* files +make -f %{SOURCE20} VERSION=%{version} configs + +#if a rhel kernel, apply the rhel config options +%if 0%{?rhel} + for i in %{all_arch_configs} + do + mv $i $i.tmp + ./merge.pl config-rhel-generic $i.tmp > $i + rm $i.tmp + done +%endif + +#ApplyOptionalPatch git-linus.diff + +# This patch adds a "make nonint_oldconfig" which is non-interactive and +# also gives a list of missing options at the end. Useful for automated +# builds (as used in the buildsystem). +ApplyPatch linux-2.6-build-nonintconfig.patch + +ApplyPatch linux-2.6-makefile-after_link.patch + +# +# misc small stuff to make things compile +# +ApplyOptionalPatch linux-2.6-compile-fixes.patch + +%if !%{nopatches} + +# revert patches from upstream that conflict or that we get via other means +ApplyOptionalPatch linux-2.6-upstream-reverts.patch -R + +ApplyOptionalPatch git-cpufreq.patch +#ApplyOptionalPatch git-bluetooth.patch + +ApplyPatch linux-2.6-hotfixes.patch + +# Roland's utrace ptrace replacement. +ApplyPatch linux-2.6-tracehook.patch +ApplyPatch linux-2.6-utrace.patch + +ApplyPatch sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch + +ApplyPatch disable-stackprotector-all.patch + +# Architecture patches +# x86(-64) +ApplyPatch via-hwmon-temp-sensor.patch +ApplyPatch linux-2.6-dell-laptop-rfkill-fix.patch + +# +# Intel IOMMU +# +# Quiesce USB host controllers before setting up the IOMMU +ApplyPatch linux-2.6-die-closed-source-bios-muppets-die.patch +# Some performance fixes, unify hardware/software passthrough support, and +# most importantly: notice when the BIOS points us to a region that returns +# all 0xFF, and claims that there's an IOMMU there. +ApplyPatch linux-2.6-intel-iommu-updates.patch +ApplyPatch linux-2.6-iommu-at-zero.patch +ApplyPatch linux-2.6-iommu-dmar-all-1s.patch +# Check for RMRRs which end before they start +ApplyPatch linux-2.6-iommu-another-hp-screwup.patch +# Apply the 'at zero' and 'all 0xFF' sanity checks for intr_remap too +ApplyPatch linux-2.6-iommu-sanity-checks-for-intr-remap-too.patch +# Fix up MMIO BAR for integrated graphics on HP laptops on resume (#536675) +ApplyPatch linux-2.6-iommu-hp-cantiga-resume.patch + +# +# PowerPC +# +### NOT (YET) UPSTREAM: +# The storage alias patch is Fedora-local, and allows the old 'ps3_storage' +# module name to work on upgrades. Otherwise, I believe mkinitrd will fail +# to pull the module in, +ApplyPatch linux-2.6-ps3-storage-alias.patch +# Alleviate G5 thermal shutdown problems +ApplyPatch linux-2.6-g5-therm-shutdown.patch +# Provide modalias in sysfs for vio devices +ApplyPatch linux-2.6-vio-modalias.patch +# Work around PCIe bridge setup on iSight +ApplyPatch linux-2.6-imac-transparent-bridge.patch + +# +# SPARC64 +# +ApplyPatch linux-2.6.29-sparc-IOC_TYPECHECK.patch + +# +# Exec shield +# +ApplyPatch linux-2.6-execshield.patch + +# +# bugfixes to drivers and filesystems +# + +# ext4 + +# xfs + +# btrfs +ApplyPatch linux-2.6-btrfs-upstream.patch + +# eCryptfs + +# NFSv4 +ApplyPatch linux-2.6-nfsd4-proots.patch +ApplyPatch linux-2.6-nfs4-ver4opt.patch +ApplyPatch linux-2.6-nfs4-callback-hidden.patch + +# USB +ApplyPatch linux-2.6-driver-level-usb-autosuspend.diff +ApplyPatch linux-2.6-qcserial-autosuspend.diff +ApplyPatch linux-2.6-bluetooth-autosuspend.diff +ApplyPatch linux-2.6-usb-uvc-autosuspend.diff + +# ACPI +ApplyPatch linux-2.6-defaults-acpi-video.patch +ApplyPatch linux-2.6-acpi-video-dos.patch +# cpuidle: Fix the menu governor to boost IO performance +ApplyPatch linux-2.6.31-cpuidle-faster-io.patch +# EC fixes from 2.6.32 (#492699, #525681) +ApplyPatch acpi-ec-merge-irq-and-poll-modes.patch +ApplyPatch acpi-ec-use-burst-mode-only-for-msi-notebooks.patch +ApplyPatch acpi-ec-restart-command-even-if-no-interrupts-from-ec.patch + +# Various low-impact patches to aid debugging. +ApplyPatch linux-2.6-debug-sizeof-structs.patch +ApplyPatch linux-2.6-debug-nmi-timeout.patch +ApplyPatch linux-2.6-debug-taint-vm.patch +ApplyPatch linux-2.6-debug-spinlock-taint.patch +ApplyPatch linux-2.6-debug-vm-would-have-oomkilled.patch +ApplyPatch linux-2.6-debug-always-inline-kzalloc.patch + +# +# PCI +# +# disable message signaled interrupts +ApplyPatch linux-2.6-defaults-pci_no_msi.patch +# update the pciehp driver +#ApplyPatch linux-2.6-pciehp-update.patch +# default to enabling passively listening for hotplug events +#ApplyPatch linux-2.6-defaults-pciehp.patch +# enable ASPM by default on hardware we expect to work +ApplyPatch linux-2.6-defaults-aspm.patch + +# +# SCSI Bits. +# + +# ALSA +# squelch hda_beep by default +ApplyPatch linux-2.6-defaults-alsa-hda-beep-off.patch +ApplyPatch linux-2.6-alsa-improve-hda-powerdown.patch +ApplyPatch hda_intel-prealloc-4mb-dmabuffer.patch +ApplyPatch alsa-tell-user-that-stream-to-be-rewound-is-suspended.patch + +# Networking + +# Misc fixes +# The input layer spews crap no-one cares about. +ApplyPatch linux-2.6-input-kill-stupid-messages.patch + +# stop floppy.ko from autoloading during udev... +ApplyPatch die-floppy-die.patch + +# make copy_from_user to a stack slot provable right +# hosed stuff, just drop this close to beta +#ApplyPatch linux-2.6.31-copy_from_user-bounds.patch + +# Get away from having to poll Toshibas +#ApplyPatch linux-2.6-input-fix-toshiba-hotkeys.patch + +ApplyPatch linux-2.6.30-no-pcspkr-modalias.patch + +# Allow to use 480600 baud on 16C950 UARTs +ApplyPatch linux-2.6-serial-460800.patch + +# Silence some useless messages that still get printed with 'quiet' +ApplyPatch linux-2.6-silence-noise.patch +ApplyPatch linux-2.6.30-hush-rom-warning.patch + +# Make fbcon not show the penguins with 'quiet' +ApplyPatch linux-2.6-silence-fbcon-logo.patch + +# Fix the SELinux mprotect checks on executable mappings +#ApplyPatch linux-2.6-selinux-mprotect-checks.patch +# Fix SELinux for sparc +#ApplyPatch linux-2.6-sparc-selinux-mprotect-checks.patch + +# Changes to upstream defaults. + + +# ia64 ata quirk +ApplyPatch linux-2.6-ata-quirk.patch + +# Make it possible to identify non-hotplug SATA ports +ApplyPatch linux-2.6-ahci-export-capabilities.patch + +# prism54: remove pci modinfo device table +ApplyPatch prism54-remove-pci-dev-table.patch + +# ath9k: add fixes suggested by upstream maintainer +ApplyPatch linux-2.6-ath9k-fixes.patch + +# /dev/crash driver. +ApplyPatch linux-2.6-crash-driver.patch + +# Determine cacheline sizes in a generic manner. +ApplyPatch linux-2.6-pci-cacheline-sizing.patch + +# http://www.lirc.org/ +ApplyPatch lirc-2.6.31.patch +# enable IR receiver on Hauppauge HD PVR (v4l-dvb merge pending) +ApplyPatch hdpvr-ir-enable.patch +# tell usbhid to ignore all imon devices (sent upstream 2009.07.31) +ApplyPatch hid-ignore-all-recent-imon-devices.patch + +# Add kernel KSM support +ApplyPatch linux-2.6-ksm.patch +ApplyPatch linux-2.6-ksm-updates.patch +ApplyPatch linux-2.6-ksm-fix-munlock.patch +ApplyPatch linux-2.6-ksm-updates-from-32.patch +# Optimize KVM for KSM support +ApplyPatch linux-2.6-ksm-kvm.patch + +# Assorted Virt Fixes +ApplyPatch linux-2.6-virtio_blk-revert-QUEUE_FLAG_VIRT-addition.patch +ApplyPatch linux-2.6-xen-fix-is_disconnected_device-exists_disconnected_device.patch +ApplyPatch linux-2.6-xen-improvement-to-wait_for_devices.patch +ApplyPatch linux-2.6-xen-increase-device-connection-timeout.patch +ApplyPatch linux-2.6-virtio_blk-add-support-for-cache-flush.patch + +# Fix block I/O errors in KVM +ApplyPatch linux-2.6-block-silently-error-unsupported-empty-barriers-too.patch + +ApplyPatch linux-2.6-e1000-ich9.patch + +# Nouveau DRM + drm fixes +ApplyPatch kms-offb-handoff.patch +ApplyPatch drm-next-b390f944.patch +ApplyPatch drm-radeon-misc-fixes.patch +ApplyPatch drm-radeon-rv410-test-fix.patch +ApplyPatch drm-conservative-fallback-modes.patch +ApplyPatch drm-edid-retry.patch +ApplyPatch drm-edid-header-fixup.patch +ApplyPatch drm-default-mode.patch + +ApplyPatch drm-nouveau.patch +# pm broken on my thinkpad t60p - airlied +#ApplyPatch drm-radeon-pm.patch +ApplyPatch drm-i915-resume-force-mode.patch +ApplyOptionalPatch drm-intel-next.patch +#this appears to be upstream - mjg59? +#ApplyPatch drm-intel-pm.patch +ApplyPatch drm-intel-no-tv-hotplug.patch +ApplyPatch drm-i915-fix-tvmode-oops.patch +ApplyPatch drm-i915-fix-sync-to-vbl-when-vga-is-off.patch +#ApplyPatch drm-disable-r600-aspm.patch + +# VGA arb + drm +ApplyPatch linux-2.6-vga-arb.patch +ApplyPatch drm-vga-arb.patch +ApplyPatch drm-radeon-kms-arbiter-return-ignore.patch + +# Lower debug level of fbcon handover messages (rh#538526) +ApplyPatch fbcon-lower-debug.patch + +# linux1394 git patches +# apply if non-empty +ApplyOptionalPatch linux-2.6-firewire-git-update.patch +ApplyOptionalPatch linux-2.6-firewire-git-pending.patch + +# silence the ACPI blacklist code +ApplyPatch linux-2.6-silence-acpi-blacklist.patch + +# V4L/DVB updates/fixes/experimental drivers +# apply if non-empty +ApplyOptionalPatch linux-2.6-v4l-dvb-fixes.patch +ApplyOptionalPatch linux-2.6-v4l-dvb-update.patch +ApplyOptionalPatch linux-2.6-v4l-dvb-experimental.patch + +ApplyPatch v4l-dvb-fix-cx25840-firmware-loading.patch + +# Patches headed upstream +ApplyPatch linux-2.6-rtc-show-hctosys.patch +ApplyPatch linux-2.6-rfkill-all.patch +ApplyPatch linux-2.6-selinux-module-load-perms.patch + +# patches headed for -stable + +# make perf counter API available to userspace (#527264) +ApplyPatch perf-make-perf-counter-h-available-to-userspace.patch + +ApplyPatch improve-resource-counter-scalability.patch + +# fix perf for sysprof +ApplyPatch perf-events-fix-swevent-hrtimer-sampling.patch +ApplyPatch perf-events-dont-generate-events-for-the-idle-task.patch + +# Fix oops in padlock +ApplyPatch crypto-via-padlock-fix-nano-aes.patch + +# tg3 fixes (#527209) +ApplyPatch tg3-01-delay-mdio-bus-init-until-fw-finishes.patch +ApplyPatch tg3-02-fix-tso-test-against-wrong-flags-var.patch +ApplyPatch tg3-03-fix-57780-asic-rev-pcie-link-receiver-errors.patch +ApplyPatch tg3-04-prevent-tx-bd-corruption.patch +ApplyPatch tg3-05-assign-flags-to-fixes-in-start_xmit_dma_bug.patch +ApplyPatch tg3-06-fix-5906-transmit-hangs.patch + +# sched fixes cherry-picked from 2.6.32 +ApplyPatch sched-deal-with-low-load-in-wake-affine.patch +ApplyPatch sched-ensure-child-cant-gain-time-over-its-parent-after-fork.patch +ApplyPatch sched-remove-shortcut-from-select-task-rq-fair.patch +# latency defaults from 2.6.32 +ApplyPatch sched-retune-scheduler-latency-defaults.patch +# fix wakeup latency +ApplyPatch sched-update-the-clock-of-runqueue-select-task-rq-selected.patch + +ApplyPatch highmem-Fix-debug_kmap_atomic-to-also-handle-KM_IRQ_.patch +ApplyPatch highmem-Fix-race-in-debug_kmap_atomic-which-could-ca.patch +ApplyPatch highmem-fix-arm-powerpc-kmap_types.patch + +ApplyPatch dlm-fix-connection-close-handling.patch + +# rhbz#544144 +ApplyPatch ipv4-fix-null-ptr-deref-in-ip_fragment.patch + +# END OF PATCH APPLICATIONS + +%endif + +# Any further pre-build tree manipulations happen here. + +chmod +x scripts/checkpatch.pl + +# only deal with configs if we are going to build for the arch +%ifnarch %nobuildarches + +mkdir configs + +# Remove configs not for the buildarch +for cfg in kernel-%{version}-*.config; do + if [ `echo %{all_arch_configs} | grep -c $cfg` -eq 0 ]; then + rm -f $cfg + fi +done + +%if !%{debugbuildsenabled} +rm -f kernel-%{version}-*debug.config +%endif + +# now run oldconfig over all the config files +for i in *.config +do + mv $i .config + Arch=`head -1 .config | cut -b 3-` + make ARCH=$Arch %{oldconfig_target} + echo "# $Arch" > configs/$i + cat .config >> configs/$i +done +# end of kernel config +%endif + +# get rid of unwanted files resulting from patch fuzz +find . \( -name "*.orig" -o -name "*~" \) -exec rm -f {} \; >/dev/null + +cd .. + +### +### build +### +%build + +%if %{with_sparse} +%define sparse_mflags C=1 +%endif + +%if %{fancy_debuginfo} +# This override tweaks the kernel makefiles so that we run debugedit on an +# object before embedding it. When we later run find-debuginfo.sh, it will +# run debugedit again. The edits it does change the build ID bits embedded +# in the stripped object, but repeating debugedit is a no-op. We do it +# beforehand to get the proper final build ID bits into the embedded image. +# This affects the vDSO images in vmlinux, and the vmlinux image in bzImage. +export AFTER_LINK=\ +'sh -xc "/usr/lib/rpm/debugedit -b $$RPM_BUILD_DIR -d /usr/src/debug -i $@"' +%endif + +cp_vmlinux() +{ + eu-strip --remove-comment -o "$2" "$1" +} + +BuildKernel() { + MakeTarget=$1 + KernelImage=$2 + Flavour=$3 + InstallName=${4:-vmlinuz} + + # Pick the right config file for the kernel we're building + Config=kernel-%{version}-%{_target_cpu}${Flavour:+-${Flavour}}.config + DevelDir=/usr/src/kernels/%{KVERREL}${Flavour:+.${Flavour}} + + # When the bootable image is just the ELF kernel, strip it. + # We already copy the unstripped file into the debuginfo package. + if [ "$KernelImage" = vmlinux ]; then + CopyKernel=cp_vmlinux + else + CopyKernel=cp + fi + + KernelVer=%{version}-%{release}.%{_target_cpu}${Flavour:+.${Flavour}} + echo BUILDING A KERNEL FOR ${Flavour} %{_target_cpu}... + + # make sure EXTRAVERSION says what we want it to say + perl -p -i -e "s/^EXTRAVERSION.*/EXTRAVERSION = %{?stablerev}-%{release}.%{_target_cpu}${Flavour:+.${Flavour}}/" Makefile + + # if pre-rc1 devel kernel, must fix up SUBLEVEL for our versioning scheme + %if !0%{?rcrev} + %if 0%{?gitrev} + perl -p -i -e 's/^SUBLEVEL.*/SUBLEVEL = %{upstream_sublevel}/' Makefile + %endif + %endif + + # and now to start the build process + + make -s mrproper + cp configs/$Config .config + + Arch=`head -1 .config | cut -b 3-` + echo USING ARCH=$Arch + + make -s ARCH=$Arch %{oldconfig_target} > /dev/null + make -s ARCH=$Arch V=1 %{?_smp_mflags} $MakeTarget %{?sparse_mflags} + make -s ARCH=$Arch V=1 %{?_smp_mflags} modules %{?sparse_mflags} || exit 1 + +%if %{with_perftool} + pushd tools/perf +# make sure the scripts are executable... won't be in tarball until 2.6.31 :/ + chmod +x util/generate-cmdlist.sh util/PERF-VERSION-GEN + make -s V=1 %{?_smp_mflags} perf + mkdir -p $RPM_BUILD_ROOT/usr/libexec/ + install -m 755 perf $RPM_BUILD_ROOT/usr/libexec/perf.$KernelVer + popd +%endif + + # Start installing the results +%if %{with_debuginfo} + mkdir -p $RPM_BUILD_ROOT%{debuginfodir}/boot + mkdir -p $RPM_BUILD_ROOT%{debuginfodir}/%{image_install_path} +%endif + mkdir -p $RPM_BUILD_ROOT/%{image_install_path} + install -m 644 .config $RPM_BUILD_ROOT/boot/config-$KernelVer + install -m 644 System.map $RPM_BUILD_ROOT/boot/System.map-$KernelVer +%if %{with_dracut} + # We estimate the size of the initramfs because rpm needs to take this size + # into consideration when performing disk space calculations. (See bz #530778) + dd if=/dev/zero of=$RPM_BUILD_ROOT/boot/initramfs-$KernelVer.img bs=1M count=20 +%else + dd if=/dev/zero of=$RPM_BUILD_ROOT/boot/initrd-$KernelVer.img bs=1M count=5 +%endif + if [ -f arch/$Arch/boot/zImage.stub ]; then + cp arch/$Arch/boot/zImage.stub $RPM_BUILD_ROOT/%{image_install_path}/zImage.stub-$KernelVer || : + fi + $CopyKernel $KernelImage \ + $RPM_BUILD_ROOT/%{image_install_path}/$InstallName-$KernelVer + chmod 755 $RPM_BUILD_ROOT/%{image_install_path}/$InstallName-$KernelVer + + mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer + # Override $(mod-fw) because we don't want it to install any firmware + # We'll do that ourselves with 'make firmware_install' + make -s ARCH=$Arch INSTALL_MOD_PATH=$RPM_BUILD_ROOT modules_install KERNELRELEASE=$KernelVer mod-fw= +%ifarch %{vdso_arches} + make -s ARCH=$Arch INSTALL_MOD_PATH=$RPM_BUILD_ROOT vdso_install KERNELRELEASE=$KernelVer + if grep '^CONFIG_XEN=y$' .config >/dev/null; then + echo > ldconfig-kernel.conf "\ +# This directive teaches ldconfig to search in nosegneg subdirectories +# and cache the DSOs there with extra bit 0 set in their hwcap match +# fields. In Xen guest kernels, the vDSO tells the dynamic linker to +# search in nosegneg subdirectories and to match this extra hwcap bit +# in the ld.so.cache file. +hwcap 0 nosegneg" + fi + if [ ! -s ldconfig-kernel.conf ]; then + echo > ldconfig-kernel.conf "\ +# Placeholder file, no vDSO hwcap entries used in this kernel." + fi + %{__install} -D -m 444 ldconfig-kernel.conf \ + $RPM_BUILD_ROOT/etc/ld.so.conf.d/kernel-$KernelVer.conf +%endif + + # And save the headers/makefiles etc for building modules against + # + # This all looks scary, but the end result is supposed to be: + # * all arch relevant include/ files + # * all Makefile/Kconfig files + # * all script/ files + + rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/source + mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + (cd $RPM_BUILD_ROOT/lib/modules/$KernelVer ; ln -s build source) + # dirs for additional modules per module-init-tools, kbuild/modules.txt + mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/extra + mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/updates + mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/weak-updates + # first copy everything + cp --parents `find -type f -name "Makefile*" -o -name "Kconfig*"` $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + cp Module.symvers $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + cp System.map $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + if [ -s Module.markers ]; then + cp Module.markers $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + fi + # then drop all but the needed Makefiles/Kconfig files + rm -rf $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/Documentation + rm -rf $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/scripts + rm -rf $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include + cp .config $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + cp -a scripts $RPM_BUILD_ROOT/lib/modules/$KernelVer/build + if [ -d arch/$Arch/scripts ]; then + cp -a arch/$Arch/scripts $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/arch/%{_arch} || : + fi + if [ -f arch/$Arch/*lds ]; then + cp -a arch/$Arch/*lds $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/arch/%{_arch}/ || : + fi + rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/scripts/*.o + rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/scripts/*/*.o +%ifarch ppc + cp -a --parents arch/powerpc/lib/crtsavres.[So] $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/ +%endif + if [ -d arch/%{asmarch}/include ]; then + cp -a --parents arch/%{asmarch}/include $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/ + fi + mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include + cd include + cp -a acpi config crypto keys linux math-emu media mtd net pcmcia rdma rxrpc scsi sound trace video drm asm-generic $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include + asmdir=$(readlink asm) + cp -a $asmdir $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/ + pushd $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include + ln -s $asmdir asm + popd + # Make sure the Makefile and version.h have a matching timestamp so that + # external modules can be built + touch -r $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/Makefile $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/linux/version.h + touch -r $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/.config $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/linux/autoconf.h + # Copy .config to include/config/auto.conf so "make prepare" is unnecessary. + cp $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/.config $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/config/auto.conf + cd .. + + # + # save the vmlinux file for kernel debugging into the kernel-debuginfo rpm + # +%if %{with_debuginfo} + mkdir -p $RPM_BUILD_ROOT%{debuginfodir}/lib/modules/$KernelVer + cp vmlinux $RPM_BUILD_ROOT%{debuginfodir}/lib/modules/$KernelVer +%endif + + find $RPM_BUILD_ROOT/lib/modules/$KernelVer -name "*.ko" -type f >modnames + + # mark modules executable so that strip-to-file can strip them + xargs --no-run-if-empty chmod u+x < modnames + + # Generate a list of modules for block and networking. + + fgrep /drivers/ modnames | xargs --no-run-if-empty nm -upA | + sed -n 's,^.*/\([^/]*\.ko\): *U \(.*\)$,\1 \2,p' > drivers.undef + + collect_modules_list() + { + sed -r -n -e "s/^([^ ]+) \\.?($2)\$/\\1/p" drivers.undef | + LC_ALL=C sort -u > $RPM_BUILD_ROOT/lib/modules/$KernelVer/modules.$1 + } + + collect_modules_list networking \ + 'register_netdev|ieee80211_register_hw|usbnet_probe' + collect_modules_list block \ + 'ata_scsi_ioctl|scsi_add_host|blk_init_queue|register_mtd_blktrans|scsi_esp_register|scsi_register_device_handler' + collect_modules_list drm \ + 'drm_open|drm_init' + collect_modules_list modesetting \ + 'drm_crtc_init' + + # detect missing or incorrect license tags + rm -f modinfo + while read i + do + echo -n "${i#$RPM_BUILD_ROOT/lib/modules/$KernelVer/} " >> modinfo + /sbin/modinfo -l $i >> modinfo + done < modnames + + egrep -v \ + 'GPL( v2)?$|Dual BSD/GPL$|Dual MPL/GPL$|GPL and additional rights$' \ + modinfo && exit 1 + + rm -f modinfo modnames + + # remove files that will be auto generated by depmod at rpm -i time + for i in alias alias.bin ccwmap dep dep.bin ieee1394map inputmap isapnpmap ofmap pcimap seriomap symbols symbols.bin usbmap + do + rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/modules.$i + done + + # Move the devel headers out of the root file system + mkdir -p $RPM_BUILD_ROOT/usr/src/kernels + mv $RPM_BUILD_ROOT/lib/modules/$KernelVer/build $RPM_BUILD_ROOT/$DevelDir + ln -sf ../../..$DevelDir $RPM_BUILD_ROOT/lib/modules/$KernelVer/build +} + +### +# DO it... +### + +# prepare directories +rm -rf $RPM_BUILD_ROOT +mkdir -p $RPM_BUILD_ROOT/boot + +cd linux-%{kversion}.%{_target_cpu} + +%if %{with_debug} +BuildKernel %make_target %kernel_image debug +%endif + +%if %{with_pae_debug} +BuildKernel %make_target %kernel_image PAEdebug +%endif + +%if %{with_pae} +BuildKernel %make_target %kernel_image PAE +%endif + +%if %{with_up} +BuildKernel %make_target %kernel_image +%endif + +%if %{with_smp} +BuildKernel %make_target %kernel_image smp +%endif + +%if %{with_kdump} +BuildKernel vmlinux vmlinux kdump vmlinux +%endif + +%if %{with_doc} +# Make the HTML and man pages. +# XXX nix %{?_smp_mflags} here, buggy Documentation/*/Makefile! +make htmldocs mandocs || %{doc_build_fail} + +# sometimes non-world-readable files sneak into the kernel source tree +chmod -R a=rX Documentation +find Documentation -type d | xargs chmod u+w +%endif + +%if %{with_perf} +pushd tools/perf +make %{?_smp_mflags} man || %{doc_build_fail} +popd +%endif + +### +### Special hacks for debuginfo subpackages. +### + +# This macro is used by %%install, so we must redefine it before that. +%define debug_package %{nil} + +%if %{fancy_debuginfo} +%define __debug_install_post \ + /usr/lib/rpm/find-debuginfo.sh %{debuginfo_args} %{_builddir}/%{?buildsubdir}\ +%{nil} +%endif + +%if %{with_debuginfo} +%ifnarch noarch +%global __debug_package 1 +%files -f debugfiles.list debuginfo-common-%{_target_cpu} +%defattr(-,root,root) +%endif +%endif + +### +### install +### + +%install + +cd linux-%{kversion}.%{_target_cpu} + +%if %{with_doc} +docdir=$RPM_BUILD_ROOT%{_datadir}/doc/kernel-doc-%{rpmversion} +man9dir=$RPM_BUILD_ROOT%{_datadir}/man/man9 + +# copy the source over +mkdir -p $docdir +tar -f - --exclude=man --exclude='.*' -c Documentation | tar xf - -C $docdir + +# Install man pages for the kernel API. +mkdir -p $man9dir +find Documentation/DocBook/man -name '*.9.gz' -print0 | +xargs -0 --no-run-if-empty %{__install} -m 444 -t $man9dir $m +ls $man9dir | grep -q '' || > $man9dir/BROKEN +%endif # with_doc + +# perf docs +%if %{with_perf} +mandir=$RPM_BUILD_ROOT%{_datadir}/man +man1dir=$mandir/man1 +pushd tools/perf/Documentation +make install-man mandir=$mandir +popd + +pushd $man1dir +for d in *.1; do + gzip $d; +done +popd +%endif # with_perf + +# perf shell wrapper +%if %{with_perf} +mkdir -p $RPM_BUILD_ROOT/usr/sbin/ +cp $RPM_SOURCE_DIR/perf $RPM_BUILD_ROOT/usr/sbin/perf +chmod 0755 $RPM_BUILD_ROOT/usr/sbin/perf +mkdir -p $RPM_BUILD_ROOT%{_datadir}/doc/perf +%endif + +%if %{with_headers} +# Install kernel headers +make ARCH=%{hdrarch} INSTALL_HDR_PATH=$RPM_BUILD_ROOT/usr headers_install + +# Do headers_check but don't die if it fails. +make ARCH=%{hdrarch} INSTALL_HDR_PATH=$RPM_BUILD_ROOT/usr headers_check \ + > hdrwarnings.txt || : +if grep -q exist hdrwarnings.txt; then + sed s:^$RPM_BUILD_ROOT/usr/include/:: hdrwarnings.txt + # Temporarily cause a build failure if header inconsistencies. + # exit 1 +fi + +find $RPM_BUILD_ROOT/usr/include \ + \( -name .install -o -name .check -o \ + -name ..install.cmd -o -name ..check.cmd \) | xargs rm -f + +# glibc provides scsi headers for itself, for now +rm -rf $RPM_BUILD_ROOT/usr/include/scsi +rm -f $RPM_BUILD_ROOT/usr/include/asm*/atomic.h +rm -f $RPM_BUILD_ROOT/usr/include/asm*/io.h +rm -f $RPM_BUILD_ROOT/usr/include/asm*/irq.h +%endif + +%if %{with_firmware} +%{build_firmware} +%endif + +%if %{with_bootwrapper} +make DESTDIR=$RPM_BUILD_ROOT bootwrapper_install WRAPPER_OBJDIR=%{_libdir}/kernel-wrapper WRAPPER_DTSDIR=%{_libdir}/kernel-wrapper/dts +%endif + + +### +### clean +### + +%clean +rm -rf $RPM_BUILD_ROOT + +### +### scripts +### + +# +# This macro defines a %%post script for a kernel*-devel package. +# %%kernel_devel_post [] +# +%define kernel_devel_post() \ +%{expand:%%post %{?1:%{1}-}devel}\ +if [ -f /etc/sysconfig/kernel ]\ +then\ + . /etc/sysconfig/kernel || exit $?\ +fi\ +if [ "$HARDLINK" != "no" -a -x /usr/sbin/hardlink ]\ +then\ + (cd /usr/src/kernels/%{KVERREL}%{?1:.%{1}} &&\ + /usr/bin/find . -type f | while read f; do\ + hardlink -c /usr/src/kernels/*.fc*.*/$f $f\ + done)\ +fi\ +%{nil} + +# This macro defines a %%posttrans script for a kernel package. +# %%kernel_variant_posttrans [] +# More text can follow to go at the end of this variant's %%post. +# +%define kernel_variant_posttrans() \ +%{expand:%%posttrans %{?1}}\ +/sbin/new-kernel-pkg --package kernel%{?1:-%{1}} --rpmposttrans %{KVERREL}%{?1:.%{1}} || exit $?\ +%{nil} + +# +# This macro defines a %%post script for a kernel package and its devel package. +# %%kernel_variant_post [-v ] [-r ] +# More text can follow to go at the end of this variant's %%post. +# +%define kernel_variant_post(v:r:) \ +%{expand:%%kernel_devel_post %{?-v*}}\ +%{expand:%%kernel_variant_posttrans %{?-v*}}\ +%{expand:%%post %{?-v*}}\ +%{-r:\ +if [ `uname -i` == "x86_64" -o `uname -i` == "i386" ] &&\ + [ -f /etc/sysconfig/kernel ]; then\ + /bin/sed -r -i -e 's/^DEFAULTKERNEL=%{-r*}$/DEFAULTKERNEL=kernel%{?-v:-%{-v*}}/' /etc/sysconfig/kernel || exit $?\ +fi}\ +%{expand:\ +%if %{with_dracut}\ +/sbin/new-kernel-pkg --package kernel%{?-v:-%{-v*}} --mkinitrd --dracut --depmod --install %{KVERREL}%{?-v:.%{-v*}} || exit $?\ +%else\ +/sbin/new-kernel-pkg --package kernel%{?-v:-%{-v*}} --mkinitrd --depmod --install %{KVERREL}%{?-v:.%{-v*}} || exit $?\ +%endif}\ +#if [ -x /sbin/weak-modules ]\ +#then\ +# /sbin/weak-modules --add-kernel %{KVERREL}%{?-v*} || exit $?\ +#fi\ +%{nil} + +# +# This macro defines a %%preun script for a kernel package. +# %%kernel_variant_preun +# +%define kernel_variant_preun() \ +%{expand:%%preun %{?1}}\ +/sbin/new-kernel-pkg --rminitrd --rmmoddep --remove %{KVERREL}%{?1:.%{1}} || exit $?\ +#if [ -x /sbin/weak-modules ]\ +#then\ +# /sbin/weak-modules --remove-kernel %{KVERREL}%{?1} || exit $?\ +#fi\ +%{nil} + +%kernel_variant_preun +%ifarch x86_64 +%kernel_variant_post -r (kernel-smp|kernel-xen) +%else +%kernel_variant_post -r kernel-smp +%endif + +%kernel_variant_preun smp +%kernel_variant_post -v smp + +%kernel_variant_preun PAE +%kernel_variant_post -v PAE -r (kernel|kernel-smp|kernel-xen) + +%kernel_variant_preun debug +%kernel_variant_post -v debug + +%kernel_variant_post -v PAEdebug -r (kernel|kernel-smp|kernel-xen) +%kernel_variant_preun PAEdebug + +if [ -x /sbin/ldconfig ] +then + /sbin/ldconfig -X || exit $? +fi + +### +### file lists +### + +%if %{with_headers} +%files headers +%defattr(-,root,root) +/usr/include/* +%endif + +%if %{with_firmware} +%files firmware +%defattr(-,root,root) +/lib/firmware/* +%doc linux-%{kversion}.%{_target_cpu}/firmware/WHENCE +%endif + +%if %{with_bootwrapper} +%files bootwrapper +%defattr(-,root,root) +/usr/sbin/* +%{_libdir}/kernel-wrapper +%endif + +# only some architecture builds need kernel-doc +%if %{with_doc} +%files doc +%defattr(-,root,root) +%{_datadir}/doc/kernel-doc-%{rpmversion}/Documentation/* +%dir %{_datadir}/doc/kernel-doc-%{rpmversion}/Documentation +%dir %{_datadir}/doc/kernel-doc-%{rpmversion} +%{_datadir}/man/man9/* +%endif + +%if %{with_perf} +%files -n perf +%defattr(-,root,root) +%{_datadir}/doc/perf +/usr/sbin/perf +%{_datadir}/man/man1/* +%endif + +# This is %{image_install_path} on an arch where that includes ELF files, +# or empty otherwise. +%define elf_image_install_path %{?kernel_image_elf:%{image_install_path}} + +# +# This macro defines the %%files sections for a kernel package +# and its devel and debuginfo packages. +# %%kernel_variant_files [-k vmlinux] +# +%define kernel_variant_files(k:) \ +%if %{1}\ +%{expand:%%files %{?2}}\ +%defattr(-,root,root)\ +/%{image_install_path}/%{?-k:%{-k*}}%{!?-k:vmlinuz}-%{KVERREL}%{?2:.%{2}}\ +/boot/System.map-%{KVERREL}%{?2:.%{2}}\ +%if %{with_perftool}\ +/usr/libexec/perf.%{KVERREL}%{?2:.%{2}}\ +%endif\ +#/boot/symvers-%{KVERREL}%{?2:.%{2}}.gz\ +/boot/config-%{KVERREL}%{?2:.%{2}}\ +%dir /lib/modules/%{KVERREL}%{?2:.%{2}}\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/kernel\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/build\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/source\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/extra\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/updates\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/weak-updates\ +%ifarch %{vdso_arches}\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/vdso\ +/etc/ld.so.conf.d/kernel-%{KVERREL}%{?2:.%{2}}.conf\ +%endif\ +/lib/modules/%{KVERREL}%{?2:.%{2}}/modules.*\ +%if %{with_dracut}\ +/boot/initramfs-%{KVERREL}%{?2:.%{2}}.img\ +%else\ +/boot/initrd-%{KVERREL}%{?2:.%{2}}.img\ +%endif\ +%{expand:%%files %{?2:%{2}-}devel}\ +%defattr(-,root,root)\ +%dir /usr/src/kernels\ +%verify(not mtime) /usr/src/kernels/%{KVERREL}%{?2:.%{2}}\ +/usr/src/kernels/%{KVERREL}%{?2:.%{2}}\ +%if %{with_debuginfo}\ +%ifnarch noarch\ +%if %{fancy_debuginfo}\ +%{expand:%%files -f debuginfo%{?2}.list %{?2:%{2}-}debuginfo}\ +%else\ +%{expand:%%files %{?2:%{2}-}debuginfo}\ +%endif\ +%defattr(-,root,root)\ +%if !%{fancy_debuginfo}\ +%if "%{elf_image_install_path}" != ""\ +%{debuginfodir}/%{elf_image_install_path}/*-%{KVERREL}%{?2:.%{2}}.debug\ +%endif\ +%{debuginfodir}/lib/modules/%{KVERREL}%{?2:.%{2}}\ +%{debuginfodir}/usr/src/kernels/%{KVERREL}%{?2:.%{2}}\ +%endif\ +%endif\ +%endif\ +%endif\ +%{nil} + + +%kernel_variant_files %{with_up} +%kernel_variant_files %{with_smp} smp +%kernel_variant_files %{with_debug} debug +%kernel_variant_files %{with_pae} PAE +%kernel_variant_files %{with_pae_debug} PAEdebug +%kernel_variant_files -k vmlinux %{with_kdump} kdump + +# plz don't put in a version string unless you're going to tag +# and build. + +%changelog +* Thu Dec 03 2009 Kyle McMartin 2.6.31.6-162 +- ipv4-fix-null-ptr-deref-in-ip_fragment.patch: null ptr deref + bug fix. + +* Thu Dec 03 2009 Dave Airlie 2.6.31.6-161 +- rv410 LVDS on resume test fix from AMD (#541562) + +* Wed Dec 02 2009 John W. Linville 2.6.31.6-160 +- ath9k: add fixes suggested by upstream maintainer + +* Wed Dec 02 2009 Dave Airlie 2.6.31.6-159 +- drm-radeon-misc-fixes.patch: r400 LVDS, r600 digital dpms, cursor fix, tv property + +* Wed Dec 02 2009 Ben Skeggs 2.6.31.6-158 +- nouveau: more complete lvds script selection on >=G80 (rh#522690, rh#529859) +- nouveau: more complete tmds script selection on >=G80 (rh#537853) +- nouveau: TV detection fixes + +* Tue Dec 01 2009 Dave Airlie 2.6.31.6-157 +- div/0 fix harder (#540593) - also ignore unposted GPUs with no BIOS + +* Tue Dec 01 2009 Dave Airlie 2.6.31.6-156 +- drm-next: fixes LVDS resume on r4xx, div/0 on no bios (#540593) + lockup on tv-out only startup. + +* Mon Nov 30 2009 Kyle McMartin +- drm-i915-fix-sync-to-vbl-when-vga-is-off.patch: add (rhbz#541670) + +* Sun Nov 29 2009 Kyle McMartin +- Drop linux-2.6-sysrq-c.patch, made consistent upstream. + +* Fri Nov 27 2009 Jarod Wilson 2.6.31.6-153 +- add device name to lirc_zilog, fixes issues w/multiple target devices +- add lirc_imon pure input mode support for onboard decode devices + +* Wed Nov 26 2009 David Woodhouse 2.6.31.6-152 +- Fix intel_tv_mode_set oops (#540218) + +* Wed Nov 26 2009 David Woodhouse 2.6.31.6-151 +- VT-d: Work around yet more HP BIOS brokenness (#536675) + +* Wed Nov 25 2009 Kyle McMartin +- dlm: fix connection close handling. + Fix by lmb, requested by fabio. + +* Wed Nov 25 2009 David Woodhouse 2.6.31.6-149 +- VT-d: Work around more HP BIOS brokenness. + +* Tue Nov 24 2009 Dave Airlie 2.6.31.6-148 +- radeon: flush HDP cache on rendering wait - fixes r600 rendercheck failure + +* Mon Nov 23 2009 Adam Jackson +- drm-default-mode.patch: Default to 1024x768 to match UMS. (#538761) + +* Mon Nov 23 2009 Roland McGrath 2.6.31.6-146 +- Fix oops in x86-32 kernel's iret handling for bogus user %cs. (#540580) + +* Fri Nov 21 2009 Kyle McMartin +- Fix up ssp' highmem fixes with fixes for arm & ppc. + +* Thu Nov 20 2009 Chris Wright 2.6.31.6-144 +- VT-d: another fallback for another BIOS bug (#524808) + +* Thu Nov 19 2009 Ben Skeggs 2.6.31.6-142 +- Oops, add new patch to spec file + +* Thu Nov 19 2009 Ben Skeggs 2.6.31.6-141 +- Lower debug level of fbcon handover messages (rh#538526) + +* Thu Nov 19 2009 Dave Airlie 2.6.31.6-140 +- drm-next-44c83571.patch: oops pulled the wrong tree into my f12 tree + +* Thu Nov 19 2009 Ben Skeggs 2.6.31.6-139 +- nouveau: s/r fixes on chipsets using bios opcode 0x87 +- nouveau: fixes to bios opcode 0x8e +- nouveau: hopefully fix nv1x context switching issues (rh#526577) +- nouveau: support for NVA5 (GeForce G220) +- nouveau: fixes for NVAA support + +* Thu Nov 19 2009 Dave Airlie 2.6.31.6-138 +- drm-next-d56672a9.patch: fix some rn50 cloning issues + +* Wed Nov 18 2009 David Woodhouse 2.6.31.6-137 +- Actually force the IOMMU not to be used when we detect the HP/Acer bug. + +* Tue Nov 17 2009 Chuck Ebbert 2.6.31.6-136 +- ACPI embedded controller fixes from Fedora 11. + +* Tue Nov 17 2009 Chuck Ebbert 2.6.31.6-135 +- Scheduler fixes and latency tuning patches from F-11. + +* Tue Nov 17 2009 Dave Airlie 2.6.31.6-134 +- glad to see edid retry patch was compiled. + +* Tue Nov 17 2009 Dave Airlie 2.6.31.6-133 +- drm-next-984d1f3c.patch: rebase with upstream fixes - drop all merged + +* Thu Nov 12 2009 Adam Jackson +- Actually apply the EDID retry patch +- drm-edid-header-fixup.patch: Fix up some broken EDID headers (#534120) + +* Thu Nov 12 2009 Chuck Ebbert 2.6.31.6-130 +- Use ApplyOptionalPatch for v4l and firewire updates. +- Drop unused v4l ABI fix. + +* Thu Nov 12 2009 Chuck Ebbert 2.6.31.6-129 +- Linux 2.6.31.6 +- Drop merged patches: + linux-2.6-iwlwifi-reduce-noise-when-skb-allocation-fails.patch + linux-2.6-libertas-crash.patch + pci-increase-alignment-to-make-more-space.patch + acpi-revert-attach-device-to-handle-early.patch + ahci-revert-restore-sb600-sata-controller-64-bit-dma.patch + acpi-pci-fix-null-pointer-dereference-in-acpi-get-pci-dev.patch + af_unix-fix-deadlock-connecting-to-shutdown-socket.patch + keys-get_instantiation_keyring-should-inc-the-keyring-refcount.patch + netlink-fix-typo-in-initialization.patch + fs-pipe-null-ptr-deref-fix.patch + +* Wed Nov 11 2009 Justin M. Forbes 2.6.31.5-128 +- Fix KSM for i686 users. (#532215) +- Add KSM fixes from 2.6.32 + +* Sun Nov 08 2009 David Woodhouse 2.6.31.5-127 +- Apply fix for fallback when HP/Acer BIOS bug detected (#524808) +- Re-enable DMAR. +- Fix libertas crash due to skb pointer bug + +* Sat Nov 07 2009 Kyle McMartin 2.6.31.5-126 +- Re-enable linux-2.6-die-closed-source-bios-muppets-die.patch, DMAR + still defaulting to off. + +* Sat Nov 07 2009 Kyle McMartin 2.6.31.5-125 +- Disable linux-2.6-die-closed-source-bios-muppets-die.patch and + default DMAR to off (can be re-enabled with intel_iommu=on on the + command line due to last minute issues and reversion upstream.) + +* Thu Nov 05 2009 Jarod Wilson +- Add --with dbgonly rpmbuild option to build only debug kernels + +* Thu Nov 05 2009 Dave Airlie 2.6.31.5-122 +- comment out kmap atomic for now, it breaks ppc build + +* Thu Nov 05 2009 Dave Airlie 2.6.31.5-121 +- drm-radeon-fix-agp-resume.patch (#531825) + +* Thu Nov 05 2009 Kyle McMartin +- Add two patches from Soren from mingo/linux-2.6-x86.git to fix + debug_kmap_atomic prints. + +* Thu Nov 05 2009 Ben Skeggs +- nouveau: fix rh#532924 + +* Wed Nov 04 2009 Kyle McMartin +- Make JBD2_DEBUG a toggleable debug setting. Leave it the way it was. + (Double checked resulting configs, don't fret.) + +* Wed Nov 04 2009 Adam Jackson 2.6.31.5-117 +- drm-edid-retry.patch: Try DDC up to four times, like X. (#532957) + +* Wed Nov 04 2009 Chuck Ebbert 2.6.31.5-116 +- tg3 bug fixes (#527209) + +* Wed Nov 04 2009 Kyle McMartin 2.6.31.5-115 +- fs/pipe.c: fix null pointer dereference (CVE-2009-3547) + +* Wed Nov 04 2009 Ben Skeggs 2.6.31.5-114 +- nouveau: provide info userspace needs to handle low memory situations +- nouveau: fix for rh#532711 +- nouveau: add option to provide more debug info for rh#532579 +- patch only so large because of included register rename + +* Tue Nov 03 2009 Adam Jackson 2.6.31.5-113 +- drm-conservative-fallback-modes.patch: When an output is connected but + fails EDID, only add modes with refresh rates <= 60 (#514600) + +* Tue Nov 03 2009 Dave Airlie 2.6.31.5-112 +- drm-r600-lenovo-w500-fix.patch: add second patch from upstream fix + +* Tue Nov 03 2009 Dave Airlie 2.6.31.5-111 +- drm-r600-lenovo-w500-fix.patch: fix lenovo w500 acpi video kill laptop dead +- drop aspm r600 patch as correct fix should be in 110 + +* Tue Nov 03 2009 Dave Airlie 2.6.31.5-110 +- r600: fix for ring setup RMW issue. + +* Mon Nov 02 2009 John W. Linville 2.6.31.5-109 +- prism54: remove pci modinfo device table (#447047) + +* Mon Nov 02 2009 Chuck Ebbert 2.6.31.5-108 +- Enable acerhdf driver for fan speed control on Acer Aspire One notebook (#532463) + +* Mon Nov 02 2009 Dave Airlie 2.6.31.5-107 +- r600: back that out, thanks to yaneti for testing. + +* Mon Nov 02 2009 Dave Airlie 2.6.31.5-106 +- r600: ring size guesswork fix. + +* Fri Oct 30 2009 Dave Airlie 2.6.31.5-105 +- drm-radeon-agp-font-fix.patch: hopefully fix AGP coherency issue + +* Wed Oct 28 2009 Dave Airlie 2.6.31.5-104 +- drm-next-ea1495a6.patch: fix rs400 resume on my test box + +* Wed Oct 28 2009 Dave Airlie 2.6.31.5-103 +- drm-next-fc7f7119.patch: fix oops in SS code, fix multi-card, dvo. +- drm-radeon-kms-arbiter-return-ignore.patch: fix arbiter for non-VGA display + +* Tue Oct 27 2009 Chuck Ebbert +- Fix oops in VIA padlock-aes code. + +* Tue Oct 27 2009 Dave Airlie +- kms: add offb handoff patch for ppc to work + +* Tue Oct 27 2009 Ben Skeggs +- drm-nouveau.patch: misc fixes, very initial NVA8 work + +* Tue Oct 27 2009 Dave Airlie +- fix dd command lines + +* Mon Oct 26 2009 Dave Jones +- Make a 20MB initramfs file so rpm gets its diskspace calculations right. (#530778) + +* Mon Oct 26 2009 Dave Airlie 2.6.31.5-97 +- drm: rebase to drm-next, drop palette fix, merged upstream +- drm-intel-big-hammer.patch: drop, proper fix in 2.6.31.5 +- drm-disable-r600-aspm.patch: test patch to disable aspm on r600/r700 for now + +* Fri Oct 23 2009 Kyle McMartin 2.6.31.5-96 +- Bump NR_CPUS to 256 on x86_64. +- Add two backports (ugh, just had to go renaming perf counters to events...) + for fixing sysprof with perf. + +* Fri Oct 23 2009 Dave Airlie 2.6.31.5-95 +- re enable MSI + +* Fri Oct 23 2009 Dave Airlie 2.6.31.5-94 +- disable debug + stackprotector + +* Fri Oct 23 2009 Chuck Ebbert +- Linux 2.6.31.5 + +* Thu Oct 22 2009 Chuck Ebbert +- Fix exploitable OOPS in keyring code. (CVE-2009-3624) +- Fix kernel memory leak to userspace. (CVE-2009-3612) + +* Thu Oct 22 2009 Dave Airlie 2.6.31.5-91.rc1 +- kms: fix palette + +* Wed Oct 21 2009 Chuck Ebbert +- Disable powersave by default for AC97 audio devices. (#524414) + +* Wed Oct 21 2009 Chuck Ebbert +- Linux 2.6.31.5-rc1 +- Remove the merged HP DC7900 workaround from iommu-updates patch. +- Drop merged patch: + linux-2.6-raidlockdep.patch + +* Mon Oct 19 2009 Kyle McMartin +- af_unix-fix-deadlock-connecting-to-shutdown-socket.patch: fix for + rhbz#529626. + +* Sat Oct 17 2009 Chuck Ebbert +- Replace linux-2.6-bluetooth-autosuspend.diff with upstream version. + +* Fri Oct 16 2009 Josef Bacik +- Update btrfs to latest upstream + +* Fri Oct 16 2009 Chuck Ebbert 2.6.31.4-85 +- Fix another ACPI boot hang (#513680) + +* Fri Oct 16 2009 Ben Skeggs 2.6.31.4-84 +- nouveau: more vbios opcodes, minor fixes, hopeful fix for rh#529292 + +* Wed Oct 14 2009 Roland McGrath 2.6.31.4-83 +- Remove work-around for gcc bug #521991, now fixed. +- Build *docs non-parallel, working around kernel's makefile bugs. + +* Wed Oct 14 2009 Peter Jones +- Add scsi_register_device_handler to modules.block's symbol list so + we'll have scsi device handlers in installer images. + +* Tue Oct 13 2009 Steve Dickson 2.6.31.4-81 +- Fixed hang during NFS installs (bz 528537) + +* Tue Oct 13 2009 Chuck Ebbert 2.6.31.4-80 +- Disable 64-bit DMA on SB600 SATA controllers. + +* Tue Oct 13 2009 Kyle McMartin +- Always build perf docs, regardless of whether we build kernel-doc. + Seems rather unfair to not ship the manpages half the time. + Also, drop BuildRequires %if when not with_doc, the rules about %if + there are f*!&^ing complicated. + +* Mon Oct 12 2009 Kyle McMartin +- Build the perf manpages properly. + +* Mon Oct 12 2009 Chuck Ebbert 2.6.31.4-77 +- Fix boot hang with ACPI on some systems. + +* Mon Oct 12 2009 Chuck Ebbert 2.6.31.4-76 +- Linux 2.6.31.4 + +* Mon Oct 12 2009 Kyle McMartin 2.6.31.4-75.rc2 +- improve-resource-counter-scalability.patch: Fix scalability issues + on big machines, requested by prarit. + +* Mon Oct 12 2009 Jarod Wilson +- Fix irq status check bugs in lirc_ene0100 + +* Mon Oct 12 2009 Chuck Ebbert +- Fix 2.6.31 regression that caused device failures with ACPI enabled. + +* Sun Oct 11 2009 Chuck Ebbert +- Linux 2.6.31.4-rc2 +- Drop merged patch: linux-2.6-frace-fixes.patch + +* Sat Oct 10 2009 Chuck Ebbert +- Make performance counter API available to userspace programs (#527264) + +* Sat Oct 10 2009 Dave Jones +- Drop the NX kernel data patch for now. Causes no-boot on some systems. + +* Fri Oct 09 2009 Dave Jones +- Backport two critical ftrace fixes. + ftrace: check for failure for all conversions + tracing: correct module boundaries for ftrace_release + +* Fri Oct 09 2009 Jarod Wilson +- Build docs sub-package again + +* Thu Oct 08 2009 Kyle McMartin 2.6.31.3-67 +- Linux 2.6.31.3 +- rebase drm-next trivially. +- dropped merged upstream patches, + - linux-2.6-fix-usb-serial-autosuspend.diff + - linux-2.6-iwlagn-modify-digital-SVR-for-1000.patch + - linux-2.6-iwlwifi-Handle-new-firmware-file-with-ucode-build-number-in-header.patch + - linux-2.6-iwlwifi-fix-debugfs-buffer-handling.patch + - linux-2.6-iwlwifi-fix-unloading-driver-while-scanning.patch + - linux-2.6-iwlwifi-remove-deprecated-6000-series-adapters.patch + - linux-2.6-iwlwifi-traverse-linklist-to-find-the-valid-OTP-block.patch + - linux-2.6-iwlwifi-update-1000-series-API-version-to-match-firmware.patch + - linux-2.6-xen-check-efer-fix.patch + - linux-2.6-xen-spinlock-enable-interrupts-only-when-blocking.patch + - linux-2.6-xen-spinlock-stronger-barrier.patch + - linux-2.6-xen-stack-protector-fix.patch + - linux-2.6.31-cpufreq-powernow-k8-oops.patch + +* Thu Oct 08 2009 Ben Skeggs +- ppc: compile nvidiafb as a module only, nvidiafb+nouveau = bang! (rh#491308) + +* Thu Oct 08 2009 Ben Skeggs 2.6.31.1-65 +- nouveau: {drm-next,context,fbcon,misc} fixes, connector forcing + +* Thu Oct 08 2009 Dave Airlie 2.6.31.1-64 +- rebase latest drm-next, fixes many s/r and r600 problems + +* Wed Oct 07 2009 Dave Jones +- Don't mark the initramfs file as a ghost. + +* Wed Oct 07 2009 Dave Jones +- Enable FUNCTION_GRAPH_TRACER on x86-64. + +* Wed Oct 07 2009 Dave Jones +- Disable CONFIG_IRQSOFF_TRACER on srostedt's recommendation. + (Adds unwanted overhead when not in use). + +* Tue Oct 6 2009 Justin M. Forbes +- virtio_blk: add support for cache flush (#526869) + +* Fri Oct 2 2009 John W. Linville +- Backport "iwlwifi: reduce noise when skb allocation fails" + +* Wed Sep 30 2009 David Woodhouse +- Update IOMMU code; mostly a bunch more workarounds for broken BIOSes. + +* Wed Sep 30 2009 Dave Airlie 2.6.31.1-56 +- revert all the arjan patches until someone tests them. + +* Tue Sep 29 2009 Steve Dickson 2.6.31.1-55 +- Updated the NFS4 pseudo root code with a fix from upstream + +* Tue Sep 29 2009 Dave Airlie 2.6.31.1-54 +- Fix broken capabilties that stopped dbus working due to copy from user + fixups. + +* Tue Sep 29 2009 Dave Airlie 2.6.31.1-53 +- drm-next-4c57edba4.patch: fix r600 dri1 memory leak and r600 bugs + +* Mon Sep 28 2009 Dave Jones 2.6.31.1-52 +- Use __builtin_object_size to validate the buffer size for copy_from_user + + associated fixes to various copy_from_user invocations. + +* Mon Sep 28 2009 Justin M. Forbes 2.6.31.1-50 +- Increase timeout for xen frontend devices to connect. + +* Sat Sep 26 2009 Chuck Ebbert 2.6.31.1-49 +- Add Xen spinlock patches to improve scalability. + +* Sat Sep 26 2009 Dave Airlie 2.6.31.1-48 +- drm-next-8ef8678c8.patch: fix intel/nouveau kms + +* Fri Sep 25 2009 Justin M. Forbes 2.6.31.1-47 +- Fix xen guest booting when NX is disabled (#525290) + +* Fri Sep 25 2009 Ben Skeggs 2.6.31.1-46 +- drm-nouveau.patch: cleanups, fixes, pre-G80 s/r fixes, init rework + +* Fri Sep 25 2009 Dave Airlie 2.6.31.1-45 +- drm-next-adea4796c.patch: fix r600 glxgears + +* Fri Sep 25 2009 Dave Airlie 2.6.31.1-44 +- bump a extra one because I accidentially CVS. + +* Thu Sep 24 2009 Dave Airlie 2.6.31.1-42 +- drm-next update - fix r600 s/r, and command line mode picking and r600 tv + +* Thu Sep 24 2009 Chuck Ebbert 2.6.31.1-41 +- Linux 2.6.31.1 +- Drop patches merged upstream: + linux-2.6-kvm-vmx-check-cpl-before-emulating-debug-register-access.patch + linux-2.6-use-__pa_symbol-to-calculate-address-of-C-symbol.patch + linux-2.6-kvm-pvmmu-do-not-batch-pte-updates-from-interrupt-context.patch + linux-2.6-scsi-sd-fix-oops-during-scanning.patch + linux-2.6-scsi-sg-fix-oops-in-error-path.patch + +* Thu Sep 24 2009 Chuck Ebbert 2.6.31-40 +- Drop the modules-ro-nx patch: it's causing ftrace to be unable + to NOP out module function call tracking. (#524042) + +* Wed Sep 23 2009 Kyle McMartin 2.6.31-39 +- touch initramfs-$foo not dracut-$foo. + +* Wed Sep 23 2009 Adam Jackson 2.6.31-37 +- drm: Fix various buglets in EDID parsing. + +* Mon Sep 21 2009 Ben Skeggs +- nouveau: more on rh#522649, added some useful info to debugfs +- lots of coding style cleanups, which is the reason for the huge commit + +* Fri Sep 18 2009 Dave Jones +- %ghost the dracut initramfs file. + +* Thu Sep 17 2009 Hans de Goede +- Now that we have %%post generation of dracut images we do not need to + Require dracut-kernel anymore + +* Thu Sep 17 2009 Kyle McMartin 2.6.31-33 +- Turn off CONFIG_CC_OPTIMIZE_FOR_SIZE on ppc64 until ld decides to play nice + and generate the save/restore stubs. + +* Thu Sep 17 2009 Kristian Høgsberg +- Drop drm page-flip patch for F12. + +* Thu Sep 17 2009 Dave Jones +- cpuidle: Fix the menu governor to boost IO performance. + +* Wed Sep 16 2009 John W. Linville +- Add a few more iwl1000 support patches. +- Remove support for deprecated iwl6000 parts. + +* Wed Sep 16 2009 Eric Paris +- Do not check CAP_SYS_MODULE when networking tres to autoload a module + +* Wed Sep 16 2009 John W. Linville +- Add iwl1000 support patches. + +* Wed Sep 16 2009 Adam Jackson +- Disable hotplug interrupts on TV connectors on i915. + +* Wed Sep 16 2009 Dave Jones +- Fix NULL deref in powernow-k8 driver. (korg #13780) + +* Wed Sep 16 2009 Hans de Goede +- Fix lockdep warning (and potential real deadlock) in mdraid10 code, + requested for -stable, rh#515471 + +* Wed Sep 16 2009 Ben Skeggs 2.6.31-17 +- nouveau: potential fix for rh#522649 + misc other fixes + +* Tue Sep 15 2009 Chuck Ebbert +- Add unused-kernel-patches Make target, change some patches to + use ApplyOptionalPatch + +* Tue Sep 15 2009 Ben Skeggs +- nouveau: misc fixes to context-related issues, fixes some severe nv4x bugs + +* Tue Sep 15 2009 Ben Skeggs +- nouveau: temporarily disable fbcon accel, it's racing with ttm + +* Mon Sep 14 2009 Steve Dickson +- Added support for -o v4 mount parsing + +* Mon Sep 14 2009 Ben Skeggs +- nouveau: avoid PFIFO IRQ hardlock, misc LVDS mode fixes, nv5x RAMFC cleanup + +* Sun Sep 13 2009 Chuck Ebbert +- SCSI oops fixes requested for -stable + +* Fri Sep 11 2009 Dave Jones +- Apply NX/RO to modules + +* Fri Sep 11 2009 Dave Jones +- Mark kernel data section as NX + +* Fri Sep 11 2009 Ben Skeggs +- nouveau: bring in Matthew Garret's initial switchable graphics support + +* Fri Sep 11 2009 Ben Skeggs +- nouveau: fixed use of strap-based panel mode when required (rh#522649) +- nouveau: temporarily block accel on NVAC chipsets (rh#522361, rh#522575) + +* Thu Sep 10 2009 Matthew Garrett +- linux-2.6-ahci-export-capabilities.patch: Backport from upstream +- linux-2.6-rtc-show-hctosys.patch: Export the hctosys state of an rtc +- linux-2.6-rfkill-all.patch: Support for keys that toggle all rfkill state + +* Thu Sep 10 2009 Ben Skeggs +- drm-nouveau.patch: add some scaler-only modes for LVDS, GEM/TTM fixes + +* Wed Sep 09 2009 Dennis Gilmore 2.6.31-2 +- touch the dracut initrd file when using %%{with_dracut} + +* Wed Sep 09 2009 Chuck Ebbert 2.6.31-1 +- Linux 2.6.31 + +* Wed Sep 09 2009 Chuck Ebbert +- Enable VXpocket and PDaudioCF PCMCIA sound drivers. + +* Wed Sep 09 2009 Hans de Goede +- Move to %%post generation of dracut initrd, because of GPL issues surrounding + shipping a prebuild initrd +- Require grubby >= 7.0.4-1, for %%post generation + +* Wed Sep 9 2009 Steve Dickson +- Updated the NFS4 pseudo root code to the latest release. + +* Wed Sep 09 2009 Justin M. Forbes +- Revert virtio_blk to rotational mode. (#509383) + +* Wed Sep 09 2009 Dave Airlie 2.6.31-0.219.rc9.git +- uggh lost nouveau bits in page flip + +* Wed Sep 09 2009 Dave Airlie 2.6.31-0.218.rc9.git2 +- fix r600 oops with page flip patch (#520766) + +* Wed Sep 09 2009 Ben Skeggs +- drm-nouveau.patch: fix display resume on pre-G8x chips + +* Wed Sep 09 2009 Ben Skeggs +- drm-nouveau.patch: add getparam to know using tile_flags is ok for scanout + +* Wed Sep 09 2009 Chuck Ebbert +- 2.6.31-rc9-git2 + +* Wed Sep 9 2009 Roland McGrath 2.6.31-0.214.rc9.git1 +- compile with -fno-var-tracking-assignments, work around gcc bug #521991 + +* Wed Sep 09 2009 Dave Airlie 2.6.31-0.213.rc9.git1 +- fix two bugs in r600 kms, fencing + mobile lvds + +* Tue Sep 08 2009 Ben Skeggs 2.6.31-0.212.rc9.git1 +- drm-nouveau.patch: fix ppc build + +* Tue Sep 08 2009 Ben Skeggs 2.6.31-0.211.rc9.git1 +- drm-nouveau.patch: more misc fixes + +* Tue Sep 08 2009 Dave Airlie 2.6.31-0.210.rc9.git1 +- drm-page-flip.patch: rebase again + +* Tue Sep 08 2009 Dave Airlie 2.6.31-0.209.rc9.git1 +- drm-next.patch: fix r600 signal interruption return value + +* Tue Sep 08 2009 Ben Skeggs 2.6.31-0.208.rc9.git1 +- drm-nouveau.patch: latest upstream + rebase onto drm-next + +* Tue Sep 08 2009 Dave Airlie 2.6.31-0.207.rc9.git1 +- drm-vga-arb.patch: update to avoid lockdep + add r600 support + +* Tue Sep 08 2009 Dave Airlie 2.6.31-0.206.rc9.git1 +- drm: rebase to drm-next - r600 accel + kms should start working now + +* Mon Sep 07 2009 Chuck Ebbert 2.6.31-0.205.rc9.git1 +- 2.6.31-rc9-git1 +- Temporarily hack the drm-next patch so it still applies; the result + should still be safe to build. + +* Sat Sep 05 2009 Chuck Ebbert 2.6.31-0.204.rc9 +- 2.6.31-rc9 + +* Fri Sep 04 2009 Chuck Ebbert 2.6.31-0.203.rc8.git2 +- Fix kernel build errors when building firmware by removing the + .config file before that step and restoring it afterward. + +* Thu Sep 03 2009 Adam Jackson +- drm-ddc-caching-bug.patch: Empty the connector's mode list when it's + disconnected. + +* Thu Sep 03 2009 Jarod Wilson +- Update hdpvr and lirc_zilog drivers for 2.6.31 i2c + +* Thu Sep 03 2009 Justin M.Forbes +- Fix xen guest with stack protector. (#508120) +- Small kvm fixes. + +* Wed Sep 02 2009 Adam Jackson 2.6.31-0.199.rc8.git2 +- drm-intel-pm.patch: Disable by default, too flickery on too many machines. + Enable with i915.powersave=1. + +* Wed Sep 02 2009 Dave Jones +- Add missing scriptlet dependancy. (#520788) + +* Tue Sep 01 2009 Adam Jackson +- Make DRM less chatty about EDID failures. No one cares. + +* Tue Sep 01 2009 Chuck Ebbert +- 2.6.31-rc8-git2 +- Blank out drm-intel-next: entire contents are now upstream. + +* Tue Sep 01 2009 Dave Jones +- Make firmware buildarch noarch. (Suggested by drago01 on irc) + +* Tue Sep 01 2009 Jarod Wilson +- Fix up lirc_zilog to enable functional IR transmit and receive + on the Hauppauge HD PVR +- Fix audio on PVR-500 when used in same system as HVR-1800 (#480728) + +* Sun Aug 30 2009 Chuck Ebbert +- 2.6.31-rc8-git1 +- Drop linux-2.6-inotify-accounting.patch, merged upstream. + +* Sun Aug 30 2009 Jarod Wilson +- fix lirc_imon oops on older devices w/o tx ctrl ep (#520008) + +* Fri Aug 28 2009 Eric Paris 2.6.31-0.190.rc8 +- fix inotify length accounting and send inotify events + +* Fri Aug 28 2009 David Woodhouse +- Enable Solos DSL driver + +* Fri Aug 28 2009 Chuck Ebbert +- 2.6.31-rc8 + +* Thu Aug 27 2009 Chuck Ebbert 2.6.31-0.185.rc7.git6 +- 2.6.31-rc7-git6 +- Drop patch merged upstream: + xen-fb-probe-fix.patch + +* Thu Aug 27 2009 Adam Jackson +- drm-rv710-ucode-fix.patch: Treat successful microcode load on RV710 as, + you know, success. (#519718) + +* Thu Aug 27 2009 Chuck Ebbert +- 2.6.31-rc7-git5 +- Drop patch linux-2.6-ima-leak.patch, now merged upstream. + +* Wed Aug 26 2009 Jarod Wilson +- Fix up hdpvr ir enable patch for use w/modular i2c (David Engel) + +* Wed Aug 26 2009 Eric Paris +- fix iint_cache leak in IMA code + drop the ima=0 patch + +* Wed Aug 26 2009 Justin M. Forbes +- Fix munlock with KSM (#516909) +- Re-enable KSM + +* Wed Aug 26 2009 Chuck Ebbert +- 2.6.31-rc7-git4 +- Drop patches merged upstream: + xen-x86-fix-stackprotect.patch + xen-x86-no-stackprotect.patch + +* Wed Aug 26 2009 Adam Jackson +- drm-intel-next.patch: Update, various output setup fixes. + +* Wed Aug 26 2009 David Woodhouse +- Make WiMAX modular (#512070) + +* Tue Aug 25 2009 Kyle McMartin +- allow-disabling-ima.diff: debugging patch... adds ima=0 kernel + param to disable initialization of IMA. + +* Tue Aug 25 2009 Ben Skeggs 2.6.31-0.174.rc7.git2 +- drm-nouveau.patch: upstream update, pre-nv50 tv-out + misc fixes + +* Tue Aug 25 2009 Chuck Ebbert 2.6.31-0.173.rc7.git2 +- Fix Xen boot (#508120) + +* Tue Aug 25 2009 Dave Airlie +- pull in drm-next tree + rebase around it + +* Mon Aug 24 2009 Chuck Ebbert +- 2.6.31-rc7-git2 + +* Mon Aug 24 2009 Chuck Ebbert +- 2.6.31-rc7-git1 + +* Sat Aug 22 2009 Chuck Ebbert +- 2.6.31-rc7 + +* Thu Aug 20 2009 Mark McLoughlin +- Disable LZMA for xen (#515831) + +* Thu Aug 20 2009 Chuck Ebbert +- 2.6.31-rc6-git5 +- Fix up drm-r600-kms.patch +- Drop fix-perf-make-man-failure.patch + +* Wed Aug 19 2009 Chuck Ebbert +- 2.6.31-rc6-git5 +- Revert linux-2.6-debug-vm-would-have-oomkilled.patch to v1.2 + because upstream changes to oom-kill.c were all reverted. + +* Tue Aug 18 2009 Kyle McMartin +- Fix up perf so that it builds docs now that they are fixed. +- with_docs disables perf docs too. be warned. (logic is that the + build deps are (mostly) the same, so if you don't want one, odds are...) + +* Tue Aug 18 2009 Dave Jones +- 2.6.31-rc6-git3 + +* Mon Aug 17 2009 Dave Jones 2.6.31-0.161.rc6.git2 +- 2.6.31-rc6-git2 + +* Mon Aug 17 2009 Chuck Ebbert +- Stop generating the (unused) ppc64-kdump.config file. + +* Mon Aug 17 2009 Jarod Wilson +- Add new lirc driver for built-in ENE0100 device on some laptops + +* Sun Aug 16 2009 Kyle McMartin 2.6.31-0.158.rc6 +- Improve the perf script so it prints something helpful if the + perf binary doesn't exist. + +* Sat Aug 15 2009 Dave Jones 2.6.31-0.157.rc6 +- Disable KSM patches on a hunch. Chasing the "encrypted VGs don't work" bug. + +* Fri Aug 14 2009 Dave Jones 2.6.31-0.155.rc6 +- 2.6.31-rc6 + +* Wed Aug 12 2009 Kyle McMartin +- fix perf. +- move perf to perf.$ver instead of perf-$ver... + +* Wed Aug 12 2009 Dennis Gilmore +- Obsolete kernel-smp on sparc64 +- Require grubby >= 7.0.2-1 since thats what introduces the dracut options we use + +* Wed Aug 12 2009 Kristian Høgsberg +- Fix drm-page-flip.patch to not break radeon kms and to not reset + crtc offset into fb on flip. + +* Wed Aug 12 2009 Adam Jackson +- Update drm-intel-next patch + +* Tue Aug 11 2009 Dennis Gilmore - 2.6.31-0.149.rc5.git3 +- disable building the -smp kernel on sparc64 +- disable building kernel-perf on sparc64 syscalls not supported + +* Tue Aug 11 2009 Eric Paris +- Enable config IMA + +* Tue Aug 11 2009 Ben Skeggs +- nouveau: various cleanups and fixes + more sanity checking in dma paths + +* Mon Aug 10 2009 Jarod Wilson +- Add new device ID to lirc_mceusb (#512483) +- Fix some lockdep false positives +- Add support for setting and enabling iMON clock via sysfs +- Add tunable pad threshold support to lirc_imon +- Add new pseudo-IR protocl to lirc_imon for universals w/o a pad +- Fix mouse device support on older iMON devices + +* Mon Aug 10 2009 David Woodhouse 2.6.31-0.145.rc5.git3 +- Merge latest Intel IOMMU fixes and BIOS workarounds, re-enable by default. + +* Sun Aug 09 2009 Kyle McMartin +- btusb autosuspend: fix build on !CONFIG_PM by stubbing out + suspend/resume methods. + +* Sat Aug 08 2009 Dennis Gilmore 2.6.31-0.141.rc5.git3 +- disable kgdb on sparc64 uni-processor kernel +- set max cpus to 256 on sparc64 +- enable AT keyboard on sparc64 + +* Fri Aug 07 2009 Justin M. Forbes +- Apply KSM updates from upstream + +* Fri Aug 07 2009 Hans de Goede +- When building a dracut generic initrd tell new-kernel-pkg to use that + instead of running mkinitrd + +* Fri Aug 07 2009 Dave Airlie 2.6.31-0.139.rc5.git3 +- drm-r600-kms.patch - update r600 KMS +- drm-radeon-fixes.patch - patches for queue to Linus + +* Thu Aug 06 2009 Justin M. Forbes 2.6.31-0.138.rc5.git3 +- Fix kvm virtio_blk errors (#514901) + +* Thu Aug 06 2009 Adam Jackson +- Hush DRM vblank warnings, they're constant (and harmless) under DRI2. + +* Thu Aug 06 2009 Dave Airlie 2.6.31.0.134.rc5.git3 +- fixup vga arb warning at startup and handover between gpus + +* Thu Aug 06 2009 Kyle McMartin 2.6.31.0.133.rc5.git3 +- die-floppy-die.patch: it's the 21st century, let's not rely on + steam powered technology. + +* Wed Aug 05 2009 Dave Airlie 2.6.31.0.132.rc5.git3 +- revert-ftrace-powerpc-snafu.patch - fix ppc build + +* Wed Aug 05 2009 Ben Skeggs +- nouveau: respect nomodeset + +* Wed Aug 05 2009 Chuck Ebbert +- Fix /usr/sbin/perf script. (#515494) + +* Wed Aug 05 2009 Dave Jones +- Fix shift in pci cacheline size printk. + +* Wed Aug 05 2009 Dave Airlie 2.6.31.0.128.rc5.git3 +- 2.6.31-rc5-git3 +- drop cpufreq + set memory fixes + +* Wed Aug 05 2009 Dave Airlie +- Add Jeromes initial r600 kms work. +- rebase arb patch + +* Tue Aug 04 2009 Kyle McMartin +- alsa-tell-user-that-stream-to-be-rewound-is-suspended.patch: apply patch + destined for 2.6.32, requested by Lennart. + +* Tue Aug 04 2009 Ben Skeggs +- nouveau: more code share between nv50/ +- update VGA arb patches again + +* Mon Aug 03 2009 Adam Jackson +- Update intel drm from anholt's tree +- Rebase drm-intel-pm.patch to match +- Drop gen3 fb hack, merged +- Drop previous watermark setup change + +* Mon Aug 03 2009 Dave Jones 2.6.31-0.122.rc5.git2 +- 2.6.31-rc5-git2 + +* Mon Aug 03 2009 Adam Jackson +- (Attempt to) fix watermark setup on Intel 9xx parts. + +* Mon Aug 03 2009 Jarod Wilson +- make usbhid driver ignore all recent SoundGraph iMON devices, so the + lirc_imon driver can grab them instead + +* Mon Aug 03 2009 Dave Airlie +- update VGA arb patches + +* Sat Aug 01 2009 David Woodhouse 2.6.31-0.118.rc5 +- Fix boot failures on ppc32 (#514010, #505071) + +* Fri Jul 31 2009 Kyle McMartin 2.6.31-0.117.rc5 +- Linux 2.6.31-rc5 + +* Fri Jul 31 2009 Matthew Garrett +- linux-2.6-dell-laptop-rfkill-fix.patch: Fix up Dell rfkill + +* Fri Jul 31 2009 Ben Skeggs +- nouveau: build against 2.6.31-rc4-git6, fix script parsing on some G8x chips + +* Thu Jul 30 2009 Chuck Ebbert +- Linux 2.6.31-rc4-git6 + New config item: CONFIG_BATTERY_DS2782 is not set +- Add last-minute set_memory_wc() fix from LKML. + +* Thu Jul 30 2009 Matthew Garrett +- drm-intel-pm.patch: Don't reclock external outputs. Increase the reduced + clock slightly to avoid upsetting some hardware. Disable renderclock + adjustment for the moment - it's breaking on some hardware. + +* Thu Jul 30 2009 Ben Skeggs +- nouveau: another DCB 1.5 entry, G80 corruption fixes, small +- fix VGA ARB + kms + +* Wed Jul 29 2009 Dave Jones +- Add support for dracut. (Harald Hoyer) + +* Wed Jul 29 2009 Ben Skeggs +- drm-nouveau.patch: nv50/nva0 tiled scanout fixes, nv40 kms fixes + +* Wed Jul 29 2009 Chuck Ebbert +- Linux 2.6.31-rc4-git3 +- Drop linux-2.6-ecryptfs-overflow-fixes.patch, merged upstream now. + +* Wed Jul 29 2009 Dave Airlie +- update VGA arb patches + +* Tue Jul 28 2009 Adam Jackson +- Remove the pcspkr modalias. If you're still living in 1994, load it + by hand. + +* Tue Jul 28 2009 Eric Sandeen 2.6.31-0.102.rc4.git2 +- Fix eCryptfs overflow issues (CVE-2009-2406, CVE-2009-2407) + +* Tue Jul 28 2009 Kyle McMartin 2.6.31-0.101.rc4.git2 +- 2.6.31-rc4-git2 +- rebase linux-2.6-fix-usb-serial-autosuspend.diff +- config changes: + - USB_GSPCA_SN9C20X=m (_EVDEV=y) + +* Tue Jul 28 2009 Ben Skeggs +- drm-nouveau.patch: cleanup userspace API, various bugfixes. + Looks worse than it is, register macros got cleaned up, which + touches pretty much everywhere.. + +* Mon Jul 27 2009 Adam Jackson +- Warn quieter about not finding PCI bus parents for ROM BARs, they're + not usually needed and there's nothing you can do about it anyway. + +* Mon Jul 27 2009 Matthew Garrett +- linux-2.6-alsa-improve-hda-powerdown.patch - attempt to reduce audio glitches + caused by HDA powerdown +- disable CONFIG_DEBUG_KOBJECT again for now, since it produces huge dmesg spew + +* Mon Jul 27 2009 Dave Airlie +- update vga arb code + +* Mon Jul 27 2009 Matthew Garrett +- drm-intel-pm.patch - Add runtime PM for Intel graphics + +* Fri Jul 24 2009 Kristian Høgsberg +- Add drm-page-flip.patch to support vsynced page flipping on intel + chipsets. +- Really add patch. +- Fix patch to not break nouveau. + +* Fri Jul 24 2009 Chuck Ebbert +- Enable CONFIG_DEBUG_KOBJECT in debug kernels. (#513606) + +* Thu Jul 23 2009 Kyle McMartin +- perf BuildRequires binutils-devel now. + +* Thu Jul 23 2009 Justin M. Forbes +- Add KSM support + +* Thu Jul 23 2009 Kyle McMartin 2.6.31-0.87.rc4 +- Linux 2.6.31-rc4 +- config changes: + - USB_CDC_PHONET=m [all] + - EVENT_PROFILE=y [i386, x86_64, powerpc, s390] + +* Wed Jul 22 2009 Tom "spot" Callaway +- We have to override the new %%install behavior because, well... the kernel is special. + +* Wed Jul 22 2009 Dave Jones +- 2.6.31-rc3-git5 + +* Wed Jul 22 2009 Ben Skeggs 2.6.31-0.82.rc3.git4 +- Enable KMS for nouveau + +* Wed Jul 22 2009 Ben Skeggs +- Update nouveau from upstream (initial suspend/resume + misc bugfixes) + +* Mon Jul 20 2009 Adam Jackson +- Disable VGA arbiter patches for a moment + +* Mon Jul 20 2009 Adam Jackson +- Revive 4k framebuffers for intel gen3 + +* Mon Jul 20 2009 Dave Jones 2.6.31-0.78.rc3.git4 +- Enable CONFIG_RTC_HCTOSYS (#489494) + +* Mon Jul 20 2009 Dave Jones 2.6.31-0.77.rc3.git4 +- Don't build 586 kernels any more. + +* Sun Jul 19 2009 Dave Jones 2.6.31-0.75.rc3.git4 +- build a 'full' package on i686 (Bill Nottingham) + +* Sun Jul 19 2009 Dave Jones 2.6.31-0.74.rc3.git4 +- 2.6.31-rc3-git4 + +* Sat Jul 18 2009 Matthew Garrett +- linux-2.6-driver-level-usb-autosuspend.diff - allow drivers to enable autopm +- linux-2.6-fix-usb-serial-autosuspend.diff - fix generic usb-serial autopm +- linux-2.6-qcserial-autosuspend.diff - enable autopm by default on qcserial +- linux-2.6-bluetooth-autosuspend.diff - enable autopm by default on btusb +- linux-2.6-usb-uvc-autosuspend.diff - enable autopm by default on uvc + +* Thu Jul 16 2009 Chuck Ebbert +- 2.6.31-rc3-git3 + +* Thu Jul 16 2009 Matthew Garrett +- linux-2.6-defaults-aspm.patch - default ASPM to on for PCIe >= 1.1 hardware + +* Thu Jul 16 2009 Dave Airlie 2.6.31-0.69.rc3 +- linux-2.6-vga-arb.patch - add VGA arbiter. +- drm-vga-arb.patch - add VGA arbiter support to drm + +* Tue Jul 14 2009 Kyle McMartin 2.6.31-0.68-rc3 +- 2.6.31-rc3 +- config changes: + - RTL8192SU is not set, (staging) + +* Mon Jul 13 2009 Kyle McMartin 2.6.31-0.67.rc2.git9 +- 2.6.31-rc2-git9 +- config changes: + - BLK_DEV_OSD=m + +* Mon Jul 13 2009 Ben Skeggs +- drm-nouveau.patch: update from upstream + +* Fri Jul 10 2009 Chuck Ebbert +- 2.6.31-rc2-git6 +- Drop dmadebug-spinlock patch -- merged upstream. + +* Fri Jul 10 2009 Dave Jones 2.6.31-0.64.rc2.git5 +- Don't jump through hoops that ppc powerbooks have to on sensible systems + in cpufreq_suspend. + +* Fri Jul 10 2009 Dave Jones +- 2.6.31-rc2-git5 + +* Thu Jul 09 2009 Dave Jones 2.6.31-0.62.rc2.git4 +- Use correct spinlock initialization in dma-debug + +* Thu Jul 09 2009 Chuck Ebbert 2.6.31-0.61.rc2.git4 +- 2.6.31-rc2-git4 + +* Thu Jul 09 2009 Jarod Wilson +- Enable IR receiver on the Hauppauge HD PVR +- Trim the changelog, axing everything before 2.6.29 (see cvs + if you still really want to see that far back) + +* Wed Jul 08 2009 Dave Jones +- Enable a bunch of debugging options that were missed somehow. + +* Wed Jul 08 2009 Kyle McMartin +- Bump NR_CPUS on x86_64 to 512. + +* Wed Jul 08 2009 Adam Jackson +- drm-no-gem-on-i8xx.patch: Drop, intel 2D driver requires GEM now. This + should be entertaining. + +* Wed Jul 08 2009 Kyle McMartin +- First cut of /usr/sbin/perf wrapper script and 'perf' + subpackage. + +* Wed Jul 08 2009 Kyle McMartin 2.6.31-0.54.rc2.git2 +- Rebase and re-apply all the Fedora-specific linux-2.6-debug-* + patches. +- Cull a bunch of upstreamed patches from the spec. + +* Wed Jul 08 2009 Steve Dickson +- Added NFSD v4 dynamic pseudo root patch which allows + NFS v3 exports to be mounted by v4 clients. + +* Tue Jul 07 2009 Jarod Wilson +- See if we can't make lirc_streamzap behave better... (#508952) + +* Tue Jul 07 2009 Chuck Ebbert 2.6.31-0.47.rc2.git2 +- 2.6.31-rc2-git2 + +* Tue Jul 07 2009 Jarod Wilson +- Make lirc_i2c actually work with 2.6.31 i2c + +* Mon Jul 06 2009 Chuck Ebbert +- Use LZMA for kernel compression on X86. + +* Mon Jul 06 2009 Jarod Wilson +- Hack up lirc_i2c and lirc_zilog to compile with 2.6.31 i2c + changes. The drivers might not actually be functional now, but + at least they compile again. Will fix later, if need be... + +* Sat Jul 04 2009 Dave Jones 2.6.31-0.42.rc2 +- 2.6.31-rc2 + +* Sat Jul 04 2009 Chuck Ebbert +- 2.6.31-rc1-git11 + +* Fri Jul 03 2009 Hans de Goede +- Disable v4l1 ov511 and quickcam_messenger drivers (obsoleted by + v4l2 gspca subdrivers) + +* Thu Jul 02 2009 Kyle McMartin 2.6.31-0.39.rc1.git9 +- 2.6.31-rc1-git9 +- linux-2.6-dm-fix-exstore-search.patch: similar patch merged upstream. + +* Tue Jun 30 2009 Chuck Ebbert 2.6.31-0.38.rc1.git7 +- 2.6.31-rc1-git7 + +* Tue Jun 30 2009 Dave Jones 2.6.31-0.37.rc1.git5 +- Disable kmemleak. Way too noisy, and not finding any real bugs. + +* Tue Jun 30 2009 Ben Skeggs +- drm-nouveau.patch: match upstream + +* Mon Jun 29 2009 Chuck Ebbert 2.6.31-0.35.rc1.git5 +- 2.6.31-rc1-git5 +- CONFIG_LEDS_LP3944=m + +* Mon Jun 29 2009 Chuck Ebbert +- Try to fix the dm overlay bug for real (#505121) + +* Sat Jun 27 2009 Ben Skeggs 2.6.31-0.33.rc1.git2 +- drm-nouveau.patch: fix conflicts from 2.6.31-rc1-git2 + +* Fri Jun 26 2009 Dave Jones 2.6.31-0.31.rc1.git2 +- Further improvements to kmemleak + +* Fri Jun 26 2009 Dave Jones 2.6.31-0.30.rc1.git2 +- 2.6.31-rc1-git2 + +* Fri Jun 26 2009 Ben Skeggs +- drm-nouveau.patch: latest upstream + reenable + +* Thu Jun 25 2009 Dave Jones 2.6.31-0.29.rc1 +- Make kmemleak scan process stacks by default. + Should reduce false positives (which does also increase false negatives, + but that's at least less noisy) + +* Wed Jun 24 2009 Kyle McMartin 2.6.31-0.28.rc1 +- 2.6.31-rc1 +- linux-2.6-utrace.patch: rebase on kernel/Makefile changes +- config changes: + - generic: + - CONFIG_DM_LOG_USERSPACE=m + - CONFIG_DM_MULTIPATH_QL=m + - CONFIG_DM_MULTIPATH_ST=m + - CONFIG_BATTERY_MAX17040=m + - CONFIG_I2C_DESIGNWARE is off (depends on clk.h) + +* Wed Jun 24 2009 Kyle McMartin +- Move perf to /usr/libexec/perf-$KernelVer. + +* Wed Jun 24 2009 Kyle McMartin +- config changes: + - generic: + - CONFIG_SCSI_DEBUG=m (was off, requested by davidz) + +* Wed Jun 24 2009 Dave Jones 2.6.31-0.22.rc0.git22 +- 2.6.30-git22 + +* Tue Jun 23 2009 Dave Jones 2.6.31-0.22.rc0.git20 +- 2.6.30-git20 + +* Mon Jun 22 2009 Kyle McMartin 2.6.31-0.24.rc0.git18 +- Enable tools/perf, installed as /bin/perf-$KernelVer. Docs and a /bin/perf + wrapper come next if this builds ok. + +* Mon Jun 22 2009 Kyle McMartin +- sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch: pull in + two fixes from Mike Galbraith from tip.git + +* Sun Jun 21 2009 Dave Jones 2.6.31-0.21.rc0.git18 +- Add patch to possibly fix the pktlen problem on via-velocity. + +* Sun Jun 21 2009 Dave Jones 2.6.31-0.20.rc0.git18 +- 2.6.30-git18 + VIA crypto & mmc patches now upstream. + +* Sun Jun 21 2009 Dave Jones +- Determine cacheline sizes in a generic manner. + +* Sun Jun 21 2009 Chuck Ebbert 2.6.31-0.18.rc0.git17 +- 2.6.30-git17 +- Config changes: + - powerpc32-generic + CONFIG_PERF_COUNTERS=y + - generic + CONFIG_KEYBOARD_LM8323 is not set + CONFIG_MOUSE_SYNAPTICS_I2C=m + CONFIG_TOUCHSCREEN_EETI=m + CONFIG_TOUCHSCREEN_W90X900=m +- Dropped agp-set_memory_ucwb.patch, all fixed upstream now. + +* Sat Jun 20 2009 Kyle McMartin 2.6.31.0.17.rc0.git15 +- config changes: + - ppc generic: + - CONFIG_PPC_DISABLE_WERROR=y (switched... chrp fails otherwise, stack + frame size.) + +* Sat Jun 20 2009 Kyle McMartin 2.6.31.0.16.rc0.git15 +- 2.6.30-git15 +- config changes: + - generic: + - CONFIG_LBDAF=y + - staging: + - CONFIG_USB_SERIAL_QUATECH2 is not set + - CONFIG_VT6655 is not set + - CONFIG_USB_CPC is not set + - CONFIG_RDC_17F3101X is not set + - CONFIG_FB_UDL is not set + - ppc32: + - CONFIG_KMETER1=y + - ppc generic: + - CONFIG_PPC_DISABLE_WERROR is not set +- lirc disabled due to i2c detach_client removal. + +* Sat Jun 20 2009 Kyle McMartin +- sched-introduce-SCHED_RESET_ON_FORK-scheduling-policy-flag.patch: add, + queued in tip/sched/core (ca94c442535a44d508c99a77e54f21a59f4fc462) + +* Fri Jun 19 2009 Kyle McMartin 2.6.31.0.15.rc0.git14 +- Fix up ptrace, hopefully. Builds on x86_64 at least. + +* Fri Jun 19 2009 Chuck Ebbert +- linux-2.6-tip.git-203abd67b75f7714ce98ab0cdbd6cfd7ad79dec4.patch + Fixes oops on boot with qemu (#507007) + +* Fri Jun 19 2009 Kyle McMartin 2.6.31-0.13.rc0.git14 +- 2.6.30-git14 + +* Fri Jun 19 2009 Chuck Ebbert +- Fix up the via-sdmmc and via-hwmon-temp-sensor patches. +- Drop VIA Padlock patches merged upstream: + via-rng-enable-64bit.patch + via-padlock-10-enable-64bit.patch + via-padlock-20-add-x86-dependency.patch + +* Thu Jun 18 2009 Kyle McMartin 2.6.31-0.11.rc0.git13 +- 2.6.30-git13 +- config changes: + - arm: + - CONFIG_UACCESS_WITH_MEMCPY is not set + - i686-PAE: + - CONFIG_XEN_DEV_EVTCHN=m + - CONFIG_XEN_SYS_HYPERVISOR=y + - ia64: + - CONFIG_RCU_FANOUT=64 + - nodebug: + - CONFIG_DEBUG_KMEMLEAK is not set + - CONFIG_DEBUG_KMEMLEAK_TEST=m + - powerpc: + - CONFIG_CAN_SJA1000_OF_PLATFORM=m + - CONFIG_PPC_EMULATED_STATS=y + - CONFIG_SWIOTLB=y + - CONFIG_RDS is not set (broken on ppc32) + - powerpc32: + - CONFIG_RCU_FANOUT=32 + - powerpc64: + - CONFIG_RCU_FANOUT=64 + - CONFIG_PERF_COUNTERS=y + - s390x: + - CONFIG_RCU_FANOUT=64 + - CONFIG_SECCOMP=y + - CONFIG_PM=y + - CONFIG_HIBERNATION=y + - CONFIG_PM_STD_PARTITION="/dev/jokes" + - sparc64: + - CONFIG_RCU_FANOUT=64 + - x86: + - CONFIG_RCU_FANOUT=32 + - CONFIG_IOMMU_STRESS is not set + - CONFIG_PERF_COUNTERS=y + - CONFIG_X86_OLD_MCE is not set + - CONFIG_X86_MCE_INTEL=y + - CONFIG_X86_MCE_AMD=y + - CONFIG_X86_ANCIENT_MCE is not set + - CONFIG_X86_MCE_INJECT is not set + - x86_64: + - CONFIG_EDAC_AMD64=m + - CONFIG_EDAC_AMD64_ERROR_INJECTION is not set + - CONFIG_XEN_DEV_EVTCHN=m + - CONFIG_XEN_SYS_HYPERVISOR=y + - CONFIG_RCU_FANOUT=64 + - CONFIG_IOMMU_STRESS is not set + - CONFIG_PERF_COUNTERS=y + - CONFIG_X86_MCE_INJECT is not set + - generic: + - CONFIG_RCU_FANOUT=32 + - CONFIG_MMC_SDHCI_PLTFM=m + - CONFIG_MMC_CB710=m + - CONFIG_CB710_CORE=m + - CONFIG_CB710_DEBUG is not set + - CONFIG_SCSI_MVSAS_DEBUG is not set + - CONFIG_SCSI_BNX2_ISCSI=m + - CONFIG_NETFILTER_XT_MATCH_OSF=m + - CONFIG_RFKILL_INPUT=y (used to be =m, which was invalid) + - CONFIG_DE2104X_DSL=0 + - CONFIG_KS8842 is not set + - CONFIG_CFG80211_DEBUGFS=y + - CONFIG_MAC80211_DEFAULT_PS=y + - CONFIG_IWM=m + - CONFIG_IWM_DEBUG is not set + - CONFIG_RT2800USB=m + - CONFIG_CAN_DEV=m + - CONFIG_CAN_CALC_BITTIMING=y + - CONFIG_CAN_SJA1000=m + - CONFIG_CAN_SJA1000_PLATFORM=m + - CONFIG_CAN_EMS_PCI=m + - CONFIG_CAN_KVASER_PCI=m + - CONFIG_EEPROM_MAX6875=m + - CONFIG_SENSORS_TMP401=m + - CONFIG_MEDIA_SUPPORT=m + - CONFIG_SND_CTXFI=m + - CONFIG_SND_LX6464ES=m + - CONFIG_SND_HDA_CODEC_CA0110=y + - CONFIG_USB_XHCI_HCD=m + - CONFIG_USB_XHCI_HCD_DEBUGGING is not set + - CONFIG_DRAGONRISE_FF=y (used to be =m) + - CONFIG_GREENASIA_FF=y (used to be =m) + - CONFIG_SMARTJOYPLUS_FF=y (used to be =m) + - CONFIG_USB_NET_INT51X1=m + - CONFIG_CUSE=m + - CONFIG_FUNCTION_PROFILER=y + - CONFIG_RING_BUFFER_BENCHMARK=m + - CONFIG_REGULATOR_USERSPACE_CONSUMER=m + - CONFIG_REGULATOR_MAX1586=m + - CONFIG_REGULATOR_LP3971=m + - CONFIG_RCU_FANOUT_EXACT is not set + - CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 + - CONFIG_FSNOTIFY=y + - CONFIG_IEEE802154=m + - CONFIG_IEEE802154_DRIVERS=m + - CONFIG_IEEE802154_FAKEHARD=m + - CONFIG_CNIC=m + +* Wed Jun 17 2009 Jarod Wilson +- New lirc_imon hotness, update 2: + * support dual-interface devices with a single lirc device + * directional pad functions as an input device mouse + * touchscreen devices finally properly supported + * support for using MCE/RC-6 protocol remotes + * fix oops in RF remote association code (F10 bug #475496) + * fix re-enabling case/panel buttons and/or knobs +- Add some misc additional lirc_mceusb2 transceiver IDs +- Add missing unregister_chrdev_region() call to lirc_dev exit +- Add it8720 support to lirc_it87 + +* Tue Jun 16 2009 Chuck Ebbert +- Update via-sdmmc driver + +* Mon Jun 15 2009 Jarod Wilson +- Update lirc patches w/new imon hotness + +* Fri Jun 12 2009 Chuck Ebbert +- Update VIA temp sensor and mmc drivers. + +* Fri Jun 12 2009 John W. Linville 2.6.30-6 +- neigh: fix state transition INCOMPLETE->FAILED via Netlink request +- enable CONFIG_ARPD (used by OpenNHRP) + +* Wed Jun 10 2009 Chuck Ebbert +- VIA Nano updates: + Enable Padlock AES encryption and random number generator on x86-64 + Add via-sdmmc and via-cputemp drivers + +* Wed Jun 10 2009 Kyle McMartin 2.6.30-1 +- Linux 2.6.30 rebase. + +* Tue Jun 09 2009 John W. Linville +- Clean-up some wireless bits in config-generic + +* Tue Jun 09 2009 Chuck Ebbert +- Add support for ACPI P-states on VIA processors. +- Disable the e_powersaver driver. + +* Tue Jun 09 2009 Chuck Ebbert +- Linux 2.6.30-rc8-git6 + +* Fri Jun 05 2009 Chuck Ebbert +- Linux 2.6.30-rc8-git1 + +* Wed Jun 03 2009 Kyle McMartin +- Linux 2.6.30-rc8 + +* Tue Jun 2 2009 Roland McGrath +- utrace update (fixes stap PR10185) + +* Tue Jun 02 2009 Dave Jones +- For reasons unknown, RT2X00 driver was being built-in. + Make it modular. + +* Tue Jun 02 2009 Dave Jones +- 2.6.30-rc7-git5 + +* Sat May 30 2009 Dave Jones +- 2.6.30-rc7-git4 + +* Thu May 28 2009 Dave Jones +- 2.6.30-rc7-git2 + +* Tue May 26 2009 Dave Jones +- Various cpufreq patches from git. + +* Tue May 26 2009 Dave Jones +- 2.6.30-rc7-git1 + +* Mon May 25 2009 Kyle McMartin +- rds-only-on-64-bit-or-x86.patch: drop patch, issue is fixed upstream. + +* Sat May 23 2009 Dave Jones +- 2.6.30-rc7 + +* Thu May 21 2009 Dave Jones +- 2.6.30-rc6-git6 + +* Wed May 20 2009 Chuck Ebbert +- Enable Divas (formerly Eicon) ISDN drivers on x86_64. (#480837) + +* Wed May 20 2009 Dave Jones +- 2.6.30-rc6-git5 + +* Mon May 18 2009 Dave Jones +- 2.6.30-rc6-git3 + +* Sun May 17 2009 Dave Jones +- 2.6.30-rc6-git2 + +* Sat May 16 2009 Dave Jones +- 2.6.30-rc6 + +* Mon May 11 2009 Kyle McMartin +- Linux 2.6.30-rc5-git1 + +* Fri May 08 2009 Kyle McMartin +- Linux 2.6.30-rc5 + +* Fri May 08 2009 Kyle McMartin +- Linux 2.6.30-rc4-git4 + +* Wed May 06 2009 Kyle McMartin +- Linux 2.6.30-rc4-git3 +- linux-2.6-cdrom-door-status.patch: merged upstream. +- linux-2.6-iwl3945-remove-useless-exports.patch: merged upstream. +- linux-2.6-utrace.patch: rebase against changes to fs/proc/array.c +- USB_NET_CDC_EEM=m + +* Fri May 01 2009 Eric Sandeen +- Fix ext4 corruption on partial write into prealloc block + +* Thu Apr 30 2009 Kyle McMartin +- 2.6.30-rc4 + +* Wed Apr 29 2009 Dave Jones +- 2.6.30-rc3-git6 + +* Tue Apr 28 2009 Dave Jones +- 2.6.30-rc3-git4 + +* Tue Apr 28 2009 Chuck Ebbert +- Make the kernel-vanilla package buildable again. +- Allow building with older versions of RPM. + +* Tue Apr 28 2009 Neil Horman +- Backport missing snmp stats (bz 492391) + +* Tue Apr 28 2009 Chuck Ebbert 2.6.30-0.72.rc3.git3 +- Drop unused exports from the iwl3945 driver. + +* Tue Apr 28 2009 Chuck Ebbert +- Linux 2.6.30-rc3-git3 + +* Mon Apr 27 2009 Dave Jones +- 2.6.30-rc3-git2 + +* Sun Apr 26 2009 Chuck Ebbert 2.6.30-0.68.rc3.git1 +- Linux 2.6.30-rc3-git1 + +* Wed Apr 22 2009 Dave Jones 2.6.30-0.67.rc3 +- Disable SYSFS_DEPRECATED on ia64 + +* Wed Apr 22 2009 Kyle McMartin +- Linux 2.6.30-rc3 +- PROC_VMCORE=y: Exports the dump image of crashed + kernel in ELF format + +* Wed Apr 22 2009 Neil Horman +- Enable RELOCATABLE and CRASH_DUMP for powerpc64 +- With this we can remove the -kdump build variant +- for the ppc64 arch + +* Tue Apr 21 2009 Chuck Ebbert +- Don't include the modules.*.bin files in the RPM package. + +* Tue Apr 21 2009 Dave Jones +- 2.6.30-rc2-git7 + +* Mon Apr 20 2009 Dave Jones +- Various s390x config tweaks. (#496596, #496601, #496605, #496607) + +* Mon Apr 20 2009 Dave Jones +- 2.6.30-rc2-git6 + +* Sat Apr 18 2009 Chuck Ebbert +- Set CONFIG_UEVENT_HELPER_PATH to the empty string (#496296) + +* Fri Apr 17 2009 Dave Jones +- 2.6.30-rc2-git3 + +* Thu Apr 16 2009 Kyle McMartin 2.6.30-0.58.rc2.git1 +- 2.6.30-rc2-git1 + +* Wed Apr 15 2009 Kyle McMartin 2.6.30-0.57.rc2 +- 2.6.30-rc2 + +* Tue Apr 14 2009 Kyle McMartin +- 2.6.30-rc1-git7 +- CONFIG_TOUCHSCREEN_AD7879_I2C=m +- CONFIG_STRIP_ASM_SYMS=y, off for -debug + +* Mon Apr 13 2009 Kyle McMartin +- ppc-fix-parport_pc.patch: add from linuxppc-dev@ + +* Mon Apr 13 2009 Kyle McMartin +- execshield: fix build (load_user_cs_desc is 32-bit only in tlb.c) + +* Sun Apr 12 2009 Kyle McMartin +- 2.6.30-rc1-git5 +- revert-fix-modules_install-via-nfs.patch: reverted upstream + +* Thu Apr 09 2009 Kyle McMartin +- actually drop utrace-ftrace from srpm. + +* Thu Apr 09 2009 Kyle McMartin +- 2.6.30-rc1-git2 +- CONFIG_IGBVF=m +- CONFIG_NETFILTER_XT_TARGET_LED=m + +* Thu Apr 09 2009 Dave Jones +- Bring back the /dev/crash driver. (#492803) + +* Wed Apr 08 2009 Dave Jones +- disable MMIOTRACE in non-debug builds (#494584) + +* Wed Apr 08 2009 Kyle McMartin 2.6.30-0.44.rc1 +- 2.6.30-rc1 +- linux-2.6-hwmon-atk0110.patch: drop +- CONFIG_DETECT_HUNG_TASK=y +- # CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set + +* Tue Apr 7 2009 Roland McGrath +- utrace update, drop unfinished utrace-ftrace + +* Tue Apr 07 2009 Kyle McMartin +- Linux 2.6.29-git15 +- EXT3_DEFAULTS_TO_ORDERED on for now. +- X86_X2APIC enabled. +- LEDS_LP5521, LEDS_BD2802 off... look not generally relevant. +- LIBFCOE on. + +* Tue Apr 07 2009 Dave Jones +- Enable CONFIG_CIFS_STATS (#494545) + +* Mon Apr 06 2009 Kyle McMartin +- linux-2.6-execshield.patch: rebase for 2.6.30 + +* Mon Apr 06 2009 Kyle McMartin +- Linux 2.6.29-git13 +- drop patches merged upstream: + - fix-ppc-debug_kmap_atomic.patch + - fix-staging-at76.patch + - linux-2.6-acpi-video-didl-intel-outputs.patch + - linux-2.6-acpi-strict-resources.patch + - linux-2.6-sony-laptop-rfkill.patch + - linux-2.6-btrfs-fix-umount-hang.patch + - linux-2.6-fiemap-header-install.patch + - linux-2.6-debug-dma-api.patch + - dma-api-debug-fixes.patch + - linux-2.6-ext4-flush-on-close.patch + - linux-2.6-relatime-by-default.patch + - linux-2.6-pci-sysfs-remove-id.patch + - linux-2.6-scsi-cpqarray-set-master.patch + - alsa-rewrite-hw_ptr-updaters.patch + - alsa-pcm-always-reset-invalid-position.patch + - alsa-pcm-fix-delta-calc-at-overlap.patch + - alsa-pcm-safer-boundary-checks.patch + - linux-2.6-input-hid-extra-gamepad.patch + - linux-2.6-ipw2x00-age-scan-results-on-resume.patch + - linux-2.6-dropwatch-protocol.patch + - linux-2.6-net-fix-gro-bug.patch + - linux-2.6-net-fix-another-gro-bug.patch + - linux-2.6-net-xfrm-fix-spin-unlock.patch + - linux-2.6.29-pat-change-is_linear_pfn_mapping-to-not-use-vm_pgoff.patch + - linux-2.6.29-pat-pci-change-prot-for-inherit.patch + +* Thu Apr 02 2009 Josef Bacik +- linux-2.6-btrfs-fix-umount-hang.patch: fix umount hang on btrfs + +* Thu Apr 02 2009 Kyle McMartin +- fix-ppc-debug_kmap_atomic.patch: fix build failures on ppc. + +* Thu Apr 02 2009 Kyle McMartin +- Linux 2.6.29-git9 + +* Tue Mar 31 2009 Kyle McMartin +- rds-only-on-64-bit-or-x86.patch: add +- at76-netdev_ops.patch: add + +* Tue Mar 31 2009 Kyle McMartin +- Linux 2.6.29-git8 +- linux-2.6-net-fix-another-gro-bug.patch: upstream. + +* Tue Mar 31 2009 Eric Sandeen +- add fiemap.h to kernel-headers +- build ext4 (and jbd2 and crc16) into the kernel + +* Tue Mar 31 2009 Kyle McMartin +- Linux 2.6.29-git7 +- fix-staging-at76.patch: pull patch from linux-wireless to fix... + +* Mon Mar 30 2009 Kyle McMartin 2.6.30-0.28.rc0.git6 +- Linux 2.6.29-git6 +- Bunch of stuff disabled, most merged, some needs rebasing. + +* Mon Mar 30 2009 Chuck Ebbert +- Make the .shared-srctree file a list so more than two checkouts + can share source files. + +* Mon Mar 30 2009 Chuck Ebbert +- Separate PAT fixes that are headed for -stable from our out-of-tree ones. + +* Mon Mar 30 2009 Dave Jones +- Make io schedulers selectable at boot time again. (#492817) + +* Mon Mar 30 2009 Dave Jones +- Add a strict-devmem=0 boot argument (#492803) + +* Mon Mar 30 2009 Adam Jackson +- linux-2.6.29-pat-fixes.patch: Fix PAT/GTT interaction + +* Mon Mar 30 2009 Mauro Carvalho Chehab +- some fixes of troubles caused by v4l2 subdev conversion + +* Mon Mar 30 2009 Mark McLoughlin 2.6.29-21 +- Fix guest->remote network stall with virtio/GSO (#490266) + +* Mon Mar 30 2009 Ben Skeggs +- drm-nouveau.patch + - rewrite nouveau PCI(E) GART functions, should fix rh#492492 + - kms: kernel option to allow dual-link dvi + - modinfo descriptions for module parameters + +* Sun Mar 29 2009 Mauro Carvalho Chehab +- more v4l/dvb updates: v4l subdev conversion and some driver improvements + +* Sun Mar 29 2009 Chuck Ebbert +- More fixes for ALSA hardware pointer updating. + +* Sat Mar 28 2009 Mauro Carvalho Chehab +- linux-2.6-revert-dvb-net-kabi-change.patch: attempt to fix dvb net breakage +- update v4l fixes patch to reflect what's ready for 2.6.30 +- update v4l devel patch to reflect what will be kept on linux-next for a while + +* Fri Mar 27 2009 Chuck Ebbert 2.6.29-16 +- Fix 2.6.29 networking lockups. +- Fix locking in net/xfrm/xfrm_state.c (#489764) + +* Fri Mar 27 2009 Ben Skeggs +- drm-nouveau.patch: do nothing for dac_{prepare,commit}, it's useless + and breaks some things in strange ways. + +* Fri Mar 27 2009 Ben Skeggs +- nv50: clear 0x1900/8 on init, possible fix for rh#492240 +- forcibly disable GEM also if KMS requested where not supported +- inform the user if we disable KMS because of it not being supported + +* Thu Mar 26 2009 Matthew Garrett +- linux-2.6-relatime-by-default.patch: Backport relatime code from 2.6.30 + +* Thu Mar 26 2009 Dave Jones +- Check for modesetting enabled before forcing mode on 915. (#490336) + +* Thu Mar 26 2009 Dave Jones +- Set kernel-PAE as default in grub. (#487578) + +* Thu Mar 26 2009 Dave Jones +- Enable CONFIG_MOUSE_PS2_ELANTECH (#492163) + +* Thu Mar 26 2009 Kyle McMartin +- linux-2.6-v4l-pvrusb2-fixes.patch: fix build for uncle steve. + +* Thu Mar 26 2009 Mauro Carvalho Chehab +- Move all 2.6.30 stuff into linux-2.6-v4l-dvb-fixes.patch, in + preparation for upstream pull; +- Added two new drivers: gspca sq905c and DVB Intel ce6230 +- Updated to the latest v4l-dvb drivers. + +* Wed Mar 25 2009 Mauro Carvalho Chehab +- remove duplicated Cinergy T2 entry at config-generic + +* Wed Mar 25 2009 Neil Horman +- Add dropmonitor/dropwatch protocol from 2.6.30 + +* Wed Mar 25 2009 Kyle McMartin +- alsa-rewrite-hw_ptr-updaters.patch: snd_pcm_update_hw_ptr() tries to + detect the unexpected hwptr jumps more strictly to avoid the position + mess-up, which often results in the bad quality I/O with pulseaudio. + +* Wed Mar 25 2009 Ben Skeggs +- drm-nouveau.patch: idle channels better before destroying them + +* Tue Mar 24 2009 Kyle McMartin +- Disable DMAR by default until suspend & resume is fixed. + +* Tue Mar 24 2009 Josef Bacik +- fsync replay fixes for btrfs + +* Mon Mar 23 2009 Dave Jones +- 2.6.29 + +### +# The following Emacs magic makes C-c C-e use UTC dates. +# Local Variables: +# rpm-change-log-uses-utc: t +# End: +### diff --git a/original/linux-2.6-btrfs-upstream.patch b/original/linux-2.6-btrfs-upstream.patch new file mode 100644 index 000000000..46ae7fff5 --- /dev/null +++ b/original/linux-2.6-btrfs-upstream.patch @@ -0,0 +1,10829 @@ +diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c +index f128427..3616042 100644 +--- a/fs/btrfs/acl.c ++++ b/fs/btrfs/acl.c +@@ -27,7 +27,7 @@ + #include "btrfs_inode.h" + #include "xattr.h" + +-#ifdef CONFIG_FS_POSIX_ACL ++#ifdef CONFIG_BTRFS_FS_POSIX_ACL + + static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) + { +@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = { + .set = btrfs_xattr_acl_access_set, + }; + +-#else /* CONFIG_FS_POSIX_ACL */ ++#else /* CONFIG_BTRFS_FS_POSIX_ACL */ + + int btrfs_acl_chmod(struct inode *inode) + { +@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) + return 0; + } + +-#endif /* CONFIG_FS_POSIX_ACL */ ++#endif /* CONFIG_BTRFS_FS_POSIX_ACL */ +diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c +index 019e8af..c0861e7 100644 +--- a/fs/btrfs/async-thread.c ++++ b/fs/btrfs/async-thread.c +@@ -48,6 +48,9 @@ struct btrfs_worker_thread { + /* number of things on the pending list */ + atomic_t num_pending; + ++ /* reference counter for this struct */ ++ atomic_t refs; ++ + unsigned long sequence; + + /* protects the pending list. */ +@@ -61,6 +64,51 @@ struct btrfs_worker_thread { + }; + + /* ++ * btrfs_start_workers uses kthread_run, which can block waiting for memory ++ * for a very long time. It will actually throttle on page writeback, ++ * and so it may not make progress until after our btrfs worker threads ++ * process all of the pending work structs in their queue ++ * ++ * This means we can't use btrfs_start_workers from inside a btrfs worker ++ * thread that is used as part of cleaning dirty memory, which pretty much ++ * involves all of the worker threads. ++ * ++ * Instead we have a helper queue who never has more than one thread ++ * where we scheduler thread start operations. This worker_start struct ++ * is used to contain the work and hold a pointer to the queue that needs ++ * another worker. ++ */ ++struct worker_start { ++ struct btrfs_work work; ++ struct btrfs_workers *queue; ++}; ++ ++static void start_new_worker_func(struct btrfs_work *work) ++{ ++ struct worker_start *start; ++ start = container_of(work, struct worker_start, work); ++ btrfs_start_workers(start->queue, 1); ++ kfree(start); ++} ++ ++static int start_new_worker(struct btrfs_workers *queue) ++{ ++ struct worker_start *start; ++ int ret; ++ ++ start = kzalloc(sizeof(*start), GFP_NOFS); ++ if (!start) ++ return -ENOMEM; ++ ++ start->work.func = start_new_worker_func; ++ start->queue = queue; ++ ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); ++ if (ret) ++ kfree(start); ++ return ret; ++} ++ ++/* + * helper function to move a thread onto the idle list after it + * has finished some requests. + */ +@@ -71,7 +119,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker) + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 1; +- list_move(&worker->worker_list, &worker->workers->idle_list); ++ ++ /* the list may be empty if the worker is just starting */ ++ if (!list_empty(&worker->worker_list)) { ++ list_move(&worker->worker_list, ++ &worker->workers->idle_list); ++ } + spin_unlock_irqrestore(&worker->workers->lock, flags); + } + } +@@ -87,23 +140,51 @@ static void check_busy_worker(struct btrfs_worker_thread *worker) + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 0; +- list_move_tail(&worker->worker_list, +- &worker->workers->worker_list); ++ ++ if (!list_empty(&worker->worker_list)) { ++ list_move_tail(&worker->worker_list, ++ &worker->workers->worker_list); ++ } + spin_unlock_irqrestore(&worker->workers->lock, flags); + } + } + +-static noinline int run_ordered_completions(struct btrfs_workers *workers, +- struct btrfs_work *work) ++static void check_pending_worker_creates(struct btrfs_worker_thread *worker) + { ++ struct btrfs_workers *workers = worker->workers; + unsigned long flags; + ++ rmb(); ++ if (!workers->atomic_start_pending) ++ return; ++ ++ spin_lock_irqsave(&workers->lock, flags); ++ if (!workers->atomic_start_pending) ++ goto out; ++ ++ workers->atomic_start_pending = 0; ++ if (workers->num_workers + workers->num_workers_starting >= ++ workers->max_workers) ++ goto out; ++ ++ workers->num_workers_starting += 1; ++ spin_unlock_irqrestore(&workers->lock, flags); ++ start_new_worker(workers); ++ return; ++ ++out: ++ spin_unlock_irqrestore(&workers->lock, flags); ++} ++ ++static noinline int run_ordered_completions(struct btrfs_workers *workers, ++ struct btrfs_work *work) ++{ + if (!workers->ordered) + return 0; + + set_bit(WORK_DONE_BIT, &work->flags); + +- spin_lock_irqsave(&workers->lock, flags); ++ spin_lock(&workers->order_lock); + + while (1) { + if (!list_empty(&workers->prio_order_list)) { +@@ -126,45 +207,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, + if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) + break; + +- spin_unlock_irqrestore(&workers->lock, flags); ++ spin_unlock(&workers->order_lock); + + work->ordered_func(work); + + /* now take the lock again and call the freeing code */ +- spin_lock_irqsave(&workers->lock, flags); ++ spin_lock(&workers->order_lock); + list_del(&work->order_list); + work->ordered_free(work); + } + +- spin_unlock_irqrestore(&workers->lock, flags); ++ spin_unlock(&workers->order_lock); + return 0; + } + ++static void put_worker(struct btrfs_worker_thread *worker) ++{ ++ if (atomic_dec_and_test(&worker->refs)) ++ kfree(worker); ++} ++ ++static int try_worker_shutdown(struct btrfs_worker_thread *worker) ++{ ++ int freeit = 0; ++ ++ spin_lock_irq(&worker->lock); ++ spin_lock(&worker->workers->lock); ++ if (worker->workers->num_workers > 1 && ++ worker->idle && ++ !worker->working && ++ !list_empty(&worker->worker_list) && ++ list_empty(&worker->prio_pending) && ++ list_empty(&worker->pending) && ++ atomic_read(&worker->num_pending) == 0) { ++ freeit = 1; ++ list_del_init(&worker->worker_list); ++ worker->workers->num_workers--; ++ } ++ spin_unlock(&worker->workers->lock); ++ spin_unlock_irq(&worker->lock); ++ ++ if (freeit) ++ put_worker(worker); ++ return freeit; ++} ++ ++static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, ++ struct list_head *prio_head, ++ struct list_head *head) ++{ ++ struct btrfs_work *work = NULL; ++ struct list_head *cur = NULL; ++ ++ if(!list_empty(prio_head)) ++ cur = prio_head->next; ++ ++ smp_mb(); ++ if (!list_empty(&worker->prio_pending)) ++ goto refill; ++ ++ if (!list_empty(head)) ++ cur = head->next; ++ ++ if (cur) ++ goto out; ++ ++refill: ++ spin_lock_irq(&worker->lock); ++ list_splice_tail_init(&worker->prio_pending, prio_head); ++ list_splice_tail_init(&worker->pending, head); ++ ++ if (!list_empty(prio_head)) ++ cur = prio_head->next; ++ else if (!list_empty(head)) ++ cur = head->next; ++ spin_unlock_irq(&worker->lock); ++ ++ if (!cur) ++ goto out_fail; ++ ++out: ++ work = list_entry(cur, struct btrfs_work, list); ++ ++out_fail: ++ return work; ++} ++ + /* + * main loop for servicing work items + */ + static int worker_loop(void *arg) + { + struct btrfs_worker_thread *worker = arg; +- struct list_head *cur; ++ struct list_head head; ++ struct list_head prio_head; + struct btrfs_work *work; ++ ++ INIT_LIST_HEAD(&head); ++ INIT_LIST_HEAD(&prio_head); ++ + do { +- spin_lock_irq(&worker->lock); +-again_locked: ++again: + while (1) { +- if (!list_empty(&worker->prio_pending)) +- cur = worker->prio_pending.next; +- else if (!list_empty(&worker->pending)) +- cur = worker->pending.next; +- else ++ ++ ++ work = get_next_work(worker, &prio_head, &head); ++ if (!work) + break; + +- work = list_entry(cur, struct btrfs_work, list); + list_del(&work->list); + clear_bit(WORK_QUEUED_BIT, &work->flags); + + work->worker = worker; +- spin_unlock_irq(&worker->lock); + + work->func(work); + +@@ -175,9 +329,13 @@ again_locked: + */ + run_ordered_completions(worker->workers, work); + +- spin_lock_irq(&worker->lock); +- check_idle_worker(worker); ++ check_pending_worker_creates(worker); ++ + } ++ ++ spin_lock_irq(&worker->lock); ++ check_idle_worker(worker); ++ + if (freezing(current)) { + worker->working = 0; + spin_unlock_irq(&worker->lock); +@@ -216,8 +374,10 @@ again_locked: + spin_lock_irq(&worker->lock); + set_current_state(TASK_INTERRUPTIBLE); + if (!list_empty(&worker->pending) || +- !list_empty(&worker->prio_pending)) +- goto again_locked; ++ !list_empty(&worker->prio_pending)) { ++ spin_unlock_irq(&worker->lock); ++ goto again; ++ } + + /* + * this makes sure we get a wakeup when someone +@@ -226,8 +386,13 @@ again_locked: + worker->working = 0; + spin_unlock_irq(&worker->lock); + +- if (!kthread_should_stop()) +- schedule(); ++ if (!kthread_should_stop()) { ++ schedule_timeout(HZ * 120); ++ if (!worker->working && ++ try_worker_shutdown(worker)) { ++ return 0; ++ } ++ } + } + __set_current_state(TASK_RUNNING); + } +@@ -242,41 +407,61 @@ int btrfs_stop_workers(struct btrfs_workers *workers) + { + struct list_head *cur; + struct btrfs_worker_thread *worker; ++ int can_stop; + ++ spin_lock_irq(&workers->lock); + list_splice_init(&workers->idle_list, &workers->worker_list); + while (!list_empty(&workers->worker_list)) { + cur = workers->worker_list.next; + worker = list_entry(cur, struct btrfs_worker_thread, + worker_list); +- kthread_stop(worker->task); +- list_del(&worker->worker_list); +- kfree(worker); ++ ++ atomic_inc(&worker->refs); ++ workers->num_workers -= 1; ++ if (!list_empty(&worker->worker_list)) { ++ list_del_init(&worker->worker_list); ++ put_worker(worker); ++ can_stop = 1; ++ } else ++ can_stop = 0; ++ spin_unlock_irq(&workers->lock); ++ if (can_stop) ++ kthread_stop(worker->task); ++ spin_lock_irq(&workers->lock); ++ put_worker(worker); + } ++ spin_unlock_irq(&workers->lock); + return 0; + } + + /* + * simple init on struct btrfs_workers + */ +-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) ++void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, ++ struct btrfs_workers *async_helper) + { + workers->num_workers = 0; ++ workers->num_workers_starting = 0; + INIT_LIST_HEAD(&workers->worker_list); + INIT_LIST_HEAD(&workers->idle_list); + INIT_LIST_HEAD(&workers->order_list); + INIT_LIST_HEAD(&workers->prio_order_list); + spin_lock_init(&workers->lock); ++ spin_lock_init(&workers->order_lock); + workers->max_workers = max; + workers->idle_thresh = 32; + workers->name = name; + workers->ordered = 0; ++ workers->atomic_start_pending = 0; ++ workers->atomic_worker_start = async_helper; + } + + /* + * starts new worker threads. This does not enforce the max worker + * count in case you need to temporarily go past it. + */ +-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) ++static int __btrfs_start_workers(struct btrfs_workers *workers, ++ int num_workers) + { + struct btrfs_worker_thread *worker; + int ret = 0; +@@ -293,7 +478,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) + INIT_LIST_HEAD(&worker->prio_pending); + INIT_LIST_HEAD(&worker->worker_list); + spin_lock_init(&worker->lock); ++ + atomic_set(&worker->num_pending, 0); ++ atomic_set(&worker->refs, 1); + worker->workers = workers; + worker->task = kthread_run(worker_loop, worker, + "btrfs-%s-%d", workers->name, +@@ -303,11 +490,12 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) + kfree(worker); + goto fail; + } +- + spin_lock_irq(&workers->lock); + list_add_tail(&worker->worker_list, &workers->idle_list); + worker->idle = 1; + workers->num_workers++; ++ workers->num_workers_starting--; ++ WARN_ON(workers->num_workers_starting < 0); + spin_unlock_irq(&workers->lock); + } + return 0; +@@ -316,6 +504,14 @@ fail: + return ret; + } + ++int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) ++{ ++ spin_lock_irq(&workers->lock); ++ workers->num_workers_starting += num_workers; ++ spin_unlock_irq(&workers->lock); ++ return __btrfs_start_workers(workers, num_workers); ++} ++ + /* + * run through the list and find a worker thread that doesn't have a lot + * to do right now. This can return null if we aren't yet at the thread +@@ -325,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) + { + struct btrfs_worker_thread *worker; + struct list_head *next; +- int enforce_min = workers->num_workers < workers->max_workers; ++ int enforce_min; ++ ++ enforce_min = (workers->num_workers + workers->num_workers_starting) < ++ workers->max_workers; + + /* + * if we find an idle thread, don't move it to the end of the +@@ -350,7 +549,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) + */ + next = workers->worker_list.next; + worker = list_entry(next, struct btrfs_worker_thread, worker_list); +- atomic_inc(&worker->num_pending); + worker->sequence++; + + if (worker->sequence % workers->idle_thresh == 0) +@@ -367,35 +565,49 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) + { + struct btrfs_worker_thread *worker; + unsigned long flags; ++ struct list_head *fallback; + + again: + spin_lock_irqsave(&workers->lock, flags); + worker = next_worker(workers); +- spin_unlock_irqrestore(&workers->lock, flags); + + if (!worker) { +- spin_lock_irqsave(&workers->lock, flags); +- if (workers->num_workers >= workers->max_workers) { +- struct list_head *fallback = NULL; +- /* +- * we have failed to find any workers, just +- * return the force one +- */ +- if (!list_empty(&workers->worker_list)) +- fallback = workers->worker_list.next; +- if (!list_empty(&workers->idle_list)) +- fallback = workers->idle_list.next; +- BUG_ON(!fallback); +- worker = list_entry(fallback, +- struct btrfs_worker_thread, worker_list); +- spin_unlock_irqrestore(&workers->lock, flags); ++ if (workers->num_workers + workers->num_workers_starting >= ++ workers->max_workers) { ++ goto fallback; ++ } else if (workers->atomic_worker_start) { ++ workers->atomic_start_pending = 1; ++ goto fallback; + } else { ++ workers->num_workers_starting++; + spin_unlock_irqrestore(&workers->lock, flags); + /* we're below the limit, start another worker */ +- btrfs_start_workers(workers, 1); ++ __btrfs_start_workers(workers, 1); + goto again; + } + } ++ goto found; ++ ++fallback: ++ fallback = NULL; ++ /* ++ * we have failed to find any workers, just ++ * return the first one we can find. ++ */ ++ if (!list_empty(&workers->worker_list)) ++ fallback = workers->worker_list.next; ++ if (!list_empty(&workers->idle_list)) ++ fallback = workers->idle_list.next; ++ BUG_ON(!fallback); ++ worker = list_entry(fallback, ++ struct btrfs_worker_thread, worker_list); ++found: ++ /* ++ * this makes sure the worker doesn't exit before it is placed ++ * onto a busy/idle list ++ */ ++ atomic_inc(&worker->num_pending); ++ spin_unlock_irqrestore(&workers->lock, flags); + return worker; + } + +@@ -427,7 +639,7 @@ int btrfs_requeue_work(struct btrfs_work *work) + spin_lock(&worker->workers->lock); + worker->idle = 0; + list_move_tail(&worker->worker_list, +- &worker->workers->worker_list); ++ &worker->workers->worker_list); + spin_unlock(&worker->workers->lock); + } + if (!worker->working) { +@@ -435,9 +647,9 @@ int btrfs_requeue_work(struct btrfs_work *work) + worker->working = 1; + } + +- spin_unlock_irqrestore(&worker->lock, flags); + if (wake) + wake_up_process(worker->task); ++ spin_unlock_irqrestore(&worker->lock, flags); + out: + + return 0; +@@ -463,14 +675,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) + + worker = find_worker(workers); + if (workers->ordered) { +- spin_lock_irqsave(&workers->lock, flags); ++ /* ++ * you're not allowed to do ordered queues from an ++ * interrupt handler ++ */ ++ spin_lock(&workers->order_lock); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { + list_add_tail(&work->order_list, + &workers->prio_order_list); + } else { + list_add_tail(&work->order_list, &workers->order_list); + } +- spin_unlock_irqrestore(&workers->lock, flags); ++ spin_unlock(&workers->order_lock); + } else { + INIT_LIST_HEAD(&work->order_list); + } +@@ -481,7 +697,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); +- atomic_inc(&worker->num_pending); + check_busy_worker(worker); + + /* +@@ -492,10 +707,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) + wake = 1; + worker->working = 1; + +- spin_unlock_irqrestore(&worker->lock, flags); +- + if (wake) + wake_up_process(worker->task); ++ spin_unlock_irqrestore(&worker->lock, flags); ++ + out: + return 0; + } +diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h +index 1b511c1..5077746 100644 +--- a/fs/btrfs/async-thread.h ++++ b/fs/btrfs/async-thread.h +@@ -64,6 +64,8 @@ struct btrfs_workers { + /* current number of running workers */ + int num_workers; + ++ int num_workers_starting; ++ + /* max number of workers allowed. changed by btrfs_start_workers */ + int max_workers; + +@@ -73,6 +75,16 @@ struct btrfs_workers { + /* force completions in the order they were queued */ + int ordered; + ++ /* more workers required, but in an interrupt handler */ ++ int atomic_start_pending; ++ ++ /* ++ * are we allowed to sleep while starting workers or are we required ++ * to start them at a later time? If we can't sleep, this indicates ++ * which queue we need to use to schedule thread creation. ++ */ ++ struct btrfs_workers *atomic_worker_start; ++ + /* list with all the work threads. The workers on the idle thread + * may be actively servicing jobs, but they haven't yet hit the + * idle thresh limit above. +@@ -90,6 +102,9 @@ struct btrfs_workers { + /* lock for finding the next worker thread to queue on */ + spinlock_t lock; + ++ /* lock for the ordered lists */ ++ spinlock_t order_lock; ++ + /* extra name for this worker, used for current->name */ + char *name; + }; +@@ -97,7 +112,8 @@ struct btrfs_workers { + int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); + int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); + int btrfs_stop_workers(struct btrfs_workers *workers); +-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); ++void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, ++ struct btrfs_workers *async_starter); + int btrfs_requeue_work(struct btrfs_work *work); + void btrfs_set_work_high_prio(struct btrfs_work *work); + #endif +diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h +index ea1ea0a..f6783a4 100644 +--- a/fs/btrfs/btrfs_inode.h ++++ b/fs/btrfs/btrfs_inode.h +@@ -86,6 +86,12 @@ struct btrfs_inode { + * transid of the trans_handle that last modified this inode + */ + u64 last_trans; ++ ++ /* ++ * log transid when this inode was last modified ++ */ ++ u64 last_sub_trans; ++ + /* + * transid that last logged this inode + */ +@@ -128,6 +134,16 @@ struct btrfs_inode { + u64 last_unlink_trans; + + /* ++ * Counters to keep track of the number of extent item's we may use due ++ * to delalloc and such. outstanding_extents is the number of extent ++ * items we think we'll end up using, and reserved_extents is the number ++ * of extent items we've reserved metadata for. ++ */ ++ spinlock_t accounting_lock; ++ int reserved_extents; ++ int outstanding_extents; ++ ++ /* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the +@@ -138,6 +154,7 @@ struct btrfs_inode { + * of these. + */ + unsigned ordered_data_close:1; ++ unsigned dummy_inode:1; + + struct inode vfs_inode; + }; +diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c +index 9d8ba4d..a11a320 100644 +--- a/fs/btrfs/compression.c ++++ b/fs/btrfs/compression.c +@@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode, + */ + set_page_extent_mapped(page); + lock_extent(tree, last_offset, end, GFP_NOFS); +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, last_offset, + PAGE_CACHE_SIZE); +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + if (!em || last_offset < em->start || + (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || +@@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + em_tree = &BTRFS_I(inode)->extent_tree; + + /* we need the actual starting offset of this extent in the file */ +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, + page_offset(bio->bi_io_vec->bv_page), + PAGE_CACHE_SIZE); +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + compressed_len = em->block_len; + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); +diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c +index 3fdcc05..ec96f3a 100644 +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, + int split; + int num_doubles = 0; + ++ l = path->nodes[0]; ++ slot = path->slots[0]; ++ if (extend && data_size + btrfs_item_size_nr(l, slot) + ++ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) ++ return -EOVERFLOW; ++ + /* first try to make some room by pushing left and right */ + if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { + wret = push_leaf_right(trans, root, path, data_size, 0); +diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h +index 837435c..e5dd628 100644 +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -114,6 +114,10 @@ struct btrfs_ordered_sum; + */ + #define BTRFS_DEV_ITEMS_OBJECTID 1ULL + ++#define BTRFS_BTREE_INODE_OBJECTID 1 ++ ++#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 ++ + /* + * we can actually store much bigger names, but lets not confuse the rest + * of linux +@@ -670,21 +674,29 @@ struct btrfs_space_info { + u64 bytes_reserved; /* total bytes the allocator has reserved for + current allocations */ + u64 bytes_readonly; /* total bytes that are read only */ +- +- /* delalloc accounting */ +- u64 bytes_delalloc; /* number of bytes reserved for allocation, +- this space is not necessarily reserved yet +- by the allocator */ ++ u64 bytes_super; /* total bytes reserved for the super blocks */ ++ u64 bytes_root; /* the number of bytes needed to commit a ++ transaction */ + u64 bytes_may_use; /* number of bytes that may be used for +- delalloc */ ++ delalloc/allocations */ ++ u64 bytes_delalloc; /* number of bytes currently reserved for ++ delayed allocation */ + + int full; /* indicates that we cannot allocate any more + chunks for this space */ + int force_alloc; /* set if we need to force a chunk alloc for + this space */ ++ int force_delalloc; /* make people start doing filemap_flush until ++ we're under a threshold */ + + struct list_head list; + ++ /* for controlling how we free up space for allocations */ ++ wait_queue_head_t allocate_wait; ++ wait_queue_head_t flush_wait; ++ int allocating_chunk; ++ int flushing; ++ + /* for block groups in our same type */ + struct list_head block_groups; + spinlock_t lock; +@@ -726,6 +738,15 @@ enum btrfs_caching_type { + BTRFS_CACHE_FINISHED = 2, + }; + ++struct btrfs_caching_control { ++ struct list_head list; ++ struct mutex mutex; ++ wait_queue_head_t wait; ++ struct btrfs_block_group_cache *block_group; ++ u64 progress; ++ atomic_t count; ++}; ++ + struct btrfs_block_group_cache { + struct btrfs_key key; + struct btrfs_block_group_item item; +@@ -733,6 +754,7 @@ struct btrfs_block_group_cache { + spinlock_t lock; + u64 pinned; + u64 reserved; ++ u64 bytes_super; + u64 flags; + u64 sectorsize; + int extents_thresh; +@@ -742,8 +764,9 @@ struct btrfs_block_group_cache { + int dirty; + + /* cache tracking stuff */ +- wait_queue_head_t caching_q; + int cached; ++ struct btrfs_caching_control *caching_ctl; ++ u64 last_byte_to_unpin; + + struct btrfs_space_info *space_info; + +@@ -782,13 +805,16 @@ struct btrfs_fs_info { + + /* the log root tree is a directory of all the other log roots */ + struct btrfs_root *log_root_tree; ++ ++ spinlock_t fs_roots_radix_lock; + struct radix_tree_root fs_roots_radix; + + /* block group cache stuff */ + spinlock_t block_group_cache_lock; + struct rb_root block_group_cache_tree; + +- struct extent_io_tree pinned_extents; ++ struct extent_io_tree freed_extents[2]; ++ struct extent_io_tree *pinned_extents; + + /* logical->physical extent mapping */ + struct btrfs_mapping_tree mapping_tree; +@@ -822,11 +848,7 @@ struct btrfs_fs_info { + struct mutex transaction_kthread_mutex; + struct mutex cleaner_mutex; + struct mutex chunk_mutex; +- struct mutex drop_mutex; + struct mutex volume_mutex; +- struct mutex tree_reloc_mutex; +- struct rw_semaphore extent_commit_sem; +- + /* + * this protects the ordered operations list only while we are + * processing all of the entries on it. This way we make +@@ -835,10 +857,16 @@ struct btrfs_fs_info { + * before jumping into the main commit. + */ + struct mutex ordered_operations_mutex; ++ struct rw_semaphore extent_commit_sem; ++ ++ struct rw_semaphore subvol_sem; ++ ++ struct srcu_struct subvol_srcu; + + struct list_head trans_list; + struct list_head hashers; + struct list_head dead_roots; ++ struct list_head caching_block_groups; + + atomic_t nr_async_submits; + atomic_t async_submit_draining; +@@ -882,6 +910,7 @@ struct btrfs_fs_info { + * A third pool does submit_bio to avoid deadlocking with the other + * two + */ ++ struct btrfs_workers generic_worker; + struct btrfs_workers workers; + struct btrfs_workers delalloc_workers; + struct btrfs_workers endio_workers; +@@ -889,6 +918,7 @@ struct btrfs_fs_info { + struct btrfs_workers endio_meta_write_workers; + struct btrfs_workers endio_write_workers; + struct btrfs_workers submit_workers; ++ struct btrfs_workers enospc_workers; + /* + * fixup workers take dirty pages that didn't properly go through + * the cow mechanism and make them safe to write. It happens +@@ -979,7 +1009,10 @@ struct btrfs_root { + atomic_t log_writers; + atomic_t log_commit[2]; + unsigned long log_transid; ++ unsigned long last_log_commit; + unsigned long log_batch; ++ pid_t log_start_pid; ++ bool log_multiple_pids; + + u64 objectid; + u64 last_trans; +@@ -996,10 +1029,12 @@ struct btrfs_root { + u32 stripesize; + + u32 type; +- u64 highest_inode; +- u64 last_inode_alloc; ++ ++ u64 highest_objectid; + int ref_cows; + int track_dirty; ++ int in_radix; ++ + u64 defrag_trans_start; + struct btrfs_key defrag_progress; + struct btrfs_key defrag_max; +@@ -1118,6 +1153,7 @@ struct btrfs_root { + #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) + #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) + #define BTRFS_MOUNT_NOSSD (1 << 9) ++#define BTRFS_MOUNT_DISCARD (1 << 10) + + #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) + #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +@@ -1920,8 +1956,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache); + int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count); + int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); +-int btrfs_update_pinned_extents(struct btrfs_root *root, +- u64 bytenr, u64 num, int pin); ++int btrfs_pin_extent(struct btrfs_root *root, ++ u64 bytenr, u64 num, int reserved); + int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *leaf); + int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, +@@ -1971,9 +2007,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, + u64 root_objectid, u64 owner, u64 offset); + + int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); ++int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root); + int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- struct extent_io_tree *unpin); ++ struct btrfs_root *root); + int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, +@@ -1984,6 +2021,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); + int btrfs_free_block_groups(struct btrfs_fs_info *info); + int btrfs_read_block_groups(struct btrfs_root *root); ++int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr); + int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_objectid, u64 chunk_offset, +@@ -1997,7 +2035,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); + void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); + void btrfs_clear_space_info_full(struct btrfs_fs_info *info); + +-int btrfs_check_metadata_free_space(struct btrfs_root *root); ++int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); ++int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); ++int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, ++ struct inode *inode, int num_items); ++int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, ++ struct inode *inode, int num_items); + int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); + void btrfs_free_reserved_data_space(struct btrfs_root *root, +@@ -2006,7 +2049,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); + void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); +-void btrfs_free_pinned_extents(struct btrfs_fs_info *info); + /* ctree.c */ + int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot); +@@ -2100,12 +2142,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct extent_buffer *parent); + /* root-item.c */ + int btrfs_find_root_ref(struct btrfs_root *tree_root, +- struct btrfs_path *path, +- u64 root_id, u64 ref_id); ++ struct btrfs_path *path, ++ u64 root_id, u64 ref_id); + int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, +- u64 root_id, u8 type, u64 ref_id, +- u64 dirid, u64 sequence, ++ u64 root_id, u64 ref_id, u64 dirid, u64 sequence, ++ const char *name, int name_len); ++int btrfs_del_root_ref(struct btrfs_trans_handle *trans, ++ struct btrfs_root *tree_root, ++ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, + const char *name, int name_len); + int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key); +@@ -2120,6 +2165,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct + int btrfs_search_root(struct btrfs_root *root, u64 search_start, + u64 *found_objectid); + int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); ++int btrfs_find_orphan_roots(struct btrfs_root *tree_root); + int btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); + /* dir-item.c */ +@@ -2138,6 +2184,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 dir, + u64 objectid, const char *name, int name_len, + int mod); ++struct btrfs_dir_item * ++btrfs_search_dir_index_item(struct btrfs_root *root, ++ struct btrfs_path *path, u64 dirid, ++ const char *name, int name_len); + struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len); +@@ -2160,6 +2210,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); + int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); ++int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); + + /* inode-map.c */ + int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, +@@ -2232,6 +2283,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index); ++int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct inode *dir, u64 objectid, ++ const char *name, int name_len); + int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 new_size, +@@ -2242,7 +2297,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); + int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc); + int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, +- struct btrfs_root *new_root, struct dentry *dentry, ++ struct btrfs_root *new_root, + u64 new_dirid, u64 alloc_hint); + int btrfs_merge_bio_hook(struct page *page, unsigned long offset, + size_t size, struct bio *bio, unsigned long bio_flags); +@@ -2258,6 +2313,7 @@ int btrfs_write_inode(struct inode *inode, int wait); + void btrfs_dirty_inode(struct inode *inode); + struct inode *btrfs_alloc_inode(struct super_block *sb); + void btrfs_destroy_inode(struct inode *inode); ++void btrfs_drop_inode(struct inode *inode); + int btrfs_init_cachep(void); + void btrfs_destroy_cachep(void); + long btrfs_ioctl_trans_end(struct file *file); +@@ -2275,6 +2331,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); + int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); + void btrfs_orphan_cleanup(struct btrfs_root *root); + int btrfs_cont_expand(struct inode *inode, loff_t size); ++int btrfs_invalidate_inodes(struct btrfs_root *root); ++extern const struct dentry_operations btrfs_dentry_operations; + + /* ioctl.c */ + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +@@ -2290,7 +2348,7 @@ extern struct file_operations btrfs_file_operations; + int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, u64 end, u64 locked_end, +- u64 inline_limit, u64 *hint_block); ++ u64 inline_limit, u64 *hint_block, int drop_cache); + int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 start, u64 end); +@@ -2317,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options); + int btrfs_sync_fs(struct super_block *sb, int wait); + + /* acl.c */ +-#ifdef CONFIG_FS_POSIX_ACL ++#ifdef CONFIG_BTRFS_FS_POSIX_ACL + int btrfs_check_acl(struct inode *inode, int mask); + #else + #define btrfs_check_acl NULL +diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c +index 1d70236..f3a6075 100644 +--- a/fs/btrfs/dir-item.c ++++ b/fs/btrfs/dir-item.c +@@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + return btrfs_match_dir_item_name(root, path, name, name_len); + } + ++struct btrfs_dir_item * ++btrfs_search_dir_index_item(struct btrfs_root *root, ++ struct btrfs_path *path, u64 dirid, ++ const char *name, int name_len) ++{ ++ struct extent_buffer *leaf; ++ struct btrfs_dir_item *di; ++ struct btrfs_key key; ++ u32 nritems; ++ int ret; ++ ++ key.objectid = dirid; ++ key.type = BTRFS_DIR_INDEX_KEY; ++ key.offset = 0; ++ ++ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); ++ if (ret < 0) ++ return ERR_PTR(ret); ++ ++ leaf = path->nodes[0]; ++ nritems = btrfs_header_nritems(leaf); ++ ++ while (1) { ++ if (path->slots[0] >= nritems) { ++ ret = btrfs_next_leaf(root, path); ++ if (ret < 0) ++ return ERR_PTR(ret); ++ if (ret > 0) ++ break; ++ leaf = path->nodes[0]; ++ nritems = btrfs_header_nritems(leaf); ++ continue; ++ } ++ ++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ++ if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) ++ break; ++ ++ di = btrfs_match_dir_item_name(root, path, name, name_len); ++ if (di) ++ return di; ++ ++ path->slots[0]++; ++ } ++ return NULL; ++} ++ + struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index e83be2e..d4132aa 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -41,6 +41,7 @@ + + static struct extent_io_ops btree_extent_io_ops; + static void end_workqueue_fn(struct btrfs_work *work); ++static void free_fs_root(struct btrfs_root *root); + + static atomic_t btrfs_bdi_num = ATOMIC_INIT(0); + +@@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode, + struct extent_map *em; + int ret; + +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) { + em->bdev = + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + goto out; + } +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + em = alloc_extent_map(GFP_NOFS); + if (!em) { +@@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, + em->block_start = 0; + em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + if (ret == -EEXIST) { + u64 failed_start = em->start; +@@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, + free_extent_map(em); + em = NULL; + } +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + + if (ret) + em = ERR_PTR(ret); +@@ -828,7 +829,9 @@ int btrfs_write_tree_block(struct extent_buffer *buf) + int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) + { + return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, +- buf->start, buf->start + buf->len - 1); ++ buf->start >> PAGE_CACHE_SHIFT, ++ (buf->start + buf->len - 1) >> ++ PAGE_CACHE_SHIFT); + } + + struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, +@@ -895,8 +898,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, + root->fs_info = fs_info; + root->objectid = objectid; + root->last_trans = 0; +- root->highest_inode = 0; +- root->last_inode_alloc = 0; ++ root->highest_objectid = 0; + root->name = NULL; + root->in_sysfs = 0; + root->inode_tree.rb_node = NULL; +@@ -917,6 +919,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, + atomic_set(&root->log_writers, 0); + root->log_batch = 0; + root->log_transid = 0; ++ root->last_log_commit = 0; + extent_io_tree_init(&root->dirty_log_pages, + fs_info->btree_inode->i_mapping, GFP_NOFS); + +@@ -952,14 +955,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root, + root, fs_info, objectid); + ret = btrfs_find_last_root(tree_root, objectid, + &root->root_item, &root->root_key); ++ if (ret > 0) ++ return -ENOENT; + BUG_ON(ret); + + generation = btrfs_root_generation(&root->root_item); + blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + blocksize, generation); +- root->commit_root = btrfs_root_node(root); + BUG_ON(!root->node); ++ root->commit_root = btrfs_root_node(root); + return 0; + } + +@@ -1085,6 +1090,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + WARN_ON(root->log_root); + root->log_root = log_root; + root->log_transid = 0; ++ root->last_log_commit = 0; + return 0; + } + +@@ -1095,7 +1101,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_path *path; + struct extent_buffer *l; +- u64 highest_inode; + u64 generation; + u32 blocksize; + int ret = 0; +@@ -1110,7 +1115,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + kfree(root); + return ERR_PTR(ret); + } +- goto insert; ++ goto out; + } + + __setup_root(tree_root->nodesize, tree_root->leafsize, +@@ -1120,39 +1125,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); +- if (ret != 0) { +- if (ret > 0) +- ret = -ENOENT; +- goto out; ++ if (ret == 0) { ++ l = path->nodes[0]; ++ read_extent_buffer(l, &root->root_item, ++ btrfs_item_ptr_offset(l, path->slots[0]), ++ sizeof(root->root_item)); ++ memcpy(&root->root_key, location, sizeof(*location)); + } +- l = path->nodes[0]; +- read_extent_buffer(l, &root->root_item, +- btrfs_item_ptr_offset(l, path->slots[0]), +- sizeof(root->root_item)); +- memcpy(&root->root_key, location, sizeof(*location)); +- ret = 0; +-out: +- btrfs_release_path(root, path); + btrfs_free_path(path); + if (ret) { +- kfree(root); ++ if (ret > 0) ++ ret = -ENOENT; + return ERR_PTR(ret); + } ++ + generation = btrfs_root_generation(&root->root_item); + blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + blocksize, generation); + root->commit_root = btrfs_root_node(root); + BUG_ON(!root->node); +-insert: +- if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { ++out: ++ if (location->objectid != BTRFS_TREE_LOG_OBJECTID) + root->ref_cows = 1; +- ret = btrfs_find_highest_inode(root, &highest_inode); +- if (ret == 0) { +- root->highest_inode = highest_inode; +- root->last_inode_alloc = highest_inode; +- } +- } ++ + return root; + } + +@@ -1187,39 +1183,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + return fs_info->dev_root; + if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) + return fs_info->csum_root; +- ++again: ++ spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)location->objectid); ++ spin_unlock(&fs_info->fs_roots_radix_lock); + if (root) + return root; + ++ ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); ++ if (ret == 0) ++ ret = -ENOENT; ++ if (ret < 0) ++ return ERR_PTR(ret); ++ + root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); + if (IS_ERR(root)) + return root; + ++ WARN_ON(btrfs_root_refs(&root->root_item) == 0); + set_anon_super(&root->anon_super, NULL); + ++ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); ++ if (ret) ++ goto fail; ++ ++ spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); ++ if (ret == 0) ++ root->in_radix = 1; ++ spin_unlock(&fs_info->fs_roots_radix_lock); ++ radix_tree_preload_end(); + if (ret) { +- free_extent_buffer(root->node); +- kfree(root); +- return ERR_PTR(ret); ++ if (ret == -EEXIST) { ++ free_fs_root(root); ++ goto again; ++ } ++ goto fail; + } +- if (!(fs_info->sb->s_flags & MS_RDONLY)) { +- ret = btrfs_find_dead_roots(fs_info->tree_root, +- root->root_key.objectid); +- BUG_ON(ret); ++ ++ ret = btrfs_find_dead_roots(fs_info->tree_root, ++ root->root_key.objectid); ++ WARN_ON(ret); ++ ++ if (!(fs_info->sb->s_flags & MS_RDONLY)) + btrfs_orphan_cleanup(root); +- } ++ + return root; ++fail: ++ free_fs_root(root); ++ return ERR_PTR(ret); + } + + struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *location, + const char *name, int namelen) + { ++ return btrfs_read_fs_root_no_name(fs_info, location); ++#if 0 + struct btrfs_root *root; + int ret; + +@@ -1236,7 +1259,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, + kfree(root); + return ERR_PTR(ret); + } +-#if 0 ++ + ret = btrfs_sysfs_add_root(root); + if (ret) { + free_extent_buffer(root->node); +@@ -1244,9 +1267,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, + kfree(root); + return ERR_PTR(ret); + } +-#endif + root->in_sysfs = 1; + return root; ++#endif + } + + static int btrfs_congested_fn(void *congested_data, int bdi_bits) +@@ -1325,9 +1348,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) + offset = page_offset(page); + + em_tree = &BTRFS_I(inode)->extent_tree; +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + if (!em) { + __unplug_io_fn(bdi, page); + return; +@@ -1359,8 +1382,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) + + err = bdi_register(bdi, NULL, "btrfs-%d", + atomic_inc_return(&btrfs_bdi_num)); +- if (err) ++ if (err) { ++ bdi_destroy(bdi); + return err; ++ } + + bdi->ra_pages = default_backing_dev_info.ra_pages; + bdi->unplug_io_fn = btrfs_unplug_io_fn; +@@ -1450,9 +1475,12 @@ static int cleaner_kthread(void *arg) + break; + + vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); +- mutex_lock(&root->fs_info->cleaner_mutex); +- btrfs_clean_old_snapshots(root); +- mutex_unlock(&root->fs_info->cleaner_mutex); ++ ++ if (!(root->fs_info->sb->s_flags & MS_RDONLY) && ++ mutex_trylock(&root->fs_info->cleaner_mutex)) { ++ btrfs_clean_old_snapshots(root); ++ mutex_unlock(&root->fs_info->cleaner_mutex); ++ } + + if (freezing(current)) { + refrigerator(); +@@ -1557,15 +1585,36 @@ struct btrfs_root *open_ctree(struct super_block *sb, + err = -ENOMEM; + goto fail; + } +- INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); ++ ++ ret = init_srcu_struct(&fs_info->subvol_srcu); ++ if (ret) { ++ err = ret; ++ goto fail; ++ } ++ ++ ret = setup_bdi(fs_info, &fs_info->bdi); ++ if (ret) { ++ err = ret; ++ goto fail_srcu; ++ } ++ ++ fs_info->btree_inode = new_inode(sb); ++ if (!fs_info->btree_inode) { ++ err = -ENOMEM; ++ goto fail_bdi; ++ } ++ ++ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + INIT_LIST_HEAD(&fs_info->trans_list); + INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->hashers); + INIT_LIST_HEAD(&fs_info->delalloc_inodes); + INIT_LIST_HEAD(&fs_info->ordered_operations); ++ INIT_LIST_HEAD(&fs_info->caching_block_groups); + spin_lock_init(&fs_info->delalloc_lock); + spin_lock_init(&fs_info->new_trans_lock); + spin_lock_init(&fs_info->ref_cache_lock); ++ spin_lock_init(&fs_info->fs_roots_radix_lock); + + init_completion(&fs_info->kobj_unregister); + fs_info->tree_root = tree_root; +@@ -1584,12 +1633,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, + fs_info->sb = sb; + fs_info->max_extent = (u64)-1; + fs_info->max_inline = 8192 * 1024; +- if (setup_bdi(fs_info, &fs_info->bdi)) +- goto fail_bdi; +- fs_info->btree_inode = new_inode(sb); +- fs_info->btree_inode->i_ino = 1; +- fs_info->btree_inode->i_nlink = 1; +- fs_info->metadata_ratio = 8; ++ fs_info->metadata_ratio = 0; + + fs_info->thread_pool_size = min_t(unsigned long, + num_online_cpus() + 2, 8); +@@ -1600,6 +1644,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, + sb->s_blocksize = 4096; + sb->s_blocksize_bits = blksize_bits(4096); + ++ fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; ++ fs_info->btree_inode->i_nlink = 1; + /* + * we set the i_size on the btree inode to the max possible int. + * the real end of the address space is determined by all of +@@ -1618,28 +1664,32 @@ struct btrfs_root *open_ctree(struct super_block *sb, + + BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; + ++ BTRFS_I(fs_info->btree_inode)->root = tree_root; ++ memset(&BTRFS_I(fs_info->btree_inode)->location, 0, ++ sizeof(struct btrfs_key)); ++ BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; ++ insert_inode_hash(fs_info->btree_inode); ++ + spin_lock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree.rb_node = NULL; + +- extent_io_tree_init(&fs_info->pinned_extents, ++ extent_io_tree_init(&fs_info->freed_extents[0], + fs_info->btree_inode->i_mapping, GFP_NOFS); ++ extent_io_tree_init(&fs_info->freed_extents[1], ++ fs_info->btree_inode->i_mapping, GFP_NOFS); ++ fs_info->pinned_extents = &fs_info->freed_extents[0]; + fs_info->do_barriers = 1; + +- BTRFS_I(fs_info->btree_inode)->root = tree_root; +- memset(&BTRFS_I(fs_info->btree_inode)->location, 0, +- sizeof(struct btrfs_key)); +- insert_inode_hash(fs_info->btree_inode); + + mutex_init(&fs_info->trans_mutex); + mutex_init(&fs_info->ordered_operations_mutex); + mutex_init(&fs_info->tree_log_mutex); +- mutex_init(&fs_info->drop_mutex); + mutex_init(&fs_info->chunk_mutex); + mutex_init(&fs_info->transaction_kthread_mutex); + mutex_init(&fs_info->cleaner_mutex); + mutex_init(&fs_info->volume_mutex); +- mutex_init(&fs_info->tree_reloc_mutex); + init_rwsem(&fs_info->extent_commit_sem); ++ init_rwsem(&fs_info->subvol_sem); + + btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); + btrfs_init_free_cluster(&fs_info->data_alloc_cluster); +@@ -1699,20 +1749,24 @@ struct btrfs_root *open_ctree(struct super_block *sb, + goto fail_iput; + } + +- /* +- * we need to start all the end_io workers up front because the +- * queue work function gets called at interrupt time, and so it +- * cannot dynamically grow. +- */ ++ btrfs_init_workers(&fs_info->generic_worker, ++ "genwork", 1, NULL); ++ + btrfs_init_workers(&fs_info->workers, "worker", +- fs_info->thread_pool_size); ++ fs_info->thread_pool_size, ++ &fs_info->generic_worker); + + btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", +- fs_info->thread_pool_size); ++ fs_info->thread_pool_size, ++ &fs_info->generic_worker); + + btrfs_init_workers(&fs_info->submit_workers, "submit", + min_t(u64, fs_devices->num_devices, +- fs_info->thread_pool_size)); ++ fs_info->thread_pool_size), ++ &fs_info->generic_worker); ++ btrfs_init_workers(&fs_info->enospc_workers, "enospc", ++ fs_info->thread_pool_size, ++ &fs_info->generic_worker); + + /* a higher idle thresh on the submit workers makes it much more + * likely that bios will be send down in a sane order to the +@@ -1726,15 +1780,20 @@ struct btrfs_root *open_ctree(struct super_block *sb, + fs_info->delalloc_workers.idle_thresh = 2; + fs_info->delalloc_workers.ordered = 1; + +- btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); ++ btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, ++ &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_workers, "endio", +- fs_info->thread_pool_size); ++ fs_info->thread_pool_size, ++ &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", +- fs_info->thread_pool_size); ++ fs_info->thread_pool_size, ++ &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_meta_write_workers, +- "endio-meta-write", fs_info->thread_pool_size); ++ "endio-meta-write", fs_info->thread_pool_size, ++ &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", +- fs_info->thread_pool_size); ++ fs_info->thread_pool_size, ++ &fs_info->generic_worker); + + /* + * endios are largely parallel and should have a very +@@ -1743,20 +1802,19 @@ struct btrfs_root *open_ctree(struct super_block *sb, + fs_info->endio_workers.idle_thresh = 4; + fs_info->endio_meta_workers.idle_thresh = 4; + +- fs_info->endio_write_workers.idle_thresh = 64; +- fs_info->endio_meta_write_workers.idle_thresh = 64; ++ fs_info->endio_write_workers.idle_thresh = 2; ++ fs_info->endio_meta_write_workers.idle_thresh = 2; + + btrfs_start_workers(&fs_info->workers, 1); ++ btrfs_start_workers(&fs_info->generic_worker, 1); + btrfs_start_workers(&fs_info->submit_workers, 1); + btrfs_start_workers(&fs_info->delalloc_workers, 1); + btrfs_start_workers(&fs_info->fixup_workers, 1); +- btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); +- btrfs_start_workers(&fs_info->endio_meta_workers, +- fs_info->thread_pool_size); +- btrfs_start_workers(&fs_info->endio_meta_write_workers, +- fs_info->thread_pool_size); +- btrfs_start_workers(&fs_info->endio_write_workers, +- fs_info->thread_pool_size); ++ btrfs_start_workers(&fs_info->endio_workers, 1); ++ btrfs_start_workers(&fs_info->endio_meta_workers, 1); ++ btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); ++ btrfs_start_workers(&fs_info->endio_write_workers, 1); ++ btrfs_start_workers(&fs_info->enospc_workers, 1); + + fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); + fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, +@@ -1916,6 +1974,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, + } + } + ++ ret = btrfs_find_orphan_roots(tree_root); ++ BUG_ON(ret); ++ + if (!(sb->s_flags & MS_RDONLY)) { + ret = btrfs_recover_relocation(tree_root); + BUG_ON(ret); +@@ -1959,6 +2020,7 @@ fail_chunk_root: + free_extent_buffer(chunk_root->node); + free_extent_buffer(chunk_root->commit_root); + fail_sb_buffer: ++ btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); +@@ -1967,6 +2029,7 @@ fail_sb_buffer: + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->submit_workers); ++ btrfs_stop_workers(&fs_info->enospc_workers); + fail_iput: + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + iput(fs_info->btree_inode); +@@ -1975,6 +2038,8 @@ fail_iput: + btrfs_mapping_tree_free(&fs_info->mapping_tree); + fail_bdi: + bdi_destroy(&fs_info->bdi); ++fail_srcu: ++ cleanup_srcu_struct(&fs_info->subvol_srcu); + fail: + kfree(extent_root); + kfree(tree_root); +@@ -2234,20 +2299,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans, + + int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) + { +- WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); ++ spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_delete(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid); ++ spin_unlock(&fs_info->fs_roots_radix_lock); ++ ++ if (btrfs_root_refs(&root->root_item) == 0) ++ synchronize_srcu(&fs_info->subvol_srcu); ++ ++ free_fs_root(root); ++ return 0; ++} ++ ++static void free_fs_root(struct btrfs_root *root) ++{ ++ WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + if (root->anon_super.s_dev) { + down_write(&root->anon_super.s_umount); + kill_anon_super(&root->anon_super); + } +- if (root->node) +- free_extent_buffer(root->node); +- if (root->commit_root) +- free_extent_buffer(root->commit_root); ++ free_extent_buffer(root->node); ++ free_extent_buffer(root->commit_root); + kfree(root->name); + kfree(root); +- return 0; + } + + static int del_fs_roots(struct btrfs_fs_info *fs_info) +@@ -2256,6 +2330,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info) + struct btrfs_root *gang[8]; + int i; + ++ while (!list_empty(&fs_info->dead_roots)) { ++ gang[0] = list_entry(fs_info->dead_roots.next, ++ struct btrfs_root, root_list); ++ list_del(&gang[0]->root_list); ++ ++ if (gang[0]->in_radix) { ++ btrfs_free_fs_root(fs_info, gang[0]); ++ } else { ++ free_extent_buffer(gang[0]->node); ++ free_extent_buffer(gang[0]->commit_root); ++ kfree(gang[0]); ++ } ++ } ++ + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, +@@ -2285,9 +2373,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) + root_objectid = gang[ret - 1]->root_key.objectid + 1; + for (i = 0; i < ret; i++) { + root_objectid = gang[i]->root_key.objectid; +- ret = btrfs_find_dead_roots(fs_info->tree_root, +- root_objectid); +- BUG_ON(ret); + btrfs_orphan_cleanup(gang[i]); + } + root_objectid++; +@@ -2357,12 +2442,12 @@ int close_ctree(struct btrfs_root *root) + free_extent_buffer(root->fs_info->csum_root->commit_root); + + btrfs_free_block_groups(root->fs_info); +- btrfs_free_pinned_extents(root->fs_info); + + del_fs_roots(fs_info); + + iput(fs_info->btree_inode); + ++ btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); +@@ -2371,11 +2456,13 @@ int close_ctree(struct btrfs_root *root) + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->submit_workers); ++ btrfs_stop_workers(&fs_info->enospc_workers); + + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + + bdi_destroy(&fs_info->bdi); ++ cleanup_srcu_struct(&fs_info->subvol_srcu); + + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); +diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c +index 9596b40..ba5c3fd 100644 +--- a/fs/btrfs/export.c ++++ b/fs/btrfs/export.c +@@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + len = BTRFS_FID_SIZE_NON_CONNECTABLE; + type = FILEID_BTRFS_WITHOUT_PARENT; + +- fid->objectid = BTRFS_I(inode)->location.objectid; ++ fid->objectid = inode->i_ino; + fid->root_objectid = BTRFS_I(inode)->root->objectid; + fid->gen = inode->i_generation; + +@@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + } + + static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, +- u64 root_objectid, u32 generation) ++ u64 root_objectid, u32 generation, ++ int check_generation) + { ++ struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; + struct btrfs_root *root; ++ struct dentry *dentry; + struct inode *inode; + struct btrfs_key key; ++ int index; ++ int err = 0; ++ ++ if (objectid < BTRFS_FIRST_FREE_OBJECTID) ++ return ERR_PTR(-ESTALE); + + key.objectid = root_objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + +- root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key); +- if (IS_ERR(root)) +- return ERR_CAST(root); ++ index = srcu_read_lock(&fs_info->subvol_srcu); ++ ++ root = btrfs_read_fs_root_no_name(fs_info, &key); ++ if (IS_ERR(root)) { ++ err = PTR_ERR(root); ++ goto fail; ++ } ++ ++ if (btrfs_root_refs(&root->root_item) == 0) { ++ err = -ENOENT; ++ goto fail; ++ } + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + inode = btrfs_iget(sb, &key, root); +- if (IS_ERR(inode)) +- return (void *)inode; ++ if (IS_ERR(inode)) { ++ err = PTR_ERR(inode); ++ goto fail; ++ } ++ ++ srcu_read_unlock(&fs_info->subvol_srcu, index); + +- if (generation != inode->i_generation) { ++ if (check_generation && generation != inode->i_generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + +- return d_obtain_alias(inode); ++ dentry = d_obtain_alias(inode); ++ if (!IS_ERR(dentry)) ++ dentry->d_op = &btrfs_dentry_operations; ++ return dentry; ++fail: ++ srcu_read_unlock(&fs_info->subvol_srcu, index); ++ return ERR_PTR(err); + } + + static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, +@@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, + objectid = fid->parent_objectid; + generation = fid->parent_gen; + +- return btrfs_get_dentry(sb, objectid, root_objectid, generation); ++ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); + } + + static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, +@@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + root_objectid = fid->root_objectid; + generation = fid->gen; + +- return btrfs_get_dentry(sb, objectid, root_objectid, generation); ++ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); + } + + static struct dentry *btrfs_get_parent(struct dentry *child) + { + struct inode *dir = child->d_inode; ++ static struct dentry *dentry; + struct btrfs_root *root = BTRFS_I(dir)->root; +- struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; +- int slot; +- u64 objectid; ++ struct btrfs_root_ref *ref; ++ struct btrfs_key key; ++ struct btrfs_key found_key; + int ret; + + path = btrfs_alloc_path(); + +- key.objectid = dir->i_ino; +- btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); +- key.offset = (u64)-1; ++ if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) { ++ key.objectid = root->root_key.objectid; ++ key.type = BTRFS_ROOT_BACKREF_KEY; ++ key.offset = (u64)-1; ++ root = root->fs_info->tree_root; ++ } else { ++ key.objectid = dir->i_ino; ++ key.type = BTRFS_INODE_REF_KEY; ++ key.offset = (u64)-1; ++ } + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +- if (ret < 0) { +- /* Error */ +- btrfs_free_path(path); +- return ERR_PTR(ret); ++ if (ret < 0) ++ goto fail; ++ ++ BUG_ON(ret == 0); ++ if (path->slots[0] == 0) { ++ ret = -ENOENT; ++ goto fail; + } ++ ++ path->slots[0]--; + leaf = path->nodes[0]; +- slot = path->slots[0]; +- if (ret) { +- /* btrfs_search_slot() returns the slot where we'd want to +- insert a backref for parent inode #0xFFFFFFFFFFFFFFFF. +- The _real_ backref, telling us what the parent inode +- _actually_ is, will be in the slot _before_ the one +- that btrfs_search_slot() returns. */ +- if (!slot) { +- /* Unless there is _no_ key in the tree before... */ +- btrfs_free_path(path); +- return ERR_PTR(-EIO); +- } +- slot--; ++ ++ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); ++ if (found_key.objectid != key.objectid || found_key.type != key.type) { ++ ret = -ENOENT; ++ goto fail; + } + +- btrfs_item_key_to_cpu(leaf, &key, slot); ++ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { ++ ref = btrfs_item_ptr(leaf, path->slots[0], ++ struct btrfs_root_ref); ++ key.objectid = btrfs_root_ref_dirid(leaf, ref); ++ } else { ++ key.objectid = found_key.offset; ++ } + btrfs_free_path(path); + +- if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) +- return ERR_PTR(-EINVAL); +- +- objectid = key.offset; +- +- /* If we are already at the root of a subvol, return the real root */ +- if (objectid == dir->i_ino) +- return dget(dir->i_sb->s_root); ++ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { ++ return btrfs_get_dentry(root->fs_info->sb, key.objectid, ++ found_key.offset, 0, 0); ++ } + +- /* Build a new key for the inode item */ +- key.objectid = objectid; +- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); ++ key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; +- +- return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); ++ dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); ++ if (!IS_ERR(dentry)) ++ dentry->d_op = &btrfs_dentry_operations; ++ return dentry; ++fail: ++ btrfs_free_path(path); ++ return ERR_PTR(ret); + } + + const struct export_operations btrfs_export_ops = { +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 72a2b9c..c56f916 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -32,12 +32,12 @@ + #include "locking.h" + #include "free-space-cache.h" + +-static int update_reserved_extents(struct btrfs_root *root, +- u64 bytenr, u64 num, int reserve); + static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int alloc, + int mark_free); ++static int update_reserved_extents(struct btrfs_block_group_cache *cache, ++ u64 num_bytes, int reserve); + static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, +@@ -57,10 +57,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + u64 parent, u64 root_objectid, + u64 flags, struct btrfs_disk_key *key, + int level, struct btrfs_key *ins); +- + static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 alloc_bytes, + u64 flags, int force); ++static int pin_down_bytes(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct btrfs_path *path, ++ u64 bytenr, u64 num_bytes, ++ int is_data, int reserved, ++ struct extent_buffer **must_clean); ++static int find_next_key(struct btrfs_path *path, int level, ++ struct btrfs_key *key); ++static void dump_space_info(struct btrfs_space_info *info, u64 bytes, ++ int dump_block_groups); + + static noinline int + block_group_cache_done(struct btrfs_block_group_cache *cache) +@@ -153,34 +162,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, + return ret; + } + +-/* +- * We always set EXTENT_LOCKED for the super mirror extents so we don't +- * overwrite them, so those bits need to be unset. Also, if we are unmounting +- * with pinned extents still sitting there because we had a block group caching, +- * we need to clear those now, since we are done. +- */ +-void btrfs_free_pinned_extents(struct btrfs_fs_info *info) ++static int add_excluded_extent(struct btrfs_root *root, ++ u64 start, u64 num_bytes) + { +- u64 start, end, last = 0; +- int ret; ++ u64 end = start + num_bytes - 1; ++ set_extent_bits(&root->fs_info->freed_extents[0], ++ start, end, EXTENT_UPTODATE, GFP_NOFS); ++ set_extent_bits(&root->fs_info->freed_extents[1], ++ start, end, EXTENT_UPTODATE, GFP_NOFS); ++ return 0; ++} + +- while (1) { +- ret = find_first_extent_bit(&info->pinned_extents, last, +- &start, &end, +- EXTENT_LOCKED|EXTENT_DIRTY); +- if (ret) +- break; ++static void free_excluded_extents(struct btrfs_root *root, ++ struct btrfs_block_group_cache *cache) ++{ ++ u64 start, end; + +- clear_extent_bits(&info->pinned_extents, start, end, +- EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS); +- last = end+1; +- } ++ start = cache->key.objectid; ++ end = start + cache->key.offset - 1; ++ ++ clear_extent_bits(&root->fs_info->freed_extents[0], ++ start, end, EXTENT_UPTODATE, GFP_NOFS); ++ clear_extent_bits(&root->fs_info->freed_extents[1], ++ start, end, EXTENT_UPTODATE, GFP_NOFS); + } + +-static int remove_sb_from_cache(struct btrfs_root *root, +- struct btrfs_block_group_cache *cache) ++static int exclude_super_stripes(struct btrfs_root *root, ++ struct btrfs_block_group_cache *cache) + { +- struct btrfs_fs_info *fs_info = root->fs_info; + u64 bytenr; + u64 *logical; + int stripe_len; +@@ -192,17 +201,42 @@ static int remove_sb_from_cache(struct btrfs_root *root, + cache->key.objectid, bytenr, + 0, &logical, &nr, &stripe_len); + BUG_ON(ret); ++ + while (nr--) { +- try_lock_extent(&fs_info->pinned_extents, +- logical[nr], +- logical[nr] + stripe_len - 1, GFP_NOFS); ++ cache->bytes_super += stripe_len; ++ ret = add_excluded_extent(root, logical[nr], ++ stripe_len); ++ BUG_ON(ret); + } ++ + kfree(logical); + } +- + return 0; + } + ++static struct btrfs_caching_control * ++get_caching_control(struct btrfs_block_group_cache *cache) ++{ ++ struct btrfs_caching_control *ctl; ++ ++ spin_lock(&cache->lock); ++ if (cache->cached != BTRFS_CACHE_STARTED) { ++ spin_unlock(&cache->lock); ++ return NULL; ++ } ++ ++ ctl = cache->caching_ctl; ++ atomic_inc(&ctl->count); ++ spin_unlock(&cache->lock); ++ return ctl; ++} ++ ++static void put_caching_control(struct btrfs_caching_control *ctl) ++{ ++ if (atomic_dec_and_test(&ctl->count)) ++ kfree(ctl); ++} ++ + /* + * this is only called by cache_block_group, since we could have freed extents + * we need to check the pinned_extents for any extents that can't be used yet +@@ -215,9 +249,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + int ret; + + while (start < end) { +- ret = find_first_extent_bit(&info->pinned_extents, start, ++ ret = find_first_extent_bit(info->pinned_extents, start, + &extent_start, &extent_end, +- EXTENT_DIRTY|EXTENT_LOCKED); ++ EXTENT_DIRTY | EXTENT_UPTODATE); + if (ret) + break; + +@@ -249,22 +283,27 @@ static int caching_kthread(void *data) + { + struct btrfs_block_group_cache *block_group = data; + struct btrfs_fs_info *fs_info = block_group->fs_info; +- u64 last = 0; ++ struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; ++ struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_path *path; +- int ret = 0; +- struct btrfs_key key; + struct extent_buffer *leaf; +- int slot; ++ struct btrfs_key key; + u64 total_found = 0; +- +- BUG_ON(!fs_info); ++ u64 last = 0; ++ u32 nritems; ++ int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + +- atomic_inc(&block_group->space_info->caching_threads); ++ exclude_super_stripes(extent_root, block_group); ++ spin_lock(&block_group->space_info->lock); ++ block_group->space_info->bytes_super += block_group->bytes_super; ++ spin_unlock(&block_group->space_info->lock); ++ + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); ++ + /* + * We don't want to deadlock with somebody trying to allocate a new + * extent for the extent root while also trying to search the extent +@@ -277,74 +316,64 @@ static int caching_kthread(void *data) + + key.objectid = last; + key.offset = 0; +- btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); ++ key.type = BTRFS_EXTENT_ITEM_KEY; + again: ++ mutex_lock(&caching_ctl->mutex); + /* need to make sure the commit_root doesn't disappear */ + down_read(&fs_info->extent_commit_sem); + +- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); ++ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto err; + ++ leaf = path->nodes[0]; ++ nritems = btrfs_header_nritems(leaf); ++ + while (1) { + smp_mb(); +- if (block_group->fs_info->closing > 1) { ++ if (fs_info->closing > 1) { + last = (u64)-1; + break; + } + +- leaf = path->nodes[0]; +- slot = path->slots[0]; +- if (slot >= btrfs_header_nritems(leaf)) { +- ret = btrfs_next_leaf(fs_info->extent_root, path); +- if (ret < 0) +- goto err; +- else if (ret) ++ if (path->slots[0] < nritems) { ++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ++ } else { ++ ret = find_next_key(path, 0, &key); ++ if (ret) + break; + +- if (need_resched() || +- btrfs_transaction_in_commit(fs_info)) { +- leaf = path->nodes[0]; +- +- /* this shouldn't happen, but if the +- * leaf is empty just move on. +- */ +- if (btrfs_header_nritems(leaf) == 0) +- break; +- /* +- * we need to copy the key out so that +- * we are sure the next search advances +- * us forward in the btree. +- */ +- btrfs_item_key_to_cpu(leaf, &key, 0); +- btrfs_release_path(fs_info->extent_root, path); +- up_read(&fs_info->extent_commit_sem); ++ caching_ctl->progress = last; ++ btrfs_release_path(extent_root, path); ++ up_read(&fs_info->extent_commit_sem); ++ mutex_unlock(&caching_ctl->mutex); ++ if (btrfs_transaction_in_commit(fs_info)) + schedule_timeout(1); +- goto again; +- } ++ else ++ cond_resched(); ++ goto again; ++ } + ++ if (key.objectid < block_group->key.objectid) { ++ path->slots[0]++; + continue; + } +- btrfs_item_key_to_cpu(leaf, &key, slot); +- if (key.objectid < block_group->key.objectid) +- goto next; + + if (key.objectid >= block_group->key.objectid + + block_group->key.offset) + break; + +- if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { ++ if (key.type == BTRFS_EXTENT_ITEM_KEY) { + total_found += add_new_free_space(block_group, + fs_info, last, + key.objectid); + last = key.objectid + key.offset; +- } + +- if (total_found > (1024 * 1024 * 2)) { +- total_found = 0; +- wake_up(&block_group->caching_q); ++ if (total_found > (1024 * 1024 * 2)) { ++ total_found = 0; ++ wake_up(&caching_ctl->wait); ++ } + } +-next: + path->slots[0]++; + } + ret = 0; +@@ -352,33 +381,65 @@ next: + total_found += add_new_free_space(block_group, fs_info, last, + block_group->key.objectid + + block_group->key.offset); ++ caching_ctl->progress = (u64)-1; + + spin_lock(&block_group->lock); ++ block_group->caching_ctl = NULL; + block_group->cached = BTRFS_CACHE_FINISHED; + spin_unlock(&block_group->lock); + + err: + btrfs_free_path(path); + up_read(&fs_info->extent_commit_sem); +- atomic_dec(&block_group->space_info->caching_threads); +- wake_up(&block_group->caching_q); + ++ free_excluded_extents(extent_root, block_group); ++ ++ mutex_unlock(&caching_ctl->mutex); ++ wake_up(&caching_ctl->wait); ++ ++ put_caching_control(caching_ctl); ++ atomic_dec(&block_group->space_info->caching_threads); + return 0; + } + + static int cache_block_group(struct btrfs_block_group_cache *cache) + { ++ struct btrfs_fs_info *fs_info = cache->fs_info; ++ struct btrfs_caching_control *caching_ctl; + struct task_struct *tsk; + int ret = 0; + ++ smp_mb(); ++ if (cache->cached != BTRFS_CACHE_NO) ++ return 0; ++ ++ caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); ++ BUG_ON(!caching_ctl); ++ ++ INIT_LIST_HEAD(&caching_ctl->list); ++ mutex_init(&caching_ctl->mutex); ++ init_waitqueue_head(&caching_ctl->wait); ++ caching_ctl->block_group = cache; ++ caching_ctl->progress = cache->key.objectid; ++ /* one for caching kthread, one for caching block group list */ ++ atomic_set(&caching_ctl->count, 2); ++ + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); +- return ret; ++ kfree(caching_ctl); ++ return 0; + } ++ cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_STARTED; + spin_unlock(&cache->lock); + ++ down_write(&fs_info->extent_commit_sem); ++ list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); ++ up_write(&fs_info->extent_commit_sem); ++ ++ atomic_inc(&cache->space_info->caching_threads); ++ + tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", + cache->key.objectid); + if (IS_ERR(tsk)) { +@@ -1507,22 +1568,22 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, + return ret; + } + +-#ifdef BIO_RW_DISCARD + static void btrfs_issue_discard(struct block_device *bdev, + u64 start, u64 len) + { + blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); + } +-#endif + + static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) + { +-#ifdef BIO_RW_DISCARD + int ret; + u64 map_length = num_bytes; + struct btrfs_multi_bio *multi = NULL; + ++ if (!btrfs_test_opt(root, DISCARD)) ++ return 0; ++ + /* Tell the block device(s) that the sectors can be discarded */ + ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, + bytenr, &map_length, &multi, 0); +@@ -1542,9 +1603,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + } + + return ret; +-#else +- return 0; +-#endif + } + + int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, +@@ -1656,7 +1714,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, + parent, ref_root, flags, + ref->objectid, ref->offset, + &ins, node->ref_mod); +- update_reserved_extents(root, ins.objectid, ins.offset, 0); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, +@@ -1782,7 +1839,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, + extent_op->flags_to_set, + &extent_op->key, + ref->level, &ins); +- update_reserved_extents(root, ins.objectid, ins.offset, 0); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, ref_root, +@@ -1817,16 +1873,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, + BUG_ON(extent_op); + head = btrfs_delayed_node_to_head(node); + if (insert_reserved) { ++ int mark_free = 0; ++ struct extent_buffer *must_clean = NULL; ++ ++ ret = pin_down_bytes(trans, root, NULL, ++ node->bytenr, node->num_bytes, ++ head->is_data, 1, &must_clean); ++ if (ret > 0) ++ mark_free = 1; ++ ++ if (must_clean) { ++ clean_tree_block(NULL, root, must_clean); ++ btrfs_tree_unlock(must_clean); ++ free_extent_buffer(must_clean); ++ } + if (head->is_data) { + ret = btrfs_del_csums(trans, root, + node->bytenr, + node->num_bytes); + BUG_ON(ret); + } +- btrfs_update_pinned_extents(root, node->bytenr, +- node->num_bytes, 1); +- update_reserved_extents(root, node->bytenr, +- node->num_bytes, 0); ++ if (mark_free) { ++ ret = btrfs_free_reserved_extent(root, ++ node->bytenr, ++ node->num_bytes); ++ BUG_ON(ret); ++ } + } + mutex_unlock(&head->mutex); + return 0; +@@ -2691,60 +2763,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) + alloc_target); + } + ++static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) ++{ ++ u64 num_bytes; ++ int level; ++ ++ level = BTRFS_MAX_LEVEL - 2; ++ /* ++ * NOTE: these calculations are absolutely the worst possible case. ++ * This assumes that _every_ item we insert will require a new leaf, and ++ * that the tree has grown to its maximum level size. ++ */ ++ ++ /* ++ * for every item we insert we could insert both an extent item and a ++ * extent ref item. Then for ever item we insert, we will need to cow ++ * both the original leaf, plus the leaf to the left and right of it. ++ * ++ * Unless we are talking about the extent root, then we just want the ++ * number of items * 2, since we just need the extent item plus its ref. ++ */ ++ if (root == root->fs_info->extent_root) ++ num_bytes = num_items * 2; ++ else ++ num_bytes = (num_items + (2 * num_items)) * 3; ++ ++ /* ++ * num_bytes is total number of leaves we could need times the leaf ++ * size, and then for every leaf we could end up cow'ing 2 nodes per ++ * level, down to the leaf level. ++ */ ++ num_bytes = (num_bytes * root->leafsize) + ++ (num_bytes * (level * 2)) * root->nodesize; ++ ++ return num_bytes; ++} ++ + /* +- * for now this just makes sure we have at least 5% of our metadata space free +- * for use. ++ * Unreserve metadata space for delalloc. If we have less reserved credits than ++ * we have extents, this function does nothing. + */ +-int btrfs_check_metadata_free_space(struct btrfs_root *root) ++int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, ++ struct inode *inode, int num_items) + { + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; +- u64 alloc_target, thresh; +- int committed = 0, ret; ++ u64 num_bytes; ++ u64 alloc_target; ++ bool bug = false; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + +-again: ++ num_bytes = calculate_bytes_needed(root->fs_info->extent_root, ++ num_items); ++ + spin_lock(&meta_sinfo->lock); +- if (!meta_sinfo->full) +- thresh = meta_sinfo->total_bytes * 80; +- else +- thresh = meta_sinfo->total_bytes * 95; ++ spin_lock(&BTRFS_I(inode)->accounting_lock); ++ if (BTRFS_I(inode)->reserved_extents <= ++ BTRFS_I(inode)->outstanding_extents) { ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); ++ spin_unlock(&meta_sinfo->lock); ++ return 0; ++ } ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); ++ ++ BTRFS_I(inode)->reserved_extents--; ++ BUG_ON(BTRFS_I(inode)->reserved_extents < 0); ++ ++ if (meta_sinfo->bytes_delalloc < num_bytes) { ++ bug = true; ++ meta_sinfo->bytes_delalloc = 0; ++ } else { ++ meta_sinfo->bytes_delalloc -= num_bytes; ++ } ++ spin_unlock(&meta_sinfo->lock); ++ ++ BUG_ON(bug); ++ ++ return 0; ++} ++ ++static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) ++{ ++ u64 thresh; ++ ++ thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + ++ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + ++ meta_sinfo->bytes_super + meta_sinfo->bytes_root + ++ meta_sinfo->bytes_may_use; + ++ thresh = meta_sinfo->total_bytes - thresh; ++ thresh *= 80; + do_div(thresh, 100); ++ if (thresh <= meta_sinfo->bytes_delalloc) ++ meta_sinfo->force_delalloc = 1; ++ else ++ meta_sinfo->force_delalloc = 0; ++} + +- if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + +- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) { +- struct btrfs_trans_handle *trans; +- if (!meta_sinfo->full) { +- meta_sinfo->force_alloc = 1; +- spin_unlock(&meta_sinfo->lock); ++struct async_flush { ++ struct btrfs_root *root; ++ struct btrfs_space_info *info; ++ struct btrfs_work work; ++}; + +- trans = btrfs_start_transaction(root, 1); +- if (!trans) +- return -ENOMEM; ++static noinline void flush_delalloc_async(struct btrfs_work *work) ++{ ++ struct async_flush *async; ++ struct btrfs_root *root; ++ struct btrfs_space_info *info; + +- ret = do_chunk_alloc(trans, root->fs_info->extent_root, +- 2 * 1024 * 1024, alloc_target, 0); +- btrfs_end_transaction(trans, root); ++ async = container_of(work, struct async_flush, work); ++ root = async->root; ++ info = async->info; ++ ++ btrfs_start_delalloc_inodes(root); ++ wake_up(&info->flush_wait); ++ btrfs_wait_ordered_extents(root, 0); ++ ++ spin_lock(&info->lock); ++ info->flushing = 0; ++ spin_unlock(&info->lock); ++ wake_up(&info->flush_wait); ++ ++ kfree(async); ++} ++ ++static void wait_on_flush(struct btrfs_space_info *info) ++{ ++ DEFINE_WAIT(wait); ++ u64 used; ++ ++ while (1) { ++ prepare_to_wait(&info->flush_wait, &wait, ++ TASK_UNINTERRUPTIBLE); ++ spin_lock(&info->lock); ++ if (!info->flushing) { ++ spin_unlock(&info->lock); ++ break; ++ } ++ ++ used = info->bytes_used + info->bytes_reserved + ++ info->bytes_pinned + info->bytes_readonly + ++ info->bytes_super + info->bytes_root + ++ info->bytes_may_use + info->bytes_delalloc; ++ if (used < info->total_bytes) { ++ spin_unlock(&info->lock); ++ break; ++ } ++ spin_unlock(&info->lock); ++ schedule(); ++ } ++ finish_wait(&info->flush_wait, &wait); ++} ++ ++static void flush_delalloc(struct btrfs_root *root, ++ struct btrfs_space_info *info) ++{ ++ struct async_flush *async; ++ bool wait = false; ++ ++ spin_lock(&info->lock); ++ ++ if (!info->flushing) { ++ info->flushing = 1; ++ init_waitqueue_head(&info->flush_wait); ++ } else { ++ wait = true; ++ } ++ ++ spin_unlock(&info->lock); ++ ++ if (wait) { ++ wait_on_flush(info); ++ return; ++ } ++ ++ async = kzalloc(sizeof(*async), GFP_NOFS); ++ if (!async) ++ goto flush; ++ ++ async->root = root; ++ async->info = info; ++ async->work.func = flush_delalloc_async; ++ ++ btrfs_queue_worker(&root->fs_info->enospc_workers, ++ &async->work); ++ wait_on_flush(info); ++ return; ++ ++flush: ++ btrfs_start_delalloc_inodes(root); ++ btrfs_wait_ordered_extents(root, 0); ++ ++ spin_lock(&info->lock); ++ info->flushing = 0; ++ spin_unlock(&info->lock); ++ wake_up(&info->flush_wait); ++} ++ ++static int maybe_allocate_chunk(struct btrfs_root *root, ++ struct btrfs_space_info *info) ++{ ++ struct btrfs_super_block *disk_super = &root->fs_info->super_copy; ++ struct btrfs_trans_handle *trans; ++ bool wait = false; ++ int ret = 0; ++ u64 min_metadata; ++ u64 free_space; ++ ++ free_space = btrfs_super_total_bytes(disk_super); ++ /* ++ * we allow the metadata to grow to a max of either 5gb or 5% of the ++ * space in the volume. ++ */ ++ min_metadata = min((u64)5 * 1024 * 1024 * 1024, ++ div64_u64(free_space * 5, 100)); ++ if (info->total_bytes >= min_metadata) { ++ spin_unlock(&info->lock); ++ return 0; ++ } ++ ++ if (info->full) { ++ spin_unlock(&info->lock); ++ return 0; ++ } ++ ++ if (!info->allocating_chunk) { ++ info->force_alloc = 1; ++ info->allocating_chunk = 1; ++ init_waitqueue_head(&info->allocate_wait); ++ } else { ++ wait = true; ++ } ++ ++ spin_unlock(&info->lock); ++ ++ if (wait) { ++ wait_event(info->allocate_wait, ++ !info->allocating_chunk); ++ return 1; ++ } ++ ++ trans = btrfs_start_transaction(root, 1); ++ if (!trans) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ ret = do_chunk_alloc(trans, root->fs_info->extent_root, ++ 4096 + 2 * 1024 * 1024, ++ info->flags, 0); ++ btrfs_end_transaction(trans, root); ++ if (ret) ++ goto out; ++out: ++ spin_lock(&info->lock); ++ info->allocating_chunk = 0; ++ spin_unlock(&info->lock); ++ wake_up(&info->allocate_wait); ++ ++ if (ret) ++ return 0; ++ return 1; ++} ++ ++/* ++ * Reserve metadata space for delalloc. ++ */ ++int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, ++ struct inode *inode, int num_items) ++{ ++ struct btrfs_fs_info *info = root->fs_info; ++ struct btrfs_space_info *meta_sinfo; ++ u64 num_bytes; ++ u64 used; ++ u64 alloc_target; ++ int flushed = 0; ++ int force_delalloc; ++ ++ /* get the space info for where the metadata will live */ ++ alloc_target = btrfs_get_alloc_profile(root, 0); ++ meta_sinfo = __find_space_info(info, alloc_target); ++ ++ num_bytes = calculate_bytes_needed(root->fs_info->extent_root, ++ num_items); ++again: ++ spin_lock(&meta_sinfo->lock); ++ ++ force_delalloc = meta_sinfo->force_delalloc; ++ ++ if (unlikely(!meta_sinfo->bytes_root)) ++ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); ++ ++ if (!flushed) ++ meta_sinfo->bytes_delalloc += num_bytes; ++ ++ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + ++ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + ++ meta_sinfo->bytes_super + meta_sinfo->bytes_root + ++ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; ++ ++ if (used > meta_sinfo->total_bytes) { ++ flushed++; ++ ++ if (flushed == 1) { ++ if (maybe_allocate_chunk(root, meta_sinfo)) ++ goto again; ++ flushed++; ++ } else { ++ spin_unlock(&meta_sinfo->lock); ++ } ++ ++ if (flushed == 2) { ++ filemap_flush(inode->i_mapping); ++ goto again; ++ } else if (flushed == 3) { ++ flush_delalloc(root, meta_sinfo); + goto again; + } ++ spin_lock(&meta_sinfo->lock); ++ meta_sinfo->bytes_delalloc -= num_bytes; + spin_unlock(&meta_sinfo->lock); ++ printk(KERN_ERR "enospc, has %d, reserved %d\n", ++ BTRFS_I(inode)->outstanding_extents, ++ BTRFS_I(inode)->reserved_extents); ++ dump_space_info(meta_sinfo, 0, 0); ++ return -ENOSPC; ++ } + +- if (!committed) { +- committed = 1; +- trans = btrfs_join_transaction(root, 1); +- if (!trans) +- return -ENOMEM; +- ret = btrfs_commit_transaction(trans, root); +- if (ret) +- return ret; ++ BTRFS_I(inode)->reserved_extents++; ++ check_force_delalloc(meta_sinfo); ++ spin_unlock(&meta_sinfo->lock); ++ ++ if (!flushed && force_delalloc) ++ filemap_flush(inode->i_mapping); ++ ++ return 0; ++} ++ ++/* ++ * unreserve num_items number of items worth of metadata space. This needs to ++ * be paired with btrfs_reserve_metadata_space. ++ * ++ * NOTE: if you have the option, run this _AFTER_ you do a ++ * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref ++ * oprations which will result in more used metadata, so we want to make sure we ++ * can do that without issue. ++ */ ++int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) ++{ ++ struct btrfs_fs_info *info = root->fs_info; ++ struct btrfs_space_info *meta_sinfo; ++ u64 num_bytes; ++ u64 alloc_target; ++ bool bug = false; ++ ++ /* get the space info for where the metadata will live */ ++ alloc_target = btrfs_get_alloc_profile(root, 0); ++ meta_sinfo = __find_space_info(info, alloc_target); ++ ++ num_bytes = calculate_bytes_needed(root, num_items); ++ ++ spin_lock(&meta_sinfo->lock); ++ if (meta_sinfo->bytes_may_use < num_bytes) { ++ bug = true; ++ meta_sinfo->bytes_may_use = 0; ++ } else { ++ meta_sinfo->bytes_may_use -= num_bytes; ++ } ++ spin_unlock(&meta_sinfo->lock); ++ ++ BUG_ON(bug); ++ ++ return 0; ++} ++ ++/* ++ * Reserve some metadata space for use. We'll calculate the worste case number ++ * of bytes that would be needed to modify num_items number of items. If we ++ * have space, fantastic, if not, you get -ENOSPC. Please call ++ * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of ++ * items you reserved, since whatever metadata you needed should have already ++ * been allocated. ++ * ++ * This will commit the transaction to make more space if we don't have enough ++ * metadata space. THe only time we don't do this is if we're reserving space ++ * inside of a transaction, then we will just return -ENOSPC and it is the ++ * callers responsibility to handle it properly. ++ */ ++int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) ++{ ++ struct btrfs_fs_info *info = root->fs_info; ++ struct btrfs_space_info *meta_sinfo; ++ u64 num_bytes; ++ u64 used; ++ u64 alloc_target; ++ int retries = 0; ++ ++ /* get the space info for where the metadata will live */ ++ alloc_target = btrfs_get_alloc_profile(root, 0); ++ meta_sinfo = __find_space_info(info, alloc_target); ++ ++ num_bytes = calculate_bytes_needed(root, num_items); ++again: ++ spin_lock(&meta_sinfo->lock); ++ ++ if (unlikely(!meta_sinfo->bytes_root)) ++ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); ++ ++ if (!retries) ++ meta_sinfo->bytes_may_use += num_bytes; ++ ++ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + ++ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + ++ meta_sinfo->bytes_super + meta_sinfo->bytes_root + ++ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; ++ ++ if (used > meta_sinfo->total_bytes) { ++ retries++; ++ if (retries == 1) { ++ if (maybe_allocate_chunk(root, meta_sinfo)) ++ goto again; ++ retries++; ++ } else { ++ spin_unlock(&meta_sinfo->lock); ++ } ++ ++ if (retries == 2) { ++ flush_delalloc(root, meta_sinfo); + goto again; + } ++ spin_lock(&meta_sinfo->lock); ++ meta_sinfo->bytes_may_use -= num_bytes; ++ spin_unlock(&meta_sinfo->lock); ++ ++ dump_space_info(meta_sinfo, 0, 0); + return -ENOSPC; + } ++ ++ check_force_delalloc(meta_sinfo); + spin_unlock(&meta_sinfo->lock); + + return 0; +@@ -2764,13 +3224,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + data_sinfo = BTRFS_I(inode)->space_info; ++ if (!data_sinfo) ++ goto alloc; ++ + again: + /* make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + if (data_sinfo->total_bytes - data_sinfo->bytes_used - + data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - + data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - +- data_sinfo->bytes_may_use < bytes) { ++ data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { + struct btrfs_trans_handle *trans; + + /* +@@ -2782,7 +3245,7 @@ again: + + data_sinfo->force_alloc = 1; + spin_unlock(&data_sinfo->lock); +- ++alloc: + alloc_target = btrfs_get_alloc_profile(root, 1); + trans = btrfs_start_transaction(root, 1); + if (!trans) +@@ -2794,12 +3257,17 @@ again: + btrfs_end_transaction(trans, root); + if (ret) + return ret; ++ ++ if (!data_sinfo) { ++ btrfs_set_inode_space_info(root, inode); ++ data_sinfo = BTRFS_I(inode)->space_info; ++ } + goto again; + } + spin_unlock(&data_sinfo->lock); + + /* commit the current transaction and try again */ +- if (!committed) { ++ if (!committed && !root->fs_info->open_ioctl_trans) { + committed = 1; + trans = btrfs_join_transaction(root, 1); + if (!trans) +@@ -2827,7 +3295,7 @@ again: + BTRFS_I(inode)->reserved_bytes += bytes; + spin_unlock(&data_sinfo->lock); + +- return btrfs_check_metadata_free_space(root); ++ return 0; + } + + /* +@@ -2926,17 +3394,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, + BUG_ON(!space_info); + + spin_lock(&space_info->lock); +- if (space_info->force_alloc) { ++ if (space_info->force_alloc) + force = 1; +- space_info->force_alloc = 0; +- } + if (space_info->full) { + spin_unlock(&space_info->lock); + goto out; + } + + thresh = space_info->total_bytes - space_info->bytes_readonly; +- thresh = div_factor(thresh, 6); ++ thresh = div_factor(thresh, 8); + if (!force && + (space_info->bytes_used + space_info->bytes_pinned + + space_info->bytes_reserved + alloc_bytes) < thresh) { +@@ -2950,7 +3416,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ +- if (flags & BTRFS_BLOCK_GROUP_DATA) { ++ if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) +@@ -2958,8 +3424,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, + } + + ret = btrfs_alloc_chunk(trans, extent_root, flags); ++ spin_lock(&space_info->lock); + if (ret) + space_info->full = 1; ++ space_info->force_alloc = 0; ++ spin_unlock(&space_info->lock); + out: + mutex_unlock(&extent_root->fs_info->chunk_mutex); + return ret; +@@ -3008,10 +3477,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, + num_bytes = min(total, cache->key.offset - byte_in_group); + if (alloc) { + old_val += num_bytes; ++ btrfs_set_block_group_used(&cache->item, old_val); ++ cache->reserved -= num_bytes; + cache->space_info->bytes_used += num_bytes; ++ cache->space_info->bytes_reserved -= num_bytes; + if (cache->ro) + cache->space_info->bytes_readonly -= num_bytes; +- btrfs_set_block_group_used(&cache->item, old_val); + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + } else { +@@ -3056,127 +3527,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) + return bytenr; + } + +-int btrfs_update_pinned_extents(struct btrfs_root *root, +- u64 bytenr, u64 num, int pin) ++/* ++ * this function must be called within transaction ++ */ ++int btrfs_pin_extent(struct btrfs_root *root, ++ u64 bytenr, u64 num_bytes, int reserved) + { +- u64 len; +- struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *fs_info = root->fs_info; ++ struct btrfs_block_group_cache *cache; + +- if (pin) +- set_extent_dirty(&fs_info->pinned_extents, +- bytenr, bytenr + num - 1, GFP_NOFS); +- +- while (num > 0) { +- cache = btrfs_lookup_block_group(fs_info, bytenr); +- BUG_ON(!cache); +- len = min(num, cache->key.offset - +- (bytenr - cache->key.objectid)); +- if (pin) { +- spin_lock(&cache->space_info->lock); +- spin_lock(&cache->lock); +- cache->pinned += len; +- cache->space_info->bytes_pinned += len; +- spin_unlock(&cache->lock); +- spin_unlock(&cache->space_info->lock); +- fs_info->total_pinned += len; +- } else { +- int unpin = 0; ++ cache = btrfs_lookup_block_group(fs_info, bytenr); ++ BUG_ON(!cache); + +- /* +- * in order to not race with the block group caching, we +- * only want to unpin the extent if we are cached. If +- * we aren't cached, we want to start async caching this +- * block group so we can free the extent the next time +- * around. +- */ +- spin_lock(&cache->space_info->lock); +- spin_lock(&cache->lock); +- unpin = (cache->cached == BTRFS_CACHE_FINISHED); +- if (likely(unpin)) { +- cache->pinned -= len; +- cache->space_info->bytes_pinned -= len; +- fs_info->total_pinned -= len; +- } +- spin_unlock(&cache->lock); +- spin_unlock(&cache->space_info->lock); ++ spin_lock(&cache->space_info->lock); ++ spin_lock(&cache->lock); ++ cache->pinned += num_bytes; ++ cache->space_info->bytes_pinned += num_bytes; ++ if (reserved) { ++ cache->reserved -= num_bytes; ++ cache->space_info->bytes_reserved -= num_bytes; ++ } ++ spin_unlock(&cache->lock); ++ spin_unlock(&cache->space_info->lock); + +- if (likely(unpin)) +- clear_extent_dirty(&fs_info->pinned_extents, +- bytenr, bytenr + len -1, +- GFP_NOFS); +- else +- cache_block_group(cache); ++ btrfs_put_block_group(cache); + +- if (unpin) +- btrfs_add_free_space(cache, bytenr, len); +- } +- btrfs_put_block_group(cache); +- bytenr += len; +- num -= len; ++ set_extent_dirty(fs_info->pinned_extents, ++ bytenr, bytenr + num_bytes - 1, GFP_NOFS); ++ return 0; ++} ++ ++static int update_reserved_extents(struct btrfs_block_group_cache *cache, ++ u64 num_bytes, int reserve) ++{ ++ spin_lock(&cache->space_info->lock); ++ spin_lock(&cache->lock); ++ if (reserve) { ++ cache->reserved += num_bytes; ++ cache->space_info->bytes_reserved += num_bytes; ++ } else { ++ cache->reserved -= num_bytes; ++ cache->space_info->bytes_reserved -= num_bytes; + } ++ spin_unlock(&cache->lock); ++ spin_unlock(&cache->space_info->lock); + return 0; + } + +-static int update_reserved_extents(struct btrfs_root *root, +- u64 bytenr, u64 num, int reserve) ++int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root) + { +- u64 len; +- struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *fs_info = root->fs_info; ++ struct btrfs_caching_control *next; ++ struct btrfs_caching_control *caching_ctl; ++ struct btrfs_block_group_cache *cache; + +- while (num > 0) { +- cache = btrfs_lookup_block_group(fs_info, bytenr); +- BUG_ON(!cache); +- len = min(num, cache->key.offset - +- (bytenr - cache->key.objectid)); ++ down_write(&fs_info->extent_commit_sem); + +- spin_lock(&cache->space_info->lock); +- spin_lock(&cache->lock); +- if (reserve) { +- cache->reserved += len; +- cache->space_info->bytes_reserved += len; ++ list_for_each_entry_safe(caching_ctl, next, ++ &fs_info->caching_block_groups, list) { ++ cache = caching_ctl->block_group; ++ if (block_group_cache_done(cache)) { ++ cache->last_byte_to_unpin = (u64)-1; ++ list_del_init(&caching_ctl->list); ++ put_caching_control(caching_ctl); + } else { +- cache->reserved -= len; +- cache->space_info->bytes_reserved -= len; ++ cache->last_byte_to_unpin = caching_ctl->progress; + } +- spin_unlock(&cache->lock); +- spin_unlock(&cache->space_info->lock); +- btrfs_put_block_group(cache); +- bytenr += len; +- num -= len; + } ++ ++ if (fs_info->pinned_extents == &fs_info->freed_extents[0]) ++ fs_info->pinned_extents = &fs_info->freed_extents[1]; ++ else ++ fs_info->pinned_extents = &fs_info->freed_extents[0]; ++ ++ up_write(&fs_info->extent_commit_sem); + return 0; + } + +-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) ++static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) + { +- u64 last = 0; +- u64 start; +- u64 end; +- struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; +- int ret; ++ struct btrfs_fs_info *fs_info = root->fs_info; ++ struct btrfs_block_group_cache *cache = NULL; ++ u64 len; + +- while (1) { +- ret = find_first_extent_bit(pinned_extents, last, +- &start, &end, EXTENT_DIRTY); +- if (ret) +- break; ++ while (start <= end) { ++ if (!cache || ++ start >= cache->key.objectid + cache->key.offset) { ++ if (cache) ++ btrfs_put_block_group(cache); ++ cache = btrfs_lookup_block_group(fs_info, start); ++ BUG_ON(!cache); ++ } + +- set_extent_dirty(copy, start, end, GFP_NOFS); +- last = end + 1; ++ len = cache->key.objectid + cache->key.offset - start; ++ len = min(len, end + 1 - start); ++ ++ if (start < cache->last_byte_to_unpin) { ++ len = min(len, cache->last_byte_to_unpin - start); ++ btrfs_add_free_space(cache, start, len); ++ } ++ ++ spin_lock(&cache->space_info->lock); ++ spin_lock(&cache->lock); ++ cache->pinned -= len; ++ cache->space_info->bytes_pinned -= len; ++ spin_unlock(&cache->lock); ++ spin_unlock(&cache->space_info->lock); ++ ++ start += len; + } ++ ++ if (cache) ++ btrfs_put_block_group(cache); + return 0; + } + + int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- struct extent_io_tree *unpin) ++ struct btrfs_root *root) + { ++ struct btrfs_fs_info *fs_info = root->fs_info; ++ struct extent_io_tree *unpin; + u64 start; + u64 end; + int ret; + ++ if (fs_info->pinned_extents == &fs_info->freed_extents[0]) ++ unpin = &fs_info->freed_extents[1]; ++ else ++ unpin = &fs_info->freed_extents[0]; ++ + while (1) { + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY); +@@ -3185,10 +3665,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + + ret = btrfs_discard_extent(root, start, end + 1 - start); + +- /* unlocks the pinned mutex */ +- btrfs_update_pinned_extents(root, start, end + 1 - start, 0); + clear_extent_dirty(unpin, start, end, GFP_NOFS); +- ++ unpin_extent_range(root, start, end); + cond_resched(); + } + +@@ -3198,7 +3676,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + static int pin_down_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, +- u64 bytenr, u64 num_bytes, int is_data, ++ u64 bytenr, u64 num_bytes, ++ int is_data, int reserved, + struct extent_buffer **must_clean) + { + int err = 0; +@@ -3207,6 +3686,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, + if (is_data) + goto pinit; + ++ /* ++ * discard is sloooow, and so triggering discards on ++ * individual btree blocks isn't a good plan. Just ++ * pin everything in discard mode. ++ */ ++ if (btrfs_test_opt(root, DISCARD)) ++ goto pinit; ++ + buf = btrfs_find_tree_block(root, bytenr, num_bytes); + if (!buf) + goto pinit; +@@ -3230,15 +3717,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, + } + free_extent_buffer(buf); + pinit: +- btrfs_set_path_blocking(path); ++ if (path) ++ btrfs_set_path_blocking(path); + /* unlocks the pinned mutex */ +- btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); ++ btrfs_pin_extent(root, bytenr, num_bytes, reserved); + + BUG_ON(err < 0); + return 0; + } + +- + static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, +@@ -3412,7 +3899,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + } + + ret = pin_down_bytes(trans, root, path, bytenr, +- num_bytes, is_data, &must_clean); ++ num_bytes, is_data, 0, &must_clean); + if (ret > 0) + mark_free = 1; + BUG_ON(ret < 0); +@@ -3543,8 +4030,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, + if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { + WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); + /* unlocks the pinned mutex */ +- btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); +- update_reserved_extents(root, bytenr, num_bytes, 0); ++ btrfs_pin_extent(root, bytenr, num_bytes, 1); + ret = 0; + } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, +@@ -3584,19 +4070,33 @@ static noinline int + wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, + u64 num_bytes) + { ++ struct btrfs_caching_control *caching_ctl; + DEFINE_WAIT(wait); + +- prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE); +- +- if (block_group_cache_done(cache)) { +- finish_wait(&cache->caching_q, &wait); ++ caching_ctl = get_caching_control(cache); ++ if (!caching_ctl) + return 0; +- } +- schedule(); +- finish_wait(&cache->caching_q, &wait); + +- wait_event(cache->caching_q, block_group_cache_done(cache) || ++ wait_event(caching_ctl->wait, block_group_cache_done(cache) || + (cache->free_space >= num_bytes)); ++ ++ put_caching_control(caching_ctl); ++ return 0; ++} ++ ++static noinline int ++wait_block_group_cache_done(struct btrfs_block_group_cache *cache) ++{ ++ struct btrfs_caching_control *caching_ctl; ++ DEFINE_WAIT(wait); ++ ++ caching_ctl = get_caching_control(cache); ++ if (!caching_ctl) ++ return 0; ++ ++ wait_event(caching_ctl->wait, block_group_cache_done(cache)); ++ ++ put_caching_control(caching_ctl); + return 0; + } + +@@ -3634,6 +4134,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, + int last_ptr_loop = 0; + int loop = 0; + bool found_uncached_bg = false; ++ bool failed_cluster_refill = false; ++ bool failed_alloc = false; + + WARN_ON(num_bytes < root->sectorsize); + btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); +@@ -3731,7 +4233,16 @@ have_block_group: + if (unlikely(block_group->ro)) + goto loop; + +- if (last_ptr) { ++ /* ++ * Ok we want to try and use the cluster allocator, so lets look ++ * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will ++ * have tried the cluster allocator plenty of times at this ++ * point and not have found anything, so we are likely way too ++ * fragmented for the clustering stuff to find anything, so lets ++ * just skip it and let the allocator find whatever block it can ++ * find ++ */ ++ if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { + /* + * the refill lock keeps out other + * people trying to start a new cluster +@@ -3806,9 +4317,11 @@ refill_cluster: + spin_unlock(&last_ptr->refill_lock); + goto checks; + } +- } else if (!cached && loop > LOOP_CACHING_NOWAIT) { ++ } else if (!cached && loop > LOOP_CACHING_NOWAIT ++ && !failed_cluster_refill) { + spin_unlock(&last_ptr->refill_lock); + ++ failed_cluster_refill = true; + wait_block_group_cache_progress(block_group, + num_bytes + empty_cluster + empty_size); + goto have_block_group; +@@ -3820,25 +4333,30 @@ refill_cluster: + * cluster. Free the cluster we've been trying + * to use, and go to the next block group + */ +- if (loop < LOOP_NO_EMPTY_SIZE) { +- btrfs_return_cluster_to_free_space(NULL, +- last_ptr); +- spin_unlock(&last_ptr->refill_lock); +- goto loop; +- } ++ btrfs_return_cluster_to_free_space(NULL, last_ptr); + spin_unlock(&last_ptr->refill_lock); ++ goto loop; + } + + offset = btrfs_find_space_for_alloc(block_group, search_start, + num_bytes, empty_size); +- if (!offset && (cached || (!cached && +- loop == LOOP_CACHING_NOWAIT))) { +- goto loop; +- } else if (!offset && (!cached && +- loop > LOOP_CACHING_NOWAIT)) { ++ /* ++ * If we didn't find a chunk, and we haven't failed on this ++ * block group before, and this block group is in the middle of ++ * caching and we are ok with waiting, then go ahead and wait ++ * for progress to be made, and set failed_alloc to true. ++ * ++ * If failed_alloc is true then we've already waited on this ++ * block group once and should move on to the next block group. ++ */ ++ if (!offset && !failed_alloc && !cached && ++ loop > LOOP_CACHING_NOWAIT) { + wait_block_group_cache_progress(block_group, +- num_bytes + empty_size); ++ num_bytes + empty_size); ++ failed_alloc = true; + goto have_block_group; ++ } else if (!offset) { ++ goto loop; + } + checks: + search_start = stripe_align(root, offset); +@@ -3880,9 +4398,13 @@ checks: + search_start - offset); + BUG_ON(offset > search_start); + ++ update_reserved_extents(block_group, num_bytes, 1); ++ + /* we are all good, lets return */ + break; + loop: ++ failed_cluster_refill = false; ++ failed_alloc = false; + btrfs_put_block_group(block_group); + } + up_read(&space_info->groups_sem); +@@ -3940,21 +4462,32 @@ loop: + return ret; + } + +-static void dump_space_info(struct btrfs_space_info *info, u64 bytes) ++static void dump_space_info(struct btrfs_space_info *info, u64 bytes, ++ int dump_block_groups) + { + struct btrfs_block_group_cache *cache; + ++ spin_lock(&info->lock); + printk(KERN_INFO "space_info has %llu free, is %sfull\n", + (unsigned long long)(info->total_bytes - info->bytes_used - +- info->bytes_pinned - info->bytes_reserved), ++ info->bytes_pinned - info->bytes_reserved - ++ info->bytes_super), + (info->full) ? "" : "not "); + printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," +- " may_use=%llu, used=%llu\n", ++ " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" ++ "\n", + (unsigned long long)info->total_bytes, + (unsigned long long)info->bytes_pinned, + (unsigned long long)info->bytes_delalloc, + (unsigned long long)info->bytes_may_use, +- (unsigned long long)info->bytes_used); ++ (unsigned long long)info->bytes_used, ++ (unsigned long long)info->bytes_root, ++ (unsigned long long)info->bytes_super, ++ (unsigned long long)info->bytes_reserved); ++ spin_unlock(&info->lock); ++ ++ if (!dump_block_groups) ++ return; + + down_read(&info->groups_sem); + list_for_each_entry(cache, &info->block_groups, list) { +@@ -3972,12 +4505,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes) + up_read(&info->groups_sem); + } + +-static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- u64 num_bytes, u64 min_alloc_size, +- u64 empty_size, u64 hint_byte, +- u64 search_end, struct btrfs_key *ins, +- u64 data) ++int btrfs_reserve_extent(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ u64 num_bytes, u64 min_alloc_size, ++ u64 empty_size, u64 hint_byte, ++ u64 search_end, struct btrfs_key *ins, ++ u64 data) + { + int ret; + u64 search_start = 0; +@@ -4022,7 +4555,7 @@ again: + printk(KERN_ERR "btrfs allocation failed flags %llu, " + "wanted %llu\n", (unsigned long long)data, + (unsigned long long)num_bytes); +- dump_space_info(sinfo, num_bytes); ++ dump_space_info(sinfo, num_bytes, 1); + } + + return ret; +@@ -4043,25 +4576,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) + ret = btrfs_discard_extent(root, start, len); + + btrfs_add_free_space(cache, start, len); ++ update_reserved_extents(cache, len, 0); + btrfs_put_block_group(cache); +- update_reserved_extents(root, start, len, 0); +- +- return ret; +-} +- +-int btrfs_reserve_extent(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- u64 num_bytes, u64 min_alloc_size, +- u64 empty_size, u64 hint_byte, +- u64 search_end, struct btrfs_key *ins, +- u64 data) +-{ +- int ret; +- ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, +- empty_size, hint_byte, search_end, ins, +- data); +- if (!ret) +- update_reserved_extents(root, ins->objectid, ins->offset, 1); + + return ret; + } +@@ -4222,15 +4738,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + { + int ret; + struct btrfs_block_group_cache *block_group; ++ struct btrfs_caching_control *caching_ctl; ++ u64 start = ins->objectid; ++ u64 num_bytes = ins->offset; + + block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + cache_block_group(block_group); +- wait_event(block_group->caching_q, +- block_group_cache_done(block_group)); ++ caching_ctl = get_caching_control(block_group); + +- ret = btrfs_remove_free_space(block_group, ins->objectid, +- ins->offset); +- BUG_ON(ret); ++ if (!caching_ctl) { ++ BUG_ON(!block_group_cache_done(block_group)); ++ ret = btrfs_remove_free_space(block_group, start, num_bytes); ++ BUG_ON(ret); ++ } else { ++ mutex_lock(&caching_ctl->mutex); ++ ++ if (start >= caching_ctl->progress) { ++ ret = add_excluded_extent(root, start, num_bytes); ++ BUG_ON(ret); ++ } else if (start + num_bytes <= caching_ctl->progress) { ++ ret = btrfs_remove_free_space(block_group, ++ start, num_bytes); ++ BUG_ON(ret); ++ } else { ++ num_bytes = caching_ctl->progress - start; ++ ret = btrfs_remove_free_space(block_group, ++ start, num_bytes); ++ BUG_ON(ret); ++ ++ start = caching_ctl->progress; ++ num_bytes = ins->objectid + ins->offset - ++ caching_ctl->progress; ++ ret = add_excluded_extent(root, start, num_bytes); ++ BUG_ON(ret); ++ } ++ ++ mutex_unlock(&caching_ctl->mutex); ++ put_caching_control(caching_ctl); ++ } ++ ++ update_reserved_extents(block_group, ins->offset, 1); + btrfs_put_block_group(block_group); + ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, + 0, owner, offset, ins, 1); +@@ -4254,9 +4801,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, + int ret; + u64 flags = 0; + +- ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, +- empty_size, hint_byte, search_end, +- ins, 0); ++ ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes, ++ empty_size, hint_byte, search_end, ++ ins, 0); + if (ret) + return ret; + +@@ -4267,7 +4814,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, + } else + BUG_ON(parent > 0); + +- update_reserved_extents(root, ins->objectid, ins->offset, 1); + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_delayed_extent_op *extent_op; + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); +@@ -4346,452 +4892,108 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, + return buf; + } + +-#if 0 +-int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, struct extent_buffer *leaf) +-{ +- u64 disk_bytenr; +- u64 num_bytes; +- struct btrfs_key key; +- struct btrfs_file_extent_item *fi; +- u32 nritems; +- int i; +- int ret; +- +- BUG_ON(!btrfs_is_leaf(leaf)); +- nritems = btrfs_header_nritems(leaf); +- +- for (i = 0; i < nritems; i++) { +- cond_resched(); +- btrfs_item_key_to_cpu(leaf, &key, i); +- +- /* only extents have references, skip everything else */ +- if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) +- continue; +- +- fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); +- +- /* inline extents live in the btree, they don't have refs */ +- if (btrfs_file_extent_type(leaf, fi) == +- BTRFS_FILE_EXTENT_INLINE) +- continue; +- +- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); +- +- /* holes don't have refs */ +- if (disk_bytenr == 0) +- continue; +- +- num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); +- ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, +- leaf->start, 0, key.objectid, 0); +- BUG_ON(ret); +- } +- return 0; +-} +- +-static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- struct btrfs_leaf_ref *ref) +-{ +- int i; +- int ret; +- struct btrfs_extent_info *info; +- struct refsort *sorted; +- +- if (ref->nritems == 0) +- return 0; +- +- sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS); +- for (i = 0; i < ref->nritems; i++) { +- sorted[i].bytenr = ref->extents[i].bytenr; +- sorted[i].slot = i; +- } +- sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL); +- +- /* +- * the items in the ref were sorted when the ref was inserted +- * into the ref cache, so this is already in order +- */ +- for (i = 0; i < ref->nritems; i++) { +- info = ref->extents + sorted[i].slot; +- ret = btrfs_free_extent(trans, root, info->bytenr, +- info->num_bytes, ref->bytenr, +- ref->owner, ref->generation, +- info->objectid, 0); +- +- atomic_inc(&root->fs_info->throttle_gen); +- wake_up(&root->fs_info->transaction_throttle); +- cond_resched(); +- +- BUG_ON(ret); +- info++; +- } +- +- kfree(sorted); +- return 0; +-} +- +- +-static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, u64 start, +- u64 len, u32 *refs) +-{ +- int ret; +- +- ret = btrfs_lookup_extent_refs(trans, root, start, len, refs); +- BUG_ON(ret); +- +-#if 0 /* some debugging code in case we see problems here */ +- /* if the refs count is one, it won't get increased again. But +- * if the ref count is > 1, someone may be decreasing it at +- * the same time we are. +- */ +- if (*refs != 1) { +- struct extent_buffer *eb = NULL; +- eb = btrfs_find_create_tree_block(root, start, len); +- if (eb) +- btrfs_tree_lock(eb); +- +- mutex_lock(&root->fs_info->alloc_mutex); +- ret = lookup_extent_ref(NULL, root, start, len, refs); +- BUG_ON(ret); +- mutex_unlock(&root->fs_info->alloc_mutex); +- +- if (eb) { +- btrfs_tree_unlock(eb); +- free_extent_buffer(eb); +- } +- if (*refs == 1) { +- printk(KERN_ERR "btrfs block %llu went down to one " +- "during drop_snap\n", (unsigned long long)start); +- } +- +- } +-#endif +- +- cond_resched(); +- return ret; +-} ++struct walk_control { ++ u64 refs[BTRFS_MAX_LEVEL]; ++ u64 flags[BTRFS_MAX_LEVEL]; ++ struct btrfs_key update_progress; ++ int stage; ++ int level; ++ int shared_level; ++ int update_ref; ++ int keep_locks; ++ int reada_slot; ++ int reada_count; ++}; + ++#define DROP_REFERENCE 1 ++#define UPDATE_BACKREF 2 + +-/* +- * this is used while deleting old snapshots, and it drops the refs +- * on a whole subtree starting from a level 1 node. +- * +- * The idea is to sort all the leaf pointers, and then drop the +- * ref on all the leaves in order. Most of the time the leaves +- * will have ref cache entries, so no leaf IOs will be required to +- * find the extents they have references on. +- * +- * For each leaf, any references it has are also dropped in order +- * +- * This ends up dropping the references in something close to optimal +- * order for reading and modifying the extent allocation tree. +- */ +-static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- struct btrfs_path *path) ++static noinline void reada_walk_down(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct walk_control *wc, ++ struct btrfs_path *path) + { + u64 bytenr; +- u64 root_owner; +- u64 root_gen; +- struct extent_buffer *eb = path->nodes[1]; +- struct extent_buffer *leaf; +- struct btrfs_leaf_ref *ref; +- struct refsort *sorted = NULL; +- int nritems = btrfs_header_nritems(eb); ++ u64 generation; ++ u64 refs; ++ u64 flags; ++ u64 last = 0; ++ u32 nritems; ++ u32 blocksize; ++ struct btrfs_key key; ++ struct extent_buffer *eb; + int ret; +- int i; +- int refi = 0; +- int slot = path->slots[1]; +- u32 blocksize = btrfs_level_size(root, 0); +- u32 refs; +- +- if (nritems == 0) +- goto out; +- +- root_owner = btrfs_header_owner(eb); +- root_gen = btrfs_header_generation(eb); +- sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); ++ int slot; ++ int nread = 0; + +- /* +- * step one, sort all the leaf pointers so we don't scribble +- * randomly into the extent allocation tree +- */ +- for (i = slot; i < nritems; i++) { +- sorted[refi].bytenr = btrfs_node_blockptr(eb, i); +- sorted[refi].slot = i; +- refi++; ++ if (path->slots[wc->level] < wc->reada_slot) { ++ wc->reada_count = wc->reada_count * 2 / 3; ++ wc->reada_count = max(wc->reada_count, 2); ++ } else { ++ wc->reada_count = wc->reada_count * 3 / 2; ++ wc->reada_count = min_t(int, wc->reada_count, ++ BTRFS_NODEPTRS_PER_BLOCK(root)); + } + +- /* +- * nritems won't be zero, but if we're picking up drop_snapshot +- * after a crash, slot might be > 0, so double check things +- * just in case. +- */ +- if (refi == 0) +- goto out; ++ eb = path->nodes[wc->level]; ++ nritems = btrfs_header_nritems(eb); ++ blocksize = btrfs_level_size(root, wc->level - 1); + +- sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); ++ for (slot = path->slots[wc->level]; slot < nritems; slot++) { ++ if (nread >= wc->reada_count) ++ break; + +- /* +- * the first loop frees everything the leaves point to +- */ +- for (i = 0; i < refi; i++) { +- u64 ptr_gen; ++ cond_resched(); ++ bytenr = btrfs_node_blockptr(eb, slot); ++ generation = btrfs_node_ptr_generation(eb, slot); + +- bytenr = sorted[i].bytenr; ++ if (slot == path->slots[wc->level]) ++ goto reada; + +- /* +- * check the reference count on this leaf. If it is > 1 +- * we just decrement it below and don't update any +- * of the refs the leaf points to. +- */ +- ret = drop_snap_lookup_refcount(trans, root, bytenr, +- blocksize, &refs); +- BUG_ON(ret); +- if (refs != 1) ++ if (wc->stage == UPDATE_BACKREF && ++ generation <= root->root_key.offset) + continue; + +- ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); +- +- /* +- * the leaf only had one reference, which means the +- * only thing pointing to this leaf is the snapshot +- * we're deleting. It isn't possible for the reference +- * count to increase again later +- * +- * The reference cache is checked for the leaf, +- * and if found we'll be able to drop any refs held by +- * the leaf without needing to read it in. +- */ +- ref = btrfs_lookup_leaf_ref(root, bytenr); +- if (ref && ref->generation != ptr_gen) { +- btrfs_free_leaf_ref(root, ref); +- ref = NULL; +- } +- if (ref) { +- ret = cache_drop_leaf_ref(trans, root, ref); +- BUG_ON(ret); +- btrfs_remove_leaf_ref(root, ref); +- btrfs_free_leaf_ref(root, ref); +- } else { +- /* +- * the leaf wasn't in the reference cache, so +- * we have to read it. +- */ +- leaf = read_tree_block(root, bytenr, blocksize, +- ptr_gen); +- ret = btrfs_drop_leaf_ref(trans, root, leaf); +- BUG_ON(ret); +- free_extent_buffer(leaf); +- } +- atomic_inc(&root->fs_info->throttle_gen); +- wake_up(&root->fs_info->transaction_throttle); +- cond_resched(); +- } +- +- /* +- * run through the loop again to free the refs on the leaves. +- * This is faster than doing it in the loop above because +- * the leaves are likely to be clustered together. We end up +- * working in nice chunks on the extent allocation tree. +- */ +- for (i = 0; i < refi; i++) { +- bytenr = sorted[i].bytenr; +- ret = btrfs_free_extent(trans, root, bytenr, +- blocksize, eb->start, +- root_owner, root_gen, 0, 1); ++ /* We don't lock the tree block, it's OK to be racy here */ ++ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, ++ &refs, &flags); + BUG_ON(ret); ++ BUG_ON(refs == 0); + +- atomic_inc(&root->fs_info->throttle_gen); +- wake_up(&root->fs_info->transaction_throttle); +- cond_resched(); +- } +-out: +- kfree(sorted); +- +- /* +- * update the path to show we've processed the entire level 1 +- * node. This will get saved into the root's drop_snapshot_progress +- * field so these drops are not repeated again if this transaction +- * commits. +- */ +- path->slots[1] = nritems; +- return 0; +-} +- +-/* +- * helper function for drop_snapshot, this walks down the tree dropping ref +- * counts as it goes. +- */ +-static noinline int walk_down_tree(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- struct btrfs_path *path, int *level) +-{ +- u64 root_owner; +- u64 root_gen; +- u64 bytenr; +- u64 ptr_gen; +- struct extent_buffer *next; +- struct extent_buffer *cur; +- struct extent_buffer *parent; +- u32 blocksize; +- int ret; +- u32 refs; +- +- WARN_ON(*level < 0); +- WARN_ON(*level >= BTRFS_MAX_LEVEL); +- ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start, +- path->nodes[*level]->len, &refs); +- BUG_ON(ret); +- if (refs > 1) +- goto out; +- +- /* +- * walk down to the last node level and free all the leaves +- */ +- while (*level >= 0) { +- WARN_ON(*level < 0); +- WARN_ON(*level >= BTRFS_MAX_LEVEL); +- cur = path->nodes[*level]; +- +- if (btrfs_header_level(cur) != *level) +- WARN_ON(1); +- +- if (path->slots[*level] >= +- btrfs_header_nritems(cur)) +- break; ++ if (wc->stage == DROP_REFERENCE) { ++ if (refs == 1) ++ goto reada; + +- /* the new code goes down to level 1 and does all the +- * leaves pointed to that node in bulk. So, this check +- * for level 0 will always be false. +- * +- * But, the disk format allows the drop_snapshot_progress +- * field in the root to leave things in a state where +- * a leaf will need cleaning up here. If someone crashes +- * with the old code and then boots with the new code, +- * we might find a leaf here. +- */ +- if (*level == 0) { +- ret = btrfs_drop_leaf_ref(trans, root, cur); +- BUG_ON(ret); +- break; ++ if (wc->level == 1 && ++ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) ++ continue; ++ if (!wc->update_ref || ++ generation <= root->root_key.offset) ++ continue; ++ btrfs_node_key_to_cpu(eb, &key, slot); ++ ret = btrfs_comp_cpu_keys(&key, ++ &wc->update_progress); ++ if (ret < 0) ++ continue; ++ } else { ++ if (wc->level == 1 && ++ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) ++ continue; + } +- +- /* +- * once we get to level one, process the whole node +- * at once, including everything below it. +- */ +- if (*level == 1) { +- ret = drop_level_one_refs(trans, root, path); +- BUG_ON(ret); ++reada: ++ ret = readahead_tree_block(root, bytenr, blocksize, ++ generation); ++ if (ret) + break; +- } +- +- bytenr = btrfs_node_blockptr(cur, path->slots[*level]); +- ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); +- blocksize = btrfs_level_size(root, *level - 1); +- +- ret = drop_snap_lookup_refcount(trans, root, bytenr, +- blocksize, &refs); +- BUG_ON(ret); +- +- /* +- * if there is more than one reference, we don't need +- * to read that node to drop any references it has. We +- * just drop the ref we hold on that node and move on to the +- * next slot in this level. +- */ +- if (refs != 1) { +- parent = path->nodes[*level]; +- root_owner = btrfs_header_owner(parent); +- root_gen = btrfs_header_generation(parent); +- path->slots[*level]++; +- +- ret = btrfs_free_extent(trans, root, bytenr, +- blocksize, parent->start, +- root_owner, root_gen, +- *level - 1, 1); +- BUG_ON(ret); +- +- atomic_inc(&root->fs_info->throttle_gen); +- wake_up(&root->fs_info->transaction_throttle); +- cond_resched(); +- +- continue; +- } +- +- /* +- * we need to keep freeing things in the next level down. +- * read the block and loop around to process it +- */ +- next = read_tree_block(root, bytenr, blocksize, ptr_gen); +- WARN_ON(*level <= 0); +- if (path->nodes[*level-1]) +- free_extent_buffer(path->nodes[*level-1]); +- path->nodes[*level-1] = next; +- *level = btrfs_header_level(next); +- path->slots[*level] = 0; +- cond_resched(); ++ last = bytenr + blocksize; ++ nread++; + } +-out: +- WARN_ON(*level < 0); +- WARN_ON(*level >= BTRFS_MAX_LEVEL); +- +- if (path->nodes[*level] == root->node) { +- parent = path->nodes[*level]; +- bytenr = path->nodes[*level]->start; +- } else { +- parent = path->nodes[*level + 1]; +- bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]); +- } +- +- blocksize = btrfs_level_size(root, *level); +- root_owner = btrfs_header_owner(parent); +- root_gen = btrfs_header_generation(parent); +- +- /* +- * cleanup and free the reference on the last node +- * we processed +- */ +- ret = btrfs_free_extent(trans, root, bytenr, blocksize, +- parent->start, root_owner, root_gen, +- *level, 1); +- free_extent_buffer(path->nodes[*level]); +- path->nodes[*level] = NULL; +- +- *level += 1; +- BUG_ON(ret); +- +- cond_resched(); +- return 0; ++ wc->reada_slot = slot; + } +-#endif +- +-struct walk_control { +- u64 refs[BTRFS_MAX_LEVEL]; +- u64 flags[BTRFS_MAX_LEVEL]; +- struct btrfs_key update_progress; +- int stage; +- int level; +- int shared_level; +- int update_ref; +- int keep_locks; +-}; +- +-#define DROP_REFERENCE 1 +-#define UPDATE_BACKREF 2 + + /* + * hepler to process tree block while walking down the tree. + * +- * when wc->stage == DROP_REFERENCE, this function checks +- * reference count of the block. if the block is shared and +- * we need update back refs for the subtree rooted at the +- * block, this function changes wc->stage to UPDATE_BACKREF +- * + * when wc->stage == UPDATE_BACKREF, this function updates + * back refs for pointers in the block. + * +@@ -4800,11 +5002,10 @@ struct walk_control { + static noinline int walk_down_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, +- struct walk_control *wc) ++ struct walk_control *wc, int lookup_info) + { + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; +- struct btrfs_key key; + u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; + int ret; + +@@ -4816,8 +5017,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, + * when reference count of tree block is 1, it won't increase + * again. once full backref flag is set, we never clear it. + */ +- if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || +- (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { ++ if (lookup_info && ++ ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || ++ (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { + BUG_ON(!path->locks[level]); + ret = btrfs_lookup_extent_info(trans, root, + eb->start, eb->len, +@@ -4827,21 +5029,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, + BUG_ON(wc->refs[level] == 0); + } + +- if (wc->stage == DROP_REFERENCE && +- wc->update_ref && wc->refs[level] > 1) { +- BUG_ON(eb == root->node); +- BUG_ON(path->slots[level] > 0); +- if (level == 0) +- btrfs_item_key_to_cpu(eb, &key, path->slots[level]); +- else +- btrfs_node_key_to_cpu(eb, &key, path->slots[level]); +- if (btrfs_header_owner(eb) == root->root_key.objectid && +- btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) { +- wc->stage = UPDATE_BACKREF; +- wc->shared_level = level; +- } +- } +- + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level] > 1) + return 1; +@@ -4878,6 +5065,136 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, + } + + /* ++ * hepler to process tree block pointer. ++ * ++ * when wc->stage == DROP_REFERENCE, this function checks ++ * reference count of the block pointed to. if the block ++ * is shared and we need update back refs for the subtree ++ * rooted at the block, this function changes wc->stage to ++ * UPDATE_BACKREF. if the block is shared and there is no ++ * need to update back, this function drops the reference ++ * to the block. ++ * ++ * NOTE: return value 1 means we should stop walking down. ++ */ ++static noinline int do_walk_down(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct btrfs_path *path, ++ struct walk_control *wc, int *lookup_info) ++{ ++ u64 bytenr; ++ u64 generation; ++ u64 parent; ++ u32 blocksize; ++ struct btrfs_key key; ++ struct extent_buffer *next; ++ int level = wc->level; ++ int reada = 0; ++ int ret = 0; ++ ++ generation = btrfs_node_ptr_generation(path->nodes[level], ++ path->slots[level]); ++ /* ++ * if the lower level block was created before the snapshot ++ * was created, we know there is no need to update back refs ++ * for the subtree ++ */ ++ if (wc->stage == UPDATE_BACKREF && ++ generation <= root->root_key.offset) { ++ *lookup_info = 1; ++ return 1; ++ } ++ ++ bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); ++ blocksize = btrfs_level_size(root, level - 1); ++ ++ next = btrfs_find_tree_block(root, bytenr, blocksize); ++ if (!next) { ++ next = btrfs_find_create_tree_block(root, bytenr, blocksize); ++ reada = 1; ++ } ++ btrfs_tree_lock(next); ++ btrfs_set_lock_blocking(next); ++ ++ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, ++ &wc->refs[level - 1], ++ &wc->flags[level - 1]); ++ BUG_ON(ret); ++ BUG_ON(wc->refs[level - 1] == 0); ++ *lookup_info = 0; ++ ++ if (wc->stage == DROP_REFERENCE) { ++ if (wc->refs[level - 1] > 1) { ++ if (level == 1 && ++ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) ++ goto skip; ++ ++ if (!wc->update_ref || ++ generation <= root->root_key.offset) ++ goto skip; ++ ++ btrfs_node_key_to_cpu(path->nodes[level], &key, ++ path->slots[level]); ++ ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); ++ if (ret < 0) ++ goto skip; ++ ++ wc->stage = UPDATE_BACKREF; ++ wc->shared_level = level - 1; ++ } ++ } else { ++ if (level == 1 && ++ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) ++ goto skip; ++ } ++ ++ if (!btrfs_buffer_uptodate(next, generation)) { ++ btrfs_tree_unlock(next); ++ free_extent_buffer(next); ++ next = NULL; ++ *lookup_info = 1; ++ } ++ ++ if (!next) { ++ if (reada && level == 1) ++ reada_walk_down(trans, root, wc, path); ++ next = read_tree_block(root, bytenr, blocksize, generation); ++ btrfs_tree_lock(next); ++ btrfs_set_lock_blocking(next); ++ } ++ ++ level--; ++ BUG_ON(level != btrfs_header_level(next)); ++ path->nodes[level] = next; ++ path->slots[level] = 0; ++ path->locks[level] = 1; ++ wc->level = level; ++ if (wc->level == 1) ++ wc->reada_slot = 0; ++ return 0; ++skip: ++ wc->refs[level - 1] = 0; ++ wc->flags[level - 1] = 0; ++ if (wc->stage == DROP_REFERENCE) { ++ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { ++ parent = path->nodes[level]->start; ++ } else { ++ BUG_ON(root->root_key.objectid != ++ btrfs_header_owner(path->nodes[level])); ++ parent = 0; ++ } ++ ++ ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, ++ root->root_key.objectid, level - 1, 0); ++ BUG_ON(ret); ++ } ++ btrfs_tree_unlock(next); ++ free_extent_buffer(next); ++ *lookup_info = 1; ++ return 1; ++} ++ ++/* + * hepler to process tree block while walking up the tree. + * + * when wc->stage == DROP_REFERENCE, this function drops +@@ -4904,7 +5221,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, + if (level < wc->shared_level) + goto out; + +- BUG_ON(wc->refs[level] <= 1); + ret = find_next_key(path, level + 1, &wc->update_progress); + if (ret > 0) + wc->update_ref = 0; +@@ -4935,8 +5251,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, + path->locks[level] = 0; + return 1; + } +- } else { +- BUG_ON(level != 0); + } + } + +@@ -4989,39 +5303,28 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct walk_control *wc) + { +- struct extent_buffer *next; +- struct extent_buffer *cur; +- u64 bytenr; +- u64 ptr_gen; +- u32 blocksize; + int level = wc->level; ++ int lookup_info = 1; + int ret; + + while (level >= 0) { +- cur = path->nodes[level]; +- BUG_ON(path->slots[level] >= btrfs_header_nritems(cur)); ++ if (path->slots[level] >= ++ btrfs_header_nritems(path->nodes[level])) ++ break; + +- ret = walk_down_proc(trans, root, path, wc); ++ ret = walk_down_proc(trans, root, path, wc, lookup_info); + if (ret > 0) + break; + + if (level == 0) + break; + +- bytenr = btrfs_node_blockptr(cur, path->slots[level]); +- blocksize = btrfs_level_size(root, level - 1); +- ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]); +- +- next = read_tree_block(root, bytenr, blocksize, ptr_gen); +- btrfs_tree_lock(next); +- btrfs_set_lock_blocking(next); +- +- level--; +- BUG_ON(level != btrfs_header_level(next)); +- path->nodes[level] = next; +- path->slots[level] = 0; +- path->locks[level] = 1; +- wc->level = level; ++ ret = do_walk_down(trans, root, path, wc, &lookup_info); ++ if (ret > 0) { ++ path->slots[level]++; ++ continue; ++ } ++ level = wc->level; + } + return 0; + } +@@ -5111,9 +5414,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) + err = ret; + goto out; + } +- btrfs_node_key_to_cpu(path->nodes[level], &key, +- path->slots[level]); +- WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key))); ++ WARN_ON(ret > 0); + + /* + * unlock our path, this is safe because only this +@@ -5148,6 +5449,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) + wc->stage = DROP_REFERENCE; + wc->update_ref = update_ref; + wc->keep_locks = 0; ++ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); + + while (1) { + ret = walk_down_tree(trans, root, path, wc); +@@ -5200,9 +5502,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) + ret = btrfs_del_root(trans, tree_root, &root->root_key); + BUG_ON(ret); + +- free_extent_buffer(root->node); +- free_extent_buffer(root->commit_root); +- kfree(root); ++ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { ++ ret = btrfs_find_last_root(tree_root, root->root_key.objectid, ++ NULL, NULL); ++ BUG_ON(ret < 0); ++ if (ret > 0) { ++ ret = btrfs_del_orphan_item(trans, tree_root, ++ root->root_key.objectid); ++ BUG_ON(ret); ++ } ++ } ++ ++ if (root->in_radix) { ++ btrfs_free_fs_root(tree_root->fs_info, root); ++ } else { ++ free_extent_buffer(root->node); ++ free_extent_buffer(root->commit_root); ++ kfree(root); ++ } + out: + btrfs_end_transaction(trans, tree_root); + kfree(wc); +@@ -5254,6 +5571,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + wc->stage = DROP_REFERENCE; + wc->update_ref = 0; + wc->keep_locks = 1; ++ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); + + while (1) { + wret = walk_down_tree(trans, root, path, wc); +@@ -5396,9 +5714,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode, + lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); + while (1) { + int ret; +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; +@@ -6841,287 +7159,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root, + return 0; + } + +-#if 0 +-static int __insert_orphan_inode(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- u64 objectid, u64 size) +-{ +- struct btrfs_path *path; +- struct btrfs_inode_item *item; +- struct extent_buffer *leaf; +- int ret; +- +- path = btrfs_alloc_path(); +- if (!path) +- return -ENOMEM; +- +- path->leave_spinning = 1; +- ret = btrfs_insert_empty_inode(trans, root, path, objectid); +- if (ret) +- goto out; +- +- leaf = path->nodes[0]; +- item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); +- memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); +- btrfs_set_inode_generation(leaf, item, 1); +- btrfs_set_inode_size(leaf, item, size); +- btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); +- btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); +- btrfs_mark_buffer_dirty(leaf); +- btrfs_release_path(root, path); +-out: +- btrfs_free_path(path); +- return ret; +-} +- +-static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, +- struct btrfs_block_group_cache *group) ++/* ++ * checks to see if its even possible to relocate this block group. ++ * ++ * @return - -1 if it's not a good idea to relocate this block group, 0 if its ++ * ok to go ahead and try. ++ */ ++int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) + { +- struct inode *inode = NULL; +- struct btrfs_trans_handle *trans; +- struct btrfs_root *root; +- struct btrfs_key root_key; +- u64 objectid = BTRFS_FIRST_FREE_OBJECTID; +- int err = 0; ++ struct btrfs_block_group_cache *block_group; ++ struct btrfs_space_info *space_info; ++ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; ++ struct btrfs_device *device; ++ int full = 0; ++ int ret = 0; + +- root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; +- root_key.type = BTRFS_ROOT_ITEM_KEY; +- root_key.offset = (u64)-1; +- root = btrfs_read_fs_root_no_name(fs_info, &root_key); +- if (IS_ERR(root)) +- return ERR_CAST(root); ++ block_group = btrfs_lookup_block_group(root->fs_info, bytenr); + +- trans = btrfs_start_transaction(root, 1); +- BUG_ON(!trans); ++ /* odd, couldn't find the block group, leave it alone */ ++ if (!block_group) ++ return -1; + +- err = btrfs_find_free_objectid(trans, root, objectid, &objectid); +- if (err) ++ /* no bytes used, we're good */ ++ if (!btrfs_block_group_used(&block_group->item)) + goto out; + +- err = __insert_orphan_inode(trans, root, objectid, group->key.offset); +- BUG_ON(err); +- +- err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, +- group->key.offset, 0, group->key.offset, +- 0, 0, 0); +- BUG_ON(err); +- +- inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); +- if (inode->i_state & I_NEW) { +- BTRFS_I(inode)->root = root; +- BTRFS_I(inode)->location.objectid = objectid; +- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; +- BTRFS_I(inode)->location.offset = 0; +- btrfs_read_locked_inode(inode); +- unlock_new_inode(inode); +- BUG_ON(is_bad_inode(inode)); +- } else { +- BUG_ON(1); +- } +- BTRFS_I(inode)->index_cnt = group->key.objectid; +- +- err = btrfs_orphan_add(trans, inode); +-out: +- btrfs_end_transaction(trans, root); +- if (err) { +- if (inode) +- iput(inode); +- inode = ERR_PTR(err); +- } +- return inode; +-} +- +-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) +-{ +- +- struct btrfs_ordered_sum *sums; +- struct btrfs_sector_sum *sector_sum; +- struct btrfs_ordered_extent *ordered; +- struct btrfs_root *root = BTRFS_I(inode)->root; +- struct list_head list; +- size_t offset; +- int ret; +- u64 disk_bytenr; +- +- INIT_LIST_HEAD(&list); +- +- ordered = btrfs_lookup_ordered_extent(inode, file_pos); +- BUG_ON(ordered->file_offset != file_pos || ordered->len != len); +- +- disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; +- ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, +- disk_bytenr + len - 1, &list); +- +- while (!list_empty(&list)) { +- sums = list_entry(list.next, struct btrfs_ordered_sum, list); +- list_del_init(&sums->list); +- +- sector_sum = sums->sums; +- sums->bytenr = ordered->start; ++ space_info = block_group->space_info; ++ spin_lock(&space_info->lock); + +- offset = 0; +- while (offset < sums->len) { +- sector_sum->bytenr += ordered->start - disk_bytenr; +- sector_sum++; +- offset += root->sectorsize; +- } ++ full = space_info->full; + +- btrfs_add_ordered_sum(inode, ordered, sums); ++ /* ++ * if this is the last block group we have in this space, we can't ++ * relocate it unless we're able to allocate a new chunk below. ++ * ++ * Otherwise, we need to make sure we have room in the space to handle ++ * all of the extents from this block group. If we can, we're good ++ */ ++ if ((space_info->total_bytes != block_group->key.offset) && ++ (space_info->bytes_used + space_info->bytes_reserved + ++ space_info->bytes_pinned + space_info->bytes_readonly + ++ btrfs_block_group_used(&block_group->item) < ++ space_info->total_bytes)) { ++ spin_unlock(&space_info->lock); ++ goto out; + } +- btrfs_put_ordered_extent(ordered); +- return 0; +-} +- +-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start) +-{ +- struct btrfs_trans_handle *trans; +- struct btrfs_path *path; +- struct btrfs_fs_info *info = root->fs_info; +- struct extent_buffer *leaf; +- struct inode *reloc_inode; +- struct btrfs_block_group_cache *block_group; +- struct btrfs_key key; +- u64 skipped; +- u64 cur_byte; +- u64 total_found; +- u32 nritems; +- int ret; +- int progress; +- int pass = 0; +- +- root = root->fs_info->extent_root; +- +- block_group = btrfs_lookup_block_group(info, group_start); +- BUG_ON(!block_group); +- +- printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n", +- (unsigned long long)block_group->key.objectid, +- (unsigned long long)block_group->flags); +- +- path = btrfs_alloc_path(); +- BUG_ON(!path); +- +- reloc_inode = create_reloc_inode(info, block_group); +- BUG_ON(IS_ERR(reloc_inode)); +- +- __alloc_chunk_for_shrink(root, block_group, 1); +- set_block_group_readonly(block_group); +- +- btrfs_start_delalloc_inodes(info->tree_root); +- btrfs_wait_ordered_extents(info->tree_root, 0); +-again: +- skipped = 0; +- total_found = 0; +- progress = 0; +- key.objectid = block_group->key.objectid; +- key.offset = 0; +- key.type = 0; +- cur_byte = key.objectid; +- +- trans = btrfs_start_transaction(info->tree_root, 1); +- btrfs_commit_transaction(trans, info->tree_root); ++ spin_unlock(&space_info->lock); + +- mutex_lock(&root->fs_info->cleaner_mutex); +- btrfs_clean_old_snapshots(info->tree_root); +- btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); +- mutex_unlock(&root->fs_info->cleaner_mutex); ++ /* ++ * ok we don't have enough space, but maybe we have free space on our ++ * devices to allocate new chunks for relocation, so loop through our ++ * alloc devices and guess if we have enough space. However, if we ++ * were marked as full, then we know there aren't enough chunks, and we ++ * can just return. ++ */ ++ ret = -1; ++ if (full) ++ goto out; + +- trans = btrfs_start_transaction(info->tree_root, 1); +- btrfs_commit_transaction(trans, info->tree_root); ++ mutex_lock(&root->fs_info->chunk_mutex); ++ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { ++ u64 min_free = btrfs_block_group_used(&block_group->item); ++ u64 dev_offset, max_avail; + +- while (1) { +- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +- if (ret < 0) +- goto out; +-next: +- leaf = path->nodes[0]; +- nritems = btrfs_header_nritems(leaf); +- if (path->slots[0] >= nritems) { +- ret = btrfs_next_leaf(root, path); +- if (ret < 0) +- goto out; +- if (ret == 1) { +- ret = 0; ++ /* ++ * check to make sure we can actually find a chunk with enough ++ * space to fit our block group in. ++ */ ++ if (device->total_bytes > device->bytes_used + min_free) { ++ ret = find_free_dev_extent(NULL, device, min_free, ++ &dev_offset, &max_avail); ++ if (!ret) + break; +- } +- leaf = path->nodes[0]; +- nritems = btrfs_header_nritems(leaf); +- } +- +- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); +- +- if (key.objectid >= block_group->key.objectid + +- block_group->key.offset) +- break; +- +- if (progress && need_resched()) { +- btrfs_release_path(root, path); +- cond_resched(); +- progress = 0; +- continue; +- } +- progress = 1; +- +- if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY || +- key.objectid + key.offset <= cur_byte) { +- path->slots[0]++; +- goto next; ++ ret = -1; + } +- +- total_found++; +- cur_byte = key.objectid + key.offset; +- btrfs_release_path(root, path); +- +- __alloc_chunk_for_shrink(root, block_group, 0); +- ret = relocate_one_extent(root, path, &key, block_group, +- reloc_inode, pass); +- BUG_ON(ret < 0); +- if (ret > 0) +- skipped++; +- +- key.objectid = cur_byte; +- key.type = 0; +- key.offset = 0; + } +- +- btrfs_release_path(root, path); +- +- if (pass == 0) { +- btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1); +- invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1); +- } +- +- if (total_found > 0) { +- printk(KERN_INFO "btrfs found %llu extents in pass %d\n", +- (unsigned long long)total_found, pass); +- pass++; +- if (total_found == skipped && pass > 2) { +- iput(reloc_inode); +- reloc_inode = create_reloc_inode(info, block_group); +- pass = 0; +- } +- goto again; +- } +- +- /* delete reloc_inode */ +- iput(reloc_inode); +- +- /* unpin extents in this range */ +- trans = btrfs_start_transaction(info->tree_root, 1); +- btrfs_commit_transaction(trans, info->tree_root); +- +- spin_lock(&block_group->lock); +- WARN_ON(block_group->pinned > 0); +- WARN_ON(block_group->reserved > 0); +- WARN_ON(btrfs_block_group_used(&block_group->item) > 0); +- spin_unlock(&block_group->lock); +- btrfs_put_block_group(block_group); +- ret = 0; ++ mutex_unlock(&root->fs_info->chunk_mutex); + out: +- btrfs_free_path(path); ++ btrfs_put_block_group(block_group); + return ret; + } +-#endif + + static int find_first_block_group(struct btrfs_root *root, + struct btrfs_path *path, struct btrfs_key *key) +@@ -7164,8 +7281,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) + { + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; ++ struct btrfs_caching_control *caching_ctl; + struct rb_node *n; + ++ down_write(&info->extent_commit_sem); ++ while (!list_empty(&info->caching_block_groups)) { ++ caching_ctl = list_entry(info->caching_block_groups.next, ++ struct btrfs_caching_control, list); ++ list_del(&caching_ctl->list); ++ put_caching_control(caching_ctl); ++ } ++ up_write(&info->extent_commit_sem); ++ + spin_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + block_group = rb_entry(n, struct btrfs_block_group_cache, +@@ -7179,8 +7306,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) + up_write(&block_group->space_info->groups_sem); + + if (block_group->cached == BTRFS_CACHE_STARTED) +- wait_event(block_group->caching_q, +- block_group_cache_done(block_group)); ++ wait_block_group_cache_done(block_group); + + btrfs_remove_free_space_cache(block_group); + +@@ -7250,7 +7376,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) + spin_lock_init(&cache->lock); + spin_lock_init(&cache->tree_lock); + cache->fs_info = info; +- init_waitqueue_head(&cache->caching_q); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + +@@ -7272,8 +7397,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) + cache->flags = btrfs_block_group_flags(&cache->item); + cache->sectorsize = root->sectorsize; + +- remove_sb_from_cache(root, cache); +- + /* + * check for two cases, either we are full, and therefore + * don't need to bother with the caching work since we won't +@@ -7282,13 +7405,19 @@ int btrfs_read_block_groups(struct btrfs_root *root) + * time, particularly in the full case. + */ + if (found_key.offset == btrfs_block_group_used(&cache->item)) { ++ exclude_super_stripes(root, cache); ++ cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; ++ free_excluded_extents(root, cache); + } else if (btrfs_block_group_used(&cache->item) == 0) { ++ exclude_super_stripes(root, cache); ++ cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + add_new_free_space(cache, root->fs_info, + found_key.objectid, + found_key.objectid + + found_key.offset); ++ free_excluded_extents(root, cache); + } + + ret = update_space_info(info, cache->flags, found_key.offset, +@@ -7296,6 +7425,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) + &space_info); + BUG_ON(ret); + cache->space_info = space_info; ++ spin_lock(&cache->space_info->lock); ++ cache->space_info->bytes_super += cache->bytes_super; ++ spin_unlock(&cache->space_info->lock); ++ + down_write(&space_info->groups_sem); + list_add_tail(&cache->list, &space_info->block_groups); + up_write(&space_info->groups_sem); +@@ -7345,7 +7478,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + spin_lock_init(&cache->tree_lock); +- init_waitqueue_head(&cache->caching_q); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + +@@ -7354,15 +7486,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, + cache->flags = type; + btrfs_set_block_group_flags(&cache->item, type); + ++ cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; +- remove_sb_from_cache(root, cache); ++ exclude_super_stripes(root, cache); + + add_new_free_space(cache, root->fs_info, chunk_offset, + chunk_offset + size); + ++ free_excluded_extents(root, cache); ++ + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, + &cache->space_info); + BUG_ON(ret); ++ ++ spin_lock(&cache->space_info->lock); ++ cache->space_info->bytes_super += cache->bytes_super; ++ spin_unlock(&cache->space_info->lock); ++ + down_write(&cache->space_info->groups_sem); + list_add_tail(&cache->list, &cache->space_info->block_groups); + up_write(&cache->space_info->groups_sem); +@@ -7428,8 +7568,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + up_write(&block_group->space_info->groups_sem); + + if (block_group->cached == BTRFS_CACHE_STARTED) +- wait_event(block_group->caching_q, +- block_group_cache_done(block_group)); ++ wait_block_group_cache_done(block_group); + + btrfs_remove_free_space_cache(block_group); + +diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c +index 6826018..96577e8 100644 +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree, + return NULL; + } + ++static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, ++ struct extent_state *other) ++{ ++ if (tree->ops && tree->ops->merge_extent_hook) ++ tree->ops->merge_extent_hook(tree->mapping->host, new, ++ other); ++} ++ + /* + * utility function to look for merge candidates inside a given range. + * Any extents with matching state are merged together into a single +@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree, + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->end == state->start - 1 && + other->state == state->state) { ++ merge_cb(tree, state, other); + state->start = other->start; + other->tree = NULL; + rb_erase(&other->rb_node, &tree->state); +@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree, + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->start == state->end + 1 && + other->state == state->state) { ++ merge_cb(tree, state, other); + other->start = state->start; + state->tree = NULL; + rb_erase(&state->rb_node, &tree->state); + free_extent_state(state); ++ state = NULL; + } + } ++ + return 0; + } + +-static void set_state_cb(struct extent_io_tree *tree, ++static int set_state_cb(struct extent_io_tree *tree, + struct extent_state *state, + unsigned long bits) + { + if (tree->ops && tree->ops->set_bit_hook) { +- tree->ops->set_bit_hook(tree->mapping->host, state->start, +- state->end, state->state, bits); ++ return tree->ops->set_bit_hook(tree->mapping->host, ++ state->start, state->end, ++ state->state, bits); + } ++ ++ return 0; + } + + static void clear_state_cb(struct extent_io_tree *tree, + struct extent_state *state, + unsigned long bits) + { +- if (tree->ops && tree->ops->clear_bit_hook) { +- tree->ops->clear_bit_hook(tree->mapping->host, state->start, +- state->end, state->state, bits); +- } ++ if (tree->ops && tree->ops->clear_bit_hook) ++ tree->ops->clear_bit_hook(tree->mapping->host, state, bits); + } + + /* +@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree, + int bits) + { + struct rb_node *node; ++ int ret; + + if (end < start) { + printk(KERN_ERR "btrfs end < start %llu %llu\n", +@@ -365,12 +379,15 @@ static int insert_state(struct extent_io_tree *tree, + (unsigned long long)start); + WARN_ON(1); + } ++ state->start = start; ++ state->end = end; ++ ret = set_state_cb(tree, state, bits); ++ if (ret) ++ return ret; ++ + if (bits & EXTENT_DIRTY) + tree->dirty_bytes += end - start + 1; +- set_state_cb(tree, state, bits); + state->state |= bits; +- state->start = start; +- state->end = end; + node = tree_insert(&tree->state, end, &state->rb_node); + if (node) { + struct extent_state *found; +@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree, + return 0; + } + ++static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, ++ u64 split) ++{ ++ if (tree->ops && tree->ops->split_extent_hook) ++ return tree->ops->split_extent_hook(tree->mapping->host, ++ orig, split); ++ return 0; ++} ++ + /* + * split a given extent state struct in two, inserting the preallocated + * struct 'prealloc' as the newly created second half. 'split' indicates an +@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, + struct extent_state *prealloc, u64 split) + { + struct rb_node *node; ++ ++ split_cb(tree, orig, split); ++ + prealloc->start = orig->start; + prealloc->end = split - 1; + prealloc->state = orig->state; +@@ -431,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, int bits, int wake, + int delete) + { +- int ret = state->state & bits; ++ int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; ++ int ret = state->state & bits_to_clear; + + if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; +@@ -439,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree, + tree->dirty_bytes -= range; + } + clear_state_cb(tree, state, bits); +- state->state &= ~bits; ++ state->state &= ~bits_to_clear; + if (wake) + wake_up(&state->wq); + if (delete || state->state == 0) { +@@ -471,10 +501,14 @@ static int clear_state_bit(struct extent_io_tree *tree, + * bits were already set, or zero if none of the bits were already set. + */ + int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, +- int bits, int wake, int delete, gfp_t mask) ++ int bits, int wake, int delete, ++ struct extent_state **cached_state, ++ gfp_t mask) + { + struct extent_state *state; ++ struct extent_state *cached; + struct extent_state *prealloc = NULL; ++ struct rb_node *next_node; + struct rb_node *node; + u64 last_end; + int err; +@@ -488,6 +522,17 @@ again: + } + + spin_lock(&tree->lock); ++ if (cached_state) { ++ cached = *cached_state; ++ *cached_state = NULL; ++ cached_state = NULL; ++ if (cached && cached->tree && cached->start == start) { ++ atomic_dec(&cached->refs); ++ state = cached; ++ goto hit_next; ++ } ++ free_extent_state(cached); ++ } + /* + * this search will find the extents that end after + * our range starts +@@ -496,6 +541,7 @@ again: + if (!node) + goto out; + state = rb_entry(node, struct extent_state, rb_node); ++hit_next: + if (state->start > end) + goto out; + WARN_ON(state->end < start); +@@ -526,13 +572,11 @@ again: + if (err) + goto out; + if (state->end <= end) { +- set |= clear_state_bit(tree, state, bits, +- wake, delete); ++ set |= clear_state_bit(tree, state, bits, wake, ++ delete); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; +- } else { +- start = state->start; + } + goto search_again; + } +@@ -547,19 +591,30 @@ again: + prealloc = alloc_extent_state(GFP_ATOMIC); + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); +- + if (wake) + wake_up(&state->wq); +- set |= clear_state_bit(tree, prealloc, bits, +- wake, delete); ++ ++ set |= clear_state_bit(tree, prealloc, bits, wake, delete); ++ + prealloc = NULL; + goto out; + } + ++ if (state->end < end && prealloc && !need_resched()) ++ next_node = rb_next(&state->rb_node); ++ else ++ next_node = NULL; ++ + set |= clear_state_bit(tree, state, bits, wake, delete); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; ++ if (start <= end && next_node) { ++ state = rb_entry(next_node, struct extent_state, ++ rb_node); ++ if (state->start == start) ++ goto hit_next; ++ } + goto search_again; + + out: +@@ -641,40 +696,59 @@ out: + return 0; + } + +-static void set_state_bits(struct extent_io_tree *tree, ++static int set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, + int bits) + { ++ int ret; ++ ++ ret = set_state_cb(tree, state, bits); ++ if (ret) ++ return ret; ++ + if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + tree->dirty_bytes += range; + } +- set_state_cb(tree, state, bits); + state->state |= bits; ++ ++ return 0; ++} ++ ++static void cache_state(struct extent_state *state, ++ struct extent_state **cached_ptr) ++{ ++ if (cached_ptr && !(*cached_ptr)) { ++ if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { ++ *cached_ptr = state; ++ atomic_inc(&state->refs); ++ } ++ } + } + + /* +- * set some bits on a range in the tree. This may require allocations +- * or sleeping, so the gfp mask is used to indicate what is allowed. ++ * set some bits on a range in the tree. This may require allocations or ++ * sleeping, so the gfp mask is used to indicate what is allowed. + * +- * If 'exclusive' == 1, this will fail with -EEXIST if some part of the +- * range already has the desired bits set. The start of the existing +- * range is returned in failed_start in this case. ++ * If any of the exclusive bits are set, this will fail with -EEXIST if some ++ * part of the range already has the desired bits set. The start of the ++ * existing range is returned in failed_start in this case. + * +- * [start, end] is inclusive +- * This takes the tree lock. ++ * [start, end] is inclusive This takes the tree lock. + */ ++ + static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, +- int bits, int exclusive, u64 *failed_start, ++ int bits, int exclusive_bits, u64 *failed_start, ++ struct extent_state **cached_state, + gfp_t mask) + { + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err = 0; +- int set; + u64 last_start; + u64 last_end; ++ + again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); +@@ -683,6 +757,13 @@ again: + } + + spin_lock(&tree->lock); ++ if (cached_state && *cached_state) { ++ state = *cached_state; ++ if (state->start == start && state->tree) { ++ node = &state->rb_node; ++ goto hit_next; ++ } ++ } + /* + * this search will find all the extents that end after + * our range starts. +@@ -694,8 +775,8 @@ again: + BUG_ON(err == -EEXIST); + goto out; + } +- + state = rb_entry(node, struct extent_state, rb_node); ++hit_next: + last_start = state->start; + last_end = state->end; + +@@ -706,17 +787,32 @@ again: + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { +- set = state->state & bits; +- if (set && exclusive) { ++ struct rb_node *next_node; ++ if (state->state & exclusive_bits) { + *failed_start = state->start; + err = -EEXIST; + goto out; + } +- set_state_bits(tree, state, bits); ++ ++ err = set_state_bits(tree, state, bits); ++ if (err) ++ goto out; ++ ++ cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; ++ + start = last_end + 1; ++ if (start < end && prealloc && !need_resched()) { ++ next_node = rb_next(node); ++ if (next_node) { ++ state = rb_entry(next_node, struct extent_state, ++ rb_node); ++ if (state->start == start) ++ goto hit_next; ++ } ++ } + goto search_again; + } + +@@ -737,8 +833,7 @@ again: + * desired bit on it. + */ + if (state->start < start) { +- set = state->state & bits; +- if (exclusive && set) { ++ if (state->state & exclusive_bits) { + *failed_start = start; + err = -EEXIST; + goto out; +@@ -749,13 +844,14 @@ again: + if (err) + goto out; + if (state->end <= end) { +- set_state_bits(tree, state, bits); ++ err = set_state_bits(tree, state, bits); ++ if (err) ++ goto out; ++ cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; +- } else { +- start = state->start; + } + goto search_again; + } +@@ -774,10 +870,13 @@ again: + this_end = last_start - 1; + err = insert_state(tree, prealloc, start, this_end, + bits); +- prealloc = NULL; + BUG_ON(err == -EEXIST); +- if (err) ++ if (err) { ++ prealloc = NULL; + goto out; ++ } ++ cache_state(prealloc, cached_state); ++ prealloc = NULL; + start = this_end + 1; + goto search_again; + } +@@ -788,8 +887,7 @@ again: + * on the first half + */ + if (state->start <= end && state->end > end) { +- set = state->state & bits; +- if (exclusive && set) { ++ if (state->state & exclusive_bits) { + *failed_start = start; + err = -EEXIST; + goto out; +@@ -797,7 +895,12 @@ again: + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + +- set_state_bits(tree, prealloc, bits); ++ err = set_state_bits(tree, prealloc, bits); ++ if (err) { ++ prealloc = NULL; ++ goto out; ++ } ++ cache_state(prealloc, cached_state); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; +@@ -826,86 +929,65 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { + return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, +- mask); +-} +- +-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, +- gfp_t mask) +-{ +- return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask); ++ NULL, mask); + } + + int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) + { + return set_extent_bit(tree, start, end, bits, 0, NULL, +- mask); ++ NULL, mask); + } + + int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) + { +- return clear_extent_bit(tree, start, end, bits, 0, 0, mask); ++ return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); + } + + int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { + return set_extent_bit(tree, start, end, +- EXTENT_DELALLOC | EXTENT_DIRTY, +- 0, NULL, mask); ++ EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, ++ 0, NULL, NULL, mask); + } + + int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { + return clear_extent_bit(tree, start, end, +- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); +-} +- +-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, +- gfp_t mask) +-{ +- return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask); ++ EXTENT_DIRTY | EXTENT_DELALLOC | ++ EXTENT_DO_ACCOUNTING, 0, 0, ++ NULL, mask); + } + + int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { + return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, +- mask); ++ NULL, mask); + } + + static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { +- return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); ++ return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, ++ NULL, mask); + } + + int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, +- mask); ++ NULL, mask); + } + + static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) + { +- return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); +-} +- +-static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end, +- gfp_t mask) +-{ +- return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, +- 0, NULL, mask); +-} +- +-static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, +- u64 end, gfp_t mask) +-{ +- return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); ++ return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, ++ NULL, mask); + } + + int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) +@@ -917,13 +999,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) + * either insert or lock state struct between start and end use mask to tell + * us if waiting is desired. + */ +-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) ++int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, ++ int bits, struct extent_state **cached_state, gfp_t mask) + { + int err; + u64 failed_start; + while (1) { +- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, +- &failed_start, mask); ++ err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, ++ EXTENT_LOCKED, &failed_start, ++ cached_state, mask); + if (err == -EEXIST && (mask & __GFP_WAIT)) { + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + start = failed_start; +@@ -935,27 +1019,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) + return err; + } + ++int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) ++{ ++ return lock_extent_bits(tree, start, end, 0, NULL, mask); ++} ++ + int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { + int err; + u64 failed_start; + +- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, +- &failed_start, mask); ++ err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, ++ &failed_start, NULL, mask); + if (err == -EEXIST) { + if (failed_start > start) + clear_extent_bit(tree, start, failed_start - 1, +- EXTENT_LOCKED, 1, 0, mask); ++ EXTENT_LOCKED, 1, 0, NULL, mask); + return 0; + } + return 1; + } + ++int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, ++ struct extent_state **cached, gfp_t mask) ++{ ++ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, ++ mask); ++} ++ + int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) + { +- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); ++ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, ++ mask); + } + + /* +@@ -974,7 +1071,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) + page_cache_release(page); + index++; + } +- set_extent_dirty(tree, start, end, GFP_NOFS); + return 0; + } + +@@ -994,7 +1090,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) + page_cache_release(page); + index++; + } +- set_extent_writeback(tree, start, end, GFP_NOFS); + return 0; + } + +@@ -1232,6 +1327,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode, + u64 delalloc_start; + u64 delalloc_end; + u64 found; ++ struct extent_state *cached_state = NULL; + int ret; + int loops = 0; + +@@ -1269,6 +1365,7 @@ again: + /* some of the pages are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching + */ ++ free_extent_state(cached_state); + if (!loops) { + unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); + max_bytes = PAGE_CACHE_SIZE - offset; +@@ -1282,18 +1379,21 @@ again: + BUG_ON(ret); + + /* step three, lock the state bits for the whole range */ +- lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); ++ lock_extent_bits(tree, delalloc_start, delalloc_end, ++ 0, &cached_state, GFP_NOFS); + + /* then test to make sure it is all still delalloc */ + ret = test_range_bit(tree, delalloc_start, delalloc_end, +- EXTENT_DELALLOC, 1); ++ EXTENT_DELALLOC, 1, cached_state); + if (!ret) { +- unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); ++ unlock_extent_cached(tree, delalloc_start, delalloc_end, ++ &cached_state, GFP_NOFS); + __unlock_for_delalloc(inode, locked_page, + delalloc_start, delalloc_end); + cond_resched(); + goto again; + } ++ free_extent_state(cached_state); + *start = delalloc_start; + *end = delalloc_end; + out_failed: +@@ -1303,11 +1403,7 @@ out_failed: + int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, +- int unlock_pages, +- int clear_unlock, +- int clear_delalloc, int clear_dirty, +- int set_writeback, +- int end_writeback) ++ unsigned long op) + { + int ret; + struct page *pages[16]; +@@ -1317,16 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode, + int i; + int clear_bits = 0; + +- if (clear_unlock) ++ if (op & EXTENT_CLEAR_UNLOCK) + clear_bits |= EXTENT_LOCKED; +- if (clear_dirty) ++ if (op & EXTENT_CLEAR_DIRTY) + clear_bits |= EXTENT_DIRTY; + +- if (clear_delalloc) ++ if (op & EXTENT_CLEAR_DELALLOC) + clear_bits |= EXTENT_DELALLOC; + +- clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); +- if (!(unlock_pages || clear_dirty || set_writeback || end_writeback)) ++ if (op & EXTENT_CLEAR_ACCOUNTING) ++ clear_bits |= EXTENT_DO_ACCOUNTING; ++ ++ clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); ++ if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | ++ EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | ++ EXTENT_SET_PRIVATE2))) + return 0; + + while (nr_pages > 0) { +@@ -1334,17 +1435,21 @@ int extent_clear_unlock_delalloc(struct inode *inode, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { ++ ++ if (op & EXTENT_SET_PRIVATE2) ++ SetPagePrivate2(pages[i]); ++ + if (pages[i] == locked_page) { + page_cache_release(pages[i]); + continue; + } +- if (clear_dirty) ++ if (op & EXTENT_CLEAR_DIRTY) + clear_page_dirty_for_io(pages[i]); +- if (set_writeback) ++ if (op & EXTENT_SET_WRITEBACK) + set_page_writeback(pages[i]); +- if (end_writeback) ++ if (op & EXTENT_END_WRITEBACK) + end_page_writeback(pages[i]); +- if (unlock_pages) ++ if (op & EXTENT_CLEAR_UNLOCK_PAGE) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } +@@ -1476,14 +1581,17 @@ out: + * range is found set. + */ + int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, +- int bits, int filled) ++ int bits, int filled, struct extent_state *cached) + { + struct extent_state *state = NULL; + struct rb_node *node; + int bitset = 0; + + spin_lock(&tree->lock); +- node = tree_search(tree, start); ++ if (cached && cached->tree && cached->start == start) ++ node = &cached->rb_node; ++ else ++ node = tree_search(tree, start); + while (node && start <= end) { + state = rb_entry(node, struct extent_state, rb_node); + +@@ -1503,6 +1611,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + bitset = 0; + break; + } ++ ++ if (state->end == (u64)-1) ++ break; ++ + start = state->end + 1; + if (start > end) + break; +@@ -1526,7 +1638,7 @@ static int check_page_uptodate(struct extent_io_tree *tree, + { + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; +- if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) ++ if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) + SetPageUptodate(page); + return 0; + } +@@ -1540,7 +1652,7 @@ static int check_page_locked(struct extent_io_tree *tree, + { + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; +- if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) ++ if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) + unlock_page(page); + return 0; + } +@@ -1552,10 +1664,7 @@ static int check_page_locked(struct extent_io_tree *tree, + static int check_page_writeback(struct extent_io_tree *tree, + struct page *page) + { +- u64 start = (u64)page->index << PAGE_CACHE_SHIFT; +- u64 end = start + PAGE_CACHE_SIZE - 1; +- if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) +- end_page_writeback(page); ++ end_page_writeback(page); + return 0; + } + +@@ -1613,13 +1722,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err) + } + + if (!uptodate) { +- clear_extent_uptodate(tree, start, end, GFP_ATOMIC); ++ clear_extent_uptodate(tree, start, end, GFP_NOFS); + ClearPageUptodate(page); + SetPageError(page); + } + +- clear_extent_writeback(tree, start, end, GFP_ATOMIC); +- + if (whole_page) + end_page_writeback(page); + else +@@ -1983,7 +2090,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, + continue; + } + /* the get_extent function already copied into the page */ +- if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { ++ if (test_range_bit(tree, cur, cur_end, ++ EXTENT_UPTODATE, 1, NULL)) { + check_page_uptodate(tree, page); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; +@@ -2078,6 +2186,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + u64 iosize; + u64 unlock_start; + sector_t sector; ++ struct extent_state *cached_state = NULL; + struct extent_map *em; + struct block_device *bdev; + int ret; +@@ -2124,6 +2233,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + delalloc_end = 0; + page_started = 0; + if (!epd->extent_locked) { ++ u64 delalloc_to_write = 0; + /* + * make sure the wbc mapping index is at least updated + * to this page. +@@ -2143,8 +2253,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + tree->ops->fill_delalloc(inode, page, delalloc_start, + delalloc_end, &page_started, + &nr_written); ++ /* ++ * delalloc_end is already one less than the total ++ * length, so we don't subtract one from ++ * PAGE_CACHE_SIZE ++ */ ++ delalloc_to_write += (delalloc_end - delalloc_start + ++ PAGE_CACHE_SIZE) >> ++ PAGE_CACHE_SHIFT; + delalloc_start = delalloc_end + 1; + } ++ if (wbc->nr_to_write < delalloc_to_write) { ++ int thresh = 8192; ++ ++ if (delalloc_to_write < thresh * 2) ++ thresh = delalloc_to_write; ++ wbc->nr_to_write = min_t(u64, delalloc_to_write, ++ thresh); ++ } + + /* did the fill delalloc function already unlock and start + * the IO? +@@ -2160,15 +2286,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + goto done_unlocked; + } + } +- lock_extent(tree, start, page_end, GFP_NOFS); +- +- unlock_start = start; +- + if (tree->ops && tree->ops->writepage_start_hook) { + ret = tree->ops->writepage_start_hook(page, start, + page_end); + if (ret == -EAGAIN) { +- unlock_extent(tree, start, page_end, GFP_NOFS); + redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); + unlock_page(page); +@@ -2184,12 +2305,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + update_nr_written(page, wbc, nr_written + 1); + + end = page_end; +- if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) +- printk(KERN_ERR "btrfs delalloc bits after lock_extent\n"); +- + if (last_byte <= start) { +- clear_extent_dirty(tree, start, page_end, GFP_NOFS); +- unlock_extent(tree, start, page_end, GFP_NOFS); + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, + page_end, NULL, 1); +@@ -2197,13 +2313,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + goto done; + } + +- set_extent_uptodate(tree, start, page_end, GFP_NOFS); + blocksize = inode->i_sb->s_blocksize; + + while (cur <= end) { + if (cur >= last_byte) { +- clear_extent_dirty(tree, cur, page_end, GFP_NOFS); +- unlock_extent(tree, unlock_start, page_end, GFP_NOFS); + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, cur, + page_end, NULL, 1); +@@ -2235,12 +2348,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + */ + if (compressed || block_start == EXTENT_MAP_HOLE || + block_start == EXTENT_MAP_INLINE) { +- clear_extent_dirty(tree, cur, +- cur + iosize - 1, GFP_NOFS); +- +- unlock_extent(tree, unlock_start, cur + iosize - 1, +- GFP_NOFS); +- + /* + * end_io notification does not happen here for + * compressed extents +@@ -2265,13 +2372,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + } + /* leave this out until we have a page_mkwrite call */ + if (0 && !test_range_bit(tree, cur, cur + iosize - 1, +- EXTENT_DIRTY, 0)) { ++ EXTENT_DIRTY, 0, NULL)) { + cur = cur + iosize; + pg_offset += iosize; + continue; + } + +- clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); + if (tree->ops && tree->ops->writepage_io_hook) { + ret = tree->ops->writepage_io_hook(page, cur, + cur + iosize - 1); +@@ -2309,12 +2415,12 @@ done: + set_page_writeback(page); + end_page_writeback(page); + } +- if (unlock_start <= page_end) +- unlock_extent(tree, unlock_start, page_end, GFP_NOFS); + unlock_page(page); + + done_unlocked: + ++ /* drop our reference on any cached states */ ++ free_extent_state(cached_state); + return 0; + } + +@@ -2339,9 +2445,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, + writepage_t writepage, void *data, + void (*flush_fn)(void *)) + { +- struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; ++ int nr_to_write_done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; +@@ -2361,7 +2467,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, + scanned = 1; + } + retry: +- while (!done && (index <= end) && ++ while (!done && !nr_to_write_done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1))) { +@@ -2412,12 +2518,15 @@ retry: + unlock_page(page); + ret = 0; + } +- if (ret || wbc->nr_to_write <= 0) +- done = 1; +- if (wbc->nonblocking && bdi_write_congested(bdi)) { +- wbc->encountered_congestion = 1; ++ if (ret) + done = 1; +- } ++ ++ /* ++ * the filesystem may choose to bump up nr_to_write. ++ * We have to make sure to honor the new nr_to_write ++ * at any time ++ */ ++ nr_to_write_done = wbc->nr_to_write <= 0; + } + pagevec_release(&pvec); + cond_resched(); +@@ -2604,10 +2713,11 @@ int extent_invalidatepage(struct extent_io_tree *tree, + return 0; + + lock_extent(tree, start, end, GFP_NOFS); +- wait_on_extent_writeback(tree, start, end); ++ wait_on_page_writeback(page); + clear_extent_bit(tree, start, end, +- EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, +- 1, 1, GFP_NOFS); ++ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | ++ EXTENT_DO_ACCOUNTING, ++ 1, 1, NULL, GFP_NOFS); + return 0; + } + +@@ -2687,7 +2797,7 @@ int extent_prepare_write(struct extent_io_tree *tree, + !isnew && !PageUptodate(page) && + (block_off_end > to || block_off_start < from) && + !test_range_bit(tree, block_start, cur_end, +- EXTENT_UPTODATE, 1)) { ++ EXTENT_UPTODATE, 1, NULL)) { + u64 sector; + u64 extent_offset = block_start - em->start; + size_t iosize; +@@ -2701,7 +2811,7 @@ int extent_prepare_write(struct extent_io_tree *tree, + */ + set_extent_bit(tree, block_start, + block_start + iosize - 1, +- EXTENT_LOCKED, 0, NULL, GFP_NOFS); ++ EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS); + ret = submit_extent_page(READ, tree, page, + sector, iosize, page_offset, em->bdev, + NULL, 1, +@@ -2742,13 +2852,18 @@ int try_release_extent_state(struct extent_map_tree *map, + int ret = 1; + + if (test_range_bit(tree, start, end, +- EXTENT_IOBITS | EXTENT_ORDERED, 0)) ++ EXTENT_IOBITS, 0, NULL)) + ret = 0; + else { + if ((mask & GFP_NOFS) == GFP_NOFS) + mask = GFP_NOFS; +- clear_extent_bit(tree, start, end, EXTENT_UPTODATE, +- 1, 1, mask); ++ /* ++ * at this point we can safely clear everything except the ++ * locked bit and the nodatasum bit ++ */ ++ clear_extent_bit(tree, start, end, ++ ~(EXTENT_LOCKED | EXTENT_NODATASUM), ++ 0, 0, NULL, mask); + } + return ret; + } +@@ -2771,29 +2886,28 @@ int try_release_extent_mapping(struct extent_map_tree *map, + u64 len; + while (start <= end) { + len = end - start + 1; +- spin_lock(&map->lock); ++ write_lock(&map->lock); + em = lookup_extent_mapping(map, start, len); + if (!em || IS_ERR(em)) { +- spin_unlock(&map->lock); ++ write_unlock(&map->lock); + break; + } + if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || + em->start != start) { +- spin_unlock(&map->lock); ++ write_unlock(&map->lock); + free_extent_map(em); + break; + } + if (!test_range_bit(tree, em->start, + extent_map_end(em) - 1, +- EXTENT_LOCKED | EXTENT_WRITEBACK | +- EXTENT_ORDERED, +- 0)) { ++ EXTENT_LOCKED | EXTENT_WRITEBACK, ++ 0, NULL)) { + remove_extent_mapping(map, em); + /* once for the rb tree */ + free_extent_map(em); + } + start = extent_map_end(em); +- spin_unlock(&map->lock); ++ write_unlock(&map->lock); + + /* once for us */ + free_extent_map(em); +@@ -3203,7 +3317,7 @@ int extent_range_uptodate(struct extent_io_tree *tree, + int uptodate; + unsigned long index; + +- ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1); ++ ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); + if (ret) + return 1; + while (start <= end) { +@@ -3233,7 +3347,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, + return 1; + + ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, +- EXTENT_UPTODATE, 1); ++ EXTENT_UPTODATE, 1, NULL); + if (ret) + return ret; + +@@ -3269,7 +3383,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, + return 0; + + if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, +- EXTENT_UPTODATE, 1)) { ++ EXTENT_UPTODATE, 1, NULL)) { + return 0; + } + +diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h +index 5bc20ab..36de250 100644 +--- a/fs/btrfs/extent_io.h ++++ b/fs/btrfs/extent_io.h +@@ -13,10 +13,9 @@ + #define EXTENT_DEFRAG (1 << 6) + #define EXTENT_DEFRAG_DONE (1 << 7) + #define EXTENT_BUFFER_FILLED (1 << 8) +-#define EXTENT_ORDERED (1 << 9) +-#define EXTENT_ORDERED_METADATA (1 << 10) +-#define EXTENT_BOUNDARY (1 << 11) +-#define EXTENT_NODATASUM (1 << 12) ++#define EXTENT_BOUNDARY (1 << 9) ++#define EXTENT_NODATASUM (1 << 10) ++#define EXTENT_DO_ACCOUNTING (1 << 11) + #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) + + /* flags for bio submission */ +@@ -27,6 +26,16 @@ + #define EXTENT_BUFFER_BLOCKING 1 + #define EXTENT_BUFFER_DIRTY 2 + ++/* these are flags for extent_clear_unlock_delalloc */ ++#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 ++#define EXTENT_CLEAR_UNLOCK 0x2 ++#define EXTENT_CLEAR_DELALLOC 0x4 ++#define EXTENT_CLEAR_DIRTY 0x8 ++#define EXTENT_SET_WRITEBACK 0x10 ++#define EXTENT_END_WRITEBACK 0x20 ++#define EXTENT_SET_PRIVATE2 0x40 ++#define EXTENT_CLEAR_ACCOUNTING 0x80 ++ + /* + * page->private values. Every page that is controlled by the extent + * map has page->private set to one. +@@ -62,8 +71,13 @@ struct extent_io_ops { + struct extent_state *state, int uptodate); + int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits); +- int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, +- unsigned long old, unsigned long bits); ++ int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, ++ unsigned long bits); ++ int (*merge_extent_hook)(struct inode *inode, ++ struct extent_state *new, ++ struct extent_state *other); ++ int (*split_extent_hook)(struct inode *inode, ++ struct extent_state *orig, u64 split); + int (*write_cache_pages_lock_hook)(struct page *page); + }; + +@@ -81,10 +95,14 @@ struct extent_state { + u64 start; + u64 end; /* inclusive */ + struct rb_node rb_node; ++ ++ /* ADD NEW ELEMENTS AFTER THIS */ + struct extent_io_tree *tree; + wait_queue_head_t wq; + atomic_t refs; + unsigned long state; ++ u64 split_start; ++ u64 split_end; + + /* for use by the FS */ + u64 private; +@@ -142,6 +160,8 @@ int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask); + int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); ++int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, ++ int bits, struct extent_state **cached, gfp_t mask); + int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); + int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +@@ -155,11 +175,12 @@ u64 count_range_bits(struct extent_io_tree *tree, + u64 max_bytes, unsigned long bits); + + int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, +- int bits, int filled); ++ int bits, int filled, struct extent_state *cached_state); + int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); + int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, +- int bits, int wake, int delete, gfp_t mask); ++ int bits, int wake, int delete, struct extent_state **cached, ++ gfp_t mask); + int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); + int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, +@@ -278,9 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree, + int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, +- int unlock_page, +- int clear_unlock, +- int clear_delalloc, int clear_dirty, +- int set_writeback, +- int end_writeback); ++ unsigned long op); + #endif +diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c +index 30c9365..2c726b7 100644 +--- a/fs/btrfs/extent_map.c ++++ b/fs/btrfs/extent_map.c +@@ -36,7 +36,7 @@ void extent_map_exit(void) + void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) + { + tree->map.rb_node = NULL; +- spin_lock_init(&tree->lock); ++ rwlock_init(&tree->lock); + } + + /** +@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) + return 0; + } + ++int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) ++{ ++ int ret = 0; ++ struct extent_map *merge = NULL; ++ struct rb_node *rb; ++ struct extent_map *em; ++ ++ write_lock(&tree->lock); ++ em = lookup_extent_mapping(tree, start, len); ++ ++ WARN_ON(em->start != start || !em); ++ ++ if (!em) ++ goto out; ++ ++ clear_bit(EXTENT_FLAG_PINNED, &em->flags); ++ ++ if (em->start != 0) { ++ rb = rb_prev(&em->rb_node); ++ if (rb) ++ merge = rb_entry(rb, struct extent_map, rb_node); ++ if (rb && mergable_maps(merge, em)) { ++ em->start = merge->start; ++ em->len += merge->len; ++ em->block_len += merge->block_len; ++ em->block_start = merge->block_start; ++ merge->in_tree = 0; ++ rb_erase(&merge->rb_node, &tree->map); ++ free_extent_map(merge); ++ } ++ } ++ ++ rb = rb_next(&em->rb_node); ++ if (rb) ++ merge = rb_entry(rb, struct extent_map, rb_node); ++ if (rb && mergable_maps(em, merge)) { ++ em->len += merge->len; ++ em->block_len += merge->len; ++ rb_erase(&merge->rb_node, &tree->map); ++ merge->in_tree = 0; ++ free_extent_map(merge); ++ } ++ ++ free_extent_map(em); ++out: ++ write_unlock(&tree->lock); ++ return ret; ++ ++} ++ + /** + * add_extent_mapping - add new extent map to the extent tree + * @tree: tree to insert new map in +@@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree, + ret = -EEXIST; + goto out; + } +- assert_spin_locked(&tree->lock); + rb = tree_insert(&tree->map, em->start, &em->rb_node); + if (rb) { + ret = -EEXIST; +@@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + struct rb_node *next = NULL; + u64 end = range_end(start, len); + +- assert_spin_locked(&tree->lock); + rb_node = __tree_search(&tree->map, start, &prev, &next); + if (!rb_node && prev) { + em = rb_entry(prev, struct extent_map, rb_node); +@@ -319,6 +367,54 @@ out: + } + + /** ++ * search_extent_mapping - find a nearby extent map ++ * @tree: tree to lookup in ++ * @start: byte offset to start the search ++ * @len: length of the lookup range ++ * ++ * Find and return the first extent_map struct in @tree that intersects the ++ * [start, len] range. ++ * ++ * If one can't be found, any nearby extent may be returned ++ */ ++struct extent_map *search_extent_mapping(struct extent_map_tree *tree, ++ u64 start, u64 len) ++{ ++ struct extent_map *em; ++ struct rb_node *rb_node; ++ struct rb_node *prev = NULL; ++ struct rb_node *next = NULL; ++ ++ rb_node = __tree_search(&tree->map, start, &prev, &next); ++ if (!rb_node && prev) { ++ em = rb_entry(prev, struct extent_map, rb_node); ++ goto found; ++ } ++ if (!rb_node && next) { ++ em = rb_entry(next, struct extent_map, rb_node); ++ goto found; ++ } ++ if (!rb_node) { ++ em = NULL; ++ goto out; ++ } ++ if (IS_ERR(rb_node)) { ++ em = ERR_PTR(PTR_ERR(rb_node)); ++ goto out; ++ } ++ em = rb_entry(rb_node, struct extent_map, rb_node); ++ goto found; ++ ++ em = NULL; ++ goto out; ++ ++found: ++ atomic_inc(&em->refs); ++out: ++ return em; ++} ++ ++/** + * remove_extent_mapping - removes an extent_map from the extent tree + * @tree: extent tree to remove from + * @em: extent map beeing removed +@@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) + int ret = 0; + + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); +- assert_spin_locked(&tree->lock); + rb_erase(&em->rb_node, &tree->map); + em->in_tree = 0; + return ret; +diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h +index fb6eeef..ab6d74b 100644 +--- a/fs/btrfs/extent_map.h ++++ b/fs/btrfs/extent_map.h +@@ -31,7 +31,7 @@ struct extent_map { + + struct extent_map_tree { + struct rb_root map; +- spinlock_t lock; ++ rwlock_t lock; + }; + + static inline u64 extent_map_end(struct extent_map *em) +@@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask); + void free_extent_map(struct extent_map *em); + int __init extent_map_init(void); + void extent_map_exit(void); ++int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); ++struct extent_map *search_extent_mapping(struct extent_map_tree *tree, ++ u64 start, u64 len); + #endif +diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c +index 4b83397..4599113 100644 +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, + int err = 0; + int i; + struct inode *inode = fdentry(file)->d_inode; +- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; +- u64 hint_byte; + u64 num_bytes; + u64 start_pos; + u64 end_of_last_block; +@@ -125,23 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + end_of_last_block = start_pos + num_bytes - 1; ++ err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); ++ if (err) ++ return err; + +- lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); +- trans = btrfs_join_transaction(root, 1); +- if (!trans) { +- err = -ENOMEM; +- goto out_unlock; +- } +- btrfs_set_trans_block_group(trans, inode); +- hint_byte = 0; +- +- set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS); +- +- /* check for reserved extents on each page, we don't want +- * to reset the delalloc bit on things that already have +- * extents reserved. +- */ +- btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); + for (i = 0; i < num_pages; i++) { + struct page *p = pages[i]; + SetPageUptodate(p); +@@ -155,9 +140,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, + * at this time. + */ + } +- err = btrfs_end_transaction(trans, root); +-out_unlock: +- unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); + return err; + } + +@@ -189,18 +171,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + if (!split2) + split2 = alloc_extent_map(GFP_NOFS); + +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (!em) { +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + break; + } + flags = em->flags; + if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { +- spin_unlock(&em_tree->lock); + if (em->start <= start && + (!testend || em->start + em->len >= start + len)) { + free_extent_map(em); ++ write_unlock(&em_tree->lock); + break; + } + if (start < em->start) { +@@ -210,6 +192,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + start = em->start + em->len; + } + free_extent_map(em); ++ write_unlock(&em_tree->lock); + continue; + } + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); +@@ -260,7 +243,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + free_extent_map(split); + split = NULL; + } +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + + /* once for us */ + free_extent_map(em); +@@ -289,7 +272,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, u64 end, u64 locked_end, +- u64 inline_limit, u64 *hint_byte) ++ u64 inline_limit, u64 *hint_byte, int drop_cache) + { + u64 extent_end = 0; + u64 search_start = start; +@@ -314,7 +297,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, + int ret; + + inline_limit = 0; +- btrfs_drop_extent_cache(inode, start, end - 1, 0); ++ if (drop_cache) ++ btrfs_drop_extent_cache(inode, start, end - 1, 0); + + path = btrfs_alloc_path(); + if (!path) +@@ -894,7 +878,8 @@ again: + btrfs_put_ordered_extent(ordered); + + clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, +- last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, ++ last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | ++ EXTENT_DO_ACCOUNTING, + GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, GFP_NOFS); +@@ -936,21 +921,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, + start_pos = pos; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); ++ ++ /* do the reserve before the mutex lock in case we have to do some ++ * flushing. We wouldn't deadlock, but this is more polite. ++ */ ++ err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); ++ if (err) ++ goto out_nolock; ++ ++ mutex_lock(&inode->i_mutex); ++ + current->backing_dev_info = inode->i_mapping->backing_dev_info; + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) +- goto out_nolock; ++ goto out; ++ + if (count == 0) +- goto out_nolock; ++ goto out; + + err = file_remove_suid(file); + if (err) +- goto out_nolock; ++ goto out; ++ + file_update_time(file); + + pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + +- mutex_lock(&inode->i_mutex); ++ /* generic_write_checks can change our pos */ ++ start_pos = pos; ++ + BTRFS_I(inode)->sequence++; + first_index = pos >> PAGE_CACHE_SHIFT; + last_index = (pos + count) >> PAGE_CACHE_SHIFT; +@@ -1047,6 +1046,7 @@ out: + mutex_unlock(&inode->i_mutex); + if (ret) + err = ret; ++ btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + + out_nolock: + kfree(pages); +@@ -1087,8 +1087,10 @@ out_nolock: + btrfs_end_transaction(trans, root); + else + btrfs_commit_transaction(trans, root); +- } else { ++ } else if (ret != BTRFS_NO_LOG_SYNC) { + btrfs_commit_transaction(trans, root); ++ } else { ++ btrfs_end_transaction(trans, root); + } + } + if (file->f_flags & O_DIRECT) { +@@ -1138,6 +1140,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) + int ret = 0; + struct btrfs_trans_handle *trans; + ++ ++ /* we wait first, since the writeback may change the inode */ ++ root->log_batch++; ++ /* the VFS called filemap_fdatawrite for us */ ++ btrfs_wait_ordered_range(inode, 0, (u64)-1); ++ root->log_batch++; ++ + /* + * check the transaction that last modified this inode + * and see if its already been committed +@@ -1145,6 +1154,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) + if (!BTRFS_I(inode)->last_trans) + goto out; + ++ /* ++ * if the last transaction that changed this file was before ++ * the current transaction, we can bail out now without any ++ * syncing ++ */ + mutex_lock(&root->fs_info->trans_mutex); + if (BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed) { +@@ -1154,13 +1168,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) + } + mutex_unlock(&root->fs_info->trans_mutex); + +- root->log_batch++; +- filemap_fdatawrite(inode->i_mapping); +- btrfs_wait_ordered_range(inode, 0, (u64)-1); +- root->log_batch++; +- +- if (datasync && !(inode->i_state & I_DIRTY_PAGES)) +- goto out; + /* + * ok we haven't committed the transaction yet, lets do a commit + */ +@@ -1189,14 +1196,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) + */ + mutex_unlock(&dentry->d_inode->i_mutex); + +- if (ret > 0) { +- ret = btrfs_commit_transaction(trans, root); +- } else { +- ret = btrfs_sync_log(trans, root); +- if (ret == 0) +- ret = btrfs_end_transaction(trans, root); +- else ++ if (ret != BTRFS_NO_LOG_SYNC) { ++ if (ret > 0) { + ret = btrfs_commit_transaction(trans, root); ++ } else { ++ ret = btrfs_sync_log(trans, root); ++ if (ret == 0) ++ ret = btrfs_end_transaction(trans, root); ++ else ++ ret = btrfs_commit_transaction(trans, root); ++ } ++ } else { ++ ret = btrfs_end_transaction(trans, root); + } + mutex_lock(&dentry->d_inode->i_mutex); + out: +diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c +index 5edcee3..5c2caad 100644 +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, + + static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) + { +- u64 max_bytes, possible_bytes; ++ u64 max_bytes; ++ u64 bitmap_bytes; ++ u64 extent_bytes; + + /* + * The goal is to keep the total amount of memory used per 1gb of space +@@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) + max_bytes = MAX_CACHE_BYTES_PER_GIG * + (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); + +- possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) + +- (sizeof(struct btrfs_free_space) * +- block_group->extents_thresh); ++ /* ++ * we want to account for 1 more bitmap than what we have so we can make ++ * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as ++ * we add more bitmaps. ++ */ ++ bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE; + +- if (possible_bytes > max_bytes) { +- int extent_bytes = max_bytes - +- (block_group->total_bitmaps * PAGE_CACHE_SIZE); ++ if (bitmap_bytes >= max_bytes) { ++ block_group->extents_thresh = 0; ++ return; ++ } + +- if (extent_bytes <= 0) { +- block_group->extents_thresh = 0; +- return; +- } ++ /* ++ * we want the extent entry threshold to always be at most 1/2 the maxw ++ * bytes we can have, or whatever is less than that. ++ */ ++ extent_bytes = max_bytes - bitmap_bytes; ++ extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); + +- block_group->extents_thresh = extent_bytes / +- (sizeof(struct btrfs_free_space)); +- } ++ block_group->extents_thresh = ++ div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); + } + + static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, +@@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group, + BUG_ON(block_group->total_bitmaps >= max_bitmaps); + + info->offset = offset_to_bitmap(block_group, offset); ++ info->bytes = 0; + link_free_space(block_group, info); + block_group->total_bitmaps++; + +diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c +index 6b627c6..72ce3c1 100644 +--- a/fs/btrfs/inode-item.c ++++ b/fs/btrfs/inode-item.c +@@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + ptr = (unsigned long)(ref + 1); + ret = 0; + } else if (ret < 0) { ++ if (ret == -EOVERFLOW) ++ ret = -EMLINK; + goto out; + } else { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], +@@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_inode_item)); +- if (ret == 0 && objectid > root->highest_inode) +- root->highest_inode = objectid; + return ret; + } + +diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c +index 9abbced..c56eb59 100644 +--- a/fs/btrfs/inode-map.c ++++ b/fs/btrfs/inode-map.c +@@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) + slot = path->slots[0] - 1; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); +- *objectid = found_key.objectid; ++ *objectid = max_t(u64, found_key.objectid, ++ BTRFS_FIRST_FREE_OBJECTID - 1); + } else { +- *objectid = BTRFS_FIRST_FREE_OBJECTID; ++ *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + } + ret = 0; + error: +@@ -53,91 +54,27 @@ error: + return ret; + } + +-/* +- * walks the btree of allocated inodes and find a hole. +- */ + int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 dirid, u64 *objectid) + { +- struct btrfs_path *path; +- struct btrfs_key key; + int ret; +- int slot = 0; +- u64 last_ino = 0; +- int start_found; +- struct extent_buffer *l; +- struct btrfs_key search_key; +- u64 search_start = dirid; +- + mutex_lock(&root->objectid_mutex); +- if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID && +- root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) { +- *objectid = ++root->last_inode_alloc; +- mutex_unlock(&root->objectid_mutex); +- return 0; +- } +- path = btrfs_alloc_path(); +- BUG_ON(!path); +- search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID); +- search_key.objectid = search_start; +- search_key.type = 0; +- search_key.offset = 0; +- +- start_found = 0; +- ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); +- if (ret < 0) +- goto error; + +- while (1) { +- l = path->nodes[0]; +- slot = path->slots[0]; +- if (slot >= btrfs_header_nritems(l)) { +- ret = btrfs_next_leaf(root, path); +- if (ret == 0) +- continue; +- if (ret < 0) +- goto error; +- if (!start_found) { +- *objectid = search_start; +- start_found = 1; +- goto found; +- } +- *objectid = last_ino > search_start ? +- last_ino : search_start; +- goto found; +- } +- btrfs_item_key_to_cpu(l, &key, slot); +- if (key.objectid >= search_start) { +- if (start_found) { +- if (last_ino < search_start) +- last_ino = search_start; +- if (key.objectid > last_ino) { +- *objectid = last_ino; +- goto found; +- } +- } else if (key.objectid > search_start) { +- *objectid = search_start; +- goto found; +- } +- } +- if (key.objectid >= BTRFS_LAST_FREE_OBJECTID) +- break; ++ if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { ++ ret = btrfs_find_highest_inode(root, &root->highest_objectid); ++ if (ret) ++ goto out; ++ } + +- start_found = 1; +- last_ino = key.objectid + 1; +- path->slots[0]++; ++ if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { ++ ret = -ENOSPC; ++ goto out; + } +- BUG_ON(1); +-found: +- btrfs_release_path(root, path); +- btrfs_free_path(path); +- BUG_ON(*objectid < search_start); +- mutex_unlock(&root->objectid_mutex); +- return 0; +-error: +- btrfs_release_path(root, path); +- btrfs_free_path(path); ++ ++ *objectid = ++root->highest_objectid; ++ ret = 0; ++out: + mutex_unlock(&root->objectid_mutex); + return ret; + } +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 59cba18..f69e5e0 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, + } + + ret = btrfs_drop_extents(trans, root, inode, start, +- aligned_end, aligned_end, start, &hint_byte); ++ aligned_end, aligned_end, start, ++ &hint_byte, 1); + BUG_ON(ret); + + if (isize > actual_end) +@@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, + inline_len, compressed_size, + compressed_pages); + BUG_ON(ret); +- btrfs_drop_extent_cache(inode, start, aligned_end, 0); ++ btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); + return 0; + } + +@@ -423,9 +424,12 @@ again: + * and free up our temp pages. + */ + extent_clear_unlock_delalloc(inode, +- &BTRFS_I(inode)->io_tree, +- start, end, NULL, 1, 0, +- 0, 1, 1, 1); ++ &BTRFS_I(inode)->io_tree, ++ start, end, NULL, ++ EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | ++ EXTENT_CLEAR_DELALLOC | ++ EXTENT_CLEAR_ACCOUNTING | ++ EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); + ret = 0; + goto free_pages_out; + } +@@ -611,9 +615,9 @@ static noinline int submit_compressed_extents(struct inode *inode, + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + + while (1) { +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; +@@ -636,11 +640,14 @@ static noinline int submit_compressed_extents(struct inode *inode, + * clear dirty, set writeback and unlock the pages. + */ + extent_clear_unlock_delalloc(inode, +- &BTRFS_I(inode)->io_tree, +- async_extent->start, +- async_extent->start + +- async_extent->ram_size - 1, +- NULL, 1, 1, 0, 1, 1, 0); ++ &BTRFS_I(inode)->io_tree, ++ async_extent->start, ++ async_extent->start + ++ async_extent->ram_size - 1, ++ NULL, EXTENT_CLEAR_UNLOCK_PAGE | ++ EXTENT_CLEAR_UNLOCK | ++ EXTENT_CLEAR_DELALLOC | ++ EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); + + ret = btrfs_submit_compressed_write(inode, + async_extent->start, +@@ -711,9 +718,15 @@ static noinline int cow_file_range(struct inode *inode, + start, end, 0, NULL); + if (ret == 0) { + extent_clear_unlock_delalloc(inode, +- &BTRFS_I(inode)->io_tree, +- start, end, NULL, 1, 1, +- 1, 1, 1, 1); ++ &BTRFS_I(inode)->io_tree, ++ start, end, NULL, ++ EXTENT_CLEAR_UNLOCK_PAGE | ++ EXTENT_CLEAR_UNLOCK | ++ EXTENT_CLEAR_DELALLOC | ++ EXTENT_CLEAR_ACCOUNTING | ++ EXTENT_CLEAR_DIRTY | ++ EXTENT_SET_WRITEBACK | ++ EXTENT_END_WRITEBACK); + *nr_written = *nr_written + + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; + *page_started = 1; +@@ -725,9 +738,20 @@ static noinline int cow_file_range(struct inode *inode, + BUG_ON(disk_num_bytes > + btrfs_super_total_bytes(&root->fs_info->super_copy)); + ++ ++ read_lock(&BTRFS_I(inode)->extent_tree.lock); ++ em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, ++ start, num_bytes); ++ if (em) { ++ alloc_hint = em->block_start; ++ free_extent_map(em); ++ } ++ read_unlock(&BTRFS_I(inode)->extent_tree.lock); + btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); + + while (disk_num_bytes > 0) { ++ unsigned long op; ++ + cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); + ret = btrfs_reserve_extent(trans, root, cur_alloc_size, + root->sectorsize, 0, alloc_hint, +@@ -737,7 +761,6 @@ static noinline int cow_file_range(struct inode *inode, + em = alloc_extent_map(GFP_NOFS); + em->start = start; + em->orig_start = em->start; +- + ram_size = ins.offset; + em->len = ins.offset; + +@@ -747,9 +770,9 @@ static noinline int cow_file_range(struct inode *inode, + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + while (1) { +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; +@@ -776,11 +799,17 @@ static noinline int cow_file_range(struct inode *inode, + /* we're not doing compressed IO, don't unlock the first + * page (which the caller expects to stay locked), don't + * clear any dirty bits and don't set any writeback bits ++ * ++ * Do set the Private2 bit so we know this page was properly ++ * setup for writepage + */ ++ op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; ++ op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | ++ EXTENT_SET_PRIVATE2; ++ + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + start, start + ram_size - 1, +- locked_page, unlock, 1, +- 1, 0, 0, 0); ++ locked_page, op); + disk_num_bytes -= cur_alloc_size; + num_bytes -= cur_alloc_size; + alloc_hint = ins.objectid + ins.offset; +@@ -852,8 +881,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, + u64 cur_end; + int limit = 10 * 1024 * 1042; + +- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | +- EXTENT_DELALLOC, 1, 0, GFP_NOFS); ++ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, ++ 1, 0, NULL, GFP_NOFS); + while (start < end) { + async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); + async_cow->inode = inode; +@@ -994,6 +1023,7 @@ next_slot: + + if (found_key.offset > cur_offset) { + extent_end = found_key.offset; ++ extent_type = 0; + goto out_check; + } + +@@ -1080,9 +1110,9 @@ out_check: + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + while (1) { +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; +@@ -1100,8 +1130,10 @@ out_check: + BUG_ON(ret); + + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, +- cur_offset, cur_offset + num_bytes - 1, +- locked_page, 1, 1, 1, 0, 0, 0); ++ cur_offset, cur_offset + num_bytes - 1, ++ locked_page, EXTENT_CLEAR_UNLOCK_PAGE | ++ EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | ++ EXTENT_SET_PRIVATE2); + cur_offset = extent_end; + if (cur_offset > end) + break; +@@ -1147,6 +1179,89 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, + return ret; + } + ++static int btrfs_split_extent_hook(struct inode *inode, ++ struct extent_state *orig, u64 split) ++{ ++ struct btrfs_root *root = BTRFS_I(inode)->root; ++ u64 size; ++ ++ if (!(orig->state & EXTENT_DELALLOC)) ++ return 0; ++ ++ size = orig->end - orig->start + 1; ++ if (size > root->fs_info->max_extent) { ++ u64 num_extents; ++ u64 new_size; ++ ++ new_size = orig->end - split + 1; ++ num_extents = div64_u64(size + root->fs_info->max_extent - 1, ++ root->fs_info->max_extent); ++ ++ /* ++ * if we break a large extent up then leave oustanding_extents ++ * be, since we've already accounted for the large extent. ++ */ ++ if (div64_u64(new_size + root->fs_info->max_extent - 1, ++ root->fs_info->max_extent) < num_extents) ++ return 0; ++ } ++ ++ spin_lock(&BTRFS_I(inode)->accounting_lock); ++ BTRFS_I(inode)->outstanding_extents++; ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); ++ ++ return 0; ++} ++ ++/* ++ * extent_io.c merge_extent_hook, used to track merged delayed allocation ++ * extents so we can keep track of new extents that are just merged onto old ++ * extents, such as when we are doing sequential writes, so we can properly ++ * account for the metadata space we'll need. ++ */ ++static int btrfs_merge_extent_hook(struct inode *inode, ++ struct extent_state *new, ++ struct extent_state *other) ++{ ++ struct btrfs_root *root = BTRFS_I(inode)->root; ++ u64 new_size, old_size; ++ u64 num_extents; ++ ++ /* not delalloc, ignore it */ ++ if (!(other->state & EXTENT_DELALLOC)) ++ return 0; ++ ++ old_size = other->end - other->start + 1; ++ if (new->start < other->start) ++ new_size = other->end - new->start + 1; ++ else ++ new_size = new->end - other->start + 1; ++ ++ /* we're not bigger than the max, unreserve the space and go */ ++ if (new_size <= root->fs_info->max_extent) { ++ spin_lock(&BTRFS_I(inode)->accounting_lock); ++ BTRFS_I(inode)->outstanding_extents--; ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); ++ return 0; ++ } ++ ++ /* ++ * If we grew by another max_extent, just return, we want to keep that ++ * reserved amount. ++ */ ++ num_extents = div64_u64(old_size + root->fs_info->max_extent - 1, ++ root->fs_info->max_extent); ++ if (div64_u64(new_size + root->fs_info->max_extent - 1, ++ root->fs_info->max_extent) > num_extents) ++ return 0; ++ ++ spin_lock(&BTRFS_I(inode)->accounting_lock); ++ BTRFS_I(inode)->outstanding_extents--; ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); ++ ++ return 0; ++} ++ + /* + * extent_io.c set_bit_hook, used to track delayed allocation + * bytes in this file, and to maintain the list of inodes that +@@ -1155,6 +1270,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, + static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits) + { ++ + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testeing for the DELALLOC +@@ -1162,6 +1278,10 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, + */ + if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; ++ ++ spin_lock(&BTRFS_I(inode)->accounting_lock); ++ BTRFS_I(inode)->outstanding_extents++; ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); + btrfs_delalloc_reserve_space(root, inode, end - start + 1); + spin_lock(&root->fs_info->delalloc_lock); + BTRFS_I(inode)->delalloc_bytes += end - start + 1; +@@ -1178,22 +1298,31 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, + /* + * extent_io.c clear_bit_hook, see set_bit_hook for why + */ +-static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, +- unsigned long old, unsigned long bits) ++static int btrfs_clear_bit_hook(struct inode *inode, ++ struct extent_state *state, unsigned long bits) + { + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testeing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ +- if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { ++ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; + ++ if (bits & EXTENT_DO_ACCOUNTING) { ++ spin_lock(&BTRFS_I(inode)->accounting_lock); ++ BTRFS_I(inode)->outstanding_extents--; ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); ++ btrfs_unreserve_metadata_for_delalloc(root, inode, 1); ++ } ++ + spin_lock(&root->fs_info->delalloc_lock); +- if (end - start + 1 > root->fs_info->delalloc_bytes) { ++ if (state->end - state->start + 1 > ++ root->fs_info->delalloc_bytes) { + printk(KERN_INFO "btrfs warning: delalloc account " + "%llu %llu\n", +- (unsigned long long)end - start + 1, ++ (unsigned long long) ++ state->end - state->start + 1, + (unsigned long long) + root->fs_info->delalloc_bytes); + btrfs_delalloc_free_space(root, inode, (u64)-1); +@@ -1201,9 +1330,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, + BTRFS_I(inode)->delalloc_bytes = 0; + } else { + btrfs_delalloc_free_space(root, inode, +- end - start + 1); +- root->fs_info->delalloc_bytes -= end - start + 1; +- BTRFS_I(inode)->delalloc_bytes -= end - start + 1; ++ state->end - ++ state->start + 1); ++ root->fs_info->delalloc_bytes -= state->end - ++ state->start + 1; ++ BTRFS_I(inode)->delalloc_bytes -= state->end - ++ state->start + 1; + } + if (BTRFS_I(inode)->delalloc_bytes == 0 && + !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { +@@ -1374,10 +1506,8 @@ again: + lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); + + /* already ordered? We're done */ +- if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, +- EXTENT_ORDERED, 0)) { ++ if (PagePrivate2(page)) + goto out; +- } + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { +@@ -1413,11 +1543,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) + struct inode *inode = page->mapping->host; + struct btrfs_writepage_fixup *fixup; + struct btrfs_root *root = BTRFS_I(inode)->root; +- int ret; + +- ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, +- EXTENT_ORDERED, 0); +- if (ret) ++ /* this page is properly in the ordered list */ ++ if (TestClearPagePrivate2(page)) + return 0; + + if (PageChecked(page)) +@@ -1455,9 +1583,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, + BUG_ON(!path); + + path->leave_spinning = 1; ++ ++ /* ++ * we may be replacing one extent in the tree with another. ++ * The new extent is pinned in the extent map, and we don't want ++ * to drop it from the cache until it is completely in the btree. ++ * ++ * So, tell btrfs_drop_extents to leave this extent in the cache. ++ * the caller is expected to unpin it and allow it to be merged ++ * with the others. ++ */ + ret = btrfs_drop_extents(trans, root, inode, file_pos, + file_pos + num_bytes, locked_end, +- file_pos, &hint); ++ file_pos, &hint, 0); + BUG_ON(ret); + + ins.objectid = inode->i_ino; +@@ -1485,7 +1623,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, + btrfs_mark_buffer_dirty(leaf); + + inode_add_bytes(inode, num_bytes); +- btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0); + + ins.objectid = disk_bytenr; + ins.offset = disk_num_bytes; +@@ -1596,6 +1733,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) + ordered_extent->len, + compressed, 0, 0, + BTRFS_FILE_EXTENT_REG); ++ unpin_extent_cache(&BTRFS_I(inode)->extent_tree, ++ ordered_extent->file_offset, ++ ordered_extent->len); + BUG_ON(ret); + } + unlock_extent(io_tree, ordered_extent->file_offset, +@@ -1623,6 +1763,7 @@ nocow: + static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate) + { ++ ClearPagePrivate2(page); + return btrfs_finish_ordered_io(page->mapping->host, start, end); + } + +@@ -1669,13 +1810,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, + failrec->last_mirror = 0; + failrec->bio_flags = 0; + +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (em->start > start || em->start + em->len < start) { + free_extent_map(em); + em = NULL; + } +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + if (!em || IS_ERR(em)) { + kfree(failrec); +@@ -1794,7 +1935,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, + return 0; + + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && +- test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) { ++ test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { + clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, + GFP_NOFS); + return 0; +@@ -2352,6 +2493,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) + return ret; + } + ++int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct inode *dir, u64 objectid, ++ const char *name, int name_len) ++{ ++ struct btrfs_path *path; ++ struct extent_buffer *leaf; ++ struct btrfs_dir_item *di; ++ struct btrfs_key key; ++ u64 index; ++ int ret; ++ ++ path = btrfs_alloc_path(); ++ if (!path) ++ return -ENOMEM; ++ ++ di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, ++ name, name_len, -1); ++ BUG_ON(!di || IS_ERR(di)); ++ ++ leaf = path->nodes[0]; ++ btrfs_dir_item_key_to_cpu(leaf, di, &key); ++ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ++ ret = btrfs_delete_one_dir_name(trans, root, path, di); ++ BUG_ON(ret); ++ btrfs_release_path(root, path); ++ ++ ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, ++ objectid, root->root_key.objectid, ++ dir->i_ino, &index, name, name_len); ++ if (ret < 0) { ++ BUG_ON(ret != -ENOENT); ++ di = btrfs_search_dir_index_item(root, path, dir->i_ino, ++ name, name_len); ++ BUG_ON(!di || IS_ERR(di)); ++ ++ leaf = path->nodes[0]; ++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ++ btrfs_release_path(root, path); ++ index = key.offset; ++ } ++ ++ di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, ++ index, name, name_len, -1); ++ BUG_ON(!di || IS_ERR(di)); ++ ++ leaf = path->nodes[0]; ++ btrfs_dir_item_key_to_cpu(leaf, di, &key); ++ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ++ ret = btrfs_delete_one_dir_name(trans, root, path, di); ++ BUG_ON(ret); ++ btrfs_release_path(root, path); ++ ++ btrfs_i_size_write(dir, dir->i_size - name_len * 2); ++ dir->i_mtime = dir->i_ctime = CURRENT_TIME; ++ ret = btrfs_update_inode(trans, root, dir); ++ BUG_ON(ret); ++ dir->i_sb->s_dirt = 1; ++ ++ btrfs_free_path(path); ++ return 0; ++} ++ + static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) + { + struct inode *inode = dentry->d_inode; +@@ -2361,29 +2565,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) + struct btrfs_trans_handle *trans; + unsigned long nr = 0; + +- /* +- * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir +- * the root of a subvolume or snapshot +- */ + if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || +- inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { ++ inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + return -ENOTEMPTY; +- } + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + ++ if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ++ err = btrfs_unlink_subvol(trans, root, dir, ++ BTRFS_I(inode)->location.objectid, ++ dentry->d_name.name, ++ dentry->d_name.len); ++ goto out; ++ } ++ + err = btrfs_orphan_add(trans, inode); + if (err) +- goto fail_trans; ++ goto out; + + /* now the directory is empty */ + err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len); + if (!err) + btrfs_i_size_write(inode, 0); +- +-fail_trans: ++out: + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + btrfs_btree_balance_dirty(root, nr); +@@ -2826,12 +3032,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) + + if ((offset & (blocksize - 1)) == 0) + goto out; ++ ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); ++ if (ret) ++ goto out; ++ ++ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); ++ if (ret) ++ goto out; + + ret = -ENOMEM; + again: + page = grab_cache_page(mapping, index); +- if (!page) ++ if (!page) { ++ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); ++ btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + goto out; ++ } + + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; +@@ -2864,7 +3080,16 @@ again: + goto again; + } + +- btrfs_set_extent_delalloc(inode, page_start, page_end); ++ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, ++ EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, ++ GFP_NOFS); ++ ++ ret = btrfs_set_extent_delalloc(inode, page_start, page_end); ++ if (ret) { ++ unlock_extent(io_tree, page_start, page_end, GFP_NOFS); ++ goto out_unlock; ++ } ++ + ret = 0; + if (offset != PAGE_CACHE_SIZE) { + kaddr = kmap(page); +@@ -2877,6 +3102,9 @@ again: + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + + out_unlock: ++ if (ret) ++ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); ++ btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + unlock_page(page); + page_cache_release(page); + out: +@@ -2895,17 +3123,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) + u64 last_byte; + u64 cur_offset; + u64 hole_size; +- int err; ++ int err = 0; + + if (size <= hole_start) + return 0; + +- err = btrfs_check_metadata_free_space(root); ++ err = btrfs_truncate_page(inode->i_mapping, inode->i_size); + if (err) + return err; + +- btrfs_truncate_page(inode->i_mapping, inode->i_size); +- + while (1) { + struct btrfs_ordered_extent *ordered; + btrfs_wait_ordered_range(inode, hole_start, +@@ -2935,15 +3161,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) + cur_offset, + cur_offset + hole_size, + block_end, +- cur_offset, &hint_byte); ++ cur_offset, &hint_byte, 1); + if (err) + break; ++ ++ err = btrfs_reserve_metadata_space(root, 1); ++ if (err) ++ break; ++ + err = btrfs_insert_file_extent(trans, root, + inode->i_ino, cur_offset, 0, + 0, hole_size, 0, hole_size, + 0, 0, 0); + btrfs_drop_extent_cache(inode, hole_start, + last_byte - 1, 0); ++ btrfs_unreserve_metadata_space(root, 1); + } + free_extent_map(em); + cur_offset = last_byte; +@@ -3003,6 +3235,11 @@ void btrfs_delete_inode(struct inode *inode) + } + btrfs_wait_ordered_range(inode, 0, (u64)-1); + ++ if (inode->i_nlink > 0) { ++ BUG_ON(btrfs_root_refs(&root->root_item) != 0); ++ goto no_delete; ++ } ++ + btrfs_i_size_write(inode, 0); + trans = btrfs_join_transaction(root, 1); + +@@ -3070,29 +3307,67 @@ out_err: + * is kind of like crossing a mount point. + */ + static int fixup_tree_root_location(struct btrfs_root *root, +- struct btrfs_key *location, +- struct btrfs_root **sub_root, +- struct dentry *dentry) ++ struct inode *dir, ++ struct dentry *dentry, ++ struct btrfs_key *location, ++ struct btrfs_root **sub_root) + { +- struct btrfs_root_item *ri; ++ struct btrfs_path *path; ++ struct btrfs_root *new_root; ++ struct btrfs_root_ref *ref; ++ struct extent_buffer *leaf; ++ int ret; ++ int err = 0; + +- if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY) +- return 0; +- if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) +- return 0; ++ path = btrfs_alloc_path(); ++ if (!path) { ++ err = -ENOMEM; ++ goto out; ++ } + +- *sub_root = btrfs_read_fs_root(root->fs_info, location, +- dentry->d_name.name, +- dentry->d_name.len); +- if (IS_ERR(*sub_root)) +- return PTR_ERR(*sub_root); ++ err = -ENOENT; ++ ret = btrfs_find_root_ref(root->fs_info->tree_root, path, ++ BTRFS_I(dir)->root->root_key.objectid, ++ location->objectid); ++ if (ret) { ++ if (ret < 0) ++ err = ret; ++ goto out; ++ } + +- ri = &(*sub_root)->root_item; +- location->objectid = btrfs_root_dirid(ri); +- btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); +- location->offset = 0; ++ leaf = path->nodes[0]; ++ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); ++ if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino || ++ btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) ++ goto out; + +- return 0; ++ ret = memcmp_extent_buffer(leaf, dentry->d_name.name, ++ (unsigned long)(ref + 1), ++ dentry->d_name.len); ++ if (ret) ++ goto out; ++ ++ btrfs_release_path(root->fs_info->tree_root, path); ++ ++ new_root = btrfs_read_fs_root_no_name(root->fs_info, location); ++ if (IS_ERR(new_root)) { ++ err = PTR_ERR(new_root); ++ goto out; ++ } ++ ++ if (btrfs_root_refs(&new_root->root_item) == 0) { ++ err = -ENOENT; ++ goto out; ++ } ++ ++ *sub_root = new_root; ++ location->objectid = btrfs_root_dirid(&new_root->root_item); ++ location->type = BTRFS_INODE_ITEM_KEY; ++ location->offset = 0; ++ err = 0; ++out: ++ btrfs_free_path(path); ++ return err; + } + + static void inode_tree_add(struct inode *inode) +@@ -3101,11 +3376,13 @@ static void inode_tree_add(struct inode *inode) + struct btrfs_inode *entry; + struct rb_node **p; + struct rb_node *parent; +- + again: + p = &root->inode_tree.rb_node; + parent = NULL; + ++ if (hlist_unhashed(&inode->i_hash)) ++ return; ++ + spin_lock(&root->inode_lock); + while (*p) { + parent = *p; +@@ -3132,13 +3409,87 @@ again: + static void inode_tree_del(struct inode *inode) + { + struct btrfs_root *root = BTRFS_I(inode)->root; ++ int empty = 0; + + spin_lock(&root->inode_lock); + if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { + rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); ++ empty = RB_EMPTY_ROOT(&root->inode_tree); ++ } ++ spin_unlock(&root->inode_lock); ++ ++ if (empty && btrfs_root_refs(&root->root_item) == 0) { ++ synchronize_srcu(&root->fs_info->subvol_srcu); ++ spin_lock(&root->inode_lock); ++ empty = RB_EMPTY_ROOT(&root->inode_tree); ++ spin_unlock(&root->inode_lock); ++ if (empty) ++ btrfs_add_dead_root(root); ++ } ++} ++ ++int btrfs_invalidate_inodes(struct btrfs_root *root) ++{ ++ struct rb_node *node; ++ struct rb_node *prev; ++ struct btrfs_inode *entry; ++ struct inode *inode; ++ u64 objectid = 0; ++ ++ WARN_ON(btrfs_root_refs(&root->root_item) != 0); ++ ++ spin_lock(&root->inode_lock); ++again: ++ node = root->inode_tree.rb_node; ++ prev = NULL; ++ while (node) { ++ prev = node; ++ entry = rb_entry(node, struct btrfs_inode, rb_node); ++ ++ if (objectid < entry->vfs_inode.i_ino) ++ node = node->rb_left; ++ else if (objectid > entry->vfs_inode.i_ino) ++ node = node->rb_right; ++ else ++ break; ++ } ++ if (!node) { ++ while (prev) { ++ entry = rb_entry(prev, struct btrfs_inode, rb_node); ++ if (objectid <= entry->vfs_inode.i_ino) { ++ node = prev; ++ break; ++ } ++ prev = rb_next(prev); ++ } ++ } ++ while (node) { ++ entry = rb_entry(node, struct btrfs_inode, rb_node); ++ objectid = entry->vfs_inode.i_ino + 1; ++ inode = igrab(&entry->vfs_inode); ++ if (inode) { ++ spin_unlock(&root->inode_lock); ++ if (atomic_read(&inode->i_count) > 1) ++ d_prune_aliases(inode); ++ /* ++ * btrfs_drop_inode will remove it from ++ * the inode cache when its usage count ++ * hits zero. ++ */ ++ iput(inode); ++ cond_resched(); ++ spin_lock(&root->inode_lock); ++ goto again; ++ } ++ ++ if (cond_resched_lock(&root->inode_lock)) ++ goto again; ++ ++ node = rb_next(node); + } + spin_unlock(&root->inode_lock); ++ return 0; + } + + static noinline void init_btrfs_i(struct inode *inode) +@@ -3148,6 +3499,7 @@ static noinline void init_btrfs_i(struct inode *inode) + bi->generation = 0; + bi->sequence = 0; + bi->last_trans = 0; ++ bi->last_sub_trans = 0; + bi->logged_trans = 0; + bi->delalloc_bytes = 0; + bi->reserved_bytes = 0; +@@ -3225,15 +3577,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + return inode; + } + ++static struct inode *new_simple_dir(struct super_block *s, ++ struct btrfs_key *key, ++ struct btrfs_root *root) ++{ ++ struct inode *inode = new_inode(s); ++ ++ if (!inode) ++ return ERR_PTR(-ENOMEM); ++ ++ init_btrfs_i(inode); ++ ++ BTRFS_I(inode)->root = root; ++ memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); ++ BTRFS_I(inode)->dummy_inode = 1; ++ ++ inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; ++ inode->i_op = &simple_dir_inode_operations; ++ inode->i_fop = &simple_dir_operations; ++ inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; ++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; ++ ++ return inode; ++} ++ + struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) + { + struct inode *inode; +- struct btrfs_inode *bi = BTRFS_I(dir); +- struct btrfs_root *root = bi->root; ++ struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *sub_root = root; + struct btrfs_key location; ++ int index; + int ret; + ++ dentry->d_op = &btrfs_dentry_operations; ++ + if (dentry->d_name.len > BTRFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + +@@ -3242,29 +3620,52 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) + if (ret < 0) + return ERR_PTR(ret); + +- inode = NULL; +- if (location.objectid) { +- ret = fixup_tree_root_location(root, &location, &sub_root, +- dentry); +- if (ret < 0) +- return ERR_PTR(ret); +- if (ret > 0) +- return ERR_PTR(-ENOENT); ++ if (location.objectid == 0) ++ return NULL; ++ ++ if (location.type == BTRFS_INODE_ITEM_KEY) { ++ inode = btrfs_iget(dir->i_sb, &location, root); ++ return inode; ++ } ++ ++ BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); ++ ++ index = srcu_read_lock(&root->fs_info->subvol_srcu); ++ ret = fixup_tree_root_location(root, dir, dentry, ++ &location, &sub_root); ++ if (ret < 0) { ++ if (ret != -ENOENT) ++ inode = ERR_PTR(ret); ++ else ++ inode = new_simple_dir(dir->i_sb, &location, sub_root); ++ } else { + inode = btrfs_iget(dir->i_sb, &location, sub_root); +- if (IS_ERR(inode)) +- return ERR_CAST(inode); + } ++ srcu_read_unlock(&root->fs_info->subvol_srcu, index); ++ + return inode; + } + ++static int btrfs_dentry_delete(struct dentry *dentry) ++{ ++ struct btrfs_root *root; ++ ++ if (!dentry->d_inode && !IS_ROOT(dentry)) ++ dentry = dentry->d_parent; ++ ++ if (dentry->d_inode) { ++ root = BTRFS_I(dentry->d_inode)->root; ++ if (btrfs_root_refs(&root->root_item) == 0) ++ return 1; ++ } ++ return 0; ++} ++ + static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) + { + struct inode *inode; + +- if (dentry->d_name.len > BTRFS_NAME_LEN) +- return ERR_PTR(-ENAMETOOLONG); +- + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) + return ERR_CAST(inode); +@@ -3603,9 +4004,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, + if (ret != 0) + goto fail; + +- if (objectid > root->highest_inode) +- root->highest_inode = objectid; +- + inode->i_uid = current_fsuid(); + + if (dir && (dir->i_mode & S_ISGID)) { +@@ -3673,26 +4071,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index) + { +- int ret; ++ int ret = 0; + struct btrfs_key key; + struct btrfs_root *root = BTRFS_I(parent_inode)->root; + +- key.objectid = inode->i_ino; +- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); +- key.offset = 0; ++ if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { ++ memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); ++ } else { ++ key.objectid = inode->i_ino; ++ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); ++ key.offset = 0; ++ } ++ ++ if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { ++ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, ++ key.objectid, root->root_key.objectid, ++ parent_inode->i_ino, ++ index, name, name_len); ++ } else if (add_backref) { ++ ret = btrfs_insert_inode_ref(trans, root, ++ name, name_len, inode->i_ino, ++ parent_inode->i_ino, index); ++ } + +- ret = btrfs_insert_dir_item(trans, root, name, name_len, +- parent_inode->i_ino, +- &key, btrfs_inode_type(inode), +- index); + if (ret == 0) { +- if (add_backref) { +- ret = btrfs_insert_inode_ref(trans, root, +- name, name_len, +- inode->i_ino, +- parent_inode->i_ino, +- index); +- } ++ ret = btrfs_insert_dir_item(trans, root, name, name_len, ++ parent_inode->i_ino, &key, ++ btrfs_inode_type(inode), index); ++ BUG_ON(ret); ++ + btrfs_i_size_write(parent_inode, parent_inode->i_size + + name_len * 2); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; +@@ -3732,11 +4139,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, + if (!new_valid_dev(rdev)) + return -EINVAL; + +- err = btrfs_check_metadata_free_space(root); ++ /* ++ * 2 for inode item and ref ++ * 2 for dir items ++ * 1 for xattr if selinux is on ++ */ ++ err = btrfs_reserve_metadata_space(root, 5); + if (err) +- goto fail; ++ return err; + + trans = btrfs_start_transaction(root, 1); ++ if (!trans) ++ goto fail; + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); +@@ -3774,6 +4188,7 @@ out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + fail: ++ btrfs_unreserve_metadata_space(root, 5); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); +@@ -3794,10 +4209,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, + u64 objectid; + u64 index = 0; + +- err = btrfs_check_metadata_free_space(root); ++ /* ++ * 2 for inode item and ref ++ * 2 for dir items ++ * 1 for xattr if selinux is on ++ */ ++ err = btrfs_reserve_metadata_space(root, 5); + if (err) +- goto fail; ++ return err; ++ + trans = btrfs_start_transaction(root, 1); ++ if (!trans) ++ goto fail; + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); +@@ -3838,6 +4261,7 @@ out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + fail: ++ btrfs_unreserve_metadata_space(root, 5); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); +@@ -3860,10 +4284,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, + if (inode->i_nlink == 0) + return -ENOENT; + +- btrfs_inc_nlink(inode); +- err = btrfs_check_metadata_free_space(root); ++ /* ++ * 1 item for inode ref ++ * 2 items for dir items ++ */ ++ err = btrfs_reserve_metadata_space(root, 3); + if (err) +- goto fail; ++ return err; ++ ++ btrfs_inc_nlink(inode); ++ + err = btrfs_set_inode_index(dir, &index); + if (err) + goto fail; +@@ -3875,20 +4305,19 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, + + err = btrfs_add_nondir(trans, dentry, inode, 1, index); + +- if (err) +- drop_inode = 1; +- +- btrfs_update_inode_block_group(trans, dir); +- err = btrfs_update_inode(trans, root, inode); +- +- if (err) ++ if (err) { + drop_inode = 1; ++ } else { ++ btrfs_update_inode_block_group(trans, dir); ++ err = btrfs_update_inode(trans, root, inode); ++ BUG_ON(err); ++ btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); ++ } + + nr = trans->blocks_used; +- +- btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); + btrfs_end_transaction_throttle(trans, root); + fail: ++ btrfs_unreserve_metadata_space(root, 3); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); +@@ -3908,17 +4337,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) + u64 index = 0; + unsigned long nr = 1; + +- err = btrfs_check_metadata_free_space(root); ++ /* ++ * 2 items for inode and ref ++ * 2 items for dir items ++ * 1 for xattr if selinux is on ++ */ ++ err = btrfs_reserve_metadata_space(root, 5); + if (err) +- goto out_unlock; ++ return err; + + trans = btrfs_start_transaction(root, 1); +- btrfs_set_trans_block_group(trans, dir); +- +- if (IS_ERR(trans)) { +- err = PTR_ERR(trans); ++ if (!trans) { ++ err = -ENOMEM; + goto out_unlock; + } ++ btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { +@@ -3967,6 +4400,7 @@ out_fail: + btrfs_end_transaction_throttle(trans, root); + + out_unlock: ++ btrfs_unreserve_metadata_space(root, 5); + if (drop_on_err) + iput(inode); + btrfs_btree_balance_dirty(root, nr); +@@ -4064,11 +4498,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + int compressed; + + again: +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) + em->bdev = root->fs_info->fs_devices->latest_bdev; +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + if (em) { + if (em->start > start || em->start + em->len <= start) +@@ -4215,6 +4649,11 @@ again: + map = kmap(page); + read_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); ++ if (pg_offset + copy_size < PAGE_CACHE_SIZE) { ++ memset(map + pg_offset + copy_size, 0, ++ PAGE_CACHE_SIZE - pg_offset - ++ copy_size); ++ } + kunmap(page); + } + flush_dcache_page(page); +@@ -4259,7 +4698,7 @@ insert: + } + + err = 0; +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + /* it is possible that someone inserted the extent into the tree + * while we had the lock dropped. It is also possible that +@@ -4299,7 +4738,7 @@ insert: + err = 0; + } + } +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + out: + if (path) + btrfs_free_path(path); +@@ -4398,13 +4837,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) + u64 page_start = page_offset(page); + u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + ++ ++ /* ++ * we have the page locked, so new writeback can't start, ++ * and the dirty bit won't be cleared while we are here. ++ * ++ * Wait for IO on this page so that we can safely clear ++ * the PagePrivate2 bit and do ordered accounting ++ */ + wait_on_page_writeback(page); ++ + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (offset) { + btrfs_releasepage(page, GFP_NOFS); + return; + } +- + lock_extent(tree, page_start, page_end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(page->mapping->host, + page_offset(page)); +@@ -4415,16 +4862,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) + */ + clear_extent_bit(tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | +- EXTENT_LOCKED, 1, 0, GFP_NOFS); +- btrfs_finish_ordered_io(page->mapping->host, +- page_start, page_end); ++ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, ++ NULL, GFP_NOFS); ++ /* ++ * whoever cleared the private bit is responsible ++ * for the finish_ordered_io ++ */ ++ if (TestClearPagePrivate2(page)) { ++ btrfs_finish_ordered_io(page->mapping->host, ++ page_start, page_end); ++ } + btrfs_put_ordered_extent(ordered); + lock_extent(tree, page_start, page_end, GFP_NOFS); + } + clear_extent_bit(tree, page_start, page_end, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | +- EXTENT_ORDERED, +- 1, 1, GFP_NOFS); ++ EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); + __btrfs_releasepage(page, GFP_NOFS); + + ClearPageChecked(page); +@@ -4473,6 +4926,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) + goto out; + } + ++ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); ++ if (ret) { ++ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); ++ ret = VM_FAULT_SIGBUS; ++ goto out; ++ } ++ + ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ + again: + lock_page(page); +@@ -4504,7 +4964,24 @@ again: + goto again; + } + +- btrfs_set_extent_delalloc(inode, page_start, page_end); ++ /* ++ * XXX - page_mkwrite gets called every time the page is dirtied, even ++ * if it was already dirty, so for space accounting reasons we need to ++ * clear any delalloc bits for the range we are fixing to save. There ++ * is probably a better way to do this, but for now keep consistent with ++ * prepare_pages in the normal write path. ++ */ ++ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, ++ EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, ++ GFP_NOFS); ++ ++ ret = btrfs_set_extent_delalloc(inode, page_start, page_end); ++ if (ret) { ++ unlock_extent(io_tree, page_start, page_end, GFP_NOFS); ++ ret = VM_FAULT_SIGBUS; ++ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); ++ goto out_unlock; ++ } + ret = 0; + + /* page is wholly or partially inside EOF */ +@@ -4521,11 +4998,17 @@ again: + } + ClearPageChecked(page); + set_page_dirty(page); ++ SetPageUptodate(page); ++ ++ BTRFS_I(inode)->last_trans = root->fs_info->generation; ++ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + +- BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + + out_unlock: ++ btrfs_unreserve_metadata_for_delalloc(root, inode, 1); ++ if (!ret) ++ return VM_FAULT_LOCKED; + unlock_page(page); + out: + return ret; +@@ -4544,7 +5027,9 @@ static void btrfs_truncate(struct inode *inode) + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + +- btrfs_truncate_page(inode->i_mapping, inode->i_size); ++ ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); ++ if (ret) ++ return; + btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); + + trans = btrfs_start_transaction(root, 1); +@@ -4594,11 +5079,11 @@ out: + * create a new subvolume directory/inode (helper for the ioctl). + */ + int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, +- struct btrfs_root *new_root, struct dentry *dentry, ++ struct btrfs_root *new_root, + u64 new_dirid, u64 alloc_hint) + { + struct inode *inode; +- int error; ++ int err; + u64 index = 0; + + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, +@@ -4611,11 +5096,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, + inode->i_nlink = 1; + btrfs_i_size_write(inode, 0); + +- error = btrfs_update_inode(trans, new_root, inode); +- if (error) +- return error; ++ err = btrfs_update_inode(trans, new_root, inode); ++ BUG_ON(err); + +- d_instantiate(dentry, inode); ++ iput(inode); + return 0; + } + +@@ -4640,7 +5124,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) + if (!ei) + return NULL; + ei->last_trans = 0; ++ ei->last_sub_trans = 0; + ei->logged_trans = 0; ++ ei->outstanding_extents = 0; ++ ei->reserved_extents = 0; ++ ei->root = NULL; ++ spin_lock_init(&ei->accounting_lock); + btrfs_ordered_inode_tree_init(&ei->ordered_tree); + INIT_LIST_HEAD(&ei->i_orphan); + INIT_LIST_HEAD(&ei->ordered_operations); +@@ -4656,6 +5145,14 @@ void btrfs_destroy_inode(struct inode *inode) + WARN_ON(inode->i_data.nrpages); + + /* ++ * This can happen where we create an inode, but somebody else also ++ * created the same inode and we need to destroy the one we already ++ * created. ++ */ ++ if (!root) ++ goto free; ++ ++ /* + * Make sure we're properly removed from the ordered operation + * lists. + */ +@@ -4690,9 +5187,20 @@ void btrfs_destroy_inode(struct inode *inode) + } + inode_tree_del(inode); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); ++free: + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); + } + ++void btrfs_drop_inode(struct inode *inode) ++{ ++ struct btrfs_root *root = BTRFS_I(inode)->root; ++ ++ if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) ++ generic_delete_inode(inode); ++ else ++ generic_drop_inode(inode); ++} ++ + static void init_once(void *foo) + { + struct btrfs_inode *ei = (struct btrfs_inode *) foo; +@@ -4761,31 +5269,37 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(old_dir)->root; ++ struct btrfs_root *dest = BTRFS_I(new_dir)->root; + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct timespec ctime = CURRENT_TIME; + u64 index = 0; ++ u64 root_objectid; + int ret; + +- /* we're not allowed to rename between subvolumes */ +- if (BTRFS_I(old_inode)->root->root_key.objectid != +- BTRFS_I(new_dir)->root->root_key.objectid) ++ if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) ++ return -EPERM; ++ ++ /* we only allow rename subvolume link between subvolumes */ ++ if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + return -EXDEV; + ++ if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || ++ (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) ++ return -ENOTEMPTY; ++ + if (S_ISDIR(old_inode->i_mode) && new_inode && +- new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) { ++ new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) + return -ENOTEMPTY; +- } + +- /* to rename a snapshot or subvolume, we need to juggle the +- * backrefs. This isn't coded yet ++ /* ++ * 2 items for dir items ++ * 1 item for orphan entry ++ * 1 item for ref + */ +- if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) +- return -EXDEV; +- +- ret = btrfs_check_metadata_free_space(root); ++ ret = btrfs_reserve_metadata_space(root, 4); + if (ret) +- goto out_unlock; ++ return ret; + + /* + * we're using rename to replace one file with another. +@@ -4796,8 +5310,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + filemap_flush(old_inode->i_mapping); + ++ /* close the racy window with snapshot create/destroy ioctl */ ++ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) ++ down_read(&root->fs_info->subvol_sem); ++ + trans = btrfs_start_transaction(root, 1); ++ btrfs_set_trans_block_group(trans, new_dir); ++ ++ if (dest != root) ++ btrfs_record_root_in_trans(trans, dest); ++ ++ ret = btrfs_set_inode_index(new_dir, &index); ++ if (ret) ++ goto out_fail; + ++ if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { ++ /* force full log commit if subvolume involved. */ ++ root->fs_info->last_trans_log_full_commit = trans->transid; ++ } else { ++ ret = btrfs_insert_inode_ref(trans, dest, ++ new_dentry->d_name.name, ++ new_dentry->d_name.len, ++ old_inode->i_ino, ++ new_dir->i_ino, index); ++ if (ret) ++ goto out_fail; ++ /* ++ * this is an ugly little race, but the rename is required ++ * to make sure that if we crash, the inode is either at the ++ * old name or the new one. pinning the log transaction lets ++ * us make sure we don't allow a log commit to come in after ++ * we unlink the name but before we add the new name back in. ++ */ ++ btrfs_pin_log_trans(root); ++ } + /* + * make sure the inode gets flushed if it is replacing + * something. +@@ -4807,18 +5353,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + btrfs_add_ordered_operation(trans, root, old_inode); + } + +- /* +- * this is an ugly little race, but the rename is required to make +- * sure that if we crash, the inode is either at the old name +- * or the new one. pinning the log transaction lets us make sure +- * we don't allow a log commit to come in after we unlink the +- * name but before we add the new name back in. +- */ +- btrfs_pin_log_trans(root); +- +- btrfs_set_trans_block_group(trans, new_dir); +- +- btrfs_inc_nlink(old_dentry->d_inode); + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + old_inode->i_ctime = ctime; +@@ -4826,47 +5360,60 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + if (old_dentry->d_parent != new_dentry->d_parent) + btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + +- ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, +- old_dentry->d_name.name, +- old_dentry->d_name.len); +- if (ret) +- goto out_fail; ++ if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { ++ root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; ++ ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, ++ old_dentry->d_name.name, ++ old_dentry->d_name.len); ++ } else { ++ btrfs_inc_nlink(old_dentry->d_inode); ++ ret = btrfs_unlink_inode(trans, root, old_dir, ++ old_dentry->d_inode, ++ old_dentry->d_name.name, ++ old_dentry->d_name.len); ++ } ++ BUG_ON(ret); + + if (new_inode) { + new_inode->i_ctime = CURRENT_TIME; +- ret = btrfs_unlink_inode(trans, root, new_dir, +- new_dentry->d_inode, +- new_dentry->d_name.name, +- new_dentry->d_name.len); +- if (ret) +- goto out_fail; ++ if (unlikely(new_inode->i_ino == ++ BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ++ root_objectid = BTRFS_I(new_inode)->location.objectid; ++ ret = btrfs_unlink_subvol(trans, dest, new_dir, ++ root_objectid, ++ new_dentry->d_name.name, ++ new_dentry->d_name.len); ++ BUG_ON(new_inode->i_nlink == 0); ++ } else { ++ ret = btrfs_unlink_inode(trans, dest, new_dir, ++ new_dentry->d_inode, ++ new_dentry->d_name.name, ++ new_dentry->d_name.len); ++ } ++ BUG_ON(ret); + if (new_inode->i_nlink == 0) { + ret = btrfs_orphan_add(trans, new_dentry->d_inode); +- if (ret) +- goto out_fail; ++ BUG_ON(ret); + } +- + } +- ret = btrfs_set_inode_index(new_dir, &index); +- if (ret) +- goto out_fail; + +- ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode, +- old_inode, new_dentry->d_name.name, +- new_dentry->d_name.len, 1, index); +- if (ret) +- goto out_fail; ++ ret = btrfs_add_link(trans, new_dir, old_inode, ++ new_dentry->d_name.name, ++ new_dentry->d_name.len, 0, index); ++ BUG_ON(ret); + +- btrfs_log_new_name(trans, old_inode, old_dir, +- new_dentry->d_parent); ++ if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { ++ btrfs_log_new_name(trans, old_inode, old_dir, ++ new_dentry->d_parent); ++ btrfs_end_log_trans(root); ++ } + out_fail: +- +- /* this btrfs_end_log_trans just allows the current +- * log-sub transaction to complete +- */ +- btrfs_end_log_trans(root); + btrfs_end_transaction_throttle(trans, root); +-out_unlock: ++ ++ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) ++ up_read(&root->fs_info->subvol_sem); ++ ++ btrfs_unreserve_metadata_space(root, 4); + return ret; + } + +@@ -4938,11 +5485,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) + return -ENAMETOOLONG; + +- err = btrfs_check_metadata_free_space(root); ++ /* ++ * 2 items for inode item and ref ++ * 2 items for dir items ++ * 1 item for xattr if selinux is on ++ */ ++ err = btrfs_reserve_metadata_space(root, 5); + if (err) +- goto out_fail; ++ return err; + + trans = btrfs_start_transaction(root, 1); ++ if (!trans) ++ goto out_fail; + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); +@@ -5023,6 +5577,7 @@ out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + out_fail: ++ btrfs_unreserve_metadata_space(root, 5); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); +@@ -5044,6 +5599,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, + + while (num_bytes > 0) { + alloc_size = min(num_bytes, root->fs_info->max_extent); ++ ++ ret = btrfs_reserve_metadata_space(root, 1); ++ if (ret) ++ goto out; ++ + ret = btrfs_reserve_extent(trans, root, alloc_size, + root->sectorsize, 0, alloc_hint, + (u64)-1, &ins, 1); +@@ -5058,9 +5618,12 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, + 0, 0, 0, + BTRFS_FILE_EXTENT_PREALLOC); + BUG_ON(ret); ++ btrfs_drop_extent_cache(inode, cur_offset, ++ cur_offset + ins.offset -1, 0); + num_bytes -= ins.offset; + cur_offset += ins.offset; + alloc_hint = ins.objectid + ins.offset; ++ btrfs_unreserve_metadata_space(root, 1); + } + out: + if (cur_offset > start) { +@@ -5223,6 +5786,7 @@ static struct inode_operations btrfs_dir_ro_inode_operations = { + .lookup = btrfs_lookup, + .permission = btrfs_permission, + }; ++ + static struct file_operations btrfs_dir_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, +@@ -5245,6 +5809,8 @@ static struct extent_io_ops btrfs_extent_io_ops = { + .readpage_io_failed_hook = btrfs_io_failed_hook, + .set_bit_hook = btrfs_set_bit_hook, + .clear_bit_hook = btrfs_clear_bit_hook, ++ .merge_extent_hook = btrfs_merge_extent_hook, ++ .split_extent_hook = btrfs_split_extent_hook, + }; + + /* +@@ -5309,3 +5875,7 @@ static struct inode_operations btrfs_symlink_inode_operations = { + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + }; ++ ++const struct dentry_operations btrfs_dentry_operations = { ++ .d_delete = btrfs_dentry_delete, ++}; +diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c +index bd88f25..cdbb054 100644 +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root, + struct btrfs_root_item root_item; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; +- struct btrfs_root *new_root = root; +- struct inode *dir; ++ struct btrfs_root *new_root; ++ struct inode *dir = dentry->d_parent->d_inode; + int ret; + int err; + u64 objectid; +@@ -239,9 +239,15 @@ static noinline int create_subvol(struct btrfs_root *root, + u64 index = 0; + unsigned long nr = 1; + +- ret = btrfs_check_metadata_free_space(root); ++ /* ++ * 1 - inode item ++ * 2 - refs ++ * 1 - root item ++ * 2 - dir items ++ */ ++ ret = btrfs_reserve_metadata_space(root, 6); + if (ret) +- goto fail_commit; ++ return ret; + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); +@@ -304,11 +310,17 @@ static noinline int create_subvol(struct btrfs_root *root, + if (ret) + goto fail; + ++ key.offset = (u64)-1; ++ new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); ++ BUG_ON(IS_ERR(new_root)); ++ ++ btrfs_record_root_in_trans(trans, new_root); ++ ++ ret = btrfs_create_subvol_root(trans, new_root, new_dirid, ++ BTRFS_I(dir)->block_group); + /* + * insert the directory item + */ +- key.offset = (u64)-1; +- dir = dentry->d_parent->d_inode; + ret = btrfs_set_inode_index(dir, &index); + BUG_ON(ret); + +@@ -322,43 +334,20 @@ static noinline int create_subvol(struct btrfs_root *root, + ret = btrfs_update_inode(trans, root, dir); + BUG_ON(ret); + +- /* add the backref first */ + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, +- objectid, BTRFS_ROOT_BACKREF_KEY, +- root->root_key.objectid, ++ objectid, root->root_key.objectid, + dir->i_ino, index, name, namelen); + + BUG_ON(ret); + +- /* now add the forward ref */ +- ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, +- root->root_key.objectid, BTRFS_ROOT_REF_KEY, +- objectid, +- dir->i_ino, index, name, namelen); +- +- BUG_ON(ret); +- +- ret = btrfs_commit_transaction(trans, root); +- if (ret) +- goto fail_commit; +- +- new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); +- BUG_ON(!new_root); +- +- trans = btrfs_start_transaction(new_root, 1); +- BUG_ON(!trans); +- +- ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid, +- BTRFS_I(dir)->block_group); +- if (ret) +- goto fail; +- ++ d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); + fail: + nr = trans->blocks_used; +- err = btrfs_commit_transaction(trans, new_root); ++ err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; +-fail_commit: ++ ++ btrfs_unreserve_metadata_space(root, 6); + btrfs_btree_balance_dirty(root, nr); + return ret; + } +@@ -375,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, + if (!root->ref_cows) + return -EINVAL; + +- ret = btrfs_check_metadata_free_space(root); ++ /* ++ * 1 - inode item ++ * 2 - refs ++ * 1 - root item ++ * 2 - dir items ++ */ ++ ret = btrfs_reserve_metadata_space(root, 6); + if (ret) + goto fail_unlock; + + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) { + ret = -ENOMEM; ++ btrfs_unreserve_metadata_space(root, 6); + goto fail_unlock; + } + pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); + if (!pending_snapshot->name) { + ret = -ENOMEM; + kfree(pending_snapshot); ++ btrfs_unreserve_metadata_space(root, 6); + goto fail_unlock; + } + memcpy(pending_snapshot->name, name, namelen); +@@ -420,14 +417,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) + * sys_mkdirat and vfs_mkdir, but we only do a single component lookup + * inside this filesystem so it's quite a bit simpler. + */ +-static noinline int btrfs_mksubvol(struct path *parent, char *name, +- int mode, int namelen, ++static noinline int btrfs_mksubvol(struct path *parent, ++ char *name, int namelen, + struct btrfs_root *snap_src) + { ++ struct inode *dir = parent->dentry->d_inode; + struct dentry *dentry; + int error; + +- mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT); ++ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + + dentry = lookup_one_len(name, parent->dentry, namelen); + error = PTR_ERR(dentry); +@@ -438,99 +436,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, + if (dentry->d_inode) + goto out_dput; + +- if (!IS_POSIXACL(parent->dentry->d_inode)) +- mode &= ~current_umask(); +- + error = mnt_want_write(parent->mnt); + if (error) + goto out_dput; + +- error = btrfs_may_create(parent->dentry->d_inode, dentry); ++ error = btrfs_may_create(dir, dentry); + if (error) + goto out_drop_write; + +- /* +- * Actually perform the low-level subvolume creation after all +- * this VFS fuzz. +- * +- * Eventually we want to pass in an inode under which we create this +- * subvolume, but for now all are under the filesystem root. +- * +- * Also we should pass on the mode eventually to allow creating new +- * subvolume with specific mode bits. +- */ ++ down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); ++ ++ if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) ++ goto out_up_read; ++ + if (snap_src) { +- struct dentry *dir = dentry->d_parent; +- struct dentry *test = dir->d_parent; +- struct btrfs_path *path = btrfs_alloc_path(); +- int ret; +- u64 test_oid; +- u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid; +- +- test_oid = snap_src->root_key.objectid; +- +- ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, +- path, parent_oid, test_oid); +- if (ret == 0) +- goto create; +- btrfs_release_path(snap_src->fs_info->tree_root, path); +- +- /* we need to make sure we aren't creating a directory loop +- * by taking a snapshot of something that has our current +- * subvol in its directory tree. So, this loops through +- * the dentries and checks the forward refs for each subvolume +- * to see if is references the subvolume where we are +- * placing this new snapshot. +- */ +- while (1) { +- if (!test || +- dir == snap_src->fs_info->sb->s_root || +- test == snap_src->fs_info->sb->s_root || +- test->d_inode->i_sb != snap_src->fs_info->sb) { +- break; +- } +- if (S_ISLNK(test->d_inode->i_mode)) { +- printk(KERN_INFO "Btrfs symlink in snapshot " +- "path, failed\n"); +- error = -EMLINK; +- btrfs_free_path(path); +- goto out_drop_write; +- } +- test_oid = +- BTRFS_I(test->d_inode)->root->root_key.objectid; +- ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, +- path, test_oid, parent_oid); +- if (ret == 0) { +- printk(KERN_INFO "Btrfs snapshot creation " +- "failed, looping\n"); +- error = -EMLINK; +- btrfs_free_path(path); +- goto out_drop_write; +- } +- btrfs_release_path(snap_src->fs_info->tree_root, path); +- test = test->d_parent; +- } +-create: +- btrfs_free_path(path); +- error = create_snapshot(snap_src, dentry, name, namelen); ++ error = create_snapshot(snap_src, dentry, ++ name, namelen); + } else { +- error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, +- dentry, name, namelen); ++ error = create_subvol(BTRFS_I(dir)->root, dentry, ++ name, namelen); + } +- if (error) +- goto out_drop_write; +- +- fsnotify_mkdir(parent->dentry->d_inode, dentry); ++ if (!error) ++ fsnotify_mkdir(dir, dentry); ++out_up_read: ++ up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); + out_drop_write: + mnt_drop_write(parent->mnt); + out_dput: + dput(dentry); + out_unlock: +- mutex_unlock(&parent->dentry->d_inode->i_mutex); ++ mutex_unlock(&dir->i_mutex); + return error; + } + +- + static int btrfs_defrag_file(struct file *file) + { + struct inode *inode = fdentry(file)->d_inode; +@@ -596,9 +534,8 @@ again: + clear_page_dirty_for_io(page); + + btrfs_set_extent_delalloc(inode, page_start, page_end); +- +- unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_page_dirty(page); ++ unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); +@@ -609,7 +546,8 @@ out_unlock: + return 0; + } + +-static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) ++static noinline int btrfs_ioctl_resize(struct btrfs_root *root, ++ void __user *arg) + { + u64 new_size; + u64 old_size; +@@ -718,10 +656,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, + { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_ioctl_vol_args *vol_args; +- struct btrfs_dir_item *di; +- struct btrfs_path *path; + struct file *src_file; +- u64 root_dirid; + int namelen; + int ret = 0; + +@@ -739,32 +674,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, + goto out; + } + +- path = btrfs_alloc_path(); +- if (!path) { +- ret = -ENOMEM; +- goto out; +- } +- +- root_dirid = root->fs_info->sb->s_root->d_inode->i_ino, +- di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, +- path, root_dirid, +- vol_args->name, namelen, 0); +- btrfs_free_path(path); +- +- if (di && !IS_ERR(di)) { +- ret = -EEXIST; +- goto out; +- } +- +- if (IS_ERR(di)) { +- ret = PTR_ERR(di); +- goto out; +- } +- + if (subvol) { +- ret = btrfs_mksubvol(&file->f_path, vol_args->name, +- file->f_path.dentry->d_inode->i_mode, +- namelen, NULL); ++ ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, ++ NULL); + } else { + struct inode *src_inode; + src_file = fget(vol_args->fd); +@@ -781,17 +693,157 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, + fput(src_file); + goto out; + } +- ret = btrfs_mksubvol(&file->f_path, vol_args->name, +- file->f_path.dentry->d_inode->i_mode, +- namelen, BTRFS_I(src_inode)->root); ++ ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, ++ BTRFS_I(src_inode)->root); + fput(src_file); + } +- + out: + kfree(vol_args); + return ret; + } + ++/* ++ * helper to check if the subvolume references other subvolumes ++ */ ++static noinline int may_destroy_subvol(struct btrfs_root *root) ++{ ++ struct btrfs_path *path; ++ struct btrfs_key key; ++ int ret; ++ ++ path = btrfs_alloc_path(); ++ if (!path) ++ return -ENOMEM; ++ ++ key.objectid = root->root_key.objectid; ++ key.type = BTRFS_ROOT_REF_KEY; ++ key.offset = (u64)-1; ++ ++ ret = btrfs_search_slot(NULL, root->fs_info->tree_root, ++ &key, path, 0, 0); ++ if (ret < 0) ++ goto out; ++ BUG_ON(ret == 0); ++ ++ ret = 0; ++ if (path->slots[0] > 0) { ++ path->slots[0]--; ++ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ++ if (key.objectid == root->root_key.objectid && ++ key.type == BTRFS_ROOT_REF_KEY) ++ ret = -ENOTEMPTY; ++ } ++out: ++ btrfs_free_path(path); ++ return ret; ++} ++ ++static noinline int btrfs_ioctl_snap_destroy(struct file *file, ++ void __user *arg) ++{ ++ struct dentry *parent = fdentry(file); ++ struct dentry *dentry; ++ struct inode *dir = parent->d_inode; ++ struct inode *inode; ++ struct btrfs_root *root = BTRFS_I(dir)->root; ++ struct btrfs_root *dest = NULL; ++ struct btrfs_ioctl_vol_args *vol_args; ++ struct btrfs_trans_handle *trans; ++ int namelen; ++ int ret; ++ int err = 0; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ vol_args = memdup_user(arg, sizeof(*vol_args)); ++ if (IS_ERR(vol_args)) ++ return PTR_ERR(vol_args); ++ ++ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ++ namelen = strlen(vol_args->name); ++ if (strchr(vol_args->name, '/') || ++ strncmp(vol_args->name, "..", namelen) == 0) { ++ err = -EINVAL; ++ goto out; ++ } ++ ++ err = mnt_want_write(file->f_path.mnt); ++ if (err) ++ goto out; ++ ++ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); ++ dentry = lookup_one_len(vol_args->name, parent, namelen); ++ if (IS_ERR(dentry)) { ++ err = PTR_ERR(dentry); ++ goto out_unlock_dir; ++ } ++ ++ if (!dentry->d_inode) { ++ err = -ENOENT; ++ goto out_dput; ++ } ++ ++ inode = dentry->d_inode; ++ if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { ++ err = -EINVAL; ++ goto out_dput; ++ } ++ ++ dest = BTRFS_I(inode)->root; ++ ++ mutex_lock(&inode->i_mutex); ++ err = d_invalidate(dentry); ++ if (err) ++ goto out_unlock; ++ ++ down_write(&root->fs_info->subvol_sem); ++ ++ err = may_destroy_subvol(dest); ++ if (err) ++ goto out_up_write; ++ ++ trans = btrfs_start_transaction(root, 1); ++ ret = btrfs_unlink_subvol(trans, root, dir, ++ dest->root_key.objectid, ++ dentry->d_name.name, ++ dentry->d_name.len); ++ BUG_ON(ret); ++ ++ btrfs_record_root_in_trans(trans, dest); ++ ++ memset(&dest->root_item.drop_progress, 0, ++ sizeof(dest->root_item.drop_progress)); ++ dest->root_item.drop_level = 0; ++ btrfs_set_root_refs(&dest->root_item, 0); ++ ++ ret = btrfs_insert_orphan_item(trans, ++ root->fs_info->tree_root, ++ dest->root_key.objectid); ++ BUG_ON(ret); ++ ++ ret = btrfs_commit_transaction(trans, root); ++ BUG_ON(ret); ++ inode->i_flags |= S_DEAD; ++out_up_write: ++ up_write(&root->fs_info->subvol_sem); ++out_unlock: ++ mutex_unlock(&inode->i_mutex); ++ if (!err) { ++ shrink_dcache_sb(root->fs_info->sb); ++ btrfs_invalidate_inodes(dest); ++ d_delete(dentry); ++ } ++out_dput: ++ dput(dentry); ++out_unlock_dir: ++ mutex_unlock(&dir->i_mutex); ++ mnt_drop_write(file->f_path.mnt); ++out: ++ kfree(vol_args); ++ return err; ++} ++ + static int btrfs_ioctl_defrag(struct file *file) + { + struct inode *inode = fdentry(file)->d_inode; +@@ -865,8 +917,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) + return ret; + } + +-static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, +- u64 off, u64 olen, u64 destoff) ++static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, ++ u64 off, u64 olen, u64 destoff) + { + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; +@@ -976,7 +1028,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + + /* punch hole in destination first */ + btrfs_drop_extents(trans, root, inode, off, off + len, +- off + len, 0, &hint_byte); ++ off + len, 0, &hint_byte, 1); + + /* clone data */ + key.objectid = src->i_ino; +@@ -1071,9 +1123,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + datao += off - key.offset; + datal -= off - key.offset; + } +- if (key.offset + datao + datal + key.offset > +- off + len) +- datal = off + len - key.offset - datao; ++ ++ if (key.offset + datal > off + len) ++ datal = off + len - key.offset; ++ + /* disko == 0 means it's a hole */ + if (!disko) + datao = 0; +@@ -1182,15 +1235,15 @@ static long btrfs_ioctl_trans_start(struct file *file) + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; +- int ret = 0; ++ int ret; + ++ ret = -EPERM; + if (!capable(CAP_SYS_ADMIN)) +- return -EPERM; ++ goto out; + +- if (file->private_data) { +- ret = -EINPROGRESS; ++ ret = -EINPROGRESS; ++ if (file->private_data) + goto out; +- } + + ret = mnt_want_write(file->f_path.mnt); + if (ret) +@@ -1200,12 +1253,19 @@ static long btrfs_ioctl_trans_start(struct file *file) + root->fs_info->open_ioctl_trans++; + mutex_unlock(&root->fs_info->trans_mutex); + ++ ret = -ENOMEM; + trans = btrfs_start_ioctl_transaction(root, 0); +- if (trans) +- file->private_data = trans; +- else +- ret = -ENOMEM; +- /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ ++ if (!trans) ++ goto out_drop; ++ ++ file->private_data = trans; ++ return 0; ++ ++out_drop: ++ mutex_lock(&root->fs_info->trans_mutex); ++ root->fs_info->open_ioctl_trans--; ++ mutex_unlock(&root->fs_info->trans_mutex); ++ mnt_drop_write(file->f_path.mnt); + out: + return ret; + } +@@ -1221,24 +1281,20 @@ long btrfs_ioctl_trans_end(struct file *file) + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; +- int ret = 0; + + trans = file->private_data; +- if (!trans) { +- ret = -EINVAL; +- goto out; +- } +- btrfs_end_transaction(trans, root); ++ if (!trans) ++ return -EINVAL; + file->private_data = NULL; + ++ btrfs_end_transaction(trans, root); ++ + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans--; + mutex_unlock(&root->fs_info->trans_mutex); + + mnt_drop_write(file->f_path.mnt); +- +-out: +- return ret; ++ return 0; + } + + long btrfs_ioctl(struct file *file, unsigned int +@@ -1258,6 +1314,8 @@ long btrfs_ioctl(struct file *file, unsigned int + return btrfs_ioctl_snap_create(file, argp, 0); + case BTRFS_IOC_SUBVOL_CREATE: + return btrfs_ioctl_snap_create(file, argp, 1); ++ case BTRFS_IOC_SNAP_DESTROY: ++ return btrfs_ioctl_snap_destroy(file, argp); + case BTRFS_IOC_DEFRAG: + return btrfs_ioctl_defrag(file); + case BTRFS_IOC_RESIZE: +diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h +index b320b10..bc49914 100644 +--- a/fs/btrfs/ioctl.h ++++ b/fs/btrfs/ioctl.h +@@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args { + + #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ + struct btrfs_ioctl_vol_args) +- ++#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ ++ struct btrfs_ioctl_vol_args) + #endif +diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c +index d6f0806..ab21c29 100644 +--- a/fs/btrfs/ordered-data.c ++++ b/fs/btrfs/ordered-data.c +@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, + * + * len is the length of the extent + * +- * This also sets the EXTENT_ORDERED bit on the range in the inode. +- * + * The tree is given a single reference on the ordered extent that was + * inserted. + */ +@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + entry->start = start; + entry->len = len; + entry->disk_len = disk_len; ++ entry->bytes_left = len; + entry->inode = inode; + if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) + set_bit(type, &entry->flags); +@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + &entry->rb_node); + BUG_ON(node); + +- set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset, +- entry_end(entry) - 1, GFP_NOFS); +- + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + list_add_tail(&entry->root_extent_list, + &BTRFS_I(inode)->root->fs_info->ordered_extents); +@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry; +- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int ret; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); +- clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1, +- GFP_NOFS); + node = tree_search(tree, file_offset); + if (!node) { + ret = 1; +@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, + goto out; + } + +- ret = test_range_bit(io_tree, entry->file_offset, +- entry->file_offset + entry->len - 1, +- EXTENT_ORDERED, 0); +- if (ret == 0) ++ if (io_size > entry->bytes_left) { ++ printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", ++ (unsigned long long)entry->bytes_left, ++ (unsigned long long)io_size); ++ } ++ entry->bytes_left -= io_size; ++ if (entry->bytes_left == 0) + ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); ++ else ++ ret = 1; + out: + mutex_unlock(&tree->mutex); + return ret == 0; +@@ -308,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode, + tree->last = NULL; + set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + ++ spin_lock(&BTRFS_I(inode)->accounting_lock); ++ BTRFS_I(inode)->outstanding_extents--; ++ spin_unlock(&BTRFS_I(inode)->accounting_lock); ++ btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, ++ inode, 1); ++ + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + list_del_init(&entry->root_extent_list); + +@@ -476,6 +480,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) + u64 orig_end; + u64 wait_end; + struct btrfs_ordered_extent *ordered; ++ int found; + + if (start + len < start) { + orig_end = INT_LIMIT(loff_t); +@@ -502,6 +507,7 @@ again: + orig_end >> PAGE_CACHE_SHIFT); + + end = orig_end; ++ found = 0; + while (1) { + ordered = btrfs_lookup_first_ordered_extent(inode, end); + if (!ordered) +@@ -514,6 +520,7 @@ again: + btrfs_put_ordered_extent(ordered); + break; + } ++ found++; + btrfs_start_ordered_extent(inode, ordered, 1); + end = ordered->file_offset; + btrfs_put_ordered_extent(ordered); +@@ -521,8 +528,8 @@ again: + break; + end--; + } +- if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, +- EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { ++ if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, ++ EXTENT_DELALLOC, 0, NULL)) { + schedule_timeout(1); + goto again; + } +@@ -613,7 +620,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, + */ + if (test_range_bit(io_tree, disk_i_size, + ordered->file_offset + ordered->len - 1, +- EXTENT_DELALLOC, 0)) { ++ EXTENT_DELALLOC, 0, NULL)) { + goto out; + } + /* +@@ -664,7 +671,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, + */ + if (i_size_test > entry_end(ordered) && + !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, +- EXTENT_DELALLOC, 0)) { ++ EXTENT_DELALLOC, 0, NULL)) { + new_i_size = min_t(u64, i_size_test, i_size_read(inode)); + } + BTRFS_I(inode)->disk_i_size = new_i_size; +diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h +index 3d31c88..993a7ea 100644 +--- a/fs/btrfs/ordered-data.h ++++ b/fs/btrfs/ordered-data.h +@@ -85,6 +85,9 @@ struct btrfs_ordered_extent { + /* extent length on disk */ + u64 disk_len; + ++ /* number of bytes that still need writing */ ++ u64 bytes_left; ++ + /* flags (described above) */ + unsigned long flags; + +diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c +index 3c0d52a..79cba5f 100644 +--- a/fs/btrfs/orphan.c ++++ b/fs/btrfs/orphan.c +@@ -65,3 +65,23 @@ out: + btrfs_free_path(path); + return ret; + } ++ ++int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset) ++{ ++ struct btrfs_path *path; ++ struct btrfs_key key; ++ int ret; ++ ++ key.objectid = BTRFS_ORPHAN_OBJECTID; ++ key.type = BTRFS_ORPHAN_ITEM_KEY; ++ key.offset = offset; ++ ++ path = btrfs_alloc_path(); ++ if (!path) ++ return -ENOMEM; ++ ++ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); ++ ++ btrfs_free_path(path); ++ return ret; ++} +diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c +index c04f7f2..cfcc93c 100644 +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -121,6 +121,15 @@ struct inodevec { + int nr; + }; + ++#define MAX_EXTENTS 128 ++ ++struct file_extent_cluster { ++ u64 start; ++ u64 end; ++ u64 boundary[MAX_EXTENTS]; ++ unsigned int nr; ++}; ++ + struct reloc_control { + /* block group to relocate */ + struct btrfs_block_group_cache *block_group; +@@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize, + struct reloc_control *rc) + { + if (test_range_bit(&rc->processed_blocks, bytenr, +- bytenr + blocksize - 1, EXTENT_DIRTY, 1)) ++ bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) + return 1; + return 0; + } +@@ -2529,56 +2538,94 @@ out: + } + + static noinline_for_stack +-int relocate_inode_pages(struct inode *inode, u64 start, u64 len) ++int setup_extent_mapping(struct inode *inode, u64 start, u64 end, ++ u64 block_start) ++{ ++ struct btrfs_root *root = BTRFS_I(inode)->root; ++ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; ++ struct extent_map *em; ++ int ret = 0; ++ ++ em = alloc_extent_map(GFP_NOFS); ++ if (!em) ++ return -ENOMEM; ++ ++ em->start = start; ++ em->len = end + 1 - start; ++ em->block_len = em->len; ++ em->block_start = block_start; ++ em->bdev = root->fs_info->fs_devices->latest_bdev; ++ set_bit(EXTENT_FLAG_PINNED, &em->flags); ++ ++ lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); ++ while (1) { ++ write_lock(&em_tree->lock); ++ ret = add_extent_mapping(em_tree, em); ++ write_unlock(&em_tree->lock); ++ if (ret != -EEXIST) { ++ free_extent_map(em); ++ break; ++ } ++ btrfs_drop_extent_cache(inode, start, end, 0); ++ } ++ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); ++ return ret; ++} ++ ++static int relocate_file_extent_cluster(struct inode *inode, ++ struct file_extent_cluster *cluster) + { + u64 page_start; + u64 page_end; +- unsigned long i; +- unsigned long first_index; ++ u64 offset = BTRFS_I(inode)->index_cnt; ++ unsigned long index; + unsigned long last_index; +- unsigned int total_read = 0; +- unsigned int total_dirty = 0; ++ unsigned int dirty_page = 0; + struct page *page; + struct file_ra_state *ra; +- struct btrfs_ordered_extent *ordered; +- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; ++ int nr = 0; + int ret = 0; + ++ if (!cluster->nr) ++ return 0; ++ + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + ++ index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; ++ last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; ++ + mutex_lock(&inode->i_mutex); +- first_index = start >> PAGE_CACHE_SHIFT; +- last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; + +- /* make sure the dirty trick played by the caller work */ +- while (1) { +- ret = invalidate_inode_pages2_range(inode->i_mapping, +- first_index, last_index); +- if (ret != -EBUSY) +- break; +- schedule_timeout(HZ/10); +- } ++ i_size_write(inode, cluster->end + 1 - offset); ++ ret = setup_extent_mapping(inode, cluster->start - offset, ++ cluster->end - offset, cluster->start); + if (ret) + goto out_unlock; + + file_ra_state_init(ra, inode->i_mapping); + +- for (i = first_index ; i <= last_index; i++) { +- if (total_read % ra->ra_pages == 0) { +- btrfs_force_ra(inode->i_mapping, ra, NULL, i, +- min(last_index, ra->ra_pages + i - 1)); +- } +- total_read++; +-again: +- if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) +- BUG_ON(1); +- page = grab_cache_page(inode->i_mapping, i); ++ WARN_ON(cluster->start != cluster->boundary[0]); ++ while (index <= last_index) { ++ page = find_lock_page(inode->i_mapping, index); + if (!page) { +- ret = -ENOMEM; +- goto out_unlock; ++ page_cache_sync_readahead(inode->i_mapping, ++ ra, NULL, index, ++ last_index + 1 - index); ++ page = grab_cache_page(inode->i_mapping, index); ++ if (!page) { ++ ret = -ENOMEM; ++ goto out_unlock; ++ } ++ } ++ ++ if (PageReadahead(page)) { ++ page_cache_async_readahead(inode->i_mapping, ++ ra, NULL, page, index, ++ last_index + 1 - index); + } ++ + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); +@@ -2589,75 +2636,79 @@ again: + goto out_unlock; + } + } +- wait_on_page_writeback(page); + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; +- lock_extent(io_tree, page_start, page_end, GFP_NOFS); +- +- ordered = btrfs_lookup_ordered_extent(inode, page_start); +- if (ordered) { +- unlock_extent(io_tree, page_start, page_end, GFP_NOFS); +- unlock_page(page); +- page_cache_release(page); +- btrfs_start_ordered_extent(inode, ordered, 1); +- btrfs_put_ordered_extent(ordered); +- goto again; +- } ++ ++ lock_extent(&BTRFS_I(inode)->io_tree, ++ page_start, page_end, GFP_NOFS); ++ + set_page_extent_mapped(page); + +- if (i == first_index) +- set_extent_bits(io_tree, page_start, page_end, ++ if (nr < cluster->nr && ++ page_start + offset == cluster->boundary[nr]) { ++ set_extent_bits(&BTRFS_I(inode)->io_tree, ++ page_start, page_end, + EXTENT_BOUNDARY, GFP_NOFS); ++ nr++; ++ } + btrfs_set_extent_delalloc(inode, page_start, page_end); + + set_page_dirty(page); +- total_dirty++; ++ dirty_page++; + +- unlock_extent(io_tree, page_start, page_end, GFP_NOFS); ++ unlock_extent(&BTRFS_I(inode)->io_tree, ++ page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); ++ ++ index++; ++ if (nr < cluster->nr && ++ page_end + 1 + offset == cluster->boundary[nr]) { ++ balance_dirty_pages_ratelimited_nr(inode->i_mapping, ++ dirty_page); ++ dirty_page = 0; ++ } ++ } ++ if (dirty_page) { ++ balance_dirty_pages_ratelimited_nr(inode->i_mapping, ++ dirty_page); + } ++ WARN_ON(nr != cluster->nr); + out_unlock: + mutex_unlock(&inode->i_mutex); + kfree(ra); +- balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); + return ret; + } + + static noinline_for_stack +-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key) ++int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, ++ struct file_extent_cluster *cluster) + { +- struct btrfs_root *root = BTRFS_I(inode)->root; +- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; +- struct extent_map *em; +- u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt; +- u64 end = start + extent_key->offset - 1; +- +- em = alloc_extent_map(GFP_NOFS); +- em->start = start; +- em->len = extent_key->offset; +- em->block_len = extent_key->offset; +- em->block_start = extent_key->objectid; +- em->bdev = root->fs_info->fs_devices->latest_bdev; +- set_bit(EXTENT_FLAG_PINNED, &em->flags); ++ int ret; + +- /* setup extent map to cheat btrfs_readpage */ +- lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); +- while (1) { +- int ret; +- spin_lock(&em_tree->lock); +- ret = add_extent_mapping(em_tree, em); +- spin_unlock(&em_tree->lock); +- if (ret != -EEXIST) { +- free_extent_map(em); +- break; +- } +- btrfs_drop_extent_cache(inode, start, end, 0); ++ if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { ++ ret = relocate_file_extent_cluster(inode, cluster); ++ if (ret) ++ return ret; ++ cluster->nr = 0; + } +- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + +- return relocate_inode_pages(inode, start, extent_key->offset); ++ if (!cluster->nr) ++ cluster->start = extent_key->objectid; ++ else ++ BUG_ON(cluster->nr >= MAX_EXTENTS); ++ cluster->end = extent_key->objectid + extent_key->offset - 1; ++ cluster->boundary[cluster->nr] = extent_key->objectid; ++ cluster->nr++; ++ ++ if (cluster->nr >= MAX_EXTENTS) { ++ ret = relocate_file_extent_cluster(inode, cluster); ++ if (ret) ++ return ret; ++ cluster->nr = 0; ++ } ++ return 0; + } + + #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +@@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags) + return 0; + } + ++ + static noinline_for_stack int relocate_block_group(struct reloc_control *rc) + { + struct rb_root blocks = RB_ROOT; + struct btrfs_key key; ++ struct file_extent_cluster *cluster; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_path *path; + struct btrfs_extent_item *ei; +@@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) + int ret; + int err = 0; + ++ cluster = kzalloc(sizeof(*cluster), GFP_NOFS); ++ if (!cluster) ++ return -ENOMEM; ++ + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ++ rc->extents_found = 0; ++ rc->extents_skipped = 0; ++ + rc->search_start = rc->block_group->key.objectid; + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, + GFP_NOFS); +@@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) + } + + nr = trans->blocks_used; +- btrfs_end_transaction_throttle(trans, rc->extent_root); ++ btrfs_end_transaction(trans, rc->extent_root); + trans = NULL; + btrfs_btree_balance_dirty(rc->extent_root, nr); + + if (rc->stage == MOVE_DATA_EXTENTS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + rc->found_file_extent = 1; +- ret = relocate_data_extent(rc->data_inode, &key); ++ ret = relocate_data_extent(rc->data_inode, ++ &key, cluster); + if (ret < 0) { + err = ret; + break; +@@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) + btrfs_btree_balance_dirty(rc->extent_root, nr); + } + ++ if (!err) { ++ ret = relocate_file_extent_cluster(rc->data_inode, cluster); ++ if (ret < 0) ++ err = ret; ++ } ++ ++ kfree(cluster); ++ + rc->create_reloc_root = 0; + smp_mb(); + +@@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) + } + + static int __insert_orphan_inode(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- u64 objectid, u64 size) ++ struct btrfs_root *root, u64 objectid) + { + struct btrfs_path *path; + struct btrfs_inode_item *item; +@@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + btrfs_set_inode_generation(leaf, item, 1); +- btrfs_set_inode_size(leaf, item, size); ++ btrfs_set_inode_size(leaf, item, 0); + btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); + btrfs_mark_buffer_dirty(leaf); +@@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + if (err) + goto out; + +- err = __insert_orphan_inode(trans, root, objectid, group->key.offset); +- BUG_ON(err); +- +- err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, +- group->key.offset, 0, group->key.offset, +- 0, 0, 0); ++ err = __insert_orphan_inode(trans, root, objectid); + BUG_ON(err); + + key.objectid = objectid; +@@ -3455,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) + BUG_ON(!rc->block_group); + + btrfs_init_workers(&rc->workers, "relocate", +- fs_info->thread_pool_size); ++ fs_info->thread_pool_size, NULL); + + rc->extent_root = extent_root; + btrfs_prepare_block_group_relocation(extent_root, rc->block_group); +@@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) + btrfs_wait_ordered_extents(fs_info->tree_root, 0); + + while (1) { +- mutex_lock(&fs_info->cleaner_mutex); +- btrfs_clean_old_snapshots(fs_info->tree_root); +- mutex_unlock(&fs_info->cleaner_mutex); +- + rc->extents_found = 0; + rc->extents_skipped = 0; + ++ mutex_lock(&fs_info->cleaner_mutex); ++ ++ btrfs_clean_old_snapshots(fs_info->tree_root); + ret = relocate_block_group(rc); ++ ++ mutex_unlock(&fs_info->cleaner_mutex); + if (ret < 0) { + err = ret; + break; +@@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) + } + } + +- filemap_fdatawrite_range(fs_info->btree_inode->i_mapping, +- rc->block_group->key.objectid, +- rc->block_group->key.objectid + +- rc->block_group->key.offset - 1); ++ filemap_write_and_wait_range(fs_info->btree_inode->i_mapping, ++ rc->block_group->key.objectid, ++ rc->block_group->key.objectid + ++ rc->block_group->key.offset - 1); + + WARN_ON(rc->block_group->pinned > 0); + WARN_ON(rc->block_group->reserved > 0); +@@ -3530,6 +3594,26 @@ out: + return err; + } + ++static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) ++{ ++ struct btrfs_trans_handle *trans; ++ int ret; ++ ++ trans = btrfs_start_transaction(root->fs_info->tree_root, 1); ++ ++ memset(&root->root_item.drop_progress, 0, ++ sizeof(root->root_item.drop_progress)); ++ root->root_item.drop_level = 0; ++ btrfs_set_root_refs(&root->root_item, 0); ++ ret = btrfs_update_root(trans, root->fs_info->tree_root, ++ &root->root_key, &root->root_item); ++ BUG_ON(ret); ++ ++ ret = btrfs_end_transaction(trans, root->fs_info->tree_root); ++ BUG_ON(ret); ++ return 0; ++} ++ + /* + * recover relocation interrupted by system crash. + * +@@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root) + fs_root = read_fs_root(root->fs_info, + reloc_root->root_key.offset); + if (IS_ERR(fs_root)) { +- err = PTR_ERR(fs_root); +- goto out; ++ ret = PTR_ERR(fs_root); ++ if (ret != -ENOENT) { ++ err = ret; ++ goto out; ++ } ++ mark_garbage_root(reloc_root); + } + } + +@@ -3613,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) + mapping_tree_init(&rc->reloc_root_tree); + INIT_LIST_HEAD(&rc->reloc_roots); + btrfs_init_workers(&rc->workers, "relocate", +- root->fs_info->thread_pool_size); ++ root->fs_info->thread_pool_size, NULL); + rc->extent_root = root->fs_info->extent_root; + + set_reloc_control(rc); +diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c +index 0ddc6d6..9351428 100644 +--- a/fs/btrfs/root-tree.c ++++ b/fs/btrfs/root-tree.c +@@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, + goto out; + + BUG_ON(ret == 0); ++ if (path->slots[0] == 0) { ++ ret = 1; ++ goto out; ++ } + l = path->nodes[0]; +- BUG_ON(path->slots[0] == 0); + slot = path->slots[0] - 1; + btrfs_item_key_to_cpu(l, &found_key, slot); +- if (found_key.objectid != objectid) { ++ if (found_key.objectid != objectid || ++ found_key.type != BTRFS_ROOT_ITEM_KEY) { + ret = 1; + goto out; + } +- read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), +- sizeof(*item)); +- memcpy(key, &found_key, sizeof(found_key)); ++ if (item) ++ read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), ++ sizeof(*item)); ++ if (key) ++ memcpy(key, &found_key, sizeof(found_key)); + ret = 0; + out: + btrfs_free_path(path); +@@ -249,6 +255,59 @@ err: + return ret; + } + ++int btrfs_find_orphan_roots(struct btrfs_root *tree_root) ++{ ++ struct extent_buffer *leaf; ++ struct btrfs_path *path; ++ struct btrfs_key key; ++ int err = 0; ++ int ret; ++ ++ path = btrfs_alloc_path(); ++ if (!path) ++ return -ENOMEM; ++ ++ key.objectid = BTRFS_ORPHAN_OBJECTID; ++ key.type = BTRFS_ORPHAN_ITEM_KEY; ++ key.offset = 0; ++ ++ while (1) { ++ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); ++ if (ret < 0) { ++ err = ret; ++ break; ++ } ++ ++ leaf = path->nodes[0]; ++ if (path->slots[0] >= btrfs_header_nritems(leaf)) { ++ ret = btrfs_next_leaf(tree_root, path); ++ if (ret < 0) ++ err = ret; ++ if (ret != 0) ++ break; ++ leaf = path->nodes[0]; ++ } ++ ++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ++ btrfs_release_path(tree_root, path); ++ ++ if (key.objectid != BTRFS_ORPHAN_OBJECTID || ++ key.type != BTRFS_ORPHAN_ITEM_KEY) ++ break; ++ ++ ret = btrfs_find_dead_roots(tree_root, key.offset); ++ if (ret) { ++ err = ret; ++ break; ++ } ++ ++ key.offset++; ++ } ++ ++ btrfs_free_path(path); ++ return err; ++} ++ + /* drop the root item for 'key' from 'root' */ + int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key) +@@ -278,31 +337,57 @@ out: + return ret; + } + +-#if 0 /* this will get used when snapshot deletion is implemented */ + int btrfs_del_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, +- u64 root_id, u8 type, u64 ref_id) ++ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, ++ const char *name, int name_len) ++ + { ++ struct btrfs_path *path; ++ struct btrfs_root_ref *ref; ++ struct extent_buffer *leaf; + struct btrfs_key key; ++ unsigned long ptr; ++ int err = 0; + int ret; +- struct btrfs_path *path; + + path = btrfs_alloc_path(); ++ if (!path) ++ return -ENOMEM; + + key.objectid = root_id; +- key.type = type; ++ key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = ref_id; +- ++again: + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); +- BUG_ON(ret); +- +- ret = btrfs_del_item(trans, tree_root, path); +- BUG_ON(ret); ++ BUG_ON(ret < 0); ++ if (ret == 0) { ++ leaf = path->nodes[0]; ++ ref = btrfs_item_ptr(leaf, path->slots[0], ++ struct btrfs_root_ref); ++ ++ WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); ++ WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); ++ ptr = (unsigned long)(ref + 1); ++ WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); ++ *sequence = btrfs_root_ref_sequence(leaf, ref); ++ ++ ret = btrfs_del_item(trans, tree_root, path); ++ BUG_ON(ret); ++ } else ++ err = -ENOENT; ++ ++ if (key.type == BTRFS_ROOT_BACKREF_KEY) { ++ btrfs_release_path(tree_root, path); ++ key.objectid = ref_id; ++ key.type = BTRFS_ROOT_REF_KEY; ++ key.offset = root_id; ++ goto again; ++ } + + btrfs_free_path(path); +- return ret; ++ return err; + } +-#endif + + int btrfs_find_root_ref(struct btrfs_root *tree_root, + struct btrfs_path *path, +@@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root, + return ret; + } + +- + /* + * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY + * or BTRFS_ROOT_BACKREF_KEY. +@@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root, + */ + int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, +- u64 root_id, u8 type, u64 ref_id, +- u64 dirid, u64 sequence, ++ u64 root_id, u64 ref_id, u64 dirid, u64 sequence, + const char *name, int name_len) + { + struct btrfs_key key; +@@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf; + unsigned long ptr; + +- + path = btrfs_alloc_path(); ++ if (!path) ++ return -ENOMEM; + + key.objectid = root_id; +- key.type = type; ++ key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = ref_id; +- ++again: + ret = btrfs_insert_empty_item(trans, tree_root, path, &key, + sizeof(*ref) + name_len); + BUG_ON(ret); +@@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + write_extent_buffer(leaf, name, ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + ++ if (key.type == BTRFS_ROOT_BACKREF_KEY) { ++ btrfs_release_path(tree_root, path); ++ key.objectid = ref_id; ++ key.type = BTRFS_ROOT_REF_KEY; ++ key.offset = root_id; ++ goto again; ++ } ++ + btrfs_free_path(path); +- return ret; ++ return 0; + } +diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c +index 6d6d06c..939b68f 100644 +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -66,7 +66,8 @@ enum { + Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, + Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, + Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, +- Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, ++ Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, ++ Opt_discard, Opt_err, + }; + + static match_table_t tokens = { +@@ -88,6 +89,7 @@ static match_table_t tokens = { + {Opt_notreelog, "notreelog"}, + {Opt_flushoncommit, "flushoncommit"}, + {Opt_ratio, "metadata_ratio=%d"}, ++ {Opt_discard, "discard"}, + {Opt_err, NULL}, + }; + +@@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) + info->metadata_ratio); + } + break; ++ case Opt_discard: ++ btrfs_set_opt(info->mount_opt, DISCARD); ++ break; + default: + break; + } +@@ -344,7 +349,9 @@ static int btrfs_fill_super(struct super_block *sb, + sb->s_export_op = &btrfs_export_ops; + sb->s_xattr = btrfs_xattr_handlers; + sb->s_time_gran = 1; ++#ifdef CONFIG_BTRFS_FS_POSIX_ACL + sb->s_flags |= MS_POSIXACL; ++#endif + + tree_root = open_ctree(sb, fs_devices, (char *)data); + +@@ -676,6 +683,7 @@ static int btrfs_unfreeze(struct super_block *sb) + } + + static struct super_operations btrfs_super_ops = { ++ .drop_inode = btrfs_drop_inode, + .delete_inode = btrfs_delete_inode, + .put_super = btrfs_put_super, + .sync_fs = btrfs_sync_fs, +diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c +index cdbb502..bca82a4 100644 +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans, + { + if (root->ref_cows && root->last_trans < trans->transid) { + WARN_ON(root == root->fs_info->extent_root); +- WARN_ON(root->root_item.refs == 0); + WARN_ON(root->commit_root != root->node); + + radix_tree_tag_set(&root->fs_info->fs_roots_radix, +@@ -187,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, + h->alloc_exclude_start = 0; + h->delayed_ref_updates = 0; + ++ if (!current->journal_info) ++ current->journal_info = h; ++ + root->fs_info->running_transaction->use_count++; + record_root_in_trans(h, root); + mutex_unlock(&root->fs_info->trans_mutex); +@@ -318,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, + wake_up(&cur_trans->writer_wait); + put_transaction(cur_trans); + mutex_unlock(&info->trans_mutex); ++ ++ if (current->journal_info == trans) ++ current->journal_info = NULL; + memset(trans, 0, sizeof(*trans)); + kmem_cache_free(btrfs_trans_handle_cachep, trans); + +@@ -339,10 +344,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, + /* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of +- * those extents are on disk for transaction or log commit ++ * those extents are sent to disk but does not wait on them + */ +-int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, +- struct extent_io_tree *dirty_pages) ++int btrfs_write_marked_extents(struct btrfs_root *root, ++ struct extent_io_tree *dirty_pages) + { + int ret; + int err = 0; +@@ -389,6 +394,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + page_cache_release(page); + } + } ++ if (err) ++ werr = err; ++ return werr; ++} ++ ++/* ++ * when btree blocks are allocated, they have some corresponding bits set for ++ * them in one of two extent_io trees. This is used to make sure all of ++ * those extents are on disk for transaction or log commit. We wait ++ * on all the pages and clear them from the dirty pages state tree ++ */ ++int btrfs_wait_marked_extents(struct btrfs_root *root, ++ struct extent_io_tree *dirty_pages) ++{ ++ int ret; ++ int err = 0; ++ int werr = 0; ++ struct page *page; ++ struct inode *btree_inode = root->fs_info->btree_inode; ++ u64 start = 0; ++ u64 end; ++ unsigned long index; ++ + while (1) { + ret = find_first_extent_bit(dirty_pages, 0, &start, &end, + EXTENT_DIRTY); +@@ -419,6 +447,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + return werr; + } + ++/* ++ * when btree blocks are allocated, they have some corresponding bits set for ++ * them in one of two extent_io trees. This is used to make sure all of ++ * those extents are on disk for transaction or log commit ++ */ ++int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, ++ struct extent_io_tree *dirty_pages) ++{ ++ int ret; ++ int ret2; ++ ++ ret = btrfs_write_marked_extents(root, dirty_pages); ++ ret2 = btrfs_wait_marked_extents(root, dirty_pages); ++ return ret || ret2; ++} ++ + int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) + { +@@ -720,7 +764,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); + + key.objectid = objectid; +- key.offset = 0; ++ /* record when the snapshot was created in key.offset */ ++ key.offset = trans->transid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + + old = btrfs_lock_root_node(root); +@@ -743,6 +788,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + memcpy(&pending->root_key, &key, sizeof(key)); + fail: + kfree(new_root_item); ++ btrfs_unreserve_metadata_space(root, 6); + return ret; + } + +@@ -778,24 +824,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, + ret = btrfs_update_inode(trans, parent_root, parent_inode); + BUG_ON(ret); + +- /* add the backref first */ + ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, + pending->root_key.objectid, +- BTRFS_ROOT_BACKREF_KEY, + parent_root->root_key.objectid, + parent_inode->i_ino, index, pending->name, + namelen); + + BUG_ON(ret); + +- /* now add the forward ref */ +- ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, +- parent_root->root_key.objectid, +- BTRFS_ROOT_REF_KEY, +- pending->root_key.objectid, +- parent_inode->i_ino, index, pending->name, +- namelen); +- + inode = btrfs_lookup_dentry(parent_inode, pending->dentry); + d_instantiate(pending->dentry, inode); + fail: +@@ -874,7 +910,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + unsigned long timeout = 1; + struct btrfs_transaction *cur_trans; + struct btrfs_transaction *prev_trans = NULL; +- struct extent_io_tree *pinned_copy; + DEFINE_WAIT(wait); + int ret; + int should_grow = 0; +@@ -915,13 +950,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + return 0; + } + +- pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS); +- if (!pinned_copy) +- return -ENOMEM; +- +- extent_io_tree_init(pinned_copy, +- root->fs_info->btree_inode->i_mapping, GFP_NOFS); +- + trans->transaction->in_commit = 1; + trans->transaction->blocked = 1; + if (cur_trans->list.prev != &root->fs_info->trans_list) { +@@ -1019,6 +1047,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + ret = commit_cowonly_roots(trans, root); + BUG_ON(ret); + ++ btrfs_prepare_extent_commit(trans, root); ++ + cur_trans = root->fs_info->running_transaction; + spin_lock(&root->fs_info->new_trans_lock); + root->fs_info->running_transaction = NULL; +@@ -1042,8 +1072,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, + sizeof(root->fs_info->super_copy)); + +- btrfs_copy_pinned(root, pinned_copy); +- + trans->transaction->blocked = 0; + + wake_up(&root->fs_info->transaction_wait); +@@ -1059,8 +1087,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + */ + mutex_unlock(&root->fs_info->tree_log_mutex); + +- btrfs_finish_extent_commit(trans, root, pinned_copy); +- kfree(pinned_copy); ++ btrfs_finish_extent_commit(trans, root); + + /* do the directory inserts of any pending snapshot creations */ + finish_pending_snapshots(trans, root->fs_info); +@@ -1078,6 +1105,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + + mutex_unlock(&root->fs_info->trans_mutex); + ++ if (current->journal_info == trans) ++ current->journal_info = NULL; ++ + kmem_cache_free(btrfs_trans_handle_cachep, trans); + return ret; + } +@@ -1096,8 +1126,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) + + while (!list_empty(&list)) { + root = list_entry(list.next, struct btrfs_root, root_list); +- list_del_init(&root->root_list); +- btrfs_drop_snapshot(root, 0); ++ list_del(&root->root_list); ++ ++ if (btrfs_header_backref_rev(root->node) < ++ BTRFS_MIXED_BACKREF_REV) ++ btrfs_drop_snapshot(root, 0); ++ else ++ btrfs_drop_snapshot(root, 1); + } + return 0; + } +diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h +index 663c674..d4e3e7a 100644 +--- a/fs/btrfs/transaction.h ++++ b/fs/btrfs/transaction.h +@@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, + struct inode *inode) + { + BTRFS_I(inode)->last_trans = trans->transaction->transid; ++ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + } + + int btrfs_end_transaction(struct btrfs_trans_handle *trans, +@@ -107,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); ++int btrfs_write_marked_extents(struct btrfs_root *root, ++ struct extent_io_tree *dirty_pages); ++int btrfs_wait_marked_extents(struct btrfs_root *root, ++ struct extent_io_tree *dirty_pages); + int btrfs_transaction_in_commit(struct btrfs_fs_info *info); + #endif +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index d91b0de..f51bf13 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans, + + mutex_lock(&root->log_mutex); + if (root->log_root) { ++ if (!root->log_start_pid) { ++ root->log_start_pid = current->pid; ++ root->log_multiple_pids = false; ++ } else if (root->log_start_pid != current->pid) { ++ root->log_multiple_pids = true; ++ } ++ + root->log_batch++; + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); + return 0; + } ++ root->log_multiple_pids = false; ++ root->log_start_pid = current->pid; + mutex_lock(&root->fs_info->tree_log_mutex); + if (!root->fs_info->log_root_tree) { + ret = btrfs_init_log_root_tree(trans, root->fs_info); +@@ -263,8 +272,8 @@ static int process_one_buffer(struct btrfs_root *log, + struct walk_control *wc, u64 gen) + { + if (wc->pin) +- btrfs_update_pinned_extents(log->fs_info->extent_root, +- eb->start, eb->len, 1); ++ btrfs_pin_extent(log->fs_info->extent_root, ++ eb->start, eb->len, 0); + + if (btrfs_buffer_uptodate(eb, gen)) { + if (wc->write) +@@ -534,7 +543,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, + saved_nbytes = inode_get_bytes(inode); + /* drop any overlapping extents */ + ret = btrfs_drop_extents(trans, root, inode, +- start, extent_end, extent_end, start, &alloc_hint); ++ start, extent_end, extent_end, start, &alloc_hint, 1); + BUG_ON(ret); + + if (found_type == BTRFS_FILE_EXTENT_REG || +@@ -1971,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + int ret; + struct btrfs_root *log = root->log_root; + struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; ++ u64 log_transid = 0; + + mutex_lock(&root->log_mutex); + index1 = root->log_transid % 2; +@@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + + while (1) { + unsigned long batch = root->log_batch; +- mutex_unlock(&root->log_mutex); +- schedule_timeout_uninterruptible(1); +- mutex_lock(&root->log_mutex); +- ++ if (root->log_multiple_pids) { ++ mutex_unlock(&root->log_mutex); ++ schedule_timeout_uninterruptible(1); ++ mutex_lock(&root->log_mutex); ++ } + wait_for_writer(trans, root); + if (batch == root->log_batch) + break; +@@ -2003,14 +2014,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + goto out; + } + +- ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); ++ /* we start IO on all the marked extents here, but we don't actually ++ * wait for them until later. ++ */ ++ ret = btrfs_write_marked_extents(log, &log->dirty_log_pages); + BUG_ON(ret); + + btrfs_set_root_node(&log->root_item, log->node); + + root->log_batch = 0; ++ log_transid = root->log_transid; + root->log_transid++; + log->log_transid = root->log_transid; ++ root->log_start_pid = 0; + smp_mb(); + /* + * log tree has been flushed to disk, new modifications of +@@ -2036,6 +2052,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + + index2 = log_root_tree->log_transid % 2; + if (atomic_read(&log_root_tree->log_commit[index2])) { ++ btrfs_wait_marked_extents(log, &log->dirty_log_pages); + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid); + mutex_unlock(&log_root_tree->log_mutex); +@@ -2055,6 +2072,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + * check the full commit flag again + */ + if (root->fs_info->last_trans_log_full_commit == trans->transid) { ++ btrfs_wait_marked_extents(log, &log->dirty_log_pages); + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out_wake_log_root; +@@ -2063,6 +2081,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + ret = btrfs_write_and_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages); + BUG_ON(ret); ++ btrfs_wait_marked_extents(log, &log->dirty_log_pages); + + btrfs_set_super_log_root(&root->fs_info->super_for_commit, + log_root_tree->node->start); +@@ -2082,9 +2101,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + * the running transaction open, so a full commit can't hop + * in and cause problems either. + */ +- write_ctree_super(trans, root->fs_info->tree_root, 2); ++ write_ctree_super(trans, root->fs_info->tree_root, 1); + ret = 0; + ++ mutex_lock(&root->log_mutex); ++ if (root->last_log_commit < log_transid) ++ root->last_log_commit = log_transid; ++ mutex_unlock(&root->log_mutex); ++ + out_wake_log_root: + atomic_set(&log_root_tree->log_commit[index2], 0); + smp_mb(); +@@ -2841,7 +2865,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + break; + +- if (parent == sb->s_root) ++ if (IS_ROOT(parent)) + break; + + parent = parent->d_parent; +@@ -2852,6 +2876,21 @@ out: + return ret; + } + ++static int inode_in_log(struct btrfs_trans_handle *trans, ++ struct inode *inode) ++{ ++ struct btrfs_root *root = BTRFS_I(inode)->root; ++ int ret = 0; ++ ++ mutex_lock(&root->log_mutex); ++ if (BTRFS_I(inode)->logged_trans == trans->transid && ++ BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) ++ ret = 1; ++ mutex_unlock(&root->log_mutex); ++ return ret; ++} ++ ++ + /* + * helper function around btrfs_log_inode to make sure newly created + * parent directories also end up in the log. A minimal inode and backref +@@ -2880,11 +2919,22 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + goto end_no_trans; + } + ++ if (root != BTRFS_I(inode)->root || ++ btrfs_root_refs(&root->root_item) == 0) { ++ ret = 1; ++ goto end_no_trans; ++ } ++ + ret = check_parent_dirs_for_sync(trans, inode, parent, + sb, last_committed); + if (ret) + goto end_no_trans; + ++ if (inode_in_log(trans, inode)) { ++ ret = BTRFS_NO_LOG_SYNC; ++ goto end_no_trans; ++ } ++ + start_log_trans(trans, root); + + ret = btrfs_log_inode(trans, root, inode, inode_only); +@@ -2907,12 +2957,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + break; + + inode = parent->d_inode; ++ if (root != BTRFS_I(inode)->root) ++ break; ++ + if (BTRFS_I(inode)->generation > + root->fs_info->last_trans_committed) { + ret = btrfs_log_inode(trans, root, inode, inode_only); + BUG_ON(ret); + } +- if (parent == sb->s_root) ++ if (IS_ROOT(parent)) + break; + + parent = parent->d_parent; +@@ -2951,7 +3004,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) + struct btrfs_key tmp_key; + struct btrfs_root *log; + struct btrfs_fs_info *fs_info = log_root_tree->fs_info; +- u64 highest_inode; + struct walk_control wc = { + .process_func = process_one_buffer, + .stage = 0, +@@ -3010,11 +3062,6 @@ again: + path); + BUG_ON(ret); + } +- ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode); +- if (ret == 0) { +- wc.replay_dest->highest_inode = highest_inode; +- wc.replay_dest->last_inode_alloc = highest_inode; +- } + + key.offset = found_key.offset - 1; + wc.replay_dest->log_root = NULL; +diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h +index d09c760..0776eac 100644 +--- a/fs/btrfs/tree-log.h ++++ b/fs/btrfs/tree-log.h +@@ -19,6 +19,9 @@ + #ifndef __TREE_LOG_ + #define __TREE_LOG_ + ++/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ ++#define BTRFS_NO_LOG_SYNC 256 ++ + int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index 5dbefd1..20cbd2e 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -276,7 +276,7 @@ loop_lock: + * is now congested. Back off and let other work structs + * run instead + */ +- if (pending && bdi_write_congested(bdi) && batch_run > 32 && ++ if (pending && bdi_write_congested(bdi) && batch_run > 8 && + fs_info->fs_devices->open_devices > 1) { + struct io_context *ioc; + +@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) + goto error; + + device->name = kstrdup(orig_dev->name, GFP_NOFS); +- if (!device->name) ++ if (!device->name) { ++ kfree(device); + goto error; ++ } + + device->devid = orig_dev->devid; + device->work.func = pending_bios_fn; +@@ -719,10 +721,9 @@ error: + * called very infrequently and that a given device has a small number + * of extents + */ +-static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, +- struct btrfs_device *device, +- u64 num_bytes, u64 *start, +- u64 *max_avail) ++int find_free_dev_extent(struct btrfs_trans_handle *trans, ++ struct btrfs_device *device, u64 num_bytes, ++ u64 *start, u64 *max_avail) + { + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; +@@ -1736,6 +1737,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, + extent_root = root->fs_info->extent_root; + em_tree = &root->fs_info->mapping_tree.map_tree; + ++ ret = btrfs_can_relocate(extent_root, chunk_offset); ++ if (ret) ++ return -ENOSPC; ++ + /* step one, relocate all the extents inside this chunk */ + ret = btrfs_relocate_block_group(extent_root, chunk_offset); + BUG_ON(ret); +@@ -1749,9 +1754,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, + * step two, delete the device extents and the + * chunk tree entries + */ +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + BUG_ON(em->start > chunk_offset || + em->start + em->len < chunk_offset); +@@ -1780,9 +1785,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, + ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); + BUG_ON(ret); + +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + + kfree(map); + em->bdev = NULL; +@@ -1807,12 +1812,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) + struct btrfs_key found_key; + u64 chunk_tree = chunk_root->root_key.objectid; + u64 chunk_type; ++ bool retried = false; ++ int failed = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ++again: + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; +@@ -1842,7 +1850,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) + ret = btrfs_relocate_chunk(chunk_root, chunk_tree, + found_key.objectid, + found_key.offset); +- BUG_ON(ret); ++ if (ret == -ENOSPC) ++ failed++; ++ else if (ret) ++ BUG(); + } + + if (found_key.offset == 0) +@@ -1850,6 +1861,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) + key.offset = found_key.offset - 1; + } + ret = 0; ++ if (failed && !retried) { ++ failed = 0; ++ retried = true; ++ goto again; ++ } else if (failed && retried) { ++ WARN_ON(1); ++ ret = -ENOSPC; ++ } + error: + btrfs_free_path(path); + return ret; +@@ -1894,6 +1913,8 @@ int btrfs_balance(struct btrfs_root *dev_root) + continue; + + ret = btrfs_shrink_device(device, old_size - size_to_free); ++ if (ret == -ENOSPC) ++ break; + BUG_ON(ret); + + trans = btrfs_start_transaction(dev_root, 1); +@@ -1938,9 +1959,8 @@ int btrfs_balance(struct btrfs_root *dev_root) + chunk = btrfs_item_ptr(path->nodes[0], + path->slots[0], + struct btrfs_chunk); +- key.offset = found_key.offset; + /* chunk zero is special */ +- if (key.offset == 0) ++ if (found_key.offset == 0) + break; + + btrfs_release_path(chunk_root, path); +@@ -1948,7 +1968,8 @@ int btrfs_balance(struct btrfs_root *dev_root) + chunk_root->root_key.objectid, + found_key.objectid, + found_key.offset); +- BUG_ON(ret); ++ BUG_ON(ret && ret != -ENOSPC); ++ key.offset = found_key.offset - 1; + } + ret = 0; + error: +@@ -1974,10 +1995,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) + u64 chunk_offset; + int ret; + int slot; ++ int failed = 0; ++ bool retried = false; + struct extent_buffer *l; + struct btrfs_key key; + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); ++ u64 old_size = device->total_bytes; + u64 diff = device->total_bytes - new_size; + + if (new_size >= device->total_bytes) +@@ -1987,12 +2011,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) + if (!path) + return -ENOMEM; + +- trans = btrfs_start_transaction(root, 1); +- if (!trans) { +- ret = -ENOMEM; +- goto done; +- } +- + path->reada = 2; + + lock_chunks(root); +@@ -2001,8 +2019,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) + if (device->writeable) + device->fs_devices->total_rw_bytes -= diff; + unlock_chunks(root); +- btrfs_end_transaction(trans, root); + ++again: + key.objectid = device->devid; + key.offset = (u64)-1; + key.type = BTRFS_DEV_EXTENT_KEY; +@@ -2017,6 +2035,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) + goto done; + if (ret) { + ret = 0; ++ btrfs_release_path(root, path); + break; + } + +@@ -2024,14 +2043,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) + slot = path->slots[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + +- if (key.objectid != device->devid) ++ if (key.objectid != device->devid) { ++ btrfs_release_path(root, path); + break; ++ } + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + length = btrfs_dev_extent_length(l, dev_extent); + +- if (key.offset + length <= new_size) ++ if (key.offset + length <= new_size) { ++ btrfs_release_path(root, path); + break; ++ } + + chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); + chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); +@@ -2040,8 +2063,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) + + ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, + chunk_offset); +- if (ret) ++ if (ret && ret != -ENOSPC) + goto done; ++ if (ret == -ENOSPC) ++ failed++; ++ key.offset -= 1; ++ } ++ ++ if (failed && !retried) { ++ failed = 0; ++ retried = true; ++ goto again; ++ } else if (failed && retried) { ++ ret = -ENOSPC; ++ lock_chunks(root); ++ ++ device->total_bytes = old_size; ++ if (device->writeable) ++ device->fs_devices->total_rw_bytes += diff; ++ unlock_chunks(root); ++ goto done; + } + + /* Shrinking succeeded, else we would be at "done". */ +@@ -2294,9 +2335,9 @@ again: + em->block_len = em->len; + + em_tree = &extent_root->fs_info->mapping_tree.map_tree; +- spin_lock(&em_tree->lock); ++ write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); +- spin_unlock(&em_tree->lock); ++ write_unlock(&em_tree->lock); + BUG_ON(ret); + free_extent_map(em); + +@@ -2491,9 +2532,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) + int readonly = 0; + int i; + +- spin_lock(&map_tree->map_tree.lock); ++ read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); +- spin_unlock(&map_tree->map_tree.lock); ++ read_unlock(&map_tree->map_tree.lock); + if (!em) + return 1; + +@@ -2518,11 +2559,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) + struct extent_map *em; + + while (1) { +- spin_lock(&tree->map_tree.lock); ++ write_lock(&tree->map_tree.lock); + em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); + if (em) + remove_extent_mapping(&tree->map_tree, em); +- spin_unlock(&tree->map_tree.lock); ++ write_unlock(&tree->map_tree.lock); + if (!em) + break; + kfree(em->bdev); +@@ -2540,9 +2581,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) + struct extent_map_tree *em_tree = &map_tree->map_tree; + int ret; + +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); +@@ -2604,9 +2645,9 @@ again: + atomic_set(&multi->error, 0); + } + +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, *length); +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + if (!em && unplug_page) + return 0; +@@ -2763,9 +2804,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, + u64 stripe_nr; + int i, j, nr = 0; + +- spin_lock(&em_tree->lock); ++ read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_start, 1); +- spin_unlock(&em_tree->lock); ++ read_unlock(&em_tree->lock); + + BUG_ON(!em || em->start != chunk_start); + map = (struct map_lookup *)em->bdev; +@@ -3053,9 +3094,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); + +- spin_lock(&map_tree->map_tree.lock); ++ read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); +- spin_unlock(&map_tree->map_tree.lock); ++ read_unlock(&map_tree->map_tree.lock); + + /* already mapped? */ + if (em && em->start <= logical && em->start + em->len > logical) { +@@ -3114,9 +3155,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + map->stripes[i].dev->in_fs_metadata = 1; + } + +- spin_lock(&map_tree->map_tree.lock); ++ write_lock(&map_tree->map_tree.lock); + ret = add_extent_mapping(&map_tree->map_tree, em); +- spin_unlock(&map_tree->map_tree.lock); ++ write_unlock(&map_tree->map_tree.lock); + BUG_ON(ret); + free_extent_map(em); + +diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h +index 5139a83..31b0fab 100644 +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root); + void btrfs_unlock_volumes(void); + void btrfs_lock_volumes(void); + int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); ++int find_free_dev_extent(struct btrfs_trans_handle *trans, ++ struct btrfs_device *device, u64 num_bytes, ++ u64 *start, u64 *max_avail); + #endif +diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c +index a9d3bf4..b6dd596 100644 +--- a/fs/btrfs/xattr.c ++++ b/fs/btrfs/xattr.c +@@ -260,7 +260,7 @@ err: + * attributes are handled directly. + */ + struct xattr_handler *btrfs_xattr_handlers[] = { +-#ifdef CONFIG_FS_POSIX_ACL ++#ifdef CONFIG_BTRFS_FS_POSIX_ACL + &btrfs_xattr_acl_access_handler, + &btrfs_xattr_acl_default_handler, + #endif diff --git a/original/linux-2.6-debug-vm-would-have-oomkilled.patch b/original/linux-2.6-debug-vm-would-have-oomkilled.patch new file mode 100644 index 000000000..5c6302644 --- /dev/null +++ b/original/linux-2.6-debug-vm-would-have-oomkilled.patch @@ -0,0 +1,65 @@ +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b2a2d68..3b132ee 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -67,6 +67,7 @@ extern int sysctl_overcommit_ratio; + extern int sysctl_panic_on_oom; + extern int sysctl_oom_kill_allocating_task; + extern int sysctl_oom_dump_tasks; ++extern int sysctl_would_have_oomkilled; + extern int max_threads; + extern int core_uses_pid; + extern int suid_dumpable; +@@ -861,6 +862,14 @@ static struct ctl_table vm_table[] = { + .proc_handler = &proc_dointvec, + }, + { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "would_have_oomkilled", ++ .data = &sysctl_would_have_oomkilled, ++ .maxlen = sizeof(sysctl_would_have_oomkilled), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { + .ctl_name = VM_OVERCOMMIT_RATIO, + .procname = "overcommit_ratio", + .data = &sysctl_overcommit_ratio, +diff --git a/mm/oom_kill.c b/mm/oom_kill.c +index f255eda..3335a94 100644 +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -31,6 +31,7 @@ + int sysctl_panic_on_oom; + int sysctl_oom_kill_allocating_task; + int sysctl_oom_dump_tasks; ++int sysctl_would_have_oomkilled; + static DEFINE_SPINLOCK(zone_scan_lock); + /* #define DEBUG */ + +@@ -321,6 +322,12 @@ static void __oom_kill_task(struct task_struct *p, int verbose) + return; + } + ++ if (sysctl_would_have_oomkilled == 1) { ++ printk(KERN_ERR "Would have killed process %d (%s). But continuing instead.\n", ++ task_pid_nr(p), p->comm); ++ return; ++ } ++ + if (verbose) + printk(KERN_ERR "Killed process %d (%s)\n", + task_pid_nr(p), p->comm); +@@ -363,6 +370,12 @@ static int oom_kill_task(struct task_struct *p) + return 1; + } while_each_thread(g, q); + ++ if (sysctl_would_have_oomkilled == 1) { ++ printk(KERN_ERR "Would have killed process %d (%s). But continuing instead.\n", ++ task_pid_nr(p), p->comm); ++ return 1; ++ } ++ + __oom_kill_task(p, 1); + + /* diff --git a/original/linux-2.6-execshield.patch b/original/linux-2.6-execshield.patch new file mode 100644 index 000000000..a98b90f5b --- /dev/null +++ b/original/linux-2.6-execshield.patch @@ -0,0 +1,1013 @@ +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h +index c45f415..3a6dbad 100644 +--- a/arch/x86/include/asm/desc.h ++++ b/arch/x86/include/asm/desc.h +@@ -6,6 +6,7 @@ + #include + #include + #include ++#include + + static inline void fill_ldt(struct desc_struct *desc, + const struct user_desc *info) +@@ -94,6 +95,9 @@ static inline int desc_empty(const void *ptr) + + #define load_TLS(t, cpu) native_load_tls(t, cpu) + #define set_ldt native_set_ldt ++#ifdef CONFIG_X86_32 ++#define load_user_cs_desc native_load_user_cs_desc ++#endif /*CONFIG_X86_32*/ + + #define write_ldt_entry(dt, entry, desc) \ + native_write_ldt_entry(dt, entry, desc) +@@ -380,4 +384,25 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) + _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); + } + ++#ifdef CONFIG_X86_32 ++static inline void set_user_cs(struct desc_struct *desc, unsigned long limit) ++{ ++ limit = (limit - 1) / PAGE_SIZE; ++ desc->a = limit & 0xffff; ++ desc->b = (limit & 0xf0000) | 0x00c0fb00; ++} ++ ++static inline void native_load_user_cs_desc(int cpu, struct mm_struct *mm) ++{ ++ get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS] = (mm)->context.user_cs; ++} ++ ++#define arch_add_exec_range arch_add_exec_range ++#define arch_remove_exec_range arch_remove_exec_range ++#define arch_flush_exec_range arch_flush_exec_range ++extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit); ++extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit); ++extern void arch_flush_exec_range(struct mm_struct *mm); ++#endif /* CONFIG_X86_32 */ ++ + #endif /* _ASM_X86_DESC_H */ +diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h +index 80a1dee..8314c66 100644 +--- a/arch/x86/include/asm/mmu.h ++++ b/arch/x86/include/asm/mmu.h +@@ -7,12 +7,19 @@ + /* + * The x86 doesn't have a mmu context, but + * we put the segment information here. ++ * ++ * exec_limit is used to track the range PROT_EXEC ++ * mappings span. + */ + typedef struct { + void *ldt; + int size; + struct mutex lock; + void *vdso; ++#ifdef CONFIG_X86_32 ++ struct desc_struct user_cs; ++ unsigned long exec_limit; ++#endif + } mm_context_t; + + #ifdef CONFIG_SMP +diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h +index 4fb37c8..d5cc31c 100644 +--- a/arch/x86/include/asm/paravirt.h ++++ b/arch/x86/include/asm/paravirt.h +@@ -139,6 +139,9 @@ struct pv_cpu_ops { + void (*store_gdt)(struct desc_ptr *); + void (*store_idt)(struct desc_ptr *); + void (*set_ldt)(const void *desc, unsigned entries); ++#ifdef CONFIG_X86_32 ++ void (*load_user_cs_desc)(int cpu, struct mm_struct *mm); ++#endif /*CONFIG_X86_32*/ + unsigned long (*store_tr)(void); + void (*load_tls)(struct thread_struct *t, unsigned int cpu); + #ifdef CONFIG_X86_64 +@@ -955,6 +958,12 @@ static inline void set_ldt(const void *addr, unsigned entries) + { + PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries); + } ++#ifdef CONFIG_X86_32 ++static inline void load_user_cs_desc(unsigned int cpu, struct mm_struct *mm) ++{ ++ PVOP_VCALL2(pv_cpu_ops.load_user_cs_desc, cpu, mm); ++} ++#endif /*CONFIG_X86_32*/ + static inline void store_gdt(struct desc_ptr *dtr) + { + PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr); +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index c776826..fb6b579 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -160,6 +160,9 @@ static inline int hlt_works(int cpu) + + #define cache_line_size() (boot_cpu_data.x86_cache_alignment) + ++#define __HAVE_ARCH_ALIGN_STACK ++extern unsigned long arch_align_stack(unsigned long sp); ++ + extern void cpu_detect(struct cpuinfo_x86 *c); + + extern struct pt_regs *idle_regs(struct pt_regs *); +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 3ffdcfa..62cba96 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -804,6 +804,20 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) + /* Filter out anything that depends on CPUID levels we don't have */ + filter_cpuid_features(c, true); + ++ /* ++ * emulation of NX with segment limits unfortunately means ++ * we have to disable the fast system calls, due to the way that ++ * sysexit clears the segment limits on return. ++ * If we have either disabled exec-shield on the boot command line, ++ * or we have NX, then we don't need to do this. ++ */ ++ if (exec_shield != 0) { ++#ifdef CONFIG_X86_PAE ++ if (!test_cpu_cap(c, X86_FEATURE_NX)) ++#endif ++ clear_cpu_cap(c, X86_FEATURE_SEP); ++ } ++ + /* If the model name is still unset, do table lookup. */ + if (!c->x86_model_id[0]) { + const char *p; +diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c +index 70ec9b9..d956b8c 100644 +--- a/arch/x86/kernel/paravirt.c ++++ b/arch/x86/kernel/paravirt.c +@@ -369,6 +369,9 @@ struct pv_cpu_ops pv_cpu_ops = { + .read_tscp = native_read_tscp, + .load_tr_desc = native_load_tr_desc, + .set_ldt = native_set_ldt, ++#ifdef CONFIG_X86_32 ++ .load_user_cs_desc = native_load_user_cs_desc, ++#endif /*CONFIG_X86_32*/ + .load_gdt = native_load_gdt, + .load_idt = native_load_idt, + .store_gdt = native_store_gdt, +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index 59f4524..068e286 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -299,7 +299,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, + void + start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) + { ++ int cpu; ++ + set_user_gs(regs, 0); ++ + regs->fs = 0; + set_fs(USER_DS); + regs->ds = __USER_DS; +@@ -308,6 +311,11 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) + regs->cs = __USER_CS; + regs->ip = new_ip; + regs->sp = new_sp; ++ ++ cpu = get_cpu(); ++ load_user_cs_desc(cpu, current->mm); ++ put_cpu(); ++ + /* + * Free the old FP and other extended state + */ +@@ -354,7 +362,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + + __unlazy_fpu(prev_p); +- ++ if (next_p->mm) ++ load_user_cs_desc(cpu, next_p->mm); + + /* we're going to use this soon, after a few expensive things */ + if (next_p->fpu_counter > 5) +@@ -495,3 +504,40 @@ unsigned long get_wchan(struct task_struct *p) + return 0; + } + ++static void modify_cs(struct mm_struct *mm, unsigned long limit) ++{ ++ mm->context.exec_limit = limit; ++ set_user_cs(&mm->context.user_cs, limit); ++ if (mm == current->mm) { ++ int cpu; ++ ++ cpu = get_cpu(); ++ load_user_cs_desc(cpu, mm); ++ put_cpu(); ++ } ++} ++ ++void arch_add_exec_range(struct mm_struct *mm, unsigned long limit) ++{ ++ if (limit > mm->context.exec_limit) ++ modify_cs(mm, limit); ++} ++ ++void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end) ++{ ++ struct vm_area_struct *vma; ++ unsigned long limit = PAGE_SIZE; ++ ++ if (old_end == mm->context.exec_limit) { ++ for (vma = mm->mmap; vma; vma = vma->vm_next) ++ if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) ++ limit = vma->vm_end; ++ modify_cs(mm, limit); ++ } ++} ++ ++void arch_flush_exec_range(struct mm_struct *mm) ++{ ++ mm->context.exec_limit = 0; ++ set_user_cs(&mm->context.user_cs, 0); ++} +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index 07d60c8..41e9129 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -118,6 +118,76 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err) + if (!user_mode_vm(regs)) + die(str, regs, err); + } ++ ++static inline int ++__compare_user_cs_desc(const struct desc_struct *desc1, ++ const struct desc_struct *desc2) ++{ ++ return ((desc1->limit0 != desc2->limit0) || ++ (desc1->limit != desc2->limit) || ++ (desc1->base0 != desc2->base0) || ++ (desc1->base1 != desc2->base1) || ++ (desc1->base2 != desc2->base2)); ++} ++ ++/* ++ * lazy-check for CS validity on exec-shield binaries: ++ * ++ * the original non-exec stack patch was written by ++ * Solar Designer . Thanks! ++ */ ++static int ++check_lazy_exec_limit(int cpu, struct pt_regs *regs, long error_code) ++{ ++ struct desc_struct *desc1, *desc2; ++ struct vm_area_struct *vma; ++ unsigned long limit; ++ ++ if (current->mm == NULL) ++ return 0; ++ ++ limit = -1UL; ++ if (current->mm->context.exec_limit != -1UL) { ++ limit = PAGE_SIZE; ++ spin_lock(¤t->mm->page_table_lock); ++ for (vma = current->mm->mmap; vma; vma = vma->vm_next) ++ if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) ++ limit = vma->vm_end; ++ vma = get_gate_vma(current); ++ if (vma && (vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) ++ limit = vma->vm_end; ++ spin_unlock(¤t->mm->page_table_lock); ++ if (limit >= TASK_SIZE) ++ limit = -1UL; ++ current->mm->context.exec_limit = limit; ++ } ++ set_user_cs(¤t->mm->context.user_cs, limit); ++ ++ desc1 = ¤t->mm->context.user_cs; ++ desc2 = get_cpu_gdt_table(cpu) + GDT_ENTRY_DEFAULT_USER_CS; ++ ++ if (__compare_user_cs_desc(desc1, desc2)) { ++ /* ++ * The CS was not in sync - reload it and retry the ++ * instruction. If the instruction still faults then ++ * we won't hit this branch next time around. ++ */ ++ if (print_fatal_signals >= 2) { ++ printk(KERN_ERR "#GPF fixup (%ld[seg:%lx]) at %08lx, CPU#%d.\n", ++ error_code, error_code/8, regs->ip, ++ smp_processor_id()); ++ printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x, CPU_cs: %08x/%08x.\n", ++ current->mm->context.exec_limit, ++ desc1->a, desc1->b, desc2->a, desc2->b); ++ } ++ ++ load_user_cs_desc(cpu, current->mm); ++ ++ return 1; ++ } ++ ++ return 0; ++} + #endif + + static void __kprobes +@@ -276,6 +346,29 @@ do_general_protection(struct pt_regs *regs, long error_code) + if (!user_mode(regs)) + goto gp_in_kernel; + ++#ifdef CONFIG_X86_32 ++{ ++ int cpu; ++ int ok; ++ ++ cpu = get_cpu(); ++ ok = check_lazy_exec_limit(cpu, regs, error_code); ++ put_cpu(); ++ ++ if (ok) ++ return; ++ ++ if (print_fatal_signals) { ++ printk(KERN_ERR "#GPF(%ld[seg:%lx]) at %08lx, CPU#%d.\n", ++ error_code, error_code/8, regs->ip, smp_processor_id()); ++ printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x.\n", ++ current->mm->context.exec_limit, ++ current->mm->context.user_cs.a, ++ current->mm->context.user_cs.b); ++ } ++} ++#endif /*CONFIG_X86_32*/ ++ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + +@@ -885,19 +978,37 @@ do_device_not_available(struct pt_regs *regs, long error_code) + } + + #ifdef CONFIG_X86_32 ++/* ++ * The fixup code for errors in iret jumps to here (iret_exc). It loses ++ * the original trap number and erorr code. The bogus trap 32 and error ++ * code 0 are what the vanilla kernel delivers via: ++ * DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) ++ * ++ * NOTE: Because of the final "1" in the macro we need to enable interrupts. ++ * ++ * In case of a general protection fault in the iret instruction, we ++ * need to check for a lazy CS update for exec-shield. ++ */ + dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) + { +- siginfo_t info; ++ int ok; ++ int cpu; ++ + local_irq_enable(); + +- info.si_signo = SIGILL; +- info.si_errno = 0; +- info.si_code = ILL_BADSTK; +- info.si_addr = NULL; +- if (notify_die(DIE_TRAP, "iret exception", +- regs, error_code, 32, SIGILL) == NOTIFY_STOP) +- return; +- do_trap(32, SIGILL, "iret exception", regs, error_code, &info); ++ cpu = get_cpu(); ++ ok = check_lazy_exec_limit(cpu, regs, error_code); ++ put_cpu(); ++ ++ if (!ok && notify_die(DIE_TRAP, "iret exception", regs, ++ error_code, 32, SIGSEGV) != NOTIFY_STOP) { ++ siginfo_t info; ++ info.si_signo = SIGSEGV; ++ info.si_errno = 0; ++ info.si_code = ILL_BADSTK; ++ info.si_addr = 0; ++ do_trap(32, SIGSEGV, "iret exception", regs, error_code, &info); ++ } + } + #endif + +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index 34c1bfb..32c3d8d 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -228,6 +228,12 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, + set_nx(); + if (nx_enabled) + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); ++#ifdef CONFIG_X86_32 ++ else ++ if (exec_shield) ++ printk(KERN_INFO "Using x86 segment limits to approximate " ++ "NX protection\n"); ++#endif + + /* Enable PSE if available */ + if (cpu_has_pse) +diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c +index 949708d..c1373b6 100644 +--- a/arch/x86/mm/init_32.c ++++ b/arch/x86/mm/init_32.c +@@ -587,6 +587,54 @@ void zap_low_mappings(void) + pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); + EXPORT_SYMBOL_GPL(__supported_pte_mask); + ++#ifdef CONFIG_X86_PAE ++ ++static int disable_nx __initdata; ++ ++/* ++ * noexec = on|off ++ * ++ * Control non executable mappings. ++ * ++ * on Enable ++ * off Disable (disables exec-shield too) ++ */ ++static int __init noexec_setup(char *str) ++{ ++ if (!str || !strcmp(str, "on")) { ++ if (cpu_has_nx) { ++ __supported_pte_mask |= _PAGE_NX; ++ disable_nx = 0; ++ } ++ } else if (!strcmp(str, "off")) { ++ disable_nx = 1; ++ __supported_pte_mask &= ~_PAGE_NX; ++ exec_shield = 0; ++ } else ++ return -EINVAL; ++ ++ return 0; ++} ++early_param("noexec", noexec_setup); ++ ++void __init set_nx(void) ++{ ++ unsigned int v[4], l, h; ++ ++ if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { ++ cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); ++ ++ if ((v[3] & (1 << 20)) && !disable_nx) { ++ rdmsr(MSR_EFER, l, h); ++ l |= EFER_NX; ++ wrmsr(MSR_EFER, l, h); ++ nx_enabled = 1; ++ __supported_pte_mask |= _PAGE_NX; ++ } ++ } ++} ++#endif ++ + /* user-defined highmem size */ + static unsigned int highmem_pages = -1; + +diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c +index 1658296..72056cf 100644 +--- a/arch/x86/mm/mmap.c ++++ b/arch/x86/mm/mmap.c +@@ -111,13 +111,16 @@ static unsigned long mmap_legacy_base(void) + */ + void arch_pick_mmap_layout(struct mm_struct *mm) + { +- if (mmap_is_legacy()) { ++ if (!(2 & exec_shield) && mmap_is_legacy()) { + mm->mmap_base = mmap_legacy_base(); + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; + } else { + mm->mmap_base = mmap_base(); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; ++ if (!(current->personality & READ_IMPLIES_EXEC) ++ && mmap_is_ia32()) ++ mm->get_unmapped_exec_area = arch_get_unmapped_exec_area; + mm->unmap_area = arch_unmap_area_topdown; + } + } +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 821e970..ea5a4c3 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -6,6 +6,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -129,6 +130,12 @@ void smp_invalidate_interrupt(struct pt_regs *regs) + union smp_flush_state *f; + + cpu = smp_processor_id(); ++ ++#ifdef CONFIG_X86_32 ++ if (current->active_mm) ++ load_user_cs_desc(cpu, current->active_mm); ++#endif ++ + /* + * orig_rax contains the negated interrupt vector. + * Use that to determine where the sender put the data. +diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c +index 58bc00f..1fdafb5 100644 +--- a/arch/x86/vdso/vdso32-setup.c ++++ b/arch/x86/vdso/vdso32-setup.c +@@ -331,7 +331,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) + if (compat) + addr = VDSO_HIGH_BASE; + else { +- addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); ++ addr = get_unmapped_area_prot(NULL, 0, PAGE_SIZE, 0, 0, 1); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; +diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c +index 0a1700a..37b8744 100644 +--- a/arch/x86/xen/enlighten.c ++++ b/arch/x86/xen/enlighten.c +@@ -321,6 +321,24 @@ static void xen_set_ldt(const void *addr, unsigned entries) + xen_mc_issue(PARAVIRT_LAZY_CPU); + } + ++#ifdef CONFIG_X86_32 ++static void xen_load_user_cs_desc(int cpu, struct mm_struct *mm) ++{ ++ void *gdt; ++ xmaddr_t mgdt; ++ u64 descriptor; ++ struct desc_struct user_cs; ++ ++ gdt = &get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS]; ++ mgdt = virt_to_machine(gdt); ++ ++ user_cs = mm->context.user_cs; ++ descriptor = (u64) user_cs.a | ((u64) user_cs.b) << 32; ++ ++ HYPERVISOR_update_descriptor(mgdt.maddr, descriptor); ++} ++#endif /*CONFIG_X86_32*/ ++ + static void xen_load_gdt(const struct desc_ptr *dtr) + { + unsigned long va = dtr->address; +@@ -886,6 +904,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { + + .load_tr_desc = paravirt_nop, + .set_ldt = xen_set_ldt, ++#ifdef CONFIG_X86_32 ++ .load_user_cs_desc = xen_load_user_cs_desc, ++#endif /*CONFIG_X86_32*/ + .load_gdt = xen_load_gdt, + .load_idt = xen_load_idt, + .load_tls = xen_load_tls, +diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c +index 40381df..f856fab 100644 +--- a/fs/binfmt_elf.c ++++ b/fs/binfmt_elf.c +@@ -73,7 +73,7 @@ static struct linux_binfmt elf_format = { + .hasvdso = 1 + }; + +-#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) ++#define BAD_ADDR(x) IS_ERR_VALUE(x) + + static int set_brk(unsigned long start, unsigned long end) + { +@@ -721,6 +721,11 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) + break; + } + ++ if (current->personality == PER_LINUX && (exec_shield & 2)) { ++ executable_stack = EXSTACK_DISABLE_X; ++ current->flags |= PF_RANDOMIZE; ++ } ++ + /* Some simple consistency checks for the interpreter */ + if (elf_interpreter) { + retval = -ELIBBAD; +@@ -740,6 +745,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) + if (retval) + goto out_free_dentry; + ++#ifdef CONFIG_X86_32 ++ /* ++ * Turn off the CS limit completely if exec-shield disabled or ++ * NX active: ++ */ ++ if (!exec_shield || executable_stack != EXSTACK_DISABLE_X || nx_enabled) ++ arch_add_exec_range(current->mm, -1); ++#endif ++ + /* OK, This is the point of no return */ + current->flags &= ~PF_FORKNOEXEC; + current->mm->def_flags = def_flags; +@@ -747,7 +761,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) + /* Do this immediately, since STACK_TOP as used in setup_arg_pages + may depend on the personality. */ + SET_PERSONALITY(loc->elf_ex); +- if (elf_read_implies_exec(loc->elf_ex, executable_stack)) ++ if (!(exec_shield & 2) && ++ elf_read_implies_exec(loc->elf_ex, executable_stack)) + current->personality |= READ_IMPLIES_EXEC; + + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) +@@ -912,7 +927,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) + interpreter, + &interp_map_addr, + load_bias); +- if (!IS_ERR((void *)elf_entry)) { ++ if (!BAD_ADDR(elf_entry)) { + /* + * load_elf_interp() returns relocation + * adjustment +diff --git a/include/linux/mm.h b/include/linux/mm.h +index ad613ed..08f08d0 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1135,7 +1135,13 @@ extern int install_special_mapping(struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long flags, struct page **pages); + +-extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); ++extern unsigned long get_unmapped_area_prot(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, int); ++ ++static inline unsigned long get_unmapped_area(struct file *file, unsigned long addr, ++ unsigned long len, unsigned long pgoff, unsigned long flags) ++{ ++ return get_unmapped_area_prot(file, addr, len, pgoff, flags, 0); ++} + + extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 0e80e26..af904ea 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -198,6 +198,9 @@ struct mm_struct { + unsigned long (*get_unmapped_area) (struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags); ++ unsigned long (*get_unmapped_exec_area) (struct file *filp, ++ unsigned long addr, unsigned long len, ++ unsigned long pgoff, unsigned long flags); + void (*unmap_area) (struct mm_struct *mm, unsigned long addr); + unsigned long mmap_base; /* base of mmap area */ + unsigned long task_size; /* size of task vm space */ +diff --git a/include/linux/resource.h b/include/linux/resource.h +index 40fc7e6..68c2549 100644 +--- a/include/linux/resource.h ++++ b/include/linux/resource.h +@@ -55,8 +55,11 @@ struct rlimit { + /* + * Limit the stack by to some sane default: root can always + * increase this limit if needed.. 8MB seems reasonable. ++ * ++ * (2MB more to cover randomization effects.) + */ +-#define _STK_LIM (8*1024*1024) ++#define _STK_LIM (10*1024*1024) ++#define EXEC_STACK_BIAS (2*1024*1024) + + /* + * GPG2 wants 64kB of mlocked memory, to make sure pass phrases +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 4896fdf..3513e03 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -101,6 +101,9 @@ struct fs_struct; + struct bts_context; + struct perf_counter_context; + ++extern int exec_shield; ++extern int print_fatal_signals; ++ + /* + * List of flags we want to share for kernel threads, + * if only because they are not used by them anyway. +@@ -359,6 +362,10 @@ extern int sysctl_max_map_count; + extern unsigned long + arch_get_unmapped_area(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); ++ ++extern unsigned long ++arch_get_unmapped_exec_area(struct file *, unsigned long, unsigned long, ++ unsigned long, unsigned long); + extern unsigned long + arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index ce664f9..1905e22 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -87,6 +87,26 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max; + #ifndef CONFIG_MMU + extern int sysctl_nr_trim_pages; + #endif ++ ++int exec_shield = (1<<0); ++/* exec_shield is a bitmask: ++ * 0: off; vdso at STACK_TOP, 1 page below TASK_SIZE ++ * (1<<0) 1: on [also on if !=0] ++ * (1<<1) 2: force noexecstack regardless of PT_GNU_STACK ++ * The old settings ++ * (1<<2) 4: vdso just below .text of main (unless too low) ++ * (1<<3) 8: vdso just below .text of PT_INTERP (unless too low) ++ * are ignored because the vdso is placed completely randomly ++ */ ++ ++static int __init setup_exec_shield(char *str) ++{ ++ get_option(&str, &exec_shield); ++ ++ return 1; ++} ++__setup("exec-shield=", setup_exec_shield); ++ + #ifdef CONFIG_RCU_TORTURE_TEST + extern int rcutorture_runnable; + #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +@@ -382,6 +402,14 @@ static struct ctl_table kern_table[] = { + .proc_handler = &proc_dointvec, + }, + { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "exec-shield", ++ .data = &exec_shield, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { + .ctl_name = KERN_CORE_USES_PID, + .procname = "core_uses_pid", + .data = &core_uses_pid, +diff --git a/mm/mmap.c b/mm/mmap.c +index 34579b2..260bb3c 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -45,6 +46,18 @@ + #define arch_rebalance_pgtables(addr, len) (addr) + #endif + ++/* No sane architecture will #define these to anything else */ ++#ifndef arch_add_exec_range ++#define arch_add_exec_range(mm, limit) do { ; } while (0) ++#endif ++#ifndef arch_flush_exec_range ++#define arch_flush_exec_range(mm) do { ; } while (0) ++#endif ++#ifndef arch_remove_exec_range ++#define arch_remove_exec_range(mm, limit) do { ; } while (0) ++#endif ++ ++ + static void unmap_region(struct mm_struct *mm, + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long start, unsigned long end); +@@ -392,6 +405,8 @@ static inline void + __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node *rb_parent) + { ++ if (vma->vm_flags & VM_EXEC) ++ arch_add_exec_range(mm, vma->vm_end); + if (prev) { + vma->vm_next = prev->vm_next; + prev->vm_next = vma; +@@ -494,6 +509,8 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, + rb_erase(&vma->vm_rb, &mm->mm_rb); + if (mm->mmap_cache == vma) + mm->mmap_cache = prev; ++ if (vma->vm_flags & VM_EXEC) ++ arch_remove_exec_range(mm, vma->vm_end); + } + + /* +@@ -803,6 +820,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, + } else /* cases 2, 5, 7 */ + vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL); ++ if (prev->vm_flags & VM_EXEC) ++ arch_add_exec_range(mm, prev->vm_end); + return prev; + } + +@@ -957,7 +976,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ +- addr = get_unmapped_area(file, addr, len, pgoff, flags); ++ addr = get_unmapped_area_prot(file, addr, len, pgoff, flags, ++ prot & PROT_EXEC); + if (addr & ~PAGE_MASK) + return addr; + +@@ -1442,13 +1462,17 @@ void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) + } + + unsigned long +-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, +- unsigned long pgoff, unsigned long flags) ++get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len, ++ unsigned long pgoff, unsigned long flags, int exec) + { + unsigned long (*get_area)(struct file *, unsigned long, + unsigned long, unsigned long, unsigned long); + +- get_area = current->mm->get_unmapped_area; ++ if (exec && current->mm->get_unmapped_exec_area) ++ get_area = current->mm->get_unmapped_exec_area; ++ else ++ get_area = current->mm->get_unmapped_area; ++ + if (file && file->f_op && file->f_op->get_unmapped_area) + get_area = file->f_op->get_unmapped_area; + addr = get_area(file, addr, len, pgoff, flags); +@@ -1462,8 +1486,76 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + + return arch_rebalance_pgtables(addr, len); + } ++EXPORT_SYMBOL(get_unmapped_area_prot); ++ ++#define SHLIB_BASE 0x00110000 ++ ++unsigned long ++arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0, ++ unsigned long len0, unsigned long pgoff, unsigned long flags) ++{ ++ unsigned long addr = addr0, len = len0; ++ struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; ++ unsigned long tmp; ++ ++ if (len > TASK_SIZE) ++ return -ENOMEM; ++ ++ if (flags & MAP_FIXED) ++ return addr; ++ ++ if (!addr) ++ addr = randomize_range(SHLIB_BASE, 0x01000000, len); ++ ++ if (addr) { ++ addr = PAGE_ALIGN(addr); ++ vma = find_vma(mm, addr); ++ if (TASK_SIZE - len >= addr && ++ (!vma || addr + len <= vma->vm_start)) ++ return addr; ++ } ++ ++ addr = SHLIB_BASE; ++ for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { ++ /* At this point: (!vma || addr < vma->vm_end). */ ++ if (TASK_SIZE - len < addr) ++ return -ENOMEM; ++ ++ if (!vma || addr + len <= vma->vm_start) { ++ /* ++ * Must not let a PROT_EXEC mapping get into the ++ * brk area: ++ */ ++ if (addr + len > mm->brk) ++ goto failed; ++ ++ /* ++ * Up until the brk area we randomize addresses ++ * as much as possible: ++ */ ++ if (addr >= 0x01000000) { ++ tmp = randomize_range(0x01000000, ++ PAGE_ALIGN(max(mm->start_brk, ++ (unsigned long)0x08000000)), len); ++ vma = find_vma(mm, tmp); ++ if (TASK_SIZE - len >= tmp && ++ (!vma || tmp + len <= vma->vm_start)) ++ return tmp; ++ } ++ /* ++ * Ok, randomization didnt work out - return ++ * the result of the linear search: ++ */ ++ return addr; ++ } ++ addr = vma->vm_end; ++ } ++ ++failed: ++ return current->mm->get_unmapped_area(filp, addr0, len0, pgoff, flags); ++} + +-EXPORT_SYMBOL(get_unmapped_area); + + /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ + struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +@@ -1538,6 +1630,14 @@ out: + return prev ? prev->vm_next : vma; + } + ++static int over_stack_limit(unsigned long sz) ++{ ++ if (sz < EXEC_STACK_BIAS) ++ return 0; ++ return (sz - EXEC_STACK_BIAS) > ++ current->signal->rlim[RLIMIT_STACK].rlim_cur; ++} ++ + /* + * Verify that the stack growth is acceptable and + * update accounting. This is shared with both the +@@ -1554,7 +1654,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns + return -ENOMEM; + + /* Stack limit test */ +- if (size > rlim[RLIMIT_STACK].rlim_cur) ++ if (over_stack_limit(size)) + return -ENOMEM; + + /* mlock limit tests */ +@@ -1864,10 +1964,14 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + +- if (new_below) ++ if (new_below) { ++ unsigned long old_end = vma->vm_end; ++ + vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + + ((addr - new->vm_start) >> PAGE_SHIFT), new); +- else ++ if (vma->vm_flags & VM_EXEC) ++ arch_remove_exec_range(mm, old_end); ++ } else + vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + + return 0; +@@ -2116,6 +2220,7 @@ void exit_mmap(struct mm_struct *mm) + vm_unacct_memory(nr_accounted); + free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); + tlb_finish_mmu(tlb, 0, end); ++ arch_flush_exec_range(mm); + + /* + * Walk the list again, actually closing and freeing it, +diff --git a/mm/mprotect.c b/mm/mprotect.c +index d80311b..032423d 100644 +--- a/mm/mprotect.c ++++ b/mm/mprotect.c +@@ -26,9 +26,14 @@ + #include + #include + #include ++#include + #include + #include + ++#ifndef arch_remove_exec_range ++#define arch_remove_exec_range(mm, limit) do { ; } while (0) ++#endif ++ + #ifndef pgprot_modify + static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) + { +@@ -139,7 +144,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, + struct mm_struct *mm = vma->vm_mm; + unsigned long oldflags = vma->vm_flags; + long nrpages = (end - start) >> PAGE_SHIFT; +- unsigned long charged = 0; ++ unsigned long charged = 0, old_end = vma->vm_end; + pgoff_t pgoff; + int error; + int dirty_accountable = 0; +@@ -204,6 +209,9 @@ success: + dirty_accountable = 1; + } + ++ if (oldflags & VM_EXEC) ++ arch_remove_exec_range(current->mm, old_end); ++ + mmu_notifier_invalidate_range_start(mm, start, end); + if (is_vm_hugetlb_page(vma)) + hugetlb_change_protection(vma, start, end, vma->vm_page_prot); +diff --git a/mm/mremap.c b/mm/mremap.c +index a39b7b9..6bebfde 100644 +--- a/mm/mremap.c ++++ b/mm/mremap.c +@@ -400,8 +400,8 @@ unsigned long do_mremap(unsigned long addr, + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + +- new_addr = get_unmapped_area(vma->vm_file, 0, new_len, +- vma->vm_pgoff, map_flags); ++ new_addr = get_unmapped_area_prot(vma->vm_file, 0, new_len, ++ vma->vm_pgoff, map_flags, vma->vm_flags & VM_EXEC); + if (new_addr & ~PAGE_MASK) { + ret = new_addr; + goto out; diff --git a/original/linux-2.6-utrace.patch b/original/linux-2.6-utrace.patch new file mode 100644 index 000000000..861080917 --- /dev/null +++ b/original/linux-2.6-utrace.patch @@ -0,0 +1,4102 @@ +diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile +index 9632444..bf4b9e8 100644 +--- a/Documentation/DocBook/Makefile ++++ b/Documentation/DocBook/Makefile +@@ -9,7 +9,7 @@ + DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \ + kernel-hacking.xml kernel-locking.xml deviceiobook.xml \ + procfs-guide.xml writing_usb_driver.xml networking.xml \ +- kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml \ ++ kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml utrace.xml \ + gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ + genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ + mac80211.xml debugobjects.xml sh.xml regulator.xml \ +diff --git a/Documentation/DocBook/utrace.tmpl b/Documentation/DocBook/utrace.tmpl +new file mode 100644 +index 0000000..6cc58a1 +--- /dev/null ++++ b/Documentation/DocBook/utrace.tmpl +@@ -0,0 +1,590 @@ ++ ++ ++ ++ ++ ++ The utrace User Debugging Infrastructure ++ ++ ++ ++ ++ utrace concepts ++ ++ Introduction ++ ++ ++ utrace is infrastructure code for tracing ++ and controlling user threads. This is the foundation for writing ++ tracing engines, which can be loadable kernel modules. ++ ++ ++ ++ The basic actors in utrace are the thread ++ and the tracing engine. A tracing engine is some body of code that ++ calls into the <linux/utrace.h> ++ interfaces, represented by a struct ++ utrace_engine_ops. (Usually it's a kernel module, ++ though the legacy ptrace support is a tracing ++ engine that is not in a kernel module.) The interface operates on ++ individual threads (struct task_struct). ++ If an engine wants to treat several threads as a group, that is up ++ to its higher-level code. ++ ++ ++ ++ Tracing begins by attaching an engine to a thread, using ++ utrace_attach_task or ++ utrace_attach_pid. If successful, it returns a ++ pointer that is the handle used in all other calls. ++ ++ ++ ++ ++ Events and Callbacks ++ ++ ++ An attached engine does nothing by default. An engine makes something ++ happen by requesting callbacks via utrace_set_events ++ and poking the thread with utrace_control. ++ The synchronization issues related to these two calls ++ are discussed further below in . ++ ++ ++ ++ Events are specified using the macro ++ UTRACE_EVENT(type). ++ Each event type is associated with a callback in struct ++ utrace_engine_ops. A tracing engine can leave unused ++ callbacks NULL. The only callbacks required ++ are those used by the event flags it sets. ++ ++ ++ ++ Many engines can be attached to each thread. When a thread has an ++ event, each engine gets a callback if it has set the event flag for ++ that event type. For most events, engines are called in the order they ++ attached. Engines that attach after the event has occurred do not get ++ callbacks for that event. This includes any new engines just attached ++ by an existing engine's callback function. Once the sequence of ++ callbacks for that one event has completed, such new engines are then ++ eligible in the next sequence that starts when there is another event. ++ ++ ++ ++ Event reporting callbacks have details particular to the event type, ++ but are all called in similar environments and have the same ++ constraints. Callbacks are made from safe points, where no locks ++ are held, no special resources are pinned (usually), and the ++ user-mode state of the thread is accessible. So, callback code has ++ a pretty free hand. But to be a good citizen, callback code should ++ never block for long periods. It is fine to block in ++ kmalloc and the like, but never wait for i/o or ++ for user mode to do something. If you need the thread to wait, use ++ UTRACE_STOP and return from the callback ++ quickly. When your i/o finishes or whatever, you can use ++ utrace_control to resume the thread. ++ ++ ++ ++ The UTRACE_EVENT(SYSCALL_ENTRY) event is a special ++ case. While other events happen in the kernel when it will return to ++ user mode soon, this event happens when entering the kernel before it ++ will proceed with the work requested from user mode. Because of this ++ difference, the report_syscall_entry callback is ++ special in two ways. For this event, engines are called in reverse of ++ the normal order (this includes the report_quiesce ++ call that precedes a report_syscall_entry call). ++ This preserves the semantics that the last engine to attach is called ++ "closest to user mode"--the engine that is first to see a thread's user ++ state when it enters the kernel is also the last to see that state when ++ the thread returns to user mode. For the same reason, if these ++ callbacks use UTRACE_STOP (see the next section), ++ the thread stops immediately after callbacks rather than only when it's ++ ready to return to user mode; when allowed to resume, it will actually ++ attempt the system call indicated by the register values at that time. ++ ++ ++ ++ ++ Stopping Safely ++ ++ Writing well-behaved callbacks ++ ++ ++ Well-behaved callbacks are important to maintain two essential ++ properties of the interface. The first of these is that unrelated ++ tracing engines should not interfere with each other. If your engine's ++ event callback does not return quickly, then another engine won't get ++ the event notification in a timely manner. The second important ++ property is that tracing should be as noninvasive as possible to the ++ normal operation of the system overall and of the traced thread in ++ particular. That is, attached tracing engines should not perturb a ++ thread's behavior, except to the extent that changing its user-visible ++ state is explicitly what you want to do. (Obviously some perturbation ++ is unavoidable, primarily timing changes, ranging from small delays due ++ to the overhead of tracing, to arbitrary pauses in user code execution ++ when a user stops a thread with a debugger for examination.) Even when ++ you explicitly want the perturbation of making the traced thread block, ++ just blocking directly in your callback has more unwanted effects. For ++ example, the CLONE event callbacks are called when ++ the new child thread has been created but not yet started running; the ++ child can never be scheduled until the CLONE ++ tracing callbacks return. (This allows engines tracing the parent to ++ attach to the child.) If a CLONE event callback ++ blocks the parent thread, it also prevents the child thread from ++ running (even to process a SIGKILL). If what you ++ want is to make both the parent and child block, then use ++ utrace_attach_task on the child and then use ++ UTRACE_STOP on both threads. A more crucial ++ problem with blocking in callbacks is that it can prevent ++ SIGKILL from working. A thread that is blocking ++ due to UTRACE_STOP will still wake up and die ++ immediately when sent a SIGKILL, as all threads ++ should. Relying on the utrace ++ infrastructure rather than on private synchronization calls in event ++ callbacks is an important way to help keep tracing robustly ++ noninvasive. ++ ++ ++ ++ ++ Using <constant>UTRACE_STOP</constant> ++ ++ ++ To control another thread and access its state, it must be stopped ++ with UTRACE_STOP. This means that it is ++ stopped and won't start running again while we access it. When a ++ thread is not already stopped, utrace_control ++ returns -EINPROGRESS and an engine must wait ++ for an event callback when the thread is ready to stop. The thread ++ may be running on another CPU or may be blocked. When it is ready ++ to be examined, it will make callbacks to engines that set the ++ UTRACE_EVENT(QUIESCE) event bit. To wake up an ++ interruptible wait, use UTRACE_INTERRUPT. ++ ++ ++ ++ As long as some engine has used UTRACE_STOP and ++ not called utrace_control to resume the thread, ++ then the thread will remain stopped. SIGKILL ++ will wake it up, but it will not run user code. When the stop is ++ cleared with utrace_control or a callback ++ return value, the thread starts running again. ++ (See also .) ++ ++ ++ ++ ++ ++ ++ Tear-down Races ++ ++ Primacy of <constant>SIGKILL</constant> ++ ++ Ordinarily synchronization issues for tracing engines are kept fairly ++ straightforward by using UTRACE_STOP. You ask a ++ thread to stop, and then once it makes the ++ report_quiesce callback it cannot do anything else ++ that would result in another callback, until you let it with a ++ utrace_control call. This simple arrangement ++ avoids complex and error-prone code in each one of a tracing engine's ++ event callbacks to keep them serialized with the engine's other ++ operations done on that thread from another thread of control. ++ However, giving tracing engines complete power to keep a traced thread ++ stuck in place runs afoul of a more important kind of simplicity that ++ the kernel overall guarantees: nothing can prevent or delay ++ SIGKILL from making a thread die and release its ++ resources. To preserve this important property of ++ SIGKILL, it as a special case can break ++ UTRACE_STOP like nothing else normally can. This ++ includes both explicit SIGKILL signals and the ++ implicit SIGKILL sent to each other thread in the ++ same thread group by a thread doing an exec, or processing a fatal ++ signal, or making an exit_group system call. A ++ tracing engine can prevent a thread from beginning the exit or exec or ++ dying by signal (other than SIGKILL) if it is ++ attached to that thread, but once the operation begins, no tracing ++ engine can prevent or delay all other threads in the same thread group ++ dying. ++ ++ ++ ++ Final callbacks ++ ++ The report_reap callback is always the final event ++ in the life cycle of a traced thread. Tracing engines can use this as ++ the trigger to clean up their own data structures. The ++ report_death callback is always the penultimate ++ event a tracing engine might see; it's seen unless the thread was ++ already in the midst of dying when the engine attached. Many tracing ++ engines will have no interest in when a parent reaps a dead process, ++ and nothing they want to do with a zombie thread once it dies; for ++ them, the report_death callback is the natural ++ place to clean up data structures and detach. To facilitate writing ++ such engines robustly, given the asynchrony of ++ SIGKILL, and without error-prone manual ++ implementation of synchronization schemes, the ++ utrace infrastructure provides some special ++ guarantees about the report_death and ++ report_reap callbacks. It still takes some care ++ to be sure your tracing engine is robust to tear-down races, but these ++ rules make it reasonably straightforward and concise to handle a lot of ++ corner cases correctly. ++ ++ ++ ++ Engine and task pointers ++ ++ The first sort of guarantee concerns the core data structures ++ themselves. struct utrace_engine is ++ a reference-counted data structure. While you hold a reference, an ++ engine pointer will always stay valid so that you can safely pass it to ++ any utrace call. Each call to ++ utrace_attach_task or ++ utrace_attach_pid returns an engine pointer with a ++ reference belonging to the caller. You own that reference until you ++ drop it using utrace_engine_put. There is an ++ implicit reference on the engine while it is attached. So if you drop ++ your only reference, and then use ++ utrace_attach_task without ++ UTRACE_ATTACH_CREATE to look up that same engine, ++ you will get the same pointer with a new reference to replace the one ++ you dropped, just like calling utrace_engine_get. ++ When an engine has been detached, either explicitly with ++ UTRACE_DETACH or implicitly after ++ report_reap, then any references you hold are all ++ that keep the old engine pointer alive. ++ ++ ++ ++ There is nothing a kernel module can do to keep a struct ++ task_struct alive outside of ++ rcu_read_lock. When the task dies and is reaped ++ by its parent (or itself), that structure can be freed so that any ++ dangling pointers you have stored become invalid. ++ utrace will not prevent this, but it can ++ help you detect it safely. By definition, a task that has been reaped ++ has had all its engines detached. All ++ utrace calls can be safely called on a ++ detached engine if the caller holds a reference on that engine pointer, ++ even if the task pointer passed in the call is invalid. All calls ++ return -ESRCH for a detached engine, which tells ++ you that the task pointer you passed could be invalid now. Since ++ utrace_control and ++ utrace_set_events do not block, you can call those ++ inside a rcu_read_lock section and be sure after ++ they don't return -ESRCH that the task pointer is ++ still valid until rcu_read_unlock. The ++ infrastructure never holds task references of its own. Though neither ++ rcu_read_lock nor any other lock is held while ++ making a callback, it's always guaranteed that the struct ++ task_struct and the struct ++ utrace_engine passed as arguments remain valid ++ until the callback function returns. ++ ++ ++ ++ The common means for safely holding task pointers that is available to ++ kernel modules is to use struct pid, which ++ permits put_pid from kernel modules. When using ++ that, the calls utrace_attach_pid, ++ utrace_control_pid, ++ utrace_set_events_pid, and ++ utrace_barrier_pid are available. ++ ++ ++ ++ ++ ++ Serialization of <constant>DEATH</constant> and <constant>REAP</constant> ++ ++ ++ The second guarantee is the serialization of ++ DEATH and REAP event ++ callbacks for a given thread. The actual reaping by the parent ++ (release_task call) can occur simultaneously ++ while the thread is still doing the final steps of dying, including ++ the report_death callback. If a tracing engine ++ has requested both DEATH and ++ REAP event reports, it's guaranteed that the ++ report_reap callback will not be made until ++ after the report_death callback has returned. ++ If the report_death callback itself detaches ++ from the thread, then the report_reap callback ++ will never be made. Thus it is safe for a ++ report_death callback to clean up data ++ structures and detach. ++ ++ ++ ++ Interlock with final callbacks ++ ++ The final sort of guarantee is that a tracing engine will know for sure ++ whether or not the report_death and/or ++ report_reap callbacks will be made for a certain ++ thread. These tear-down races are disambiguated by the error return ++ values of utrace_set_events and ++ utrace_control. Normally ++ utrace_control called with ++ UTRACE_DETACH returns zero, and this means that no ++ more callbacks will be made. If the thread is in the midst of dying, ++ it returns -EALREADY to indicate that the ++ report_death callback may already be in progress; ++ when you get this error, you know that any cleanup your ++ report_death callback does is about to happen or ++ has just happened--note that if the report_death ++ callback does not detach, the engine remains attached until the thread ++ gets reaped. If the thread is in the midst of being reaped, ++ utrace_control returns -ESRCH ++ to indicate that the report_reap callback may ++ already be in progress; this means the engine is implicitly detached ++ when the callback completes. This makes it possible for a tracing ++ engine that has decided asynchronously to detach from a thread to ++ safely clean up its data structures, knowing that no ++ report_death or report_reap ++ callback will try to do the same. utrace_detach ++ returns -ESRCH when the struct ++ utrace_engine has already been detached, but is ++ still a valid pointer because of its reference count. A tracing engine ++ can use this to safely synchronize its own independent multiple threads ++ of control with each other and with its event callbacks that detach. ++ ++ ++ ++ In the same vein, utrace_set_events normally ++ returns zero; if the target thread was stopped before the call, then ++ after a successful call, no event callbacks not requested in the new ++ flags will be made. It fails with -EALREADY if ++ you try to clear UTRACE_EVENT(DEATH) when the ++ report_death callback may already have begun, if ++ you try to clear UTRACE_EVENT(REAP) when the ++ report_reap callback may already have begun, or if ++ you try to newly set UTRACE_EVENT(DEATH) or ++ UTRACE_EVENT(QUIESCE) when the target is already ++ dead or dying. Like utrace_control, it returns ++ -ESRCH when the thread has already been detached ++ (including forcible detach on reaping). This lets the tracing engine ++ know for sure which event callbacks it will or won't see after ++ utrace_set_events has returned. By checking for ++ errors, it can know whether to clean up its data structures immediately ++ or to let its callbacks do the work. ++ ++ ++ ++ Using <function>utrace_barrier</function> ++ ++ When a thread is safely stopped, calling ++ utrace_control with UTRACE_DETACH ++ or calling utrace_set_events to disable some events ++ ensures synchronously that your engine won't get any more of the callbacks ++ that have been disabled (none at all when detaching). But these can also ++ be used while the thread is not stopped, when it might be simultaneously ++ making a callback to your engine. For this situation, these calls return ++ -EINPROGRESS when it's possible a callback is in ++ progress. If you are not prepared to have your old callbacks still run, ++ then you can synchronize to be sure all the old callbacks are finished, ++ using utrace_barrier. This is necessary if the ++ kernel module containing your callback code is going to be unloaded. ++ ++ ++ After using UTRACE_DETACH once, further calls to ++ utrace_control with the same engine pointer will ++ return -ESRCH. In contrast, after getting ++ -EINPROGRESS from ++ utrace_set_events, you can call ++ utrace_set_events again later and if it returns zero ++ then know the old callbacks have finished. ++ ++ ++ Unlike all other calls, utrace_barrier (and ++ utrace_barrier_pid) will accept any engine pointer you ++ hold a reference on, even if UTRACE_DETACH has already ++ been used. After any utrace_control or ++ utrace_set_events call (these do not block), you can ++ call utrace_barrier to block until callbacks have ++ finished. This returns -ESRCH only if the engine is ++ completely detached (finished all callbacks). Otherwise it waits ++ until the thread is definitely not in the midst of a callback to this ++ engine and then returns zero, but can return ++ -ERESTARTSYS if its wait is interrupted. ++ ++ ++ ++ ++ ++ ++ ++utrace core API ++ ++ ++ The utrace API is declared in <linux/utrace.h>. ++ ++ ++!Iinclude/linux/utrace.h ++!Ekernel/utrace.c ++ ++ ++ ++Machine State ++ ++ ++ The task_current_syscall function can be used on any ++ valid struct task_struct at any time, and does ++ not even require that utrace_attach_task was used at all. ++ ++ ++ ++ The other ways to access the registers and other machine-dependent state of ++ a task can only be used on a task that is at a known safe point. The safe ++ points are all the places where utrace_set_events can ++ request callbacks (except for the DEATH and ++ REAP events). So at any event callback, it is safe to ++ examine current. ++ ++ ++ ++ One task can examine another only after a callback in the target task that ++ returns UTRACE_STOP so that task will not return to user ++ mode after the safe point. This guarantees that the task will not resume ++ until the same engine uses utrace_control, unless the ++ task dies suddenly. To examine safely, one must use a pair of calls to ++ utrace_prepare_examine and ++ utrace_finish_examine surrounding the calls to ++ struct user_regset functions or direct examination ++ of task data structures. utrace_prepare_examine returns ++ an error if the task is not properly stopped and not dead. After a ++ successful examination, the paired utrace_finish_examine ++ call returns an error if the task ever woke up during the examination. If ++ so, any data gathered may be scrambled and should be discarded. This means ++ there was a spurious wake-up (which should not happen), or a sudden death. ++ ++ ++<structname>struct user_regset</structname> ++ ++ ++ The struct user_regset API ++ is declared in <linux/regset.h>. ++ ++ ++!Finclude/linux/regset.h ++ ++ ++ ++ ++ <filename>System Call Information</filename> ++ ++ ++ This function is declared in <linux/ptrace.h>. ++ ++ ++!Elib/syscall.c ++ ++ ++ ++<filename>System Call Tracing</filename> ++ ++ ++ The arch API for system call information is declared in ++ <asm/syscall.h>. ++ Each of these calls can be used only at system call entry tracing, ++ or can be used only at system call exit and the subsequent safe points ++ before returning to user mode. ++ At system call entry tracing means either during a ++ report_syscall_entry callback, ++ or any time after that callback has returned UTRACE_STOP. ++ ++ ++!Finclude/asm-generic/syscall.h ++ ++ ++ ++ ++ ++Kernel Internals ++ ++ ++ This chapter covers the interface to the tracing infrastructure ++ from the core of the kernel and the architecture-specific code. ++ This is for maintainers of the kernel and arch code, and not relevant ++ to using the tracing facilities described in preceding chapters. ++ ++ ++Core Calls In ++ ++ ++ These calls are declared in <linux/tracehook.h>. ++ The core kernel calls these functions at various important places. ++ ++ ++!Finclude/linux/tracehook.h ++ ++ ++ ++Architecture Calls Out ++ ++ ++ An arch that has done all these things sets ++ CONFIG_HAVE_ARCH_TRACEHOOK. ++ This is required to enable the utrace code. ++ ++ ++<filename><asm/ptrace.h></filename> ++ ++ ++ An arch defines these in <asm/ptrace.h> ++ if it supports hardware single-step or block-step features. ++ ++ ++!Finclude/linux/ptrace.h arch_has_single_step arch_has_block_step ++!Finclude/linux/ptrace.h user_enable_single_step user_enable_block_step ++!Finclude/linux/ptrace.h user_disable_single_step ++ ++ ++ ++ ++ <filename><asm/syscall.h></filename> ++ ++ ++ An arch provides <asm/syscall.h> that ++ defines these as inlines, or declares them as exported functions. ++ These interfaces are described in . ++ ++ ++ ++ ++ ++ <filename><linux/tracehook.h></filename> ++ ++ ++ An arch must define TIF_NOTIFY_RESUME ++ and TIF_SYSCALL_TRACE ++ in its <asm/thread_info.h>. ++ The arch code must call the following functions, all declared ++ in <linux/tracehook.h> and ++ described in : ++ ++ ++ ++ tracehook_notify_resume ++ ++ ++ tracehook_report_syscall_entry ++ ++ ++ tracehook_report_syscall_exit ++ ++ ++ tracehook_signal_handler ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/fs/proc/array.c b/fs/proc/array.c +index 725a650..e299a63 100644 +--- a/fs/proc/array.c ++++ b/fs/proc/array.c +@@ -82,6 +82,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -188,6 +189,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, + cred->uid, cred->euid, cred->suid, cred->fsuid, + cred->gid, cred->egid, cred->sgid, cred->fsgid); + ++ task_utrace_proc_status(m, p); ++ + task_lock(p); + if (p->files) + fdt = files_fdtable(p->files); +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 5368fbd..aecd24e 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -167,6 +167,7 @@ extern struct cred init_cred; + [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ + }, \ + .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ ++ INIT_UTRACE(tsk) \ + INIT_IDS \ + INIT_PERF_COUNTERS(tsk) \ + INIT_TRACE_IRQFLAGS \ +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 4d07542..2060aa1 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -59,6 +59,7 @@ struct sched_param { + #include + #include + #include ++#include + + #include + #include +@@ -1313,6 +1314,11 @@ struct task_struct { + #endif + seccomp_t seccomp; + ++#ifdef CONFIG_UTRACE ++ struct utrace utrace; ++ unsigned long utrace_flags; ++#endif ++ + /* Thread group tracking */ + u32 parent_exec_id; + u32 self_exec_id; +diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h +index 7c2bfd9..a91d9a4 100644 +--- a/include/linux/tracehook.h ++++ b/include/linux/tracehook.h +@@ -49,6 +49,7 @@ + #include + #include + #include ++#include + struct linux_binprm; + + /** +@@ -63,6 +64,8 @@ struct linux_binprm; + */ + static inline int tracehook_expect_breakpoints(struct task_struct *task) + { ++ if (unlikely(task_utrace_flags(task) & UTRACE_EVENT(SIGNAL_CORE))) ++ return 1; + return (task_ptrace(task) & PT_PTRACED) != 0; + } + +@@ -111,6 +114,9 @@ static inline void ptrace_report_syscall(struct pt_regs *regs) + static inline __must_check int tracehook_report_syscall_entry( + struct pt_regs *regs) + { ++ if ((task_utrace_flags(current) & UTRACE_EVENT(SYSCALL_ENTRY)) && ++ utrace_report_syscall_entry(regs)) ++ return 1; + ptrace_report_syscall(regs); + return 0; + } +@@ -134,6 +140,8 @@ static inline __must_check int tracehook_report_syscall_entry( + */ + static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step) + { ++ if (task_utrace_flags(current) & UTRACE_EVENT(SYSCALL_EXIT)) ++ utrace_report_syscall_exit(regs); + ptrace_report_syscall(regs); + } + +@@ -194,6 +202,8 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt, + struct linux_binprm *bprm, + struct pt_regs *regs) + { ++ if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(EXEC))) ++ utrace_report_exec(fmt, bprm, regs); + if (!ptrace_event(PT_TRACE_EXEC, PTRACE_EVENT_EXEC, 0) && + unlikely(task_ptrace(current) & PT_PTRACED)) + send_sig(SIGTRAP, current, 0); +@@ -211,6 +221,8 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt, + */ + static inline void tracehook_report_exit(long *exit_code) + { ++ if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(EXIT))) ++ utrace_report_exit(exit_code); + ptrace_event(PT_TRACE_EXIT, PTRACE_EVENT_EXIT, *exit_code); + } + +@@ -254,6 +266,7 @@ static inline int tracehook_prepare_clone(unsigned clone_flags) + static inline void tracehook_finish_clone(struct task_struct *child, + unsigned long clone_flags, int trace) + { ++ utrace_init_task(child); + ptrace_init_task(child, (clone_flags & CLONE_PTRACE) || trace); + } + +@@ -278,6 +291,8 @@ static inline void tracehook_report_clone(struct pt_regs *regs, + unsigned long clone_flags, + pid_t pid, struct task_struct *child) + { ++ if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(CLONE))) ++ utrace_report_clone(clone_flags, child); + if (unlikely(task_ptrace(child))) { + /* + * It doesn't matter who attached/attaching to this +@@ -310,6 +325,9 @@ static inline void tracehook_report_clone_complete(int trace, + pid_t pid, + struct task_struct *child) + { ++ if (unlikely(task_utrace_flags(current) & UTRACE_EVENT(CLONE)) && ++ (clone_flags & CLONE_VFORK)) ++ utrace_finish_vfork(current); + if (unlikely(trace)) + ptrace_event(0, trace, pid); + } +@@ -344,6 +362,7 @@ static inline void tracehook_report_vfork_done(struct task_struct *child, + */ + static inline void tracehook_prepare_release_task(struct task_struct *task) + { ++ utrace_release_task(task); + } + + /** +@@ -358,6 +377,7 @@ static inline void tracehook_prepare_release_task(struct task_struct *task) + static inline void tracehook_finish_release_task(struct task_struct *task) + { + ptrace_release_task(task); ++ BUG_ON(task->exit_state != EXIT_DEAD); + } + + /** +@@ -379,6 +399,8 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info, + const struct k_sigaction *ka, + struct pt_regs *regs, int stepping) + { ++ if (task_utrace_flags(current)) ++ utrace_signal_handler(current, stepping); + if (stepping) + ptrace_notify(SIGTRAP); + } +@@ -396,6 +418,8 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info, + static inline int tracehook_consider_ignored_signal(struct task_struct *task, + int sig) + { ++ if (unlikely(task_utrace_flags(task) & UTRACE_EVENT(SIGNAL_IGN))) ++ return 1; + return (task_ptrace(task) & PT_PTRACED) != 0; + } + +@@ -415,6 +439,9 @@ static inline int tracehook_consider_ignored_signal(struct task_struct *task, + static inline int tracehook_consider_fatal_signal(struct task_struct *task, + int sig) + { ++ if (unlikely(task_utrace_flags(task) & (UTRACE_EVENT(SIGNAL_TERM) | ++ UTRACE_EVENT(SIGNAL_CORE)))) ++ return 1; + return (task_ptrace(task) & PT_PTRACED) != 0; + } + +@@ -429,6 +456,8 @@ static inline int tracehook_consider_fatal_signal(struct task_struct *task, + */ + static inline int tracehook_force_sigpending(void) + { ++ if (unlikely(task_utrace_flags(current))) ++ return utrace_interrupt_pending(); + return 0; + } + +@@ -458,6 +487,8 @@ static inline int tracehook_get_signal(struct task_struct *task, + siginfo_t *info, + struct k_sigaction *return_ka) + { ++ if (unlikely(task_utrace_flags(task))) ++ return utrace_get_signal(task, regs, info, return_ka); + return 0; + } + +@@ -485,6 +516,8 @@ static inline int tracehook_get_signal(struct task_struct *task, + */ + static inline int tracehook_notify_jctl(int notify, int why) + { ++ if (task_utrace_flags(current) & UTRACE_EVENT(JCTL)) ++ utrace_report_jctl(notify, why); + return notify ?: (current->ptrace & PT_PTRACED) ? why : 0; + } + +@@ -508,6 +541,8 @@ static inline int tracehook_notify_jctl(int notify, int why) + static inline int tracehook_notify_death(struct task_struct *task, + void **death_cookie, int group_dead) + { ++ *death_cookie = task_utrace_struct(task); ++ + if (task_detached(task)) + return task->ptrace ? SIGCHLD : DEATH_REAP; + +@@ -544,6 +579,20 @@ static inline void tracehook_report_death(struct task_struct *task, + int signal, void *death_cookie, + int group_dead) + { ++ /* ++ * This barrier ensures that our caller's setting of ++ * @task->exit_state precedes checking @task->utrace_flags here. ++ * If utrace_set_events() was just called to enable ++ * UTRACE_EVENT(DEATH), then we are obliged to call ++ * utrace_report_death() and not miss it. utrace_set_events() ++ * uses tasklist_lock to synchronize enabling the bit with the ++ * actual change to @task->exit_state, but we need this barrier ++ * to be sure we see a flags change made just before our caller ++ * took the tasklist_lock. ++ */ ++ smp_mb(); ++ if (task_utrace_flags(task) & _UTRACE_DEATH_EVENTS) ++ utrace_report_death(task, death_cookie, group_dead, signal); + } + + #ifdef TIF_NOTIFY_RESUME +@@ -573,10 +622,20 @@ static inline void set_notify_resume(struct task_struct *task) + * asynchronously, this will be called again before we return to + * user mode. + * +- * Called without locks. ++ * Called without locks. However, on some machines this may be ++ * called with interrupts disabled. + */ + static inline void tracehook_notify_resume(struct pt_regs *regs) + { ++ struct task_struct *task = current; ++ /* ++ * This pairs with the barrier implicit in set_notify_resume(). ++ * It ensures that we read the nonzero utrace_flags set before ++ * set_notify_resume() was called by utrace setup. ++ */ ++ smp_rmb(); ++ if (task_utrace_flags(task)) ++ utrace_resume(task, regs); + } + #endif /* TIF_NOTIFY_RESUME */ + +diff --git a/include/linux/utrace.h b/include/linux/utrace.h +new file mode 100644 +index 0000000..f877ec6 +--- /dev/null ++++ b/include/linux/utrace.h +@@ -0,0 +1,692 @@ ++/* ++ * utrace infrastructure interface for debugging user processes ++ * ++ * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. ++ * ++ * This copyrighted material is made available to anyone wishing to use, ++ * modify, copy, or redistribute it subject to the terms and conditions ++ * of the GNU General Public License v.2. ++ * ++ * Red Hat Author: Roland McGrath. ++ * ++ * This interface allows for notification of interesting events in a ++ * thread. It also mediates access to thread state such as registers. ++ * Multiple unrelated users can be associated with a single thread. ++ * We call each of these a tracing engine. ++ * ++ * A tracing engine starts by calling utrace_attach_task() or ++ * utrace_attach_pid() on the chosen thread, passing in a set of hooks ++ * (&struct utrace_engine_ops), and some associated data. This produces a ++ * &struct utrace_engine, which is the handle used for all other ++ * operations. An attached engine has its ops vector, its data, and an ++ * event mask controlled by utrace_set_events(). ++ * ++ * For each event bit that is set, that engine will get the ++ * appropriate ops->report_*() callback when the event occurs. The ++ * &struct utrace_engine_ops need not provide callbacks for an event ++ * unless the engine sets one of the associated event bits. ++ */ ++ ++#ifndef _LINUX_UTRACE_H ++#define _LINUX_UTRACE_H 1 ++ ++#include ++#include ++#include ++#include ++ ++struct linux_binprm; ++struct pt_regs; ++struct utrace; ++struct user_regset; ++struct user_regset_view; ++ ++/* ++ * Event bits passed to utrace_set_events(). ++ * These appear in &struct task_struct.@utrace_flags ++ * and &struct utrace_engine.@flags. ++ */ ++enum utrace_events { ++ _UTRACE_EVENT_QUIESCE, /* Thread is available for examination. */ ++ _UTRACE_EVENT_REAP, /* Zombie reaped, no more tracing possible. */ ++ _UTRACE_EVENT_CLONE, /* Successful clone/fork/vfork just done. */ ++ _UTRACE_EVENT_EXEC, /* Successful execve just completed. */ ++ _UTRACE_EVENT_EXIT, /* Thread exit in progress. */ ++ _UTRACE_EVENT_DEATH, /* Thread has died. */ ++ _UTRACE_EVENT_SYSCALL_ENTRY, /* User entered kernel for system call. */ ++ _UTRACE_EVENT_SYSCALL_EXIT, /* Returning to user after system call. */ ++ _UTRACE_EVENT_SIGNAL, /* Signal delivery will run a user handler. */ ++ _UTRACE_EVENT_SIGNAL_IGN, /* No-op signal to be delivered. */ ++ _UTRACE_EVENT_SIGNAL_STOP, /* Signal delivery will suspend. */ ++ _UTRACE_EVENT_SIGNAL_TERM, /* Signal delivery will terminate. */ ++ _UTRACE_EVENT_SIGNAL_CORE, /* Signal delivery will dump core. */ ++ _UTRACE_EVENT_JCTL, /* Job control stop or continue completed. */ ++ _UTRACE_NEVENTS ++}; ++#define UTRACE_EVENT(type) (1UL << _UTRACE_EVENT_##type) ++ ++/* ++ * All the kinds of signal events. ++ * These all use the @report_signal() callback. ++ */ ++#define UTRACE_EVENT_SIGNAL_ALL (UTRACE_EVENT(SIGNAL) \ ++ | UTRACE_EVENT(SIGNAL_IGN) \ ++ | UTRACE_EVENT(SIGNAL_STOP) \ ++ | UTRACE_EVENT(SIGNAL_TERM) \ ++ | UTRACE_EVENT(SIGNAL_CORE)) ++/* ++ * Both kinds of syscall events; these call the @report_syscall_entry() ++ * and @report_syscall_exit() callbacks, respectively. ++ */ ++#define UTRACE_EVENT_SYSCALL \ ++ (UTRACE_EVENT(SYSCALL_ENTRY) | UTRACE_EVENT(SYSCALL_EXIT)) ++ ++/* ++ * The event reports triggered synchronously by task death. ++ */ ++#define _UTRACE_DEATH_EVENTS (UTRACE_EVENT(DEATH) | UTRACE_EVENT(QUIESCE)) ++ ++/* ++ * Hooks in call these entry points to the ++ * utrace dispatch. They are weak references here only so ++ * tracehook.h doesn't need to #ifndef CONFIG_UTRACE them to ++ * avoid external references in case of unoptimized compilation. ++ */ ++bool utrace_interrupt_pending(void) ++ __attribute__((weak)); ++void utrace_resume(struct task_struct *, struct pt_regs *) ++ __attribute__((weak)); ++int utrace_get_signal(struct task_struct *, struct pt_regs *, ++ siginfo_t *, struct k_sigaction *) ++ __attribute__((weak)); ++void utrace_report_clone(unsigned long, struct task_struct *) ++ __attribute__((weak)); ++void utrace_finish_vfork(struct task_struct *) ++ __attribute__((weak)); ++void utrace_report_exit(long *exit_code) ++ __attribute__((weak)); ++void utrace_report_death(struct task_struct *, struct utrace *, bool, int) ++ __attribute__((weak)); ++void utrace_report_jctl(int notify, int type) ++ __attribute__((weak)); ++void utrace_report_exec(struct linux_binfmt *, struct linux_binprm *, ++ struct pt_regs *regs) ++ __attribute__((weak)); ++bool utrace_report_syscall_entry(struct pt_regs *) ++ __attribute__((weak)); ++void utrace_report_syscall_exit(struct pt_regs *) ++ __attribute__((weak)); ++void utrace_signal_handler(struct task_struct *, int) ++ __attribute__((weak)); ++ ++#ifndef CONFIG_UTRACE ++ ++/* ++ * uses these accessors to avoid #ifdef CONFIG_UTRACE. ++ */ ++static inline unsigned long task_utrace_flags(struct task_struct *task) ++{ ++ return 0; ++} ++static inline struct utrace *task_utrace_struct(struct task_struct *task) ++{ ++ return NULL; ++} ++static inline void utrace_init_task(struct task_struct *child) ++{ ++} ++static inline void utrace_release_task(struct task_struct *task) ++{ ++} ++ ++static inline void task_utrace_proc_status(struct seq_file *m, ++ struct task_struct *p) ++{ ++} ++ ++#else /* CONFIG_UTRACE */ ++ ++static inline unsigned long task_utrace_flags(struct task_struct *task) ++{ ++ return task->utrace_flags; ++} ++ ++static inline struct utrace *task_utrace_struct(struct task_struct *task) ++{ ++ return &task->utrace; ++} ++ ++static inline void utrace_init_task(struct task_struct *task) ++{ ++ task->utrace_flags = 0; ++ memset(&task->utrace, 0, sizeof(task->utrace)); ++ INIT_LIST_HEAD(&task->utrace.attached); ++ INIT_LIST_HEAD(&task->utrace.attaching); ++ spin_lock_init(&task->utrace.lock); ++} ++ ++void utrace_release_task(struct task_struct *); ++void task_utrace_proc_status(struct seq_file *m, struct task_struct *p); ++ ++ ++/* ++ * Version number of the API defined in this file. This will change ++ * whenever a tracing engine's code would need some updates to keep ++ * working. We maintain this here for the benefit of tracing engine code ++ * that is developed concurrently with utrace API improvements before they ++ * are merged into the kernel, making LINUX_VERSION_CODE checks unwieldy. ++ */ ++#define UTRACE_API_VERSION 20090416 ++ ++/** ++ * enum utrace_resume_action - engine's choice of action for a traced task ++ * @UTRACE_STOP: Stay quiescent after callbacks. ++ * @UTRACE_REPORT: Make some callback soon. ++ * @UTRACE_INTERRUPT: Make @report_signal() callback soon. ++ * @UTRACE_SINGLESTEP: Resume in user mode for one instruction. ++ * @UTRACE_BLOCKSTEP: Resume in user mode until next branch. ++ * @UTRACE_RESUME: Resume normally in user mode. ++ * @UTRACE_DETACH: Detach my engine (implies %UTRACE_RESUME). ++ * ++ * See utrace_control() for detailed descriptions of each action. This is ++ * encoded in the @action argument and the return value for every callback ++ * with a &u32 return value. ++ * ++ * The order of these is important. When there is more than one engine, ++ * each supplies its choice and the smallest value prevails. ++ */ ++enum utrace_resume_action { ++ UTRACE_STOP, ++ UTRACE_REPORT, ++ UTRACE_INTERRUPT, ++ UTRACE_SINGLESTEP, ++ UTRACE_BLOCKSTEP, ++ UTRACE_RESUME, ++ UTRACE_DETACH ++}; ++#define UTRACE_RESUME_MASK 0x0f ++ ++/** ++ * utrace_resume_action - &enum utrace_resume_action from callback action ++ * @action: &u32 callback @action argument or return value ++ * ++ * This extracts the &enum utrace_resume_action from @action, ++ * which is the @action argument to a &struct utrace_engine_ops ++ * callback or the return value from one. ++ */ ++static inline enum utrace_resume_action utrace_resume_action(u32 action) ++{ ++ return action & UTRACE_RESUME_MASK; ++} ++ ++/** ++ * enum utrace_signal_action - disposition of signal ++ * @UTRACE_SIGNAL_DELIVER: Deliver according to sigaction. ++ * @UTRACE_SIGNAL_IGN: Ignore the signal. ++ * @UTRACE_SIGNAL_TERM: Terminate the process. ++ * @UTRACE_SIGNAL_CORE: Terminate with core dump. ++ * @UTRACE_SIGNAL_STOP: Deliver as absolute stop. ++ * @UTRACE_SIGNAL_TSTP: Deliver as job control stop. ++ * @UTRACE_SIGNAL_REPORT: Reporting before pending signals. ++ * @UTRACE_SIGNAL_HANDLER: Reporting after signal handler setup. ++ * ++ * This is encoded in the @action argument and the return value for ++ * a @report_signal() callback. It says what will happen to the ++ * signal described by the &siginfo_t parameter to the callback. ++ * ++ * The %UTRACE_SIGNAL_REPORT value is used in an @action argument when ++ * a tracing report is being made before dequeuing any pending signal. ++ * If this is immediately after a signal handler has been set up, then ++ * %UTRACE_SIGNAL_HANDLER is used instead. A @report_signal callback ++ * that uses %UTRACE_SIGNAL_DELIVER|%UTRACE_SINGLESTEP will ensure ++ * it sees a %UTRACE_SIGNAL_HANDLER report. ++ */ ++enum utrace_signal_action { ++ UTRACE_SIGNAL_DELIVER = 0x00, ++ UTRACE_SIGNAL_IGN = 0x10, ++ UTRACE_SIGNAL_TERM = 0x20, ++ UTRACE_SIGNAL_CORE = 0x30, ++ UTRACE_SIGNAL_STOP = 0x40, ++ UTRACE_SIGNAL_TSTP = 0x50, ++ UTRACE_SIGNAL_REPORT = 0x60, ++ UTRACE_SIGNAL_HANDLER = 0x70 ++}; ++#define UTRACE_SIGNAL_MASK 0xf0 ++#define UTRACE_SIGNAL_HOLD 0x100 /* Flag, push signal back on queue. */ ++ ++/** ++ * utrace_signal_action - &enum utrace_signal_action from callback action ++ * @action: @report_signal callback @action argument or return value ++ * ++ * This extracts the &enum utrace_signal_action from @action, which ++ * is the @action argument to a @report_signal callback or the ++ * return value from one. ++ */ ++static inline enum utrace_signal_action utrace_signal_action(u32 action) ++{ ++ return action & UTRACE_SIGNAL_MASK; ++} ++ ++/** ++ * enum utrace_syscall_action - disposition of system call attempt ++ * @UTRACE_SYSCALL_RUN: Run the system call. ++ * @UTRACE_SYSCALL_ABORT: Don't run the system call. ++ * ++ * This is encoded in the @action argument and the return value for ++ * a @report_syscall_entry callback. ++ */ ++enum utrace_syscall_action { ++ UTRACE_SYSCALL_RUN = 0x00, ++ UTRACE_SYSCALL_ABORT = 0x10 ++}; ++#define UTRACE_SYSCALL_MASK 0xf0 ++ ++/** ++ * utrace_syscall_action - &enum utrace_syscall_action from callback action ++ * @action: @report_syscall_entry callback @action or return value ++ * ++ * This extracts the &enum utrace_syscall_action from @action, which ++ * is the @action argument to a @report_syscall_entry callback or the ++ * return value from one. ++ */ ++static inline enum utrace_syscall_action utrace_syscall_action(u32 action) ++{ ++ return action & UTRACE_SYSCALL_MASK; ++} ++ ++/* ++ * Flags for utrace_attach_task() and utrace_attach_pid(). ++ */ ++#define UTRACE_ATTACH_CREATE 0x0010 /* Attach a new engine. */ ++#define UTRACE_ATTACH_EXCLUSIVE 0x0020 /* Refuse if existing match. */ ++#define UTRACE_ATTACH_MATCH_OPS 0x0001 /* Match engines on ops. */ ++#define UTRACE_ATTACH_MATCH_DATA 0x0002 /* Match engines on data. */ ++#define UTRACE_ATTACH_MATCH_MASK 0x000f ++ ++/** ++ * struct utrace_engine - per-engine structure ++ * @ops: &struct utrace_engine_ops pointer passed to utrace_attach_task() ++ * @data: engine-private &void * passed to utrace_attach_task() ++ * @flags: event mask set by utrace_set_events() plus internal flag bits ++ * ++ * The task itself never has to worry about engines detaching while ++ * it's doing event callbacks. These structures are removed from the ++ * task's active list only when it's stopped, or by the task itself. ++ * ++ * utrace_engine_get() and utrace_engine_put() maintain a reference count. ++ * When it drops to zero, the structure is freed. One reference is held ++ * implicitly while the engine is attached to its task. ++ */ ++struct utrace_engine { ++/* private: */ ++ struct kref kref; ++ struct list_head entry; ++ ++/* public: */ ++ const struct utrace_engine_ops *ops; ++ void *data; ++ ++ unsigned long flags; ++}; ++ ++/** ++ * utrace_engine_get - acquire a reference on a &struct utrace_engine ++ * @engine: &struct utrace_engine pointer ++ * ++ * You must hold a reference on @engine, and you get another. ++ */ ++static inline void utrace_engine_get(struct utrace_engine *engine) ++{ ++ kref_get(&engine->kref); ++} ++ ++void __utrace_engine_release(struct kref *); ++ ++/** ++ * utrace_engine_put - release a reference on a &struct utrace_engine ++ * @engine: &struct utrace_engine pointer ++ * ++ * You must hold a reference on @engine, and you lose that reference. ++ * If it was the last one, @engine becomes an invalid pointer. ++ */ ++static inline void utrace_engine_put(struct utrace_engine *engine) ++{ ++ kref_put(&engine->kref, __utrace_engine_release); ++} ++ ++/** ++ * struct utrace_engine_ops - tracing engine callbacks ++ * ++ * Each @report_*() callback corresponds to an %UTRACE_EVENT(*) bit. ++ * utrace_set_events() calls on @engine choose which callbacks will be made ++ * to @engine from @task. ++ * ++ * Most callbacks take an @action argument, giving the resume action ++ * chosen by other tracing engines. All callbacks take an @engine ++ * argument, and a @task argument, which is always equal to @current. ++ * For some calls, @action also includes bits specific to that event ++ * and utrace_resume_action() is used to extract the resume action. ++ * This shows what would happen if @engine wasn't there, or will if ++ * the callback's return value uses %UTRACE_RESUME. This always ++ * starts as %UTRACE_RESUME when no other tracing is being done on ++ * this task. ++ * ++ * All return values contain &enum utrace_resume_action bits. For ++ * some calls, other bits specific to that kind of event are added to ++ * the resume action bits with OR. These are the same bits used in ++ * the @action argument. The resume action returned by a callback ++ * does not override previous engines' choices, it only says what ++ * @engine wants done. What @task actually does is the action that's ++ * most constrained among the choices made by all attached engines. ++ * See utrace_control() for more information on the actions. ++ * ++ * When %UTRACE_STOP is used in @report_syscall_entry, then @task ++ * stops before attempting the system call. In other cases, the ++ * resume action does not take effect until @task is ready to check ++ * for signals and return to user mode. If there are more callbacks ++ * to be made, the last round of calls determines the final action. ++ * A @report_quiesce callback with @event zero, or a @report_signal ++ * callback, will always be the last one made before @task resumes. ++ * Only %UTRACE_STOP is "sticky"--if @engine returned %UTRACE_STOP ++ * then @task stays stopped unless @engine returns different from a ++ * following callback. ++ * ++ * The report_death() and report_reap() callbacks do not take @action ++ * arguments, and only %UTRACE_DETACH is meaningful in the return value ++ * from a report_death() callback. None of the resume actions applies ++ * to a dead thread. ++ * ++ * All @report_*() hooks are called with no locks held, in a generally ++ * safe environment when we will be returning to user mode soon (or just ++ * entered the kernel). It is fine to block for memory allocation and ++ * the like, but all hooks are asynchronous and must not block on ++ * external events! If you want the thread to block, use %UTRACE_STOP ++ * in your hook's return value; then later wake it up with utrace_control(). ++ * ++ * @report_quiesce: ++ * Requested by %UTRACE_EVENT(%QUIESCE). ++ * This does not indicate any event, but just that @task (the current ++ * thread) is in a safe place for examination. This call is made ++ * before each specific event callback, except for @report_reap. ++ * The @event argument gives the %UTRACE_EVENT(@which) value for ++ * the event occurring. This callback might be made for events @engine ++ * has not requested, if some other engine is tracing the event; ++ * calling utrace_set_events() call here can request the immediate ++ * callback for this occurrence of @event. @event is zero when there ++ * is no other event, @task is now ready to check for signals and ++ * return to user mode, and some engine has used %UTRACE_REPORT or ++ * %UTRACE_INTERRUPT to request this callback. For this case, ++ * if @report_signal is not %NULL, the @report_quiesce callback ++ * may be replaced with a @report_signal callback passing ++ * %UTRACE_SIGNAL_REPORT in its @action argument, whenever @task is ++ * entering the signal-check path anyway. ++ * ++ * @report_signal: ++ * Requested by %UTRACE_EVENT(%SIGNAL_*) or %UTRACE_EVENT(%QUIESCE). ++ * Use utrace_signal_action() and utrace_resume_action() on @action. ++ * The signal action is %UTRACE_SIGNAL_REPORT when some engine has ++ * used %UTRACE_REPORT or %UTRACE_INTERRUPT; the callback can choose ++ * to stop or to deliver an artificial signal, before pending signals. ++ * It's %UTRACE_SIGNAL_HANDLER instead when signal handler setup just ++ * finished (after a previous %UTRACE_SIGNAL_DELIVER return); this ++ * serves in lieu of any %UTRACE_SIGNAL_REPORT callback requested by ++ * %UTRACE_REPORT or %UTRACE_INTERRUPT, and is also implicitly ++ * requested by %UTRACE_SINGLESTEP or %UTRACE_BLOCKSTEP into the ++ * signal delivery. The other signal actions indicate a signal about ++ * to be delivered; the previous engine's return value sets the signal ++ * action seen by the the following engine's callback. The @info data ++ * can be changed at will, including @info->si_signo. The settings in ++ * @return_ka determines what %UTRACE_SIGNAL_DELIVER does. @orig_ka ++ * is what was in force before other tracing engines intervened, and ++ * it's %NULL when this report began as %UTRACE_SIGNAL_REPORT or ++ * %UTRACE_SIGNAL_HANDLER. For a report without a new signal, @info ++ * is left uninitialized and must be set completely by an engine that ++ * chooses to deliver a signal; if there was a previous @report_signal ++ * callback ending in %UTRACE_STOP and it was just resumed using ++ * %UTRACE_REPORT or %UTRACE_INTERRUPT, then @info is left unchanged ++ * from the previous callback. In this way, the original signal can ++ * be left in @info while returning %UTRACE_STOP|%UTRACE_SIGNAL_IGN ++ * and then found again when resuming @task with %UTRACE_INTERRUPT. ++ * The %UTRACE_SIGNAL_HOLD flag bit can be OR'd into the return value, ++ * and might be in @action if the previous engine returned it. This ++ * flag asks that the signal in @info be pushed back on @task's queue ++ * so that it will be seen again after whatever action is taken now. ++ * ++ * @report_clone: ++ * Requested by %UTRACE_EVENT(%CLONE). ++ * Event reported for parent, before the new task @child might run. ++ * @clone_flags gives the flags used in the clone system call, ++ * or equivalent flags for a fork() or vfork() system call. ++ * This function can use utrace_attach_task() on @child. It's guaranteed ++ * that asynchronous utrace_attach_task() calls will be ordered after ++ * any calls in @report_clone callbacks for the parent. Thus ++ * when using %UTRACE_ATTACH_EXCLUSIVE in the asynchronous calls, ++ * you can be sure that the parent's @report_clone callback has ++ * already attached to @child or chosen not to. Passing %UTRACE_STOP ++ * to utrace_control() on @child here keeps the child stopped before ++ * it ever runs in user mode, %UTRACE_REPORT or %UTRACE_INTERRUPT ++ * ensures a callback from @child before it starts in user mode. ++ * ++ * @report_jctl: ++ * Requested by %UTRACE_EVENT(%JCTL). ++ * Job control event; @type is %CLD_STOPPED or %CLD_CONTINUED, ++ * indicating whether we are stopping or resuming now. If @notify ++ * is nonzero, @task is the last thread to stop and so will send ++ * %SIGCHLD to its parent after this callback; @notify reflects ++ * what the parent's %SIGCHLD has in @si_code, which can sometimes ++ * be %CLD_STOPPED even when @type is %CLD_CONTINUED. ++ * ++ * @report_exec: ++ * Requested by %UTRACE_EVENT(%EXEC). ++ * An execve system call has succeeded and the new program is about to ++ * start running. The initial user register state is handy to be tweaked ++ * directly in @regs. @fmt and @bprm gives the details of this exec. ++ * ++ * @report_syscall_entry: ++ * Requested by %UTRACE_EVENT(%SYSCALL_ENTRY). ++ * Thread has entered the kernel to request a system call. ++ * The user register state is handy to be tweaked directly in @regs. ++ * The @action argument contains an &enum utrace_syscall_action, ++ * use utrace_syscall_action() to extract it. The return value ++ * overrides the last engine's action for the system call. ++ * If the final action is %UTRACE_SYSCALL_ABORT, no system call ++ * is made. The details of the system call being attempted can ++ * be fetched here with syscall_get_nr() and syscall_get_arguments(). ++ * The parameter registers can be changed with syscall_set_arguments(). ++ * ++ * @report_syscall_exit: ++ * Requested by %UTRACE_EVENT(%SYSCALL_EXIT). ++ * Thread is about to leave the kernel after a system call request. ++ * The user register state is handy to be tweaked directly in @regs. ++ * The results of the system call attempt can be examined here using ++ * syscall_get_error() and syscall_get_return_value(). It is safe ++ * here to call syscall_set_return_value() or syscall_rollback(). ++ * ++ * @report_exit: ++ * Requested by %UTRACE_EVENT(%EXIT). ++ * Thread is exiting and cannot be prevented from doing so, ++ * but all its state is still live. The @code value will be ++ * the wait result seen by the parent, and can be changed by ++ * this engine or others. The @orig_code value is the real ++ * status, not changed by any tracing engine. Returning %UTRACE_STOP ++ * here keeps @task stopped before it cleans up its state and dies, ++ * so it can be examined by other processes. When @task is allowed ++ * to run, it will die and get to the @report_death callback. ++ * ++ * @report_death: ++ * Requested by %UTRACE_EVENT(%DEATH). ++ * Thread is really dead now. It might be reaped by its parent at ++ * any time, or self-reap immediately. Though the actual reaping ++ * may happen in parallel, a report_reap() callback will always be ++ * ordered after a report_death() callback. ++ * ++ * @report_reap: ++ * Requested by %UTRACE_EVENT(%REAP). ++ * Called when someone reaps the dead task (parent, init, or self). ++ * This means the parent called wait, or else this was a detached ++ * thread or a process whose parent ignores SIGCHLD. ++ * No more callbacks are made after this one. ++ * The engine is always detached. ++ * There is nothing more a tracing engine can do about this thread. ++ * After this callback, the @engine pointer will become invalid. ++ * The @task pointer may become invalid if get_task_struct() hasn't ++ * been used to keep it alive. ++ * An engine should always request this callback if it stores the ++ * @engine pointer or stores any pointer in @engine->data, so it ++ * can clean up its data structures. ++ * Unlike other callbacks, this can be called from the parent's context ++ * rather than from the traced thread itself--it must not delay the ++ * parent by blocking. ++ */ ++struct utrace_engine_ops { ++ u32 (*report_quiesce)(enum utrace_resume_action action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ unsigned long event); ++ u32 (*report_signal)(u32 action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ struct pt_regs *regs, ++ siginfo_t *info, ++ const struct k_sigaction *orig_ka, ++ struct k_sigaction *return_ka); ++ u32 (*report_clone)(enum utrace_resume_action action, ++ struct utrace_engine *engine, ++ struct task_struct *parent, ++ unsigned long clone_flags, ++ struct task_struct *child); ++ u32 (*report_jctl)(enum utrace_resume_action action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ int type, int notify); ++ u32 (*report_exec)(enum utrace_resume_action action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ const struct linux_binfmt *fmt, ++ const struct linux_binprm *bprm, ++ struct pt_regs *regs); ++ u32 (*report_syscall_entry)(u32 action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ struct pt_regs *regs); ++ u32 (*report_syscall_exit)(enum utrace_resume_action action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ struct pt_regs *regs); ++ u32 (*report_exit)(enum utrace_resume_action action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ long orig_code, long *code); ++ u32 (*report_death)(struct utrace_engine *engine, ++ struct task_struct *task, ++ bool group_dead, int signal); ++ void (*report_reap)(struct utrace_engine *engine, ++ struct task_struct *task); ++}; ++ ++/** ++ * struct utrace_examiner - private state for using utrace_prepare_examine() ++ * ++ * The members of &struct utrace_examiner are private to the implementation. ++ * This data type holds the state from a call to utrace_prepare_examine() ++ * to be used by a call to utrace_finish_examine(). ++ */ ++struct utrace_examiner { ++/* private: */ ++ long state; ++ unsigned long ncsw; ++}; ++ ++/* ++ * These are the exported entry points for tracing engines to use. ++ * See kernel/utrace.c for their kerneldoc comments with interface details. ++ */ ++struct utrace_engine *utrace_attach_task(struct task_struct *, int, ++ const struct utrace_engine_ops *, ++ void *); ++struct utrace_engine *utrace_attach_pid(struct pid *, int, ++ const struct utrace_engine_ops *, ++ void *); ++int __must_check utrace_control(struct task_struct *, ++ struct utrace_engine *, ++ enum utrace_resume_action); ++int __must_check utrace_set_events(struct task_struct *, ++ struct utrace_engine *, ++ unsigned long eventmask); ++int __must_check utrace_barrier(struct task_struct *, ++ struct utrace_engine *); ++int __must_check utrace_prepare_examine(struct task_struct *, ++ struct utrace_engine *, ++ struct utrace_examiner *); ++int __must_check utrace_finish_examine(struct task_struct *, ++ struct utrace_engine *, ++ struct utrace_examiner *); ++ ++/** ++ * utrace_control_pid - control a thread being traced by a tracing engine ++ * @pid: thread to affect ++ * @engine: attached engine to affect ++ * @action: &enum utrace_resume_action for thread to do ++ * ++ * This is the same as utrace_control(), but takes a &struct pid ++ * pointer rather than a &struct task_struct pointer. The caller must ++ * hold a ref on @pid, but does not need to worry about the task ++ * staying valid. If it's been reaped so that @pid points nowhere, ++ * then this call returns -%ESRCH. ++ */ ++static inline __must_check int utrace_control_pid( ++ struct pid *pid, struct utrace_engine *engine, ++ enum utrace_resume_action action) ++{ ++ /* ++ * We don't bother with rcu_read_lock() here to protect the ++ * task_struct pointer, because utrace_control will return ++ * -ESRCH without looking at that pointer if the engine is ++ * already detached. A task_struct pointer can't die before ++ * all the engines are detached in release_task() first. ++ */ ++ struct task_struct *task = pid_task(pid, PIDTYPE_PID); ++ return unlikely(!task) ? -ESRCH : utrace_control(task, engine, action); ++} ++ ++/** ++ * utrace_set_events_pid - choose which event reports a tracing engine gets ++ * @pid: thread to affect ++ * @engine: attached engine to affect ++ * @eventmask: new event mask ++ * ++ * This is the same as utrace_set_events(), but takes a &struct pid ++ * pointer rather than a &struct task_struct pointer. The caller must ++ * hold a ref on @pid, but does not need to worry about the task ++ * staying valid. If it's been reaped so that @pid points nowhere, ++ * then this call returns -%ESRCH. ++ */ ++static inline __must_check int utrace_set_events_pid( ++ struct pid *pid, struct utrace_engine *engine, unsigned long eventmask) ++{ ++ struct task_struct *task = pid_task(pid, PIDTYPE_PID); ++ return unlikely(!task) ? -ESRCH : ++ utrace_set_events(task, engine, eventmask); ++} ++ ++/** ++ * utrace_barrier_pid - synchronize with simultaneous tracing callbacks ++ * @pid: thread to affect ++ * @engine: engine to affect (can be detached) ++ * ++ * This is the same as utrace_barrier(), but takes a &struct pid ++ * pointer rather than a &struct task_struct pointer. The caller must ++ * hold a ref on @pid, but does not need to worry about the task ++ * staying valid. If it's been reaped so that @pid points nowhere, ++ * then this call returns -%ESRCH. ++ */ ++static inline __must_check int utrace_barrier_pid(struct pid *pid, ++ struct utrace_engine *engine) ++{ ++ struct task_struct *task = pid_task(pid, PIDTYPE_PID); ++ return unlikely(!task) ? -ESRCH : utrace_barrier(task, engine); ++} ++ ++#endif /* CONFIG_UTRACE */ ++ ++#endif /* linux/utrace.h */ +diff --git a/include/linux/utrace_struct.h b/include/linux/utrace_struct.h +new file mode 100644 +index 0000000..aba7e09 +--- /dev/null ++++ b/include/linux/utrace_struct.h +@@ -0,0 +1,58 @@ ++/* ++ * 'struct utrace' data structure for kernel/utrace.c private use. ++ * ++ * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. ++ * ++ * This copyrighted material is made available to anyone wishing to use, ++ * modify, copy, or redistribute it subject to the terms and conditions ++ * of the GNU General Public License v.2. ++ */ ++ ++#ifndef _LINUX_UTRACE_STRUCT_H ++#define _LINUX_UTRACE_STRUCT_H 1 ++ ++#ifdef CONFIG_UTRACE ++ ++#include ++#include ++ ++/* ++ * Per-thread structure private to utrace implementation. This properly ++ * belongs in kernel/utrace.c and its use is entirely private to the code ++ * there. It is only defined in a header file so that it can be embedded ++ * in the struct task_struct layout. It is here rather than in utrace.h ++ * to avoid header nesting order issues getting too complex. ++ * ++ */ ++struct utrace { ++ struct task_struct *cloning; ++ ++ struct list_head attached, attaching; ++ spinlock_t lock; ++ ++ struct utrace_engine *reporting; ++ ++ unsigned int stopped:1; ++ unsigned int report:1; ++ unsigned int interrupt:1; ++ unsigned int signal_handler:1; ++ unsigned int vfork_stop:1; /* need utrace_stop() before vfork wait */ ++ unsigned int death:1; /* in utrace_report_death() now */ ++ unsigned int reap:1; /* release_task() has run */ ++}; ++ ++# define INIT_UTRACE(tsk) \ ++ .utrace_flags = 0, \ ++ .utrace = { \ ++ .lock = __SPIN_LOCK_UNLOCKED(tsk.utrace.lock), \ ++ .attached = LIST_HEAD_INIT(tsk.utrace.attached), \ ++ .attaching = LIST_HEAD_INIT(tsk.utrace.attaching), \ ++ }, ++ ++#else ++ ++# define INIT_UTRACE(tsk) /* Nothing. */ ++ ++#endif /* CONFIG_UTRACE */ ++ ++#endif /* linux/utrace_struct.h */ +diff --git a/init/Kconfig b/init/Kconfig +index 1ce05a4..f720929 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1191,6 +1191,15 @@ config STOP_MACHINE + help + Need stop_machine() primitive. + ++menuconfig UTRACE ++ bool "Infrastructure for tracing and debugging user processes" ++ depends on EXPERIMENTAL ++ depends on HAVE_ARCH_TRACEHOOK ++ help ++ Enable the utrace process tracing interface. This is an internal ++ kernel interface exported to kernel modules, to track events in ++ user threads, extract and change user thread state. ++ + source "block/Kconfig" + + config PREEMPT_NOTIFIERS +diff --git a/kernel/Makefile b/kernel/Makefile +index 780c8dc..cd16d49 100644 +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -69,6 +69,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o + obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o + obj-$(CONFIG_STOP_MACHINE) += stop_machine.o + obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o ++obj-$(CONFIG_UTRACE) += utrace.o + obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o + obj-$(CONFIG_AUDITSYSCALL) += auditsc.o + obj-$(CONFIG_GCOV_KERNEL) += gcov/ +diff --git a/kernel/ptrace.c b/kernel/ptrace.c +index 61c78b2..935eeee 100644 +--- a/kernel/ptrace.c ++++ b/kernel/ptrace.c +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -164,6 +165,14 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) + return !err; + } + ++/* ++ * For experimental use of utrace, exclude ptrace on the same task. ++ */ ++static inline bool exclude_ptrace(struct task_struct *task) ++{ ++ return unlikely(!!task_utrace_flags(task)); ++} ++ + int ptrace_attach(struct task_struct *task) + { + int retval; +@@ -186,6 +195,13 @@ int ptrace_attach(struct task_struct *task) + goto out; + + task_lock(task); ++ ++ if (exclude_ptrace(task)) { ++ retval = -EBUSY; ++ task_unlock(task); ++ goto unlock_creds; ++ } ++ + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); + task_unlock(task); + if (retval) +@@ -226,7 +242,9 @@ int ptrace_traceme(void) + + write_lock_irq(&tasklist_lock); + /* Are we already being traced? */ +- if (!current->ptrace) { ++ if (exclude_ptrace(current)) { ++ ret = -EBUSY; ++ } else if (!current->ptrace) { + ret = security_ptrace_traceme(current->parent); + /* + * Check PF_EXITING to ensure ->real_parent has not passed +@@ -577,7 +595,17 @@ int ptrace_request(struct task_struct *child, long request, + return ret; + } + +-static struct task_struct *ptrace_get_task_struct(pid_t pid) ++/** ++ * ptrace_get_task_struct -- grab a task struct reference for ptrace ++ * @pid: process id to grab a task_struct reference of ++ * ++ * This function is a helper for ptrace implementations. It checks ++ * permissions and then grabs a task struct for use of the actual ++ * ptrace implementation. ++ * ++ * Returns the task_struct for @pid or an ERR_PTR() on failure. ++ */ ++struct task_struct *ptrace_get_task_struct(pid_t pid) + { + struct task_struct *child; + +diff --git a/kernel/utrace.c b/kernel/utrace.c +new file mode 100644 +index 0000000..74b5fc5 +--- /dev/null ++++ b/kernel/utrace.c +@@ -0,0 +1,2357 @@ ++/* ++ * utrace infrastructure interface for debugging user processes ++ * ++ * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. ++ * ++ * This copyrighted material is made available to anyone wishing to use, ++ * modify, copy, or redistribute it subject to the terms and conditions ++ * of the GNU General Public License v.2. ++ * ++ * Red Hat Author: Roland McGrath. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++/* ++ * Rules for 'struct utrace', defined in ++ * but used entirely privately in this file. ++ * ++ * The common event reporting loops are done by the task making the ++ * report without ever taking any locks. To facilitate this, the two ++ * lists @attached and @attaching work together for smooth asynchronous ++ * attaching with low overhead. Modifying either list requires @lock. ++ * The @attaching list can be modified any time while holding @lock. ++ * New engines being attached always go on this list. ++ * ++ * The @attached list is what the task itself uses for its reporting ++ * loops. When the task itself is not quiescent, it can use the ++ * @attached list without taking any lock. Nobody may modify the list ++ * when the task is not quiescent. When it is quiescent, that means ++ * that it won't run again without taking @lock itself before using ++ * the list. ++ * ++ * At each place where we know the task is quiescent (or it's current), ++ * while holding @lock, we call splice_attaching(), below. This moves ++ * the @attaching list members on to the end of the @attached list. ++ * Since this happens at the start of any reporting pass, any new ++ * engines attached asynchronously go on the stable @attached list ++ * in time to have their callbacks seen. ++ */ ++ ++static struct kmem_cache *utrace_engine_cachep; ++static const struct utrace_engine_ops utrace_detached_ops; /* forward decl */ ++ ++static int __init utrace_init(void) ++{ ++ utrace_engine_cachep = KMEM_CACHE(utrace_engine, SLAB_PANIC); ++ return 0; ++} ++module_init(utrace_init); ++ ++/* ++ * This is called with @utrace->lock held when the task is safely ++ * quiescent, i.e. it won't consult utrace->attached without the lock. ++ * Move any engines attached asynchronously from @utrace->attaching ++ * onto the @utrace->attached list. ++ */ ++static void splice_attaching(struct utrace *utrace) ++{ ++ list_splice_tail_init(&utrace->attaching, &utrace->attached); ++} ++ ++/* ++ * This is the exported function used by the utrace_engine_put() inline. ++ */ ++void __utrace_engine_release(struct kref *kref) ++{ ++ struct utrace_engine *engine = container_of(kref, struct utrace_engine, ++ kref); ++ BUG_ON(!list_empty(&engine->entry)); ++ kmem_cache_free(utrace_engine_cachep, engine); ++} ++EXPORT_SYMBOL_GPL(__utrace_engine_release); ++ ++static bool engine_matches(struct utrace_engine *engine, int flags, ++ const struct utrace_engine_ops *ops, void *data) ++{ ++ if ((flags & UTRACE_ATTACH_MATCH_OPS) && engine->ops != ops) ++ return false; ++ if ((flags & UTRACE_ATTACH_MATCH_DATA) && engine->data != data) ++ return false; ++ return engine->ops && engine->ops != &utrace_detached_ops; ++} ++ ++static struct utrace_engine *matching_engine( ++ struct utrace *utrace, int flags, ++ const struct utrace_engine_ops *ops, void *data) ++{ ++ struct utrace_engine *engine; ++ list_for_each_entry(engine, &utrace->attached, entry) ++ if (engine_matches(engine, flags, ops, data)) ++ return engine; ++ list_for_each_entry(engine, &utrace->attaching, entry) ++ if (engine_matches(engine, flags, ops, data)) ++ return engine; ++ return NULL; ++} ++ ++/* ++ * For experimental use, utrace attach is mutually exclusive with ptrace. ++ */ ++static inline bool exclude_utrace(struct task_struct *task) ++{ ++ return unlikely(!!task->ptrace); ++} ++ ++/* ++ * Called without locks, when we might be the first utrace engine to attach. ++ * If this is a newborn thread and we are not the creator, we have to wait ++ * for it. The creator gets the first chance to attach. The PF_STARTING ++ * flag is cleared after its report_clone hook has had a chance to run. ++ */ ++static inline int utrace_attach_delay(struct task_struct *target) ++{ ++ if ((target->flags & PF_STARTING) && ++ current->utrace.cloning != target) ++ do { ++ schedule_timeout_interruptible(1); ++ if (signal_pending(current)) ++ return -ERESTARTNOINTR; ++ } while (target->flags & PF_STARTING); ++ ++ return 0; ++} ++ ++/* ++ * Enqueue @engine, or maybe don't if UTRACE_ATTACH_EXCLUSIVE. ++ */ ++static int utrace_add_engine(struct task_struct *target, ++ struct utrace *utrace, ++ struct utrace_engine *engine, ++ int flags, ++ const struct utrace_engine_ops *ops, ++ void *data) ++{ ++ int ret; ++ ++ spin_lock(&utrace->lock); ++ ++ if (utrace->reap) { ++ /* ++ * Already entered utrace_release_task(), cannot attach now. ++ */ ++ ret = -ESRCH; ++ } else if ((flags & UTRACE_ATTACH_EXCLUSIVE) && ++ unlikely(matching_engine(utrace, flags, ops, data))) { ++ ret = -EEXIST; ++ } else { ++ /* ++ * Put the new engine on the pending ->attaching list. ++ * Make sure it gets onto the ->attached list by the next ++ * time it's examined. ++ * ++ * When target == current, it would be safe just to call ++ * splice_attaching() right here. But if we're inside a ++ * callback, that would mean the new engine also gets ++ * notified about the event that precipitated its own ++ * creation. This is not what the user wants. ++ * ++ * Setting ->report ensures that start_report() takes the ++ * lock and does it next time. Whenever setting ->report, ++ * we must maintain the invariant that TIF_NOTIFY_RESUME is ++ * also set. Otherwise utrace_control() or utrace_do_stop() ++ * might skip setting TIF_NOTIFY_RESUME upon seeing ->report ++ * already set, and we'd miss a necessary callback. ++ * ++ * In case we had no engines before, make sure that ++ * utrace_flags is not zero when tracehook_notify_resume() ++ * checks. That would bypass utrace reporting clearing ++ * TIF_NOTIFY_RESUME, and thus violate the same invariant. ++ */ ++ target->utrace_flags |= UTRACE_EVENT(REAP); ++ list_add_tail(&engine->entry, &utrace->attaching); ++ utrace->report = 1; ++ set_notify_resume(target); ++ ++ ret = 0; ++ } ++ ++ spin_unlock(&utrace->lock); ++ ++ return ret; ++} ++ ++/** ++ * utrace_attach_task - attach new engine, or look up an attached engine ++ * @target: thread to attach to ++ * @flags: flag bits combined with OR, see below ++ * @ops: callback table for new engine ++ * @data: engine private data pointer ++ * ++ * The caller must ensure that the @target thread does not get freed, ++ * i.e. hold a ref or be its parent. It is always safe to call this ++ * on @current, or on the @child pointer in a @report_clone callback. ++ * For most other cases, it's easier to use utrace_attach_pid() instead. ++ * ++ * UTRACE_ATTACH_CREATE: ++ * Create a new engine. If %UTRACE_ATTACH_CREATE is not specified, you ++ * only look up an existing engine already attached to the thread. ++ * ++ * UTRACE_ATTACH_EXCLUSIVE: ++ * Attempting to attach a second (matching) engine fails with -%EEXIST. ++ * ++ * UTRACE_ATTACH_MATCH_OPS: Only consider engines matching @ops. ++ * UTRACE_ATTACH_MATCH_DATA: Only consider engines matching @data. ++ * ++ * Calls with neither %UTRACE_ATTACH_MATCH_OPS nor %UTRACE_ATTACH_MATCH_DATA ++ * match the first among any engines attached to @target. That means that ++ * %UTRACE_ATTACH_EXCLUSIVE in such a call fails with -%EEXIST if there ++ * are any engines on @target at all. ++ */ ++struct utrace_engine *utrace_attach_task( ++ struct task_struct *target, int flags, ++ const struct utrace_engine_ops *ops, void *data) ++{ ++ struct utrace *utrace; ++ struct utrace_engine *engine; ++ int ret; ++ ++ utrace = &target->utrace; ++ ++ if (unlikely(target->exit_state == EXIT_DEAD)) { ++ /* ++ * The target has already been reaped. ++ * Check this early, though it's not synchronized. ++ * utrace_add_engine() will do the final check. ++ */ ++ if (!(flags & UTRACE_ATTACH_CREATE)) ++ return ERR_PTR(-ENOENT); ++ return ERR_PTR(-ESRCH); ++ } ++ ++ if (!(flags & UTRACE_ATTACH_CREATE)) { ++ spin_lock(&utrace->lock); ++ engine = matching_engine(utrace, flags, ops, data); ++ if (engine) ++ utrace_engine_get(engine); ++ spin_unlock(&utrace->lock); ++ return engine ?: ERR_PTR(-ENOENT); ++ } ++ ++ if (unlikely(!ops) || unlikely(ops == &utrace_detached_ops)) ++ return ERR_PTR(-EINVAL); ++ ++ if (unlikely(target->flags & PF_KTHREAD)) ++ /* ++ * Silly kernel, utrace is for users! ++ */ ++ return ERR_PTR(-EPERM); ++ ++ engine = kmem_cache_alloc(utrace_engine_cachep, GFP_KERNEL); ++ if (unlikely(!engine)) ++ return ERR_PTR(-ENOMEM); ++ ++ /* ++ * Initialize the new engine structure. It starts out with two ++ * refs: one ref to return, and one ref for being attached. ++ */ ++ kref_set(&engine->kref, 2); ++ engine->flags = 0; ++ engine->ops = ops; ++ engine->data = data; ++ ++ ret = utrace_attach_delay(target); ++ if (likely(!ret)) ++ ret = utrace_add_engine(target, utrace, engine, ++ flags, ops, data); ++ ++ if (unlikely(ret)) { ++ kmem_cache_free(utrace_engine_cachep, engine); ++ engine = ERR_PTR(ret); ++ } ++ ++ return engine; ++} ++EXPORT_SYMBOL_GPL(utrace_attach_task); ++ ++/** ++ * utrace_attach_pid - attach new engine, or look up an attached engine ++ * @pid: &struct pid pointer representing thread to attach to ++ * @flags: flag bits combined with OR, see utrace_attach_task() ++ * @ops: callback table for new engine ++ * @data: engine private data pointer ++ * ++ * This is the same as utrace_attach_task(), but takes a &struct pid ++ * pointer rather than a &struct task_struct pointer. The caller must ++ * hold a ref on @pid, but does not need to worry about the task ++ * staying valid. If it's been reaped so that @pid points nowhere, ++ * then this call returns -%ESRCH. ++ */ ++struct utrace_engine *utrace_attach_pid( ++ struct pid *pid, int flags, ++ const struct utrace_engine_ops *ops, void *data) ++{ ++ struct utrace_engine *engine = ERR_PTR(-ESRCH); ++ struct task_struct *task = get_pid_task(pid, PIDTYPE_PID); ++ if (task) { ++ engine = utrace_attach_task(task, flags, ops, data); ++ put_task_struct(task); ++ } ++ return engine; ++} ++EXPORT_SYMBOL_GPL(utrace_attach_pid); ++ ++/* ++ * When an engine is detached, the target thread may still see it and ++ * make callbacks until it quiesces. We install a special ops vector ++ * with these two callbacks. When the target thread quiesces, it can ++ * safely free the engine itself. For any event we will always get ++ * the report_quiesce() callback first, so we only need this one ++ * pointer to be set. The only exception is report_reap(), so we ++ * supply that callback too. ++ */ ++static u32 utrace_detached_quiesce(enum utrace_resume_action action, ++ struct utrace_engine *engine, ++ struct task_struct *task, ++ unsigned long event) ++{ ++ return UTRACE_DETACH; ++} ++ ++static void utrace_detached_reap(struct utrace_engine *engine, ++ struct task_struct *task) ++{ ++} ++ ++static const struct utrace_engine_ops utrace_detached_ops = { ++ .report_quiesce = &utrace_detached_quiesce, ++ .report_reap = &utrace_detached_reap ++}; ++ ++/* ++ * After waking up from TASK_TRACED, clear bookkeeping in @utrace. ++ * Returns true if we were woken up prematurely by SIGKILL. ++ */ ++static inline bool finish_utrace_stop(struct task_struct *task, ++ struct utrace *utrace) ++{ ++ bool killed = false; ++ ++ /* ++ * utrace_wakeup() clears @utrace->stopped before waking us up. ++ * We're officially awake if it's clear. ++ */ ++ spin_lock(&utrace->lock); ++ if (unlikely(utrace->stopped)) { ++ /* ++ * If we're here with it still set, it must have been ++ * signal_wake_up() instead, waking us up for a SIGKILL. ++ */ ++ spin_lock_irq(&task->sighand->siglock); ++ WARN_ON(!sigismember(&task->pending.signal, SIGKILL)); ++ spin_unlock_irq(&task->sighand->siglock); ++ utrace->stopped = 0; ++ killed = true; ++ } ++ spin_unlock(&utrace->lock); ++ ++ return killed; ++} ++ ++/* ++ * Perform %UTRACE_STOP, i.e. block in TASK_TRACED until woken up. ++ * @task == current, @utrace == current->utrace, which is not locked. ++ * Return true if we were woken up by SIGKILL even though some utrace ++ * engine may still want us to stay stopped. ++ */ ++static bool utrace_stop(struct task_struct *task, struct utrace *utrace, ++ bool report) ++{ ++ bool killed; ++ ++ /* ++ * @utrace->stopped is the flag that says we are safely ++ * inside this function. It should never be set on entry. ++ */ ++ BUG_ON(utrace->stopped); ++ ++ /* ++ * The siglock protects us against signals. As well as SIGKILL ++ * waking us up, we must synchronize with the signal bookkeeping ++ * for stop signals and SIGCONT. ++ */ ++ spin_lock(&utrace->lock); ++ spin_lock_irq(&task->sighand->siglock); ++ ++ if (unlikely(sigismember(&task->pending.signal, SIGKILL))) { ++ spin_unlock_irq(&task->sighand->siglock); ++ spin_unlock(&utrace->lock); ++ return true; ++ } ++ ++ if (report) { ++ /* ++ * Ensure a reporting pass when we're resumed. ++ */ ++ utrace->report = 1; ++ set_thread_flag(TIF_NOTIFY_RESUME); ++ } ++ ++ utrace->stopped = 1; ++ __set_current_state(TASK_TRACED); ++ ++ /* ++ * If there is a group stop in progress, ++ * we must participate in the bookkeeping. ++ */ ++ if (task->signal->group_stop_count > 0) ++ --task->signal->group_stop_count; ++ ++ spin_unlock_irq(&task->sighand->siglock); ++ spin_unlock(&utrace->lock); ++ ++ schedule(); ++ ++ /* ++ * While in TASK_TRACED, we were considered "frozen enough". ++ * Now that we woke up, it's crucial if we're supposed to be ++ * frozen that we freeze now before running anything substantial. ++ */ ++ try_to_freeze(); ++ ++ killed = finish_utrace_stop(task, utrace); ++ ++ /* ++ * While we were in TASK_TRACED, complete_signal() considered ++ * us "uninterested" in signal wakeups. Now make sure our ++ * TIF_SIGPENDING state is correct for normal running. ++ */ ++ spin_lock_irq(&task->sighand->siglock); ++ recalc_sigpending(); ++ spin_unlock_irq(&task->sighand->siglock); ++ ++ return killed; ++} ++ ++/* ++ * The caller has to hold a ref on the engine. If the attached flag is ++ * true (all but utrace_barrier() calls), the engine is supposed to be ++ * attached. If the attached flag is false (utrace_barrier() only), ++ * then return -ERESTARTSYS for an engine marked for detach but not yet ++ * fully detached. The task pointer can be invalid if the engine is ++ * detached. ++ * ++ * Get the utrace lock for the target task. ++ * Returns the struct if locked, or ERR_PTR(-errno). ++ * ++ * This has to be robust against races with: ++ * utrace_control(target, UTRACE_DETACH) calls ++ * UTRACE_DETACH after reports ++ * utrace_report_death ++ * utrace_release_task ++ */ ++static struct utrace *get_utrace_lock(struct task_struct *target, ++ struct utrace_engine *engine, ++ bool attached) ++ __acquires(utrace->lock) ++{ ++ struct utrace *utrace; ++ ++ rcu_read_lock(); ++ ++ /* ++ * If this engine was already detached, bail out before we look at ++ * the task_struct pointer at all. If it's detached after this ++ * check, then RCU is still keeping this task_struct pointer valid. ++ * ++ * The ops pointer is NULL when the engine is fully detached. ++ * It's &utrace_detached_ops when it's marked detached but still ++ * on the list. In the latter case, utrace_barrier() still works, ++ * since the target might be in the middle of an old callback. ++ */ ++ if (unlikely(!engine->ops)) { ++ rcu_read_unlock(); ++ return ERR_PTR(-ESRCH); ++ } ++ ++ if (unlikely(engine->ops == &utrace_detached_ops)) { ++ rcu_read_unlock(); ++ return attached ? ERR_PTR(-ESRCH) : ERR_PTR(-ERESTARTSYS); ++ } ++ ++ utrace = &target->utrace; ++ if (unlikely(target->exit_state == EXIT_DEAD)) { ++ /* ++ * If all engines detached already, utrace is clear. ++ * Otherwise, we're called after utrace_release_task might ++ * have started. A call to this engine's report_reap ++ * callback might already be in progress. ++ */ ++ utrace = ERR_PTR(-ESRCH); ++ } else { ++ spin_lock(&utrace->lock); ++ if (unlikely(!engine->ops) || ++ unlikely(engine->ops == &utrace_detached_ops)) { ++ /* ++ * By the time we got the utrace lock, ++ * it had been reaped or detached already. ++ */ ++ spin_unlock(&utrace->lock); ++ utrace = ERR_PTR(-ESRCH); ++ if (!attached && engine->ops == &utrace_detached_ops) ++ utrace = ERR_PTR(-ERESTARTSYS); ++ } ++ } ++ rcu_read_unlock(); ++ ++ return utrace; ++} ++ ++/* ++ * Now that we don't hold any locks, run through any ++ * detached engines and free their references. Each ++ * engine had one implicit ref while it was attached. ++ */ ++static void put_detached_list(struct list_head *list) ++{ ++ struct utrace_engine *engine, *next; ++ list_for_each_entry_safe(engine, next, list, entry) { ++ list_del_init(&engine->entry); ++ utrace_engine_put(engine); ++ } ++} ++ ++/* ++ * Called with utrace->lock held. ++ * Notify and clean up all engines, then free utrace. ++ */ ++static void utrace_reap(struct task_struct *target, struct utrace *utrace) ++ __releases(utrace->lock) ++{ ++ struct utrace_engine *engine, *next; ++ const struct utrace_engine_ops *ops; ++ LIST_HEAD(detached); ++ ++restart: ++ splice_attaching(utrace); ++ list_for_each_entry_safe(engine, next, &utrace->attached, entry) { ++ ops = engine->ops; ++ engine->ops = NULL; ++ list_move(&engine->entry, &detached); ++ ++ /* ++ * If it didn't need a callback, we don't need to drop ++ * the lock. Now nothing else refers to this engine. ++ */ ++ if (!(engine->flags & UTRACE_EVENT(REAP))) ++ continue; ++ ++ /* ++ * This synchronizes with utrace_barrier(). Since we ++ * need the utrace->lock here anyway (unlike the other ++ * reporting loops), we don't need any memory barrier ++ * as utrace_barrier() holds the lock. ++ */ ++ utrace->reporting = engine; ++ spin_unlock(&utrace->lock); ++ ++ (*ops->report_reap)(engine, target); ++ ++ utrace->reporting = NULL; ++ ++ put_detached_list(&detached); ++ ++ spin_lock(&utrace->lock); ++ goto restart; ++ } ++ ++ spin_unlock(&utrace->lock); ++ ++ put_detached_list(&detached); ++} ++ ++/* ++ * Called by release_task. After this, target->utrace must be cleared. ++ */ ++void utrace_release_task(struct task_struct *target) ++{ ++ struct utrace *utrace; ++ ++ utrace = &target->utrace; ++ ++ spin_lock(&utrace->lock); ++ ++ utrace->reap = 1; ++ ++ if (!(target->utrace_flags & _UTRACE_DEATH_EVENTS)) { ++ utrace_reap(target, utrace); /* Unlocks and frees. */ ++ return; ++ } ++ ++ /* ++ * The target will do some final callbacks but hasn't ++ * finished them yet. We know because it clears these ++ * event bits after it's done. Instead of cleaning up here ++ * and requiring utrace_report_death to cope with it, we ++ * delay the REAP report and the teardown until after the ++ * target finishes its death reports. ++ */ ++ ++ spin_unlock(&utrace->lock); ++} ++ ++/* ++ * We use an extra bit in utrace_engine.flags past the event bits, ++ * to record whether the engine is keeping the target thread stopped. ++ */ ++#define ENGINE_STOP (1UL << _UTRACE_NEVENTS) ++ ++static void mark_engine_wants_stop(struct utrace_engine *engine) ++{ ++ engine->flags |= ENGINE_STOP; ++} ++ ++static void clear_engine_wants_stop(struct utrace_engine *engine) ++{ ++ engine->flags &= ~ENGINE_STOP; ++} ++ ++static bool engine_wants_stop(struct utrace_engine *engine) ++{ ++ return (engine->flags & ENGINE_STOP) != 0; ++} ++ ++/** ++ * utrace_set_events - choose which event reports a tracing engine gets ++ * @target: thread to affect ++ * @engine: attached engine to affect ++ * @events: new event mask ++ * ++ * This changes the set of events for which @engine wants callbacks made. ++ * ++ * This fails with -%EALREADY and does nothing if you try to clear ++ * %UTRACE_EVENT(%DEATH) when the @report_death callback may already have ++ * begun, if you try to clear %UTRACE_EVENT(%REAP) when the @report_reap ++ * callback may already have begun, or if you try to newly set ++ * %UTRACE_EVENT(%DEATH) or %UTRACE_EVENT(%QUIESCE) when @target is ++ * already dead or dying. ++ * ++ * This can fail with -%ESRCH when @target has already been detached, ++ * including forcible detach on reaping. ++ * ++ * If @target was stopped before the call, then after a successful call, ++ * no event callbacks not requested in @events will be made; if ++ * %UTRACE_EVENT(%QUIESCE) is included in @events, then a @report_quiesce ++ * callback will be made when @target resumes. If @target was not stopped, ++ * and was about to make a callback to @engine, this returns -%EINPROGRESS. ++ * In this case, the callback in progress might be one excluded from the ++ * new @events setting. When this returns zero, you can be sure that no ++ * event callbacks you've disabled in @events can be made. ++ * ++ * To synchronize after an -%EINPROGRESS return, see utrace_barrier(). ++ * ++ * When @target is @current, -%EINPROGRESS is not returned. But ++ * note that a newly-created engine will not receive any callbacks ++ * related to an event notification already in progress. This call ++ * enables @events callbacks to be made as soon as @engine becomes ++ * eligible for any callbacks, see utrace_attach_task(). ++ * ++ * These rules provide for coherent synchronization based on %UTRACE_STOP, ++ * even when %SIGKILL is breaking its normal simple rules. ++ */ ++int utrace_set_events(struct task_struct *target, ++ struct utrace_engine *engine, ++ unsigned long events) ++{ ++ struct utrace *utrace; ++ unsigned long old_flags, old_utrace_flags, set_utrace_flags; ++ int ret; ++ ++ utrace = get_utrace_lock(target, engine, true); ++ if (unlikely(IS_ERR(utrace))) ++ return PTR_ERR(utrace); ++ ++ old_utrace_flags = target->utrace_flags; ++ set_utrace_flags = events; ++ old_flags = engine->flags; ++ ++ if (target->exit_state && ++ (((events & ~old_flags) & _UTRACE_DEATH_EVENTS) || ++ (utrace->death && ++ ((old_flags & ~events) & _UTRACE_DEATH_EVENTS)) || ++ (utrace->reap && ((old_flags & ~events) & UTRACE_EVENT(REAP))))) { ++ spin_unlock(&utrace->lock); ++ return -EALREADY; ++ } ++ ++ /* ++ * When setting these flags, it's essential that we really ++ * synchronize with exit_notify(). They cannot be set after ++ * exit_notify() takes the tasklist_lock. By holding the read ++ * lock here while setting the flags, we ensure that the calls ++ * to tracehook_notify_death() and tracehook_report_death() will ++ * see the new flags. This ensures that utrace_release_task() ++ * knows positively that utrace_report_death() will be called or ++ * that it won't. ++ */ ++ if ((set_utrace_flags & ~old_utrace_flags) & _UTRACE_DEATH_EVENTS) { ++ read_lock(&tasklist_lock); ++ if (unlikely(target->exit_state)) { ++ read_unlock(&tasklist_lock); ++ spin_unlock(&utrace->lock); ++ return -EALREADY; ++ } ++ target->utrace_flags |= set_utrace_flags; ++ read_unlock(&tasklist_lock); ++ } ++ ++ engine->flags = events | (engine->flags & ENGINE_STOP); ++ target->utrace_flags |= set_utrace_flags; ++ ++ if ((set_utrace_flags & UTRACE_EVENT_SYSCALL) && ++ !(old_utrace_flags & UTRACE_EVENT_SYSCALL)) ++ set_tsk_thread_flag(target, TIF_SYSCALL_TRACE); ++ ++ ret = 0; ++ if (!utrace->stopped && target != current) { ++ /* ++ * This barrier ensures that our engine->flags changes ++ * have hit before we examine utrace->reporting, ++ * pairing with the barrier in start_callback(). If ++ * @target has not yet hit finish_callback() to clear ++ * utrace->reporting, we might be in the middle of a ++ * callback to @engine. ++ */ ++ smp_mb(); ++ if (utrace->reporting == engine) ++ ret = -EINPROGRESS; ++ } ++ ++ spin_unlock(&utrace->lock); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(utrace_set_events); ++ ++/* ++ * Asynchronously mark an engine as being detached. ++ * ++ * This must work while the target thread races with us doing ++ * start_callback(), defined below. It uses smp_rmb() between checking ++ * @engine->flags and using @engine->ops. Here we change @engine->ops ++ * first, then use smp_wmb() before changing @engine->flags. This ensures ++ * it can check the old flags before using the old ops, or check the old ++ * flags before using the new ops, or check the new flags before using the ++ * new ops, but can never check the new flags before using the old ops. ++ * Hence, utrace_detached_ops might be used with any old flags in place. ++ * It has report_quiesce() and report_reap() callbacks to handle all cases. ++ */ ++static void mark_engine_detached(struct utrace_engine *engine) ++{ ++ engine->ops = &utrace_detached_ops; ++ smp_wmb(); ++ engine->flags = UTRACE_EVENT(QUIESCE); ++} ++ ++/* ++ * Get @target to stop and return true if it is already stopped now. ++ * If we return false, it will make some event callback soonish. ++ * Called with @utrace locked. ++ */ ++static bool utrace_do_stop(struct task_struct *target, struct utrace *utrace) ++{ ++ bool stopped = false; ++ ++ spin_lock_irq(&target->sighand->siglock); ++ if (unlikely(target->exit_state)) { ++ /* ++ * On the exit path, it's only truly quiescent ++ * if it has already been through ++ * utrace_report_death(), or never will. ++ */ ++ if (!(target->utrace_flags & _UTRACE_DEATH_EVENTS)) ++ utrace->stopped = stopped = true; ++ } else if (task_is_stopped(target)) { ++ /* ++ * Stopped is considered quiescent; when it wakes up, it will ++ * go through utrace_get_signal() before doing anything else. ++ */ ++ utrace->stopped = stopped = true; ++ } else if (!utrace->report && !utrace->interrupt) { ++ utrace->report = 1; ++ set_notify_resume(target); ++ } ++ spin_unlock_irq(&target->sighand->siglock); ++ ++ return stopped; ++} ++ ++/* ++ * If the target is not dead it should not be in tracing ++ * stop any more. Wake it unless it's in job control stop. ++ * ++ * Called with @utrace->lock held and @utrace->stopped set. ++ */ ++static void utrace_wakeup(struct task_struct *target, struct utrace *utrace) ++{ ++ struct sighand_struct *sighand; ++ unsigned long irqflags; ++ ++ utrace->stopped = 0; ++ ++ sighand = lock_task_sighand(target, &irqflags); ++ if (unlikely(!sighand)) ++ return; ++ ++ if (likely(task_is_stopped_or_traced(target))) { ++ if (target->signal->flags & SIGNAL_STOP_STOPPED) ++ target->state = TASK_STOPPED; ++ else ++ wake_up_state(target, __TASK_STOPPED | __TASK_TRACED); ++ } ++ ++ unlock_task_sighand(target, &irqflags); ++} ++ ++/* ++ * This is called when there might be some detached engines on the list or ++ * some stale bits in @task->utrace_flags. Clean them up and recompute the ++ * flags. ++ * ++ * @action is NULL when @task is stopped and @utrace->stopped is set; wake ++ * it up if it should not be. @action is set when @task is current; if ++ * we're fully detached, reset *@action to UTRACE_RESUME. ++ * ++ * Called with @utrace->lock held, returns with it released. ++ * After this returns, @utrace might be freed if everything detached. ++ */ ++static void utrace_reset(struct task_struct *task, struct utrace *utrace, ++ enum utrace_resume_action *action) ++ __releases(utrace->lock) ++{ ++ struct utrace_engine *engine, *next; ++ unsigned long flags = 0; ++ LIST_HEAD(detached); ++ bool wake = !action; ++ BUG_ON(wake != (task != current)); ++ ++ splice_attaching(utrace); ++ ++ /* ++ * Update the set of events of interest from the union ++ * of the interests of the remaining tracing engines. ++ * For any engine marked detached, remove it from the list. ++ * We'll collect them on the detached list. ++ */ ++ list_for_each_entry_safe(engine, next, &utrace->attached, entry) { ++ if (engine->ops == &utrace_detached_ops) { ++ engine->ops = NULL; ++ list_move(&engine->entry, &detached); ++ } else { ++ flags |= engine->flags | UTRACE_EVENT(REAP); ++ wake = wake && !engine_wants_stop(engine); ++ } ++ } ++ ++ if (task->exit_state) { ++ /* ++ * Once it's already dead, we never install any flags ++ * except REAP. When ->exit_state is set and events ++ * like DEATH are not set, then they never can be set. ++ * This ensures that utrace_release_task() knows ++ * positively that utrace_report_death() can never run. ++ */ ++ BUG_ON(utrace->death); ++ flags &= UTRACE_EVENT(REAP); ++ wake = false; ++ } else if (!(flags & UTRACE_EVENT_SYSCALL) && ++ test_tsk_thread_flag(task, TIF_SYSCALL_TRACE)) { ++ clear_tsk_thread_flag(task, TIF_SYSCALL_TRACE); ++ } ++ ++ task->utrace_flags = flags; ++ ++ if (wake) ++ utrace_wakeup(task, utrace); ++ ++ /* ++ * If any engines are left, we're done. ++ */ ++ spin_unlock(&utrace->lock); ++ if (!flags) { ++ /* ++ * No more engines, cleared out the utrace. ++ */ ++ ++ if (action) ++ *action = UTRACE_RESUME; ++ } ++ ++ put_detached_list(&detached); ++} ++ ++/* ++ * You can't do anything to a dead task but detach it. ++ * If release_task() has been called, you can't do that. ++ * ++ * On the exit path, DEATH and QUIESCE event bits are set only ++ * before utrace_report_death() has taken the lock. At that point, ++ * the death report will come soon, so disallow detach until it's ++ * done. This prevents us from racing with it detaching itself. ++ * ++ * Called with utrace->lock held, when @target->exit_state is nonzero. ++ */ ++static inline int utrace_control_dead(struct task_struct *target, ++ struct utrace *utrace, ++ enum utrace_resume_action action) ++{ ++ if (action != UTRACE_DETACH || unlikely(utrace->reap)) ++ return -ESRCH; ++ ++ if (unlikely(utrace->death)) ++ /* ++ * We have already started the death report. We can't ++ * prevent the report_death and report_reap callbacks, ++ * so tell the caller they will happen. ++ */ ++ return -EALREADY; ++ ++ return 0; ++} ++ ++/** ++ * utrace_control - control a thread being traced by a tracing engine ++ * @target: thread to affect ++ * @engine: attached engine to affect ++ * @action: &enum utrace_resume_action for thread to do ++ * ++ * This is how a tracing engine asks a traced thread to do something. ++ * This call is controlled by the @action argument, which has the ++ * same meaning as the &enum utrace_resume_action value returned by ++ * event reporting callbacks. ++ * ++ * If @target is already dead (@target->exit_state nonzero), ++ * all actions except %UTRACE_DETACH fail with -%ESRCH. ++ * ++ * The following sections describe each option for the @action argument. ++ * ++ * UTRACE_DETACH: ++ * ++ * After this, the @engine data structure is no longer accessible, ++ * and the thread might be reaped. The thread will start running ++ * again if it was stopped and no longer has any attached engines ++ * that want it stopped. ++ * ++ * If the @report_reap callback may already have begun, this fails ++ * with -%ESRCH. If the @report_death callback may already have ++ * begun, this fails with -%EALREADY. ++ * ++ * If @target is not already stopped, then a callback to this engine ++ * might be in progress or about to start on another CPU. If so, ++ * then this returns -%EINPROGRESS; the detach happens as soon as ++ * the pending callback is finished. To synchronize after an ++ * -%EINPROGRESS return, see utrace_barrier(). ++ * ++ * If @target is properly stopped before utrace_control() is called, ++ * then after successful return it's guaranteed that no more callbacks ++ * to the @engine->ops vector will be made. ++ * ++ * The only exception is %SIGKILL (and exec or group-exit by another ++ * thread in the group), which can cause asynchronous @report_death ++ * and/or @report_reap callbacks even when %UTRACE_STOP was used. ++ * (In that event, this fails with -%ESRCH or -%EALREADY, see above.) ++ * ++ * UTRACE_STOP: ++ * This asks that @target stop running. This returns 0 only if ++ * @target is already stopped, either for tracing or for job ++ * control. Then @target will remain stopped until another ++ * utrace_control() call is made on @engine; @target can be woken ++ * only by %SIGKILL (or equivalent, such as exec or termination by ++ * another thread in the same thread group). ++ * ++ * This returns -%EINPROGRESS if @target is not already stopped. ++ * Then the effect is like %UTRACE_REPORT. A @report_quiesce or ++ * @report_signal callback will be made soon. Your callback can ++ * then return %UTRACE_STOP to keep @target stopped. ++ * ++ * This does not interrupt system calls in progress, including ones ++ * that sleep for a long time. For that, use %UTRACE_INTERRUPT. ++ * To interrupt system calls and then keep @target stopped, your ++ * @report_signal callback can return %UTRACE_STOP. ++ * ++ * UTRACE_RESUME: ++ * ++ * Just let @target continue running normally, reversing the effect ++ * of a previous %UTRACE_STOP. If another engine is keeping @target ++ * stopped, then it remains stopped until all engines let it resume. ++ * If @target was not stopped, this has no effect. ++ * ++ * UTRACE_REPORT: ++ * ++ * This is like %UTRACE_RESUME, but also ensures that there will be ++ * a @report_quiesce or @report_signal callback made soon. If ++ * @target had been stopped, then there will be a callback before it ++ * resumes running normally. If another engine is keeping @target ++ * stopped, then there might be no callbacks until all engines let ++ * it resume. ++ * ++ * UTRACE_INTERRUPT: ++ * ++ * This is like %UTRACE_REPORT, but ensures that @target will make a ++ * @report_signal callback before it resumes or delivers signals. ++ * If @target was in a system call or about to enter one, work in ++ * progress will be interrupted as if by %SIGSTOP. If another ++ * engine is keeping @target stopped, then there might be no ++ * callbacks until all engines let it resume. ++ * ++ * This gives @engine an opportunity to introduce a forced signal ++ * disposition via its @report_signal callback. ++ * ++ * UTRACE_SINGLESTEP: ++ * ++ * It's invalid to use this unless arch_has_single_step() returned true. ++ * This is like %UTRACE_RESUME, but resumes for one user instruction ++ * only. It's invalid to use this in utrace_control() unless @target ++ * had been stopped by @engine previously. ++ * ++ * Note that passing %UTRACE_SINGLESTEP or %UTRACE_BLOCKSTEP to ++ * utrace_control() or returning it from an event callback alone does ++ * not necessarily ensure that stepping will be enabled. If there are ++ * more callbacks made to any engine before returning to user mode, ++ * then the resume action is chosen only by the last set of callbacks. ++ * To be sure, enable %UTRACE_EVENT(%QUIESCE) and look for the ++ * @report_quiesce callback with a zero event mask, or the ++ * @report_signal callback with %UTRACE_SIGNAL_REPORT. ++ * ++ * UTRACE_BLOCKSTEP: ++ * ++ * It's invalid to use this unless arch_has_block_step() returned true. ++ * This is like %UTRACE_SINGLESTEP, but resumes for one whole basic ++ * block of user instructions. ++ * ++ * %UTRACE_BLOCKSTEP devolves to %UTRACE_SINGLESTEP when another ++ * tracing engine is using %UTRACE_SINGLESTEP at the same time. ++ */ ++int utrace_control(struct task_struct *target, ++ struct utrace_engine *engine, ++ enum utrace_resume_action action) ++{ ++ struct utrace *utrace; ++ bool resume; ++ int ret; ++ ++ if (unlikely(action > UTRACE_DETACH)) ++ return -EINVAL; ++ ++ utrace = get_utrace_lock(target, engine, true); ++ if (unlikely(IS_ERR(utrace))) ++ return PTR_ERR(utrace); ++ ++ if (target->exit_state) { ++ ret = utrace_control_dead(target, utrace, action); ++ if (ret) { ++ spin_unlock(&utrace->lock); ++ return ret; ++ } ++ } ++ ++ resume = utrace->stopped; ++ ret = 0; ++ ++ clear_engine_wants_stop(engine); ++ switch (action) { ++ case UTRACE_STOP: ++ mark_engine_wants_stop(engine); ++ if (!resume && !utrace_do_stop(target, utrace)) ++ ret = -EINPROGRESS; ++ resume = false; ++ break; ++ ++ case UTRACE_DETACH: ++ mark_engine_detached(engine); ++ resume = resume || utrace_do_stop(target, utrace); ++ if (!resume) { ++ /* ++ * As in utrace_set_events(), this barrier ensures ++ * that our engine->flags changes have hit before we ++ * examine utrace->reporting, pairing with the barrier ++ * in start_callback(). If @target has not yet hit ++ * finish_callback() to clear utrace->reporting, we ++ * might be in the middle of a callback to @engine. ++ */ ++ smp_mb(); ++ if (utrace->reporting == engine) ++ ret = -EINPROGRESS; ++ break; ++ } ++ /* Fall through. */ ++ ++ case UTRACE_RESUME: ++ /* ++ * This and all other cases imply resuming if stopped. ++ * There might not be another report before it just ++ * resumes, so make sure single-step is not left set. ++ */ ++ if (likely(resume)) ++ user_disable_single_step(target); ++ break; ++ ++ case UTRACE_REPORT: ++ /* ++ * Make the thread call tracehook_notify_resume() soon. ++ * But don't bother if it's already been interrupted. ++ * In that case, utrace_get_signal() will be reporting soon. ++ */ ++ if (!utrace->report && !utrace->interrupt) { ++ utrace->report = 1; ++ set_notify_resume(target); ++ } ++ break; ++ ++ case UTRACE_INTERRUPT: ++ /* ++ * Make the thread call tracehook_get_signal() soon. ++ */ ++ if (utrace->interrupt) ++ break; ++ utrace->interrupt = 1; ++ ++ /* ++ * If it's not already stopped, interrupt it now. ++ * We need the siglock here in case it calls ++ * recalc_sigpending() and clears its own ++ * TIF_SIGPENDING. By taking the lock, we've ++ * serialized any later recalc_sigpending() after ++ * our setting of utrace->interrupt to force it on. ++ */ ++ if (resume) { ++ /* ++ * This is really just to keep the invariant ++ * that TIF_SIGPENDING is set with utrace->interrupt. ++ * When it's stopped, we know it's always going ++ * through utrace_get_signal and will recalculate. ++ */ ++ set_tsk_thread_flag(target, TIF_SIGPENDING); ++ } else { ++ struct sighand_struct *sighand; ++ unsigned long irqflags; ++ sighand = lock_task_sighand(target, &irqflags); ++ if (likely(sighand)) { ++ signal_wake_up(target, 0); ++ unlock_task_sighand(target, &irqflags); ++ } ++ } ++ break; ++ ++ case UTRACE_BLOCKSTEP: ++ /* ++ * Resume from stopped, step one block. ++ */ ++ if (unlikely(!arch_has_block_step())) { ++ WARN_ON(1); ++ /* Fall through to treat it as SINGLESTEP. */ ++ } else if (likely(resume)) { ++ user_enable_block_step(target); ++ break; ++ } ++ ++ case UTRACE_SINGLESTEP: ++ /* ++ * Resume from stopped, step one instruction. ++ */ ++ if (unlikely(!arch_has_single_step())) { ++ WARN_ON(1); ++ resume = false; ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ if (likely(resume)) ++ user_enable_single_step(target); ++ else ++ /* ++ * You were supposed to stop it before asking ++ * it to step. ++ */ ++ ret = -EAGAIN; ++ break; ++ } ++ ++ /* ++ * Let the thread resume running. If it's not stopped now, ++ * there is nothing more we need to do. ++ */ ++ if (resume) ++ utrace_reset(target, utrace, NULL); ++ else ++ spin_unlock(&utrace->lock); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(utrace_control); ++ ++/** ++ * utrace_barrier - synchronize with simultaneous tracing callbacks ++ * @target: thread to affect ++ * @engine: engine to affect (can be detached) ++ * ++ * This blocks while @target might be in the midst of making a callback to ++ * @engine. It can be interrupted by signals and will return -%ERESTARTSYS. ++ * A return value of zero means no callback from @target to @engine was ++ * in progress. Any effect of its return value (such as %UTRACE_STOP) has ++ * already been applied to @engine. ++ * ++ * It's not necessary to keep the @target pointer alive for this call. ++ * It's only necessary to hold a ref on @engine. This will return ++ * safely even if @target has been reaped and has no task refs. ++ * ++ * A successful return from utrace_barrier() guarantees its ordering ++ * with respect to utrace_set_events() and utrace_control() calls. If ++ * @target was not properly stopped, event callbacks just disabled might ++ * still be in progress; utrace_barrier() waits until there is no chance ++ * an unwanted callback can be in progress. ++ */ ++int utrace_barrier(struct task_struct *target, struct utrace_engine *engine) ++{ ++ struct utrace *utrace; ++ int ret = -ERESTARTSYS; ++ ++ if (unlikely(target == current)) ++ return 0; ++ ++ do { ++ utrace = get_utrace_lock(target, engine, false); ++ if (unlikely(IS_ERR(utrace))) { ++ ret = PTR_ERR(utrace); ++ if (ret != -ERESTARTSYS) ++ break; ++ } else { ++ /* ++ * All engine state changes are done while ++ * holding the lock, i.e. before we get here. ++ * Since we have the lock, we only need to ++ * worry about @target making a callback. ++ * When it has entered start_callback() but ++ * not yet gotten to finish_callback(), we ++ * will see utrace->reporting == @engine. ++ * When @target doesn't take the lock, it uses ++ * barriers to order setting utrace->reporting ++ * before it examines the engine state. ++ */ ++ if (utrace->reporting != engine) ++ ret = 0; ++ spin_unlock(&utrace->lock); ++ if (!ret) ++ break; ++ } ++ schedule_timeout_interruptible(1); ++ } while (!signal_pending(current)); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(utrace_barrier); ++ ++/* ++ * This is local state used for reporting loops, perhaps optimized away. ++ */ ++struct utrace_report { ++ enum utrace_resume_action action; ++ u32 result; ++ bool detaches; ++ bool reports; ++ bool takers; ++ bool killed; ++}; ++ ++#define INIT_REPORT(var) \ ++ struct utrace_report var = { UTRACE_RESUME, 0, \ ++ false, false, false, false } ++ ++/* ++ * We are now making the report, so clear the flag saying we need one. ++ */ ++static void start_report(struct utrace *utrace) ++{ ++ BUG_ON(utrace->stopped); ++ if (utrace->report) { ++ spin_lock(&utrace->lock); ++ utrace->report = 0; ++ splice_attaching(utrace); ++ spin_unlock(&utrace->lock); ++ } ++} ++ ++/* ++ * Complete a normal reporting pass, pairing with a start_report() call. ++ * This handles any UTRACE_DETACH or UTRACE_REPORT or UTRACE_INTERRUPT ++ * returns from engine callbacks. If any engine's last callback used ++ * UTRACE_STOP, we do UTRACE_REPORT here to ensure we stop before user ++ * mode. If there were no callbacks made, it will recompute ++ * @task->utrace_flags to avoid another false-positive. ++ */ ++static void finish_report(struct utrace_report *report, ++ struct task_struct *task, struct utrace *utrace) ++{ ++ bool clean = (report->takers && !report->detaches); ++ ++ if (report->action <= UTRACE_REPORT && !utrace->report) { ++ spin_lock(&utrace->lock); ++ utrace->report = 1; ++ set_tsk_thread_flag(task, TIF_NOTIFY_RESUME); ++ } else if (report->action == UTRACE_INTERRUPT && !utrace->interrupt) { ++ spin_lock(&utrace->lock); ++ utrace->interrupt = 1; ++ set_tsk_thread_flag(task, TIF_SIGPENDING); ++ } else if (clean) { ++ return; ++ } else { ++ spin_lock(&utrace->lock); ++ } ++ ++ if (clean) ++ spin_unlock(&utrace->lock); ++ else ++ utrace_reset(task, utrace, &report->action); ++} ++ ++/* ++ * Apply the return value of one engine callback to @report. ++ * Returns true if @engine detached and should not get any more callbacks. ++ */ ++static bool finish_callback(struct utrace *utrace, ++ struct utrace_report *report, ++ struct utrace_engine *engine, ++ u32 ret) ++{ ++ enum utrace_resume_action action = utrace_resume_action(ret); ++ ++ report->result = ret & ~UTRACE_RESUME_MASK; ++ ++ /* ++ * If utrace_control() was used, treat that like UTRACE_DETACH here. ++ */ ++ if (action == UTRACE_DETACH || engine->ops == &utrace_detached_ops) { ++ engine->ops = &utrace_detached_ops; ++ report->detaches = true; ++ } else { ++ if (action < report->action) ++ report->action = action; ++ ++ if (action == UTRACE_STOP) { ++ if (!engine_wants_stop(engine)) { ++ spin_lock(&utrace->lock); ++ mark_engine_wants_stop(engine); ++ spin_unlock(&utrace->lock); ++ } ++ } else { ++ if (action == UTRACE_REPORT) ++ report->reports = true; ++ ++ if (engine_wants_stop(engine)) { ++ spin_lock(&utrace->lock); ++ clear_engine_wants_stop(engine); ++ spin_unlock(&utrace->lock); ++ } ++ } ++ } ++ ++ /* ++ * Now that we have applied the effect of the return value, ++ * clear this so that utrace_barrier() can stop waiting. ++ * A subsequent utrace_control() can stop or resume @engine ++ * and know this was ordered after its callback's action. ++ * ++ * We don't need any barriers here because utrace_barrier() ++ * takes utrace->lock. If we touched engine->flags above, ++ * the lock guaranteed this change was before utrace_barrier() ++ * examined utrace->reporting. ++ */ ++ utrace->reporting = NULL; ++ ++ /* ++ * This is a good place to make sure tracing engines don't ++ * introduce too much latency under voluntary preemption. ++ */ ++ if (need_resched()) ++ cond_resched(); ++ ++ return engine->ops == &utrace_detached_ops; ++} ++ ++/* ++ * Start the callbacks for @engine to consider @event (a bit mask). ++ * This makes the report_quiesce() callback first. If @engine wants ++ * a specific callback for @event, we return the ops vector to use. ++ * If not, we return NULL. The return value from the ops->callback ++ * function called should be passed to finish_callback(). ++ */ ++static const struct utrace_engine_ops *start_callback( ++ struct utrace *utrace, struct utrace_report *report, ++ struct utrace_engine *engine, struct task_struct *task, ++ unsigned long event) ++{ ++ const struct utrace_engine_ops *ops; ++ unsigned long want; ++ ++ /* ++ * This barrier ensures that we've set utrace->reporting before ++ * we examine engine->flags or engine->ops. utrace_barrier() ++ * relies on this ordering to indicate that the effect of any ++ * utrace_control() and utrace_set_events() calls is in place ++ * by the time utrace->reporting can be seen to be NULL. ++ */ ++ utrace->reporting = engine; ++ smp_mb(); ++ ++ /* ++ * This pairs with the barrier in mark_engine_detached(). ++ * It makes sure that we never see the old ops vector with ++ * the new flags, in case the original vector had no report_quiesce. ++ */ ++ want = engine->flags; ++ smp_rmb(); ++ ops = engine->ops; ++ ++ if (want & UTRACE_EVENT(QUIESCE)) { ++ if (finish_callback(utrace, report, engine, ++ (*ops->report_quiesce)(report->action, ++ engine, task, ++ event))) ++ return NULL; ++ ++ /* ++ * finish_callback() reset utrace->reporting after the ++ * quiesce callback. Now we set it again (as above) ++ * before re-examining engine->flags, which could have ++ * been changed synchronously by ->report_quiesce or ++ * asynchronously by utrace_control() or utrace_set_events(). ++ */ ++ utrace->reporting = engine; ++ smp_mb(); ++ want = engine->flags; ++ } ++ ++ if (want & ENGINE_STOP) ++ report->action = UTRACE_STOP; ++ ++ if (want & event) { ++ report->takers = true; ++ return ops; ++ } ++ ++ utrace->reporting = NULL; ++ return NULL; ++} ++ ++/* ++ * Do a normal reporting pass for engines interested in @event. ++ * @callback is the name of the member in the ops vector, and remaining ++ * args are the extras it takes after the standard three args. ++ */ ++#define REPORT(task, utrace, report, event, callback, ...) \ ++ do { \ ++ start_report(utrace); \ ++ REPORT_CALLBACKS(, task, utrace, report, event, callback, \ ++ (report)->action, engine, current, \ ++ ## __VA_ARGS__); \ ++ finish_report(report, task, utrace); \ ++ } while (0) ++#define REPORT_CALLBACKS(rev, task, utrace, report, event, callback, ...) \ ++ do { \ ++ struct utrace_engine *engine; \ ++ const struct utrace_engine_ops *ops; \ ++ list_for_each_entry##rev(engine, &utrace->attached, entry) { \ ++ ops = start_callback(utrace, report, engine, task, \ ++ event); \ ++ if (!ops) \ ++ continue; \ ++ finish_callback(utrace, report, engine, \ ++ (*ops->callback)(__VA_ARGS__)); \ ++ } \ ++ } while (0) ++ ++/* ++ * Called iff UTRACE_EVENT(EXEC) flag is set. ++ */ ++void utrace_report_exec(struct linux_binfmt *fmt, struct linux_binprm *bprm, ++ struct pt_regs *regs) ++{ ++ struct task_struct *task = current; ++ struct utrace *utrace = task_utrace_struct(task); ++ INIT_REPORT(report); ++ ++ REPORT(task, utrace, &report, UTRACE_EVENT(EXEC), ++ report_exec, fmt, bprm, regs); ++} ++ ++/* ++ * Called iff UTRACE_EVENT(SYSCALL_ENTRY) flag is set. ++ * Return true to prevent the system call. ++ */ ++bool utrace_report_syscall_entry(struct pt_regs *regs) ++{ ++ struct task_struct *task = current; ++ struct utrace *utrace = task_utrace_struct(task); ++ INIT_REPORT(report); ++ ++ start_report(utrace); ++ REPORT_CALLBACKS(_reverse, task, utrace, &report, ++ UTRACE_EVENT(SYSCALL_ENTRY), report_syscall_entry, ++ report.result | report.action, engine, current, regs); ++ finish_report(&report, task, utrace); ++ ++ if (report.action == UTRACE_STOP && ++ unlikely(utrace_stop(task, utrace, false))) ++ /* ++ * We are continuing despite UTRACE_STOP because of a ++ * SIGKILL. Don't let the system call actually proceed. ++ */ ++ return true; ++ ++ return report.result == UTRACE_SYSCALL_ABORT; ++} ++ ++/* ++ * Called iff UTRACE_EVENT(SYSCALL_EXIT) flag is set. ++ */ ++void utrace_report_syscall_exit(struct pt_regs *regs) ++{ ++ struct task_struct *task = current; ++ struct utrace *utrace = task_utrace_struct(task); ++ INIT_REPORT(report); ++ ++ REPORT(task, utrace, &report, UTRACE_EVENT(SYSCALL_EXIT), ++ report_syscall_exit, regs); ++} ++ ++/* ++ * Called iff UTRACE_EVENT(CLONE) flag is set. ++ * This notification call blocks the wake_up_new_task call on the child. ++ * So we must not quiesce here. tracehook_report_clone_complete will do ++ * a quiescence check momentarily. ++ */ ++void utrace_report_clone(unsigned long clone_flags, struct task_struct *child) ++{ ++ struct task_struct *task = current; ++ struct utrace *utrace = task_utrace_struct(task); ++ INIT_REPORT(report); ++ ++ /* ++ * We don't use the REPORT() macro here, because we need ++ * to clear utrace->cloning before finish_report(). ++ * After finish_report(), utrace can be a stale pointer ++ * in cases when report.action is still UTRACE_RESUME. ++ */ ++ start_report(utrace); ++ utrace->cloning = child; ++ ++ REPORT_CALLBACKS(, task, utrace, &report, ++ UTRACE_EVENT(CLONE), report_clone, ++ report.action, engine, task, clone_flags, child); ++ ++ utrace->cloning = NULL; ++ finish_report(&report, task, utrace); ++ ++ /* ++ * For a vfork, we will go into an uninterruptible block waiting ++ * for the child. We need UTRACE_STOP to happen before this, not ++ * after. For CLONE_VFORK, utrace_finish_vfork() will be called. ++ */ ++ if (report.action == UTRACE_STOP && (clone_flags & CLONE_VFORK)) { ++ spin_lock(&utrace->lock); ++ utrace->vfork_stop = 1; ++ spin_unlock(&utrace->lock); ++ } ++} ++ ++/* ++ * We're called after utrace_report_clone() for a CLONE_VFORK. ++ * If UTRACE_STOP was left from the clone report, we stop here. ++ * After this, we'll enter the uninterruptible wait_for_completion() ++ * waiting for the child. ++ */ ++void utrace_finish_vfork(struct task_struct *task) ++{ ++ struct utrace *utrace = task_utrace_struct(task); ++ ++ spin_lock(&utrace->lock); ++ if (!utrace->vfork_stop) ++ spin_unlock(&utrace->lock); ++ else { ++ utrace->vfork_stop = 0; ++ spin_unlock(&utrace->lock); ++ utrace_stop(task, utrace, false); ++ } ++} ++ ++/* ++ * Called iff UTRACE_EVENT(JCTL) flag is set. ++ * ++ * Called with siglock held. ++ */ ++void utrace_report_jctl(int notify, int what) ++{ ++ struct task_struct *task = current; ++ struct utrace *utrace = task_utrace_struct(task); ++ INIT_REPORT(report); ++ bool stop = task_is_stopped(task); ++ ++ /* ++ * We have to come out of TASK_STOPPED in case the event report ++ * hooks might block. Since we held the siglock throughout, it's ++ * as if we were never in TASK_STOPPED yet at all. ++ */ ++ if (stop) { ++ __set_current_state(TASK_RUNNING); ++ task->signal->flags &= ~SIGNAL_STOP_STOPPED; ++ ++task->signal->group_stop_count; ++ } ++ spin_unlock_irq(&task->sighand->siglock); ++ ++ /* ++ * We get here with CLD_STOPPED when we've just entered ++ * TASK_STOPPED, or with CLD_CONTINUED when we've just come ++ * out but not yet been through utrace_get_signal() again. ++ * ++ * While in TASK_STOPPED, we can be considered safely ++ * stopped by utrace_do_stop() and detached asynchronously. ++ * If we woke up and checked task->utrace_flags before that ++ * was finished, we might be here with utrace already ++ * removed or in the middle of being removed. ++ * ++ * If we are indeed attached, then make sure we are no ++ * longer considered stopped while we run callbacks. ++ */ ++ spin_lock(&utrace->lock); ++ utrace->stopped = 0; ++ /* ++ * Do start_report()'s work too since we already have the lock anyway. ++ */ ++ utrace->report = 0; ++ splice_attaching(utrace); ++ spin_unlock(&utrace->lock); ++ ++ REPORT(task, utrace, &report, UTRACE_EVENT(JCTL), ++ report_jctl, what, notify); ++ ++ /* ++ * Retake the lock, and go back into TASK_STOPPED ++ * unless the stop was just cleared. ++ */ ++ spin_lock_irq(&task->sighand->siglock); ++ if (stop && task->signal->group_stop_count > 0) { ++ __set_current_state(TASK_STOPPED); ++ if (--task->signal->group_stop_count == 0) ++ task->signal->flags |= SIGNAL_STOP_STOPPED; ++ } ++} ++ ++/* ++ * Called iff UTRACE_EVENT(EXIT) flag is set. ++ */ ++void utrace_report_exit(long *exit_code) ++{ ++ struct task_struct *task = current; ++ struct utrace *utrace = task_utrace_struct(task); ++ INIT_REPORT(report); ++ long orig_code = *exit_code; ++ ++ REPORT(task, utrace, &report, UTRACE_EVENT(EXIT), ++ report_exit, orig_code, exit_code); ++ ++ if (report.action == UTRACE_STOP) ++ utrace_stop(task, utrace, false); ++} ++ ++/* ++ * Called iff UTRACE_EVENT(DEATH) or UTRACE_EVENT(QUIESCE) flag is set. ++ * ++ * It is always possible that we are racing with utrace_release_task here. ++ * For this reason, utrace_release_task checks for the event bits that get ++ * us here, and delays its cleanup for us to do. ++ */ ++void utrace_report_death(struct task_struct *task, struct utrace *utrace, ++ bool group_dead, int signal) ++{ ++ INIT_REPORT(report); ++ ++ BUG_ON(!task->exit_state); ++ ++ /* ++ * We are presently considered "quiescent"--which is accurate ++ * inasmuch as we won't run any more user instructions ever again. ++ * But for utrace_control and utrace_set_events to be robust, they ++ * must be sure whether or not we will run any more callbacks. If ++ * a call comes in before we do, taking the lock here synchronizes ++ * us so we don't run any callbacks just disabled. Calls that come ++ * in while we're running the callbacks will see the exit.death ++ * flag and know that we are not yet fully quiescent for purposes ++ * of detach bookkeeping. ++ */ ++ spin_lock(&utrace->lock); ++ BUG_ON(utrace->death); ++ utrace->death = 1; ++ utrace->report = 0; ++ utrace->interrupt = 0; ++ spin_unlock(&utrace->lock); ++ ++ REPORT_CALLBACKS(, task, utrace, &report, UTRACE_EVENT(DEATH), ++ report_death, engine, task, group_dead, signal); ++ ++ spin_lock(&utrace->lock); ++ ++ /* ++ * After we unlock (possibly inside utrace_reap for callbacks) with ++ * this flag clear, competing utrace_control/utrace_set_events calls ++ * know that we've finished our callbacks and any detach bookkeeping. ++ */ ++ utrace->death = 0; ++ ++ if (utrace->reap) ++ /* ++ * utrace_release_task() was already called in parallel. ++ * We must complete its work now. ++ */ ++ utrace_reap(task, utrace); ++ else ++ utrace_reset(task, utrace, &report.action); ++} ++ ++/* ++ * Finish the last reporting pass before returning to user mode. ++ */ ++static void finish_resume_report(struct utrace_report *report, ++ struct task_struct *task, ++ struct utrace *utrace) ++{ ++ if (report->detaches || !report->takers) { ++ spin_lock(&utrace->lock); ++ utrace_reset(task, utrace, &report->action); ++ } ++ ++ switch (report->action) { ++ case UTRACE_STOP: ++ report->killed = utrace_stop(task, utrace, report->reports); ++ break; ++ ++ case UTRACE_INTERRUPT: ++ if (!signal_pending(task)) ++ set_tsk_thread_flag(task, TIF_SIGPENDING); ++ break; ++ ++ case UTRACE_BLOCKSTEP: ++ if (likely(arch_has_block_step())) { ++ user_enable_block_step(task); ++ break; ++ } ++ ++ /* ++ * This means some callback is to blame for failing ++ * to check arch_has_block_step() itself. Warn and ++ * then fall through to treat it as SINGLESTEP. ++ */ ++ WARN_ON(1); ++ ++ case UTRACE_SINGLESTEP: ++ if (likely(arch_has_single_step())) ++ user_enable_single_step(task); ++ else ++ /* ++ * This means some callback is to blame for failing ++ * to check arch_has_single_step() itself. Spew ++ * about it so the loser will fix his module. ++ */ ++ WARN_ON(1); ++ break; ++ ++ case UTRACE_REPORT: ++ case UTRACE_RESUME: ++ default: ++ user_disable_single_step(task); ++ break; ++ } ++} ++ ++/* ++ * This is called when TIF_NOTIFY_RESUME had been set (and is now clear). ++ * We are close to user mode, and this is the place to report or stop. ++ * When we return, we're going to user mode or into the signals code. ++ */ ++void utrace_resume(struct task_struct *task, struct pt_regs *regs) ++{ ++ struct utrace *utrace = task_utrace_struct(task); ++ INIT_REPORT(report); ++ struct utrace_engine *engine; ++ ++ /* ++ * Some machines get here with interrupts disabled. The same arch ++ * code path leads to calling into get_signal_to_deliver(), which ++ * implicitly reenables them by virtue of spin_unlock_irq. ++ */ ++ local_irq_enable(); ++ ++ /* ++ * If this flag is still set it's because there was a signal ++ * handler setup done but no report_signal following it. Clear ++ * the flag before we get to user so it doesn't confuse us later. ++ */ ++ if (unlikely(utrace->signal_handler)) { ++ int skip; ++ spin_lock(&utrace->lock); ++ utrace->signal_handler = 0; ++ skip = !utrace->report; ++ spin_unlock(&utrace->lock); ++ if (skip) ++ return; ++ } ++ ++ /* ++ * If UTRACE_INTERRUPT was just used, we don't bother with a report ++ * here. We will report and stop in utrace_get_signal(). In case ++ * of a race with utrace_control(), make sure we don't momentarily ++ * return to user mode because TIF_SIGPENDING was not set yet. ++ */ ++ if (unlikely(utrace->interrupt)) { ++ set_thread_flag(TIF_SIGPENDING); ++ return; ++ } ++ ++ /* ++ * Do a simple reporting pass, with no callback after report_quiesce. ++ */ ++ start_report(utrace); ++ ++ list_for_each_entry(engine, &utrace->attached, entry) ++ start_callback(utrace, &report, engine, task, 0); ++ ++ /* ++ * Finish the report and either stop or get ready to resume. ++ */ ++ finish_resume_report(&report, task, utrace); ++} ++ ++/* ++ * Return true if current has forced signal_pending(). ++ * ++ * This is called only when current->utrace_flags is nonzero, so we know ++ * that current->utrace must be set. It's not inlined in tracehook.h ++ * just so that struct utrace can stay opaque outside this file. ++ */ ++bool utrace_interrupt_pending(void) ++{ ++ return task_utrace_struct(current)->interrupt; ++} ++ ++/* ++ * Take the siglock and push @info back on our queue. ++ * Returns with @task->sighand->siglock held. ++ */ ++static void push_back_signal(struct task_struct *task, siginfo_t *info) ++ __acquires(task->sighand->siglock) ++{ ++ struct sigqueue *q; ++ ++ if (unlikely(!info->si_signo)) { /* Oh, a wise guy! */ ++ spin_lock_irq(&task->sighand->siglock); ++ return; ++ } ++ ++ q = sigqueue_alloc(); ++ if (likely(q)) { ++ q->flags = 0; ++ copy_siginfo(&q->info, info); ++ } ++ ++ spin_lock_irq(&task->sighand->siglock); ++ ++ sigaddset(&task->pending.signal, info->si_signo); ++ if (likely(q)) ++ list_add(&q->list, &task->pending.list); ++ ++ set_tsk_thread_flag(task, TIF_SIGPENDING); ++} ++ ++/* ++ * This is the hook from the signals code, called with the siglock held. ++ * Here is the ideal place to stop. We also dequeue and intercept signals. ++ */ ++int utrace_get_signal(struct task_struct *task, struct pt_regs *regs, ++ siginfo_t *info, struct k_sigaction *return_ka) ++ __releases(task->sighand->siglock) ++ __acquires(task->sighand->siglock) ++{ ++ struct utrace *utrace; ++ struct k_sigaction *ka; ++ INIT_REPORT(report); ++ struct utrace_engine *engine; ++ const struct utrace_engine_ops *ops; ++ unsigned long event, want; ++ u32 ret; ++ int signr; ++ ++ utrace = &task->utrace; ++ if (utrace->interrupt || utrace->report || utrace->signal_handler) { ++ /* ++ * We've been asked for an explicit report before we ++ * even check for pending signals. ++ */ ++ ++ spin_unlock_irq(&task->sighand->siglock); ++ ++ spin_lock(&utrace->lock); ++ ++ splice_attaching(utrace); ++ ++ if (unlikely(!utrace->interrupt) && unlikely(!utrace->report)) ++ report.result = UTRACE_SIGNAL_IGN; ++ else if (utrace->signal_handler) ++ report.result = UTRACE_SIGNAL_HANDLER; ++ else ++ report.result = UTRACE_SIGNAL_REPORT; ++ ++ /* ++ * We are now making the report and it's on the ++ * interrupt path, so clear the flags asking for those. ++ */ ++ utrace->interrupt = utrace->report = utrace->signal_handler = 0; ++ utrace->stopped = 0; ++ ++ /* ++ * Make sure signal_pending() only returns true ++ * if there are real signals pending. ++ */ ++ if (signal_pending(task)) { ++ spin_lock_irq(&task->sighand->siglock); ++ recalc_sigpending(); ++ spin_unlock_irq(&task->sighand->siglock); ++ } ++ ++ spin_unlock(&utrace->lock); ++ ++ if (unlikely(report.result == UTRACE_SIGNAL_IGN)) ++ /* ++ * We only got here to clear utrace->signal_handler. ++ */ ++ return -1; ++ ++ /* ++ * Do a reporting pass for no signal, just for EVENT(QUIESCE). ++ * The engine callbacks can fill in *info and *return_ka. ++ * We'll pass NULL for the @orig_ka argument to indicate ++ * that there was no original signal. ++ */ ++ event = 0; ++ ka = NULL; ++ memset(return_ka, 0, sizeof *return_ka); ++ } else if ((task->utrace_flags & UTRACE_EVENT_SIGNAL_ALL) == 0 && ++ !utrace->stopped) { ++ /* ++ * If no engine is interested in intercepting signals, ++ * let the caller just dequeue them normally. ++ */ ++ return 0; ++ } else { ++ if (unlikely(utrace->stopped)) { ++ spin_unlock_irq(&task->sighand->siglock); ++ spin_lock(&utrace->lock); ++ utrace->stopped = 0; ++ spin_unlock(&utrace->lock); ++ spin_lock_irq(&task->sighand->siglock); ++ } ++ ++ /* ++ * Steal the next signal so we can let tracing engines ++ * examine it. From the signal number and sigaction, ++ * determine what normal delivery would do. If no ++ * engine perturbs it, we'll do that by returning the ++ * signal number after setting *return_ka. ++ */ ++ signr = dequeue_signal(task, &task->blocked, info); ++ if (signr == 0) ++ return signr; ++ BUG_ON(signr != info->si_signo); ++ ++ ka = &task->sighand->action[signr - 1]; ++ *return_ka = *ka; ++ ++ /* ++ * We are never allowed to interfere with SIGKILL. ++ * Just punt after filling in *return_ka for our caller. ++ */ ++ if (signr == SIGKILL) ++ return signr; ++ ++ if (ka->sa.sa_handler == SIG_IGN) { ++ event = UTRACE_EVENT(SIGNAL_IGN); ++ report.result = UTRACE_SIGNAL_IGN; ++ } else if (ka->sa.sa_handler != SIG_DFL) { ++ event = UTRACE_EVENT(SIGNAL); ++ report.result = UTRACE_SIGNAL_DELIVER; ++ } else if (sig_kernel_coredump(signr)) { ++ event = UTRACE_EVENT(SIGNAL_CORE); ++ report.result = UTRACE_SIGNAL_CORE; ++ } else if (sig_kernel_ignore(signr)) { ++ event = UTRACE_EVENT(SIGNAL_IGN); ++ report.result = UTRACE_SIGNAL_IGN; ++ } else if (signr == SIGSTOP) { ++ event = UTRACE_EVENT(SIGNAL_STOP); ++ report.result = UTRACE_SIGNAL_STOP; ++ } else if (sig_kernel_stop(signr)) { ++ event = UTRACE_EVENT(SIGNAL_STOP); ++ report.result = UTRACE_SIGNAL_TSTP; ++ } else { ++ event = UTRACE_EVENT(SIGNAL_TERM); ++ report.result = UTRACE_SIGNAL_TERM; ++ } ++ ++ /* ++ * Now that we know what event type this signal is, we ++ * can short-circuit if no engines care about those. ++ */ ++ if ((task->utrace_flags & (event | UTRACE_EVENT(QUIESCE))) == 0) ++ return signr; ++ ++ /* ++ * We have some interested engines, so tell them about ++ * the signal and let them change its disposition. ++ */ ++ spin_unlock_irq(&task->sighand->siglock); ++ } ++ ++ /* ++ * This reporting pass chooses what signal disposition we'll act on. ++ */ ++ list_for_each_entry(engine, &utrace->attached, entry) { ++ /* ++ * See start_callback() comment about this barrier. ++ */ ++ utrace->reporting = engine; ++ smp_mb(); ++ ++ /* ++ * This pairs with the barrier in mark_engine_detached(), ++ * see start_callback() comments. ++ */ ++ want = engine->flags; ++ smp_rmb(); ++ ops = engine->ops; ++ ++ if ((want & (event | UTRACE_EVENT(QUIESCE))) == 0) { ++ utrace->reporting = NULL; ++ continue; ++ } ++ ++ if (ops->report_signal) ++ ret = (*ops->report_signal)( ++ report.result | report.action, engine, task, ++ regs, info, ka, return_ka); ++ else ++ ret = (report.result | (*ops->report_quiesce)( ++ report.action, engine, task, event)); ++ ++ /* ++ * Avoid a tight loop reporting again and again if some ++ * engine is too stupid. ++ */ ++ switch (utrace_resume_action(ret)) { ++ default: ++ break; ++ case UTRACE_INTERRUPT: ++ case UTRACE_REPORT: ++ ret = (ret & ~UTRACE_RESUME_MASK) | UTRACE_RESUME; ++ break; ++ } ++ ++ finish_callback(utrace, &report, engine, ret); ++ } ++ ++ /* ++ * We express the chosen action to the signals code in terms ++ * of a representative signal whose default action does it. ++ * Our caller uses our return value (signr) to decide what to ++ * do, but uses info->si_signo as the signal number to report. ++ */ ++ switch (utrace_signal_action(report.result)) { ++ case UTRACE_SIGNAL_TERM: ++ signr = SIGTERM; ++ break; ++ ++ case UTRACE_SIGNAL_CORE: ++ signr = SIGQUIT; ++ break; ++ ++ case UTRACE_SIGNAL_STOP: ++ signr = SIGSTOP; ++ break; ++ ++ case UTRACE_SIGNAL_TSTP: ++ signr = SIGTSTP; ++ break; ++ ++ case UTRACE_SIGNAL_DELIVER: ++ signr = info->si_signo; ++ ++ if (return_ka->sa.sa_handler == SIG_DFL) { ++ /* ++ * We'll do signr's normal default action. ++ * For ignore, we'll fall through below. ++ * For stop/death, break locks and returns it. ++ */ ++ if (likely(signr) && !sig_kernel_ignore(signr)) ++ break; ++ } else if (return_ka->sa.sa_handler != SIG_IGN && ++ likely(signr)) { ++ /* ++ * Complete the bookkeeping after the report. ++ * The handler will run. If an engine wanted to ++ * stop or step, then make sure we do another ++ * report after signal handler setup. ++ */ ++ if (report.action != UTRACE_RESUME) ++ report.action = UTRACE_INTERRUPT; ++ finish_report(&report, task, utrace); ++ ++ if (unlikely(report.result & UTRACE_SIGNAL_HOLD)) ++ push_back_signal(task, info); ++ else ++ spin_lock_irq(&task->sighand->siglock); ++ ++ /* ++ * We do the SA_ONESHOT work here since the ++ * normal path will only touch *return_ka now. ++ */ ++ if (unlikely(return_ka->sa.sa_flags & SA_ONESHOT)) { ++ return_ka->sa.sa_flags &= ~SA_ONESHOT; ++ if (likely(valid_signal(signr))) { ++ ka = &task->sighand->action[signr - 1]; ++ ka->sa.sa_handler = SIG_DFL; ++ } ++ } ++ ++ return signr; ++ } ++ ++ /* Fall through for an ignored signal. */ ++ ++ case UTRACE_SIGNAL_IGN: ++ case UTRACE_SIGNAL_REPORT: ++ default: ++ /* ++ * If the signal is being ignored, then we are on the way ++ * directly back to user mode. We can stop here, or step, ++ * as in utrace_resume(), above. After we've dealt with that, ++ * our caller will relock and come back through here. ++ */ ++ finish_resume_report(&report, task, utrace); ++ ++ if (unlikely(report.killed)) { ++ /* ++ * The only reason we woke up now was because of a ++ * SIGKILL. Don't do normal dequeuing in case it ++ * might get a signal other than SIGKILL. That would ++ * perturb the death state so it might differ from ++ * what the debugger would have allowed to happen. ++ * Instead, pluck out just the SIGKILL to be sure ++ * we'll die immediately with nothing else different ++ * from the quiescent state the debugger wanted us in. ++ */ ++ sigset_t sigkill_only; ++ siginitsetinv(&sigkill_only, sigmask(SIGKILL)); ++ spin_lock_irq(&task->sighand->siglock); ++ signr = dequeue_signal(task, &sigkill_only, info); ++ BUG_ON(signr != SIGKILL); ++ *return_ka = task->sighand->action[SIGKILL - 1]; ++ return signr; ++ } ++ ++ if (unlikely(report.result & UTRACE_SIGNAL_HOLD)) { ++ push_back_signal(task, info); ++ spin_unlock_irq(&task->sighand->siglock); ++ } ++ ++ return -1; ++ } ++ ++ /* ++ * Complete the bookkeeping after the report. ++ * This sets utrace->report if UTRACE_STOP was used. ++ */ ++ finish_report(&report, task, utrace); ++ ++ return_ka->sa.sa_handler = SIG_DFL; ++ ++ if (unlikely(report.result & UTRACE_SIGNAL_HOLD)) ++ push_back_signal(task, info); ++ else ++ spin_lock_irq(&task->sighand->siglock); ++ ++ if (sig_kernel_stop(signr)) ++ task->signal->flags |= SIGNAL_STOP_DEQUEUED; ++ ++ return signr; ++} ++ ++/* ++ * This gets called after a signal handler has been set up. ++ * We set a flag so the next report knows it happened. ++ * If we're already stepping, make sure we do a report_signal. ++ * If not, make sure we get into utrace_resume() where we can ++ * clear the signal_handler flag before resuming. ++ */ ++void utrace_signal_handler(struct task_struct *task, int stepping) ++{ ++ struct utrace *utrace = task_utrace_struct(task); ++ ++ spin_lock(&utrace->lock); ++ ++ utrace->signal_handler = 1; ++ if (stepping) { ++ utrace->interrupt = 1; ++ set_tsk_thread_flag(task, TIF_SIGPENDING); ++ } else { ++ set_tsk_thread_flag(task, TIF_NOTIFY_RESUME); ++ } ++ ++ spin_unlock(&utrace->lock); ++} ++ ++/** ++ * utrace_prepare_examine - prepare to examine thread state ++ * @target: thread of interest, a &struct task_struct pointer ++ * @engine: engine pointer returned by utrace_attach_task() ++ * @exam: temporary state, a &struct utrace_examiner pointer ++ * ++ * This call prepares to safely examine the thread @target using ++ * &struct user_regset calls, or direct access to thread-synchronous fields. ++ * ++ * When @target is current, this call is superfluous. When @target is ++ * another thread, it must held stopped via %UTRACE_STOP by @engine. ++ * ++ * This call may block the caller until @target stays stopped, so it must ++ * be called only after the caller is sure @target is about to unschedule. ++ * This means a zero return from a utrace_control() call on @engine giving ++ * %UTRACE_STOP, or a report_quiesce() or report_signal() callback to ++ * @engine that used %UTRACE_STOP in its return value. ++ * ++ * Returns -%ESRCH if @target is dead or -%EINVAL if %UTRACE_STOP was ++ * not used. If @target has started running again despite %UTRACE_STOP ++ * (for %SIGKILL or a spurious wakeup), this call returns -%EAGAIN. ++ * ++ * When this call returns zero, it's safe to use &struct user_regset ++ * calls and task_user_regset_view() on @target and to examine some of ++ * its fields directly. When the examination is complete, a ++ * utrace_finish_examine() call must follow to check whether it was ++ * completed safely. ++ */ ++int utrace_prepare_examine(struct task_struct *target, ++ struct utrace_engine *engine, ++ struct utrace_examiner *exam) ++{ ++ int ret = 0; ++ ++ if (unlikely(target == current)) ++ return 0; ++ ++ rcu_read_lock(); ++ if (unlikely(!engine_wants_stop(engine))) ++ ret = -EINVAL; ++ else if (unlikely(target->exit_state)) ++ ret = -ESRCH; ++ else { ++ exam->state = target->state; ++ if (unlikely(exam->state == TASK_RUNNING)) ++ ret = -EAGAIN; ++ else ++ get_task_struct(target); ++ } ++ rcu_read_unlock(); ++ ++ if (likely(!ret)) { ++ exam->ncsw = wait_task_inactive(target, exam->state); ++ put_task_struct(target); ++ if (unlikely(!exam->ncsw)) ++ ret = -EAGAIN; ++ } ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(utrace_prepare_examine); ++ ++/** ++ * utrace_finish_examine - complete an examination of thread state ++ * @target: thread of interest, a &struct task_struct pointer ++ * @engine: engine pointer returned by utrace_attach_task() ++ * @exam: pointer passed to utrace_prepare_examine() call ++ * ++ * This call completes an examination on the thread @target begun by a ++ * paired utrace_prepare_examine() call with the same arguments that ++ * returned success (zero). ++ * ++ * When @target is current, this call is superfluous. When @target is ++ * another thread, this returns zero if @target has remained unscheduled ++ * since the paired utrace_prepare_examine() call returned zero. ++ * ++ * When this returns an error, any examination done since the paired ++ * utrace_prepare_examine() call is unreliable and the data extracted ++ * should be discarded. The error is -%EINVAL if @engine is not ++ * keeping @target stopped, or -%EAGAIN if @target woke up unexpectedly. ++ */ ++int utrace_finish_examine(struct task_struct *target, ++ struct utrace_engine *engine, ++ struct utrace_examiner *exam) ++{ ++ int ret = 0; ++ ++ if (unlikely(target == current)) ++ return 0; ++ ++ rcu_read_lock(); ++ if (unlikely(!engine_wants_stop(engine))) ++ ret = -EINVAL; ++ else if (unlikely(target->state != exam->state)) ++ ret = -EAGAIN; ++ else ++ get_task_struct(target); ++ rcu_read_unlock(); ++ ++ if (likely(!ret)) { ++ unsigned long ncsw = wait_task_inactive(target, exam->state); ++ if (unlikely(ncsw != exam->ncsw)) ++ ret = -EAGAIN; ++ put_task_struct(target); ++ } ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(utrace_finish_examine); ++ ++/* ++ * This is declared in linux/regset.h and defined in machine-dependent ++ * code. We put the export here to ensure no machine forgets it. ++ */ ++EXPORT_SYMBOL_GPL(task_user_regset_view); ++ ++/* ++ * Called with rcu_read_lock() held. ++ */ ++void task_utrace_proc_status(struct seq_file *m, struct task_struct *p) ++{ ++ struct utrace *utrace = &p->utrace; ++ seq_printf(m, "Utrace:\t%lx%s%s%s\n", ++ p->utrace_flags, ++ utrace->stopped ? " (stopped)" : "", ++ utrace->report ? " (report)" : "", ++ utrace->interrupt ? " (interrupt)" : ""); ++} diff --git a/rpmmacros.in b/rpmmacros.in new file mode 100644 index 000000000..745559038 --- /dev/null +++ b/rpmmacros.in @@ -0,0 +1,7 @@ +%_topdir PWD +%_tmppath PWD/tmp +%__spec_install_pre %{___build_pre} +%_install_langs C:en_US:en +%_netsharedpath /proc:/dev/pts:/usr/share/info +%_excludedocs 1 +%__file_context_path /dev/null diff --git a/rpmmacros.sh b/rpmmacros.sh new file mode 100755 index 000000000..a87f78520 --- /dev/null +++ b/rpmmacros.sh @@ -0,0 +1 @@ +sed -e "s,PWD,$(pwd),g" rpmmacros.in > .rpmmacros diff --git a/sources b/sources new file mode 100644 index 000000000..05c590b6f --- /dev/null +++ b/sources @@ -0,0 +1,2 @@ +e2a867bcb1ad901981707edefc8f936b24b27090 http://mirror.onelab.eu/third-party/kernel-2.6.31.6-162.fc12.src.rpm +c14f136d15db7130c3121bbc634ab98c41f06394 http://vserver.13thfloor.at/Experimental/patch-2.6.31.6-vs2.3.0.36.27.diff -- 2.43.0