fedora core 6 1.2949 + vserver 2.2.0

[linux-2.6.git] / Documentation / filesystems / Locking
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking

index 83e6a8f..28bfea7 100644 (file)
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -90,7 +90,7 @@ prototypes:
         void (*destroy_inode)(struct inode *);
         void (*read_inode) (struct inode *);
         void (*dirty_inode) (struct inode *);
-       void (*write_inode) (struct inode *, int);
+       int (*write_inode) (struct inode *, int);
         void (*put_inode) (struct inode *);
         void (*drop_inode) (struct inode *);
         void (*delete_inode) (struct inode *);
@@ -99,11 +99,13 @@ prototypes:
         int (*sync_fs)(struct super_block *sb, int wait);
         void (*write_super_lockfs) (struct super_block *);
         void (*unlockfs) (struct super_block *);
-       int (*statfs) (struct super_block *, struct kstatfs *);
+       int (*statfs) (struct dentry *, struct kstatfs *);
         int (*remount_fs) (struct super_block *, int *, char *);
         void (*clear_inode) (struct inode *);
         void (*umount_begin) (struct super_block *);
         int (*show_options)(struct seq_file *, struct vfsmount *);
+       ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
+       ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
  
  locking rules:
         All may block.
@@ -122,26 +124,34 @@ sync_fs:          no      no      read
  write_super_lockfs:    ?
  unlockfs:              ?
  statfs:                        no      no      no
-remount_fs:            no      yes     maybe           (see below)
+remount_fs:            yes     yes     maybe           (see below)
  clear_inode:           no
  umount_begin:          yes     no      no
  show_options:          no                              (vfsmount->sem)
+quota_read:            no      no      no              (see below)
+quota_write:           no      no      no              (see below)
  
  ->read_inode() is not a method - it's a callback used in iget().
  ->remount_fs() will have the s_umount lock if it's already mounted.
  When called from get_sb_single, it does NOT have the s_umount lock.
+->quota_read() and ->quota_write() functions are both guaranteed to
+be the only ones operating on the quota file by the quota code (via
+dqio_sem) (unless an admin really wants to screw up something and
+writes to quota files with quotas on). For other details about locking
+see also dquot_operations section.
  
  --------------------------- file_system_type ---------------------------
  prototypes:
-       struct super_block *(*get_sb) (struct file_system_type *, int,
-                       const char *, void *);
+       int (*get_sb) (struct file_system_type *, int,
+                      const char *, void *, struct vfsmount *);
         void (*kill_sb) (struct super_block *);
  locking rules:
                 may block       BKL
  get_sb         yes             yes
  kill_sb                yes             yes
  
-->get_sb() returns error or a locked superblock (exclusive on ->s_umount).
+->get_sb() returns error or 0 with locked superblock attached to the vfsmount
+(exclusive on ->s_umount).
  ->kill_sb() takes a write-locked superblock, does all shutdown work on it,
  unlocks and drops the reference.
  
@@ -161,6 +171,7 @@ prototypes:
         int (*releasepage) (struct page *, int);
         int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
                         loff_t offset, unsigned long nr_segs);
+       int (*launder_page) (struct page *);
  
  locking rules:
         All except set_page_dirty may block
@@ -178,6 +189,7 @@ bmap:                       yes
  invalidatepage:                no      yes
  releasepage:           no      yes
  direct_IO:             no
+launder_page:          no      yes
  
         ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage()
  may be called from the request handler (/dev/loop).
@@ -203,20 +215,38 @@ currently-in-progress I/O.
  
  If the filesystem is not called for "sync" and it determines that it
  would need to block against in-progress I/O to be able to start new I/O
-against the page the filesystem shoud redirty the page (usually with
-__set_page_dirty_nobuffers()), then unlock the page and return zero. 
+against the page the filesystem should redirty the page with
+redirty_page_for_writepage(), then unlock the page and return zero.
  This may also be done to avoid internal deadlocks, but rarely.
  
  If the filesytem is called for sync then it must wait on any
  in-progress I/O and then start new I/O.
  
-The filesystem should unlock the page synchronously, before returning
-to the caller.  If the page has write I/O underway against it,
-writepage() should run SetPageWriteback() against the page prior to
-unlocking it.  The write I/O completion handler should run
-end_page_writeback() against the page.
-
-That is: after 2.5.12, pages which are under writeout are *not* locked.
+The filesystem should unlock the page synchronously, before returning to the
+caller, unless ->writepage() returns special WRITEPAGE_ACTIVATE
+value. WRITEPAGE_ACTIVATE means that page cannot really be written out
+currently, and VM should stop calling ->writepage() on this page for some
+time. VM does this by moving page to the head of the active list, hence the
+name.
+
+Unless the filesystem is going to redirty_page_for_writepage(), unlock the page
+and return zero, writepage *must* run set_page_writeback() against the page,
+followed by unlocking it.  Once set_page_writeback() has been run against the
+page, write I/O can be submitted and the write I/O completion handler must run
+end_page_writeback() once the I/O is complete.  If no I/O is submitted, the
+filesystem must run end_page_writeback() against the page before returning from
+writepage.
+
+That is: after 2.5.12, pages which are under writeout are *not* locked.  Note,
+if the filesystem needs the page to be locked during writeout, that is ok, too,
+the page is allowed to be unlocked at any point in time between the calls to
+set_page_writeback() and end_page_writeback().
+
+Note, failure to run either redirty_page_for_writepage() or the combination of
+set_page_writeback()/end_page_writeback() on a page submitted to writepage
+will leave the page itself marked clean but it will be tagged as dirty in the
+radix tree.  This incoherency can lead to all sorts of hard-to-debug problems
+in the filesystem like having dirty inodes at umount and losing written data.
  
         ->sync_page() locking rules are not well-defined - usually it is called
  with lock on page, but that is not guaranteed. Considering the currently
@@ -253,6 +283,12 @@ buffers from the page in preparation for freeing it.  It returns zero to
  indicate that the buffers are (or may be) freeable.  If ->releasepage is zero,
  the kernel assumes that the fs has no private interest in the buffers.
  
+       ->launder_page() may be called prior to releasing a page if
+it is still found to be dirty. It returns zero if the page was successfully
+cleaned, or an error value if not. Note that in order to prevent the page
+getting mapped back in and redirtied, it needs to be kept locked
+across the entire operation.
+
         Note: currently almost all instances of address_space methods are
  using BKL for internal serialization and that's one of the worst sources
  of contention. Normally they are calling library functions (in fs/buffer.c)
@@ -262,21 +298,40 @@ foo_get_block(). It's an overkill, since block bitmaps can be protected by
  internal fs locking and real critical areas are much smaller than the areas
  filesystems protect now.
  
---------------------------- file_lock ------------------------------------
+----------------------- file_lock_operations ------------------------------
  prototypes:
-       void (*fl_notify)(struct file_lock *);  /* unblock callback */
         void (*fl_insert)(struct file_lock *);  /* lock insertion callback */
         void (*fl_remove)(struct file_lock *);  /* lock removal callback */
+       void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
+       void (*fl_release_private)(struct file_lock *);
+
  
  locking rules:
-               BKL     may block
-fl_notify:     yes     no
-fl_insert:     yes     no
-fl_remove:     yes     no
-       Currently only NLM provides instances of this class. None of the
+                       BKL     may block
+fl_insert:             yes     no
+fl_remove:             yes     no
+fl_copy_lock:          yes     no
+fl_release_private:    yes     yes
+
+----------------------- lock_manager_operations ---------------------------
+prototypes:
+       int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
+       void (*fl_notify)(struct file_lock *);  /* unblock callback */
+       void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
+       void (*fl_release_private)(struct file_lock *);
+       void (*fl_break)(struct file_lock *); /* break_lease callback */
+
+locking rules:
+                       BKL     may block
+fl_compare_owner:      yes     no
+fl_notify:             yes     no
+fl_copy_lock:          yes     no
+fl_release_private:    yes     yes
+fl_break:              yes     no
+
+       Currently only NFSD and NLM provide instances of this class. None of the
  them block. If you have out-of-tree instances - please, show up. Locking
  in that area will change.
-
  --------------------------- buffer_head -----------------------------------
  prototypes:
         void (*b_end_io)(struct buffer_head *bh, int uptodate);
@@ -284,8 +339,8 @@ prototypes:
  locking rules:
         called from interrupts. In other words, extreme care is needed here.
  bh is locked, but that's all warranties we have here. Currently only RAID1,
-highmem and fs/buffer.c are providing these. Block devices call this method
-upon the IO completion.
+highmem, fs/buffer.c, and fs/ntfs/aops.c are providing these. Block devices
+call this method upon the IO completion.
  
  --------------------------- block_device_operations -----------------------
  prototypes:
@@ -309,14 +364,15 @@ The last two are called only from check_disk_change().
  prototypes:
         loff_t (*llseek) (struct file *, loff_t, int);
         ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
-       ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t);
         ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
-       ssize_t (*aio_write) (struct kiocb *, const char __user *, size_t,
-                       loff_t);
+       ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+       ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
         int (*readdir) (struct file *, void *, filldir_t);
         unsigned int (*poll) (struct file *, struct poll_table_struct *);
         int (*ioctl) (struct inode *, struct file *, unsigned int,
                         unsigned long);
+       long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
+       long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
         int (*mmap) (struct file *, struct vm_area_struct *);
         int (*open) (struct inode *, struct file *);
         int (*flush) (struct file *);
@@ -335,6 +391,8 @@ prototypes:
                         loff_t *, int);
         unsigned long (*get_unmapped_area)(struct file *, unsigned long,
                         unsigned long, unsigned long, unsigned long);
+       int (*check_flags)(int);
+       int (*dir_notify)(struct file *, unsigned long);
  };
  
  locking rules:
@@ -348,6 +406,8 @@ aio_write:          no
  readdir:               no
  poll:                  no
  ioctl:                 yes     (see below)
+unlocked_ioctl:                no      (see below)
+compat_ioctl:          no
  mmap:                  no
  open:                  maybe   (see below)
  flush:                 no
@@ -361,6 +421,8 @@ writev:                     no
  sendfile:              no
  sendpage:              no
  get_unmapped_area:     no
+check_flags:           no
+dir_notify:            no
  
  ->llseek() locking has moved from llseek to the individual llseek
  implementations.  If your fs is not using generic_file_llseek, you
@@ -391,6 +453,9 @@ move ->readdir() to inode_operations and use a separate method for directory
  anything that resembles union-mount we won't have a struct file for all
  components. And there are other reasons why the current interface is a mess...
  
+->ioctl() on regular files is superceded by the ->unlocked_ioctl() that
+doesn't take the BKL.
+
  ->read on directories probably must go away - we should just enforce -EISDIR
  in sys_read() and friends.
  
@@ -398,23 +463,46 @@ in sys_read() and friends.
  
  --------------------------- dquot_operations -------------------------------
  prototypes:
-       void (*initialize) (struct inode *, short);
-       void (*drop) (struct inode *);
-       int (*alloc_block) (const struct inode *, unsigned long, char);
+       int (*initialize) (struct inode *, int);
+       int (*drop) (struct inode *);
+       int (*alloc_space) (struct inode *, qsize_t, int);
         int (*alloc_inode) (const struct inode *, unsigned long);
-       void (*free_block) (const struct inode *, unsigned long);
-       void (*free_inode) (const struct inode *, unsigned long);
-       int (*transfer) (struct dentry *, struct iattr *);
-
-locking rules:
-               BKL
-initialize:    no
-drop:          no
-alloc_block:   yes
-alloc_inode:   yes
-free_block:    yes
-free_inode:    yes
-transfer:      no
+       int (*free_space) (struct inode *, qsize_t);
+       int (*free_inode) (const struct inode *, unsigned long);
+       int (*transfer) (struct inode *, struct iattr *);
+       int (*write_dquot) (struct dquot *);
+       int (*acquire_dquot) (struct dquot *);
+       int (*release_dquot) (struct dquot *);
+       int (*mark_dirty) (struct dquot *);
+       int (*write_info) (struct super_block *, int);
+
+These operations are intended to be more or less wrapping functions that ensure
+a proper locking wrt the filesystem and call the generic quota operations.
+
+What filesystem should expect from the generic quota functions:
+
+               FS recursion    Held locks when called
+initialize:    yes             maybe dqonoff_sem
+drop:          yes             -
+alloc_space:   ->mark_dirty()  -
+alloc_inode:   ->mark_dirty()  -
+free_space:    ->mark_dirty()  -
+free_inode:    ->mark_dirty()  -
+transfer:      yes             -
+write_dquot:   yes             dqonoff_sem or dqptr_sem
+acquire_dquot: yes             dqonoff_sem or dqptr_sem
+release_dquot: yes             dqonoff_sem or dqptr_sem
+mark_dirty:    no              -
+write_info:    yes             dqonoff_sem
+
+FS recursion means calling ->quota_read() and ->quota_write() from superblock
+operations.
+
+->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
+only directly by the filesystem and do not call any fs functions only
+the ->mark_dirty() operation.
+
+More details about quota locking can be found in fs/dquot.c.
  
  --------------------------- vm_operations_struct -----------------------------
  prototypes: