vserver 1.9.5.x5

[linux-2.6.git] / Documentation / filesystems / Locking
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking

index 83e6a8f..2c9ce27 100644 (file)
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -90,7 +90,7 @@ prototypes:
         void (*destroy_inode)(struct inode *);
         void (*read_inode) (struct inode *);
         void (*dirty_inode) (struct inode *);
-       void (*write_inode) (struct inode *, int);
+       int (*write_inode) (struct inode *, int);
         void (*put_inode) (struct inode *);
         void (*drop_inode) (struct inode *);
         void (*delete_inode) (struct inode *);
@@ -203,20 +203,34 @@ currently-in-progress I/O.
  
  If the filesystem is not called for "sync" and it determines that it
  would need to block against in-progress I/O to be able to start new I/O
-against the page the filesystem shoud redirty the page (usually with
-__set_page_dirty_nobuffers()), then unlock the page and return zero. 
+against the page the filesystem should redirty the page with
+redirty_page_for_writepage(), then unlock the page and return zero.
  This may also be done to avoid internal deadlocks, but rarely.
  
  If the filesytem is called for sync then it must wait on any
  in-progress I/O and then start new I/O.
  
  The filesystem should unlock the page synchronously, before returning
-to the caller.  If the page has write I/O underway against it,
-writepage() should run SetPageWriteback() against the page prior to
-unlocking it.  The write I/O completion handler should run
-end_page_writeback() against the page.
-
-That is: after 2.5.12, pages which are under writeout are *not* locked.
+to the caller.
+
+Unless the filesystem is going to redirty_page_for_writepage(), unlock the page
+and return zero, writepage *must* run set_page_writeback() against the page,
+followed by unlocking it.  Once set_page_writeback() has been run against the
+page, write I/O can be submitted and the write I/O completion handler must run
+end_page_writeback() once the I/O is complete.  If no I/O is submitted, the
+filesystem must run end_page_writeback() against the page before returning from
+writepage.
+
+That is: after 2.5.12, pages which are under writeout are *not* locked.  Note,
+if the filesystem needs the page to be locked during writeout, that is ok, too,
+the page is allowed to be unlocked at any point in time between the calls to
+set_page_writeback() and end_page_writeback().
+
+Note, failure to run either redirty_page_for_writepage() or the combination of
+set_page_writeback()/end_page_writeback() on a page submitted to writepage
+will leave the page itself marked clean but it will be tagged as dirty in the
+radix tree.  This incoherency can lead to all sorts of hard-to-debug problems
+in the filesystem like having dirty inodes at umount and losing written data.
  
         ->sync_page() locking rules are not well-defined - usually it is called
  with lock on page, but that is not guaranteed. Considering the currently
@@ -262,21 +276,40 @@ foo_get_block(). It's an overkill, since block bitmaps can be protected by
  internal fs locking and real critical areas are much smaller than the areas
  filesystems protect now.
  
---------------------------- file_lock ------------------------------------
+----------------------- file_lock_operations ------------------------------
  prototypes:
-       void (*fl_notify)(struct file_lock *);  /* unblock callback */
         void (*fl_insert)(struct file_lock *);  /* lock insertion callback */
         void (*fl_remove)(struct file_lock *);  /* lock removal callback */
+       void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
+       void (*fl_release_private)(struct file_lock *);
+
+
+locking rules:
+                       BKL     may block
+fl_insert:             yes     no
+fl_remove:             yes     no
+fl_copy_lock:          yes     no
+fl_release_private:    yes     yes
+
+----------------------- lock_manager_operations ---------------------------
+prototypes:
+       int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
+       void (*fl_notify)(struct file_lock *);  /* unblock callback */
+       void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
+       void (*fl_release_private)(struct file_lock *);
+       void (*fl_break)(struct file_lock *); /* break_lease callback */
  
  locking rules:
-               BKL     may block
-fl_notify:     yes     no
-fl_insert:     yes     no
-fl_remove:     yes     no
-       Currently only NLM provides instances of this class. None of the
+                       BKL     may block
+fl_compare_owner:      yes     no
+fl_notify:             yes     no
+fl_copy_lock:          yes     no
+fl_release_private:    yes     yes
+fl_break:              yes     no
+
+       Currently only NFSD and NLM provide instances of this class. None of the
  them block. If you have out-of-tree instances - please, show up. Locking
  in that area will change.
-
  --------------------------- buffer_head -----------------------------------
  prototypes:
         void (*b_end_io)(struct buffer_head *bh, int uptodate);
@@ -284,8 +317,8 @@ prototypes:
  locking rules:
         called from interrupts. In other words, extreme care is needed here.
  bh is locked, but that's all warranties we have here. Currently only RAID1,
-highmem and fs/buffer.c are providing these. Block devices call this method
-upon the IO completion.
+highmem, fs/buffer.c, and fs/ntfs/aops.c are providing these. Block devices
+call this method upon the IO completion.
  
  --------------------------- block_device_operations -----------------------
  prototypes:
@@ -317,6 +350,8 @@ prototypes:
         unsigned int (*poll) (struct file *, struct poll_table_struct *);
         int (*ioctl) (struct inode *, struct file *, unsigned int,
                         unsigned long);
+       long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
+       long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
         int (*mmap) (struct file *, struct vm_area_struct *);
         int (*open) (struct inode *, struct file *);
         int (*flush) (struct file *);
@@ -335,6 +370,8 @@ prototypes:
                         loff_t *, int);
         unsigned long (*get_unmapped_area)(struct file *, unsigned long,
                         unsigned long, unsigned long, unsigned long);
+       int (*check_flags)(int);
+       int (*dir_notify)(struct file *, unsigned long);
  };
  
  locking rules:
@@ -348,6 +385,8 @@ aio_write:          no
  readdir:               no
  poll:                  no
  ioctl:                 yes     (see below)
+unlocked_ioctl:                no      (see below)
+compat_ioctl:          no
  mmap:                  no
  open:                  maybe   (see below)
  flush:                 no
@@ -361,6 +400,8 @@ writev:                     no
  sendfile:              no
  sendpage:              no
  get_unmapped_area:     no
+check_flags:           no
+dir_notify:            no
  
  ->llseek() locking has moved from llseek to the individual llseek
  implementations.  If your fs is not using generic_file_llseek, you
@@ -391,6 +432,9 @@ move ->readdir() to inode_operations and use a separate method for directory
  anything that resembles union-mount we won't have a struct file for all
  components. And there are other reasons why the current interface is a mess...
  
+->ioctl() on regular files is superceded by the ->unlocked_ioctl() that
+doesn't take the BKL.
+
  ->read on directories probably must go away - we should just enforce -EISDIR
  in sys_read() and friends.