PL3026: This is the upgraded version of vdu that maintains an internal
authorMarc Fiuczynski <mef@cs.princeton.edu>
Tue, 16 Nov 2004 04:51:30 +0000 (04:51 +0000)
committerMarc Fiuczynski <mef@cs.princeton.edu>
Tue, 16 Nov 2004 04:51:30 +0000 (04:51 +0000)
hash table of files with a nlink count > 1.  Only if vdu sees all hard
links to a particular inode does it add its size and block count to the
total.

src/vdu.c
src/vdu.h [new file with mode: 0644]

index 04c060c..19b5dec 100644 (file)
--- a/src/vdu.c
+++ b/src/vdu.c
@@ -1,4 +1,4 @@
-// $Id: vdu.c,v 1.1 2003/09/29 22:01:57 ensc Exp $
+// $Id: vdu-new.c,v 1.2 2004/08/17 14:44:14 mef-pl_kernel Exp $
 
 // Copyright (C) 2003 Enrico Scholz <enrico.scholz@informatik.tu-chemnitz.de>
 // based on vdu.cc by Jacques Gelinas
 // along with this program; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 
-#ifdef HAVE_CONFIG_H
-#  include <config.h>
-#endif
-#include "compat.h"
-
 #include <stdlib.h>
 #include <stdio.h>
 #include <sys/stat.h>
 #include <dirent.h>
 #include <errno.h>
 #include <string.h>
-
 #include <sys/ioctl.h>
-#include "ext2fs.h"
 
-// Patch to help compile this utility on unpatched kernel source
-#ifndef EXT2_IMMUTABLE_FILE_FL
-       #define EXT2_IMMUTABLE_FILE_FL  0x00000010
-       #define EXT2_IMMUTABLE_LINK_FL  0x00008000
-#endif
+#include <assert.h>
+
+#include "vdu.h"
 
+HashTable tbl;
+
+static int // boolean
+INOPut(PHashTable tbl, ino_t* key, struct stat **val){
+    return Put(tbl, key, val);
+}
 
-//__extension__ typedef long long              longlong;
-__extension__ typedef long             longlong;
+__extension__ typedef long long                longlong;
+//__extension__ typedef long           longlong;
 
 static longlong inodes;
 static longlong blocks;
@@ -62,6 +59,7 @@ void panic(char *s) {
 
 static void vdu_onedir (char const *path)
 {
+    char const *foo = path;
     struct stat dirst, st;
     struct dirent *ent;
     char *name;
@@ -94,8 +92,9 @@ static void vdu_onedir (char const *path)
 
 
     /* Walk the directory entries and compute the sum of inodes,
-       blocks, and disk space used. This code will recursively descend
-       down the directory structure. */
+     * blocks, and disk space used. This code will recursively descend
+     * down the directory structure. 
+     */
 
     while ((ent=readdir(dir))!=NULL){
        if (lstat(ent->d_name,&st)==-1){
@@ -108,27 +107,58 @@ static void vdu_onedir (char const *path)
 
        if (S_ISREG(st.st_mode)){
            if (st.st_nlink > 1){
-               long flags;
-               int fd, res;
-
-               if ((fd = open(ent->d_name,O_RDONLY))==-1) {
-                   fprintf (stderr,"Can't open file %s/%s\n",path,ent->d_name);                    
-                   warning ("open failed");
-                   continue;
-               }
-
-               flags = 0;
-               res = ioctl(fd, EXT2_IOC_GETFLAGS, &flags);
-               close(fd);
-
-               if ((res == 0) && (flags & EXT2_IMMUTABLE_LINK_FL)){
-                   if (verbose)
-                       printf ("Skipping %s\n",ent->d_name);               
-                   continue;
+               struct stat *val;
+               int nlink;
+
+               /* Check hash table if we've seen this inode
+                * before. Note that the hash maintains a
+                * (inode,struct stat) key value pair.
+                */
+
+               val = &st;
+
+               (void) INOPut(&tbl,&st.st_ino,&val);
+
+               /* Note that after the INOPut call "val" refers to the
+                * value entry in the hash table --- not &st.  This
+                * means that if the inode has been put into the hash
+                * table before, val will refer to the first st that
+                * was put into the hashtable.  Otherwise, if it is
+                * the first time it is put into the hash table, then
+                * val will be equal to this &st.
+                */
+               nlink = val->st_nlink;
+               nlink --;
+
+               /* val refers to value in hash tbale */
+               if (nlink == 0) {
+
+                   /* We saw all hard links to this particular inode
+                    * as part of this sweep of vdu. So account for
+                    * the size and blocks required by the file.
+                    */
+
+                   dirsize += val->st_size;
+                   dirblocks += val->st_blocks;
+
+                   /* Do not delete the (ino,val) tuple from the tbl,
+                    * as we need to handle the case when we are
+                    * double counting a file due to a bind mount.
+                    */
+                   val->st_nlink = 0;
+
+               } else if (nlink > 0) {
+                   val->st_nlink = nlink;
+               } else /* if(nlink < 0) */ {
+                   /* We get here when we are double counting nlinks
+                      due a bind mount. */
+
+                   /* DO NOTHING */
                }
+           } else {
+               dirsize += st.st_size;
+               dirblocks += st.st_blocks;
            }
-           dirsize += st.st_size;
-           dirblocks += st.st_blocks;
 
        } else if (S_ISDIR(st.st_mode)) {
            if ((st.st_dev == dirst.st_dev) &&
@@ -147,43 +177,71 @@ static void vdu_onedir (char const *path)
                fchdir(dirfd);
            }
        } else {
-           dirsize += st.st_size;
-           dirblocks += st.st_blocks;
+           // dirsize += st.st_size;
+           // dirblocks += st.st_blocks;
        }
     }
     closedir (dir);
     close (dirfd);
     if (verbose)
-       printf("%8ld %8ld %8ld %s\n",dirinodes, dirblocks, dirsize>>10,path);
+       printf("%16lld %16lld %16lld %s\n",dirinodes, dirblocks, dirsize,foo);
     inodes += dirinodes;
     blocks += dirblocks;
     size   += dirsize;
 }
 
+static void
+Count(ino_t* key, struct stat* val) {
+    if(val->st_nlink) {
+       blocks += val->st_blocks;
+       size += val->st_size;
+       printf("ino=%ld nlink=%d\n",val->st_ino, val->st_nlink);
+    }
+}
 
-
-int main (int argc, char *argv[])
+int
+main (int argc, char **argv)
 {
     int startdir, i;
 
     if (argc < 2){
        fprintf (stderr,"vdu version %s\n",VERSION);
        fprintf (stderr,"vdu directory ...\n\n");
-       fprintf (stderr,"Compute the size of a directory tree.");
+       fprintf (stderr,"Compute the size of a directory tree.\n");
     }else{
        if ((startdir = open (".",O_RDONLY)) == -1) {
            fprintf (stderr,"Can't open current working directory\n");
            panic("open failed");
        }
 
+       /* hash table support for hard link count */
+       (void) Init(&tbl,0,0);
+
        for (i=1; i<argc; i++){
            inodes = blocks = size = 0;
            vdu_onedir (argv[i]);
-           printf("%8ld %8ld %8ld %s TOTAL\n",inodes, blocks, size>>10,argv[i]);
+
+           printf("%16lld %16lld %16lld %s\n",
+                  inodes, 
+                  blocks>>1, 
+                  size,
+                  argv[i]);
            if (fchdir (startdir) == -1) {
                panic("fchdir failed");
            }
        }
+
+       if(0) {
+           /* show size & blocks for files with nlinks from outside of dir */
+           inodes = blocks = size = 0;
+           Iterate(&tbl,Count);
+           printf("%16lld %16lld %16lld NOT COUNTED\n",
+                  inodes, 
+                  blocks, 
+                  size);
+       }
+
+       // Dispose(&tbl); this fails to delete all entries 
        close(startdir);
     }
     return 0;
diff --git a/src/vdu.h b/src/vdu.h
new file mode 100644 (file)
index 0000000..13bc979
--- /dev/null
+++ b/src/vdu.h
@@ -0,0 +1,374 @@
+#ifndef __VDU_H
+#define __VDU_H
+
+typedef ino_t KEY;
+typedef KEY *PKEY;
+typedef struct stat VAL;
+typedef VAL *PVAL;
+
+static inline
+unsigned int
+HASH(PKEY key){
+    return (int) *key;
+}
+
+static inline
+unsigned int // boolean
+EQUAL(PKEY key1, PKEY key2){
+    return *key1 == *key2;
+}
+
+#ifndef MIN
+#define MIN(x,y) (((x)<(y))?(x):(y))
+#endif // MIN
+
+#ifndef MAX
+#define MAX(x,y) (((x)>(y))?(x):(y))
+#endif // MAX
+
+
+////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////
+// hash table support for efficient lookup of duplicate inodes */
+////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////
+
+#define Multiplier  (0x9e3779b9)
+#define MaxLogBuckets  (((sizeof (unsigned long))*8) - 2)
+#define MaxBuckets     (1<<MaxLogBuckets)
+#define MinLogBuckets  (4)
+#define MinBuckets     (1<<MinLogBuckets)
+
+// Thresholds for rehashing the table: *)
+// to avoid crazy oscillations, we must have MaxDensity > 2*MinDensity; *)
+// to avoid excessive probes, we must try to keep MaxDensity low. *)
+// Divide by 100 before using
+#define MaxDensity 75 /* max numEntries/NUMBER(buckets) */
+#define MinDensity 20 /* min numEntries/NUMBER(buckets) */
+#define IdealDensity 50
+#define BITSIZE(x) (sizeof(x)*8)
+
+#define NEW(type,num) ((type*)malloc(sizeof(type)*num))
+#define DISPOSE(ptr) (free((void*)ptr))
+
+////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////
+// Generic Hash Entry Type
+////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////
+
+typedef struct VoidList {
+  struct VoidList *tail;
+} VoidList, *PVoidList;
+
+typedef struct HashTable {
+  PVoidList *buckets;
+  unsigned int numBuckets;    // number of buckets
+  unsigned int minLogBuckets; // minimum value for Log_2(initial size) 
+  unsigned int logBuckets;    // CEILING(Log2(NUMBER(buckets^))) 
+  unsigned int maxEntries;    // maximum number of entries 
+  unsigned int minEntries;    // minimum number of entries 
+  unsigned int numEntries;    // current num of entries in table 
+  PVoidList cache;            // cache of removed elements 
+  int cacheSize;              // current size of the cache 
+  int maxCacheSize;           // maximum size, -1 means unbounded, 0 no cache 
+} HashTable, *PHashTable;
+
+////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////
+// Hash Prototypes
+////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////
+
+PHashTable
+Init(PHashTable tbl, unsigned int n, int maxCacheSize);
+
+void
+Dispose(PHashTable tbl);
+
+unsigned int
+Log_2(unsigned int x);
+
+void
+NewBuckets(PHashTable tbl, unsigned int logBuckets);
+
+//
+// Generic Hash Table support
+//
+
+PHashTable
+Init(PHashTable tbl, unsigned int n, int maxCacheSize){
+  int idealBuckets;
+  int minBuckets;
+  
+  idealBuckets = MIN(((n*100)/IdealDensity),MaxBuckets);
+  minBuckets = MAX(MinBuckets, idealBuckets);
+  tbl->minLogBuckets = Log_2(minBuckets);
+
+  NewBuckets(tbl, tbl->minLogBuckets);
+  tbl->numEntries = 0;
+  tbl->maxCacheSize = maxCacheSize;
+  tbl->cacheSize = 0;
+  tbl->cache = 0;
+  return tbl;
+} // Init()
+
+
+//
+// Internal procedures
+//
+
+unsigned int
+Log_2(unsigned int x){
+  // return CEILING(LOG_2(x))
+  unsigned int log = 0;
+  unsigned int n= 1;
+
+  assert(x != 0);
+  while ((log < MaxLogBuckets) && (x > n)){
+    log++; 
+    n += n;
+  }
+  return log;
+} // Log_2()
+
+void
+NewBuckets(PHashTable tbl, unsigned int logBuckets){
+  // Allocate "2^logBuckets" buckets.
+  unsigned int numBuckets = 1 << logBuckets;
+  PVoidList *b;
+  unsigned int i;
+
+  tbl->buckets = NEW(PVoidList, numBuckets);
+  tbl->numBuckets = numBuckets;
+  b = tbl->buckets;
+
+  for (i=0; i<tbl->numBuckets; i++){
+    b[i] = NULL;
+  }
+  tbl->logBuckets = logBuckets;
+  tbl->maxEntries = MaxDensity * numBuckets / 100;
+  tbl->minEntries = MinDensity * numBuckets / 100;
+} // NewBuckets()
+
+#ifndef NULL
+#define NULL (void*)0
+#endif // !NULL
+
+#ifndef TRUE
+#define TRUE 1
+#endif // !TRUE
+
+#ifndef FALSE
+#define FALSE 0
+#endif // !FALSE
+
+////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////
+// Type specific hash entry
+////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////
+typedef struct EntryList {
+  struct EntryList *tail;
+  KEY key;
+  VAL val;
+}EntryList, *PEntryList;
+
+////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////
+// Type specific Hash implementation functions
+////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////
+
+static
+void
+Rehash(PHashTable tbl, unsigned int logBuckets) {
+  // Reallocate "2^logBuckets" buckets, and rehash the entries into
+  // the new table.
+
+  PVoidList *oldBucketPointer;
+  PEntryList *ob, obi;
+  PEntryList *nb, *nbh;
+  PEntryList that, tail;
+  unsigned int index; 
+  unsigned int i;
+  unsigned int oldNumBuckets;
+
+  return;
+  assert(logBuckets <= MaxLogBuckets);
+  assert(logBuckets >= tbl->minLogBuckets);
+  oldBucketPointer = tbl->buckets;
+  ob = (PEntryList*)tbl->buckets;
+  oldNumBuckets = tbl->numBuckets;
+
+  NewBuckets(tbl, logBuckets);
+  nb = (PEntryList*)tbl->buckets;
+
+  for(i=0;i<oldNumBuckets;i++){
+    obi = ob[i];
+    that = obi;
+    while (that != NULL) {
+      index = (HASH(&(that->key))*Multiplier)>>(BITSIZE(unsigned long)-tbl->logBuckets);
+      nbh = &(nb[index]);
+      tail = that->tail;
+      that->tail = *nbh;
+      *nbh = that;
+      that = tail;
+    }
+  }
+  DISPOSE (oldBucketPointer);
+}
+
+static inline
+unsigned int // boolean
+Get(PHashTable tbl, PKEY key, PVAL *val){
+  PEntryList that;
+  unsigned int index;
+
+  index = (HASH(key)*Multiplier)>>(BITSIZE(unsigned long)-tbl->logBuckets);
+  that = (PEntryList)tbl->buckets[index];
+  while ((that != NULL) && !EQUAL(key,&(that->key))) {
+    that = that->tail;
+  }
+  if (that != NULL){
+    *val = &that->val;
+    return TRUE;
+  }
+  else {
+    return FALSE;
+  }
+} // Get()
+
+static inline 
+unsigned int // boolean
+Put(PHashTable tbl, PKEY key, PVAL *val){
+  PEntryList that;
+  PEntryList *first;
+  unsigned int index;
+  unsigned int res;
+
+  index = (HASH(key)*Multiplier)>>(BITSIZE(unsigned long)-tbl->logBuckets);
+  first = (PEntryList*)&(tbl->buckets[index]);
+  that = *first;
+  while ((that != NULL) && !EQUAL(key, &(that->key))){
+    that = that->tail;
+  }
+  
+  // found an entry in the hash table given above key
+  if (that != NULL){
+    res = TRUE;
+  }
+  else {
+    // check if we can reuse something from the cache
+    if (tbl->cache != NULL) {
+          that = (PEntryList)tbl->cache;
+          tbl->cache = (PVoidList)tbl->cache->tail;
+          that->key = *key;
+          that->tail = *first;
+          *first = that;
+    }
+    else {
+          that = NEW(EntryList,1);
+         that->key = *key;
+         that->tail = *first;
+         *first = that;
+    }
+    that->val = **val;
+
+    tbl->numEntries++;
+    if ((tbl->logBuckets < MaxLogBuckets)
+       && (tbl->numEntries > tbl->maxEntries)){
+      Rehash(tbl, tbl->logBuckets + 1); // too crowded
+    }
+    res = FALSE;
+  }
+  *val = &that->val;
+  return res;
+
+} // Put()
+
+static inline
+int
+Delete(PHashTable tbl,PKEY key){
+  PEntryList that, prev;
+  PEntryList *first;
+  unsigned int index;
+
+  index = (HASH(key)*Multiplier)>>(BITSIZE(unsigned long)-tbl->logBuckets);
+  first = (PEntryList*)&(tbl->buckets[index]);
+  that = *first;
+  prev = NULL;
+
+  while ((that != NULL) && !EQUAL(key, &(that->key))){
+    prev = that;
+    that = that->tail;
+  }
+  if (that != NULL) {
+    if (prev == NULL) {
+      *first = that->tail;
+    }
+    else {
+      prev->tail = that->tail;
+    }
+    if ((tbl->maxCacheSize == -1)||(tbl->cacheSize < tbl->maxCacheSize)) {
+      that->tail = (PEntryList)tbl->cache;
+      tbl->cache = (PVoidList)that;
+      tbl->cacheSize++;
+    }
+    else {
+      DISPOSE (that);
+    }
+    tbl->numEntries--;
+    if (tbl->maxCacheSize == 0) {
+      if ((tbl->logBuckets > tbl->minLogBuckets)
+         && (tbl->numEntries < tbl->minEntries)) {
+       Rehash(tbl, tbl->logBuckets - 1); // too sparse
+      }
+    }
+    return TRUE;
+  }
+  else {
+    return FALSE;
+  }
+} // Delete()
+
+typedef void (*callback)(PKEY key, PVAL val);
+
+void
+Iterate(PHashTable tbl, callback fn)
+{
+  PVoidList that;
+  unsigned int i;
+  
+  for(i=0;i<tbl->numBuckets;i++) {
+    that = tbl->buckets[i];
+    while ( that != (PVoidList)0 ) {
+      PEntryList entry = (PEntryList)that;
+      fn(&entry->key,&entry->val);
+      that = that->tail;
+    }
+  }
+}
+
+void
+Dispose(PHashTable tbl)
+{
+  PVoidList that, next;
+  unsigned int i;
+
+  for(i=0;i<tbl->numBuckets;i++) {
+    that = tbl->buckets[i];
+    while( that != NULL) {
+        next = that->tail;
+        DISPOSE (that);
+        tbl->numEntries--;
+        that = next;
+    }
+  }
+  DISPOSE(tbl->buckets);
+  assert(tbl->numEntries = 0);
+} // Dispose;
+
+
+
+#endif // __VDU_H