1 /* Copyright 2005 Princeton University */
5 #define _LARGEFILE64_SOURCE 1
10 #include <sys/types.h>
16 #include <sys/ioctl.h>
21 * hash table implementation
26 typedef struct stat64 VAL;
36 unsigned int // boolean
37 EQUAL(PKEY key1, PKEY key2){
38 return *key1 == *key2;
42 #define MIN(x,y) (((x)<(y))?(x):(y))
46 #define MAX(x,y) (((x)>(y))?(x):(y))
51 * hash table support for efficient lookup of duplicate inodes
54 #define Multiplier (0x9e3779b9)
55 #define MaxLogBuckets (((sizeof (unsigned long))*8) - 2)
56 #define MaxBuckets (1<<MaxLogBuckets)
57 #define MinLogBuckets (4)
58 #define MinBuckets (1<<MinLogBuckets)
60 /* Thresholds for rehashing the table: *)
61 * to avoid crazy oscillations, we must have MaxDensity > 2*MinDensity; *)
62 * to avoid excessive probes, we must try to keep MaxDensity low. *)
63 * Divide by 100 before using
65 #define MaxDensity 75 /* max numEntries/NUMBER(buckets) */
66 #define MinDensity 20 /* min numEntries/NUMBER(buckets) */
67 #define IdealDensity 50
68 #define BITSIZE(x) (sizeof(x)*8)
70 #define NEW(type,num) ((type*)malloc(sizeof(type)*num))
71 #define DISPOSE(ptr) (free((void*)ptr))
74 * Generic Hash Entry Type
77 typedef struct VoidList {
78 struct VoidList *tail;
79 } VoidList, *PVoidList;
81 typedef struct HashTable {
83 unsigned int numBuckets; // number of buckets
84 unsigned int minLogBuckets; // minimum value for Log_2(initial size)
85 unsigned int logBuckets; // CEILING(Log2(NUMBER(buckets^)))
86 unsigned int maxEntries; // maximum number of entries
87 unsigned int minEntries; // minimum number of entries
88 unsigned int numEntries; // current num of entries in table
89 PVoidList cache; // cache of removed elements
90 int cacheSize; // current size of the cache
91 int maxCacheSize; // maximum size, -1 means unbounded, 0 no cache
92 } HashTable, *PHashTable;
99 Init(PHashTable tbl, unsigned int n, int maxCacheSize);
102 Dispose(PHashTable tbl);
105 Log_2(unsigned int x);
108 NewBuckets(PHashTable tbl, unsigned int logBuckets);
111 * Generic Hash Table support
115 Init(PHashTable tbl, unsigned int n, int maxCacheSize){
119 idealBuckets = MIN(((n*100)/IdealDensity),MaxBuckets);
120 minBuckets = MAX(MinBuckets, idealBuckets);
121 tbl->minLogBuckets = Log_2(minBuckets);
123 NewBuckets(tbl, tbl->minLogBuckets);
125 tbl->maxCacheSize = maxCacheSize;
133 * Internal procedures
137 Log_2(unsigned int x){
138 /* return CEILING(LOG_2(x)) */
139 unsigned int log = 0;
143 while ((log < MaxLogBuckets) && (x > n)){
151 NewBuckets(PHashTable tbl, unsigned int logBuckets){
152 /* Allocate "2^logBuckets" buckets. */
153 unsigned int numBuckets = 1 << logBuckets;
157 tbl->buckets = NEW(PVoidList, numBuckets);
158 tbl->numBuckets = numBuckets;
161 for (i=0; i<tbl->numBuckets; i++){
164 tbl->logBuckets = logBuckets;
165 tbl->maxEntries = MaxDensity * numBuckets / 100;
166 tbl->minEntries = MinDensity * numBuckets / 100;
170 #define NULL (void*)0
182 * Type specific hash entry
184 typedef struct EntryList {
185 struct EntryList *tail;
188 }EntryList, *PEntryList;
191 * Type specific Hash implementation functions
196 Rehash(PHashTable tbl, unsigned int logBuckets) {
197 /* Reallocate "2^logBuckets" buckets, and rehash the entries into
201 PVoidList *oldBucketPointer;
203 PEntryList *nb, *nbh;
204 PEntryList that, tail;
207 unsigned int oldNumBuckets;
210 assert(logBuckets <= MaxLogBuckets);
211 assert(logBuckets >= tbl->minLogBuckets);
212 oldBucketPointer = tbl->buckets;
213 ob = (PEntryList*)tbl->buckets;
214 oldNumBuckets = tbl->numBuckets;
216 NewBuckets(tbl, logBuckets);
217 nb = (PEntryList*)tbl->buckets;
219 for(i=0;i<oldNumBuckets;i++){
222 while (that != NULL) {
223 index = (HASH(&(that->key))*Multiplier)>>(BITSIZE(unsigned long)-tbl->logBuckets);
231 DISPOSE (oldBucketPointer);
235 unsigned int /* boolean */
236 Get(PHashTable tbl, PKEY key, PVAL *val){
240 index = (HASH(key)*Multiplier)>>(BITSIZE(unsigned long)-tbl->logBuckets);
241 that = (PEntryList)tbl->buckets[index];
242 while ((that != NULL) && !EQUAL(key,&(that->key))) {
255 unsigned int /* boolean */
256 Put(PHashTable tbl, PKEY key, PVAL *val){
262 index = (HASH(key)*Multiplier)>>(BITSIZE(unsigned long)-tbl->logBuckets);
263 first = (PEntryList*)&(tbl->buckets[index]);
265 while ((that != NULL) && !EQUAL(key, &(that->key))){
269 /* found an entry in the hash table given above key */
274 /* check if we can reuse something from the cache */
275 if (tbl->cache != NULL) {
276 that = (PEntryList)tbl->cache;
277 tbl->cache = (PVoidList)tbl->cache->tail;
283 that = NEW(EntryList,1);
291 if ((tbl->logBuckets < MaxLogBuckets)
292 && (tbl->numEntries > tbl->maxEntries)){
293 Rehash(tbl, tbl->logBuckets + 1); /* too crowded */
304 Delete(PHashTable tbl,PKEY key){
305 PEntryList that, prev;
309 index = (HASH(key)*Multiplier)>>(BITSIZE(unsigned long)-tbl->logBuckets);
310 first = (PEntryList*)&(tbl->buckets[index]);
314 while ((that != NULL) && !EQUAL(key, &(that->key))){
323 prev->tail = that->tail;
325 if ((tbl->maxCacheSize == -1)||(tbl->cacheSize < tbl->maxCacheSize)) {
326 that->tail = (PEntryList)tbl->cache;
327 tbl->cache = (PVoidList)that;
334 if (tbl->maxCacheSize == 0) {
335 if ((tbl->logBuckets > tbl->minLogBuckets)
336 && (tbl->numEntries < tbl->minEntries)) {
337 Rehash(tbl, tbl->logBuckets - 1); /* too sparse */
347 typedef void (*callback)(PKEY key, PVAL val);
350 Iterate(PHashTable tbl, callback fn)
355 for(i=0;i<tbl->numBuckets;i++) {
356 that = tbl->buckets[i];
357 while ( that != (PVoidList)0 ) {
358 PEntryList entry = (PEntryList)that;
359 fn(&entry->key,&entry->val);
366 Dispose(PHashTable tbl)
368 PVoidList that, next;
371 for(i=0;i<tbl->numBuckets;i++) {
372 that = tbl->buckets[i];
373 while( that != NULL) {
380 DISPOSE(tbl->buckets);
381 assert(tbl->numEntries = 0);
384 static int /* boolean */
385 INOPut(PHashTable tbl, ino64_t* key, struct stat64 **val){
386 return Put(tbl, key, val);
389 __extension__ typedef long long longlong;
397 static short verbose = 0;
399 static int vdu_onedir (PHashTable tbl, struct stats *__s, char const *path)
401 char const *foo = path;
402 struct stat64 dirst, st;
408 longlong dirsize, dirinodes, dirblocks;
410 dirsize = dirinodes = dirblocks = 0;
412 // A handle to speed up chdir
413 if ((dirfd = open (path,O_RDONLY)) == -1) {
417 if (fchdir (dirfd) == -1) {
421 if (fstat64 (dirfd,&dirst) != 0) {
425 if ((dir = opendir (".")) == NULL) {
429 /* Walk the directory entries and compute the sum of inodes,
430 * blocks, and disk space used. This code will recursively descend
431 * down the directory structure.
434 while ((ent=readdir(dir))!=NULL){
435 if (lstat64(ent->d_name,&st)==-1){
441 if (S_ISREG(st.st_mode)){
442 if (st.st_nlink > 1){
446 /* Check hash table if we've seen this inode
447 * before. Note that the hash maintains a
448 * (inode,struct stat) key value pair.
453 (void) INOPut(tbl,&st.st_ino,&val);
455 /* Note that after the INOPut call "val" refers to the
456 * value entry in the hash table --- not &st. This
457 * means that if the inode has been put into the hash
458 * table before, val will refer to the first st that
459 * was put into the hashtable. Otherwise, if it is
460 * the first time it is put into the hash table, then
461 * val will be equal to this &st.
463 nlink = val->st_nlink;
466 /* val refers to value in hash tbale */
469 /* We saw all hard links to this particular inode
470 * as part of this sweep of vdu. So account for
471 * the size and blocks required by the file.
474 dirsize += val->st_size;
475 dirblocks += val->st_blocks;
477 /* Do not delete the (ino,val) tuple from the tbl,
478 * as we need to handle the case when we are
479 * double counting a file due to a bind mount.
483 } else if (nlink > 0) {
484 val->st_nlink = nlink;
485 } else /* if(nlink < 0) */ {
486 /* We get here when we are double counting nlinks
492 dirsize += st.st_size;
493 dirblocks += st.st_blocks;
496 } else if (S_ISDIR(st.st_mode)) {
497 if ((st.st_dev == dirst.st_dev) &&
498 (strcmp(ent->d_name,".")!=0) &&
499 (strcmp(ent->d_name,"..")!=0)) {
501 dirsize += st.st_size;
502 dirblocks += st.st_blocks;
504 name = strdup(ent->d_name);
508 res |= vdu_onedir(tbl,__s,name);
513 /* dirsize += st.st_size; */
514 /* dirblocks += st.st_blocks; */
519 __s->inodes += dirinodes;
520 __s->blocks += dirblocks;
521 __s->size += dirsize;
523 printf("%16lld %16lld %16lld %s\n",dirinodes, dirblocks, dirsize,foo);
524 printf("%16lld %16lld %16lld %s\n",__s->inodes, __s->blocks, __s->size,foo);
532 do_vdu(PyObject *self, PyObject *args)
541 if (!PyArg_ParseTuple(args, "s", &path))
544 /* init of tbl and stats */
545 s.inodes = s.blocks = s.size = 0;
546 (void) Init(&tbl,0,0);
548 res = vdu_onedir(&tbl, &s, path);
550 /* deallocate whatever has been added to tbl */
553 /* create a python (inode, block, size) tuple */
554 tuple = Py_BuildValue("(L,L,L)",
556 s.blocks>>1, /* NOTE: div by 2 to adjust
557 * 512b block count to 1K
561 return (res == -1) ? PyErr_SetFromErrno(PyExc_OSError) : tuple;
564 static PyMethodDef methods[] = {
565 { "vdu", do_vdu, METH_VARARGS,
566 "perform vdu operation on directory tree" },
567 { NULL, NULL, 0, NULL }
573 Py_InitModule("vduimpl", methods);