first skeleton of clean-backupdb.py
[infrastructure.git] / scripts / clean-backupdb.py
diff --git a/scripts/clean-backupdb.py b/scripts/clean-backupdb.py
new file mode 100755 (executable)
index 0000000..45e5026
--- /dev/null
@@ -0,0 +1,282 @@
+#!/usr/bin/python
+
+### a utility to filter backups in the /db-backup area
+#
+# lists all files that match <prefix>-<date>-<time>.<suffix>
+# and preserves only the following
+# (*) more than 2 months old : one file per month
+# (*) more than 2 weeks old : one file per week
+# (*) recent stuff: unchanged
+# 
+
+import os, os.path
+from datetime import datetime
+from glob import glob
+import re
+import traceback
+
+from optparse import OptionParser
+
+class FileIgnored (Exception): pass
+
+# in days
+LEAVE_FILES_YOUNGER_THAN = 20
+# keep that amount of (plain) months organized in weeks
+# should not exceed 11
+KEEP_FILES_IN_MONTH_AFTER = 2
+
+# monday=0 sunday=6
+# we prefer saturday as this is the end of the week
+PREFERRED_WEEKDAY=5
+
+#
+## utility to parse a filename
+datetime_pattern="(?P<datetime>[0-9]{8}\-[0-9]{4})"
+# returns a tuple
+# (bool, prefix, suffix, datetime)
+
+prefix_pattern="(?P<prefix>[\w\-]+)"
+datetime_pattern="(?P<datetime>[0-9]{8}\-[0-9]{4})"
+suffix_pattern="(?P<suffix>[\w\.]+)"
+filename_matcher=re.compile("\A%s[-_]%s\.%s\Z"%(prefix_pattern,datetime_pattern,suffix_pattern))
+
+parsing_failed = (False, None, None, None)
+
+def parse_filename (filename):
+    match=filename_matcher.match(filename)
+    if not match:
+        return parsing_failed
+    else:
+        d = match.groupdict()
+        try:
+            dt = datetime.strptime(d['datetime'],"%Y%m%d-%H%M")
+            return (True, d['prefix'], d['suffix'], dt)
+        except:
+            print "failed to parse timestamp %s from %s"%(d['datetime'],filename)
+            traceback.print_exc()
+            return parsing_failed
+
+# one entry like this per file, managed in the Kind class
+now=datetime.now()
+
+class File:
+
+    def __init__ (self, dir, filename, datetime, options):
+        self.dir=dir
+        self.filename=filename
+        self.datetime=datetime
+        self.options=options
+        self.age=now-datetime
+        self.weekday=self.datetime.weekday()
+        if self.age.days<0:
+            if self.options.verbose: print 'Filename %s is from the future - skipped'%sfilename
+            raise FileIgnored,"Filename from the future %s"%filename
+        self.group = self._group_string()
+
+    def __repr__ (self):
+        return "%s (%s) -- weekday %s"%(self.filename,self.datetime,self.datetime.weekday())
+
+    def age_days (self):
+        return self.age.days
+
+    def age_weeks (self):
+        return self.age.days/7
+
+    # oldest first
+    @staticmethod
+    def sort_age (file1, file2):
+        # for 2.7, seems safer
+        try:
+            return int((file1.datetime-file2.datetime).total_seconds())
+        # otherwise resort to days, we should be clear as our backups are daily
+        except:
+            return int((file1.datetime-file2.datetime).days)
+        
+    @staticmethod
+    def sort_relevance (file1, file2):
+        w1=file1.weekday
+        w2=file2.weekday
+        if w1==PREFERRED_WEEKDAY and w2==PREFERRED_WEEKDAY:
+            return File.sort_age (file1,file2)
+        elif w1!=PREFERRED_WEEKDAY and w2!=PREFERRED_WEEKDAY:
+            return File.sort_age (file1,file2)
+        elif w1==PREFERRED_WEEKDAY and w2!=PREFERRED_WEEKDAY:
+            return -1
+        elif w1!=PREFERRED_WEEKDAY and w2==PREFERRED_WEEKDAY:
+            return 1
+    
+    month_marrier=None
+    @staticmethod
+    def compute_month_barrier ():
+        if not File.month_marrier: 
+            # find the exact datetime for a month change 
+            # KEEP_FILES_IN_MONTH_AFTER months ago +
+            year=now.year
+            month=now.month
+            day=1
+            if now.month>=KEEP_FILES_IN_MONTH_AFTER+1:
+                month -= KEEP_FILES_IN_MONTH_AFTER
+            else:
+                year -= 1
+                month += (12-KEEP_FILES_IN_MONTH_AFTER)
+            File.month_marrier = datetime (year=year, month=month, day=day)
+                
+        return File.month_marrier
+
+    # returns a key for grouping files, the cleanup then
+    # preserving one entry in the set of files with same group
+    def _group_string (self):
+        if self.age.days<=LEAVE_FILES_YOUNGER_THAN:
+            if self.options.verbose: print 'Filename %s is recent (%d d) - skipped'%\
+                    (self.filename,LEAVE_FILES_YOUNGER_THAN)
+            raise FileIgnored,"Filename %s is recent"%self.filename
+        # in the month range
+        if self.datetime <= File.compute_month_barrier():
+            return self.datetime.strftime("%Y%m")
+        else:
+            return "week%d"%self.age_weeks()
+
+# all files in a given timeslot (either month or week)
+class Group:
+    def __init__ (self, groupname):
+        self.groupname=groupname
+        self.files=[]
+    def insert (self, file):
+        self.files.append(file)
+    def epilogue (self):
+        self.files.sort (File.sort_relevance)
+#        print 20*'*','after sort'
+#        for file in self.files:
+#            print "%s"%file
+
+# all files with the same (prefix, suffix)
+class Kind:
+
+    def __init__ (self, prefix, suffix, options):
+        self.prefix=prefix
+        self.suffix=suffix
+        self.options=options
+        # will contain tuples (filename, datetime)
+        self.list = []
+
+    # allow for basic checking to be done in File
+    def add_file (self, dir, filename, datetime):
+        try:
+            self.list.append ( File (dir, filename, datetime, self.options) )
+        except FileIgnored: pass
+        except:
+            print 'could not append %s'%filename
+            traceback.print_exc()
+            pass
+
+    def epilogue (self):
+        # sort self.list according to file age, oldest first
+        self.list.sort(File.sort_age)
+        # prepare groups according to age
+        self.groups = {}
+        for file in self.list:
+            groupname=file.group
+            if groupname not in self.groups:
+                self.groups[groupname]=Group(groupname)
+            self.groups[groupname].insert(file)
+        for group in self.groups.values():
+            group.epilogue()
+
+    def show (self):
+        print 30*'-',"%s-<date>.%s"%(self.prefix,self.suffix)
+        entries=len(self.list)
+        print " %d entries" % entries,
+        if entries >=1:
+            f=self.list[0]
+            print " << %s - %s d old"%(f.filename, f.age_days()),
+        if entries >=2:
+            f=self.list[-1]
+            print ">> %s - %s d old"%(f.filename, f.age_days())
+        groupnames=self.groups.keys()
+        groupnames.sort()
+        groupnames.reverse()
+        if self.options.extra_verbose:
+            print " Found %d groups"%len(groupnames)
+            for g in groupnames: 
+                print "  %s"%g
+                files=self.groups[g].files
+                for file in files:
+                    print "    %s"%file
+        elif self.options.verbose:
+            print " Found %d groups"%len(groupnames),
+            for g in groupnames: print "%s->%d"%(k,len(self.groups[g].files)),
+            print ''
+
+    # sort on number of entries
+    @staticmethod
+    def sort_size (k1, k2):
+        return len(k1.list)-len(k2.list)
+
+# keeps an index of all files found, index by (prefix, suffix), then sorted by time
+class Index:
+    def __init__ (self,options):
+        self.options=options
+        self.index = {}
+
+    def insert (self, dir, filename, prefix, suffix, datetime):
+        key= (prefix, suffix)
+        if key not in self.index:
+            self.index[key] = Kind(prefix,suffix, self.options)
+        self.index[key].add_file (dir, filename, datetime)
+
+    # we're done inserting, do housecleaning
+    def epilogue (self):
+        for (key, kind) in self.index.items():
+            kind.epilogue()
+
+    def show (self):
+        # sort on number of entries
+        kinds = self.index.values()
+        kinds.sort (Kind.sort_size)
+        for kind in kinds:
+            kind.show()
+
+    def insert_many (self, dir, filenames):
+        for filename in filenames:
+            (b,p,s,d) = parse_filename (filename)
+            if not b:
+                print "Filename %s does not match - skipped"%filename
+                continue
+            self.insert (dir, filename, p, s, d)
+
+def handle_arg (index, dir, pattern):
+    try:
+        os.chdir(dir)
+    except:
+        print "Cannot chdir into %s - skipped"%dir
+        return
+    filenames=glob(pattern)
+    index.insert_many (dir, filenames)
+
+def main ():
+    parser=OptionParser()
+    parser.add_option ("-v","--verbose",dest='verbose',action='store_true',default=False,
+                       help="run in verbose mode")
+    parser.add_option ("-x","--extra-verbose",dest='extra_verbose',action='store_true',default=False,
+                       help="run in extra verbose mode")
+    parser.add_option ("-n","--dry-run",dest='dry_run',action='store_true',default=False,
+                       help="dry run")
+    (options, args) = parser.parse_args()
+    if options.extra_verbose: options.verbose=True
+    
+    # args can be directories, or patterns, like 
+    # main /db-backup /db-backup-f8/*bz2
+    # in any case we handle each arg completely separately
+    index = Index (options)
+    for arg in args:
+        if os.path.isdir (arg):
+            handle_arg (index, arg, "*")
+        else:
+            (dir,pattern)=os.path.split(arg)
+            if not dir: dir='.'
+            handle_arg (index, dir, pattern)
+    index.epilogue()
+    index.show()
+            
+if __name__ == '__main__':
+    main()