--- /dev/null
+#!/usr/bin/python
+
+### a utility to filter backups in the /db-backup area
+#
+# lists all files that match <prefix>-<date>-<time>.<suffix>
+# and preserves only the following
+# (*) more than 2 months old : one file per month
+# (*) more than 2 weeks old : one file per week
+# (*) recent stuff: unchanged
+#
+
+import os, os.path
+from datetime import datetime
+from glob import glob
+import re
+import traceback
+
+from optparse import OptionParser
+
+class FileIgnored (Exception): pass
+
+# in days
+LEAVE_FILES_YOUNGER_THAN = 20
+# keep that amount of (plain) months organized in weeks
+# should not exceed 11
+KEEP_FILES_IN_MONTH_AFTER = 2
+
+# monday=0 sunday=6
+# we prefer saturday as this is the end of the week
+PREFERRED_WEEKDAY=5
+
+#
+## utility to parse a filename
+datetime_pattern="(?P<datetime>[0-9]{8}\-[0-9]{4})"
+# returns a tuple
+# (bool, prefix, suffix, datetime)
+
+prefix_pattern="(?P<prefix>[\w\-]+)"
+datetime_pattern="(?P<datetime>[0-9]{8}\-[0-9]{4})"
+suffix_pattern="(?P<suffix>[\w\.]+)"
+filename_matcher=re.compile("\A%s[-_]%s\.%s\Z"%(prefix_pattern,datetime_pattern,suffix_pattern))
+
+parsing_failed = (False, None, None, None)
+
+def parse_filename (filename):
+ match=filename_matcher.match(filename)
+ if not match:
+ return parsing_failed
+ else:
+ d = match.groupdict()
+ try:
+ dt = datetime.strptime(d['datetime'],"%Y%m%d-%H%M")
+ return (True, d['prefix'], d['suffix'], dt)
+ except:
+ print "failed to parse timestamp %s from %s"%(d['datetime'],filename)
+ traceback.print_exc()
+ return parsing_failed
+
+# one entry like this per file, managed in the Kind class
+now=datetime.now()
+
+class File:
+
+ def __init__ (self, dir, filename, datetime, options):
+ self.dir=dir
+ self.filename=filename
+ self.datetime=datetime
+ self.options=options
+ self.age=now-datetime
+ self.weekday=self.datetime.weekday()
+ if self.age.days<0:
+ if self.options.verbose: print 'Filename %s is from the future - skipped'%sfilename
+ raise FileIgnored,"Filename from the future %s"%filename
+ self.group = self._group_string()
+
+ def __repr__ (self):
+ return "%s (%s) -- weekday %s"%(self.filename,self.datetime,self.datetime.weekday())
+
+ def age_days (self):
+ return self.age.days
+
+ def age_weeks (self):
+ return self.age.days/7
+
+ # oldest first
+ @staticmethod
+ def sort_age (file1, file2):
+ # for 2.7, seems safer
+ try:
+ return int((file1.datetime-file2.datetime).total_seconds())
+ # otherwise resort to days, we should be clear as our backups are daily
+ except:
+ return int((file1.datetime-file2.datetime).days)
+
+ @staticmethod
+ def sort_relevance (file1, file2):
+ w1=file1.weekday
+ w2=file2.weekday
+ if w1==PREFERRED_WEEKDAY and w2==PREFERRED_WEEKDAY:
+ return File.sort_age (file1,file2)
+ elif w1!=PREFERRED_WEEKDAY and w2!=PREFERRED_WEEKDAY:
+ return File.sort_age (file1,file2)
+ elif w1==PREFERRED_WEEKDAY and w2!=PREFERRED_WEEKDAY:
+ return -1
+ elif w1!=PREFERRED_WEEKDAY and w2==PREFERRED_WEEKDAY:
+ return 1
+
+ month_marrier=None
+ @staticmethod
+ def compute_month_barrier ():
+ if not File.month_marrier:
+ # find the exact datetime for a month change
+ # KEEP_FILES_IN_MONTH_AFTER months ago +
+ year=now.year
+ month=now.month
+ day=1
+ if now.month>=KEEP_FILES_IN_MONTH_AFTER+1:
+ month -= KEEP_FILES_IN_MONTH_AFTER
+ else:
+ year -= 1
+ month += (12-KEEP_FILES_IN_MONTH_AFTER)
+ File.month_marrier = datetime (year=year, month=month, day=day)
+
+ return File.month_marrier
+
+ # returns a key for grouping files, the cleanup then
+ # preserving one entry in the set of files with same group
+ def _group_string (self):
+ if self.age.days<=LEAVE_FILES_YOUNGER_THAN:
+ if self.options.verbose: print 'Filename %s is recent (%d d) - skipped'%\
+ (self.filename,LEAVE_FILES_YOUNGER_THAN)
+ raise FileIgnored,"Filename %s is recent"%self.filename
+ # in the month range
+ if self.datetime <= File.compute_month_barrier():
+ return self.datetime.strftime("%Y%m")
+ else:
+ return "week%d"%self.age_weeks()
+
+# all files in a given timeslot (either month or week)
+class Group:
+ def __init__ (self, groupname):
+ self.groupname=groupname
+ self.files=[]
+ def insert (self, file):
+ self.files.append(file)
+ def epilogue (self):
+ self.files.sort (File.sort_relevance)
+# print 20*'*','after sort'
+# for file in self.files:
+# print "%s"%file
+
+# all files with the same (prefix, suffix)
+class Kind:
+
+ def __init__ (self, prefix, suffix, options):
+ self.prefix=prefix
+ self.suffix=suffix
+ self.options=options
+ # will contain tuples (filename, datetime)
+ self.list = []
+
+ # allow for basic checking to be done in File
+ def add_file (self, dir, filename, datetime):
+ try:
+ self.list.append ( File (dir, filename, datetime, self.options) )
+ except FileIgnored: pass
+ except:
+ print 'could not append %s'%filename
+ traceback.print_exc()
+ pass
+
+ def epilogue (self):
+ # sort self.list according to file age, oldest first
+ self.list.sort(File.sort_age)
+ # prepare groups according to age
+ self.groups = {}
+ for file in self.list:
+ groupname=file.group
+ if groupname not in self.groups:
+ self.groups[groupname]=Group(groupname)
+ self.groups[groupname].insert(file)
+ for group in self.groups.values():
+ group.epilogue()
+
+ def show (self):
+ print 30*'-',"%s-<date>.%s"%(self.prefix,self.suffix)
+ entries=len(self.list)
+ print " %d entries" % entries,
+ if entries >=1:
+ f=self.list[0]
+ print " << %s - %s d old"%(f.filename, f.age_days()),
+ if entries >=2:
+ f=self.list[-1]
+ print ">> %s - %s d old"%(f.filename, f.age_days())
+ groupnames=self.groups.keys()
+ groupnames.sort()
+ groupnames.reverse()
+ if self.options.extra_verbose:
+ print " Found %d groups"%len(groupnames)
+ for g in groupnames:
+ print " %s"%g
+ files=self.groups[g].files
+ for file in files:
+ print " %s"%file
+ elif self.options.verbose:
+ print " Found %d groups"%len(groupnames),
+ for g in groupnames: print "%s->%d"%(k,len(self.groups[g].files)),
+ print ''
+
+ # sort on number of entries
+ @staticmethod
+ def sort_size (k1, k2):
+ return len(k1.list)-len(k2.list)
+
+# keeps an index of all files found, index by (prefix, suffix), then sorted by time
+class Index:
+ def __init__ (self,options):
+ self.options=options
+ self.index = {}
+
+ def insert (self, dir, filename, prefix, suffix, datetime):
+ key= (prefix, suffix)
+ if key not in self.index:
+ self.index[key] = Kind(prefix,suffix, self.options)
+ self.index[key].add_file (dir, filename, datetime)
+
+ # we're done inserting, do housecleaning
+ def epilogue (self):
+ for (key, kind) in self.index.items():
+ kind.epilogue()
+
+ def show (self):
+ # sort on number of entries
+ kinds = self.index.values()
+ kinds.sort (Kind.sort_size)
+ for kind in kinds:
+ kind.show()
+
+ def insert_many (self, dir, filenames):
+ for filename in filenames:
+ (b,p,s,d) = parse_filename (filename)
+ if not b:
+ print "Filename %s does not match - skipped"%filename
+ continue
+ self.insert (dir, filename, p, s, d)
+
+def handle_arg (index, dir, pattern):
+ try:
+ os.chdir(dir)
+ except:
+ print "Cannot chdir into %s - skipped"%dir
+ return
+ filenames=glob(pattern)
+ index.insert_many (dir, filenames)
+
+def main ():
+ parser=OptionParser()
+ parser.add_option ("-v","--verbose",dest='verbose',action='store_true',default=False,
+ help="run in verbose mode")
+ parser.add_option ("-x","--extra-verbose",dest='extra_verbose',action='store_true',default=False,
+ help="run in extra verbose mode")
+ parser.add_option ("-n","--dry-run",dest='dry_run',action='store_true',default=False,
+ help="dry run")
+ (options, args) = parser.parse_args()
+ if options.extra_verbose: options.verbose=True
+
+ # args can be directories, or patterns, like
+ # main /db-backup /db-backup-f8/*bz2
+ # in any case we handle each arg completely separately
+ index = Index (options)
+ for arg in args:
+ if os.path.isdir (arg):
+ handle_arg (index, arg, "*")
+ else:
+ (dir,pattern)=os.path.split(arg)
+ if not dir: dir='.'
+ handle_arg (index, dir, pattern)
+ index.epilogue()
+ index.show()
+
+if __name__ == '__main__':
+ main()