clearer names for actions, and infer actions better
[monitor.git] / statistics / rt_s1_raw_dump.py
1 #!/usr/bin/python
2
3 import os, sys, shutil
4 import MySQLdb
5 import string
6
7 import re
8
9 import time 
10 from datetime import datetime
11
12 from monitor import config
13 from monitor import database
14
15 def convert_time(time_str):
16         if '-' in str:
17                 try:
18                         tup = time.strptime(str, "%Y-%m-%d %H:%M:%S")
19                 except:
20                         tup = time.strptime(str, "%Y-%m-%d-%H:%M")
21         elif '/' in str:
22                 tup = time.strptime(str, "%m/%d/%Y")
23         else:
24                 tup = time.strptime(str, "%m/%d/%Y")
25         d_ret = datetime.fromtimestamp(time.mktime(tup))
26         return d_ret
27
28 def open_rt_db():
29
30         try:
31                 rt_db = MySQLdb.connect(host=config.RT_DB_HOST,
32                                                                 user=config.RT_DB_USER,
33                                                                 passwd=config.RT_DB_PASSWORD,
34                                                                 db=config.RT_DB_NAME)
35         except Exception, err:
36                 print "Failed to connect to RT database: %s" %err
37                 return -1
38
39         return rt_db
40
41 def fetch_from_db(db, sql):
42         try:
43                 # create a 'cursor' (required by MySQLdb)
44                 c = db.cursor()
45                 c.execute(sql)
46         except Exception, err:
47                 print "Could not execute RT query %s" %err
48                 return -1
49
50         # fetch all rows (list of lists)
51         raw = c.fetchall()
52         return raw
53         
54
55 def get_rt_tickets(date):
56         print "open db connection"
57         db = open_rt_db()
58         if db == -1:
59                 return ""
60
61         if date is None:
62                 date_select = ""
63         else:
64                 #date_select = "AND tr.Created >= DATE_SUB(CURDATE(),INTERVAL %s DAY)" % date
65                 date_select = "AND tr.Created >= STR_TO_DATE('%s', '%%Y-%%m-%%d')" % date
66
67         sql = """SELECT tk.id, tk.Queue, tr.Type, tr.Field, tr.OldValue, tr.NewValue, 
68                                         tr.Created, at.id, at.Subject, at.Content, us.Name
69                          FROM Tickets as tk, Transactions as tr, Users as us
70                          LEFT OUTER JOIN Attachments as at ON tr.id=at.TransactionId 
71                          WHERE (tk.Queue=3 OR tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>0 AND
72                          us.id=tr.Creator %s""" % date_select
73                          #WHERE (tk.Queue=3 OR tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>12506 AND
74                          # WHERE (tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>40800 AND 
75                          #WHERE (tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>39896 AND tk.id<42241 AND ## (oct15th2008) 
76                          #WHERE (tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>40800 AND ## (1st3months)
77                          #WHERE (tk.Queue=3 OR tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>12506  # 12506 jan-1-2006
78
79         print sql
80         print "run query"
81         raw = fetch_from_db(db, sql)
82         if raw == -1:
83                 return raw
84         
85         tickets = {}
86         subject_map = {}
87         def parse_ticket(x):
88                 ticket_id = int(x[0])
89                 queue = int(x[1])
90                 trtype = str(x[2])
91                 field = x[3]
92                 oldvalue = x[4]
93                 newvalue = x[5]
94                 datecreated = x[6]              # already a datetime object
95                 attachmentid = x[7]
96                 subject = x[8]
97                 content = x[9]
98                 creator = x[10]
99
100                 if ticket_id not in tickets:
101                         print "found new ticket_id", ticket_id
102                         tickets[ticket_id] = {'queue' : queue,
103                                                                   'transactions' : [] }
104
105                 if subject != "":
106                         subject_map[ticket_id] = subject
107                 elif ticket_id in subject_map:
108                         subject = subject_map[ticket_id]
109                 else:
110                         # subject == "" and no record in subject_map yet
111                         # should probably put on a queue to be processed later.
112                         print "no subject for %s" % ticket_id
113
114                 transaction = {
115                                         'type' : trtype,
116                                         'field' : field,
117                                         'oldvalue' : oldvalue,
118                                         'newvalue' : newvalue,
119                                         'datecreated' : datecreated,
120                                         'attachmentid' : attachmentid,
121                                         'creator' : creator,
122                                         'subject' : subject,
123                                         'content' : content,
124                                                 }
125                 tickets[ticket_id]['transactions'].append(transaction)
126                 
127
128         print "sort data"
129         list = map(parse_ticket, raw)
130
131         db.close()
132
133
134         return tickets
135
136
137 # flow chart:
138 #               classify:
139 #                       for each ticket
140 #                               classify into category
141 #                               remove from ticket set, add to classified-set
142 #               
143 #               add new search patterns, 
144 #               re-run classify algorithm
145
146 re_map = [
147         #('mom', {'pattern' : '.*pl_mom.*'}),
148         #('technical-support', {'pattern' : '.*PlanetLab node.* down'}),
149         #('technical-support', {'pattern' : 'Node .* was stopped by'}),  # and opened
150         #('technical-support', {'pattern' : 'bootcd|BootCD|bootCD|boot cd|boot CD|booting'}),
151         #('technical-support', {'pattern' : '.* failed to authenticate'}),
152         #('technical-support', {'pattern' : '.* fails to boot'}),
153         #('technical-support', {'pattern' : '.* fail.* to boot'}),
154         #('technical-support', {'pattern' : '.* failed to authenticate'}),
155         #('technical-support', {'pattern' : 'curl (60)|.* CA certificates.*|peer certificate.*authenticated'}),
156         #('technical-support', {'pattern' : '(usb|USB).*(key|Disk|stick|boot|help|problem|trouble)'}), 
157         #('complaint', {'pattern' : '.*omplaint|.*attack'}),
158         #('complaint', {'pattern' : '.* stop .*'}), # and subject
159         #('spam', {}),j
160         #('user-support', {'pattern' : '(R|r)egistration|(R|r)egister'}),
161         #('user-support', {'pattern' : 'password reset|reset password'}),
162         ('user-support', {'pattern' : 'New PI account registration from'}),
163         #('other', {}),
164 ]
165
166 def sort_tickets(tickets, re_map):
167
168         ticket_count = len(tickets.keys())
169         marked_subject = 0
170         marked_content = 0
171         for ticket_id in sorted(tickets.keys()):
172                 for i,(name, pattern) in enumerate(re_map):
173                         if 'compile' not in pattern:
174                                 pattern['compile'] = re.compile(pattern['pattern'])
175                         pat = pattern['compile']
176                         for transaction in tickets[ticket_id]['transactions']:
177
178                                 try:
179                                         if transaction['subject'] and re.match(pat, transaction['subject']):
180                                                 print "ticket %s matches pattern %s: %s" % (ticket_id, 
181                                                                 pattern['pattern'], transaction['subject'])
182                                                 marked_subject += 1
183                                                 break
184                                         if transaction['content'] and re.match(pat, transaction['content']):
185                                                 print "ticket %s matches pattern %s: %s" % (ticket_id, 
186                                                                 pattern['pattern'], transaction['subject'])
187                                                 #if transaction['subject'] == "":
188                                                 #       print transaction
189                                                 marked_content += 1
190                                                 break
191                                 except:
192                                         import traceback
193                                         print traceback.print_exc()
194                                         print transaction
195                                         print ticket_id
196                                         print pattern
197                                         sys.exit(1)
198
199         print ticket_count
200         print marked_subject
201         print marked_content
202         print ticket_count - marked_content - marked_content
203
204 def main():
205         from optparse import OptionParser
206         parser = OptionParser()
207
208         parser.set_defaults(runsql=False,date=None)
209
210         parser.add_option("", "--runsql", dest="runsql", action="store_true",
211                                                 help="Whether to collect data from the MySQL server before "+
212                                                         "caching it, or to just use the previously collected data.")
213         parser.add_option("", "--date", dest="date", 
214                                                 help="Query all transactions after the given date.")
215
216         (config, args) = parser.parse_args()
217         if len(sys.argv) == 1:
218                 parser.print_help()
219                 sys.exit(1)
220
221         if config.runsql:
222                 tickets = get_rt_tickets(config.date)
223                 database.dbDump("survey_tickets", tickets)
224         else:
225                 print "loading"
226                 tickets = database.dbLoad("survey_tickets")
227
228 if __name__ == '__main__':
229         main()