clearer names for actions, and infer actions better
[monitor.git] / statistics / rtsurvey.py
1 #!/usr/bin/python
2
3 import os, sys, shutil
4 import MySQLdb
5 import string
6
7 import re
8
9 import time 
10 from datetime import datetime
11
12 from monitor import config
13 from monitor import database
14
15 def convert_time(time_str):
16         if '-' in str:
17                 try:
18                         tup = time.strptime(str, "%Y-%m-%d %H:%M:%S")
19                 except:
20                         tup = time.strptime(str, "%Y-%m-%d-%H:%M")
21         elif '/' in str:
22                 tup = time.strptime(str, "%m/%d/%Y")
23         else:
24                 tup = time.strptime(str, "%m/%d/%Y")
25         d_ret = datetime.fromtimestamp(time.mktime(tup))
26         return d_ret
27
28 def open_rt_db():
29
30         try:
31                 rt_db = MySQLdb.connect(host=config.RT_DB_HOST,
32                                                                 user=config.RT_DB_USER,
33                                                                 passwd=config.RT_DB_PASSWORD,
34                                                                 db=config.RT_DB_NAME)
35         except Exception, err:
36                 print "Failed to connect to RT database: %s" %err
37                 return -1
38
39         return rt_db
40
41 def fetch_from_db(db, sql):
42         try:
43                 # create a 'cursor' (required by MySQLdb)
44                 c = db.cursor()
45                 c.execute(sql)
46         except Exception, err:
47                 print "Could not execute RT query %s" %err
48                 return -1
49
50         # fetch all rows (list of lists)
51         raw = c.fetchall()
52         return raw
53         
54
55 def get_rt_tickets():
56         print "open db connection"
57         db = open_rt_db()
58         if db == -1:
59                 return ""
60
61         sql = """SELECT tk.id, tk.Queue, tr.Type, tr.Field, tr.OldValue, tr.NewValue, 
62                     tr.Created, at.id, at.Subject, at.Content, us.Name
63              FROM Tickets as tk, Transactions as tr, Users as us
64              LEFT OUTER JOIN Attachments as at ON tr.id=at.TransactionId 
65              WHERE (tk.Queue=3 OR tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>0 AND
66                          us.id=tr.Creator"""
67              #WHERE (tk.Queue=3 OR tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>12506 AND
68              # WHERE (tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>40800 AND 
69              #WHERE (tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>39896 AND tk.id<42241 AND ## (oct15th2008) 
70              #WHERE (tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>40800 AND ## (1st3months)
71              #WHERE (tk.Queue=3 OR tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>12506  # 12506 jan-1-2006
72
73         print "run query"
74         raw = fetch_from_db(db, sql)
75         if raw == -1:
76                 return raw
77         
78         tickets = {}
79         subject_map = {}
80         def parse_ticket(x):
81                 ticket_id = int(x[0])
82                 queue = int(x[1])
83                 trtype = str(x[2])
84                 field = x[3]
85                 oldvalue = x[4]
86                 newvalue = x[5]
87                 datecreated = x[6]              # already a datetime object
88                 attachmentid = x[7]
89                 subject = x[8]
90                 content = x[9]
91                 creator = x[10]
92
93                 if ticket_id not in tickets:
94                         print "found new ticket_id", ticket_id
95                         tickets[ticket_id] = {'queue' : queue,
96                                                                   'transactions' : [] }
97
98                 if subject != "":
99                         subject_map[ticket_id] = subject
100                 elif ticket_id in subject_map:
101                         subject = subject_map[ticket_id]
102                 else:
103                         # subject == "" and no record in subject_map yet
104                         # should probably put on a queue to be processed later.
105                         print "no subject for %s" % ticket_id
106
107                 transaction = {
108                                         'type' : trtype,
109                                         'field' : field,
110                                         'oldvalue' : oldvalue,
111                                         'newvalue' : newvalue,
112                                         'datecreated' : datecreated,
113                                         'attachmentid' : attachmentid,
114                                         'creator' : creator,
115                                         'subject' : subject,
116                                         'content' : content,
117                                                 }
118                 tickets[ticket_id]['transactions'].append(transaction)
119                 
120
121         print "sort data"
122         list = map(parse_ticket, raw)
123
124         db.close()
125
126
127         return tickets
128
129
130 # flow chart:
131 #               classify:
132 #                       for each ticket
133 #                               classify into category
134 #                               remove from ticket set, add to classified-set
135 #               
136 #               add new search patterns, 
137 #               re-run classify algorithm
138
139 re_map = [
140         #('mom', {'pattern' : '.*pl_mom.*'}),
141         #('technical-support', {'pattern' : '.*PlanetLab node.* down'}),
142         #('technical-support', {'pattern' : 'Node .* was stopped by'}),  # and opened
143         #('technical-support', {'pattern' : 'bootcd|BootCD|bootCD|boot cd|boot CD|booting'}),
144         #('technical-support', {'pattern' : '.* failed to authenticate'}),
145         #('technical-support', {'pattern' : '.* fails to boot'}),
146         #('technical-support', {'pattern' : '.* fail.* to boot'}),
147         #('technical-support', {'pattern' : '.* failed to authenticate'}),
148         #('technical-support', {'pattern' : 'curl (60)|.* CA certificates.*|peer certificate.*authenticated'}),
149         #('technical-support', {'pattern' : '(usb|USB).*(key|Disk|stick|boot|help|problem|trouble)'}), 
150         #('complaint', {'pattern' : '.*omplaint|.*attack'}),
151         #('complaint', {'pattern' : '.* stop .*'}), # and subject
152         #('spam', {}),j
153         #('user-support', {'pattern' : '(R|r)egistration|(R|r)egister'}),
154         #('user-support', {'pattern' : 'password reset|reset password'}),
155         ('user-support', {'pattern' : 'New PI account registration from'}),
156         #('other', {}),
157 ]
158
159 def sort_tickets(tickets, re_map):
160
161         ticket_count = len(tickets.keys())
162         marked_subject = 0
163         marked_content = 0
164         for ticket_id in sorted(tickets.keys()):
165                 for i,(name, pattern) in enumerate(re_map):
166                         if 'compile' not in pattern:
167                                 pattern['compile'] = re.compile(pattern['pattern'])
168                         pat = pattern['compile']
169                         for transaction in tickets[ticket_id]['transactions']:
170
171                                 try:
172                                         if transaction['subject'] and re.match(pat, transaction['subject']):
173                                                 print "ticket %s matches pattern %s: %s" % (ticket_id, 
174                                                                 pattern['pattern'], transaction['subject'])
175                                                 marked_subject += 1
176                                                 break
177                                         if transaction['content'] and re.match(pat, transaction['content']):
178                                                 print "ticket %s matches pattern %s: %s" % (ticket_id, 
179                                                                 pattern['pattern'], transaction['subject'])
180                                                 #if transaction['subject'] == "":
181                                                 #       print transaction
182                                                 marked_content += 1
183                                                 break
184                                 except:
185                                         import traceback
186                                         print traceback.print_exc()
187                                         print transaction
188                                         print ticket_id
189                                         print pattern
190                                         sys.exit(1)
191
192         print ticket_count
193         print marked_subject
194         print marked_content
195         print ticket_count - marked_content - marked_content
196
197 def main():
198         from optparse import OptionParser
199         parser = OptionParser()
200
201         parser.set_defaults(runsql=False,)
202
203         parser.add_option("", "--runsql", dest="runsql", action="store_true",
204                                                 help="Whether to collect data from the MySQL server before "+
205                                                         "caching it, or to just use the previously collected data.")
206
207         (config, args) = parser.parse_args()
208         if len(sys.argv) == 1:
209                 parser.print_help()
210                 sys.exit(1)
211
212         #for i,(name, pattern) in enumerate(re_map):
213         #       print i, name
214
215         if config.runsql:
216                 tickets = get_rt_tickets()
217                 database.dbDump("survey_tickets", tickets)
218         else:
219                 print "loading"
220                 tickets = database.dbLoad("survey_tickets")
221         #print tickets[42171]['transactions'][0]
222
223         #sort_tickets(tickets, re_map)
224
225         # for each ticket id
226         #       scan for known keywords and sort into classes
227         #       record assigned class
228
229         # review all tickets that remain
230
231 if __name__ == '__main__':
232         main()