commented out unused value
[monitor.git] / statistics / rtsurvey.py
1 #!/usr/bin/python
2
3 import os, sys, shutil
4 import MySQLdb
5 import string
6
7 import re
8
9 import time 
10 from datetime import datetime
11
12 from monitor import config
13 from monitor import database
14
15 def convert_time(time_str):
16         if '-' in str:
17                 try:
18                         tup = time.strptime(str, "%Y-%m-%d %H:%M:%S")
19                 except:
20                         tup = time.strptime(str, "%Y-%m-%d-%H:%M")
21         elif '/' in str:
22                 tup = time.strptime(str, "%m/%d/%Y")
23         else:
24                 tup = time.strptime(str, "%m/%d/%Y")
25         d_ret = datetime.fromtimestamp(time.mktime(tup))
26         return d_ret
27
28 def open_rt_db():
29
30         try:
31                 rt_db = MySQLdb.connect(host=config.RT_DB_HOST,
32                                                                 user=config.RT_DB_USER,
33                                                                 passwd=config.RT_DB_PASSWORD,
34                                                                 db=config.RT_DB_NAME)
35         except Exception, err:
36                 print "Failed to connect to RT database: %s" %err
37                 return -1
38
39         return rt_db
40
41 def fetch_from_db(db, sql):
42         try:
43                 # create a 'cursor' (required by MySQLdb)
44                 c = db.cursor()
45                 c.execute(sql)
46         except Exception, err:
47                 print "Could not execute RT query %s" %err
48                 return -1
49
50         # fetch all rows (list of lists)
51         raw = c.fetchall()
52         return raw
53         
54
55 def get_rt_tickets():
56         print "open db connection"
57         db = open_rt_db()
58         if db == -1:
59                 return ""
60
61         sql = """SELECT tk.id, tk.Queue, tr.Type, tr.Field, tr.OldValue, tr.NewValue, 
62                     tr.Created, at.id, at.Subject, at.Content, us.Name
63              FROM Tickets as tk, Transactions as tr, Users as us
64              LEFT OUTER JOIN Attachments as at ON tr.id=at.TransactionId 
65              WHERE (tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>40800 AND 
66                          us.id=tr.Creator"""
67              #WHERE (tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>39896 AND tk.id<42241 AND ## (oct15th2008) 
68              #WHERE (tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>40800 AND ## (1st3months)
69              #WHERE (tk.Queue=3 OR tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>10000  """
70
71         print "run query"
72         raw = fetch_from_db(db, sql)
73         if raw == -1:
74                 return raw
75         
76         tickets = {}
77         subject_map = {}
78         def parse_ticket(x):
79                 ticket_id = int(x[0])
80                 queue = int(x[1])
81                 trtype = str(x[2])
82                 field = x[3]
83                 oldvalue = x[4]
84                 newvalue = x[5]
85                 datecreated = x[6]              # already a datetime object
86                 attachmentid = x[7]
87                 subject = x[8]
88                 content = x[9]
89                 creator = x[10]
90
91                 if ticket_id not in tickets:
92                         print "found new ticket_id", ticket_id
93                         tickets[ticket_id] = {'queue' : queue,
94                                                                   'transactions' : [] }
95
96                 if subject != "":
97                         subject_map[ticket_id] = subject
98                 elif ticket_id in subject_map:
99                         subject = subject_map[ticket_id]
100                 else:
101                         # subject == "" and no record in subject_map yet
102                         # should probably put on a queue to be processed later.
103                         print "no subject for %s" % ticket_id
104
105                 transaction = {
106                                         'type' : trtype,
107                                         'field' : field,
108                                         'oldvalue' : oldvalue,
109                                         'newvalue' : newvalue,
110                                         'datecreated' : datecreated,
111                                         'attachmentid' : attachmentid,
112                                         'creator' : creator,
113                                         'subject' : subject,
114                                         'content' : content,
115                                                 }
116                 tickets[ticket_id]['transactions'].append(transaction)
117                 
118
119         print "sort data"
120         list = map(parse_ticket, raw)
121
122         db.close()
123
124
125         return tickets
126
127
128 # flow chart:
129 #               classify:
130 #                       for each ticket
131 #                               classify into category
132 #                               remove from ticket set, add to classified-set
133 #               
134 #               add new search patterns, 
135 #               re-run classify algorithm
136
137 re_map = [
138         #('mom', {'pattern' : '.*pl_mom.*'}),
139         #('technical-support', {'pattern' : '.*PlanetLab node.* down'}),
140         #('technical-support', {'pattern' : 'Node .* was stopped by'}),  # and opened
141         #('technical-support', {'pattern' : 'bootcd|BootCD|bootCD|boot cd|boot CD|booting'}),
142         #('technical-support', {'pattern' : '.* failed to authenticate'}),
143         #('technical-support', {'pattern' : '.* fails to boot'}),
144         #('technical-support', {'pattern' : '.* fail.* to boot'}),
145         #('technical-support', {'pattern' : '.* failed to authenticate'}),
146         #('technical-support', {'pattern' : 'curl (60)|.* CA certificates.*|peer certificate.*authenticated'}),
147         #('technical-support', {'pattern' : '(usb|USB).*(key|Disk|stick|boot|help|problem|trouble)'}), 
148         #('complaint', {'pattern' : '.*omplaint|.*attack'}),
149         #('complaint', {'pattern' : '.* stop .*'}), # and subject
150         #('spam', {}),j
151         #('user-support', {'pattern' : '(R|r)egistration|(R|r)egister'}),
152         #('user-support', {'pattern' : 'password reset|reset password'}),
153         ('user-support', {'pattern' : 'New PI account registration from'}),
154         #('other', {}),
155 ]
156
157 def sort_tickets(tickets, re_map):
158
159         ticket_count = len(tickets.keys())
160         marked_subject = 0
161         marked_content = 0
162         for ticket_id in sorted(tickets.keys()):
163                 for i,(name, pattern) in enumerate(re_map):
164                         if 'compile' not in pattern:
165                                 pattern['compile'] = re.compile(pattern['pattern'])
166                         pat = pattern['compile']
167                         for transaction in tickets[ticket_id]['transactions']:
168
169                                 try:
170                                         if transaction['subject'] and re.match(pat, transaction['subject']):
171                                                 print "ticket %s matches pattern %s: %s" % (ticket_id, 
172                                                                 pattern['pattern'], transaction['subject'])
173                                                 marked_subject += 1
174                                                 break
175                                         if transaction['content'] and re.match(pat, transaction['content']):
176                                                 print "ticket %s matches pattern %s: %s" % (ticket_id, 
177                                                                 pattern['pattern'], transaction['subject'])
178                                                 #if transaction['subject'] == "":
179                                                 #       print transaction
180                                                 marked_content += 1
181                                                 break
182                                 except:
183                                         import traceback
184                                         print traceback.print_exc()
185                                         print transaction
186                                         print ticket_id
187                                         print pattern
188                                         sys.exit(1)
189
190         print ticket_count
191         print marked_subject
192         print marked_content
193         print ticket_count - marked_content - marked_content
194
195 def main():
196         from optparse import OptionParser
197         parser = OptionParser()
198
199         parser.set_defaults(runsql=False,)
200
201         parser.add_option("", "--runsql", dest="runsql", action="store_true",
202                                                 help="Whether to collect data from the MySQL server before "+
203                                                         "caching it, or to just use the previously collected data.")
204
205         (config, args) = parser.parse_args()
206         if len(sys.argv) == 1:
207                 parser.print_help()
208                 sys.exit(1)
209
210         for i,(name, pattern) in enumerate(re_map):
211                 print i, name
212
213         if config.runsql:
214                 tickets = get_rt_tickets()
215                 database.dbDump("survey_tickets", tickets)
216         else:
217                 print "loading"
218                 tickets = database.dbLoad("survey_tickets")
219         #print tickets[42171]['transactions'][0]
220
221         #sort_tickets(tickets, re_map)
222
223         # for each ticket id
224         #       scan for known keywords and sort into classes
225         #       record assigned class
226
227         # review all tickets that remain
228
229 if __name__ == '__main__':
230         main()