add scrappy statistics gathering scripts
[monitor.git] / statistics / rtsurvey.py
1 #!/usr/bin/python
2
3 import os, sys, shutil
4 import MySQLdb
5 import string
6
7 import re
8
9 import time 
10 from datetime import datetime
11
12 from monitor import config
13 from monitor import database
14
15 def convert_time(time_str):
16         if '-' in str:
17                 try:
18                         tup = time.strptime(str, "%Y-%m-%d %H:%M:%S")
19                 except:
20                         tup = time.strptime(str, "%Y-%m-%d-%H:%M")
21         elif '/' in str:
22                 tup = time.strptime(str, "%m/%d/%Y")
23         else:
24                 tup = time.strptime(str, "%m/%d/%Y")
25         d_ret = datetime.fromtimestamp(time.mktime(tup))
26         return d_ret
27
28 def open_rt_db():
29
30         try:
31                 rt_db = MySQLdb.connect(host=config.RT_DB_HOST,
32                                                                 user=config.RT_DB_USER,
33                                                                 passwd=config.RT_DB_PASSWORD,
34                                                                 db=config.RT_DB_NAME)
35         except Exception, err:
36                 print "Failed to connect to RT database: %s" %err
37                 return -1
38
39         return rt_db
40
41 def fetch_from_db(db, sql):
42         try:
43                 # create a 'cursor' (required by MySQLdb)
44                 c = db.cursor()
45                 c.execute(sql)
46         except Exception, err:
47                 print "Could not execute RT query %s" %err
48                 return -1
49
50         # fetch all rows (list of lists)
51         raw = c.fetchall()
52         return raw
53         
54
55 def get_rt_tickets():
56         print "open db connection"
57         db = open_rt_db()
58         if db == -1:
59                 return ""
60
61         sql = """SELECT tk.id, tk.Queue, tr.Type, tr.Field, tr.OldValue, tr.NewValue, 
62                     tr.Created, at.id, at.Subject, at.Content
63              FROM Tickets as tk, Transactions as tr 
64              LEFT OUTER JOIN Attachments as at ON tr.id=at.TransactionId 
65              WHERE (tk.Queue=3 OR tk.Queue=22) AND tk.id=tr.ObjectId AND tk.id>10000  """
66
67         print "run query"
68         raw = fetch_from_db(db, sql)
69         if raw == -1:
70                 return raw
71         
72         tickets = {}
73         subject_map = {}
74         def parse_ticket(x):
75                 ticket_id = int(x[0])
76                 queue = int(x[1])
77                 trtype = str(x[2])
78                 field = x[3]
79                 oldvalue = x[4]
80                 newvalue = x[5]
81                 datecreated = x[6]              # already a datetime object
82                 attachmentid = x[7]
83                 subject = x[8]
84                 content = x[9]
85
86                 if ticket_id not in tickets:
87                         print "found new ticket_id", ticket_id
88                         tickets[ticket_id] = {'queue' : queue,
89                                                                   'transactions' : [] }
90
91                 if subject != "":
92                         subject_map[ticket_id] = subject
93                 elif ticket_id in subject_map:
94                         subject = subject_map[ticket_id]
95                 else:
96                         # subject == "" and no record in subject_map yet
97                         # should probably put on a queue to be processed later.
98                         print "no subject for %s" % ticket_id
99
100                 transaction = {
101                                         'type' : trtype,
102                                         'field' : field,
103                                         'oldvalue' : oldvalue,
104                                         'newvalue' : newvalue,
105                                         'datecreated' : datecreated,
106                                         'attachmentid' : attachmentid,
107                                         'subject' : subject,
108                                         'content' : content,
109                                                 }
110                 tickets[ticket_id]['transactions'].append(transaction)
111                 
112
113         print "sort data"
114         list = map(parse_ticket, raw)
115
116         # map(lambda x: { "email":str(x[4]), "lastupdated":str(x[5]), "owner":str(x[7]), }, raw)
117
118         db.close()
119
120
121         return tickets
122
123
124 # flow chart:
125 #               classify:
126 #                       for each ticket
127 #                               classify into category
128 #                               remove from ticket set, add to classified-set
129 #               
130 #               add new search patterns, 
131 #               re-run classify algorithm
132
133 re_map = [
134         #('mom', {'pattern' : '.*pl_mom.*'}),
135         #('technical-support', {'pattern' : '.*PlanetLab node.* down'}),
136         #('technical-support', {'pattern' : 'Node .* was stopped by'}),  # and opened
137         #('technical-support', {'pattern' : 'bootcd|BootCD|bootCD|boot cd|boot CD|booting'}),
138         #('technical-support', {'pattern' : '.* failed to authenticate'}),
139         #('technical-support', {'pattern' : '.* fails to boot'}),
140         #('technical-support', {'pattern' : '.* fail.* to boot'}),
141         #('technical-support', {'pattern' : '.* failed to authenticate'}),
142         #('technical-support', {'pattern' : 'curl (60)|.* CA certificates.*|peer certificate.*authenticated'}),
143         #('technical-support', {'pattern' : '(usb|USB).*(key|Disk|stick|boot|help|problem|trouble)'}), 
144         #('complaint', {'pattern' : '.*omplaint|.*attack'}),
145         #('complaint', {'pattern' : '.* stop .*'}), # and subject
146         #('spam', {}),j
147         #('user-support', {'pattern' : '(R|r)egistration|(R|r)egister'}),
148         #('user-support', {'pattern' : 'password reset|reset password'}),
149         ('user-support', {'pattern' : 'New PI account registration from'}),
150         #('other', {}),
151 ]
152
153 def sort_tickets(tickets, re_map):
154
155         ticket_count = len(tickets.keys())
156         marked_subject = 0
157         marked_content = 0
158         for ticket_id in sorted(tickets.keys()):
159                 for i,(name, pattern) in enumerate(re_map):
160                         if 'compile' not in pattern:
161                                 pattern['compile'] = re.compile(pattern['pattern'])
162                         pat = pattern['compile']
163                         for transaction in tickets[ticket_id]['transactions']:
164
165                                 try:
166                                         if transaction['subject'] and re.match(pat, transaction['subject']):
167                                                 print "ticket %s matches pattern %s: %s" % (ticket_id, 
168                                                                 pattern['pattern'], transaction['subject'])
169                                                 marked_subject += 1
170                                                 break
171                                         if transaction['content'] and re.match(pat, transaction['content']):
172                                                 print "ticket %s matches pattern %s: %s" % (ticket_id, 
173                                                                 pattern['pattern'], transaction['subject'])
174                                                 #if transaction['subject'] == "":
175                                                 #       print transaction
176                                                 marked_content += 1
177                                                 break
178                                 except:
179                                         import traceback
180                                         print traceback.print_exc()
181                                         print transaction
182                                         print ticket_id
183                                         print pattern
184                                         sys.exit(1)
185
186         print ticket_count
187         print marked_subject
188         print marked_content
189         print ticket_count - marked_content - marked_content
190
191 def main():
192         from optparse import OptionParser
193         parser = OptionParser()
194
195         parser.set_defaults(runsql=False,)
196
197         parser.add_option("", "--runsql", dest="runsql", action="store_true",
198                                                 help="Whether to collect data from the MySQL server before "+
199                                                         "caching it, or to just use the previously collected data.")
200
201         (config, args) = parser.parse_args()
202         if len(sys.argv) == 1:
203                 parser.print_help()
204                 sys.exit(1)
205
206         for i,(name, pattern) in enumerate(re_map):
207                 print i, name
208
209         if config.runsql:
210                 tickets = get_rt_tickets()
211                 database.dbDump("survey_tickets", tickets)
212         else:
213                 print "loading"
214                 tickets = database.dbLoad("survey_tickets")
215         print tickets[42171]['transactions'][0]
216
217         sort_tickets(tickets, re_map)
218
219         # for each ticket id
220         #       scan for known keywords and sort into classes
221         #       record assigned class
222
223         # review all tickets that remain
224
225 if __name__ == '__main__':
226         main()