4befbd94b3452fe6bed2c39bc76e29c4ab61abeb
[monitor.git] / policy.py
1 #!/usr/bin/python
2
3 # This script is used to manipulate the operational state of nodes in
4 # different node groups.  These are basically set operations on nodes via the
5 # PLC api.
6
7 # Take the ng name as an argument....
8 # optionally, 
9 #  * get a list of nodes in the given nodegroup.
10 #  * set some or all in the set to rins.
11 #  * restart them all.
12 #  * do something else to them all.
13
14
15 import os
16 import time
17 import traceback
18 import sys
19 from optparse import OptionParser
20
21 from monitor import config
22 from monitor import parser as parsermodule
23 from monitor.common import *
24 from monitor.model import *
25 from monitor.wrapper import plc
26 from monitor.wrapper import plccache
27 from monitor.database.info.model import *
28 from monitor.database.info.interface import *
29
30 from nodequery import verify,query_to_dict,node_select
31
32 api = plc.getAuthAPI()
33
34 def logic():
35
36         plc.nodeBootState(host, 'rins')
37         node_end_record(host)
38
39 def main(hostnames, sitenames):
40         # commands:
41         i = 1
42         node_count = 1
43         site_count = 1
44         #print "hosts: %s" % hostnames
45         for i,host in enumerate(hostnames):
46                 try:
47                         lb = plccache.plcdb_hn2lb[host]
48                 except:
49                         print "unknown host in plcdb_hn2lb %s" % host
50                         continue
51
52                 nodeblack = BlacklistRecord.get_by(hostname=host)
53
54                 if nodeblack and not nodeblack.expired():
55                         print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
56                         continue
57
58                 sitehist = SiteInterface.get_or_make(loginbase=lb)
59
60                 recent_actions = sitehist.getRecentActions(hostname=host)
61
62                 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
63
64                 print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
65                 if nodehist.status == 'good' and \
66                         changed_lessthan(nodehist.last_changed, 1.0) and \
67                         not found_within(recent_actions, 'online_notice', 0.5):
68                                 # NOTE: there is a narrow window in which this command must be
69                                 # evaluated, otherwise the notice will not go out.  this is not ideal.
70                                 sitehist.sendMessage('online_notice', hostname=host, viart=False)
71                                 print "send message for host %s online" % host
72
73                                 pass
74
75                 if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
76                         changed_greaterthan(nodehist.last_changed,1.0) and \
77                         not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
78
79                                 sitehist.attemptReboot(host)
80                                 print "send message for host %s first_try_reboot" % host
81                                 pass
82
83                 # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
84                 #               will be false for a day after the above condition is satisfied
85                 if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
86                         changed_greaterthan(nodehist.last_changed,1.5) and \
87                         found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
88                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
89                         # found_within(recent_actions, 'first_try_reboot', 3.5) and \
90                                 
91                                 # send pcu failure message
92                                 #act = ActionRecord(**kwargs)
93                                 sitehist.sendMessage('pcufailed_notice', hostname=host)
94                                 print "send message for host %s PCU Failure" % host
95                                 pass
96
97                 if nodehist.status == 'monitordebug' and \
98                         changed_greaterthan(nodehist.last_changed, 1) and \
99                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
100                                 # send down node notice
101                                 # delay 0.5 days before retrying...
102
103                                 print "send message for host %s bootmanager_restore" % host
104                                 sitehist.runBootManager(host)
105                         #       sitehist.sendMessage('retry_bootman', hostname=host)
106
107                 if nodehist.status == 'down' and \
108                         changed_greaterthan(nodehist.last_changed, 2) and \
109                         not found_within(recent_actions, 'down_notice', 3.5):
110                                 # send down node notice
111
112                                 sitehist.sendMessage('down_notice', hostname=host)
113                                 print "send message for host %s down" % host
114                                 pass
115
116                 node_count = node_count + 1
117                 session.flush()
118
119         for i,site in enumerate(sitenames):
120                 sitehist = SiteInterface.get_or_make(loginbase=site)
121                 siteblack = BlacklistRecord.get_by(loginbase=site)
122
123                 if siteblack and not siteblack.expired():
124                         print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
125                         continue
126
127                 # TODO: make query only return records within a certin time range,
128                 #               i.e. greater than 0.5 days ago. or 5 days, etc.
129                 recent_actions = sitehist.getRecentActions(loginbase=site)
130
131                 print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
132                 if sitehist.db.status == 'down':
133                         if  not found_within(recent_actions, 'pause_penalty', 30) and \
134                                 not found_within(recent_actions, 'increase_penalty', 7) and \
135                                 changed_greaterthan(sitehist.db.last_changed, 7):
136
137                                 # TODO: catch errors
138                                 sitehist.increasePenalty()
139                                 #sitehist.applyPenalty()
140                                 sitehist.sendMessage('increase_penalty')
141
142                                 print "send message for site %s penalty increase" % site
143
144                 if sitehist.db.status == 'good':
145                         # clear penalty
146                         # NOTE: because 'all clear' should have an indefinite status, we
147                         #               have a boolean value rather than a 'recent action'
148                         if sitehist.db.penalty_applied:
149                                 # send message that penalties are cleared.
150
151                                 sitehist.clearPenalty()
152                                 #sitehist.applyPenalty()
153                                 sitehist.sendMessage('clear_penalty')
154                                 sitehist.closeTicket()
155
156                                 print "send message for site %s penalty cleared" % site
157
158                 # find all ticket ids for site ( could be on the site record? )
159                 # determine if there are penalties within the last 30 days?
160                 # if so, add a 'pause_penalty' action.
161                 if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
162                         #       pause escalation
163                         print "Pausing penalties for %s" % site
164                         sitehist.pausePenalty()
165
166                 site_count = site_count + 1
167
168                 session.flush()
169
170         session.flush()
171         return
172
173
174 if __name__ == "__main__":
175         parser = parsermodule.getParser(['nodesets'])
176         parser.set_defaults( timewait=0,
177                                                 skip=0,
178                                                 rins=False,
179                                                 reboot=False,
180                                                 findbad=False,
181                                                 force=False, 
182                                                 nosetup=False, 
183                                                 verbose=False, 
184                                                 quiet=False,)
185
186         parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
187                                                 help="The select string that must evaluate to true for the node to be considered 'done'")
188         parser.add_option("", "--findbad", dest="findbad", action="store_true", 
189                                                 help="Re-run findbad on the nodes we're going to check before acting.")
190         parser.add_option("", "--force", dest="force", action="store_true", 
191                                                 help="Force action regardless of previous actions/logs.")
192         parser.add_option("", "--rins", dest="rins", action="store_true", 
193                                                 help="Set the boot_state to 'rins' for all nodes.")
194         parser.add_option("", "--reboot", dest="reboot", action="store_true", 
195                                                 help="Actively try to reboot the nodes, keeping a log of actions.")
196
197         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
198                                                 help="Extra debug output messages.")
199         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
200                                                 help="Do not perform the orginary setup phase.")
201         parser.add_option("", "--skip", dest="skip", 
202                                                 help="Number of machines to skip on the input queue.")
203         parser.add_option("", "--timewait", dest="timewait", 
204                                                 help="Minutes to wait between iterations of 10 nodes.")
205
206         parser = parsermodule.getParser(['defaults'], parser)
207         config = parsermodule.parse_args(parser)
208
209         fbquery = HistoryNodeRecord.query.all()
210         hostnames = [ n.hostname for n in fbquery ]
211         
212         fbquery = HistorySiteRecord.query.all()
213         sitenames = [ s.loginbase for s in fbquery ]
214
215         if config.site:
216                 # TODO: replace with calls to local db.  the api fails so often that
217                 #               these calls should be regarded as unreliable.
218                 l_nodes = plccache.GetNodesBySite(config.site)
219                 filter_hostnames = [ n['hostname'] for n in l_nodes ]
220
221                 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
222                 sitenames = [config.site]
223
224         if config.node:
225                 hostnames = [ config.node ] 
226                 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
227
228         try:
229                 main(hostnames, sitenames)
230         except KeyboardInterrupt:
231                 print "Killed by interrupt"
232                 session.flush()
233                 sys.exit(0)
234         except:
235                 #email_exception()
236                 print traceback.print_exc();
237                 print "fail all..."