merge from 2.0 branch
[monitor.git] / policy.py
1 #!/usr/bin/python
2
3 # This script is used to manipulate the operational state of nodes in
4 # different node groups.  These are basically set operations on nodes via the
5 # PLC api.
6
7 # Take the ng name as an argument....
8 # optionally, 
9 #  * get a list of nodes in the given nodegroup.
10 #  * set some or all in the set to rins.
11 #  * restart them all.
12 #  * do something else to them all.
13
14
15 import os
16 import time
17 import traceback
18 import sys
19 from optparse import OptionParser
20
21 from monitor import config
22 from monitor import parser as parsermodule
23 from monitor.common import *
24 from monitor.model import *
25 from monitor.wrapper import plc
26 from monitor.wrapper import plccache
27 from monitor.database.info.model import *
28 from monitor.database.info.interface import *
29
30 from nodequery import verify,query_to_dict,node_select
31
32 api = plc.getAuthAPI()
33
34 def logic():
35
36         plc.nodeBootState(host, 'rins')
37         node_end_record(host)
38
39 def main(hostnames, sitenames):
40         # commands:
41         i = 1
42         node_count = 1
43         site_count = 1
44         #print "hosts: %s" % hostnames
45         for i,host in enumerate(hostnames):
46                 try:
47                         lb = plccache.plcdb_hn2lb[host]
48                 except:
49                         print "unknown host in plcdb_hn2lb %s" % host
50                         email_exception(host)
51                         continue
52
53                 nodeblack = BlacklistRecord.get_by(hostname=host)
54
55                 if nodeblack and not nodeblack.expired():
56                         print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
57                         continue
58
59                 sitehist = SiteInterface.get_or_make(loginbase=lb)
60
61                 recent_actions = sitehist.getRecentActions(hostname=host)
62
63                 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
64
65                 print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
66                 if nodehist.status == 'good' and \
67                         changed_lessthan(nodehist.last_changed, 1.0) and \
68                         found_within(recent_actions, 'down_notice', 7.0) and \
69                         not found_within(recent_actions, 'online_notice', 0.5):
70                             # NOTE: searching for down_notice proves that the node has
71                                 #               gone through a 'down' state first, rather than just
72                                 #               flapping through: good, offline, online, ...
73                                 #       
74                                 # NOTE: there is a narrow window in which this command must be
75                                 #               evaluated, otherwise the notice will not go out.  
76                                 #               this is not ideal.
77                                 sitehist.sendMessage('online_notice', hostname=host, viart=False)
78                                 print "send message for host %s online" % host
79
80
81                 # if a node is offline and doesn't have a PCU, remind the user that they should have one.
82                 if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
83                         changed_greaterthan(nodehist.last_changed,1.0) and \
84                         not found_within(recent_actions, 'pcumissing_notice', 7.0):
85
86                                 sitehist.sendMessage('pcumissing_notice', hostname=host)
87                                 print "send message for host %s pcumissing_notice" % host
88
89                 # if it is offline and HAS a PCU, then try to use it.
90                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
91                         changed_greaterthan(nodehist.last_changed,1.0) and \
92                         not found_between(recent_actions, 'try_reboot', 3.5, 1):
93
94                                 sitehist.attemptReboot(host)
95                                 print "send message for host %s try_reboot" % host
96
97                 # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
98                 #               will be false for a day after the above condition is satisfied
99                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
100                         changed_greaterthan(nodehist.last_changed,1.5) and \
101                         found_between(recent_actions, 'try_reboot', 3.5, 1) and \
102                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
103                                 
104                                 # send pcu failure message
105                                 #act = ActionRecord(**kwargs)
106                                 sitehist.sendMessage('pcufailed_notice', hostname=host)
107                                 print "send message for host %s PCU Failure" % host
108
109                 if nodehist.status == 'monitordebug' and \
110                         changed_greaterthan(nodehist.last_changed, 1) and \
111                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
112                                 # send down node notice
113                                 # delay 0.5 days before retrying...
114
115                                 print "send message for host %s bootmanager_restore" % host
116                                 sitehist.runBootManager(host)
117                         #       sitehist.sendMessage('retry_bootman', hostname=host)
118
119                 if nodehist.status == 'down' and \
120                         changed_greaterthan(nodehist.last_changed, 2) and \
121                         not found_within(recent_actions, 'down_notice', 3.5):
122                                 # send down node notice
123
124                                 sitehist.sendMessage('down_notice', hostname=host)
125                                 print "send message for host %s down" % host
126
127                 node_count = node_count + 1
128                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
129                 sys.stdout.flush()
130                 session.flush()
131
132         for i,site in enumerate(sitenames):
133                 sitehist = SiteInterface.get_or_make(loginbase=site)
134                 siteblack = BlacklistRecord.get_by(loginbase=site)
135
136                 if siteblack and not siteblack.expired():
137                         print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
138                         continue
139
140                 # TODO: make query only return records within a certin time range,
141                 #               i.e. greater than 0.5 days ago. or 5 days, etc.
142                 recent_actions = sitehist.getRecentActions(loginbase=site)
143
144                 print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
145                 if sitehist.db.status == 'down':
146                         if  not found_within(recent_actions, 'pause_penalty', 30) and \
147                                 not found_within(recent_actions, 'increase_penalty', 7) and \
148                                 changed_greaterthan(sitehist.db.last_changed, 7):
149
150                                 # TODO: catch errors
151                                 sitehist.increasePenalty()
152                                 #sitehist.applyPenalty()
153                                 sitehist.sendMessage('increase_penalty')
154
155                                 print "send message for site %s penalty increase" % site
156
157                 if sitehist.db.status == 'good':
158                         # clear penalty
159                         # NOTE: because 'all clear' should have an indefinite status, we
160                         #               have a boolean value rather than a 'recent action'
161                         if sitehist.db.penalty_applied:
162                                 # send message that penalties are cleared.
163
164                                 sitehist.clearPenalty()
165                                 #sitehist.applyPenalty()
166                                 sitehist.sendMessage('clear_penalty')
167                                 sitehist.closeTicket()
168
169                                 print "send message for site %s penalty cleared" % site
170
171                 # find all ticket ids for site ( could be on the site record? )
172                 # determine if there are penalties within the last 30 days?
173                 # if so, add a 'pause_penalty' action.
174                 if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and \
175                         sitehist.db.penalty_level > 0 and not found_within(recent_actions, 'pause_penalty', 30):
176                         #       pause escalation
177                         print "Pausing penalties for %s" % site
178                         sitehist.pausePenalty()
179
180                 site_count = site_count + 1
181
182                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
183                 sys.stdout.flush()
184                 session.flush()
185
186         session.flush()
187         return
188
189
190 if __name__ == "__main__":
191         parser = parsermodule.getParser(['nodesets'])
192         parser.set_defaults( timewait=0,
193                                                 skip=0,
194                                                 rins=False,
195                                                 reboot=False,
196                                                 findbad=False,
197                                                 force=False, 
198                                                 nosetup=False, 
199                                                 verbose=False, 
200                                                 quiet=False,)
201
202         parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
203                                                 help="The select string that must evaluate to true for the node to be considered 'done'")
204         parser.add_option("", "--findbad", dest="findbad", action="store_true", 
205                                                 help="Re-run findbad on the nodes we're going to check before acting.")
206         parser.add_option("", "--force", dest="force", action="store_true", 
207                                                 help="Force action regardless of previous actions/logs.")
208         parser.add_option("", "--rins", dest="rins", action="store_true", 
209                                                 help="Set the boot_state to 'rins' for all nodes.")
210         parser.add_option("", "--reboot", dest="reboot", action="store_true", 
211                                                 help="Actively try to reboot the nodes, keeping a log of actions.")
212
213         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
214                                                 help="Extra debug output messages.")
215         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
216                                                 help="Do not perform the orginary setup phase.")
217         parser.add_option("", "--skip", dest="skip", 
218                                                 help="Number of machines to skip on the input queue.")
219         parser.add_option("", "--timewait", dest="timewait", 
220                                                 help="Minutes to wait between iterations of 10 nodes.")
221
222         parser = parsermodule.getParser(['defaults'], parser)
223         config = parsermodule.parse_args(parser)
224
225         fbquery = HistoryNodeRecord.query.all()
226         hostnames = [ n.hostname for n in fbquery ]
227         
228         fbquery = HistorySiteRecord.query.all()
229         sitenames = [ s.loginbase for s in fbquery ]
230
231         if config.site:
232                 # TODO: replace with calls to local db.  the api fails so often that
233                 #               these calls should be regarded as unreliable.
234                 l_nodes = plccache.GetNodesBySite(config.site)
235                 filter_hostnames = [ n['hostname'] for n in l_nodes ]
236
237                 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
238                 sitenames = [config.site]
239
240         if config.node:
241                 hostnames = [ config.node ] 
242                 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
243
244         try:
245                 main(hostnames, sitenames)
246                 session.flush()
247         except KeyboardInterrupt:
248                 print "Killed by interrupt"
249                 session.flush()
250                 sys.exit(0)
251         except:
252                 #email_exception()
253                 print traceback.print_exc();
254                 print "fail all..."