give noop myops xmlrpc call a extra parameter to allow it to work with the
[monitor.git] / policy.py
1 #!/usr/bin/python
2
3 # This script is used to manipulate the operational state of nodes in
4 # different node groups.  These are basically set operations on nodes via the
5 # PLC api.
6
7 # Take the ng name as an argument....
8 # optionally, 
9 #  * get a list of nodes in the given nodegroup.
10 #  * set some or all in the set to rins.
11 #  * restart them all.
12 #  * do something else to them all.
13
14
15 import os
16 import time
17 import traceback
18 import sys
19 from optparse import OptionParser
20
21 from monitor import config
22 from monitor import parser as parsermodule
23 from monitor.common import *
24 from monitor.model import *
25 from monitor.wrapper import plc
26 from monitor.wrapper import plccache
27 from monitor.database.info.model import *
28 from monitor.database.info.interface import *
29
30 from nodequery import verify,query_to_dict,node_select
31
32 api = plc.getAuthAPI()
33
34 def logic():
35
36         plc.nodeBootState(host, 'reinstall')
37         node_end_record(host)
38
39 def main(hostnames, sitenames):
40         # commands:
41         i = 1
42         node_count = 1
43         site_count = 1
44         #print "hosts: %s" % hostnames
45         for i,host in enumerate(hostnames):
46                 try:
47                         lb = plccache.plcdb_hn2lb[host]
48                 except:
49                         print "unknown host in plcdb_hn2lb %s" % host
50                         email_exception(host)
51                         continue
52
53                 nodeblack = BlacklistRecord.get_by(hostname=host)
54
55                 if nodeblack and not nodeblack.expired():
56                         print "skipping %s due to blacklist.  will expire %s" % (host, nodeblack.willExpire() )
57                         continue
58
59                 sitehist = SiteInterface.get_or_make(loginbase=lb)
60
61                 recent_actions = sitehist.getRecentActions(hostname=host)
62
63                 nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
64
65                 print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
66                 if nodehist.status == 'good' and \
67                         changed_lessthan(nodehist.last_changed, 1.0) and \
68                         found_within(recent_actions, 'down_notice', 7.0) and \
69                         not found_within(recent_actions, 'online_notice', 0.5):
70                                 # NOTE: chronicly flapping nodes will not get 'online' notices
71                                 #               since, they are never up long enough to be 'good'.
72                             # NOTE: searching for down_notice proves that the node has
73                                 #               gone through a 'down' state first, rather than just
74                                 #               flapping through: good, offline, online, ...
75                                 #       
76                                 # NOTE: there is a narrow window in which this command must be
77                                 #               evaluated, otherwise the notice will not go out.  
78                                 #               this is not ideal.
79                                 sitehist.sendMessage('online_notice', hostname=host, viart=False, saveact=True)
80                                 print "send message for host %s online" % host
81
82
83                 # if a node is offline and doesn't have a PCU, remind the user that they should have one.
84                 #if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
85                 #       changed_greaterthan(nodehist.last_changed,1.0) and \
86                 #       not found_within(recent_actions, 'pcumissing_notice', 7.0):
87                 #
88                 #               sitehist.sendMessage('pcumissing_notice', hostname=host)
89                 #               print "send message for host %s pcumissing_notice" % host
90
91                 # if it is offline and HAS a PCU, then try to use it.
92                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
93                         changed_greaterthan(nodehist.last_changed,1.0) and \
94                         not found_between(recent_actions, 'try_reboot', 3.5, 1):
95
96                                 sitehist.attemptReboot(host)
97                                 print "send message for host %s try_reboot" % host
98
99                 # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
100                 #               will be false for a day after the above condition is satisfied
101                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
102                         changed_greaterthan(nodehist.last_changed,1.5) and \
103                         found_between(recent_actions, 'try_reboot', 3.5, 1) and \
104                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
105                                 
106                                 # send pcu failure message
107                                 #act = ActionRecord(**kwargs)
108                                 sitehist.sendMessage('pcufailed_notice', hostname=host)
109                                 print "send message for host %s PCU Failure" % host
110
111                 if nodehist.status == 'monitordebug' and \
112                         changed_greaterthan(nodehist.last_changed, 1) and \
113                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
114                                 # send down node notice
115                                 # delay 0.5 days before retrying...
116
117                                 print "send message for host %s bootmanager_restore" % host
118                                 sitehist.runBootManager(host)
119                         #       sitehist.sendMessage('retry_bootman', hostname=host)
120
121                 if nodehist.status == 'down' and \
122                         changed_greaterthan(nodehist.last_changed, 2) and \
123                         not found_within(recent_actions, 'down_notice', 3.5):
124                                 # send down node notice
125
126                                 sitehist.sendMessage('down_notice', hostname=host)
127                                 print "send message for host %s down" % host
128
129                 node_count = node_count + 1
130                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
131                 sys.stdout.flush()
132                 session.flush()
133
134         for i,site in enumerate(sitenames):
135                 sitehist = SiteInterface.get_or_make(loginbase=site)
136                 siteblack = BlacklistRecord.get_by(loginbase=site)
137                 skip_due_to_blacklist=False
138
139                 if siteblack and not siteblack.expired():
140                         print "skipping %s due to blacklist.  will expire %s" % (site, siteblack.willExpire() )
141                         skip_due_to_blacklist=True
142                         sitehist.clearPenalty()
143                         sitehist.applyPenalty()
144                         continue
145
146                 # TODO: make query only return records within a certin time range,
147                 #               i.e. greater than 0.5 days ago. or 5 days, etc.
148                 recent_actions = sitehist.getRecentActions(loginbase=site)
149
150                 print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
151
152                 # determine if there are penalties within the last 30 days?
153                 # if so, add a 'pause_penalty' action.
154                 if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and \
155                         sitehist.db.penalty_level > 0 and not found_within(recent_actions, 'pause_penalty', 30):
156                         #       pause escalation
157                         print "Pausing penalties for %s" % site
158                         sitehist.pausePenalty()
159                 else:
160
161                         if sitehist.db.status == 'down':
162                                 if  not found_within(recent_actions, 'pause_penalty', 30) and \
163                                         not found_within(recent_actions, 'increase_penalty', 7) and \
164                                         changed_greaterthan(sitehist.db.last_changed, 7):
165
166                                         # TODO: catch errors
167                                         sitehist.increasePenalty()
168                                         sitehist.applyPenalty()
169                                         sitehist.sendMessage('increase_penalty')
170
171                                         print "send message for site %s penalty increase" % site
172
173                         if sitehist.db.status == 'good':
174                                 # clear penalty
175                                 # NOTE: because 'all clear' should have an indefinite status, we
176                                 #               have a boolean value rather than a 'recent action'
177                                 if sitehist.db.penalty_applied:
178                                         # send message that penalties are cleared.
179
180                                         sitehist.clearPenalty()
181                                         sitehist.applyPenalty()
182                                         sitehist.sendMessage('clear_penalty')
183                                         sitehist.closeTicket()
184
185                                         print "send message for site %s penalty cleared" % site
186
187
188                 site_count = site_count + 1
189
190                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
191                 sys.stdout.flush()
192                 session.flush()
193
194         session.flush()
195         return
196
197
198 if __name__ == "__main__":
199         parser = parsermodule.getParser(['nodesets'])
200         parser.set_defaults( timewait=0,
201                                                 skip=0,
202                                                 rins=False,
203                                                 reboot=False,
204                                                 findbad=False,
205                                                 force=False, 
206                                                 nosetup=False, 
207                                                 verbose=False, 
208                                                 quiet=False,)
209
210         parser.add_option("", "--stopselect", dest="stopselect", metavar="", 
211                                                 help="The select string that must evaluate to true for the node to be considered 'done'")
212         parser.add_option("", "--findbad", dest="findbad", action="store_true", 
213                                                 help="Re-run findbad on the nodes we're going to check before acting.")
214         parser.add_option("", "--force", dest="force", action="store_true", 
215                                                 help="Force action regardless of previous actions/logs.")
216         parser.add_option("", "--rins", dest="rins", action="store_true", 
217                                                 help="Set the boot_state to 'rins' for all nodes.")
218         parser.add_option("", "--reboot", dest="reboot", action="store_true", 
219                                                 help="Actively try to reboot the nodes, keeping a log of actions.")
220
221         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
222                                                 help="Extra debug output messages.")
223         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
224                                                 help="Do not perform the orginary setup phase.")
225         parser.add_option("", "--skip", dest="skip", 
226                                                 help="Number of machines to skip on the input queue.")
227         parser.add_option("", "--timewait", dest="timewait", 
228                                                 help="Minutes to wait between iterations of 10 nodes.")
229
230         parser = parsermodule.getParser(['defaults'], parser)
231         config = parsermodule.parse_args(parser)
232
233         fbquery = HistoryNodeRecord.query.all()
234         hostnames = [ n.hostname for n in fbquery ]
235         
236         fbquery = HistorySiteRecord.query.all()
237         sitenames = [ s.loginbase for s in fbquery ]
238
239         if config.site:
240                 # TODO: replace with calls to local db.  the api fails so often that
241                 #               these calls should be regarded as unreliable.
242                 l_nodes = plccache.GetNodesBySite(config.site)
243                 filter_hostnames = [ n['hostname'] for n in l_nodes ]
244
245                 hostnames = filter(lambda x: x in filter_hostnames, hostnames)
246                 sitenames = [config.site]
247
248         if config.node:
249                 hostnames = [ config.node ] 
250                 sitenames = [ plccache.plcdb_hn2lb[config.node] ]
251
252         try:
253                 main(hostnames, sitenames)
254                 session.flush()
255         except KeyboardInterrupt:
256                 print "Killed by interrupt"
257                 session.flush()
258                 sys.exit(0)
259         except:
260                 email_exception()
261                 print traceback.print_exc();
262                 print "fail all..."