grouprins.py

   1 #!/usr/bin/python
   2
   3 # This script is used to manipulate the operational state of nodes in
   4 # different node groups.  These are basically set operations on nodes via the
   5 # PLC api.
   6 #
   7 # Take the ng name as an argument....
   8 # optionally,
   9 #  * get a list of nodes in the given nodegroup.
  10 #  * set some or all in the set to rins.
  11 #  * restart them all.
  12 #  * do something else to them all.
  13 #
  14
  15 from monitor import config
  16 from monitor import util
  17 from monitor import const
  18 from monitor import database
  19 from monitor import parser as parsermodule
  20 from monitor.pcu import reboot
  21 from monitor.wrapper import plc
  22 api = plc.getAuthAPI()
  23
  24 import traceback
  25 from optparse import OptionParser
  26
  27 from nodecommon import *
  28 from nodequery import verify,query_to_dict,node_select
  29 from monitor.model import *
  30 import os
  31
  32 import time
  33
  34 import bootman          # debug nodes
  35 import mailmonitor      # down nodes without pcu
  36 from emailTxt import mailtxt
  37 import sys
  38
  39 class Reboot(object):
  40         def __init__(self, fbnode):
  41                 self.fbnode = fbnode
  42
  43         def _send_pcunotice(self, host):
  44                 args = {}
  45                 args['hostname'] = host
  46                 try:
  47                         args['pcu_id'] = plc.getpcu(host)['pcu_id']
  48                 except:
  49                         args['pcu_id'] = host
  50
  51                 m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
  52                                                                  mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
  53
  54                 loginbase = plc.siteId(host)
  55                 m.send([const.TECHEMAIL % loginbase])
  56
  57         def pcu(self, host):
  58                 # TODO: It should be possible to diagnose the various conditions of
  59                 #               the PCU here, and send different messages as appropriate.
  60                 print "'%s'" % self.fbnode['pcu']
  61                 if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
  62                         self.action = "reboot.reboot('%s')" % host
  63
  64                         pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
  65                         #pflags.resetRecentFlag('pcutried')
  66                         if not pflags.getRecentFlag('pcutried'):
  67                                 try:
  68                                         print "CALLING REBOOT!!!"
  69                                         ret = reboot.reboot(host)
  70
  71                                         pflags.setRecentFlag('pcutried')
  72                                         pflags.save()
  73                                         return ret
  74
  75                                 except Exception,e:
  76                                         print traceback.print_exc(); print e
  77
  78                                         # NOTE: this failure could be an implementation issue on
  79                                         #               our end.  So, extra notices are confusing...
  80                                         # self._send_pcunotice(host)
  81
  82                                         pflags.setRecentFlag('pcufailed')
  83                                         pflags.save()
  84                                         return False
  85
  86                         elif not pflags.getRecentFlag('pcu_rins_tried'):
  87                                 try:
  88                                         # set node to 'rins' boot state.
  89                                         print "CALLING REBOOT +++ RINS"
  90                                         plc.nodeBootState(host, 'rins')
  91                                         ret = reboot.reboot(host)
  92
  93                                         pflags.setRecentFlag('pcu_rins_tried')
  94                                         pflags.save()
  95                                         return ret
  96
  97                                 except Exception,e:
  98                                         print traceback.print_exc(); print e
  99
 100                                         # NOTE: this failure could be an implementation issue on
 101                                         #               our end.  So, extra notices are confusing...
 102                                         # self._send_pcunotice(host)
 103
 104                                         pflags.setRecentFlag('pcufailed')
 105                                         pflags.save()
 106                                         return False
 107                         else:
 108                                 # we've tried the pcu recently, but it didn't work,
 109                                 # so did we send a message about it recently?
 110                                 if not pflags.getRecentFlag('pcumessagesent'):
 111
 112                                         self._send_pcunotice(host)
 113
 114                                         pflags.setRecentFlag('pcumessagesent')
 115                                         pflags.save()
 116
 117                                 # This will result in mail() being called next, to try to
 118                                 # engage the technical contact to take care of it also.
 119                                 print "RETURNING FALSE"
 120                                 return False
 121
 122                 else:
 123                         print "NO PCUOK"
 124                         self.action = "None"
 125                         return False
 126
 127         def mail(self, host):
 128
 129                 # Reset every 4 weeks or so
 130                 pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
 131                 if not pflags.getRecentFlag('endrecord'):
 132                         node_end_record(host)
 133                         pflags.setRecentFlag('endrecord')
 134                         pflags.save()
 135
 136                 # Then in either case, run mailmonitor.reboot()
 137                 self.action = "mailmonitor.reboot('%s')" % host
 138                 try:
 139                         return mailmonitor.reboot(host)
 140                 except Exception, e:
 141                         print traceback.print_exc(); print e
 142                         return False
 143
 144 class RebootDebug(Reboot):
 145
 146         def direct(self, host):
 147                 self.action = "bootman.reboot('%s', config, None)" % host
 148                 return bootman.reboot(host, config, None)
 149
 150 class RebootBoot(Reboot):
 151
 152         def direct(self, host):
 153                 self.action = "bootman.reboot('%s', config, 'reboot')" % host
 154                 return bootman.reboot(host, config, 'reboot')
 155
 156 class RebootDown(Reboot):
 157
 158         def direct(self, host):
 159                 self.action = "None"
 160                 return False    # this always fails, since the node will be down.
 161
 162 def set_node_to_rins(host, fb):
 163
 164         node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
 165         record = {'observation' : node[0],
 166                           'model' : 'USER_REQUEST',
 167                           'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host,
 168                           'time' : time.time()}
 169         l = Log(host, record)
 170
 171         ret = api.UpdateNode(host, {'boot_state' : 'rins'})
 172         if ret:
 173                 # it's nice to see the current status rather than the previous status on the console
 174                 node = api.GetNodes(host)[0]
 175                 print l
 176                 print "%-2d" % (i-1), nodegroup_display(node, fb)
 177                 return l
 178         else:
 179                 print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
 180                 return None
 181
 182
 183 try:
 184         rebootlog = database.dbLoad("rebootlog")
 185 except:
 186         rebootlog = LogRoll()
 187
 188 parser = parsermodule.getParser(['nodesets'])
 189 parser.set_defaults( timewait=0,
 190                                         skip=0,
 191                                         rins=False,
 192                                         reboot=False,
 193                                         findbad=False,
 194                                         force=False,
 195                                         nosetup=False,
 196                                         verbose=False,
 197                                         quiet=False,
 198                                         )
 199
 200 parser.add_option("", "--stopselect", dest="stopselect", metavar="",
 201                                         help="The select string that must evaluate to true for the node to be considered 'done'")
 202 parser.add_option("", "--findbad", dest="findbad", action="store_true",
 203                                         help="Re-run findbad on the nodes we're going to check before acting.")
 204 parser.add_option("", "--force", dest="force", action="store_true",
 205                                         help="Force action regardless of previous actions/logs.")
 206 parser.add_option("", "--rins", dest="rins", action="store_true",
 207                                         help="Set the boot_state to 'rins' for all nodes.")
 208 parser.add_option("", "--reboot", dest="reboot", action="store_true",
 209                                         help="Actively try to reboot the nodes, keeping a log of actions.")
 210
 211 parser.add_option("", "--verbose", dest="verbose", action="store_true",
 212                                         help="Extra debug output messages.")
 213 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 214                                         help="Do not perform the orginary setup phase.")
 215 parser.add_option("", "--skip", dest="skip",
 216                                         help="Number of machines to skip on the input queue.")
 217 parser.add_option("", "--timewait", dest="timewait",
 218                                         help="Minutes to wait between iterations of 10 nodes.")
 219
 220 parser = parsermodule.getParser(['defaults'], parser)
 221 config = parsermodule.parse_args(parser)
 222
 223 # COLLECT nodegroups, nodes and node lists
 224 if config.nodegroup:
 225         ng = api.GetNodeGroups({'name' : config.nodegroup})
 226         nodelist = api.GetNodes(ng[0]['node_ids'])
 227         hostnames = [ n['hostname'] for n in nodelist ]
 228
 229 if config.site:
 230         site = api.GetSites(config.site)
 231         l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
 232         hostnames = [ n['hostname'] for n in l_nodes ]
 233
 234 if config.node or config.nodelist:
 235         if config.node: hostnames = [ config.node ]
 236         else: hostnames = util.file.getListFromFile(config.nodelist)
 237
 238 fbquery = FindbadNodeRecord.get_all_latest()
 239 fb_nodelist = [ n.hostname for n in fbquery ]
 240
 241 if config.nodeselect:
 242         hostnames = node_select(config.nodeselect, fb_nodelist)
 243
 244 if config.findbad:
 245         # rerun findbad with the nodes in the given nodes.
 246         file = "findbad.txt"
 247         util.file.setFileFromList(file, hostnames)
 248         os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
 249         # TODO: shouldn't we reload the node list now?
 250
 251 l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
 252 # commands:
 253 i = 1
 254 count = 1
 255 #print "hosts: %s" % hostnames
 256 for host in hostnames:
 257
 258         #if 'echo' in host or 'hptest-1' in host: continue
 259
 260         try:
 261                 try:
 262                         node = api.GetNodes(host)[0]
 263                 except:
 264                         print traceback.print_exc();
 265                         print "FAILED GETNODES for host: %s" % host
 266                         continue
 267
 268                 print "%-2d" % i, nodegroup_display(node, fb)
 269                 i += 1
 270                 if i-1 <= int(config.skip): continue
 271                 if host in l_blacklist:
 272                         print "%s is blacklisted.  Skipping." % host
 273                         continue
 274
 275                 if config.stopselect:
 276                         dict_query = query_to_dict(config.stopselect)
 277                         fbnode = fb['nodes'][host]['values']
 278                         observed_state = get_current_state(fbnode)
 279
 280                         if verify(dict_query, fbnode) and observed_state != "dbg ":
 281                                 # evaluates to true, therefore skip.
 282                                 print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
 283                                 try:
 284                                         # todo: clean up act_all record here.
 285                                         # todo: send thank you, etc.
 286                                         mailmonitor.reboot(host)
 287                                 except Exception, e:
 288                                         print traceback.print_exc(); print e
 289
 290                                 continue
 291                         #else:
 292                                 #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
 293                                 #sys.exit(1)
 294
 295                 if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
 296                         print "recently rebooted %s.  skipping... " % host
 297                         continue
 298
 299                 if config.reboot:
 300
 301                         fbnode = fb['nodes'][host]['values']
 302                         observed_state = get_current_state(fbnode)
 303
 304                         if       observed_state == "dbg ":
 305                                 o = RebootDebug(fbnode)
 306
 307                         elif observed_state == "boot" :
 308                                 if config.rins:
 309                                         l = set_node_to_rins(host, fb)
 310                                         if l: rebootlog.add(l)
 311
 312                                 o = RebootBoot(fbnode)
 313
 314                         elif observed_state == "down":
 315                                 if config.rins:
 316                                         l = set_node_to_rins(host, fb)
 317                                         if l: rebootlog.add(l)
 318
 319                                 o = RebootDown(fbnode)
 320
 321
 322                         if o.direct(host):
 323                                 record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state,
 324                                                   'action' : o.action,
 325                                                   'model' : "none",
 326                                                   'time' : time.time()}
 327                         elif o.pcu(host):
 328                                 record = {'observation' : "PCU_SUCCESS: %s" % observed_state,
 329                                                   'action' : o.action,
 330                                                   'model' : "none",
 331                                                   'time' : time.time()}
 332                         elif o.mail(host):
 333                                 record = {'observation' : "MAIL_SUCCESS: %s" % observed_state,
 334                                                   'action' : o.action,
 335                                                   'model' : "none",
 336                                                   'time' : time.time()}
 337                         else:
 338                                 record = {'observation' : "REBOOT_FAILED: %s" %  observed_state,
 339                                                   'action' : "log failure",
 340                                                   'model' : "none",
 341                                                   'time' : time.time()}
 342
 343                                 print "ALL METHODS OF RESTARTING %s FAILED" % host
 344                                 args = {}
 345                                 args['hostname'] = host
 346                                 #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
 347                                 #                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
 348                                 #m.reset()
 349                                 #m.send(['monitor-list@lists.planet-lab.org'])
 350
 351                         l = Log(host, record)
 352                         print l
 353                         rebootlog.add(l)
 354         except KeyboardInterrupt:
 355                 print "Killed by interrupt"
 356                 sys.exit(0)
 357         except:
 358                 print traceback.print_exc();
 359                 print "Continuing..."
 360
 361         time.sleep(1)
 362         if count % 10 == 0:
 363                 print "Saving rebootlog"
 364                 database.dbDump("rebootlog", rebootlog)
 365                 wait_time = int(config.timewait)
 366                 print "Sleeping %d minutes" % wait_time
 367                 ti = 0
 368                 print "Minutes slept: ",
 369                 sys.stdout.flush()
 370                 while ti < wait_time:
 371                         print "%s" % ti,
 372                         sys.stdout.flush()
 373                         time.sleep(60)
 374                         ti = ti+1
 375
 376         count = count + 1
 377
 378 print "Saving rebootlog"
 379 database.dbDump("rebootlog", rebootlog)