grouprins.py

   1 #!/usr/bin/python
   2
   3 # This script is used to manipulate the operational state of nodes in
   4 # different node groups.  These are basically set operations on nodes via the
   5 # PLC api.
   6 #
   7 # Take the ng name as an argument....
   8 # optionally,
   9 #  * get a list of nodes in the given nodegroup.
  10 #  * set some or all in the set to rins.
  11 #  * restart them all.
  12 #  * do something else to them all.
  13 #
  14
  15 from monitor import config
  16 from monitor import util
  17 from monitor import const
  18 from monitor import database
  19 from monitor import parser as parsermodule
  20 from monitor import reboot
  21 from monitor.wrapper import plc
  22 api = plc.getAuthAPI()
  23
  24 import traceback
  25 from optparse import OptionParser
  26
  27 from monitor.common import *
  28 from nodequery import verify,query_to_dict,node_select
  29 from monitor.model import *
  30 import os
  31
  32 import time
  33
  34 import bootman          # debug nodes
  35 import mailmonitor      # down nodes without pcu
  36 from monitor.wrapper.emailTxt import mailtxt
  37 import sys
  38
  39 class Reboot(object):
  40         def __init__(self, fbnode):
  41                 self.fbnode = fbnode
  42
  43         def _send_pcunotice(self, host):
  44                 args = {}
  45                 args['hostname'] = host
  46                 try:
  47                         args['pcu_id'] = plc.getpcu(host)['pcu_id']
  48                 except:
  49                         args['pcu_id'] = host
  50
  51                 m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
  52                                                                  mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
  53
  54                 loginbase = plc.siteId(host)
  55                 m.send([const.TECHEMAIL % loginbase])
  56
  57         def pcu(self, host):
  58                 # TODO: It should be possible to diagnose the various conditions of
  59                 #               the PCU here, and send different messages as appropriate.
  60                 print "'%s'" % self.fbnode['pcu']
  61                 if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
  62                         self.action = "reboot.reboot('%s')" % host
  63
  64                         pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
  65                         #pflags.resetRecentFlag('pcutried')
  66                         if not pflags.getRecentFlag('pcutried'):
  67                                 try:
  68                                         print "CALLING REBOOT!!!"
  69                                         ret = reboot.reboot(host)
  70
  71                                         pflags.setRecentFlag('pcutried')
  72                                         pflags.save()
  73                                         return ret
  74
  75                                 except Exception,e:
  76                                         email_exception()
  77                                         print traceback.print_exc(); print e
  78
  79                                         # NOTE: this failure could be an implementation issue on
  80                                         #               our end.  So, extra notices are confusing...
  81                                         # self._send_pcunotice(host)
  82
  83                                         pflags.setRecentFlag('pcufailed')
  84                                         pflags.save()
  85                                         return False
  86
  87                         elif not pflags.getRecentFlag('pcu_rins_tried'):
  88                                 try:
  89                                         # set node to 'rins' boot state.
  90                                         print "CALLING REBOOT +++ RINS"
  91                                         plc.nodeBootState(host, 'rins')
  92                                         ret = reboot.reboot(host)
  93
  94                                         pflags.setRecentFlag('pcu_rins_tried')
  95                                         pflags.save()
  96                                         return ret
  97
  98                                 except Exception,e:
  99                                         email_exception()
 100                                         print traceback.print_exc(); print e
 101
 102                                         # NOTE: this failure could be an implementation issue on
 103                                         #               our end.  So, extra notices are confusing...
 104                                         # self._send_pcunotice(host)
 105
 106                                         pflags.setRecentFlag('pcufailed')
 107                                         pflags.save()
 108                                         return False
 109                         else:
 110                                 # we've tried the pcu recently, but it didn't work,
 111                                 # so did we send a message about it recently?
 112                                 if not pflags.getRecentFlag('pcumessagesent'):
 113
 114                                         self._send_pcunotice(host)
 115
 116                                         pflags.setRecentFlag('pcumessagesent')
 117                                         pflags.save()
 118
 119                                 # This will result in mail() being called next, to try to
 120                                 # engage the technical contact to take care of it also.
 121                                 print "RETURNING FALSE"
 122                                 return False
 123
 124                 else:
 125                         print "NO PCUOK"
 126                         self.action = "None"
 127                         return False
 128
 129         def mail(self, host):
 130
 131                 # Reset every 4 weeks or so
 132                 pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
 133                 if not pflags.getRecentFlag('endrecord'):
 134                         node_end_record(host)
 135                         pflags.setRecentFlag('endrecord')
 136                         pflags.save()
 137
 138                 # Then in either case, run mailmonitor.reboot()
 139                 self.action = "mailmonitor.reboot('%s')" % host
 140                 try:
 141                         return mailmonitor.reboot(host)
 142                 except Exception, e:
 143                         email_exception(host)
 144                         print traceback.print_exc(); print e
 145                         return False
 146
 147 class RebootDebug(Reboot):
 148
 149         def direct(self, host):
 150                 self.action = "bootman.reboot('%s', config, None)" % host
 151                 return bootman.reboot(host, config, None)
 152
 153 class RebootBoot(Reboot):
 154
 155         def direct(self, host):
 156                 self.action = "bootman.reboot('%s', config, 'reboot')" % host
 157                 return bootman.reboot(host, config, 'reboot')
 158
 159 class RebootDown(Reboot):
 160
 161         def direct(self, host):
 162                 self.action = "None"
 163                 return False    # this always fails, since the node will be down.
 164
 165 def set_node_to_rins(host, fb):
 166
 167         node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
 168         record = {'observation' : node[0],
 169                           'model' : 'USER_REQUEST',
 170                           'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host,
 171                           'time' : time.time()}
 172         l = Log(host, record)
 173
 174         ret = api.UpdateNode(host, {'boot_state' : 'rins'})
 175         if ret:
 176                 # it's nice to see the current status rather than the previous status on the console
 177                 node = api.GetNodes(host)[0]
 178                 print l
 179                 print "%-2d" % (i-1), nodegroup_display(node, fb)
 180                 return l
 181         else:
 182                 print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
 183                 return None
 184
 185
 186 try:
 187         rebootlog = database.dbLoad("rebootlog")
 188 except:
 189         rebootlog = LogRoll()
 190
 191 parser = parsermodule.getParser(['nodesets'])
 192 parser.set_defaults( timewait=0,
 193                                         skip=0,
 194                                         rins=False,
 195                                         reboot=False,
 196                                         findbad=False,
 197                                         force=False,
 198                                         nosetup=False,
 199                                         verbose=False,
 200                                         quiet=False,
 201                                         )
 202
 203 parser.add_option("", "--stopselect", dest="stopselect", metavar="",
 204                                         help="The select string that must evaluate to true for the node to be considered 'done'")
 205 parser.add_option("", "--findbad", dest="findbad", action="store_true",
 206                                         help="Re-run findbad on the nodes we're going to check before acting.")
 207 parser.add_option("", "--force", dest="force", action="store_true",
 208                                         help="Force action regardless of previous actions/logs.")
 209 parser.add_option("", "--rins", dest="rins", action="store_true",
 210                                         help="Set the boot_state to 'rins' for all nodes.")
 211 parser.add_option("", "--reboot", dest="reboot", action="store_true",
 212                                         help="Actively try to reboot the nodes, keeping a log of actions.")
 213
 214 parser.add_option("", "--verbose", dest="verbose", action="store_true",
 215                                         help="Extra debug output messages.")
 216 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 217                                         help="Do not perform the orginary setup phase.")
 218 parser.add_option("", "--skip", dest="skip",
 219                                         help="Number of machines to skip on the input queue.")
 220 parser.add_option("", "--timewait", dest="timewait",
 221                                         help="Minutes to wait between iterations of 10 nodes.")
 222
 223 parser = parsermodule.getParser(['defaults'], parser)
 224 config = parsermodule.parse_args(parser)
 225
 226 # COLLECT nodegroups, nodes and node lists
 227 if config.nodegroup:
 228         ng = api.GetNodeGroups({'name' : config.nodegroup})
 229         nodelist = api.GetNodes(ng[0]['node_ids'])
 230         hostnames = [ n['hostname'] for n in nodelist ]
 231
 232 if config.site:
 233         site = api.GetSites(config.site)
 234         l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
 235         hostnames = [ n['hostname'] for n in l_nodes ]
 236
 237 if config.node or config.nodelist:
 238         if config.node: hostnames = [ config.node ]
 239         else: hostnames = util.file.getListFromFile(config.nodelist)
 240
 241 fbquery = FindbadNodeRecord.get_all_latest()
 242 fb_nodelist = [ n.hostname for n in fbquery ]
 243
 244 if config.nodeselect:
 245         hostnames = node_select(config.nodeselect, fb_nodelist)
 246
 247 if config.findbad:
 248         # rerun findbad with the nodes in the given nodes.
 249         file = "findbad.txt"
 250         util.file.setFileFromList(file, hostnames)
 251         os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
 252         # TODO: shouldn't we reload the node list now?
 253
 254 l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
 255 # commands:
 256 i = 1
 257 count = 1
 258 #print "hosts: %s" % hostnames
 259 for host in hostnames:
 260
 261         #if 'echo' in host or 'hptest-1' in host: continue
 262
 263         try:
 264                 try:
 265                         node = api.GetNodes(host)[0]
 266                 except:
 267                         email_exception()
 268                         print traceback.print_exc();
 269                         print "FAILED GETNODES for host: %s" % host
 270                         continue
 271
 272                 print "%-2d" % i, nodegroup_display(node, fb)
 273                 i += 1
 274                 if i-1 <= int(config.skip): continue
 275                 if host in l_blacklist:
 276                         print "%s is blacklisted.  Skipping." % host
 277                         continue
 278
 279                 if config.stopselect:
 280                         dict_query = query_to_dict(config.stopselect)
 281                         fbnode = fb['nodes'][host]['values']
 282                         observed_state = get_current_state(fbnode)
 283
 284                         if verify(dict_query, fbnode) and observed_state != "dbg ":
 285                                 # evaluates to true, therefore skip.
 286                                 print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
 287                                 try:
 288                                         # todo: clean up act_all record here.
 289                                         # todo: send thank you, etc.
 290                                         mailmonitor.reboot(host)
 291                                 except Exception, e:
 292                                         email_exception()
 293                                         print traceback.print_exc(); print e
 294
 295                                 continue
 296                         #else:
 297                                 #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
 298                                 #sys.exit(1)
 299
 300                 if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
 301                         print "recently rebooted %s.  skipping... " % host
 302                         continue
 303
 304                 if config.reboot:
 305
 306                         fbnode = fb['nodes'][host]['values']
 307                         observed_state = get_current_state(fbnode)
 308
 309                         if       observed_state == "dbg ":
 310                                 o = RebootDebug(fbnode)
 311
 312                         elif observed_state == "boot" :
 313                                 if config.rins:
 314                                         l = set_node_to_rins(host, fb)
 315                                         if l: rebootlog.add(l)
 316
 317                                 o = RebootBoot(fbnode)
 318
 319                         elif observed_state == "down":
 320                                 if config.rins:
 321                                         l = set_node_to_rins(host, fb)
 322                                         if l: rebootlog.add(l)
 323
 324                                 o = RebootDown(fbnode)
 325
 326
 327                         if o.direct(host):
 328                                 record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state,
 329                                                   'action' : o.action,
 330                                                   'model' : "none",
 331                                                   'time' : time.time()}
 332                         elif o.pcu(host):
 333                                 record = {'observation' : "PCU_SUCCESS: %s" % observed_state,
 334                                                   'action' : o.action,
 335                                                   'model' : "none",
 336                                                   'time' : time.time()}
 337                         elif o.mail(host):
 338                                 record = {'observation' : "MAIL_SUCCESS: %s" % observed_state,
 339                                                   'action' : o.action,
 340                                                   'model' : "none",
 341                                                   'time' : time.time()}
 342                         else:
 343                                 record = {'observation' : "REBOOT_FAILED: %s" %  observed_state,
 344                                                   'action' : "log failure",
 345                                                   'model' : "none",
 346                                                   'time' : time.time()}
 347
 348                                 print "ALL METHODS OF RESTARTING %s FAILED" % host
 349                                 args = {}
 350                                 args['hostname'] = host
 351                                 #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
 352                                 #                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
 353                                 #m.reset()
 354                                 #m.send(['monitor-list@lists.planet-lab.org'])
 355
 356                         l = Log(host, record)
 357                         print l
 358                         rebootlog.add(l)
 359         except KeyboardInterrupt:
 360                 print "Killed by interrupt"
 361                 sys.exit(0)
 362         except:
 363                 email_exception()
 364                 print traceback.print_exc();
 365                 print "Continuing..."
 366
 367         time.sleep(1)
 368         if count % 10 == 0:
 369                 print "Saving rebootlog"
 370                 database.dbDump("rebootlog", rebootlog)
 371                 wait_time = int(config.timewait)
 372                 print "Sleeping %d minutes" % wait_time
 373                 ti = 0
 374                 print "Minutes slept: ",
 375                 sys.stdout.flush()
 376                 while ti < wait_time:
 377                         print "%s" % ti,
 378                         sys.stdout.flush()
 379                         time.sleep(60)
 380                         ti = ti+1
 381
 382         count = count + 1
 383
 384 print "Saving rebootlog"
 385 database.dbDump("rebootlog", rebootlog)