grouprins.py

   1 #!/usr/bin/python
   2
   3 # This script is used to manipulate the operational state of nodes in
   4 # different node groups.  These are basically set operations on nodes via the
   5 # PLC api.
   6 #
   7 # Take the ng name as an argument....
   8 # optionally,
   9 #  * get a list of nodes in the given nodegroup.
  10 #  * set some or all in the set to rins.
  11 #  * restart them all.
  12 #  * do something else to them all.
  13 #
  14
  15 from monitor import config
  16 from monitor import util
  17 from monitor import const
  18 from monitor import database
  19 from monitor import parser as parsermodule
  20 from monitor.pcu import reboot
  21 from monitor.wrapper import plc
  22 api = plc.getAuthAPI()
  23
  24 import traceback
  25 from optparse import OptionParser
  26
  27 from nodecommon import *
  28 from nodequery import verify,query_to_dict,node_select
  29 from unified_model import *
  30 import os
  31
  32 import time
  33 from model import *
  34
  35 import bootman          # debug nodes
  36 import mailmonitor      # down nodes without pcu
  37 from emailTxt import mailtxt
  38 import sys
  39
  40 class Reboot(object):
  41         def __init__(self, fbnode):
  42                 self.fbnode = fbnode
  43
  44         def _send_pcunotice(self, host):
  45                 args = {}
  46                 args['hostname'] = host
  47                 try:
  48                         args['pcu_id'] = plc.getpcu(host)['pcu_id']
  49                 except:
  50                         args['pcu_id'] = host
  51
  52                 m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
  53                                                                  mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
  54
  55                 loginbase = plc.siteId(host)
  56                 m.send([const.TECHEMAIL % loginbase])
  57
  58         def pcu(self, host):
  59                 # TODO: It should be possible to diagnose the various conditions of
  60                 #               the PCU here, and send different messages as appropriate.
  61                 print "'%s'" % self.fbnode['pcu']
  62                 if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
  63                         self.action = "reboot.reboot('%s')" % host
  64
  65                         pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
  66                         #pflags.resetRecentFlag('pcutried')
  67                         if not pflags.getRecentFlag('pcutried'):
  68                                 try:
  69                                         print "CALLING REBOOT!!!"
  70                                         ret = reboot.reboot(host)
  71
  72                                         pflags.setRecentFlag('pcutried')
  73                                         pflags.save()
  74                                         return ret
  75
  76                                 except Exception,e:
  77                                         print traceback.print_exc(); print e
  78
  79                                         # NOTE: this failure could be an implementation issue on
  80                                         #               our end.  So, extra notices are confusing...
  81                                         # self._send_pcunotice(host)
  82
  83                                         pflags.setRecentFlag('pcufailed')
  84                                         pflags.save()
  85                                         return False
  86
  87                         elif not pflags.getRecentFlag('pcu_rins_tried'):
  88                                 try:
  89                                         # set node to 'rins' boot state.
  90                                         print "CALLING REBOOT +++ RINS"
  91                                         plc.nodeBootState(host, 'rins')
  92                                         ret = reboot.reboot(host)
  93
  94                                         pflags.setRecentFlag('pcu_rins_tried')
  95                                         pflags.save()
  96                                         return ret
  97
  98                                 except Exception,e:
  99                                         print traceback.print_exc(); print e
 100
 101                                         # NOTE: this failure could be an implementation issue on
 102                                         #               our end.  So, extra notices are confusing...
 103                                         # self._send_pcunotice(host)
 104
 105                                         pflags.setRecentFlag('pcufailed')
 106                                         pflags.save()
 107                                         return False
 108                         else:
 109                                 # we've tried the pcu recently, but it didn't work,
 110                                 # so did we send a message about it recently?
 111                                 if not pflags.getRecentFlag('pcumessagesent'):
 112
 113                                         self._send_pcunotice(host)
 114
 115                                         pflags.setRecentFlag('pcumessagesent')
 116                                         pflags.save()
 117
 118                                 # This will result in mail() being called next, to try to
 119                                 # engage the technical contact to take care of it also.
 120                                 print "RETURNING FALSE"
 121                                 return False
 122
 123                 else:
 124                         print "NO PCUOK"
 125                         self.action = "None"
 126                         return False
 127
 128         def mail(self, host):
 129
 130                 # Reset every 4 weeks or so
 131                 pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
 132                 if not pflags.getRecentFlag('endrecord'):
 133                         node_end_record(host)
 134                         pflags.setRecentFlag('endrecord')
 135                         pflags.save()
 136
 137                 # Then in either case, run mailmonitor.reboot()
 138                 self.action = "mailmonitor.reboot('%s')" % host
 139                 try:
 140                         return mailmonitor.reboot(host)
 141                 except Exception, e:
 142                         print traceback.print_exc(); print e
 143                         return False
 144
 145 class RebootDebug(Reboot):
 146
 147         def direct(self, host):
 148                 self.action = "bootman.reboot('%s', config, None)" % host
 149                 return bootman.reboot(host, config, None)
 150
 151 class RebootBoot(Reboot):
 152
 153         def direct(self, host):
 154                 self.action = "bootman.reboot('%s', config, 'reboot')" % host
 155                 return bootman.reboot(host, config, 'reboot')
 156
 157 class RebootDown(Reboot):
 158
 159         def direct(self, host):
 160                 self.action = "None"
 161                 return False    # this always fails, since the node will be down.
 162
 163 def set_node_to_rins(host, fb):
 164
 165         node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
 166         record = {'observation' : node[0],
 167                           'model' : 'USER_REQUEST',
 168                           'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host,
 169                           'time' : time.time()}
 170         l = Log(host, record)
 171
 172         ret = api.UpdateNode(host, {'boot_state' : 'rins'})
 173         if ret:
 174                 # it's nice to see the current status rather than the previous status on the console
 175                 node = api.GetNodes(host)[0]
 176                 print l
 177                 print "%-2d" % (i-1), nodegroup_display(node, fb)
 178                 return l
 179         else:
 180                 print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
 181                 return None
 182
 183
 184 try:
 185         rebootlog = database.dbLoad("rebootlog")
 186 except:
 187         rebootlog = LogRoll()
 188
 189 parser = parsermodule.getParser(['nodesets'])
 190 parser.set_defaults( timewait=0,
 191                                         skip=0,
 192                                         rins=False,
 193                                         reboot=False,
 194                                         findbad=False,
 195                                         force=False,
 196                                         nosetup=False,
 197                                         verbose=False,
 198                                         quiet=False,
 199                                         )
 200
 201 parser.add_option("", "--stopselect", dest="stopselect", metavar="",
 202                                         help="The select string that must evaluate to true for the node to be considered 'done'")
 203 parser.add_option("", "--findbad", dest="findbad", action="store_true",
 204                                         help="Re-run findbad on the nodes we're going to check before acting.")
 205 parser.add_option("", "--force", dest="force", action="store_true",
 206                                         help="Force action regardless of previous actions/logs.")
 207 parser.add_option("", "--rins", dest="rins", action="store_true",
 208                                         help="Set the boot_state to 'rins' for all nodes.")
 209 parser.add_option("", "--reboot", dest="reboot", action="store_true",
 210                                         help="Actively try to reboot the nodes, keeping a log of actions.")
 211
 212 parser.add_option("", "--verbose", dest="verbose", action="store_true",
 213                                         help="Extra debug output messages.")
 214 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 215                                         help="Do not perform the orginary setup phase.")
 216 parser.add_option("", "--skip", dest="skip",
 217                                         help="Number of machines to skip on the input queue.")
 218 parser.add_option("", "--timewait", dest="timewait",
 219                                         help="Minutes to wait between iterations of 10 nodes.")
 220
 221 parser = parsermodule.getParser(['defaults'], parser)
 222 config = parsermodule.parse_args(parser)
 223
 224 # COLLECT nodegroups, nodes and node lists
 225 if config.nodegroup:
 226         ng = api.GetNodeGroups({'name' : config.nodegroup})
 227         nodelist = api.GetNodes(ng[0]['node_ids'])
 228         hostnames = [ n['hostname'] for n in nodelist ]
 229
 230 if config.site:
 231         site = api.GetSites(config.site)
 232         l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
 233         hostnames = [ n['hostname'] for n in l_nodes ]
 234
 235 if config.node or config.nodelist:
 236         if config.node: hostnames = [ config.node ]
 237         else: hostnames = util.file.getListFromFile(config.nodelist)
 238
 239 fbquery = FindbadNodeRecord.get_all_latest()
 240 fb_nodelist = [ n.hostname for n in fbquery ]
 241
 242 if config.nodeselect:
 243         hostnames = node_select(config.nodeselect, fb_nodelist)
 244
 245 if config.findbad:
 246         # rerun findbad with the nodes in the given nodes.
 247         file = "findbad.txt"
 248         util.file.setFileFromList(file, hostnames)
 249         os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
 250         # TODO: shouldn't we reload the node list now?
 251
 252 l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
 253 # commands:
 254 i = 1
 255 count = 1
 256 #print "hosts: %s" % hostnames
 257 for host in hostnames:
 258
 259         #if 'echo' in host or 'hptest-1' in host: continue
 260
 261         try:
 262                 try:
 263                         node = api.GetNodes(host)[0]
 264                 except:
 265                         print traceback.print_exc();
 266                         print "FAILED GETNODES for host: %s" % host
 267                         continue
 268
 269                 print "%-2d" % i, nodegroup_display(node, fb)
 270                 i += 1
 271                 if i-1 <= int(config.skip): continue
 272                 if host in l_blacklist:
 273                         print "%s is blacklisted.  Skipping." % host
 274                         continue
 275
 276                 if config.stopselect:
 277                         dict_query = query_to_dict(config.stopselect)
 278                         fbnode = fb['nodes'][host]['values']
 279                         observed_state = get_current_state(fbnode)
 280
 281                         if verify(dict_query, fbnode) and observed_state != "dbg ":
 282                                 # evaluates to true, therefore skip.
 283                                 print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
 284                                 try:
 285                                         # todo: clean up act_all record here.
 286                                         # todo: send thank you, etc.
 287                                         mailmonitor.reboot(host)
 288                                 except Exception, e:
 289                                         print traceback.print_exc(); print e
 290
 291                                 continue
 292                         #else:
 293                                 #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
 294                                 #sys.exit(1)
 295
 296                 if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
 297                         print "recently rebooted %s.  skipping... " % host
 298                         continue
 299
 300                 if config.reboot:
 301
 302                         fbnode = fb['nodes'][host]['values']
 303                         observed_state = get_current_state(fbnode)
 304
 305                         if       observed_state == "dbg ":
 306                                 o = RebootDebug(fbnode)
 307
 308                         elif observed_state == "boot" :
 309                                 if config.rins:
 310                                         l = set_node_to_rins(host, fb)
 311                                         if l: rebootlog.add(l)
 312
 313                                 o = RebootBoot(fbnode)
 314
 315                         elif observed_state == "down":
 316                                 if config.rins:
 317                                         l = set_node_to_rins(host, fb)
 318                                         if l: rebootlog.add(l)
 319
 320                                 o = RebootDown(fbnode)
 321
 322
 323                         if o.direct(host):
 324                                 record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state,
 325                                                   'action' : o.action,
 326                                                   'model' : "none",
 327                                                   'time' : time.time()}
 328                         elif o.pcu(host):
 329                                 record = {'observation' : "PCU_SUCCESS: %s" % observed_state,
 330                                                   'action' : o.action,
 331                                                   'model' : "none",
 332                                                   'time' : time.time()}
 333                         elif o.mail(host):
 334                                 record = {'observation' : "MAIL_SUCCESS: %s" % observed_state,
 335                                                   'action' : o.action,
 336                                                   'model' : "none",
 337                                                   'time' : time.time()}
 338                         else:
 339                                 record = {'observation' : "REBOOT_FAILED: %s" %  observed_state,
 340                                                   'action' : "log failure",
 341                                                   'model' : "none",
 342                                                   'time' : time.time()}
 343
 344                                 print "ALL METHODS OF RESTARTING %s FAILED" % host
 345                                 args = {}
 346                                 args['hostname'] = host
 347                                 #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
 348                                 #                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
 349                                 #m.reset()
 350                                 #m.send(['monitor-list@lists.planet-lab.org'])
 351
 352                         l = Log(host, record)
 353                         print l
 354                         rebootlog.add(l)
 355         except KeyboardInterrupt:
 356                 print "Killed by interrupt"
 357                 sys.exit(0)
 358         except:
 359                 print traceback.print_exc();
 360                 print "Continuing..."
 361
 362         time.sleep(1)
 363         if count % 10 == 0:
 364                 print "Saving rebootlog"
 365                 database.dbDump("rebootlog", rebootlog)
 366                 wait_time = int(config.timewait)
 367                 print "Sleeping %d minutes" % wait_time
 368                 ti = 0
 369                 print "Minutes slept: ",
 370                 sys.stdout.flush()
 371                 while ti < wait_time:
 372                         print "%s" % ti,
 373                         sys.stdout.flush()
 374                         time.sleep(60)
 375                         ti = ti+1
 376
 377         count = count + 1
 378
 379 print "Saving rebootlog"
 380 database.dbDump("rebootlog", rebootlog)