grouprins.py

   1 #!/usr/bin/python
   2
   3 # This script is used to manipulate the operational state of nodes in
   4 # different node groups.  These are basically set operations on nodes via the
   5 # PLC api.
   6 #
   7 # Take the ng name as an argument....
   8 # optionally,
   9 #  * get a list of nodes in the given nodegroup.
  10 #  * set some or all in the set to rins.
  11 #  * restart them all.
  12 #  * do something else to them all.
  13 #
  14
  15 from monitor import config
  16 from monitor import util
  17 from monitor import const
  18 from monitor import database
  19 from monitor import parser as parsermodule
  20 from monitor import reboot
  21 from monitor.database.info.model import *
  22 from monitor.wrapper import plc
  23 api = plc.getAuthAPI()
  24
  25 import traceback
  26 from optparse import OptionParser
  27
  28 from monitor.common import *
  29 from nodequery import verify,query_to_dict,node_select
  30 from monitor.model import *
  31 import os
  32
  33 import time
  34
  35 import bootman          # debug nodes
  36 import mailmonitor      # down nodes without pcu
  37 from monitor.wrapper.emailTxt import mailtxt
  38 import sys
  39
  40 class Reboot(object):
  41         def __init__(self, fbnode):
  42                 self.fbnode = fbnode
  43
  44         def _send_pcunotice(self, host):
  45                 args = {}
  46                 args['hostname'] = host
  47                 try:
  48                         args['pcu_id'] = plc.getpcu(host)['pcu_id']
  49                 except:
  50                         args['pcu_id'] = host
  51
  52                 m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
  53                                                                  mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
  54
  55                 loginbase = plc.siteId(host)
  56                 m.send([const.TECHEMAIL % loginbase])
  57
  58         def pcu(self, host):
  59                 # TODO: It should be possible to diagnose the various conditions of
  60                 #               the PCU here, and send different messages as appropriate.
  61                 print "'%s'" % self.fbnode['pcu']
  62                 if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
  63                         self.action = "reboot.reboot('%s')" % host
  64
  65                         pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
  66                         #pflags.resetRecentFlag('pcutried')
  67                         if not pflags.getRecentFlag('pcutried'):
  68                                 try:
  69                                         print "CALLING REBOOT!!!"
  70                                         ret = reboot.reboot(host)
  71
  72                                         pflags.setRecentFlag('pcutried')
  73                                         pflags.save()
  74                                         return ret
  75
  76                                 except Exception,e:
  77                                         email_exception()
  78                                         print traceback.print_exc(); print e
  79
  80                                         # NOTE: this failure could be an implementation issue on
  81                                         #               our end.  So, extra notices are confusing...
  82                                         # self._send_pcunotice(host)
  83
  84                                         pflags.setRecentFlag('pcufailed')
  85                                         pflags.save()
  86                                         return False
  87
  88                         elif not pflags.getRecentFlag('pcu_rins_tried'):
  89                                 try:
  90                                         # set node to 'rins' boot state.
  91                                         print "CALLING REBOOT +++ RINS"
  92                                         plc.nodeBootState(host, 'rins')
  93                                         ret = reboot.reboot(host)
  94
  95                                         pflags.setRecentFlag('pcu_rins_tried')
  96                                         pflags.save()
  97                                         return ret
  98
  99                                 except Exception,e:
 100                                         email_exception()
 101                                         print traceback.print_exc(); print e
 102
 103                                         # NOTE: this failure could be an implementation issue on
 104                                         #               our end.  So, extra notices are confusing...
 105                                         # self._send_pcunotice(host)
 106
 107                                         pflags.setRecentFlag('pcufailed')
 108                                         pflags.save()
 109                                         return False
 110                         else:
 111                                 # we've tried the pcu recently, but it didn't work,
 112                                 # so did we send a message about it recently?
 113                                 if not pflags.getRecentFlag('pcumessagesent'):
 114
 115                                         self._send_pcunotice(host)
 116
 117                                         pflags.setRecentFlag('pcumessagesent')
 118                                         pflags.save()
 119
 120                                 # This will result in mail() being called next, to try to
 121                                 # engage the technical contact to take care of it also.
 122                                 print "RETURNING FALSE"
 123                                 return False
 124
 125                 else:
 126                         print "NO PCUOK"
 127                         self.action = "None"
 128                         return False
 129
 130         def mail(self, host):
 131
 132                 # Reset every 4 weeks or so
 133                 pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
 134                 if not pflags.getRecentFlag('endrecord'):
 135                         node_end_record(host)
 136                         pflags.setRecentFlag('endrecord')
 137                         pflags.save()
 138
 139                 # Then in either case, run mailmonitor.reboot()
 140                 self.action = "mailmonitor.reboot('%s')" % host
 141                 try:
 142                         return mailmonitor.reboot(host)
 143                 except Exception, e:
 144                         email_exception(host)
 145                         print traceback.print_exc(); print e
 146                         return False
 147
 148 class RebootDebug(Reboot):
 149
 150         def direct(self, host):
 151                 self.action = "bootman.reboot('%s', config, None)" % host
 152                 return bootman.reboot(host, config, None)
 153
 154 class RebootBoot(Reboot):
 155
 156         def direct(self, host):
 157                 self.action = "bootman.reboot('%s', config, 'reboot')" % host
 158                 return bootman.reboot(host, config, 'reboot')
 159
 160 class RebootDown(Reboot):
 161
 162         def direct(self, host):
 163                 self.action = "None"
 164                 return False    # this always fails, since the node will be down.
 165
 166 def set_node_to_rins(host, fb):
 167
 168         node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
 169         record = {'observation' : node[0],
 170                           'model' : 'USER_REQUEST',
 171                           'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host,
 172                           'time' : time.time()}
 173         l = Log(host, record)
 174
 175         ret = api.UpdateNode(host, {'boot_state' : 'rins'})
 176         if ret:
 177                 # it's nice to see the current status rather than the previous status on the console
 178                 node = api.GetNodes(host)[0]
 179                 print l
 180                 print "%-2d" % (i-1), nodegroup_display(node, fb)
 181                 return l
 182         else:
 183                 print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
 184                 return None
 185
 186
 187 try:
 188         rebootlog = database.dbLoad("rebootlog")
 189 except:
 190         rebootlog = LogRoll()
 191
 192 parser = parsermodule.getParser(['nodesets'])
 193 parser.set_defaults( timewait=0,
 194                                         skip=0,
 195                                         rins=False,
 196                                         reboot=False,
 197                                         findbad=False,
 198                                         force=False,
 199                                         nosetup=False,
 200                                         verbose=False,
 201                                         quiet=False,
 202                                         )
 203
 204 parser.add_option("", "--stopselect", dest="stopselect", metavar="",
 205                                         help="The select string that must evaluate to true for the node to be considered 'done'")
 206 parser.add_option("", "--findbad", dest="findbad", action="store_true",
 207                                         help="Re-run findbad on the nodes we're going to check before acting.")
 208 parser.add_option("", "--force", dest="force", action="store_true",
 209                                         help="Force action regardless of previous actions/logs.")
 210 parser.add_option("", "--rins", dest="rins", action="store_true",
 211                                         help="Set the boot_state to 'rins' for all nodes.")
 212 parser.add_option("", "--reboot", dest="reboot", action="store_true",
 213                                         help="Actively try to reboot the nodes, keeping a log of actions.")
 214
 215 parser.add_option("", "--verbose", dest="verbose", action="store_true",
 216                                         help="Extra debug output messages.")
 217 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 218                                         help="Do not perform the orginary setup phase.")
 219 parser.add_option("", "--skip", dest="skip",
 220                                         help="Number of machines to skip on the input queue.")
 221 parser.add_option("", "--timewait", dest="timewait",
 222                                         help="Minutes to wait between iterations of 10 nodes.")
 223
 224 parser = parsermodule.getParser(['defaults'], parser)
 225 config = parsermodule.parse_args(parser)
 226
 227 # COLLECT nodegroups, nodes and node lists
 228 if config.nodegroup:
 229         ng = api.GetNodeGroups({'name' : config.nodegroup})
 230         nodelist = api.GetNodes(ng[0]['node_ids'])
 231         hostnames = [ n['hostname'] for n in nodelist ]
 232
 233 if config.site:
 234         site = api.GetSites(config.site)
 235         l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
 236         hostnames = [ n['hostname'] for n in l_nodes ]
 237
 238 if config.node or config.nodelist:
 239         if config.node: hostnames = [ config.node ]
 240         else: hostnames = util.file.getListFromFile(config.nodelist)
 241
 242 fbquery = FindbadNodeRecord.get_all_latest()
 243 fb_nodelist = [ n.hostname for n in fbquery ]
 244
 245 if config.nodeselect:
 246         hostnames = node_select(config.nodeselect, fb_nodelist)
 247
 248 if config.findbad:
 249         # rerun findbad with the nodes in the given nodes.
 250         file = "findbad.txt"
 251         util.file.setFileFromList(file, hostnames)
 252         os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
 253         # TODO: shouldn't we reload the node list now?
 254
 255 q_blacklist = BlacklistRecord.query.all()
 256 l_blacklist = [ n.hostname for n in q_blacklist ]
 257 # commands:
 258 i = 1
 259 count = 1
 260 #print "hosts: %s" % hostnames
 261 for host in hostnames:
 262
 263         #if 'echo' in host or 'hptest-1' in host: continue
 264
 265         try:
 266                 try:
 267                         node = api.GetNodes(host)[0]
 268                 except:
 269                         email_exception()
 270                         print traceback.print_exc();
 271                         print "FAILED GETNODES for host: %s" % host
 272                         continue
 273
 274                 print "%-2d" % i, nodegroup_display(node, fb)
 275                 i += 1
 276                 if i-1 <= int(config.skip): continue
 277                 if host in l_blacklist:
 278                         print "%s is blacklisted.  Skipping." % host
 279                         continue
 280
 281                 if config.stopselect:
 282                         dict_query = query_to_dict(config.stopselect)
 283                         fbnode = fb['nodes'][host]['values']
 284                         observed_state = get_current_state(fbnode)
 285
 286                         if verify(dict_query, fbnode) and observed_state != "dbg ":
 287                                 # evaluates to true, therefore skip.
 288                                 print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
 289                                 try:
 290                                         # todo: clean up act_all record here.
 291                                         # todo: send thank you, etc.
 292                                         mailmonitor.reboot(host)
 293                                 except Exception, e:
 294                                         email_exception()
 295                                         print traceback.print_exc(); print e
 296
 297                                 continue
 298                         #else:
 299                                 #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
 300                                 #sys.exit(1)
 301
 302                 if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
 303                         print "recently rebooted %s.  skipping... " % host
 304                         continue
 305
 306                 if config.reboot:
 307
 308                         fbnode = fb['nodes'][host]['values']
 309                         observed_state = get_current_state(fbnode)
 310
 311                         if       observed_state == "dbg ":
 312                                 o = RebootDebug(fbnode)
 313
 314                         elif observed_state == "boot" :
 315                                 if config.rins:
 316                                         l = set_node_to_rins(host, fb)
 317                                         if l: rebootlog.add(l)
 318
 319                                 o = RebootBoot(fbnode)
 320
 321                         elif observed_state == "down":
 322                                 if config.rins:
 323                                         l = set_node_to_rins(host, fb)
 324                                         if l: rebootlog.add(l)
 325
 326                                 o = RebootDown(fbnode)
 327
 328
 329                         if o.direct(host):
 330                                 record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state,
 331                                                   'action' : o.action,
 332                                                   'model' : "none",
 333                                                   'time' : time.time()}
 334                         elif o.pcu(host):
 335                                 record = {'observation' : "PCU_SUCCESS: %s" % observed_state,
 336                                                   'action' : o.action,
 337                                                   'model' : "none",
 338                                                   'time' : time.time()}
 339                         elif o.mail(host):
 340                                 record = {'observation' : "MAIL_SUCCESS: %s" % observed_state,
 341                                                   'action' : o.action,
 342                                                   'model' : "none",
 343                                                   'time' : time.time()}
 344                         else:
 345                                 record = {'observation' : "REBOOT_FAILED: %s" %  observed_state,
 346                                                   'action' : "log failure",
 347                                                   'model' : "none",
 348                                                   'time' : time.time()}
 349
 350                                 print "ALL METHODS OF RESTARTING %s FAILED" % host
 351                                 args = {}
 352                                 args['hostname'] = host
 353                                 #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
 354                                 #                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
 355                                 #m.reset()
 356                                 #m.send(['monitor-list@lists.planet-lab.org'])
 357
 358                         l = Log(host, record)
 359                         print l
 360                         rebootlog.add(l)
 361         except KeyboardInterrupt:
 362                 print "Killed by interrupt"
 363                 sys.exit(0)
 364         except:
 365                 email_exception()
 366                 print traceback.print_exc();
 367                 print "Continuing..."
 368
 369         time.sleep(1)
 370         if count % 10 == 0:
 371                 print "Saving rebootlog"
 372                 database.dbDump("rebootlog", rebootlog)
 373                 wait_time = int(config.timewait)
 374                 print "Sleeping %d minutes" % wait_time
 375                 ti = 0
 376                 print "Minutes slept: ",
 377                 sys.stdout.flush()
 378                 while ti < wait_time:
 379                         print "%s" % ti,
 380                         sys.stdout.flush()
 381                         time.sleep(60)
 382                         ti = ti+1
 383
 384         count = count + 1
 385
 386 print "Saving rebootlog"
 387 database.dbDump("rebootlog", rebootlog)