grouprins.py

   1 #!/usr/bin/python
   2
   3 # This script is used to manipulate the operational state of nodes in
   4 # different node groups.  These are basically set operations on nodes via the
   5 # PLC api.
   6 #
   7 # Take the ng name as an argument....
   8 # optionally,
   9 #  * get a list of nodes in the given nodegroup.
  10 #  * set some or all in the set to rins.
  11 #  * restart them all.
  12 #  * do something else to them all.
  13 #
  14
  15 import plc
  16 api = plc.getAuthAPI()
  17
  18 import traceback
  19 import config
  20 import util.file
  21 from optparse import OptionParser
  22
  23 import const
  24 from nodecommon import *
  25 from nodequery import verify,query_to_dict,node_select
  26 import database
  27 from unified_model import *
  28 import os
  29
  30 import time
  31 import parser as parsermodule
  32
  33 from model import *
  34 import bootman          # debug nodes
  35 import reboot           # down nodes without pcu
  36 import mailmonitor      # down nodes with pcu
  37 from emailTxt import mailtxt
  38 #reboot.verbose = 0
  39 import sys
  40
  41 class Reboot(object):
  42         def __init__(self, fbnode):
  43                 self.fbnode = fbnode
  44
  45         def _send_pcunotice(self, host):
  46                 args = {}
  47                 args['hostname'] = host
  48                 try:
  49                         args['pcu_id'] = plc.getpcu(host)['pcu_id']
  50                 except:
  51                         args['pcu_id'] = host
  52
  53                 m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
  54                                                                  mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
  55
  56                 loginbase = plc.siteId(host)
  57                 m.send([const.TECHEMAIL % loginbase])
  58
  59         def pcu(self, host):
  60                 # TODO: It should be possible to diagnose the various conditions of
  61                 #               the PCU here, and send different messages as appropriate.
  62                 print "'%s'" % self.fbnode['pcu']
  63                 if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
  64                         self.action = "reboot.reboot('%s')" % host
  65
  66                         pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
  67                         #pflags.resetRecentFlag('pcutried')
  68                         if not pflags.getRecentFlag('pcutried'):
  69                                 try:
  70                                         node_pf = PersistFlags(host, 1, db='node_persistflags')
  71                                         if  node_pf.checkattr('last_change') and \
  72                                                 node_pf.last_change < time.time() - 60*60*24 and \
  73                                                 node_pf.checkattr('status') and \
  74                                                 node_pf.status != "good":
  75
  76                                                 print "CALLING REBOOT!!!"
  77                                                 ret = reboot.reboot(host)
  78
  79                                                 pflags.setRecentFlag('pcutried')
  80                                                 pflags.save()
  81                                                 return ret
  82                                         else:
  83                                                 return True
  84
  85                                 except Exception,e:
  86                                         email_exception()
  87                                         print traceback.print_exc(); print e
  88
  89                                         # NOTE: this failure could be an implementation issue on
  90                                         #               our end.  So, extra notices are confusing...
  91                                         # self._send_pcunotice(host)
  92
  93                                         pflags.setRecentFlag('pcufailed')
  94                                         pflags.save()
  95                                         return False
  96
  97                         elif not pflags.getRecentFlag('pcu_rins_tried'):
  98                                 try:
  99                                         # NOTE: check that the node has been down for at least a
 100                                         # day before rebooting it.  this avoids false-reboots/rins
 101                                         # from failed node detections. circa 03-12-09
 102                                         node_pf = PersistFlags(host, 1, db='node_persistflags')
 103                                         if  node_pf.checkattr('last_change') and \
 104                                                 node_pf.last_change < time.time() - 60*60*24 and \
 105                                                 node_pf.checkattr('status') and \
 106                                                 node_pf.status != "good":
 107
 108                                                 # set node to 'rins' boot state.
 109                                                 print "CALLING REBOOT +++ RINS"
 110                                                 plc.nodeBootState(host, 'reinstall')
 111                                                 ret = reboot.reboot(host)
 112
 113                                                 pflags.setRecentFlag('pcu_rins_tried')
 114                                                 pflags.save()
 115                                                 return ret
 116
 117                                         else:
 118                                                 return True
 119
 120                                 except Exception,e:
 121                                         email_exception()
 122                                         print traceback.print_exc(); print e
 123
 124                                         # NOTE: this failure could be an implementation issue on
 125                                         #               our end.  So, extra notices are confusing...
 126                                         # self._send_pcunotice(host)
 127
 128                                         pflags.setRecentFlag('pcufailed')
 129                                         pflags.save()
 130                                         return False
 131                         else:
 132                                 # we've tried the pcu recently, but it didn't work,
 133                                 # so did we send a message about it recently?
 134                                 if not pflags.getRecentFlag('pcumessagesent'):
 135
 136                                         self._send_pcunotice(host)
 137
 138                                         pflags.setRecentFlag('pcumessagesent')
 139                                         pflags.save()
 140
 141                                 # This will result in mail() being called next, to try to
 142                                 # engage the technical contact to take care of it also.
 143                                 print "RETURNING FALSE"
 144                                 return False
 145
 146                 else:
 147                         print "NO PCUOK"
 148                         self.action = "None"
 149                         return False
 150
 151         def mail(self, host):
 152
 153                 # Reset every 4 weeks or so
 154                 pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
 155                 if not pflags.getRecentFlag('endrecord'):
 156                         node_end_record(host)
 157                         pflags.setRecentFlag('endrecord')
 158                         pflags.save()
 159
 160                 # Then in either case, run mailmonitor.reboot()
 161                 self.action = "mailmonitor.reboot('%s')" % host
 162                 try:
 163                         return mailmonitor.reboot(host)
 164                 except Exception, e:
 165                         email_exception(host)
 166                         print traceback.print_exc(); print e
 167                         return False
 168
 169 class RebootDebug(Reboot):
 170
 171         def direct(self, host):
 172                 self.action = "bootman.reboot('%s', config, None)" % host
 173                 return bootman.reboot(host, config, None)
 174
 175 class RebootBoot(Reboot):
 176
 177         def direct(self, host):
 178                 self.action = "bootman.reboot('%s', config, 'reboot')" % host
 179                 return bootman.reboot(host, config, 'reboot')
 180
 181 class RebootDown(Reboot):
 182
 183         def direct(self, host):
 184                 self.action = "None"
 185                 return False    # this always fails, since the node will be down.
 186
 187 def set_node_to_rins(host, fb):
 188
 189         node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
 190         record = {'observation' : node[0],
 191                           'model' : 'USER_REQUEST',
 192                           'action' : 'api.UpdateNode(%s, {"boot_state" : "reinstall"})' % host,
 193                           'time' : time.time()}
 194         l = Log(host, record)
 195
 196         ret = api.UpdateNode(host, {'boot_state' : 'reinstall'})
 197         if ret:
 198                 # it's nice to see the current status rather than the previous status on the console
 199                 node = api.GetNodes(host)[0]
 200                 print l
 201                 print "%-2d" % (i-1), nodegroup_display(node, fb)
 202                 return l
 203         else:
 204                 print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
 205                 return None
 206
 207
 208 try:
 209         rebootlog = database.dbLoad("rebootlog")
 210 except:
 211         rebootlog = LogRoll()
 212
 213 parser = parsermodule.getParser(['nodesets'])
 214 parser.set_defaults( timewait=0,
 215                                         skip=0,
 216                                         rins=False,
 217                                         reboot=False,
 218                                         findbad=False,
 219                                         force=False,
 220                                         nosetup=False,
 221                                         verbose=False,
 222                                         quiet=False,
 223                                         )
 224
 225 parser.add_option("", "--stopselect", dest="stopselect", metavar="",
 226                                         help="The select string that must evaluate to true for the node to be considered 'done'")
 227 parser.add_option("", "--findbad", dest="findbad", action="store_true",
 228                                         help="Re-run findbad on the nodes we're going to check before acting.")
 229 parser.add_option("", "--force", dest="force", action="store_true",
 230                                         help="Force action regardless of previous actions/logs.")
 231 parser.add_option("", "--rins", dest="rins", action="store_true",
 232                                         help="Set the boot_state to 'rins' for all nodes.")
 233 parser.add_option("", "--reboot", dest="reboot", action="store_true",
 234                                         help="Actively try to reboot the nodes, keeping a log of actions.")
 235
 236 parser.add_option("", "--verbose", dest="verbose", action="store_true",
 237                                         help="Extra debug output messages.")
 238 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 239                                         help="Do not perform the orginary setup phase.")
 240 parser.add_option("", "--skip", dest="skip",
 241                                         help="Number of machines to skip on the input queue.")
 242 parser.add_option("", "--timewait", dest="timewait",
 243                                         help="Minutes to wait between iterations of 10 nodes.")
 244
 245 parser = parsermodule.getParser(['defaults'], parser)
 246 config = parsermodule.parse_args(parser)
 247
 248 # COLLECT nodegroups, nodes and node lists
 249 if config.nodegroup:
 250         ng = api.GetNodeGroups({'groupname' : config.nodegroup})
 251         nodelist = api.GetNodes(ng[0]['node_ids'])
 252         hostnames = [ n['hostname'] for n in nodelist ]
 253
 254 if config.site:
 255         site = api.GetSites(config.site)
 256         l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
 257         hostnames = [ n['hostname'] for n in l_nodes ]
 258
 259 if config.node or config.nodelist:
 260         if config.node: hostnames = [ config.node ]
 261         else: hostnames = util.file.getListFromFile(config.nodelist)
 262
 263 fb = database.dbLoad("findbad")
 264
 265 if config.nodeselect:
 266         hostnames = node_select(config.nodeselect, fb['nodes'].keys(), fb)
 267
 268 if config.findbad:
 269         # rerun findbad with the nodes in the given nodes.
 270         file = "findbad.txt"
 271         util.file.setFileFromList(file, hostnames)
 272         os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
 273         # TODO: shouldn't we reload the node list now?
 274
 275 l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
 276 # commands:
 277 i = 1
 278 count = 1
 279 #print "hosts: %s" % hostnames
 280 for host in hostnames:
 281
 282         #if 'echo' in host or 'hptest-1' in host: continue
 283
 284         try:
 285                 try:
 286                         node = api.GetNodes(host)[0]
 287                 except:
 288                         email_exception()
 289                         print traceback.print_exc();
 290                         print "FAILED GETNODES for host: %s" % host
 291                         continue
 292
 293                 print "%-2d" % i, nodegroup_display(node, fb)
 294                 i += 1
 295                 if i-1 <= int(config.skip): continue
 296                 if host in l_blacklist:
 297                         print "%s is blacklisted.  Skipping." % host
 298                         continue
 299
 300                 if config.stopselect:
 301                         dict_query = query_to_dict(config.stopselect)
 302                         fbnode = fb['nodes'][host]['values']
 303                         observed_state = get_current_state(fbnode)
 304
 305                         if verify(dict_query, fbnode) and observed_state != "dbg ":
 306                                 # evaluates to true, therefore skip.
 307                                 print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
 308                                 try:
 309                                         # todo: clean up act_all record here.
 310                                         # todo: send thank you, etc.
 311                                         mailmonitor.reboot(host)
 312                                 except Exception, e:
 313                                         email_exception()
 314                                         print traceback.print_exc(); print e
 315
 316                                 continue
 317                         #else:
 318                                 #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
 319                                 #sys.exit(1)
 320
 321                 if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
 322                         print "recently rebooted %s.  skipping... " % host
 323                         continue
 324
 325                 if config.reboot:
 326
 327                         fbnode = fb['nodes'][host]['values']
 328                         observed_state = get_current_state(fbnode)
 329
 330                         if       observed_state == "dbg ":
 331                                 o = RebootDebug(fbnode)
 332
 333                         elif observed_state == "boot" :
 334                                 if config.rins:
 335                                         l = set_node_to_rins(host, fb)
 336                                         if l: rebootlog.add(l)
 337
 338                                 o = RebootBoot(fbnode)
 339
 340                         elif observed_state == "down":
 341                                 if config.rins:
 342                                         l = set_node_to_rins(host, fb)
 343                                         if l: rebootlog.add(l)
 344
 345                                 o = RebootDown(fbnode)
 346
 347
 348                         if o.direct(host):
 349                                 record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state,
 350                                                   'action' : o.action,
 351                                                   'model' : "none",
 352                                                   'time' : time.time()}
 353                         elif o.pcu(host):
 354                                 record = {'observation' : "PCU_SUCCESS: %s" % observed_state,
 355                                                   'action' : o.action,
 356                                                   'model' : "none",
 357                                                   'time' : time.time()}
 358                         elif o.mail(host):
 359                                 record = {'observation' : "MAIL_SUCCESS: %s" % observed_state,
 360                                                   'action' : o.action,
 361                                                   'model' : "none",
 362                                                   'time' : time.time()}
 363                         else:
 364                                 record = {'observation' : "REBOOT_FAILED: %s" %  observed_state,
 365                                                   'action' : "log failure",
 366                                                   'model' : "none",
 367                                                   'time' : time.time()}
 368
 369                                 print "ALL METHODS OF RESTARTING %s FAILED" % host
 370                                 args = {}
 371                                 args['hostname'] = host
 372                                 #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
 373                                 #                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
 374                                 #m.reset()
 375                                 #m.send(['monitor-list@lists.planet-lab.org'])
 376
 377                         l = Log(host, record)
 378                         print l
 379                         rebootlog.add(l)
 380         except KeyboardInterrupt:
 381                 print "Killed by interrupt"
 382                 sys.exit(0)
 383         except:
 384                 email_exception()
 385                 print traceback.print_exc();
 386                 print "Continuing..."
 387
 388         time.sleep(1)
 389         if count % 10 == 0:
 390                 print "Saving rebootlog"
 391                 database.dbDump("rebootlog", rebootlog)
 392                 wait_time = int(config.timewait)
 393                 print "Sleeping %d minutes" % wait_time
 394                 ti = 0
 395                 print "Minutes slept: ",
 396                 sys.stdout.flush()
 397                 while ti < wait_time:
 398                         print "%s" % ti,
 399                         sys.stdout.flush()
 400                         time.sleep(60)
 401                         ti = ti+1
 402
 403         count = count + 1
 404
 405 print "Saving rebootlog"
 406 database.dbDump("rebootlog", rebootlog)