%package pcucontrol
summary: pcu controls for monitor and plcapi
group: applications/system
-requires: python
+Requires: python
+Requires: OpenIPMI-tools
+Requires: openssh-clients
+Requires: perl-libwww-perl
+Requires: perl-IO-Socket-SSL
+Requires: curl
%description pcucontrol
both monitor and the plcapi use a set of common commands to reboot machines
def main():
+ loginbase = False
+
try:
- longopts = ["delete=", "help"]
- (opts, argv) = getopt.getopt(sys.argv[1:], "d:h", longopts)
+ longopts = ["delete=", "loginbase", "help"]
+ (opts, argv) = getopt.getopt(sys.argv[1:], "d:lh", longopts)
except getopt.GetoptError, err:
print "Error: " + err.msg
sys.exit(1)
- blacklist = BlacklistRecord.query.all()
- hostnames = [ h.hostname for h in blacklist ]
+ hostnames_q = BlacklistRecord.getHostnameBlacklist()
+ loginbases_q = BlacklistRecord.getLoginbaseBlacklist()
+ hostnames = [ h.hostname for h in hostnames_q ]
+ loginbases = [ h.loginbase for h in loginbases_q ]
for (opt, optval) in opts:
if opt in ["-d", "--delete"]:
i = optval
bl = BlacklistRecord.get_by(hostname=i)
bl.delete()
+ elif opt in ["-l", "--loginbase"]:
+ loginbase = True
else:
usage()
sys.exit(0)
i_cnt = 0
- for i in blacklist:
- print i.hostname
- i_cnt += 1
+ if not loginbase:
+ for i in hostnames:
+ print i
+ i_cnt += 1
+ else:
+ for i in loginbases:
+ print i
+ i_cnt += 1
+
while 1:
if not line:
break
line = line.strip()
- if line not in hostnames:
- bl = BlacklistRecord(hostname=line)
+ if line not in hostnames and line not in loginbases:
+ if loginbase:
+ bl = BlacklistRecord(loginbase=line)
+ else:
+ bl = BlacklistRecord(hostname=line)
bl.flush()
i_cnt += 1
session.flush()
- print "Total %d nodes in blacklist" % (i_cnt)
+ if loginbase:
+ print "Total %d loginbases in blacklist" % (i_cnt)
+ else:
+ print "Total %d nodes in blacklist" % (i_cnt)
if __name__ == '__main__':
import os
class BlacklistRecord(Entity):
date_created = Field(DateTime,default=datetime.now)
- hostname = Field(String,default=None, primary_key=True)
+ hostname = Field(String,default=None)
+ loginbase = Field(String,default=None)
expires = Field(Integer,default=0) # seconds plus
acts_as_versioned(['hostname'])
+ @classmethod
+ def getLoginbaseBlacklist(cls):
+ # TODO: need to sort on 'round' since actions will not be globally sync'd.
+ return cls.query.filter(cls.loginbase!=None).order_by(cls.loginbase.desc())
+
+ @classmethod
+ def getHostnameBlacklist(cls):
+ # TODO: need to sort on 'round' since actions will not be globally sync'd.
+ return cls.query.filter(cls.hostname!=None).order_by(cls.hostname.desc())
+
def neverExpires(self):
if self.expires == 0:
return True
with PlanetLab.
""")
- pcufailed_notice =("""Could not use PCU to reboot %(hostname)s""",
+ pcufailed_notice =("""MONTEST: Could not use PCU to reboot %(hostname)s""",
"""As part of PlanetLab node monitoring and maintenance, we tried to use the PCU
registered for %(hostname)s, but could not for some reason.
Thank you very much for your help,
-- PlanetLab Central (support@planet-lab.org)
""")
- online_notice=("""Host %(hostname)s is online""",
+ online_notice=("""MONTEST: Host %(hostname)s is online""",
"""
This notice is simply to let you know that:
%(hostname)s
is online and operational. Thank you very much for your help!
""")
- test_notice=("""Host %(hostname)s is testing""",
+ test_notice=("""MONTEST: Host %(hostname)s is testing""",
"""
This notice is simply to test whether notices work.
%(hostname)s
Thank you very much for your help!
""")
- retry_bootman=("""Running BootManager on %(hostname)s""",
+ retry_bootman=("""MONTEST: Running BootManager on %(hostname)s""",
"""
This notice is simply to let you know that:
%(hostname)s
appears stuck in a debug mode. To try to correct this, we're trying to rerun BootManager.py.
If any action is needed from you, you will recieve additional notices. Thank you!
""")
- down_notice=("""Host %(hostname)s is down""",
+ down_notice=("""MONTEST: Host %(hostname)s is down""",
"""
This notice is simply to let you know that:
%(hostname)s
is down, disconnected from the network and/or non-operational. Please investigate, thank you very much for your help!
""")
- clear_penalty=("""All penalties have been cleared from site %(loginbase)s""",
+ clear_penalty=("""MONTEST: All penalties have been cleared from site %(loginbase)s""",
"""
This notice is to let you know that any penalties previously applied to your site have
been removed: %(penalty_level)s.
2+ - all existing slices will be disabled.
""")
- increase_penalty=("""Penalty increased for site %(loginbase)s""",
+ increase_penalty=("""MONTEST: Penalty increased for site %(loginbase)s""",
"""
This notice is to let you know that the penalty applied to your site has
increased: %(penalty_level)s.
2+ - all existing slices will be disabled.
""")
- newbootcd_notice=(""" Host %(hostname)s needs a new BootImage""", """
+ newbootcd_notice=("""MONTEST: Host %(hostname)s needs a new BootImage""", """
As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD:
%(hostname)s
-- PlanetLab Central (support@planet-lab.org)
""")
- newalphacd_notice=(""" New Boot Images for %(hostname)s""",
+ newalphacd_notice=("""MONTEST: New Boot Images for %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed that we were not able to recognize all the hardware in your machine. This means that it is so new that it needs a new BootCD, or that it is so old that it is no longer supported.
%(hostname)s
pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one]
pcudown=[pcudown_one, pcudown_one, pcudown_one]
- unknownsequence_notice = ("""Unrecognized Error on PlanetLab host %(hostname)s""",
+ unknownsequence_notice = ("""MONTEST: Unrecognized Error on PlanetLab host %(hostname)s""",
"""
While trying to automatically recover this machine:
donation_down = [ donation_down_one, donation_down_one, donation_down_one ]
- minimalhardware_notice = ("""Hardware requirements not met on PlanetLab host %(hostname)s""",
+ minimalhardware_notice = ("""MONTEST: Hardware requirements not met on PlanetLab host %(hostname)s""",
"""
While trying to automatically recover this machine:
%(bmlog)s
""" )
- baddisk_notice = ("""Bad Disk on PlanetLab node %(hostname)s""",
+ baddisk_notice = ("""MONTEST: Bad Disk on PlanetLab node %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node.
Please verify the integrity of the disk, and order a replacement if needed. If you need to schedule downtime for the node, please let us know at support@planet-lab.org.
%(bmlog)s
""")
- nodeconfig_notice=(""" Please Update Configuration file for PlanetLab node %(hostname)s""",
+ nodeconfig_notice=("""MONTEST: Please Update Configuration file for PlanetLab node %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME. This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade. To resolve the issue we require your assistance. All that is needed is to visit:
https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
""")
- baddns_notice=("""Planetlab node down: broken DNS configuration for %(hostname)s""",
+ baddns_notice=("""MONTEST: Planetlab node down: broken DNS configuration for %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries.
%(hostname)s
def attemptReboot(self, hostname):
print "attempting PCU reboot of %s" % hostname
- ret = reboot.reboot_str(hostname)
+ err = ""
+ try:
+ ret = reboot.reboot_str(hostname)
+ except Exception, e:
+ err = traceback.format_exc()
+ ret = str(e)
+
if ret == 0 or ret == "0":
ret = ""
+
act = ActionRecord(loginbase=self.db.loginbase,
hostname=hostname,
action='reboot',
action_type='first_try_reboot',
- error_string=ret)
+ error_string=err)
def logic():
# send down node notice
sitehist.sendMessage('down_notice', hostname=host)
- print "send message for host %s offline" % host
+ print "send message for host %s down" % host
pass
node_count = node_count + 1
+ session.flush()
for site in sitenames:
sitehist = SiteInterface.get_or_make(loginbase=site)
site_count = site_count + 1
- session.flush()
+ session.flush()
+ session.flush()
return
main(hostnames, sitenames)
except KeyboardInterrupt:
print "Killed by interrupt"
+ session.flush()
sys.exit(0)
except:
#email_exception()
print traceback.print_exc();
- print "Continuing..."
+ print "fail all..."
def check_site_state(rec, sitehist):
- if sitehist.new and sitehist.status != 'new':
+ if sitehist.new and sitehist.status not in ['new', 'online', 'good']:
sitehist.status = 'new'
+ sitehist.penalty_applied = True # because new sites are disabled by default, i.e. have a penalty.
sitehist.last_changed = datetime.now()
- if not sitehist.new:
+ if sitehist.nodes_up >= MINUP:
- if sitehist.nodes_up >= MINUP:
+ if sitehist.status != 'online' and sitehist.status != 'good':
+ sitehist.last_changed = datetime.now()
- if sitehist.status != 'online' and sitehist.status != 'good':
- sitehist.last_changed = datetime.now()
+ if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'online':
+ print "changed status from %s to online" % sitehist.status
+ sitehist.status = 'online'
- if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'online':
- print "changed status from %s to online" % sitehist.status
- sitehist.status = 'online'
+ if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'good':
+ print "changed status from %s to good" % sitehist.status
+ sitehist.status = 'good'
- if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'good':
- print "changed status from %s to good" % sitehist.status
- sitehist.status = 'good'
+ if not sitehist.new:
- else: # sitehist.nodes_up < MINUP:
-
- if sitehist.status != 'offline' and sitehist.status != 'down':
- sitehist.last_changed = datetime.now()
+ if sitehist.status != 'offline' and sitehist.status != 'down':
+ sitehist.last_changed = datetime.now()
- if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'offline':
- print "changed status from %s to offline" % sitehist.status
- sitehist.status = 'offline'
+ if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'offline':
+ print "changed status from %s to offline" % sitehist.status
+ sitehist.status = 'offline'
- if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'down':
- print "changed status from %s to down" % sitehist.status
- sitehist.status = 'down'
+ if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'down':
+ print "changed status from %s to down" % sitehist.status
+ sitehist.status = 'down'
def checkAndRecordState(l_sites, l_plcsites):
count = 0
prep_node_for_display(node)
nodequery += [node]
- return self.pcuview(None, hostname) # dict(nodequery=nodequery)
+ return self.pcuview(None, None, hostname) # dict(nodequery=nodequery)
@expose(template="monitorweb.templates.nodelist")
def node(self, filter='boot'):
def nodeaction_handler(self, tg_exceptions=None):
"""Handle any kind of error."""
+ print "NODEACTION_HANDLER------------------"
if 'pcuid' in request.params:
pcuid = request.params['pcuid']
return self.pcuview(None, pcuid, **dict(exceptions=tg_exceptions))
def nodeaction(self, **data):
+ print "NODEACTION------------------"
for item in data.keys():
print "%s %s" % ( item, data[item] )
ret = reboot.reboot_str(str(hostname))
print ret
if ret: raise RuntimeError("Error using PCU: " + str(ret))
- flash("Reboot appeared to work. All at most 5 minutes. Run ExternalScan to check current status.")
+ flash("Reboot appeared to work. Allow at most 5 minutes. Then run ExternalScan to check current status.")
elif action == "ExternalScan":
scanapi.externalprobe(str(hostname))
@expose(template="monitorweb.templates.pcuview")
@exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)")
def pcuview(self, loginbase=None, pcuid=None, hostname=None, **data):
+ print "PCUVIEW------------------"
session.clear()
sitequery=[]
pcuquery=[]
\r
#header {\r
height: 40px;\r
- width: 780px;\r
+ /*width: 780px;*/\r
/*background: blue URL('../images/header_inner.png') no-repeat;*/\r
- border-left: 1px solid #aaa;\r
- border-right: 1px solid #aaa;\r
+ /*border-left: 1px solid #aaa;*/\r
+ /*border-right: 1px solid #aaa;*/\r
margin: 0 auto 0 auto;\r
text-align: center;\r
font-size: 180%;\r
}\r
\r
#footer {\r
- border: 1px solid #aaa;\r
+ /*border: 1px solid #aaa;*/\r
border-top: 0px none;\r
color: #999;\r
background-color: white;\r
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns:py="http://purl.org/kid/ns#">
<head>
- <title>App Name - ${page_title}</title>
+ <title>${page_title}</title>
<link href="static/css/style.css" type="text/css" rel="stylesheet" />
<script type="text/javascript" src="tg_js/MochiKit.js"></script>
<script type="text/javascript" src="static/javascript/sortable_tables.js"></script>
</head>
<body>
- <div id="header">Monitor : ${page_title}</div>
<table valign="top" border="1" bgcolor="white" align="center" width="700px">
+ <tr> <td> <div id="header">${page_title}</div> </td> </tr>
<tr>
<td>
<table id="nps-table" width="100%">
<th><a href="${link('site')}">Sites</a></th>
<th><a href="${link('pcu')}">PCUs</a></th>
<th><a href="${link('node')}">Nodes</a></th>
- <th><a href="${link('action')}">Actions</a></th>
+ <th><a href="">Actions</a></th>
</tr>
</thead>
<tbody>
</table>
</td>
</tr>
+ <tr> <td> <div id="footer">Copywrite © 2007-2008 The Trustees of Princeton University</div> </td> </tr>
</table>
- <div id="footer">Copywrite © 2007-2008 The Trustees of Princeton University</div>
</body>
</html>