planetstack/hpc_wizard/planetstack_analytics.py

   1 from bigquery_analytics import BigQueryAnalytics
   2 import datetime
   3 import re
   4 import os
   5 import sys
   6 import json
   7 import traceback
   8 import urllib2
   9
  10 if os.path.exists("/home/smbaker/projects/vicci/plstackapi/planetstack"):
  11     sys.path.append("/home/smbaker/projects/vicci/plstackapi/planetstack")
  12 else:
  13     sys.path.append("/opt/planetstack")
  14
  15 os.environ.setdefault("DJANGO_SETTINGS_MODULE", "planetstack.settings")
  16 from django import db
  17 from django.db import connection
  18 from core.models import Slice, Sliver, ServiceClass, Reservation, Tag, Network, User, Node, Image, Deployment, Site, NetworkTemplate, NetworkSlice, Service
  19
  20 BLUE_LOAD=5000000
  21 RED_LOAD=15000000
  22
  23 class PlanetStackAnalytics(BigQueryAnalytics):
  24     def __init__(self, tableName="demoevents"):
  25         BigQueryAnalytics.__init__(self, tableName)
  26
  27     def service_to_sliceNames(self, serviceName):
  28         service=Service.objects.get(name=serviceName)
  29         try:
  30             slices = service.slices.all()
  31         except:
  32             # BUG in data model -- Slice.service has related name 'service' and
  33             #                      it should be 'slices'
  34             slices = service.service.all()
  35
  36         return [slice.name for slice in slices]
  37
  38     def compose_query(self, slice=None, site=None, node=None, service=None, timeBucket="60", avg=[], sum=[], count=[], computed=[], groupBy=["Time"], orderBy=["Time"], tableName="demoevents"):
  39         tablePart = "%s.%s@-3600000--1" % ("vicci", tableName)
  40
  41         fields = []
  42         fieldNames = []
  43
  44         fields.append("SEC_TO_TIMESTAMP(INTEGER(TIMESTAMP_TO_SEC(time)/%s)*%s) as Time" % (str(timeBucket),str(timeBucket)))
  45         #fields.append("INTEGER(TIMESTAMP_TO_SEC(time)/%s)*%s as Time" % (str(timeBucket),str(timeBucket)))
  46
  47         for fieldName in avg:
  48             fields.append("AVG(%s) as avg_%s" % (fieldName, fieldName.replace("%","")))
  49             fieldNames.append("avg_%s" % fieldName.replace("%",""))
  50
  51         for fieldName in sum:
  52             fields.append("SUM(%s) as sum_%s" % (fieldName, fieldName.replace("%","")))
  53             fieldNames.append("sum_%s" % fieldName.replace("%",""))
  54
  55         for fieldName in count:
  56             fields.append("COUNT(distinct %s) as count_%s" % (fieldName, fieldName.replace("%","")))
  57             fieldNames.append("count_%s" % fieldName.replace("%",""))
  58
  59         for fieldName in computed:
  60             operator = "/"
  61             parts = fieldName.split("/")
  62             computedFieldName = "computed_" + parts[0].replace("%","")+"_div_"+parts[1].replace("%","")
  63             if len(parts)==1:
  64                 operator = "*"
  65                 parts = computed.split("*")
  66                 computedFieldName = "computed_" + parts[0].replace("%","")+"_mult_"+parts[1].replace("%","")
  67             fields.append("SUM(%s)%sSUM(%s) as %s" % (parts[0], operator, parts[1], computedFieldName))
  68             fieldNames.append(computedFieldName)
  69
  70         for fieldName in groupBy:
  71             if (fieldName not in ["Time"]):
  72                 fields.append(fieldName)
  73                 fieldNames.append(fieldName)
  74
  75         fields = ", ".join(fields)
  76
  77         where = []
  78
  79         if slice:
  80             where.append("%%slice='%s'" % slice)
  81         if site:
  82             where.append("%%site='%s'" % site)
  83         if node:
  84             where.append("%%hostname='%s'" % node)
  85         if service:
  86             sliceNames = self.service_to_sliceNames(service)
  87             if sliceNames:
  88                 where.append("(" + " OR ".join(["%%slice='%s'" % sliceName for sliceName in sliceNames]) +")")
  89
  90         if where:
  91             where = " WHERE " + " AND ".join(where)
  92         else:
  93             where =""
  94
  95         if groupBy:
  96             groupBySub = " GROUP BY " + ",".join(groupBy + ["%hostname"])
  97             groupBy = " GROUP BY " + ",".join(groupBy)
  98         else:
  99             groupBySub = " GROUP BY %hostname"
 100             groupBy = ""
 101
 102         if orderBy:
 103             orderBy = " ORDER BY " + ",".join(orderBy)
 104         else:
 105             orderBy = ""
 106
 107         if computed:
 108             subQuery = "SELECT %%hostname, %s FROM [%s]" % (fields, tablePart)
 109             if where:
 110                 subQuery = subQuery + where
 111             subQuery = subQuery + groupBySub
 112             #subQuery = subQuery + " GROUP BY %s,%%hostname" % timeField
 113
 114             sumFields = []
 115             for fieldName in fieldNames:
 116                 if fieldName.startswith("avg"):
 117                     sumFields.append("AVG(%s) as avg_%s"%(fieldName,fieldName))
 118                     sumFields.append("MAX(%s) as max_%s"%(fieldName,fieldName))
 119                 elif (fieldName.startswith("count")) or (fieldName.startswith("sum")) or (fieldName.startswith("computed")):
 120                     sumFields.append("SUM(%s) as sum_%s"%(fieldName,fieldName))
 121                 else:
 122                     sumFields.append(fieldName)
 123
 124             sumFields = ",".join(sumFields)
 125
 126             query = "SELECT %s, %s FROM (%s)" % ("Time", sumFields, subQuery)
 127             if groupBy:
 128                 query = query + groupBy
 129             if orderBy:
 130                 query = query + orderBy
 131         else:
 132             query = "SELECT %s FROM [%s]" % (fields, tablePart)
 133             if where:
 134                 query = query + " " + where
 135             if groupBy:
 136                 query = query + groupBy
 137             if orderBy:
 138                 query = query + orderBy
 139
 140         return query
 141
 142     def get_list_from_req(self, req, name, default=[]):
 143         value = req.GET.get(name, None)
 144         if not value:
 145             return default
 146         value=value.replace("@","%")
 147         return value.split(",")
 148
 149     def format_result(self, format, result, query, dataSourceUrl):
 150         if (format == "json_dicts"):
 151             result = {"query": query, "rows": result, "dataSourceUrl": dataSourceUrl}
 152             return ("application/javascript", json.dumps(result))
 153
 154         elif (format == "json_arrays"):
 155             new_result = []
 156             for row in result:
 157                 new_row = []
 158                 for key in sorted(row.keys()):
 159                     new_row.append(row[key])
 160                 new_result.append(new_row)
 161                 new_result = {"query": query, "rows": new_result}
 162             return ("application/javascript", json.dumps(new_result))
 163
 164         elif (format == "html_table"):
 165             new_rows = []
 166             for row in result:
 167                 new_row = []
 168                 for key in sorted(row.keys()):
 169                     new_row.append("<TD>%s</TD>" % str(row[key]))
 170                 new_rows.append("<TR>%s</TR>" % "".join(new_row))
 171
 172             new_result = "<TABLE>%s</TABLE>" % "\n".join(new_rows)
 173
 174             return ("text/html", new_result)
 175
 176         elif (format == "json_hpcdash"):
 177             new_rows = {}
 178             for row in result:
 179                 new_row = {"lat": float(row.get("lat", 0)),
 180                            "long": float(row.get("long", 0)),
 181                            "health": 0,
 182                            "numNodes": int(row.get("numNodes",0)),
 183                            "numHPCSlivers": int(row.get("sum_count_hostname", 0)),
 184                            "siteUrl": row.get("url", ""),
 185                            "hot": float(row.get("hotness", 0.0)),
 186                            "load": int(float(row.get("max_avg_cpu", 0)))}
 187                 new_rows[row["site"]] = new_row
 188             return ("application/javascript", json.dumps(new_rows))
 189
 190     def only_largest(self, rows, fieldName):
 191         """ Given a fieldName, only return the set of rows that had the
 192             maximum value of that fieldName.
 193         """
 194         maxVal = max( [int(row[fieldName]) for row in rows] )
 195         new_rows = [row for row in rows if int(row[fieldName])==maxVal]
 196         return new_rows
 197
 198     def merge_datamodel_sites(self, rows):
 199         """ For a query that included "site" in its groupby, merge in the
 200             opencloud site information.
 201         """
 202         for row in rows:
 203             sitename = row["site"]
 204             try:
 205                 model_site = Site.objects.get(name=sitename)
 206             except:
 207                 # we didn't find it in the data model
 208                 continue
 209
 210             row["lat"] = float(model_site.location.latitude)
 211             row["long"] = float(model_site.location.longitude)
 212             row["url"] = model_site.site_url
 213             row["numNodes"] = model_site.nodes.count()
 214
 215             if "max_avg_cpu" in row:
 216                 cpu=float(row["max_avg_cpu"])/100.0
 217                 row["hotness"] = max(0.0, ((cpu*RED_LOAD) - BLUE_LOAD)/(RED_LOAD-BLUE_LOAD))
 218
 219     def process_request(self, req):
 220         print req.GET
 221
 222         tqx = req.GET.get("tqx", None)
 223
 224         slice = req.GET.get("slice", None)
 225         site = req.GET.get("site", None)
 226         node = req.GET.get("node", None)
 227         service = req.GET.get("service", None)
 228
 229         format = req.GET.get("format", "json_dicts")
 230
 231         timeField = req.GET.get("timeBucket", "60")
 232         avg = self.get_list_from_req(req, "avg")
 233         sum = self.get_list_from_req(req, "sum")
 234         count = self.get_list_from_req(req, "count")
 235         computed = self.get_list_from_req(req, "computed")
 236         groupBy = self.get_list_from_req(req, "groupBy", ["Time"])
 237         orderBy = self.get_list_from_req(req, "orderBy", ["Time"])
 238
 239         maxRows = req.GET.get("maxRows", None)
 240         onlyLargest = req.GET.get("onlyLargest", None)
 241         mergeDataModelSites = req.GET.get("mergeDataModelSites", None)
 242
 243         q = self.compose_query(slice, site, node, service, timeField, avg, sum, count, computed, groupBy, orderBy)
 244
 245         print q
 246         dataSourceUrl = "http://" + req.META["SERVER_NAME"] + ":" + req.META["SERVER_PORT"] + req.META["PATH_INFO"] + "?" + req.META["QUERY_STRING"].replace("format=","origFormat=").replace("%","%25") + "&format=charts";
 247
 248         if (format=="dataSourceUrl"):
 249             result = {"dataSourceUrl": dataSourceUrl}
 250             return ("application/javascript", result)
 251
 252         elif (format=="raw"):
 253             result = self.run_query_raw(q)
 254             result["dataSourceUrl"] = dataSourceUrl
 255
 256             result = json.dumps(result);
 257
 258             return ("application/javascript", result)
 259
 260         elif (format=="charts"):
 261             bq_result = self.run_query_raw(q)
 262
 263             # cloudscrutiny code is probably better!
 264             table = {}
 265             table["cols"] = self.schema_to_cols(bq_result["schema"])
 266             rows = []
 267             for row in bq_result["rows"]:
 268                 rowcols = []
 269                 for (colnum,col) in enumerate(row["f"]):
 270                     if (colnum==0):
 271                         dt = datetime.datetime.fromtimestamp(float(col["v"]))
 272                         rowcols.append({"v": 'new Date("%s")' % dt.isoformat()})
 273                     else:
 274                         try:
 275                             rowcols.append({"v": float(col["v"])})
 276                         except:
 277                             rowcols.append({"v": col["v"]})
 278                 rows.append({"c": rowcols})
 279             table["rows"] = rows
 280
 281             if tqx:
 282                 reqId = tqx.strip("reqId:")
 283             else:
 284                 reqId = "0"
 285
 286             result = {"status": "okColumnChart", "reqId": reqId, "table": table, "version": "0.6"}
 287
 288             result = "google.visualization.Query.setResponse(" + json.dumps(result) + ");"
 289
 290             def unquote_it(x): return x.group()[1:-1].replace('\\"', '"')
 291
 292             p = re.compile(r'"new Date\(\\"[^"]*\\"\)"')
 293             result=p.sub(unquote_it, result)
 294
 295             return ("application/javascript", result)
 296
 297         else:
 298             result = self.run_query(q)
 299
 300             if onlyLargest:
 301                 result = self.only_largest(result, onlyLargest)
 302
 303             if mergeDataModelSites:
 304                 self.merge_datamodel_sites(result)
 305
 306             if maxRows:
 307                 result = result[-int(maxRows):]
 308
 309             return self.format_result(format, result, q, dataSourceUrl)
 310
 311
 312 def DoPlanetStackAnalytics(request):
 313     bq = PlanetStackAnalytics()
 314     result = bq.process_request(request)
 315
 316     return result
 317
 318 def main():
 319     bq = PlanetStackAnalytics()
 320
 321     q=bq.compose_query(avg=["%cpu","%bandwidth"], count=["%hostname"], slice="HyperCache")
 322     print q
 323     bq.dump_table(bq.run_query(q))
 324
 325     q=bq.compose_query(computed=["%bytes_sent/%elapsed"])
 326     print
 327     print q
 328     bq.dump_table(bq.run_query(q))
 329
 330     q=bq.compose_query(timeBucket=60*60, avg=["%cpu"], count=["%hostname"], computed=["%bytes_sent/%elapsed"])
 331     print
 332     print q
 333     bq.dump_table(bq.run_query(q))
 334
 335     q=bq.compose_query(avg=["%cpu"], count=["%hostname"], computed=["%bytes_sent/%elapsed"], service="HPC Service", groupBy=["Time","%site"])
 336     print
 337     print q
 338     result=bq.run_query(q)
 339     result = bq.only_largest(result, "Time")
 340     bq.merge_datamodel_sites(result)
 341     #bq.dump_table(result)
 342     print bq.format_result("json_hpcdash", result, q)
 343
 344 if __name__ == "__main__":
 345     main()
 346
 347
 348
 349
 350