only do maxDeltaTime if there are rows, make count only count distinct items, some...
authorScott Baker <smbaker@gmail.com>
Fri, 25 Apr 2014 00:04:55 +0000 (17:04 -0700)
committerScott Baker <smbaker@gmail.com>
Fri, 25 Apr 2014 00:04:55 +0000 (17:04 -0700)
planetstack/hpc_wizard/bigquery_analytics.py

index 994e837..dafb55c 100644 (file)
@@ -71,6 +71,11 @@ class BigQueryAnalytics:
                raise Exception('Error accessing register allocations: %d'%resp.status_code)
 
     def run_query_raw(self, query):
+        try:
+            file("/tmp/query_log","a").write("query %s\n" % query)
+        except:
+            pass
+
         p = re.compile('%[a-zA-z_]*')
 
         try:
@@ -79,6 +84,11 @@ class BigQueryAnalytics:
             self.reload_mapping()
             query = p.sub(self.remap, query)
 
+        try:
+            file("/tmp/query_log","a").write("remapped query %s\n" % query)
+        except:
+            pass
+
        storage = Storage('/opt/planetstack/hpc_wizard/bigquery_credentials.dat')
        credentials = storage.get()
 
@@ -155,13 +165,21 @@ class BigQueryAnalytics:
                 new_row["max_" + k] = max(new_row.get("max_" + k, 0), to_number(row.get(k,0)))
 
             for k in count:
-                new_row["count_" + k] = new_row.get("count_" + k, 0) + 1
+                v = row.get(k,None)
+                dl = new_row["distinct_" + k] = new_row.get("distinct_" + k, [])
+                if (v not in dl):
+                    dl.append(v)
+
+                #new_row["count_" + k] = new_row.get("count_" + k, 0) + 1
 
         for row in new_rows.values():
             for k in avg:
                 row["avg_" + k] = float(row["avg_" + k]) / row["avg_base_" + k]
                 del row["avg_base_" + k]
 
+            for k in count:
+                new_row["count_" + k] = len(new_row.get("distinct_" + k, []))
+
         return new_rows.values()
 
     def do_computed_fields(self, rows, computed=[]):
@@ -190,9 +208,10 @@ class BigQueryAnalytics:
         for (k,v) in filter.items():
             rows = self.filter_results(rows, k, v)
 
-        if maxDeltaTime is not None:
-            maxTime = max([float(row["time"]) for row in rows])
-            rows = [row for row in rows if float(row["time"])>=maxTime-maxDeltaTime]
+        if rows:
+            if maxDeltaTime is not None:
+                maxTime = max([float(row["time"]) for row in rows])
+                rows = [row for row in rows if float(row["time"])>=maxTime-maxDeltaTime]
 
         (computedFieldNames, rows) = self.do_computed_fields(rows, computed)
         sum = sum + computedFieldNames