R routines for printing some statistics
authorStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 11 Feb 2010 20:14:07 +0000 (20:14 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Thu, 11 Feb 2010 20:14:07 +0000 (20:14 +0000)
12 files changed:
statistics/bm_reboot.r [new file with mode: 0644]
statistics/bm_reboot_api.r [new file with mode: 0644]
statistics/bm_reboot_unique.r [new file with mode: 0644]
statistics/functions.r [new file with mode: 0644]
statistics/node_history_may0809.r [new file with mode: 0644]
statistics/prep.r [new file with mode: 0644]
statistics/rpm_dist.r [new file with mode: 0644]
statistics/rt_data.r [new file with mode: 0644]
statistics/rt_monitor_data.r [new file with mode: 0644]
statistics/site_scores.r [new file with mode: 0644]
statistics/site_scores_bw.r [new file with mode: 0644]
statistics/site_scores_pcu.r [new file with mode: 0644]

diff --git a/statistics/bm_reboot.r b/statistics/bm_reboot.r
new file mode 100644 (file)
index 0000000..e0ff6df
--- /dev/null
@@ -0,0 +1,52 @@
+
+source("functions.r");
+
+# system("parse_rt_data.py > rt_data.csv");
+# ./bmevents.py events.1-18-10 BootUpdateNode > bm_reboot_2010-01-18.csv
+# ./bmevents.py events.10-08-09 BootUpdateNode > bm_reboot_2009-10-08.csv 
+# ./bmevents.py events.29.12.08.dump BootUpdateNode > bm_reboot_2008-12-29.csv
+# ./bmevents.py events.8-25-09.dump BootUpdateNode > bm_reboot_2009-08-25.csv
+# 
+t <- read.csv('bm_reboot.csv', sep=',', header=TRUE)
+
+t2<-t
+
+tstamp_78 <-unclass(as.POSIXct("2008-01-01", origin="1960-01-01"))[1]
+tstamp_89 <-unclass(as.POSIXct("2009-01-01", origin="1960-01-01"))[1]
+
+t_7 <- t2[which( t2$start < tstamp_78 ),]
+t_8 <- t2[which( t2$start >= tstamp_78 & t2$start < tstamp_89 ),]
+t_9 <- t2[which( t2$start >= tstamp_89 ),]
+
+tstamp <-unclass(as.POSIXct("2008-01-01", origin="1960-01-01"))
+t_67 <- t2[which( t2$start <  tstamp[1] ),]
+t_89 <- t2[which( t2$start >= tstamp[1] ),]
+
+
+#start_image("bm_reboot.png")
+
+par(mfrow=c(2,1))
+par(mai=c(.5,.4,.5,.4))
+year_hist(t_9, "2009", "2009/06/21", "2010/2/10", 500, 'day', "Daily Reboot Rates")
+rows <- year_hist_unique(t_9, "2009", "2009/06/21", "2010/2/10", 100, 'day', "Unique Daily Reboots")
+
+#end_image()
+
+start_image("reboot_distributions.png")
+par(mfrow=c(2,1))
+par(mai=c(.5,.5,.5,.5))
+
+m<-mean(rows$reboots[which(rows$reboots>0&rows$reboots<50)])
+s<-sd(rows$reboots[which(rows$reboots>0&rows$reboots<50)])
+
+qqnorm(rows$reboots[which(rows$reboots>0&rows$reboots<50)])
+qqline(rows$reboots[which(rows$reboots>0&rows$reboots<50)])
+
+h<-hist(rows$reboots[which(rows$reboots>0&rows$reboots<50)], breaks=20)
+x<- 0:100/100 * 2 * m
+y<- dnorm(x, mean=m, sd=s)
+lines(x,y*max(h$counts)/max(y))
+end_image()
+
+par(mfrow=c(1,1))
+par(mai=c(.7,.7,.7,.7))
diff --git a/statistics/bm_reboot_api.r b/statistics/bm_reboot_api.r
new file mode 100644 (file)
index 0000000..3bea8f6
--- /dev/null
@@ -0,0 +1,74 @@
+
+source("functions.r");
+
+# system("parse_rt_data.py > rt_data.csv");
+# ./bmevents.py events.1-18-10 BootUpdateNode > bm_reboot_2010-01-18.csv
+# ./bmevents.py events.10-08-09 BootUpdateNode > bm_reboot_2009-10-08.csv 
+# ./bmevents.py events.29.12.08.dump BootUpdateNode > bm_reboot_2008-12-29.csv
+# ./bmevents.py events.8-25-09.dump BootUpdateNode > bm_reboot_2009-08-25.csv
+# 
+t <- read.csv('bm_reboot_2008-12-29.csv', sep=',', header=TRUE)
+
+t2<-t
+
+tstamp_78 <-unclass(as.POSIXct("2008-01-01", origin="1960-01-01"))[1]
+tstamp_89 <-unclass(as.POSIXct("2009-01-01", origin="1960-01-01"))[1]
+
+t_7 <- t2[which( t2$start < tstamp_78 ),]
+t_8 <- t2[which( t2$start >= tstamp_78 & t2$start < tstamp_89 ),]
+t_9 <- t2[which( t2$start >= tstamp_89 ),]
+
+tstamp <-unclass(as.POSIXct("2008-01-01", origin="1960-01-01"))
+t_67 <- t2[which( t2$start <  tstamp[1] ),]
+t_89 <- t2[which( t2$start >= tstamp[1] ),]
+
+start_image("bm_reboot_api.png")
+
+par(mfrow=c(2,1))
+par(mai=c(.5,.4,.5,.4))
+year_hist(t_9, "2009", "2009/06/21", "2010/2/10", 500, 'day', "Daily Reboot Rates")
+rows_api <- year_hist_unique(t_9, "2009", "2009/06/21", "2010/2/10", 100, 'day', "Unique Daily Reboots")
+
+#year_hist(t_89, "2008-2009", "2008/01/21", "2010/2/10", 0, 'day', "Daily Reboot Rates")
+#rows <- year_hist_unique(t_89, "2008-2009", "2008/01/21", "2010/2/10", 0, 'day', "Unique Daily Reboots")
+
+end_image()
+
+
+## NOTE: compare api and log data:
+start_image("bm_reboot_compare.png", width=960)
+par(mfrow=c(1,1))
+par(mai=c(1.0,.7,.7,.7))
+x<- cbind(rows$reboots, rows_api$reboots)
+#barplot(t(x), beside=TRUE, ylim=c(0,150), main="Compare Daily Frequency of Raw-logs & API Events")
+barplot(rows$reboots-rows_api$reboots, ylim=c(-40,150), main="Difference between Raw-logs & API Events", xlab="Day", ylab="Difference of Frequency")
+end_image()
+
+# it appears that logs come out ahead consistently of the API events.
+start_image("bm_reboot_diff_freq.png")
+d<-rows$reboots-rows_api$reboots
+hist(d[which( d > -10 & d < 20)], breaks=20, main="Frequency of Differences", xlab="Difference")
+end_image()
+
+# * why is this so?
+
+###
+
+start_image("reboot_distributions.png")
+par(mfrow=c(2,1))
+par(mai=c(.5,.5,.5,.5))
+
+m<-mean(rows$reboots[which(rows$reboots>0&rows$reboots<50)])
+s<-sd(rows$reboots[which(rows$reboots>0&rows$reboots<50)])
+
+qqnorm(rows$reboots[which(rows$reboots>0&rows$reboots<50)])
+qqline(rows$reboots[which(rows$reboots>0&rows$reboots<50)])
+
+h<-hist(rows$reboots[which(rows$reboots>0&rows$reboots<50)], breaks=20)
+x<- 0:100/100 * 2 * m
+y<- dnorm(x, mean=m, sd=s)
+lines(x,y*max(h$counts)/max(y))
+end_image()
+
+par(mfrow=c(1,1))
+par(mai=c(.7,.7,.7,.7))
diff --git a/statistics/bm_reboot_unique.r b/statistics/bm_reboot_unique.r
new file mode 100644 (file)
index 0000000..29f2bf6
--- /dev/null
@@ -0,0 +1,250 @@
+
+source("functions.r");
+
+# system("parse_rt_data.py > rt_data.csv");
+# ./bmevents.py events.1-18-10 BootUpdateNode > bm_reboot_2010-01-18.csv
+# ./bmevents.py events.10-08-09 BootUpdateNode > bm_reboot_2009-10-08.csv 
+# ./bmevents.py events.29.12.08.dump BootUpdateNode > bm_reboot_2008-12-29.csv
+# ./bmevents.py events.8-25-09.dump BootUpdateNode > bm_reboot_2009-08-25.csv
+# 
+bm <- read.csv('bm_reboot.csv', sep=',', header=TRUE)
+bm_api <- read.csv('bm_reboot_2008-12-29.csv', sep=',', header=TRUE)
+
+bm2<-bm
+
+tstamp_78 <-unclass(as.POSIXct("2008-01-01", origin="1960-01-01"))[1]
+tstamp_89 <-unclass(as.POSIXct("2009-01-01", origin="1960-01-01"))[1]
+
+bm_7 <- bm2[which( bm2$start < tstamp_78 ),]
+bm_8 <- bm2[which( bm2$start >= tstamp_78 & bm2$start < tstamp_89 ),]
+bm_9 <- bm2[which( bm2$start >= tstamp_89 ),]
+
+tstamp <-unclass(as.POSIXct("2008-01-01", origin="1960-01-01"))
+bm_67 <- bm2[which( bm2$start <  tstamp[1] ),]
+bm_89 <- bm2[which( bm2$start >= tstamp[1] ),]
+
+
+#start_image("bm_reboot.png")
+
+par(mfrow=c(2,1))
+par(mai=c(.5,.4,.5,.4))
+#year_hist(bm_9, "2009", "2009/06/21", "2010/2/10", 500, 'day', "Daily Reboot Rates")
+#rows <- year_hist_unique(bm_9, "2009", "2009/06/21", "2010/2/10", 100, 'day', "Unique Daily Reboots")
+#end_image()
+
+if ( TRUE )
+{
+    rows_blocks <- year_hist_unique_recent(bm_9, "2009", "2009/06/21", "2010/2/10", 100, c(1,3,7,14,30), 'day', "Unique Daily Reboots")
+
+    x<-NULL
+    blocks <- c(0,1,3,7,14,30)
+    for ( b in blocks ) { x<- c(x, paste("X", b, sep="")) }
+
+    par(mfrow=c(1,1))
+    par(mai=c(1,.7,.5,.4))
+    start_image("bm_reboot_color.png", width=900)
+
+    barplot(t(rows_blocks[x]), border=NA, col=c('purple', 'blue', 'green', 'red', 'pink', 'orange', 'yellow'), ylim=c(0,100), main="How Recently Node were Rebooted", xlab="Days from June-2009 to Jan-2010", space=0, legend=c("Only today", "Also within 1 day", "Also within 3 days", "Also within 7 days", "Also within 14 days", "Also within 30 days"), ylab="Frequency")
+    end_image()
+
+    #barplot(rows_blocks$X0, border=NA, col=c('purple', 'blue', 'green', 'red', 'pink', 'orange', 'yellow'), ylim=c(0,100))
+
+    #par(mfrow=c(6,1))
+    #par(mai=c(.1,.7,.1,.1))
+    #barplot(rows_blocks$X0, border=NA, col=c('purple'), ylim=c(0,100))
+    #barplot(rows_blocks$X1, border=NA, col=c('blue'), ylim=c(0,100))
+    #barplot(rows_blocks$X3, border=NA, col=c('green'), ylim=c(0,100))
+    #barplot(rows_blocks$X7, border=NA, col=c('red'), ylim=c(0,100))
+    #barplot(rows_blocks$X14, border=NA, col=c('pink'), ylim=c(0,100))
+    #barplot(rows_blocks$X30, border=NA, col=c('orange'), ylim=c(0,100))
+
+    shapiro.test(rows_blocks$X0[ rows_blocks$X0 < 50 ])
+    shapiro.test(rows_blocks$X1[ rows_blocks$X1 < 50 ])
+    shapiro.test(rows_blocks$X3[ rows_blocks$X3 < 50 ])
+    shapiro.test(rows_blocks$X7[ rows_blocks$X7 < 50 ])
+    shapiro.test(rows_blocks$X14[ rows_blocks$X14 < 50 ])
+    shapiro.test(rows_blocks$X30[ rows_blocks$X30 < 50 ])
+}
+
+
+#image <- reboot_image(t_9, "2009", "2009/06/21", "2010/2/10", 0, 'day')
+#myImagePlot(image)
+
+start_image("st_bm_reboots.png", width=400, height=600)
+image <- reboot_image(bm_9, "2009", "2009/06/21", "2010/2/10", 0, 'day', title="BootManager Reboots for all Nodes")
+end_image()
+
+start_image("st_api_event_reboots.png", width=800, height=600)
+image2 <- reboot_image(bm_api, "2009", "2008/06/21", "2010/2/10", 0, 'day', title= "API Reboot Events for all Nodes")
+end_image()
+
+reboot_frequency <- function ( img )
+{
+    d <- dim(img)
+    # for each row
+    f <- NULL
+    for ( i in seq(1:d[1]) )
+    {
+        r <- img[i,]
+        f <- c(f, sum(r))
+    }
+    return (f);
+}
+
+reboot_events <- function ( img )
+{
+    d <- dim(img)
+    # for each row
+    f <- NULL
+    for ( i in seq(1:d[2]) )
+    {
+        c <- img[,i]
+        f <- c(f, sum(c))
+    }
+    return (f);
+}
+
+time_to_reboot <- function (img, first=0, last=0)
+{
+    d <- dim(img)
+    # for each row
+    f <- NULL
+    for ( i in seq(1:d[1]) )
+    {
+        if (last == 0 ) { last <- length(img[i,]) }
+        r <- img[i,first:last]
+        # find  first reboot
+        start_i <- 1
+        while ( start_i < length(r) && r[start_i] != 1 ) 
+        { 
+            start_i <- start_i + 1 
+        }
+        end_i <- start_i
+
+        while ( start_i < length(r) )
+        {
+            if ( r[start_i] == 1 && start_i != end_i)
+            {
+                f <- c(f, start_i-end_i)
+                while ( start_i < length(r) && r[start_i] == 1 ) { start_i <- start_i + 1 }
+                end_i <- start_i
+            }
+            start_i <- start_i + 1
+        }
+    }
+    return (f);
+}
+
+find_95 <- function (cdf, low=0, high=1000) 
+{
+    # find the lowest point past the 95th percentile.
+    while ( high - low > 1)
+    {
+        c_low <- cdf(low)
+        c_mid <- cdf(low+floor((high-low)/2))
+        c_high <- cdf(high)
+
+        c_min <- min(min(abs(0.95-c_low), abs(0.95-c_mid)), abs(0.95-c_high))
+
+        if ( c_mid > 0.95 ) {
+            high <- high - floor((high-low)/2)
+            print (sprintf("adjust high: %s\n", high));
+        } else if ( c_mid <= 0.95 ) {
+            low <- low + floor((high-low)/2)
+            print (sprintf("adjust low: %s\n", low));
+        }
+
+        #swap<-0
+        #if ( c_min == abs(0.95-c_mid) ) {
+        #    # is it in top half or bottom half?
+        #    print (sprintf("middle\n"));
+        #    if ( abs(0.95-c_low) < abs(0.95-c_high) ) {
+        #        low <- low + floor((high-low)/2)
+        #        print (sprintf("adjust low: %s\n", low));
+        #    } else { #if ( c_min == abs(0.95-c_high) ) {
+        #        high <- high - floor((high-low)/2)
+        #        print (sprintf("adjust high: %s\n", high));
+        #    }
+        #} else {
+        #    if ( c_min == abs(0.95-c_low) ) {
+        #        high <- high - floor((high-low)/2)
+        #        print (sprintf("adjust high: %s\n", high));
+        #    } else { #if ( c_min == abs(0.95-c_high) ) {
+        #        low <- low + floor((high-low)/2)
+        #        print (sprintf("adjust low: %s\n", low));
+        #    }
+        #}
+    }
+    return (low)
+}
+
+#0,193-402,length(r)
+ttr1 <- time_to_reboot(image,9,122)
+ttr2 <- time_to_reboot(image,131,223)
+
+ttr8 <- time_to_reboot(image2,0,193)
+ttr9 <- time_to_reboot(image2,402)
+
+x1 <- ecdf(c(ttr1, ttr2))
+x2 <- ecdf(c(ttr8,ttr9))
+start_image("reboot_ttr_cdf.png")
+plot(x1, col.vert='red', col.hor="red", col.points="red", pch='*', xlab="Days to Reboot", ylab="Percentile", verticals=TRUE, xlim=c(0,170), main="CDF of Days to Reboot for BM & API Events")
+plot(x2, col.vert='blue', col.hor="blue", col.points="blue", pch=20, verticals=TRUE, add=TRUE)
+legend(130, 0.15, legend=c("BM Uploads", "API Events"), col=c('red', 'blue'), pch=c(42, 20))
+abline(0.95,0)
+v1<-find_95(x1)
+v2<-find_95(x2)
+abline(v=v1, col="pink")
+abline(v=v2, col="light blue")
+axis(1, labels=c(v1,v2), at=c(v1,v2))
+
+abline(v=7, col="grey")
+abline(v=14, col="grey")
+abline(v=21, col="grey")
+abline(v=28, col="grey")
+abline(v=42, col="grey")
+abline(v=56, col="grey")
+end_image()
+
+e <- reboot_events(image)
+e2 <- reboot_events(image2)
+x1 <- ecdf(e)
+x2 <- ecdf(e2)
+
+start_image("reboot_days_cdf.png")
+plot(x1, col.vert='red', col.hor="red", col.points="red", pch='*', xlab="Reboots in a Single Day", ylab="Percentile", verticals=TRUE, xlim=c(0,100), main="CDF of Reboots per Day for BM & API Events")
+plot(x2, col.vert='blue', col.hor="blue", col.points="blue", pch=20, verticals=TRUE, add=TRUE)
+legend(75, 0.15, legend=c("BM Uploads", "API Events"), col=c('red', 'blue'), pch=c(42, 20))
+abline(0.95,0)
+v1<-find_95(x1)
+v2<-find_95(x2)
+abline(v=v1, col="pink")
+abline(v=v2, col="light blue")
+axis(1, labels=c(v1,v2), at=c(v1,v2))
+end_image()
+
+
+
+f <- reboot_frequency(image)
+f2 <- reboot_frequency(image2)
+x1 <- ecdf(f)
+x2 <- ecdf(f2)
+
+start_image("reboot_node_cdf.png")
+par(mfrow=c(1,1))
+par(mai=c(.9,.8,.5,.4))
+plot(x1, col.vert='red', col.hor="red", col.points="red", pch='*', xlab="Reboots per Node", ylab="Percentile", verticals=TRUE, xlim=c(0,100), main="CDF of Reboot per Node for BM & API Events")
+plot(x2, col.vert='blue', col.hor="blue", col.points="blue", pch=20, verticals=TRUE, add=TRUE)
+legend(75, 0.15, legend=c("BM Uploads", "API Events"), col=c('red', 'blue'), pch=c(42, 20))
+abline(0.95,0)
+v1<-find_95(x1)
+v2<-find_95(x2)
+abline(v=v1, col="pink")
+abline(v=v2, col="light blue")
+axis(1, labels=c(v1,v2), at=c(v1,v2))
+end_image()
+
+
+
+par(mfrow=c(1,1))
+par(mai=c(.7,.7,.7,.7))
diff --git a/statistics/functions.r b/statistics/functions.r
new file mode 100644 (file)
index 0000000..3411586
--- /dev/null
@@ -0,0 +1,403 @@
+slices <- function (x, components=FALSE) 
+{
+    m<-x$memsize;
+    d<-x$disksize/250;
+    c<-x$cpuspeed;
+    r<-x$numcores;
+    if ( components ) {
+        a<-c(m,d,c*r);
+    } else {
+        a<-(m+d+c*r);
+    }
+    return(a/2);
+}
+
+slices_2 <- function (x, components=FALSE) 
+{
+    # Define an ideal, then scale each measurement relative to the ideal.
+    # If it matches it will be more or less than 1
+    # does this scale (up or down) linearly, and why not?
+
+    # 4, 2.4x2, 1000; 4, 3.2x1, 320; 1, 2.4x1, 160
+    ideal_m <- 3.4;            # GB
+    ideal_c <- 2.4;            # GHz 
+    ideal_d <- 450;            # GB
+    ideal_r <- 2;
+
+    m<-x$memsize/ideal_m;
+    d<-x$disksize/ideal_d;
+    c<-x$cpuspeed/ideal_c;
+    r<-x$numcores/ideal_r;
+    # ideal is 1
+
+    if ( components ) {
+        a<-c(m,d,c*r);
+    } else {
+        a<-(m+d+c*r);
+    }
+
+    return (a/3*5);
+}
+
+slices_3 <- function (x, components=FALSE) 
+{
+    # Define an ideal, then scale each measurement relative to the ideal.
+    # If it matches it will be more or less than 1
+    # does this scale (up or down) linearly, and why not?
+
+    # 4, 2.4x2, 1000; 4, 3.2x1, 320; 1, 2.4x1, 160
+    ideal_m <- 3.4; #GB
+    ideal_c <- 2.4; #GHz
+    ideal_d <- 450;    #GB
+    ideal_r <- 2;
+       ideal_bw <- 100000;      #Kbps
+
+    m<-x$memsize/ideal_m;
+    d<-x$disksize/ideal_d;
+    c<-x$cpuspeed/ideal_c;
+    r<-x$numcores/ideal_r;
+    b<-log(x$bwlimit)/log(ideal_bw);
+    # ideal is 1
+
+    if ( components ) {
+        a<-c(m,d,c*r,b);
+    } else {
+        a<-(m+d+c*r+b);
+    }
+
+    return (a/4*5);
+}
+
+slices_4 <- function (x, components=FALSE) 
+{
+    # Define an ideal, then scale each measurement relative to the ideal.
+    # If it matches it will be more or less than 1
+    # does this scale (up or down) linearly, and why not?
+
+    # 4, 2.4x2, 1000; 4, 3.2x1, 320; 1, 2.4x1, 160
+    ideal_m <- 3.4; #GB
+    ideal_c <- 2.4; #GHz
+    ideal_d <- 450;    #GB
+    ideal_r <- 2;
+       ideal_bw <- 100000;      #Kbps
+       ideal_pcu <- 1;
+
+    m<-x$memsize/ideal_m;
+    d<-x$disksize/ideal_d;
+    c<-x$cpuspeed/ideal_c;
+    r<-x$numcores/ideal_r;
+    b<-log(x$bwlimit)/log(ideal_bw);
+       p<-x$pcustatus/ideal_pcu;
+    # ideal is 1
+
+    if ( components ) {
+        a<-c(m,d,c*r,b,p);
+    } else {
+        a<-(m+d+c*r+b+p);
+    }
+
+    return (a/5*5);
+}
+
+index_of_bin <- function (h, value)
+{
+    index <- 0;
+
+    for (i in sequence(length(h$breaks))) 
+    {
+        # first bin
+
+        if ( value < h$breaks[1] )
+        {
+            index <- 1;
+            break;
+        } 
+
+        # last bin
+
+        if ( i == length(h$breaks) )
+        {
+            # end of line
+            index <- i;
+            break;
+        } 
+
+        # all other bins
+
+        if ( value > h$breaks[i] && value <= h$breaks[i+1] )
+        {
+            index <- i+1;
+            break;
+        } 
+    }
+    if ( index == 0 ) {
+        warning("index == 0, no bin assigned for value: ", value);
+    }
+
+    return (index);
+}
+
+start_image <- function (name, width=480, height=480)
+{
+    png(name, width=width, height=height);
+}
+
+end_image <- function ()
+{
+    dev.off()
+}
+
+
+plot_rt_hist <- function (t, imagename=0)
+{
+    d2 <- (t$lastreply - t$start)
+    std_dev <- sd(log(d2))
+    m <- mean(log(d2))
+    print(sprintf("mean: %s, stddev: %s\n", m, std_dev));
+
+    if ( imagename != 0 ) { start_image(imagename) }
+
+    h<-hist(log(d2), 
+        xlab="Hours between ticket creation and final reply", 
+        main="Time to Final Reply for RT Tickets", axes=FALSE)
+    
+    a<-exp(h$breaks)/(60*60)    # convert units from log(secs) to hours
+    axis(1,labels=signif(a,2), at=h$breaks)
+    axis(2)
+
+    x<-seq(min(h$breaks),max(h$breaks),length=500)
+    y<-dnorm(x,mean=m, sd=std_dev)
+
+    # scale y to the size of h's 'counts' vector rather than the density function
+    lines(x,y*max(h$counts)/max(y))
+    if ( imagename != 0 ) { end_image() }
+}
+
+year_hist <- function (t, year, from, to, max, type="week", title="Histogram for Tickets in")
+{
+    dates <-seq(as.Date(from), as.Date(to), type)
+    months <- format(dates, "%b-%d")
+    hbreaks<-unclass(as.POSIXct(dates))
+    h<-hist(t$start, breaks=hbreaks, plot=FALSE)
+    main<-sprintf(paste(title, "%s: MEAN %s\n"), year, mean(h$counts))
+    print(main);
+    if ( max == 0 ) {
+        max = max(h$counts)
+    }
+    plot(h, ylim=c(0,max), main=main, axes=FALSE)
+    axis(1, labels=months, at=hbreaks)
+    axis(2)
+    abline(mean(h$counts), 0, col='grey')
+    #qqnorm(h$counts)
+    #qqline(h$counts)
+}
+
+year_hist_unique <- function (t, year, from, to, max, type="week", title="Histogram for Tickets in")
+{
+    dates <-seq(as.Date(from), as.Date(to), type)
+    months <- format(dates, "%b-%d")
+    hbreaks<-unclass(as.POSIXct(dates))
+
+    rows <- NULL
+    for ( d in hbreaks )
+    {
+        d_end <- d+60*60*24
+        t_sub <- t[which(t$start > d & t$start <= d_end),]
+        rows <- rbind(rows, c('start'=d, 'reboots'=length(unique(t_sub$hostname))) )
+    }
+    rows <- data.frame(rows)
+
+    if ( max == 0 ) {
+        max = max(rows$reboots)
+    }
+    main<-sprintf(paste(title, "%s: MEAN %s\n"), year, mean(rows$reboots))
+    print(main);
+    barplot(rows$reboots, ylim=c(0,max), main=main, axes=FALSE, space=0)
+    #plot(h, ylim=c(0,max), main=main, axes=FALSE)
+    axis(1, labels=months, at=seq(1,length(hbreaks)))
+    axis(2)
+    abline(mean(rows$reboots), 0, col='grey')
+    #qqnorm(h$counts)
+    #qqline(h$counts)
+    return (rows);
+}
+
+year_hist_unique_recent <- function (t, year, from, to, max, blocks=c(1,3,7,14,30), type="week", title="Histogram for Tickets in")
+{
+    dates <-seq(as.Date(from), as.Date(to), type)
+    months <- format(dates, "%b-%d")
+    hbreaks<-unclass(as.POSIXct(dates))
+
+    rows <- NULL
+
+
+    for ( d in hbreaks )
+    {
+        # initialize row for this iteration
+        row <- NULL
+        row[as.character(0)] <- 0
+        for ( block in blocks ) {
+            row[as.character(block)] <- 0
+        }
+
+        # find the range : d plus a day
+        d_end <- d+60*60*24
+        # find unique hosts in this day range
+        t_sub <- t[which(t$start > d & t$start <= d_end),]
+        unique_hosts <- unique(t_sub$hostname)
+        if (length(unique_hosts) == 0 ) { 
+            rows <- rbind(rows, c('start'=d, row))
+            next 
+        }
+
+        #print(sprintf("unique_hosts: %s\n", unique_hosts));
+        print(sprintf("unique_hosts: %s\n", length(unique_hosts)));
+
+        for ( host in as.character(unique_hosts) ) 
+        {
+            found <- 0
+            for ( block in blocks )
+            {
+                #print(sprintf("date: %s, block: -%s, %s\n", d, block, host));
+                #print(sprintf("row: %s\n", row));
+                # find the range : 'block' days ago to 'd'
+                d_back <- d - 60*60*24 * block
+                t_back_sub <- t[which(t$start > d_back & t$start <= d),]
+                u <- unique(t_back_sub$hostname)
+                if ( length(u[u==host]) >= 1) 
+                {
+    #               add to block_count and go to next host.
+                    found <- 1
+                    i <- as.character(block)
+                    row[i] <- row[i] + 1
+                    break
+                }
+            }
+            if ( found == 0 )
+            {
+                # no range found
+                row['0'] <- row['0'] + 1
+            }
+        }
+        rows <- rbind(rows, c('start'=d, row))
+    }
+
+    rows <- data.frame(rows)
+
+    if ( max == 0 ) {
+        max = max(rows['0'])
+    }
+    #main<-sprintf(paste(title, "%s: MEAN %s\n"), year, mean(rows$reboots))
+    #print(main);
+    #barplot(rows$reboots, ylim=c(0,max), main=main, axes=FALSE, space=0)
+    ##plot(h, ylim=c(0,max), main=main, axes=FALSE)
+    #axis(1, labels=months, at=seq(1,length(hbreaks)))
+    #axis(2)
+    #abline(mean(rows$reboots), 0, col='grey')
+    #qqnorm(h$counts)
+    #qqline(h$counts)
+    return (rows);
+}
+
+source("myImagePlot.R")
+reboot_image <- function (t, year, from, to, max=0, type="week", title="")
+{
+    dates <-seq(as.Date(from), as.Date(to), type)
+    months <- format(dates, "%b-%d")
+    hbreaks<-unclass(as.POSIXct(dates))
+
+    rows <- NULL
+    image <- matrix(data=0, nrow=max(as.numeric(t$hostname)), ncol=length(hbreaks))
+    #image <- matrix(data=0, nrow=length(unique(t$hostname)), ncol=length(hbreaks))
+
+    #for ( d in hbreaks )
+    for ( i in seq(1, length(hbreaks)) )
+    {
+        # find the range : d plus a day
+        d <- hbreaks[i]
+        d_end <- d+60*60*24
+        # find unique hosts in this day range
+        t_sub <- t[which(t$start > d & t$start <= d_end),]
+        unique_hosts <- unique(t_sub$hostname)
+        if (length(unique_hosts) == 0 ) { next }
+
+        for ( host in unique_hosts ) 
+        {
+            image[host,i] <- 1
+        }
+    }
+
+    myImagePlot(image, xLabels=months, yLabels=c(""), title=title)
+
+            #found <- 0
+            #for ( block in blocks )
+            #{
+                #print(sprintf("date: %s, block: -%s, %s\n", d, block, host));
+                #print(sprintf("row: %s\n", row));
+                # find the range : 'block' days ago to 'd'
+            #    d_back <- d - 60*60*24 * block
+            #    t_back_sub <- t[which(t$start > d_back & t$start <= d),]
+            #    u <- unique(t_back_sub$hostname)
+            #    if ( length(u[u==host]) >= 1) 
+            #    {
+    #       #        add to block_count and go to next host.
+            #        found <- 1
+            #        i <- as.character(block)
+            #        row[i] <- row[i] + 1
+            #        break
+            #    }
+            #}
+            #if ( found == 0 )
+            #{
+            #    # no range found
+            #    row['0'] <- row['0'] + 1
+            #}
+        #}
+        #rows <- rbind(rows, c('start'=d, row))
+
+    #rows <- data.frame(rows)
+
+    #if ( max == 0 ) {
+    #    max = max(rows['0'])
+    #}
+    #main<-sprintf(paste(title, "%s: MEAN %s\n"), year, mean(rows$reboots))
+    #print(main);
+    #barplot(rows$reboots, ylim=c(0,max), main=main, axes=FALSE, space=0)
+    ##plot(h, ylim=c(0,max), main=main, axes=FALSE)
+    #axis(1, labels=months, at=seq(1,length(hbreaks)))
+    #axis(2)
+    #abline(mean(rows$reboots), 0, col='grey')
+    #qqnorm(h$counts)
+    #qqline(h$counts)
+    return (image);
+}
+
+add_year <- function (t)
+{
+    t$year <- c(0)  # assign new column with zero value initially
+    for ( i in 1:length(t$start) )
+    {
+        d <- as.POSIXlt(t$start[i], origin="1970-01-01")
+        year <- d$year + 1900 # as.numeric(format(d, "%Y"))
+        t$year[i] <- year
+    }
+    return (t);
+}
+
+add_timestamp <- function (t)
+{
+    t$start <- c(0)  # assign new column with zero value initially
+    for ( i in 1:length(t$date) )
+    {
+        tstamp <-unclass(as.POSIXct(t$date[i], origin="1970-01-01"))[1]
+        t$start[i] <- tstamp
+    }
+    return (t);
+}
+
+abline_at_date <- function (date, col='black', lty=1, format="%Y-%m-%d")
+{
+    ts <-unclass(as.POSIXct(date, format=format, origin="1970-01-01"))[1]
+    abline(v=ts, col=col, lty=lty)
+    return (ts);
+}
diff --git a/statistics/node_history_may0809.r b/statistics/node_history_may0809.r
new file mode 100644 (file)
index 0000000..75f3c52
--- /dev/null
@@ -0,0 +1,58 @@
+
+source("functions.r");
+
+# data collected from M2 pickle files
+dnc <- read.csv('daily-available-node-count.csv', sep=',', header=TRUE)
+
+dnc2<-add_timestamp(dnc)
+
+tstamp_08 <-unclass(as.POSIXct("2008-05-07", origin="1970-01-01"))[1]
+dnc2 <- dnc2[which( dnc2$start >  tstamp_08 ),]
+
+
+dates <-seq(as.Date('2008-05-07'), as.Date('2009-05-07'), 'week')
+months <- format(dates, "%b")
+hbreaks<-unclass(as.POSIXct(dates))
+
+x_start<-unclass(as.POSIXct("2008-05-07", origin="1970-01-01"))[1]
+x_end  <-unclass(as.POSIXct("2009-06-1", origin="1970-01-01"))[1]
+
+start_image("daily-node-count.png")
+plot(dnc2$start[which(!is.na(dnc2$available))], dnc2$registered[which(!is.na(dnc2$available))], 
+    type='l', col='blue', ylim=c(0,900), xlim=c(x_start, x_end),
+    xlab="Date", ylab="Node Count", axes=F)
+lines(dnc2$start[which(!is.na(dnc2$available))], dnc2$available[which(!is.na(dnc2$available))], type='l', col='red', ylim=c(0,900))
+axis(2)
+axis(1, labels=months, at=hbreaks)
+
+
+tstamp_0610 <-abline_at_date("2008-06-10", col='grey20', lty=2)
+# dates takes from reboot_image() output for API events.
+tstamp_0815 <-abline_at_date("2008-08-15", col='grey20', lty=2)
+tstamp_0905 <-abline_at_date("2008-09-05", col='grey70')
+tstamp_0924 <-abline_at_date("2008-09-24", col='grey20', lty=2)
+tstamp_1015 <-abline_at_date("2008-10-15", col='grey20', lty=2)
+tstamp_1105 <-abline_at_date("2008-11-05", col='white', lty=2)
+tstamp_1214 <-abline_at_date("2008-12-14", col='grey70')
+tstamp_0223 <-abline_at_date("2009-02-23", col='grey70')
+tstamp_0313 <-abline_at_date("2009-03-13", col='grey70')
+
+
+text(x=c(tstamp_0610+(tstamp_0815-tstamp_0610)/2,
+         tstamp_0815+(tstamp_0905-tstamp_0815)/2,
+         tstamp_0924+(tstamp_1015-tstamp_0924)/2, 
+         tstamp_1015+(tstamp_1105-tstamp_1015)/2, 
+         tstamp_1214+(tstamp_0223-tstamp_1214)/2, 
+         tstamp_0223+(tstamp_0313-tstamp_0223)/2), 
+     y=c(0),
+     labels=c("Kernel bug", 'fix1', 'fix2', 'fix3', 'Notice bug', 'fix4')) #, 'fix 2', 'fix 3', 'fix 4'))
+
+legend(unclass(as.POSIXct("2009-03-13", origin="1970-01-01"))[1], 200,
+        cex=0.7,
+        legend=c("Registered", "Available", 'Kernel Update', 'MyOps Event'),
+        pch=c('-', '-', '-', '-'),
+        col=c('blue', 'red', 'grey20', 'grey70'),
+        lty=c(1, 1, 2, 1), merge=T)
+
+end_image()
+
diff --git a/statistics/prep.r b/statistics/prep.r
new file mode 100644 (file)
index 0000000..bf7333d
--- /dev/null
@@ -0,0 +1,161 @@
+
+source("functions.r");
+
+ikern <- read.csv("/Users/soltesz/Downloads/out.csv", TRUE, sep=",")
+f<-factor(ikern$kernel_version, sort(unique(ikern$kernel_version)), sequence(length(unique(ikern$kernel_version))))
+
+u<-ikern$uptime/(60*60*24)
+
+current_time <- as.numeric(format(Sys.time(), "%s"))
+i<-(current_time-ikern$install_date)/(60*60*24)
+
+plot(f,u)
+
+
+sites <- read.csv("/Users/soltesz/Downloads/sites.csv", TRUE, sep=",")
+f<-factor(sites$status, sort(unique(sites$status)), sequence(length(unique(sites$status))))
+
+s<-sites$sliver_count
+
+res <- read.csv("/Users/soltesz/Downloads/out_resources.csv", TRUE, sep=",")
+library(lattice)
+cloud(memsize ~ disksize * cpuspeed|numcores, data=res)
+
+x<-c(res[2],res[4],res[5])
+pairs(x)
+
+
+
+mdrc <- read.csv("/Users/soltesz/Downloads/out_resources.csv", TRUE, sep=",")
+
+stripchart(round(slices(mdrc)), method="jitter")
+hist(round(slices(mdrc)),breaks=30)
+
+hist(round(slices(mdrc)),breaks=30,xlim=c(0,32))
+stripchart(round(slices(mdrc)), method="jitter", add=TRUE, jitter=30, at=50)
+
+
+# bottom, left, top, right
+par(mai=c(0,1,0.5,0.2))
+hist(round(slices(mdrc)),breaks=30,xlim=c(0,32))
+par(mai=c(1.0,1,0.5,0.2))
+stripchart(round(slices(mdrc))-0.5, method="jitter", jitter=20, xlim=c(0,32), ylim=c(-25,25),  ylab="Raw Samples", xlab="Slice count as a function of Mem, CPU, Disk")
+
+
+png("/Users/soltesz/Downloads/slices.png")
+par(mfrow=c(2,1))
+par(mai=c(0,1,0.5,0.2))
+hist(round(slices(mdrc)),breaks=30,xlim=c(0,32), main="Distribution of Slice Count as Function of Mem, CPU, Disk")
+par(mai=c(1.0,1,0.5,0.2))
+stripchart(round(slices(mdrc))-0.5, method="jitter", jitter=20, xlim=c(0,32), ylim=c(-25,25),  ylab="Raw Samples", xlab="Slice count as a function of Mem, CPU, Disk for live Planetlab Machines")
+dev.off()
+
+
+#-----------------------
+
+f<-slices
+f<-slices_2
+
+s2<- f(mdrc, FALSE);
+mdrc$score <- s2;
+df <- data.frame(mdrc);
+b<-30;
+
+# ----------------------
+### LOGINBASE
+unique_loginbase_length <- length(unique(mdrc$loginbase));
+unique_lb <- list(loginbase=array(0,c(unique_loginbase_length)), 
+                                 score=array(0,c(unique_loginbase_length)),
+                                 memsize=array(0,c(unique_loginbase_length)),
+                                 disksize=array(0,c(unique_loginbase_length)),
+                                 cpuspeed=array(0,c(unique_loginbase_length))
+                                 )
+
+for ( i in 1:length(mdrc$loginbase) )
+{
+    r <- mdrc[i,];
+       v <- f(r, TRUE);
+       unique_lb$loginbase[r$loginbase] <- r$loginbase;
+       unique_lb$score[r$loginbase]    <- unique_lb$score[r$loginbase]  + r$score;
+}
+
+for ( i in 1:length(mdrc$loginbase) )
+{
+    r <- mdrc[i,];
+       v <- f(r, TRUE);
+       rscore <- unique_lb$score[r$loginbase]
+       unique_lb$memsize[r$loginbase]  <- unique_lb$memsize[r$loginbase]  + v[1];
+       unique_lb$disksize[r$loginbase] <- unique_lb$disksize[r$loginbase]  + v[2];
+       unique_lb$cpuspeed[r$loginbase] <- unique_lb$cpuspeed[r$loginbase]  + v[3];
+}
+
+df<- data.frame(unique_lb)
+
+h<- hist(df$score, breaks=b);
+bins<-max(length(h$breaks),max(h$breaks));
+c<- array(0,c(bins));
+d<- array(0,c(bins));
+m<- array(0,c(bins));
+# foreach score value, find which range it falls into, 
+# then in three columns for cpu, mem, disk, record the fraction of each.
+# then plot each sequence in a stacked graph, perhaps beside h$counts
+for ( i in 1:length(df$cpuspeed) )
+{
+    r <- df[i,];
+    s <- index_of_bin(h, r$score); # find bin position...
+    # take fraction that each component contributes to the total, and add to sum
+
+    m[s] <- m[s] + unique_lb$memsize[r$loginbase];
+    d[s] <- d[s] + unique_lb$disksize[r$loginbase];
+    c[s] <- c[s] + unique_lb$cpuspeed[r$loginbase];
+}
+
+# ----------------------
+### HOSTS
+# ---  get plot of contributing parts
+h<- hist(df$score, breaks=b);
+bins<-max(length(h$breaks),max(h$breaks));
+c<- array(0,c(bins));
+d<- array(0,c(bins));
+m<- array(0,c(bins));
+# foreach score value, find which range it falls into, 
+# then in three columns for cpu, mem, disk, record the fraction of each.
+# then plot each sequence in a stacked graph, perhaps beside h$counts
+for ( i in 1:length(df$cpuspeed) )
+{
+    r <- df[i,1:6];
+    s <- index_of_bin(h, r$score); # find bin position...
+    # take fraction that each component contributes to the total, and add to sum
+    v <- f(r, TRUE);
+    m[s] <- m[s] + v[1]/r$score;
+    d[s] <- d[s] + v[2]/r$score;
+    c[s] <- c[s] + v[3]/r$score;
+}
+
+
+#a <- array(c(c,d,m), dim=c(bins, 3));
+a <- array(c(c), dim=c(bins, 3));
+
+#png("/Users/soltesz/Downloads/slice_policy_1.png")
+par(mfrow=c(2,1))
+par(mai=c(0.5,1,0.5,0.2))
+barplot(c(0,h$counts), 
+    xlab="slice count", 
+    main="Distribution of Per-node 'Scores' Calculated from Mem/Disk/CPU", 
+    ylab="Total Frequency", 
+    ylim=c(0,160))
+par(mai=c(1.0,1,0,0.2));
+barplot(t(a), 
+    legend=c("CPUspeed (GHz)", "DISKsize (GB)", "MEMsize (GB)"), 
+    col=c("pink", "lightblue", "lightgreen"), 
+    ylim=c(0,160),
+    ylab="Total with Break-down",
+    xlab="Per-node Score",
+    names.arg=h$breaks,
+);
+#dev.off()
+
+
+
+#a <- list(cpuspeed=c, memsize=m, disksize=d);
+# barplot(t(a), legend=c("cpuspeed", "disksize", "memsize"), col = c("pink", "lightblue", "lightgreen"), ylab="Total Contribution by CPU, Disk, Mem ")
diff --git a/statistics/rpm_dist.r b/statistics/rpm_dist.r
new file mode 100644 (file)
index 0000000..497971a
--- /dev/null
@@ -0,0 +1,125 @@
+#####
+
+#system("URL='https://monitor.planet-lab.org:443/monitor/query?object=nodes&nodehistory_hostname=&hostname=on&observed_status=on&rpms=on&rpmvalue=planetlab&tg_format=plain'; curl -s --insecure $URL | grep -v DOWN | grep -v DEBUG | /usr/share/monitor/statistics/hn2rpms.py > out_rpm.csv");
+#system("grep MD5SUMS /usr/share/monitor/monitor.log | grep -v measurement-lab | awk 'BEGIN { printf \"hostname,yumsum\\n\" } {if ( $3 != \"\") { printf \"%s,%s\\n\", $2,$3 } }' > yumsum.csv")
+
+r <- read.csv("out_rpm.csv")
+ys<- read.csv('yumsum.csv')
+m<-merge(r,ys, by="hostname")
+
+s<-table(factor(r$NodeManager), factor(r$kernel), factor(r$iptables));
+plot(s);
+
+ideal<-c(NodeManager='NodeManager-1.8-12.planetlab.1',
+        NodeUpdate='NodeUpdate-0.5-4.planetlab', 
+                codemux='codemux-0.1-13.planetlab',
+        fprobe.ulog='fprobe-ulog-1.1.3-0.planetlab', 
+                ipod='ipod-2.2-1.planetlab',
+        iproute='iproute-2.6.16-2.planetlab', 
+                iptables='iptables-1.3.8-9.planetlab',
+         kernel='kernel-2.6.22.19-vs2.3.0.34.39.planetlab',
+        madwifi='madwifi-0.9.4-2.6.22.19.3.planetlab', 
+                monitor.client='monitor-client-3.0-17.planetlab',
+        monitor.runlevelagent='monitor-runlevelagent-3.0-17.planetlab', 
+                pl_mom='pl_mom-2.3-1.planetlab',
+        pl_sshd='pl_sshd-1.0-11.planetlab', 
+                pyplnet='pyplnet-4.3-3.planetlab',
+        util.vserver.pl='util-vserver-pl-0.3-17.planetlab',
+        vserver.planetlab.f8.i386='vserver-planetlab-f8-i386-4.2-12.2009.06.23',
+        vserver.systemslices.planetlab.f8.i386='vserver-systemslices-planetlab-f8-i386-4.2-12.2009.06.23',
+        vsys='vsys-0.9-3.planetlab', 
+                vsys.scripts='vsys-scripts-0.95-11.planetlab');
+
+r_summary <- lapply(r[,4:23], summary)
+for (i in 1:length(r_summary))
+{
+    n<-sort(unlist(r_summary[i]), decreasing=TRUE)
+       names(n[1])
+}
+
+as.numeric(factor(ideal[1], levels(r$NodeManager)))
+
+cv <- function ( row , rows=566, start_col=4, end_col=23, ref=NULL)
+{
+       ret<-NULL;
+    for ( i in 1:rows )
+       {
+               r_l <-NULL
+           for ( name in names(row) ) 
+               {
+                       # NOTE: this doesn't work unless the levels in row are a subset of ref's levels.
+                       x<-as.numeric(factor(row[i,name], levels(factor(unlist(row[name])))));
+                       r_l <- c(r_l, x);
+               }
+               #r<-as.numeric(row[i,start_col:end_col]);
+               str<- paste(as.character(r_l), collapse="-", sep="-");
+               ret<- rbind(ret, str);
+       }
+       return (ret);
+}
+
+grow <- function (d, column, val)
+{
+    r <- which(d[column] == val);
+       return (d[r,]);
+}
+
+cv(m, length(m$hostname));
+i<-data.frame(t(ideal));
+cv(i, 1, 1, length(ideal));
+
+       # ---
+
+x<-cv(r, length(r$hostname))
+x2<-factor(x)
+# plot the frequency of each RPM package combination
+barplot(sort(table(x2), decreasing=TRUE), 
+               ylim=c(0, max(table(x2))),
+               xlab="Unique Package Combinations",
+               ylab="Frequency",
+               axisnames=FALSE,
+               main=paste("Distribution of Packages for", length(r$hostname),"nodes"));
+
+png("/Users/soltesz/Downloads/rpm_plpackages_distribution_1.png",
+       width=640,
+       height=300,
+       unit="px")
+# 1x1 grid, with 1" margins on the bottom/left, 0.5" on the top/right
+par(mfrow=c(1,1));
+par(mai=c(1,1,0.5,0.5));
+barplot(sort(table(x2), decreasing=TRUE), 
+               ylim=c(0, max(table(x2))),
+               xlab="Unique Package Combinations",
+               ylab="Frequency",
+               axisnames=FALSE,
+               main=paste("Distribution of Packages for", length(r$hostname),"nodes"));
+dev.off()
+
+
+
+#convert_rpm <- function ( row )
+#{
+#      c <- as.character(row$rpms)
+#      rpm_list <- unlist(strsplit(c, " "))
+#      rpm_sort <- paste(sort(rpm_list), collapse="::");
+#      return (rpm_sort);
+#}
+
+#s<-convert_rpm(r)
+
+#for ( row in r[,] )
+#{
+#      c <- as.character(row$rpms)
+#      rpm_list <- unlist(strsplit(c, " "))
+#      row$rpm_sort <- paste(sort(rpm_list), collapse="::");
+#
+#      #for ( rpm in rpm_list ) 
+#      #{
+#      #       fields <- unlist(strsplit(rpm, "-"));
+#      #       s <- sort(fields);
+#      #}
+#}
+#
+#s<-sort(rpm_list);
+
+
diff --git a/statistics/rt_data.r b/statistics/rt_data.r
new file mode 100644 (file)
index 0000000..9ba227b
--- /dev/null
@@ -0,0 +1,415 @@
+
+
+source("functions.r");
+
+# system("parse_rt_data.py 3 > rt_data.csv");
+#t <- read.csv('rt_data.csv', sep=',', header=TRUE)
+t <- read.csv('rt_data_2004-2010.csv', sep=',', header=TRUE)
+
+par(mfrow=c(2,1))
+
+h<-hist(log(log(t$replies)), breaks=50)
+lines(h$breaks[which(h$counts!=0)], h$counts[which(h$counts!=0)])
+h<-hist(log(log(log(t$replies))), breaks=50)
+lines(h$breaks[which(h$counts!=0)], h$counts[which(h$counts!=0)])
+
+
+par(mfrow=c(1,1))
+
+t2 <- t[which(t$complete == 1),]
+d <- (t2$lastreply - t2$start)/(60*60)
+
+#start_image("rt_hist_ttc_1000.png")
+#hist(d[which(d<1000)], xlab="hours from creation to last reply", breaks=30)
+#end_image()
+#
+#start_image("rt_hist_ttc_200.png")
+#hist(d[which(d<200)], xlab="hours from creation to last reply", breaks=30)
+#end_image()
+#
+#start_image("rt_hist_ttc_50.png")
+#hist(d[which(d<50)], xlab="hours from creation to last reply", breaks=30)
+#end_image()
+#
+#start_image("rt_hist_ttc_10.png")
+#hist(d[which(d<10)], xlab="hours from creation to last reply", breaks=30)
+#end_image()
+#
+#d2 <- (t2$lastreply - t2$start)
+#h<-hist(log(d2), plot=F, breaks=50)
+#lines(h$breaks[which(h$counts!=0)], h$counts[which(h$counts!=0)])
+
+
+# this doesn't work as I would like.  I think the bins aren't as I expect
+#h <- hist(d, plot=F, breaks=c(seq(0,max(d)+1, .1)))
+#plot(h$counts, log="x", pch=20, col="blue",
+#      main="Log-normal distribution",
+#      xlab="Value", ylab="Frequency")
+
+#plot(log(d2))
+#plot(ecdf(d2))
+
+tstamp_45 <-unclass(as.POSIXct("2005-01-01", origin="1960-01-01"))[1]
+tstamp_56 <-unclass(as.POSIXct("2006-01-01", origin="1960-01-01"))[1]
+tstamp_67 <-unclass(as.POSIXct("2007-01-01", origin="1960-01-01"))[1]
+tstamp_78 <-unclass(as.POSIXct("2008-01-01", origin="1960-01-01"))[1]
+tstamp_89 <-unclass(as.POSIXct("2009-01-01", origin="1960-01-01"))[1]
+tstamp_90 <-unclass(as.POSIXct("2010-01-01", origin="1960-01-01"))[1]
+
+
+t_4 <- t2[which( t2$start <  tstamp_45 ),]
+t_5 <- t2[which( t2$start >= tstamp_45 & t2$start < tstamp_56 ),]
+t_6 <- t2[which( t2$start >= tstamp_56 & t2$start < tstamp_67 ),]
+t_7 <- t2[which( t2$start >= tstamp_67 & t2$start < tstamp_78 ),]
+t_8 <- t2[which( t2$start >= tstamp_78 & t2$start < tstamp_89 ),]
+t_9 <- t2[which( t2$start >= tstamp_89 & t2$start < tstamp_90 ),]
+t_10 <- t2[which( t2$start >= tstamp_90 ),]
+
+par(mfrow=c(4,1))
+plot_rt_hist(t_4)
+plot_rt_hist(t_5)
+plot_rt_hist(t_6)
+plot_rt_hist(t_7)
+plot_rt_hist(t_8)
+plot_rt_hist(t_9)
+par(mfrow=c(1,1))
+
+start_image("rt_support_seasonal.png")
+par(mfrow=c(6,1))
+par(mai=c(.3,.3,.3,.3))
+
+# start dates on Sunday to align all weeks with weekend boundaries.
+year_hist(t_4, "2004", "2003/12/28", "2005/1/7", 85)
+year_hist(t_5, "2005", "2005/1/2", "2006/1/7", 85)
+year_hist(t_6, "2006", "2006/1/1", "2007/1/7", 85)
+year_hist(t_7, "2007", "2006/12/31", "2008/1/7", 85)
+year_hist(t_8, "2008", "2007/12/30", "2009/1/7", 85)
+year_hist(t_9, "2009", "2008/12/28", "2010/1/30", 85)
+end_image()
+
+par(mai=c(0.7,0.7,0.7,0.7))
+par(mfrow=c(1,1))
+
+
+tstamp <-unclass(as.POSIXct("2008-01-01", origin="1960-01-01"))
+t_67 <- t2[which( t2$start <  tstamp[1] ),]
+t_89 <- t2[which( t2$start >= tstamp[1] ),]
+
+
+# install.packages('sn')
+require(sn)
+par(mfrow=c(6,1))
+par(mai=c(0.3,0.3,0.3,0.3))
+
+#start_image("rt_hist_ttc_1000.png")
+time_hist <- function (t, lessthan, year, log=T, breaks=30, xlim=c(-4,10), ylim=c(0,150))
+{
+    d <- (t$lastreply - t$start)/(60*60)
+    main = sprintf("Histogram of d<%s for %s", lessthan, year);
+    if ( log )
+    {
+        d <- log(d[which(d<lessthan)])
+        avg <- round(mean(d), 2)
+        main = sprintf("Histogram of d<%s for %s : %s", lessthan, year, avg);
+        #h<-sn.mle(d, xlab="Hours to Complete Ticket", breaks=30, main=main, xlim=xlim, ylim=ylim)
+        h<-sn.mle(y=d)
+    } else {
+        h<-hist(d[which(d<lessthan)], xlab="Hours to Complete Ticket", breaks=30, main=main, xlim=xlim, ylim=ylim)
+    }
+    return (h);
+}
+tstamp <-unclass(as.POSIXct("2007-05-01", origin="1960-01-01"))
+t_7a <- t_7[which(t_7$start < tstamp),]
+t_7b <- t_7[which(t_7$start >= tstamp),]
+
+#end_image()
+h4<-time_hist(t_4, 10000, "2004")
+h5<-time_hist(t_5, 10000, "2005")
+h6<-time_hist(t_6, 10000, "2006")
+#h7<-time_hist(t_7, 10000, "2007")
+h7a<-time_hist(t_7a, 10000, "2007")
+h7b<-time_hist(t_7b, 10000, "2007")
+h8<-time_hist(t_8, 10000, "2008")
+h9<-time_hist(t_9, 10000, "2009")
+
+tstamp <-unclass(as.POSIXct("2009-09-01", origin="1960-01-01"))
+m_9a <- m_9[which(m_9$start < tstamp),]
+m_9b <- m_9[which(m_9$start >= tstamp),]
+
+split_by_time <- function (t, datestr)
+{
+    tstamp <-unclass(as.POSIXct(datestr, origin="1960-01-01"))
+    a <- t[which(t$start < tstamp),]
+    b <- t[which(t$start >= tstamp),]
+    v<- list('before'=a, 'after'=b)
+    return (v);
+}
+
+mh7 <- time_hist(m_7, 10000, '2007')
+
+sm_8 <- split_by_time(m_8, "2008-07-01")
+#mh8a <- time_hist(rbind(m_7, m_8$before, m_8$after), 10000, '2008')
+#mh8a <- time_hist(rbind(m_7[which(log((m_7$lastreply-m_7$start)/(60*60))>2),]), 10000, '2008')
+# m_7 is junk data
+
+mh_8 <- time_hist(sm_8$before, 10000, '2008')
+
+sm_9 <- split_by_time(m_9, "2009-09-01")
+
+mh_89 <- time_hist(rbind(sm_8$after, sm_9$before), 10000, '2009')
+mh_9 <- time_hist(sm_9$after, 10000, '2009')
+
+
+x<-seq(-8,10,0.01)
+#x<- exp(x)/24
+
+#my7<-dsn(x, dp=cp.to.dp(mh7$cp))
+my8<-dsn(x, dp=cp.to.dp(mh_8$cp))
+my89<-dsn(x, dp=cp.to.dp(mh_89$cp))
+my9<-dsn(x, dp=cp.to.dp(mh_9$cp))
+
+y4<-dsn(x, dp=cp.to.dp(h4$cp))
+y5<-dsn(x, dp=cp.to.dp(h5$cp))
+y6<-dsn(x, dp=cp.to.dp(h6$cp))
+y7a<-dsn(x, dp=cp.to.dp(h7a$cp))
+y7b<-dsn(x, dp=cp.to.dp(h7b$cp))
+y8<-dsn(x, dp=cp.to.dp(h8$cp))
+y9<-dsn(x, dp=cp.to.dp(h9$cp))
+
+start_image("rt_time_to_resolve.png")
+par(mfrow=c(1,1))
+par(mai=c(1.0,0.7,0.7,0.7))
+# monitor
+plot(x, my9, col='blue', type='l', axes=F, xlab="Days to Resolve", ylab="Density")
+axis(1, labels=c(0.0001, 0.01, 0.1, 1, 5, 20, 100), at=c(0.0001, 0.01, 0.1, 1, 5, 20, 100))
+axis(2)
+lines(x, my8, col='dodgerblue')
+lines(x, my7, col='turquoise')
+abline(v=x[which(my8==max(my8))])
+abline(v=x[which(my9==max(my9))])
+
+# heavy
+lines(x, y7a, col='green3')
+lines(x, y4, col='green4')
+lines(x, y5, col='greenyellow')
+
+abline(v=x[which(y4==max(y4))])
+abline(v=x[which(y5==max(y5))])
+abline(v=x[which(y7a==max(y7a))])
+
+# light
+lines(x, y7b, col='orange', type='l')
+lines(x, y6, col='orange3')
+lines(x, y8, col='firebrick2')
+lines(x, y9, col='firebrick4')
+
+abline(v=x[which(y7b==max(y7b))])
+abline(v=x[which(y6==max(y6))])
+abline(v=x[which(y8==max(y8))])
+abline(v=x[which(y9==max(y9))])
+
+end_image()
+
+whisker <- function (x0,y0,sd, length=0.05)
+{
+    arrows(x0, y0, x0, y0+sd, code=2, angle=90, length=length)
+    arrows(x0, y0, x0, y0-sd, code=2, angle=90, length=length)
+}
+
+whisker2 <- function (x0,y0, y0_high, y0_low, col="black", length=0.05)
+{
+    arrows(x0, y0, x0, y0_high, code=2, angle=90, length=length, col=col)
+    arrows(x0, y0, x0, y0_low, code=2, angle=90, length=length, col=col)
+}
+
+start_image("rt_aggregate_times.png")
+par(mfrow=c(1,1))
+par(mai=c(1,1,1,1))
+par(mar=c(5,4,4,4))
+
+s_list <- c(1519, 1596, 1112, 1591, 1019, 815)
+m_list <- c(0,0,0,    119,  229,  251)
+x_tick_list <- c(1,   2.5,  4, 5.5, 7, 8.5)
+x_tt_resolve_list <- c(1,   2.5,  4, 5.2,5.8, 7, 8.5)
+y_tt_resolve_list <- c( x[which(y4==max(y4))],
+                            x[which(y5==max(y5))],
+                            x[which(y6==max(y6))],
+                            x[which(y7a==max(y7a))],
+                            x[which(y7b==max(y7b))],
+                            x[which(y8==max(y8))],
+                            x[which(y9==max(y9))])
+
+
+y_mean_list <- c( h4$cp['mean'],
+                h5$cp['mean'],
+                h6$cp['mean'],
+                h7a$cp['mean'],
+                h7b$cp['mean'],
+                h8$cp['mean'],
+                h9$cp['mean'])
+
+y_sd_list <- c( h4$cp['s.d.'],
+                h5$cp['s.d.'],
+                h6$cp['s.d.'],
+                h7a$cp['s.d.'],
+                h7b$cp['s.d.'],
+                h8$cp['s.d.'],
+                h9$cp['s.d.'])
+
+days_tt_resolve <- exp(y_tt_resolve_list)/24
+days_tt_resolve_low <- exp(y_tt_resolve_list-y_sd_list)/24
+days_tt_resolve_high <- exp(y_tt_resolve_list+y_sd_list)/24
+
+
+my_mean_list <- c( mh_8$cp['mean'],
+                mh_89$cp['mean'],
+                mh_9$cp['mean'])
+
+my_sd_list <- c( mh_8$cp['s.d.'],
+                mh_89$cp['s.d.'],
+                mh_9$cp['s.d.'])
+
+mx_tt_resolve_list <- c(7, 8, 8.5)
+my_tt_resolve_list <- c(x[which(my8==max(my8))],
+                        x[which(my89==max(my89))],
+                        x[which(my9==max(my9))] )
+
+mdays_tt_resolve <- exp(my_tt_resolve_list)/24
+mdays_tt_resolve_low <- exp(my_tt_resolve_list-my_sd_list)/24
+mdays_tt_resolve_high <- exp(my_tt_resolve_list+my_sd_list)/24
+
+
+days_y_sd_list <- exp(y_sd_list)/24
+mdays_y_sd_list <- exp(my_sd_list)/24
+
+days_y_sd_list <- exp(y_sd_list)/24
+mdays_tt_resolve <- exp(my_tt_resolve_list)/24
+
+plot(x_tt_resolve_list, days_tt_resolve, type='p', pch=c(22), axes=FALSE, 
+        log='y', ylim=c(.01,350), xlab="Year", ylab='')
+#points(x_tt_resolve_list, days_tt_resolve, pch=c(22))
+
+lines(c(x_tt_resolve_list[1:2], x_tt_resolve_list[4]), c(days_tt_resolve[1:2], days_tt_resolve[4]), col='red')
+lines(c(x_tt_resolve_list[3], x_tt_resolve_list[5:7]), c(days_tt_resolve[3], days_tt_resolve[5:7]), col='green')
+#lines(mx_tt_resolve_list, mdays_tt_resolve)
+#points(mx_tt_resolve_list, mdays_tt_resolve, pch=c(24))
+
+lines(mx_tt_resolve_list, mdays_tt_resolve, col='blue')
+points(mx_tt_resolve_list, mdays_tt_resolve, pch=c(24))
+
+ticks<-c(0,0.01, 0.1, 0.5,1,2,4,7,21, 28, 7*8, 7*16)
+
+axis(1, labels=c('2004', '2005', '2006', '2007', '2008', '2009'), at=x_tick_list)
+axis(2, labels=ticks, at=ticks)
+mtext("Days to Resolve Message", 2, line=3)
+#axis(2, labels=ticks, at=ticks)
+#for (i in 1:length(days_y_sd_list) ) {
+#    whisker(x_tt_resolve_list[i], days_tt_resolve[i], days_y_sd_list[i])
+#}
+#for (i in 1:length(mdays_y_sd_list) ) {
+#    whisker(mx_tt_resolve_list[i], mdays_tt_resolve[i], mdays_y_sd_list[i])
+#}
+for (i in c(1,2,4) ) {
+    whisker2(x_tt_resolve_list[i], days_tt_resolve[i], 
+            days_tt_resolve_high[i], days_tt_resolve_low[i], col='red')
+}
+for (i in c(3,5,6,7) ) {
+    whisker2(x_tt_resolve_list[i], days_tt_resolve[i], 
+            days_tt_resolve_high[i], days_tt_resolve_low[i], col='green')
+}
+for (i in 1:length(mdays_y_sd_list) ) {
+    whisker2(mx_tt_resolve_list[i], mdays_tt_resolve[i], 
+            mdays_tt_resolve_high[i], mdays_tt_resolve_low[i], col='blue')
+}
+
+abline(h=21,col='grey90')
+abline(h=2,col='grey90')
+abline(h=0.5,col='grey80')
+
+legend(1, .05, 
+        cex=0.7,
+        legend=c("Unstable Periods", "Stable Periods", "MyOps Messages"), 
+        pch=c(22, 22, 24),
+        col=c('red', 'green', 'blue'),
+        lty=c(1, 1,1), merge=T)
+end_image()
+# install.packages('UsingR')
+require(UsingR)
+
+m<-min(t_4$start)
+d<-data.frame(
+    '2004'=t_4$start-m,
+    '2005'=t_5$start-m,
+    '2006'=t_6$start-m)
+simple.violinplot(d)
+
+par(mfrow=c(3,3))
+par(mai=c(.3,.3,.3,.3))
+sp <- function (t)
+{
+    d <- (t$lastreply-t$start)/(60*60*24)
+    simple.violinplot(log(d))
+}
+sp(t_4)
+sp(t_5)
+sp(t_6)
+sp(t_7)
+sp(t_8)
+sp(t_9)
+sp(m_8)
+sp(m_89)
+sp(m_9)
+
+
+t3 <- add_year (t2)
+m3 <- add_year (m2)
+
+par(mfrow=c(1,2))
+par(mai=c(.5,.5,.5,.5))
+t4<-t3[which((t3$lastreply-t3$start)/(60*60*24) < 20),]
+t4<-t3
+simple.violinplot(log((lastreply-start)/(60*60*24)) ~ year, data=t4)
+
+m3[which((m3$lastreply-m3$start)< 0),]
+m4<-m3[which((m3$lastreply-m3$start)/(60*60*24) < 100),]
+simple.violinplot(log((lastreply-start)/(60*60*24)) ~ year, data=m4, log='y')
+
+meanof <- function (t, year)
+{
+    tx <- t[which(t$year == year),]
+    r<-sn.em(y=log((tx$lastreply-tx$start)/(60*60*24)))
+    return (r)
+}
+
+t_sd <- p
+t_p <- c( meanof(t3,2004)$cp['mean'],
+        meanof(t3,2005)$cp['mean'],
+        meanof(t3,2006)$cp['mean'],
+        meanof(t3,2007)$cp['mean'],
+        meanof(t3,2008)$cp['mean'],
+        meanof(t3,2009)$cp['mean'],
+        meanof(t3,2010)$cp['mean'])
+points(t_p)
+for (i in 1:length(t_sd) ) {
+    whisker(i, t_p[i], exp(t_sd[i]))
+}
+
+
+
+
+
+
+#for (i in 1:length(y_tt_resolve_list) ) { 
+#    whisker(x_tt_resolve_list[i], scale_by*y_tt_resolve_list[i], scale_by*2) 
+#}
+#for (i in 1:length(my_tt_resolve_list) ) { 
+#    whisker(mx_tt_resolve_list[i], scale_by*my_tt_resolve_list[i], scale_by*2) 
+#}
+
+#
+#end_image()
+#par(mfrow=c(2,1))
+#plot_rt_hist(t_67)
+#plot_rt_hist(t_89)
+par(mfrow=c(1,1))
+
diff --git a/statistics/rt_monitor_data.r b/statistics/rt_monitor_data.r
new file mode 100644 (file)
index 0000000..62b63ff
--- /dev/null
@@ -0,0 +1,132 @@
+
+
+source("functions.r");
+
+# system("parse_rt_data.py 22 > rt_monitor_data.csv");
+m <- read.csv('rt_monitor_data.csv', sep=',', header=TRUE)
+
+par(mfrow=c(2,1))
+
+h<-hist(log(log(m$replies)), breaks=50)
+lines(h$breaks[which(h$counts!=0)], h$counts[which(h$counts!=0)])
+h<-hist(log(log(log(m$replies))), breaks=50)
+lines(h$breaks[which(h$counts!=0)], h$counts[which(h$counts!=0)])
+
+
+par(mfrow=c(1,1))
+
+m2 <- m[which(m$complete == 1),]
+d <- (m2$lastreply - m2$start)/(60*60)
+
+#start_image("rt_hist_ttc_1000.png")
+#hist(d[which(d<1000)], xlab="hours from creation to last reply", breaks=30)
+#end_image()
+#
+#start_image("rt_hist_ttc_200.png")
+#hist(d[which(d<200)], xlab="hours from creation to last reply", breaks=30)
+#end_image()
+#
+#start_image("rt_hist_ttc_50.png")
+#hist(d[which(d<50)], xlab="hours from creation to last reply", breaks=30)
+#end_image()
+#
+#start_image("rt_hist_ttc_10.png")
+#hist(d[which(d<10)], xlab="hours from creation to last reply", breaks=30)
+#end_image()
+#
+#d2 <- (t2$lastreply - t2$start)
+#h<-hist(log(d2), plot=F, breaks=50)
+#lines(h$breaks[which(h$counts!=0)], h$counts[which(h$counts!=0)])
+
+
+# this doesn't work as I would like.  I think the bins aren't as I expect
+#h <- hist(d, plot=F, breaks=c(seq(0,max(d)+1, .1)))
+#plot(h$counts, log="x", pch=20, col="blue",
+#      main="Log-normal distribution",
+#      xlab="Value", ylab="Frequency")
+
+#plot(log(d2))
+#plot(ecdf(d2))
+
+d2<-(m2$lastreply-m2$start)
+start_image("rt_monitor_ttc.png")
+par(mfrow=c(2,1))
+qqnorm(log(d2))
+plot_rt_hist(m2)
+end_image()
+
+par(mfrow=c(1,1))
+start_image("rt_monitor_trends.png")
+hist(log(d2[which(d2>59026)]), breaks=60, xlab="LOG(time to last-reply)", main="Monitor Queue Traffic patterns")
+end_image()
+
+tstamp_78 <-unclass(as.POSIXct("2008-01-01", origin="1960-01-01"))[1]
+tstamp_89 <-unclass(as.POSIXct("2009-01-01", origin="1960-01-01"))[1]
+
+m_7 <- m2[which( m2$start < tstamp_78 ),]
+m_8 <- m2[which( m2$start >= tstamp_78 & m2$start < tstamp_89 ),]
+m_9 <- m2[which( m2$start >= tstamp_89 ),]
+
+
+par(mfrow=c(3,1))
+plot_rt_hist(m_7)
+plot_rt_hist(m_8)
+plot_rt_hist(m_9)
+par(mfrow=c(1,1))
+
+
+tstamp <-unclass(as.POSIXct("2008-01-01", origin="1960-01-01"))
+m_67 <- m2[which( m2$start <  tstamp[1] ),]
+m_89 <- m2[which( m2$start >= tstamp[1] ),]
+
+
+#par(mfrow=c(2,1))
+#plot_rt_hist(t_67)
+#plot_rt_hist(t_89)
+par(mfrow=c(1,1))
+par(mai=c(1,1,1,2))
+par(mar=c(5,4,4,8))
+
+s_list <- c('2006'=1112, '2007'=1591, '2008'=1019, '2009'=815)
+m_list <- c('2006'=0,    '2007'=119,  '2008'=229,  '2009'=251)
+
+start_image('rt_aggregate_traffic.png')
+par(mfrow=c(1,1))
+par(mai=c(1,1,1,1))
+par(mar=c(5,4,4,4))
+
+s_list <- c(1519, 1596, 1112, 1591, 1019, 815)
+m_list <- c(0,0,0,    119,  229,  251)
+x_online_node_list <- c(1,   2.5,  4, 5.5, 7, 8.5)
+y_online_node_list <- c(330, 480,  500,    550,  575,  642)
+
+y<- rbind(support=s_list, monitor=m_list)
+barplot(y, space=0.5, width=1, ylim=c(0,2000), xlim=c(0,9),  
+        col=c('grey35', 'grey85'),
+        legend=F, ylab="Messages with One or More Replies", xlab="Year")
+scale_by <- 1500 / 700
+lines(x_online_node_list, y_online_node_list*scale_by)
+points(x_online_node_list, y_online_node_list*scale_by, pch=c(22))
+ticks<-c(0, 100, 200, 300, 400, 500, 600, 700)
+
+axis(1, labels=c('2004', '2005', '2006', '2007', '2008', '2009'), at=x_online_node_list)
+axis(4, labels=ticks, at=ticks*scale_by)
+
+mtext("Online Node Count", 4, line=3)
+legend(6.5, 2000, 
+        cex=0.7,
+        legend=c("Online Node Count", "MyOps Messages", "Support Messages"), 
+         fill=c(0, 'grey85', 'grey40'),
+        lty=c(1,0,0), merge=T)
+end_image()
+
+
+start_image("rt_monitor_seasonal.png")
+par(mfrow=c(3,1))
+par(mai=c(.3,.3,.3,.3))
+year_hist(m_7, "2007", "2006/12/31", "2008/1/7", 60)
+year_hist(m_8, "2008", "2007/12/30", "2009/1/7", 60)
+year_hist(m_9, "2009", "2008/12/28", "2010/1/30", 60)
+end_image()
+
+par(mfrow=c(1,1))
diff --git a/statistics/site_scores.r b/statistics/site_scores.r
new file mode 100644 (file)
index 0000000..615d303
--- /dev/null
@@ -0,0 +1,83 @@
+
+source("functions.r");
+
+#system("../nodequery.py --nodelist > ../nodelist.txt")
+#system("../comonquery.py --cache --nodelist ../nodelist.txt --select 'resptime>0' --fields='name,cpuspeed,numcores,memsize,disksize,bwlimit' | grep -v null | ./hn2lb.py | ./hn2pcustatus.py > ./out_resources.csv ")
+
+mdrc <- read.csv("out_resources.csv", TRUE, sep=",")
+
+# replace all weird numbers with defaults of 100mbps 
+mdrc$bwlimit <- replace(mdrc$bwlimit, which(mdrc$bwlimit==0 | mdrc$bwlimit==1), 100000)
+
+f<-slices_2
+
+s2<- f(mdrc, FALSE);
+mdrc$score <- s2;
+b<-30;
+
+# ----------------------
+### LOGINBASE
+unique_loginbase_length <- length(unique(mdrc$loginbase));
+unique_lb <- list(loginbase=array(0,c(unique_loginbase_length)), 
+                                 score=array(0,c(unique_loginbase_length)),
+                                 memsize=array(0,c(unique_loginbase_length)),
+                                 disksize=array(0,c(unique_loginbase_length)),
+                                 cpuspeed=array(0,c(unique_loginbase_length))
+                         )
+
+for ( i in 1:length(mdrc$loginbase) )
+{
+    r <- mdrc[i,];
+       unique_lb$loginbase[r$loginbase] <- r$loginbase;
+       unique_lb$score[r$loginbase]     <- unique_lb$score[r$loginbase] + r$score;
+
+       v <- f(r, TRUE);
+       unique_lb$memsize[r$loginbase]  <- unique_lb$memsize[r$loginbase]  + v[1];
+       unique_lb$disksize[r$loginbase] <- unique_lb$disksize[r$loginbase]  + v[2];
+       unique_lb$cpuspeed[r$loginbase] <- unique_lb$cpuspeed[r$loginbase]  + v[3];
+}
+
+df<- data.frame(unique_lb)
+
+h<- hist(df$score, breaks=b);
+bins<-length(h$breaks);
+c<- array(0,c(bins));
+d<- array(0,c(bins));
+m<- array(0,c(bins));
+b<- array(0,c(bins));
+# foreach score value, find which range it falls into, 
+# then in three columns for cpu, mem, disk, record the fraction of each.
+# then plot each sequence in a stacked graph, perhaps beside h$counts
+for ( i in 1:length(df$cpuspeed) )
+{
+    r <- df[i,];
+    s <- index_of_bin(h, r$score); # find bin position...
+    # take fraction that each component contributes to the total, and add to sum
+
+    m[s] <- m[s] + unique_lb$memsize[r$loginbase]/r$score;
+    d[s] <- d[s] + unique_lb$disksize[r$loginbase]/r$score;
+    c[s] <- c[s] + unique_lb$cpuspeed[r$loginbase]/r$score;
+}
+
+a <- array(c(c,d,m), dim=c(bins, 3));
+
+png("/Users/soltesz/Downloads/slice_policy_3.png")
+par(mfrow=c(2,1))
+par(mai=c(0.5,1,0.5,0.2))
+barplot(c(0,h$counts), 
+    xlab="slice count", 
+    main="Distribution of Site Scores", 
+    ylab="Total Frequency", 
+    ylim=c(0,70))
+par(mai=c(1.0,1,0,0.2));
+barplot(t(a), 
+    legend=c("CPUspeed (GHz)", "DISKsize (GB)", "MEMsize (GB)"), 
+    col=c("pink", "lightblue", "lightgreen"), 
+    ylim=c(0,70),
+    ylab="Break-down by Resource",
+    xlab="Site Score",
+    names.arg=c(0,h$breaks[1:length(h$breaks)-1]),
+);
+dev.off()
+
+
diff --git a/statistics/site_scores_bw.r b/statistics/site_scores_bw.r
new file mode 100644 (file)
index 0000000..1697671
--- /dev/null
@@ -0,0 +1,87 @@
+
+source("functions.r");
+
+#system("../nodequery.py --nodelist > ../nodelist.txt")
+#system("../comonquery.py --cache --nodelist ../nodelist.txt --select 'resptime>0' --fields='name,cpuspeed,numcores,memsize,disksize,bwlimit' | grep -v null | ./hn2lb.py | ./hn2pcustatus.py > ./out_resources.csv ")
+
+mdrc <- read.csv("out_resources.csv", TRUE, sep=",")
+
+# replace all weird numbers with defaults of 100mbps 
+mdrc$bwlimit <- replace(mdrc$bwlimit, which(mdrc$bwlimit==0 | mdrc$bwlimit==1), 100000)
+
+f<-slices_3
+
+s2<- f(mdrc, FALSE);
+mdrc$score <- s2;
+b<-30;
+
+# ----------------------
+### LOGINBASE
+unique_loginbase_length <- length(unique(mdrc$loginbase));
+unique_lb <- list(loginbase=array(0,c(unique_loginbase_length)), 
+                                 score=array(0,c(unique_loginbase_length)),
+                                 memsize=array(0,c(unique_loginbase_length)),
+                                 disksize=array(0,c(unique_loginbase_length)),
+                                 cpuspeed=array(0,c(unique_loginbase_length)),
+                                 bwlimit=array(0,c(unique_loginbase_length))
+                         )
+
+for ( i in 1:length(mdrc$loginbase) )
+{
+    r <- mdrc[i,];
+       unique_lb$loginbase[r$loginbase] <- r$loginbase;
+       unique_lb$score[r$loginbase]     <- unique_lb$score[r$loginbase] + r$score;
+
+       v <- f(r, TRUE);
+       unique_lb$memsize[r$loginbase]  <- unique_lb$memsize[r$loginbase]  + v[1];
+       unique_lb$disksize[r$loginbase] <- unique_lb$disksize[r$loginbase]  + v[2];
+       unique_lb$cpuspeed[r$loginbase] <- unique_lb$cpuspeed[r$loginbase]  + v[3];
+       unique_lb$bwlimit[r$loginbase] <- unique_lb$bwlimit[r$loginbase]  + v[4];
+}
+
+df<- data.frame(unique_lb)
+
+h<- hist(df$score, breaks=b);
+bins<-length(h$breaks);
+c<- array(0,c(bins));
+d<- array(0,c(bins));
+m<- array(0,c(bins));
+b<- array(0,c(bins));
+# foreach score value, find which range it falls into, 
+# then in three columns for cpu, mem, disk, record the fraction of each.
+# then plot each sequence in a stacked graph, perhaps beside h$counts
+for ( i in 1:length(df$cpuspeed) )
+{
+    r <- df[i,];
+    s <- index_of_bin(h, r$score); # find bin position...
+    # take fraction that each component contributes to the total, and add to sum
+
+    m[s] <- m[s] + unique_lb$memsize[r$loginbase]/r$score;
+    d[s] <- d[s] + unique_lb$disksize[r$loginbase]/r$score;
+    c[s] <- c[s] + unique_lb$cpuspeed[r$loginbase]/r$score;
+    b[s] <- b[s] + unique_lb$bwlimit[r$loginbase]/r$score;
+}
+
+#vals <- list(bwlimit=b,cpuspeed=c,disksize=d,memsize=m)
+a <- array(c(b,c,d,m), dim=c(bins, 4));
+
+png("/Users/soltesz/Downloads/slice_policy_4.png")
+par(mfrow=c(2,1))
+par(mai=c(0.5,1,0.5,0.2))
+barplot(c(0,h$counts), 
+    xlab="slice count", 
+    main="Distribution of Site Scores", 
+    ylab="Total Frequency", 
+    ylim=c(0,70))
+par(mai=c(1.0,1,0,0.2));
+barplot(t(a), 
+    legend=c("BWlimit (Mbps)", "CPUspeed (GHz)", "DISKsize (GB)", "MEMsize (GB)"), 
+    col=c("lightyellow", "pink", "lightblue", "lightgreen"), 
+    ylim=c(0,70),
+    ylab="Break-down by Resource",
+    xlab="Site Score",
+    names.arg=c(0,h$breaks[1:length(h$breaks)-1]),
+);
+dev.off()
+
+
diff --git a/statistics/site_scores_pcu.r b/statistics/site_scores_pcu.r
new file mode 100644 (file)
index 0000000..c4a3f92
--- /dev/null
@@ -0,0 +1,94 @@
+
+source("functions.r");
+
+#system("../nodequery.py --nodelist > ../nodelist.txt")
+#system("../comonquery.py --cache --nodelist ../nodelist.txt --select 'resptime>0' --fields='name,cpuspeed,numcores,memsize,disksize,bwlimit' | grep -v null | ./hn2lb.py | ./hn2pcustatus.py | sed -e "s/none/0/g" -e "s/Not_Run/0.5/g" -e "s/error/0.5/g" -e "s/Ok/1/g" > ./out_resources.csv ")
+
+mdrc <- read.csv("out_resources.csv", TRUE, sep=",")
+
+# replace all weird numbers with defaults of 100mbps 
+mdrc$bwlimit <- replace(mdrc$bwlimit, which(mdrc$bwlimit==0 | mdrc$bwlimit==1), 100000)
+#mdrc$pcus <- replace(mdrc$pcustatus, which(mdrc$pcustatus=="none"), 0);
+#mdrc$pcus <- replace(mdrc$pcus, which(mdrc$pcus=="error" | mdrc$pcusu=="Not_Run"), 0.5);
+#mdrc$pcus <- replace(mdrc$pcus, which(mdrc$pcus=="Ok"), 1);
+
+f<-slices_4
+
+s2<- f(mdrc, FALSE);
+mdrc$score <- s2;
+b<-30;
+
+# ----------------------
+### LOGINBASE
+unique_loginbase_length <- length(unique(mdrc$loginbase));
+unique_lb <- list(loginbase=array(0,c(unique_loginbase_length)), 
+                                 score=array(0,c(unique_loginbase_length)),
+                                 memsize=array(0,c(unique_loginbase_length)),
+                                 disksize=array(0,c(unique_loginbase_length)),
+                                 cpuspeed=array(0,c(unique_loginbase_length)),
+                                 bwlimit=array(0,c(unique_loginbase_length)),
+                                 pcustatus=array(0,c(unique_loginbase_length))
+                         )
+
+for ( i in 1:length(mdrc$loginbase) )
+{
+    r <- mdrc[i,];
+       unique_lb$loginbase[r$loginbase] <- r$loginbase;
+       unique_lb$score[r$loginbase]     <- unique_lb$score[r$loginbase] + r$score;
+
+       v <- f(r, TRUE);
+       unique_lb$memsize[r$loginbase]  <- unique_lb$memsize[r$loginbase]  + v[1];
+       unique_lb$disksize[r$loginbase] <- unique_lb$disksize[r$loginbase]  + v[2];
+       unique_lb$cpuspeed[r$loginbase] <- unique_lb$cpuspeed[r$loginbase]  + v[3];
+       unique_lb$bwlimit[r$loginbase] <- unique_lb$bwlimit[r$loginbase]  + v[4];
+       unique_lb$pcustatus[r$loginbase] <- unique_lb$pcustatus[r$loginbase]  + v[5];
+}
+
+df<- data.frame(unique_lb)
+
+h<- hist(df$score, breaks=b);
+bins<-length(h$breaks);
+c<- array(0,c(bins));
+d<- array(0,c(bins));
+m<- array(0,c(bins));
+b<- array(0,c(bins));
+p<- array(0,c(bins));
+# foreach score value, find which range it falls into, 
+# then in three columns for cpu, mem, disk, record the fraction of each.
+# then plot each sequence in a stacked graph, perhaps beside h$counts
+for ( i in 1:length(df$cpuspeed) )
+{
+    r <- df[i,];
+    s <- index_of_bin(h, r$score); # find bin position...
+    # take fraction that each component contributes to the total, and add to sum
+
+    m[s] <- m[s] + unique_lb$memsize[r$loginbase]/r$score;
+    d[s] <- d[s] + unique_lb$disksize[r$loginbase]/r$score;
+    c[s] <- c[s] + unique_lb$cpuspeed[r$loginbase]/r$score;
+    b[s] <- b[s] + unique_lb$bwlimit[r$loginbase]/r$score;
+    p[s] <- p[s] + unique_lb$pcustatus[r$loginbase]/r$score;
+}
+
+#vals <- list(bwlimit=b,cpuspeed=c,disksize=d,memsize=m)
+a <- array(c(p,b,c,d,m), dim=c(bins, 5));
+
+#png("/Users/soltesz/Downloads/slice_policy_5.png")
+par(mfrow=c(2,1))
+par(mai=c(0.5,1,0.5,0.2))
+barplot(c(0,h$counts), 
+    xlab="slice count", 
+    main="Distribution of Site Scores", 
+    ylab="Total Frequency", 
+    ylim=c(0,70))
+par(mai=c(1.0,1,0,0.2));
+barplot(t(a), 
+    legend=c("PCU Status", "BWlimit (Mbps)", "CPUspeed (GHz)", "DISKsize (GB)", "MEMsize (GB)"), 
+    col=c("orange", "lightyellow", "pink", "lightblue", "lightgreen"), 
+    ylim=c(0,70),
+    ylab="Break-down by Resource",
+    xlab="Site Score",
+    names.arg=c(0,h$breaks[1:length(h$breaks)-1]),
+);
+#dev.off()
+
+