X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=blobdiff_plain;f=statistics%2Frt_data.r;h=0c32b3b3ea8ff4ce499c3b8d1587adbec42352bd;hp=9ba227b778f96ec162fc1ce8be0ba58b6dae33d2;hb=a043462186ca0db241c1b52b6660ee39dcbceb3e;hpb=86a30aa99415b4a68f9b483487ba101a722c3f92 diff --git a/statistics/rt_data.r b/statistics/rt_data.r index 9ba227b..0c32b3b 100644 --- a/statistics/rt_data.r +++ b/statistics/rt_data.r @@ -39,7 +39,6 @@ d <- (t2$lastreply - t2$start)/(60*60) #h<-hist(log(d2), plot=F, breaks=50) #lines(h$breaks[which(h$counts!=0)], h$counts[which(h$counts!=0)]) - # this doesn't work as I would like. I think the bins aren't as I expect #h <- hist(d, plot=F, breaks=c(seq(0,max(d)+1, .1))) #plot(h$counts, log="x", pch=20, col="blue", @@ -49,12 +48,12 @@ d <- (t2$lastreply - t2$start)/(60*60) #plot(log(d2)) #plot(ecdf(d2)) -tstamp_45 <-unclass(as.POSIXct("2005-01-01", origin="1960-01-01"))[1] -tstamp_56 <-unclass(as.POSIXct("2006-01-01", origin="1960-01-01"))[1] -tstamp_67 <-unclass(as.POSIXct("2007-01-01", origin="1960-01-01"))[1] -tstamp_78 <-unclass(as.POSIXct("2008-01-01", origin="1960-01-01"))[1] -tstamp_89 <-unclass(as.POSIXct("2009-01-01", origin="1960-01-01"))[1] -tstamp_90 <-unclass(as.POSIXct("2010-01-01", origin="1960-01-01"))[1] +tstamp_45 <-unclass(as.POSIXct("2005-01-01", origin="1970-01-01"))[1] +tstamp_56 <-unclass(as.POSIXct("2006-01-01", origin="1970-01-01"))[1] +tstamp_67 <-unclass(as.POSIXct("2007-01-01", origin="1970-01-01"))[1] +tstamp_78 <-unclass(as.POSIXct("2008-01-01", origin="1970-01-01"))[1] +tstamp_89 <-unclass(as.POSIXct("2009-01-01", origin="1970-01-01"))[1] +tstamp_90 <-unclass(as.POSIXct("2010-01-01", origin="1970-01-01"))[1] t_4 <- t2[which( t2$start < tstamp_45 ),] @@ -87,6 +86,81 @@ year_hist(t_8, "2008", "2007/12/30", "2009/1/7", 85) year_hist(t_9, "2009", "2008/12/28", "2010/1/30", 85) end_image() +h4<-year_hist(t_4, "2004", "2003/12/28", "2005/2/7", 0, type='month', fmt="%b") +h5<-year_hist(t_5, "2005", "2005/1/2", "2006/2/7", 0, type='month', fmt="%b") +h6<-year_hist(t_6, "2006", "2006/1/1", "2007/2/7", 0, type='month', fmt="%b") +h7<-year_hist(t_7, "2007", "2006/12/31", "2008/2/7", 0, type='month', fmt="%b") +h8<-year_hist(t_8, "2008", "2007/12/30", "2009/2/7", 0, type='month', fmt="%b") +h9<-year_hist(t_9, "2009", "2008/12/28", "2010/1/30", 0, type='month', fmt="%b") + +hall<-year_hist(t2, "200x", "2004/1/1", "2010/3/28", 0, type='month', fmt="%b") + +threshold <- function (hall, d, from, to, type, fmt="%b") +{ + dates <-seq(as.Date(from), as.Date(to), type) + months <- format(dates, fmt) + hbreaks<-unclass(as.POSIXct(dates)) + + x<-seq(1,length(hall$breaks)) + a_x<-x[which(hall$counts>d)] + a_y<-hall$counts[which(hall$counts>d)] + b_x<-x[which(hall$counts length(b) ) { + yy<- rbind(yy,hall$counts[b[i]:length(hall$counts)]) + } else { + yy<- rbind(yy,hall$counts[b[i]:b[i+1]-1]) + } +} +yy[7,3:12]<-0 # no data for beyond feb. +y2<-NULL ; for ( i in seq(1,12) ) { y2<-c(y2,sum(yy[,i])) } + +start_image('rt_aggregate_months.png', width=600, height=300) +barplot(y2, space=.1, width=.9, col=c('blue','red', 'red', 'red', 'red', + 'blue', 'blue', 'red', 'red', 'red', 'blue', 'blue'), + xlab="Months", ylab="Sum of Tickets over 6 years") +axis(1, labels=c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', + 'Sep', 'Oct', 'Nov', 'Dec'), at=c(0,1,2,3,4,5,6,7,8,9,10,11)+.5) +end_image() + +cc<-NULL ; +for (i in 1:length(yy)) +{ + if ( t(yy)[i] < 80 ) + { + cc<- c(cc, 'blue') + } else { + cc<- c(cc, 'red') + } +} +barplot(yy, col=cc) + +# skip 2007 +start_image('rt_aggregate_months_no2007.png', width=600, height=300) +y3<-NULL ; for ( i in seq(1,12) ) { y3<-c(y3,sum(yy[1:3,i], yy[5:7,i])) } +barplot(y3, , space=.1, width=.9, col=c('blue','blue', 'red', 'red', 'red', + 'blue', 'blue', 'red', 'red', 'red', 'blue', 'blue'), + xlab="Months", ylab="Sum of Tickets over 6 years") +axis(1, labels=c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', + 'Sep', 'Oct', 'Nov', 'Dec'), at=c(0,1,2,3,4,5,6,7,8,9,10,11)+.5) +end_image() + + + + par(mai=c(0.7,0.7,0.7,0.7)) par(mfrow=c(1,1)) @@ -118,8 +192,245 @@ time_hist <- function (t, lessthan, year, log=T, breaks=30, xlim=c(-4,10), ylim= } return (h); } + +median_time_to_resolve_window <- function (t, from, to, window, fmt="%b") +{ + # find 'type' range of days + dates <-seq(as.Date(from), as.Date(to), 'week') + months <- format(dates, fmt) + hbreaks<-unclass(as.POSIXct(dates)) + + xx<-NULL; + yy<-NULL; + yy_sd_high<-NULL; + yy_sd_low<-NULL; + date_index <- NULL; + q_list <- NULL; + + x<-seq(-20,20,0.01) + for ( i in seq(1,length(hbreaks)-window-1) ) + { + print (sprintf("round %s of %s", i, length(hbreaks)-window-1)) + # get range from t + t_sub <- t[which(t$start > hbreaks[i] & t$start<= hbreaks[i+window]),] + if ( length(t_sub$start) <= 1 ) { next } + # take log, then sn.mle -> h + d <- (t_sub$lastreply - t_sub$start)/(60*60) # hours + d <- log(d) # log(hours) + # sn.mle + print (sprintf("length: %s", length(d))) + q<-quantile(d) + print(q) + + date_index <- c(date_index, round(i+window/2)) + + xx<- c(xx, hbreaks[round(i+window/2)]) + q_list <- rbind(q_list, q) + + } + m<- months[date_index] + return (cbind(xx,q_list, m)) +} +mean_time_to_resolve_window <- function (t, from, to, window, fmt="%b") +{ + # find 'type' range of days + dates <-seq(as.Date(from), as.Date(to), 'week') + months <- format(dates, fmt) + hbreaks<-unclass(as.POSIXct(dates)) + + xx<-NULL; + yy<-NULL; + yy_sd_high<-NULL; + yy_sd_low<-NULL; + date_list <- NULL; + + x<-seq(-20,20,0.01) + for ( i in seq(1,length(hbreaks)-window-1) ) + { + print (sprintf("round %s of %s", i, length(hbreaks)-window-1)) + # get range from t + t_sub <- t[which(t$start > hbreaks[i] & t$start<= hbreaks[i+window]),] + if ( length(t_sub$start) <= 1 ) { next } + # take log, then sn.mle -> h + d <- (t_sub$lastreply - t_sub$start)/(60*60) # hours + d <- log(d) # log(hours) + # sn.mle + print (sprintf("length: %s", length(d))) + avg<-mean(d) + s<-sd(d) + r<-shapiro.test(d) #, mean=avg, sd=s) + if ( r$statistic < 0.9 ){ + print (r); + } + + m<-dnorm(x, mean=avg, sd=s) + print(avg) + # find max of y + y_peak <- x[which(m==max(m))] + print(y_peak) + # plot point date, max(y) + xx<- c(xx, hbreaks[round(i+window/2)]) + yy<- c(yy, y_peak) + yy_sd_high<- y_peak + s + yy_sd_low <- y_peak - s + date_list <- c(date_list, dates[i]) + # plot whisker2(x0,y0,y0_hi,y0_lo) + } + l<-length(months)-window-1 + m<- months[1:l] + return (rbind(xx,yy,yy_sd_high, yy_sd_low, m)) +} +require(sn) +sknorm_time_to_resolve_window <- function (t, from, to, window, fmt="%b") +{ + # find 'type' range of days + dates <-seq(as.Date(from), as.Date(to), 'week') + months <- format(dates, fmt) + hbreaks<-unclass(as.POSIXct(dates)) + + xx<-NULL; + yy<-NULL; + yy_sd_high<-NULL; + yy_sd_low<-NULL; + date_list <- NULL; + + x<-seq(-20,20,0.01) + for ( i in seq(1,length(hbreaks)-window-1) ) + { + print (sprintf("round %s of %s", i, length(hbreaks)-window-1)) + # get range from t + t_sub <- t[which(t$start > hbreaks[i] & t$start<= hbreaks[i+window]),] + if ( length(t_sub$start) <= 1 ) { next } + # take log, then sn.mle -> h + d <- (t_sub$lastreply - t_sub$start)/(60*60) # hours + d <- log(d) # log(hours) + # sn.mle + print (sprintf("length: %s", length(d))) + h<-sn.em(y=d) + if ( abs(h$cp['skewness']) > 0.95 ) + { + print(h) + next # just skip it + } + + # find dsn() using h parameters -> y + m<-dsn(x, dp=cp.to.dp(h$cp)) + # find max of y + y_peak <- x[which(m==max(m))] + # plot point date, max(y) + xx<- c(xx, hbreaks[round(i+window/2)]) + yy<- c(yy, y_peak) + yy_sd_high<- y_peak + h$cp['s.d.'] + yy_sd_low <- y_peak - h$cp['s.d.'] + date_list <- c(date_list, dates[i]) + # plot whisker2(x0,y0,y0_hi,y0_lo) + } + l<-length(months)-window-1 + m<- months[1:l] + return (rbind(xx,yy,yy_sd_high, yy_sd_low, m)) +} + +# NOTE: Try something simpler, like median of the log of ttr. +# it's going to be a lot of work to explain lsn distributions. something +# more obvious would be a lot easier. + +par(mfrow=c(4,1)) +par(mai=c(.3,0.3,0.3,0.3)) +for ( s in c(7)) #,14,21) ) +{ + d<- median_time_to_resolve_window(t2, "2004/1/1", "2010/2/28", s, "%b%y") + plot(d[,1], exp(as.numeric(d[,5]))/24, type='l', lty=1, xlab="", + axes=F, ylim=c(0.01, 15), ylab="Days to Resolve", col='orange') + lines(d[,1], exp(as.numeric(d[,4]))/24, lty=1, col='red') + lines(d[,1], exp(as.numeric(d[,3]))/24, lty=1, col='black') + axis(1, labels=d[,7], at=d[,1]) + axis(2, las=1) + m<-round(max(exp(as.numeric(d[,4]))/24), 2) + axis(2, labels=m, at=m, las=1) + abline(h=m, lty=2, col='grey40') +} + +# monitor + d2<- median_time_to_resolve_window(m2, "2007/02/1", "2010/2/28", s, "%b%y") + plot(d[,1], exp(as.numeric(d[,2]))/24, type='l', lty=1, xlab="", + axes=F, ylim=c(0.01, 165), ylab="Days to Resolve", col='white') + lines(d2[,1], exp(as.numeric(d2[,5]))/24, lty=1, col='red') + lines(d2[,1], exp(as.numeric(d2[,4]))/24, lty=1, col='red') + lines(d2[,1], exp(as.numeric(d2[,3]))/24, lty=1, col='black') + axis(1, labels=d[,7], at=d[,1]) + axis(2, las=1) + m<-round(max(exp(as.numeric(d2[,4]))/24), 2) + axis(2, labels=m, at=m, las=1) + abline(h=m, lty=2, col='grey40') + + + +mean_time_to_resolve <- function (t, from, to, type, fmt="%b") +{ + # find 'type' range of days + dates <-seq(as.Date(from), as.Date(to), type) + months <- format(dates, fmt) + hbreaks<-unclass(as.POSIXct(dates)) + + xx<-NULL; + yy<-NULL; + yy_sd_high<-NULL; + yy_sd_low<-NULL; + date_list <- NULL; + + for ( i in seq(1,length(hbreaks)-1) ) + { + # get range from t + t_sub <- t[which(t$start > hbreaks[i] & t$start<= hbreaks[i+1]),] + if ( length(t_sub$start) == 0 ) { next } + # take log, then sn.mle -> h + d <- (t_sub$lastreply - t_sub$start)/(60*60) # hours + d <- log(d) # log(hours) + # sn.mle + h<-sn.em(y=d) + if ( abs(h$cp['skewness']) > 0.95 ) + { + print(h) + } + + # find dsn() using h parameters -> y + x<-seq(-8,10,0.01) + m<-dsn(x, dp=cp.to.dp(h$cp)) + # find max of y + y_peak <- x[which(m==max(m))] + # plot point date, max(y) + xx<- c(xx, hbreaks[i]) + yy<- c(yy, y_peak) + yy_sd_high<- y_peak + h$cp['s.d.'] + yy_sd_low <- y_peak - h$cp['s.d.'] + date_list <- c(date_list, dates[i]) + # plot whisker2(x0,y0,y0_hi,y0_lo) + } + m<- months[1:length(months)-1] + return (rbind(xx,yy,yy_sd_high, yy_sd_low, m)) +} + + +par(mfrow=c(5,1)) +par(mai=c(.3,0.3,0.3,0.3)) +for ( s in c("10 days", "2 weeks", "3 weeks", "month", "2 months")) +#for ( s in c("month") ) +{ + d<- mean_time_to_resolve(t2, "2004/1/1", "2010/2/28", s, "%b%y") + plot(d[1,], exp(as.numeric(d[2,]))/24, type='l', axes=F) + points(d[1,], exp(as.numeric(d[2,]))/24, pch=23) + axis(1, labels=d[5,], at=d[1,]) + axis(2) +} + + + tstamp <-unclass(as.POSIXct("2007-05-01", origin="1960-01-01")) -t_7a <- t_7[which(t_7$start < tstamp),] +t_7a <- t_7[t_rep <- read.csv('rt_replies.csv', sep=',', header=TRUE) +t2_rep <- t_rep[which(t_rep$complete == 1),] +t2_rep <- t_rep[which(t_rep$diff != 0),] + +which(t_7$start < tstamp),] t_7b <- t_7[which(t_7$start >= tstamp),] #end_image() @@ -222,6 +533,7 @@ whisker2 <- function (x0,y0, y0_high, y0_low, col="black", length=0.05) arrows(x0, y0, x0, y0_low, code=2, angle=90, length=length, col=col) } +# NOTE: ** monthly averages might make a more compelling case than annual averages. start_image("rt_aggregate_times.png") par(mfrow=c(1,1)) par(mai=c(1,1,1,1)) @@ -297,10 +609,10 @@ lines(c(x_tt_resolve_list[3], x_tt_resolve_list[5:7]), c(days_tt_resolve[3], day lines(mx_tt_resolve_list, mdays_tt_resolve, col='blue') points(mx_tt_resolve_list, mdays_tt_resolve, pch=c(24)) -ticks<-c(0,0.01, 0.1, 0.5,1,2,4,7,21, 28, 7*8, 7*16) +ticks<-c(0,0.01, 0.1, 0.5,1,2,4,7,14,21, 28, 60, 120) axis(1, labels=c('2004', '2005', '2006', '2007', '2008', '2009'), at=x_tick_list) -axis(2, labels=ticks, at=ticks) +axis(2, las=1, labels=ticks, at=ticks) mtext("Days to Resolve Message", 2, line=3) #axis(2, labels=ticks, at=ticks) #for (i in 1:length(days_y_sd_list) ) { @@ -322,9 +634,12 @@ for (i in 1:length(mdays_y_sd_list) ) { mdays_tt_resolve_high[i], mdays_tt_resolve_low[i], col='blue') } -abline(h=21,col='grey90') -abline(h=2,col='grey90') -abline(h=0.5,col='grey80') +abline(h=120,col='grey80', lty=2) +abline(h=21,col='grey80', lty=2) +abline(h=7,col='grey80', lty=2) +abline(h=2,col='grey80', lty=2) +abline(h=0.5,col='grey80', lty=2) +abline(h=0.1,col='grey80', lty=2) legend(1, .05, cex=0.7, @@ -413,3 +728,65 @@ for (i in 1:length(t_sd) ) { #plot_rt_hist(t_89) par(mfrow=c(1,1)) + +# system("./parse_rt_replies.py 3> rt_replies.csv") +t_rep <- read.csv('rt_replies.csv', sep=',', header=TRUE) +t2_rep <- t_rep[which(t_rep$complete == 1),] +t2_rep <- t_rep[which(t_rep$diff != 0),] + +mean_diff_time <- function (t, from, to, type, fmt="%b") +{ + # find 'type' range of days + dates <-seq(as.Date(from), as.Date(to), type) + months <- format(dates, fmt) + hbreaks<-unclass(as.POSIXct(dates)) + + xx<-NULL; + yy<-NULL; + yy_sd_high<-NULL; + yy_sd_low<-NULL; + date_list <- NULL; + + for ( i in seq(1,length(hbreaks)-1) ) + { + # get range from t + t_sub <- t[which(t$prev > hbreaks[i] & t$prev <= hbreaks[i+1]),] + if ( length(t_sub$start) == 0 ) { next } + # take log, then sn.mle -> h + d <- (abs(t_sub$diff)/(60*60)) + d <- log(d) # log(hours) + # sn.mle + h<-sn.em(y=d) + if ( abs(h$cp['skewness']) > 0.95 ) + { + print(h) + } + + # find dsn() using h parameters -> y + x<-seq(-8,10,0.01) + m<-dsn(x, dp=cp.to.dp(h$cp)) + # find max of y + y_peak <- x[which(m==max(m))] + # plot point date, max(y) + xx<- c(xx, hbreaks[i]) + yy<- c(yy, y_peak) + yy_sd_high<- y_peak + h$cp['s.d.'] + yy_sd_low <- y_peak - h$cp['s.d.'] + date_list <- c(date_list, dates[i]) + # plot whisker2(x0,y0,y0_hi,y0_lo) + } + m<- months[1:length(months)-1] + return (rbind(xx,yy,yy_sd_high, yy_sd_low, m)) +} + +par(mfrow=c(5,1)) +par(mai=c(.3,0.3,0.3,0.3)) +for ( s in c("2 weeks", "3 weeks", "month", "2 months")) +#for ( s in c("month") ) +{ + d<- mean_diff_time(t2_rep, "2004/1/1", "2010/2/28", s, "%b%y") + plot(d[1,], exp(as.numeric(d[2,]))/24, type='l', axes=F) + points(d[1,], exp(as.numeric(d[2,]))/24, pch=23) + axis(1, labels=d[5,], at=d[1,]) + axis(2) +}