1#!/usr/bin/Rscript 2# 3# Write an overview of task status, per-metric task status, task histograms. 4 5library(data.table) 6library(ggplot2) 7 8options(stringsAsFactors = FALSE) # get rid of annoying behavior 9 10Log <- function(fmt, ...) { 11 cat(sprintf(fmt, ...)) 12 cat('\n') 13} 14 15# max of non-NA values; NA if there are none 16MaybeMax <- function(values) { 17 v <- values[!is.na(values)] 18 if (length(v) == 0) { 19 m <- NA 20 } else { 21 m <- max(v) 22 } 23 as.numeric(m) # data.table requires this; otherwise we get type errors 24} 25 26# mean of non-NA values; NA if there are none 27MaybeMean <- function(values) { 28 v <- values[!is.na(values)] 29 if (length(v) == 0) { 30 m <- NA 31 } else { 32 m <- mean(v) 33 } 34 as.numeric(m) # data.table require this; otherwise we get type errors 35} 36 37WriteDistOverview <- function(summary, output_dir) { 38 s <- data.table(summary) # data.table syntax is easier here 39 40 by_metric <- s[ , list( 41 params_file = unique(params_file), 42 map_file = unique(map_file), 43 days = length(date), 44 max_num_reports = MaybeMax(num_reports), 45 46 # summarize status 47 ok = sum(status == 'OK'), 48 fail = sum(status == 'FAIL'), 49 timeout = sum(status == 'TIMEOUT'), 50 skipped = sum(status == 'SKIPPED'), 51 52 # TODO: Need to document the meaning of these metrics. 53 # All could be NA 54 # KiB -> MB 55 #max_vm5_peak_mb = MaybeMax(vm5_peak_kib * 1024 / 1e6), 56 #mean_vm5_mean_mb = MaybeMean(vm5_mean_kib * 1024 / 1e6), 57 58 mean_secs = MaybeMean(seconds), 59 mean_allocated_mass = MaybeMean(allocated_mass) 60 61 # unique failure reasons 62 # This can be used when there are different call stacks. 63 #fail_reasons = length(unique(fail_reason[fail_reason != ""])) 64 ), by=metric] 65 66 # Case insensitive sort by metric name 67 by_metric <- by_metric[order(tolower(by_metric$metric)), ] 68 69 overview_path <- file.path(output_dir, 'overview.csv') 70 write.csv(by_metric, file = overview_path, row.names = FALSE) 71 Log("Wrote %s", overview_path) 72 73 by_metric 74} 75 76WriteDistMetricStatus <- function(summary, output_dir) { 77 # Write status.csv, num_reports.csv, and mass.csv for each metric. 78 79 s <- data.table(summary) 80 81 # loop over unique metrics, and write a CSV for each one 82 for (m in unique(s$metric)) { 83 # Select cols, and convert units. Don't need params / map / metric. 84 subframe <- s[s$metric == m, 85 list(job_id, date, status, 86 #vm5_peak_mb = vm5_peak_kib * 1024 / 1e6, 87 #vm5_mean_mb = vm5_mean_kib * 1024 / 1e6, 88 num_reports, 89 seconds, 90 allocated_mass, num_rappor)] 91 92 # Sort by descending date. Alphabetical sort works fine for YYYY-MM-DD. 93 subframe <- subframe[order(subframe$date, decreasing = TRUE), ] 94 95 out_path = file.path(output_dir, m, 'status.csv') 96 write.csv(subframe, file = out_path, row.names = FALSE) 97 Log("Wrote %s", out_path) 98 } 99 100 # This one is just for plotting with dygraphs. TODO: can dygraphs do 101 # something smarter? Maybe you need to select the column in JavaScript, and 102 # pass it an array, rather than CSV text. 103 for (m in unique(s$metric)) { 104 f1 <- s[s$metric == m, list(date, num_reports)] 105 path1 <- file.path(output_dir, m, 'num_reports.csv') 106 # NOTE: dygraphs (only in Firefox?) doesn't like the quotes around 107 # "2015-04-03". In general, we can't turn off quotes, because strings with 108 # double quotes will be invalid CSV files. But in this case, we only have 109 # date and number columns, so we can. dygraphs is mistaken here. 110 write.csv(f1, file = path1, row.names = FALSE, quote = FALSE) 111 Log("Wrote %s", path1) 112 113 # Write unallocated mass. TODO: Write the other 2 vars too? 114 f2 <- s[s$metric == m, 115 list(date, 116 unallocated_mass = 1.0 - allocated_mass)] 117 118 path2 <- file.path(output_dir, m, 'mass.csv') 119 write.csv(f2, file = path2, row.names = FALSE, quote = FALSE) 120 Log("Wrote %s", path2) 121 } 122} 123 124WritePlot <- function(p, outdir, filename, width = 800, height = 600) { 125 filename <- file.path(outdir, filename) 126 png(filename, width = width, height = height) 127 plot(p) 128 dev.off() 129 Log('Wrote %s', filename) 130} 131 132# Make sure the histogram has some valid input. If we don't do this, ggplot 133# blows up with an unintuitive error message. 134CheckHistogramInput <- function(v) { 135 if (all(is.na(v))) { 136 arg_name <- deparse(substitute(v)) # R idiom to get name 137 Log('FATAL: All values in %s are NA (no successful runs?)', arg_name) 138 quit(status = 1) 139 } 140} 141 142WriteDistHistograms <- function(s, output_dir) { 143 CheckHistogramInput(s$allocated_mass) 144 145 p <- qplot(s$allocated_mass, geom = "histogram") 146 t <- ggtitle("Allocated Mass by Task") 147 x <- xlab("allocated mass") 148 y <- ylab("number of tasks") 149 WritePlot(p + t + x + y, output_dir, 'allocated_mass.png') 150 151 CheckHistogramInput(s$num_rappor) 152 153 p <- qplot(s$num_rappor, geom = "histogram") 154 t <- ggtitle("Detected Strings by Task") 155 x <- xlab("detected strings") 156 y <- ylab("number of tasks") 157 WritePlot(p + t + x + y, output_dir, 'num_rappor.png') 158 159 CheckHistogramInput(s$num_reports) 160 161 p <- qplot(s$num_reports / 1e6, geom = "histogram") 162 t <- ggtitle("Raw Reports by Task") 163 x <- xlab("millions of reports") 164 y <- ylab("number of tasks") 165 WritePlot(p + t + x + y, output_dir, 'num_reports.png') 166 167 CheckHistogramInput(s$seconds) 168 169 p <- qplot(s$seconds, geom = "histogram") 170 t <- ggtitle("Analysis Duration by Task") 171 x <- xlab("seconds") 172 y <- ylab("number of tasks") 173 WritePlot(p + t + x + y, output_dir, 'seconds.png') 174 175 # NOTE: Skipping this for 'series' jobs. 176 if (sum(!is.na(s$vm5_peak_kib)) > 0) { 177 p <- qplot(s$vm5_peak_kib * 1024 / 1e6, geom = "histogram") 178 t <- ggtitle("Peak Memory Usage by Task") 179 x <- xlab("Peak megabytes (1e6 bytes) of memory") 180 y <- ylab("number of tasks") 181 WritePlot(p + t + x + y, output_dir, 'memory.png') 182 } 183} 184 185ProcessAllDist <- function(s, output_dir) { 186 Log('dist: Writing per-metric status.csv') 187 WriteDistMetricStatus(s, output_dir) 188 189 Log('dist: Writing histograms') 190 WriteDistHistograms(s, output_dir) 191 192 Log('dist: Writing aggregated overview.csv') 193 WriteDistOverview(s, output_dir) 194} 195 196# Write the single CSV file loaded by assoc-overview.html. 197WriteAssocOverview <- function(summary, output_dir) { 198 s <- data.table(summary) # data.table syntax is easier here 199 200 by_metric <- s[ , list( 201 #params_file = unique(params_file), 202 #map_file = unique(map_file), 203 204 days = length(date), 205 max_num_reports = MaybeMax(num_reports), 206 207 # summarize status 208 ok = sum(status == 'OK'), 209 fail = sum(status == 'FAIL'), 210 timeout = sum(status == 'TIMEOUT'), 211 skipped = sum(status == 'SKIPPED'), 212 213 mean_total_secs = MaybeMean(total_elapsed_seconds), 214 mean_em_secs = MaybeMean(em_elapsed_seconds) 215 216 ), by=list(metric)] 217 218 # Case insensitive sort by metric name 219 by_metric <- by_metric[order(tolower(by_metric$metric)), ] 220 221 overview_path <- file.path(output_dir, 'assoc-overview.csv') 222 write.csv(by_metric, file = overview_path, row.names = FALSE) 223 Log("Wrote %s", overview_path) 224 225 by_metric 226} 227 228# Write the CSV files loaded by assoc-metric.html -- that is, one 229# metric-status.csv for each metric name. 230WriteAssocMetricStatus <- function(summary, output_dir) { 231 s <- data.table(summary) 232 csv_list <- unique(s[, list(metric)]) 233 for (i in 1:nrow(csv_list)) { 234 u <- csv_list[i, ] 235 # Select cols, and convert units. Don't need params / map / metric. 236 by_pair <- s[s$metric == u$metric, 237 list(days = length(date), 238 max_num_reports = MaybeMax(num_reports), 239 240 # summarize status 241 ok = sum(status == 'OK'), 242 fail = sum(status == 'FAIL'), 243 timeout = sum(status == 'TIMEOUT'), 244 skipped = sum(status == 'SKIPPED'), 245 246 mean_total_secs = MaybeMean(total_elapsed_seconds), 247 mean_em_secs = MaybeMean(em_elapsed_seconds) 248 ), 249 by=list(var1, var2)] 250 251 # Case insensitive sort by var1 name 252 by_pair <- by_pair[order(tolower(by_pair$var1)), ] 253 254 csv_path <- file.path(output_dir, u$metric, 'metric-status.csv') 255 write.csv(by_pair, file = csv_path, row.names = FALSE) 256 Log("Wrote %s", csv_path) 257 } 258} 259 260# This naming convention is in task_spec.py AssocTaskSpec. 261FormatAssocRelPath <- function(metric, var1, var2) { 262 v2 <- gsub('..', '_', var2, fixed = TRUE) 263 var_dir <- sprintf('%s_X_%s', var1, v2) 264 file.path(metric, var_dir) 265} 266 267# Write the CSV files loaded by assoc-pair.html -- that is, one pair-status.csv 268# for each (metric, var1, var2) pair. 269WriteAssocPairStatus <- function(summary, output_dir) { 270 271 s <- data.table(summary) 272 273 csv_list <- unique(s[, list(metric, var1, var2)]) 274 Log('CSV list:') 275 print(csv_list) 276 277 # loop over unique metrics, and write a CSV for each one 278 for (i in 1:nrow(csv_list)) { 279 u <- csv_list[i, ] 280 281 # Select cols, and convert units. Don't need params / map / metric. 282 subframe <- s[s$metric == u$metric & s$var1 == u$var1 & s$var2 == u$var2, 283 list(job_id, date, status, 284 num_reports, d1, d2, 285 total_elapsed_seconds, 286 em_elapsed_seconds)] 287 288 # Sort by descending date. Alphabetical sort works fine for YYYY-MM-DD. 289 subframe <- subframe[order(subframe$date, decreasing = TRUE), ] 290 291 pair_rel_path <- FormatAssocRelPath(u$metric, u$var1, u$var2) 292 293 csv_path <- file.path(output_dir, pair_rel_path, 'pair-status.csv') 294 write.csv(subframe, file = csv_path, row.names = FALSE) 295 Log("Wrote %s", csv_path) 296 297 # Write a file with the raw variable names. Parsed by ui.sh, to pass to 298 # csv_to_html.py. 299 meta_path <- file.path(output_dir, pair_rel_path, 'pair-metadata.txt') 300 301 # NOTE: The conversion from data.table to character vector requires 302 # stringsAsFactors to work correctly! 303 lines <- as.character(u) 304 writeLines(lines, con = meta_path) 305 Log("Wrote %s", meta_path) 306 } 307} 308 309ProcessAllAssoc <- function(s, output_dir) { 310 Log('assoc: Writing pair-status.csv for each variable pair in each metric') 311 WriteAssocPairStatus(s, output_dir) 312 313 Log('assoc: Writing metric-status.csv for each metric') 314 WriteAssocMetricStatus(s, output_dir) 315 316 Log('assoc: Writing aggregated overview.csv') 317 WriteAssocOverview(s, output_dir) 318} 319 320main <- function(argv) { 321 # increase ggplot font size globally 322 theme_set(theme_grey(base_size = 16)) 323 324 action = argv[[1]] 325 input = argv[[2]] 326 output_dir = argv[[3]] 327 328 if (action == 'dist') { 329 summary = read.csv(input) 330 ProcessAllDist(summary, output_dir) 331 } else if (action == 'assoc') { 332 summary = read.csv(input) 333 ProcessAllAssoc(summary, output_dir) 334 } else { 335 stop(sprintf('Invalid action %s', action)) 336 } 337 338 Log('Done') 339} 340 341if (length(sys.frames()) == 0) { 342 main(commandArgs(TRUE)) 343} 344