xref: /aosp_15_r20/external/rappor/pipeline/metric_status.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1#!/usr/bin/Rscript
2#
3# Write an overview of task status, per-metric task status, task histograms.
4
5library(data.table)
6library(ggplot2)
7
8options(stringsAsFactors = FALSE)  # get rid of annoying behavior
9
10Log <- function(fmt, ...) {
11  cat(sprintf(fmt, ...))
12  cat('\n')
13}
14
15# max of non-NA values; NA if there are none
16MaybeMax <- function(values) {
17  v <- values[!is.na(values)]
18  if (length(v) == 0) {
19    m <- NA
20  } else {
21    m <- max(v)
22  }
23  as.numeric(m)  # data.table requires this; otherwise we get type errors
24}
25
26# mean of non-NA values; NA if there are none
27MaybeMean <- function(values) {
28  v <- values[!is.na(values)]
29  if (length(v) == 0) {
30    m <- NA
31  } else {
32    m <- mean(v)
33  }
34  as.numeric(m)  # data.table require this; otherwise we get type errors
35}
36
37WriteDistOverview <- function(summary, output_dir) {
38  s <- data.table(summary)  # data.table syntax is easier here
39
40  by_metric <-  s[ , list(
41      params_file = unique(params_file),
42      map_file = unique(map_file),
43      days = length(date),
44      max_num_reports = MaybeMax(num_reports),
45
46      # summarize status
47      ok = sum(status == 'OK'),
48      fail = sum(status == 'FAIL'),
49      timeout = sum(status == 'TIMEOUT'),
50      skipped = sum(status == 'SKIPPED'),
51
52      # TODO: Need to document the meaning of these metrics.
53      # All could be NA
54      # KiB -> MB
55      #max_vm5_peak_mb = MaybeMax(vm5_peak_kib * 1024 / 1e6),
56      #mean_vm5_mean_mb = MaybeMean(vm5_mean_kib * 1024 / 1e6),
57
58      mean_secs = MaybeMean(seconds),
59      mean_allocated_mass = MaybeMean(allocated_mass)
60
61      # unique failure reasons
62      # This can be used when there are different call stacks.
63      #fail_reasons = length(unique(fail_reason[fail_reason != ""]))
64      ), by=metric]
65
66  # Case insensitive sort by metric name
67  by_metric <- by_metric[order(tolower(by_metric$metric)), ]
68
69  overview_path <- file.path(output_dir, 'overview.csv')
70  write.csv(by_metric, file = overview_path, row.names = FALSE)
71  Log("Wrote %s", overview_path)
72
73  by_metric
74}
75
76WriteDistMetricStatus <- function(summary, output_dir) {
77  # Write status.csv, num_reports.csv, and mass.csv for each metric.
78
79  s <- data.table(summary)
80
81  # loop over unique metrics, and write a CSV for each one
82  for (m in unique(s$metric)) {
83    # Select cols, and convert units.  Don't need params / map / metric.
84    subframe <- s[s$metric == m,
85                  list(job_id, date, status,
86                       #vm5_peak_mb = vm5_peak_kib * 1024 / 1e6,
87                       #vm5_mean_mb = vm5_mean_kib * 1024 / 1e6,
88                       num_reports,
89                       seconds,
90                       allocated_mass, num_rappor)]
91
92    # Sort by descending date.  Alphabetical sort works fine for YYYY-MM-DD.
93    subframe <- subframe[order(subframe$date, decreasing = TRUE), ]
94
95    out_path = file.path(output_dir, m, 'status.csv')
96    write.csv(subframe, file = out_path, row.names = FALSE)
97    Log("Wrote %s", out_path)
98  }
99
100  # This one is just for plotting with dygraphs.  TODO: can dygraphs do
101  # something smarter?  Maybe you need to select the column in JavaScript, and
102  # pass it an array, rather than CSV text.
103  for (m in unique(s$metric)) {
104    f1 <- s[s$metric == m, list(date, num_reports)]
105    path1 <- file.path(output_dir, m, 'num_reports.csv')
106    # NOTE: dygraphs (only in Firefox?) doesn't like the quotes around
107    # "2015-04-03".  In general, we can't turn off quotes, because strings with
108    # double quotes will be invalid CSV files.  But in this case, we only have
109    # date and number columns, so we can.  dygraphs is mistaken here.
110    write.csv(f1, file = path1, row.names = FALSE, quote = FALSE)
111    Log("Wrote %s", path1)
112
113    # Write unallocated mass.  TODO: Write the other 2 vars too?
114    f2 <- s[s$metric == m,
115            list(date,
116                 unallocated_mass = 1.0 - allocated_mass)]
117
118    path2 <- file.path(output_dir, m, 'mass.csv')
119    write.csv(f2, file = path2, row.names = FALSE, quote = FALSE)
120    Log("Wrote %s", path2)
121  }
122}
123
124WritePlot <- function(p, outdir, filename, width = 800, height = 600) {
125  filename <- file.path(outdir, filename)
126  png(filename, width = width, height = height)
127  plot(p)
128  dev.off()
129  Log('Wrote %s', filename)
130}
131
132# Make sure the histogram has some valid input.  If we don't do this, ggplot
133# blows up with an unintuitive error message.
134CheckHistogramInput <- function(v) {
135  if (all(is.na(v))) {
136    arg_name <- deparse(substitute(v))  # R idiom to get name
137    Log('FATAL: All values in %s are NA (no successful runs?)', arg_name)
138    quit(status = 1)
139  }
140}
141
142WriteDistHistograms <- function(s, output_dir) {
143  CheckHistogramInput(s$allocated_mass)
144
145  p <- qplot(s$allocated_mass, geom = "histogram")
146  t <- ggtitle("Allocated Mass by Task")
147  x <- xlab("allocated mass")
148  y <- ylab("number of tasks")
149  WritePlot(p + t + x + y, output_dir, 'allocated_mass.png')
150
151  CheckHistogramInput(s$num_rappor)
152
153  p <- qplot(s$num_rappor, geom = "histogram")
154  t <- ggtitle("Detected Strings by Task")
155  x <- xlab("detected strings")
156  y <- ylab("number of tasks")
157  WritePlot(p + t + x + y, output_dir, 'num_rappor.png')
158
159  CheckHistogramInput(s$num_reports)
160
161  p <- qplot(s$num_reports / 1e6, geom = "histogram")
162  t <- ggtitle("Raw Reports by Task")
163  x <- xlab("millions of reports")
164  y <- ylab("number of tasks")
165  WritePlot(p + t + x + y, output_dir, 'num_reports.png')
166
167  CheckHistogramInput(s$seconds)
168
169  p <- qplot(s$seconds, geom = "histogram")
170  t <- ggtitle("Analysis Duration by Task")
171  x <- xlab("seconds")
172  y <- ylab("number of tasks")
173  WritePlot(p + t + x + y, output_dir, 'seconds.png')
174
175  # NOTE: Skipping this for 'series' jobs.
176  if (sum(!is.na(s$vm5_peak_kib)) > 0) {
177    p <- qplot(s$vm5_peak_kib * 1024 / 1e6, geom = "histogram")
178    t <- ggtitle("Peak Memory Usage by Task")
179    x <- xlab("Peak megabytes (1e6 bytes) of memory")
180    y <- ylab("number of tasks")
181    WritePlot(p + t + x + y, output_dir, 'memory.png')
182  }
183}
184
185ProcessAllDist <- function(s, output_dir) {
186  Log('dist: Writing per-metric status.csv')
187  WriteDistMetricStatus(s, output_dir)
188
189  Log('dist: Writing histograms')
190  WriteDistHistograms(s, output_dir)
191
192  Log('dist: Writing aggregated overview.csv')
193  WriteDistOverview(s, output_dir)
194}
195
196# Write the single CSV file loaded by assoc-overview.html.
197WriteAssocOverview <- function(summary, output_dir) {
198  s <- data.table(summary)  # data.table syntax is easier here
199
200  by_metric <-  s[ , list(
201      #params_file = unique(params_file),
202      #map_file = unique(map_file),
203
204      days = length(date),
205      max_num_reports = MaybeMax(num_reports),
206
207      # summarize status
208      ok = sum(status == 'OK'),
209      fail = sum(status == 'FAIL'),
210      timeout = sum(status == 'TIMEOUT'),
211      skipped = sum(status == 'SKIPPED'),
212
213      mean_total_secs = MaybeMean(total_elapsed_seconds),
214      mean_em_secs = MaybeMean(em_elapsed_seconds)
215
216      ), by=list(metric)]
217
218  # Case insensitive sort by metric name
219  by_metric <- by_metric[order(tolower(by_metric$metric)), ]
220
221  overview_path <- file.path(output_dir, 'assoc-overview.csv')
222  write.csv(by_metric, file = overview_path, row.names = FALSE)
223  Log("Wrote %s", overview_path)
224
225  by_metric
226}
227
228# Write the CSV files loaded by assoc-metric.html -- that is, one
229# metric-status.csv for each metric name.
230WriteAssocMetricStatus <- function(summary, output_dir) {
231  s <- data.table(summary)
232  csv_list <- unique(s[, list(metric)])
233  for (i in 1:nrow(csv_list)) {
234    u <- csv_list[i, ]
235    # Select cols, and convert units.  Don't need params / map / metric.
236    by_pair <- s[s$metric == u$metric,
237                 list(days = length(date),
238                      max_num_reports = MaybeMax(num_reports),
239
240                      # summarize status
241                      ok = sum(status == 'OK'),
242                      fail = sum(status == 'FAIL'),
243                      timeout = sum(status == 'TIMEOUT'),
244                      skipped = sum(status == 'SKIPPED'),
245
246                      mean_total_secs = MaybeMean(total_elapsed_seconds),
247                      mean_em_secs = MaybeMean(em_elapsed_seconds)
248                      ),
249                 by=list(var1, var2)]
250
251    # Case insensitive sort by var1 name
252    by_pair <- by_pair[order(tolower(by_pair$var1)), ]
253
254    csv_path <- file.path(output_dir, u$metric, 'metric-status.csv')
255    write.csv(by_pair, file = csv_path, row.names = FALSE)
256    Log("Wrote %s", csv_path)
257  }
258}
259
260# This naming convention is in task_spec.py AssocTaskSpec.
261FormatAssocRelPath <- function(metric, var1, var2) {
262  v2 <- gsub('..', '_', var2, fixed = TRUE)
263  var_dir <- sprintf('%s_X_%s', var1, v2)
264  file.path(metric, var_dir)
265}
266
267# Write the CSV files loaded by assoc-pair.html -- that is, one pair-status.csv
268# for each (metric, var1, var2) pair.
269WriteAssocPairStatus <- function(summary, output_dir) {
270
271  s <- data.table(summary)
272
273  csv_list <- unique(s[, list(metric, var1, var2)])
274  Log('CSV list:')
275  print(csv_list)
276
277  # loop over unique metrics, and write a CSV for each one
278  for (i in 1:nrow(csv_list)) {
279    u <- csv_list[i, ]
280
281    # Select cols, and convert units.  Don't need params / map / metric.
282    subframe <- s[s$metric == u$metric & s$var1 == u$var1 & s$var2 == u$var2,
283                  list(job_id, date, status,
284                       num_reports, d1, d2,
285                       total_elapsed_seconds,
286                       em_elapsed_seconds)]
287
288    # Sort by descending date.  Alphabetical sort works fine for YYYY-MM-DD.
289    subframe <- subframe[order(subframe$date, decreasing = TRUE), ]
290
291    pair_rel_path <- FormatAssocRelPath(u$metric, u$var1, u$var2)
292
293    csv_path <- file.path(output_dir, pair_rel_path, 'pair-status.csv')
294    write.csv(subframe, file = csv_path, row.names = FALSE)
295    Log("Wrote %s", csv_path)
296
297    # Write a file with the raw variable names.  Parsed by ui.sh, to pass to
298    # csv_to_html.py.
299    meta_path <- file.path(output_dir, pair_rel_path, 'pair-metadata.txt')
300
301    # NOTE: The conversion from data.table to character vector requires
302    # stringsAsFactors to work correctly!
303    lines <- as.character(u)
304    writeLines(lines, con = meta_path)
305    Log("Wrote %s", meta_path)
306  }
307}
308
309ProcessAllAssoc <- function(s, output_dir) {
310  Log('assoc: Writing pair-status.csv for each variable pair in each metric')
311  WriteAssocPairStatus(s, output_dir)
312
313  Log('assoc: Writing metric-status.csv for each metric')
314  WriteAssocMetricStatus(s, output_dir)
315
316  Log('assoc: Writing aggregated overview.csv')
317  WriteAssocOverview(s, output_dir)
318}
319
320main <- function(argv) {
321  # increase ggplot font size globally
322  theme_set(theme_grey(base_size = 16))
323
324  action = argv[[1]]
325  input = argv[[2]]
326  output_dir = argv[[3]]
327
328  if (action == 'dist') {
329    summary = read.csv(input)
330    ProcessAllDist(summary, output_dir)
331  } else if (action == 'assoc') {
332    summary = read.csv(input)
333    ProcessAllAssoc(summary, output_dir)
334  } else {
335    stop(sprintf('Invalid action %s', action))
336  }
337
338  Log('Done')
339}
340
341if (length(sys.frames()) == 0) {
342  main(commandArgs(TRUE))
343}
344