xref: /aosp_15_r20/external/rappor/pipeline/combine_results.py (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1#!/usr/bin/python
2"""Combines results from multiple days of a single metric.
3
4Feed it the STATUS.txt files on stdin.  It then finds the corresponding
5results.csv, and takes the top N items.
6
7Example:
8
9Date,      "google.com,", yahoo.com
102015-03-01,          0.0,       0.9
112015-03-02,          0.1,       0.8
12
13Dygraphs can load this CSV file directly.
14
15TODO: Use different dygraph API?
16
17Also we need error bars.
18
19  new Dygraph(document.getElementById("graphdiv2"),
20              [
21                [1,10,100],
22                [2,20,80],
23                [3,50,60],
24                [4,70,80]
25              ],
26              {
27                labels: [ "Date", "failure", "timeout", "google.com" ]
28              });
29"""
30
31import collections
32import csv
33import json
34import os
35import sys
36
37import util
38
39
40def CombineDistResults(stdin, c_out, num_top):
41  dates = []
42  var_cols = collections.defaultdict(dict)  # {name: {date: value}}
43
44  seen_dates = set()
45
46  for line in stdin:
47    status_path = line.strip()
48
49    # Assume it looks like .../2015-03-01/STATUS.txt
50    task_dir = os.path.dirname(status_path)
51    date = os.path.basename(task_dir)
52
53    # Get rid of duplicate dates.  These could be caused by retries.
54    if date in seen_dates:
55      continue
56
57    seen_dates.add(date)
58
59    with open(status_path) as f:
60      status = f.readline().split()[0]  # OK, FAIL, TIMEOUT, SKIPPED
61
62    dates.append(date)
63
64    if status != 'OK':
65      continue  # won't have results.csv
66
67    results_path = os.path.join(task_dir, 'results.csv')
68    with open(results_path) as f:
69      c = csv.reader(f)
70      unused_header = c.next()  # header row
71
72      # they are sorted by decreasing "estimate", which is what we want
73      for i in xrange(0, num_top):
74        try:
75          row = c.next()
76        except StopIteration:
77          # It's OK if it doesn't have enough
78          util.log('Stopping early. Fewer than %d results to render.', num_top)
79          break
80
81        string, _, _, proportion, _, prop_low, prop_high = row
82
83        # dygraphs has a weird format with semicolons:
84        # value;lower;upper,value;lower;upper.
85
86        # http://dygraphs.com/data.html#csv
87
88        # Arbitrarily use 4 digits after decimal point (for dygraphs, not
89        # directly displayed)
90        dygraph_triple = '%.4f;%.4f;%.4f' % (
91            float(prop_low), float(proportion), float(prop_high))
92
93        var_cols[string][date] = dygraph_triple
94
95  # Now print CSV on stdout.
96  cols = sorted(var_cols.keys())  # sort columns alphabetically
97  c_out.writerow(['date'] + cols)
98
99  dates.sort()
100
101  for date in dates:
102    row = [date]
103    for col in cols:
104      cell = var_cols[col].get(date)  # None mean sthere is no row
105      row.append(cell)
106    c_out.writerow(row)
107
108  #util.log("Number of dynamic cols: %d", len(var_cols))
109
110
111def CombineAssocResults(stdin, c_out, num_top):
112  header = ('dummy',)
113  c_out.writerow(header)
114
115
116def main(argv):
117  action = argv[1]
118
119  if action == 'dist':
120    num_top = int(argv[2])  # number of values to keep
121    c_out = csv.writer(sys.stdout)
122    CombineDistResults(sys.stdin, c_out, num_top)
123
124  elif action == 'assoc':
125    num_top = int(argv[2])  # number of values to keep
126    c_out = csv.writer(sys.stdout)
127    CombineAssocResults(sys.stdin, c_out, num_top)
128
129  else:
130    raise RuntimeError('Invalid action %r' % action)
131
132
133if __name__ == '__main__':
134  try:
135    main(sys.argv)
136  except RuntimeError, e:
137    print >>sys.stderr, 'FATAL: %s' % e
138    sys.exit(1)
139