xref: /aosp_15_r20/external/rappor/pipeline/combine_results.py (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li#!/usr/bin/python
2*2abb3134SXin Li"""Combines results from multiple days of a single metric.
3*2abb3134SXin Li
4*2abb3134SXin LiFeed it the STATUS.txt files on stdin.  It then finds the corresponding
5*2abb3134SXin Liresults.csv, and takes the top N items.
6*2abb3134SXin Li
7*2abb3134SXin LiExample:
8*2abb3134SXin Li
9*2abb3134SXin LiDate,      "google.com,", yahoo.com
10*2abb3134SXin Li2015-03-01,          0.0,       0.9
11*2abb3134SXin Li2015-03-02,          0.1,       0.8
12*2abb3134SXin Li
13*2abb3134SXin LiDygraphs can load this CSV file directly.
14*2abb3134SXin Li
15*2abb3134SXin LiTODO: Use different dygraph API?
16*2abb3134SXin Li
17*2abb3134SXin LiAlso we need error bars.
18*2abb3134SXin Li
19*2abb3134SXin Li  new Dygraph(document.getElementById("graphdiv2"),
20*2abb3134SXin Li              [
21*2abb3134SXin Li                [1,10,100],
22*2abb3134SXin Li                [2,20,80],
23*2abb3134SXin Li                [3,50,60],
24*2abb3134SXin Li                [4,70,80]
25*2abb3134SXin Li              ],
26*2abb3134SXin Li              {
27*2abb3134SXin Li                labels: [ "Date", "failure", "timeout", "google.com" ]
28*2abb3134SXin Li              });
29*2abb3134SXin Li"""
30*2abb3134SXin Li
31*2abb3134SXin Liimport collections
32*2abb3134SXin Liimport csv
33*2abb3134SXin Liimport json
34*2abb3134SXin Liimport os
35*2abb3134SXin Liimport sys
36*2abb3134SXin Li
37*2abb3134SXin Liimport util
38*2abb3134SXin Li
39*2abb3134SXin Li
40*2abb3134SXin Lidef CombineDistResults(stdin, c_out, num_top):
41*2abb3134SXin Li  dates = []
42*2abb3134SXin Li  var_cols = collections.defaultdict(dict)  # {name: {date: value}}
43*2abb3134SXin Li
44*2abb3134SXin Li  seen_dates = set()
45*2abb3134SXin Li
46*2abb3134SXin Li  for line in stdin:
47*2abb3134SXin Li    status_path = line.strip()
48*2abb3134SXin Li
49*2abb3134SXin Li    # Assume it looks like .../2015-03-01/STATUS.txt
50*2abb3134SXin Li    task_dir = os.path.dirname(status_path)
51*2abb3134SXin Li    date = os.path.basename(task_dir)
52*2abb3134SXin Li
53*2abb3134SXin Li    # Get rid of duplicate dates.  These could be caused by retries.
54*2abb3134SXin Li    if date in seen_dates:
55*2abb3134SXin Li      continue
56*2abb3134SXin Li
57*2abb3134SXin Li    seen_dates.add(date)
58*2abb3134SXin Li
59*2abb3134SXin Li    with open(status_path) as f:
60*2abb3134SXin Li      status = f.readline().split()[0]  # OK, FAIL, TIMEOUT, SKIPPED
61*2abb3134SXin Li
62*2abb3134SXin Li    dates.append(date)
63*2abb3134SXin Li
64*2abb3134SXin Li    if status != 'OK':
65*2abb3134SXin Li      continue  # won't have results.csv
66*2abb3134SXin Li
67*2abb3134SXin Li    results_path = os.path.join(task_dir, 'results.csv')
68*2abb3134SXin Li    with open(results_path) as f:
69*2abb3134SXin Li      c = csv.reader(f)
70*2abb3134SXin Li      unused_header = c.next()  # header row
71*2abb3134SXin Li
72*2abb3134SXin Li      # they are sorted by decreasing "estimate", which is what we want
73*2abb3134SXin Li      for i in xrange(0, num_top):
74*2abb3134SXin Li        try:
75*2abb3134SXin Li          row = c.next()
76*2abb3134SXin Li        except StopIteration:
77*2abb3134SXin Li          # It's OK if it doesn't have enough
78*2abb3134SXin Li          util.log('Stopping early. Fewer than %d results to render.', num_top)
79*2abb3134SXin Li          break
80*2abb3134SXin Li
81*2abb3134SXin Li        string, _, _, proportion, _, prop_low, prop_high = row
82*2abb3134SXin Li
83*2abb3134SXin Li        # dygraphs has a weird format with semicolons:
84*2abb3134SXin Li        # value;lower;upper,value;lower;upper.
85*2abb3134SXin Li
86*2abb3134SXin Li        # http://dygraphs.com/data.html#csv
87*2abb3134SXin Li
88*2abb3134SXin Li        # Arbitrarily use 4 digits after decimal point (for dygraphs, not
89*2abb3134SXin Li        # directly displayed)
90*2abb3134SXin Li        dygraph_triple = '%.4f;%.4f;%.4f' % (
91*2abb3134SXin Li            float(prop_low), float(proportion), float(prop_high))
92*2abb3134SXin Li
93*2abb3134SXin Li        var_cols[string][date] = dygraph_triple
94*2abb3134SXin Li
95*2abb3134SXin Li  # Now print CSV on stdout.
96*2abb3134SXin Li  cols = sorted(var_cols.keys())  # sort columns alphabetically
97*2abb3134SXin Li  c_out.writerow(['date'] + cols)
98*2abb3134SXin Li
99*2abb3134SXin Li  dates.sort()
100*2abb3134SXin Li
101*2abb3134SXin Li  for date in dates:
102*2abb3134SXin Li    row = [date]
103*2abb3134SXin Li    for col in cols:
104*2abb3134SXin Li      cell = var_cols[col].get(date)  # None mean sthere is no row
105*2abb3134SXin Li      row.append(cell)
106*2abb3134SXin Li    c_out.writerow(row)
107*2abb3134SXin Li
108*2abb3134SXin Li  #util.log("Number of dynamic cols: %d", len(var_cols))
109*2abb3134SXin Li
110*2abb3134SXin Li
111*2abb3134SXin Lidef CombineAssocResults(stdin, c_out, num_top):
112*2abb3134SXin Li  header = ('dummy',)
113*2abb3134SXin Li  c_out.writerow(header)
114*2abb3134SXin Li
115*2abb3134SXin Li
116*2abb3134SXin Lidef main(argv):
117*2abb3134SXin Li  action = argv[1]
118*2abb3134SXin Li
119*2abb3134SXin Li  if action == 'dist':
120*2abb3134SXin Li    num_top = int(argv[2])  # number of values to keep
121*2abb3134SXin Li    c_out = csv.writer(sys.stdout)
122*2abb3134SXin Li    CombineDistResults(sys.stdin, c_out, num_top)
123*2abb3134SXin Li
124*2abb3134SXin Li  elif action == 'assoc':
125*2abb3134SXin Li    num_top = int(argv[2])  # number of values to keep
126*2abb3134SXin Li    c_out = csv.writer(sys.stdout)
127*2abb3134SXin Li    CombineAssocResults(sys.stdin, c_out, num_top)
128*2abb3134SXin Li
129*2abb3134SXin Li  else:
130*2abb3134SXin Li    raise RuntimeError('Invalid action %r' % action)
131*2abb3134SXin Li
132*2abb3134SXin Li
133*2abb3134SXin Liif __name__ == '__main__':
134*2abb3134SXin Li  try:
135*2abb3134SXin Li    main(sys.argv)
136*2abb3134SXin Li  except RuntimeError, e:
137*2abb3134SXin Li    print >>sys.stderr, 'FATAL: %s' % e
138*2abb3134SXin Li    sys.exit(1)
139