xref: /aosp_15_r20/external/rappor/pipeline/csv_to_html.py (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li#!/usr/bin/python
2*2abb3134SXin Li"""Reads a CSV file on stdin, and prints an an HTML table on stdout.
3*2abb3134SXin Li
4*2abb3134SXin LiThe static HTML can then be made made dynamic with JavaScript, e.g. jQuery
5*2abb3134SXin LiDataTable.
6*2abb3134SXin Li
7*2abb3134SXin LiUse Cases:
8*2abb3134SXin Li
9*2abb3134SXin Li  - overview.csv -- each row is a metric
10*2abb3134SXin Li    - links: to metric page
11*2abb3134SXin Li
12*2abb3134SXin Li  - status.csv -- each row is a day
13*2abb3134SXin Li    - links: to log.txt, to results.html
14*2abb3134SXin Li"""
15*2abb3134SXin Li
16*2abb3134SXin Liimport cgi
17*2abb3134SXin Liimport csv
18*2abb3134SXin Liimport optparse
19*2abb3134SXin Liimport sys
20*2abb3134SXin Li
21*2abb3134SXin Liimport util
22*2abb3134SXin Li
23*2abb3134SXin Li
24*2abb3134SXin Lidef CreateOptionsParser():
25*2abb3134SXin Li  p = optparse.OptionParser()
26*2abb3134SXin Li
27*2abb3134SXin Li  # We are taking a path, and not using stdin, because we read it twice.
28*2abb3134SXin Li  p.add_option(
29*2abb3134SXin Li      '--col-format', dest='col_formats', metavar="'COLNAME FMT'", type='str',
30*2abb3134SXin Li      default=[], action='append',
31*2abb3134SXin Li      help='Add HTML links to the named column, using the given Python '
32*2abb3134SXin Li           '.format() string')
33*2abb3134SXin Li
34*2abb3134SXin Li  p.add_option(
35*2abb3134SXin Li      '--def', dest='defs', metavar="'NAME VALUE'", type='str',
36*2abb3134SXin Li      default=[], action='append',
37*2abb3134SXin Li      help='Define varaibles for use in format strings')
38*2abb3134SXin Li
39*2abb3134SXin Li  p.add_option(
40*2abb3134SXin Li      '--as-percent', dest='percent_cols', metavar="COLNAME", type='str',
41*2abb3134SXin Li      default=[], action='append',
42*2abb3134SXin Li      help='Format this floating point column as a percentage string')
43*2abb3134SXin Li
44*2abb3134SXin Li  # TODO: We could include this by default, and then change all the HTML to
45*2abb3134SXin Li  # have <div> placeholders instead of <table>.
46*2abb3134SXin Li  p.add_option(
47*2abb3134SXin Li      '--table', dest='table', default=False, action='store_true',
48*2abb3134SXin Li      help='Add <table></table> tags (useful for testing)')
49*2abb3134SXin Li
50*2abb3134SXin Li  return p
51*2abb3134SXin Li
52*2abb3134SXin Li
53*2abb3134SXin Lidef ParseSpec(arg_list):
54*2abb3134SXin Li  """Given an argument list, return a string -> string dictionary."""
55*2abb3134SXin Li  # The format string is passed the cell value.  Escaped as HTML?
56*2abb3134SXin Li  d = {}
57*2abb3134SXin Li  for s in arg_list:
58*2abb3134SXin Li    try:
59*2abb3134SXin Li      name, value = s.split(' ', 1)
60*2abb3134SXin Li    except ValueError:
61*2abb3134SXin Li      raise RuntimeError('Invalid column format %r' % s)
62*2abb3134SXin Li    d[name] = value
63*2abb3134SXin Li  return d
64*2abb3134SXin Li
65*2abb3134SXin Li
66*2abb3134SXin Lidef PrintRow(row, col_names, col_formats, defs, percent_cols):
67*2abb3134SXin Li  """Print a CSV row as HTML, using the given formatting.
68*2abb3134SXin Li
69*2abb3134SXin Li  Returns:
70*2abb3134SXin Li    An array of booleans indicating whether each cell is a number.
71*2abb3134SXin Li  """
72*2abb3134SXin Li  is_number_flags = [False] * len(col_names)
73*2abb3134SXin Li
74*2abb3134SXin Li  for i, cell in enumerate(row):
75*2abb3134SXin Li    # The cell as a string.  By default we leave it as is; it may be mutated
76*2abb3134SXin Li    # below.
77*2abb3134SXin Li    cell_str = cell
78*2abb3134SXin Li    css_class = ''  # CSS class for the cell.
79*2abb3134SXin Li    col_name = col_names[i]  # column that the cell is under
80*2abb3134SXin Li
81*2abb3134SXin Li    # Does the cell look like a float?
82*2abb3134SXin Li    try:
83*2abb3134SXin Li      cell_float = float(cell)
84*2abb3134SXin Li      if col_name in percent_cols:  # Floats can be formatted as percentages.
85*2abb3134SXin Li        cell_str = '{:.1f}%'.format(cell_float * 100)
86*2abb3134SXin Li      else:
87*2abb3134SXin Li        # Arbitrarily use 3 digits of precision for display
88*2abb3134SXin Li        cell_str = '{:.3f}'.format(cell_float)
89*2abb3134SXin Li      css_class = 'num'
90*2abb3134SXin Li      is_number_flags[i] = True
91*2abb3134SXin Li    except ValueError:
92*2abb3134SXin Li      pass
93*2abb3134SXin Li
94*2abb3134SXin Li    # Does it look lik an int?
95*2abb3134SXin Li    try:
96*2abb3134SXin Li      cell_int = int(cell)
97*2abb3134SXin Li      cell_str = '{:,}'.format(cell_int)
98*2abb3134SXin Li      css_class = 'num'
99*2abb3134SXin Li      is_number_flags[i] = True
100*2abb3134SXin Li    except ValueError:
101*2abb3134SXin Li      pass
102*2abb3134SXin Li
103*2abb3134SXin Li    # Special CSS class for R NA values.
104*2abb3134SXin Li    if cell_str.strip() == 'NA':
105*2abb3134SXin Li      css_class = 'num na'  # num should right justify; na should make it red
106*2abb3134SXin Li      is_number_flags[i] = True
107*2abb3134SXin Li
108*2abb3134SXin Li    if css_class:
109*2abb3134SXin Li      print '    <td class="{}">'.format(css_class),
110*2abb3134SXin Li    else:
111*2abb3134SXin Li      print '    <td>',
112*2abb3134SXin Li
113*2abb3134SXin Li    cell_safe = cgi.escape(cell_str)
114*2abb3134SXin Li
115*2abb3134SXin Li    # If the cell has a format string, print it this way.
116*2abb3134SXin Li
117*2abb3134SXin Li    fmt = col_formats.get(col_name)  # e.g. "../{date}.html"
118*2abb3134SXin Li    if fmt:
119*2abb3134SXin Li      # Copy variable bindings
120*2abb3134SXin Li      bindings = dict(defs)
121*2abb3134SXin Li
122*2abb3134SXin Li      # Also let the format string use other column names.  TODO: Is there a
123*2abb3134SXin Li      # more efficient way?
124*2abb3134SXin Li      bindings.update(zip(col_names, [cgi.escape(c) for c in row]))
125*2abb3134SXin Li
126*2abb3134SXin Li      bindings[col_name] = cell_safe
127*2abb3134SXin Li
128*2abb3134SXin Li      print fmt.format(**bindings),  # no newline
129*2abb3134SXin Li    else:
130*2abb3134SXin Li      print cell_safe,  # no newline
131*2abb3134SXin Li
132*2abb3134SXin Li    print '</td>'
133*2abb3134SXin Li
134*2abb3134SXin Li  return is_number_flags
135*2abb3134SXin Li
136*2abb3134SXin Li
137*2abb3134SXin Lidef ReadCsv(f):
138*2abb3134SXin Li  """Read the CSV file, returning the column names and rows."""
139*2abb3134SXin Li  c = csv.reader(f)
140*2abb3134SXin Li
141*2abb3134SXin Li  # The first row of the CSV is assumed to be a header.  The rest are data.
142*2abb3134SXin Li  col_names = []
143*2abb3134SXin Li  rows = []
144*2abb3134SXin Li  for i, row in enumerate(c):
145*2abb3134SXin Li    if i == 0:
146*2abb3134SXin Li      col_names = row
147*2abb3134SXin Li      continue
148*2abb3134SXin Li    rows.append(row)
149*2abb3134SXin Li  return col_names, rows
150*2abb3134SXin Li
151*2abb3134SXin Li
152*2abb3134SXin Lidef PrintColGroup(col_names, col_is_numeric):
153*2abb3134SXin Li  """Print HTML colgroup element, used for JavaScript sorting."""
154*2abb3134SXin Li  print '<colgroup>'
155*2abb3134SXin Li  for i, col in enumerate(col_names):
156*2abb3134SXin Li    # CSS class is used for sorting
157*2abb3134SXin Li    if col_is_numeric[i]:
158*2abb3134SXin Li      css_class = 'number'
159*2abb3134SXin Li    else:
160*2abb3134SXin Li      css_class = 'case-insensitive'
161*2abb3134SXin Li
162*2abb3134SXin Li    # NOTE: id is a comment only; not used
163*2abb3134SXin Li    print '  <col id="{}" type="{}" />'.format(col, css_class)
164*2abb3134SXin Li  print '</colgroup>'
165*2abb3134SXin Li
166*2abb3134SXin Li
167*2abb3134SXin Lidef main(argv):
168*2abb3134SXin Li  (opts, argv) = CreateOptionsParser().parse_args(argv)
169*2abb3134SXin Li
170*2abb3134SXin Li  col_formats = ParseSpec(opts.col_formats)
171*2abb3134SXin Li  defs = ParseSpec(opts.defs)
172*2abb3134SXin Li
173*2abb3134SXin Li  col_names, rows = ReadCsv(sys.stdin)
174*2abb3134SXin Li
175*2abb3134SXin Li  for col in opts.percent_cols:
176*2abb3134SXin Li    if col not in col_names:
177*2abb3134SXin Li      raise RuntimeError('--percent-col %s is not a valid column' % col)
178*2abb3134SXin Li
179*2abb3134SXin Li  # By default, we don't print the <table> bit -- that's up to the host page
180*2abb3134SXin Li  if opts.table:
181*2abb3134SXin Li    print '<table>'
182*2abb3134SXin Li
183*2abb3134SXin Li  print '<thead>'
184*2abb3134SXin Li  for col in col_names:
185*2abb3134SXin Li    # change _ to space so long column names can wrap
186*2abb3134SXin Li    print '  <td>%s</td>' % cgi.escape(col.replace('_', ' '))
187*2abb3134SXin Li  print '</thead>'
188*2abb3134SXin Li
189*2abb3134SXin Li  # Assume all columns are numeric at first.  Look at each row for non-numeric
190*2abb3134SXin Li  # values.
191*2abb3134SXin Li  col_is_numeric = [True] * len(col_names)
192*2abb3134SXin Li
193*2abb3134SXin Li  print '<tbody>'
194*2abb3134SXin Li  for row in rows:
195*2abb3134SXin Li    print '  <tr>'
196*2abb3134SXin Li    is_number_flags = PrintRow(row, col_names, col_formats, defs,
197*2abb3134SXin Li                               opts.percent_cols)
198*2abb3134SXin Li
199*2abb3134SXin Li    # If one cell in a column is not a number, then the whole cell isn't.
200*2abb3134SXin Li    for (i, is_number) in enumerate(is_number_flags):
201*2abb3134SXin Li      if not is_number:
202*2abb3134SXin Li        col_is_numeric[i] = False
203*2abb3134SXin Li
204*2abb3134SXin Li    print '  </tr>'
205*2abb3134SXin Li  print '</tbody>'
206*2abb3134SXin Li
207*2abb3134SXin Li  PrintColGroup(col_names, col_is_numeric)
208*2abb3134SXin Li
209*2abb3134SXin Li  if opts.table:
210*2abb3134SXin Li    print '</table>'
211*2abb3134SXin Li
212*2abb3134SXin Li
213*2abb3134SXin Liif __name__ == '__main__':
214*2abb3134SXin Li  try:
215*2abb3134SXin Li    main(sys.argv)
216*2abb3134SXin Li  except RuntimeError, e:
217*2abb3134SXin Li    print >>sys.stderr, 'FATAL: %s' % e
218*2abb3134SXin Li    sys.exit(1)
219