xref: /aosp_15_r20/external/rappor/bin/decode_dist.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1#!/usr/bin/env Rscript
2#
3# Command line tool to decode a RAPPOR data set.  It is a simple wrapper for
4# Decode() in decode.R.
5
6library(optparse)
7
8#
9# Command line parsing.  Do this first before loading libraries to catch errors
10# quickly.  Loading libraries in R is slow.
11#
12
13# For command line error checking.
14UsageError <- function(...) {
15  cat(sprintf(...))
16  cat('\n')
17  quit(status = 1)
18}
19
20option_list <- list(
21  # Inputs
22  make_option("--map", default="", help="Map file (required)"),
23  make_option("--counts", default="", help="Counts file (required)"),
24  make_option("--params", default="", help="Params file (required)"),
25  make_option("--output-dir", dest="output_dir", default=".",
26              help="Output directory (default .)"),
27
28  make_option("--correction", default="FDR", help="Correction method"),
29  make_option("--alpha", default=.05, help="Alpha level"),
30
31  make_option("--adjust-counts-hack", dest="adjust_counts_hack",
32              default=FALSE, action="store_true",
33              help="Allow the counts file to have more rows than cohorts.
34                    Most users should not use this.")
35)
36
37ParseOptions <- function() {
38  # NOTE: This API is bad; if you add positional_arguments, the return value
39  # changes!
40  parser <- OptionParser(option_list = option_list)
41  opts <- parse_args(parser)
42
43  if (opts$map == "") {
44    UsageError("--map is required.")
45  }
46  if (opts$counts == "") {
47    UsageError("--counts is required.")
48  }
49  if (opts$params == "") {
50    UsageError("--params is required.")
51  }
52  return(opts)
53}
54
55if (!interactive()) {
56  opts <- ParseOptions()
57}
58
59#
60# Load libraries and source our own code.
61#
62
63library(RJSONIO)
64
65# So we don't have to change pwd
66source.rappor <- function(rel_path)  {
67  abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
68  source(abs_path)
69}
70
71source.rappor("analysis/R/read_input.R")
72source.rappor("analysis/R/decode.R")
73source.rappor("analysis/R/util.R")
74
75source.rappor("analysis/R/alternative.R")
76
77options(stringsAsFactors = FALSE)
78
79
80main <- function(opts) {
81  Log("decode-dist")
82  Log("argv:")
83  print(commandArgs(TRUE))
84
85  Log("Loading inputs")
86
87  # Run a single model of all inputs are specified.
88  params <- ReadParameterFile(opts$params)
89  counts <- ReadCountsFile(opts$counts, params, adjust_counts = opts$adjust_counts_hack)
90  counts <- AdjustCounts(counts, params)
91
92
93  # The left-most column has totals.
94  num_reports <- sum(counts[, 1])
95
96  map <- LoadMapFile(opts$map, params)
97
98  Log("Decoding %d reports", num_reports)
99  res <- Decode(counts, map$map, params, correction = opts$correction,
100                alpha = opts$alpha)
101  Log("Done decoding")
102
103  if (nrow(res$fit) == 0) {
104    Log("FATAL: Analysis returned no strings.")
105    quit(status = 1)
106  }
107
108  # Write analysis results as CSV.
109  results_csv_path <- file.path(opts$output_dir, 'results.csv')
110  write.csv(res$fit, file = results_csv_path, row.names = FALSE)
111
112  # Write residual histograph as a png.
113  results_png_path <- file.path(opts$output_dir, 'residual.png')
114  png(results_png_path)
115  breaks <- pretty(res$residual, n = 200)
116  histogram <- hist(res$residual, breaks, plot = FALSE)
117  histogram$counts <- histogram$counts / sum(histogram$counts)  # convert the histogram to frequencies
118  plot(histogram, main = "Histogram of the residual",
119       xlab = sprintf("Residual (observed - explained, %d x %d values)", params$m, params$k))
120  dev.off()
121
122  res$metrics$total_elapsed_time <- proc.time()[['elapsed']]
123
124  # Write summary as JSON (scalar values).
125  metrics_json_path <- file.path(opts$output_dir, 'metrics.json')
126  m <- toJSON(res$metrics)
127  writeLines(m, con = metrics_json_path)
128  Log("Wrote %s, %s, and %s", results_csv_path, results_png_path, metrics_json_path)
129
130  # TODO:
131  # - These are in an 2 column 'parameters' and 'values' format.  Should these
132  # just be a plain list?
133  # - Should any of these privacy params be in metrics.json?
134
135  Log("Privacy summary:")
136  print(res$privacy)
137  cat("\n")
138
139  Log('DONE')
140}
141
142if (!interactive()) {
143  main(opts)
144}
145