xref: /aosp_15_r20/external/rappor/pipeline/assoc.sh (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1#!/bin/bash
2#
3# Usage:
4#   ./assoc.sh <function name>
5
6set -o nounset
7set -o pipefail
8set -o errexit
9
10readonly THIS_DIR=$(dirname $0)
11readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd)
12
13source $RAPPOR_SRC/util.sh  # log, banner
14source $RAPPOR_SRC/pipeline/tools-lib.sh
15source $RAPPOR_SRC/pipeline/alarm-lib.sh
16
17# Change the default location of these tools by setting DEP_*
18readonly DECODE_ASSOC=${DEP_DECODE_ASSOC:-$RAPPOR_SRC/bin/decode-assoc}
19readonly FAST_EM=${DEP_FAST_EM:-$RAPPOR_SRC/analysis/cpp/_tmp/fast_em}
20
21# Run a single decode-assoc process, to analyze one variable pair for one
22# metric.  The arguments to this function are one row of the task spec.
23decode-one() {
24  # Job constants, from decode-many
25  local rappor_src=$1
26  local timeout_secs=$2
27  local min_reports=$3
28  local job_dir=$4
29  local sample_size=$5
30
31  # Task spec variables, from task_spec.py
32  local num_reports=$6
33  local metric_name=$7
34  local date=$8  # for output naming only
35  local reports=$9  # file with reports
36  local var1=${10}
37  local var2=${11}
38  local map1=${12}
39  local output_dir=${13}
40
41  local log_file=$output_dir/assoc-log.txt
42  local status_file=$output_dir/assoc-status.txt
43  mkdir --verbose -p $output_dir
44
45  # Flags drived from job constants
46  local schema=$job_dir/config/rappor-vars.csv
47  local params_dir=$job_dir/config
48  local em_executable=$FAST_EM
49
50  # TODO:
51  # - Skip jobs with few reports, like ./backfill.sh analyze-one.
52
53  # Output the spec for combine_status.py.
54  echo "$@" > $output_dir/assoc-spec.txt
55
56  # NOTE: Not passing --num-cores since we're parallelizing already.
57
58  # NOTE: --tmp-dir is the output dir.  Then we just delete all the .bin files
59  # afterward so we don't copy them to x20 (they are big).
60
61  { time \
62      alarm-status $status_file $timeout_secs \
63        $DECODE_ASSOC \
64          --create-bool-map \
65          --remove-bad-rows \
66          --em-executable $em_executable \
67          --schema $schema \
68          --params-dir $params_dir \
69          --metric-name $metric_name \
70          --reports $reports \
71          --var1 $var1 \
72          --var2 $var2 \
73          --map1 $map1 \
74          --reports-sample-size $sample_size \
75          --tmp-dir $output_dir \
76          --output-dir $output_dir
77  } >$log_file 2>&1
78}
79
80test-decode-one() {
81  decode-one $RAPPOR_SRC
82}
83
84readonly DEFAULT_MIN_REPORTS=5000
85
86#readonly DEFAULT_TIMEOUT_SECONDS=300  # 5 minutes as a quick test.
87readonly DEFAULT_TIMEOUT_SECONDS=3600  # 1 hour
88
89readonly DEFAULT_MAX_PROCS=6  # TODO: Share with backfill.sh
90
91# Limit to 1M for now.  Raise it when we have a full run.
92readonly DEFAULT_SAMPLE_SIZE=1000000
93
94readonly NUM_ARGS=8  # number of tokens in the task spec, used for xargs
95
96# Run many decode-assoc processes in parallel.
97decode-many() {
98  local job_dir=$1
99  local spec_list=$2
100
101  # These 3 params affect speed
102  local timeout_secs=${3:-$DEFAULT_TIMEOUT_SECONDS}
103  local sample_size=${4:-$DEFAULT_SAMPLE_SIZE}
104  local max_procs=${5:-$DEFAULT_MAX_PROCS}
105
106  local rappor_src=${6:-$RAPPOR_SRC}
107  local min_reports=${7:-$DEFAULT_MIN_REPORTS}
108
109  time cat $spec_list \
110    | xargs --verbose -n $NUM_ARGS -P $max_procs --no-run-if-empty -- \
111      $0 decode-one $rappor_src $timeout_secs $min_reports $job_dir $sample_size
112}
113
114# Combine assoc results and render HTML.
115
116combine-and-render-html() {
117  local jobs_base_dir=$1
118  local job_dir=$2
119
120  banner "Combining assoc task status"
121  TOOLS-cook combine-assoc-task-status $jobs_base_dir $job_dir
122
123  banner "Combining assoc results"
124  TOOLS-cook combine-assoc-results $jobs_base_dir $job_dir
125
126  banner "Splitting out status per metric, and writing overview"
127  TOOLS-cook assoc-metric-status $job_dir
128
129  TOOLS-gen-ui symlink-static assoc $job_dir
130
131  banner "Building overview .part.html from CSV"
132  TOOLS-gen-ui assoc-overview-part-html $job_dir
133
134  banner "Building metric .part.html from CSV"
135  TOOLS-gen-ui assoc-metric-part-html $job_dir
136
137  banner "Building pair .part.html from CSV"
138  TOOLS-gen-ui assoc-pair-part-html $job_dir
139
140  banner "Building day .part.html from CSV"
141  TOOLS-gen-ui assoc-day-part-html $job_dir
142}
143
144# Temp files left over by the fast_em R <-> C++.
145list-and-remove-bin() {
146  local job_dir=$1
147  # If everything failed, we might not have anything to list/delete.
148  find $job_dir -name \*.bin | xargs --no-run-if-empty -- ls -l --si
149  find $job_dir -name \*.bin | xargs --no-run-if-empty -- rm -f --verbose
150}
151
152"$@"
153