1#!/bin/bash 2# 3# Usage: 4# ./assoc.sh <function name> 5 6set -o nounset 7set -o pipefail 8set -o errexit 9 10readonly THIS_DIR=$(dirname $0) 11readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd) 12 13source $RAPPOR_SRC/util.sh # log, banner 14source $RAPPOR_SRC/pipeline/tools-lib.sh 15source $RAPPOR_SRC/pipeline/alarm-lib.sh 16 17# Change the default location of these tools by setting DEP_* 18readonly DECODE_ASSOC=${DEP_DECODE_ASSOC:-$RAPPOR_SRC/bin/decode-assoc} 19readonly FAST_EM=${DEP_FAST_EM:-$RAPPOR_SRC/analysis/cpp/_tmp/fast_em} 20 21# Run a single decode-assoc process, to analyze one variable pair for one 22# metric. The arguments to this function are one row of the task spec. 23decode-one() { 24 # Job constants, from decode-many 25 local rappor_src=$1 26 local timeout_secs=$2 27 local min_reports=$3 28 local job_dir=$4 29 local sample_size=$5 30 31 # Task spec variables, from task_spec.py 32 local num_reports=$6 33 local metric_name=$7 34 local date=$8 # for output naming only 35 local reports=$9 # file with reports 36 local var1=${10} 37 local var2=${11} 38 local map1=${12} 39 local output_dir=${13} 40 41 local log_file=$output_dir/assoc-log.txt 42 local status_file=$output_dir/assoc-status.txt 43 mkdir --verbose -p $output_dir 44 45 # Flags drived from job constants 46 local schema=$job_dir/config/rappor-vars.csv 47 local params_dir=$job_dir/config 48 local em_executable=$FAST_EM 49 50 # TODO: 51 # - Skip jobs with few reports, like ./backfill.sh analyze-one. 52 53 # Output the spec for combine_status.py. 54 echo "$@" > $output_dir/assoc-spec.txt 55 56 # NOTE: Not passing --num-cores since we're parallelizing already. 57 58 # NOTE: --tmp-dir is the output dir. Then we just delete all the .bin files 59 # afterward so we don't copy them to x20 (they are big). 60 61 { time \ 62 alarm-status $status_file $timeout_secs \ 63 $DECODE_ASSOC \ 64 --create-bool-map \ 65 --remove-bad-rows \ 66 --em-executable $em_executable \ 67 --schema $schema \ 68 --params-dir $params_dir \ 69 --metric-name $metric_name \ 70 --reports $reports \ 71 --var1 $var1 \ 72 --var2 $var2 \ 73 --map1 $map1 \ 74 --reports-sample-size $sample_size \ 75 --tmp-dir $output_dir \ 76 --output-dir $output_dir 77 } >$log_file 2>&1 78} 79 80test-decode-one() { 81 decode-one $RAPPOR_SRC 82} 83 84readonly DEFAULT_MIN_REPORTS=5000 85 86#readonly DEFAULT_TIMEOUT_SECONDS=300 # 5 minutes as a quick test. 87readonly DEFAULT_TIMEOUT_SECONDS=3600 # 1 hour 88 89readonly DEFAULT_MAX_PROCS=6 # TODO: Share with backfill.sh 90 91# Limit to 1M for now. Raise it when we have a full run. 92readonly DEFAULT_SAMPLE_SIZE=1000000 93 94readonly NUM_ARGS=8 # number of tokens in the task spec, used for xargs 95 96# Run many decode-assoc processes in parallel. 97decode-many() { 98 local job_dir=$1 99 local spec_list=$2 100 101 # These 3 params affect speed 102 local timeout_secs=${3:-$DEFAULT_TIMEOUT_SECONDS} 103 local sample_size=${4:-$DEFAULT_SAMPLE_SIZE} 104 local max_procs=${5:-$DEFAULT_MAX_PROCS} 105 106 local rappor_src=${6:-$RAPPOR_SRC} 107 local min_reports=${7:-$DEFAULT_MIN_REPORTS} 108 109 time cat $spec_list \ 110 | xargs --verbose -n $NUM_ARGS -P $max_procs --no-run-if-empty -- \ 111 $0 decode-one $rappor_src $timeout_secs $min_reports $job_dir $sample_size 112} 113 114# Combine assoc results and render HTML. 115 116combine-and-render-html() { 117 local jobs_base_dir=$1 118 local job_dir=$2 119 120 banner "Combining assoc task status" 121 TOOLS-cook combine-assoc-task-status $jobs_base_dir $job_dir 122 123 banner "Combining assoc results" 124 TOOLS-cook combine-assoc-results $jobs_base_dir $job_dir 125 126 banner "Splitting out status per metric, and writing overview" 127 TOOLS-cook assoc-metric-status $job_dir 128 129 TOOLS-gen-ui symlink-static assoc $job_dir 130 131 banner "Building overview .part.html from CSV" 132 TOOLS-gen-ui assoc-overview-part-html $job_dir 133 134 banner "Building metric .part.html from CSV" 135 TOOLS-gen-ui assoc-metric-part-html $job_dir 136 137 banner "Building pair .part.html from CSV" 138 TOOLS-gen-ui assoc-pair-part-html $job_dir 139 140 banner "Building day .part.html from CSV" 141 TOOLS-gen-ui assoc-day-part-html $job_dir 142} 143 144# Temp files left over by the fast_em R <-> C++. 145list-and-remove-bin() { 146 local job_dir=$1 147 # If everything failed, we might not have anything to list/delete. 148 find $job_dir -name \*.bin | xargs --no-run-if-empty -- ls -l --si 149 find $job_dir -name \*.bin | xargs --no-run-if-empty -- rm -f --verbose 150} 151 152"$@" 153