1#!/usr/bin/env Rscript 2# 3# Copyright 2015 Google Inc. All rights reserved. 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16 17# Simulates inputs on which association analysis can be run. 18# Currently assoc_sim.R only supports 2 variables but can 19# be easily extended to support more. 20# 21# Usage: 22# $ ./assoc_sim.R -n 1000 23# Inputs: uvals, params, reports, map, num, unif 24# see how options are parsed below for more information 25# Outputs: 26# reports.csv file containing reports 27# map_{1, 2, ...}.csv file(s) containing maps of variables 28 29library("optparse") 30 31options(stringsAsFactors = FALSE) 32 33if(!interactive()) { 34 option_list <- list( 35 make_option(c("--uvals", "-v"), default = "uvals.csv", 36 help = "Filename for list of values over which 37 distributions are simulated. The file is a list of 38 comma-separated strings each line of which refers 39 to a new variable."), 40 make_option(c("--params", "-p"), default = "params.csv", 41 help = "Filename for RAPPOR parameters"), 42 make_option(c("--reports", "-r"), default = "reports.csv", 43 help = "Filename for reports"), 44 make_option(c("--map", "-m"), default = "map", 45 help = "Filename *prefix* for map(s)"), 46 make_option(c("--num", "-n"), default = 1e05, 47 help = "Number of reports"), 48 make_option(c("--unif", "-u"), default = FALSE, 49 help = "Run simulation with uniform distribution") 50 ) 51 opts <- parse_args(OptionParser(option_list = option_list)) 52} 53 54source("../analysis/R/encode.R") 55source("../analysis/R/decode.R") 56source("../analysis/R/simulation.R") 57source("../analysis/R/read_input.R") 58source("../analysis/R/association.R") 59 60# Read unique values of reports from a csv file 61# Inputs: filename. The file is expected to contain two rows of strings 62# (one for each variable): 63# "google.com", "apple.com", ... 64# "ssl", "nossl", ... 65# Returns: a list containing strings 66GetUniqueValsFromFile <- function(filename) { 67 contents <- read.csv(filename, header = FALSE) 68 # Expect 2 rows of unique vals 69 if(nrow(contents) != 2) { 70 stop(paste("Unique vals file", filename, "expected to have 71 two rows of strings.")) 72 } 73 # Removes superfluous "" entries if the lists of unique values 74 # differ in length 75 strip_empty <- function(vec) { 76 vec[!vec %in% c("")] 77 } 78 list(var1 = strip_empty(as.vector(t(contents[1,]))), 79 var2 = strip_empty(as.vector(t(contents[2,])))) 80} 81 82# Simulate correlated reports and write into reportsfile 83# Inputs: N = number of reports 84# uvals = list containing a list of unique values 85# params = list with RAPPOR parameters 86# unif = whether to replace poisson with uniform 87# mapfile = file to write maps into (with .csv suffixes) 88# reportsfile = file to write reports into (with .csv suffix) 89SimulateReports <- function(N, uvals, params, unif, 90 mapfile, reportsfile) { 91 # Compute true distribution 92 m <- params$m 93 94 if (unif) { 95 # Draw uniformly from 1 to 10 96 v1_samples <- as.integer(runif(N, 1, 10)) 97 } else { 98 # Draw from a Poisson random variable 99 v1_samples <- rpois(N, 1) + rep(1, N) 100 } 101 102 # Pr[var2 = N + 1 | var1 = N] = 0.5 103 # Pr[var2 = N | var1 = N] = 0.5 104 v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE) 105 106 tmp_samples <- list(v1_samples, v2_samples) 107 108 # Function to pad strings to uval_vec if sample_vec has 109 # larger support than the number of strings in uval_vec 110 # For e.g., if samples have support {1, 2, 3, 4, ...} and uvals 111 # only have "value1", "value2", and "value3", samples now 112 # over support {"value1", "value2", "value3", "str4", ...} 113 PadStrings <- function(sample_vec, uval_vec) { 114 if (max(sample_vec) > length(uval_vec)) { 115 # Padding uvals to required length 116 len <- length(uval_vec) 117 max_of_samples <- max(sample_vec) 118 uval_vec[(len + 1):max_of_samples] <- apply( 119 as.matrix((len + 1):max_of_samples), 120 1, 121 function(i) sprintf("str%d", i)) 122 } 123 uval_vec 124 } 125 126 # Pad and update uvals 127 uvals <- lapply(1:2, function(i) PadStrings(tmp_samples[[i]], 128 uvals[[i]])) 129 130 # Replace integers in tmp_samples with actual sample strings 131 samples <- lapply(1:2, function(i) uvals[[i]][tmp_samples[[i]]]) 132 133 # Randomly assign cohorts in each dimension 134 cohorts <- sample(1:m, N, replace = TRUE) 135 136 # Create and write map into mapfile_1.csv and mapfile_2.csv 137 map <- lapply(uvals, function(u) CreateMap(u, params)) 138 write.table(map[[1]]$map_pos, file = paste(mapfile, "_1.csv", sep = ""), 139 sep = ",", col.names = FALSE, na = "", quote = FALSE) 140 write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""), 141 sep = ",", col.names = FALSE, na = "", quote = FALSE) 142 143 # Write reports into a csv file 144 # Format: 145 # cohort, bloom filter var1, bloom filter var2 146 reports <- lapply(1:2, function(i) 147 EncodeAll(samples[[i]], cohorts, map[[i]]$map, params)) 148 # Organize cohorts and reports into format 149 write_matrix <- cbind(as.matrix(cohorts), 150 as.matrix(lapply(reports[[1]], 151 function(x) paste(x, collapse = ""))), 152 as.matrix(lapply(reports[[2]], 153 function(x) paste(x, collapse = "")))) 154 write.table(write_matrix, file = reportsfile, quote = FALSE, 155 row.names = FALSE, col.names = FALSE, sep = ",") 156} 157 158main <- function(opts) { 159 ptm <- proc.time() 160 161 uvals <- GetUniqueValsFromFile(opts$uvals) 162 params <- ReadParameterFile(opts$params) 163 SimulateReports(opts$num, uvals, params, opts$unif, # inputs 164 opts$map, opts$reports) # outputs 165 166 print("PROC.TIME") 167 print(proc.time() - ptm) 168} 169 170if(!interactive()) { 171 main(opts) 172} 173