1# Copyright 2014 Google Inc. All rights reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14 15# 16# RAPPOR simulation library. Contains code for encoding simulated data and 17# creating the map used to encode and decode reports. 18 19library(glmnet) 20library(parallel) # mclapply 21 22SetOfStrings <- function(num_strings = 100) { 23 # Generates a set of strings for simulation purposes. 24 strs <- paste0("V_", as.character(1:num_strings)) 25 strs 26} 27 28GetSampleProbs <- function(params) { 29 # Generate different underlying distributions for simulations purposes. 30 # Args: 31 # - params: a list describing the shape of the true distribution: 32 # c(num_strings, prop_nonzero_strings, decay_type, 33 # rate_exponetial). 34 nstrs <- params[[1]] 35 nonzero <- params[[2]] 36 decay <- params[[3]] 37 expo <- params[[4]] 38 background <- params[[5]] 39 40 probs <- rep(0, nstrs) 41 ind <- floor(nstrs * nonzero) 42 if (decay == "Linear") { 43 probs[1:ind] <- (ind:1) / sum(1:ind) 44 } else if (decay == "Constant") { 45 probs[1:ind] <- 1 / ind 46 } else if (decay == "Exponential") { 47 temp <- seq(0, nonzero, length.out = ind) 48 temp <- exp(-temp * expo) 49 temp <- temp + background 50 temp <- temp / sum(temp) 51 probs[1:ind] <- temp 52 } else { 53 stop('params[[4]] must be in c("Linear", "Exponenential", "Constant")') 54 } 55 probs 56} 57 58EncodeAll <- function(x, cohorts, map, params, num_cores = 1) { 59 # Encodes the ground truth into RAPPOR reports. 60 # 61 # Args: 62 # x: Observed strings for each report, Nx1 vector 63 # cohort: Cohort assignment for each report, Nx1 vector 64 # map: list of matrices encoding locations of hashes for each 65 # string, for each cohort 66 # params: System parameters 67 # 68 # Returns: 69 # RAPPOR reports for each piece of data. 70 71 p <- params$p 72 q <- params$q 73 f <- params$f 74 k <- params$k 75 76 qstar <- (1 - f / 2) * q + (f / 2) * p 77 pstar <- (1 - f / 2) * p + (f / 2) * q 78 79 candidates <- colnames(map[[1]]) 80 if (!all(x %in% candidates)) { 81 stop("Some strings are not in the map. set(X) - set(candidates): ", 82 paste(setdiff(unique(x), candidates), collapse=" "), "\n") 83 } 84 bfs <- mapply(function(x, y) y[, x], x, map[cohorts], SIMPLIFY = FALSE, 85 USE.NAMES = FALSE) 86 reports <- mclapply(bfs, function(x) { 87 noise <- sample(0:1, k, replace = TRUE, prob = c(1 - pstar, pstar)) 88 ind <- which(x) 89 noise[ind] <- sample(0:1, length(ind), replace = TRUE, 90 prob = c(1 - qstar, qstar)) 91 noise 92 }, mc.cores = num_cores) 93 94 reports 95} 96 97CreateMap <- function(strs, params, generate_pos = TRUE, basic = FALSE) { 98 # Creates a list of 0/1 matrices corresponding to mapping between the strs and 99 # Bloom filters for each instance of the RAPPOR. 100 # Ex. for 3 strings, 2 instances, 1 hash function and Bloom filter of size 4, 101 # the result could look this: 102 # [[1]] 103 # 1 0 0 0 104 # 0 1 0 0 105 # 0 0 0 1 106 # [[2]] 107 # 0 1 0 0 108 # 0 0 0 1 109 # 0 0 1 0 110 # 111 # Args: 112 # strs: a vector of strings 113 # params: a list of parameters in the following format: 114 # (k, h, m, p, q, f). 115 # generate_pos: Tells whether to generate an object storing the 116 # positions of the nonzeros in the matrix 117 # basic: Tells whether to use basic RAPPOR (only works if h=1). 118 119 M <- length(strs) 120 map_by_cohort <- list() 121 k <- params$k 122 h <- params$h 123 m <- params$m 124 125 for (i in 1:m) { 126 if (basic && (h == 1) && (k == M)) { 127 ones <- 1:M 128 } else { 129 ones <- sample(1:k, M * h, replace = TRUE) 130 } 131 cols <- rep(1:M, each = h) 132 map_by_cohort[[i]] <- sparseMatrix(ones, cols, dims = c(k, M)) 133 colnames(map_by_cohort[[i]]) <- strs 134 } 135 136 all_cohorts_map <- do.call("rBind", map_by_cohort) 137 if (generate_pos) { 138 map_pos <- t(apply(all_cohorts_map, 2, function(x) { 139 ind <- which(x == 1) 140 n <- length(ind) 141 if (n < h * m) { 142 ind <- c(ind, rep(NA, h * m - n)) 143 } 144 ind 145 })) 146 } else { 147 map_pos <- NULL 148 } 149 150 list(map_by_cohort = map_by_cohort, all_cohorts_map = all_cohorts_map, 151 map_pos = map_pos) 152} 153 154GetSample <- function(N, strs, probs) { 155 # Sample for the strs population with distribution probs. 156 sample(strs, N, replace = TRUE, prob = probs) 157} 158 159GetTrueBits <- function(samp, map, params) { 160 # Convert sample generated by GetSample() to Bloom filters where mapping 161 # is defined in map. 162 # Output: 163 # - reports: a matrix of size [num_instances x size] where each row 164 # represents the number of times each bit in the Bloom filter 165 # was set for a particular instance. 166 # Note: reports[, 1] contains the same size for each instance. 167 168 N <- length(samp) 169 k <- params$k 170 m <- params$m 171 strs <- colnames(map[[1]]) 172 reports <- matrix(0, m, k + 1) 173 inst <- sample(1:m, N, replace = TRUE) 174 for (i in 1:m) { 175 tab <- table(samp[inst == i]) 176 tab2 <- rep(0, length(strs)) 177 tab2[match(names(tab), strs)] <- tab 178 counts <- apply(map[[i]], 1, function(x) x * tab2) 179 # cat(length(tab2), dim(map[[i]]), dim(counts), "\n") 180 reports[i, ] <- c(sum(tab2), apply(counts, 2, sum)) 181 } 182 reports 183} 184 185GetNoisyBits <- function(truth, params) { 186 # Applies RAPPOR to the Bloom filters. 187 # Args: 188 # - truth: a matrix generated by GetTrueBits(). 189 190 k <- params$k 191 p <- params$p 192 q <- params$q 193 f <- params$f 194 195 rappors <- apply(truth, 1, function(x) { 196 # The following samples considering 4 cases: 197 # 1. Signal and we lie on the bit. 198 # 2. Signal and we tell the truth. 199 # 3. Noise and we lie. 200 # 4. Noise and we tell the truth. 201 202 # Lies when signal sampled from the binomial distribution. 203 lied_signal <- rbinom(k, x[-1], f) 204 205 # Remaining must be the non-lying bits when signal. Sampled with q. 206 truth_signal <- x[-1] - lied_signal 207 208 # Lies when there is no signal which happens x[1] - x[-1] times. 209 lied_nosignal <- rbinom(k, x[1] - x[-1], f) 210 211 # Trtuh when there's no signal. These are sampled with p. 212 truth_nosignal <- x[1] - x[-1] - lied_nosignal 213 214 # Total lies and sampling lies with 50/50 for either p or q. 215 lied <- lied_signal + lied_nosignal 216 lied_p <- rbinom(k, lied, .5) 217 lied_q <- lied - lied_p 218 219 # Generating the report where sampling of either p or q occurs. 220 rbinom(k, lied_q + truth_signal, q) + rbinom(k, lied_p + truth_nosignal, p) 221 }) 222 223 cbind(truth[, 1], t(rappors)) 224} 225 226GenerateSamples <- function(N = 10^5, params, pop_params, alpha = .05, 227 prop_missing = 0, 228 correction = "Bonferroni") { 229 # Simulate N reports with pop_params describing the population and 230 # params describing the RAPPOR configuration. 231 num_strings = pop_params[[1]] 232 233 strs <- SetOfStrings(num_strings) 234 probs <- GetSampleProbs(pop_params) 235 samp <- GetSample(N, strs, probs) 236 map <- CreateMap(strs, params) 237 truth <- GetTrueBits(samp, map$map_by_cohort, params) 238 rappors <- GetNoisyBits(truth, params) 239 240 strs_apprx <- strs 241 map_apprx <- map$all_cohorts_map 242 # Remove % of strings to simulate missing variables. 243 if (prop_missing > 0) { 244 ind <- which(probs > 0) 245 removed <- sample(ind, ceiling(prop_missing * length(ind))) 246 map_apprx <- map$all_cohorts_map[, -removed] 247 strs_apprx <- strs[-removed] 248 } 249 250 # Randomize the columns. 251 ind <- sample(1:length(strs_apprx), length(strs_apprx)) 252 map_apprx <- map_apprx[, ind] 253 strs_apprx <- strs_apprx[ind] 254 255 fit <- Decode(rappors, map_apprx, params, alpha = alpha, 256 correction = correction) 257 258 # Add truth column. 259 fit$fit$Truth <- table(samp)[fit$fit$string] 260 fit$fit$Truth[is.na(fit$fit$Truth)] <- 0 261 262 fit$map <- map$map_by_cohort 263 fit$truth <- truth 264 fit$strs <- strs 265 fit$probs <- probs 266 267 fit 268} 269