1# Copyright 2014 Google Inc. All rights reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14 15# Authors: [email protected] (Vasyl Pihur), [email protected] (Giulia Fanti) 16 17library(RUnit) 18source("analysis/R/encode.R") 19source("analysis/R/decode.R") 20source("analysis/R/simulation.R") 21source("analysis/R/association.R") 22source("analysis/R/fast_em.R") 23source("analysis/R/util.R") 24 25SamplePopulations <- function(N, num_variables = 1, params, 26 variable_opts) { 27 # Samples a number of variables. User specifies the number of variables 28 # and some desired properties of those variables. 29 # 30 # Args: 31 # N: Number of reports to generate. 32 # params: RAPPOR parameters, like Bloom filter size, number of 33 # hash bits, etc. 34 # variable_opts: List of options for generating the ground truth: 35 # independent = whether distinct variables should be independently drawn 36 # deterministic = whether the variables should be drawn from a 37 # Poisson distribution or uniformly assigned across the range 38 # of 1:num_strings 39 # num_strings: Only does something if deterministic == TRUE, and 40 # specifies how many strings to use in the uniform assignment 41 # of ground truth strings. 42 # 43 # Returns: 44 # RAPPOR simulated ground truth for each piece of data. 45 46 m <- params$m 47 num_strings <- variable_opts$num_strings 48 49 if (variable_opts$deterministic) { 50 # If a deterministic assignment is desired, evenly distribute 51 # strings across all cohorts. 52 53 reps <- ceiling(N / num_strings) 54 variables <- lapply(1:num_variables, 55 function(i) 56 as.vector(sapply(1:num_strings, function(x) 57 rep(x, reps)))[1:N]) 58 cohorts <- lapply(1:num_variables, 59 function(i) rep(1:m, ceiling(N / m))[1:N]) 60 } else { 61 # Otherwise, draw from a Poisson random variable 62 variables <- lapply(1:num_variables, function(i) rpois(N, 1) + 1) 63 64 # Randomly assign cohorts in each dimension 65 cohorts <- lapply(1:num_variables, 66 function(i) sample(1:params$m, N, replace = TRUE)) 67 68 if (!variable_opts$independent) { 69 # If user wants dependent RVs, subsequent variables are closely correlated 70 # with the first variable in the foll. manner: 71 # variable_i ~ variable_1 + (i-1) Bernoulli(0.5) 72 73 bernoulli_corr <- function(x) { 74 variables[[1]] + (x - 1) * sample(c(0, 1), N, replace = TRUE)} 75 76 variables[2:num_variables] <- lapply(2:num_variables, 77 function(x) bernoulli_corr(x)) 78 } 79 } 80 list(variables = variables, cohorts = cohorts) 81} 82 83Simulate <- function(N, num_variables, params, variable_opts = NULL, 84 truth = NULL, basic = FALSE) { 85 if (is.null(truth)) { 86 truth <- SamplePopulations(N, num_variables, params, 87 variable_opts) 88 } 89 strs <- lapply(truth$variables, function(x) sort(seq(max(x)))) 90 # strs <- lapply(truth$variables, function(x) sort(unique(x))) 91 # strs <- lapply(truth$variables, function(x) 1:length(unique(x))) 92 93 # Construct lists of maps and reports 94 if (variable_opts$deterministic) { 95 # Build the maps 96 map <- CreateMap(strs[[1]], params, FALSE, basic = basic) 97 maps <- lapply(1:num_variables, function(x) map) 98 # Build the reports 99 report <- EncodeAll(truth$variables[[1]], truth$cohorts[[1]], 100 map$map_by_cohort, params) 101 reports <- lapply(1:num_variables, function(x) report) 102 } else { 103 # Build the maps 104 maps <- lapply(1:num_variables, function(x) 105 CreateMap(strs[[x]], params, FALSE, 106 basic = basic)) 107 # Build the reports 108 reports <- lapply(1:num_variables, function(x) 109 EncodeAll(truth$variables[[x]], truth$cohorts[[x]], 110 maps[[x]]$map_by_cohort, params)) 111 } 112 113 list(reports = reports, cohorts = truth$cohorts, 114 truth = truth$variables, maps = maps, strs = strs) 115 116} 117 118# ----------------Actual testing starts here--------------- # 119TestComputeDistributionEM <- function() { 120 # Test various aspects of ComputeDistributionEM in association.R. 121 # Tests include: 122 # Test 1: Compute a joint distribution of uniformly distributed, 123 # perfectly correlated strings 124 # Test 2: Compute a marginal distribution of uniformly distributed strings 125 # Test 3: Check the "other" category estimation works by removing 126 # a string from the known map. 127 # Test 4: Test that the variance from EM algorithm is 1/N when there 128 # is no noise in the system. 129 # Test 5: Check that the right answer is still obtained when f = 0.2. 130 131 num_variables <- 3 132 N <- 100 133 134 # Initialize the parameters 135 params <- list(k = 12, h = 2, m = 4, p = 0, q = 1, f = 0) 136 variable_opts <- list(deterministic = TRUE, num_strings = 2, 137 independent = FALSE) 138 sim <- Simulate(N, num_variables, params, variable_opts) 139 140 # Test 1: Delta function pmf 141 joint_dist <- ComputeDistributionEM(sim$reports, 142 sim$cohorts, sim$maps, 143 ignore_other = TRUE, 144 params = params, 145 marginals = NULL, 146 estimate_var = FALSE) 147 # The recovered distribution should be close to the delta function. 148 checkTrue(abs(joint_dist$fit["1", "1", "1"] - 0.5) < 0.01) 149 checkTrue(abs(joint_dist$fit["2", "2", "2"] - 0.5) < 0.01) 150 151 # Test 2: Now compute a marginal using EM 152 dist <- ComputeDistributionEM(list(sim$reports[[1]]), 153 list(sim$cohorts[[1]]), 154 list(sim$maps[[1]]), 155 ignore_other = TRUE, 156 params = params, 157 marginals = NULL, 158 estimate_var = FALSE) 159 checkTrue(abs(dist$fit["1"] - 0.5) < 0.01) 160 161 # Test 3: Check that the "other" category is correctly computed 162 # Build a modified map with no column 2 (i.e. we only know that string 163 # "1" is a valid string 164 map <- sim$maps[[1]] 165 small_map <- map 166 167 for (i in 1:params$m) { 168 locs <- which(map$map_by_cohort[[i]][, 1]) 169 small_map$map_by_cohort[[i]] <- sparseMatrix(locs, rep(1, length(locs)), 170 dims = c(params$k, 1)) 171 locs <- which(map$all_cohorts_map[, 1]) 172 colnames(small_map$map_by_cohort[[i]]) <- sim$strs[1] 173 } 174 small_map$all_cohorts_map <- do.call("rBind", small_map$map_by_cohort) 175 176 dist <- ComputeDistributionEM(list(sim$reports[[1]]), 177 list(sim$cohorts[[1]]), 178 list(small_map), 179 ignore_other = FALSE, 180 params = params, 181 marginals = NULL, 182 estimate_var = FALSE) 183 184 # The recovered distribution should be uniform over 2 strings. 185 checkTrue(abs(dist$fit[1] - 0.5) < 0.1) 186 187 # Test 4: Test the variance is 1/N 188 variable_opts <- list(deterministic = TRUE, num_strings = 1) 189 sim <- Simulate(N, num_variables = 1, params, variable_opts) 190 dist <- ComputeDistributionEM(sim$reports, sim$cohorts, 191 sim$maps, ignore_other = TRUE, 192 params = params, marginals = NULL, 193 estimate_var = TRUE) 194 195 checkEqualsNumeric(dist$em$var_cov[1, 1], 1 / N) 196 197 # Test 5: Check that when f=0.2, we still get a good estimate 198 params <- list(k = 12, h = 2, m = 2, p = 0, q = 1, f = 0.2) 199 variable_opts <- list(deterministic = TRUE, num_strings = 2) 200 sim <- Simulate(N, num_variables = 2, params, variable_opts) 201 dist <- ComputeDistributionEM(sim$reports, sim$cohorts, 202 sim$maps, ignore_other = TRUE, 203 params = params, marginals = NULL, 204 estimate_var = FALSE) 205 206 checkTrue(abs(dist$fit["1", "1"] - 0.5) < 0.15) 207 checkTrue(abs(dist$fit["2", "2"] - 0.5) < 0.15) 208 209 # Test 6: Check the computed joint distribution with randomized 210 # correlated inputs from the Poisson distribution 211 # Expect to have correlation between strings n and n + 1 212 N <- 1000 213 params <- list(k = 16, h = 2, m = 4, p = 0.1, q = 0.9, f = 0.1) 214 variable_opts <- list(deterministic = FALSE, independent = FALSE) 215 sim <- Simulate(N, num_variables = 2, params, variable_opts) 216 dist <- ComputeDistributionEM(sim$reports, sim$cohorts, 217 sim$maps, ignore_other = TRUE, 218 params = params, marginals = NULL, 219 estimate_var = FALSE) 220 221 print_dist <- TRUE # to print joint distribution, set to TRUE 222 223 if (print_dist) { 224 # dist$fit[dist$fit<1e-4] <- 0 225 # Sort by row names and column names to visually see correlation 226 print(dist$fit[sort(rownames(dist$fit)), sort(colnames(dist$fit))]) 227 } 228 229 # Check for correlations (constants chosen heuristically to get good 230 # test confidence with small # of samples) 231 # Should have mass roughly 1/2e and 1/2e each 232 checkTrue(abs(dist$fit["1", "1"] - dist$fit["1", "2"]) < 0.1) 233 checkTrue(abs(dist$fit["2", "2"] - dist$fit["2", "3"]) < 0.1) 234 235 # Should have mass roughly 1/4e and 1/4e each 236 checkTrue(abs(dist$fit["3", "3"] - dist$fit["3", "4"]) < 0.06) 237 238 # Check for lack of probability mass 239 checkTrue(dist$fit["1", "3"] < 0.02) 240 checkTrue(dist$fit["1", "4"] < 0.02) 241 checkTrue(dist$fit["2", "1"] < 0.02) 242 checkTrue(dist$fit["2", "4"] < 0.02) 243 checkTrue(dist$fit["3", "1"] < 0.02) 244 checkTrue(dist$fit["3", "2"] < 0.02) 245} 246 247MakeCondProb <- function() { 248 d = matrix(c(1,1,2,2,3,3), nrow=3, ncol=2) 249 d = d / sum(d) 250 251 e = matrix(c(3,3,2,2,1,1), nrow=3, ncol=2) 252 e = e / sum(e) 253 254 list(d, e, d) # 3 reports 255} 256 257# Test the slow version in R. 258RunEmFunction <- function(cond_prob, max_em_iters) { 259 cond_prob <- MakeCondProb() 260 261 # Mechanical test of 4 iterations. em$hist has 5 elements. 262 result <- EM(cond_prob, max_em_iters=max_em_iters) 263 result$est 264} 265 266# Run a test of the EM executable 267RunEmExecutable <- function(em_executable, cond_prob, max_em_iters) { 268 print(cond_prob) 269 270 if (!file.exists(em_executable)) { 271 stop(sprintf("EM executable %s doesn't exist (build it?)", em_executable)) 272 } 273 em_iter_func <- ConstructFastEM(em_executable, "/tmp") 274 275 result <- em_iter_func(cond_prob, max_em_iters=max_em_iters) 276 result$est 277} 278 279TestCppImplementation <- function() { 280 cond_prob <- MakeCondProb() 281 max_em_iters <- 10 282 fit1 <- RunEmFunction(cond_prob, max_em_iters) 283 284 # Assume we're in the repo root 285 em_cpp <- file.path(getwd(), "analysis/cpp/_tmp/fast_em") 286 fit2 <- RunEmExecutable(em_cpp, cond_prob, max_em_iters) 287 288 cpp_diff <- abs(fit1 - fit2) 289 print(cpp_diff) 290 Log("C++ implementation difference after %d iterations: %e", max_em_iters, 291 sum(cpp_diff)) 292 293 # After 10 iterations they should be almost indistinguishable. 294 checkTrue(sum(cpp_diff) < 1e-10) 295} 296 297TestTensorFlowImplementation <- function() { 298 cond_prob <- MakeCondProb() 299 max_em_iters <- 10 300 fit1 <- RunEmFunction(cond_prob, max_em_iters) 301 302 em_tf <- file.path(getwd(), "analysis/tensorflow/fast_em.sh") 303 fit2 <- RunEmExecutable(em_tf, cond_prob, max_em_iters) 304 305 tf_diff <- abs(fit1 - fit2) 306 print(tf_diff) 307 Log("TensorFlow implementation difference after %d iterations: %e", 308 max_em_iters, sum(tf_diff)) 309 310 checkTrue(sum(tf_diff) < 1e-10) 311} 312