1# Copyright 2014 Google Inc. All rights reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14 15# 16# This library implements the RAPPOR marginal decoding algorithms using LASSO. 17 18library(glmnet) 19 20# So we don't have to change pwd 21source.rappor <- function(rel_path) { 22 abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path) 23 source(abs_path) 24} 25 26source.rappor('analysis/R/alternative.R') 27 28EstimateBloomCounts <- function(params, obs_counts) { 29 # Estimates the number of times each bit in each cohort was set in original 30 # Bloom filters. 31 # 32 # Input: 33 # params: a list of RAPPOR parameters: 34 # k - size of a Bloom filter 35 # h - number of hash functions 36 # m - number of cohorts 37 # p - P(IRR = 1 | PRR = 0) 38 # q - P(IRR = 1 | PRR = 1) 39 # f - Proportion of bits in the Bloom filter that are set randomly 40 # to 0 or 1 regardless of the underlying true bit value 41 # obs_counts: a matrix of size m by (k + 1). Column one contains sample 42 # sizes for each cohort. Other counts indicated how many times 43 # each bit was set in each cohort. 44 # 45 # Output: 46 # ests: a matrix of size m by k with estimated counts for the probability 47 # of each bit set to 1 in the true Bloom filter. 48 # stds: standard deviation of the estimates. 49 50 p <- params$p 51 q <- params$q 52 f <- params$f 53 m <- params$m 54 k <- params$k 55 56 stopifnot(m == nrow(obs_counts), k + 1 == ncol(obs_counts)) 57 58 p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1 59 p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1 60 61 p2 <- p11 - p01 # == (1 - f) * (q - p) 62 63 # When m = 1, obs_counts does not have the right dimensions. Fixing this. 64 dim(obs_counts) <- c(m, k + 1) 65 66 ests <- apply(obs_counts, 1, function(cohort_row) { 67 N <- cohort_row[1] # sample size for the cohort -- first column is total 68 v <- cohort_row[-1] # counts for individual bits 69 (v - p01 * N) / p2 # unbiased estimator for individual bits' 70 # true counts. It can be negative or 71 # exceed the total. 72 }) 73 74 # NOTE: When k == 1, rows of obs_counts have 2 entries. Then cohort_row[-1] 75 # is a singleton vector, and apply() returns a *vector*. When rows have 3 76 # entries, cohort_row[-1] is a vector of length 2 and apply() returns a 77 # *matrix*. 78 # 79 # Fix this by explicitly setting dimensions. NOTE: It's k x m, not m x k. 80 dim(ests) <- c(k, m) 81 82 total <- sum(obs_counts[,1]) 83 84 variances <- apply(obs_counts, 1, function(cohort_row) { 85 N <- cohort_row[1] 86 v <- cohort_row[-1] 87 p_hats <- (v - p01 * N) / (N * p2) # expectation of a true 1 88 p_hats <- pmax(0, pmin(1, p_hats)) # clamp to [0,1] 89 r <- p_hats * p11 + (1 - p_hats) * p01 # expectation of a reported 1 90 N * r * (1 - r) / p2^2 # variance of the binomial 91 }) 92 93 dim(variances) <- c(k, m) 94 95 # Transform counts from absolute values to fractional, removing bias due to 96 # variability of reporting between cohorts. 97 ests <- apply(ests, 1, function(x) x / obs_counts[,1]) 98 stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1]) 99 100 # Some estimates may be set to infinity, e.g. if f=1. We want to account for 101 # this possibility, and set the corresponding counts to 0. 102 ests[abs(ests) == Inf] <- 0 103 104 list(estimates = ests, stds = stds) 105} 106 107FitLasso <- function(X, Y, intercept = TRUE) { 108 # Fits a Lasso model to select a subset of columns of X. 109 # 110 # Input: 111 # X: a design matrix of size km by M (the number of candidate strings). 112 # Y: a vector of size km with estimated counts from EstimateBloomCounts(). 113 # intercept: whether to fit with intercept or not. 114 # 115 # Output: 116 # a vector of size ncol(X) of coefficients. 117 118 # TODO(mironov): Test cv.glmnet instead of glmnet 119 mod <- try(glmnet(X, Y, standardize = FALSE, intercept = intercept, 120 lower.limits = 0, # outputs are non-negative 121 # Cap the number of non-zero coefficients to 500 or 122 # 80% of the length of Y, whichever is less. The 500 cap 123 # is for performance reasons, 80% is to avoid overfitting. 124 pmax = min(500, length(Y) * .8)), 125 silent = TRUE) 126 127 # If fitting fails, return an empty data.frame. 128 if (class(mod)[1] == "try-error") { 129 coefs <- setNames(rep(0, ncol(X)), colnames(X)) 130 } else { 131 coefs <- coef(mod) 132 coefs <- coefs[-1, ncol(coefs), drop = FALSE] # coefs[1] is the intercept 133 } 134 coefs 135} 136 137PerformInference <- function(X, Y, N, mod, params, alpha, correction) { 138 m <- params$m 139 p <- params$p 140 q <- params$q 141 f <- params$f 142 h <- params$h 143 144 q2 <- .5 * f * (p + q) + (1 - f) * q 145 p2 <- .5 * f * (p + q) + (1 - f) * p 146 resid_var <- p2 * (1 - p2) * (N / m) / (q2 - p2)^2 147 148 # Total Sum of Squares (SS). 149 TSS <- sum((Y - mean(Y))^2) 150 # Error Sum of Squares (ESS). 151 ESS <- resid_var * nrow(X) 152 153 betas <- matrix(mod$coefs, ncol = 1) 154 155# mod_var <- summary(mod$fit)$sigma^2 156# betas_sd <- rep(sqrt(max(resid_var, mod_var) / (m * h)), length(betas)) 157# 158# z_values <- betas / betas_sd 159# 160# # 1-sided t-test. 161# p_values <- pnorm(z_values, lower = FALSE) 162 163 fit <- data.frame(string = colnames(X), Estimate = betas, 164 SD = mod$stds, # z_stat = z_values, pvalue = p_values, 165 stringsAsFactors = FALSE) 166 167# if (correction == "FDR") { 168# fit <- fit[order(fit$pvalue, decreasing = FALSE), ] 169# ind <- which(fit$pvalue < (1:nrow(fit)) * alpha / nrow(fit)) 170# if (length(ind) > 0) { 171# fit <- fit[1:max(ind), ] 172# } else { 173# fit <- fit[numeric(0), ] 174# } 175# } else { 176# fit <- fit[fit$p < alpha, ] 177# } 178 179 fit <- fit[order(fit$Estimate, decreasing = TRUE), ] 180 181 if (nrow(fit) > 0) { 182 str_names <- fit$string 183 str_names <- str_names[!is.na(str_names)] 184 if (length(str_names) > 0 && length(str_names) < nrow(X)) { 185 this_data <- as.data.frame(as.matrix(X[, str_names])) 186 Y_hat <- predict(lm(Y ~ ., data = this_data)) 187 RSS <- sum((Y_hat - mean(Y))^2) 188 } else { 189 RSS <- NA 190 } 191 } else { 192 RSS <- 0 193 } 194 195 USS <- TSS - ESS - RSS 196 SS <- c(RSS, USS, ESS) / TSS 197 198 list(fit = fit, SS = SS, resid_sigma = sqrt(resid_var)) 199} 200 201ComputePrivacyGuarantees <- function(params, alpha, N) { 202 # Compute privacy parameters and guarantees. 203 p <- params$p 204 q <- params$q 205 f <- params$f 206 h <- params$h 207 208 q2 <- .5 * f * (p + q) + (1 - f) * q 209 p2 <- .5 * f * (p + q) + (1 - f) * p 210 211 exp_e_one <- ((q2 * (1 - p2)) / (p2 * (1 - q2)))^h 212 if (exp_e_one < 1) { 213 exp_e_one <- 1 / exp_e_one 214 } 215 e_one <- log(exp_e_one) 216 217 exp_e_inf <- ((1 - .5 * f) / (.5 * f))^(2 * h) 218 e_inf <- log(exp_e_inf) 219 220 std_dev_counts <- sqrt(p2 * (1 - p2) * N) / (q2 - p2) 221 detection_freq <- qnorm(1 - alpha) * std_dev_counts / N 222 223 privacy_names <- c("Effective p", "Effective q", "exp(e_1)", 224 "e_1", "exp(e_inf)", "e_inf", "Detection frequency") 225 privacy_vals <- c(p2, q2, exp_e_one, e_one, exp_e_inf, e_inf, detection_freq) 226 227 privacy <- data.frame(parameters = privacy_names, 228 values = privacy_vals) 229 privacy 230} 231 232FitDistribution <- function(estimates_stds, map, quiet = FALSE) { 233 # Find a distribution over rows of map that approximates estimates_stds best 234 # 235 # Input: 236 # estimates_stds: a list of two m x k matrices, one for estimates, another 237 # for standard errors 238 # map : an (m * k) x S boolean matrix 239 # 240 # Output: 241 # a float vector of length S, so that a distribution over map's rows sampled 242 # according to this vector approximates estimates 243 244 S <- ncol(map) # total number of candidates 245 246 support_coefs <- 1:S 247 248 if (S > length(estimates_stds$estimates) * .8) { 249 # the system is close to being underdetermined 250 lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) 251 252 # Select non-zero coefficients. 253 support_coefs <- which(lasso > 0) 254 255 if(!quiet) 256 cat("LASSO selected ", length(support_coefs), " non-zero coefficients.\n") 257 } 258 259 coefs <- setNames(rep(0, S), colnames(map)) 260 261 if(length(support_coefs) > 0) { # LASSO may return an empty list 262 constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE], 263 estimates_stds) 264 265 coefs[support_coefs] <- constrained_coefs 266 } 267 268 coefs 269} 270 271Resample <- function(e) { 272 # Simulate resampling of the Bloom filter estimates by adding Gaussian noise 273 # with estimated standard deviation. 274 estimates <- matrix(mapply(function(x, y) x + rnorm(1, 0, y), 275 e$estimates, e$stds), 276 nrow = nrow(e$estimates), ncol = ncol(e$estimates)) 277 stds <- e$stds * 2^.5 278 279 list(estimates = estimates, stds = stds) 280} 281 282# Private function 283# Decode for Boolean RAPPOR inputs 284# Returns a list with attribute fit only. (Inference and other aspects 285# currently not incorporated because they're unnecessary for association.) 286.DecodeBoolean <- function(counts, params, num_reports) { 287 # Boolean variables are reported without cohorts and to estimate counts, 288 # first sum up counts across all cohorts and then run EstimateBloomCounts 289 # with the number of cohorts set to 1. 290 params$m <- 1 # set number of cohorts to 1 291 summed_counts <- colSums(counts) # sum counts across cohorts 292 es <- EstimateBloomCounts(params, summed_counts) # estimate boolean counts 293 294 ests <- es$estimates[[1]] 295 std <- es$stds[[1]] 296 297 fit <- data.frame( 298 string = c("TRUE", "FALSE"), 299 estimate = c(ests * num_reports, 300 num_reports - ests * num_reports), 301 std_error = c(std * num_reports, std * num_reports), 302 proportion = c(ests, 1 - ests), 303 prop_std_error = c(std, std)) 304 305 low_95 <- fit$proportion - 1.96 * fit$prop_std_error 306 high_95 <- fit$proportion + 1.96 * fit$prop_std_error 307 308 fit$prop_low_95 <- pmax(low_95, 0.0) 309 fit$prop_high_95 <- pmin(high_95, 1.0) 310 rownames(fit) <- fit$string 311 312 return(list(fit = fit)) 313} 314 315CheckDecodeInputs <- function(counts, map, params) { 316 # Returns an error message, or NULL if there is no error. 317 318 if (nrow(map) != (params$m * params$k)) { 319 return(sprintf( 320 "Map matrix has invalid dimensions: m * k = %d, nrow(map) = %d", 321 params$m * params$k, nrow(map))) 322 } 323 324 if ((ncol(counts) - 1) != params$k) { 325 return(sprintf(paste0( 326 "Dimensions of counts file do not match: m = %d, k = %d, ", 327 "nrow(counts) = %d, ncol(counts) = %d"), params$m, params$k, 328 nrow(counts), ncol(counts))) 329 330 } 331 332 # numerically correct comparison 333 if (isTRUE(all.equal((1 - params$f) * (params$p - params$q), 0))) { 334 return("Information is lost. Cannot decode.") 335 } 336 337 return(NULL) # no error 338} 339 340Decode <- function(counts, map, params, alpha = 0.05, 341 correction = c("Bonferroni"), quiet = FALSE, ...) { 342 343 error_msg <- CheckDecodeInputs(counts, map, params) 344 if (!is.null(error_msg)) { 345 stop(error_msg) 346 } 347 348 k <- params$k 349 p <- params$p 350 q <- params$q 351 f <- params$f 352 h <- params$h 353 m <- params$m 354 355 S <- ncol(map) # total number of candidates 356 357 N <- sum(counts[, 1]) 358 if (k == 1) { 359 return(.DecodeBoolean(counts, params, N)) 360 } 361 362 filter_cohorts <- which(counts[, 1] != 0) # exclude cohorts with zero reports 363 364 # stretch cohorts to bits 365 filter_bits <- as.vector(matrix(1:nrow(map), ncol = m)[,filter_cohorts, drop = FALSE]) 366 367 map_filtered <- map[filter_bits, , drop = FALSE] 368 369 es <- EstimateBloomCounts(params, counts) 370 371 estimates_stds_filtered <- 372 list(estimates = es$estimates[filter_cohorts, , drop = FALSE], 373 stds = es$stds[filter_cohorts, , drop = FALSE]) 374 375 coefs_all <- vector() 376 377 # Run the fitting procedure several times (5 seems to be sufficient and not 378 # too many) to estimate standard deviation of the output. 379 for(r in 1:5) { 380 if(r > 1) 381 e <- Resample(estimates_stds_filtered) 382 else 383 e <- estimates_stds_filtered 384 385 coefs_all <- rbind(coefs_all, 386 FitDistribution(e, map_filtered, quiet)) 387 } 388 389 coefs_ssd <- N * apply(coefs_all, 2, sd) # compute sample standard deviations 390 coefs_ave <- N * apply(coefs_all, 2, mean) 391 392 # Only select coefficients more than two standard deviations from 0. May 393 # inflate empirical SD of the estimates. 394 reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd) 395 396 mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported]) 397 398 coefs_ave_zeroed <- coefs_ave 399 coefs_ave_zeroed[-reported] <- 0 400 401 residual <- as.vector(t(estimates_stds_filtered$estimates)) - 402 map_filtered %*% coefs_ave_zeroed / N 403 404 if (correction == "Bonferroni") { 405 alpha <- alpha / S 406 } 407 408 inf <- PerformInference(map_filtered[, reported, drop = FALSE], 409 as.vector(t(estimates_stds_filtered$estimates)), 410 N, mod, params, alpha, 411 correction) 412 fit <- inf$fit 413 # If this is a basic RAPPOR instance, just use the counts for the estimate 414 # (Check if the map is diagonal to tell if this is basic RAPPOR.) 415 if (sum(map) == sum(diag(map))) { 416 fit$Estimate <- colSums(counts)[-1] 417 } 418 419 # Estimates from the model are per instance so must be multipled by h. 420 # Standard errors are also adjusted. 421 fit$estimate <- floor(fit$Estimate) 422 fit$proportion <- fit$estimate / N 423 424 fit$std_error <- floor(fit$SD) 425 fit$prop_std_error <- fit$std_error / N 426 427 # 1.96 standard deviations gives 95% confidence interval. 428 low_95 <- fit$proportion - 1.96 * fit$prop_std_error 429 high_95 <- fit$proportion + 1.96 * fit$prop_std_error 430 # Clamp estimated proportion. pmin/max: vectorized min and max 431 fit$prop_low_95 <- pmax(low_95, 0.0) 432 fit$prop_high_95 <- pmin(high_95, 1.0) 433 434 fit <- fit[, c("string", "estimate", "std_error", "proportion", 435 "prop_std_error", "prop_low_95", "prop_high_95")] 436 437 allocated_mass <- sum(fit$proportion) 438 num_detected <- nrow(fit) 439 440 ss <- round(inf$SS, digits = 3) 441 explained_var <- ss[[1]] 442 missing_var <- ss[[2]] 443 noise_var <- ss[[3]] 444 445 noise_std_dev <- round(inf$resid_sigma, digits = 3) 446 447 # Compute summary of the fit. 448 parameters <- 449 c("Candidate strings", "Detected strings", 450 "Sample size (N)", "Discovered Prop (out of N)", 451 "Explained Variance", "Missing Variance", "Noise Variance", 452 "Theoretical Noise Std. Dev.") 453 values <- c(S, num_detected, N, allocated_mass, 454 explained_var, missing_var, noise_var, noise_std_dev) 455 456 res_summary <- data.frame(parameters = parameters, values = values) 457 458 privacy <- ComputePrivacyGuarantees(params, alpha, N) 459 params <- data.frame(parameters = 460 c("k", "h", "m", "p", "q", "f", "N", "alpha"), 461 values = c(k, h, m, p, q, f, N, alpha)) 462 463 # This is a list of decode stats in a better format than 'summary'. 464 metrics <- list(sample_size = N, 465 allocated_mass = allocated_mass, 466 num_detected = num_detected, 467 explained_var = explained_var, 468 missing_var = missing_var) 469 470 list(fit = fit, summary = res_summary, privacy = privacy, params = params, 471 lasso = NULL, residual = as.vector(residual), 472 counts = counts[, -1], resid = NULL, metrics = metrics, 473 ests = es$estimates # ests needed by Shiny rappor-sim app 474 ) 475} 476 477ComputeCounts <- function(reports, cohorts, params) { 478 # Counts the number of times each bit in the Bloom filters was set for 479 # each cohort. 480 # 481 # Args: 482 # reports: A list of N elements, each containing the 483 # report for a given report 484 # cohorts: A list of N elements, each containing the 485 # cohort number for a given report 486 # params: A list of parameters for the problem 487 # 488 # Returns: 489 # An mx(k+1) array containing the number of times each bit was set 490 # in each cohort. 491 492 # Check that the cohorts are evenly assigned. We assume that if there 493 # are m cohorts, each cohort should have approximately N/m reports. 494 # The constraint we impose here simply says that cohort bins should 495 # each have within N/m reports of one another. Since the most popular 496 # cohort is expected to have about O(logN/loglogN) reports (which we ) 497 # approximate as O(logN) bins for practical values of N, a discrepancy of 498 # O(N) bins seems significant enough to alter expected behavior. This 499 # threshold can be changed to be more sensitive if desired. 500 N <- length(reports) 501 cohort_freqs <- table(factor(cohorts, levels = 1:params$m)) 502 imbalance_threshold <- N / params$m 503 if ((max(cohort_freqs) - min(cohort_freqs)) > imbalance_threshold) { 504 cat("\nNote: You are using unbalanced cohort assignments, which can", 505 "significantly degrade estimation quality!\n\n") 506 } 507 508 # Count the times each bit was set, and add cohort counts to first column 509 counts <- lapply(1:params$m, function(i) 510 Reduce("+", reports[which(cohorts == i)])) 511 counts[which(cohort_freqs == 0)] <- data.frame(rep(0, params$k)) 512 cbind(cohort_freqs, do.call("rbind", counts)) 513} 514