analysis/R/simulation.R

# Copyright 2014 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#
# RAPPOR simulation library. Contains code for encoding simulated data and
#     creating the map used to encode and decode reports.

library(glmnet)
library(parallel)  # mclapply

SetOfStrings <- function(num_strings = 100) {
  # Generates a set of strings for simulation purposes.
  strs <- paste0("V_", as.character(1:num_strings))
  strs
}

GetSampleProbs <- function(params) {
  # Generate different underlying distributions for simulations purposes.
  # Args:
  #    - params: a list describing the shape of the true distribution:
  #              c(num_strings, prop_nonzero_strings, decay_type,
  #                rate_exponetial).
  nstrs <- params[[1]]
  nonzero <- params[[2]]
  decay <- params[[3]]
  expo <- params[[4]]
  background <- params[[5]]

  probs <- rep(0, nstrs)
  ind <- floor(nstrs * nonzero)
  if (decay == "Linear") {
    probs[1:ind] <- (ind:1) / sum(1:ind)
  } else if (decay == "Constant") {
    probs[1:ind] <- 1 / ind
  } else if (decay == "Exponential") {
    temp <- seq(0, nonzero, length.out = ind)
    temp <- exp(-temp * expo)
    temp <- temp + background
    temp <- temp / sum(temp)
    probs[1:ind] <- temp
  } else {
    stop('params[[4]] must be in c("Linear", "Exponenential", "Constant")')
  }
  probs
}

EncodeAll <- function(x, cohorts, map, params, num_cores = 1) {
  # Encodes the ground truth into RAPPOR reports.
  #
  # Args:
  #   x: Observed strings for each report, Nx1 vector
  #   cohort: Cohort assignment for each report, Nx1 vector
  #   map: list of matrices encoding locations of hashes for each
  #       string, for each cohort
  #   params: System parameters
  #
  # Returns:
  #   RAPPOR reports for each piece of data.

  p <- params$p
  q <- params$q
  f <- params$f
  k <- params$k

  qstar <- (1 - f / 2) * q + (f / 2) * p
  pstar <- (1 - f / 2) * p + (f / 2) * q

  candidates <- colnames(map[[1]])
  if (!all(x %in% candidates)) {
    stop("Some strings are not in the map. set(X) - set(candidates): ",
         paste(setdiff(unique(x), candidates), collapse=" "), "\n")
  }
  bfs <- mapply(function(x, y) y[, x], x, map[cohorts], SIMPLIFY = FALSE,
                USE.NAMES = FALSE)
  reports <- mclapply(bfs, function(x) {
    noise <- sample(0:1, k, replace = TRUE, prob = c(1 - pstar, pstar))
    ind <- which(x)
    noise[ind] <- sample(0:1, length(ind), replace = TRUE,
                         prob = c(1 - qstar, qstar))
    noise
  }, mc.cores = num_cores)

  reports
}

CreateMap <- function(strs, params, generate_pos = TRUE, basic = FALSE) {
  # Creates a list of 0/1 matrices corresponding to mapping between the strs and
  # Bloom filters for each instance of the RAPPOR.
  # Ex. for 3 strings, 2 instances, 1 hash function and Bloom filter of size 4,
  # the result could look this:
  # [[1]]
  #   1 0 0 0
  #   0 1 0 0
  #   0 0 0 1
  # [[2]]
  #   0 1 0 0
  #   0 0 0 1
  #   0 0 1 0
  #
  # Args:
  #    strs: a vector of strings
  #    params: a list of parameters in the following format:
  #         (k, h, m, p, q, f).
  #    generate_pos: Tells whether to generate an object storing the
  #        positions of the nonzeros in the matrix
  #    basic: Tells whether to use basic RAPPOR (only works if h=1).

  M <- length(strs)
  map_by_cohort <- list()
  k <- params$k
  h <- params$h
  m <- params$m

  for (i in 1:m) {
    if (basic && (h == 1) && (k == M)) {
      ones <- 1:M
    } else {
      ones <- sample(1:k, M * h, replace = TRUE)
    }
    cols <- rep(1:M, each = h)
    map_by_cohort[[i]] <- sparseMatrix(ones, cols, dims = c(k, M))
    colnames(map_by_cohort[[i]]) <- strs
  }

  all_cohorts_map <- do.call("rBind", map_by_cohort)
  if (generate_pos) {
    map_pos <- t(apply(all_cohorts_map, 2, function(x) {
      ind <- which(x == 1)
      n <- length(ind)
      if (n < h * m) {
        ind <- c(ind, rep(NA, h * m - n))
      }
      ind
    }))
  } else {
    map_pos <- NULL
  }

  list(map_by_cohort = map_by_cohort, all_cohorts_map = all_cohorts_map,
       map_pos = map_pos)
}

GetSample <- function(N, strs, probs) {
  # Sample for the strs population with distribution probs.
  sample(strs, N, replace = TRUE, prob = probs)
}

GetTrueBits <- function(samp, map, params) {
  # Convert sample generated by GetSample() to Bloom filters where mapping
  # is defined in map.
  # Output:
  #    - reports: a matrix of size [num_instances x size] where each row
  #               represents the number of times each bit in the Bloom filter
  #               was set for a particular instance.
  # Note: reports[, 1] contains the same size for each instance.

  N <- length(samp)
  k <- params$k
  m <- params$m
  strs <- colnames(map[[1]])
  reports <- matrix(0, m, k + 1)
  inst <- sample(1:m, N, replace = TRUE)
  for (i in 1:m) {
    tab <- table(samp[inst == i])
    tab2 <- rep(0, length(strs))
    tab2[match(names(tab), strs)] <- tab
    counts <- apply(map[[i]], 1, function(x) x * tab2)
    # cat(length(tab2), dim(map[[i]]), dim(counts), "\n")
    reports[i, ] <- c(sum(tab2), apply(counts, 2, sum))
  }
  reports
}

GetNoisyBits <- function(truth, params) {
  # Applies RAPPOR to the Bloom filters.
  # Args:
  #     - truth: a matrix generated by GetTrueBits().

  k <- params$k
  p <- params$p
  q <- params$q
  f <- params$f

  rappors <- apply(truth, 1, function(x) {
    # The following samples considering 4 cases:
    # 1. Signal and we lie on the bit.
    # 2. Signal and we tell the truth.
    # 3. Noise and we lie.
    # 4. Noise and we tell the truth.

    # Lies when signal sampled from the binomial distribution.
    lied_signal <- rbinom(k, x[-1], f)

    # Remaining must be the non-lying bits when signal. Sampled with q.
    truth_signal <- x[-1] - lied_signal

    # Lies when there is no signal which happens x[1] - x[-1] times.
    lied_nosignal <- rbinom(k, x[1] - x[-1], f)

    # Trtuh when there's no signal. These are sampled with p.
    truth_nosignal <- x[1] - x[-1] - lied_nosignal

    # Total lies and sampling lies with 50/50 for either p or q.
    lied <- lied_signal + lied_nosignal
    lied_p <- rbinom(k, lied, .5)
    lied_q <- lied - lied_p

    # Generating the report where sampling of either p or q occurs.
    rbinom(k, lied_q + truth_signal, q) + rbinom(k, lied_p + truth_nosignal, p)
  })

  cbind(truth[, 1], t(rappors))
}

GenerateSamples <- function(N = 10^5, params, pop_params, alpha = .05,
                            prop_missing = 0,
                            correction = "Bonferroni") {
  # Simulate N reports with pop_params describing the population and
  # params describing the RAPPOR configuration.
  num_strings = pop_params[[1]]

  strs <- SetOfStrings(num_strings)
  probs <- GetSampleProbs(pop_params)
  samp <- GetSample(N, strs, probs)
  map <- CreateMap(strs, params)
  truth <- GetTrueBits(samp, map$map_by_cohort, params)
  rappors <- GetNoisyBits(truth, params)

  strs_apprx <- strs
  map_apprx <- map$all_cohorts_map
  # Remove % of strings to simulate missing variables.
  if (prop_missing > 0) {
    ind <- which(probs > 0)
    removed <- sample(ind, ceiling(prop_missing * length(ind)))
    map_apprx <- map$all_cohorts_map[, -removed]
    strs_apprx <- strs[-removed]
  }

  # Randomize the columns.
  ind <- sample(1:length(strs_apprx), length(strs_apprx))
  map_apprx <- map_apprx[, ind]
  strs_apprx <- strs_apprx[ind]

  fit <- Decode(rappors, map_apprx, params, alpha = alpha,
                correction = correction)

  # Add truth column.
  fit$fit$Truth <- table(samp)[fit$fit$string]
  fit$fit$Truth[is.na(fit$fit$Truth)] <- 0

  fit$map <- map$map_by_cohort
  fit$truth <- truth
  fit$strs <- strs
  fit$probs <- probs

  fit
}