client/cpp/encoder.cc

// Copyright 2015 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "encoder.h"
#include "openssl_hash_impl.h"

#include <assert.h>
#include <stdio.h>
#include <stdarg.h>  // va_list, etc.
#include <vector>

namespace rappor {

void log(const char* fmt, ...) {
  va_list args;
  va_start(args, fmt);
  vfprintf(stderr, fmt, args);
  va_end(args);
  fprintf(stderr, "\n");
}

//
// Functions for debugging
//

static void PrintHex(const std::vector<uint8_t>& h) {
  for (size_t i = 0; i < h.size(); ++i) {
    fprintf(stderr, "%02x", h[i]);
  }
  fprintf(stderr, "\n");
}

// We use 1 *byte* of a HMAC-SHA256 value per BIT to generate the PRR.  SHA256
// has 32 bytes, so the max is 32 bits.
static const int kMaxBits = 32;

// Can't be more than the number of bytes in MD5.
static const int kMaxHashes = 16;

// Probabilities should be in the interval [0.0, 1.0].
static void CheckValidProbability(float prob, const char* var_name) {
  if (prob < 0.0f || prob > 1.0f) {
    log("%s should be between 0.0 and 1.0 inclusive (got %.2f)", var_name,
        prob);
    assert(false);
  }
}

// Used to 1) turn cohort into a string, and 2) Turn raw bits into a string.
// Return by value since it's small.
static std::string ToBigEndian(uint32_t u) {
  std::string result(4, '\0');

  // rely on truncation to char
  result[0] = u >> 24;
  result[1] = u >> 16;
  result[2] = u >> 8;
  result[3] = u;

  return result;
}

static const char* kHmacCohortPrefix = "\x00";
static const char* kHmacPrrPrefix = "\x01";


//
// Encoder
//

uint32_t Encoder::AssignCohort(const Deps& deps, int num_cohorts) {
  std::vector<uint8_t> sha256;
  if (!deps.hmac_func_(deps.client_secret_, kHmacCohortPrefix, &sha256)) {
    log("HMAC failed");
    assert(false);
  }

  // Either we are using SHA256 to have exactly 32 bytes,
  // or we're using HmacDrbg for any number of bytes.
  if ((sha256.size() == kMaxBits)
      || (deps.hmac_func_ == rappor::HmacDrbg)) {
    // Hash size ok.
  } else {
    log("Bad hash size.");
    assert(false);
  }

  // Interpret first 4 bytes of sha256 as a uint32_t.
  uint32_t c = *(reinterpret_cast<uint32_t*>(sha256.data()));
  // e.g. for 128 cohorts, 0x80 - 1 = 0x7f
  uint32_t cohort_mask = num_cohorts - 1;
  return c & cohort_mask;
}

Encoder::Encoder(const std::string& encoder_id, const Params& params,
                 const Deps& deps)
    : encoder_id_(encoder_id),
      params_(params),
      deps_(deps),
      cohort_(AssignCohort(deps, params.num_cohorts_)),
      cohort_str_(ToBigEndian(cohort_)) {

  if (params_.num_bits_ <= 0) {
    log("num_bits must be positive");
    assert(false);
  }
  if (params_.num_hashes_ <= 0) {
    log("num_hashes must be positive");
    assert(false);
  }
  if (params_.num_cohorts_ <= 0) {
    log("num_cohorts must be positive");
    assert(false);
  }

  // Check Maximum values.
  if (deps_.hmac_func_ == rappor::HmacDrbg) {
    // Using HmacDrbg
    if (params_.num_bits_ % 8 != 0) {
      log("num_bits (%d) must be divisible by 8 when using HmacDrbg.",
          params.num_bits_);
      assert(false);
    }
  } else {
    // Using SHA256
    if (params_.num_bits_ > kMaxBits) {
        log("num_bits (%d) can't be greater than %d", params_.num_bits_,
            kMaxBits);
        assert(false);
    }
  }

  if (params_.num_hashes_ > kMaxHashes) {
    log("num_hashes (%d) can't be greater than %d", params_.num_hashes_,
        kMaxHashes);
    assert(false);
  }
  int m = params_.num_cohorts_;
  if ((m & (m - 1)) != 0) {
    log("num_cohorts (%d) must be a power of 2 (and not 0)", m);
    assert(false);
  }
  // TODO: check max cohorts?

  CheckValidProbability(params_.prob_f_, "prob_f");
  CheckValidProbability(params_.prob_p_, "prob_p");
  CheckValidProbability(params_.prob_q_, "prob_q");
}

bool Encoder::MakeBloomFilter(const std::string& value, Bits* bloom_out) const {
  const int num_bits = params_.num_bits_;
  const int num_hashes = params_.num_hashes_;

  Bits bloom = 0;

  // 4 byte cohort string + true value
  std::string hash_input(cohort_str_ + value);

  // First do hashing.
  std::vector<uint8_t> hash_output;
  deps_.hash_func_(hash_input, &hash_output);

  // Error check
  if (hash_output.size() < static_cast<size_t>(num_hashes)) {
    log("Hash function didn't return enough bytes");
    return false;
  }

  // To determine which bit to set in the bloom filter, use a byte of the MD5.
  for (int i = 0; i < num_hashes; ++i) {
    int bit_to_set = hash_output[i] % num_bits;
    bloom |= 1 << bit_to_set;
  }

  *bloom_out = bloom;
  return true;
}

// Write a Bloom filter into a vector of bytes, used for num_bits > 32.
bool Encoder::MakeBloomFilter(const std::string& value,
                              std::vector<uint8_t>* bloom_out) const {
  const int num_bits = params_.num_bits_;
  const int num_hashes = params_.num_hashes_;

  bloom_out->resize(params_.num_bits_ / 8, 0);

  // Generate the hash.
  std::vector<uint8_t> hash_output;
  deps_.hash_func_(std::string(cohort_str_ + value), &hash_output);

  // Check that we have enough bytes of hash available.
  int exponent = 0;
  int bytes_needed = 0;
  while ((1 << exponent) < num_bits) {
    exponent++;
  }
  bytes_needed = ((exponent - 1) / 8) + 1;
  if (bytes_needed > 4) {
    log("Can only use 4 bytes of hash at a time, needed %d "
        "to address %d bits.", bytes_needed, num_bits);
    return false;
  }
  if (hash_output.size() < static_cast<size_t>(bytes_needed * num_hashes)) {
    log("Hash function returned %d bytes, but we needed "
        "%d bytes * %d hashes. Choose lower num_hashes or "
        "a different hash function.",
        hash_output.size(), bytes_needed, num_hashes);
    return false;
  }

  // To determine which bit to set in the Bloom filter, use 1 or more
  // bytes of the MD5.
  int hash_byte = 0;
  for (int i = 0; i < num_hashes; ++i) {
    int bit_to_set = 0;
    for (int j = 0; j < bytes_needed; ++j) {
      bit_to_set |= hash_output[hash_byte] << (j * 8);
      ++hash_byte;
    }
    bit_to_set %= num_bits;
    // Start at end of array to be consistent with the Bits implementation.
    int index = (bloom_out->size() - 1) - (bit_to_set / 8);
    (*bloom_out)[index] |= 1 << (bit_to_set % 8);
  }
  return true;
}

// Helper method for PRR
bool Encoder::GetPrrMasks(const Bits bits, Bits* uniform_out,
                          Bits* f_mask_out) const {
  // Create HMAC(secret, value), and use its bits to construct f_mask and
  // uniform bits.
  std::vector<uint8_t> sha256;

  std::string hmac_value = kHmacPrrPrefix + encoder_id_ + ToBigEndian(bits);

  deps_.hmac_func_(deps_.client_secret_, hmac_value, &sha256);
  if (sha256.size() != kMaxBits) {  // sanity check
    return false;
  }

  // We should have already checked this.
  if (params_.num_bits_ > kMaxBits) {
    log("num_bits exceeds maximum.");
    assert(false);
  }

  uint8_t threshold128 = static_cast<uint8_t>(params_.prob_f_ * 128);

  Bits uniform = 0;
  Bits f_mask = 0;

  for (int i = 0; i < params_.num_bits_; ++i) {
    uint8_t byte = sha256[i];

    uint8_t u_bit = byte & 0x01;  // 1 bit of entropy
    uniform |= (u_bit << i);  // maybe set bit in mask

    uint8_t rand128 = byte >> 1;  // 7 bits of entropy
    uint8_t noise_bit = (rand128 < threshold128);
    f_mask |= (noise_bit << i);  // maybe set bit in mask
  }

  *uniform_out = uniform;
  *f_mask_out = f_mask;
  return true;
}

bool Encoder::_EncodeBitsInternal(const Bits bits, Bits* prr_out,
                                  Bits* irr_out) const {
  // Compute Permanent Randomized Response (PRR).
  Bits uniform;
  Bits f_mask;
  if (!GetPrrMasks(bits, &uniform, &f_mask)) {
    log("GetPrrMasks failed");
    return false;
  }

  Bits prr = (bits & ~f_mask) | (uniform & f_mask);
  *prr_out = prr;

  // Compute Instantaneous Randomized Response (IRR).

  // NOTE: These can fail if say a read() from /dev/urandom fails.
  Bits p_bits;
  Bits q_bits;
  if (!deps_.irr_rand_.GetMask(params_.prob_p_, params_.num_bits_, &p_bits)) {
    log("PMask failed");
    return false;
  }
  if (!deps_.irr_rand_.GetMask(params_.prob_q_, params_.num_bits_, &q_bits)) {
    log("QMask failed");
    return false;
  };

  Bits irr = (p_bits & ~prr) | (q_bits & prr);
  *irr_out = irr;

  return true;
}

bool Encoder::_EncodeStringInternal(const std::string& value, Bits* bloom_out,
    Bits* prr_out, Bits* irr_out) const {
  if (!MakeBloomFilter(value, bloom_out)) {
    log("Bloom filter calculation failed");
    return false;
  }
  return _EncodeBitsInternal(*bloom_out, prr_out, irr_out);
}

bool Encoder::EncodeBits(const Bits bits, Bits* irr_out) const {
  Bits unused_prr;
  return _EncodeBitsInternal(bits, &unused_prr, irr_out);
}

bool Encoder::EncodeString(const std::string& value, Bits* irr_out) const {
  Bits unused_bloom;
  Bits unused_prr;
  return _EncodeStringInternal(value, &unused_bloom, &unused_prr, irr_out);
}

static uint8_t shifted(const Bits& bits, const int& index) {
  // For an array of bytes, select the appopriate byte from a 4-byte
  // integer value. Bytes are enumerated in big-endian order, i.e.
  // index = 0 is the MSB, index = 3 is the LSB.
  int shift = 8 * (3 - (index % 4)); // Byte 0 shifts by 24 bits, 1 by 16, etc.
  return (uint8_t)((bits >> shift) & 0xFF);  // Return the correct byte.
}

bool Encoder::EncodeString(const std::string& value,
                           std::vector<uint8_t>* irr_out) const {
  std::vector<uint8_t> bloom_out;
  std::vector<uint8_t> hmac_out;
  std::vector<uint8_t> uniform;
  std::vector<uint8_t> f_mask;
  const int num_bits = params_.num_bits_;

  uniform.resize(num_bits / 8, 0);
  f_mask.resize(num_bits / 8, 0);
  irr_out->resize(num_bits / 8, 0);

  // Set bloom_out.
  if (!MakeBloomFilter(value, &bloom_out)) {
    log("Bloom filter calculation failed");
    return false;
  }

  // Set hmac_out.
  hmac_out.resize(num_bits);  // Signal to HmacDrbg about desired output size.
  // Call HmacDrbg
  std::string hmac_value =  kHmacPrrPrefix + encoder_id_;
  for (int i = 0; i < bloom_out.size(); ++i) {
    hmac_value.append(reinterpret_cast<char *>(&bloom_out[i]), 1);
  }
  deps_.hmac_func_(deps_.client_secret_, hmac_value, &hmac_out);
  if (hmac_out.size() != num_bits) {
    log("Needed %d bytes from Hmac function, received %d bytes.",
        num_bits, hmac_out.size());
    return false;
  }

  // We'll be using 7 bits of each byte of the MAC as our random
  // number for the f_mask.
  uint8_t threshold128 = static_cast<uint8_t>(params_.prob_f_ * 128);

  // Construct uniform and f_mask bitwise.
  for (int i = 0; i < num_bits; i++) {
    uint8_t byte = hmac_out[i];
    uint8_t u_bit = byte & 0x01;  // 1 bit of entropy.
    int vector_index = (num_bits - 1 - i) / 8;
    uint8_t rand128 = byte >> 1;  // 7 bits of entropy.
    uint8_t noise_bit = (rand128 < threshold128);
    uniform[vector_index] |= (u_bit << (i % 8));
    f_mask[vector_index] |= (noise_bit << (i % 8));
  }

  for (int i = 0; i < bloom_out.size(); i++) {
    Bits p_bits;
    Bits q_bits;
    uint8_t prr;
    prr = (bloom_out[i] & ~f_mask[i]) | (uniform[i] & f_mask[i]);
    // GetMask operates on Uint32, so we generate a new p_bits every 4
    // bytes, and use each of its bytes once.
    if (i % 4 == 0) {
      // Need new p_bits, q_bits values to work with.
      if (!deps_.irr_rand_.GetMask(params_.prob_p_, 32, &p_bits)) {
        log("PMask failed");
        return false;
      }
      if (!deps_.irr_rand_.GetMask(params_.prob_q_, 32, &q_bits)) {
        log("QMask failed");
        return false;
      }
    }
    (*irr_out)[i] = (shifted(p_bits, i) & ~prr)
        | (shifted(q_bits, i) & prr);
  }
}

void Encoder::set_cohort(uint32_t cohort) {
  cohort_ = cohort;
  cohort_str_ = ToBigEndian(cohort_);
}

}  // namespace rappor