1 // Copyright 2015 Google Inc. All rights reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "encoder.h"
16 #include "openssl_hash_impl.h"
17
18 #include <assert.h>
19 #include <stdio.h>
20 #include <stdarg.h> // va_list, etc.
21 #include <vector>
22
23 namespace rappor {
24
log(const char * fmt,...)25 void log(const char* fmt, ...) {
26 va_list args;
27 va_start(args, fmt);
28 vfprintf(stderr, fmt, args);
29 va_end(args);
30 fprintf(stderr, "\n");
31 }
32
33 //
34 // Functions for debugging
35 //
36
PrintHex(const std::vector<uint8_t> & h)37 static void PrintHex(const std::vector<uint8_t>& h) {
38 for (size_t i = 0; i < h.size(); ++i) {
39 fprintf(stderr, "%02x", h[i]);
40 }
41 fprintf(stderr, "\n");
42 }
43
44 // We use 1 *byte* of a HMAC-SHA256 value per BIT to generate the PRR. SHA256
45 // has 32 bytes, so the max is 32 bits.
46 static const int kMaxBits = 32;
47
48 // Can't be more than the number of bytes in MD5.
49 static const int kMaxHashes = 16;
50
51 // Probabilities should be in the interval [0.0, 1.0].
CheckValidProbability(float prob,const char * var_name)52 static void CheckValidProbability(float prob, const char* var_name) {
53 if (prob < 0.0f || prob > 1.0f) {
54 log("%s should be between 0.0 and 1.0 inclusive (got %.2f)", var_name,
55 prob);
56 assert(false);
57 }
58 }
59
60 // Used to 1) turn cohort into a string, and 2) Turn raw bits into a string.
61 // Return by value since it's small.
ToBigEndian(uint32_t u)62 static std::string ToBigEndian(uint32_t u) {
63 std::string result(4, '\0');
64
65 // rely on truncation to char
66 result[0] = u >> 24;
67 result[1] = u >> 16;
68 result[2] = u >> 8;
69 result[3] = u;
70
71 return result;
72 }
73
74 static const char* kHmacCohortPrefix = "\x00";
75 static const char* kHmacPrrPrefix = "\x01";
76
77
78 //
79 // Encoder
80 //
81
AssignCohort(const Deps & deps,int num_cohorts)82 uint32_t Encoder::AssignCohort(const Deps& deps, int num_cohorts) {
83 std::vector<uint8_t> sha256;
84 if (!deps.hmac_func_(deps.client_secret_, kHmacCohortPrefix, &sha256)) {
85 log("HMAC failed");
86 assert(false);
87 }
88
89 // Either we are using SHA256 to have exactly 32 bytes,
90 // or we're using HmacDrbg for any number of bytes.
91 if ((sha256.size() == kMaxBits)
92 || (deps.hmac_func_ == rappor::HmacDrbg)) {
93 // Hash size ok.
94 } else {
95 log("Bad hash size.");
96 assert(false);
97 }
98
99 // Interpret first 4 bytes of sha256 as a uint32_t.
100 uint32_t c = *(reinterpret_cast<uint32_t*>(sha256.data()));
101 // e.g. for 128 cohorts, 0x80 - 1 = 0x7f
102 uint32_t cohort_mask = num_cohorts - 1;
103 return c & cohort_mask;
104 }
105
Encoder(const std::string & encoder_id,const Params & params,const Deps & deps)106 Encoder::Encoder(const std::string& encoder_id, const Params& params,
107 const Deps& deps)
108 : encoder_id_(encoder_id),
109 params_(params),
110 deps_(deps),
111 cohort_(AssignCohort(deps, params.num_cohorts_)),
112 cohort_str_(ToBigEndian(cohort_)) {
113
114 if (params_.num_bits_ <= 0) {
115 log("num_bits must be positive");
116 assert(false);
117 }
118 if (params_.num_hashes_ <= 0) {
119 log("num_hashes must be positive");
120 assert(false);
121 }
122 if (params_.num_cohorts_ <= 0) {
123 log("num_cohorts must be positive");
124 assert(false);
125 }
126
127 // Check Maximum values.
128 if (deps_.hmac_func_ == rappor::HmacDrbg) {
129 // Using HmacDrbg
130 if (params_.num_bits_ % 8 != 0) {
131 log("num_bits (%d) must be divisible by 8 when using HmacDrbg.",
132 params.num_bits_);
133 assert(false);
134 }
135 } else {
136 // Using SHA256
137 if (params_.num_bits_ > kMaxBits) {
138 log("num_bits (%d) can't be greater than %d", params_.num_bits_,
139 kMaxBits);
140 assert(false);
141 }
142 }
143
144 if (params_.num_hashes_ > kMaxHashes) {
145 log("num_hashes (%d) can't be greater than %d", params_.num_hashes_,
146 kMaxHashes);
147 assert(false);
148 }
149 int m = params_.num_cohorts_;
150 if ((m & (m - 1)) != 0) {
151 log("num_cohorts (%d) must be a power of 2 (and not 0)", m);
152 assert(false);
153 }
154 // TODO: check max cohorts?
155
156 CheckValidProbability(params_.prob_f_, "prob_f");
157 CheckValidProbability(params_.prob_p_, "prob_p");
158 CheckValidProbability(params_.prob_q_, "prob_q");
159 }
160
MakeBloomFilter(const std::string & value,Bits * bloom_out) const161 bool Encoder::MakeBloomFilter(const std::string& value, Bits* bloom_out) const {
162 const int num_bits = params_.num_bits_;
163 const int num_hashes = params_.num_hashes_;
164
165 Bits bloom = 0;
166
167 // 4 byte cohort string + true value
168 std::string hash_input(cohort_str_ + value);
169
170 // First do hashing.
171 std::vector<uint8_t> hash_output;
172 deps_.hash_func_(hash_input, &hash_output);
173
174 // Error check
175 if (hash_output.size() < static_cast<size_t>(num_hashes)) {
176 log("Hash function didn't return enough bytes");
177 return false;
178 }
179
180 // To determine which bit to set in the bloom filter, use a byte of the MD5.
181 for (int i = 0; i < num_hashes; ++i) {
182 int bit_to_set = hash_output[i] % num_bits;
183 bloom |= 1 << bit_to_set;
184 }
185
186 *bloom_out = bloom;
187 return true;
188 }
189
190 // Write a Bloom filter into a vector of bytes, used for num_bits > 32.
MakeBloomFilter(const std::string & value,std::vector<uint8_t> * bloom_out) const191 bool Encoder::MakeBloomFilter(const std::string& value,
192 std::vector<uint8_t>* bloom_out) const {
193 const int num_bits = params_.num_bits_;
194 const int num_hashes = params_.num_hashes_;
195
196 bloom_out->resize(params_.num_bits_ / 8, 0);
197
198 // Generate the hash.
199 std::vector<uint8_t> hash_output;
200 deps_.hash_func_(std::string(cohort_str_ + value), &hash_output);
201
202 // Check that we have enough bytes of hash available.
203 int exponent = 0;
204 int bytes_needed = 0;
205 while ((1 << exponent) < num_bits) {
206 exponent++;
207 }
208 bytes_needed = ((exponent - 1) / 8) + 1;
209 if (bytes_needed > 4) {
210 log("Can only use 4 bytes of hash at a time, needed %d "
211 "to address %d bits.", bytes_needed, num_bits);
212 return false;
213 }
214 if (hash_output.size() < static_cast<size_t>(bytes_needed * num_hashes)) {
215 log("Hash function returned %d bytes, but we needed "
216 "%d bytes * %d hashes. Choose lower num_hashes or "
217 "a different hash function.",
218 hash_output.size(), bytes_needed, num_hashes);
219 return false;
220 }
221
222 // To determine which bit to set in the Bloom filter, use 1 or more
223 // bytes of the MD5.
224 int hash_byte = 0;
225 for (int i = 0; i < num_hashes; ++i) {
226 int bit_to_set = 0;
227 for (int j = 0; j < bytes_needed; ++j) {
228 bit_to_set |= hash_output[hash_byte] << (j * 8);
229 ++hash_byte;
230 }
231 bit_to_set %= num_bits;
232 // Start at end of array to be consistent with the Bits implementation.
233 int index = (bloom_out->size() - 1) - (bit_to_set / 8);
234 (*bloom_out)[index] |= 1 << (bit_to_set % 8);
235 }
236 return true;
237 }
238
239 // Helper method for PRR
GetPrrMasks(const Bits bits,Bits * uniform_out,Bits * f_mask_out) const240 bool Encoder::GetPrrMasks(const Bits bits, Bits* uniform_out,
241 Bits* f_mask_out) const {
242 // Create HMAC(secret, value), and use its bits to construct f_mask and
243 // uniform bits.
244 std::vector<uint8_t> sha256;
245
246 std::string hmac_value = kHmacPrrPrefix + encoder_id_ + ToBigEndian(bits);
247
248 deps_.hmac_func_(deps_.client_secret_, hmac_value, &sha256);
249 if (sha256.size() != kMaxBits) { // sanity check
250 return false;
251 }
252
253 // We should have already checked this.
254 if (params_.num_bits_ > kMaxBits) {
255 log("num_bits exceeds maximum.");
256 assert(false);
257 }
258
259 uint8_t threshold128 = static_cast<uint8_t>(params_.prob_f_ * 128);
260
261 Bits uniform = 0;
262 Bits f_mask = 0;
263
264 for (int i = 0; i < params_.num_bits_; ++i) {
265 uint8_t byte = sha256[i];
266
267 uint8_t u_bit = byte & 0x01; // 1 bit of entropy
268 uniform |= (u_bit << i); // maybe set bit in mask
269
270 uint8_t rand128 = byte >> 1; // 7 bits of entropy
271 uint8_t noise_bit = (rand128 < threshold128);
272 f_mask |= (noise_bit << i); // maybe set bit in mask
273 }
274
275 *uniform_out = uniform;
276 *f_mask_out = f_mask;
277 return true;
278 }
279
_EncodeBitsInternal(const Bits bits,Bits * prr_out,Bits * irr_out) const280 bool Encoder::_EncodeBitsInternal(const Bits bits, Bits* prr_out,
281 Bits* irr_out) const {
282 // Compute Permanent Randomized Response (PRR).
283 Bits uniform;
284 Bits f_mask;
285 if (!GetPrrMasks(bits, &uniform, &f_mask)) {
286 log("GetPrrMasks failed");
287 return false;
288 }
289
290 Bits prr = (bits & ~f_mask) | (uniform & f_mask);
291 *prr_out = prr;
292
293 // Compute Instantaneous Randomized Response (IRR).
294
295 // NOTE: These can fail if say a read() from /dev/urandom fails.
296 Bits p_bits;
297 Bits q_bits;
298 if (!deps_.irr_rand_.GetMask(params_.prob_p_, params_.num_bits_, &p_bits)) {
299 log("PMask failed");
300 return false;
301 }
302 if (!deps_.irr_rand_.GetMask(params_.prob_q_, params_.num_bits_, &q_bits)) {
303 log("QMask failed");
304 return false;
305 };
306
307 Bits irr = (p_bits & ~prr) | (q_bits & prr);
308 *irr_out = irr;
309
310 return true;
311 }
312
_EncodeStringInternal(const std::string & value,Bits * bloom_out,Bits * prr_out,Bits * irr_out) const313 bool Encoder::_EncodeStringInternal(const std::string& value, Bits* bloom_out,
314 Bits* prr_out, Bits* irr_out) const {
315 if (!MakeBloomFilter(value, bloom_out)) {
316 log("Bloom filter calculation failed");
317 return false;
318 }
319 return _EncodeBitsInternal(*bloom_out, prr_out, irr_out);
320 }
321
EncodeBits(const Bits bits,Bits * irr_out) const322 bool Encoder::EncodeBits(const Bits bits, Bits* irr_out) const {
323 Bits unused_prr;
324 return _EncodeBitsInternal(bits, &unused_prr, irr_out);
325 }
326
EncodeString(const std::string & value,Bits * irr_out) const327 bool Encoder::EncodeString(const std::string& value, Bits* irr_out) const {
328 Bits unused_bloom;
329 Bits unused_prr;
330 return _EncodeStringInternal(value, &unused_bloom, &unused_prr, irr_out);
331 }
332
shifted(const Bits & bits,const int & index)333 static uint8_t shifted(const Bits& bits, const int& index) {
334 // For an array of bytes, select the appopriate byte from a 4-byte
335 // integer value. Bytes are enumerated in big-endian order, i.e.
336 // index = 0 is the MSB, index = 3 is the LSB.
337 int shift = 8 * (3 - (index % 4)); // Byte 0 shifts by 24 bits, 1 by 16, etc.
338 return (uint8_t)((bits >> shift) & 0xFF); // Return the correct byte.
339 }
340
EncodeString(const std::string & value,std::vector<uint8_t> * irr_out) const341 bool Encoder::EncodeString(const std::string& value,
342 std::vector<uint8_t>* irr_out) const {
343 std::vector<uint8_t> bloom_out;
344 std::vector<uint8_t> hmac_out;
345 std::vector<uint8_t> uniform;
346 std::vector<uint8_t> f_mask;
347 const int num_bits = params_.num_bits_;
348
349 uniform.resize(num_bits / 8, 0);
350 f_mask.resize(num_bits / 8, 0);
351 irr_out->resize(num_bits / 8, 0);
352
353 // Set bloom_out.
354 if (!MakeBloomFilter(value, &bloom_out)) {
355 log("Bloom filter calculation failed");
356 return false;
357 }
358
359 // Set hmac_out.
360 hmac_out.resize(num_bits); // Signal to HmacDrbg about desired output size.
361 // Call HmacDrbg
362 std::string hmac_value = kHmacPrrPrefix + encoder_id_;
363 for (int i = 0; i < bloom_out.size(); ++i) {
364 hmac_value.append(reinterpret_cast<char *>(&bloom_out[i]), 1);
365 }
366 deps_.hmac_func_(deps_.client_secret_, hmac_value, &hmac_out);
367 if (hmac_out.size() != num_bits) {
368 log("Needed %d bytes from Hmac function, received %d bytes.",
369 num_bits, hmac_out.size());
370 return false;
371 }
372
373 // We'll be using 7 bits of each byte of the MAC as our random
374 // number for the f_mask.
375 uint8_t threshold128 = static_cast<uint8_t>(params_.prob_f_ * 128);
376
377 // Construct uniform and f_mask bitwise.
378 for (int i = 0; i < num_bits; i++) {
379 uint8_t byte = hmac_out[i];
380 uint8_t u_bit = byte & 0x01; // 1 bit of entropy.
381 int vector_index = (num_bits - 1 - i) / 8;
382 uint8_t rand128 = byte >> 1; // 7 bits of entropy.
383 uint8_t noise_bit = (rand128 < threshold128);
384 uniform[vector_index] |= (u_bit << (i % 8));
385 f_mask[vector_index] |= (noise_bit << (i % 8));
386 }
387
388 for (int i = 0; i < bloom_out.size(); i++) {
389 Bits p_bits;
390 Bits q_bits;
391 uint8_t prr;
392 prr = (bloom_out[i] & ~f_mask[i]) | (uniform[i] & f_mask[i]);
393 // GetMask operates on Uint32, so we generate a new p_bits every 4
394 // bytes, and use each of its bytes once.
395 if (i % 4 == 0) {
396 // Need new p_bits, q_bits values to work with.
397 if (!deps_.irr_rand_.GetMask(params_.prob_p_, 32, &p_bits)) {
398 log("PMask failed");
399 return false;
400 }
401 if (!deps_.irr_rand_.GetMask(params_.prob_q_, 32, &q_bits)) {
402 log("QMask failed");
403 return false;
404 }
405 }
406 (*irr_out)[i] = (shifted(p_bits, i) & ~prr)
407 | (shifted(q_bits, i) & prr);
408 }
409 }
410
set_cohort(uint32_t cohort)411 void Encoder::set_cohort(uint32_t cohort) {
412 cohort_ = cohort;
413 cohort_str_ = ToBigEndian(cohort_);
414 }
415
416 } // namespace rappor
417