xref: /aosp_15_r20/external/rappor/client/cpp/encoder.cc (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1 // Copyright 2015 Google Inc. All rights reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "encoder.h"
16 #include "openssl_hash_impl.h"
17 
18 #include <assert.h>
19 #include <stdio.h>
20 #include <stdarg.h>  // va_list, etc.
21 #include <vector>
22 
23 namespace rappor {
24 
log(const char * fmt,...)25 void log(const char* fmt, ...) {
26   va_list args;
27   va_start(args, fmt);
28   vfprintf(stderr, fmt, args);
29   va_end(args);
30   fprintf(stderr, "\n");
31 }
32 
33 //
34 // Functions for debugging
35 //
36 
PrintHex(const std::vector<uint8_t> & h)37 static void PrintHex(const std::vector<uint8_t>& h) {
38   for (size_t i = 0; i < h.size(); ++i) {
39     fprintf(stderr, "%02x", h[i]);
40   }
41   fprintf(stderr, "\n");
42 }
43 
44 // We use 1 *byte* of a HMAC-SHA256 value per BIT to generate the PRR.  SHA256
45 // has 32 bytes, so the max is 32 bits.
46 static const int kMaxBits = 32;
47 
48 // Can't be more than the number of bytes in MD5.
49 static const int kMaxHashes = 16;
50 
51 // Probabilities should be in the interval [0.0, 1.0].
CheckValidProbability(float prob,const char * var_name)52 static void CheckValidProbability(float prob, const char* var_name) {
53   if (prob < 0.0f || prob > 1.0f) {
54     log("%s should be between 0.0 and 1.0 inclusive (got %.2f)", var_name,
55         prob);
56     assert(false);
57   }
58 }
59 
60 // Used to 1) turn cohort into a string, and 2) Turn raw bits into a string.
61 // Return by value since it's small.
ToBigEndian(uint32_t u)62 static std::string ToBigEndian(uint32_t u) {
63   std::string result(4, '\0');
64 
65   // rely on truncation to char
66   result[0] = u >> 24;
67   result[1] = u >> 16;
68   result[2] = u >> 8;
69   result[3] = u;
70 
71   return result;
72 }
73 
74 static const char* kHmacCohortPrefix = "\x00";
75 static const char* kHmacPrrPrefix = "\x01";
76 
77 
78 //
79 // Encoder
80 //
81 
AssignCohort(const Deps & deps,int num_cohorts)82 uint32_t Encoder::AssignCohort(const Deps& deps, int num_cohorts) {
83   std::vector<uint8_t> sha256;
84   if (!deps.hmac_func_(deps.client_secret_, kHmacCohortPrefix, &sha256)) {
85     log("HMAC failed");
86     assert(false);
87   }
88 
89   // Either we are using SHA256 to have exactly 32 bytes,
90   // or we're using HmacDrbg for any number of bytes.
91   if ((sha256.size() == kMaxBits)
92       || (deps.hmac_func_ == rappor::HmacDrbg)) {
93     // Hash size ok.
94   } else {
95     log("Bad hash size.");
96     assert(false);
97   }
98 
99   // Interpret first 4 bytes of sha256 as a uint32_t.
100   uint32_t c = *(reinterpret_cast<uint32_t*>(sha256.data()));
101   // e.g. for 128 cohorts, 0x80 - 1 = 0x7f
102   uint32_t cohort_mask = num_cohorts - 1;
103   return c & cohort_mask;
104 }
105 
Encoder(const std::string & encoder_id,const Params & params,const Deps & deps)106 Encoder::Encoder(const std::string& encoder_id, const Params& params,
107                  const Deps& deps)
108     : encoder_id_(encoder_id),
109       params_(params),
110       deps_(deps),
111       cohort_(AssignCohort(deps, params.num_cohorts_)),
112       cohort_str_(ToBigEndian(cohort_)) {
113 
114   if (params_.num_bits_ <= 0) {
115     log("num_bits must be positive");
116     assert(false);
117   }
118   if (params_.num_hashes_ <= 0) {
119     log("num_hashes must be positive");
120     assert(false);
121   }
122   if (params_.num_cohorts_ <= 0) {
123     log("num_cohorts must be positive");
124     assert(false);
125   }
126 
127   // Check Maximum values.
128   if (deps_.hmac_func_ == rappor::HmacDrbg) {
129     // Using HmacDrbg
130     if (params_.num_bits_ % 8 != 0) {
131       log("num_bits (%d) must be divisible by 8 when using HmacDrbg.",
132           params.num_bits_);
133       assert(false);
134     }
135   } else {
136     // Using SHA256
137     if (params_.num_bits_ > kMaxBits) {
138         log("num_bits (%d) can't be greater than %d", params_.num_bits_,
139             kMaxBits);
140         assert(false);
141     }
142   }
143 
144   if (params_.num_hashes_ > kMaxHashes) {
145     log("num_hashes (%d) can't be greater than %d", params_.num_hashes_,
146         kMaxHashes);
147     assert(false);
148   }
149   int m = params_.num_cohorts_;
150   if ((m & (m - 1)) != 0) {
151     log("num_cohorts (%d) must be a power of 2 (and not 0)", m);
152     assert(false);
153   }
154   // TODO: check max cohorts?
155 
156   CheckValidProbability(params_.prob_f_, "prob_f");
157   CheckValidProbability(params_.prob_p_, "prob_p");
158   CheckValidProbability(params_.prob_q_, "prob_q");
159 }
160 
MakeBloomFilter(const std::string & value,Bits * bloom_out) const161 bool Encoder::MakeBloomFilter(const std::string& value, Bits* bloom_out) const {
162   const int num_bits = params_.num_bits_;
163   const int num_hashes = params_.num_hashes_;
164 
165   Bits bloom = 0;
166 
167   // 4 byte cohort string + true value
168   std::string hash_input(cohort_str_ + value);
169 
170   // First do hashing.
171   std::vector<uint8_t> hash_output;
172   deps_.hash_func_(hash_input, &hash_output);
173 
174   // Error check
175   if (hash_output.size() < static_cast<size_t>(num_hashes)) {
176     log("Hash function didn't return enough bytes");
177     return false;
178   }
179 
180   // To determine which bit to set in the bloom filter, use a byte of the MD5.
181   for (int i = 0; i < num_hashes; ++i) {
182     int bit_to_set = hash_output[i] % num_bits;
183     bloom |= 1 << bit_to_set;
184   }
185 
186   *bloom_out = bloom;
187   return true;
188 }
189 
190 // Write a Bloom filter into a vector of bytes, used for num_bits > 32.
MakeBloomFilter(const std::string & value,std::vector<uint8_t> * bloom_out) const191 bool Encoder::MakeBloomFilter(const std::string& value,
192                               std::vector<uint8_t>* bloom_out) const {
193   const int num_bits = params_.num_bits_;
194   const int num_hashes = params_.num_hashes_;
195 
196   bloom_out->resize(params_.num_bits_ / 8, 0);
197 
198   // Generate the hash.
199   std::vector<uint8_t> hash_output;
200   deps_.hash_func_(std::string(cohort_str_ + value), &hash_output);
201 
202   // Check that we have enough bytes of hash available.
203   int exponent = 0;
204   int bytes_needed = 0;
205   while ((1 << exponent) < num_bits) {
206     exponent++;
207   }
208   bytes_needed = ((exponent - 1) / 8) + 1;
209   if (bytes_needed > 4) {
210     log("Can only use 4 bytes of hash at a time, needed %d "
211         "to address %d bits.", bytes_needed, num_bits);
212     return false;
213   }
214   if (hash_output.size() < static_cast<size_t>(bytes_needed * num_hashes)) {
215     log("Hash function returned %d bytes, but we needed "
216         "%d bytes * %d hashes. Choose lower num_hashes or "
217         "a different hash function.",
218         hash_output.size(), bytes_needed, num_hashes);
219     return false;
220   }
221 
222   // To determine which bit to set in the Bloom filter, use 1 or more
223   // bytes of the MD5.
224   int hash_byte = 0;
225   for (int i = 0; i < num_hashes; ++i) {
226     int bit_to_set = 0;
227     for (int j = 0; j < bytes_needed; ++j) {
228       bit_to_set |= hash_output[hash_byte] << (j * 8);
229       ++hash_byte;
230     }
231     bit_to_set %= num_bits;
232     // Start at end of array to be consistent with the Bits implementation.
233     int index = (bloom_out->size() - 1) - (bit_to_set / 8);
234     (*bloom_out)[index] |= 1 << (bit_to_set % 8);
235   }
236   return true;
237 }
238 
239 // Helper method for PRR
GetPrrMasks(const Bits bits,Bits * uniform_out,Bits * f_mask_out) const240 bool Encoder::GetPrrMasks(const Bits bits, Bits* uniform_out,
241                           Bits* f_mask_out) const {
242   // Create HMAC(secret, value), and use its bits to construct f_mask and
243   // uniform bits.
244   std::vector<uint8_t> sha256;
245 
246   std::string hmac_value = kHmacPrrPrefix + encoder_id_ + ToBigEndian(bits);
247 
248   deps_.hmac_func_(deps_.client_secret_, hmac_value, &sha256);
249   if (sha256.size() != kMaxBits) {  // sanity check
250     return false;
251   }
252 
253   // We should have already checked this.
254   if (params_.num_bits_ > kMaxBits) {
255     log("num_bits exceeds maximum.");
256     assert(false);
257   }
258 
259   uint8_t threshold128 = static_cast<uint8_t>(params_.prob_f_ * 128);
260 
261   Bits uniform = 0;
262   Bits f_mask = 0;
263 
264   for (int i = 0; i < params_.num_bits_; ++i) {
265     uint8_t byte = sha256[i];
266 
267     uint8_t u_bit = byte & 0x01;  // 1 bit of entropy
268     uniform |= (u_bit << i);  // maybe set bit in mask
269 
270     uint8_t rand128 = byte >> 1;  // 7 bits of entropy
271     uint8_t noise_bit = (rand128 < threshold128);
272     f_mask |= (noise_bit << i);  // maybe set bit in mask
273   }
274 
275   *uniform_out = uniform;
276   *f_mask_out = f_mask;
277   return true;
278 }
279 
_EncodeBitsInternal(const Bits bits,Bits * prr_out,Bits * irr_out) const280 bool Encoder::_EncodeBitsInternal(const Bits bits, Bits* prr_out,
281                                   Bits* irr_out) const {
282   // Compute Permanent Randomized Response (PRR).
283   Bits uniform;
284   Bits f_mask;
285   if (!GetPrrMasks(bits, &uniform, &f_mask)) {
286     log("GetPrrMasks failed");
287     return false;
288   }
289 
290   Bits prr = (bits & ~f_mask) | (uniform & f_mask);
291   *prr_out = prr;
292 
293   // Compute Instantaneous Randomized Response (IRR).
294 
295   // NOTE: These can fail if say a read() from /dev/urandom fails.
296   Bits p_bits;
297   Bits q_bits;
298   if (!deps_.irr_rand_.GetMask(params_.prob_p_, params_.num_bits_, &p_bits)) {
299     log("PMask failed");
300     return false;
301   }
302   if (!deps_.irr_rand_.GetMask(params_.prob_q_, params_.num_bits_, &q_bits)) {
303     log("QMask failed");
304     return false;
305   };
306 
307   Bits irr = (p_bits & ~prr) | (q_bits & prr);
308   *irr_out = irr;
309 
310   return true;
311 }
312 
_EncodeStringInternal(const std::string & value,Bits * bloom_out,Bits * prr_out,Bits * irr_out) const313 bool Encoder::_EncodeStringInternal(const std::string& value, Bits* bloom_out,
314     Bits* prr_out, Bits* irr_out) const {
315   if (!MakeBloomFilter(value, bloom_out)) {
316     log("Bloom filter calculation failed");
317     return false;
318   }
319   return _EncodeBitsInternal(*bloom_out, prr_out, irr_out);
320 }
321 
EncodeBits(const Bits bits,Bits * irr_out) const322 bool Encoder::EncodeBits(const Bits bits, Bits* irr_out) const {
323   Bits unused_prr;
324   return _EncodeBitsInternal(bits, &unused_prr, irr_out);
325 }
326 
EncodeString(const std::string & value,Bits * irr_out) const327 bool Encoder::EncodeString(const std::string& value, Bits* irr_out) const {
328   Bits unused_bloom;
329   Bits unused_prr;
330   return _EncodeStringInternal(value, &unused_bloom, &unused_prr, irr_out);
331 }
332 
shifted(const Bits & bits,const int & index)333 static uint8_t shifted(const Bits& bits, const int& index) {
334   // For an array of bytes, select the appopriate byte from a 4-byte
335   // integer value. Bytes are enumerated in big-endian order, i.e.
336   // index = 0 is the MSB, index = 3 is the LSB.
337   int shift = 8 * (3 - (index % 4)); // Byte 0 shifts by 24 bits, 1 by 16, etc.
338   return (uint8_t)((bits >> shift) & 0xFF);  // Return the correct byte.
339 }
340 
EncodeString(const std::string & value,std::vector<uint8_t> * irr_out) const341 bool Encoder::EncodeString(const std::string& value,
342                            std::vector<uint8_t>* irr_out) const {
343   std::vector<uint8_t> bloom_out;
344   std::vector<uint8_t> hmac_out;
345   std::vector<uint8_t> uniform;
346   std::vector<uint8_t> f_mask;
347   const int num_bits = params_.num_bits_;
348 
349   uniform.resize(num_bits / 8, 0);
350   f_mask.resize(num_bits / 8, 0);
351   irr_out->resize(num_bits / 8, 0);
352 
353   // Set bloom_out.
354   if (!MakeBloomFilter(value, &bloom_out)) {
355     log("Bloom filter calculation failed");
356     return false;
357   }
358 
359   // Set hmac_out.
360   hmac_out.resize(num_bits);  // Signal to HmacDrbg about desired output size.
361   // Call HmacDrbg
362   std::string hmac_value =  kHmacPrrPrefix + encoder_id_;
363   for (int i = 0; i < bloom_out.size(); ++i) {
364     hmac_value.append(reinterpret_cast<char *>(&bloom_out[i]), 1);
365   }
366   deps_.hmac_func_(deps_.client_secret_, hmac_value, &hmac_out);
367   if (hmac_out.size() != num_bits) {
368     log("Needed %d bytes from Hmac function, received %d bytes.",
369         num_bits, hmac_out.size());
370     return false;
371   }
372 
373   // We'll be using 7 bits of each byte of the MAC as our random
374   // number for the f_mask.
375   uint8_t threshold128 = static_cast<uint8_t>(params_.prob_f_ * 128);
376 
377   // Construct uniform and f_mask bitwise.
378   for (int i = 0; i < num_bits; i++) {
379     uint8_t byte = hmac_out[i];
380     uint8_t u_bit = byte & 0x01;  // 1 bit of entropy.
381     int vector_index = (num_bits - 1 - i) / 8;
382     uint8_t rand128 = byte >> 1;  // 7 bits of entropy.
383     uint8_t noise_bit = (rand128 < threshold128);
384     uniform[vector_index] |= (u_bit << (i % 8));
385     f_mask[vector_index] |= (noise_bit << (i % 8));
386   }
387 
388   for (int i = 0; i < bloom_out.size(); i++) {
389     Bits p_bits;
390     Bits q_bits;
391     uint8_t prr;
392     prr = (bloom_out[i] & ~f_mask[i]) | (uniform[i] & f_mask[i]);
393     // GetMask operates on Uint32, so we generate a new p_bits every 4
394     // bytes, and use each of its bytes once.
395     if (i % 4 == 0) {
396       // Need new p_bits, q_bits values to work with.
397       if (!deps_.irr_rand_.GetMask(params_.prob_p_, 32, &p_bits)) {
398         log("PMask failed");
399         return false;
400       }
401       if (!deps_.irr_rand_.GetMask(params_.prob_q_, 32, &q_bits)) {
402         log("QMask failed");
403         return false;
404       }
405     }
406     (*irr_out)[i] = (shifted(p_bits, i) & ~prr)
407         | (shifted(q_bits, i) & prr);
408   }
409 }
410 
set_cohort(uint32_t cohort)411 void Encoder::set_cohort(uint32_t cohort) {
412   cohort_ = cohort;
413   cohort_str_ = ToBigEndian(cohort_);
414 }
415 
416 }  // namespace rappor
417