1# Copyright 2014 Google Inc. All rights reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14 15"""RAPPOR client library. 16 17Privacy is ensured without a third party by only sending RAPPOR'd data over the 18network (as opposed to raw client data). 19 20Note that we use MD5 for the Bloom filter hash function. (security not required) 21""" 22import csv 23import hashlib 24import hmac 25import json 26import struct 27import sys 28 29from random import SystemRandom 30 31class Error(Exception): 32 pass 33 34 35def log(msg, *args): 36 if args: 37 msg = msg % args 38 print >>sys.stderr, msg 39 40 41class Params(object): 42 """RAPPOR encoding parameters. 43 44 These affect privacy/anonymity. See the paper for details. 45 """ 46 def __init__(self): 47 self.num_bloombits = 16 # Number of bloom filter bits (k) 48 self.num_hashes = 2 # Number of bloom filter hashes (h) 49 self.num_cohorts = 64 # Number of cohorts (m) 50 self.prob_p = 0.50 # Probability p 51 self.prob_q = 0.75 # Probability q 52 self.prob_f = 0.50 # Probability f 53 54 # For testing 55 def __eq__(self, other): 56 return self.__dict__ == other.__dict__ 57 58 def __repr__(self): 59 return repr(self.__dict__) 60 61 def to_json(self): 62 """Convert this instance to JSON. 63 64 The names are be compatible with the apps/api server. 65 """ 66 return json.dumps({ 67 'numBits': self.num_bloombits, 68 'numHashes': self.num_hashes, 69 'numCohorts': self.num_cohorts, 70 'probPrr': self.prob_f, 71 'probIrr0': self.prob_p, 72 'probIrr1': self.prob_q, 73 }) 74 75 # NOTE: 76 # - from_csv is currently used in sum_bits.py 77 # - to_csv is in rappor_sim.print_params 78 @staticmethod 79 def from_csv(f): 80 """Read the RAPPOR parameters from a CSV file. 81 82 Args: 83 f: file handle 84 85 Returns: 86 Params instance. 87 88 Raises: 89 rappor.Error: when the file is malformed. 90 """ 91 c = csv.reader(f) 92 ok = False 93 p = Params() 94 for i, row in enumerate(c): 95 96 if i == 0: 97 if row != ['k', 'h', 'm', 'p', 'q', 'f']: 98 raise Error('Header %s is malformed; expected k,h,m,p,q,f' % row) 99 100 elif i == 1: 101 try: 102 # NOTE: May raise exceptions 103 p.num_bloombits = int(row[0]) 104 p.num_hashes = int(row[1]) 105 p.num_cohorts = int(row[2]) 106 p.prob_p = float(row[3]) 107 p.prob_q = float(row[4]) 108 p.prob_f = float(row[5]) 109 except (ValueError, IndexError) as e: 110 raise Error('Row is malformed: %s' % e) 111 ok = True 112 113 else: 114 raise Error('Params file should only have two rows') 115 116 if not ok: 117 raise Error("Expected second row with params") 118 119 return p 120 121 122class _SecureRandom(object): 123 """Returns an integer where each bit has probability p of being 1.""" 124 125 def __init__(self, prob_one, num_bits): 126 self.prob_one = prob_one 127 self.num_bits = num_bits 128 129 def __call__(self): 130 p = self.prob_one 131 rand = SystemRandom() 132 r = 0 133 134 for i in xrange(self.num_bits): 135 bit = rand.random() < p 136 r |= (bit << i) # using bool as int 137 return r 138 139 140class SecureIrrRand(object): 141 """Python's os.random()""" 142 143 def __init__(self, params): 144 """ 145 Args: 146 params: rappor.Params 147 """ 148 num_bits = params.num_bloombits 149 # IRR probabilities 150 151 self.p_gen = _SecureRandom(params.prob_p, num_bits) 152 self.q_gen = _SecureRandom(params.prob_q, num_bits) 153 154 155def to_big_endian(i): 156 """Convert an integer to a 4 byte big endian string. Used for hashing.""" 157 # https://docs.python.org/2/library/struct.html 158 # - Big Endian (>) for consistent network byte order. 159 # - L means 4 bytes when using > 160 return struct.pack('>L', i) 161 162 163def get_bloom_bits(word, cohort, num_hashes, num_bloombits): 164 """Return an array of bits to set in the bloom filter. 165 166 In the real report, we bitwise-OR them together. In hash candidates, we put 167 them in separate entries in the "map" matrix. 168 """ 169 value = to_big_endian(cohort) + word # Cohort is 4 byte prefix. 170 md5 = hashlib.md5(value) 171 172 digest = md5.digest() 173 174 # Each has is a byte, which means we could have up to 256 bit Bloom filters. 175 # There are 16 bytes in an MD5, in which case we can have up to 16 hash 176 # functions per Bloom filter. 177 if num_hashes > len(digest): 178 raise RuntimeError("Can't have more than %d hashes" % md5) 179 180 #log('hash_input %r', value) 181 #log('Cohort %d', cohort) 182 #log('MD5 %s', md5.hexdigest()) 183 184 return [ord(digest[i]) % num_bloombits for i in xrange(num_hashes)] 185 186 187def get_prr_masks(secret, word, prob_f, num_bits): 188 h = hmac.new(secret, word, digestmod=hashlib.sha256) 189 #log('word %s, secret %s, HMAC-SHA256 %s', word, secret, h.hexdigest()) 190 191 # Now go through each byte 192 digest_bytes = h.digest() 193 assert len(digest_bytes) == 32 194 195 # Use 32 bits. If we want 64 bits, it may be fine to generate another 32 196 # bytes by repeated HMAC. For arbitrary numbers of bytes it's probably 197 # better to use the HMAC-DRBG algorithm. 198 if num_bits > len(digest_bytes): 199 raise RuntimeError('%d bits is more than the max of %d', num_bits, len(d)) 200 201 threshold128 = prob_f * 128 202 203 uniform = 0 204 f_mask = 0 205 206 for i in xrange(num_bits): 207 ch = digest_bytes[i] 208 byte = ord(ch) 209 210 u_bit = byte & 0x01 # 1 bit of entropy 211 uniform |= (u_bit << i) # maybe set bit in mask 212 213 rand128 = byte >> 1 # 7 bits of entropy 214 noise_bit = (rand128 < threshold128) 215 f_mask |= (noise_bit << i) # maybe set bit in mask 216 217 return uniform, f_mask 218 219 220def bit_string(irr, num_bloombits): 221 """Like bin(), but uses leading zeroes, and no '0b'.""" 222 s = '' 223 bits = [] 224 for bit_num in xrange(num_bloombits): 225 if irr & (1 << bit_num): 226 bits.append('1') 227 else: 228 bits.append('0') 229 return ''.join(reversed(bits)) 230 231 232class Encoder(object): 233 """Obfuscates values for a given user using the RAPPOR privacy algorithm.""" 234 235 def __init__(self, params, cohort, secret, irr_rand): 236 """ 237 Args: 238 params: RAPPOR Params() controlling privacy 239 cohort: integer cohort, for Bloom hashing. 240 secret: secret string, for the PRR to be a deterministic function of the 241 reported value. 242 irr_rand: IRR randomness interface. 243 """ 244 # RAPPOR params. NOTE: num_cohorts isn't used. p and q are used by 245 # irr_rand. 246 self.params = params 247 self.cohort = cohort # associated: MD5 248 self.secret = secret # associated: HMAC-SHA256 249 self.irr_rand = irr_rand # p and q used 250 251 def _internal_encode_bits(self, bits): 252 """Helper function for simulation / testing. 253 254 Returns: 255 The PRR and IRR. The PRR should never be sent over the network. 256 """ 257 # Compute Permanent Randomized Response (PRR). 258 uniform, f_mask = get_prr_masks( 259 self.secret, to_big_endian(bits), self.params.prob_f, 260 self.params.num_bloombits) 261 262 # Suppose bit i of the Bloom filter is B_i. Then bit i of the PRR is 263 # defined as: 264 # 265 # 1 with prob f/2 266 # 0 with prob f/2 267 # B_i with prob 1-f 268 269 # Uniform bits are 1 with probability 1/2, and f_mask bits are 1 with 270 # probability f. So in the expression below: 271 # 272 # - Bits in (uniform & f_mask) are 1 with probability f/2. 273 # - (bloom_bits & ~f_mask) clears a bloom filter bit with probability 274 # f, so we get B_i with probability 1-f. 275 # - The remaining bits are 0, with remaining probability f/2. 276 277 prr = (bits & ~f_mask) | (uniform & f_mask) 278 279 #log('U %s / F %s', bit_string(uniform, num_bits), 280 # bit_string(f_mask, num_bits)) 281 282 #log('B %s / PRR %s', bit_string(bloom_bits, num_bits), 283 # bit_string(prr, num_bits)) 284 285 # Compute Instantaneous Randomized Response (IRR). 286 # If PRR bit is 0, IRR bit is 1 with probability p. 287 # If PRR bit is 1, IRR bit is 1 with probability q. 288 p_bits = self.irr_rand.p_gen() 289 q_bits = self.irr_rand.q_gen() 290 291 irr = (p_bits & ~prr) | (q_bits & prr) 292 293 return prr, irr # IRR is the rappor 294 295 def _internal_encode(self, word): 296 """Helper function for simulation / testing. 297 298 Returns: 299 The Bloom filter bits, PRR, and IRR. The first two values should never 300 be sent over the network. 301 """ 302 bloom_bits = get_bloom_bits(word, self.cohort, self.params.num_hashes, 303 self.params.num_bloombits) 304 305 bloom = 0 306 for bit_to_set in bloom_bits: 307 bloom |= (1 << bit_to_set) 308 309 prr, irr = self._internal_encode_bits(bloom) 310 return bloom, prr, irr 311 312 def encode_bits(self, bits): 313 """Encode a string with RAPPOR. 314 315 Args: 316 bits: An integer representing bits to encode. 317 318 Returns: 319 An integer that is the IRR (Instantaneous Randomized Response). 320 """ 321 _, irr = self._internal_encode_bits(bits) 322 return irr 323 324 def encode(self, word): 325 """Encode a string with RAPPOR. 326 327 Args: 328 word: the string that should be privately transmitted. 329 330 Returns: 331 An integer that is the IRR (Instantaneous Randomized Response). 332 """ 333 _, _, irr = self._internal_encode(word) 334 return irr 335