xref: /aosp_15_r20/external/rappor/client/python/rappor.py (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1# Copyright 2014 Google Inc. All rights reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""RAPPOR client library.
16
17Privacy is ensured without a third party by only sending RAPPOR'd data over the
18network (as opposed to raw client data).
19
20Note that we use MD5 for the Bloom filter hash function. (security not required)
21"""
22import csv
23import hashlib
24import hmac
25import json
26import struct
27import sys
28
29from random import SystemRandom
30
31class Error(Exception):
32  pass
33
34
35def log(msg, *args):
36  if args:
37    msg = msg % args
38  print >>sys.stderr, msg
39
40
41class Params(object):
42  """RAPPOR encoding parameters.
43
44  These affect privacy/anonymity.  See the paper for details.
45  """
46  def __init__(self):
47    self.num_bloombits = 16      # Number of bloom filter bits (k)
48    self.num_hashes = 2          # Number of bloom filter hashes (h)
49    self.num_cohorts = 64        # Number of cohorts (m)
50    self.prob_p = 0.50           # Probability p
51    self.prob_q = 0.75           # Probability q
52    self.prob_f = 0.50           # Probability f
53
54  # For testing
55  def __eq__(self, other):
56    return self.__dict__ == other.__dict__
57
58  def __repr__(self):
59    return repr(self.__dict__)
60
61  def to_json(self):
62    """Convert this instance to JSON.
63
64    The names are be compatible with the apps/api server.
65    """
66    return json.dumps({
67        'numBits': self.num_bloombits,
68        'numHashes': self.num_hashes,
69        'numCohorts': self.num_cohorts,
70        'probPrr': self.prob_f,
71        'probIrr0': self.prob_p,
72        'probIrr1': self.prob_q,
73    })
74
75  # NOTE:
76  # - from_csv is currently used in sum_bits.py
77  # - to_csv is in rappor_sim.print_params
78  @staticmethod
79  def from_csv(f):
80    """Read the RAPPOR parameters from a CSV file.
81
82    Args:
83      f: file handle
84
85    Returns:
86      Params instance.
87
88    Raises:
89      rappor.Error: when the file is malformed.
90    """
91    c = csv.reader(f)
92    ok = False
93    p = Params()
94    for i, row in enumerate(c):
95
96      if i == 0:
97        if row != ['k', 'h', 'm', 'p', 'q', 'f']:
98          raise Error('Header %s is malformed; expected k,h,m,p,q,f' % row)
99
100      elif i == 1:
101        try:
102          # NOTE: May raise exceptions
103          p.num_bloombits = int(row[0])
104          p.num_hashes = int(row[1])
105          p.num_cohorts = int(row[2])
106          p.prob_p = float(row[3])
107          p.prob_q = float(row[4])
108          p.prob_f = float(row[5])
109        except (ValueError, IndexError) as e:
110          raise Error('Row is malformed: %s' % e)
111        ok = True
112
113      else:
114        raise Error('Params file should only have two rows')
115
116    if not ok:
117      raise Error("Expected second row with params")
118
119    return p
120
121
122class _SecureRandom(object):
123  """Returns an integer where each bit has probability p of being 1."""
124
125  def __init__(self, prob_one, num_bits):
126    self.prob_one = prob_one
127    self.num_bits = num_bits
128
129  def __call__(self):
130    p = self.prob_one
131    rand = SystemRandom()
132    r = 0
133
134    for i in xrange(self.num_bits):
135      bit = rand.random() < p
136      r |= (bit << i)  # using bool as int
137    return r
138
139
140class SecureIrrRand(object):
141  """Python's os.random()"""
142
143  def __init__(self, params):
144    """
145    Args:
146      params: rappor.Params
147    """
148    num_bits = params.num_bloombits
149    # IRR probabilities
150
151    self.p_gen = _SecureRandom(params.prob_p, num_bits)
152    self.q_gen = _SecureRandom(params.prob_q, num_bits)
153
154
155def to_big_endian(i):
156  """Convert an integer to a 4 byte big endian string.  Used for hashing."""
157  # https://docs.python.org/2/library/struct.html
158  # - Big Endian (>) for consistent network byte order.
159  # - L means 4 bytes when using >
160  return struct.pack('>L', i)
161
162
163def get_bloom_bits(word, cohort, num_hashes, num_bloombits):
164  """Return an array of bits to set in the bloom filter.
165
166  In the real report, we bitwise-OR them together.  In hash candidates, we put
167  them in separate entries in the "map" matrix.
168  """
169  value = to_big_endian(cohort) + word  # Cohort is 4 byte prefix.
170  md5 = hashlib.md5(value)
171
172  digest = md5.digest()
173
174  # Each has is a byte, which means we could have up to 256 bit Bloom filters.
175  # There are 16 bytes in an MD5, in which case we can have up to 16 hash
176  # functions per Bloom filter.
177  if num_hashes > len(digest):
178    raise RuntimeError("Can't have more than %d hashes" % md5)
179
180  #log('hash_input %r', value)
181  #log('Cohort %d', cohort)
182  #log('MD5 %s', md5.hexdigest())
183
184  return [ord(digest[i]) % num_bloombits for i in xrange(num_hashes)]
185
186
187def get_prr_masks(secret, word, prob_f, num_bits):
188  h = hmac.new(secret, word, digestmod=hashlib.sha256)
189  #log('word %s, secret %s, HMAC-SHA256 %s', word, secret, h.hexdigest())
190
191  # Now go through each byte
192  digest_bytes = h.digest()
193  assert len(digest_bytes) == 32
194
195  # Use 32 bits.  If we want 64 bits, it may be fine to generate another 32
196  # bytes by repeated HMAC.  For arbitrary numbers of bytes it's probably
197  # better to use the HMAC-DRBG algorithm.
198  if num_bits > len(digest_bytes):
199    raise RuntimeError('%d bits is more than the max of %d', num_bits, len(d))
200
201  threshold128 = prob_f * 128
202
203  uniform = 0
204  f_mask = 0
205
206  for i in xrange(num_bits):
207    ch = digest_bytes[i]
208    byte = ord(ch)
209
210    u_bit = byte & 0x01  # 1 bit of entropy
211    uniform |= (u_bit << i)  # maybe set bit in mask
212
213    rand128 = byte >> 1  # 7 bits of entropy
214    noise_bit = (rand128 < threshold128)
215    f_mask |= (noise_bit << i)  # maybe set bit in mask
216
217  return uniform, f_mask
218
219
220def bit_string(irr, num_bloombits):
221  """Like bin(), but uses leading zeroes, and no '0b'."""
222  s = ''
223  bits = []
224  for bit_num in xrange(num_bloombits):
225    if irr & (1 << bit_num):
226      bits.append('1')
227    else:
228      bits.append('0')
229  return ''.join(reversed(bits))
230
231
232class Encoder(object):
233  """Obfuscates values for a given user using the RAPPOR privacy algorithm."""
234
235  def __init__(self, params, cohort, secret, irr_rand):
236    """
237    Args:
238      params: RAPPOR Params() controlling privacy
239      cohort: integer cohort, for Bloom hashing.
240      secret: secret string, for the PRR to be a deterministic function of the
241        reported value.
242      irr_rand: IRR randomness interface.
243    """
244    # RAPPOR params.  NOTE: num_cohorts isn't used.  p and q are used by
245    # irr_rand.
246    self.params = params
247    self.cohort = cohort  # associated: MD5
248    self.secret = secret  # associated: HMAC-SHA256
249    self.irr_rand = irr_rand  # p and q used
250
251  def _internal_encode_bits(self, bits):
252    """Helper function for simulation / testing.
253
254    Returns:
255      The PRR and IRR.  The PRR should never be sent over the network.
256    """
257    # Compute Permanent Randomized Response (PRR).
258    uniform, f_mask = get_prr_masks(
259        self.secret, to_big_endian(bits), self.params.prob_f,
260        self.params.num_bloombits)
261
262    # Suppose bit i of the Bloom filter is B_i.  Then bit i of the PRR is
263    # defined as:
264    #
265    # 1   with prob f/2
266    # 0   with prob f/2
267    # B_i with prob 1-f
268
269    # Uniform bits are 1 with probability 1/2, and f_mask bits are 1 with
270    # probability f.  So in the expression below:
271    #
272    # - Bits in (uniform & f_mask) are 1 with probability f/2.
273    # - (bloom_bits & ~f_mask) clears a bloom filter bit with probability
274    # f, so we get B_i with probability 1-f.
275    # - The remaining bits are 0, with remaining probability f/2.
276
277    prr = (bits & ~f_mask) | (uniform & f_mask)
278
279    #log('U %s / F %s', bit_string(uniform, num_bits),
280    #    bit_string(f_mask, num_bits))
281
282    #log('B %s / PRR %s', bit_string(bloom_bits, num_bits),
283    #    bit_string(prr, num_bits))
284
285    # Compute Instantaneous Randomized Response (IRR).
286    # If PRR bit is 0, IRR bit is 1 with probability p.
287    # If PRR bit is 1, IRR bit is 1 with probability q.
288    p_bits = self.irr_rand.p_gen()
289    q_bits = self.irr_rand.q_gen()
290
291    irr = (p_bits & ~prr) | (q_bits & prr)
292
293    return prr, irr  # IRR is the rappor
294
295  def _internal_encode(self, word):
296    """Helper function for simulation / testing.
297
298    Returns:
299      The Bloom filter bits, PRR, and IRR.  The first two values should never
300      be sent over the network.
301    """
302    bloom_bits = get_bloom_bits(word, self.cohort, self.params.num_hashes,
303                                self.params.num_bloombits)
304
305    bloom = 0
306    for bit_to_set in bloom_bits:
307      bloom |= (1 << bit_to_set)
308
309    prr, irr = self._internal_encode_bits(bloom)
310    return bloom, prr, irr
311
312  def encode_bits(self, bits):
313    """Encode a string with RAPPOR.
314
315    Args:
316      bits: An integer representing bits to encode.
317
318    Returns:
319      An integer that is the IRR (Instantaneous Randomized Response).
320    """
321    _, irr = self._internal_encode_bits(bits)
322    return irr
323
324  def encode(self, word):
325    """Encode a string with RAPPOR.
326
327    Args:
328      word: the string that should be privately transmitted.
329
330    Returns:
331      An integer that is the IRR (Instantaneous Randomized Response).
332    """
333    _, _, irr = self._internal_encode(word)
334    return irr
335