xref: /aosp_15_r20/external/private-join-and-compute/private_join_and_compute/data_util.h (revision a6aa18fbfbf9cb5cd47356a9d1b057768998488c)
1 /*
2  * Copyright 2019 Google LLC.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     https://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef PRIVATE_JOIN_AND_COMPUTE_DATA_UTIL_H_
17 #define PRIVATE_JOIN_AND_COMPUTE_DATA_UTIL_H_
18 
19 // Contains utility functions to generate dummy input data for the server and
20 // client, and also to write the data to file and parse it back.
21 
22 #include <string>
23 #include <tuple>
24 #include <utility>
25 #include <vector>
26 
27 #include "absl/strings/string_view.h"
28 #include "private_join_and_compute/crypto/context.h"
29 #include "private_join_and_compute/match.pb.h"
30 #include "private_join_and_compute/util/status.inc"
31 
32 namespace private_join_and_compute {
33 
34 // Random Identifiers generated by this library will be this many bytes long.
35 static const int64_t kRandomIdentifierLengthBytes = 32;
36 
37 // Generates random datasets for the server and client. The server data contains
38 // the server_data_size identifiers, while the client data contains
39 // client_data_size identifiers, each paired with randomly selected associated
40 // values between 0 and the max_associated_value. The two generated datasets
41 // will have intersection_size identifiers in common. The function also returns
42 // the value of the real intersection sum. Each identifier consists of random
43 // alphanumeric strings.
44 //
45 // The output is a tuple with the following interpretation:
46 // First element: server's data.
47 // Second element: client's data (identifiers and associated values).
48 // Third element: the sum of values associated with common identifiers ( the
49 // "true" intersection-sum)
50 //
51 // Client and server identifiers are kRandomIdentifierLengthBytes-long random
52 // strings.
53 //
54 // The identifiers are generated and permuted with a
55 // non-cryptographically-secure PRNG. This is fine for dummy data.
56 //
57 // Fails with INVALID_ARGUMENT if the intersection size given is larger than
58 // either server or client data size, if max_associated_value is negative, or if
59 // max_associated_value * intersection_size is larger than the max value of
60 // int64_t.
61 auto GenerateRandomDatabases(int64_t server_data_size, int64_t client_data_size,
62                              int64_t intersection_size,
63                              int64_t max_associated_value)
64     -> StatusOr<std::tuple<
65         std::vector<std::string>,
66         std::pair<std::vector<std::string>, std::vector<int64_t>>, int64_t>>;
67 
68 // Write Server Dataset to the specified file in CSV format.
69 Status WriteServerDatasetToFile(const std::vector<std::string>& server_data,
70                                 absl::string_view server_data_filename);
71 
72 // Write Client Dataset to the specified file in CSV format.
73 Status WriteClientDatasetToFile(
74     const std::vector<std::string>& client_identifiers,
75     const std::vector<int64_t>& client_associated_values,
76     absl::string_view client_data_filename);
77 
78 // Read Server Dataset from the specified file, which should be in CSV format.
79 StatusOr<std::vector<std::string>> ReadServerDatasetFromFile(
80     absl::string_view server_data_filename);
81 
82 // Read Client Dataset (identifiers and associated values) from the specified
83 // file, which should be in CSV format. Automatically packages the parsed
84 // associated values as BigNums for convenience.
85 StatusOr<std::pair<std::vector<std::string>, std::vector<BigNum>>>
86 ReadClientDatasetFromFile(absl::string_view client_data_filename,
87                           Context* context);
88 
89 // Splits a CSV line using ',' as a delimiter, and returns a vector of
90 // associated strings.
91 std::vector<std::string> SplitCsvLine(const std::string& line);
92 
93 }  // namespace private_join_and_compute
94 #endif  // PRIVATE_JOIN_AND_COMPUTE_DATA_UTIL_H_
95