xref: /aosp_15_r20/external/rappor/analysis/cpp/find_cliques.cc (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1 // Copyright 2014 Google Inc. All rights reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <assert.h>
16 #include <stdarg.h>  // va_list, etc.
17 #include <stdio.h>
18 #include <stdint.h>  // uint16_t
19 #include <string>
20 // Using unordered_{set,map} and not the older set,map since they only require
21 // implementing equality, not comparison.  They require a C++ 11 compiler.
22 #include <unordered_map>
23 #include <unordered_set>
24 #include <vector>
25 
26 // find_cliques.cc: Find k-cliques in a k-partite graph.  This is part of the
27 // RAPPOR analysis for unknown dictionaries.
28 //
29 // A clique is a complete subgraph; it has (|N| choose 2) edges.
30 //
31 // This does the same computation as FindFeasibleStrings in
32 // analysis/R/decode_ngrams.R.
33 
34 // Graph format:
35 //
36 // num_partitions 3
37 // 0.ab 1.bc
38 // 0.ab 2.de
39 //
40 // See WriteKPartiteGraph in analysis/R/decode_ngrams.R for details.
41 //
42 // PERFORMANCE
43 //
44 // The code is optimized in terms of memory locality.  Nodes are 4 bytes; Edges
45 // are 8 bytes; PathArray is a contiguous block of memory.
46 
47 using std::unordered_map;
48 using std::unordered_set;
49 using std::string;
50 using std::vector;
51 
52 // TODO: log to stderr.  Add VERBOSE logging.
log(const char * fmt,...)53 void log(const char* fmt, ...) {
54   va_list args;
55   va_start(args, fmt);
56   vprintf(fmt, args);
57   va_end(args);
58   printf("\n");
59 }
60 
61 // Nodes and Edges are value types.  A node is 4 bytes.  2^16 = 65536
62 // partitions is plenty.
63 struct Node {
64   uint16_t partition;
65   // Right now we support bigrams.  We may want to support trigrams or
66   // arbitrary n-grams, although there will be a performance hit.
67   char ngram[2];
68 
69   // for debugging only
ToStringNode70   string ToString() const {
71     char buf[100];
72     snprintf(buf, sizeof(buf), "%d.%c%c", partition, ngram[0], ngram[1]);
73     return string(buf);  // copies buf
74   }
75 };
76 
77 // Implement hash and equality functors for unordered_set.
78 struct NodeHash {
operator ()NodeHash79   int operator() (const Node& node) const {
80     // DJB hash: http://floodyberry.com/noncryptohashzoo/DJB.html
81     int h = 5381;
82     h = (h << 5) + h + node.partition;
83     h = (h << 5) + h + node.ngram[0];
84     h = (h << 5) + h + node.ngram[1];
85     // log("hash %s = %d", node.ToString().c_str(), h);
86     return h;
87   }
88 };
89 
90 struct NodeEq {
operator ()NodeEq91   bool operator() (const Node& x, const Node& y) const {
92     // TODO: optimize to 4 byte comparison with memcmp(&x, &y, sizeof(Node))?
93     // NOTE: x.ngram == y.ngram is wrong; it compares pointers!
94     return x.partition == y.partition &&
95            x.ngram[0] == y.ngram[0] &&
96            x.ngram[1] == y.ngram[1];
97   }
98 };
99 
100 // This is an undirected edge, but we still call them "left" and "right"
101 // because the partition of "left" must be less than that of "right".
102 //
103 // NOTE: To reduce the size further, we could have a NodePool, and then typedef
104 // uint16_t NodeId.  Edge and Path can both use a 2 byte NodeId instead of a 4
105 // byte Node.  ToString() can take the NodePool for pretty printing.
106 //
107 // This will be better for the EnumeratePaths stage, but it will be
108 // worse for the CheckForCliques stage (doing the lookups may reduce memory
109 // locality).
110 
111 struct Edge {
112   Node left;
113   Node right;
114 
115   // for debugging only
ToStringEdge116   string ToString() const {
117     return left.ToString() + " - " + right.ToString();
118   }
119 };
120 
121 // Implement hash and equality functors for unordered_set.
122 struct EdgeHash {
operator ()EdgeHash123   int operator() (const Edge& edge) const {
124     // DJB hash
125     int h = 5381;
126     h = (h << 5) + h + NodeHash()(edge.left);
127     h = (h << 5) + h + NodeHash()(edge.right);
128     return h;
129   }
130 };
131 
132 struct EdgeEq {
operator ()EdgeEq133   bool operator() (const Edge& x, const Edge& y) const {
134     // TODO: optimize to 8 byte comparison with memcmp(&x, &y, sizeof(Edge))?
135     // This is in the inner loop for removing cadidates.
136     return NodeEq()(x.left, y.left) && NodeEq()(x.right, y.right);
137   }
138 };
139 
140 typedef unordered_set<Edge, EdgeHash, EdgeEq> EdgeSet;
141 
142 // The full graph.  It is k-partite, which can be seen by the node naming
143 // convention.
144 struct Graph {
145   int num_partitions;
146   vector<Edge> edges;
147 };
148 
149 // Given a Node, look up Nodes in the adjacent partition that it is connected
150 // to.
151 typedef unordered_map<Node, vector<Node>, NodeHash, NodeEq> Adjacency;
152 
153 // for debugging only
AdjacencyToString(const Adjacency & a)154 string AdjacencyToString(const Adjacency& a) {
155   string s;
156   for (auto& kv : a) {
157     s += kv.first.ToString();
158     s += " : <";
159     for (auto& node : kv.second) {
160       s += node.ToString();
161       s += " ";
162     }
163     s += ">  ";
164   }
165   return s;
166 }
167 
168 // Subgraph where only edges between adjacent partitions are included.
169 //
170 // We have k partitions, numbered 0 to k-1.  This means we have k-1 "columns",
171 // numbered 0 to k-2.
172 //
173 // A column is subgraph containing edges between adjacent partitions of the
174 // k-partite graph.
175 //
176 // The ColumnSubgraph class represents ALL columns (and is itself a subgraph).
177 
178 class ColumnSubgraph {
179  public:
ColumnSubgraph(int num_columns)180   explicit ColumnSubgraph(int num_columns)
181       : num_columns_(num_columns),
182         adj_list_(new Adjacency[num_columns]) {
183   }
~ColumnSubgraph()184   ~ColumnSubgraph() {
185     delete[] adj_list_;
186   }
AddEdge(Edge e)187   void AddEdge(Edge e) {
188     int part = e.left.partition;
189     assert(part < num_columns_);
190 
191     adj_list_[part][e.left].push_back(e.right);
192   }
GetColumn(int part,vector<Edge> * out) const193   void GetColumn(int part, vector<Edge>* out) const {
194     const Adjacency& a = adj_list_[part];
195     for (auto& kv : a) {
196       for (auto& right : kv.second) {
197         Edge e;
198         e.left = kv.first;
199         e.right = right;
200         out->push_back(e);
201       }
202     }
203   }
204   // Get the nodes in the next partition adjacent to node N
GetAdjacentNodes(Node n,vector<Node> * out) const205   void GetAdjacentNodes(Node n, vector<Node>* out) const {
206     int part = n.partition;
207     const Adjacency& a = adj_list_[part];
208 
209     // log("GetAdjacentNodes %s, part %d", n.ToString().c_str(), part);
210 
211     auto it = a.find(n);
212     if (it == a.end()) {
213       return;
214     }
215     // TODO: it would be better not to copy these.
216     for (auto node : it->second) {
217       out->push_back(node);
218     }
219   }
220 
221   // accessor
num_columns() const222   int num_columns() const { return num_columns_; }
223 
224   // for debugging only
ToString() const225   string ToString() const {
226     string s("[\n");
227     char buf[100];
228     for (int i = 0; i < num_columns_; ++i) {
229       const Adjacency& a = adj_list_[i];
230       snprintf(buf, sizeof(buf), "%d (%zu) ", i, a.size());
231       s += string(buf);
232       s += AdjacencyToString(a);
233       s += "\n";
234     }
235     s += " ]";
236     return s;
237   }
238 
239  private:
240   int num_columns_;
241   // Adjacency list.  An array of k-1 maps.
242   // Lookup goes from nodes in partition i to nodes in partition i+1.
243   Adjacency* adj_list_;
244 };
245 
BuildColumnSubgraph(const Graph & g,ColumnSubgraph * a)246 void BuildColumnSubgraph(const Graph& g, ColumnSubgraph* a) {
247   for (const auto& e : g.edges) {
248     if (e.left.partition + 1 == e.right.partition) {
249       a->AddEdge(e);
250     }
251   }
252 }
253 
254 // A 2D array of paths.  It's an array because all paths are the same length.
255 // We use a single vector<> to represent it, to reduce memory allocation.
256 class PathArray {
257  public:
PathArray(int path_length)258   explicit PathArray(int path_length)
259      : path_length_(path_length),
260        num_paths_(0) {
261   }
AddEdgeAsPath(Edge e)262   void AddEdgeAsPath(Edge e) {
263     // Can only initialize PathArray with edges when path length is 2
264     assert(path_length_ == 2);
265 
266     nodes_.push_back(e.left);
267     nodes_.push_back(e.right);
268     num_paths_++;
269   }
LastNodeInPath(int index) const270   Node LastNodeInPath(int index) const {
271     int start = index * path_length_;
272     return nodes_[start + path_length_ -1];
273   }
274   // Pretty print a single path in this array.  For debugging only.
PathDebugString(int index) const275   string PathDebugString(int index) const {
276     string s("[ ");
277     for (int i = index * path_length_; i < (index + 1) * path_length_; ++i) {
278       s += nodes_[i].ToString();
279       s += " - ";
280     }
281     s += " ]";
282     return s;
283   }
284   // Print the word implied by the path.
PathAsString(int index) const285   string PathAsString(int index) const {
286     string s;
287     for (int i = index * path_length_; i < (index + 1) * path_length_; ++i) {
288       s += nodes_[i].ngram[0];
289       s += nodes_[i].ngram[1];
290     }
291     return s;
292   }
GetPathStart(int index) const293   const Node* GetPathStart(int index) const {
294     return &nodes_[index * path_length_];
295   }
AddPath(const Node * start,int prefix_length,Node right)296   void AddPath(const Node* start, int prefix_length, Node right) {
297     // Make sure it is one less
298     assert(prefix_length == path_length_-1);
299 
300     // TODO: replace with memcpy?  Is it faster?
301     for (int i = 0; i < prefix_length; ++i) {
302       nodes_.push_back(start[i]);
303     }
304     nodes_.push_back(right);
305     num_paths_++;
306   }
307 
308   // accessors
num_paths() const309   int num_paths() const { return num_paths_; }
path_length() const310   int path_length() const { return path_length_; }
311 
312  private:
313   int path_length_;
314   int num_paths_;
315   vector<Node> nodes_;
316 };
317 
318 // Given a PathArray of length i, produce one of length i+1.
319 //
320 // NOTE: It would be more efficient to filter 'right_nodes' here, and only add
321 // a new path if it forms a "partial clique" (at step i+1).  This amounts to
322 // doing the membership tests in edge_set for each "column", instead of waiting
323 // until the end.
324 //
325 // This will reduce the exponential blowup of EnumeratePaths (although it
326 // doesn't change the worst case).
327 
EnumerateStep(const ColumnSubgraph & subgraph,const PathArray & in,PathArray * out)328 void EnumerateStep(
329     const ColumnSubgraph& subgraph, const PathArray& in, PathArray* out) {
330 
331   int prefix_length = in.path_length();
332 
333   for (int i = 0; i < in.num_paths(); ++i) {
334     // log("col %d, path %d", col, i);
335 
336     // last node in every path
337     Node last_node = in.LastNodeInPath(i);
338 
339     // TODO: avoid copying of nodes?
340     vector<Node> right_nodes;
341     subgraph.GetAdjacentNodes(last_node, &right_nodes);
342 
343     // Get a pointer to the start of the path
344     const Node* start = in.GetPathStart(i);
345 
346     for (Node right : right_nodes) {
347       out->AddPath(start, prefix_length, right);
348     }
349   }
350 }
351 
352 // Given a the column subgraph, produce an array of all possible paths of
353 // length k.  These will be subsequently checked to see if they are cliques.
EnumeratePaths(const ColumnSubgraph & subgraph,PathArray * candidates)354 void EnumeratePaths(
355     const ColumnSubgraph& subgraph, PathArray* candidates) {
356   // edges between partitions 0 and 1, a "column" of edges
357   vector<Edge> edges0;
358   subgraph.GetColumn(0, &edges0);
359 
360   int num_columns = subgraph.num_columns();
361   PathArray** arrays = new PathArray*[num_columns];
362 
363   // Initialize using column 0.
364   int path_length = 2;
365   arrays[0] = new PathArray(path_length);
366   for (auto& e : edges0) {
367     arrays[0]->AddEdgeAsPath(e);
368   }
369 
370   // Iterate over columns 1 to k-1.
371   for (int i = 1; i < num_columns; ++i) {
372     log("--- Column %d", i);
373 
374     path_length++;
375     if (i == num_columns - 1) {
376       arrays[i] = candidates;  // final result, from output argument!
377     } else {
378       arrays[i] = new PathArray(path_length);  // intermediate result
379     }
380     PathArray* in = arrays[i - 1];
381     PathArray* out = arrays[i];
382 
383     EnumerateStep(subgraph, *in, out);
384 
385     log("in num paths: %d", in->num_paths());
386     log("out num paths: %d", out->num_paths());
387 
388     // We create an destroy a PathArray on every iteration.  On each
389     // iteration, the PathArray grows both rows and columns, so it's hard to
390     // avoid this.
391     delete in;
392   }
393 }
394 
395 // Inserts the path number 'p' in incomplete if the path is not a complete
396 // subgraph.
IsClique(const Node * path,int k,const EdgeSet & edge_set)397 bool IsClique(const Node* path, int k, const EdgeSet& edge_set) {
398   // We need to ensure that (k choose 2) edges are all in edge_set.
399   // We already know that k-1 of them are present, so we need to check (k
400   // choose 2) - (k-1).
401   for (int i = 0; i < k; ++i) {
402     for (int j = i + 1; j < k; ++j) {
403       if (i + 1 == j) {
404         // Already know this edge exists.  NOTE: does this even speed things
405         // up?  It's a branch in the middle of an inner loop.
406         continue;
407       }
408       Edge e;
409       e.left = path[i];
410       e.right = path[j];
411       if (edge_set.find(e) == edge_set.end()) {
412         log("Didn't find edge %s", e.ToString().c_str());
413         return false;
414       }
415     }
416   }
417   return true;
418 }
419 
CheckForCliques(const PathArray & candidates,const EdgeSet & edge_set,unordered_set<int> * incomplete)420 void CheckForCliques(const PathArray& candidates,
421                      const EdgeSet& edge_set,
422                      unordered_set<int>* incomplete) {
423   int k = candidates.path_length();
424   for (int p = 0; p < candidates.num_paths(); ++p) {
425     const Node* path = candidates.GetPathStart(p);
426     // NOTE: We could run many IsClique invocations in parallel.  It reads from
427     // edge_set.  The different 'incomplete' sets can be merged.
428     if (!IsClique(path, k, edge_set)) {
429       incomplete->insert(p);
430       return;  // IMPORTANT: early return
431     }
432   }
433 }
434 
435 // Parse text on stdin into a graph, and do some validation.
ParseGraph(Graph * g,EdgeSet * edge_set)436 bool ParseGraph(Graph* g, EdgeSet* edge_set) {
437   // NOTE: It's possible that there NO k-cliques.
438 
439   int ret = fscanf(stdin, "num_partitions %d\n", &(g->num_partitions));
440   if (ret != 1) {
441     log("ERROR: Expected 'num_partitions <integer>'\n");
442     return false;
443   }
444   log("num_partitions = %d", g->num_partitions);
445 
446   int ngram_size;
447   ret = fscanf(stdin, "ngram_size %d\n", &ngram_size);
448   if (ret != 1) {
449     log("ERROR: Expected 'ngram_size <integer>'\n");
450     return false;
451   }
452   if (ngram_size != 2) {
453     log("ERROR: Only bigrams are currently supported (got n = %d)\n", ngram_size);
454     return false;
455   }
456 
457   int num_edges = 0;
458   while (true) {
459     int part1, part2;
460     char c1, c2, c3, c4;
461     int ret = fscanf(stdin, "edge %d.%c%c %d.%c%c\n",
462                      &part1, &c1, &c2, &part2, &c3, &c4);
463     if (ret == EOF) {
464       log("Read %d edges", num_edges);
465       break;
466     }
467     if (ret != 6) {
468       log("ERROR: Expected 6 values for edge, got %d", ret);
469       return false;
470     }
471     // log("%d -> %d", part1, part2);
472     if (part1 >= part2) {
473       log("ERROR: edge in wrong order (%d >= %d)", part1, part2);
474       return false;
475     }
476 
477     Edge e;
478     e.left.partition = part1;
479     e.left.ngram[0] = c1;
480     e.left.ngram[1] = c2;
481 
482     e.right.partition = part2;
483     e.right.ngram[0] = c3;
484     e.right.ngram[1] = c4;
485 
486     g->edges.push_back(e);
487 
488     // For lookup in CheckForCliques
489     edge_set->insert(e);
490 
491     num_edges++;
492   }
493   return true;
494 }
495 
main()496 int main() {
497   log("sizeof(Node) = %zu", sizeof(Node));
498   log("sizeof(Edge) = %zu", sizeof(Edge));
499   // This should be true no matter what platform we use, e.g. since we use
500   // uint16_t.
501   assert(sizeof(Node) == 4);
502   assert(sizeof(Edge) == 8);
503 
504   Graph g;
505   EdgeSet edge_set;
506 
507   log("ParseGraph");
508   if (!ParseGraph(&g, &edge_set)) {
509     log("Fatal error parsing graph.");
510     return 1;
511   }
512 
513   // If there are k partitions, there are k-1 edge "columns".
514   ColumnSubgraph subgraph(g.num_partitions - 1);
515   log("BuildColumnSubgraph");
516   BuildColumnSubgraph(g, &subgraph);
517   log("%s", subgraph.ToString().c_str());
518 
519   // PathArray candidates(num_partitions);
520   log("EnumeratePaths");
521   PathArray candidates(g.num_partitions);
522   EnumeratePaths(subgraph, &candidates);
523 
524   log("EnumeratePaths produced %d candidates", candidates.num_paths());
525   for (int i = 0; i < candidates.num_paths(); ++i) {
526     log("%d %s", i, candidates.PathDebugString(i).c_str());
527   }
528 
529   // array of indices of incomplete paths, i.e. paths that are not complete
530   // subgraphs
531   log("CheckForCliques");
532   unordered_set<int> incomplete;
533   CheckForCliques(candidates, edge_set, &incomplete);
534   for (auto p : incomplete) {
535     log("Path %d is incomplete", p);
536   }
537 
538   log("Found the following cliques/words:");
539   // Now print all the complete ones to stdout
540   for (int i = 0; i < candidates.num_paths(); i++) {
541     if (incomplete.find(i) == incomplete.end()) {
542       log("%d %s", i, candidates.PathAsString(i).c_str());
543     }
544   }
545   log("Done");
546 }
547