xref: /aosp_15_r20/external/rappor/analysis/cpp/find_cliques.cc (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li // Copyright 2014 Google Inc. All rights reserved.
2*2abb3134SXin Li //
3*2abb3134SXin Li // Licensed under the Apache License, Version 2.0 (the "License");
4*2abb3134SXin Li // you may not use this file except in compliance with the License.
5*2abb3134SXin Li // You may obtain a copy of the License at
6*2abb3134SXin Li //
7*2abb3134SXin Li //     http://www.apache.org/licenses/LICENSE-2.0
8*2abb3134SXin Li //
9*2abb3134SXin Li // Unless required by applicable law or agreed to in writing, software
10*2abb3134SXin Li // distributed under the License is distributed on an "AS IS" BASIS,
11*2abb3134SXin Li // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*2abb3134SXin Li // See the License for the specific language governing permissions and
13*2abb3134SXin Li // limitations under the License.
14*2abb3134SXin Li 
15*2abb3134SXin Li #include <assert.h>
16*2abb3134SXin Li #include <stdarg.h>  // va_list, etc.
17*2abb3134SXin Li #include <stdio.h>
18*2abb3134SXin Li #include <stdint.h>  // uint16_t
19*2abb3134SXin Li #include <string>
20*2abb3134SXin Li // Using unordered_{set,map} and not the older set,map since they only require
21*2abb3134SXin Li // implementing equality, not comparison.  They require a C++ 11 compiler.
22*2abb3134SXin Li #include <unordered_map>
23*2abb3134SXin Li #include <unordered_set>
24*2abb3134SXin Li #include <vector>
25*2abb3134SXin Li 
26*2abb3134SXin Li // find_cliques.cc: Find k-cliques in a k-partite graph.  This is part of the
27*2abb3134SXin Li // RAPPOR analysis for unknown dictionaries.
28*2abb3134SXin Li //
29*2abb3134SXin Li // A clique is a complete subgraph; it has (|N| choose 2) edges.
30*2abb3134SXin Li //
31*2abb3134SXin Li // This does the same computation as FindFeasibleStrings in
32*2abb3134SXin Li // analysis/R/decode_ngrams.R.
33*2abb3134SXin Li 
34*2abb3134SXin Li // Graph format:
35*2abb3134SXin Li //
36*2abb3134SXin Li // num_partitions 3
37*2abb3134SXin Li // 0.ab 1.bc
38*2abb3134SXin Li // 0.ab 2.de
39*2abb3134SXin Li //
40*2abb3134SXin Li // See WriteKPartiteGraph in analysis/R/decode_ngrams.R for details.
41*2abb3134SXin Li //
42*2abb3134SXin Li // PERFORMANCE
43*2abb3134SXin Li //
44*2abb3134SXin Li // The code is optimized in terms of memory locality.  Nodes are 4 bytes; Edges
45*2abb3134SXin Li // are 8 bytes; PathArray is a contiguous block of memory.
46*2abb3134SXin Li 
47*2abb3134SXin Li using std::unordered_map;
48*2abb3134SXin Li using std::unordered_set;
49*2abb3134SXin Li using std::string;
50*2abb3134SXin Li using std::vector;
51*2abb3134SXin Li 
52*2abb3134SXin Li // TODO: log to stderr.  Add VERBOSE logging.
log(const char * fmt,...)53*2abb3134SXin Li void log(const char* fmt, ...) {
54*2abb3134SXin Li   va_list args;
55*2abb3134SXin Li   va_start(args, fmt);
56*2abb3134SXin Li   vprintf(fmt, args);
57*2abb3134SXin Li   va_end(args);
58*2abb3134SXin Li   printf("\n");
59*2abb3134SXin Li }
60*2abb3134SXin Li 
61*2abb3134SXin Li // Nodes and Edges are value types.  A node is 4 bytes.  2^16 = 65536
62*2abb3134SXin Li // partitions is plenty.
63*2abb3134SXin Li struct Node {
64*2abb3134SXin Li   uint16_t partition;
65*2abb3134SXin Li   // Right now we support bigrams.  We may want to support trigrams or
66*2abb3134SXin Li   // arbitrary n-grams, although there will be a performance hit.
67*2abb3134SXin Li   char ngram[2];
68*2abb3134SXin Li 
69*2abb3134SXin Li   // for debugging only
ToStringNode70*2abb3134SXin Li   string ToString() const {
71*2abb3134SXin Li     char buf[100];
72*2abb3134SXin Li     snprintf(buf, sizeof(buf), "%d.%c%c", partition, ngram[0], ngram[1]);
73*2abb3134SXin Li     return string(buf);  // copies buf
74*2abb3134SXin Li   }
75*2abb3134SXin Li };
76*2abb3134SXin Li 
77*2abb3134SXin Li // Implement hash and equality functors for unordered_set.
78*2abb3134SXin Li struct NodeHash {
operator ()NodeHash79*2abb3134SXin Li   int operator() (const Node& node) const {
80*2abb3134SXin Li     // DJB hash: http://floodyberry.com/noncryptohashzoo/DJB.html
81*2abb3134SXin Li     int h = 5381;
82*2abb3134SXin Li     h = (h << 5) + h + node.partition;
83*2abb3134SXin Li     h = (h << 5) + h + node.ngram[0];
84*2abb3134SXin Li     h = (h << 5) + h + node.ngram[1];
85*2abb3134SXin Li     // log("hash %s = %d", node.ToString().c_str(), h);
86*2abb3134SXin Li     return h;
87*2abb3134SXin Li   }
88*2abb3134SXin Li };
89*2abb3134SXin Li 
90*2abb3134SXin Li struct NodeEq {
operator ()NodeEq91*2abb3134SXin Li   bool operator() (const Node& x, const Node& y) const {
92*2abb3134SXin Li     // TODO: optimize to 4 byte comparison with memcmp(&x, &y, sizeof(Node))?
93*2abb3134SXin Li     // NOTE: x.ngram == y.ngram is wrong; it compares pointers!
94*2abb3134SXin Li     return x.partition == y.partition &&
95*2abb3134SXin Li            x.ngram[0] == y.ngram[0] &&
96*2abb3134SXin Li            x.ngram[1] == y.ngram[1];
97*2abb3134SXin Li   }
98*2abb3134SXin Li };
99*2abb3134SXin Li 
100*2abb3134SXin Li // This is an undirected edge, but we still call them "left" and "right"
101*2abb3134SXin Li // because the partition of "left" must be less than that of "right".
102*2abb3134SXin Li //
103*2abb3134SXin Li // NOTE: To reduce the size further, we could have a NodePool, and then typedef
104*2abb3134SXin Li // uint16_t NodeId.  Edge and Path can both use a 2 byte NodeId instead of a 4
105*2abb3134SXin Li // byte Node.  ToString() can take the NodePool for pretty printing.
106*2abb3134SXin Li //
107*2abb3134SXin Li // This will be better for the EnumeratePaths stage, but it will be
108*2abb3134SXin Li // worse for the CheckForCliques stage (doing the lookups may reduce memory
109*2abb3134SXin Li // locality).
110*2abb3134SXin Li 
111*2abb3134SXin Li struct Edge {
112*2abb3134SXin Li   Node left;
113*2abb3134SXin Li   Node right;
114*2abb3134SXin Li 
115*2abb3134SXin Li   // for debugging only
ToStringEdge116*2abb3134SXin Li   string ToString() const {
117*2abb3134SXin Li     return left.ToString() + " - " + right.ToString();
118*2abb3134SXin Li   }
119*2abb3134SXin Li };
120*2abb3134SXin Li 
121*2abb3134SXin Li // Implement hash and equality functors for unordered_set.
122*2abb3134SXin Li struct EdgeHash {
operator ()EdgeHash123*2abb3134SXin Li   int operator() (const Edge& edge) const {
124*2abb3134SXin Li     // DJB hash
125*2abb3134SXin Li     int h = 5381;
126*2abb3134SXin Li     h = (h << 5) + h + NodeHash()(edge.left);
127*2abb3134SXin Li     h = (h << 5) + h + NodeHash()(edge.right);
128*2abb3134SXin Li     return h;
129*2abb3134SXin Li   }
130*2abb3134SXin Li };
131*2abb3134SXin Li 
132*2abb3134SXin Li struct EdgeEq {
operator ()EdgeEq133*2abb3134SXin Li   bool operator() (const Edge& x, const Edge& y) const {
134*2abb3134SXin Li     // TODO: optimize to 8 byte comparison with memcmp(&x, &y, sizeof(Edge))?
135*2abb3134SXin Li     // This is in the inner loop for removing cadidates.
136*2abb3134SXin Li     return NodeEq()(x.left, y.left) && NodeEq()(x.right, y.right);
137*2abb3134SXin Li   }
138*2abb3134SXin Li };
139*2abb3134SXin Li 
140*2abb3134SXin Li typedef unordered_set<Edge, EdgeHash, EdgeEq> EdgeSet;
141*2abb3134SXin Li 
142*2abb3134SXin Li // The full graph.  It is k-partite, which can be seen by the node naming
143*2abb3134SXin Li // convention.
144*2abb3134SXin Li struct Graph {
145*2abb3134SXin Li   int num_partitions;
146*2abb3134SXin Li   vector<Edge> edges;
147*2abb3134SXin Li };
148*2abb3134SXin Li 
149*2abb3134SXin Li // Given a Node, look up Nodes in the adjacent partition that it is connected
150*2abb3134SXin Li // to.
151*2abb3134SXin Li typedef unordered_map<Node, vector<Node>, NodeHash, NodeEq> Adjacency;
152*2abb3134SXin Li 
153*2abb3134SXin Li // for debugging only
AdjacencyToString(const Adjacency & a)154*2abb3134SXin Li string AdjacencyToString(const Adjacency& a) {
155*2abb3134SXin Li   string s;
156*2abb3134SXin Li   for (auto& kv : a) {
157*2abb3134SXin Li     s += kv.first.ToString();
158*2abb3134SXin Li     s += " : <";
159*2abb3134SXin Li     for (auto& node : kv.second) {
160*2abb3134SXin Li       s += node.ToString();
161*2abb3134SXin Li       s += " ";
162*2abb3134SXin Li     }
163*2abb3134SXin Li     s += ">  ";
164*2abb3134SXin Li   }
165*2abb3134SXin Li   return s;
166*2abb3134SXin Li }
167*2abb3134SXin Li 
168*2abb3134SXin Li // Subgraph where only edges between adjacent partitions are included.
169*2abb3134SXin Li //
170*2abb3134SXin Li // We have k partitions, numbered 0 to k-1.  This means we have k-1 "columns",
171*2abb3134SXin Li // numbered 0 to k-2.
172*2abb3134SXin Li //
173*2abb3134SXin Li // A column is subgraph containing edges between adjacent partitions of the
174*2abb3134SXin Li // k-partite graph.
175*2abb3134SXin Li //
176*2abb3134SXin Li // The ColumnSubgraph class represents ALL columns (and is itself a subgraph).
177*2abb3134SXin Li 
178*2abb3134SXin Li class ColumnSubgraph {
179*2abb3134SXin Li  public:
ColumnSubgraph(int num_columns)180*2abb3134SXin Li   explicit ColumnSubgraph(int num_columns)
181*2abb3134SXin Li       : num_columns_(num_columns),
182*2abb3134SXin Li         adj_list_(new Adjacency[num_columns]) {
183*2abb3134SXin Li   }
~ColumnSubgraph()184*2abb3134SXin Li   ~ColumnSubgraph() {
185*2abb3134SXin Li     delete[] adj_list_;
186*2abb3134SXin Li   }
AddEdge(Edge e)187*2abb3134SXin Li   void AddEdge(Edge e) {
188*2abb3134SXin Li     int part = e.left.partition;
189*2abb3134SXin Li     assert(part < num_columns_);
190*2abb3134SXin Li 
191*2abb3134SXin Li     adj_list_[part][e.left].push_back(e.right);
192*2abb3134SXin Li   }
GetColumn(int part,vector<Edge> * out) const193*2abb3134SXin Li   void GetColumn(int part, vector<Edge>* out) const {
194*2abb3134SXin Li     const Adjacency& a = adj_list_[part];
195*2abb3134SXin Li     for (auto& kv : a) {
196*2abb3134SXin Li       for (auto& right : kv.second) {
197*2abb3134SXin Li         Edge e;
198*2abb3134SXin Li         e.left = kv.first;
199*2abb3134SXin Li         e.right = right;
200*2abb3134SXin Li         out->push_back(e);
201*2abb3134SXin Li       }
202*2abb3134SXin Li     }
203*2abb3134SXin Li   }
204*2abb3134SXin Li   // Get the nodes in the next partition adjacent to node N
GetAdjacentNodes(Node n,vector<Node> * out) const205*2abb3134SXin Li   void GetAdjacentNodes(Node n, vector<Node>* out) const {
206*2abb3134SXin Li     int part = n.partition;
207*2abb3134SXin Li     const Adjacency& a = adj_list_[part];
208*2abb3134SXin Li 
209*2abb3134SXin Li     // log("GetAdjacentNodes %s, part %d", n.ToString().c_str(), part);
210*2abb3134SXin Li 
211*2abb3134SXin Li     auto it = a.find(n);
212*2abb3134SXin Li     if (it == a.end()) {
213*2abb3134SXin Li       return;
214*2abb3134SXin Li     }
215*2abb3134SXin Li     // TODO: it would be better not to copy these.
216*2abb3134SXin Li     for (auto node : it->second) {
217*2abb3134SXin Li       out->push_back(node);
218*2abb3134SXin Li     }
219*2abb3134SXin Li   }
220*2abb3134SXin Li 
221*2abb3134SXin Li   // accessor
num_columns() const222*2abb3134SXin Li   int num_columns() const { return num_columns_; }
223*2abb3134SXin Li 
224*2abb3134SXin Li   // for debugging only
ToString() const225*2abb3134SXin Li   string ToString() const {
226*2abb3134SXin Li     string s("[\n");
227*2abb3134SXin Li     char buf[100];
228*2abb3134SXin Li     for (int i = 0; i < num_columns_; ++i) {
229*2abb3134SXin Li       const Adjacency& a = adj_list_[i];
230*2abb3134SXin Li       snprintf(buf, sizeof(buf), "%d (%zu) ", i, a.size());
231*2abb3134SXin Li       s += string(buf);
232*2abb3134SXin Li       s += AdjacencyToString(a);
233*2abb3134SXin Li       s += "\n";
234*2abb3134SXin Li     }
235*2abb3134SXin Li     s += " ]";
236*2abb3134SXin Li     return s;
237*2abb3134SXin Li   }
238*2abb3134SXin Li 
239*2abb3134SXin Li  private:
240*2abb3134SXin Li   int num_columns_;
241*2abb3134SXin Li   // Adjacency list.  An array of k-1 maps.
242*2abb3134SXin Li   // Lookup goes from nodes in partition i to nodes in partition i+1.
243*2abb3134SXin Li   Adjacency* adj_list_;
244*2abb3134SXin Li };
245*2abb3134SXin Li 
BuildColumnSubgraph(const Graph & g,ColumnSubgraph * a)246*2abb3134SXin Li void BuildColumnSubgraph(const Graph& g, ColumnSubgraph* a) {
247*2abb3134SXin Li   for (const auto& e : g.edges) {
248*2abb3134SXin Li     if (e.left.partition + 1 == e.right.partition) {
249*2abb3134SXin Li       a->AddEdge(e);
250*2abb3134SXin Li     }
251*2abb3134SXin Li   }
252*2abb3134SXin Li }
253*2abb3134SXin Li 
254*2abb3134SXin Li // A 2D array of paths.  It's an array because all paths are the same length.
255*2abb3134SXin Li // We use a single vector<> to represent it, to reduce memory allocation.
256*2abb3134SXin Li class PathArray {
257*2abb3134SXin Li  public:
PathArray(int path_length)258*2abb3134SXin Li   explicit PathArray(int path_length)
259*2abb3134SXin Li      : path_length_(path_length),
260*2abb3134SXin Li        num_paths_(0) {
261*2abb3134SXin Li   }
AddEdgeAsPath(Edge e)262*2abb3134SXin Li   void AddEdgeAsPath(Edge e) {
263*2abb3134SXin Li     // Can only initialize PathArray with edges when path length is 2
264*2abb3134SXin Li     assert(path_length_ == 2);
265*2abb3134SXin Li 
266*2abb3134SXin Li     nodes_.push_back(e.left);
267*2abb3134SXin Li     nodes_.push_back(e.right);
268*2abb3134SXin Li     num_paths_++;
269*2abb3134SXin Li   }
LastNodeInPath(int index) const270*2abb3134SXin Li   Node LastNodeInPath(int index) const {
271*2abb3134SXin Li     int start = index * path_length_;
272*2abb3134SXin Li     return nodes_[start + path_length_ -1];
273*2abb3134SXin Li   }
274*2abb3134SXin Li   // Pretty print a single path in this array.  For debugging only.
PathDebugString(int index) const275*2abb3134SXin Li   string PathDebugString(int index) const {
276*2abb3134SXin Li     string s("[ ");
277*2abb3134SXin Li     for (int i = index * path_length_; i < (index + 1) * path_length_; ++i) {
278*2abb3134SXin Li       s += nodes_[i].ToString();
279*2abb3134SXin Li       s += " - ";
280*2abb3134SXin Li     }
281*2abb3134SXin Li     s += " ]";
282*2abb3134SXin Li     return s;
283*2abb3134SXin Li   }
284*2abb3134SXin Li   // Print the word implied by the path.
PathAsString(int index) const285*2abb3134SXin Li   string PathAsString(int index) const {
286*2abb3134SXin Li     string s;
287*2abb3134SXin Li     for (int i = index * path_length_; i < (index + 1) * path_length_; ++i) {
288*2abb3134SXin Li       s += nodes_[i].ngram[0];
289*2abb3134SXin Li       s += nodes_[i].ngram[1];
290*2abb3134SXin Li     }
291*2abb3134SXin Li     return s;
292*2abb3134SXin Li   }
GetPathStart(int index) const293*2abb3134SXin Li   const Node* GetPathStart(int index) const {
294*2abb3134SXin Li     return &nodes_[index * path_length_];
295*2abb3134SXin Li   }
AddPath(const Node * start,int prefix_length,Node right)296*2abb3134SXin Li   void AddPath(const Node* start, int prefix_length, Node right) {
297*2abb3134SXin Li     // Make sure it is one less
298*2abb3134SXin Li     assert(prefix_length == path_length_-1);
299*2abb3134SXin Li 
300*2abb3134SXin Li     // TODO: replace with memcpy?  Is it faster?
301*2abb3134SXin Li     for (int i = 0; i < prefix_length; ++i) {
302*2abb3134SXin Li       nodes_.push_back(start[i]);
303*2abb3134SXin Li     }
304*2abb3134SXin Li     nodes_.push_back(right);
305*2abb3134SXin Li     num_paths_++;
306*2abb3134SXin Li   }
307*2abb3134SXin Li 
308*2abb3134SXin Li   // accessors
num_paths() const309*2abb3134SXin Li   int num_paths() const { return num_paths_; }
path_length() const310*2abb3134SXin Li   int path_length() const { return path_length_; }
311*2abb3134SXin Li 
312*2abb3134SXin Li  private:
313*2abb3134SXin Li   int path_length_;
314*2abb3134SXin Li   int num_paths_;
315*2abb3134SXin Li   vector<Node> nodes_;
316*2abb3134SXin Li };
317*2abb3134SXin Li 
318*2abb3134SXin Li // Given a PathArray of length i, produce one of length i+1.
319*2abb3134SXin Li //
320*2abb3134SXin Li // NOTE: It would be more efficient to filter 'right_nodes' here, and only add
321*2abb3134SXin Li // a new path if it forms a "partial clique" (at step i+1).  This amounts to
322*2abb3134SXin Li // doing the membership tests in edge_set for each "column", instead of waiting
323*2abb3134SXin Li // until the end.
324*2abb3134SXin Li //
325*2abb3134SXin Li // This will reduce the exponential blowup of EnumeratePaths (although it
326*2abb3134SXin Li // doesn't change the worst case).
327*2abb3134SXin Li 
EnumerateStep(const ColumnSubgraph & subgraph,const PathArray & in,PathArray * out)328*2abb3134SXin Li void EnumerateStep(
329*2abb3134SXin Li     const ColumnSubgraph& subgraph, const PathArray& in, PathArray* out) {
330*2abb3134SXin Li 
331*2abb3134SXin Li   int prefix_length = in.path_length();
332*2abb3134SXin Li 
333*2abb3134SXin Li   for (int i = 0; i < in.num_paths(); ++i) {
334*2abb3134SXin Li     // log("col %d, path %d", col, i);
335*2abb3134SXin Li 
336*2abb3134SXin Li     // last node in every path
337*2abb3134SXin Li     Node last_node = in.LastNodeInPath(i);
338*2abb3134SXin Li 
339*2abb3134SXin Li     // TODO: avoid copying of nodes?
340*2abb3134SXin Li     vector<Node> right_nodes;
341*2abb3134SXin Li     subgraph.GetAdjacentNodes(last_node, &right_nodes);
342*2abb3134SXin Li 
343*2abb3134SXin Li     // Get a pointer to the start of the path
344*2abb3134SXin Li     const Node* start = in.GetPathStart(i);
345*2abb3134SXin Li 
346*2abb3134SXin Li     for (Node right : right_nodes) {
347*2abb3134SXin Li       out->AddPath(start, prefix_length, right);
348*2abb3134SXin Li     }
349*2abb3134SXin Li   }
350*2abb3134SXin Li }
351*2abb3134SXin Li 
352*2abb3134SXin Li // Given a the column subgraph, produce an array of all possible paths of
353*2abb3134SXin Li // length k.  These will be subsequently checked to see if they are cliques.
EnumeratePaths(const ColumnSubgraph & subgraph,PathArray * candidates)354*2abb3134SXin Li void EnumeratePaths(
355*2abb3134SXin Li     const ColumnSubgraph& subgraph, PathArray* candidates) {
356*2abb3134SXin Li   // edges between partitions 0 and 1, a "column" of edges
357*2abb3134SXin Li   vector<Edge> edges0;
358*2abb3134SXin Li   subgraph.GetColumn(0, &edges0);
359*2abb3134SXin Li 
360*2abb3134SXin Li   int num_columns = subgraph.num_columns();
361*2abb3134SXin Li   PathArray** arrays = new PathArray*[num_columns];
362*2abb3134SXin Li 
363*2abb3134SXin Li   // Initialize using column 0.
364*2abb3134SXin Li   int path_length = 2;
365*2abb3134SXin Li   arrays[0] = new PathArray(path_length);
366*2abb3134SXin Li   for (auto& e : edges0) {
367*2abb3134SXin Li     arrays[0]->AddEdgeAsPath(e);
368*2abb3134SXin Li   }
369*2abb3134SXin Li 
370*2abb3134SXin Li   // Iterate over columns 1 to k-1.
371*2abb3134SXin Li   for (int i = 1; i < num_columns; ++i) {
372*2abb3134SXin Li     log("--- Column %d", i);
373*2abb3134SXin Li 
374*2abb3134SXin Li     path_length++;
375*2abb3134SXin Li     if (i == num_columns - 1) {
376*2abb3134SXin Li       arrays[i] = candidates;  // final result, from output argument!
377*2abb3134SXin Li     } else {
378*2abb3134SXin Li       arrays[i] = new PathArray(path_length);  // intermediate result
379*2abb3134SXin Li     }
380*2abb3134SXin Li     PathArray* in = arrays[i - 1];
381*2abb3134SXin Li     PathArray* out = arrays[i];
382*2abb3134SXin Li 
383*2abb3134SXin Li     EnumerateStep(subgraph, *in, out);
384*2abb3134SXin Li 
385*2abb3134SXin Li     log("in num paths: %d", in->num_paths());
386*2abb3134SXin Li     log("out num paths: %d", out->num_paths());
387*2abb3134SXin Li 
388*2abb3134SXin Li     // We create an destroy a PathArray on every iteration.  On each
389*2abb3134SXin Li     // iteration, the PathArray grows both rows and columns, so it's hard to
390*2abb3134SXin Li     // avoid this.
391*2abb3134SXin Li     delete in;
392*2abb3134SXin Li   }
393*2abb3134SXin Li }
394*2abb3134SXin Li 
395*2abb3134SXin Li // Inserts the path number 'p' in incomplete if the path is not a complete
396*2abb3134SXin Li // subgraph.
IsClique(const Node * path,int k,const EdgeSet & edge_set)397*2abb3134SXin Li bool IsClique(const Node* path, int k, const EdgeSet& edge_set) {
398*2abb3134SXin Li   // We need to ensure that (k choose 2) edges are all in edge_set.
399*2abb3134SXin Li   // We already know that k-1 of them are present, so we need to check (k
400*2abb3134SXin Li   // choose 2) - (k-1).
401*2abb3134SXin Li   for (int i = 0; i < k; ++i) {
402*2abb3134SXin Li     for (int j = i + 1; j < k; ++j) {
403*2abb3134SXin Li       if (i + 1 == j) {
404*2abb3134SXin Li         // Already know this edge exists.  NOTE: does this even speed things
405*2abb3134SXin Li         // up?  It's a branch in the middle of an inner loop.
406*2abb3134SXin Li         continue;
407*2abb3134SXin Li       }
408*2abb3134SXin Li       Edge e;
409*2abb3134SXin Li       e.left = path[i];
410*2abb3134SXin Li       e.right = path[j];
411*2abb3134SXin Li       if (edge_set.find(e) == edge_set.end()) {
412*2abb3134SXin Li         log("Didn't find edge %s", e.ToString().c_str());
413*2abb3134SXin Li         return false;
414*2abb3134SXin Li       }
415*2abb3134SXin Li     }
416*2abb3134SXin Li   }
417*2abb3134SXin Li   return true;
418*2abb3134SXin Li }
419*2abb3134SXin Li 
CheckForCliques(const PathArray & candidates,const EdgeSet & edge_set,unordered_set<int> * incomplete)420*2abb3134SXin Li void CheckForCliques(const PathArray& candidates,
421*2abb3134SXin Li                      const EdgeSet& edge_set,
422*2abb3134SXin Li                      unordered_set<int>* incomplete) {
423*2abb3134SXin Li   int k = candidates.path_length();
424*2abb3134SXin Li   for (int p = 0; p < candidates.num_paths(); ++p) {
425*2abb3134SXin Li     const Node* path = candidates.GetPathStart(p);
426*2abb3134SXin Li     // NOTE: We could run many IsClique invocations in parallel.  It reads from
427*2abb3134SXin Li     // edge_set.  The different 'incomplete' sets can be merged.
428*2abb3134SXin Li     if (!IsClique(path, k, edge_set)) {
429*2abb3134SXin Li       incomplete->insert(p);
430*2abb3134SXin Li       return;  // IMPORTANT: early return
431*2abb3134SXin Li     }
432*2abb3134SXin Li   }
433*2abb3134SXin Li }
434*2abb3134SXin Li 
435*2abb3134SXin Li // Parse text on stdin into a graph, and do some validation.
ParseGraph(Graph * g,EdgeSet * edge_set)436*2abb3134SXin Li bool ParseGraph(Graph* g, EdgeSet* edge_set) {
437*2abb3134SXin Li   // NOTE: It's possible that there NO k-cliques.
438*2abb3134SXin Li 
439*2abb3134SXin Li   int ret = fscanf(stdin, "num_partitions %d\n", &(g->num_partitions));
440*2abb3134SXin Li   if (ret != 1) {
441*2abb3134SXin Li     log("ERROR: Expected 'num_partitions <integer>'\n");
442*2abb3134SXin Li     return false;
443*2abb3134SXin Li   }
444*2abb3134SXin Li   log("num_partitions = %d", g->num_partitions);
445*2abb3134SXin Li 
446*2abb3134SXin Li   int ngram_size;
447*2abb3134SXin Li   ret = fscanf(stdin, "ngram_size %d\n", &ngram_size);
448*2abb3134SXin Li   if (ret != 1) {
449*2abb3134SXin Li     log("ERROR: Expected 'ngram_size <integer>'\n");
450*2abb3134SXin Li     return false;
451*2abb3134SXin Li   }
452*2abb3134SXin Li   if (ngram_size != 2) {
453*2abb3134SXin Li     log("ERROR: Only bigrams are currently supported (got n = %d)\n", ngram_size);
454*2abb3134SXin Li     return false;
455*2abb3134SXin Li   }
456*2abb3134SXin Li 
457*2abb3134SXin Li   int num_edges = 0;
458*2abb3134SXin Li   while (true) {
459*2abb3134SXin Li     int part1, part2;
460*2abb3134SXin Li     char c1, c2, c3, c4;
461*2abb3134SXin Li     int ret = fscanf(stdin, "edge %d.%c%c %d.%c%c\n",
462*2abb3134SXin Li                      &part1, &c1, &c2, &part2, &c3, &c4);
463*2abb3134SXin Li     if (ret == EOF) {
464*2abb3134SXin Li       log("Read %d edges", num_edges);
465*2abb3134SXin Li       break;
466*2abb3134SXin Li     }
467*2abb3134SXin Li     if (ret != 6) {
468*2abb3134SXin Li       log("ERROR: Expected 6 values for edge, got %d", ret);
469*2abb3134SXin Li       return false;
470*2abb3134SXin Li     }
471*2abb3134SXin Li     // log("%d -> %d", part1, part2);
472*2abb3134SXin Li     if (part1 >= part2) {
473*2abb3134SXin Li       log("ERROR: edge in wrong order (%d >= %d)", part1, part2);
474*2abb3134SXin Li       return false;
475*2abb3134SXin Li     }
476*2abb3134SXin Li 
477*2abb3134SXin Li     Edge e;
478*2abb3134SXin Li     e.left.partition = part1;
479*2abb3134SXin Li     e.left.ngram[0] = c1;
480*2abb3134SXin Li     e.left.ngram[1] = c2;
481*2abb3134SXin Li 
482*2abb3134SXin Li     e.right.partition = part2;
483*2abb3134SXin Li     e.right.ngram[0] = c3;
484*2abb3134SXin Li     e.right.ngram[1] = c4;
485*2abb3134SXin Li 
486*2abb3134SXin Li     g->edges.push_back(e);
487*2abb3134SXin Li 
488*2abb3134SXin Li     // For lookup in CheckForCliques
489*2abb3134SXin Li     edge_set->insert(e);
490*2abb3134SXin Li 
491*2abb3134SXin Li     num_edges++;
492*2abb3134SXin Li   }
493*2abb3134SXin Li   return true;
494*2abb3134SXin Li }
495*2abb3134SXin Li 
main()496*2abb3134SXin Li int main() {
497*2abb3134SXin Li   log("sizeof(Node) = %zu", sizeof(Node));
498*2abb3134SXin Li   log("sizeof(Edge) = %zu", sizeof(Edge));
499*2abb3134SXin Li   // This should be true no matter what platform we use, e.g. since we use
500*2abb3134SXin Li   // uint16_t.
501*2abb3134SXin Li   assert(sizeof(Node) == 4);
502*2abb3134SXin Li   assert(sizeof(Edge) == 8);
503*2abb3134SXin Li 
504*2abb3134SXin Li   Graph g;
505*2abb3134SXin Li   EdgeSet edge_set;
506*2abb3134SXin Li 
507*2abb3134SXin Li   log("ParseGraph");
508*2abb3134SXin Li   if (!ParseGraph(&g, &edge_set)) {
509*2abb3134SXin Li     log("Fatal error parsing graph.");
510*2abb3134SXin Li     return 1;
511*2abb3134SXin Li   }
512*2abb3134SXin Li 
513*2abb3134SXin Li   // If there are k partitions, there are k-1 edge "columns".
514*2abb3134SXin Li   ColumnSubgraph subgraph(g.num_partitions - 1);
515*2abb3134SXin Li   log("BuildColumnSubgraph");
516*2abb3134SXin Li   BuildColumnSubgraph(g, &subgraph);
517*2abb3134SXin Li   log("%s", subgraph.ToString().c_str());
518*2abb3134SXin Li 
519*2abb3134SXin Li   // PathArray candidates(num_partitions);
520*2abb3134SXin Li   log("EnumeratePaths");
521*2abb3134SXin Li   PathArray candidates(g.num_partitions);
522*2abb3134SXin Li   EnumeratePaths(subgraph, &candidates);
523*2abb3134SXin Li 
524*2abb3134SXin Li   log("EnumeratePaths produced %d candidates", candidates.num_paths());
525*2abb3134SXin Li   for (int i = 0; i < candidates.num_paths(); ++i) {
526*2abb3134SXin Li     log("%d %s", i, candidates.PathDebugString(i).c_str());
527*2abb3134SXin Li   }
528*2abb3134SXin Li 
529*2abb3134SXin Li   // array of indices of incomplete paths, i.e. paths that are not complete
530*2abb3134SXin Li   // subgraphs
531*2abb3134SXin Li   log("CheckForCliques");
532*2abb3134SXin Li   unordered_set<int> incomplete;
533*2abb3134SXin Li   CheckForCliques(candidates, edge_set, &incomplete);
534*2abb3134SXin Li   for (auto p : incomplete) {
535*2abb3134SXin Li     log("Path %d is incomplete", p);
536*2abb3134SXin Li   }
537*2abb3134SXin Li 
538*2abb3134SXin Li   log("Found the following cliques/words:");
539*2abb3134SXin Li   // Now print all the complete ones to stdout
540*2abb3134SXin Li   for (int i = 0; i < candidates.num_paths(); i++) {
541*2abb3134SXin Li     if (incomplete.find(i) == incomplete.end()) {
542*2abb3134SXin Li       log("%d %s", i, candidates.PathAsString(i).c_str());
543*2abb3134SXin Li     }
544*2abb3134SXin Li   }
545*2abb3134SXin Li   log("Done");
546*2abb3134SXin Li }
547