1#!/usr/bin/env python3 2# Copyright 2014 The Chromium Authors 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6""" 7A Deterministic acyclic finite state automaton (DAFSA) is a compact 8representation of an unordered word list (dictionary). 9 10https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton 11 12This python program converts a list of strings to a byte array in C++. 13This python program fetches strings and return values from a gperf file 14and generates a C++ file with a byte array representing graph that can be 15used as a memory efficient replacement for the perfect hash table. 16 17The input strings are assumed to consist of printable 7-bit ASCII characters 18and the return values are assumed to be one digit integers. 19 20In this program a DAFSA is a diamond shaped graph starting at a common 21source node and ending at a common sink node. All internal nodes contain 22a label and each word is represented by the labels in one path from 23the source node to the sink node. 24 25The following python represention is used for nodes: 26 27 Source node: [ children ] 28 Internal node: (label, [ children ]) 29 Sink node: None 30 31The graph is first compressed by prefixes like a trie. In the next step 32suffixes are compressed so that the graph gets diamond shaped. Finally 33one to one linked nodes are replaced by nodes with the labels joined. 34 35The order of the operations is crucial since lookups will be performed 36starting from the source with no backtracking. Thus a node must have at 37most one child with a label starting by the same character. The output 38is also arranged so that all jumps are to increasing addresses, thus forward 39in memory. 40 41The generated output has suffix free decoding so that the sign of leading 42bits in a link (a reference to a child node) indicate if it has a size of one, 43two or three bytes and if it is the last outgoing link from the actual node. 44A node label is terminated by a byte with the leading bit set. 45 46The generated byte array can described by the following BNF: 47 48<byte> ::= < 8-bit value in range [0x00-0xFF] > 49 50<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] > 51<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] > 52<return value> ::= < value + 0x80, byte in range [0x80-0x8F] > 53 54<offset1> ::= < byte in range [0x00-0x3F] > 55<offset2> ::= < byte in range [0x40-0x5F] > 56<offset3> ::= < byte in range [0x60-0x7F] > 57 58<end_offset1> ::= < byte in range [0x80-0xBF] > 59<end_offset2> ::= < byte in range [0xC0-0xDF] > 60<end_offset3> ::= < byte in range [0xE0-0xFF] > 61 62<prefix> ::= <char> 63 64<label> ::= <end_char> 65 | <char> <label> 66 67<end_label> ::= <return_value> 68 | <char> <end_label> 69 70<offset> ::= <offset1> 71 | <offset2> <byte> 72 | <offset3> <byte> <byte> 73 74<end_offset> ::= <end_offset1> 75 | <end_offset2> <byte> 76 | <end_offset3> <byte> <byte> 77 78<offsets> ::= <end_offset> 79 | <offset> <offsets> 80 81<source> ::= <offsets> 82 83<node> ::= <label> <offsets> 84 | <prefix> <node> 85 | <end_label> 86 87<dafsa> ::= <source> 88 | <dafsa> <node> 89 90Decoding: 91 92<char> -> printable 7-bit ASCII character 93<end_char> & 0x7F -> printable 7-bit ASCII character 94<return value> & 0x0F -> integer 95<offset1 & 0x3F> -> integer 96((<offset2> & 0x1F>) << 8) + <byte> -> integer 97((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer 98 99end_offset1, end_offset2 and and_offset3 are decoded same as offset1, 100offset2 and offset3 respectively. 101 102The first offset in a list of offsets is the distance in bytes between the 103offset itself and the first child node. Subsequent offsets are the distance 104between previous child node and next child node. Thus each offset links a node 105to a child node. The distance is always counted between start addresses, i.e. 106first byte in decoded offset or first byte in child node. 107 108Example 1: 109 110%% 111aa, 1 112a, 2 113%% 114 115The input is first parsed to a list of words: 116["aa1", "a2"] 117 118A fully expanded graph is created from the words: 119source = [node1, node4] 120node1 = ("a", [node2]) 121node2 = ("a", [node3]) 122node3 = ("\x01", [sink]) 123node4 = ("a", [node5]) 124node5 = ("\x02", [sink]) 125sink = None 126 127Compression results in the following graph: 128source = [node1] 129node1 = ("a", [node2, node3]) 130node2 = ("\x02", [sink]) 131node3 = ("a\x01", [sink]) 132sink = None 133 134A C++ representation of the compressed graph is generated: 135 136const unsigned char dafsa[7] = { 137 0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81, 138}; 139 140The bytes in the generated array has the following meaning: 141 142 0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1 143 144 1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a" 145 2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4 146 147 3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5 148 4: 0x82 <return_value> 0x82 & 0x0F -> return 2 149 150 5: 0x61 <char> label character 0x61 -> match "a" 151 6: 0x81 <return_value> 0x81 & 0x0F -> return 1 152 153Example 2: 154 155%% 156aa, 1 157bbb, 2 158baa, 1 159%% 160 161The input is first parsed to a list of words: 162["aa1", "bbb2", "baa1"] 163 164Compression results in the following graph: 165source = [node1, node2] 166node1 = ("b", [node2, node3]) 167node2 = ("aa\x01", [sink]) 168node3 = ("bb\x02", [sink]) 169sink = None 170 171A C++ representation of the compressed graph is generated: 172 173const unsigned char dafsa[11] = { 174 0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82, 175}; 176 177The bytes in the generated array has the following meaning: 178 179 0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2 180 1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5 181 182 2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b" 183 3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5 184 4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8 185 186 5: 0x61 <char> label character 0x61 -> match "a" 187 6: 0x61 <char> label character 0x61 -> match "a" 188 7: 0x81 <return_value> 0x81 & 0x0F -> return 1 189 190 8: 0x62 <char> label character 0x62 -> match "b" 191 9: 0x62 <char> label character 0x62 -> match "b" 19210: 0x82 <return_value> 0x82 & 0x0F -> return 2 193""" 194 195import argparse 196import sys 197 198from typing import Any, Dict, FrozenSet, Iterable, List, MutableSequence, Sequence, Tuple, Union 199 200# Use of Any below is because mypy doesn't support recursive types. 201SinkNode = Union[None, None] # weird hack to get around lack of TypeAlias. 202InteriorNode = Tuple[str, List[Any]] 203SourceNode = List[Any] 204 205NonSinkNode = Union[InteriorNode, SourceNode] 206Node = Union[SinkNode, InteriorNode, SourceNode] 207DAFSA = List[Node] 208 209 210class InputError(Exception): 211 """Exception raised for errors in the input file.""" 212 213 214def to_dafsa(words: Iterable[str]) -> DAFSA: 215 """Generates a DAFSA from a word list and returns the source nodes. 216 217 Each word is split into characters so that each character is represented by 218 a unique node. It is assumed the word list is not empty. 219 """ 220 if not words: 221 raise InputError('The domain list must not be empty') 222 223 def ToNodes(word: str) -> Node: 224 """Split words into characters""" 225 if not 0x1F < ord(word[0]) < 0x80: 226 raise InputError('Domain names must be printable 7-bit ASCII') 227 if len(word) == 1: 228 return chr(ord(word[0]) & 0x0F), [None] 229 return word[0], [ToNodes(word[1:])] 230 return [ToNodes(word) for word in words] 231 232 233def to_words(node: Node) -> Iterable[str]: 234 """Generates a word list from all paths starting from an internal node.""" 235 if not node: 236 return [''] 237 return [(node[0] + word) for child in node[1] for word in to_words(child)] 238 239 240def reverse(dafsa: DAFSA) -> DAFSA: 241 """Generates a new DAFSA that is reversed, so that the old sink node becomes 242 the new source node. 243 """ 244 sink: SourceNode = [] 245 nodemap: Dict[int, InteriorNode] = {} 246 247 def dfs(node: Node, parent: Node) -> None: 248 """Creates reverse nodes. 249 250 A new reverse node will be created for each old node. The new node will 251 get a reversed label and the parents of the old node as children. 252 """ 253 if not node: 254 sink.append(parent) 255 elif id(node) not in nodemap: 256 nodemap[id(node)] = (node[0][::-1], [parent]) 257 for child in node[1]: 258 dfs(child, nodemap[id(node)]) 259 else: 260 nodemap[id(node)][1].append(parent) 261 262 for node in dafsa: 263 dfs(node, None) 264 return sink 265 266 267def join_labels(dafsa: DAFSA) -> DAFSA: 268 """Generates a new DAFSA where internal nodes are merged if there is a one to 269 one connection. 270 """ 271 parentcount: Dict[int, int] = {id(None): 2} 272 nodemap: Dict[int, Node] = {id(None): None} 273 274 def count_parents(node: Node) -> None: 275 """Count incoming references""" 276 if id(node) in parentcount: 277 parentcount[id(node)] += 1 278 else: 279 assert node is not None # parentcount statically contains `id(None)` 280 parentcount[id(node)] = 1 281 for child in node[1]: 282 count_parents(child) 283 284 def join(node: Node) -> Node: 285 """Create new nodes""" 286 if id(node) not in nodemap: 287 assert node is not None # nodemap statically contains `id(None)` 288 children = [join(child) for child in node[1]] 289 if len(children) == 1 and parentcount[id(node[1][0])] == 1: 290 child = children[0] 291 # parentcount statically maps `id(None)` to 2, so this child cannot be 292 # the sink. 293 assert child is not None 294 nodemap[id(node)] = (node[0] + child[0], child[1]) 295 else: 296 nodemap[id(node)] = (node[0], children) 297 return nodemap[id(node)] 298 299 for node in dafsa: 300 count_parents(node) 301 return [join(node) for node in dafsa] 302 303 304def join_suffixes(dafsa: DAFSA) -> DAFSA: 305 """Generates a new DAFSA where nodes that represent the same word lists 306 towards the sink are merged. 307 """ 308 nodemap: Dict[FrozenSet[str], Node] = {frozenset(('', )): None} 309 310 def join(node: Node) -> Node: 311 """Returns a matching node. A new node is created if no matching node 312 exists. The graph is accessed in dfs order. 313 """ 314 suffixes = frozenset(to_words(node)) 315 if suffixes not in nodemap: 316 # The only set of suffixes for the sink is {''}, which is statically 317 # contained in nodemap. 318 assert node is not None 319 nodemap[suffixes] = (node[0], [join(child) for child in node[1]]) 320 return nodemap[suffixes] 321 322 return [join(node) for node in dafsa] 323 324 325def top_sort(dafsa: DAFSA) -> Sequence[NonSinkNode]: 326 """Generates list of nodes in topological sort order.""" 327 # `incoming` contains the in-degree of every node except the sink. 328 incoming: Dict[int, int] = {} 329 330 def count_incoming(node: Node) -> None: 331 """Counts incoming references.""" 332 if node: 333 if id(node) not in incoming: 334 incoming[id(node)] = 1 335 for child in node[1]: 336 count_incoming(child) 337 else: 338 incoming[id(node)] += 1 339 340 for node in dafsa: 341 count_incoming(node) 342 343 for node in dafsa: 344 if node: 345 incoming[id(node)] -= 1 346 347 waiting: List[NonSinkNode] = [ 348 node for node in dafsa if node and incoming[id(node)] == 0 349 ] 350 nodes: List[NonSinkNode] = [] 351 352 while waiting: 353 node = waiting.pop() 354 assert incoming[id(node)] == 0 355 nodes.append(node) 356 for child in node[1]: 357 if child: 358 incoming[id(child)] -= 1 359 if incoming[id(child)] == 0: 360 waiting.append(child) 361 return nodes 362 363 364def encode_links(children: Sequence[Node], offsets: Dict[int, int], 365 current: int) -> Iterable[int]: 366 """Encodes a list of children as one, two or three byte offsets.""" 367 if not children[0]: 368 # This is an <end_label> node and no links follow such nodes 369 assert len(children) == 1 370 return [] 371 guess = 3 * len(children) 372 assert children 373 children = sorted(children, key = lambda x: -offsets[id(x)]) 374 while True: 375 offset = current + guess 376 buf: List[int] = [] 377 for child in children: 378 last = len(buf) 379 distance = offset - offsets[id(child)] 380 assert distance > 0 and distance < (1 << 21) 381 382 if distance < (1 << 6): 383 # A 6-bit offset: "s0xxxxxx" 384 buf.append(distance) 385 elif distance < (1 << 13): 386 # A 13-bit offset: "s10xxxxxxxxxxxxx" 387 buf.append(0x40 | (distance >> 8)) 388 buf.append(distance & 0xFF) 389 else: 390 # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx" 391 buf.append(0x60 | (distance >> 16)) 392 buf.append((distance >> 8) & 0xFF) 393 buf.append(distance & 0xFF) 394 # Distance in first link is relative to following record. 395 # Distance in other links are relative to previous link. 396 offset -= distance 397 if len(buf) == guess: 398 break 399 guess = len(buf) 400 # Set most significant bit to mark end of links in this node. 401 buf[last] |= (1 << 7) 402 buf.reverse() 403 return buf 404 405 406def encode_prefix(label: str) -> MutableSequence[int]: 407 """Encodes a node label as a list of bytes without a trailing high byte. 408 409 This method encodes a node if there is exactly one child and the 410 child follows immidiately after so that no jump is needed. This label 411 will then be a prefix to the label in the child node. 412 """ 413 assert label 414 return [ord(c) for c in reversed(label)] 415 416 417def encode_label(label: str) -> Iterable[int]: 418 """Encodes a node label as a list of bytes with a trailing high byte >0x80. 419 """ 420 buf = encode_prefix(label) 421 # Set most significant bit to mark end of label in this node. 422 buf[0] |= (1 << 7) 423 return buf 424 425 426def encode(dafsa: DAFSA) -> Sequence[int]: 427 """Encodes a DAFSA to a list of bytes""" 428 output: List[int] = [] 429 offsets: Dict[int, int] = {} 430 431 for node in reversed(top_sort(dafsa)): 432 if (len(node[1]) == 1 and node[1][0] and 433 (offsets[id(node[1][0])] == len(output))): 434 output.extend(encode_prefix(node[0])) 435 else: 436 output.extend(encode_links(node[1], offsets, len(output))) 437 output.extend(encode_label(node[0])) 438 offsets[id(node)] = len(output) 439 440 output.extend(encode_links(dafsa, offsets, len(output))) 441 output.reverse() 442 return output 443 444 445def to_cxx(data: Sequence[int]) -> str: 446 """Generates C++ code from a list of encoded bytes.""" 447 text = '/* This file is generated. DO NOT EDIT!\n\n' 448 text += 'The byte array encodes effective tld names. See make_dafsa.py for' 449 text += ' documentation.' 450 text += '*/\n\n' 451 text += 'const unsigned char kDafsa[%s] = {\n' % len(data) 452 for i in range(0, len(data), 12): 453 text += ' ' 454 text += ', '.join('0x%02x' % byte for byte in data[i:i + 12]) 455 text += ',\n' 456 text += '};\n' 457 return text 458 459 460def words_to_cxx(words: Iterable[str]) -> str: 461 """Generates C++ code from a word list""" 462 dafsa = to_dafsa(words) 463 for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels): 464 dafsa = fun(dafsa) 465 return to_cxx(encode(dafsa)) 466 467 468def parse_gperf(infile: Iterable[str], reverse: bool) -> Iterable[str]: 469 """Parses gperf file and extract strings and return code""" 470 lines = [line.strip() for line in infile] 471 # Extract strings after the first '%%' and before the second '%%'. 472 begin = lines.index('%%') + 1 473 end = lines.index('%%', begin) 474 lines = lines[begin:end] 475 for line in lines: 476 if line[-3:-1] != ', ': 477 raise InputError('Expected "domainname, <digit>", found "%s"' % line) 478 # Technically the DAFSA format can support return values in the range 479 # [0-31], but only the first three bits have any defined meaning. 480 if not line.endswith(('0', '1', '2', '3', '4', '5', '6', '7')): 481 raise InputError('Expected value to be in the range of 0-7, found "%s"' % 482 line[-1]) 483 if reverse: 484 return [line[-4::-1] + line[-1] for line in lines] 485 else: 486 return [line[:-3] + line[-1] for line in lines] 487 488 489def main() -> int: 490 parser = argparse.ArgumentParser() 491 parser.add_argument('--reverse', action='store_const', const=True, 492 default=False) 493 parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), 494 default=sys.stdin) 495 parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), 496 default=sys.stdout) 497 args = parser.parse_args() 498 args.outfile.write(words_to_cxx(parse_gperf(args.infile, args.reverse))) 499 return 0 500 501 502if __name__ == '__main__': 503 sys.exit(main()) 504