1#!/usr/bin/env python3 2# Copyright 2020 The Pigweed Authors 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); you may not 5# use this file except in compliance with the License. You may obtain a copy of 6# the License at 7# 8# https://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13# License for the specific language governing permissions and limitations under 14# the License. 15"""Generates a C macro for the PW tokenizer 65599 fixed length hash.""" 16 17import datetime 18import os 19 20HASH_CONSTANT = 65599 21HASH_NAME = 'pw_tokenizer_65599_fixed_length' 22HASH_LENGTHS = 80, 96, 128, 256 23 24FILE_HEADER = """\ 25// Copyright {year} The Pigweed Authors 26// 27// Licensed under the Apache License, Version 2.0 (the "License"); you may not 28// use this file except in compliance with the License. You may obtain a copy of 29// the License at 30// 31// https://www.apache.org/licenses/LICENSE-2.0 32// 33// Unless required by applicable law or agreed to in writing, software 34// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 35// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 36// License for the specific language governing permissions and limitations under 37// the License. 38 39// AUTOGENERATED - DO NOT EDIT 40// 41// This file was generated by {script}. 42// To make changes, update the script and run it to regenerate the files. 43#pragma once 44 45#include <stdint.h> 46 47// {hash_length}-character version of the tokenizer hash function. 48// 49// The argument must be a string literal. It is concatenated with "" to ensure 50// that this is the case. 51// 52// clang-format off 53 54""" 55 56 57def generate_pw_tokenizer_65599_fixed_length_hash_macro(hash_length): 58 """Generate macro that hashes a string literal using a modified x65599 hash. 59 60 The macros generated by this function only operate on string literals. 61 62 Since macros can only operate on fixed-length strings, the hash macro only 63 hashes up to a fixed length, and characters beyond that length are ignored. 64 To eliminate some collisions, the length of the string is hashed as if it 65 were the first character. 66 67 This hash is calculated with the following equation, where s is the string 68 and k is the maximum hash length: 69 70 H(s, k) = len(s) + 65599 * s[0] + 65599^2 * s[1] + ... + 65599^k * s[k-1] 71 72 The hash algorithm is a modified version of the x65599 hash used by the SDBM 73 open source project. This hash has the following differences from x65599: 74 - Characters are only hashed up to a fixed maximum string length. 75 - Characters are hashed in reverse order. 76 - The string length is hashed as the first character in the string. 77 78 The code generated by this function is intentionally sparse. Each character 79 appears hash_length times per log message, so using fewer characters results 80 in faster compilation times. 81 82 Args: 83 hash_length: maximum string size to hash; extra characters are ignored 84 85 Returns: 86 the macro header file as a string 87 """ 88 89 first_hash_term = ( 90 '(uint32_t)(sizeof(str "") - 1 + ' 91 '/* The argument must be a string literal. */ \\\n' 92 ) 93 94 # Use this to add the aligned backslash at the end of the macro lines. 95 line_format = '{{:<{}}}\\\n'.format(len(first_hash_term)) 96 97 lines = [ 98 FILE_HEADER.format( 99 script=os.path.basename(__file__), 100 hash_length=hash_length, 101 year=datetime.date.today().year, 102 ) 103 ] 104 105 lines.append( 106 line_format.format( 107 '#define {}_{}_HASH(str)'.format(HASH_NAME.upper(), hash_length) 108 ) 109 ) 110 lines.append(' ' + first_hash_term) # add indendation and the macro line 111 112 indent = ' ' * len(' (uint32_t)(') 113 coefficient_format = '0x{coefficient:0>8x}ull' 114 115 # The string will have at least a null terminator 116 lines.append( 117 line_format.format( 118 '{}0x{:0>8x}ull * (uint8_t)str[0] +'.format(indent, HASH_CONSTANT) 119 ) 120 ) 121 122 # Format string to use for the remaining terms. 123 term_format = ( 124 '{indent}{coefficient} * ' 125 '(uint8_t)({index} < sizeof(str) ? str[{index}] : 0) +' 126 ).format( 127 indent=indent, 128 coefficient=coefficient_format, 129 index='{{index:>{}}}'.format(len(str(hash_length - 1))), 130 ) 131 132 for i in range(1, hash_length): 133 coefficient = HASH_CONSTANT ** (i + 1) % 2**32 134 term = term_format.format(index=i, coefficient=coefficient) 135 lines.append(line_format.format(term)) 136 137 # Remove the extra + and \ and add the closing ) 138 lines[-1] = lines[-1].rstrip(' +\\\n') + ')' 139 140 lines.append('\n\n// clang-format on\n') 141 142 return ''.join(lines) 143 144 145def _main(): 146 base = os.path.abspath( 147 os.path.join( 148 os.path.dirname(__file__), 149 '..', 150 'public', 151 'pw_tokenizer', 152 'internal', 153 ) 154 ) 155 156 # Generate macros for hashes of the specified lengths. 157 for hash_length in HASH_LENGTHS: 158 path = os.path.join( 159 base, '{}_{}_hash_macro.h'.format(HASH_NAME, hash_length) 160 ) 161 162 with open(path, 'w') as header_file: 163 header_file.write( 164 generate_pw_tokenizer_65599_fixed_length_hash_macro(hash_length) 165 ) 166 167 print( 168 'Generated {}-character hash macro at {}'.format(hash_length, path) 169 ) 170 171 172if __name__ == '__main__': 173 _main() 174