1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc. 2*46c4c49dSIbrahim Kanouche// 3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License"); 4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License. 5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at 6*46c4c49dSIbrahim Kanouche// 7*46c4c49dSIbrahim Kanouche// http://www.apache.org/licenses/LICENSE-2.0 8*46c4c49dSIbrahim Kanouche// 9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software 10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS, 11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and 13*46c4c49dSIbrahim Kanouche// limitations under the License. 14*46c4c49dSIbrahim Kanouche 15*46c4c49dSIbrahim Kanouche// Package tokenizer converts a text into a stream of tokens. 16*46c4c49dSIbrahim Kanouchepackage tokenizer 17*46c4c49dSIbrahim Kanouche 18*46c4c49dSIbrahim Kanoucheimport ( 19*46c4c49dSIbrahim Kanouche "bytes" 20*46c4c49dSIbrahim Kanouche "fmt" 21*46c4c49dSIbrahim Kanouche "hash/crc32" 22*46c4c49dSIbrahim Kanouche "sort" 23*46c4c49dSIbrahim Kanouche "unicode" 24*46c4c49dSIbrahim Kanouche "unicode/utf8" 25*46c4c49dSIbrahim Kanouche) 26*46c4c49dSIbrahim Kanouche 27*46c4c49dSIbrahim Kanouche// Token is a non-whitespace sequence (i.e., word or punctuation) in the 28*46c4c49dSIbrahim Kanouche// original string. This is not meant for use outside of this package. 29*46c4c49dSIbrahim Kanouchetype token struct { 30*46c4c49dSIbrahim Kanouche Text string 31*46c4c49dSIbrahim Kanouche Offset int 32*46c4c49dSIbrahim Kanouche} 33*46c4c49dSIbrahim Kanouche 34*46c4c49dSIbrahim Kanouche// Tokens is a list of Token objects. 35*46c4c49dSIbrahim Kanouchetype Tokens []*token 36*46c4c49dSIbrahim Kanouche 37*46c4c49dSIbrahim Kanouche// newToken creates a new token object with an invalid (negative) offset, which 38*46c4c49dSIbrahim Kanouche// will be set before the token's used. 39*46c4c49dSIbrahim Kanouchefunc newToken() *token { 40*46c4c49dSIbrahim Kanouche return &token{Offset: -1} 41*46c4c49dSIbrahim Kanouche} 42*46c4c49dSIbrahim Kanouche 43*46c4c49dSIbrahim Kanouche// Tokenize converts a string into a stream of tokens. 44*46c4c49dSIbrahim Kanouchefunc Tokenize(s string) (toks Tokens) { 45*46c4c49dSIbrahim Kanouche tok := newToken() 46*46c4c49dSIbrahim Kanouche for i := 0; i < len(s); { 47*46c4c49dSIbrahim Kanouche r, size := utf8.DecodeRuneInString(s[i:]) 48*46c4c49dSIbrahim Kanouche switch { 49*46c4c49dSIbrahim Kanouche case unicode.IsSpace(r): 50*46c4c49dSIbrahim Kanouche if tok.Offset >= 0 { 51*46c4c49dSIbrahim Kanouche toks = append(toks, tok) 52*46c4c49dSIbrahim Kanouche tok = newToken() 53*46c4c49dSIbrahim Kanouche } 54*46c4c49dSIbrahim Kanouche case unicode.IsPunct(r): 55*46c4c49dSIbrahim Kanouche if tok.Offset >= 0 { 56*46c4c49dSIbrahim Kanouche toks = append(toks, tok) 57*46c4c49dSIbrahim Kanouche tok = newToken() 58*46c4c49dSIbrahim Kanouche } 59*46c4c49dSIbrahim Kanouche toks = append(toks, &token{ 60*46c4c49dSIbrahim Kanouche Text: string(r), 61*46c4c49dSIbrahim Kanouche Offset: i, 62*46c4c49dSIbrahim Kanouche }) 63*46c4c49dSIbrahim Kanouche default: 64*46c4c49dSIbrahim Kanouche if tok.Offset == -1 { 65*46c4c49dSIbrahim Kanouche tok.Offset = i 66*46c4c49dSIbrahim Kanouche } 67*46c4c49dSIbrahim Kanouche tok.Text += string(r) 68*46c4c49dSIbrahim Kanouche } 69*46c4c49dSIbrahim Kanouche i += size 70*46c4c49dSIbrahim Kanouche } 71*46c4c49dSIbrahim Kanouche if tok.Offset != -1 { 72*46c4c49dSIbrahim Kanouche // Add any remaining token that wasn't yet included in the list. 73*46c4c49dSIbrahim Kanouche toks = append(toks, tok) 74*46c4c49dSIbrahim Kanouche } 75*46c4c49dSIbrahim Kanouche return toks 76*46c4c49dSIbrahim Kanouche} 77*46c4c49dSIbrahim Kanouche 78*46c4c49dSIbrahim Kanouche// GenerateHashes generates hashes for "size" length substrings. The 79*46c4c49dSIbrahim Kanouche// "stringifyTokens" call takes a long time to run, so not all substrings have 80*46c4c49dSIbrahim Kanouche// hashes, i.e. we skip some of the smaller substrings. 81*46c4c49dSIbrahim Kanouchefunc (t Tokens) GenerateHashes(h Hash, size int) ([]uint32, TokenRanges) { 82*46c4c49dSIbrahim Kanouche if size == 0 { 83*46c4c49dSIbrahim Kanouche return nil, nil 84*46c4c49dSIbrahim Kanouche } 85*46c4c49dSIbrahim Kanouche 86*46c4c49dSIbrahim Kanouche var css []uint32 87*46c4c49dSIbrahim Kanouche var tr TokenRanges 88*46c4c49dSIbrahim Kanouche for offset := 0; offset+size <= len(t); offset += size / 2 { 89*46c4c49dSIbrahim Kanouche var b bytes.Buffer 90*46c4c49dSIbrahim Kanouche t.stringifyTokens(&b, offset, size) 91*46c4c49dSIbrahim Kanouche cs := crc32.ChecksumIEEE(b.Bytes()) 92*46c4c49dSIbrahim Kanouche css = append(css, cs) 93*46c4c49dSIbrahim Kanouche tr = append(tr, &TokenRange{offset, offset + size}) 94*46c4c49dSIbrahim Kanouche h.add(cs, offset, offset+size) 95*46c4c49dSIbrahim Kanouche if size <= 1 { 96*46c4c49dSIbrahim Kanouche break 97*46c4c49dSIbrahim Kanouche } 98*46c4c49dSIbrahim Kanouche } 99*46c4c49dSIbrahim Kanouche 100*46c4c49dSIbrahim Kanouche return css, tr 101*46c4c49dSIbrahim Kanouche} 102*46c4c49dSIbrahim Kanouche 103*46c4c49dSIbrahim Kanouche// stringifyTokens serializes a sublist of tokens into a bytes buffer. 104*46c4c49dSIbrahim Kanouchefunc (t Tokens) stringifyTokens(b *bytes.Buffer, offset, size int) { 105*46c4c49dSIbrahim Kanouche for j := offset; j < offset+size; j++ { 106*46c4c49dSIbrahim Kanouche if j != offset { 107*46c4c49dSIbrahim Kanouche b.WriteRune(' ') 108*46c4c49dSIbrahim Kanouche } 109*46c4c49dSIbrahim Kanouche b.WriteString(t[j].Text) 110*46c4c49dSIbrahim Kanouche } 111*46c4c49dSIbrahim Kanouche} 112*46c4c49dSIbrahim Kanouche 113*46c4c49dSIbrahim Kanouche// TokenRange indicates the range of tokens that map to a particular checksum. 114*46c4c49dSIbrahim Kanouchetype TokenRange struct { 115*46c4c49dSIbrahim Kanouche Start int 116*46c4c49dSIbrahim Kanouche End int 117*46c4c49dSIbrahim Kanouche} 118*46c4c49dSIbrahim Kanouche 119*46c4c49dSIbrahim Kanouchefunc (t *TokenRange) String() string { 120*46c4c49dSIbrahim Kanouche return fmt.Sprintf("[%v, %v)", t.Start, t.End) 121*46c4c49dSIbrahim Kanouche} 122*46c4c49dSIbrahim Kanouche 123*46c4c49dSIbrahim Kanouche// TokenRanges is a list of TokenRange objects. The chance that two different 124*46c4c49dSIbrahim Kanouche// strings map to the same checksum is very small, but unfortunately isn't 125*46c4c49dSIbrahim Kanouche// zero, so we use this instead of making the assumption that they will all be 126*46c4c49dSIbrahim Kanouche// unique. 127*46c4c49dSIbrahim Kanouchetype TokenRanges []*TokenRange 128*46c4c49dSIbrahim Kanouche 129*46c4c49dSIbrahim Kanouchefunc (t TokenRanges) Len() int { return len(t) } 130*46c4c49dSIbrahim Kanouchefunc (t TokenRanges) Swap(i, j int) { t[i], t[j] = t[j], t[i] } 131*46c4c49dSIbrahim Kanouchefunc (t TokenRanges) Less(i, j int) bool { return t[i].Start < t[j].Start } 132*46c4c49dSIbrahim Kanouche 133*46c4c49dSIbrahim Kanouche// CombineUnique returns the combination of both token ranges with no duplicates. 134*46c4c49dSIbrahim Kanouchefunc (t TokenRanges) CombineUnique(other TokenRanges) TokenRanges { 135*46c4c49dSIbrahim Kanouche if len(other) == 0 { 136*46c4c49dSIbrahim Kanouche return t 137*46c4c49dSIbrahim Kanouche } 138*46c4c49dSIbrahim Kanouche if len(t) == 0 { 139*46c4c49dSIbrahim Kanouche return other 140*46c4c49dSIbrahim Kanouche } 141*46c4c49dSIbrahim Kanouche 142*46c4c49dSIbrahim Kanouche cu := append(t, other...) 143*46c4c49dSIbrahim Kanouche sort.Sort(cu) 144*46c4c49dSIbrahim Kanouche 145*46c4c49dSIbrahim Kanouche if len(cu) == 0 { 146*46c4c49dSIbrahim Kanouche return nil 147*46c4c49dSIbrahim Kanouche } 148*46c4c49dSIbrahim Kanouche 149*46c4c49dSIbrahim Kanouche res := TokenRanges{cu[0]} 150*46c4c49dSIbrahim Kanouche for prev, i := cu[0], 1; i < len(cu); i++ { 151*46c4c49dSIbrahim Kanouche if prev.Start != cu[i].Start || prev.End != cu[i].End { 152*46c4c49dSIbrahim Kanouche res = append(res, cu[i]) 153*46c4c49dSIbrahim Kanouche prev = cu[i] 154*46c4c49dSIbrahim Kanouche } 155*46c4c49dSIbrahim Kanouche } 156*46c4c49dSIbrahim Kanouche return res 157*46c4c49dSIbrahim Kanouche} 158*46c4c49dSIbrahim Kanouche 159*46c4c49dSIbrahim Kanouche// Hash is a map of the hashes of a section of text to the token range covering that text. 160*46c4c49dSIbrahim Kanouchetype Hash map[uint32]TokenRanges 161*46c4c49dSIbrahim Kanouche 162*46c4c49dSIbrahim Kanouche// add associates a token range, [start, end], to a checksum. 163*46c4c49dSIbrahim Kanouchefunc (h Hash) add(checksum uint32, start, end int) { 164*46c4c49dSIbrahim Kanouche ntr := &TokenRange{Start: start, End: end} 165*46c4c49dSIbrahim Kanouche if r, ok := h[checksum]; ok { 166*46c4c49dSIbrahim Kanouche for _, tr := range r { 167*46c4c49dSIbrahim Kanouche if tr.Start == ntr.Start && tr.End == ntr.End { 168*46c4c49dSIbrahim Kanouche // The token range already exists at this 169*46c4c49dSIbrahim Kanouche // checksum. No need to re-add it. 170*46c4c49dSIbrahim Kanouche return 171*46c4c49dSIbrahim Kanouche } 172*46c4c49dSIbrahim Kanouche } 173*46c4c49dSIbrahim Kanouche } 174*46c4c49dSIbrahim Kanouche h[checksum] = append(h[checksum], ntr) 175*46c4c49dSIbrahim Kanouche} 176