xref: /aosp_15_r20/external/licenseclassifier/stringclassifier/searchset/tokenizer/tokenizer.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche
15*46c4c49dSIbrahim Kanouche// Package tokenizer converts a text into a stream of tokens.
16*46c4c49dSIbrahim Kanouchepackage tokenizer
17*46c4c49dSIbrahim Kanouche
18*46c4c49dSIbrahim Kanoucheimport (
19*46c4c49dSIbrahim Kanouche	"bytes"
20*46c4c49dSIbrahim Kanouche	"fmt"
21*46c4c49dSIbrahim Kanouche	"hash/crc32"
22*46c4c49dSIbrahim Kanouche	"sort"
23*46c4c49dSIbrahim Kanouche	"unicode"
24*46c4c49dSIbrahim Kanouche	"unicode/utf8"
25*46c4c49dSIbrahim Kanouche)
26*46c4c49dSIbrahim Kanouche
27*46c4c49dSIbrahim Kanouche// Token is a non-whitespace sequence (i.e., word or punctuation) in the
28*46c4c49dSIbrahim Kanouche// original string. This is not meant for use outside of this package.
29*46c4c49dSIbrahim Kanouchetype token struct {
30*46c4c49dSIbrahim Kanouche	Text   string
31*46c4c49dSIbrahim Kanouche	Offset int
32*46c4c49dSIbrahim Kanouche}
33*46c4c49dSIbrahim Kanouche
34*46c4c49dSIbrahim Kanouche// Tokens is a list of Token objects.
35*46c4c49dSIbrahim Kanouchetype Tokens []*token
36*46c4c49dSIbrahim Kanouche
37*46c4c49dSIbrahim Kanouche// newToken creates a new token object with an invalid (negative) offset, which
38*46c4c49dSIbrahim Kanouche// will be set before the token's used.
39*46c4c49dSIbrahim Kanouchefunc newToken() *token {
40*46c4c49dSIbrahim Kanouche	return &token{Offset: -1}
41*46c4c49dSIbrahim Kanouche}
42*46c4c49dSIbrahim Kanouche
43*46c4c49dSIbrahim Kanouche// Tokenize converts a string into a stream of tokens.
44*46c4c49dSIbrahim Kanouchefunc Tokenize(s string) (toks Tokens) {
45*46c4c49dSIbrahim Kanouche	tok := newToken()
46*46c4c49dSIbrahim Kanouche	for i := 0; i < len(s); {
47*46c4c49dSIbrahim Kanouche		r, size := utf8.DecodeRuneInString(s[i:])
48*46c4c49dSIbrahim Kanouche		switch {
49*46c4c49dSIbrahim Kanouche		case unicode.IsSpace(r):
50*46c4c49dSIbrahim Kanouche			if tok.Offset >= 0 {
51*46c4c49dSIbrahim Kanouche				toks = append(toks, tok)
52*46c4c49dSIbrahim Kanouche				tok = newToken()
53*46c4c49dSIbrahim Kanouche			}
54*46c4c49dSIbrahim Kanouche		case unicode.IsPunct(r):
55*46c4c49dSIbrahim Kanouche			if tok.Offset >= 0 {
56*46c4c49dSIbrahim Kanouche				toks = append(toks, tok)
57*46c4c49dSIbrahim Kanouche				tok = newToken()
58*46c4c49dSIbrahim Kanouche			}
59*46c4c49dSIbrahim Kanouche			toks = append(toks, &token{
60*46c4c49dSIbrahim Kanouche				Text:   string(r),
61*46c4c49dSIbrahim Kanouche				Offset: i,
62*46c4c49dSIbrahim Kanouche			})
63*46c4c49dSIbrahim Kanouche		default:
64*46c4c49dSIbrahim Kanouche			if tok.Offset == -1 {
65*46c4c49dSIbrahim Kanouche				tok.Offset = i
66*46c4c49dSIbrahim Kanouche			}
67*46c4c49dSIbrahim Kanouche			tok.Text += string(r)
68*46c4c49dSIbrahim Kanouche		}
69*46c4c49dSIbrahim Kanouche		i += size
70*46c4c49dSIbrahim Kanouche	}
71*46c4c49dSIbrahim Kanouche	if tok.Offset != -1 {
72*46c4c49dSIbrahim Kanouche		// Add any remaining token that wasn't yet included in the list.
73*46c4c49dSIbrahim Kanouche		toks = append(toks, tok)
74*46c4c49dSIbrahim Kanouche	}
75*46c4c49dSIbrahim Kanouche	return toks
76*46c4c49dSIbrahim Kanouche}
77*46c4c49dSIbrahim Kanouche
78*46c4c49dSIbrahim Kanouche// GenerateHashes generates hashes for "size" length substrings. The
79*46c4c49dSIbrahim Kanouche// "stringifyTokens" call takes a long time to run, so not all substrings have
80*46c4c49dSIbrahim Kanouche// hashes, i.e. we skip some of the smaller substrings.
81*46c4c49dSIbrahim Kanouchefunc (t Tokens) GenerateHashes(h Hash, size int) ([]uint32, TokenRanges) {
82*46c4c49dSIbrahim Kanouche	if size == 0 {
83*46c4c49dSIbrahim Kanouche		return nil, nil
84*46c4c49dSIbrahim Kanouche	}
85*46c4c49dSIbrahim Kanouche
86*46c4c49dSIbrahim Kanouche	var css []uint32
87*46c4c49dSIbrahim Kanouche	var tr TokenRanges
88*46c4c49dSIbrahim Kanouche	for offset := 0; offset+size <= len(t); offset += size / 2 {
89*46c4c49dSIbrahim Kanouche		var b bytes.Buffer
90*46c4c49dSIbrahim Kanouche		t.stringifyTokens(&b, offset, size)
91*46c4c49dSIbrahim Kanouche		cs := crc32.ChecksumIEEE(b.Bytes())
92*46c4c49dSIbrahim Kanouche		css = append(css, cs)
93*46c4c49dSIbrahim Kanouche		tr = append(tr, &TokenRange{offset, offset + size})
94*46c4c49dSIbrahim Kanouche		h.add(cs, offset, offset+size)
95*46c4c49dSIbrahim Kanouche		if size <= 1 {
96*46c4c49dSIbrahim Kanouche			break
97*46c4c49dSIbrahim Kanouche		}
98*46c4c49dSIbrahim Kanouche	}
99*46c4c49dSIbrahim Kanouche
100*46c4c49dSIbrahim Kanouche	return css, tr
101*46c4c49dSIbrahim Kanouche}
102*46c4c49dSIbrahim Kanouche
103*46c4c49dSIbrahim Kanouche// stringifyTokens serializes a sublist of tokens into a bytes buffer.
104*46c4c49dSIbrahim Kanouchefunc (t Tokens) stringifyTokens(b *bytes.Buffer, offset, size int) {
105*46c4c49dSIbrahim Kanouche	for j := offset; j < offset+size; j++ {
106*46c4c49dSIbrahim Kanouche		if j != offset {
107*46c4c49dSIbrahim Kanouche			b.WriteRune(' ')
108*46c4c49dSIbrahim Kanouche		}
109*46c4c49dSIbrahim Kanouche		b.WriteString(t[j].Text)
110*46c4c49dSIbrahim Kanouche	}
111*46c4c49dSIbrahim Kanouche}
112*46c4c49dSIbrahim Kanouche
113*46c4c49dSIbrahim Kanouche// TokenRange indicates the range of tokens that map to a particular checksum.
114*46c4c49dSIbrahim Kanouchetype TokenRange struct {
115*46c4c49dSIbrahim Kanouche	Start int
116*46c4c49dSIbrahim Kanouche	End   int
117*46c4c49dSIbrahim Kanouche}
118*46c4c49dSIbrahim Kanouche
119*46c4c49dSIbrahim Kanouchefunc (t *TokenRange) String() string {
120*46c4c49dSIbrahim Kanouche	return fmt.Sprintf("[%v, %v)", t.Start, t.End)
121*46c4c49dSIbrahim Kanouche}
122*46c4c49dSIbrahim Kanouche
123*46c4c49dSIbrahim Kanouche// TokenRanges is a list of TokenRange objects. The chance that two different
124*46c4c49dSIbrahim Kanouche// strings map to the same checksum is very small, but unfortunately isn't
125*46c4c49dSIbrahim Kanouche// zero, so we use this instead of making the assumption that they will all be
126*46c4c49dSIbrahim Kanouche// unique.
127*46c4c49dSIbrahim Kanouchetype TokenRanges []*TokenRange
128*46c4c49dSIbrahim Kanouche
129*46c4c49dSIbrahim Kanouchefunc (t TokenRanges) Len() int           { return len(t) }
130*46c4c49dSIbrahim Kanouchefunc (t TokenRanges) Swap(i, j int)      { t[i], t[j] = t[j], t[i] }
131*46c4c49dSIbrahim Kanouchefunc (t TokenRanges) Less(i, j int) bool { return t[i].Start < t[j].Start }
132*46c4c49dSIbrahim Kanouche
133*46c4c49dSIbrahim Kanouche// CombineUnique returns the combination of both token ranges with no duplicates.
134*46c4c49dSIbrahim Kanouchefunc (t TokenRanges) CombineUnique(other TokenRanges) TokenRanges {
135*46c4c49dSIbrahim Kanouche	if len(other) == 0 {
136*46c4c49dSIbrahim Kanouche		return t
137*46c4c49dSIbrahim Kanouche	}
138*46c4c49dSIbrahim Kanouche	if len(t) == 0 {
139*46c4c49dSIbrahim Kanouche		return other
140*46c4c49dSIbrahim Kanouche	}
141*46c4c49dSIbrahim Kanouche
142*46c4c49dSIbrahim Kanouche	cu := append(t, other...)
143*46c4c49dSIbrahim Kanouche	sort.Sort(cu)
144*46c4c49dSIbrahim Kanouche
145*46c4c49dSIbrahim Kanouche	if len(cu) == 0 {
146*46c4c49dSIbrahim Kanouche		return nil
147*46c4c49dSIbrahim Kanouche	}
148*46c4c49dSIbrahim Kanouche
149*46c4c49dSIbrahim Kanouche	res := TokenRanges{cu[0]}
150*46c4c49dSIbrahim Kanouche	for prev, i := cu[0], 1; i < len(cu); i++ {
151*46c4c49dSIbrahim Kanouche		if prev.Start != cu[i].Start || prev.End != cu[i].End {
152*46c4c49dSIbrahim Kanouche			res = append(res, cu[i])
153*46c4c49dSIbrahim Kanouche			prev = cu[i]
154*46c4c49dSIbrahim Kanouche		}
155*46c4c49dSIbrahim Kanouche	}
156*46c4c49dSIbrahim Kanouche	return res
157*46c4c49dSIbrahim Kanouche}
158*46c4c49dSIbrahim Kanouche
159*46c4c49dSIbrahim Kanouche// Hash is a map of the hashes of a section of text to the token range covering that text.
160*46c4c49dSIbrahim Kanouchetype Hash map[uint32]TokenRanges
161*46c4c49dSIbrahim Kanouche
162*46c4c49dSIbrahim Kanouche// add associates a token range, [start, end], to a checksum.
163*46c4c49dSIbrahim Kanouchefunc (h Hash) add(checksum uint32, start, end int) {
164*46c4c49dSIbrahim Kanouche	ntr := &TokenRange{Start: start, End: end}
165*46c4c49dSIbrahim Kanouche	if r, ok := h[checksum]; ok {
166*46c4c49dSIbrahim Kanouche		for _, tr := range r {
167*46c4c49dSIbrahim Kanouche			if tr.Start == ntr.Start && tr.End == ntr.End {
168*46c4c49dSIbrahim Kanouche				// The token range already exists at this
169*46c4c49dSIbrahim Kanouche				// checksum. No need to re-add it.
170*46c4c49dSIbrahim Kanouche				return
171*46c4c49dSIbrahim Kanouche			}
172*46c4c49dSIbrahim Kanouche		}
173*46c4c49dSIbrahim Kanouche	}
174*46c4c49dSIbrahim Kanouche	h[checksum] = append(h[checksum], ntr)
175*46c4c49dSIbrahim Kanouche}
176