xref: /aosp_15_r20/external/licenseclassifier/v2/document.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2020 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche
15*46c4c49dSIbrahim Kanouche// Package classifier provides the implementation of the v2 license classifier.
16*46c4c49dSIbrahim Kanouchepackage classifier
17*46c4c49dSIbrahim Kanouche
18*46c4c49dSIbrahim Kanoucheimport (
19*46c4c49dSIbrahim Kanouche	"bytes"
20*46c4c49dSIbrahim Kanouche	"fmt"
21*46c4c49dSIbrahim Kanouche	"os"
22*46c4c49dSIbrahim Kanouche	"strings"
23*46c4c49dSIbrahim Kanouche)
24*46c4c49dSIbrahim Kanouche
25*46c4c49dSIbrahim Kanouchetype tokenID int // type to ensure safety when manipulating token identifiers.
26*46c4c49dSIbrahim Kanouche
27*46c4c49dSIbrahim Kanouche// token provides detailed information about a single textual token in the document.
28*46c4c49dSIbrahim Kanouchetype token struct {
29*46c4c49dSIbrahim Kanouche	Text     string // normalized text of the token
30*46c4c49dSIbrahim Kanouche	Line     int    // line position of this token in the source
31*46c4c49dSIbrahim Kanouche	Previous string // for the first token in a line, any previous text.
32*46c4c49dSIbrahim Kanouche}
33*46c4c49dSIbrahim Kanouche
34*46c4c49dSIbrahim Kanouchetype indexedToken struct {
35*46c4c49dSIbrahim Kanouche	Line int     // line position of this token in the source
36*46c4c49dSIbrahim Kanouche	ID   tokenID // identifier of the text in the dictionary
37*46c4c49dSIbrahim Kanouche}
38*46c4c49dSIbrahim Kanouche
39*46c4c49dSIbrahim Kanouchetype indexedDocument struct {
40*46c4c49dSIbrahim Kanouche	Norm    string          // The normalized token sequence
41*46c4c49dSIbrahim Kanouche	Tokens  []indexedToken  // ordered tokens of the document
42*46c4c49dSIbrahim Kanouche	Matches Matches         // these are matches identified while processing the original, untokenized text via regexp matching
43*46c4c49dSIbrahim Kanouche	f       *frequencyTable // frequencies computed for this document
44*46c4c49dSIbrahim Kanouche	dict    *dictionary     // The corpus dictionary for this document
45*46c4c49dSIbrahim Kanouche	s       *searchSet      // The searchset for this document
46*46c4c49dSIbrahim Kanouche	runes   []rune
47*46c4c49dSIbrahim Kanouche}
48*46c4c49dSIbrahim Kanouche
49*46c4c49dSIbrahim Kanouchefunc (d *indexedDocument) generateSearchSet(q int) {
50*46c4c49dSIbrahim Kanouche	d.s = newSearchSet(d, q)
51*46c4c49dSIbrahim Kanouche}
52*46c4c49dSIbrahim Kanouche
53*46c4c49dSIbrahim Kanouchefunc (d *indexedDocument) size() int {
54*46c4c49dSIbrahim Kanouche	return len(d.Tokens)
55*46c4c49dSIbrahim Kanouche}
56*46c4c49dSIbrahim Kanouche
57*46c4c49dSIbrahim Kanouche// normalized returns a string of the normalized tokens concatenated with a
58*46c4c49dSIbrahim Kanouche// single space. This is used by the diff algorithm.
59*46c4c49dSIbrahim Kanouche// TODO: it'd be more efficient to have the diff algorithm work with the raw tokens directly
60*46c4c49dSIbrahim Kanouche// and avoid these ephemeral allocations.
61*46c4c49dSIbrahim Kanouchefunc (d *indexedDocument) normalized() string {
62*46c4c49dSIbrahim Kanouche	var w strings.Builder
63*46c4c49dSIbrahim Kanouche	for i, t := range d.Tokens {
64*46c4c49dSIbrahim Kanouche		w.WriteString(d.dict.getWord(t.ID))
65*46c4c49dSIbrahim Kanouche		if (i + 1) != d.size() {
66*46c4c49dSIbrahim Kanouche			w.WriteString(" ")
67*46c4c49dSIbrahim Kanouche		}
68*46c4c49dSIbrahim Kanouche	}
69*46c4c49dSIbrahim Kanouche	return w.String()
70*46c4c49dSIbrahim Kanouche}
71*46c4c49dSIbrahim Kanouche
72*46c4c49dSIbrahim Kanouchefunc computeQ(threshold float64) int {
73*46c4c49dSIbrahim Kanouche	// q is the lower bound for token runs (q-grams) that must exist
74*46c4c49dSIbrahim Kanouche	// in content that can be recognized at the specified threshold.
75*46c4c49dSIbrahim Kanouche	// Imagine a document with 100 tokens, and a threshold of 80%. This means
76*46c4c49dSIbrahim Kanouche	// that in a worst-case scenario, the 20 errors are evenly distributed to
77*46c4c49dSIbrahim Kanouche	// create the sortest possible token runs. In this case, there would be
78*46c4c49dSIbrahim Kanouche	// a repeating sequence of 4 good tokens and 1 errored token, occurring
79*46c4c49dSIbrahim Kanouche	// 20 times. This function returns the minimum token length, or returning
80*46c4c49dSIbrahim Kanouche	// a value of 1 if necessary (since a threshold level below 50% would generate
81*46c4c49dSIbrahim Kanouche	// a run of 0-length, which is meaningless.)
82*46c4c49dSIbrahim Kanouche	if threshold == 1.0 {
83*46c4c49dSIbrahim Kanouche		return 10 // avoid divide by 0
84*46c4c49dSIbrahim Kanouche	}
85*46c4c49dSIbrahim Kanouche
86*46c4c49dSIbrahim Kanouche	return max(1, int((threshold)/(1.0-threshold)))
87*46c4c49dSIbrahim Kanouche}
88*46c4c49dSIbrahim Kanouche
89*46c4c49dSIbrahim Kanouchefunc max(a, b int) int {
90*46c4c49dSIbrahim Kanouche	if a > b {
91*46c4c49dSIbrahim Kanouche		return a
92*46c4c49dSIbrahim Kanouche	}
93*46c4c49dSIbrahim Kanouche	return b
94*46c4c49dSIbrahim Kanouche}
95*46c4c49dSIbrahim Kanouche
96*46c4c49dSIbrahim Kanouche// AddContent incorporates the provided textual content into the classifier for
97*46c4c49dSIbrahim Kanouche// matching. This will not modify the supplied content.
98*46c4c49dSIbrahim Kanouchefunc (c *Classifier) AddContent(category, name, variant string, content []byte) {
99*46c4c49dSIbrahim Kanouche	// Since bytes.NewReader().Read() will never return an error, tokenizeStream
100*46c4c49dSIbrahim Kanouche	// will never return an error so it's okay to ignore the return value in this
101*46c4c49dSIbrahim Kanouche	// case.
102*46c4c49dSIbrahim Kanouche	doc, _ := tokenizeStream(bytes.NewReader(content), true, c.dict, true)
103*46c4c49dSIbrahim Kanouche	c.addDocument(category, name, variant, doc)
104*46c4c49dSIbrahim Kanouche}
105*46c4c49dSIbrahim Kanouche
106*46c4c49dSIbrahim Kanouche// addDocument takes a textual document and incorporates it into the classifier for matching.
107*46c4c49dSIbrahim Kanouchefunc (c *Classifier) addDocument(category, name, variant string, id *indexedDocument) {
108*46c4c49dSIbrahim Kanouche	// For documents that are part of the corpus, we add them to the dictionary and
109*46c4c49dSIbrahim Kanouche	// compute their associated search data eagerly so they are ready for matching against
110*46c4c49dSIbrahim Kanouche	// candidates.
111*46c4c49dSIbrahim Kanouche	indexName := c.generateDocName(category, name, variant)
112*46c4c49dSIbrahim Kanouche	id.generateSearchSet(c.q)
113*46c4c49dSIbrahim Kanouche	id.s.origin = indexName
114*46c4c49dSIbrahim Kanouche	c.docs[indexName] = id
115*46c4c49dSIbrahim Kanouche}
116*46c4c49dSIbrahim Kanouche
117*46c4c49dSIbrahim Kanouche// createTargetIndexedDocument creates an indexed document without adding the
118*46c4c49dSIbrahim Kanouche// words to the classifier dictionary. This should be used for matching targets, not
119*46c4c49dSIbrahim Kanouche// populating the corpus.
120*46c4c49dSIbrahim Kanouchefunc (c *Classifier) createTargetIndexedDocument(in []byte) *indexedDocument {
121*46c4c49dSIbrahim Kanouche	doc, _ := tokenizeStream(bytes.NewReader(in), true, c.dict, false)
122*46c4c49dSIbrahim Kanouche	return doc
123*46c4c49dSIbrahim Kanouche}
124*46c4c49dSIbrahim Kanouche
125*46c4c49dSIbrahim Kanouchefunc (c *Classifier) generateDocName(category, name, variant string) string {
126*46c4c49dSIbrahim Kanouche	return fmt.Sprintf("%s%c%s%c%s", category, os.PathSeparator, name, os.PathSeparator, variant)
127*46c4c49dSIbrahim Kanouche}
128*46c4c49dSIbrahim Kanouchefunc (c *Classifier) getIndexedDocument(category, name, variant string) *indexedDocument {
129*46c4c49dSIbrahim Kanouche	return c.docs[c.generateDocName(category, name, variant)]
130*46c4c49dSIbrahim Kanouche}
131*46c4c49dSIbrahim Kanouche
132*46c4c49dSIbrahim Kanouche// dictionary is used to intern all the token words encountered in the text corpus.
133*46c4c49dSIbrahim Kanouche// words and indices form an inverse mapping relationship. It is just a convenience type
134*46c4c49dSIbrahim Kanouche// over a pair of correlated maps.
135*46c4c49dSIbrahim Kanouchetype dictionary struct {
136*46c4c49dSIbrahim Kanouche	words   map[tokenID]string
137*46c4c49dSIbrahim Kanouche	indices map[string]tokenID
138*46c4c49dSIbrahim Kanouche}
139*46c4c49dSIbrahim Kanouche
140*46c4c49dSIbrahim Kanouchefunc newDictionary() *dictionary {
141*46c4c49dSIbrahim Kanouche	return &dictionary{
142*46c4c49dSIbrahim Kanouche		words:   make(map[tokenID]string),
143*46c4c49dSIbrahim Kanouche		indices: make(map[string]tokenID),
144*46c4c49dSIbrahim Kanouche	}
145*46c4c49dSIbrahim Kanouche}
146*46c4c49dSIbrahim Kanouche
147*46c4c49dSIbrahim Kanouche// add inserts the provided word into the dictionary if it does not already exist.
148*46c4c49dSIbrahim Kanouchefunc (d *dictionary) add(word string) tokenID {
149*46c4c49dSIbrahim Kanouche	if idx := d.getIndex(word); idx != unknownIndex {
150*46c4c49dSIbrahim Kanouche		return idx
151*46c4c49dSIbrahim Kanouche	}
152*46c4c49dSIbrahim Kanouche	// token IDs start from 1, 0 is reserved for the invalid ID
153*46c4c49dSIbrahim Kanouche	idx := tokenID(len(d.words) + 1)
154*46c4c49dSIbrahim Kanouche	d.words[idx] = word
155*46c4c49dSIbrahim Kanouche	d.indices[word] = idx
156*46c4c49dSIbrahim Kanouche	return idx
157*46c4c49dSIbrahim Kanouche}
158*46c4c49dSIbrahim Kanouche
159*46c4c49dSIbrahim Kanouchevar unknownWord = "UNKNOWN"
160*46c4c49dSIbrahim Kanouchevar unknownIndex = tokenID(0)
161*46c4c49dSIbrahim Kanouche
162*46c4c49dSIbrahim Kanouche// getIndex returns the index of the supplied word, or 0 if the word is not in the dictionary.
163*46c4c49dSIbrahim Kanouchefunc (d *dictionary) getIndex(word string) tokenID {
164*46c4c49dSIbrahim Kanouche	if idx, found := d.indices[word]; found {
165*46c4c49dSIbrahim Kanouche		return idx
166*46c4c49dSIbrahim Kanouche	}
167*46c4c49dSIbrahim Kanouche	return unknownIndex
168*46c4c49dSIbrahim Kanouche}
169*46c4c49dSIbrahim Kanouche
170*46c4c49dSIbrahim Kanouche// getWord returns the word associated with the index.
171*46c4c49dSIbrahim Kanouchefunc (d *dictionary) getWord(index tokenID) string {
172*46c4c49dSIbrahim Kanouche	if word, found := d.words[index]; found {
173*46c4c49dSIbrahim Kanouche		return word
174*46c4c49dSIbrahim Kanouche	}
175*46c4c49dSIbrahim Kanouche	return unknownWord
176*46c4c49dSIbrahim Kanouche}
177