1*46c4c49dSIbrahim Kanouche// Copyright 2020 Google Inc. 2*46c4c49dSIbrahim Kanouche// 3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License"); 4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License. 5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at 6*46c4c49dSIbrahim Kanouche// 7*46c4c49dSIbrahim Kanouche// http://www.apache.org/licenses/LICENSE-2.0 8*46c4c49dSIbrahim Kanouche// 9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software 10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS, 11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and 13*46c4c49dSIbrahim Kanouche// limitations under the License. 14*46c4c49dSIbrahim Kanouche 15*46c4c49dSIbrahim Kanouche// Package classifier provides the implementation of the v2 license classifier. 16*46c4c49dSIbrahim Kanouchepackage classifier 17*46c4c49dSIbrahim Kanouche 18*46c4c49dSIbrahim Kanoucheimport ( 19*46c4c49dSIbrahim Kanouche "bytes" 20*46c4c49dSIbrahim Kanouche "fmt" 21*46c4c49dSIbrahim Kanouche "os" 22*46c4c49dSIbrahim Kanouche "strings" 23*46c4c49dSIbrahim Kanouche) 24*46c4c49dSIbrahim Kanouche 25*46c4c49dSIbrahim Kanouchetype tokenID int // type to ensure safety when manipulating token identifiers. 26*46c4c49dSIbrahim Kanouche 27*46c4c49dSIbrahim Kanouche// token provides detailed information about a single textual token in the document. 28*46c4c49dSIbrahim Kanouchetype token struct { 29*46c4c49dSIbrahim Kanouche Text string // normalized text of the token 30*46c4c49dSIbrahim Kanouche Line int // line position of this token in the source 31*46c4c49dSIbrahim Kanouche Previous string // for the first token in a line, any previous text. 32*46c4c49dSIbrahim Kanouche} 33*46c4c49dSIbrahim Kanouche 34*46c4c49dSIbrahim Kanouchetype indexedToken struct { 35*46c4c49dSIbrahim Kanouche Line int // line position of this token in the source 36*46c4c49dSIbrahim Kanouche ID tokenID // identifier of the text in the dictionary 37*46c4c49dSIbrahim Kanouche} 38*46c4c49dSIbrahim Kanouche 39*46c4c49dSIbrahim Kanouchetype indexedDocument struct { 40*46c4c49dSIbrahim Kanouche Norm string // The normalized token sequence 41*46c4c49dSIbrahim Kanouche Tokens []indexedToken // ordered tokens of the document 42*46c4c49dSIbrahim Kanouche Matches Matches // these are matches identified while processing the original, untokenized text via regexp matching 43*46c4c49dSIbrahim Kanouche f *frequencyTable // frequencies computed for this document 44*46c4c49dSIbrahim Kanouche dict *dictionary // The corpus dictionary for this document 45*46c4c49dSIbrahim Kanouche s *searchSet // The searchset for this document 46*46c4c49dSIbrahim Kanouche runes []rune 47*46c4c49dSIbrahim Kanouche} 48*46c4c49dSIbrahim Kanouche 49*46c4c49dSIbrahim Kanouchefunc (d *indexedDocument) generateSearchSet(q int) { 50*46c4c49dSIbrahim Kanouche d.s = newSearchSet(d, q) 51*46c4c49dSIbrahim Kanouche} 52*46c4c49dSIbrahim Kanouche 53*46c4c49dSIbrahim Kanouchefunc (d *indexedDocument) size() int { 54*46c4c49dSIbrahim Kanouche return len(d.Tokens) 55*46c4c49dSIbrahim Kanouche} 56*46c4c49dSIbrahim Kanouche 57*46c4c49dSIbrahim Kanouche// normalized returns a string of the normalized tokens concatenated with a 58*46c4c49dSIbrahim Kanouche// single space. This is used by the diff algorithm. 59*46c4c49dSIbrahim Kanouche// TODO: it'd be more efficient to have the diff algorithm work with the raw tokens directly 60*46c4c49dSIbrahim Kanouche// and avoid these ephemeral allocations. 61*46c4c49dSIbrahim Kanouchefunc (d *indexedDocument) normalized() string { 62*46c4c49dSIbrahim Kanouche var w strings.Builder 63*46c4c49dSIbrahim Kanouche for i, t := range d.Tokens { 64*46c4c49dSIbrahim Kanouche w.WriteString(d.dict.getWord(t.ID)) 65*46c4c49dSIbrahim Kanouche if (i + 1) != d.size() { 66*46c4c49dSIbrahim Kanouche w.WriteString(" ") 67*46c4c49dSIbrahim Kanouche } 68*46c4c49dSIbrahim Kanouche } 69*46c4c49dSIbrahim Kanouche return w.String() 70*46c4c49dSIbrahim Kanouche} 71*46c4c49dSIbrahim Kanouche 72*46c4c49dSIbrahim Kanouchefunc computeQ(threshold float64) int { 73*46c4c49dSIbrahim Kanouche // q is the lower bound for token runs (q-grams) that must exist 74*46c4c49dSIbrahim Kanouche // in content that can be recognized at the specified threshold. 75*46c4c49dSIbrahim Kanouche // Imagine a document with 100 tokens, and a threshold of 80%. This means 76*46c4c49dSIbrahim Kanouche // that in a worst-case scenario, the 20 errors are evenly distributed to 77*46c4c49dSIbrahim Kanouche // create the sortest possible token runs. In this case, there would be 78*46c4c49dSIbrahim Kanouche // a repeating sequence of 4 good tokens and 1 errored token, occurring 79*46c4c49dSIbrahim Kanouche // 20 times. This function returns the minimum token length, or returning 80*46c4c49dSIbrahim Kanouche // a value of 1 if necessary (since a threshold level below 50% would generate 81*46c4c49dSIbrahim Kanouche // a run of 0-length, which is meaningless.) 82*46c4c49dSIbrahim Kanouche if threshold == 1.0 { 83*46c4c49dSIbrahim Kanouche return 10 // avoid divide by 0 84*46c4c49dSIbrahim Kanouche } 85*46c4c49dSIbrahim Kanouche 86*46c4c49dSIbrahim Kanouche return max(1, int((threshold)/(1.0-threshold))) 87*46c4c49dSIbrahim Kanouche} 88*46c4c49dSIbrahim Kanouche 89*46c4c49dSIbrahim Kanouchefunc max(a, b int) int { 90*46c4c49dSIbrahim Kanouche if a > b { 91*46c4c49dSIbrahim Kanouche return a 92*46c4c49dSIbrahim Kanouche } 93*46c4c49dSIbrahim Kanouche return b 94*46c4c49dSIbrahim Kanouche} 95*46c4c49dSIbrahim Kanouche 96*46c4c49dSIbrahim Kanouche// AddContent incorporates the provided textual content into the classifier for 97*46c4c49dSIbrahim Kanouche// matching. This will not modify the supplied content. 98*46c4c49dSIbrahim Kanouchefunc (c *Classifier) AddContent(category, name, variant string, content []byte) { 99*46c4c49dSIbrahim Kanouche // Since bytes.NewReader().Read() will never return an error, tokenizeStream 100*46c4c49dSIbrahim Kanouche // will never return an error so it's okay to ignore the return value in this 101*46c4c49dSIbrahim Kanouche // case. 102*46c4c49dSIbrahim Kanouche doc, _ := tokenizeStream(bytes.NewReader(content), true, c.dict, true) 103*46c4c49dSIbrahim Kanouche c.addDocument(category, name, variant, doc) 104*46c4c49dSIbrahim Kanouche} 105*46c4c49dSIbrahim Kanouche 106*46c4c49dSIbrahim Kanouche// addDocument takes a textual document and incorporates it into the classifier for matching. 107*46c4c49dSIbrahim Kanouchefunc (c *Classifier) addDocument(category, name, variant string, id *indexedDocument) { 108*46c4c49dSIbrahim Kanouche // For documents that are part of the corpus, we add them to the dictionary and 109*46c4c49dSIbrahim Kanouche // compute their associated search data eagerly so they are ready for matching against 110*46c4c49dSIbrahim Kanouche // candidates. 111*46c4c49dSIbrahim Kanouche indexName := c.generateDocName(category, name, variant) 112*46c4c49dSIbrahim Kanouche id.generateSearchSet(c.q) 113*46c4c49dSIbrahim Kanouche id.s.origin = indexName 114*46c4c49dSIbrahim Kanouche c.docs[indexName] = id 115*46c4c49dSIbrahim Kanouche} 116*46c4c49dSIbrahim Kanouche 117*46c4c49dSIbrahim Kanouche// createTargetIndexedDocument creates an indexed document without adding the 118*46c4c49dSIbrahim Kanouche// words to the classifier dictionary. This should be used for matching targets, not 119*46c4c49dSIbrahim Kanouche// populating the corpus. 120*46c4c49dSIbrahim Kanouchefunc (c *Classifier) createTargetIndexedDocument(in []byte) *indexedDocument { 121*46c4c49dSIbrahim Kanouche doc, _ := tokenizeStream(bytes.NewReader(in), true, c.dict, false) 122*46c4c49dSIbrahim Kanouche return doc 123*46c4c49dSIbrahim Kanouche} 124*46c4c49dSIbrahim Kanouche 125*46c4c49dSIbrahim Kanouchefunc (c *Classifier) generateDocName(category, name, variant string) string { 126*46c4c49dSIbrahim Kanouche return fmt.Sprintf("%s%c%s%c%s", category, os.PathSeparator, name, os.PathSeparator, variant) 127*46c4c49dSIbrahim Kanouche} 128*46c4c49dSIbrahim Kanouchefunc (c *Classifier) getIndexedDocument(category, name, variant string) *indexedDocument { 129*46c4c49dSIbrahim Kanouche return c.docs[c.generateDocName(category, name, variant)] 130*46c4c49dSIbrahim Kanouche} 131*46c4c49dSIbrahim Kanouche 132*46c4c49dSIbrahim Kanouche// dictionary is used to intern all the token words encountered in the text corpus. 133*46c4c49dSIbrahim Kanouche// words and indices form an inverse mapping relationship. It is just a convenience type 134*46c4c49dSIbrahim Kanouche// over a pair of correlated maps. 135*46c4c49dSIbrahim Kanouchetype dictionary struct { 136*46c4c49dSIbrahim Kanouche words map[tokenID]string 137*46c4c49dSIbrahim Kanouche indices map[string]tokenID 138*46c4c49dSIbrahim Kanouche} 139*46c4c49dSIbrahim Kanouche 140*46c4c49dSIbrahim Kanouchefunc newDictionary() *dictionary { 141*46c4c49dSIbrahim Kanouche return &dictionary{ 142*46c4c49dSIbrahim Kanouche words: make(map[tokenID]string), 143*46c4c49dSIbrahim Kanouche indices: make(map[string]tokenID), 144*46c4c49dSIbrahim Kanouche } 145*46c4c49dSIbrahim Kanouche} 146*46c4c49dSIbrahim Kanouche 147*46c4c49dSIbrahim Kanouche// add inserts the provided word into the dictionary if it does not already exist. 148*46c4c49dSIbrahim Kanouchefunc (d *dictionary) add(word string) tokenID { 149*46c4c49dSIbrahim Kanouche if idx := d.getIndex(word); idx != unknownIndex { 150*46c4c49dSIbrahim Kanouche return idx 151*46c4c49dSIbrahim Kanouche } 152*46c4c49dSIbrahim Kanouche // token IDs start from 1, 0 is reserved for the invalid ID 153*46c4c49dSIbrahim Kanouche idx := tokenID(len(d.words) + 1) 154*46c4c49dSIbrahim Kanouche d.words[idx] = word 155*46c4c49dSIbrahim Kanouche d.indices[word] = idx 156*46c4c49dSIbrahim Kanouche return idx 157*46c4c49dSIbrahim Kanouche} 158*46c4c49dSIbrahim Kanouche 159*46c4c49dSIbrahim Kanouchevar unknownWord = "UNKNOWN" 160*46c4c49dSIbrahim Kanouchevar unknownIndex = tokenID(0) 161*46c4c49dSIbrahim Kanouche 162*46c4c49dSIbrahim Kanouche// getIndex returns the index of the supplied word, or 0 if the word is not in the dictionary. 163*46c4c49dSIbrahim Kanouchefunc (d *dictionary) getIndex(word string) tokenID { 164*46c4c49dSIbrahim Kanouche if idx, found := d.indices[word]; found { 165*46c4c49dSIbrahim Kanouche return idx 166*46c4c49dSIbrahim Kanouche } 167*46c4c49dSIbrahim Kanouche return unknownIndex 168*46c4c49dSIbrahim Kanouche} 169*46c4c49dSIbrahim Kanouche 170*46c4c49dSIbrahim Kanouche// getWord returns the word associated with the index. 171*46c4c49dSIbrahim Kanouchefunc (d *dictionary) getWord(index tokenID) string { 172*46c4c49dSIbrahim Kanouche if word, found := d.words[index]; found { 173*46c4c49dSIbrahim Kanouche return word 174*46c4c49dSIbrahim Kanouche } 175*46c4c49dSIbrahim Kanouche return unknownWord 176*46c4c49dSIbrahim Kanouche} 177