xref: /aosp_15_r20/external/licenseclassifier/v2/classifier.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1// Copyright 2020 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package classifier
16
17import (
18	"bytes"
19	"fmt"
20	"io"
21	"io/ioutil"
22	"os"
23	"path/filepath"
24	"sort"
25	"strings"
26)
27
28// Match is the information about a single instance of a detected match.
29type Match struct {
30	Name            string
31	Confidence      float64
32	MatchType       string
33	Variant         string
34	StartLine       int
35	EndLine         int
36	StartTokenIndex int
37	EndTokenIndex   int
38}
39
40// Results captures the summary information and matches detected by the
41// classifier.
42type Results struct {
43	Matches         Matches
44	TotalInputLines int
45}
46
47// Matches is a sortable slice of Match.
48type Matches []*Match
49
50// Swap two elements of Matches.
51func (d Matches) Swap(i, j int) { d[i], d[j] = d[j], d[i] }
52func (d Matches) Len() int      { return len(d) }
53func (d Matches) Less(i, j int) bool {
54	di, dj := d[i], d[j]
55	// Return matches ordered by confidence
56	if di.Confidence != dj.Confidence {
57		return di.Confidence > dj.Confidence
58	}
59	// Licenses of same confidence are ordered by their appearance
60	if di.StartTokenIndex != dj.StartTokenIndex {
61		return di.StartTokenIndex < dj.StartTokenIndex
62	}
63	// Should never get here, but tiebreak based on the larger license.
64	return di.EndTokenIndex > dj.EndTokenIndex
65}
66
67// Match reports instances of the supplied content in the corpus.
68func (c *Classifier) match(in io.Reader) (Results, error) {
69	id, err := tokenizeStream(in, true, c.dict, false)
70	if err != nil {
71		return Results{}, err
72	}
73
74	firstPass := make(map[string]*indexedDocument)
75	for l, d := range c.docs {
76		sim := id.tokenSimilarity(d)
77
78		if c.tc.traceTokenize(l) {
79			c.tc.trace("Token similarity for %s: %.2f", l, sim)
80		}
81
82		if sim >= c.threshold {
83			firstPass[l] = d
84		}
85	}
86
87	if len(firstPass) == 0 {
88		return Results{
89			Matches:         nil,
90			TotalInputLines: 0,
91		}, nil
92	}
93
94	// Perform the expensive work of generating a searchset to look for token runs.
95	id.generateSearchSet(c.q)
96
97	var candidates Matches
98	candidates = append(candidates, id.Matches...)
99
100	for l, d := range firstPass {
101		matches := c.findPotentialMatches(d.s, id.s, c.threshold)
102		for _, m := range matches {
103			startIndex := m.TargetStart
104			endIndex := m.TargetEnd
105			conf, startOffset, endOffset := c.score(l, id, d, startIndex, endIndex)
106			if conf >= c.threshold && (endIndex-startIndex-startOffset-endOffset) > 0 {
107				candidates = append(candidates, &Match{
108					Name:            LicenseName(l),
109					Variant:         variantName(l),
110					MatchType:       detectionType(l),
111					Confidence:      conf,
112					StartLine:       id.Tokens[startIndex+startOffset].Line,
113					EndLine:         id.Tokens[endIndex-endOffset-1].Line,
114					StartTokenIndex: startIndex + startOffset,
115					EndTokenIndex:   endIndex - endOffset - 1,
116				})
117			}
118
119		}
120	}
121	sort.Sort(candidates)
122	retain := make([]bool, len(candidates))
123	for i, c := range candidates {
124		// Filter out overlapping licenses based primarily on confidence. Since
125		// the candidates slice is ordered by confidence, we look for overlaps and
126		// decide if we retain the record c.
127
128		// For each candidate, only add it to the report unless we have a
129		// higher-quality hit that contains these lines. In the case of two
130		// licenses having overlap, we consider 'token density' to break ties. If a
131		// less confident match of a larger license has more matching tokens than a
132		// perfect match of a smaller license, we want to keep that. This handles
133		// licenses that include another license as a subtext. NPL contains MPL
134		// as a concrete example.
135
136		keep := true
137		proposals := make(map[int]bool)
138		for j, o := range candidates {
139			if j == i {
140				break
141			}
142			// Make sure to only check containment on licenses that are still in consideration at this point.
143			if contains(c, o) && retain[j] {
144				// The license here can override a previous detection, but that isn't sufficient to be kept
145				// on its own. Consider the licenses Xnet, MPL-1.1 and NPL-1.1 in a file that just has MPL-1.1.
146				// The confidence rating on NPL-1.1 will cause Xnet to not be retained, which is correct, but it
147				// shouldn't be retained if the token confidence for MPL is higher than NPL since the NPL-specific
148				// bits are missing.
149
150				ctoks := float64(c.EndTokenIndex - c.StartTokenIndex)
151				otoks := float64(o.EndTokenIndex - o.StartTokenIndex)
152				cconf := ctoks * c.Confidence
153				oconf := otoks * o.Confidence
154
155				// If the two licenses are exactly the same confidence, that means we
156				// have an ambiguous detect and should retain both, so the caller can
157				// see and resolve the situation.
158				if cconf > oconf {
159					proposals[j] = false
160				} else if oconf > cconf {
161					keep = false
162				}
163			} else if overlaps(c, o) && retain[j] {
164				// if the ending and start lines exactly overlap, it's OK to keep both
165				if c.StartLine != o.EndLine {
166					keep = false
167				}
168			}
169
170			if !keep {
171				break
172			}
173		}
174		if keep {
175			retain[i] = true
176			for p, v := range proposals {
177				retain[p] = v
178			}
179		}
180	}
181
182	var out Matches
183	for i, keep := range retain {
184		if keep {
185			out = append(out, candidates[i])
186		}
187	}
188	return Results{
189		Matches:         out,
190		TotalInputLines: id.Tokens[len(id.Tokens)-1].Line,
191	}, nil
192}
193
194// Classifier provides methods for identifying open source licenses in text
195// content.
196type Classifier struct {
197	tc        *TraceConfiguration
198	dict      *dictionary
199	docs      map[string]*indexedDocument
200	threshold float64
201	q         int // The value of q for q-grams in this corpus
202}
203
204// NewClassifier creates a classifier with an empty corpus.
205func NewClassifier(threshold float64) *Classifier {
206	classifier := &Classifier{
207		tc:        new(TraceConfiguration),
208		dict:      newDictionary(),
209		docs:      make(map[string]*indexedDocument),
210		threshold: threshold,
211		q:         computeQ(threshold),
212	}
213	return classifier
214}
215
216// Normalize takes input content and applies the following transforms to aid in
217// identifying license content. The return value of this function is
218// line-separated text which is the basis for position values returned by the
219// classifier.
220//
221// 1. Breaks up long lines of text. This helps with detecting licenses like in
222// TODO(wcn):URL reference
223//
224// 2. Certain ignorable texts are removed to aid matching blocks of text.
225// Introductory lines such as "The MIT License" are removed. Copyright notices
226// are removed since the parties are variable and shouldn't impact matching.
227//
228// It is NOT necessary to call this function to simply identify licenses in a
229// file. It should only be called to aid presenting this information to the user
230// in context (for example, creating diffs of differences to canonical
231// licenses).
232//
233// It is an invariant of the classifier that calling Match(Normalize(in)) will
234// return the same results as Match(in).
235func (c *Classifier) Normalize(in []byte) []byte {
236	doc, err := tokenizeStream(bytes.NewReader(in), false, c.dict, true)
237	if err != nil {
238		panic("should not be reachable, since bytes.NewReader().Read() should never fail")
239	}
240
241	var buf bytes.Buffer
242
243	switch len(doc.Tokens) {
244	case 0:
245		return nil
246	case 1:
247		buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
248		return buf.Bytes()
249	}
250
251	prevLine := 1
252	buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
253	for _, t := range doc.Tokens[1:] {
254		// Only write out an EOL token that incremented the line
255		if t.Line == prevLine+1 {
256			buf.WriteString(eol)
257		}
258
259		// Only write tokens that aren't EOL
260		txt := c.dict.getWord(t.ID)
261
262		if txt != eol {
263			// Only put a space between tokens if the previous token was on the same
264			// line. This prevents spaces after an EOL
265			if t.Line == prevLine {
266				buf.WriteString(" ")
267			}
268			buf.WriteString(txt)
269		}
270
271		prevLine = t.Line
272	}
273	return buf.Bytes()
274}
275
276// LoadLicenses adds the contents of the supplied directory to the corpus of the
277// classifier.
278func (c *Classifier) LoadLicenses(dir string) error {
279	var files []string
280	err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
281		if err != nil {
282			return nil
283		}
284		if !strings.HasSuffix(path, "txt") {
285			return nil
286		}
287		files = append(files, path)
288		return nil
289	})
290	if err != nil {
291		return err
292	}
293
294	for _, f := range files {
295		relativePath := strings.Replace(f, dir, "", 1)
296		sep := fmt.Sprintf("%c", os.PathSeparator)
297		segments := strings.Split(relativePath, sep)
298		if len(segments) < 3 {
299			c.tc.trace("Insufficient segment count for path: %s", relativePath)
300			continue
301		}
302		category, name, variant := segments[1], segments[2], segments[3]
303		b, err := ioutil.ReadFile(f)
304		if err != nil {
305			return err
306		}
307
308		c.AddContent(category, name, variant, []byte(string(b)))
309	}
310	return nil
311}
312
313// SetTraceConfiguration installs a tracing configuration for the classifier.
314func (c *Classifier) SetTraceConfiguration(in *TraceConfiguration) {
315	c.tc = in
316	c.tc.init()
317}
318
319// Match finds matches within an unknown text. This will not modify the contents
320// of the supplied byte slice.
321func (c *Classifier) Match(in []byte) Results {
322	// Since bytes.NewReader().Read() will never return an error, tokenizeStream
323	// will never return an error so it's okay to ignore the return value in this
324	// case.
325	res, _ := c.MatchFrom(bytes.NewReader(in))
326	return res
327}
328
329// MatchFrom finds matches within the read content.
330func (c *Classifier) MatchFrom(in io.Reader) (Results, error) {
331	return c.match(in)
332}
333
334func detectionType(in string) string {
335	splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
336	return splits[0]
337}
338
339func variantName(in string) string {
340	splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
341	return splits[2]
342}
343
344// LicenseName produces the output name for a license, removing the internal structure
345// of the filename in use.
346func LicenseName(in string) string {
347	splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
348	return splits[1]
349}
350
351// contains returns true iff b is completely inside a
352func contains(a, b *Match) bool {
353	return a.StartLine <= b.StartLine && a.EndLine >= b.EndLine
354}
355
356// returns true iff b <= a <= c
357func between(a, b, c int) bool {
358	return b <= a && a <= c
359}
360
361// returns true iff the ranges covered by a and b overlap.
362func overlaps(a, b *Match) bool {
363	return between(a.StartLine, b.StartLine, b.EndLine) || between(a.EndLine, b.StartLine, b.EndLine)
364}
365