xref: /aosp_15_r20/external/licenseclassifier/v2/classifier.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2020 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche
15*46c4c49dSIbrahim Kanouchepackage classifier
16*46c4c49dSIbrahim Kanouche
17*46c4c49dSIbrahim Kanoucheimport (
18*46c4c49dSIbrahim Kanouche	"bytes"
19*46c4c49dSIbrahim Kanouche	"fmt"
20*46c4c49dSIbrahim Kanouche	"io"
21*46c4c49dSIbrahim Kanouche	"io/ioutil"
22*46c4c49dSIbrahim Kanouche	"os"
23*46c4c49dSIbrahim Kanouche	"path/filepath"
24*46c4c49dSIbrahim Kanouche	"sort"
25*46c4c49dSIbrahim Kanouche	"strings"
26*46c4c49dSIbrahim Kanouche)
27*46c4c49dSIbrahim Kanouche
28*46c4c49dSIbrahim Kanouche// Match is the information about a single instance of a detected match.
29*46c4c49dSIbrahim Kanouchetype Match struct {
30*46c4c49dSIbrahim Kanouche	Name            string
31*46c4c49dSIbrahim Kanouche	Confidence      float64
32*46c4c49dSIbrahim Kanouche	MatchType       string
33*46c4c49dSIbrahim Kanouche	Variant         string
34*46c4c49dSIbrahim Kanouche	StartLine       int
35*46c4c49dSIbrahim Kanouche	EndLine         int
36*46c4c49dSIbrahim Kanouche	StartTokenIndex int
37*46c4c49dSIbrahim Kanouche	EndTokenIndex   int
38*46c4c49dSIbrahim Kanouche}
39*46c4c49dSIbrahim Kanouche
40*46c4c49dSIbrahim Kanouche// Results captures the summary information and matches detected by the
41*46c4c49dSIbrahim Kanouche// classifier.
42*46c4c49dSIbrahim Kanouchetype Results struct {
43*46c4c49dSIbrahim Kanouche	Matches         Matches
44*46c4c49dSIbrahim Kanouche	TotalInputLines int
45*46c4c49dSIbrahim Kanouche}
46*46c4c49dSIbrahim Kanouche
47*46c4c49dSIbrahim Kanouche// Matches is a sortable slice of Match.
48*46c4c49dSIbrahim Kanouchetype Matches []*Match
49*46c4c49dSIbrahim Kanouche
50*46c4c49dSIbrahim Kanouche// Swap two elements of Matches.
51*46c4c49dSIbrahim Kanouchefunc (d Matches) Swap(i, j int) { d[i], d[j] = d[j], d[i] }
52*46c4c49dSIbrahim Kanouchefunc (d Matches) Len() int      { return len(d) }
53*46c4c49dSIbrahim Kanouchefunc (d Matches) Less(i, j int) bool {
54*46c4c49dSIbrahim Kanouche	di, dj := d[i], d[j]
55*46c4c49dSIbrahim Kanouche	// Return matches ordered by confidence
56*46c4c49dSIbrahim Kanouche	if di.Confidence != dj.Confidence {
57*46c4c49dSIbrahim Kanouche		return di.Confidence > dj.Confidence
58*46c4c49dSIbrahim Kanouche	}
59*46c4c49dSIbrahim Kanouche	// Licenses of same confidence are ordered by their appearance
60*46c4c49dSIbrahim Kanouche	if di.StartTokenIndex != dj.StartTokenIndex {
61*46c4c49dSIbrahim Kanouche		return di.StartTokenIndex < dj.StartTokenIndex
62*46c4c49dSIbrahim Kanouche	}
63*46c4c49dSIbrahim Kanouche	// Should never get here, but tiebreak based on the larger license.
64*46c4c49dSIbrahim Kanouche	return di.EndTokenIndex > dj.EndTokenIndex
65*46c4c49dSIbrahim Kanouche}
66*46c4c49dSIbrahim Kanouche
67*46c4c49dSIbrahim Kanouche// Match reports instances of the supplied content in the corpus.
68*46c4c49dSIbrahim Kanouchefunc (c *Classifier) match(in io.Reader) (Results, error) {
69*46c4c49dSIbrahim Kanouche	id, err := tokenizeStream(in, true, c.dict, false)
70*46c4c49dSIbrahim Kanouche	if err != nil {
71*46c4c49dSIbrahim Kanouche		return Results{}, err
72*46c4c49dSIbrahim Kanouche	}
73*46c4c49dSIbrahim Kanouche
74*46c4c49dSIbrahim Kanouche	firstPass := make(map[string]*indexedDocument)
75*46c4c49dSIbrahim Kanouche	for l, d := range c.docs {
76*46c4c49dSIbrahim Kanouche		sim := id.tokenSimilarity(d)
77*46c4c49dSIbrahim Kanouche
78*46c4c49dSIbrahim Kanouche		if c.tc.traceTokenize(l) {
79*46c4c49dSIbrahim Kanouche			c.tc.trace("Token similarity for %s: %.2f", l, sim)
80*46c4c49dSIbrahim Kanouche		}
81*46c4c49dSIbrahim Kanouche
82*46c4c49dSIbrahim Kanouche		if sim >= c.threshold {
83*46c4c49dSIbrahim Kanouche			firstPass[l] = d
84*46c4c49dSIbrahim Kanouche		}
85*46c4c49dSIbrahim Kanouche	}
86*46c4c49dSIbrahim Kanouche
87*46c4c49dSIbrahim Kanouche	if len(firstPass) == 0 {
88*46c4c49dSIbrahim Kanouche		return Results{
89*46c4c49dSIbrahim Kanouche			Matches:         nil,
90*46c4c49dSIbrahim Kanouche			TotalInputLines: 0,
91*46c4c49dSIbrahim Kanouche		}, nil
92*46c4c49dSIbrahim Kanouche	}
93*46c4c49dSIbrahim Kanouche
94*46c4c49dSIbrahim Kanouche	// Perform the expensive work of generating a searchset to look for token runs.
95*46c4c49dSIbrahim Kanouche	id.generateSearchSet(c.q)
96*46c4c49dSIbrahim Kanouche
97*46c4c49dSIbrahim Kanouche	var candidates Matches
98*46c4c49dSIbrahim Kanouche	candidates = append(candidates, id.Matches...)
99*46c4c49dSIbrahim Kanouche
100*46c4c49dSIbrahim Kanouche	for l, d := range firstPass {
101*46c4c49dSIbrahim Kanouche		matches := c.findPotentialMatches(d.s, id.s, c.threshold)
102*46c4c49dSIbrahim Kanouche		for _, m := range matches {
103*46c4c49dSIbrahim Kanouche			startIndex := m.TargetStart
104*46c4c49dSIbrahim Kanouche			endIndex := m.TargetEnd
105*46c4c49dSIbrahim Kanouche			conf, startOffset, endOffset := c.score(l, id, d, startIndex, endIndex)
106*46c4c49dSIbrahim Kanouche			if conf >= c.threshold && (endIndex-startIndex-startOffset-endOffset) > 0 {
107*46c4c49dSIbrahim Kanouche				candidates = append(candidates, &Match{
108*46c4c49dSIbrahim Kanouche					Name:            LicenseName(l),
109*46c4c49dSIbrahim Kanouche					Variant:         variantName(l),
110*46c4c49dSIbrahim Kanouche					MatchType:       detectionType(l),
111*46c4c49dSIbrahim Kanouche					Confidence:      conf,
112*46c4c49dSIbrahim Kanouche					StartLine:       id.Tokens[startIndex+startOffset].Line,
113*46c4c49dSIbrahim Kanouche					EndLine:         id.Tokens[endIndex-endOffset-1].Line,
114*46c4c49dSIbrahim Kanouche					StartTokenIndex: startIndex + startOffset,
115*46c4c49dSIbrahim Kanouche					EndTokenIndex:   endIndex - endOffset - 1,
116*46c4c49dSIbrahim Kanouche				})
117*46c4c49dSIbrahim Kanouche			}
118*46c4c49dSIbrahim Kanouche
119*46c4c49dSIbrahim Kanouche		}
120*46c4c49dSIbrahim Kanouche	}
121*46c4c49dSIbrahim Kanouche	sort.Sort(candidates)
122*46c4c49dSIbrahim Kanouche	retain := make([]bool, len(candidates))
123*46c4c49dSIbrahim Kanouche	for i, c := range candidates {
124*46c4c49dSIbrahim Kanouche		// Filter out overlapping licenses based primarily on confidence. Since
125*46c4c49dSIbrahim Kanouche		// the candidates slice is ordered by confidence, we look for overlaps and
126*46c4c49dSIbrahim Kanouche		// decide if we retain the record c.
127*46c4c49dSIbrahim Kanouche
128*46c4c49dSIbrahim Kanouche		// For each candidate, only add it to the report unless we have a
129*46c4c49dSIbrahim Kanouche		// higher-quality hit that contains these lines. In the case of two
130*46c4c49dSIbrahim Kanouche		// licenses having overlap, we consider 'token density' to break ties. If a
131*46c4c49dSIbrahim Kanouche		// less confident match of a larger license has more matching tokens than a
132*46c4c49dSIbrahim Kanouche		// perfect match of a smaller license, we want to keep that. This handles
133*46c4c49dSIbrahim Kanouche		// licenses that include another license as a subtext. NPL contains MPL
134*46c4c49dSIbrahim Kanouche		// as a concrete example.
135*46c4c49dSIbrahim Kanouche
136*46c4c49dSIbrahim Kanouche		keep := true
137*46c4c49dSIbrahim Kanouche		proposals := make(map[int]bool)
138*46c4c49dSIbrahim Kanouche		for j, o := range candidates {
139*46c4c49dSIbrahim Kanouche			if j == i {
140*46c4c49dSIbrahim Kanouche				break
141*46c4c49dSIbrahim Kanouche			}
142*46c4c49dSIbrahim Kanouche			// Make sure to only check containment on licenses that are still in consideration at this point.
143*46c4c49dSIbrahim Kanouche			if contains(c, o) && retain[j] {
144*46c4c49dSIbrahim Kanouche				// The license here can override a previous detection, but that isn't sufficient to be kept
145*46c4c49dSIbrahim Kanouche				// on its own. Consider the licenses Xnet, MPL-1.1 and NPL-1.1 in a file that just has MPL-1.1.
146*46c4c49dSIbrahim Kanouche				// The confidence rating on NPL-1.1 will cause Xnet to not be retained, which is correct, but it
147*46c4c49dSIbrahim Kanouche				// shouldn't be retained if the token confidence for MPL is higher than NPL since the NPL-specific
148*46c4c49dSIbrahim Kanouche				// bits are missing.
149*46c4c49dSIbrahim Kanouche
150*46c4c49dSIbrahim Kanouche				ctoks := float64(c.EndTokenIndex - c.StartTokenIndex)
151*46c4c49dSIbrahim Kanouche				otoks := float64(o.EndTokenIndex - o.StartTokenIndex)
152*46c4c49dSIbrahim Kanouche				cconf := ctoks * c.Confidence
153*46c4c49dSIbrahim Kanouche				oconf := otoks * o.Confidence
154*46c4c49dSIbrahim Kanouche
155*46c4c49dSIbrahim Kanouche				// If the two licenses are exactly the same confidence, that means we
156*46c4c49dSIbrahim Kanouche				// have an ambiguous detect and should retain both, so the caller can
157*46c4c49dSIbrahim Kanouche				// see and resolve the situation.
158*46c4c49dSIbrahim Kanouche				if cconf > oconf {
159*46c4c49dSIbrahim Kanouche					proposals[j] = false
160*46c4c49dSIbrahim Kanouche				} else if oconf > cconf {
161*46c4c49dSIbrahim Kanouche					keep = false
162*46c4c49dSIbrahim Kanouche				}
163*46c4c49dSIbrahim Kanouche			} else if overlaps(c, o) && retain[j] {
164*46c4c49dSIbrahim Kanouche				// if the ending and start lines exactly overlap, it's OK to keep both
165*46c4c49dSIbrahim Kanouche				if c.StartLine != o.EndLine {
166*46c4c49dSIbrahim Kanouche					keep = false
167*46c4c49dSIbrahim Kanouche				}
168*46c4c49dSIbrahim Kanouche			}
169*46c4c49dSIbrahim Kanouche
170*46c4c49dSIbrahim Kanouche			if !keep {
171*46c4c49dSIbrahim Kanouche				break
172*46c4c49dSIbrahim Kanouche			}
173*46c4c49dSIbrahim Kanouche		}
174*46c4c49dSIbrahim Kanouche		if keep {
175*46c4c49dSIbrahim Kanouche			retain[i] = true
176*46c4c49dSIbrahim Kanouche			for p, v := range proposals {
177*46c4c49dSIbrahim Kanouche				retain[p] = v
178*46c4c49dSIbrahim Kanouche			}
179*46c4c49dSIbrahim Kanouche		}
180*46c4c49dSIbrahim Kanouche	}
181*46c4c49dSIbrahim Kanouche
182*46c4c49dSIbrahim Kanouche	var out Matches
183*46c4c49dSIbrahim Kanouche	for i, keep := range retain {
184*46c4c49dSIbrahim Kanouche		if keep {
185*46c4c49dSIbrahim Kanouche			out = append(out, candidates[i])
186*46c4c49dSIbrahim Kanouche		}
187*46c4c49dSIbrahim Kanouche	}
188*46c4c49dSIbrahim Kanouche	return Results{
189*46c4c49dSIbrahim Kanouche		Matches:         out,
190*46c4c49dSIbrahim Kanouche		TotalInputLines: id.Tokens[len(id.Tokens)-1].Line,
191*46c4c49dSIbrahim Kanouche	}, nil
192*46c4c49dSIbrahim Kanouche}
193*46c4c49dSIbrahim Kanouche
194*46c4c49dSIbrahim Kanouche// Classifier provides methods for identifying open source licenses in text
195*46c4c49dSIbrahim Kanouche// content.
196*46c4c49dSIbrahim Kanouchetype Classifier struct {
197*46c4c49dSIbrahim Kanouche	tc        *TraceConfiguration
198*46c4c49dSIbrahim Kanouche	dict      *dictionary
199*46c4c49dSIbrahim Kanouche	docs      map[string]*indexedDocument
200*46c4c49dSIbrahim Kanouche	threshold float64
201*46c4c49dSIbrahim Kanouche	q         int // The value of q for q-grams in this corpus
202*46c4c49dSIbrahim Kanouche}
203*46c4c49dSIbrahim Kanouche
204*46c4c49dSIbrahim Kanouche// NewClassifier creates a classifier with an empty corpus.
205*46c4c49dSIbrahim Kanouchefunc NewClassifier(threshold float64) *Classifier {
206*46c4c49dSIbrahim Kanouche	classifier := &Classifier{
207*46c4c49dSIbrahim Kanouche		tc:        new(TraceConfiguration),
208*46c4c49dSIbrahim Kanouche		dict:      newDictionary(),
209*46c4c49dSIbrahim Kanouche		docs:      make(map[string]*indexedDocument),
210*46c4c49dSIbrahim Kanouche		threshold: threshold,
211*46c4c49dSIbrahim Kanouche		q:         computeQ(threshold),
212*46c4c49dSIbrahim Kanouche	}
213*46c4c49dSIbrahim Kanouche	return classifier
214*46c4c49dSIbrahim Kanouche}
215*46c4c49dSIbrahim Kanouche
216*46c4c49dSIbrahim Kanouche// Normalize takes input content and applies the following transforms to aid in
217*46c4c49dSIbrahim Kanouche// identifying license content. The return value of this function is
218*46c4c49dSIbrahim Kanouche// line-separated text which is the basis for position values returned by the
219*46c4c49dSIbrahim Kanouche// classifier.
220*46c4c49dSIbrahim Kanouche//
221*46c4c49dSIbrahim Kanouche// 1. Breaks up long lines of text. This helps with detecting licenses like in
222*46c4c49dSIbrahim Kanouche// TODO(wcn):URL reference
223*46c4c49dSIbrahim Kanouche//
224*46c4c49dSIbrahim Kanouche// 2. Certain ignorable texts are removed to aid matching blocks of text.
225*46c4c49dSIbrahim Kanouche// Introductory lines such as "The MIT License" are removed. Copyright notices
226*46c4c49dSIbrahim Kanouche// are removed since the parties are variable and shouldn't impact matching.
227*46c4c49dSIbrahim Kanouche//
228*46c4c49dSIbrahim Kanouche// It is NOT necessary to call this function to simply identify licenses in a
229*46c4c49dSIbrahim Kanouche// file. It should only be called to aid presenting this information to the user
230*46c4c49dSIbrahim Kanouche// in context (for example, creating diffs of differences to canonical
231*46c4c49dSIbrahim Kanouche// licenses).
232*46c4c49dSIbrahim Kanouche//
233*46c4c49dSIbrahim Kanouche// It is an invariant of the classifier that calling Match(Normalize(in)) will
234*46c4c49dSIbrahim Kanouche// return the same results as Match(in).
235*46c4c49dSIbrahim Kanouchefunc (c *Classifier) Normalize(in []byte) []byte {
236*46c4c49dSIbrahim Kanouche	doc, err := tokenizeStream(bytes.NewReader(in), false, c.dict, true)
237*46c4c49dSIbrahim Kanouche	if err != nil {
238*46c4c49dSIbrahim Kanouche		panic("should not be reachable, since bytes.NewReader().Read() should never fail")
239*46c4c49dSIbrahim Kanouche	}
240*46c4c49dSIbrahim Kanouche
241*46c4c49dSIbrahim Kanouche	var buf bytes.Buffer
242*46c4c49dSIbrahim Kanouche
243*46c4c49dSIbrahim Kanouche	switch len(doc.Tokens) {
244*46c4c49dSIbrahim Kanouche	case 0:
245*46c4c49dSIbrahim Kanouche		return nil
246*46c4c49dSIbrahim Kanouche	case 1:
247*46c4c49dSIbrahim Kanouche		buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
248*46c4c49dSIbrahim Kanouche		return buf.Bytes()
249*46c4c49dSIbrahim Kanouche	}
250*46c4c49dSIbrahim Kanouche
251*46c4c49dSIbrahim Kanouche	prevLine := 1
252*46c4c49dSIbrahim Kanouche	buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
253*46c4c49dSIbrahim Kanouche	for _, t := range doc.Tokens[1:] {
254*46c4c49dSIbrahim Kanouche		// Only write out an EOL token that incremented the line
255*46c4c49dSIbrahim Kanouche		if t.Line == prevLine+1 {
256*46c4c49dSIbrahim Kanouche			buf.WriteString(eol)
257*46c4c49dSIbrahim Kanouche		}
258*46c4c49dSIbrahim Kanouche
259*46c4c49dSIbrahim Kanouche		// Only write tokens that aren't EOL
260*46c4c49dSIbrahim Kanouche		txt := c.dict.getWord(t.ID)
261*46c4c49dSIbrahim Kanouche
262*46c4c49dSIbrahim Kanouche		if txt != eol {
263*46c4c49dSIbrahim Kanouche			// Only put a space between tokens if the previous token was on the same
264*46c4c49dSIbrahim Kanouche			// line. This prevents spaces after an EOL
265*46c4c49dSIbrahim Kanouche			if t.Line == prevLine {
266*46c4c49dSIbrahim Kanouche				buf.WriteString(" ")
267*46c4c49dSIbrahim Kanouche			}
268*46c4c49dSIbrahim Kanouche			buf.WriteString(txt)
269*46c4c49dSIbrahim Kanouche		}
270*46c4c49dSIbrahim Kanouche
271*46c4c49dSIbrahim Kanouche		prevLine = t.Line
272*46c4c49dSIbrahim Kanouche	}
273*46c4c49dSIbrahim Kanouche	return buf.Bytes()
274*46c4c49dSIbrahim Kanouche}
275*46c4c49dSIbrahim Kanouche
276*46c4c49dSIbrahim Kanouche// LoadLicenses adds the contents of the supplied directory to the corpus of the
277*46c4c49dSIbrahim Kanouche// classifier.
278*46c4c49dSIbrahim Kanouchefunc (c *Classifier) LoadLicenses(dir string) error {
279*46c4c49dSIbrahim Kanouche	var files []string
280*46c4c49dSIbrahim Kanouche	err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
281*46c4c49dSIbrahim Kanouche		if err != nil {
282*46c4c49dSIbrahim Kanouche			return nil
283*46c4c49dSIbrahim Kanouche		}
284*46c4c49dSIbrahim Kanouche		if !strings.HasSuffix(path, "txt") {
285*46c4c49dSIbrahim Kanouche			return nil
286*46c4c49dSIbrahim Kanouche		}
287*46c4c49dSIbrahim Kanouche		files = append(files, path)
288*46c4c49dSIbrahim Kanouche		return nil
289*46c4c49dSIbrahim Kanouche	})
290*46c4c49dSIbrahim Kanouche	if err != nil {
291*46c4c49dSIbrahim Kanouche		return err
292*46c4c49dSIbrahim Kanouche	}
293*46c4c49dSIbrahim Kanouche
294*46c4c49dSIbrahim Kanouche	for _, f := range files {
295*46c4c49dSIbrahim Kanouche		relativePath := strings.Replace(f, dir, "", 1)
296*46c4c49dSIbrahim Kanouche		sep := fmt.Sprintf("%c", os.PathSeparator)
297*46c4c49dSIbrahim Kanouche		segments := strings.Split(relativePath, sep)
298*46c4c49dSIbrahim Kanouche		if len(segments) < 3 {
299*46c4c49dSIbrahim Kanouche			c.tc.trace("Insufficient segment count for path: %s", relativePath)
300*46c4c49dSIbrahim Kanouche			continue
301*46c4c49dSIbrahim Kanouche		}
302*46c4c49dSIbrahim Kanouche		category, name, variant := segments[1], segments[2], segments[3]
303*46c4c49dSIbrahim Kanouche		b, err := ioutil.ReadFile(f)
304*46c4c49dSIbrahim Kanouche		if err != nil {
305*46c4c49dSIbrahim Kanouche			return err
306*46c4c49dSIbrahim Kanouche		}
307*46c4c49dSIbrahim Kanouche
308*46c4c49dSIbrahim Kanouche		c.AddContent(category, name, variant, []byte(string(b)))
309*46c4c49dSIbrahim Kanouche	}
310*46c4c49dSIbrahim Kanouche	return nil
311*46c4c49dSIbrahim Kanouche}
312*46c4c49dSIbrahim Kanouche
313*46c4c49dSIbrahim Kanouche// SetTraceConfiguration installs a tracing configuration for the classifier.
314*46c4c49dSIbrahim Kanouchefunc (c *Classifier) SetTraceConfiguration(in *TraceConfiguration) {
315*46c4c49dSIbrahim Kanouche	c.tc = in
316*46c4c49dSIbrahim Kanouche	c.tc.init()
317*46c4c49dSIbrahim Kanouche}
318*46c4c49dSIbrahim Kanouche
319*46c4c49dSIbrahim Kanouche// Match finds matches within an unknown text. This will not modify the contents
320*46c4c49dSIbrahim Kanouche// of the supplied byte slice.
321*46c4c49dSIbrahim Kanouchefunc (c *Classifier) Match(in []byte) Results {
322*46c4c49dSIbrahim Kanouche	// Since bytes.NewReader().Read() will never return an error, tokenizeStream
323*46c4c49dSIbrahim Kanouche	// will never return an error so it's okay to ignore the return value in this
324*46c4c49dSIbrahim Kanouche	// case.
325*46c4c49dSIbrahim Kanouche	res, _ := c.MatchFrom(bytes.NewReader(in))
326*46c4c49dSIbrahim Kanouche	return res
327*46c4c49dSIbrahim Kanouche}
328*46c4c49dSIbrahim Kanouche
329*46c4c49dSIbrahim Kanouche// MatchFrom finds matches within the read content.
330*46c4c49dSIbrahim Kanouchefunc (c *Classifier) MatchFrom(in io.Reader) (Results, error) {
331*46c4c49dSIbrahim Kanouche	return c.match(in)
332*46c4c49dSIbrahim Kanouche}
333*46c4c49dSIbrahim Kanouche
334*46c4c49dSIbrahim Kanouchefunc detectionType(in string) string {
335*46c4c49dSIbrahim Kanouche	splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
336*46c4c49dSIbrahim Kanouche	return splits[0]
337*46c4c49dSIbrahim Kanouche}
338*46c4c49dSIbrahim Kanouche
339*46c4c49dSIbrahim Kanouchefunc variantName(in string) string {
340*46c4c49dSIbrahim Kanouche	splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
341*46c4c49dSIbrahim Kanouche	return splits[2]
342*46c4c49dSIbrahim Kanouche}
343*46c4c49dSIbrahim Kanouche
344*46c4c49dSIbrahim Kanouche// LicenseName produces the output name for a license, removing the internal structure
345*46c4c49dSIbrahim Kanouche// of the filename in use.
346*46c4c49dSIbrahim Kanouchefunc LicenseName(in string) string {
347*46c4c49dSIbrahim Kanouche	splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
348*46c4c49dSIbrahim Kanouche	return splits[1]
349*46c4c49dSIbrahim Kanouche}
350*46c4c49dSIbrahim Kanouche
351*46c4c49dSIbrahim Kanouche// contains returns true iff b is completely inside a
352*46c4c49dSIbrahim Kanouchefunc contains(a, b *Match) bool {
353*46c4c49dSIbrahim Kanouche	return a.StartLine <= b.StartLine && a.EndLine >= b.EndLine
354*46c4c49dSIbrahim Kanouche}
355*46c4c49dSIbrahim Kanouche
356*46c4c49dSIbrahim Kanouche// returns true iff b <= a <= c
357*46c4c49dSIbrahim Kanouchefunc between(a, b, c int) bool {
358*46c4c49dSIbrahim Kanouche	return b <= a && a <= c
359*46c4c49dSIbrahim Kanouche}
360*46c4c49dSIbrahim Kanouche
361*46c4c49dSIbrahim Kanouche// returns true iff the ranges covered by a and b overlap.
362*46c4c49dSIbrahim Kanouchefunc overlaps(a, b *Match) bool {
363*46c4c49dSIbrahim Kanouche	return between(a.StartLine, b.StartLine, b.EndLine) || between(a.EndLine, b.StartLine, b.EndLine)
364*46c4c49dSIbrahim Kanouche}
365