1*46c4c49dSIbrahim Kanouche// Copyright 2020 Google Inc. 2*46c4c49dSIbrahim Kanouche// 3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License"); 4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License. 5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at 6*46c4c49dSIbrahim Kanouche// 7*46c4c49dSIbrahim Kanouche// http://www.apache.org/licenses/LICENSE-2.0 8*46c4c49dSIbrahim Kanouche// 9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software 10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS, 11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and 13*46c4c49dSIbrahim Kanouche// limitations under the License. 14*46c4c49dSIbrahim Kanouche 15*46c4c49dSIbrahim Kanouchepackage classifier 16*46c4c49dSIbrahim Kanouche 17*46c4c49dSIbrahim Kanoucheimport ( 18*46c4c49dSIbrahim Kanouche "bytes" 19*46c4c49dSIbrahim Kanouche "fmt" 20*46c4c49dSIbrahim Kanouche "io" 21*46c4c49dSIbrahim Kanouche "io/ioutil" 22*46c4c49dSIbrahim Kanouche "os" 23*46c4c49dSIbrahim Kanouche "path/filepath" 24*46c4c49dSIbrahim Kanouche "sort" 25*46c4c49dSIbrahim Kanouche "strings" 26*46c4c49dSIbrahim Kanouche) 27*46c4c49dSIbrahim Kanouche 28*46c4c49dSIbrahim Kanouche// Match is the information about a single instance of a detected match. 29*46c4c49dSIbrahim Kanouchetype Match struct { 30*46c4c49dSIbrahim Kanouche Name string 31*46c4c49dSIbrahim Kanouche Confidence float64 32*46c4c49dSIbrahim Kanouche MatchType string 33*46c4c49dSIbrahim Kanouche Variant string 34*46c4c49dSIbrahim Kanouche StartLine int 35*46c4c49dSIbrahim Kanouche EndLine int 36*46c4c49dSIbrahim Kanouche StartTokenIndex int 37*46c4c49dSIbrahim Kanouche EndTokenIndex int 38*46c4c49dSIbrahim Kanouche} 39*46c4c49dSIbrahim Kanouche 40*46c4c49dSIbrahim Kanouche// Results captures the summary information and matches detected by the 41*46c4c49dSIbrahim Kanouche// classifier. 42*46c4c49dSIbrahim Kanouchetype Results struct { 43*46c4c49dSIbrahim Kanouche Matches Matches 44*46c4c49dSIbrahim Kanouche TotalInputLines int 45*46c4c49dSIbrahim Kanouche} 46*46c4c49dSIbrahim Kanouche 47*46c4c49dSIbrahim Kanouche// Matches is a sortable slice of Match. 48*46c4c49dSIbrahim Kanouchetype Matches []*Match 49*46c4c49dSIbrahim Kanouche 50*46c4c49dSIbrahim Kanouche// Swap two elements of Matches. 51*46c4c49dSIbrahim Kanouchefunc (d Matches) Swap(i, j int) { d[i], d[j] = d[j], d[i] } 52*46c4c49dSIbrahim Kanouchefunc (d Matches) Len() int { return len(d) } 53*46c4c49dSIbrahim Kanouchefunc (d Matches) Less(i, j int) bool { 54*46c4c49dSIbrahim Kanouche di, dj := d[i], d[j] 55*46c4c49dSIbrahim Kanouche // Return matches ordered by confidence 56*46c4c49dSIbrahim Kanouche if di.Confidence != dj.Confidence { 57*46c4c49dSIbrahim Kanouche return di.Confidence > dj.Confidence 58*46c4c49dSIbrahim Kanouche } 59*46c4c49dSIbrahim Kanouche // Licenses of same confidence are ordered by their appearance 60*46c4c49dSIbrahim Kanouche if di.StartTokenIndex != dj.StartTokenIndex { 61*46c4c49dSIbrahim Kanouche return di.StartTokenIndex < dj.StartTokenIndex 62*46c4c49dSIbrahim Kanouche } 63*46c4c49dSIbrahim Kanouche // Should never get here, but tiebreak based on the larger license. 64*46c4c49dSIbrahim Kanouche return di.EndTokenIndex > dj.EndTokenIndex 65*46c4c49dSIbrahim Kanouche} 66*46c4c49dSIbrahim Kanouche 67*46c4c49dSIbrahim Kanouche// Match reports instances of the supplied content in the corpus. 68*46c4c49dSIbrahim Kanouchefunc (c *Classifier) match(in io.Reader) (Results, error) { 69*46c4c49dSIbrahim Kanouche id, err := tokenizeStream(in, true, c.dict, false) 70*46c4c49dSIbrahim Kanouche if err != nil { 71*46c4c49dSIbrahim Kanouche return Results{}, err 72*46c4c49dSIbrahim Kanouche } 73*46c4c49dSIbrahim Kanouche 74*46c4c49dSIbrahim Kanouche firstPass := make(map[string]*indexedDocument) 75*46c4c49dSIbrahim Kanouche for l, d := range c.docs { 76*46c4c49dSIbrahim Kanouche sim := id.tokenSimilarity(d) 77*46c4c49dSIbrahim Kanouche 78*46c4c49dSIbrahim Kanouche if c.tc.traceTokenize(l) { 79*46c4c49dSIbrahim Kanouche c.tc.trace("Token similarity for %s: %.2f", l, sim) 80*46c4c49dSIbrahim Kanouche } 81*46c4c49dSIbrahim Kanouche 82*46c4c49dSIbrahim Kanouche if sim >= c.threshold { 83*46c4c49dSIbrahim Kanouche firstPass[l] = d 84*46c4c49dSIbrahim Kanouche } 85*46c4c49dSIbrahim Kanouche } 86*46c4c49dSIbrahim Kanouche 87*46c4c49dSIbrahim Kanouche if len(firstPass) == 0 { 88*46c4c49dSIbrahim Kanouche return Results{ 89*46c4c49dSIbrahim Kanouche Matches: nil, 90*46c4c49dSIbrahim Kanouche TotalInputLines: 0, 91*46c4c49dSIbrahim Kanouche }, nil 92*46c4c49dSIbrahim Kanouche } 93*46c4c49dSIbrahim Kanouche 94*46c4c49dSIbrahim Kanouche // Perform the expensive work of generating a searchset to look for token runs. 95*46c4c49dSIbrahim Kanouche id.generateSearchSet(c.q) 96*46c4c49dSIbrahim Kanouche 97*46c4c49dSIbrahim Kanouche var candidates Matches 98*46c4c49dSIbrahim Kanouche candidates = append(candidates, id.Matches...) 99*46c4c49dSIbrahim Kanouche 100*46c4c49dSIbrahim Kanouche for l, d := range firstPass { 101*46c4c49dSIbrahim Kanouche matches := c.findPotentialMatches(d.s, id.s, c.threshold) 102*46c4c49dSIbrahim Kanouche for _, m := range matches { 103*46c4c49dSIbrahim Kanouche startIndex := m.TargetStart 104*46c4c49dSIbrahim Kanouche endIndex := m.TargetEnd 105*46c4c49dSIbrahim Kanouche conf, startOffset, endOffset := c.score(l, id, d, startIndex, endIndex) 106*46c4c49dSIbrahim Kanouche if conf >= c.threshold && (endIndex-startIndex-startOffset-endOffset) > 0 { 107*46c4c49dSIbrahim Kanouche candidates = append(candidates, &Match{ 108*46c4c49dSIbrahim Kanouche Name: LicenseName(l), 109*46c4c49dSIbrahim Kanouche Variant: variantName(l), 110*46c4c49dSIbrahim Kanouche MatchType: detectionType(l), 111*46c4c49dSIbrahim Kanouche Confidence: conf, 112*46c4c49dSIbrahim Kanouche StartLine: id.Tokens[startIndex+startOffset].Line, 113*46c4c49dSIbrahim Kanouche EndLine: id.Tokens[endIndex-endOffset-1].Line, 114*46c4c49dSIbrahim Kanouche StartTokenIndex: startIndex + startOffset, 115*46c4c49dSIbrahim Kanouche EndTokenIndex: endIndex - endOffset - 1, 116*46c4c49dSIbrahim Kanouche }) 117*46c4c49dSIbrahim Kanouche } 118*46c4c49dSIbrahim Kanouche 119*46c4c49dSIbrahim Kanouche } 120*46c4c49dSIbrahim Kanouche } 121*46c4c49dSIbrahim Kanouche sort.Sort(candidates) 122*46c4c49dSIbrahim Kanouche retain := make([]bool, len(candidates)) 123*46c4c49dSIbrahim Kanouche for i, c := range candidates { 124*46c4c49dSIbrahim Kanouche // Filter out overlapping licenses based primarily on confidence. Since 125*46c4c49dSIbrahim Kanouche // the candidates slice is ordered by confidence, we look for overlaps and 126*46c4c49dSIbrahim Kanouche // decide if we retain the record c. 127*46c4c49dSIbrahim Kanouche 128*46c4c49dSIbrahim Kanouche // For each candidate, only add it to the report unless we have a 129*46c4c49dSIbrahim Kanouche // higher-quality hit that contains these lines. In the case of two 130*46c4c49dSIbrahim Kanouche // licenses having overlap, we consider 'token density' to break ties. If a 131*46c4c49dSIbrahim Kanouche // less confident match of a larger license has more matching tokens than a 132*46c4c49dSIbrahim Kanouche // perfect match of a smaller license, we want to keep that. This handles 133*46c4c49dSIbrahim Kanouche // licenses that include another license as a subtext. NPL contains MPL 134*46c4c49dSIbrahim Kanouche // as a concrete example. 135*46c4c49dSIbrahim Kanouche 136*46c4c49dSIbrahim Kanouche keep := true 137*46c4c49dSIbrahim Kanouche proposals := make(map[int]bool) 138*46c4c49dSIbrahim Kanouche for j, o := range candidates { 139*46c4c49dSIbrahim Kanouche if j == i { 140*46c4c49dSIbrahim Kanouche break 141*46c4c49dSIbrahim Kanouche } 142*46c4c49dSIbrahim Kanouche // Make sure to only check containment on licenses that are still in consideration at this point. 143*46c4c49dSIbrahim Kanouche if contains(c, o) && retain[j] { 144*46c4c49dSIbrahim Kanouche // The license here can override a previous detection, but that isn't sufficient to be kept 145*46c4c49dSIbrahim Kanouche // on its own. Consider the licenses Xnet, MPL-1.1 and NPL-1.1 in a file that just has MPL-1.1. 146*46c4c49dSIbrahim Kanouche // The confidence rating on NPL-1.1 will cause Xnet to not be retained, which is correct, but it 147*46c4c49dSIbrahim Kanouche // shouldn't be retained if the token confidence for MPL is higher than NPL since the NPL-specific 148*46c4c49dSIbrahim Kanouche // bits are missing. 149*46c4c49dSIbrahim Kanouche 150*46c4c49dSIbrahim Kanouche ctoks := float64(c.EndTokenIndex - c.StartTokenIndex) 151*46c4c49dSIbrahim Kanouche otoks := float64(o.EndTokenIndex - o.StartTokenIndex) 152*46c4c49dSIbrahim Kanouche cconf := ctoks * c.Confidence 153*46c4c49dSIbrahim Kanouche oconf := otoks * o.Confidence 154*46c4c49dSIbrahim Kanouche 155*46c4c49dSIbrahim Kanouche // If the two licenses are exactly the same confidence, that means we 156*46c4c49dSIbrahim Kanouche // have an ambiguous detect and should retain both, so the caller can 157*46c4c49dSIbrahim Kanouche // see and resolve the situation. 158*46c4c49dSIbrahim Kanouche if cconf > oconf { 159*46c4c49dSIbrahim Kanouche proposals[j] = false 160*46c4c49dSIbrahim Kanouche } else if oconf > cconf { 161*46c4c49dSIbrahim Kanouche keep = false 162*46c4c49dSIbrahim Kanouche } 163*46c4c49dSIbrahim Kanouche } else if overlaps(c, o) && retain[j] { 164*46c4c49dSIbrahim Kanouche // if the ending and start lines exactly overlap, it's OK to keep both 165*46c4c49dSIbrahim Kanouche if c.StartLine != o.EndLine { 166*46c4c49dSIbrahim Kanouche keep = false 167*46c4c49dSIbrahim Kanouche } 168*46c4c49dSIbrahim Kanouche } 169*46c4c49dSIbrahim Kanouche 170*46c4c49dSIbrahim Kanouche if !keep { 171*46c4c49dSIbrahim Kanouche break 172*46c4c49dSIbrahim Kanouche } 173*46c4c49dSIbrahim Kanouche } 174*46c4c49dSIbrahim Kanouche if keep { 175*46c4c49dSIbrahim Kanouche retain[i] = true 176*46c4c49dSIbrahim Kanouche for p, v := range proposals { 177*46c4c49dSIbrahim Kanouche retain[p] = v 178*46c4c49dSIbrahim Kanouche } 179*46c4c49dSIbrahim Kanouche } 180*46c4c49dSIbrahim Kanouche } 181*46c4c49dSIbrahim Kanouche 182*46c4c49dSIbrahim Kanouche var out Matches 183*46c4c49dSIbrahim Kanouche for i, keep := range retain { 184*46c4c49dSIbrahim Kanouche if keep { 185*46c4c49dSIbrahim Kanouche out = append(out, candidates[i]) 186*46c4c49dSIbrahim Kanouche } 187*46c4c49dSIbrahim Kanouche } 188*46c4c49dSIbrahim Kanouche return Results{ 189*46c4c49dSIbrahim Kanouche Matches: out, 190*46c4c49dSIbrahim Kanouche TotalInputLines: id.Tokens[len(id.Tokens)-1].Line, 191*46c4c49dSIbrahim Kanouche }, nil 192*46c4c49dSIbrahim Kanouche} 193*46c4c49dSIbrahim Kanouche 194*46c4c49dSIbrahim Kanouche// Classifier provides methods for identifying open source licenses in text 195*46c4c49dSIbrahim Kanouche// content. 196*46c4c49dSIbrahim Kanouchetype Classifier struct { 197*46c4c49dSIbrahim Kanouche tc *TraceConfiguration 198*46c4c49dSIbrahim Kanouche dict *dictionary 199*46c4c49dSIbrahim Kanouche docs map[string]*indexedDocument 200*46c4c49dSIbrahim Kanouche threshold float64 201*46c4c49dSIbrahim Kanouche q int // The value of q for q-grams in this corpus 202*46c4c49dSIbrahim Kanouche} 203*46c4c49dSIbrahim Kanouche 204*46c4c49dSIbrahim Kanouche// NewClassifier creates a classifier with an empty corpus. 205*46c4c49dSIbrahim Kanouchefunc NewClassifier(threshold float64) *Classifier { 206*46c4c49dSIbrahim Kanouche classifier := &Classifier{ 207*46c4c49dSIbrahim Kanouche tc: new(TraceConfiguration), 208*46c4c49dSIbrahim Kanouche dict: newDictionary(), 209*46c4c49dSIbrahim Kanouche docs: make(map[string]*indexedDocument), 210*46c4c49dSIbrahim Kanouche threshold: threshold, 211*46c4c49dSIbrahim Kanouche q: computeQ(threshold), 212*46c4c49dSIbrahim Kanouche } 213*46c4c49dSIbrahim Kanouche return classifier 214*46c4c49dSIbrahim Kanouche} 215*46c4c49dSIbrahim Kanouche 216*46c4c49dSIbrahim Kanouche// Normalize takes input content and applies the following transforms to aid in 217*46c4c49dSIbrahim Kanouche// identifying license content. The return value of this function is 218*46c4c49dSIbrahim Kanouche// line-separated text which is the basis for position values returned by the 219*46c4c49dSIbrahim Kanouche// classifier. 220*46c4c49dSIbrahim Kanouche// 221*46c4c49dSIbrahim Kanouche// 1. Breaks up long lines of text. This helps with detecting licenses like in 222*46c4c49dSIbrahim Kanouche// TODO(wcn):URL reference 223*46c4c49dSIbrahim Kanouche// 224*46c4c49dSIbrahim Kanouche// 2. Certain ignorable texts are removed to aid matching blocks of text. 225*46c4c49dSIbrahim Kanouche// Introductory lines such as "The MIT License" are removed. Copyright notices 226*46c4c49dSIbrahim Kanouche// are removed since the parties are variable and shouldn't impact matching. 227*46c4c49dSIbrahim Kanouche// 228*46c4c49dSIbrahim Kanouche// It is NOT necessary to call this function to simply identify licenses in a 229*46c4c49dSIbrahim Kanouche// file. It should only be called to aid presenting this information to the user 230*46c4c49dSIbrahim Kanouche// in context (for example, creating diffs of differences to canonical 231*46c4c49dSIbrahim Kanouche// licenses). 232*46c4c49dSIbrahim Kanouche// 233*46c4c49dSIbrahim Kanouche// It is an invariant of the classifier that calling Match(Normalize(in)) will 234*46c4c49dSIbrahim Kanouche// return the same results as Match(in). 235*46c4c49dSIbrahim Kanouchefunc (c *Classifier) Normalize(in []byte) []byte { 236*46c4c49dSIbrahim Kanouche doc, err := tokenizeStream(bytes.NewReader(in), false, c.dict, true) 237*46c4c49dSIbrahim Kanouche if err != nil { 238*46c4c49dSIbrahim Kanouche panic("should not be reachable, since bytes.NewReader().Read() should never fail") 239*46c4c49dSIbrahim Kanouche } 240*46c4c49dSIbrahim Kanouche 241*46c4c49dSIbrahim Kanouche var buf bytes.Buffer 242*46c4c49dSIbrahim Kanouche 243*46c4c49dSIbrahim Kanouche switch len(doc.Tokens) { 244*46c4c49dSIbrahim Kanouche case 0: 245*46c4c49dSIbrahim Kanouche return nil 246*46c4c49dSIbrahim Kanouche case 1: 247*46c4c49dSIbrahim Kanouche buf.WriteString(c.dict.getWord(doc.Tokens[0].ID)) 248*46c4c49dSIbrahim Kanouche return buf.Bytes() 249*46c4c49dSIbrahim Kanouche } 250*46c4c49dSIbrahim Kanouche 251*46c4c49dSIbrahim Kanouche prevLine := 1 252*46c4c49dSIbrahim Kanouche buf.WriteString(c.dict.getWord(doc.Tokens[0].ID)) 253*46c4c49dSIbrahim Kanouche for _, t := range doc.Tokens[1:] { 254*46c4c49dSIbrahim Kanouche // Only write out an EOL token that incremented the line 255*46c4c49dSIbrahim Kanouche if t.Line == prevLine+1 { 256*46c4c49dSIbrahim Kanouche buf.WriteString(eol) 257*46c4c49dSIbrahim Kanouche } 258*46c4c49dSIbrahim Kanouche 259*46c4c49dSIbrahim Kanouche // Only write tokens that aren't EOL 260*46c4c49dSIbrahim Kanouche txt := c.dict.getWord(t.ID) 261*46c4c49dSIbrahim Kanouche 262*46c4c49dSIbrahim Kanouche if txt != eol { 263*46c4c49dSIbrahim Kanouche // Only put a space between tokens if the previous token was on the same 264*46c4c49dSIbrahim Kanouche // line. This prevents spaces after an EOL 265*46c4c49dSIbrahim Kanouche if t.Line == prevLine { 266*46c4c49dSIbrahim Kanouche buf.WriteString(" ") 267*46c4c49dSIbrahim Kanouche } 268*46c4c49dSIbrahim Kanouche buf.WriteString(txt) 269*46c4c49dSIbrahim Kanouche } 270*46c4c49dSIbrahim Kanouche 271*46c4c49dSIbrahim Kanouche prevLine = t.Line 272*46c4c49dSIbrahim Kanouche } 273*46c4c49dSIbrahim Kanouche return buf.Bytes() 274*46c4c49dSIbrahim Kanouche} 275*46c4c49dSIbrahim Kanouche 276*46c4c49dSIbrahim Kanouche// LoadLicenses adds the contents of the supplied directory to the corpus of the 277*46c4c49dSIbrahim Kanouche// classifier. 278*46c4c49dSIbrahim Kanouchefunc (c *Classifier) LoadLicenses(dir string) error { 279*46c4c49dSIbrahim Kanouche var files []string 280*46c4c49dSIbrahim Kanouche err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { 281*46c4c49dSIbrahim Kanouche if err != nil { 282*46c4c49dSIbrahim Kanouche return nil 283*46c4c49dSIbrahim Kanouche } 284*46c4c49dSIbrahim Kanouche if !strings.HasSuffix(path, "txt") { 285*46c4c49dSIbrahim Kanouche return nil 286*46c4c49dSIbrahim Kanouche } 287*46c4c49dSIbrahim Kanouche files = append(files, path) 288*46c4c49dSIbrahim Kanouche return nil 289*46c4c49dSIbrahim Kanouche }) 290*46c4c49dSIbrahim Kanouche if err != nil { 291*46c4c49dSIbrahim Kanouche return err 292*46c4c49dSIbrahim Kanouche } 293*46c4c49dSIbrahim Kanouche 294*46c4c49dSIbrahim Kanouche for _, f := range files { 295*46c4c49dSIbrahim Kanouche relativePath := strings.Replace(f, dir, "", 1) 296*46c4c49dSIbrahim Kanouche sep := fmt.Sprintf("%c", os.PathSeparator) 297*46c4c49dSIbrahim Kanouche segments := strings.Split(relativePath, sep) 298*46c4c49dSIbrahim Kanouche if len(segments) < 3 { 299*46c4c49dSIbrahim Kanouche c.tc.trace("Insufficient segment count for path: %s", relativePath) 300*46c4c49dSIbrahim Kanouche continue 301*46c4c49dSIbrahim Kanouche } 302*46c4c49dSIbrahim Kanouche category, name, variant := segments[1], segments[2], segments[3] 303*46c4c49dSIbrahim Kanouche b, err := ioutil.ReadFile(f) 304*46c4c49dSIbrahim Kanouche if err != nil { 305*46c4c49dSIbrahim Kanouche return err 306*46c4c49dSIbrahim Kanouche } 307*46c4c49dSIbrahim Kanouche 308*46c4c49dSIbrahim Kanouche c.AddContent(category, name, variant, []byte(string(b))) 309*46c4c49dSIbrahim Kanouche } 310*46c4c49dSIbrahim Kanouche return nil 311*46c4c49dSIbrahim Kanouche} 312*46c4c49dSIbrahim Kanouche 313*46c4c49dSIbrahim Kanouche// SetTraceConfiguration installs a tracing configuration for the classifier. 314*46c4c49dSIbrahim Kanouchefunc (c *Classifier) SetTraceConfiguration(in *TraceConfiguration) { 315*46c4c49dSIbrahim Kanouche c.tc = in 316*46c4c49dSIbrahim Kanouche c.tc.init() 317*46c4c49dSIbrahim Kanouche} 318*46c4c49dSIbrahim Kanouche 319*46c4c49dSIbrahim Kanouche// Match finds matches within an unknown text. This will not modify the contents 320*46c4c49dSIbrahim Kanouche// of the supplied byte slice. 321*46c4c49dSIbrahim Kanouchefunc (c *Classifier) Match(in []byte) Results { 322*46c4c49dSIbrahim Kanouche // Since bytes.NewReader().Read() will never return an error, tokenizeStream 323*46c4c49dSIbrahim Kanouche // will never return an error so it's okay to ignore the return value in this 324*46c4c49dSIbrahim Kanouche // case. 325*46c4c49dSIbrahim Kanouche res, _ := c.MatchFrom(bytes.NewReader(in)) 326*46c4c49dSIbrahim Kanouche return res 327*46c4c49dSIbrahim Kanouche} 328*46c4c49dSIbrahim Kanouche 329*46c4c49dSIbrahim Kanouche// MatchFrom finds matches within the read content. 330*46c4c49dSIbrahim Kanouchefunc (c *Classifier) MatchFrom(in io.Reader) (Results, error) { 331*46c4c49dSIbrahim Kanouche return c.match(in) 332*46c4c49dSIbrahim Kanouche} 333*46c4c49dSIbrahim Kanouche 334*46c4c49dSIbrahim Kanouchefunc detectionType(in string) string { 335*46c4c49dSIbrahim Kanouche splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator)) 336*46c4c49dSIbrahim Kanouche return splits[0] 337*46c4c49dSIbrahim Kanouche} 338*46c4c49dSIbrahim Kanouche 339*46c4c49dSIbrahim Kanouchefunc variantName(in string) string { 340*46c4c49dSIbrahim Kanouche splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator)) 341*46c4c49dSIbrahim Kanouche return splits[2] 342*46c4c49dSIbrahim Kanouche} 343*46c4c49dSIbrahim Kanouche 344*46c4c49dSIbrahim Kanouche// LicenseName produces the output name for a license, removing the internal structure 345*46c4c49dSIbrahim Kanouche// of the filename in use. 346*46c4c49dSIbrahim Kanouchefunc LicenseName(in string) string { 347*46c4c49dSIbrahim Kanouche splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator)) 348*46c4c49dSIbrahim Kanouche return splits[1] 349*46c4c49dSIbrahim Kanouche} 350*46c4c49dSIbrahim Kanouche 351*46c4c49dSIbrahim Kanouche// contains returns true iff b is completely inside a 352*46c4c49dSIbrahim Kanouchefunc contains(a, b *Match) bool { 353*46c4c49dSIbrahim Kanouche return a.StartLine <= b.StartLine && a.EndLine >= b.EndLine 354*46c4c49dSIbrahim Kanouche} 355*46c4c49dSIbrahim Kanouche 356*46c4c49dSIbrahim Kanouche// returns true iff b <= a <= c 357*46c4c49dSIbrahim Kanouchefunc between(a, b, c int) bool { 358*46c4c49dSIbrahim Kanouche return b <= a && a <= c 359*46c4c49dSIbrahim Kanouche} 360*46c4c49dSIbrahim Kanouche 361*46c4c49dSIbrahim Kanouche// returns true iff the ranges covered by a and b overlap. 362*46c4c49dSIbrahim Kanouchefunc overlaps(a, b *Match) bool { 363*46c4c49dSIbrahim Kanouche return between(a.StartLine, b.StartLine, b.EndLine) || between(a.EndLine, b.StartLine, b.EndLine) 364*46c4c49dSIbrahim Kanouche} 365