1// Copyright 2020 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package classifier 16 17import ( 18 "bytes" 19 "fmt" 20 "io" 21 "io/ioutil" 22 "os" 23 "path/filepath" 24 "sort" 25 "strings" 26) 27 28// Match is the information about a single instance of a detected match. 29type Match struct { 30 Name string 31 Confidence float64 32 MatchType string 33 Variant string 34 StartLine int 35 EndLine int 36 StartTokenIndex int 37 EndTokenIndex int 38} 39 40// Results captures the summary information and matches detected by the 41// classifier. 42type Results struct { 43 Matches Matches 44 TotalInputLines int 45} 46 47// Matches is a sortable slice of Match. 48type Matches []*Match 49 50// Swap two elements of Matches. 51func (d Matches) Swap(i, j int) { d[i], d[j] = d[j], d[i] } 52func (d Matches) Len() int { return len(d) } 53func (d Matches) Less(i, j int) bool { 54 di, dj := d[i], d[j] 55 // Return matches ordered by confidence 56 if di.Confidence != dj.Confidence { 57 return di.Confidence > dj.Confidence 58 } 59 // Licenses of same confidence are ordered by their appearance 60 if di.StartTokenIndex != dj.StartTokenIndex { 61 return di.StartTokenIndex < dj.StartTokenIndex 62 } 63 // Should never get here, but tiebreak based on the larger license. 64 return di.EndTokenIndex > dj.EndTokenIndex 65} 66 67// Match reports instances of the supplied content in the corpus. 68func (c *Classifier) match(in io.Reader) (Results, error) { 69 id, err := tokenizeStream(in, true, c.dict, false) 70 if err != nil { 71 return Results{}, err 72 } 73 74 firstPass := make(map[string]*indexedDocument) 75 for l, d := range c.docs { 76 sim := id.tokenSimilarity(d) 77 78 if c.tc.traceTokenize(l) { 79 c.tc.trace("Token similarity for %s: %.2f", l, sim) 80 } 81 82 if sim >= c.threshold { 83 firstPass[l] = d 84 } 85 } 86 87 if len(firstPass) == 0 { 88 return Results{ 89 Matches: nil, 90 TotalInputLines: 0, 91 }, nil 92 } 93 94 // Perform the expensive work of generating a searchset to look for token runs. 95 id.generateSearchSet(c.q) 96 97 var candidates Matches 98 candidates = append(candidates, id.Matches...) 99 100 for l, d := range firstPass { 101 matches := c.findPotentialMatches(d.s, id.s, c.threshold) 102 for _, m := range matches { 103 startIndex := m.TargetStart 104 endIndex := m.TargetEnd 105 conf, startOffset, endOffset := c.score(l, id, d, startIndex, endIndex) 106 if conf >= c.threshold && (endIndex-startIndex-startOffset-endOffset) > 0 { 107 candidates = append(candidates, &Match{ 108 Name: LicenseName(l), 109 Variant: variantName(l), 110 MatchType: detectionType(l), 111 Confidence: conf, 112 StartLine: id.Tokens[startIndex+startOffset].Line, 113 EndLine: id.Tokens[endIndex-endOffset-1].Line, 114 StartTokenIndex: startIndex + startOffset, 115 EndTokenIndex: endIndex - endOffset - 1, 116 }) 117 } 118 119 } 120 } 121 sort.Sort(candidates) 122 retain := make([]bool, len(candidates)) 123 for i, c := range candidates { 124 // Filter out overlapping licenses based primarily on confidence. Since 125 // the candidates slice is ordered by confidence, we look for overlaps and 126 // decide if we retain the record c. 127 128 // For each candidate, only add it to the report unless we have a 129 // higher-quality hit that contains these lines. In the case of two 130 // licenses having overlap, we consider 'token density' to break ties. If a 131 // less confident match of a larger license has more matching tokens than a 132 // perfect match of a smaller license, we want to keep that. This handles 133 // licenses that include another license as a subtext. NPL contains MPL 134 // as a concrete example. 135 136 keep := true 137 proposals := make(map[int]bool) 138 for j, o := range candidates { 139 if j == i { 140 break 141 } 142 // Make sure to only check containment on licenses that are still in consideration at this point. 143 if contains(c, o) && retain[j] { 144 // The license here can override a previous detection, but that isn't sufficient to be kept 145 // on its own. Consider the licenses Xnet, MPL-1.1 and NPL-1.1 in a file that just has MPL-1.1. 146 // The confidence rating on NPL-1.1 will cause Xnet to not be retained, which is correct, but it 147 // shouldn't be retained if the token confidence for MPL is higher than NPL since the NPL-specific 148 // bits are missing. 149 150 ctoks := float64(c.EndTokenIndex - c.StartTokenIndex) 151 otoks := float64(o.EndTokenIndex - o.StartTokenIndex) 152 cconf := ctoks * c.Confidence 153 oconf := otoks * o.Confidence 154 155 // If the two licenses are exactly the same confidence, that means we 156 // have an ambiguous detect and should retain both, so the caller can 157 // see and resolve the situation. 158 if cconf > oconf { 159 proposals[j] = false 160 } else if oconf > cconf { 161 keep = false 162 } 163 } else if overlaps(c, o) && retain[j] { 164 // if the ending and start lines exactly overlap, it's OK to keep both 165 if c.StartLine != o.EndLine { 166 keep = false 167 } 168 } 169 170 if !keep { 171 break 172 } 173 } 174 if keep { 175 retain[i] = true 176 for p, v := range proposals { 177 retain[p] = v 178 } 179 } 180 } 181 182 var out Matches 183 for i, keep := range retain { 184 if keep { 185 out = append(out, candidates[i]) 186 } 187 } 188 return Results{ 189 Matches: out, 190 TotalInputLines: id.Tokens[len(id.Tokens)-1].Line, 191 }, nil 192} 193 194// Classifier provides methods for identifying open source licenses in text 195// content. 196type Classifier struct { 197 tc *TraceConfiguration 198 dict *dictionary 199 docs map[string]*indexedDocument 200 threshold float64 201 q int // The value of q for q-grams in this corpus 202} 203 204// NewClassifier creates a classifier with an empty corpus. 205func NewClassifier(threshold float64) *Classifier { 206 classifier := &Classifier{ 207 tc: new(TraceConfiguration), 208 dict: newDictionary(), 209 docs: make(map[string]*indexedDocument), 210 threshold: threshold, 211 q: computeQ(threshold), 212 } 213 return classifier 214} 215 216// Normalize takes input content and applies the following transforms to aid in 217// identifying license content. The return value of this function is 218// line-separated text which is the basis for position values returned by the 219// classifier. 220// 221// 1. Breaks up long lines of text. This helps with detecting licenses like in 222// TODO(wcn):URL reference 223// 224// 2. Certain ignorable texts are removed to aid matching blocks of text. 225// Introductory lines such as "The MIT License" are removed. Copyright notices 226// are removed since the parties are variable and shouldn't impact matching. 227// 228// It is NOT necessary to call this function to simply identify licenses in a 229// file. It should only be called to aid presenting this information to the user 230// in context (for example, creating diffs of differences to canonical 231// licenses). 232// 233// It is an invariant of the classifier that calling Match(Normalize(in)) will 234// return the same results as Match(in). 235func (c *Classifier) Normalize(in []byte) []byte { 236 doc, err := tokenizeStream(bytes.NewReader(in), false, c.dict, true) 237 if err != nil { 238 panic("should not be reachable, since bytes.NewReader().Read() should never fail") 239 } 240 241 var buf bytes.Buffer 242 243 switch len(doc.Tokens) { 244 case 0: 245 return nil 246 case 1: 247 buf.WriteString(c.dict.getWord(doc.Tokens[0].ID)) 248 return buf.Bytes() 249 } 250 251 prevLine := 1 252 buf.WriteString(c.dict.getWord(doc.Tokens[0].ID)) 253 for _, t := range doc.Tokens[1:] { 254 // Only write out an EOL token that incremented the line 255 if t.Line == prevLine+1 { 256 buf.WriteString(eol) 257 } 258 259 // Only write tokens that aren't EOL 260 txt := c.dict.getWord(t.ID) 261 262 if txt != eol { 263 // Only put a space between tokens if the previous token was on the same 264 // line. This prevents spaces after an EOL 265 if t.Line == prevLine { 266 buf.WriteString(" ") 267 } 268 buf.WriteString(txt) 269 } 270 271 prevLine = t.Line 272 } 273 return buf.Bytes() 274} 275 276// LoadLicenses adds the contents of the supplied directory to the corpus of the 277// classifier. 278func (c *Classifier) LoadLicenses(dir string) error { 279 var files []string 280 err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { 281 if err != nil { 282 return nil 283 } 284 if !strings.HasSuffix(path, "txt") { 285 return nil 286 } 287 files = append(files, path) 288 return nil 289 }) 290 if err != nil { 291 return err 292 } 293 294 for _, f := range files { 295 relativePath := strings.Replace(f, dir, "", 1) 296 sep := fmt.Sprintf("%c", os.PathSeparator) 297 segments := strings.Split(relativePath, sep) 298 if len(segments) < 3 { 299 c.tc.trace("Insufficient segment count for path: %s", relativePath) 300 continue 301 } 302 category, name, variant := segments[1], segments[2], segments[3] 303 b, err := ioutil.ReadFile(f) 304 if err != nil { 305 return err 306 } 307 308 c.AddContent(category, name, variant, []byte(string(b))) 309 } 310 return nil 311} 312 313// SetTraceConfiguration installs a tracing configuration for the classifier. 314func (c *Classifier) SetTraceConfiguration(in *TraceConfiguration) { 315 c.tc = in 316 c.tc.init() 317} 318 319// Match finds matches within an unknown text. This will not modify the contents 320// of the supplied byte slice. 321func (c *Classifier) Match(in []byte) Results { 322 // Since bytes.NewReader().Read() will never return an error, tokenizeStream 323 // will never return an error so it's okay to ignore the return value in this 324 // case. 325 res, _ := c.MatchFrom(bytes.NewReader(in)) 326 return res 327} 328 329// MatchFrom finds matches within the read content. 330func (c *Classifier) MatchFrom(in io.Reader) (Results, error) { 331 return c.match(in) 332} 333 334func detectionType(in string) string { 335 splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator)) 336 return splits[0] 337} 338 339func variantName(in string) string { 340 splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator)) 341 return splits[2] 342} 343 344// LicenseName produces the output name for a license, removing the internal structure 345// of the filename in use. 346func LicenseName(in string) string { 347 splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator)) 348 return splits[1] 349} 350 351// contains returns true iff b is completely inside a 352func contains(a, b *Match) bool { 353 return a.StartLine <= b.StartLine && a.EndLine >= b.EndLine 354} 355 356// returns true iff b <= a <= c 357func between(a, b, c int) bool { 358 return b <= a && a <= c 359} 360 361// returns true iff the ranges covered by a and b overlap. 362func overlaps(a, b *Match) bool { 363 return between(a.StartLine, b.StartLine, b.EndLine) || between(a.EndLine, b.StartLine, b.EndLine) 364} 365