1// Copyright 2017 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package licenseclassifier provides methods to identify the open source 16// license that most closely matches an unknown license. 17package licenseclassifier 18 19import ( 20 "archive/tar" 21 "bytes" 22 "compress/gzip" 23 "fmt" 24 "html" 25 "io" 26 "math" 27 "regexp" 28 "sort" 29 "strings" 30 "sync" 31 "unicode" 32 33 "github.com/google/licenseclassifier/stringclassifier" 34 "github.com/google/licenseclassifier/stringclassifier/searchset" 35) 36 37// DefaultConfidenceThreshold is the minimum confidence percentage we're willing to accept in order 38// to say that a match is good. 39const DefaultConfidenceThreshold = 0.80 40 41var ( 42 // Normalizers is a list of functions that get applied to the strings 43 // before they are registered with the string classifier. 44 Normalizers = []stringclassifier.NormalizeFunc{ 45 html.UnescapeString, 46 removeShebangLine, 47 RemoveNonWords, 48 NormalizeEquivalentWords, 49 NormalizePunctuation, 50 strings.ToLower, 51 removeIgnorableTexts, 52 stringclassifier.FlattenWhitespace, 53 strings.TrimSpace, 54 } 55 56 // commonLicenseWords are words that are common to all known licenses. 57 // If an unknown text doesn't have at least one of these, then we can 58 // ignore it. 59 commonLicenseWords = []*regexp.Regexp{ 60 regexp.MustCompile(`(?i)\bcode\b`), 61 regexp.MustCompile(`(?i)\blicense\b`), 62 regexp.MustCompile(`(?i)\boriginal\b`), 63 regexp.MustCompile(`(?i)\brights\b`), 64 regexp.MustCompile(`(?i)\bsoftware\b`), 65 regexp.MustCompile(`(?i)\bterms\b`), 66 regexp.MustCompile(`(?i)\bversion\b`), 67 regexp.MustCompile(`(?i)\bwork\b`), 68 } 69) 70 71// License is a classifier pre-loaded with known open source licenses. 72type License struct { 73 c *stringclassifier.Classifier 74 75 // Threshold is the lowest confidence percentage acceptable for the 76 // classifier. 77 Threshold float64 78 79 // archive is a function that must return the contents of the license archive. 80 // When archive is nil, ReadLicenseFile(LicenseFile) is used to retrieve the 81 // contents. 82 archive func() ([]byte, error) 83} 84 85// OptionFunc set options on a License struct. 86type OptionFunc func(l *License) error 87 88// Archive is an OptionFunc to specify the location of the license archive file. 89func Archive(f string) OptionFunc { 90 return func(l *License) error { 91 l.archive = func() ([]byte, error) { return ReadLicenseFile(f) } 92 return nil 93 } 94} 95 96// ArchiveBytes is an OptionFunc that provides the contents of the license archive file. 97// The caller must not overwrite the contents of b as it is not copied. 98func ArchiveBytes(b []byte) OptionFunc { 99 return func(l *License) error { 100 l.archive = func() ([]byte, error) { return b, nil } 101 return nil 102 } 103} 104 105// ArchiveFunc is an OptionFunc that provides a function that must return the contents 106// of the license archive file. 107func ArchiveFunc(f func() ([]byte, error)) OptionFunc { 108 return func(l *License) error { 109 l.archive = f 110 return nil 111 } 112} 113 114// New creates a license classifier and pre-loads it with known open source licenses. 115func New(threshold float64, options ...OptionFunc) (*License, error) { 116 classifier := &License{ 117 c: stringclassifier.New(threshold, Normalizers...), 118 Threshold: threshold, 119 } 120 121 for _, o := range options { 122 err := o(classifier) 123 if err != nil { 124 return nil, fmt.Errorf("error setting option %v: %v", o, err) 125 } 126 } 127 128 if err := classifier.registerLicenses(); err != nil { 129 return nil, fmt.Errorf("cannot register licenses from archive: %v", err) 130 } 131 return classifier, nil 132} 133 134// NewWithForbiddenLicenses creates a license classifier and pre-loads it with 135// known open source licenses which are forbidden. 136func NewWithForbiddenLicenses(threshold float64, options ...OptionFunc) (*License, error) { 137 opts := []OptionFunc{Archive(ForbiddenLicenseArchive)} 138 opts = append(opts, options...) 139 return New(threshold, opts...) 140} 141 142// WithinConfidenceThreshold returns true if the confidence value is above or 143// equal to the confidence threshold. 144func (c *License) WithinConfidenceThreshold(conf float64) bool { 145 return conf > c.Threshold || math.Abs(conf-c.Threshold) < math.SmallestNonzeroFloat64 146} 147 148// NearestMatch returns the "nearest" match to the given set of known licenses. 149// Returned are the name of the license, and a confidence percentage indicating 150// how confident the classifier is in the result. 151func (c *License) NearestMatch(contents string) *stringclassifier.Match { 152 if !c.hasCommonLicenseWords(contents) { 153 return nil 154 } 155 m := c.c.NearestMatch(contents) 156 m.Name = strings.TrimSuffix(m.Name, ".header") 157 return m 158} 159 160// MultipleMatch matches all licenses within an unknown text. 161func (c *License) MultipleMatch(contents string, includeHeaders bool) stringclassifier.Matches { 162 norm := normalizeText(contents) 163 if !c.hasCommonLicenseWords(norm) { 164 return nil 165 } 166 167 m := make(map[stringclassifier.Match]bool) 168 var matches stringclassifier.Matches 169 for _, v := range c.c.MultipleMatch(norm) { 170 if !c.WithinConfidenceThreshold(v.Confidence) { 171 continue 172 } 173 174 if !includeHeaders && strings.HasSuffix(v.Name, ".header") { 175 continue 176 } 177 178 v.Name = strings.TrimSuffix(v.Name, ".header") 179 if re, ok := forbiddenRegexps[v.Name]; ok && !re.MatchString(norm) { 180 continue 181 } 182 if _, ok := m[*v]; !ok { 183 m[*v] = true 184 matches = append(matches, v) 185 } 186 } 187 sort.Sort(matches) 188 return matches 189} 190 191func normalizeText(s string) string { 192 for _, n := range Normalizers { 193 s = n(s) 194 } 195 return s 196} 197 198// hasCommonLicenseWords returns true if the unknown text has at least one word 199// that's common to all licenses. 200func (c *License) hasCommonLicenseWords(s string) bool { 201 for _, re := range commonLicenseWords { 202 if re.MatchString(s) { 203 return true 204 } 205 } 206 return false 207} 208 209type archivedValue struct { 210 name string 211 normalized string 212 set *searchset.SearchSet 213} 214 215// registerLicenses loads all known licenses and adds them to c as known values 216// for comparison. The allocated space after ingesting the 'licenses.db' 217// archive is ~167M. 218func (c *License) registerLicenses() error { 219 var contents []byte 220 var err error 221 if c.archive == nil { 222 contents, err = ReadLicenseFile(LicenseArchive) 223 } else { 224 contents, err = c.archive() 225 } 226 if err != nil { 227 return err 228 } 229 230 reader := bytes.NewReader(contents) 231 gr, err := gzip.NewReader(reader) 232 if err != nil { 233 return err 234 } 235 defer gr.Close() 236 237 tr := tar.NewReader(gr) 238 239 var muVals sync.Mutex 240 var vals []archivedValue 241 for i := 0; ; i++ { 242 hdr, err := tr.Next() 243 if err == io.EOF { 244 break 245 } 246 if err != nil { 247 return err 248 } 249 250 name := strings.TrimSuffix(hdr.Name, ".txt") 251 252 // Read normalized value. 253 var b bytes.Buffer 254 if _, err := io.Copy(&b, tr); err != nil { 255 return err 256 } 257 normalized := b.String() 258 b.Reset() 259 260 // Read precomputed hashes. 261 hdr, err = tr.Next() 262 if err != nil { 263 return err 264 } 265 266 if _, err := io.Copy(&b, tr); err != nil { 267 return err 268 } 269 270 var set searchset.SearchSet 271 searchset.Deserialize(&b, &set) 272 273 muVals.Lock() 274 vals = append(vals, archivedValue{name, normalized, &set}) 275 muVals.Unlock() 276 } 277 278 for _, v := range vals { 279 if err = c.c.AddPrecomputedValue(v.name, v.normalized, v.set); err != nil { 280 return err 281 } 282 } 283 return nil 284} 285 286// endOfLicenseText is text commonly associated with the end of a license. We 287// can remove text that occurs after it. 288var endOfLicenseText = []string{ 289 "END OF TERMS AND CONDITIONS", 290} 291 292// TrimExtraneousTrailingText removes text after an obvious end of the license 293// and does not include substantive text of the license. 294func TrimExtraneousTrailingText(s string) string { 295 for _, e := range endOfLicenseText { 296 if i := strings.LastIndex(s, e); i != -1 { 297 return s[:i+len(e)] 298 } 299 } 300 return s 301} 302 303var copyrightRE = regexp.MustCompile(`(?m)(?i:Copyright)\s+(?i:©\s+|\(c\)\s+)?(?:\d{2,4})(?:[-,]\s*\d{2,4})*,?\s*(?i:by)?\s*(.*?(?i:\s+Inc\.)?)[.,]?\s*(?i:All rights reserved\.?)?\s*$`) 304 305// CopyrightHolder finds a copyright notification, if it exists, and returns 306// the copyright holder. 307func CopyrightHolder(contents string) string { 308 matches := copyrightRE.FindStringSubmatch(contents) 309 if len(matches) == 2 { 310 return matches[1] 311 } 312 return "" 313} 314 315var publicDomainRE = regexp.MustCompile("(?i)(this file )?is( in the)? public domain") 316 317// HasPublicDomainNotice performs a simple regex over the contents to see if a 318// public domain notice is in there. As you can imagine, this isn't 100% 319// definitive, but can be useful if a license match isn't found. 320func (c *License) HasPublicDomainNotice(contents string) bool { 321 return publicDomainRE.FindString(contents) != "" 322} 323 324// ignorableTexts is a list of lines at the start of the string we can remove 325// to get a cleaner match. 326var ignorableTexts = []*regexp.Regexp{ 327 regexp.MustCompile(`(?i)^(?:the )?mit license(?: \(mit\))?$`), 328 regexp.MustCompile(`(?i)^(?:new )?bsd license$`), 329 regexp.MustCompile(`(?i)^copyright and permission notice$`), 330 regexp.MustCompile(`(?i)^copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]? .*$`), 331 regexp.MustCompile(`(?i)^(all|some) rights reserved\.?$`), 332 regexp.MustCompile(`(?i)^@license$`), 333 regexp.MustCompile(`^\s*$`), 334} 335 336// removeIgnorableTexts removes common text, which is not important for 337// classification, that shows up before the body of the license. 338func removeIgnorableTexts(s string) string { 339 lines := strings.Split(strings.TrimRight(s, "\n"), "\n") 340 var start int 341 for ; start < len(lines); start++ { 342 line := strings.TrimSpace(lines[start]) 343 var matches bool 344 for _, re := range ignorableTexts { 345 if re.MatchString(line) { 346 matches = true 347 break 348 } 349 } 350 if !matches { 351 break 352 } 353 } 354 end := len(lines) 355 if start > end { 356 return "\n" 357 } 358 return strings.Join(lines[start:end], "\n") + "\n" 359} 360 361// removeShebangLine removes the '#!...' line if it's the first line in the 362// file. Note that if it's the only line in a comment, it won't be removed. 363func removeShebangLine(s string) string { 364 lines := strings.Split(s, "\n") 365 if len(lines) <= 1 || !strings.HasPrefix(lines[0], "#!") { 366 return s 367 } 368 369 return strings.Join(lines[1:], "\n") 370} 371 372// isDecorative returns true if the line is made up purely of non-letter and 373// non-digit characters. 374func isDecorative(s string) bool { 375 for _, c := range s { 376 if unicode.IsLetter(c) || unicode.IsDigit(c) { 377 return false 378 } 379 } 380 return true 381} 382 383var nonWords = regexp.MustCompile("[[:punct:]]+") 384 385// RemoveNonWords removes non-words from the string. 386func RemoveNonWords(s string) string { 387 return nonWords.ReplaceAllString(s, " ") 388} 389 390// interchangeablePunctutation is punctuation that can be normalized. 391var interchangeablePunctuation = []struct { 392 interchangeable *regexp.Regexp 393 substitute string 394}{ 395 // Hyphen, Dash, En Dash, and Em Dash. 396 {regexp.MustCompile(`[-‒–—]`), "-"}, 397 // Single, Double, Curly Single, and Curly Double. 398 {regexp.MustCompile("['\"`‘’“”]"), "'"}, 399 // Copyright. 400 {regexp.MustCompile("©"), "(c)"}, 401 // Hyphen-separated words. 402 {regexp.MustCompile(`(\S)-\s+(\S)`), "${1}-${2}"}, 403 // Currency and Section. (Different copies of the CDDL use each marker.) 404 {regexp.MustCompile("[§¤]"), "(s)"}, 405 // Middle Dot 406 {regexp.MustCompile("·"), "*"}, 407} 408 409// NormalizePunctuation takes all hyphens and quotes and normalizes them. 410func NormalizePunctuation(s string) string { 411 for _, iw := range interchangeablePunctuation { 412 s = iw.interchangeable.ReplaceAllString(s, iw.substitute) 413 } 414 return s 415} 416 417// interchangeableWords are words we can substitute for a normalized form 418// without changing the meaning of the license. See 419// https://spdx.org/spdx-license-list/matching-guidelines for the list. 420var interchangeableWords = []struct { 421 interchangeable *regexp.Regexp 422 substitute string 423}{ 424 {regexp.MustCompile("(?i)Acknowledgment"), "Acknowledgement"}, 425 {regexp.MustCompile("(?i)Analogue"), "Analog"}, 426 {regexp.MustCompile("(?i)Analyse"), "Analyze"}, 427 {regexp.MustCompile("(?i)Artefact"), "Artifact"}, 428 {regexp.MustCompile("(?i)Authorisation"), "Authorization"}, 429 {regexp.MustCompile("(?i)Authorised"), "Authorized"}, 430 {regexp.MustCompile("(?i)Calibre"), "Caliber"}, 431 {regexp.MustCompile("(?i)Cancelled"), "Canceled"}, 432 {regexp.MustCompile("(?i)Capitalisations"), "Capitalizations"}, 433 {regexp.MustCompile("(?i)Catalogue"), "Catalog"}, 434 {regexp.MustCompile("(?i)Categorise"), "Categorize"}, 435 {regexp.MustCompile("(?i)Centre"), "Center"}, 436 {regexp.MustCompile("(?i)Emphasised"), "Emphasized"}, 437 {regexp.MustCompile("(?i)Favour"), "Favor"}, 438 {regexp.MustCompile("(?i)Favourite"), "Favorite"}, 439 {regexp.MustCompile("(?i)Fulfil"), "Fulfill"}, 440 {regexp.MustCompile("(?i)Fulfilment"), "Fulfillment"}, 441 {regexp.MustCompile("(?i)Initialise"), "Initialize"}, 442 {regexp.MustCompile("(?i)Judgment"), "Judgement"}, 443 {regexp.MustCompile("(?i)Labelling"), "Labeling"}, 444 {regexp.MustCompile("(?i)Labour"), "Labor"}, 445 {regexp.MustCompile("(?i)Licence"), "License"}, 446 {regexp.MustCompile("(?i)Maximise"), "Maximize"}, 447 {regexp.MustCompile("(?i)Modelled"), "Modeled"}, 448 {regexp.MustCompile("(?i)Modelling"), "Modeling"}, 449 {regexp.MustCompile("(?i)Offence"), "Offense"}, 450 {regexp.MustCompile("(?i)Optimise"), "Optimize"}, 451 {regexp.MustCompile("(?i)Organisation"), "Organization"}, 452 {regexp.MustCompile("(?i)Organise"), "Organize"}, 453 {regexp.MustCompile("(?i)Practise"), "Practice"}, 454 {regexp.MustCompile("(?i)Programme"), "Program"}, 455 {regexp.MustCompile("(?i)Realise"), "Realize"}, 456 {regexp.MustCompile("(?i)Recognise"), "Recognize"}, 457 {regexp.MustCompile("(?i)Signalling"), "Signaling"}, 458 {regexp.MustCompile("(?i)Sub[- ]license"), "Sublicense"}, 459 {regexp.MustCompile("(?i)Utilisation"), "Utilization"}, 460 {regexp.MustCompile("(?i)Whilst"), "While"}, 461 {regexp.MustCompile("(?i)Wilful"), "Wilfull"}, 462 {regexp.MustCompile("(?i)Non-commercial"), "Noncommercial"}, 463 {regexp.MustCompile("(?i)Per cent"), "Percent"}, 464} 465 466// NormalizeEquivalentWords normalizes equivalent words that are interchangeable. 467func NormalizeEquivalentWords(s string) string { 468 for _, iw := range interchangeableWords { 469 s = iw.interchangeable.ReplaceAllString(s, iw.substitute) 470 } 471 return s 472} 473