xref: /aosp_15_r20/external/licenseclassifier/classifier.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1// Copyright 2017 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package licenseclassifier provides methods to identify the open source
16// license that most closely matches an unknown license.
17package licenseclassifier
18
19import (
20	"archive/tar"
21	"bytes"
22	"compress/gzip"
23	"fmt"
24	"html"
25	"io"
26	"math"
27	"regexp"
28	"sort"
29	"strings"
30	"sync"
31	"unicode"
32
33	"github.com/google/licenseclassifier/stringclassifier"
34	"github.com/google/licenseclassifier/stringclassifier/searchset"
35)
36
37// DefaultConfidenceThreshold is the minimum confidence percentage we're willing to accept in order
38// to say that a match is good.
39const DefaultConfidenceThreshold = 0.80
40
41var (
42	// Normalizers is a list of functions that get applied to the strings
43	// before they are registered with the string classifier.
44	Normalizers = []stringclassifier.NormalizeFunc{
45		html.UnescapeString,
46		removeShebangLine,
47		RemoveNonWords,
48		NormalizeEquivalentWords,
49		NormalizePunctuation,
50		strings.ToLower,
51		removeIgnorableTexts,
52		stringclassifier.FlattenWhitespace,
53		strings.TrimSpace,
54	}
55
56	// commonLicenseWords are words that are common to all known licenses.
57	// If an unknown text doesn't have at least one of these, then we can
58	// ignore it.
59	commonLicenseWords = []*regexp.Regexp{
60		regexp.MustCompile(`(?i)\bcode\b`),
61		regexp.MustCompile(`(?i)\blicense\b`),
62		regexp.MustCompile(`(?i)\boriginal\b`),
63		regexp.MustCompile(`(?i)\brights\b`),
64		regexp.MustCompile(`(?i)\bsoftware\b`),
65		regexp.MustCompile(`(?i)\bterms\b`),
66		regexp.MustCompile(`(?i)\bversion\b`),
67		regexp.MustCompile(`(?i)\bwork\b`),
68	}
69)
70
71// License is a classifier pre-loaded with known open source licenses.
72type License struct {
73	c *stringclassifier.Classifier
74
75	// Threshold is the lowest confidence percentage acceptable for the
76	// classifier.
77	Threshold float64
78
79	// archive is a function that must return the contents of the license archive.
80	// When archive is nil, ReadLicenseFile(LicenseFile) is used to retrieve the
81	// contents.
82	archive func() ([]byte, error)
83}
84
85// OptionFunc set options on a License struct.
86type OptionFunc func(l *License) error
87
88// Archive is an OptionFunc to specify the location of the license archive file.
89func Archive(f string) OptionFunc {
90	return func(l *License) error {
91		l.archive = func() ([]byte, error) { return ReadLicenseFile(f) }
92		return nil
93	}
94}
95
96// ArchiveBytes is an OptionFunc that provides the contents of the license archive file.
97// The caller must not overwrite the contents of b as it is not copied.
98func ArchiveBytes(b []byte) OptionFunc {
99	return func(l *License) error {
100		l.archive = func() ([]byte, error) { return b, nil }
101		return nil
102	}
103}
104
105// ArchiveFunc is an OptionFunc that provides a function that must return the contents
106// of the license archive file.
107func ArchiveFunc(f func() ([]byte, error)) OptionFunc {
108	return func(l *License) error {
109		l.archive = f
110		return nil
111	}
112}
113
114// New creates a license classifier and pre-loads it with known open source licenses.
115func New(threshold float64, options ...OptionFunc) (*License, error) {
116	classifier := &License{
117		c:         stringclassifier.New(threshold, Normalizers...),
118		Threshold: threshold,
119	}
120
121	for _, o := range options {
122		err := o(classifier)
123		if err != nil {
124			return nil, fmt.Errorf("error setting option %v: %v", o, err)
125		}
126	}
127
128	if err := classifier.registerLicenses(); err != nil {
129		return nil, fmt.Errorf("cannot register licenses from archive: %v", err)
130	}
131	return classifier, nil
132}
133
134// NewWithForbiddenLicenses creates a license classifier and pre-loads it with
135// known open source licenses which are forbidden.
136func NewWithForbiddenLicenses(threshold float64, options ...OptionFunc) (*License, error) {
137	opts := []OptionFunc{Archive(ForbiddenLicenseArchive)}
138	opts = append(opts, options...)
139	return New(threshold, opts...)
140}
141
142// WithinConfidenceThreshold returns true if the confidence value is above or
143// equal to the confidence threshold.
144func (c *License) WithinConfidenceThreshold(conf float64) bool {
145	return conf > c.Threshold || math.Abs(conf-c.Threshold) < math.SmallestNonzeroFloat64
146}
147
148// NearestMatch returns the "nearest" match to the given set of known licenses.
149// Returned are the name of the license, and a confidence percentage indicating
150// how confident the classifier is in the result.
151func (c *License) NearestMatch(contents string) *stringclassifier.Match {
152	if !c.hasCommonLicenseWords(contents) {
153		return nil
154	}
155	m := c.c.NearestMatch(contents)
156	m.Name = strings.TrimSuffix(m.Name, ".header")
157	return m
158}
159
160// MultipleMatch matches all licenses within an unknown text.
161func (c *License) MultipleMatch(contents string, includeHeaders bool) stringclassifier.Matches {
162	norm := normalizeText(contents)
163	if !c.hasCommonLicenseWords(norm) {
164		return nil
165	}
166
167	m := make(map[stringclassifier.Match]bool)
168	var matches stringclassifier.Matches
169	for _, v := range c.c.MultipleMatch(norm) {
170		if !c.WithinConfidenceThreshold(v.Confidence) {
171			continue
172		}
173
174		if !includeHeaders && strings.HasSuffix(v.Name, ".header") {
175			continue
176		}
177
178		v.Name = strings.TrimSuffix(v.Name, ".header")
179		if re, ok := forbiddenRegexps[v.Name]; ok && !re.MatchString(norm) {
180			continue
181		}
182		if _, ok := m[*v]; !ok {
183			m[*v] = true
184			matches = append(matches, v)
185		}
186	}
187	sort.Sort(matches)
188	return matches
189}
190
191func normalizeText(s string) string {
192	for _, n := range Normalizers {
193		s = n(s)
194	}
195	return s
196}
197
198// hasCommonLicenseWords returns true if the unknown text has at least one word
199// that's common to all licenses.
200func (c *License) hasCommonLicenseWords(s string) bool {
201	for _, re := range commonLicenseWords {
202		if re.MatchString(s) {
203			return true
204		}
205	}
206	return false
207}
208
209type archivedValue struct {
210	name       string
211	normalized string
212	set        *searchset.SearchSet
213}
214
215// registerLicenses loads all known licenses and adds them to c as known values
216// for comparison. The allocated space after ingesting the 'licenses.db'
217// archive is ~167M.
218func (c *License) registerLicenses() error {
219	var contents []byte
220	var err error
221	if c.archive == nil {
222		contents, err = ReadLicenseFile(LicenseArchive)
223	} else {
224		contents, err = c.archive()
225	}
226	if err != nil {
227		return err
228	}
229
230	reader := bytes.NewReader(contents)
231	gr, err := gzip.NewReader(reader)
232	if err != nil {
233		return err
234	}
235	defer gr.Close()
236
237	tr := tar.NewReader(gr)
238
239	var muVals sync.Mutex
240	var vals []archivedValue
241	for i := 0; ; i++ {
242		hdr, err := tr.Next()
243		if err == io.EOF {
244			break
245		}
246		if err != nil {
247			return err
248		}
249
250		name := strings.TrimSuffix(hdr.Name, ".txt")
251
252		// Read normalized value.
253		var b bytes.Buffer
254		if _, err := io.Copy(&b, tr); err != nil {
255			return err
256		}
257		normalized := b.String()
258		b.Reset()
259
260		// Read precomputed hashes.
261		hdr, err = tr.Next()
262		if err != nil {
263			return err
264		}
265
266		if _, err := io.Copy(&b, tr); err != nil {
267			return err
268		}
269
270		var set searchset.SearchSet
271		searchset.Deserialize(&b, &set)
272
273		muVals.Lock()
274		vals = append(vals, archivedValue{name, normalized, &set})
275		muVals.Unlock()
276	}
277
278	for _, v := range vals {
279		if err = c.c.AddPrecomputedValue(v.name, v.normalized, v.set); err != nil {
280			return err
281		}
282	}
283	return nil
284}
285
286// endOfLicenseText is text commonly associated with the end of a license. We
287// can remove text that occurs after it.
288var endOfLicenseText = []string{
289	"END OF TERMS AND CONDITIONS",
290}
291
292// TrimExtraneousTrailingText removes text after an obvious end of the license
293// and does not include substantive text of the license.
294func TrimExtraneousTrailingText(s string) string {
295	for _, e := range endOfLicenseText {
296		if i := strings.LastIndex(s, e); i != -1 {
297			return s[:i+len(e)]
298		}
299	}
300	return s
301}
302
303var copyrightRE = regexp.MustCompile(`(?m)(?i:Copyright)\s+(?i:©\s+|\(c\)\s+)?(?:\d{2,4})(?:[-,]\s*\d{2,4})*,?\s*(?i:by)?\s*(.*?(?i:\s+Inc\.)?)[.,]?\s*(?i:All rights reserved\.?)?\s*$`)
304
305// CopyrightHolder finds a copyright notification, if it exists, and returns
306// the copyright holder.
307func CopyrightHolder(contents string) string {
308	matches := copyrightRE.FindStringSubmatch(contents)
309	if len(matches) == 2 {
310		return matches[1]
311	}
312	return ""
313}
314
315var publicDomainRE = regexp.MustCompile("(?i)(this file )?is( in the)? public domain")
316
317// HasPublicDomainNotice performs a simple regex over the contents to see if a
318// public domain notice is in there. As you can imagine, this isn't 100%
319// definitive, but can be useful if a license match isn't found.
320func (c *License) HasPublicDomainNotice(contents string) bool {
321	return publicDomainRE.FindString(contents) != ""
322}
323
324// ignorableTexts is a list of lines at the start of the string we can remove
325// to get a cleaner match.
326var ignorableTexts = []*regexp.Regexp{
327	regexp.MustCompile(`(?i)^(?:the )?mit license(?: \(mit\))?$`),
328	regexp.MustCompile(`(?i)^(?:new )?bsd license$`),
329	regexp.MustCompile(`(?i)^copyright and permission notice$`),
330	regexp.MustCompile(`(?i)^copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]? .*$`),
331	regexp.MustCompile(`(?i)^(all|some) rights reserved\.?$`),
332	regexp.MustCompile(`(?i)^@license$`),
333	regexp.MustCompile(`^\s*$`),
334}
335
336// removeIgnorableTexts removes common text, which is not important for
337// classification, that shows up before the body of the license.
338func removeIgnorableTexts(s string) string {
339	lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
340	var start int
341	for ; start < len(lines); start++ {
342		line := strings.TrimSpace(lines[start])
343		var matches bool
344		for _, re := range ignorableTexts {
345			if re.MatchString(line) {
346				matches = true
347				break
348			}
349		}
350		if !matches {
351			break
352		}
353	}
354	end := len(lines)
355	if start > end {
356		return "\n"
357	}
358	return strings.Join(lines[start:end], "\n") + "\n"
359}
360
361// removeShebangLine removes the '#!...' line if it's the first line in the
362// file. Note that if it's the only line in a comment, it won't be removed.
363func removeShebangLine(s string) string {
364	lines := strings.Split(s, "\n")
365	if len(lines) <= 1 || !strings.HasPrefix(lines[0], "#!") {
366		return s
367	}
368
369	return strings.Join(lines[1:], "\n")
370}
371
372// isDecorative returns true if the line is made up purely of non-letter and
373// non-digit characters.
374func isDecorative(s string) bool {
375	for _, c := range s {
376		if unicode.IsLetter(c) || unicode.IsDigit(c) {
377			return false
378		}
379	}
380	return true
381}
382
383var nonWords = regexp.MustCompile("[[:punct:]]+")
384
385// RemoveNonWords removes non-words from the string.
386func RemoveNonWords(s string) string {
387	return nonWords.ReplaceAllString(s, " ")
388}
389
390// interchangeablePunctutation is punctuation that can be normalized.
391var interchangeablePunctuation = []struct {
392	interchangeable *regexp.Regexp
393	substitute      string
394}{
395	// Hyphen, Dash, En Dash, and Em Dash.
396	{regexp.MustCompile(`[-‒–—]`), "-"},
397	// Single, Double, Curly Single, and Curly Double.
398	{regexp.MustCompile("['\"`‘’“”]"), "'"},
399	// Copyright.
400	{regexp.MustCompile("©"), "(c)"},
401	// Hyphen-separated words.
402	{regexp.MustCompile(`(\S)-\s+(\S)`), "${1}-${2}"},
403	// Currency and Section. (Different copies of the CDDL use each marker.)
404	{regexp.MustCompile("[§¤]"), "(s)"},
405	// Middle Dot
406	{regexp.MustCompile("·"), "*"},
407}
408
409// NormalizePunctuation takes all hyphens and quotes and normalizes them.
410func NormalizePunctuation(s string) string {
411	for _, iw := range interchangeablePunctuation {
412		s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
413	}
414	return s
415}
416
417// interchangeableWords are words we can substitute for a normalized form
418// without changing the meaning of the license. See
419// https://spdx.org/spdx-license-list/matching-guidelines for the list.
420var interchangeableWords = []struct {
421	interchangeable *regexp.Regexp
422	substitute      string
423}{
424	{regexp.MustCompile("(?i)Acknowledgment"), "Acknowledgement"},
425	{regexp.MustCompile("(?i)Analogue"), "Analog"},
426	{regexp.MustCompile("(?i)Analyse"), "Analyze"},
427	{regexp.MustCompile("(?i)Artefact"), "Artifact"},
428	{regexp.MustCompile("(?i)Authorisation"), "Authorization"},
429	{regexp.MustCompile("(?i)Authorised"), "Authorized"},
430	{regexp.MustCompile("(?i)Calibre"), "Caliber"},
431	{regexp.MustCompile("(?i)Cancelled"), "Canceled"},
432	{regexp.MustCompile("(?i)Capitalisations"), "Capitalizations"},
433	{regexp.MustCompile("(?i)Catalogue"), "Catalog"},
434	{regexp.MustCompile("(?i)Categorise"), "Categorize"},
435	{regexp.MustCompile("(?i)Centre"), "Center"},
436	{regexp.MustCompile("(?i)Emphasised"), "Emphasized"},
437	{regexp.MustCompile("(?i)Favour"), "Favor"},
438	{regexp.MustCompile("(?i)Favourite"), "Favorite"},
439	{regexp.MustCompile("(?i)Fulfil"), "Fulfill"},
440	{regexp.MustCompile("(?i)Fulfilment"), "Fulfillment"},
441	{regexp.MustCompile("(?i)Initialise"), "Initialize"},
442	{regexp.MustCompile("(?i)Judgment"), "Judgement"},
443	{regexp.MustCompile("(?i)Labelling"), "Labeling"},
444	{regexp.MustCompile("(?i)Labour"), "Labor"},
445	{regexp.MustCompile("(?i)Licence"), "License"},
446	{regexp.MustCompile("(?i)Maximise"), "Maximize"},
447	{regexp.MustCompile("(?i)Modelled"), "Modeled"},
448	{regexp.MustCompile("(?i)Modelling"), "Modeling"},
449	{regexp.MustCompile("(?i)Offence"), "Offense"},
450	{regexp.MustCompile("(?i)Optimise"), "Optimize"},
451	{regexp.MustCompile("(?i)Organisation"), "Organization"},
452	{regexp.MustCompile("(?i)Organise"), "Organize"},
453	{regexp.MustCompile("(?i)Practise"), "Practice"},
454	{regexp.MustCompile("(?i)Programme"), "Program"},
455	{regexp.MustCompile("(?i)Realise"), "Realize"},
456	{regexp.MustCompile("(?i)Recognise"), "Recognize"},
457	{regexp.MustCompile("(?i)Signalling"), "Signaling"},
458	{regexp.MustCompile("(?i)Sub[- ]license"), "Sublicense"},
459	{regexp.MustCompile("(?i)Utilisation"), "Utilization"},
460	{regexp.MustCompile("(?i)Whilst"), "While"},
461	{regexp.MustCompile("(?i)Wilful"), "Wilfull"},
462	{regexp.MustCompile("(?i)Non-commercial"), "Noncommercial"},
463	{regexp.MustCompile("(?i)Per cent"), "Percent"},
464}
465
466// NormalizeEquivalentWords normalizes equivalent words that are interchangeable.
467func NormalizeEquivalentWords(s string) string {
468	for _, iw := range interchangeableWords {
469		s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
470	}
471	return s
472}
473