xref: /aosp_15_r20/external/licenseclassifier/v2/scoring.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1// Copyright 2020 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package classifier
16
17import (
18	"strings"
19	"unicode"
20
21	"github.com/davecgh/go-spew/spew"
22	"github.com/sergi/go-diff/diffmatchpatch"
23)
24
25// return values for the distance function that explain why a diff
26// is not an acceptable match for the source document.
27const (
28	versionChange          = -1
29	introducedPhraseChange = -2
30	lesserGPLChange        = -3
31)
32
33// score computes a metric of similarity between the known and unknown
34// document, including the offsets into the unknown that yield the content
35// generating the computed similarity.
36func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownStart, unknownEnd int) (float64, int, int) {
37	if c.tc.traceScoring(known.s.origin) {
38		c.tc.trace("Scoring %s: [%d-%d]", known.s.origin, unknownStart, unknownEnd)
39	}
40
41	knownLength := known.size()
42	diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength)
43
44	start, end := diffRange(known.Norm, diffs)
45	distance := scoreDiffs(id, diffs[start:end])
46
47	if c.tc.traceScoring(known.s.origin) {
48		c.tc.trace("Diffs against %s:\n%s", known.s.origin, spew.Sdump(diffs[start:end]))
49	}
50
51	if distance < 0 {
52		// If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match.
53		if c.tc.traceScoring(known.s.origin) {
54			c.tc.trace("Distance result %v, rejected match", distance)
55		}
56		return 0.0, 0, 0
57	}
58
59	// Applying the diffRange-generated offsets provides the run of text from the
60	// target most closely correlated to the source.  This is the process for
61	// compensating for errors from the searchset. With the reduced text, we
62	// compute the final confidence score and exact text locations for the
63	// matched content.
64	// The diff slice consists of three regions: an optional deletion diff for
65	// target text before the source, n elements that represent the diff between
66	// the source and target, and an optional deletion diff for text after the
67	// target.  The helper functions identify the portions of the slice
68	// corresponding to those regions.  This results in a more accurate
69	// confidence score and better position detection of the source in the
70	// target.
71	conf, so, eo := confidencePercentage(knownLength, distance), textLength(diffs[:start]), textLength(diffs[end:])
72
73	if c.tc.traceScoring(known.s.origin) {
74		c.tc.trace("Score result: %v [%d-%d]", conf, so, eo)
75	}
76	return conf, so, eo
77}
78
79// confidencePercentage computes a confidence match score for the lengths,
80// handling the cases where source and target lengths differ.
81func confidencePercentage(klen, distance int) float64 {
82	// No text is matched at 100% confidence (avoid divide by zero).
83	if klen == 0 {
84		return 1.0
85	}
86
87	// Return a computed fractional match against the known text.
88	return 1.0 - float64(distance)/float64(klen)
89}
90
91// diffLevenshteinWord computes word-based Levenshtein count.
92func diffLevenshteinWord(diffs []diffmatchpatch.Diff) int {
93	levenshtein := 0
94	insertions := 0
95	deletions := 0
96
97	for _, aDiff := range diffs {
98		switch aDiff.Type {
99		case diffmatchpatch.DiffInsert:
100			insertions += wordLen(aDiff.Text)
101		case diffmatchpatch.DiffDelete:
102			deletions += wordLen(aDiff.Text)
103		case diffmatchpatch.DiffEqual:
104			// A deletion and an insertion is one substitution.
105			levenshtein += max(insertions, deletions)
106			insertions = 0
107			deletions = 0
108		}
109	}
110
111	levenshtein += max(insertions, deletions)
112	return levenshtein
113}
114
115func isVersionNumber(in string) bool {
116	for _, r := range in {
117		if !unicode.IsDigit(r) && r != '.' {
118			return false
119		}
120	}
121	return true
122}
123
124// scoreDiffs returns a score rating the acceptability of these diffs.  A
125// negative value means that the changes represented by the diff are not an
126// acceptable transformation since it would change the underlying license.  A
127// positive value indicates the Levenshtein word distance.
128func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int {
129	// We make a pass looking for unacceptable substitutions
130	// Delete diffs are always ordered before insert diffs. This is leveraged to
131	// analyze a change by checking an insert against the delete text that was
132	// previously cached.
133	prevText := ""
134	prevDelete := ""
135	for i, diff := range diffs {
136		text := diff.Text
137		switch diff.Type {
138		case diffmatchpatch.DiffInsert:
139			num := text
140			if i := strings.Index(num, " "); i != -1 {
141				num = num[0:i]
142			}
143			if isVersionNumber(num) && strings.HasSuffix(prevText, "version") {
144				if !strings.HasSuffix(prevText, "the standard version") && !strings.HasSuffix(prevText, "the contributor version") {
145					return versionChange
146				}
147			}
148			// There are certain phrases that can't be introduced to make a license
149			// hit.  TODO: would like to generate this programmatically. Most of
150			// these are words or phrases that appear in a single/small number of
151			// licenses. Can we leverage frequency analysis to identify these
152			// interesting words/phrases and auto-extract them?
153
154			inducedPhrases := map[string][]string{
155				"AGPL":                             {"affero"},
156				"Atmel":                            {"atmel"},
157				"Apache":                           {"apache"},
158				"BSD":                              {"bsd"},
159				"BSD-3-Clause-Attribution":         {"acknowledgment"},
160				"bzip2":                            {"seward"},
161				"GPL-2.0-with-GCC-exception":       {"gcc linking exception"},
162				"GPL-2.0-with-autoconf-exception":  {"autoconf exception"},
163				"GPL-2.0-with-bison-exception":     {"bison exception"},
164				"GPL-2.0-with-classpath-exception": {"class path exception"},
165				"GPL-2.0-with-font-exception":      {"font exception"},
166				"LGPL-2.0":                         {"library"},
167				"ImageMagick":                      {"imagemagick"},
168				"PHP":                              {"php"},
169				"SISSL":                            {"sun standards"},
170				"SGI-B":                            {"silicon graphics"},
171				"SunPro":                           {"sunpro"},
172				"X11":                              {"x consortium"},
173			}
174
175			for k, ps := range inducedPhrases {
176				if strings.HasPrefix(LicenseName(id), k) {
177					for _, p := range ps {
178						if strings.Index(text, p) != -1 {
179							// Check to make sure there isn't a corresponding diff for this
180							// insert that also contains the text. This prevents against diff
181							// blocks that are too big and force a false hit on this check,
182							// which usually happens with URLs since they are stored in one
183							// token but can happen in other cases as well. We don't look just
184							// for delete diffs because the subsequent text may reference the
185							// content in case a URL was truncated.
186							if i+1 < len(diffs) && strings.Index(diffs[i+1].Text, p) != -1 {
187								continue
188							}
189							return introducedPhraseChange
190						}
191					}
192				}
193			}
194
195			// Ignore changes between "library" and "lesser" in a GNU context as they
196			// changed the terms, but look for introductions of Lesser that would
197			// otherwise disqualify a match.
198			if text == "lesser" && strings.HasSuffix(prevText, "gnu") && prevDelete != "library" {
199				// The LGPL 3.0 doesn't have a standard header, so people tend to craft
200				// their own. As a result, sometimes the warranty clause refers to the
201				// GPL instead of the LGPL. This is fine from a licensing perspective,
202				// but we need to tweak matching to ignore that particular case. In
203				// other circumstances, inserting or removing the word Lesser in the
204				// GPL context is not an acceptable change. There is also a reference to
205				// it when suggesting to use the LGPL.
206				if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
207					return lesserGPLChange
208				}
209			}
210		case diffmatchpatch.DiffEqual:
211			prevText = text
212			prevDelete = ""
213
214		case diffmatchpatch.DiffDelete:
215			// Avoid substitution in most cases. The two exceptions are with usage
216			// statements that are talking about *another* license, and don't affect
217			// the detection of the current license.
218			if (text == "lesser" || text == "library") && strings.HasSuffix(prevText, "gnu") {
219				// Same as above to avoid matching GPL instead of LGPL here.
220				if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
221					return lesserGPLChange
222				}
223			}
224			prevDelete = text
225		}
226	}
227	return diffLevenshteinWord(diffs)
228}
229