xref: /aosp_15_r20/external/licenseclassifier/v2/scoring.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2020 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche
15*46c4c49dSIbrahim Kanouchepackage classifier
16*46c4c49dSIbrahim Kanouche
17*46c4c49dSIbrahim Kanoucheimport (
18*46c4c49dSIbrahim Kanouche	"strings"
19*46c4c49dSIbrahim Kanouche	"unicode"
20*46c4c49dSIbrahim Kanouche
21*46c4c49dSIbrahim Kanouche	"github.com/davecgh/go-spew/spew"
22*46c4c49dSIbrahim Kanouche	"github.com/sergi/go-diff/diffmatchpatch"
23*46c4c49dSIbrahim Kanouche)
24*46c4c49dSIbrahim Kanouche
25*46c4c49dSIbrahim Kanouche// return values for the distance function that explain why a diff
26*46c4c49dSIbrahim Kanouche// is not an acceptable match for the source document.
27*46c4c49dSIbrahim Kanoucheconst (
28*46c4c49dSIbrahim Kanouche	versionChange          = -1
29*46c4c49dSIbrahim Kanouche	introducedPhraseChange = -2
30*46c4c49dSIbrahim Kanouche	lesserGPLChange        = -3
31*46c4c49dSIbrahim Kanouche)
32*46c4c49dSIbrahim Kanouche
33*46c4c49dSIbrahim Kanouche// score computes a metric of similarity between the known and unknown
34*46c4c49dSIbrahim Kanouche// document, including the offsets into the unknown that yield the content
35*46c4c49dSIbrahim Kanouche// generating the computed similarity.
36*46c4c49dSIbrahim Kanouchefunc (c *Classifier) score(id string, unknown, known *indexedDocument, unknownStart, unknownEnd int) (float64, int, int) {
37*46c4c49dSIbrahim Kanouche	if c.tc.traceScoring(known.s.origin) {
38*46c4c49dSIbrahim Kanouche		c.tc.trace("Scoring %s: [%d-%d]", known.s.origin, unknownStart, unknownEnd)
39*46c4c49dSIbrahim Kanouche	}
40*46c4c49dSIbrahim Kanouche
41*46c4c49dSIbrahim Kanouche	knownLength := known.size()
42*46c4c49dSIbrahim Kanouche	diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength)
43*46c4c49dSIbrahim Kanouche
44*46c4c49dSIbrahim Kanouche	start, end := diffRange(known.Norm, diffs)
45*46c4c49dSIbrahim Kanouche	distance := scoreDiffs(id, diffs[start:end])
46*46c4c49dSIbrahim Kanouche
47*46c4c49dSIbrahim Kanouche	if c.tc.traceScoring(known.s.origin) {
48*46c4c49dSIbrahim Kanouche		c.tc.trace("Diffs against %s:\n%s", known.s.origin, spew.Sdump(diffs[start:end]))
49*46c4c49dSIbrahim Kanouche	}
50*46c4c49dSIbrahim Kanouche
51*46c4c49dSIbrahim Kanouche	if distance < 0 {
52*46c4c49dSIbrahim Kanouche		// If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match.
53*46c4c49dSIbrahim Kanouche		if c.tc.traceScoring(known.s.origin) {
54*46c4c49dSIbrahim Kanouche			c.tc.trace("Distance result %v, rejected match", distance)
55*46c4c49dSIbrahim Kanouche		}
56*46c4c49dSIbrahim Kanouche		return 0.0, 0, 0
57*46c4c49dSIbrahim Kanouche	}
58*46c4c49dSIbrahim Kanouche
59*46c4c49dSIbrahim Kanouche	// Applying the diffRange-generated offsets provides the run of text from the
60*46c4c49dSIbrahim Kanouche	// target most closely correlated to the source.  This is the process for
61*46c4c49dSIbrahim Kanouche	// compensating for errors from the searchset. With the reduced text, we
62*46c4c49dSIbrahim Kanouche	// compute the final confidence score and exact text locations for the
63*46c4c49dSIbrahim Kanouche	// matched content.
64*46c4c49dSIbrahim Kanouche	// The diff slice consists of three regions: an optional deletion diff for
65*46c4c49dSIbrahim Kanouche	// target text before the source, n elements that represent the diff between
66*46c4c49dSIbrahim Kanouche	// the source and target, and an optional deletion diff for text after the
67*46c4c49dSIbrahim Kanouche	// target.  The helper functions identify the portions of the slice
68*46c4c49dSIbrahim Kanouche	// corresponding to those regions.  This results in a more accurate
69*46c4c49dSIbrahim Kanouche	// confidence score and better position detection of the source in the
70*46c4c49dSIbrahim Kanouche	// target.
71*46c4c49dSIbrahim Kanouche	conf, so, eo := confidencePercentage(knownLength, distance), textLength(diffs[:start]), textLength(diffs[end:])
72*46c4c49dSIbrahim Kanouche
73*46c4c49dSIbrahim Kanouche	if c.tc.traceScoring(known.s.origin) {
74*46c4c49dSIbrahim Kanouche		c.tc.trace("Score result: %v [%d-%d]", conf, so, eo)
75*46c4c49dSIbrahim Kanouche	}
76*46c4c49dSIbrahim Kanouche	return conf, so, eo
77*46c4c49dSIbrahim Kanouche}
78*46c4c49dSIbrahim Kanouche
79*46c4c49dSIbrahim Kanouche// confidencePercentage computes a confidence match score for the lengths,
80*46c4c49dSIbrahim Kanouche// handling the cases where source and target lengths differ.
81*46c4c49dSIbrahim Kanouchefunc confidencePercentage(klen, distance int) float64 {
82*46c4c49dSIbrahim Kanouche	// No text is matched at 100% confidence (avoid divide by zero).
83*46c4c49dSIbrahim Kanouche	if klen == 0 {
84*46c4c49dSIbrahim Kanouche		return 1.0
85*46c4c49dSIbrahim Kanouche	}
86*46c4c49dSIbrahim Kanouche
87*46c4c49dSIbrahim Kanouche	// Return a computed fractional match against the known text.
88*46c4c49dSIbrahim Kanouche	return 1.0 - float64(distance)/float64(klen)
89*46c4c49dSIbrahim Kanouche}
90*46c4c49dSIbrahim Kanouche
91*46c4c49dSIbrahim Kanouche// diffLevenshteinWord computes word-based Levenshtein count.
92*46c4c49dSIbrahim Kanouchefunc diffLevenshteinWord(diffs []diffmatchpatch.Diff) int {
93*46c4c49dSIbrahim Kanouche	levenshtein := 0
94*46c4c49dSIbrahim Kanouche	insertions := 0
95*46c4c49dSIbrahim Kanouche	deletions := 0
96*46c4c49dSIbrahim Kanouche
97*46c4c49dSIbrahim Kanouche	for _, aDiff := range diffs {
98*46c4c49dSIbrahim Kanouche		switch aDiff.Type {
99*46c4c49dSIbrahim Kanouche		case diffmatchpatch.DiffInsert:
100*46c4c49dSIbrahim Kanouche			insertions += wordLen(aDiff.Text)
101*46c4c49dSIbrahim Kanouche		case diffmatchpatch.DiffDelete:
102*46c4c49dSIbrahim Kanouche			deletions += wordLen(aDiff.Text)
103*46c4c49dSIbrahim Kanouche		case diffmatchpatch.DiffEqual:
104*46c4c49dSIbrahim Kanouche			// A deletion and an insertion is one substitution.
105*46c4c49dSIbrahim Kanouche			levenshtein += max(insertions, deletions)
106*46c4c49dSIbrahim Kanouche			insertions = 0
107*46c4c49dSIbrahim Kanouche			deletions = 0
108*46c4c49dSIbrahim Kanouche		}
109*46c4c49dSIbrahim Kanouche	}
110*46c4c49dSIbrahim Kanouche
111*46c4c49dSIbrahim Kanouche	levenshtein += max(insertions, deletions)
112*46c4c49dSIbrahim Kanouche	return levenshtein
113*46c4c49dSIbrahim Kanouche}
114*46c4c49dSIbrahim Kanouche
115*46c4c49dSIbrahim Kanouchefunc isVersionNumber(in string) bool {
116*46c4c49dSIbrahim Kanouche	for _, r := range in {
117*46c4c49dSIbrahim Kanouche		if !unicode.IsDigit(r) && r != '.' {
118*46c4c49dSIbrahim Kanouche			return false
119*46c4c49dSIbrahim Kanouche		}
120*46c4c49dSIbrahim Kanouche	}
121*46c4c49dSIbrahim Kanouche	return true
122*46c4c49dSIbrahim Kanouche}
123*46c4c49dSIbrahim Kanouche
124*46c4c49dSIbrahim Kanouche// scoreDiffs returns a score rating the acceptability of these diffs.  A
125*46c4c49dSIbrahim Kanouche// negative value means that the changes represented by the diff are not an
126*46c4c49dSIbrahim Kanouche// acceptable transformation since it would change the underlying license.  A
127*46c4c49dSIbrahim Kanouche// positive value indicates the Levenshtein word distance.
128*46c4c49dSIbrahim Kanouchefunc scoreDiffs(id string, diffs []diffmatchpatch.Diff) int {
129*46c4c49dSIbrahim Kanouche	// We make a pass looking for unacceptable substitutions
130*46c4c49dSIbrahim Kanouche	// Delete diffs are always ordered before insert diffs. This is leveraged to
131*46c4c49dSIbrahim Kanouche	// analyze a change by checking an insert against the delete text that was
132*46c4c49dSIbrahim Kanouche	// previously cached.
133*46c4c49dSIbrahim Kanouche	prevText := ""
134*46c4c49dSIbrahim Kanouche	prevDelete := ""
135*46c4c49dSIbrahim Kanouche	for i, diff := range diffs {
136*46c4c49dSIbrahim Kanouche		text := diff.Text
137*46c4c49dSIbrahim Kanouche		switch diff.Type {
138*46c4c49dSIbrahim Kanouche		case diffmatchpatch.DiffInsert:
139*46c4c49dSIbrahim Kanouche			num := text
140*46c4c49dSIbrahim Kanouche			if i := strings.Index(num, " "); i != -1 {
141*46c4c49dSIbrahim Kanouche				num = num[0:i]
142*46c4c49dSIbrahim Kanouche			}
143*46c4c49dSIbrahim Kanouche			if isVersionNumber(num) && strings.HasSuffix(prevText, "version") {
144*46c4c49dSIbrahim Kanouche				if !strings.HasSuffix(prevText, "the standard version") && !strings.HasSuffix(prevText, "the contributor version") {
145*46c4c49dSIbrahim Kanouche					return versionChange
146*46c4c49dSIbrahim Kanouche				}
147*46c4c49dSIbrahim Kanouche			}
148*46c4c49dSIbrahim Kanouche			// There are certain phrases that can't be introduced to make a license
149*46c4c49dSIbrahim Kanouche			// hit.  TODO: would like to generate this programmatically. Most of
150*46c4c49dSIbrahim Kanouche			// these are words or phrases that appear in a single/small number of
151*46c4c49dSIbrahim Kanouche			// licenses. Can we leverage frequency analysis to identify these
152*46c4c49dSIbrahim Kanouche			// interesting words/phrases and auto-extract them?
153*46c4c49dSIbrahim Kanouche
154*46c4c49dSIbrahim Kanouche			inducedPhrases := map[string][]string{
155*46c4c49dSIbrahim Kanouche				"AGPL":                             {"affero"},
156*46c4c49dSIbrahim Kanouche				"Atmel":                            {"atmel"},
157*46c4c49dSIbrahim Kanouche				"Apache":                           {"apache"},
158*46c4c49dSIbrahim Kanouche				"BSD":                              {"bsd"},
159*46c4c49dSIbrahim Kanouche				"BSD-3-Clause-Attribution":         {"acknowledgment"},
160*46c4c49dSIbrahim Kanouche				"bzip2":                            {"seward"},
161*46c4c49dSIbrahim Kanouche				"GPL-2.0-with-GCC-exception":       {"gcc linking exception"},
162*46c4c49dSIbrahim Kanouche				"GPL-2.0-with-autoconf-exception":  {"autoconf exception"},
163*46c4c49dSIbrahim Kanouche				"GPL-2.0-with-bison-exception":     {"bison exception"},
164*46c4c49dSIbrahim Kanouche				"GPL-2.0-with-classpath-exception": {"class path exception"},
165*46c4c49dSIbrahim Kanouche				"GPL-2.0-with-font-exception":      {"font exception"},
166*46c4c49dSIbrahim Kanouche				"LGPL-2.0":                         {"library"},
167*46c4c49dSIbrahim Kanouche				"ImageMagick":                      {"imagemagick"},
168*46c4c49dSIbrahim Kanouche				"PHP":                              {"php"},
169*46c4c49dSIbrahim Kanouche				"SISSL":                            {"sun standards"},
170*46c4c49dSIbrahim Kanouche				"SGI-B":                            {"silicon graphics"},
171*46c4c49dSIbrahim Kanouche				"SunPro":                           {"sunpro"},
172*46c4c49dSIbrahim Kanouche				"X11":                              {"x consortium"},
173*46c4c49dSIbrahim Kanouche			}
174*46c4c49dSIbrahim Kanouche
175*46c4c49dSIbrahim Kanouche			for k, ps := range inducedPhrases {
176*46c4c49dSIbrahim Kanouche				if strings.HasPrefix(LicenseName(id), k) {
177*46c4c49dSIbrahim Kanouche					for _, p := range ps {
178*46c4c49dSIbrahim Kanouche						if strings.Index(text, p) != -1 {
179*46c4c49dSIbrahim Kanouche							// Check to make sure there isn't a corresponding diff for this
180*46c4c49dSIbrahim Kanouche							// insert that also contains the text. This prevents against diff
181*46c4c49dSIbrahim Kanouche							// blocks that are too big and force a false hit on this check,
182*46c4c49dSIbrahim Kanouche							// which usually happens with URLs since they are stored in one
183*46c4c49dSIbrahim Kanouche							// token but can happen in other cases as well. We don't look just
184*46c4c49dSIbrahim Kanouche							// for delete diffs because the subsequent text may reference the
185*46c4c49dSIbrahim Kanouche							// content in case a URL was truncated.
186*46c4c49dSIbrahim Kanouche							if i+1 < len(diffs) && strings.Index(diffs[i+1].Text, p) != -1 {
187*46c4c49dSIbrahim Kanouche								continue
188*46c4c49dSIbrahim Kanouche							}
189*46c4c49dSIbrahim Kanouche							return introducedPhraseChange
190*46c4c49dSIbrahim Kanouche						}
191*46c4c49dSIbrahim Kanouche					}
192*46c4c49dSIbrahim Kanouche				}
193*46c4c49dSIbrahim Kanouche			}
194*46c4c49dSIbrahim Kanouche
195*46c4c49dSIbrahim Kanouche			// Ignore changes between "library" and "lesser" in a GNU context as they
196*46c4c49dSIbrahim Kanouche			// changed the terms, but look for introductions of Lesser that would
197*46c4c49dSIbrahim Kanouche			// otherwise disqualify a match.
198*46c4c49dSIbrahim Kanouche			if text == "lesser" && strings.HasSuffix(prevText, "gnu") && prevDelete != "library" {
199*46c4c49dSIbrahim Kanouche				// The LGPL 3.0 doesn't have a standard header, so people tend to craft
200*46c4c49dSIbrahim Kanouche				// their own. As a result, sometimes the warranty clause refers to the
201*46c4c49dSIbrahim Kanouche				// GPL instead of the LGPL. This is fine from a licensing perspective,
202*46c4c49dSIbrahim Kanouche				// but we need to tweak matching to ignore that particular case. In
203*46c4c49dSIbrahim Kanouche				// other circumstances, inserting or removing the word Lesser in the
204*46c4c49dSIbrahim Kanouche				// GPL context is not an acceptable change. There is also a reference to
205*46c4c49dSIbrahim Kanouche				// it when suggesting to use the LGPL.
206*46c4c49dSIbrahim Kanouche				if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
207*46c4c49dSIbrahim Kanouche					return lesserGPLChange
208*46c4c49dSIbrahim Kanouche				}
209*46c4c49dSIbrahim Kanouche			}
210*46c4c49dSIbrahim Kanouche		case diffmatchpatch.DiffEqual:
211*46c4c49dSIbrahim Kanouche			prevText = text
212*46c4c49dSIbrahim Kanouche			prevDelete = ""
213*46c4c49dSIbrahim Kanouche
214*46c4c49dSIbrahim Kanouche		case diffmatchpatch.DiffDelete:
215*46c4c49dSIbrahim Kanouche			// Avoid substitution in most cases. The two exceptions are with usage
216*46c4c49dSIbrahim Kanouche			// statements that are talking about *another* license, and don't affect
217*46c4c49dSIbrahim Kanouche			// the detection of the current license.
218*46c4c49dSIbrahim Kanouche			if (text == "lesser" || text == "library") && strings.HasSuffix(prevText, "gnu") {
219*46c4c49dSIbrahim Kanouche				// Same as above to avoid matching GPL instead of LGPL here.
220*46c4c49dSIbrahim Kanouche				if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
221*46c4c49dSIbrahim Kanouche					return lesserGPLChange
222*46c4c49dSIbrahim Kanouche				}
223*46c4c49dSIbrahim Kanouche			}
224*46c4c49dSIbrahim Kanouche			prevDelete = text
225*46c4c49dSIbrahim Kanouche		}
226*46c4c49dSIbrahim Kanouche	}
227*46c4c49dSIbrahim Kanouche	return diffLevenshteinWord(diffs)
228*46c4c49dSIbrahim Kanouche}
229