1// Copyright 2020 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package classifier 16 17import ( 18 "strings" 19 "unicode" 20 21 "github.com/davecgh/go-spew/spew" 22 "github.com/sergi/go-diff/diffmatchpatch" 23) 24 25// return values for the distance function that explain why a diff 26// is not an acceptable match for the source document. 27const ( 28 versionChange = -1 29 introducedPhraseChange = -2 30 lesserGPLChange = -3 31) 32 33// score computes a metric of similarity between the known and unknown 34// document, including the offsets into the unknown that yield the content 35// generating the computed similarity. 36func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownStart, unknownEnd int) (float64, int, int) { 37 if c.tc.traceScoring(known.s.origin) { 38 c.tc.trace("Scoring %s: [%d-%d]", known.s.origin, unknownStart, unknownEnd) 39 } 40 41 knownLength := known.size() 42 diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength) 43 44 start, end := diffRange(known.Norm, diffs) 45 distance := scoreDiffs(id, diffs[start:end]) 46 47 if c.tc.traceScoring(known.s.origin) { 48 c.tc.trace("Diffs against %s:\n%s", known.s.origin, spew.Sdump(diffs[start:end])) 49 } 50 51 if distance < 0 { 52 // If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match. 53 if c.tc.traceScoring(known.s.origin) { 54 c.tc.trace("Distance result %v, rejected match", distance) 55 } 56 return 0.0, 0, 0 57 } 58 59 // Applying the diffRange-generated offsets provides the run of text from the 60 // target most closely correlated to the source. This is the process for 61 // compensating for errors from the searchset. With the reduced text, we 62 // compute the final confidence score and exact text locations for the 63 // matched content. 64 // The diff slice consists of three regions: an optional deletion diff for 65 // target text before the source, n elements that represent the diff between 66 // the source and target, and an optional deletion diff for text after the 67 // target. The helper functions identify the portions of the slice 68 // corresponding to those regions. This results in a more accurate 69 // confidence score and better position detection of the source in the 70 // target. 71 conf, so, eo := confidencePercentage(knownLength, distance), textLength(diffs[:start]), textLength(diffs[end:]) 72 73 if c.tc.traceScoring(known.s.origin) { 74 c.tc.trace("Score result: %v [%d-%d]", conf, so, eo) 75 } 76 return conf, so, eo 77} 78 79// confidencePercentage computes a confidence match score for the lengths, 80// handling the cases where source and target lengths differ. 81func confidencePercentage(klen, distance int) float64 { 82 // No text is matched at 100% confidence (avoid divide by zero). 83 if klen == 0 { 84 return 1.0 85 } 86 87 // Return a computed fractional match against the known text. 88 return 1.0 - float64(distance)/float64(klen) 89} 90 91// diffLevenshteinWord computes word-based Levenshtein count. 92func diffLevenshteinWord(diffs []diffmatchpatch.Diff) int { 93 levenshtein := 0 94 insertions := 0 95 deletions := 0 96 97 for _, aDiff := range diffs { 98 switch aDiff.Type { 99 case diffmatchpatch.DiffInsert: 100 insertions += wordLen(aDiff.Text) 101 case diffmatchpatch.DiffDelete: 102 deletions += wordLen(aDiff.Text) 103 case diffmatchpatch.DiffEqual: 104 // A deletion and an insertion is one substitution. 105 levenshtein += max(insertions, deletions) 106 insertions = 0 107 deletions = 0 108 } 109 } 110 111 levenshtein += max(insertions, deletions) 112 return levenshtein 113} 114 115func isVersionNumber(in string) bool { 116 for _, r := range in { 117 if !unicode.IsDigit(r) && r != '.' { 118 return false 119 } 120 } 121 return true 122} 123 124// scoreDiffs returns a score rating the acceptability of these diffs. A 125// negative value means that the changes represented by the diff are not an 126// acceptable transformation since it would change the underlying license. A 127// positive value indicates the Levenshtein word distance. 128func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int { 129 // We make a pass looking for unacceptable substitutions 130 // Delete diffs are always ordered before insert diffs. This is leveraged to 131 // analyze a change by checking an insert against the delete text that was 132 // previously cached. 133 prevText := "" 134 prevDelete := "" 135 for i, diff := range diffs { 136 text := diff.Text 137 switch diff.Type { 138 case diffmatchpatch.DiffInsert: 139 num := text 140 if i := strings.Index(num, " "); i != -1 { 141 num = num[0:i] 142 } 143 if isVersionNumber(num) && strings.HasSuffix(prevText, "version") { 144 if !strings.HasSuffix(prevText, "the standard version") && !strings.HasSuffix(prevText, "the contributor version") { 145 return versionChange 146 } 147 } 148 // There are certain phrases that can't be introduced to make a license 149 // hit. TODO: would like to generate this programmatically. Most of 150 // these are words or phrases that appear in a single/small number of 151 // licenses. Can we leverage frequency analysis to identify these 152 // interesting words/phrases and auto-extract them? 153 154 inducedPhrases := map[string][]string{ 155 "AGPL": {"affero"}, 156 "Atmel": {"atmel"}, 157 "Apache": {"apache"}, 158 "BSD": {"bsd"}, 159 "BSD-3-Clause-Attribution": {"acknowledgment"}, 160 "bzip2": {"seward"}, 161 "GPL-2.0-with-GCC-exception": {"gcc linking exception"}, 162 "GPL-2.0-with-autoconf-exception": {"autoconf exception"}, 163 "GPL-2.0-with-bison-exception": {"bison exception"}, 164 "GPL-2.0-with-classpath-exception": {"class path exception"}, 165 "GPL-2.0-with-font-exception": {"font exception"}, 166 "LGPL-2.0": {"library"}, 167 "ImageMagick": {"imagemagick"}, 168 "PHP": {"php"}, 169 "SISSL": {"sun standards"}, 170 "SGI-B": {"silicon graphics"}, 171 "SunPro": {"sunpro"}, 172 "X11": {"x consortium"}, 173 } 174 175 for k, ps := range inducedPhrases { 176 if strings.HasPrefix(LicenseName(id), k) { 177 for _, p := range ps { 178 if strings.Index(text, p) != -1 { 179 // Check to make sure there isn't a corresponding diff for this 180 // insert that also contains the text. This prevents against diff 181 // blocks that are too big and force a false hit on this check, 182 // which usually happens with URLs since they are stored in one 183 // token but can happen in other cases as well. We don't look just 184 // for delete diffs because the subsequent text may reference the 185 // content in case a URL was truncated. 186 if i+1 < len(diffs) && strings.Index(diffs[i+1].Text, p) != -1 { 187 continue 188 } 189 return introducedPhraseChange 190 } 191 } 192 } 193 } 194 195 // Ignore changes between "library" and "lesser" in a GNU context as they 196 // changed the terms, but look for introductions of Lesser that would 197 // otherwise disqualify a match. 198 if text == "lesser" && strings.HasSuffix(prevText, "gnu") && prevDelete != "library" { 199 // The LGPL 3.0 doesn't have a standard header, so people tend to craft 200 // their own. As a result, sometimes the warranty clause refers to the 201 // GPL instead of the LGPL. This is fine from a licensing perspective, 202 // but we need to tweak matching to ignore that particular case. In 203 // other circumstances, inserting or removing the word Lesser in the 204 // GPL context is not an acceptable change. There is also a reference to 205 // it when suggesting to use the LGPL. 206 if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") { 207 return lesserGPLChange 208 } 209 } 210 case diffmatchpatch.DiffEqual: 211 prevText = text 212 prevDelete = "" 213 214 case diffmatchpatch.DiffDelete: 215 // Avoid substitution in most cases. The two exceptions are with usage 216 // statements that are talking about *another* license, and don't affect 217 // the detection of the current license. 218 if (text == "lesser" || text == "library") && strings.HasSuffix(prevText, "gnu") { 219 // Same as above to avoid matching GPL instead of LGPL here. 220 if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") { 221 return lesserGPLChange 222 } 223 } 224 prevDelete = text 225 } 226 } 227 return diffLevenshteinWord(diffs) 228} 229