xref: /aosp_15_r20/external/licenseclassifier/stringclassifier/classifier_test.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1// Copyright 2017 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//	http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// Select test data comes from
16// The Project Gutenberg eBook of The humour of Ireland, by D. J., (David James), (1866-1917) O'Donoghue
17
18package stringclassifier
19
20import (
21	"reflect"
22	"regexp"
23	"sort"
24	"testing"
25
26	"github.com/sergi/go-diff/diffmatchpatch"
27)
28
29var (
30	gettysburg = `Four score and seven years ago our fathers brought forth
31on this continent, a new nation, conceived in Liberty, and dedicated to the
32proposition that all men are created equal.`
33	modifiedGettysburg = `Four score and seven years ago our fathers brought forth
34on this continent, a nation that was new and improved, conceived in Liberty, and
35dedicated to the proposition that all men are created equal.`
36	gettysburgExtraWord = `Four score and seven years ago our fathers brought forth
37on this continent, a new nation, conceived in Liberty, and dedicated to the
38proposition that all men are created equal.Foobar`
39
40	declaration = `When in the Course of human events, it becomes necessary
41for one people to dissolve the political bands which have connected them with
42another, and to assume among the powers of the earth, the separate and equal
43station to which the Laws of Nature and of Nature's God entitle them, a decent
44respect to the opinions of mankind requires that they should declare the causes
45which impel them to the separation.`
46
47	loremipsum = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla
48varius enim mattis, rhoncus lectus id, aliquet sem. Phasellus eget ex in dolor
49feugiat ultricies. Etiam interdum sit amet nisl in placerat.  Sed vitae enim
50vulputate, tempus leo commodo, accumsan nulla.`
51	modifiedLorem = `Lorem ipsum dolor amet, consectetur adipiscing elit. Nulla
52varius enim mattis, lectus id, aliquet rhoncus  sem. Phasellus eget ex in dolor
53feugiat ultricies. Etiam interdum sit amet sit  nisl in placerat.  Sed vitae enim
54vulputate, tempus leo commodo, accumsan nulla.`
55	lessModifiedLorem = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla
56varius enim mattis, rhoncus lectus id, aliquet. Phasellus eget ex in dolor
57feugiat ultricies. Etiam interdum sit amet nisl in placerat.  Sed vitae enim
58vulputate, tempus leo commodo, accumsan nulla.`
59	humourOfIreland = `As a rule, Irish poets have not extracted a pessimistic
60philosophy from liquor; they areelevated,” not depressed, and do not deem
61it essential to the production of a poem that its author should be a cynic or
62an evil prophet. One of the best attributes of Irish poetry is its constant
63expression of the natural emotions. Previous to the close of the
64seventeenth[xvi] century, it is said, drunkenness was not suggested by the
65poets as common in Irelandthe popularity of Bacchanalian songs since that
66date seems to prove that the vice soon became a virtue. Maginn is the
67noisiest of modern revellers, and easily roars the others down.
68`
69	fellowInTheGoatSkin = `There was a poor widow living down there near the Iron
70Forge when the country was all covered with forests, and you might walk on
71the tops of trees from Carnew to the Ladys Island, and she had one boy. She
72was very poor, as I said before, and was not able to buy clothes for her son.
73So when she was going out she fixed him snug and combustible in the ash-pit,
74and piled the warm ashes about him. The boy knew no better, and was as happy
75as the day was long; and he was happier still when a neighbour[10] gave his
76mother a kid to keep him company when herself was abroad. The kid and the lad
77played like two may-boys; and when she was old enough to give milk, wasnt it
78a godsend to the little family? You wont prevent the boy from growing up
79into a young man, but not a screed of clothes had he then no more than when
80he was a gorsoon.
81`
82	oldCrowYoungCrow = `There was an old crow teaching a young crow one day, and
83he said to him, “Now, my son,” says he, “listen to the advice Im going to
84give you. If you see a person coming near you and stooping, mind yourself,
85and be on your keeping; hes stooping for a stone to throw at you.”
86
87But tell me,” says the young crow, “what should I do if he had a stone
88already down in his pocket?”
89
90Musha, golong out of that,” says the old crow, “youve learned enough; the
91devil another learning Im able to give you.”
92`
93	nullifiable = `[[ , _ , _ , _
94? _ : _
95? _ : _
96? _ : _
97]
98}
99`
100	nonWords = regexp.MustCompile("[[:punct:]]+")
101)
102
103// removeNonWords removes non-words from the string, replacing them with empty
104// string. (This is meant to exercise tokenization problems.)
105func removeNonWords(s string) string {
106	return nonWords.ReplaceAllString(s, "")
107}
108
109func TestClassify_NearestMatch(t *testing.T) {
110	c := New(DefaultConfidenceThreshold, FlattenWhitespace)
111	c.AddValue("gettysburg", gettysburg)
112	c.AddValue("declaration", declaration)
113	c.AddValue("loremipsum", loremipsum)
114
115	tests := []struct {
116		description string
117		input       string  // input string to match
118		name        string  // name of expected nearest match
119		minConf     float64 // the lowest confidence accepted for the match
120		maxConf     float64 // the highest confidence we expect for this match
121	}{
122		{
123			description: "Full Declaration",
124			input:       declaration,
125			name:        "declaration",
126			minConf:     1.0,
127			maxConf:     1.0,
128		},
129		{
130			description: "Modified Lorem",
131			input:       modifiedLorem,
132			name:        "loremipsum",
133			minConf:     0.90,
134			maxConf:     0.91,
135		},
136		{
137			description: "Modified Gettysburg",
138			input:       modifiedGettysburg,
139			name:        "gettysburg",
140			minConf:     0.86,
141			maxConf:     0.87,
142		},
143	}
144
145	for _, tt := range tests {
146		m := c.NearestMatch(tt.input)
147
148		if got, want := m.Name, tt.name; got != want {
149			t.Errorf("NearestMatch(%q) = %q, want %q", tt.description, got, want)
150		}
151		if got, want := m.Confidence, tt.minConf; got < want {
152			t.Errorf("NearestMatch(%q) returned confidence %v, want minimum of %v", tt.description, got, want)
153		}
154		if got, want := m.Confidence, tt.maxConf; got > want {
155			t.Errorf("NearestMatch(%q) = %v, want maxiumum of %v", tt.description, got, want)
156		}
157	}
158}
159
160type result struct {
161	key    string // key of expected nearest match
162	offset int    // offset of match in unknown string
163
164	// The confidence values are retrieved by simply running the classifier
165	// and noting the output. A value greater than the "max" is fine and
166	// the tests can be adjusted to account for it. A value less than "min"
167	// should be carefully scrutinzed before adjusting the tests.
168	minConf float64 // the lowest confidence accepted for the match
169	maxConf float64 // the highest confidence we expect for this match
170}
171
172func TestClassify_MultipleMatch(t *testing.T) {
173	c := New(DefaultConfidenceThreshold, FlattenWhitespace)
174	c.AddValue("gettysburg", gettysburg)
175	c.AddValue("declaration", declaration)
176	c.AddValue("declaration-close", declaration[:len(declaration)/2-1]+"_"+declaration[len(declaration)/2:])
177	c.AddValue("loremipsum", loremipsum)
178
179	cNormalize := New(DefaultConfidenceThreshold, FlattenWhitespace, removeNonWords)
180	cNormalize.AddValue("gettysburg", gettysburg)
181
182	tests := []struct {
183		description string
184		c           *Classifier
185		input       string // input string to match
186		want        []result
187	}{
188		{
189			description: "Exact text match",
190			c:           c,
191			input:       fellowInTheGoatSkin + declaration + humourOfIreland,
192			want: []result{
193				{
194					key:     "declaration",
195					offset:  845,
196					minConf: 1.0,
197					maxConf: 1.0,
198				},
199			},
200		},
201		{
202			description: "Partial text match",
203			c:           c,
204			input:       fellowInTheGoatSkin + modifiedLorem + humourOfIreland,
205			want: []result{
206				{
207					key:     "loremipsum",
208					offset:  845,
209					minConf: 0.90,
210					maxConf: 0.91,
211				},
212			},
213		},
214		{
215			description: "Two partial matches",
216			c:           c,
217			input:       fellowInTheGoatSkin + modifiedLorem + humourOfIreland + modifiedGettysburg + oldCrowYoungCrow,
218			want: []result{
219				{
220					key:     "loremipsum",
221					offset:  845,
222					minConf: 0.90,
223					maxConf: 0.91,
224				},
225				{
226					key:     "gettysburg",
227					offset:  1750,
228					minConf: 0.86,
229					maxConf: 0.87,
230				},
231			},
232		},
233		{
234			description: "Partial matches of similar text",
235			c:           c,
236			input:       fellowInTheGoatSkin + modifiedLorem + humourOfIreland + lessModifiedLorem + oldCrowYoungCrow,
237			want: []result{
238				{
239					key:     "loremipsum",
240					offset:  1750,
241					minConf: 0.98,
242					maxConf: 0.99,
243				},
244				{
245					key:     "loremipsum",
246					offset:  845,
247					minConf: 0.90,
248					maxConf: 0.91,
249				},
250			},
251		},
252		{
253			description: "Nullifiable text",
254			c:           c,
255			input:       nullifiable,
256			want:        nil,
257		},
258		{
259			description: "No match",
260			c:           c,
261			input:       fellowInTheGoatSkin + humourOfIreland,
262			want:        nil,
263		},
264		{
265			description: "Exact text match, with extra word and non-word normalizer",
266			c:           cNormalize,
267			input:       fellowInTheGoatSkin + gettysburgExtraWord + humourOfIreland,
268			want: []result{
269				{
270					key:     "gettysburg",
271					offset:  825,
272					minConf: 1.0,
273					maxConf: 1.0,
274				},
275			},
276		},
277	}
278
279	for _, tt := range tests {
280		matches := tt.c.MultipleMatch(tt.input)
281		if len(matches) != len(tt.want) {
282			t.Errorf("MultipleMatch(%q) not enough matches = %v, want %v", tt.description, len(matches), len(tt.want))
283		}
284
285		for i := 0; i < len(matches); i++ {
286			m := matches[i]
287			w := tt.want[i]
288			if got, want := m.Name, w.key; got != want {
289				t.Errorf("MultipleMatch(%q) = %q, want %q", tt.description, got, want)
290			}
291			if got, want := m.Confidence, w.minConf; got < want {
292				t.Errorf("MultipleMatch(%q) %q = %v, want minimum of %v", tt.description, w.key, got, want)
293			}
294			if got, want := m.Confidence, w.maxConf; got > want {
295				t.Errorf("MultipleMatch(%q) %q = %v, want maximum of %v", tt.description, w.key, got, want)
296			}
297			if got, want := m.Offset, w.offset; got != want {
298				t.Errorf("MultipleMatch(%q) %q = %v, want offset of %v", tt.description, w.key, got, want)
299			}
300		}
301	}
302}
303
304func TestClassify_DiffRatio(t *testing.T) {
305	tests := []struct {
306		x, y string
307		want float64
308	}{
309		{"", "", 1.0},
310		{"a", "b", 1.0},
311		{"", "abc", 0},
312		{"ab", "c", 0.5},
313		{"a", "bc", 0.5},
314		{"a", "bcde", 0.25},
315	}
316
317	for _, tt := range tests {
318		if got, want := diffRatio(tt.x, tt.y), tt.want; got != want {
319			t.Errorf("diffRatio(%q, %q) = %f, want %f", tt.x, tt.y, got, want)
320		}
321	}
322}
323
324func TestClassify_Matches(t *testing.T) {
325	tests := []struct {
326		description string
327		matches     Matches
328		want        Matches
329	}{
330		{
331			description: "Different names, same confidences, same offset",
332			matches: Matches{
333				&Match{
334					Name:       "b",
335					Confidence: 0.42,
336					Offset:     0,
337				},
338				&Match{
339					Name:       "a",
340					Confidence: 0.42,
341					Offset:     0,
342				},
343			},
344			want: Matches{
345				&Match{
346					Name:       "a",
347					Confidence: 0.42,
348					Offset:     0,
349				},
350				&Match{
351					Name:       "b",
352					Confidence: 0.42,
353					Offset:     0,
354				},
355			},
356		},
357		{
358			description: "Same names, different confidences, same offset",
359			matches: Matches{
360				&Match{
361					Name:       "b",
362					Confidence: 0.42,
363					Offset:     0,
364				},
365				&Match{
366					Name:       "b",
367					Confidence: 0.90,
368					Offset:     0,
369				},
370			},
371			want: Matches{
372				&Match{
373					Name:       "b",
374					Confidence: 0.90,
375					Offset:     0,
376				},
377				&Match{
378					Name:       "b",
379					Confidence: 0.42,
380					Offset:     0,
381				},
382			},
383		},
384		{
385			description: "Same names, same confidences, different offsets",
386			matches: Matches{
387				&Match{
388					Name:       "b",
389					Confidence: 0.42,
390					Offset:     42,
391				},
392				&Match{
393					Name:       "b",
394					Confidence: 0.42,
395					Offset:     0,
396				},
397			},
398			want: Matches{
399				&Match{
400					Name:       "b",
401					Confidence: 0.42,
402					Offset:     0,
403				},
404				&Match{
405					Name:       "b",
406					Confidence: 0.42,
407					Offset:     42,
408				},
409			},
410		},
411
412		{
413			description: "Different names, different confidences, same offset",
414			matches: Matches{
415				&Match{
416					Name:       "b",
417					Confidence: 0.42,
418					Offset:     0,
419				},
420				&Match{
421					Name:       "a",
422					Confidence: 0.90,
423					Offset:     0,
424				},
425			},
426			want: Matches{
427				&Match{
428					Name:       "a",
429					Confidence: 0.90,
430					Offset:     0,
431				},
432				&Match{
433					Name:       "b",
434					Confidence: 0.42,
435					Offset:     0,
436				},
437			},
438		},
439		{
440			description: "Different names, same confidences, different offset",
441			matches: Matches{
442				&Match{
443					Name:       "b",
444					Confidence: 0.42,
445					Offset:     37,
446				},
447				&Match{
448					Name:       "a",
449					Confidence: 0.42,
450					Offset:     0,
451				},
452			},
453			want: Matches{
454				&Match{
455					Name:       "a",
456					Confidence: 0.42,
457					Offset:     0,
458				},
459				&Match{
460					Name:       "b",
461					Confidence: 0.42,
462					Offset:     37,
463				},
464			},
465		},
466		{
467			description: "Different names, different confidences, different offset",
468			matches: Matches{
469				&Match{
470					Name:       "a",
471					Confidence: 0.42,
472					Offset:     0,
473				},
474				&Match{
475					Name:       "b",
476					Confidence: 0.90,
477					Offset:     37,
478				},
479			},
480			want: Matches{
481				&Match{
482					Name:       "b",
483					Confidence: 0.90,
484					Offset:     37,
485				},
486				&Match{
487					Name:       "a",
488					Confidence: 0.42,
489					Offset:     0,
490				},
491			},
492		},
493	}
494
495	for _, tt := range tests {
496		sort.Sort(tt.matches)
497		if !reflect.DeepEqual(tt.matches, tt.want) {
498			for _, x := range tt.matches {
499				t.Errorf("got: %v", x)
500			}
501			for _, x := range tt.want {
502				t.Errorf("want: %v", x)
503			}
504			t.Errorf("MatchesSort(%q) = %v, want %v", tt.description, tt.matches, tt.want)
505		}
506	}
507}
508
509func TestClassify_DiffRangeEnd(t *testing.T) {
510	dmp := diffmatchpatch.New()
511	tests := []struct {
512		description string
513		unknown     string
514		known       string
515		end         int
516	}{
517		{
518			description: "identical",
519			unknown:     declaration,
520			known:       declaration,
521			end:         1,
522		},
523		{
524			description: "lorem",
525			unknown:     lessModifiedLorem,
526			known:       loremipsum,
527			end:         3,
528		},
529		{
530			description: "gettysburg",
531			unknown:     modifiedGettysburg,
532			known:       gettysburg,
533			end:         19,
534		},
535	}
536
537	for _, tt := range tests {
538		diffs := dmp.DiffMain(tt.unknown, tt.known, true)
539		if e := diffRangeEnd(tt.known, diffs); e != tt.end {
540			t.Errorf("DiffRangeEnd(%q) = end %v, want %v", tt.description, e, tt.end)
541		}
542	}
543}
544
545func BenchmarkClassifier(b *testing.B) {
546	c := New(DefaultConfidenceThreshold, FlattenWhitespace)
547	c.AddValue("gettysburg", gettysburg)
548	c.AddValue("declaration", declaration)
549	c.AddValue("loremipsum", loremipsum)
550
551	b.ResetTimer()
552	for i := 0; i < b.N; i++ {
553		c.NearestMatch(modifiedLorem)
554	}
555}
556