1// Copyright 2017 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14// 15// Select test data comes from 16// The Project Gutenberg eBook of The humour of Ireland, by D. J., (David James), (1866-1917) O'Donoghue 17 18package stringclassifier 19 20import ( 21 "reflect" 22 "regexp" 23 "sort" 24 "testing" 25 26 "github.com/sergi/go-diff/diffmatchpatch" 27) 28 29var ( 30 gettysburg = `Four score and seven years ago our fathers brought forth 31on this continent, a new nation, conceived in Liberty, and dedicated to the 32proposition that all men are created equal.` 33 modifiedGettysburg = `Four score and seven years ago our fathers brought forth 34on this continent, a nation that was new and improved, conceived in Liberty, and 35dedicated to the proposition that all men are created equal.` 36 gettysburgExtraWord = `Four score and seven years ago our fathers brought forth 37on this continent, a new nation, conceived in Liberty, and dedicated to the 38proposition that all men are created equal.Foobar` 39 40 declaration = `When in the Course of human events, it becomes necessary 41for one people to dissolve the political bands which have connected them with 42another, and to assume among the powers of the earth, the separate and equal 43station to which the Laws of Nature and of Nature's God entitle them, a decent 44respect to the opinions of mankind requires that they should declare the causes 45which impel them to the separation.` 46 47 loremipsum = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla 48varius enim mattis, rhoncus lectus id, aliquet sem. Phasellus eget ex in dolor 49feugiat ultricies. Etiam interdum sit amet nisl in placerat. Sed vitae enim 50vulputate, tempus leo commodo, accumsan nulla.` 51 modifiedLorem = `Lorem ipsum dolor amet, consectetur adipiscing elit. Nulla 52varius enim mattis, lectus id, aliquet rhoncus sem. Phasellus eget ex in dolor 53feugiat ultricies. Etiam interdum sit amet sit nisl in placerat. Sed vitae enim 54vulputate, tempus leo commodo, accumsan nulla.` 55 lessModifiedLorem = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla 56varius enim mattis, rhoncus lectus id, aliquet. Phasellus eget ex in dolor 57feugiat ultricies. Etiam interdum sit amet nisl in placerat. Sed vitae enim 58vulputate, tempus leo commodo, accumsan nulla.` 59 humourOfIreland = `As a rule, Irish poets have not extracted a pessimistic 60philosophy from liquor; they are “elevated,” not depressed, and do not deem 61it essential to the production of a poem that its author should be a cynic or 62an evil prophet. One of the best attributes of Irish poetry is its constant 63expression of the natural emotions. Previous to the close of the 64seventeenth[xvi] century, it is said, drunkenness was not suggested by the 65poets as common in Ireland—the popularity of Bacchanalian songs since that 66date seems to prove that the vice soon became a virtue. Maginn is the 67noisiest of modern revellers, and easily roars the others down. 68` 69 fellowInTheGoatSkin = `There was a poor widow living down there near the Iron 70Forge when the country was all covered with forests, and you might walk on 71the tops of trees from Carnew to the Lady’s Island, and she had one boy. She 72was very poor, as I said before, and was not able to buy clothes for her son. 73So when she was going out she fixed him snug and combustible in the ash-pit, 74and piled the warm ashes about him. The boy knew no better, and was as happy 75as the day was long; and he was happier still when a neighbour[10] gave his 76mother a kid to keep him company when herself was abroad. The kid and the lad 77played like two may-boys; and when she was old enough to give milk, wasn’t it 78a godsend to the little family? You won’t prevent the boy from growing up 79into a young man, but not a screed of clothes had he then no more than when 80he was a gorsoon. 81` 82 oldCrowYoungCrow = `There was an old crow teaching a young crow one day, and 83he said to him, “Now, my son,” says he, “listen to the advice I’m going to 84give you. If you see a person coming near you and stooping, mind yourself, 85and be on your keeping; he’s stooping for a stone to throw at you.” 86 87“But tell me,” says the young crow, “what should I do if he had a stone 88already down in his pocket?” 89 90“Musha, go ’long out of that,” says the old crow, “you’ve learned enough; the 91devil another learning I’m able to give you.” 92` 93 nullifiable = `[[ , _ , _ , _ 94? _ : _ 95? _ : _ 96? _ : _ 97] 98} 99` 100 nonWords = regexp.MustCompile("[[:punct:]]+") 101) 102 103// removeNonWords removes non-words from the string, replacing them with empty 104// string. (This is meant to exercise tokenization problems.) 105func removeNonWords(s string) string { 106 return nonWords.ReplaceAllString(s, "") 107} 108 109func TestClassify_NearestMatch(t *testing.T) { 110 c := New(DefaultConfidenceThreshold, FlattenWhitespace) 111 c.AddValue("gettysburg", gettysburg) 112 c.AddValue("declaration", declaration) 113 c.AddValue("loremipsum", loremipsum) 114 115 tests := []struct { 116 description string 117 input string // input string to match 118 name string // name of expected nearest match 119 minConf float64 // the lowest confidence accepted for the match 120 maxConf float64 // the highest confidence we expect for this match 121 }{ 122 { 123 description: "Full Declaration", 124 input: declaration, 125 name: "declaration", 126 minConf: 1.0, 127 maxConf: 1.0, 128 }, 129 { 130 description: "Modified Lorem", 131 input: modifiedLorem, 132 name: "loremipsum", 133 minConf: 0.90, 134 maxConf: 0.91, 135 }, 136 { 137 description: "Modified Gettysburg", 138 input: modifiedGettysburg, 139 name: "gettysburg", 140 minConf: 0.86, 141 maxConf: 0.87, 142 }, 143 } 144 145 for _, tt := range tests { 146 m := c.NearestMatch(tt.input) 147 148 if got, want := m.Name, tt.name; got != want { 149 t.Errorf("NearestMatch(%q) = %q, want %q", tt.description, got, want) 150 } 151 if got, want := m.Confidence, tt.minConf; got < want { 152 t.Errorf("NearestMatch(%q) returned confidence %v, want minimum of %v", tt.description, got, want) 153 } 154 if got, want := m.Confidence, tt.maxConf; got > want { 155 t.Errorf("NearestMatch(%q) = %v, want maxiumum of %v", tt.description, got, want) 156 } 157 } 158} 159 160type result struct { 161 key string // key of expected nearest match 162 offset int // offset of match in unknown string 163 164 // The confidence values are retrieved by simply running the classifier 165 // and noting the output. A value greater than the "max" is fine and 166 // the tests can be adjusted to account for it. A value less than "min" 167 // should be carefully scrutinzed before adjusting the tests. 168 minConf float64 // the lowest confidence accepted for the match 169 maxConf float64 // the highest confidence we expect for this match 170} 171 172func TestClassify_MultipleMatch(t *testing.T) { 173 c := New(DefaultConfidenceThreshold, FlattenWhitespace) 174 c.AddValue("gettysburg", gettysburg) 175 c.AddValue("declaration", declaration) 176 c.AddValue("declaration-close", declaration[:len(declaration)/2-1]+"_"+declaration[len(declaration)/2:]) 177 c.AddValue("loremipsum", loremipsum) 178 179 cNormalize := New(DefaultConfidenceThreshold, FlattenWhitespace, removeNonWords) 180 cNormalize.AddValue("gettysburg", gettysburg) 181 182 tests := []struct { 183 description string 184 c *Classifier 185 input string // input string to match 186 want []result 187 }{ 188 { 189 description: "Exact text match", 190 c: c, 191 input: fellowInTheGoatSkin + declaration + humourOfIreland, 192 want: []result{ 193 { 194 key: "declaration", 195 offset: 845, 196 minConf: 1.0, 197 maxConf: 1.0, 198 }, 199 }, 200 }, 201 { 202 description: "Partial text match", 203 c: c, 204 input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland, 205 want: []result{ 206 { 207 key: "loremipsum", 208 offset: 845, 209 minConf: 0.90, 210 maxConf: 0.91, 211 }, 212 }, 213 }, 214 { 215 description: "Two partial matches", 216 c: c, 217 input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland + modifiedGettysburg + oldCrowYoungCrow, 218 want: []result{ 219 { 220 key: "loremipsum", 221 offset: 845, 222 minConf: 0.90, 223 maxConf: 0.91, 224 }, 225 { 226 key: "gettysburg", 227 offset: 1750, 228 minConf: 0.86, 229 maxConf: 0.87, 230 }, 231 }, 232 }, 233 { 234 description: "Partial matches of similar text", 235 c: c, 236 input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland + lessModifiedLorem + oldCrowYoungCrow, 237 want: []result{ 238 { 239 key: "loremipsum", 240 offset: 1750, 241 minConf: 0.98, 242 maxConf: 0.99, 243 }, 244 { 245 key: "loremipsum", 246 offset: 845, 247 minConf: 0.90, 248 maxConf: 0.91, 249 }, 250 }, 251 }, 252 { 253 description: "Nullifiable text", 254 c: c, 255 input: nullifiable, 256 want: nil, 257 }, 258 { 259 description: "No match", 260 c: c, 261 input: fellowInTheGoatSkin + humourOfIreland, 262 want: nil, 263 }, 264 { 265 description: "Exact text match, with extra word and non-word normalizer", 266 c: cNormalize, 267 input: fellowInTheGoatSkin + gettysburgExtraWord + humourOfIreland, 268 want: []result{ 269 { 270 key: "gettysburg", 271 offset: 825, 272 minConf: 1.0, 273 maxConf: 1.0, 274 }, 275 }, 276 }, 277 } 278 279 for _, tt := range tests { 280 matches := tt.c.MultipleMatch(tt.input) 281 if len(matches) != len(tt.want) { 282 t.Errorf("MultipleMatch(%q) not enough matches = %v, want %v", tt.description, len(matches), len(tt.want)) 283 } 284 285 for i := 0; i < len(matches); i++ { 286 m := matches[i] 287 w := tt.want[i] 288 if got, want := m.Name, w.key; got != want { 289 t.Errorf("MultipleMatch(%q) = %q, want %q", tt.description, got, want) 290 } 291 if got, want := m.Confidence, w.minConf; got < want { 292 t.Errorf("MultipleMatch(%q) %q = %v, want minimum of %v", tt.description, w.key, got, want) 293 } 294 if got, want := m.Confidence, w.maxConf; got > want { 295 t.Errorf("MultipleMatch(%q) %q = %v, want maximum of %v", tt.description, w.key, got, want) 296 } 297 if got, want := m.Offset, w.offset; got != want { 298 t.Errorf("MultipleMatch(%q) %q = %v, want offset of %v", tt.description, w.key, got, want) 299 } 300 } 301 } 302} 303 304func TestClassify_DiffRatio(t *testing.T) { 305 tests := []struct { 306 x, y string 307 want float64 308 }{ 309 {"", "", 1.0}, 310 {"a", "b", 1.0}, 311 {"", "abc", 0}, 312 {"ab", "c", 0.5}, 313 {"a", "bc", 0.5}, 314 {"a", "bcde", 0.25}, 315 } 316 317 for _, tt := range tests { 318 if got, want := diffRatio(tt.x, tt.y), tt.want; got != want { 319 t.Errorf("diffRatio(%q, %q) = %f, want %f", tt.x, tt.y, got, want) 320 } 321 } 322} 323 324func TestClassify_Matches(t *testing.T) { 325 tests := []struct { 326 description string 327 matches Matches 328 want Matches 329 }{ 330 { 331 description: "Different names, same confidences, same offset", 332 matches: Matches{ 333 &Match{ 334 Name: "b", 335 Confidence: 0.42, 336 Offset: 0, 337 }, 338 &Match{ 339 Name: "a", 340 Confidence: 0.42, 341 Offset: 0, 342 }, 343 }, 344 want: Matches{ 345 &Match{ 346 Name: "a", 347 Confidence: 0.42, 348 Offset: 0, 349 }, 350 &Match{ 351 Name: "b", 352 Confidence: 0.42, 353 Offset: 0, 354 }, 355 }, 356 }, 357 { 358 description: "Same names, different confidences, same offset", 359 matches: Matches{ 360 &Match{ 361 Name: "b", 362 Confidence: 0.42, 363 Offset: 0, 364 }, 365 &Match{ 366 Name: "b", 367 Confidence: 0.90, 368 Offset: 0, 369 }, 370 }, 371 want: Matches{ 372 &Match{ 373 Name: "b", 374 Confidence: 0.90, 375 Offset: 0, 376 }, 377 &Match{ 378 Name: "b", 379 Confidence: 0.42, 380 Offset: 0, 381 }, 382 }, 383 }, 384 { 385 description: "Same names, same confidences, different offsets", 386 matches: Matches{ 387 &Match{ 388 Name: "b", 389 Confidence: 0.42, 390 Offset: 42, 391 }, 392 &Match{ 393 Name: "b", 394 Confidence: 0.42, 395 Offset: 0, 396 }, 397 }, 398 want: Matches{ 399 &Match{ 400 Name: "b", 401 Confidence: 0.42, 402 Offset: 0, 403 }, 404 &Match{ 405 Name: "b", 406 Confidence: 0.42, 407 Offset: 42, 408 }, 409 }, 410 }, 411 412 { 413 description: "Different names, different confidences, same offset", 414 matches: Matches{ 415 &Match{ 416 Name: "b", 417 Confidence: 0.42, 418 Offset: 0, 419 }, 420 &Match{ 421 Name: "a", 422 Confidence: 0.90, 423 Offset: 0, 424 }, 425 }, 426 want: Matches{ 427 &Match{ 428 Name: "a", 429 Confidence: 0.90, 430 Offset: 0, 431 }, 432 &Match{ 433 Name: "b", 434 Confidence: 0.42, 435 Offset: 0, 436 }, 437 }, 438 }, 439 { 440 description: "Different names, same confidences, different offset", 441 matches: Matches{ 442 &Match{ 443 Name: "b", 444 Confidence: 0.42, 445 Offset: 37, 446 }, 447 &Match{ 448 Name: "a", 449 Confidence: 0.42, 450 Offset: 0, 451 }, 452 }, 453 want: Matches{ 454 &Match{ 455 Name: "a", 456 Confidence: 0.42, 457 Offset: 0, 458 }, 459 &Match{ 460 Name: "b", 461 Confidence: 0.42, 462 Offset: 37, 463 }, 464 }, 465 }, 466 { 467 description: "Different names, different confidences, different offset", 468 matches: Matches{ 469 &Match{ 470 Name: "a", 471 Confidence: 0.42, 472 Offset: 0, 473 }, 474 &Match{ 475 Name: "b", 476 Confidence: 0.90, 477 Offset: 37, 478 }, 479 }, 480 want: Matches{ 481 &Match{ 482 Name: "b", 483 Confidence: 0.90, 484 Offset: 37, 485 }, 486 &Match{ 487 Name: "a", 488 Confidence: 0.42, 489 Offset: 0, 490 }, 491 }, 492 }, 493 } 494 495 for _, tt := range tests { 496 sort.Sort(tt.matches) 497 if !reflect.DeepEqual(tt.matches, tt.want) { 498 for _, x := range tt.matches { 499 t.Errorf("got: %v", x) 500 } 501 for _, x := range tt.want { 502 t.Errorf("want: %v", x) 503 } 504 t.Errorf("MatchesSort(%q) = %v, want %v", tt.description, tt.matches, tt.want) 505 } 506 } 507} 508 509func TestClassify_DiffRangeEnd(t *testing.T) { 510 dmp := diffmatchpatch.New() 511 tests := []struct { 512 description string 513 unknown string 514 known string 515 end int 516 }{ 517 { 518 description: "identical", 519 unknown: declaration, 520 known: declaration, 521 end: 1, 522 }, 523 { 524 description: "lorem", 525 unknown: lessModifiedLorem, 526 known: loremipsum, 527 end: 3, 528 }, 529 { 530 description: "gettysburg", 531 unknown: modifiedGettysburg, 532 known: gettysburg, 533 end: 19, 534 }, 535 } 536 537 for _, tt := range tests { 538 diffs := dmp.DiffMain(tt.unknown, tt.known, true) 539 if e := diffRangeEnd(tt.known, diffs); e != tt.end { 540 t.Errorf("DiffRangeEnd(%q) = end %v, want %v", tt.description, e, tt.end) 541 } 542 } 543} 544 545func BenchmarkClassifier(b *testing.B) { 546 c := New(DefaultConfidenceThreshold, FlattenWhitespace) 547 c.AddValue("gettysburg", gettysburg) 548 c.AddValue("declaration", declaration) 549 c.AddValue("loremipsum", loremipsum) 550 551 b.ResetTimer() 552 for i := 0; i < b.N; i++ { 553 c.NearestMatch(modifiedLorem) 554 } 555} 556