1// Copyright 2017 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14package tokenizer 15 16import ( 17 "reflect" 18 "testing" 19) 20 21func TestTokenizer_Tokenize(t *testing.T) { 22 tests := []struct { 23 text string 24 want Tokens 25 }{ 26 { 27 text: "Tokenize", 28 want: Tokens{&token{Text: "Tokenize", Offset: 0}}, 29 }, 30 { 31 text: "Hello world", 32 want: Tokens{ 33 &token{Text: "Hello", Offset: 0}, 34 &token{Text: "world", Offset: 6}, 35 }, 36 }, 37 { 38 text: `Goodnight, 39Irene 40`, 41 want: Tokens{ 42 &token{Text: "Goodnight", Offset: 0}, 43 &token{Text: ",", Offset: 9}, 44 &token{Text: "Irene", Offset: 11}, 45 }, 46 }, 47 { 48 text: "Copyright © 2017 Yoyodyne, Inc.", 49 want: Tokens{ 50 &token{Text: "Copyright", Offset: 0}, 51 &token{Text: "©", Offset: 10}, 52 &token{Text: "2017", Offset: 13}, 53 &token{Text: "Yoyodyne", Offset: 18}, 54 &token{Text: ",", Offset: 26}, 55 &token{Text: "Inc", Offset: 28}, 56 &token{Text: ".", Offset: 31}, 57 }, 58 }, 59 } 60 61 for _, tt := range tests { 62 if got := Tokenize(tt.text); !reflect.DeepEqual(got, tt.want) { 63 t.Errorf("Tokenize(%q) = %+v, want %+v", tt.text, got, tt.want) 64 } 65 } 66} 67 68func TestTokenizer_GenerateHashes(t *testing.T) { 69 tests := []struct { 70 text string 71 sizeFactor int 72 wantHash []uint32 73 wantRanges TokenRanges 74 }{ 75 { 76 text: "", 77 sizeFactor: 1, 78 wantHash: nil, 79 wantRanges: nil, 80 }, 81 { 82 text: "Hashes", 83 sizeFactor: 1, 84 wantHash: []uint32{408116689}, 85 wantRanges: TokenRanges{{Start: 0, End: 1}}, 86 }, 87 { 88 text: "hello world", 89 sizeFactor: 1, 90 wantHash: []uint32{222957957}, 91 wantRanges: TokenRanges{{Start: 0, End: 2}}, 92 }, 93 { 94 text: "Copyright © 2017 Yoyodyne, Inc.", 95 sizeFactor: 3, 96 wantHash: []uint32{2473816729, 966085113, 3025678301, 3199087486, 850352802, 1274745089}, 97 wantRanges: TokenRanges{ 98 {Start: 0, End: 2}, 99 {Start: 1, End: 3}, 100 {Start: 2, End: 4}, 101 {Start: 3, End: 5}, 102 {Start: 4, End: 6}, 103 {Start: 5, End: 7}, 104 }, 105 }, 106 } 107 108 for _, tt := range tests { 109 hash := make(Hash) 110 toks := Tokenize(tt.text) 111 h, tr := toks.GenerateHashes(hash, len(toks)/tt.sizeFactor) 112 if !reflect.DeepEqual(h, tt.wantHash) { 113 t.Errorf("GenerateHashes(hash) = %v, want %v", h, tt.wantHash) 114 } 115 if !reflect.DeepEqual(tr, tt.wantRanges) { 116 t.Errorf("GenerateHashes(ranges) = %v, want %v", tr, tt.wantRanges) 117 } 118 } 119} 120