1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc. 2*46c4c49dSIbrahim Kanouche// 3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License"); 4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License. 5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at 6*46c4c49dSIbrahim Kanouche// 7*46c4c49dSIbrahim Kanouche// http://www.apache.org/licenses/LICENSE-2.0 8*46c4c49dSIbrahim Kanouche// 9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software 10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS, 11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and 13*46c4c49dSIbrahim Kanouche// limitations under the License. 14*46c4c49dSIbrahim Kanouchepackage tokenizer 15*46c4c49dSIbrahim Kanouche 16*46c4c49dSIbrahim Kanoucheimport ( 17*46c4c49dSIbrahim Kanouche "reflect" 18*46c4c49dSIbrahim Kanouche "testing" 19*46c4c49dSIbrahim Kanouche) 20*46c4c49dSIbrahim Kanouche 21*46c4c49dSIbrahim Kanouchefunc TestTokenizer_Tokenize(t *testing.T) { 22*46c4c49dSIbrahim Kanouche tests := []struct { 23*46c4c49dSIbrahim Kanouche text string 24*46c4c49dSIbrahim Kanouche want Tokens 25*46c4c49dSIbrahim Kanouche }{ 26*46c4c49dSIbrahim Kanouche { 27*46c4c49dSIbrahim Kanouche text: "Tokenize", 28*46c4c49dSIbrahim Kanouche want: Tokens{&token{Text: "Tokenize", Offset: 0}}, 29*46c4c49dSIbrahim Kanouche }, 30*46c4c49dSIbrahim Kanouche { 31*46c4c49dSIbrahim Kanouche text: "Hello world", 32*46c4c49dSIbrahim Kanouche want: Tokens{ 33*46c4c49dSIbrahim Kanouche &token{Text: "Hello", Offset: 0}, 34*46c4c49dSIbrahim Kanouche &token{Text: "world", Offset: 6}, 35*46c4c49dSIbrahim Kanouche }, 36*46c4c49dSIbrahim Kanouche }, 37*46c4c49dSIbrahim Kanouche { 38*46c4c49dSIbrahim Kanouche text: `Goodnight, 39*46c4c49dSIbrahim KanoucheIrene 40*46c4c49dSIbrahim Kanouche`, 41*46c4c49dSIbrahim Kanouche want: Tokens{ 42*46c4c49dSIbrahim Kanouche &token{Text: "Goodnight", Offset: 0}, 43*46c4c49dSIbrahim Kanouche &token{Text: ",", Offset: 9}, 44*46c4c49dSIbrahim Kanouche &token{Text: "Irene", Offset: 11}, 45*46c4c49dSIbrahim Kanouche }, 46*46c4c49dSIbrahim Kanouche }, 47*46c4c49dSIbrahim Kanouche { 48*46c4c49dSIbrahim Kanouche text: "Copyright © 2017 Yoyodyne, Inc.", 49*46c4c49dSIbrahim Kanouche want: Tokens{ 50*46c4c49dSIbrahim Kanouche &token{Text: "Copyright", Offset: 0}, 51*46c4c49dSIbrahim Kanouche &token{Text: "©", Offset: 10}, 52*46c4c49dSIbrahim Kanouche &token{Text: "2017", Offset: 13}, 53*46c4c49dSIbrahim Kanouche &token{Text: "Yoyodyne", Offset: 18}, 54*46c4c49dSIbrahim Kanouche &token{Text: ",", Offset: 26}, 55*46c4c49dSIbrahim Kanouche &token{Text: "Inc", Offset: 28}, 56*46c4c49dSIbrahim Kanouche &token{Text: ".", Offset: 31}, 57*46c4c49dSIbrahim Kanouche }, 58*46c4c49dSIbrahim Kanouche }, 59*46c4c49dSIbrahim Kanouche } 60*46c4c49dSIbrahim Kanouche 61*46c4c49dSIbrahim Kanouche for _, tt := range tests { 62*46c4c49dSIbrahim Kanouche if got := Tokenize(tt.text); !reflect.DeepEqual(got, tt.want) { 63*46c4c49dSIbrahim Kanouche t.Errorf("Tokenize(%q) = %+v, want %+v", tt.text, got, tt.want) 64*46c4c49dSIbrahim Kanouche } 65*46c4c49dSIbrahim Kanouche } 66*46c4c49dSIbrahim Kanouche} 67*46c4c49dSIbrahim Kanouche 68*46c4c49dSIbrahim Kanouchefunc TestTokenizer_GenerateHashes(t *testing.T) { 69*46c4c49dSIbrahim Kanouche tests := []struct { 70*46c4c49dSIbrahim Kanouche text string 71*46c4c49dSIbrahim Kanouche sizeFactor int 72*46c4c49dSIbrahim Kanouche wantHash []uint32 73*46c4c49dSIbrahim Kanouche wantRanges TokenRanges 74*46c4c49dSIbrahim Kanouche }{ 75*46c4c49dSIbrahim Kanouche { 76*46c4c49dSIbrahim Kanouche text: "", 77*46c4c49dSIbrahim Kanouche sizeFactor: 1, 78*46c4c49dSIbrahim Kanouche wantHash: nil, 79*46c4c49dSIbrahim Kanouche wantRanges: nil, 80*46c4c49dSIbrahim Kanouche }, 81*46c4c49dSIbrahim Kanouche { 82*46c4c49dSIbrahim Kanouche text: "Hashes", 83*46c4c49dSIbrahim Kanouche sizeFactor: 1, 84*46c4c49dSIbrahim Kanouche wantHash: []uint32{408116689}, 85*46c4c49dSIbrahim Kanouche wantRanges: TokenRanges{{Start: 0, End: 1}}, 86*46c4c49dSIbrahim Kanouche }, 87*46c4c49dSIbrahim Kanouche { 88*46c4c49dSIbrahim Kanouche text: "hello world", 89*46c4c49dSIbrahim Kanouche sizeFactor: 1, 90*46c4c49dSIbrahim Kanouche wantHash: []uint32{222957957}, 91*46c4c49dSIbrahim Kanouche wantRanges: TokenRanges{{Start: 0, End: 2}}, 92*46c4c49dSIbrahim Kanouche }, 93*46c4c49dSIbrahim Kanouche { 94*46c4c49dSIbrahim Kanouche text: "Copyright © 2017 Yoyodyne, Inc.", 95*46c4c49dSIbrahim Kanouche sizeFactor: 3, 96*46c4c49dSIbrahim Kanouche wantHash: []uint32{2473816729, 966085113, 3025678301, 3199087486, 850352802, 1274745089}, 97*46c4c49dSIbrahim Kanouche wantRanges: TokenRanges{ 98*46c4c49dSIbrahim Kanouche {Start: 0, End: 2}, 99*46c4c49dSIbrahim Kanouche {Start: 1, End: 3}, 100*46c4c49dSIbrahim Kanouche {Start: 2, End: 4}, 101*46c4c49dSIbrahim Kanouche {Start: 3, End: 5}, 102*46c4c49dSIbrahim Kanouche {Start: 4, End: 6}, 103*46c4c49dSIbrahim Kanouche {Start: 5, End: 7}, 104*46c4c49dSIbrahim Kanouche }, 105*46c4c49dSIbrahim Kanouche }, 106*46c4c49dSIbrahim Kanouche } 107*46c4c49dSIbrahim Kanouche 108*46c4c49dSIbrahim Kanouche for _, tt := range tests { 109*46c4c49dSIbrahim Kanouche hash := make(Hash) 110*46c4c49dSIbrahim Kanouche toks := Tokenize(tt.text) 111*46c4c49dSIbrahim Kanouche h, tr := toks.GenerateHashes(hash, len(toks)/tt.sizeFactor) 112*46c4c49dSIbrahim Kanouche if !reflect.DeepEqual(h, tt.wantHash) { 113*46c4c49dSIbrahim Kanouche t.Errorf("GenerateHashes(hash) = %v, want %v", h, tt.wantHash) 114*46c4c49dSIbrahim Kanouche } 115*46c4c49dSIbrahim Kanouche if !reflect.DeepEqual(tr, tt.wantRanges) { 116*46c4c49dSIbrahim Kanouche t.Errorf("GenerateHashes(ranges) = %v, want %v", tr, tt.wantRanges) 117*46c4c49dSIbrahim Kanouche } 118*46c4c49dSIbrahim Kanouche } 119*46c4c49dSIbrahim Kanouche} 120