xref: /aosp_15_r20/external/licenseclassifier/stringclassifier/searchset/tokenizer/tokenizer_test.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//	http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouchepackage tokenizer
15*46c4c49dSIbrahim Kanouche
16*46c4c49dSIbrahim Kanoucheimport (
17*46c4c49dSIbrahim Kanouche	"reflect"
18*46c4c49dSIbrahim Kanouche	"testing"
19*46c4c49dSIbrahim Kanouche)
20*46c4c49dSIbrahim Kanouche
21*46c4c49dSIbrahim Kanouchefunc TestTokenizer_Tokenize(t *testing.T) {
22*46c4c49dSIbrahim Kanouche	tests := []struct {
23*46c4c49dSIbrahim Kanouche		text string
24*46c4c49dSIbrahim Kanouche		want Tokens
25*46c4c49dSIbrahim Kanouche	}{
26*46c4c49dSIbrahim Kanouche		{
27*46c4c49dSIbrahim Kanouche			text: "Tokenize",
28*46c4c49dSIbrahim Kanouche			want: Tokens{&token{Text: "Tokenize", Offset: 0}},
29*46c4c49dSIbrahim Kanouche		},
30*46c4c49dSIbrahim Kanouche		{
31*46c4c49dSIbrahim Kanouche			text: "Hello world",
32*46c4c49dSIbrahim Kanouche			want: Tokens{
33*46c4c49dSIbrahim Kanouche				&token{Text: "Hello", Offset: 0},
34*46c4c49dSIbrahim Kanouche				&token{Text: "world", Offset: 6},
35*46c4c49dSIbrahim Kanouche			},
36*46c4c49dSIbrahim Kanouche		},
37*46c4c49dSIbrahim Kanouche		{
38*46c4c49dSIbrahim Kanouche			text: `Goodnight,
39*46c4c49dSIbrahim KanoucheIrene
40*46c4c49dSIbrahim Kanouche`,
41*46c4c49dSIbrahim Kanouche			want: Tokens{
42*46c4c49dSIbrahim Kanouche				&token{Text: "Goodnight", Offset: 0},
43*46c4c49dSIbrahim Kanouche				&token{Text: ",", Offset: 9},
44*46c4c49dSIbrahim Kanouche				&token{Text: "Irene", Offset: 11},
45*46c4c49dSIbrahim Kanouche			},
46*46c4c49dSIbrahim Kanouche		},
47*46c4c49dSIbrahim Kanouche		{
48*46c4c49dSIbrahim Kanouche			text: "Copyright © 2017 Yoyodyne, Inc.",
49*46c4c49dSIbrahim Kanouche			want: Tokens{
50*46c4c49dSIbrahim Kanouche				&token{Text: "Copyright", Offset: 0},
51*46c4c49dSIbrahim Kanouche				&token{Text: "©", Offset: 10},
52*46c4c49dSIbrahim Kanouche				&token{Text: "2017", Offset: 13},
53*46c4c49dSIbrahim Kanouche				&token{Text: "Yoyodyne", Offset: 18},
54*46c4c49dSIbrahim Kanouche				&token{Text: ",", Offset: 26},
55*46c4c49dSIbrahim Kanouche				&token{Text: "Inc", Offset: 28},
56*46c4c49dSIbrahim Kanouche				&token{Text: ".", Offset: 31},
57*46c4c49dSIbrahim Kanouche			},
58*46c4c49dSIbrahim Kanouche		},
59*46c4c49dSIbrahim Kanouche	}
60*46c4c49dSIbrahim Kanouche
61*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
62*46c4c49dSIbrahim Kanouche		if got := Tokenize(tt.text); !reflect.DeepEqual(got, tt.want) {
63*46c4c49dSIbrahim Kanouche			t.Errorf("Tokenize(%q) = %+v, want %+v", tt.text, got, tt.want)
64*46c4c49dSIbrahim Kanouche		}
65*46c4c49dSIbrahim Kanouche	}
66*46c4c49dSIbrahim Kanouche}
67*46c4c49dSIbrahim Kanouche
68*46c4c49dSIbrahim Kanouchefunc TestTokenizer_GenerateHashes(t *testing.T) {
69*46c4c49dSIbrahim Kanouche	tests := []struct {
70*46c4c49dSIbrahim Kanouche		text       string
71*46c4c49dSIbrahim Kanouche		sizeFactor int
72*46c4c49dSIbrahim Kanouche		wantHash   []uint32
73*46c4c49dSIbrahim Kanouche		wantRanges TokenRanges
74*46c4c49dSIbrahim Kanouche	}{
75*46c4c49dSIbrahim Kanouche		{
76*46c4c49dSIbrahim Kanouche			text:       "",
77*46c4c49dSIbrahim Kanouche			sizeFactor: 1,
78*46c4c49dSIbrahim Kanouche			wantHash:   nil,
79*46c4c49dSIbrahim Kanouche			wantRanges: nil,
80*46c4c49dSIbrahim Kanouche		},
81*46c4c49dSIbrahim Kanouche		{
82*46c4c49dSIbrahim Kanouche			text:       "Hashes",
83*46c4c49dSIbrahim Kanouche			sizeFactor: 1,
84*46c4c49dSIbrahim Kanouche			wantHash:   []uint32{408116689},
85*46c4c49dSIbrahim Kanouche			wantRanges: TokenRanges{{Start: 0, End: 1}},
86*46c4c49dSIbrahim Kanouche		},
87*46c4c49dSIbrahim Kanouche		{
88*46c4c49dSIbrahim Kanouche			text:       "hello world",
89*46c4c49dSIbrahim Kanouche			sizeFactor: 1,
90*46c4c49dSIbrahim Kanouche			wantHash:   []uint32{222957957},
91*46c4c49dSIbrahim Kanouche			wantRanges: TokenRanges{{Start: 0, End: 2}},
92*46c4c49dSIbrahim Kanouche		},
93*46c4c49dSIbrahim Kanouche		{
94*46c4c49dSIbrahim Kanouche			text:       "Copyright © 2017 Yoyodyne, Inc.",
95*46c4c49dSIbrahim Kanouche			sizeFactor: 3,
96*46c4c49dSIbrahim Kanouche			wantHash:   []uint32{2473816729, 966085113, 3025678301, 3199087486, 850352802, 1274745089},
97*46c4c49dSIbrahim Kanouche			wantRanges: TokenRanges{
98*46c4c49dSIbrahim Kanouche				{Start: 0, End: 2},
99*46c4c49dSIbrahim Kanouche				{Start: 1, End: 3},
100*46c4c49dSIbrahim Kanouche				{Start: 2, End: 4},
101*46c4c49dSIbrahim Kanouche				{Start: 3, End: 5},
102*46c4c49dSIbrahim Kanouche				{Start: 4, End: 6},
103*46c4c49dSIbrahim Kanouche				{Start: 5, End: 7},
104*46c4c49dSIbrahim Kanouche			},
105*46c4c49dSIbrahim Kanouche		},
106*46c4c49dSIbrahim Kanouche	}
107*46c4c49dSIbrahim Kanouche
108*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
109*46c4c49dSIbrahim Kanouche		hash := make(Hash)
110*46c4c49dSIbrahim Kanouche		toks := Tokenize(tt.text)
111*46c4c49dSIbrahim Kanouche		h, tr := toks.GenerateHashes(hash, len(toks)/tt.sizeFactor)
112*46c4c49dSIbrahim Kanouche		if !reflect.DeepEqual(h, tt.wantHash) {
113*46c4c49dSIbrahim Kanouche			t.Errorf("GenerateHashes(hash) = %v, want %v", h, tt.wantHash)
114*46c4c49dSIbrahim Kanouche		}
115*46c4c49dSIbrahim Kanouche		if !reflect.DeepEqual(tr, tt.wantRanges) {
116*46c4c49dSIbrahim Kanouche			t.Errorf("GenerateHashes(ranges) = %v, want %v", tr, tt.wantRanges)
117*46c4c49dSIbrahim Kanouche		}
118*46c4c49dSIbrahim Kanouche	}
119*46c4c49dSIbrahim Kanouche}
120