xref: /aosp_15_r20/external/licenseclassifier/stringclassifier/searchset/tokenizer/tokenizer_test.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1// Copyright 2017 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//	http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14package tokenizer
15
16import (
17	"reflect"
18	"testing"
19)
20
21func TestTokenizer_Tokenize(t *testing.T) {
22	tests := []struct {
23		text string
24		want Tokens
25	}{
26		{
27			text: "Tokenize",
28			want: Tokens{&token{Text: "Tokenize", Offset: 0}},
29		},
30		{
31			text: "Hello world",
32			want: Tokens{
33				&token{Text: "Hello", Offset: 0},
34				&token{Text: "world", Offset: 6},
35			},
36		},
37		{
38			text: `Goodnight,
39Irene
40`,
41			want: Tokens{
42				&token{Text: "Goodnight", Offset: 0},
43				&token{Text: ",", Offset: 9},
44				&token{Text: "Irene", Offset: 11},
45			},
46		},
47		{
48			text: "Copyright © 2017 Yoyodyne, Inc.",
49			want: Tokens{
50				&token{Text: "Copyright", Offset: 0},
51				&token{Text: "©", Offset: 10},
52				&token{Text: "2017", Offset: 13},
53				&token{Text: "Yoyodyne", Offset: 18},
54				&token{Text: ",", Offset: 26},
55				&token{Text: "Inc", Offset: 28},
56				&token{Text: ".", Offset: 31},
57			},
58		},
59	}
60
61	for _, tt := range tests {
62		if got := Tokenize(tt.text); !reflect.DeepEqual(got, tt.want) {
63			t.Errorf("Tokenize(%q) = %+v, want %+v", tt.text, got, tt.want)
64		}
65	}
66}
67
68func TestTokenizer_GenerateHashes(t *testing.T) {
69	tests := []struct {
70		text       string
71		sizeFactor int
72		wantHash   []uint32
73		wantRanges TokenRanges
74	}{
75		{
76			text:       "",
77			sizeFactor: 1,
78			wantHash:   nil,
79			wantRanges: nil,
80		},
81		{
82			text:       "Hashes",
83			sizeFactor: 1,
84			wantHash:   []uint32{408116689},
85			wantRanges: TokenRanges{{Start: 0, End: 1}},
86		},
87		{
88			text:       "hello world",
89			sizeFactor: 1,
90			wantHash:   []uint32{222957957},
91			wantRanges: TokenRanges{{Start: 0, End: 2}},
92		},
93		{
94			text:       "Copyright © 2017 Yoyodyne, Inc.",
95			sizeFactor: 3,
96			wantHash:   []uint32{2473816729, 966085113, 3025678301, 3199087486, 850352802, 1274745089},
97			wantRanges: TokenRanges{
98				{Start: 0, End: 2},
99				{Start: 1, End: 3},
100				{Start: 2, End: 4},
101				{Start: 3, End: 5},
102				{Start: 4, End: 6},
103				{Start: 5, End: 7},
104			},
105		},
106	}
107
108	for _, tt := range tests {
109		hash := make(Hash)
110		toks := Tokenize(tt.text)
111		h, tr := toks.GenerateHashes(hash, len(toks)/tt.sizeFactor)
112		if !reflect.DeepEqual(h, tt.wantHash) {
113			t.Errorf("GenerateHashes(hash) = %v, want %v", h, tt.wantHash)
114		}
115		if !reflect.DeepEqual(tr, tt.wantRanges) {
116			t.Errorf("GenerateHashes(ranges) = %v, want %v", tr, tt.wantRanges)
117		}
118	}
119}
120