xref: /aosp_15_r20/external/libtextclassifier/native/utils/tokenizer.fbs (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker//
2*993b0882SAndroid Build Coastguard Worker// Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker//
4*993b0882SAndroid Build Coastguard Worker// Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker// you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker// You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker//
8*993b0882SAndroid Build Coastguard Worker//      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker//
10*993b0882SAndroid Build Coastguard Worker// Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker// distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker// See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker// limitations under the License.
15*993b0882SAndroid Build Coastguard Worker//
16*993b0882SAndroid Build Coastguard Worker
17*993b0882SAndroid Build Coastguard Worker// Controls the type of tokenization the model will use for the input text.
18*993b0882SAndroid Build Coastguard Workernamespace libtextclassifier3;
19*993b0882SAndroid Build Coastguard Workerenum TokenizationType : int {
20*993b0882SAndroid Build Coastguard Worker  INVALID_TOKENIZATION_TYPE = 0,
21*993b0882SAndroid Build Coastguard Worker
22*993b0882SAndroid Build Coastguard Worker  // Use the internal tokenizer for tokenization.
23*993b0882SAndroid Build Coastguard Worker  INTERNAL_TOKENIZER = 1,
24*993b0882SAndroid Build Coastguard Worker
25*993b0882SAndroid Build Coastguard Worker  // Use ICU for tokenization.
26*993b0882SAndroid Build Coastguard Worker  ICU = 2,
27*993b0882SAndroid Build Coastguard Worker
28*993b0882SAndroid Build Coastguard Worker  // First apply ICU tokenization. Then identify stretches of tokens
29*993b0882SAndroid Build Coastguard Worker  // consisting only of codepoints in internal_tokenizer_codepoint_ranges
30*993b0882SAndroid Build Coastguard Worker  // and re-tokenize them using the internal tokenizer.
31*993b0882SAndroid Build Coastguard Worker  MIXED = 3,
32*993b0882SAndroid Build Coastguard Worker
33*993b0882SAndroid Build Coastguard Worker  // Tokenizer parsing out numbers, words and separators.
34*993b0882SAndroid Build Coastguard Worker  LETTER_DIGIT = 4,
35*993b0882SAndroid Build Coastguard Worker}
36*993b0882SAndroid Build Coastguard Worker
37*993b0882SAndroid Build Coastguard Worker// Role of the codepoints in the range.
38*993b0882SAndroid Build Coastguard Workernamespace libtextclassifier3.TokenizationCodepointRange_;
39*993b0882SAndroid Build Coastguard Workerenum Role : int {
40*993b0882SAndroid Build Coastguard Worker  // Concatenates the codepoint to the current run of codepoints.
41*993b0882SAndroid Build Coastguard Worker  DEFAULT_ROLE = 0,
42*993b0882SAndroid Build Coastguard Worker
43*993b0882SAndroid Build Coastguard Worker  // Splits a run of codepoints before the current codepoint.
44*993b0882SAndroid Build Coastguard Worker  SPLIT_BEFORE = 1,
45*993b0882SAndroid Build Coastguard Worker
46*993b0882SAndroid Build Coastguard Worker  // Splits a run of codepoints after the current codepoint.
47*993b0882SAndroid Build Coastguard Worker  SPLIT_AFTER = 2,
48*993b0882SAndroid Build Coastguard Worker
49*993b0882SAndroid Build Coastguard Worker  // Each codepoint will be a separate token. Good e.g. for Chinese
50*993b0882SAndroid Build Coastguard Worker  // characters.
51*993b0882SAndroid Build Coastguard Worker  TOKEN_SEPARATOR = 3,
52*993b0882SAndroid Build Coastguard Worker
53*993b0882SAndroid Build Coastguard Worker  // Discards the codepoint.
54*993b0882SAndroid Build Coastguard Worker  DISCARD_CODEPOINT = 4,
55*993b0882SAndroid Build Coastguard Worker
56*993b0882SAndroid Build Coastguard Worker  // Common values:
57*993b0882SAndroid Build Coastguard Worker  // Splits on the characters and discards them. Good e.g. for the space
58*993b0882SAndroid Build Coastguard Worker  // character.
59*993b0882SAndroid Build Coastguard Worker  WHITESPACE_SEPARATOR = 7,
60*993b0882SAndroid Build Coastguard Worker}
61*993b0882SAndroid Build Coastguard Worker
62*993b0882SAndroid Build Coastguard Worker// Represents a codepoint range [start, end) with its role for tokenization.
63*993b0882SAndroid Build Coastguard Workernamespace libtextclassifier3;
64*993b0882SAndroid Build Coastguard Workertable TokenizationCodepointRange {
65*993b0882SAndroid Build Coastguard Worker  start:int;
66*993b0882SAndroid Build Coastguard Worker  end:int;
67*993b0882SAndroid Build Coastguard Worker  role:TokenizationCodepointRange_.Role;
68*993b0882SAndroid Build Coastguard Worker
69*993b0882SAndroid Build Coastguard Worker  // Integer identifier of the script this range denotes. Negative values are
70*993b0882SAndroid Build Coastguard Worker  // reserved for Tokenizer's internal use.
71*993b0882SAndroid Build Coastguard Worker  script_id:int;
72*993b0882SAndroid Build Coastguard Worker}
73*993b0882SAndroid Build Coastguard Worker
74