1*993b0882SAndroid Build Coastguard Worker// 2*993b0882SAndroid Build Coastguard Worker// Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker// 4*993b0882SAndroid Build Coastguard Worker// Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker// you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker// You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker// 8*993b0882SAndroid Build Coastguard Worker// http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker// 10*993b0882SAndroid Build Coastguard Worker// Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker// distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker// See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker// limitations under the License. 15*993b0882SAndroid Build Coastguard Worker// 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker// Controls the type of tokenization the model will use for the input text. 18*993b0882SAndroid Build Coastguard Workernamespace libtextclassifier3; 19*993b0882SAndroid Build Coastguard Workerenum TokenizationType : int { 20*993b0882SAndroid Build Coastguard Worker INVALID_TOKENIZATION_TYPE = 0, 21*993b0882SAndroid Build Coastguard Worker 22*993b0882SAndroid Build Coastguard Worker // Use the internal tokenizer for tokenization. 23*993b0882SAndroid Build Coastguard Worker INTERNAL_TOKENIZER = 1, 24*993b0882SAndroid Build Coastguard Worker 25*993b0882SAndroid Build Coastguard Worker // Use ICU for tokenization. 26*993b0882SAndroid Build Coastguard Worker ICU = 2, 27*993b0882SAndroid Build Coastguard Worker 28*993b0882SAndroid Build Coastguard Worker // First apply ICU tokenization. Then identify stretches of tokens 29*993b0882SAndroid Build Coastguard Worker // consisting only of codepoints in internal_tokenizer_codepoint_ranges 30*993b0882SAndroid Build Coastguard Worker // and re-tokenize them using the internal tokenizer. 31*993b0882SAndroid Build Coastguard Worker MIXED = 3, 32*993b0882SAndroid Build Coastguard Worker 33*993b0882SAndroid Build Coastguard Worker // Tokenizer parsing out numbers, words and separators. 34*993b0882SAndroid Build Coastguard Worker LETTER_DIGIT = 4, 35*993b0882SAndroid Build Coastguard Worker} 36*993b0882SAndroid Build Coastguard Worker 37*993b0882SAndroid Build Coastguard Worker// Role of the codepoints in the range. 38*993b0882SAndroid Build Coastguard Workernamespace libtextclassifier3.TokenizationCodepointRange_; 39*993b0882SAndroid Build Coastguard Workerenum Role : int { 40*993b0882SAndroid Build Coastguard Worker // Concatenates the codepoint to the current run of codepoints. 41*993b0882SAndroid Build Coastguard Worker DEFAULT_ROLE = 0, 42*993b0882SAndroid Build Coastguard Worker 43*993b0882SAndroid Build Coastguard Worker // Splits a run of codepoints before the current codepoint. 44*993b0882SAndroid Build Coastguard Worker SPLIT_BEFORE = 1, 45*993b0882SAndroid Build Coastguard Worker 46*993b0882SAndroid Build Coastguard Worker // Splits a run of codepoints after the current codepoint. 47*993b0882SAndroid Build Coastguard Worker SPLIT_AFTER = 2, 48*993b0882SAndroid Build Coastguard Worker 49*993b0882SAndroid Build Coastguard Worker // Each codepoint will be a separate token. Good e.g. for Chinese 50*993b0882SAndroid Build Coastguard Worker // characters. 51*993b0882SAndroid Build Coastguard Worker TOKEN_SEPARATOR = 3, 52*993b0882SAndroid Build Coastguard Worker 53*993b0882SAndroid Build Coastguard Worker // Discards the codepoint. 54*993b0882SAndroid Build Coastguard Worker DISCARD_CODEPOINT = 4, 55*993b0882SAndroid Build Coastguard Worker 56*993b0882SAndroid Build Coastguard Worker // Common values: 57*993b0882SAndroid Build Coastguard Worker // Splits on the characters and discards them. Good e.g. for the space 58*993b0882SAndroid Build Coastguard Worker // character. 59*993b0882SAndroid Build Coastguard Worker WHITESPACE_SEPARATOR = 7, 60*993b0882SAndroid Build Coastguard Worker} 61*993b0882SAndroid Build Coastguard Worker 62*993b0882SAndroid Build Coastguard Worker// Represents a codepoint range [start, end) with its role for tokenization. 63*993b0882SAndroid Build Coastguard Workernamespace libtextclassifier3; 64*993b0882SAndroid Build Coastguard Workertable TokenizationCodepointRange { 65*993b0882SAndroid Build Coastguard Worker start:int; 66*993b0882SAndroid Build Coastguard Worker end:int; 67*993b0882SAndroid Build Coastguard Worker role:TokenizationCodepointRange_.Role; 68*993b0882SAndroid Build Coastguard Worker 69*993b0882SAndroid Build Coastguard Worker // Integer identifier of the script this range denotes. Negative values are 70*993b0882SAndroid Build Coastguard Worker // reserved for Tokenizer's internal use. 71*993b0882SAndroid Build Coastguard Worker script_id:int; 72*993b0882SAndroid Build Coastguard Worker} 73*993b0882SAndroid Build Coastguard Worker 74