1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_RELEVANT_SCRIPT_FEATURE_H_ 18*993b0882SAndroid Build Coastguard Worker #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_RELEVANT_SCRIPT_FEATURE_H_ 19*993b0882SAndroid Build Coastguard Worker 20*993b0882SAndroid Build Coastguard Worker #include <memory> 21*993b0882SAndroid Build Coastguard Worker 22*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/fel/feature-extractor.h" 23*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/fel/task-context.h" 24*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/fel/workspace.h" 25*993b0882SAndroid Build Coastguard Worker #include "lang_id/features/light-sentence-features.h" 26*993b0882SAndroid Build Coastguard Worker #include "lang_id/light-sentence.h" 27*993b0882SAndroid Build Coastguard Worker #include "lang_id/script/script-detector.h" 28*993b0882SAndroid Build Coastguard Worker 29*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 { 30*993b0882SAndroid Build Coastguard Worker namespace mobile { 31*993b0882SAndroid Build Coastguard Worker namespace lang_id { 32*993b0882SAndroid Build Coastguard Worker 33*993b0882SAndroid Build Coastguard Worker // Given a sentence, generates one FloatFeatureValue for each "relevant" Unicode 34*993b0882SAndroid Build Coastguard Worker // script (see below): each such feature indicates the script and the ratio of 35*993b0882SAndroid Build Coastguard Worker // UTF8 characters in that script, in the given sentence. 36*993b0882SAndroid Build Coastguard Worker // 37*993b0882SAndroid Build Coastguard Worker // What is a relevant script? Recognizing all 100+ Unicode scripts would 38*993b0882SAndroid Build Coastguard Worker // require too much code size and runtime. Instead, we focus only on a few 39*993b0882SAndroid Build Coastguard Worker // scripts that communicate a lot of language information: e.g., the use of 40*993b0882SAndroid Build Coastguard Worker // Hiragana characters almost always indicates Japanese, so Hiragana is a 41*993b0882SAndroid Build Coastguard Worker // "relevant" script for us. The Latin script is used by dozens of language, so 42*993b0882SAndroid Build Coastguard Worker // Latin is not relevant in this context. 43*993b0882SAndroid Build Coastguard Worker class RelevantScriptFeature : public LightSentenceFeature { 44*993b0882SAndroid Build Coastguard Worker public: 45*993b0882SAndroid Build Coastguard Worker bool Setup(TaskContext *context) override; 46*993b0882SAndroid Build Coastguard Worker bool Init(TaskContext *context) override; 47*993b0882SAndroid Build Coastguard Worker 48*993b0882SAndroid Build Coastguard Worker // Appends the features computed from the sentence to the feature vector. 49*993b0882SAndroid Build Coastguard Worker void Evaluate(const WorkspaceSet &workspaces, 50*993b0882SAndroid Build Coastguard Worker const LightSentence &sentence, 51*993b0882SAndroid Build Coastguard Worker FeatureVector *result) const override; 52*993b0882SAndroid Build Coastguard Worker 53*993b0882SAndroid Build Coastguard Worker SAFTM_DEFINE_REGISTRATION_METHOD("continuous-bag-of-relevant-scripts", 54*993b0882SAndroid Build Coastguard Worker RelevantScriptFeature); 55*993b0882SAndroid Build Coastguard Worker 56*993b0882SAndroid Build Coastguard Worker private: 57*993b0882SAndroid Build Coastguard Worker // Detects script of individual UTF8 characters. 58*993b0882SAndroid Build Coastguard Worker std::unique_ptr<ScriptDetector> script_detector_; 59*993b0882SAndroid Build Coastguard Worker 60*993b0882SAndroid Build Coastguard Worker // Current model supports scripts in [0, num_supported_scripts_). 61*993b0882SAndroid Build Coastguard Worker int num_supported_scripts_ = 0; 62*993b0882SAndroid Build Coastguard Worker }; 63*993b0882SAndroid Build Coastguard Worker 64*993b0882SAndroid Build Coastguard Worker } // namespace lang_id 65*993b0882SAndroid Build Coastguard Worker } // namespace mobile 66*993b0882SAndroid Build Coastguard Worker } // namespace nlp_saft 67*993b0882SAndroid Build Coastguard Worker 68*993b0882SAndroid Build Coastguard Worker #endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_RELEVANT_SCRIPT_FEATURE_H_ 69