xref: /aosp_15_r20/external/libtextclassifier/native/actions/feature-processor.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #include "actions/feature-processor.h"
18*993b0882SAndroid Build Coastguard Worker 
19*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
20*993b0882SAndroid Build Coastguard Worker namespace {
BuildTokenFeatureExtractorOptions(const ActionsTokenFeatureProcessorOptions * const options)21*993b0882SAndroid Build Coastguard Worker TokenFeatureExtractorOptions BuildTokenFeatureExtractorOptions(
22*993b0882SAndroid Build Coastguard Worker     const ActionsTokenFeatureProcessorOptions* const options) {
23*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions extractor_options;
24*993b0882SAndroid Build Coastguard Worker   extractor_options.num_buckets = options->num_buckets();
25*993b0882SAndroid Build Coastguard Worker   if (options->chargram_orders() != nullptr) {
26*993b0882SAndroid Build Coastguard Worker     for (int order : *options->chargram_orders()) {
27*993b0882SAndroid Build Coastguard Worker       extractor_options.chargram_orders.push_back(order);
28*993b0882SAndroid Build Coastguard Worker     }
29*993b0882SAndroid Build Coastguard Worker   }
30*993b0882SAndroid Build Coastguard Worker   extractor_options.max_word_length = options->max_token_length();
31*993b0882SAndroid Build Coastguard Worker   extractor_options.extract_case_feature = options->extract_case_feature();
32*993b0882SAndroid Build Coastguard Worker   extractor_options.unicode_aware_features = options->unicode_aware_features();
33*993b0882SAndroid Build Coastguard Worker   extractor_options.extract_selection_mask_feature = false;
34*993b0882SAndroid Build Coastguard Worker   if (options->regexp_features() != nullptr) {
35*993b0882SAndroid Build Coastguard Worker     for (const auto regexp_feature : *options->regexp_features()) {
36*993b0882SAndroid Build Coastguard Worker       extractor_options.regexp_features.push_back(regexp_feature->str());
37*993b0882SAndroid Build Coastguard Worker     }
38*993b0882SAndroid Build Coastguard Worker   }
39*993b0882SAndroid Build Coastguard Worker   extractor_options.remap_digits = options->remap_digits();
40*993b0882SAndroid Build Coastguard Worker   extractor_options.lowercase_tokens = options->lowercase_tokens();
41*993b0882SAndroid Build Coastguard Worker   return extractor_options;
42*993b0882SAndroid Build Coastguard Worker }
43*993b0882SAndroid Build Coastguard Worker }  // namespace
44*993b0882SAndroid Build Coastguard Worker 
CreateTokenizer(const ActionsTokenizerOptions * options,const UniLib * unilib)45*993b0882SAndroid Build Coastguard Worker std::unique_ptr<Tokenizer> CreateTokenizer(
46*993b0882SAndroid Build Coastguard Worker     const ActionsTokenizerOptions* options, const UniLib* unilib) {
47*993b0882SAndroid Build Coastguard Worker   std::vector<const TokenizationCodepointRange*> codepoint_config;
48*993b0882SAndroid Build Coastguard Worker   if (options->tokenization_codepoint_config() != nullptr) {
49*993b0882SAndroid Build Coastguard Worker     codepoint_config.insert(codepoint_config.end(),
50*993b0882SAndroid Build Coastguard Worker                             options->tokenization_codepoint_config()->begin(),
51*993b0882SAndroid Build Coastguard Worker                             options->tokenization_codepoint_config()->end());
52*993b0882SAndroid Build Coastguard Worker   }
53*993b0882SAndroid Build Coastguard Worker   std::vector<const CodepointRange*> internal_codepoint_config;
54*993b0882SAndroid Build Coastguard Worker   if (options->internal_tokenizer_codepoint_ranges() != nullptr) {
55*993b0882SAndroid Build Coastguard Worker     internal_codepoint_config.insert(
56*993b0882SAndroid Build Coastguard Worker         internal_codepoint_config.end(),
57*993b0882SAndroid Build Coastguard Worker         options->internal_tokenizer_codepoint_ranges()->begin(),
58*993b0882SAndroid Build Coastguard Worker         options->internal_tokenizer_codepoint_ranges()->end());
59*993b0882SAndroid Build Coastguard Worker   }
60*993b0882SAndroid Build Coastguard Worker   const bool tokenize_on_script_change =
61*993b0882SAndroid Build Coastguard Worker       options->tokenization_codepoint_config() != nullptr &&
62*993b0882SAndroid Build Coastguard Worker       options->tokenize_on_script_change();
63*993b0882SAndroid Build Coastguard Worker   return std::unique_ptr<Tokenizer>(new Tokenizer(
64*993b0882SAndroid Build Coastguard Worker       options->type(), unilib, codepoint_config, internal_codepoint_config,
65*993b0882SAndroid Build Coastguard Worker       tokenize_on_script_change, options->icu_preserve_whitespace_tokens()));
66*993b0882SAndroid Build Coastguard Worker }
67*993b0882SAndroid Build Coastguard Worker 
ActionsFeatureProcessor(const ActionsTokenFeatureProcessorOptions * options,const UniLib * unilib)68*993b0882SAndroid Build Coastguard Worker ActionsFeatureProcessor::ActionsFeatureProcessor(
69*993b0882SAndroid Build Coastguard Worker     const ActionsTokenFeatureProcessorOptions* options, const UniLib* unilib)
70*993b0882SAndroid Build Coastguard Worker     : options_(options),
71*993b0882SAndroid Build Coastguard Worker       tokenizer_(CreateTokenizer(options->tokenizer_options(), unilib)),
72*993b0882SAndroid Build Coastguard Worker       token_feature_extractor_(BuildTokenFeatureExtractorOptions(options),
73*993b0882SAndroid Build Coastguard Worker                                unilib) {}
74*993b0882SAndroid Build Coastguard Worker 
GetTokenEmbeddingSize() const75*993b0882SAndroid Build Coastguard Worker int ActionsFeatureProcessor::GetTokenEmbeddingSize() const {
76*993b0882SAndroid Build Coastguard Worker   return options_->embedding_size() +
77*993b0882SAndroid Build Coastguard Worker          token_feature_extractor_.DenseFeaturesCount();
78*993b0882SAndroid Build Coastguard Worker }
79*993b0882SAndroid Build Coastguard Worker 
AppendFeatures(const std::vector<int> & sparse_features,const std::vector<float> & dense_features,const EmbeddingExecutor * embedding_executor,std::vector<float> * output_features) const80*993b0882SAndroid Build Coastguard Worker bool ActionsFeatureProcessor::AppendFeatures(
81*993b0882SAndroid Build Coastguard Worker     const std::vector<int>& sparse_features,
82*993b0882SAndroid Build Coastguard Worker     const std::vector<float>& dense_features,
83*993b0882SAndroid Build Coastguard Worker     const EmbeddingExecutor* embedding_executor,
84*993b0882SAndroid Build Coastguard Worker     std::vector<float>* output_features) const {
85*993b0882SAndroid Build Coastguard Worker   // Embed the sparse features, appending them directly to the output.
86*993b0882SAndroid Build Coastguard Worker   const int embedding_size = options_->embedding_size();
87*993b0882SAndroid Build Coastguard Worker   output_features->resize(output_features->size() + embedding_size);
88*993b0882SAndroid Build Coastguard Worker   float* output_features_end =
89*993b0882SAndroid Build Coastguard Worker       output_features->data() + output_features->size();
90*993b0882SAndroid Build Coastguard Worker   if (!embedding_executor->AddEmbedding(
91*993b0882SAndroid Build Coastguard Worker           TensorView<int>(sparse_features.data(),
92*993b0882SAndroid Build Coastguard Worker                           {static_cast<int>(sparse_features.size())}),
93*993b0882SAndroid Build Coastguard Worker           /*dest=*/output_features_end - embedding_size,
94*993b0882SAndroid Build Coastguard Worker           /*dest_size=*/embedding_size)) {
95*993b0882SAndroid Build Coastguard Worker     TC3_LOG(ERROR) << "Could not embed token's sparse features.";
96*993b0882SAndroid Build Coastguard Worker     return false;
97*993b0882SAndroid Build Coastguard Worker   }
98*993b0882SAndroid Build Coastguard Worker 
99*993b0882SAndroid Build Coastguard Worker   // Append the dense features to the output.
100*993b0882SAndroid Build Coastguard Worker   output_features->insert(output_features->end(), dense_features.begin(),
101*993b0882SAndroid Build Coastguard Worker                           dense_features.end());
102*993b0882SAndroid Build Coastguard Worker   return true;
103*993b0882SAndroid Build Coastguard Worker }
104*993b0882SAndroid Build Coastguard Worker 
AppendTokenFeatures(const Token & token,const EmbeddingExecutor * embedding_executor,std::vector<float> * output_features) const105*993b0882SAndroid Build Coastguard Worker bool ActionsFeatureProcessor::AppendTokenFeatures(
106*993b0882SAndroid Build Coastguard Worker     const Token& token, const EmbeddingExecutor* embedding_executor,
107*993b0882SAndroid Build Coastguard Worker     std::vector<float>* output_features) const {
108*993b0882SAndroid Build Coastguard Worker   // Extract the sparse and dense features.
109*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features;
110*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
111*993b0882SAndroid Build Coastguard Worker   if (!token_feature_extractor_.Extract(token, /*(unused) is_in_span=*/false,
112*993b0882SAndroid Build Coastguard Worker                                         &sparse_features, &dense_features)) {
113*993b0882SAndroid Build Coastguard Worker     TC3_LOG(ERROR) << "Could not extract token's features.";
114*993b0882SAndroid Build Coastguard Worker     return false;
115*993b0882SAndroid Build Coastguard Worker   }
116*993b0882SAndroid Build Coastguard Worker   return AppendFeatures(sparse_features, dense_features, embedding_executor,
117*993b0882SAndroid Build Coastguard Worker                         output_features);
118*993b0882SAndroid Build Coastguard Worker }
119*993b0882SAndroid Build Coastguard Worker 
AppendTokenFeatures(const std::vector<Token> & tokens,const EmbeddingExecutor * embedding_executor,std::vector<float> * output_features) const120*993b0882SAndroid Build Coastguard Worker bool ActionsFeatureProcessor::AppendTokenFeatures(
121*993b0882SAndroid Build Coastguard Worker     const std::vector<Token>& tokens,
122*993b0882SAndroid Build Coastguard Worker     const EmbeddingExecutor* embedding_executor,
123*993b0882SAndroid Build Coastguard Worker     std::vector<float>* output_features) const {
124*993b0882SAndroid Build Coastguard Worker   for (const Token& token : tokens) {
125*993b0882SAndroid Build Coastguard Worker     if (!AppendTokenFeatures(token, embedding_executor, output_features)) {
126*993b0882SAndroid Build Coastguard Worker       return false;
127*993b0882SAndroid Build Coastguard Worker     }
128*993b0882SAndroid Build Coastguard Worker   }
129*993b0882SAndroid Build Coastguard Worker   return true;
130*993b0882SAndroid Build Coastguard Worker }
131*993b0882SAndroid Build Coastguard Worker 
132*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3
133