1 // Copyright (C) 2020 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_STORE_TYPE_NAMESPACE_ASSOCIATED_SCORING_DATA_H_ 16 #define ICING_STORE_TYPE_NAMESPACE_ASSOCIATED_SCORING_DATA_H_ 17 18 #include <cstdint> 19 #include <limits> 20 #include <type_traits> 21 22 #include "icing/legacy/core/icing-packed-pod.h" 23 24 namespace icing { 25 namespace lib { 26 27 // This is the cache entity of corpus-associated scores. The ground-truth data 28 // is stored somewhere else. The cache includes: 29 // 1. Number of documents contained in the corpus. 30 // Positive values are required. 31 // 2. The sum of the documents' lengths, in number of tokens. 32 class CorpusAssociatedScoreData { 33 public: 34 explicit CorpusAssociatedScoreData(int num_docs = 0, 35 int64_t sum_length_in_tokens = 0) sum_length_in_tokens_(sum_length_in_tokens)36 : sum_length_in_tokens_(sum_length_in_tokens), num_docs_(num_docs) {} 37 38 bool operator==(const CorpusAssociatedScoreData& other) const { 39 return num_docs_ == other.num_docs() && 40 sum_length_in_tokens_ == other.sum_length_in_tokens(); 41 } 42 num_docs()43 uint32_t num_docs() const { return num_docs_; } set_num_docs(uint32_t val)44 void set_num_docs(uint32_t val) { num_docs_ = val; } 45 sum_length_in_tokens()46 uint64_t sum_length_in_tokens() const { return sum_length_in_tokens_; } set_sum_length_in_tokens(uint64_t val)47 void set_sum_length_in_tokens(uint64_t val) { sum_length_in_tokens_ = val; } 48 average_doc_length_in_tokens()49 float average_doc_length_in_tokens() const { 50 return sum_length_in_tokens_ / (1.0f + num_docs_); 51 } 52 53 // Adds a new document. 54 // Adds the document's length to the total length of the corpus, 55 // sum_length_in_tokens_. AddDocument(uint32_t doc_length_in_tokens)56 void AddDocument(uint32_t doc_length_in_tokens) { 57 ++num_docs_; 58 sum_length_in_tokens_ = 59 (std::numeric_limits<int>::max() - doc_length_in_tokens < 60 sum_length_in_tokens_) 61 ? std::numeric_limits<int>::max() 62 : sum_length_in_tokens_ + doc_length_in_tokens; 63 } 64 65 private: 66 // The sum total of the length of all documents in the corpus. 67 int sum_length_in_tokens_; 68 int num_docs_; 69 } __attribute__((packed)); 70 71 static_assert(sizeof(CorpusAssociatedScoreData) == 8, 72 "Size of CorpusAssociatedScoreData should be 8"); 73 static_assert(icing_is_packed_pod<CorpusAssociatedScoreData>::value, 74 "go/icing-ubsan"); 75 76 } // namespace lib 77 } // namespace icing 78 79 #endif // ICING_STORE_TYPE_NAMESPACE_ASSOCIATED_SCORING_DATA_H_ 80