xref: /aosp_15_r20/external/icing/icing/testing/random-string.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_TESTING_RANDOM_STRING_H_
16 #define ICING_TESTING_RANDOM_STRING_H_
17 
18 #include <algorithm>
19 #include <random>
20 #include <string>
21 
22 namespace icing {
23 namespace lib {
24 
25 inline constexpr std::string_view kAlNumAlphabet =
26     "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
27 
28 // Average length of word in English is 4.7 characters.
29 inline constexpr int kAvgTokenLen = 5;
30 // Made up value. This results in a fairly reasonable language - the majority of
31 // generated words are 3-9 characters, ~3% of words are >=20 chars, and the
32 // longest ones are 27 chars, (roughly consistent with the longest,
33 // non-contrived English words
34 // https://en.wikipedia.org/wiki/Longest_word_in_English)
35 inline constexpr int kTokenStdDev = 7;
36 
37 template <typename Gen>
RandomString(const std::string_view alphabet,size_t len,Gen * gen)38 std::string RandomString(const std::string_view alphabet, size_t len,
39                          Gen* gen) {
40   std::uniform_int_distribution<size_t> uniform(0u, alphabet.size() - 1);
41   std::string result(len, '\0');
42   std::generate(
43       std::begin(result), std::end(result),
44       [&gen, &alphabet, &uniform]() { return alphabet[uniform(*gen)]; });
45 
46   return result;
47 }
48 
49 // Creates a vector containing num_words randomly-generated words for use by
50 // documents.
51 template <typename Rand>
CreateLanguages(int num_words,Rand * r)52 std::vector<std::string> CreateLanguages(int num_words, Rand* r) {
53   std::vector<std::string> language;
54   std::normal_distribution<> norm_dist(kAvgTokenLen, kTokenStdDev);
55   while (--num_words >= 0) {
56     int word_length = 0;
57     while (word_length < 1) {
58       word_length = std::round(norm_dist(*r));
59     }
60     language.push_back(RandomString(kAlNumAlphabet, word_length, r));
61   }
62   return language;
63 }
64 
65 // Returns a vector containing num_terms unique terms. Terms are created in
66 // non-random order starting with "a" to "z" to "aa" to "zz", etc.
67 std::vector<std::string> GenerateUniqueTerms(int num_terms);
68 
69 }  // namespace lib
70 }  // namespace icing
71 
72 #endif  // ICING_TESTING_RANDOM_STRING_H_
73