1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifndef ICING_TESTING_RANDOM_STRING_H_
16 #define ICING_TESTING_RANDOM_STRING_H_
17
18 #include <algorithm>
19 #include <random>
20 #include <string>
21
22 namespace icing {
23 namespace lib {
24
25 inline constexpr std::string_view kAlNumAlphabet =
26 "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
27
28 // Average length of word in English is 4.7 characters.
29 inline constexpr int kAvgTokenLen = 5;
30 // Made up value. This results in a fairly reasonable language - the majority of
31 // generated words are 3-9 characters, ~3% of words are >=20 chars, and the
32 // longest ones are 27 chars, (roughly consistent with the longest,
33 // non-contrived English words
34 // https://en.wikipedia.org/wiki/Longest_word_in_English)
35 inline constexpr int kTokenStdDev = 7;
36
37 template <typename Gen>
RandomString(const std::string_view alphabet,size_t len,Gen * gen)38 std::string RandomString(const std::string_view alphabet, size_t len,
39 Gen* gen) {
40 std::uniform_int_distribution<size_t> uniform(0u, alphabet.size() - 1);
41 std::string result(len, '\0');
42 std::generate(
43 std::begin(result), std::end(result),
44 [&gen, &alphabet, &uniform]() { return alphabet[uniform(*gen)]; });
45
46 return result;
47 }
48
49 // Creates a vector containing num_words randomly-generated words for use by
50 // documents.
51 template <typename Rand>
CreateLanguages(int num_words,Rand * r)52 std::vector<std::string> CreateLanguages(int num_words, Rand* r) {
53 std::vector<std::string> language;
54 std::normal_distribution<> norm_dist(kAvgTokenLen, kTokenStdDev);
55 while (--num_words >= 0) {
56 int word_length = 0;
57 while (word_length < 1) {
58 word_length = std::round(norm_dist(*r));
59 }
60 language.push_back(RandomString(kAlNumAlphabet, word_length, r));
61 }
62 return language;
63 }
64
65 // Returns a vector containing num_terms unique terms. Terms are created in
66 // non-random order starting with "a" to "z" to "aa" to "zz", etc.
67 std::vector<std::string> GenerateUniqueTerms(int num_terms);
68
69 } // namespace lib
70 } // namespace icing
71
72 #endif // ICING_TESTING_RANDOM_STRING_H_
73