1 // Copyright (C) 2022 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_MONKEY_TEST_MONKEY_TEST_COMMON_WORDS_H_ 16 #define ICING_MONKEY_TEST_MONKEY_TEST_COMMON_WORDS_H_ 17 18 #include <array> 19 #include <string_view> 20 21 namespace icing { 22 namespace lib { 23 24 // A bag of words in English for creating random documents. Only words that are 25 // at least 3 letters long are included (that's kPrefixLength) so that prefix 26 // queries are easily formed from any word in a random document. 27 // Data source: 28 // https://chromium.googlesource.com/chromium/src/+/HEAD/components/url_formatter/spoof_checks/common_words/data/ 29 static constexpr std::array<std::string_view, 1000> kCommonWords = { 30 "the", "and", "for", "that", 31 "this", "with", "you", "not", 32 "are", "from", "your", "all", 33 "have", "new", "more", "was", 34 "will", "home", "can", "about", 35 "page", "has", "search", "free", 36 "but", "our", "one", "other", 37 "information", "time", "they", "site", 38 "may", "what", "which", "their", 39 "news", "out", "use", "any", 40 "there", "see", "only", "his", 41 "when", "contact", "here", "business", 42 "who", "web", "also", "now", 43 "help", "get", "view", "online", 44 "first", "been", "would", "how", 45 "were", "services", "some", "these", 46 "click", "its", "like", "service", 47 "than", "find", "price", "date", 48 "back", "top", "people", "had", 49 "list", "name", "just", "over", 50 "state", "year", "day", "into", 51 "email", "two", "health", "world", 52 "next", "used", "work", "last", 53 "most", "products", "music", "buy", 54 "data", "make", "them", "should", 55 "product", "system", "post", "her", 56 "city", "add", "policy", "number", 57 "such", "please", "available", "copyright", 58 "support", "message", "after", "best", 59 "software", "then", "jan", "good", 60 "video", "well", "where", "info", 61 "rights", "public", "books", "high", 62 "school", "through", "each", "links", 63 "she", "review", "years", "order", 64 "very", "privacy", "book", "items", 65 "company", "read", "group", "sex", 66 "need", "many", "user", "said", 67 "does", "set", "under", "general", 68 "research", "university", "january", "mail", 69 "full", "map", "reviews", "program", 70 "life", "know", "games", "way", 71 "days", "management", "part", "could", 72 "great", "united", "hotel", "real", 73 "item", "international", "center", "must", 74 "store", "travel", "comments", "made", 75 "development", "report", "off", "member", 76 "details", "line", "terms", "before", 77 "hotels", "did", "send", "right", 78 "type", "because", "local", "those", 79 "using", "results", "office", "education", 80 "national", "car", "design", "take", 81 "posted", "internet", "address", "community", 82 "within", "states", "area", "want", 83 "phone", "dvd", "shipping", "reserved", 84 "subject", "between", "forum", "family", 85 "long", "based", "code", "show", 86 "even", "black", "check", "special", 87 "prices", "website", "index", "being", 88 "women", "much", "sign", "file", 89 "link", "open", "today", "technology", 90 "south", "case", "project", "same", 91 "pages", "version", "section", "own", 92 "found", "sports", "house", "related", 93 "security", "both", "county", "american", 94 "photo", "game", "members", "power", 95 "while", "care", "network", "down", 96 "computer", "systems", "three", "total", 97 "place", "end", "following", "download", 98 "him", "without", "per", "access", 99 "think", "north", "resources", "current", 100 "posts", "big", "media", "law", 101 "control", "water", "history", "pictures", 102 "size", "art", "personal", "since", 103 "including", "guide", "shop", "directory", 104 "board", "location", "change", "white", 105 "text", "small", "rating", "rate", 106 "government", "children", "during", "usa", 107 "return", "students", "shopping", "account", 108 "times", "sites", "level", "digital", 109 "profile", "previous", "form", "events", 110 "love", "old", "john", "main", 111 "call", "hours", "image", "department", 112 "title", "description", "non", "insurance", 113 "another", "why", "shall", "property", 114 "class", "still", "money", "quality", 115 "every", "listing", "content", "country", 116 "private", "little", "visit", "save", 117 "tools", "low", "reply", "customer", 118 "december", "compare", "movies", "include", 119 "college", "value", "article", "york", 120 "man", "card", "jobs", "provide", 121 "food", "source", "author", "different", 122 "press", "learn", "sale", "around", 123 "print", "course", "job", "canada", 124 "process", "teen", "room", "stock", 125 "training", "too", "credit", "point", 126 "join", "science", "men", "categories", 127 "advanced", "west", "sales", "look", 128 "english", "left", "team", "estate", 129 "box", "conditions", "select", "windows", 130 "photos", "gay", "thread", "week", 131 "category", "note", "live", "large", 132 "gallery", "table", "register", "however", 133 "june", "october", "november", "market", 134 "library", "really", "action", "start", 135 "series", "model", "features", "air", 136 "industry", "plan", "human", "provided", 137 "yes", "required", "second", "hot", 138 "accessories", "cost", "movie", "forums", 139 "march", "september", "better", "say", 140 "questions", "july", "going", "medical", 141 "test", "friend", "come", "dec", 142 "server", "study", "application", "cart", 143 "staff", "articles", "san", "feedback", 144 "again", "play", "looking", "issues", 145 "april", "never", "users", "complete", 146 "street", "topic", "comment", "financial", 147 "things", "working", "against", "standard", 148 "tax", "person", "below", "mobile", 149 "less", "got", "blog", "party", 150 "payment", "equipment", "login", "student", 151 "let", "programs", "offers", "legal", 152 "above", "recent", "park", "stores", 153 "side", "act", "problem", "red", 154 "give", "memory", "performance", "social", 155 "august", "quote", "language", "story", 156 "sell", "options", "experience", "rates", 157 "create", "key", "body", "young", 158 "america", "important", "field", "few", 159 "east", "paper", "single", "age", 160 "activities", "club", "example", "girls", 161 "additional", "password", "latest", "something", 162 "road", "gift", "question", "changes", 163 "night", "hard", "texas", "oct", 164 "pay", "four", "poker", "status", 165 "browse", "issue", "range", "building", 166 "seller", "court", "february", "always", 167 "result", "audio", "light", "write", 168 "war", "nov", "offer", "blue", 169 "groups", "easy", "given", "files", 170 "event", "release", "analysis", "request", 171 "fax", "china", "making", "picture", 172 "needs", "possible", "might", "professional", 173 "yet", "month", "major", "star", 174 "areas", "future", "space", "committee", 175 "hand", "sun", "cards", "problems", 176 "london", "washington", "meeting", "rss", 177 "become", "interest", "child", "keep", 178 "enter", "california", "porn", "share", 179 "similar", "garden", "schools", "million", 180 "added", "reference", "companies", "listed", 181 "baby", "learning", "energy", "run", 182 "delivery", "net", "popular", "term", 183 "film", "stories", "put", "computers", 184 "journal", "reports", "try", "welcome", 185 "central", "images", "president", "notice", 186 "god", "original", "head", "radio", 187 "until", "cell", "color", "self", 188 "council", "away", "includes", "track", 189 "australia", "discussion", "archive", "once", 190 "others", "entertainment", "agreement", "format", 191 "least", "society", "months", "log", 192 "safety", "friends", "sure", "faq", 193 "trade", "edition", "cars", "messages", 194 "marketing", "tell", "further", "updated", 195 "association", "able", "having", "provides", 196 "david", "fun", "already", "green", 197 "studies", "close", "common", "drive", 198 "specific", "several", "gold", "feb", 199 "living", "sep", "collection", "called", 200 "short", "arts", "lot", "ask", 201 "display", "limited", "powered", "solutions", 202 "means", "director", "daily", "beach", 203 "past", "natural", "whether", "due", 204 "electronics", "five", "upon", "period", 205 "planning", "database", "says", "official", 206 "weather", "mar", "land", "average", 207 "done", "technical", "window", "france", 208 "pro", "region", "island", "record", 209 "direct", "conference", "environment", "records", 210 "district", "calendar", "costs", "style", 211 "url", "front", "statement", "update", 212 "parts", "aug", "ever", "downloads", 213 "early", "miles", "sound", "resource", 214 "present", "applications", "either", "ago", 215 "document", "word", "works", "material", 216 "bill", "apr", "written", "talk", 217 "federal", "hosting", "rules", "final", 218 "adult", "tickets", "thing", "centre", 219 "requirements", "via", "cheap", "nude", 220 "kids", "finance", "true", "minutes", 221 "else", "mark", "third", "rock", 222 "gifts", "europe", "reading", "topics", 223 "bad", "individual", "tips", "plus", 224 "auto", "cover", "usually", "edit", 225 "together", "videos", "percent", "fast", 226 "function", "fact", "unit", "getting", 227 "global", "tech", "meet", "far", 228 "economic", "player", "projects", "lyrics", 229 "often", "subscribe", "submit", "germany", 230 "amount", "watch", "included", "feel", 231 "though", "bank", "risk", "thanks", 232 "everything", "deals", "various", "words", 233 "linux", "jul", "production", "commercial", 234 "james", "weight", "town", "heart", 235 "advertising", "received", "choose", "treatment", 236 "newsletter", "archives", "points", "knowledge", 237 "magazine", "error", "camera", "jun", 238 "girl", "currently", "construction", "toys", 239 "registered", "clear", "golf", "receive", 240 "domain", "methods", "chapter", "makes", 241 "protection", "policies", "loan", "wide", 242 "beauty", "manager", "india", "position", 243 "taken", "sort", "listings", "models", 244 "michael", "known", "half", "cases", 245 "step", "engineering", "florida", "simple", 246 "quick", "none", "wireless", "license", 247 "paul", "friday", "lake", "whole", 248 "annual", "published", "later", "basic", 249 "shows", "corporate", "church", "method", 250 "purchase", "customers", "active", "response", 251 "practice", "hardware", "figure", "materials", 252 "fire", "holiday", "chat", "enough", 253 "designed", "along", "among", "death", 254 "writing", "speed", "html", "countries", 255 "loss", "face", "brand", "discount", 256 "higher", "effects", "created", "remember", 257 "standards", "oil", "bit", "yellow", 258 "political", "increase", "advertise", "kingdom", 259 "base", "near", "environmental", "thought", 260 "stuff", "french", "storage", "japan", 261 "doing", "loans", "shoes", "entry", 262 "stay", "nature", "orders", "availability", 263 "africa", "summary", "turn", "mean", 264 "growth", "notes", "agency", "king", 265 "monday", "european", "activity", "copy", 266 "although", "drug", "pics", "western", 267 "income", "force", "cash", "employment", 268 "overall", "bay", "river", "commission", 269 "package", "contents", "seen", "players", 270 "engine", "port", "album", "regional", 271 "stop", "supplies", "started", "administration", 272 "bar", "institute", "views", "plans", 273 "double", "dog", "build", "screen", 274 "exchange", "types", "soon", "sponsored", 275 "lines", "electronic", "continue", "across", 276 "benefits", "needed", "season", "apply", 277 "someone", "held", "anything", "printer", 278 "condition", "effective", "believe", "organization", 279 "effect", "asked", "eur", "mind"}; 280 281 } // namespace lib 282 } // namespace icing 283 284 #endif // ICING_MONKEY_TEST_MONKEY_TEST_COMMON_WORDS_H_ 285