xref: /aosp_15_r20/external/zstd/programs/lorem.c (revision 01826a4963a0d8a59bc3812d29bdf0fb76416722)
1*01826a49SYabin Cui /*
2*01826a49SYabin Cui  * Copyright (c) Meta Platforms, Inc. and affiliates.
3*01826a49SYabin Cui  * All rights reserved.
4*01826a49SYabin Cui  *
5*01826a49SYabin Cui  * This source code is licensed under both the BSD-style license (found in the
6*01826a49SYabin Cui  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7*01826a49SYabin Cui  * in the COPYING file in the root directory of this source tree).
8*01826a49SYabin Cui  * You may select, at your option, one of the above-listed licenses.
9*01826a49SYabin Cui  */
10*01826a49SYabin Cui 
11*01826a49SYabin Cui /* Implementation notes:
12*01826a49SYabin Cui  *
13*01826a49SYabin Cui  * This is a very simple lorem ipsum generator
14*01826a49SYabin Cui  * which features a static list of words
15*01826a49SYabin Cui  * and print them one after another randomly
16*01826a49SYabin Cui  * with a fake sentence / paragraph structure.
17*01826a49SYabin Cui  *
18*01826a49SYabin Cui  * The goal is to generate a printable text
19*01826a49SYabin Cui  * that can be used to fake a text compression scenario.
20*01826a49SYabin Cui  * The resulting compression / ratio curve of the lorem ipsum generator
21*01826a49SYabin Cui  * is more satisfying than the previous statistical generator,
22*01826a49SYabin Cui  * which was initially designed for entropy compression,
23*01826a49SYabin Cui  * and lacks a regularity more representative of text.
24*01826a49SYabin Cui  *
25*01826a49SYabin Cui  * The compression ratio achievable on the generated lorem ipsum
26*01826a49SYabin Cui  * is still a bit too good, presumably because the dictionary is a bit too
27*01826a49SYabin Cui  * small. It would be possible to create some more complex scheme, notably by
28*01826a49SYabin Cui  * enlarging the dictionary with a word generator, and adding grammatical rules
29*01826a49SYabin Cui  * (composition) and syntax rules. But that's probably overkill for the intended
30*01826a49SYabin Cui  * goal.
31*01826a49SYabin Cui  */
32*01826a49SYabin Cui 
33*01826a49SYabin Cui #include "lorem.h"
34*01826a49SYabin Cui #include <assert.h>
35*01826a49SYabin Cui #include <limits.h> /* INT_MAX */
36*01826a49SYabin Cui #include <string.h> /* memcpy */
37*01826a49SYabin Cui 
38*01826a49SYabin Cui #define WORD_MAX_SIZE 20
39*01826a49SYabin Cui 
40*01826a49SYabin Cui /* Define the word pool */
41*01826a49SYabin Cui static const char* kWords[] = {
42*01826a49SYabin Cui     "lorem",        "ipsum",      "dolor",       "sit",          "amet",
43*01826a49SYabin Cui     "consectetur",  "adipiscing", "elit",        "sed",          "do",
44*01826a49SYabin Cui     "eiusmod",      "tempor",     "incididunt",  "ut",           "labore",
45*01826a49SYabin Cui     "et",           "dolore",     "magna",       "aliqua",       "dis",
46*01826a49SYabin Cui     "lectus",       "vestibulum", "mattis",      "ullamcorper",  "velit",
47*01826a49SYabin Cui     "commodo",      "a",          "lacus",       "arcu",         "magnis",
48*01826a49SYabin Cui     "parturient",   "montes",     "nascetur",    "ridiculus",    "mus",
49*01826a49SYabin Cui     "mauris",       "nulla",      "malesuada",   "pellentesque", "eget",
50*01826a49SYabin Cui     "gravida",      "in",         "dictum",      "non",          "erat",
51*01826a49SYabin Cui     "nam",          "voluptat",   "maecenas",    "blandit",      "aliquam",
52*01826a49SYabin Cui     "etiam",        "enim",       "lobortis",    "scelerisque",  "fermentum",
53*01826a49SYabin Cui     "dui",          "faucibus",   "ornare",      "at",           "elementum",
54*01826a49SYabin Cui     "eu",           "facilisis",  "odio",        "morbi",        "quis",
55*01826a49SYabin Cui     "eros",         "donec",      "ac",          "orci",         "purus",
56*01826a49SYabin Cui     "turpis",       "cursus",     "leo",         "vel",          "porta",
57*01826a49SYabin Cui     "consequat",    "interdum",   "varius",      "vulputate",    "aliquet",
58*01826a49SYabin Cui     "pharetra",     "nunc",       "auctor",      "urna",         "id",
59*01826a49SYabin Cui     "metus",        "viverra",    "nibh",        "cras",         "mi",
60*01826a49SYabin Cui     "unde",         "omnis",      "iste",        "natus",        "error",
61*01826a49SYabin Cui     "perspiciatis", "voluptatem", "accusantium", "doloremque",   "laudantium",
62*01826a49SYabin Cui     "totam",        "rem",        "aperiam",     "eaque",        "ipsa",
63*01826a49SYabin Cui     "quae",         "ab",         "illo",        "inventore",    "veritatis",
64*01826a49SYabin Cui     "quasi",        "architecto", "beatae",      "vitae",        "dicta",
65*01826a49SYabin Cui     "sunt",         "explicabo",  "nemo",        "ipsam",        "quia",
66*01826a49SYabin Cui     "voluptas",     "aspernatur", "aut",         "odit",         "fugit",
67*01826a49SYabin Cui     "consequuntur", "magni",      "dolores",     "eos",          "qui",
68*01826a49SYabin Cui     "ratione",      "sequi",      "nesciunt",    "neque",        "porro",
69*01826a49SYabin Cui     "quisquam",     "est",        "dolorem",     "adipisci",     "numquam",
70*01826a49SYabin Cui     "eius",         "modi",       "tempora",     "incidunt",     "magnam",
71*01826a49SYabin Cui     "quaerat",      "ad",         "minima",      "veniam",       "nostrum",
72*01826a49SYabin Cui     "ullam",        "corporis",   "suscipit",    "laboriosam",   "nisi",
73*01826a49SYabin Cui     "aliquid",      "ex",         "ea",          "commodi",      "consequatur",
74*01826a49SYabin Cui     "autem",        "eum",        "iure",        "voluptate",    "esse",
75*01826a49SYabin Cui     "quam",         "nihil",      "molestiae",   "illum",        "fugiat",
76*01826a49SYabin Cui     "quo",          "pariatur",   "vero",        "accusamus",    "iusto",
77*01826a49SYabin Cui     "dignissimos",  "ducimus",    "blanditiis",  "praesentium",  "voluptatum",
78*01826a49SYabin Cui     "deleniti",     "atque",      "corrupti",    "quos",         "quas",
79*01826a49SYabin Cui     "molestias",    "excepturi",  "sint",        "occaecati",    "cupiditate",
80*01826a49SYabin Cui     "provident",    "similique",  "culpa",       "officia",      "deserunt",
81*01826a49SYabin Cui     "mollitia",     "animi",      "laborum",     "dolorum",      "fuga",
82*01826a49SYabin Cui     "harum",        "quidem",     "rerum",       "facilis",      "expedita",
83*01826a49SYabin Cui     "distinctio",   "libero",     "tempore",     "cum",          "soluta",
84*01826a49SYabin Cui     "nobis",        "eligendi",   "optio",       "cumque",       "impedit",
85*01826a49SYabin Cui     "minus",        "quod",       "maxime",      "placeat",      "facere",
86*01826a49SYabin Cui     "possimus",     "assumenda",  "repellendus", "temporibus",   "quibusdam",
87*01826a49SYabin Cui     "officiis",     "debitis",    "saepe",       "eveniet",      "voluptates",
88*01826a49SYabin Cui     "repudiandae",  "recusandae", "itaque",      "earum",        "hic",
89*01826a49SYabin Cui     "tenetur",      "sapiente",   "delectus",    "reiciendis",   "cillum",
90*01826a49SYabin Cui     "maiores",      "alias",      "perferendis", "doloribus",    "asperiores",
91*01826a49SYabin Cui     "repellat",     "minim",      "nostrud",     "exercitation", "ullamco",
92*01826a49SYabin Cui     "laboris",      "aliquip",    "duis",        "aute",         "irure",
93*01826a49SYabin Cui };
94*01826a49SYabin Cui static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]);
95*01826a49SYabin Cui 
96*01826a49SYabin Cui /* simple 1-dimension distribution, based on word's length, favors small words
97*01826a49SYabin Cui  */
98*01826a49SYabin Cui static const int kWeights[]    = { 0, 8, 6, 4, 3, 2 };
99*01826a49SYabin Cui static const size_t kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
100*01826a49SYabin Cui 
101*01826a49SYabin Cui #define DISTRIB_SIZE_MAX 650
102*01826a49SYabin Cui static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
103*01826a49SYabin Cui static unsigned g_distribCount         = 0;
104*01826a49SYabin Cui 
countFreqs(const char * words[],size_t nbWords,const int * weights,size_t nbWeights)105*01826a49SYabin Cui static void countFreqs(
106*01826a49SYabin Cui         const char* words[],
107*01826a49SYabin Cui         size_t nbWords,
108*01826a49SYabin Cui         const int* weights,
109*01826a49SYabin Cui         size_t nbWeights)
110*01826a49SYabin Cui {
111*01826a49SYabin Cui     unsigned total = 0;
112*01826a49SYabin Cui     size_t w;
113*01826a49SYabin Cui     for (w = 0; w < nbWords; w++) {
114*01826a49SYabin Cui         size_t len = strlen(words[w]);
115*01826a49SYabin Cui         int lmax;
116*01826a49SYabin Cui         if (len >= nbWeights)
117*01826a49SYabin Cui             len = nbWeights - 1;
118*01826a49SYabin Cui         lmax = weights[len];
119*01826a49SYabin Cui         total += (unsigned)lmax;
120*01826a49SYabin Cui     }
121*01826a49SYabin Cui     g_distribCount = total;
122*01826a49SYabin Cui     assert(g_distribCount <= DISTRIB_SIZE_MAX);
123*01826a49SYabin Cui }
124*01826a49SYabin Cui 
init_word_distrib(const char * words[],size_t nbWords,const int * weights,size_t nbWeights)125*01826a49SYabin Cui static void init_word_distrib(
126*01826a49SYabin Cui         const char* words[],
127*01826a49SYabin Cui         size_t nbWords,
128*01826a49SYabin Cui         const int* weights,
129*01826a49SYabin Cui         size_t nbWeights)
130*01826a49SYabin Cui {
131*01826a49SYabin Cui     size_t w, d = 0;
132*01826a49SYabin Cui     countFreqs(words, nbWords, weights, nbWeights);
133*01826a49SYabin Cui     for (w = 0; w < nbWords; w++) {
134*01826a49SYabin Cui         size_t len = strlen(words[w]);
135*01826a49SYabin Cui         int l, lmax;
136*01826a49SYabin Cui         if (len >= nbWeights)
137*01826a49SYabin Cui             len = nbWeights - 1;
138*01826a49SYabin Cui         lmax = weights[len];
139*01826a49SYabin Cui         for (l = 0; l < lmax; l++) {
140*01826a49SYabin Cui             g_distrib[d++] = (int)w;
141*01826a49SYabin Cui         }
142*01826a49SYabin Cui     }
143*01826a49SYabin Cui }
144*01826a49SYabin Cui 
145*01826a49SYabin Cui /* Note: this unit only works when invoked sequentially.
146*01826a49SYabin Cui  * No concurrent access is allowed */
147*01826a49SYabin Cui static char* g_ptr         = NULL;
148*01826a49SYabin Cui static size_t g_nbChars    = 0;
149*01826a49SYabin Cui static size_t g_maxChars   = 10000000;
150*01826a49SYabin Cui static unsigned g_randRoot = 0;
151*01826a49SYabin Cui 
152*01826a49SYabin Cui #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
LOREM_rand(unsigned range)153*01826a49SYabin Cui static unsigned LOREM_rand(unsigned range)
154*01826a49SYabin Cui {
155*01826a49SYabin Cui     static const unsigned prime1 = 2654435761U;
156*01826a49SYabin Cui     static const unsigned prime2 = 2246822519U;
157*01826a49SYabin Cui     unsigned rand32              = g_randRoot;
158*01826a49SYabin Cui     rand32 *= prime1;
159*01826a49SYabin Cui     rand32 ^= prime2;
160*01826a49SYabin Cui     rand32     = RDG_rotl32(rand32, 13);
161*01826a49SYabin Cui     g_randRoot = rand32;
162*01826a49SYabin Cui     return (unsigned)(((unsigned long long)rand32 * range) >> 32);
163*01826a49SYabin Cui }
164*01826a49SYabin Cui 
writeLastCharacters(void)165*01826a49SYabin Cui static void writeLastCharacters(void)
166*01826a49SYabin Cui {
167*01826a49SYabin Cui     size_t lastChars = g_maxChars - g_nbChars;
168*01826a49SYabin Cui     assert(g_maxChars >= g_nbChars);
169*01826a49SYabin Cui     if (lastChars == 0)
170*01826a49SYabin Cui         return;
171*01826a49SYabin Cui     g_ptr[g_nbChars++] = '.';
172*01826a49SYabin Cui     if (lastChars > 2) {
173*01826a49SYabin Cui         memset(g_ptr + g_nbChars, ' ', lastChars - 2);
174*01826a49SYabin Cui     }
175*01826a49SYabin Cui     if (lastChars > 1) {
176*01826a49SYabin Cui         g_ptr[g_maxChars - 1] = '\n';
177*01826a49SYabin Cui     }
178*01826a49SYabin Cui     g_nbChars = g_maxChars;
179*01826a49SYabin Cui }
180*01826a49SYabin Cui 
generateWord(const char * word,const char * separator,int upCase)181*01826a49SYabin Cui static void generateWord(const char* word, const char* separator, int upCase)
182*01826a49SYabin Cui {
183*01826a49SYabin Cui     size_t const len = strlen(word) + strlen(separator);
184*01826a49SYabin Cui     if (g_nbChars + len > g_maxChars) {
185*01826a49SYabin Cui         writeLastCharacters();
186*01826a49SYabin Cui         return;
187*01826a49SYabin Cui     }
188*01826a49SYabin Cui     memcpy(g_ptr + g_nbChars, word, strlen(word));
189*01826a49SYabin Cui     if (upCase) {
190*01826a49SYabin Cui         static const char toUp = 'A' - 'a';
191*01826a49SYabin Cui         g_ptr[g_nbChars]       = (char)(g_ptr[g_nbChars] + toUp);
192*01826a49SYabin Cui     }
193*01826a49SYabin Cui     g_nbChars += strlen(word);
194*01826a49SYabin Cui     memcpy(g_ptr + g_nbChars, separator, strlen(separator));
195*01826a49SYabin Cui     g_nbChars += strlen(separator);
196*01826a49SYabin Cui }
197*01826a49SYabin Cui 
about(unsigned target)198*01826a49SYabin Cui static int about(unsigned target)
199*01826a49SYabin Cui {
200*01826a49SYabin Cui     return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
201*01826a49SYabin Cui }
202*01826a49SYabin Cui 
203*01826a49SYabin Cui /* Function to generate a random sentence */
generateSentence(int nbWords)204*01826a49SYabin Cui static void generateSentence(int nbWords)
205*01826a49SYabin Cui {
206*01826a49SYabin Cui     int commaPos       = about(9);
207*01826a49SYabin Cui     int comma2         = commaPos + about(7);
208*01826a49SYabin Cui     int qmark          = (LOREM_rand(11) == 7);
209*01826a49SYabin Cui     const char* endSep = qmark ? "? " : ". ";
210*01826a49SYabin Cui     int i;
211*01826a49SYabin Cui     for (i = 0; i < nbWords; i++) {
212*01826a49SYabin Cui         int const wordID       = g_distrib[LOREM_rand(g_distribCount)];
213*01826a49SYabin Cui         const char* const word = kWords[wordID];
214*01826a49SYabin Cui         const char* sep        = " ";
215*01826a49SYabin Cui         if (i == commaPos)
216*01826a49SYabin Cui             sep = ", ";
217*01826a49SYabin Cui         if (i == comma2)
218*01826a49SYabin Cui             sep = ", ";
219*01826a49SYabin Cui         if (i == nbWords - 1)
220*01826a49SYabin Cui             sep = endSep;
221*01826a49SYabin Cui         generateWord(word, sep, i == 0);
222*01826a49SYabin Cui     }
223*01826a49SYabin Cui }
224*01826a49SYabin Cui 
generateParagraph(int nbSentences)225*01826a49SYabin Cui static void generateParagraph(int nbSentences)
226*01826a49SYabin Cui {
227*01826a49SYabin Cui     int i;
228*01826a49SYabin Cui     for (i = 0; i < nbSentences; i++) {
229*01826a49SYabin Cui         int wordsPerSentence = about(11);
230*01826a49SYabin Cui         generateSentence(wordsPerSentence);
231*01826a49SYabin Cui     }
232*01826a49SYabin Cui     if (g_nbChars < g_maxChars) {
233*01826a49SYabin Cui         g_ptr[g_nbChars++] = '\n';
234*01826a49SYabin Cui     }
235*01826a49SYabin Cui     if (g_nbChars < g_maxChars) {
236*01826a49SYabin Cui         g_ptr[g_nbChars++] = '\n';
237*01826a49SYabin Cui     }
238*01826a49SYabin Cui }
239*01826a49SYabin Cui 
240*01826a49SYabin Cui /* It's "common" for lorem ipsum generators to start with the same first
241*01826a49SYabin Cui  * pre-defined sentence */
generateFirstSentence(void)242*01826a49SYabin Cui static void generateFirstSentence(void)
243*01826a49SYabin Cui {
244*01826a49SYabin Cui     int i;
245*01826a49SYabin Cui     for (i = 0; i < 18; i++) {
246*01826a49SYabin Cui         const char* word      = kWords[i];
247*01826a49SYabin Cui         const char* separator = " ";
248*01826a49SYabin Cui         if (i == 4)
249*01826a49SYabin Cui             separator = ", ";
250*01826a49SYabin Cui         if (i == 7)
251*01826a49SYabin Cui             separator = ", ";
252*01826a49SYabin Cui         generateWord(word, separator, i == 0);
253*01826a49SYabin Cui     }
254*01826a49SYabin Cui     generateWord(kWords[18], ". ", 0);
255*01826a49SYabin Cui }
256*01826a49SYabin Cui 
257*01826a49SYabin Cui size_t
LOREM_genBlock(void * buffer,size_t size,unsigned seed,int first,int fill)258*01826a49SYabin Cui LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
259*01826a49SYabin Cui {
260*01826a49SYabin Cui     g_ptr = (char*)buffer;
261*01826a49SYabin Cui     assert(size < INT_MAX);
262*01826a49SYabin Cui     g_maxChars = size;
263*01826a49SYabin Cui     g_nbChars  = 0;
264*01826a49SYabin Cui     g_randRoot = seed;
265*01826a49SYabin Cui     if (g_distribCount == 0) {
266*01826a49SYabin Cui         init_word_distrib(kWords, kNbWords, kWeights, kNbWeights);
267*01826a49SYabin Cui     }
268*01826a49SYabin Cui 
269*01826a49SYabin Cui     if (first) {
270*01826a49SYabin Cui         generateFirstSentence();
271*01826a49SYabin Cui     }
272*01826a49SYabin Cui     while (g_nbChars < g_maxChars) {
273*01826a49SYabin Cui         int sentencePerParagraph = about(7);
274*01826a49SYabin Cui         generateParagraph(sentencePerParagraph);
275*01826a49SYabin Cui         if (!fill)
276*01826a49SYabin Cui             break; /* only generate one paragraph in not-fill mode */
277*01826a49SYabin Cui     }
278*01826a49SYabin Cui     g_ptr = NULL;
279*01826a49SYabin Cui     return g_nbChars;
280*01826a49SYabin Cui }
281*01826a49SYabin Cui 
LOREM_genBuffer(void * buffer,size_t size,unsigned seed)282*01826a49SYabin Cui void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
283*01826a49SYabin Cui {
284*01826a49SYabin Cui     LOREM_genBlock(buffer, size, seed, 1, 1);
285*01826a49SYabin Cui }
286