xref: /aosp_15_r20/external/lz4/programs/lorem.c (revision 27162e4e17433d5aa7cb38e7b6a433a09405fc7f)
1 /*
2     lorem.c - lorem ipsum generator
3     Copyright (C) Yann Collet 2024
4 
5     GPL v2 License
6 
7     This program is free software; you can redistribute it and/or modify
8     it under the terms of the GNU General Public License as published by
9     the Free Software Foundation; either version 2 of the License, or
10     (at your option) any later version.
11 
12     This program is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15     GNU General Public License for more details.
16 
17     You should have received a copy of the GNU General Public License along
18     with this program; if not, write to the Free Software Foundation, Inc.,
19     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 
21     You can contact the author at :
22    - LZ4 source repository : https://github.com/lz4/lz4
23    - Public forum : https://groups.google.com/forum/#!forum/lz4c
24 */
25 
26 /* Implementation notes:
27  *
28  * This is a very simple lorem ipsum generator
29  * which features a static list of words
30  * and print them one after another randomly
31  * with a fake sentence / paragraph structure.
32  *
33  * The goal is to generate a printable text
34  * that can be used to fake a text compression scenario.
35  * The resulting compression / ratio curve of the lorem ipsum generator
36  * is more satisfying than the previous statistical generator,
37  * which was initially designed for entropy compression,
38  * and lacks a regularity more representative of text.
39  *
40  * The compression ratio achievable on the generated lorem ipsum
41  * is still a bit too good, presumably because the dictionary is a bit too
42  * small. It would be possible to create some more complex scheme, notably by
43  * enlarging the dictionary with a word generator, and adding grammatical rules
44  * (composition) and syntax rules. But that's probably overkill for the intended
45  * goal.
46  */
47 
48 #include "lorem.h"
49 #include <assert.h>
50 #include <limits.h> /* INT_MAX */
51 #include <stdlib.h> /* malloc, abort */
52 #include <string.h> /* memcpy */
53 
54 /* Define the word pool
55  * Note: all words must have a len <= 16 */
56 static const char* kWords[] = {
57     "lorem",        "ipsum",      "dolor",       "sit",          "amet",
58     "consectetur",  "adipiscing", "elit",        "sed",          "do",
59     "eiusmod",      "tempor",     "incididunt",  "ut",           "labore",
60     "et",           "dolore",     "magna",       "aliqua",       "dis",
61     "lectus",       "vestibulum", "mattis",      "ullamcorper",  "velit",
62     "commodo",      "a",          "lacus",       "arcu",         "magnis",
63     "parturient",   "montes",     "nascetur",    "ridiculus",    "mus",
64     "mauris",       "nulla",      "malesuada",   "pellentesque", "eget",
65     "gravida",      "in",         "dictum",      "non",          "erat",
66     "nam",          "voluptat",   "maecenas",    "blandit",      "aliquam",
67     "etiam",        "enim",       "lobortis",    "scelerisque",  "fermentum",
68     "dui",          "faucibus",   "ornare",      "at",           "elementum",
69     "eu",           "facilisis",  "odio",        "morbi",        "quis",
70     "eros",         "donec",      "ac",          "orci",         "purus",
71     "turpis",       "cursus",     "leo",         "vel",          "porta",
72     "consequat",    "interdum",   "varius",      "vulputate",    "aliquet",
73     "pharetra",     "nunc",       "auctor",      "urna",         "id",
74     "metus",        "viverra",    "nibh",        "cras",         "mi",
75     "unde",         "omnis",      "iste",        "natus",        "error",
76     "perspiciatis", "voluptatem", "accusantium", "doloremque",   "laudantium",
77     "totam",        "rem",        "aperiam",     "eaque",        "ipsa",
78     "quae",         "ab",         "illo",        "inventore",    "veritatis",
79     "quasi",        "architecto", "beatae",      "vitae",        "dicta",
80     "sunt",         "explicabo",  "nemo",        "ipsam",        "quia",
81     "voluptas",     "aspernatur", "aut",         "odit",         "fugit",
82     "consequuntur", "magni",      "dolores",     "eos",          "qui",
83     "ratione",      "sequi",      "nesciunt",    "neque",        "porro",
84     "quisquam",     "est",        "dolorem",     "adipisci",     "numquam",
85     "eius",         "modi",       "tempora",     "incidunt",     "magnam",
86     "quaerat",      "ad",         "minima",      "veniam",       "nostrum",
87     "ullam",        "corporis",   "suscipit",    "laboriosam",   "nisi",
88     "aliquid",      "ex",         "ea",          "commodi",      "consequatur",
89     "autem",        "eum",        "iure",        "voluptate",    "esse",
90     "quam",         "nihil",      "molestiae",   "illum",        "fugiat",
91     "quo",          "pariatur",   "vero",        "accusamus",    "iusto",
92     "dignissimos",  "ducimus",    "blanditiis",  "praesentium",  "voluptatum",
93     "deleniti",     "atque",      "corrupti",    "quos",         "quas",
94     "molestias",    "excepturi",  "sint",        "occaecati",    "cupiditate",
95     "provident",    "similique",  "culpa",       "officia",      "deserunt",
96     "mollitia",     "animi",      "laborum",     "dolorum",      "fuga",
97     "harum",        "quidem",     "rerum",       "facilis",      "expedita",
98     "distinctio",   "libero",     "tempore",     "cum",          "soluta",
99     "nobis",        "eligendi",   "optio",       "cumque",       "impedit",
100     "minus",        "quod",       "maxime",      "placeat",      "facere",
101     "possimus",     "assumenda",  "repellendus", "temporibus",   "quibusdam",
102     "officiis",     "debitis",    "saepe",       "eveniet",      "voluptates",
103     "repudiandae",  "recusandae", "itaque",      "earum",        "hic",
104     "tenetur",      "sapiente",   "delectus",    "reiciendis",   "cillum",
105     "maiores",      "alias",      "perferendis", "doloribus",    "asperiores",
106     "repellat",     "minim",      "nostrud",     "exercitation", "ullamco",
107     "laboris",      "aliquip",    "duis",        "aute",         "irure",
108 };
109 #define KNBWORDS (sizeof(kWords) / sizeof(kWords[0]))
110 static const unsigned kNbWords = KNBWORDS;
111 
112 static const char* g_words[KNBWORDS] = { NULL };
113 static unsigned g_wordLen[KNBWORDS] = {0};
114 static char* g_wordBuffer = NULL;
115 
116 /* simple 1-dimension distribution, based on word's length, favors small words
117  */
118 static const int kWeights[]      = { 0, 8, 6, 4, 3, 2 };
119 static const unsigned kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
120 
121 #define DISTRIB_SIZE_MAX 650
122 static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
123 static unsigned g_distribCount         = 0;
124 
countFreqs(const unsigned wordLen[],size_t nbWords,const int * weights,unsigned long nbWeights)125 static void countFreqs(
126         const unsigned wordLen[],
127         size_t nbWords,
128         const int* weights,
129         unsigned long nbWeights)
130 {
131     unsigned total = 0;
132     size_t w;
133     for (w = 0; w < nbWords; w++) {
134         size_t len = wordLen[w];
135         int lmax;
136         if (len >= nbWeights)
137             len = nbWeights - 1;
138         lmax = weights[len];
139         total += (unsigned)lmax;
140     }
141     g_distribCount = total;
142     assert(g_distribCount <= DISTRIB_SIZE_MAX);
143 }
144 
init_word_len(const char * words[],size_t nbWords)145 static void init_word_len(
146         const char* words[],
147         size_t nbWords)
148 {
149     size_t n;
150     assert(words != NULL);
151     for (n=0; n<nbWords; n++) {
152         assert(words[n] != NULL);
153         assert(strlen(words[n]) < 256);
154         g_wordLen[n] = (unsigned char)strlen(words[n]);
155     }
156 
157 }
158 
sumLen(const unsigned * sizes,size_t s)159 static size_t sumLen(const unsigned* sizes, size_t s)
160 {
161     size_t total = 0;
162     size_t n;
163     assert(sizes != NULL);
164     for (n=0; n<s; n++) {
165         total += sizes[n];
166     }
167     return total;
168 }
169 
init_word_buffer(void)170 static void init_word_buffer(void)
171 {
172     size_t n;
173     size_t const bufSize = sumLen(g_wordLen, kNbWords) + 16;
174     char* ptr;
175     assert(g_wordBuffer == NULL);
176     g_wordBuffer = (char*)calloc(1, bufSize);
177     if (g_wordBuffer == NULL) abort();
178     ptr = g_wordBuffer;
179     for (n=0; n<kNbWords; n++) {
180         memcpy(ptr, kWords[n], g_wordLen[n]);
181         g_words[n] = ptr;
182         ptr += g_wordLen[n];
183     }
184 }
185 
init_word_distrib(const unsigned wordLen[],size_t nbWords,const int * weights,unsigned long nbWeights)186 static void init_word_distrib(
187         const unsigned wordLen[],
188         size_t nbWords,
189         const int* weights,
190         unsigned long nbWeights)
191 {
192     size_t w, d = 0;
193     countFreqs(wordLen, nbWords, weights, nbWeights);
194     for (w = 0; w < nbWords; w++) {
195         size_t len = wordLen[w];
196         int l, lmax;
197         if (len >= nbWeights)
198             len = nbWeights - 1;
199         lmax = weights[len];
200         for (l = 0; l < lmax; l++) {
201             g_distrib[d++] = (int)w;
202         }
203     }
204 }
205 
206 /* Note: this unit only works when invoked sequentially.
207  * No concurrent access is allowed */
208 static char* g_ptr         = NULL;
209 static size_t g_nbChars    = 0;
210 static size_t g_maxChars   = 10000000;
211 static unsigned g_randRoot = 0;
212 
213 #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
LOREM_rand(unsigned range)214 static unsigned LOREM_rand(unsigned range)
215 {
216     static const unsigned prime1 = 2654435761U;
217     static const unsigned prime2 = 2246822519U;
218     unsigned rand32              = g_randRoot;
219     rand32 *= prime1;
220     rand32 ^= prime2;
221     rand32     = RDG_rotl32(rand32, 13);
222     g_randRoot = rand32;
223     return (unsigned)(((unsigned long long)rand32 * range) >> 32);
224 }
225 
writeLastCharacters(void)226 static void writeLastCharacters(void)
227 {
228     size_t lastChars = g_maxChars - g_nbChars;
229     assert(g_maxChars >= g_nbChars);
230     if (lastChars == 0)
231         return;
232     g_ptr[g_nbChars++] = '.';
233     if (lastChars > 2) {
234         memset(g_ptr + g_nbChars, ' ', lastChars - 2);
235     }
236     if (lastChars > 1) {
237         g_ptr[g_maxChars - 1] = '\n';
238     }
239     g_nbChars = g_maxChars;
240 }
241 
generateLastWord(const char * word,size_t wordLen,int upCase)242 static void generateLastWord(const char* word, size_t wordLen, int upCase)
243 {
244     if (g_nbChars + wordLen + 2 > g_maxChars) {
245         writeLastCharacters();
246         return;
247     }
248     memcpy(g_ptr + g_nbChars, word, wordLen);
249     if (upCase) {
250         static const char toUp = 'A' - 'a';
251         g_ptr[g_nbChars]       = (char)(g_ptr[g_nbChars] + toUp);
252     }
253     g_nbChars += wordLen;
254     writeLastCharacters();
255 }
256 
257 #define MAX(a,b)  ((a)<(b)?(b):(a))
generateWord(const char * word,size_t wordLen,const char * separator,size_t sepLen,int upCase)258 static void generateWord(const char* word, size_t wordLen, const char* separator, size_t sepLen, int upCase)
259 {
260     size_t const wlen = MAX(16, wordLen + 2);
261     if (g_nbChars + wlen > g_maxChars) {
262         generateLastWord(word, wordLen, upCase);
263         return;
264     }
265     assert(wordLen <= 16);
266     memcpy(g_ptr + g_nbChars, word, 16);
267     if (upCase) {
268         static const char toUp = 'A' - 'a';
269         g_ptr[g_nbChars]       = (char)(g_ptr[g_nbChars] + toUp);
270     }
271     g_nbChars += wordLen;
272     assert(sepLen <= 2);
273     memcpy(g_ptr + g_nbChars, separator, 2);
274     g_nbChars += sepLen;
275 }
276 
about(unsigned target)277 static int about(unsigned target)
278 {
279     return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
280 }
281 
282 /* Function to generate a random sentence */
generateSentence(int nbWords)283 static void generateSentence(int nbWords)
284 {
285     int commaPos       = about(9);
286     int comma2         = commaPos + about(7);
287     int qmark          = (LOREM_rand(11) == 7);
288     const char* endSep = qmark ? "? " : ". ";
289     int i;
290     for (i = 0; i < nbWords; i++) {
291         int const wordID       = g_distrib[LOREM_rand(g_distribCount)];
292         const char* sep        = " ";
293         size_t sepLen = 1;
294         if (i == commaPos)
295             sep = ", ", sepLen=2;
296         if (i == comma2)
297             sep = ", ", sepLen=2;
298         if (i == nbWords - 1)
299             sep = endSep, sepLen=2;
300         generateWord(g_words[wordID], g_wordLen[wordID], sep, sepLen, i == 0);
301     }
302 }
303 
generateParagraph(int nbSentences)304 static void generateParagraph(int nbSentences)
305 {
306     int i;
307     for (i = 0; i < nbSentences; i++) {
308         int wordsPerSentence = about(11);
309         generateSentence(wordsPerSentence);
310     }
311     if (g_nbChars < g_maxChars) {
312         g_ptr[g_nbChars++] = '\n';
313     }
314     if (g_nbChars < g_maxChars) {
315         g_ptr[g_nbChars++] = '\n';
316     }
317 }
318 
319 /* It's "common" for lorem ipsum generators to start with the same first
320  * pre-defined sentence */
generateFirstSentence(void)321 static void generateFirstSentence(void)
322 {
323     int i;
324     for (i = 0; i < 18; i++) {
325         const char* separator = " ";
326         size_t sepLen = 1;
327         if (i == 4)
328             separator = ", ", sepLen=2;
329         if (i == 7)
330             separator = ", ", sepLen=2;
331         generateWord(g_words[i], g_wordLen[i], separator, sepLen, i == 0);
332     }
333     generateWord(g_words[18], g_wordLen[18], ". ", 2, 0);
334 }
335 
336 size_t
LOREM_genBlock(void * buffer,size_t size,unsigned seed,int first,int fill)337 LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
338 {
339     g_ptr = (char*)buffer;
340     assert(size < INT_MAX);
341     g_maxChars = size;
342     g_nbChars  = 0;
343     g_randRoot = seed;
344     if (g_distribCount == 0) {
345         init_word_len(kWords, kNbWords);
346         init_word_buffer();
347         init_word_distrib(g_wordLen, kNbWords, kWeights, kNbWeights);
348     }
349 
350     if (first) {
351         generateFirstSentence();
352     }
353     while (g_nbChars < g_maxChars) {
354         int sentencePerParagraph = about(7);
355         generateParagraph(sentencePerParagraph);
356         if (!fill)
357             break; /* only generate one paragraph in not-fill mode */
358     }
359     g_ptr = NULL;
360     return g_nbChars;
361 }
362 
LOREM_genBuffer(void * buffer,size_t size,unsigned seed)363 void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
364 {
365     LOREM_genBlock(buffer, size, seed, 1, 1);
366 }
367