1 /*
2 lorem.c - lorem ipsum generator
3 Copyright (C) Yann Collet 2024
4
5 GPL v2 License
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation, Inc.,
19 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20
21 You can contact the author at :
22 - LZ4 source repository : https://github.com/lz4/lz4
23 - Public forum : https://groups.google.com/forum/#!forum/lz4c
24 */
25
26 /* Implementation notes:
27 *
28 * This is a very simple lorem ipsum generator
29 * which features a static list of words
30 * and print them one after another randomly
31 * with a fake sentence / paragraph structure.
32 *
33 * The goal is to generate a printable text
34 * that can be used to fake a text compression scenario.
35 * The resulting compression / ratio curve of the lorem ipsum generator
36 * is more satisfying than the previous statistical generator,
37 * which was initially designed for entropy compression,
38 * and lacks a regularity more representative of text.
39 *
40 * The compression ratio achievable on the generated lorem ipsum
41 * is still a bit too good, presumably because the dictionary is a bit too
42 * small. It would be possible to create some more complex scheme, notably by
43 * enlarging the dictionary with a word generator, and adding grammatical rules
44 * (composition) and syntax rules. But that's probably overkill for the intended
45 * goal.
46 */
47
48 #include "lorem.h"
49 #include <assert.h>
50 #include <limits.h> /* INT_MAX */
51 #include <stdlib.h> /* malloc, abort */
52 #include <string.h> /* memcpy */
53
54 /* Define the word pool
55 * Note: all words must have a len <= 16 */
56 static const char* kWords[] = {
57 "lorem", "ipsum", "dolor", "sit", "amet",
58 "consectetur", "adipiscing", "elit", "sed", "do",
59 "eiusmod", "tempor", "incididunt", "ut", "labore",
60 "et", "dolore", "magna", "aliqua", "dis",
61 "lectus", "vestibulum", "mattis", "ullamcorper", "velit",
62 "commodo", "a", "lacus", "arcu", "magnis",
63 "parturient", "montes", "nascetur", "ridiculus", "mus",
64 "mauris", "nulla", "malesuada", "pellentesque", "eget",
65 "gravida", "in", "dictum", "non", "erat",
66 "nam", "voluptat", "maecenas", "blandit", "aliquam",
67 "etiam", "enim", "lobortis", "scelerisque", "fermentum",
68 "dui", "faucibus", "ornare", "at", "elementum",
69 "eu", "facilisis", "odio", "morbi", "quis",
70 "eros", "donec", "ac", "orci", "purus",
71 "turpis", "cursus", "leo", "vel", "porta",
72 "consequat", "interdum", "varius", "vulputate", "aliquet",
73 "pharetra", "nunc", "auctor", "urna", "id",
74 "metus", "viverra", "nibh", "cras", "mi",
75 "unde", "omnis", "iste", "natus", "error",
76 "perspiciatis", "voluptatem", "accusantium", "doloremque", "laudantium",
77 "totam", "rem", "aperiam", "eaque", "ipsa",
78 "quae", "ab", "illo", "inventore", "veritatis",
79 "quasi", "architecto", "beatae", "vitae", "dicta",
80 "sunt", "explicabo", "nemo", "ipsam", "quia",
81 "voluptas", "aspernatur", "aut", "odit", "fugit",
82 "consequuntur", "magni", "dolores", "eos", "qui",
83 "ratione", "sequi", "nesciunt", "neque", "porro",
84 "quisquam", "est", "dolorem", "adipisci", "numquam",
85 "eius", "modi", "tempora", "incidunt", "magnam",
86 "quaerat", "ad", "minima", "veniam", "nostrum",
87 "ullam", "corporis", "suscipit", "laboriosam", "nisi",
88 "aliquid", "ex", "ea", "commodi", "consequatur",
89 "autem", "eum", "iure", "voluptate", "esse",
90 "quam", "nihil", "molestiae", "illum", "fugiat",
91 "quo", "pariatur", "vero", "accusamus", "iusto",
92 "dignissimos", "ducimus", "blanditiis", "praesentium", "voluptatum",
93 "deleniti", "atque", "corrupti", "quos", "quas",
94 "molestias", "excepturi", "sint", "occaecati", "cupiditate",
95 "provident", "similique", "culpa", "officia", "deserunt",
96 "mollitia", "animi", "laborum", "dolorum", "fuga",
97 "harum", "quidem", "rerum", "facilis", "expedita",
98 "distinctio", "libero", "tempore", "cum", "soluta",
99 "nobis", "eligendi", "optio", "cumque", "impedit",
100 "minus", "quod", "maxime", "placeat", "facere",
101 "possimus", "assumenda", "repellendus", "temporibus", "quibusdam",
102 "officiis", "debitis", "saepe", "eveniet", "voluptates",
103 "repudiandae", "recusandae", "itaque", "earum", "hic",
104 "tenetur", "sapiente", "delectus", "reiciendis", "cillum",
105 "maiores", "alias", "perferendis", "doloribus", "asperiores",
106 "repellat", "minim", "nostrud", "exercitation", "ullamco",
107 "laboris", "aliquip", "duis", "aute", "irure",
108 };
109 #define KNBWORDS (sizeof(kWords) / sizeof(kWords[0]))
110 static const unsigned kNbWords = KNBWORDS;
111
112 static const char* g_words[KNBWORDS] = { NULL };
113 static unsigned g_wordLen[KNBWORDS] = {0};
114 static char* g_wordBuffer = NULL;
115
116 /* simple 1-dimension distribution, based on word's length, favors small words
117 */
118 static const int kWeights[] = { 0, 8, 6, 4, 3, 2 };
119 static const unsigned kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
120
121 #define DISTRIB_SIZE_MAX 650
122 static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
123 static unsigned g_distribCount = 0;
124
countFreqs(const unsigned wordLen[],size_t nbWords,const int * weights,unsigned long nbWeights)125 static void countFreqs(
126 const unsigned wordLen[],
127 size_t nbWords,
128 const int* weights,
129 unsigned long nbWeights)
130 {
131 unsigned total = 0;
132 size_t w;
133 for (w = 0; w < nbWords; w++) {
134 size_t len = wordLen[w];
135 int lmax;
136 if (len >= nbWeights)
137 len = nbWeights - 1;
138 lmax = weights[len];
139 total += (unsigned)lmax;
140 }
141 g_distribCount = total;
142 assert(g_distribCount <= DISTRIB_SIZE_MAX);
143 }
144
init_word_len(const char * words[],size_t nbWords)145 static void init_word_len(
146 const char* words[],
147 size_t nbWords)
148 {
149 size_t n;
150 assert(words != NULL);
151 for (n=0; n<nbWords; n++) {
152 assert(words[n] != NULL);
153 assert(strlen(words[n]) < 256);
154 g_wordLen[n] = (unsigned char)strlen(words[n]);
155 }
156
157 }
158
sumLen(const unsigned * sizes,size_t s)159 static size_t sumLen(const unsigned* sizes, size_t s)
160 {
161 size_t total = 0;
162 size_t n;
163 assert(sizes != NULL);
164 for (n=0; n<s; n++) {
165 total += sizes[n];
166 }
167 return total;
168 }
169
init_word_buffer(void)170 static void init_word_buffer(void)
171 {
172 size_t n;
173 size_t const bufSize = sumLen(g_wordLen, kNbWords) + 16;
174 char* ptr;
175 assert(g_wordBuffer == NULL);
176 g_wordBuffer = (char*)calloc(1, bufSize);
177 if (g_wordBuffer == NULL) abort();
178 ptr = g_wordBuffer;
179 for (n=0; n<kNbWords; n++) {
180 memcpy(ptr, kWords[n], g_wordLen[n]);
181 g_words[n] = ptr;
182 ptr += g_wordLen[n];
183 }
184 }
185
init_word_distrib(const unsigned wordLen[],size_t nbWords,const int * weights,unsigned long nbWeights)186 static void init_word_distrib(
187 const unsigned wordLen[],
188 size_t nbWords,
189 const int* weights,
190 unsigned long nbWeights)
191 {
192 size_t w, d = 0;
193 countFreqs(wordLen, nbWords, weights, nbWeights);
194 for (w = 0; w < nbWords; w++) {
195 size_t len = wordLen[w];
196 int l, lmax;
197 if (len >= nbWeights)
198 len = nbWeights - 1;
199 lmax = weights[len];
200 for (l = 0; l < lmax; l++) {
201 g_distrib[d++] = (int)w;
202 }
203 }
204 }
205
206 /* Note: this unit only works when invoked sequentially.
207 * No concurrent access is allowed */
208 static char* g_ptr = NULL;
209 static size_t g_nbChars = 0;
210 static size_t g_maxChars = 10000000;
211 static unsigned g_randRoot = 0;
212
213 #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
LOREM_rand(unsigned range)214 static unsigned LOREM_rand(unsigned range)
215 {
216 static const unsigned prime1 = 2654435761U;
217 static const unsigned prime2 = 2246822519U;
218 unsigned rand32 = g_randRoot;
219 rand32 *= prime1;
220 rand32 ^= prime2;
221 rand32 = RDG_rotl32(rand32, 13);
222 g_randRoot = rand32;
223 return (unsigned)(((unsigned long long)rand32 * range) >> 32);
224 }
225
writeLastCharacters(void)226 static void writeLastCharacters(void)
227 {
228 size_t lastChars = g_maxChars - g_nbChars;
229 assert(g_maxChars >= g_nbChars);
230 if (lastChars == 0)
231 return;
232 g_ptr[g_nbChars++] = '.';
233 if (lastChars > 2) {
234 memset(g_ptr + g_nbChars, ' ', lastChars - 2);
235 }
236 if (lastChars > 1) {
237 g_ptr[g_maxChars - 1] = '\n';
238 }
239 g_nbChars = g_maxChars;
240 }
241
generateLastWord(const char * word,size_t wordLen,int upCase)242 static void generateLastWord(const char* word, size_t wordLen, int upCase)
243 {
244 if (g_nbChars + wordLen + 2 > g_maxChars) {
245 writeLastCharacters();
246 return;
247 }
248 memcpy(g_ptr + g_nbChars, word, wordLen);
249 if (upCase) {
250 static const char toUp = 'A' - 'a';
251 g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp);
252 }
253 g_nbChars += wordLen;
254 writeLastCharacters();
255 }
256
257 #define MAX(a,b) ((a)<(b)?(b):(a))
generateWord(const char * word,size_t wordLen,const char * separator,size_t sepLen,int upCase)258 static void generateWord(const char* word, size_t wordLen, const char* separator, size_t sepLen, int upCase)
259 {
260 size_t const wlen = MAX(16, wordLen + 2);
261 if (g_nbChars + wlen > g_maxChars) {
262 generateLastWord(word, wordLen, upCase);
263 return;
264 }
265 assert(wordLen <= 16);
266 memcpy(g_ptr + g_nbChars, word, 16);
267 if (upCase) {
268 static const char toUp = 'A' - 'a';
269 g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp);
270 }
271 g_nbChars += wordLen;
272 assert(sepLen <= 2);
273 memcpy(g_ptr + g_nbChars, separator, 2);
274 g_nbChars += sepLen;
275 }
276
about(unsigned target)277 static int about(unsigned target)
278 {
279 return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
280 }
281
282 /* Function to generate a random sentence */
generateSentence(int nbWords)283 static void generateSentence(int nbWords)
284 {
285 int commaPos = about(9);
286 int comma2 = commaPos + about(7);
287 int qmark = (LOREM_rand(11) == 7);
288 const char* endSep = qmark ? "? " : ". ";
289 int i;
290 for (i = 0; i < nbWords; i++) {
291 int const wordID = g_distrib[LOREM_rand(g_distribCount)];
292 const char* sep = " ";
293 size_t sepLen = 1;
294 if (i == commaPos)
295 sep = ", ", sepLen=2;
296 if (i == comma2)
297 sep = ", ", sepLen=2;
298 if (i == nbWords - 1)
299 sep = endSep, sepLen=2;
300 generateWord(g_words[wordID], g_wordLen[wordID], sep, sepLen, i == 0);
301 }
302 }
303
generateParagraph(int nbSentences)304 static void generateParagraph(int nbSentences)
305 {
306 int i;
307 for (i = 0; i < nbSentences; i++) {
308 int wordsPerSentence = about(11);
309 generateSentence(wordsPerSentence);
310 }
311 if (g_nbChars < g_maxChars) {
312 g_ptr[g_nbChars++] = '\n';
313 }
314 if (g_nbChars < g_maxChars) {
315 g_ptr[g_nbChars++] = '\n';
316 }
317 }
318
319 /* It's "common" for lorem ipsum generators to start with the same first
320 * pre-defined sentence */
generateFirstSentence(void)321 static void generateFirstSentence(void)
322 {
323 int i;
324 for (i = 0; i < 18; i++) {
325 const char* separator = " ";
326 size_t sepLen = 1;
327 if (i == 4)
328 separator = ", ", sepLen=2;
329 if (i == 7)
330 separator = ", ", sepLen=2;
331 generateWord(g_words[i], g_wordLen[i], separator, sepLen, i == 0);
332 }
333 generateWord(g_words[18], g_wordLen[18], ". ", 2, 0);
334 }
335
336 size_t
LOREM_genBlock(void * buffer,size_t size,unsigned seed,int first,int fill)337 LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
338 {
339 g_ptr = (char*)buffer;
340 assert(size < INT_MAX);
341 g_maxChars = size;
342 g_nbChars = 0;
343 g_randRoot = seed;
344 if (g_distribCount == 0) {
345 init_word_len(kWords, kNbWords);
346 init_word_buffer();
347 init_word_distrib(g_wordLen, kNbWords, kWeights, kNbWeights);
348 }
349
350 if (first) {
351 generateFirstSentence();
352 }
353 while (g_nbChars < g_maxChars) {
354 int sentencePerParagraph = about(7);
355 generateParagraph(sentencePerParagraph);
356 if (!fill)
357 break; /* only generate one paragraph in not-fill mode */
358 }
359 g_ptr = NULL;
360 return g_nbChars;
361 }
362
LOREM_genBuffer(void * buffer,size_t size,unsigned seed)363 void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
364 {
365 LOREM_genBlock(buffer, size, seed, 1, 1);
366 }
367