1*01826a49SYabin Cui /*
2*01826a49SYabin Cui * Copyright (c) Meta Platforms, Inc. and affiliates.
3*01826a49SYabin Cui * All rights reserved.
4*01826a49SYabin Cui *
5*01826a49SYabin Cui * This source code is licensed under both the BSD-style license (found in the
6*01826a49SYabin Cui * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7*01826a49SYabin Cui * in the COPYING file in the root directory of this source tree).
8*01826a49SYabin Cui * You may select, at your option, one of the above-listed licenses.
9*01826a49SYabin Cui */
10*01826a49SYabin Cui
11*01826a49SYabin Cui /* Implementation notes:
12*01826a49SYabin Cui *
13*01826a49SYabin Cui * This is a very simple lorem ipsum generator
14*01826a49SYabin Cui * which features a static list of words
15*01826a49SYabin Cui * and print them one after another randomly
16*01826a49SYabin Cui * with a fake sentence / paragraph structure.
17*01826a49SYabin Cui *
18*01826a49SYabin Cui * The goal is to generate a printable text
19*01826a49SYabin Cui * that can be used to fake a text compression scenario.
20*01826a49SYabin Cui * The resulting compression / ratio curve of the lorem ipsum generator
21*01826a49SYabin Cui * is more satisfying than the previous statistical generator,
22*01826a49SYabin Cui * which was initially designed for entropy compression,
23*01826a49SYabin Cui * and lacks a regularity more representative of text.
24*01826a49SYabin Cui *
25*01826a49SYabin Cui * The compression ratio achievable on the generated lorem ipsum
26*01826a49SYabin Cui * is still a bit too good, presumably because the dictionary is a bit too
27*01826a49SYabin Cui * small. It would be possible to create some more complex scheme, notably by
28*01826a49SYabin Cui * enlarging the dictionary with a word generator, and adding grammatical rules
29*01826a49SYabin Cui * (composition) and syntax rules. But that's probably overkill for the intended
30*01826a49SYabin Cui * goal.
31*01826a49SYabin Cui */
32*01826a49SYabin Cui
33*01826a49SYabin Cui #include "lorem.h"
34*01826a49SYabin Cui #include <assert.h>
35*01826a49SYabin Cui #include <limits.h> /* INT_MAX */
36*01826a49SYabin Cui #include <string.h> /* memcpy */
37*01826a49SYabin Cui
38*01826a49SYabin Cui #define WORD_MAX_SIZE 20
39*01826a49SYabin Cui
40*01826a49SYabin Cui /* Define the word pool */
41*01826a49SYabin Cui static const char* kWords[] = {
42*01826a49SYabin Cui "lorem", "ipsum", "dolor", "sit", "amet",
43*01826a49SYabin Cui "consectetur", "adipiscing", "elit", "sed", "do",
44*01826a49SYabin Cui "eiusmod", "tempor", "incididunt", "ut", "labore",
45*01826a49SYabin Cui "et", "dolore", "magna", "aliqua", "dis",
46*01826a49SYabin Cui "lectus", "vestibulum", "mattis", "ullamcorper", "velit",
47*01826a49SYabin Cui "commodo", "a", "lacus", "arcu", "magnis",
48*01826a49SYabin Cui "parturient", "montes", "nascetur", "ridiculus", "mus",
49*01826a49SYabin Cui "mauris", "nulla", "malesuada", "pellentesque", "eget",
50*01826a49SYabin Cui "gravida", "in", "dictum", "non", "erat",
51*01826a49SYabin Cui "nam", "voluptat", "maecenas", "blandit", "aliquam",
52*01826a49SYabin Cui "etiam", "enim", "lobortis", "scelerisque", "fermentum",
53*01826a49SYabin Cui "dui", "faucibus", "ornare", "at", "elementum",
54*01826a49SYabin Cui "eu", "facilisis", "odio", "morbi", "quis",
55*01826a49SYabin Cui "eros", "donec", "ac", "orci", "purus",
56*01826a49SYabin Cui "turpis", "cursus", "leo", "vel", "porta",
57*01826a49SYabin Cui "consequat", "interdum", "varius", "vulputate", "aliquet",
58*01826a49SYabin Cui "pharetra", "nunc", "auctor", "urna", "id",
59*01826a49SYabin Cui "metus", "viverra", "nibh", "cras", "mi",
60*01826a49SYabin Cui "unde", "omnis", "iste", "natus", "error",
61*01826a49SYabin Cui "perspiciatis", "voluptatem", "accusantium", "doloremque", "laudantium",
62*01826a49SYabin Cui "totam", "rem", "aperiam", "eaque", "ipsa",
63*01826a49SYabin Cui "quae", "ab", "illo", "inventore", "veritatis",
64*01826a49SYabin Cui "quasi", "architecto", "beatae", "vitae", "dicta",
65*01826a49SYabin Cui "sunt", "explicabo", "nemo", "ipsam", "quia",
66*01826a49SYabin Cui "voluptas", "aspernatur", "aut", "odit", "fugit",
67*01826a49SYabin Cui "consequuntur", "magni", "dolores", "eos", "qui",
68*01826a49SYabin Cui "ratione", "sequi", "nesciunt", "neque", "porro",
69*01826a49SYabin Cui "quisquam", "est", "dolorem", "adipisci", "numquam",
70*01826a49SYabin Cui "eius", "modi", "tempora", "incidunt", "magnam",
71*01826a49SYabin Cui "quaerat", "ad", "minima", "veniam", "nostrum",
72*01826a49SYabin Cui "ullam", "corporis", "suscipit", "laboriosam", "nisi",
73*01826a49SYabin Cui "aliquid", "ex", "ea", "commodi", "consequatur",
74*01826a49SYabin Cui "autem", "eum", "iure", "voluptate", "esse",
75*01826a49SYabin Cui "quam", "nihil", "molestiae", "illum", "fugiat",
76*01826a49SYabin Cui "quo", "pariatur", "vero", "accusamus", "iusto",
77*01826a49SYabin Cui "dignissimos", "ducimus", "blanditiis", "praesentium", "voluptatum",
78*01826a49SYabin Cui "deleniti", "atque", "corrupti", "quos", "quas",
79*01826a49SYabin Cui "molestias", "excepturi", "sint", "occaecati", "cupiditate",
80*01826a49SYabin Cui "provident", "similique", "culpa", "officia", "deserunt",
81*01826a49SYabin Cui "mollitia", "animi", "laborum", "dolorum", "fuga",
82*01826a49SYabin Cui "harum", "quidem", "rerum", "facilis", "expedita",
83*01826a49SYabin Cui "distinctio", "libero", "tempore", "cum", "soluta",
84*01826a49SYabin Cui "nobis", "eligendi", "optio", "cumque", "impedit",
85*01826a49SYabin Cui "minus", "quod", "maxime", "placeat", "facere",
86*01826a49SYabin Cui "possimus", "assumenda", "repellendus", "temporibus", "quibusdam",
87*01826a49SYabin Cui "officiis", "debitis", "saepe", "eveniet", "voluptates",
88*01826a49SYabin Cui "repudiandae", "recusandae", "itaque", "earum", "hic",
89*01826a49SYabin Cui "tenetur", "sapiente", "delectus", "reiciendis", "cillum",
90*01826a49SYabin Cui "maiores", "alias", "perferendis", "doloribus", "asperiores",
91*01826a49SYabin Cui "repellat", "minim", "nostrud", "exercitation", "ullamco",
92*01826a49SYabin Cui "laboris", "aliquip", "duis", "aute", "irure",
93*01826a49SYabin Cui };
94*01826a49SYabin Cui static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]);
95*01826a49SYabin Cui
96*01826a49SYabin Cui /* simple 1-dimension distribution, based on word's length, favors small words
97*01826a49SYabin Cui */
98*01826a49SYabin Cui static const int kWeights[] = { 0, 8, 6, 4, 3, 2 };
99*01826a49SYabin Cui static const size_t kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
100*01826a49SYabin Cui
101*01826a49SYabin Cui #define DISTRIB_SIZE_MAX 650
102*01826a49SYabin Cui static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
103*01826a49SYabin Cui static unsigned g_distribCount = 0;
104*01826a49SYabin Cui
countFreqs(const char * words[],size_t nbWords,const int * weights,size_t nbWeights)105*01826a49SYabin Cui static void countFreqs(
106*01826a49SYabin Cui const char* words[],
107*01826a49SYabin Cui size_t nbWords,
108*01826a49SYabin Cui const int* weights,
109*01826a49SYabin Cui size_t nbWeights)
110*01826a49SYabin Cui {
111*01826a49SYabin Cui unsigned total = 0;
112*01826a49SYabin Cui size_t w;
113*01826a49SYabin Cui for (w = 0; w < nbWords; w++) {
114*01826a49SYabin Cui size_t len = strlen(words[w]);
115*01826a49SYabin Cui int lmax;
116*01826a49SYabin Cui if (len >= nbWeights)
117*01826a49SYabin Cui len = nbWeights - 1;
118*01826a49SYabin Cui lmax = weights[len];
119*01826a49SYabin Cui total += (unsigned)lmax;
120*01826a49SYabin Cui }
121*01826a49SYabin Cui g_distribCount = total;
122*01826a49SYabin Cui assert(g_distribCount <= DISTRIB_SIZE_MAX);
123*01826a49SYabin Cui }
124*01826a49SYabin Cui
init_word_distrib(const char * words[],size_t nbWords,const int * weights,size_t nbWeights)125*01826a49SYabin Cui static void init_word_distrib(
126*01826a49SYabin Cui const char* words[],
127*01826a49SYabin Cui size_t nbWords,
128*01826a49SYabin Cui const int* weights,
129*01826a49SYabin Cui size_t nbWeights)
130*01826a49SYabin Cui {
131*01826a49SYabin Cui size_t w, d = 0;
132*01826a49SYabin Cui countFreqs(words, nbWords, weights, nbWeights);
133*01826a49SYabin Cui for (w = 0; w < nbWords; w++) {
134*01826a49SYabin Cui size_t len = strlen(words[w]);
135*01826a49SYabin Cui int l, lmax;
136*01826a49SYabin Cui if (len >= nbWeights)
137*01826a49SYabin Cui len = nbWeights - 1;
138*01826a49SYabin Cui lmax = weights[len];
139*01826a49SYabin Cui for (l = 0; l < lmax; l++) {
140*01826a49SYabin Cui g_distrib[d++] = (int)w;
141*01826a49SYabin Cui }
142*01826a49SYabin Cui }
143*01826a49SYabin Cui }
144*01826a49SYabin Cui
145*01826a49SYabin Cui /* Note: this unit only works when invoked sequentially.
146*01826a49SYabin Cui * No concurrent access is allowed */
147*01826a49SYabin Cui static char* g_ptr = NULL;
148*01826a49SYabin Cui static size_t g_nbChars = 0;
149*01826a49SYabin Cui static size_t g_maxChars = 10000000;
150*01826a49SYabin Cui static unsigned g_randRoot = 0;
151*01826a49SYabin Cui
152*01826a49SYabin Cui #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
LOREM_rand(unsigned range)153*01826a49SYabin Cui static unsigned LOREM_rand(unsigned range)
154*01826a49SYabin Cui {
155*01826a49SYabin Cui static const unsigned prime1 = 2654435761U;
156*01826a49SYabin Cui static const unsigned prime2 = 2246822519U;
157*01826a49SYabin Cui unsigned rand32 = g_randRoot;
158*01826a49SYabin Cui rand32 *= prime1;
159*01826a49SYabin Cui rand32 ^= prime2;
160*01826a49SYabin Cui rand32 = RDG_rotl32(rand32, 13);
161*01826a49SYabin Cui g_randRoot = rand32;
162*01826a49SYabin Cui return (unsigned)(((unsigned long long)rand32 * range) >> 32);
163*01826a49SYabin Cui }
164*01826a49SYabin Cui
writeLastCharacters(void)165*01826a49SYabin Cui static void writeLastCharacters(void)
166*01826a49SYabin Cui {
167*01826a49SYabin Cui size_t lastChars = g_maxChars - g_nbChars;
168*01826a49SYabin Cui assert(g_maxChars >= g_nbChars);
169*01826a49SYabin Cui if (lastChars == 0)
170*01826a49SYabin Cui return;
171*01826a49SYabin Cui g_ptr[g_nbChars++] = '.';
172*01826a49SYabin Cui if (lastChars > 2) {
173*01826a49SYabin Cui memset(g_ptr + g_nbChars, ' ', lastChars - 2);
174*01826a49SYabin Cui }
175*01826a49SYabin Cui if (lastChars > 1) {
176*01826a49SYabin Cui g_ptr[g_maxChars - 1] = '\n';
177*01826a49SYabin Cui }
178*01826a49SYabin Cui g_nbChars = g_maxChars;
179*01826a49SYabin Cui }
180*01826a49SYabin Cui
generateWord(const char * word,const char * separator,int upCase)181*01826a49SYabin Cui static void generateWord(const char* word, const char* separator, int upCase)
182*01826a49SYabin Cui {
183*01826a49SYabin Cui size_t const len = strlen(word) + strlen(separator);
184*01826a49SYabin Cui if (g_nbChars + len > g_maxChars) {
185*01826a49SYabin Cui writeLastCharacters();
186*01826a49SYabin Cui return;
187*01826a49SYabin Cui }
188*01826a49SYabin Cui memcpy(g_ptr + g_nbChars, word, strlen(word));
189*01826a49SYabin Cui if (upCase) {
190*01826a49SYabin Cui static const char toUp = 'A' - 'a';
191*01826a49SYabin Cui g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp);
192*01826a49SYabin Cui }
193*01826a49SYabin Cui g_nbChars += strlen(word);
194*01826a49SYabin Cui memcpy(g_ptr + g_nbChars, separator, strlen(separator));
195*01826a49SYabin Cui g_nbChars += strlen(separator);
196*01826a49SYabin Cui }
197*01826a49SYabin Cui
about(unsigned target)198*01826a49SYabin Cui static int about(unsigned target)
199*01826a49SYabin Cui {
200*01826a49SYabin Cui return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
201*01826a49SYabin Cui }
202*01826a49SYabin Cui
203*01826a49SYabin Cui /* Function to generate a random sentence */
generateSentence(int nbWords)204*01826a49SYabin Cui static void generateSentence(int nbWords)
205*01826a49SYabin Cui {
206*01826a49SYabin Cui int commaPos = about(9);
207*01826a49SYabin Cui int comma2 = commaPos + about(7);
208*01826a49SYabin Cui int qmark = (LOREM_rand(11) == 7);
209*01826a49SYabin Cui const char* endSep = qmark ? "? " : ". ";
210*01826a49SYabin Cui int i;
211*01826a49SYabin Cui for (i = 0; i < nbWords; i++) {
212*01826a49SYabin Cui int const wordID = g_distrib[LOREM_rand(g_distribCount)];
213*01826a49SYabin Cui const char* const word = kWords[wordID];
214*01826a49SYabin Cui const char* sep = " ";
215*01826a49SYabin Cui if (i == commaPos)
216*01826a49SYabin Cui sep = ", ";
217*01826a49SYabin Cui if (i == comma2)
218*01826a49SYabin Cui sep = ", ";
219*01826a49SYabin Cui if (i == nbWords - 1)
220*01826a49SYabin Cui sep = endSep;
221*01826a49SYabin Cui generateWord(word, sep, i == 0);
222*01826a49SYabin Cui }
223*01826a49SYabin Cui }
224*01826a49SYabin Cui
generateParagraph(int nbSentences)225*01826a49SYabin Cui static void generateParagraph(int nbSentences)
226*01826a49SYabin Cui {
227*01826a49SYabin Cui int i;
228*01826a49SYabin Cui for (i = 0; i < nbSentences; i++) {
229*01826a49SYabin Cui int wordsPerSentence = about(11);
230*01826a49SYabin Cui generateSentence(wordsPerSentence);
231*01826a49SYabin Cui }
232*01826a49SYabin Cui if (g_nbChars < g_maxChars) {
233*01826a49SYabin Cui g_ptr[g_nbChars++] = '\n';
234*01826a49SYabin Cui }
235*01826a49SYabin Cui if (g_nbChars < g_maxChars) {
236*01826a49SYabin Cui g_ptr[g_nbChars++] = '\n';
237*01826a49SYabin Cui }
238*01826a49SYabin Cui }
239*01826a49SYabin Cui
240*01826a49SYabin Cui /* It's "common" for lorem ipsum generators to start with the same first
241*01826a49SYabin Cui * pre-defined sentence */
generateFirstSentence(void)242*01826a49SYabin Cui static void generateFirstSentence(void)
243*01826a49SYabin Cui {
244*01826a49SYabin Cui int i;
245*01826a49SYabin Cui for (i = 0; i < 18; i++) {
246*01826a49SYabin Cui const char* word = kWords[i];
247*01826a49SYabin Cui const char* separator = " ";
248*01826a49SYabin Cui if (i == 4)
249*01826a49SYabin Cui separator = ", ";
250*01826a49SYabin Cui if (i == 7)
251*01826a49SYabin Cui separator = ", ";
252*01826a49SYabin Cui generateWord(word, separator, i == 0);
253*01826a49SYabin Cui }
254*01826a49SYabin Cui generateWord(kWords[18], ". ", 0);
255*01826a49SYabin Cui }
256*01826a49SYabin Cui
257*01826a49SYabin Cui size_t
LOREM_genBlock(void * buffer,size_t size,unsigned seed,int first,int fill)258*01826a49SYabin Cui LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
259*01826a49SYabin Cui {
260*01826a49SYabin Cui g_ptr = (char*)buffer;
261*01826a49SYabin Cui assert(size < INT_MAX);
262*01826a49SYabin Cui g_maxChars = size;
263*01826a49SYabin Cui g_nbChars = 0;
264*01826a49SYabin Cui g_randRoot = seed;
265*01826a49SYabin Cui if (g_distribCount == 0) {
266*01826a49SYabin Cui init_word_distrib(kWords, kNbWords, kWeights, kNbWeights);
267*01826a49SYabin Cui }
268*01826a49SYabin Cui
269*01826a49SYabin Cui if (first) {
270*01826a49SYabin Cui generateFirstSentence();
271*01826a49SYabin Cui }
272*01826a49SYabin Cui while (g_nbChars < g_maxChars) {
273*01826a49SYabin Cui int sentencePerParagraph = about(7);
274*01826a49SYabin Cui generateParagraph(sentencePerParagraph);
275*01826a49SYabin Cui if (!fill)
276*01826a49SYabin Cui break; /* only generate one paragraph in not-fill mode */
277*01826a49SYabin Cui }
278*01826a49SYabin Cui g_ptr = NULL;
279*01826a49SYabin Cui return g_nbChars;
280*01826a49SYabin Cui }
281*01826a49SYabin Cui
LOREM_genBuffer(void * buffer,size_t size,unsigned seed)282*01826a49SYabin Cui void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
283*01826a49SYabin Cui {
284*01826a49SYabin Cui LOREM_genBlock(buffer, size, seed, 1, 1);
285*01826a49SYabin Cui }
286