xref: /aosp_15_r20/external/icu/icu4c/source/i18n/messageformat2_parser.cpp (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1 // © 2024 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include "unicode/utypes.h"
5 
6 #if !UCONFIG_NO_FORMATTING
7 
8 #if !UCONFIG_NO_MF2
9 
10 #include "messageformat2_errors.h"
11 #include "messageformat2_macros.h"
12 #include "messageformat2_parser.h"
13 #include "uvector.h" // U_ASSERT
14 
15 U_NAMESPACE_BEGIN
16 
17 namespace message2 {
18 
19 using namespace pluralimpl;
20 
21 using namespace data_model;
22 
23 /*
24     The `ERROR()` macro sets a syntax error in the context
25     and sets the offset in `parseError` to `index`. It does not alter control flow.
26 */
27 #define ERROR(parseError, errorCode, index)                                                             \
28     if (!errors.hasSyntaxError()) {                                                                     \
29         setParseError(parseError, index);                                                               \
30         errors.addSyntaxError(errorCode);                                                               \
31     }
32 
33 // Returns true iff `index` is a valid index for the string `source`
inBounds(const UnicodeString & source,uint32_t index)34 static bool inBounds(const UnicodeString &source, uint32_t index) {
35     return (((int32_t)index) < source.length());
36 }
37 
38 // Increments the line number and updates the "characters seen before
39 // current line" count in `parseError`, iff `source[index]` is a newline
maybeAdvanceLine()40 void Parser::maybeAdvanceLine() {
41     if (source[index] == LF) {
42         parseError.line++;
43         // add 1 to index to get the number of characters seen so far
44         // (including the newline)
45         parseError.lengthBeforeCurrentLine = index + 1;
46     }
47 }
48 
49 /*
50     Signals an error and returns either if `parseError` already denotes an
51     error, or `index` is out of bounds for the string `source`
52 */
53 #define CHECK_BOUNDS(source, index, parseError, errorCode)                                              \
54     if (!inBounds(source, index)) {                                                                     \
55         ERROR(parseError, errorCode, index);                                                            \
56         return;                                                                                         \
57     }
58 
59 // -------------------------------------
60 // Helper functions
61 
copyContext(const UChar in[U_PARSE_CONTEXT_LEN],UChar out[U_PARSE_CONTEXT_LEN])62 static void copyContext(const UChar in[U_PARSE_CONTEXT_LEN], UChar out[U_PARSE_CONTEXT_LEN]) {
63     for (int32_t i = 0; i < U_PARSE_CONTEXT_LEN; i++) {
64         out[i] = in[i];
65         if (in[i] == '\0') {
66             break;
67         }
68     }
69 }
70 
translateParseError(const MessageParseError & messageParseError,UParseError & parseError)71 /* static */ void Parser::translateParseError(const MessageParseError &messageParseError, UParseError &parseError) {
72     parseError.line = messageParseError.line;
73     parseError.offset = messageParseError.offset;
74     copyContext(messageParseError.preContext, parseError.preContext);
75     copyContext(messageParseError.postContext, parseError.postContext);
76 }
77 
setParseError(MessageParseError & parseError,uint32_t index)78 /* static */ void Parser::setParseError(MessageParseError &parseError, uint32_t index) {
79     // Translate absolute to relative offset
80     parseError.offset = index                               // Start with total number of characters seen
81                       - parseError.lengthBeforeCurrentLine; // Subtract all characters before the current line
82     // TODO: Fill this in with actual pre and post-context
83     parseError.preContext[0] = 0;
84     parseError.postContext[0] = 0;
85 }
86 
87 // -------------------------------------
88 // Predicates
89 
90 // Returns true if `c` is in the interval [`first`, `last`]
inRange(UChar32 c,UChar32 first,UChar32 last)91 static bool inRange(UChar32 c, UChar32 first, UChar32 last) {
92     U_ASSERT(first < last);
93     return c >= first && c <= last;
94 }
95 
96 /*
97   The following helper predicates should exactly match nonterminals in the MessageFormat 2 grammar:
98 
99   `isContentChar()`   : `content-char`
100   `isTextChar()`      : `text-char`
101   `isReservedStart()` : `reserved-start`
102   `isReservedChar()`  : `reserved-char`
103   `isAlpha()`         : `ALPHA`
104   `isDigit()`         : `DIGIT`
105   `isNameStart()`     : `name-start`
106   `isNameChar()`      : `name-char`
107   `isUnquotedStart()` : `unquoted-start`
108   `isQuotedChar()`    : `quoted-char`
109   `isWhitespace()`    : `s`
110 */
111 
isContentChar(UChar32 c)112 static bool isContentChar(UChar32 c) {
113     return inRange(c, 0x0001, 0x0008)    // Omit NULL, HTAB and LF
114            || inRange(c, 0x000B, 0x000C) // Omit CR
115            || inRange(c, 0x000E, 0x001F) // Omit SP
116            || inRange(c, 0x0021, 0x002D) // Omit '.'
117            || inRange(c, 0x002F, 0x003F) // Omit '@'
118            || inRange(c, 0x0041, 0x005B) // Omit '\'
119            || inRange(c, 0x005D, 0x007A) // Omit { | }
120            || inRange(c, 0x007E, 0xD7FF) // Omit surrogates
121            || inRange(c, 0xE000, 0x10FFFF);
122 }
123 
124 // See `s` in the MessageFormat 2 grammar
isWhitespace(UChar32 c)125 inline bool isWhitespace(UChar32 c) {
126     switch (c) {
127     case SPACE:
128     case HTAB:
129     case CR:
130     case LF:
131     case IDEOGRAPHIC_SPACE:
132         return true;
133     default:
134         return false;
135     }
136 }
137 
isTextChar(UChar32 c)138 static bool isTextChar(UChar32 c) {
139     return isContentChar(c)
140         || isWhitespace(c)
141         || c == PERIOD
142         || c == AT
143         || c == PIPE;
144 }
145 
146 // Note: this doesn't distinguish between private-use
147 // and reserved, since the data model doesn't
isReservedStart(UChar32 c)148 static bool isReservedStart(UChar32 c) {
149     switch (c) {
150     case BANG:
151     case PERCENT:
152     case ASTERISK:
153     case PLUS:
154     case LESS_THAN:
155     case GREATER_THAN:
156     case QUESTION:
157     case TILDE:
158     // Private-use
159     case CARET:
160     case AMPERSAND:
161         return true;
162     default:
163         return false;
164     }
165 }
166 
isReservedChar(UChar32 c)167 static bool isReservedChar(UChar32 c) {
168     return isContentChar(c) || c == PERIOD;
169 }
170 
isReservedBodyStart(UChar32 c)171 static bool isReservedBodyStart(UChar32 c) {
172     return isReservedChar(c) || c == BACKSLASH || c == PIPE;
173 }
174 
isAlpha(UChar32 c)175 static bool isAlpha(UChar32 c) { return inRange(c, 0x0041, 0x005A) || inRange(c, 0x0061, 0x007A); }
176 
isDigit(UChar32 c)177 static bool isDigit(UChar32 c) { return inRange(c, 0x0030, 0x0039); }
178 
isNameStart(UChar32 c)179 static bool isNameStart(UChar32 c) {
180     return isAlpha(c) || c == UNDERSCORE || inRange(c, 0x00C0, 0x00D6) || inRange(c, 0x00D8, 0x00F6) ||
181            inRange(c, 0x00F8, 0x02FF) || inRange(c, 0x0370, 0x037D) || inRange(c, 0x037F, 0x1FFF) ||
182            inRange(c, 0x200C, 0x200D) || inRange(c, 0x2070, 0x218F) || inRange(c, 0x2C00, 0x2FEF) ||
183            inRange(c, 0x3001, 0xD7FF) || inRange(c, 0xF900, 0xFDCF) || inRange(c, 0xFDF0, 0xFFFD) ||
184            inRange(c, 0x10000, 0xEFFFF);
185 }
186 
isNameChar(UChar32 c)187 static bool isNameChar(UChar32 c) {
188     return isNameStart(c) || isDigit(c) || c == HYPHEN || c == PERIOD || c == 0x00B7 ||
189            inRange(c, 0x0300, 0x036F) || inRange(c, 0x203F, 0x2040);
190 }
191 
isUnquotedStart(UChar32 c)192 static bool isUnquotedStart(UChar32 c) {
193     return isNameStart(c) || isDigit(c) || c == HYPHEN || c == PERIOD || c == 0x00B7 ||
194            inRange(c, 0x0300, 0x036F) || inRange(c, 0x203F, 0x2040);
195 }
196 
isQuotedChar(UChar32 c)197 static bool isQuotedChar(UChar32 c) {
198     return isContentChar(c)
199         || isWhitespace(c)
200         || c == PERIOD
201         || c == AT
202         || c == LEFT_CURLY_BRACE
203         || c == RIGHT_CURLY_BRACE;
204 }
205 
206 // Returns true iff `c` can begin a `function` nonterminal
isFunctionStart(UChar32 c)207 static bool isFunctionStart(UChar32 c) {
208     switch (c) {
209     case COLON: {
210         return true;
211     }
212     default: {
213         return false;
214     }
215     }
216 }
217 
218 // Returns true iff `c` can begin an `annotation` nonterminal
isAnnotationStart(UChar32 c)219 static bool isAnnotationStart(UChar32 c) {
220     return isFunctionStart(c) || isReservedStart(c);
221 }
222 
223 // Returns true iff `c` can begin either a `reserved-char` or `reserved-escape`
224 // literal
reservedChunkFollows(UChar32 c)225 static bool reservedChunkFollows(UChar32 c) {
226    switch(c) {
227        // reserved-escape
228        case BACKSLASH:
229        // literal
230        case PIPE: {
231            return true;
232        }
233        default: {
234            // reserved-char
235            return (isReservedChar(c));
236        }
237     }
238 }
239 
240 // Returns true iff `c` can begin a `literal` nonterminal
isLiteralStart(UChar32 c)241 static bool isLiteralStart(UChar32 c) {
242     return (c == PIPE || isNameStart(c) || c == HYPHEN || isDigit(c));
243 }
244 
245 // Returns true iff `c` can begin a `key` nonterminal
isKeyStart(UChar32 c)246 static bool isKeyStart(UChar32 c) {
247     return (c == ASTERISK || isLiteralStart(c));
248 }
249 
isDeclarationStart(const UnicodeString & source,int32_t index)250 inline bool isDeclarationStart(const UnicodeString& source, int32_t index) {
251     int32_t len = source.length();
252     int32_t next = index + 1;
253     return (source[index] == ID_LOCAL[0]
254             && next < len
255             && source[next] == ID_LOCAL[1])
256         || (source[index] == ID_INPUT[0]
257             && next < len
258             && source[next] == ID_INPUT[1]);
259 }
260 
261 // -------------------------------------
262 // Parsing functions
263 
264 
265 /*
266   TODO: Since handling the whitespace ambiguities needs to be repeated
267   in several different places and is hard to factor out,
268   it probably would be better to replace the parser with a lexer + parser
269   to separate tokenizing from parsing, which would simplify the code significantly.
270   This has the disadvantage that there is no token grammar for MessageFormat,
271   so one would have to be invented that isn't a component of the spec.
272  */
273 
274 /*
275     This is a recursive-descent scannerless parser that,
276     with a few exceptions, uses 1 character of lookahead.
277 
278     This may not be an exhaustive list, as the additions of attributes and reserved
279     statements introduced several new ambiguities.
280 
281 All but three of the exceptions involve ambiguities about the meaning of whitespace.
282 One ambiguity not involving whitespace is:
283 identifier -> namespace ":" name
284 vs.
285 identifier -> name
286 
287 `namespace` and `name` can't be distinguished without arbitrary lookahead.
288 (For how this is handled, see parseIdentifier())
289 
290 The second ambiguity not involving whitespace is:
291 complex-message -> *(declaration[s]) complex-body
292                 -> declaration *(declaration[s]) complex-body
293                 -> declaration complex-body
294                 -> reserved-statement complex-body
295                 -> .foo {$x} .match // ...
296 When processing the '.', arbitrary lookahead is required to distinguish the
297 arbitrary-length unsupported keyword from `.match`.
298 (For how this is handled, see parseDeclarations()).
299 
300 The third ambiguity not involving whitespace is:
301 complex-message -> *(declaration [s]) complex-body
302                 -> reserved-statement *(declaration [s]) complex-body
303                 -> reserved-statement complex-body
304                 -> reserved-statement quotedPattern
305                 -> reserved-keyword [s reserved-body] 1*([s] expression) quoted-pattern
306                 -> reserved-keyword expression quoted-pattern
307  Example: .foo {1} {{1}}
308 
309  Without lookahead, the opening '{' of the quoted pattern can't be distinguished
310  from the opening '{' of another expression in the unsupported statement.
311  (Though this only requires 1 character of lookahead.)
312 
313  Otherwise:
314 
315 There are at least seven ambiguities in the grammar that can't be resolved with finite
316 lookahead (since whitespace sequences can be arbitrarily long). They are resolved
317 with a form of backtracking (early exit). No state needs to be saved/restored
318 since whitespace doesn't affect the shape of the resulting parse tree, so it's
319 not true backtracking.
320 
321 In addition, the grammar has been refactored
322 in a semantics-preserving way in some cases to make the code easier to structure.
323 
324 First: variant = when 1*(s key) [s] pattern
325    Example: when k     {a}
326    When reading the first space after 'k', it's ambiguous whether it's the
327    required space before another key, or the optional space before `pattern`.
328  (See comments in parseNonEmptyKeys())
329 
330 Second: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
331         annotation = (function *(s option)) / reserved
332    Example: {:f    }
333    When reading the first space after 'f', it's ambiguous whether it's the
334    required space before an option, or the optional trailing space after an options list
335    (in this case, the options list is empty).
336  (See comments in parseOptions() -- handling this case also meant it was easier to base
337   the code on a slightly refactored grammar, which should be semantically equivalent.)
338 
339 Third: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
340         annotation = (function *(s option)) / reserved
341    Example: {@a }
342    Similar to the previous case; see comments in parseReserved()
343 
344 Fourth: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
345    Example: {|foo|   }
346    When reading the first space after the '|', it's ambiguous whether it's the required
347    space before an annotation, or the optional trailing space before the '}'.
348   (See comments in parseLiteralOrVariableWithAnnotation(); handling this case relies on
349   the same grammar refactoring as the second exception.)
350 
351     Most functions match a non-terminal in the grammar, except as explained
352     in comments.
353 
354 Fifth: matcher = match-statement 1*([s] variant)
355                -> match 1 *([s] selector) 1*([s] variant)
356     Example: match {42} * {{_}}
357  When reading the space after the first '}', it's unclear whether
358  it's the optional space before another selector, or the optional space
359  before a variant.
360 
361 Sixth: annotation-expression = "{" [s] annotation *(s attribute) [s] "}"
362        -> "{" [s] function *(s attribute) [s] "}"
363        -> "{" [s] ":" identifier *(s option) *(s attribute) [s] "}"
364        -> "{" [s] ":" identifier s attribute *(s attribute) [s] "}"
365 
366      Example: {:func @foo}
367 (Note: the same ambiguity is present with variable-expression and literal-expression)
368 
369 Seventh:
370 
371 
372 When parsing the space, it's unclear whether it's the optional space before an
373 option, or the optional space before an attribute.
374 
375  Unless otherwise noted in a comment, all helper functions that take
376     a `source` string, an `index` unsigned int, and an `errorCode` `UErrorCode`
377     have the precondition:
378       `index` < `source.length()`
379     and the postcondition:
380       `U_FAILURE(errorCode)` || `index < `source.length()`
381 */
382 
383 /*
384   No pre, no post.
385   A message may end with whitespace, so `index` may equal `source.length()` on exit.
386 */
parseWhitespaceMaybeRequired(bool required,UErrorCode & errorCode)387 void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) {
388     bool sawWhitespace = false;
389 
390     // The loop exits either when we consume all the input,
391     // or when we see a non-whitespace character.
392     while (true) {
393         // Check if all input has been consumed
394         if (!inBounds(source, index)) {
395             // If whitespace isn't required -- or if we saw it already --
396             // then the caller is responsible for checking this case and
397             // setting an error if necessary.
398             if (!required || sawWhitespace) {
399                 // Not an error.
400                 return;
401             }
402             // Otherwise, whitespace is required; the end of the input has
403             // been reached without whitespace. This is an error.
404             ERROR(parseError, errorCode, index);
405             return;
406         }
407 
408         // Input remains; process the next character if it's whitespace,
409         // exit the loop otherwise
410         if (isWhitespace(source[index])) {
411             sawWhitespace = true;
412             // Increment line number in parse error if we consume a newline
413             maybeAdvanceLine();
414             index++;
415         } else {
416             break;
417         }
418     }
419 
420     if (!sawWhitespace && required) {
421         ERROR(parseError, errorCode, index);
422     }
423 }
424 
425 /*
426   No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
427 */
parseRequiredWhitespace(UErrorCode & errorCode)428 void Parser::parseRequiredWhitespace(UErrorCode& errorCode) {
429     parseWhitespaceMaybeRequired(true, errorCode);
430     normalizedInput += SPACE;
431 }
432 
433 /*
434   No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
435 */
parseOptionalWhitespace(UErrorCode & errorCode)436 void Parser::parseOptionalWhitespace(UErrorCode& errorCode) {
437     parseWhitespaceMaybeRequired(false, errorCode);
438 }
439 
440 // Consumes a single character, signaling an error if `source[index]` != `c`
441 // No postcondition -- a message can end with a '}' token
parseToken(UChar32 c,UErrorCode & errorCode)442 void Parser::parseToken(UChar32 c, UErrorCode& errorCode) {
443     CHECK_BOUNDS(source, index, parseError, errorCode);
444 
445     if (source[index] == c) {
446         index++;
447         normalizedInput += c;
448         return;
449     }
450     // Next character didn't match -- error out
451     ERROR(parseError, errorCode, index);
452 }
453 
454 /*
455    Consumes a fixed-length token, signaling an error if the token isn't a prefix of
456    the string beginning at `source[index]`
457    No postcondition -- a message can end with a '}' token
458 */
459 template <int32_t N>
parseToken(const UChar32 (& token)[N],UErrorCode & errorCode)460 void Parser::parseToken(const UChar32 (&token)[N], UErrorCode& errorCode) {
461     U_ASSERT(inBounds(source, index));
462 
463     int32_t tokenPos = 0;
464     while (tokenPos < N - 1) {
465         if (source[index] != token[tokenPos]) {
466             ERROR(parseError, errorCode, index);
467             return;
468         }
469         normalizedInput += token[tokenPos];
470         index++;
471         tokenPos++;
472     }
473 }
474 
475 /*
476    Consumes optional whitespace, possibly advancing `index` to `index'`,
477    then consumes a fixed-length token (signaling an error if the token isn't a prefix of
478    the string beginning at `source[index']`),
479    then consumes optional whitespace again
480 */
481 template <int32_t N>
parseTokenWithWhitespace(const UChar32 (& token)[N],UErrorCode & errorCode)482 void Parser::parseTokenWithWhitespace(const UChar32 (&token)[N], UErrorCode& errorCode) {
483     // No need for error check or bounds check before parseOptionalWhitespace
484     parseOptionalWhitespace(errorCode);
485     // Establish precondition
486     CHECK_BOUNDS(source, index, parseError, errorCode);
487     parseToken(token);
488     parseOptionalWhitespace(errorCode);
489     // Guarantee postcondition
490     CHECK_BOUNDS(source, index, parseError, errorCode);
491 }
492 
493 /*
494    Consumes optional whitespace, possibly advancing `index` to `index'`,
495    then consumes a single character (signaling an error if it doesn't match
496    `source[index']`),
497    then consumes optional whitespace again
498 */
parseTokenWithWhitespace(UChar32 c,UErrorCode & errorCode)499 void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) {
500     // No need for error check or bounds check before parseOptionalWhitespace(errorCode)
501     parseOptionalWhitespace(errorCode);
502     // Establish precondition
503     CHECK_BOUNDS(source, index, parseError, errorCode);
504     parseToken(c, errorCode);
505     parseOptionalWhitespace(errorCode);
506     // Guarantee postcondition
507     CHECK_BOUNDS(source, index, parseError, errorCode);
508 }
509 
510 /*
511   Consumes a non-empty sequence of `name-char`s, the first of which is
512   also a `name-start`.
513   that begins with a character `start` such that `isNameStart(start)`.
514 
515   Returns this sequence.
516 
517   (Matches the `name` nonterminal in the grammar.)
518 */
parseName(UErrorCode & errorCode)519 UnicodeString Parser::parseName(UErrorCode& errorCode) {
520     UnicodeString name;
521 
522     U_ASSERT(inBounds(source, index));
523 
524     if (!isNameStart(source[index])) {
525         ERROR(parseError, errorCode, index);
526         return name;
527     }
528 
529     while (isNameChar(source[index])) {
530         name += source[index];
531         normalizedInput += source[index];
532         index++;
533         if (!inBounds(source, index)) {
534             ERROR(parseError, errorCode, index);
535             break;
536         }
537     }
538     return name;
539 }
540 
541 /*
542   Consumes a '$' followed by a `name`, returning a VariableName
543   with `name` as its name
544 
545   (Matches the `variable` nonterminal in the grammar.)
546 */
parseVariableName(UErrorCode & errorCode)547 VariableName Parser::parseVariableName(UErrorCode& errorCode) {
548     VariableName result;
549 
550     U_ASSERT(inBounds(source, index));
551     // If the '$' is missing, we don't want a binding
552     // for this variable to be created.
553     bool valid = source[index] == DOLLAR;
554     parseToken(DOLLAR, errorCode);
555     if (!inBounds(source, index)) {
556         ERROR(parseError, errorCode, index);
557         return result;
558     }
559     UnicodeString varName = parseName(errorCode);
560     // Set the name to "" if the variable wasn't
561     // declared correctly
562     if (!valid) {
563         varName.remove();
564     }
565     return VariableName(varName);
566 }
567 
568 /*
569   Corresponds to the `identifier` nonterminal in the grammar
570 */
parseIdentifier(UErrorCode & errorCode)571 UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) {
572     U_ASSERT(inBounds(source, index));
573 
574     UnicodeString result;
575     // The following is a hack to get around ambiguity in the grammar:
576     // identifier -> namespace ":" name
577     // vs.
578     // identifier -> name
579     // can't be distinguished without arbitrary lookahead.
580     // Instead, we treat the production as:
581     // identifier -> namespace *(":"name)
582     // and then check for multiple colons.
583 
584     // Parse namespace
585     result += parseName(errorCode);
586     int32_t firstColon = -1;
587     while (inBounds(source, index) && source[index] == COLON) {
588         // Parse ':' separator
589         if (firstColon == -1) {
590             firstColon = index;
591         }
592         parseToken(COLON, errorCode);
593         result += COLON;
594         // Check for message ending with something like "foo:"
595         if (!inBounds(source, index)) {
596             ERROR(parseError, errorCode, index);
597         } else {
598             // Parse name part
599             result += parseName(errorCode);
600         }
601     }
602 
603     // If there's at least one ':', scan from the first ':'
604     // to the end of the name to check for multiple ':'s
605     if (firstColon != -1) {
606         for (int32_t i = firstColon + 1; i < result.length(); i++) {
607             if (result[i] == COLON) {
608                 ERROR(parseError, errorCode, i);
609                 return {};
610             }
611         }
612     }
613 
614     return result;
615 }
616 
617 /*
618   Consumes a reference to a function, matching the ": identifier"
619   in the `function` nonterminal in the grammar.
620 
621   Returns the function name.
622 */
parseFunction(UErrorCode & errorCode)623 FunctionName Parser::parseFunction(UErrorCode& errorCode) {
624     U_ASSERT(inBounds(source, index));
625     if (!isFunctionStart(source[index])) {
626         ERROR(parseError, errorCode, index);
627         return FunctionName();
628     }
629 
630     normalizedInput += source[index];
631     index++; // Consume the function start character
632     if (!inBounds(source, index)) {
633         ERROR(parseError, errorCode, index);
634         return FunctionName();
635     }
636     return parseIdentifier(errorCode);
637 }
638 
639 
640 /*
641   Precondition: source[index] == BACKSLASH
642 
643   Consume an escaped character.
644 
645   Generalized to handle `reserved-escape`, `text-escape`,
646   or `literal-escape`, depending on the `kind` argument.
647 
648   Appends result to `str`
649 */
parseEscapeSequence(EscapeKind kind,UnicodeString & str,UErrorCode & errorCode)650 void Parser::parseEscapeSequence(EscapeKind kind,
651                                  UnicodeString &str,
652                                  UErrorCode& errorCode) {
653     U_ASSERT(inBounds(source, index));
654     U_ASSERT(source[index] == BACKSLASH);
655     normalizedInput += BACKSLASH;
656     index++; // Skip the initial backslash
657     CHECK_BOUNDS(source, index, parseError, errorCode);
658 
659     #define SUCCEED \
660        /* Append to the output string */                    \
661        str += source[index];                                \
662        /* Update normalizedInput */                         \
663        normalizedInput += source[index];                    \
664        /* Consume the character */                          \
665        index++;                                             \
666        /* Guarantee postcondition */                        \
667        CHECK_BOUNDS(source, index, parseError, errorCode);  \
668        return;
669 
670     // Expect a '{', '|' or '}'
671     switch (source[index]) {
672     case LEFT_CURLY_BRACE:
673     case RIGHT_CURLY_BRACE: {
674         // Allowed in a `text-escape` or `reserved-escape`
675         switch (kind) {
676         case TEXT:
677         case RESERVED: {
678             SUCCEED;
679         }
680         default: {
681             break;
682         }
683         }
684         break;
685     }
686     case PIPE: {
687         // Allowed in a `literal-escape` or `reserved-escape`
688         switch (kind) {
689            case LITERAL:
690            case RESERVED: {
691                SUCCEED;
692            }
693            default: {
694                break;
695            }
696         }
697         break;
698     }
699    case BACKSLASH: {
700        // Allowed in any escape sequence
701        SUCCEED;
702    }
703    default: {
704         // No other characters are allowed here
705         break;
706     }
707    }
708    // If control reaches here, there was an error
709    ERROR(parseError, errorCode, index);
710 }
711 
712 /*
713   Consume an escaped pipe or backslash, matching the `literal-escape`
714   nonterminal in the grammar
715 */
parseLiteralEscape(UnicodeString & str,UErrorCode & errorCode)716 void Parser::parseLiteralEscape(UnicodeString &str, UErrorCode& errorCode) {
717     parseEscapeSequence(LITERAL, str, errorCode);
718 }
719 
720 
721 /*
722   Consume and return a quoted literal, matching the `literal` nonterminal in the grammar.
723 */
parseQuotedLiteral(UErrorCode & errorCode)724 Literal Parser::parseQuotedLiteral(UErrorCode& errorCode) {
725     bool error = false;
726 
727     UnicodeString contents;
728     if (U_SUCCESS(errorCode)) {
729         // Parse the opening '|'
730         parseToken(PIPE, errorCode);
731         if (!inBounds(source, index)) {
732             ERROR(parseError, errorCode, index);
733             error = true;
734         } else {
735             // Parse the contents
736             bool done = false;
737             while (!done) {
738                 if (source[index] == BACKSLASH) {
739                     parseLiteralEscape(contents, errorCode);
740                 } else if (isQuotedChar(source[index])) {
741                     contents += source[index];
742                     normalizedInput += source[index];
743                     index++; // Consume this character
744                     maybeAdvanceLine();
745                 } else {
746                     // Assume the sequence of literal characters ends here
747                     done = true;
748                 }
749                 if (!inBounds(source, index)) {
750                     ERROR(parseError, errorCode, index);
751                     error = true;
752                     break;
753                 }
754             }
755         }
756     }
757 
758     if (error) {
759         return {};
760     }
761 
762     // Parse the closing '|'
763     parseToken(PIPE, errorCode);
764 
765     return Literal(true, contents);
766 }
767 
768 // Parse (1*DIGIT)
parseDigits(UErrorCode & errorCode)769 UnicodeString Parser::parseDigits(UErrorCode& errorCode) {
770     if (U_FAILURE(errorCode)) {
771         return {};
772     }
773 
774     U_ASSERT(isDigit(source[index]));
775 
776     UnicodeString contents;
777     do {
778         contents += source[index];
779         normalizedInput += source[index];
780         index++;
781         if (!inBounds(source, index)) {
782             ERROR(parseError, errorCode, index);
783             return {};
784         }
785     } while (isDigit(source[index]));
786 
787     return contents;
788 }
789 /*
790   Consume and return an unquoted literal, matching the `unquoted` nonterminal in the grammar.
791 */
parseUnquotedLiteral(UErrorCode & errorCode)792 Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) {
793     if (U_FAILURE(errorCode)) {
794         return {};
795     }
796 
797     // unquoted -> name
798     if (isNameStart(source[index])) {
799         return Literal(false, parseName(errorCode));
800     }
801 
802     // unquoted -> number
803     // Parse the contents
804     UnicodeString contents;
805 
806     // Parse the sign
807     if (source[index] == HYPHEN) {
808         contents += source[index];
809         normalizedInput += source[index];
810         index++;
811     }
812     if (!inBounds(source, index)) {
813         ERROR(parseError, errorCode, index);
814         return {};
815     }
816 
817     // Parse the integer part
818     if (source[index] == ((UChar32)0x0030) /* 0 */) {
819         contents += source[index];
820         normalizedInput += source[index];
821         index++;
822     } else if (isDigit(source[index])) {
823         contents += parseDigits(errorCode);
824     } else {
825         // Error -- nothing else can start a number literal
826         ERROR(parseError, errorCode, index);
827         return {};
828     }
829 
830     // Parse the decimal point if present
831     if (source[index] == PERIOD) {
832         contents += source[index];
833         normalizedInput += source[index];
834         index++;
835         if (!inBounds(source, index)) {
836             ERROR(parseError, errorCode, index);
837             return {};
838         }
839         // Parse the fraction part
840         if (isDigit(source[index])) {
841             contents += parseDigits(errorCode);
842         } else {
843             // '.' not followed by digit is a parse error
844             ERROR(parseError, errorCode, index);
845             return {};
846         }
847     }
848 
849     if (!inBounds(source, index)) {
850         ERROR(parseError, errorCode, index);
851         return {};
852     }
853 
854     // Parse the exponent part if present
855     if (source[index] == UPPERCASE_E || source[index] == LOWERCASE_E) {
856         contents += source[index];
857         normalizedInput += source[index];
858         index++;
859         if (!inBounds(source, index)) {
860             ERROR(parseError, errorCode, index);
861             return {};
862         }
863         // Parse sign if present
864         if (source[index] == PLUS || source[index] == HYPHEN) {
865             contents += source[index];
866             normalizedInput += source[index];
867             index++;
868             if (!inBounds(source, index)) {
869                 ERROR(parseError, errorCode, index);
870                 return {};
871             }
872         }
873         // Parse exponent digits
874         if (!isDigit(source[index])) {
875             ERROR(parseError, errorCode, index);
876             return {};
877         }
878         contents += parseDigits(errorCode);
879     }
880 
881     return Literal(false, contents);
882 }
883 
884 /*
885   Consume and return a literal, matching the `literal` nonterminal in the grammar.
886 */
parseLiteral(UErrorCode & errorCode)887 Literal Parser::parseLiteral(UErrorCode& errorCode) {
888     Literal result;
889     if (!inBounds(source, index)) {
890         ERROR(parseError, errorCode, index);
891     } else {
892         if (source[index] == PIPE) {
893             result = parseQuotedLiteral(errorCode);
894         } else {
895             result = parseUnquotedLiteral(errorCode);
896         }
897         // Guarantee postcondition
898         if (!inBounds(source, index)) {
899             ERROR(parseError, errorCode, index);
900         }
901     }
902 
903     return result;
904 }
905 
906 /*
907   Consume a @name-value pair, matching the `attribute` nonterminal in the grammar.
908 
909   Adds the option to `options`
910 */
911 template<class T>
parseAttribute(AttributeAdder<T> & attrAdder,UErrorCode & errorCode)912 void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
913     U_ASSERT(inBounds(source, index));
914 
915     U_ASSERT(source[index] == AT);
916     // Consume the '@'
917     parseToken(AT, errorCode);
918 
919     // Parse LHS
920     UnicodeString lhs = parseIdentifier(errorCode);
921 
922     // Prepare to "backtrack" to resolve ambiguity
923     // about whether whitespace precedes another
924     // attribute, or the '=' sign
925     int32_t savedIndex = index;
926     parseOptionalWhitespace(errorCode);
927 
928     Operand rand;
929     if (source[index] == EQUALS) {
930         // Parse '='
931         parseTokenWithWhitespace(EQUALS, errorCode);
932 
933         UnicodeString rhsStr;
934         // Parse RHS, which is either a literal or variable
935         switch (source[index]) {
936         case DOLLAR: {
937             rand = Operand(parseVariableName(errorCode));
938             break;
939         }
940         default: {
941             // Must be a literal
942             rand = Operand(parseLiteral(errorCode));
943             break;
944         }
945         }
946         U_ASSERT(!rand.isNull());
947     } else {
948         // attribute -> "@" identifier [[s] "=" [s]]
949         // Use null operand, which `rand` is already set to
950         // "Backtrack" by restoring the whitespace (if there was any)
951         index = savedIndex;
952     }
953 
954     attrAdder.addAttribute(lhs, std::move(rand), errorCode);
955 }
956 
957 /*
958   Consume a name-value pair, matching the `option` nonterminal in the grammar.
959 
960   Adds the option to `optionList`
961 */
962 template<class T>
parseOption(OptionAdder<T> & addOption,UErrorCode & errorCode)963 void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) {
964     U_ASSERT(inBounds(source, index));
965 
966     // Parse LHS
967     UnicodeString lhs = parseIdentifier(errorCode);
968 
969     // Parse '='
970     parseTokenWithWhitespace(EQUALS, errorCode);
971 
972     UnicodeString rhsStr;
973     Operand rand;
974     // Parse RHS, which is either a literal or variable
975     switch (source[index]) {
976     case DOLLAR: {
977         rand = Operand(parseVariableName(errorCode));
978         break;
979     }
980     default: {
981         // Must be a literal
982         rand = Operand(parseLiteral(errorCode));
983         break;
984     }
985     }
986     U_ASSERT(!rand.isNull());
987 
988     // Finally, add the key=value mapping
989     // Use a local error code, check for duplicate option error and
990     // record it as with other errors
991     UErrorCode status = U_ZERO_ERROR;
992     addOption.addOption(lhs, std::move(rand), status);
993     if (U_FAILURE(status)) {
994       U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR);
995       errors.setDuplicateOptionName(errorCode);
996     }
997 }
998 
999 /*
1000   Note: there are multiple overloads of parseOptions() for parsing
1001   options within markup, vs. within an expression, vs. parsing
1002   attributes. This should be refactored. TODO
1003  */
1004 
1005 /*
1006   Consume optional whitespace followed by a sequence of options
1007   (possibly empty), separated by whitespace
1008 */
1009 template <class T>
parseOptions(OptionAdder<T> & addOption,UErrorCode & errorCode)1010 void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) {
1011     // Early exit if out of bounds -- no more work is possible
1012     CHECK_BOUNDS(source, index, parseError, errorCode);
1013 
1014 /*
1015 Arbitrary lookahead is required to parse option lists. To see why, consider
1016 these rules from the grammar:
1017 
1018 expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1019 annotation = (function *(s option)) / reserved
1020 
1021 And this example:
1022 {:foo  }
1023 
1024 Derivation:
1025 expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1026            -> "{" [s] annotation [s] "}"
1027            -> "{" [s] ((function *(s option)) / reserved) [s] "}"
1028            -> "{" [s] function *(s option) [s] "}"
1029 
1030 In this example, knowing whether to expect a '}' or the start of another option
1031 after the whitespace would require arbitrary lookahead -- in other words, which
1032 rule should we apply?
1033     *(s option) -> s option *(s option)
1034   or
1035     *(s option) ->
1036 
1037 The same would apply to the example {:foo k=v } (note the trailing space after "v").
1038 
1039 This is addressed using a form of backtracking and (to make the backtracking easier
1040 to apply) a slight refactoring to the grammar.
1041 
1042 This code is written as if the grammar is:
1043   expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}"
1044   annotation = (function *(s option) [s]) / (reserved [s])
1045 
1046 Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning
1047 that `parseExpression()` can safely require a '}' after `parseOptions()` finishes.
1048 
1049 Note that when "backtracking" really just means early exit, since only whitespace
1050 is involved and there's no state to save.
1051 
1052 There is a separate but similar ambiguity as to whether the space precedes
1053 an option or an attribute.
1054 */
1055 
1056     while(true) {
1057         // If the next character is not whitespace, that means we've already
1058         // parsed the entire options list (which may have been empty) and there's
1059         // no trailing whitespace. In that case, exit.
1060         if (!isWhitespace(source[index])) {
1061             break;
1062         }
1063         int32_t firstWhitespace = index;
1064 
1065         // In any case other than an empty options list, there must be at least
1066         // one whitespace character.
1067         parseRequiredWhitespace(errorCode);
1068         // Restore precondition
1069         CHECK_BOUNDS(source, index, parseError, errorCode);
1070 
1071         // If a name character follows, then at least one more option remains
1072         // in the list.
1073         // Otherwise, we've consumed all the options and any trailing whitespace,
1074         // and can exit.
1075         // Note that exiting is sort of like backtracking: "(s option)" doesn't apply,
1076         // so we back out to [s].
1077         if (!isNameStart(source[index])) {
1078             // We've consumed all the options (meaning that either we consumed non-empty
1079             // whitespace, or consumed at least one option.)
1080             // Done.
1081             // Remove the required whitespace from normalizedInput
1082             normalizedInput.truncate(normalizedInput.length() - 1);
1083             // "Backtrack" so as to leave the optional whitespace there
1084             // when parsing attributes
1085             index = firstWhitespace;
1086             break;
1087         }
1088         parseOption(addOption, errorCode);
1089     }
1090 }
1091 
1092 /*
1093   Consume optional whitespace followed by a sequence of attributes
1094   (possibly empty), separated by whitespace
1095 */
1096 template<class T>
parseAttributes(AttributeAdder<T> & attrAdder,UErrorCode & errorCode)1097 void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1098 
1099     // Early exit if out of bounds -- no more work is possible
1100     if (!inBounds(source, index)) {
1101         ERROR(parseError, errorCode, index);
1102         return;
1103     }
1104 
1105 /*
1106 Arbitrary lookahead is required to parse attribute lists, similarly to option lists.
1107 (See comment in parseOptions()).
1108 */
1109 
1110     while(true) {
1111         // If the next character is not whitespace, that means we've already
1112         // parsed the entire attributes list (which may have been empty) and there's
1113         // no trailing whitespace. In that case, exit.
1114         if (!isWhitespace(source[index])) {
1115             break;
1116         }
1117 
1118         // In any case other than an empty attributes list, there must be at least
1119         // one whitespace character.
1120         parseRequiredWhitespace(errorCode);
1121         // Restore precondition
1122         if (!inBounds(source, index)) {
1123             ERROR(parseError, errorCode, index);
1124             break;
1125         }
1126 
1127         // If an '@' follows, then at least one more attribute remains
1128         // in the list.
1129         // Otherwise, we've consumed all the attributes and any trailing whitespace,
1130         // and can exit.
1131         // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply,
1132         // so we back out to [s].
1133         if (source[index] != AT) {
1134             // We've consumed all the attributes (meaning that either we consumed non-empty
1135             // whitespace, or consumed at least one attribute.)
1136             // Done.
1137             // Remove the whitespace from normalizedInput
1138             normalizedInput.truncate(normalizedInput.length() - 1);
1139             break;
1140         }
1141         parseAttribute(attrAdder, errorCode);
1142     }
1143 }
1144 
parseReservedEscape(UnicodeString & str,UErrorCode & errorCode)1145 void Parser::parseReservedEscape(UnicodeString &str, UErrorCode& errorCode) {
1146     parseEscapeSequence(RESERVED, str, errorCode);
1147 }
1148 
1149 /*
1150   Consumes a non-empty sequence of reserved-chars, reserved-escapes, and
1151   literals (as in 1*(reserved-char / reserved-escape / literal) in the `reserved-body` rule)
1152 
1153   Appends it to `str`
1154 */
parseReservedChunk(Reserved::Builder & result,UErrorCode & status)1155 void Parser::parseReservedChunk(Reserved::Builder& result, UErrorCode& status) {
1156     CHECK_ERROR(status);
1157 
1158     bool empty = true;
1159     UnicodeString chunk;
1160     while(reservedChunkFollows(source[index])) {
1161         empty = false;
1162         // reserved-char
1163         if (isReservedChar(source[index])) {
1164             chunk += source[index];
1165             normalizedInput += source[index];
1166             // consume the char
1167             index++;
1168             // Restore precondition
1169             CHECK_BOUNDS(source, index, parseError, status);
1170             continue;
1171         }
1172 
1173         if (chunk.length() > 0) {
1174           result.add(Literal(false, chunk), status);
1175           chunk.setTo(u"", 0);
1176         }
1177 
1178         if (source[index] == BACKSLASH) {
1179             // reserved-escape
1180             parseReservedEscape(chunk, status);
1181             result.add(Literal(false, chunk), status);
1182             chunk.setTo(u"", 0);
1183         } else if (source[index] == PIPE || isUnquotedStart(source[index])) {
1184             result.add(parseLiteral(status), status);
1185         } else {
1186             // The reserved chunk ends here
1187             break;
1188         }
1189 
1190         CHECK_ERROR(status); // Avoid looping infinitely
1191     }
1192 
1193     // Add the last chunk if necessary
1194     if (chunk.length() > 0) {
1195         result.add(Literal(false, chunk), status);
1196     }
1197 
1198     if (empty) {
1199         ERROR(parseError, status, index);
1200     }
1201 }
1202 
1203 /*
1204   Consume a `reserved-start` character followed by a possibly-empty sequence
1205   of non-empty sequences of reserved characters, separated by whitespace.
1206   Matches the `reserved` nonterminal in the grammar
1207 
1208 */
parseReserved(UErrorCode & status)1209 Reserved Parser::parseReserved(UErrorCode& status) {
1210     Reserved::Builder builder(status);
1211 
1212     if (U_FAILURE(status)) {
1213         return {};
1214     }
1215 
1216     U_ASSERT(inBounds(source, index));
1217 
1218     // Require a `reservedStart` character
1219     if (!isReservedStart(source[index])) {
1220         ERROR(parseError, status, index);
1221         return Reserved();
1222     }
1223 
1224     // Add the start char as a separate text chunk
1225     UnicodeString firstCharString(source[index]);
1226     builder.add(Literal(false, firstCharString), status);
1227     if (U_FAILURE(status)) {
1228         return {};
1229     }
1230     // Consume reservedStart
1231     normalizedInput += source[index];
1232     index++;
1233     return parseReservedBody(builder, status);
1234 }
1235 
parseReservedBody(Reserved::Builder & builder,UErrorCode & status)1236 Reserved Parser::parseReservedBody(Reserved::Builder& builder, UErrorCode& status) {
1237     if (U_FAILURE(status)) {
1238         return {};
1239     }
1240 
1241 /*
1242   Arbitrary lookahead is required to parse a `reserved`, for similar reasons
1243   to why it's required for parsing function annotations.
1244 
1245   In the grammar:
1246 
1247   annotation = (function *(s option)) / reserved
1248   expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1249   reserved       = reserved-start reserved-body
1250   reserved-body  = *( [s] 1*(reserved-char / reserved-escape / literal))
1251 
1252   When reading a whitespace character, it's ambiguous whether it's the optional
1253   whitespace in this rule, or the optional whitespace that precedes a '}' in an
1254   expression.
1255 
1256   The ambiguity is resolved using the same grammar refactoring as shown in
1257   the comment in `parseOptions()`.
1258 */
1259     // Consume reserved characters / literals / reserved escapes
1260     // until a character that can't be in a `reserved-body` is seen
1261     while (true) {
1262         /*
1263           First, if there is whitespace, it means either a chunk follows it,
1264           or this is the trailing whitespace before the '}' that terminates an
1265           expression.
1266 
1267           Next, if the next character can start a reserved-char, reserved-escape,
1268           or literal, then parse a "chunk" of reserved things.
1269           In any other case, we exit successfully, since per the refactored
1270           grammar rule:
1271                annotation = (function *(s option) [s]) / (reserved [s])
1272           it's valid to consume whitespace after a `reserved`.
1273           (`parseExpression()` is responsible for checking that the next
1274           character is in fact a '}'.)
1275          */
1276         if (!inBounds(source, index)) {
1277             break;
1278         }
1279         int32_t numWhitespaceChars = 0;
1280         int32_t savedIndex = index;
1281         if (isWhitespace(source[index])) {
1282             parseOptionalWhitespace(status);
1283             numWhitespaceChars = index - savedIndex;
1284             // Restore precondition
1285             if (!inBounds(source, index)) {
1286                 break;
1287             }
1288         }
1289 
1290         if (reservedChunkFollows(source[index])) {
1291             parseReservedChunk(builder, status);
1292 
1293             // Avoid looping infinitely
1294             if (U_FAILURE(status) || !inBounds(source, index)) {
1295                 break;
1296             }
1297         } else {
1298             if (numWhitespaceChars > 0) {
1299                 if (source[index] == LEFT_CURLY_BRACE) {
1300                     // Resolve even more ambiguity (space preceding another piece of
1301                     // a `reserved-body`, vs. space preceding an expression in `reserved-statement`
1302                     // "Backtrack"
1303                     index -= numWhitespaceChars;
1304                     break;
1305                 }
1306                 if (source[index] == RIGHT_CURLY_BRACE) {
1307                     // Not an error: just means there's no trailing whitespace
1308                     // after this `reserved`
1309                     break;
1310                 }
1311                 if (source[index] == AT) {
1312                     // Not an error, but we have to "backtrack" due to the ambiguity
1313                     // between an `s` preceding another reserved chunk
1314                     // and an `s` preceding an attribute list
1315                     index -= numWhitespaceChars;
1316                     break;
1317                 }
1318                 // Error: if there's whitespace, it must either be followed
1319                 // by a non-empty sequence or by '}'
1320                 ERROR(parseError, status, index);
1321                 break;
1322             }
1323             // If there was no whitespace, it's not an error,
1324             // just the end of the reserved string
1325             break;
1326         }
1327     }
1328 
1329     return builder.build(status);
1330 }
1331 
1332 /*
1333   Consume a function call or reserved string, matching the `annotation`
1334   nonterminal in the grammar
1335 
1336   Returns an `Operator` representing this (a reserved is a parse error)
1337 */
parseAnnotation(UErrorCode & status)1338 Operator Parser::parseAnnotation(UErrorCode& status) {
1339     U_ASSERT(inBounds(source, index));
1340     Operator::Builder ratorBuilder(status);
1341     if (U_FAILURE(status)) {
1342         return {};
1343     }
1344     if (isFunctionStart(source[index])) {
1345         // Consume the function name
1346         FunctionName func = parseFunction(status);
1347         ratorBuilder.setFunctionName(std::move(func));
1348 
1349         OptionAdder<Operator::Builder> addOptions(ratorBuilder);
1350         // Consume the options (which may be empty)
1351         parseOptions(addOptions, status);
1352     } else {
1353       // Must be reserved
1354       // A reserved sequence is not a parse error, but might be a formatting error
1355       Reserved rator = parseReserved(status);
1356       ratorBuilder.setReserved(std::move(rator));
1357     }
1358     UErrorCode localStatus = U_ZERO_ERROR;
1359     Operator result = ratorBuilder.build(localStatus);
1360     // Either `setReserved` or `setFunctionName` was called,
1361     // so there shouldn't be an error.
1362     U_ASSERT(U_SUCCESS(localStatus));
1363     return result;
1364 }
1365 
1366 /*
1367   Consume a literal or variable (depending on `isVariable`),
1368   followed by either required whitespace followed by an annotation,
1369   or optional whitespace.
1370 */
parseLiteralOrVariableWithAnnotation(bool isVariable,Expression::Builder & builder,UErrorCode & status)1371 void Parser::parseLiteralOrVariableWithAnnotation(bool isVariable,
1372                                                   Expression::Builder& builder,
1373                                                   UErrorCode& status) {
1374     CHECK_ERROR(status);
1375 
1376     U_ASSERT(inBounds(source, index));
1377 
1378     Operand rand;
1379     if (isVariable) {
1380         rand = Operand(parseVariableName(status));
1381     } else {
1382         rand = Operand(parseLiteral(status));
1383     }
1384 
1385     builder.setOperand(std::move(rand));
1386 
1387 /*
1388 Parsing a literal or variable with an optional annotation requires arbitrary lookahead.
1389 To see why, consider this rule from the grammar:
1390 
1391 expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1392 
1393 And this example:
1394 
1395 {|foo|   }
1396 
1397 Derivation:
1398 expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1399            -> "{" [s] ((literal / variable) [s annotation]) [s] "}"
1400            -> "{" [s] (literal [s annotation]) [s] "}"
1401 
1402 When reading the ' ' after the second '|', it's ambiguous whether that's the required
1403 space before an annotation, or the optional space before the '}'.
1404 
1405 To make this ambiguity easier to handle, this code is based on the same grammar
1406 refactoring for the `expression` nonterminal that `parseOptions()` relies on. See
1407 the comment in `parseOptions()` for details.
1408 */
1409 
1410     if (isWhitespace(source[index])) {
1411       int32_t firstWhitespace = index;
1412 
1413       // If the next character is whitespace, either [s annotation] or [s] applies
1414       // (the character is either the required space before an annotation, or optional
1415       // trailing space after the literal or variable). It's still ambiguous which
1416       // one does apply.
1417       parseOptionalWhitespace(status);
1418       // Restore precondition
1419       CHECK_BOUNDS(source, index, parseError, status);
1420 
1421       // This next check resolves the ambiguity between [s annotation] and [s]
1422       bool isSAnnotation = isAnnotationStart(source[index]);
1423 
1424       if (isSAnnotation) {
1425         normalizedInput += SPACE;
1426       }
1427 
1428       if (isSAnnotation) {
1429         // The previously consumed whitespace precedes an annotation
1430         builder.setOperator(parseAnnotation(status));
1431       } else {
1432           // Either there's a right curly brace (will be consumed by the caller),
1433           // or there's an error and the trailing whitespace should be
1434           // handled by the caller. However, this is not an error
1435           // here because we're just parsing `literal [s annotation]`.
1436           index = firstWhitespace;
1437       }
1438     } else {
1439       // Either there was never whitespace, or
1440       // the previously consumed whitespace is the optional trailing whitespace;
1441       // either the next character is '}' or the error will be handled by parseExpression.
1442       // Do nothing, since the operand was already set
1443     }
1444 
1445     // At the end of this code, the next character should either be '}',
1446     // whitespace followed by a '}',
1447     // or end-of-input
1448 }
1449 
1450 /*
1451   Consume an expression, matching the `expression` nonterminal in the grammar
1452 */
1453 
exprFallback(Expression::Builder & exprBuilder)1454 static void exprFallback(Expression::Builder& exprBuilder) {
1455     // Construct a literal consisting just of  The U+FFFD REPLACEMENT CHARACTER
1456     // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
1457     exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
1458 }
1459 
exprFallback(UErrorCode & status)1460 static Expression exprFallback(UErrorCode& status) {
1461     Expression result;
1462     if (U_SUCCESS(status)) {
1463         Expression::Builder exprBuilder(status);
1464         if (U_SUCCESS(status)) {
1465             // Construct a literal consisting just of  The U+FFFD REPLACEMENT CHARACTER
1466             // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
1467             exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
1468             UErrorCode status = U_ZERO_ERROR;
1469             result = exprBuilder.build(status);
1470             // An operand was set, so there can't be an error
1471             U_ASSERT(U_SUCCESS(status));
1472         }
1473     }
1474     return result;
1475 }
1476 
parseExpression(UErrorCode & status)1477 Expression Parser::parseExpression(UErrorCode& status) {
1478     if (U_FAILURE(status)) {
1479         return {};
1480     }
1481 
1482     // Early return if out of input -- no more work is possible
1483     U_ASSERT(inBounds(source, index));
1484 
1485     // Parse opening brace
1486     parseToken(LEFT_CURLY_BRACE, status);
1487     // Optional whitespace after opening brace
1488     parseOptionalWhitespace(status);
1489 
1490     Expression::Builder exprBuilder(status);
1491     // Restore precondition
1492     if (!inBounds(source, index)) {
1493         exprFallback(exprBuilder);
1494     } else {
1495         // literal '|', variable '$' or annotation
1496         switch (source[index]) {
1497         case PIPE: {
1498             // Quoted literal
1499             parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
1500             break;
1501         }
1502         case DOLLAR: {
1503             // Variable
1504             parseLiteralOrVariableWithAnnotation(true, exprBuilder, status);
1505             break;
1506         }
1507         default: {
1508             if (isAnnotationStart(source[index])) {
1509                 Operator rator = parseAnnotation(status);
1510                 exprBuilder.setOperator(std::move(rator));
1511             } else if (isUnquotedStart(source[index])) {
1512                 // Unquoted literal
1513                 parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
1514             } else {
1515                 // Not a literal, variable or annotation -- error out
1516                 ERROR(parseError, status, index);
1517                 exprFallback(exprBuilder);
1518                 break;
1519             }
1520             break;
1521         }
1522         }
1523     }
1524 
1525     // Parse attributes
1526     AttributeAdder attrAdder(exprBuilder);
1527     parseAttributes(attrAdder, status);
1528 
1529     // Parse optional space
1530     // (the last [s] in e.g. "{" [s] literal [s annotation] *(s attribute) [s] "}")
1531     parseOptionalWhitespace(status);
1532 
1533     // Either an operand or operator (or both) must have been set already,
1534     // so there can't be an error
1535     UErrorCode localStatus = U_ZERO_ERROR;
1536     Expression result = exprBuilder.build(localStatus);
1537     U_ASSERT(U_SUCCESS(localStatus));
1538 
1539     // Check for end-of-input and missing '}'
1540     if (!inBounds(source, index)) {
1541         ERROR(parseError, status, index);
1542     } else {
1543         // Otherwise, it's safe to check for the '}'
1544         parseToken(RIGHT_CURLY_BRACE, status);
1545     }
1546     return result;
1547 }
1548 
1549 /*
1550   Parse a .local declaration, matching the `local-declaration`
1551   production in the grammar
1552 */
parseLocalDeclaration(UErrorCode & status)1553 void Parser::parseLocalDeclaration(UErrorCode& status) {
1554     // End-of-input here would be an error; even empty
1555     // declarations must be followed by a body
1556     CHECK_BOUNDS(source, index, parseError, status);
1557 
1558     parseToken(ID_LOCAL, status);
1559     parseRequiredWhitespace(status);
1560 
1561     // Restore precondition
1562     CHECK_BOUNDS(source, index, parseError, status);
1563     VariableName lhs = parseVariableName(status);
1564     parseTokenWithWhitespace(EQUALS, status);
1565     // Restore precondition before calling parseExpression()
1566     CHECK_BOUNDS(source, index, parseError, status);
1567 
1568     Expression rhs = parseExpression(status);
1569 
1570     // Add binding from lhs to rhs, unless there was an error
1571     // (This ensures that if there was a correct lhs but a
1572     // parse error in rhs, the fallback for uses of the
1573     // lhs will be its own name rather than the rhs)
1574     /* This affects the behavior of this test case, which the spec
1575        is ambiguous about:
1576 
1577        .local $bar {|foo|} {{{$bar}}}
1578 
1579        Should `$bar` still be bound to a value although
1580        its declaration is syntactically incorrect (missing the '=')?
1581        This code says no, but it needs to change if
1582        https://github.com/unicode-org/message-format-wg/issues/703
1583        is resolved differently.
1584     */
1585     CHECK_ERROR(status);
1586     if (!errors.hasSyntaxError()) {
1587         dataModel.addBinding(Binding(std::move(lhs), std::move(rhs)), status);
1588         // Check if status is U_DUPLICATE_DECLARATION_ERROR
1589         // and add that as an internal error if so
1590         if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
1591             status = U_ZERO_ERROR;
1592             errors.addError(StaticErrorType::DuplicateDeclarationError, status);
1593         }
1594     }
1595 }
1596 
1597 /*
1598   Parse an .input declaration, matching the `local-declaration`
1599   production in the grammar
1600 */
parseInputDeclaration(UErrorCode & status)1601 void Parser::parseInputDeclaration(UErrorCode& status) {
1602     // End-of-input here would be an error; even empty
1603     // declarations must be followed by a body
1604     CHECK_BOUNDS(source, index, parseError, status);
1605 
1606     parseToken(ID_INPUT, status);
1607     parseOptionalWhitespace(status);
1608 
1609     // Restore precondition before calling parseExpression()
1610     CHECK_BOUNDS(source, index, parseError, status);
1611 
1612     // Save the index for error diagnostics
1613     int32_t exprIndex = index;
1614     Expression rhs = parseExpression(status);
1615 
1616     // Here we have to check that the rhs is a variable-expression
1617     if (!rhs.getOperand().isVariable()) {
1618         // This case is a syntax error; report it at the beginning
1619         // of the expression
1620         ERROR(parseError, status, exprIndex);
1621         return;
1622     }
1623 
1624     VariableName lhs = rhs.getOperand().asVariable();
1625 
1626     // Add binding from lhs to rhs
1627     // This just adds a new local variable that shadows the message
1628     // argument referred to, which is harmless.
1629     // When evaluating the RHS, the new local is not in scope
1630     // and the message argument will be correctly referred to.
1631     CHECK_ERROR(status);
1632     if (!errors.hasSyntaxError()) {
1633         dataModel.addBinding(Binding::input(std::move(lhs), std::move(rhs), status), status);
1634         // Check if status is U_MF_DUPLICATE_DECLARATION_ERROR
1635         // and add that as an internal error if so
1636         if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
1637             status = U_ZERO_ERROR;
1638             errors.addError(StaticErrorType::DuplicateDeclarationError, status);
1639         }
1640     }
1641 }
1642 
1643 /*
1644   Parses a `reserved-statement` per the grammar
1645  */
parseUnsupportedStatement(UErrorCode & status)1646 void Parser::parseUnsupportedStatement(UErrorCode& status) {
1647     U_ASSERT(inBounds(source, index) && source[index] == PERIOD);
1648 
1649     UnsupportedStatement::Builder builder(status);
1650     CHECK_ERROR(status);
1651 
1652     // Parse the keyword
1653     UnicodeString keyword(PERIOD);
1654     normalizedInput += UnicodeString(PERIOD);
1655     index++;
1656     keyword += parseName(status);
1657     builder.setKeyword(keyword);
1658 
1659     // Parse the body, which is optional
1660     // Lookahead is required to distinguish the `s` in reserved-body
1661     // from the `s` in `[s] expression`
1662     // Next character may be:
1663     // * whitespace (followed by either a reserved-body start or
1664     //   a '{')
1665     // * a '{'
1666 
1667     CHECK_BOUNDS(source, index, parseError, status);
1668 
1669     if (source[index] != LEFT_CURLY_BRACE) {
1670         if (!isWhitespace(source[index])) {
1671             ERROR(parseError, status, index);
1672             return;
1673         }
1674         // Expect a reserved-body start
1675         int32_t savedIndex = index;
1676         parseRequiredWhitespace(status);
1677         CHECK_BOUNDS(source, index, parseError, status);
1678         if (isReservedBodyStart(source[index])) {
1679             // There is a reserved body
1680             Reserved::Builder r(status);
1681             builder.setBody(parseReservedBody(r, status));
1682         } else {
1683             // No body -- backtrack so we can parse 1*([s] expression)
1684             index = savedIndex;
1685             normalizedInput.truncate(normalizedInput.length() - 1);
1686         }
1687         // Otherwise, the next character must be a '{'
1688         // to open the required expression (or optional whitespace)
1689         if (source[index] != LEFT_CURLY_BRACE && !isWhitespace(source[index])) {
1690             ERROR(parseError, status, index);
1691             return;
1692         }
1693     }
1694 
1695     // Finally, parse the expressions
1696 
1697     // Need to look ahead to disambiguate a '{' beginning
1698     // an expression from one beginning with a quoted pattern
1699     int32_t expressionCount = 0;
1700     while (source[index] == LEFT_CURLY_BRACE || isWhitespace(source[index])) {
1701         parseOptionalWhitespace(status);
1702 
1703         bool nextIsLbrace = source[index] == LEFT_CURLY_BRACE;
1704         bool nextIsQuotedPattern = nextIsLbrace && inBounds(source, index + 1)
1705             && source[index + 1] == LEFT_CURLY_BRACE;
1706         if (nextIsQuotedPattern) {
1707             break;
1708         }
1709 
1710         builder.addExpression(parseExpression(status), status);
1711         expressionCount++;
1712     }
1713     if (expressionCount <= 0) {
1714         // At least one expression is required
1715         ERROR(parseError, status, index);
1716         return;
1717     }
1718     dataModel.addUnsupportedStatement(builder.build(status), status);
1719 }
1720 
1721 // Terrible hack to get around the ambiguity between `matcher` and `reserved-statement`
nextIsMatch() const1722 bool Parser::nextIsMatch() const {
1723     for(int32_t i = 0; i < 6; i++) {
1724         if (!inBounds(source, index + i) || source[index + i] != ID_MATCH[i]) {
1725             return false;
1726         }
1727     }
1728     return true;
1729 }
1730 /*
1731   Consume a possibly-empty sequence of declarations separated by whitespace;
1732   each declaration matches the `declaration` nonterminal in the grammar
1733 
1734   Builds up an environment representing those declarations
1735 */
parseDeclarations(UErrorCode & status)1736 void Parser::parseDeclarations(UErrorCode& status) {
1737     // End-of-input here would be an error; even empty
1738     // declarations must be followed by a body
1739     CHECK_BOUNDS(source, index, parseError, status);
1740 
1741     while (source[index] == PERIOD) {
1742         CHECK_BOUNDS(source, index + 1, parseError, status);
1743         if (source[index + 1] == ID_LOCAL[1]) {
1744             parseLocalDeclaration(status);
1745         } else if (source[index + 1] == ID_INPUT[1]) {
1746             parseInputDeclaration(status);
1747         } else {
1748             // Unsupported statement
1749             // Lookahead is needed to disambiguate this from a `match`
1750             if (!nextIsMatch()) {
1751                 parseUnsupportedStatement(status);
1752             } else {
1753                 // Done parsing declarations
1754                 break;
1755             }
1756         }
1757 
1758         // Avoid looping infinitely
1759         CHECK_ERROR(status);
1760 
1761         parseOptionalWhitespace(status);
1762         // Restore precondition
1763         CHECK_BOUNDS(source, index, parseError, status);
1764     }
1765 }
1766 
1767 /*
1768   Consume an escaped curly brace, or backslash, matching the `text-escape`
1769   nonterminal in the grammar
1770 */
parseTextEscape(UnicodeString & str,UErrorCode & status)1771 void Parser::parseTextEscape(UnicodeString &str, UErrorCode& status) {
1772     parseEscapeSequence(TEXT, str, status);
1773 }
1774 
1775 /*
1776   Consume a non-empty sequence of text characters and escaped text characters,
1777   matching the `text` nonterminal in the grammar
1778 
1779   No postcondition (a message can end with a text)
1780 */
parseText(UErrorCode & status)1781 UnicodeString Parser::parseText(UErrorCode& status) {
1782     UnicodeString str;
1783     if (!inBounds(source, index)) {
1784         // Text can be empty
1785         return str;
1786     }
1787 
1788     if (!(isTextChar(source[index] || source[index] == BACKSLASH))) {
1789         // Error -- text is expected here
1790         ERROR(parseError, status, index);
1791         return str;
1792     }
1793 
1794     while (true) {
1795         if (source[index] == BACKSLASH) {
1796             parseTextEscape(str, status);
1797         } else if (isTextChar(source[index])) {
1798             normalizedInput += source[index];
1799             str += source[index];
1800             index++;
1801             maybeAdvanceLine();
1802         } else {
1803             break;
1804         }
1805         if (!inBounds(source, index)) {
1806             // OK for text to end a message
1807             break;
1808         }
1809     }
1810 
1811     return str;
1812 }
1813 
1814 /*
1815   Consume an `nmtoken`, `literal`, or the string "*", matching
1816   the `key` nonterminal in the grammar
1817 */
parseKey(UErrorCode & status)1818 Key Parser::parseKey(UErrorCode& status) {
1819     U_ASSERT(inBounds(source, index));
1820 
1821     Key k; // wildcard by default
1822     // Literal | '*'
1823     switch (source[index]) {
1824     case ASTERISK: {
1825         index++;
1826         normalizedInput += ASTERISK;
1827         // Guarantee postcondition
1828         if (!inBounds(source, index)) {
1829             ERROR(parseError, status, index);
1830             return k;
1831         }
1832         break;
1833     }
1834     default: {
1835         // Literal
1836         k = Key(parseLiteral(status));
1837         break;
1838     }
1839     }
1840     return k;
1841 }
1842 
1843 /*
1844   Consume a non-empty sequence of `key`s separated by whitespace
1845 
1846   Takes ownership of `keys`
1847 */
parseNonEmptyKeys(UErrorCode & status)1848 SelectorKeys Parser::parseNonEmptyKeys(UErrorCode& status) {
1849     SelectorKeys result;
1850 
1851     if (U_FAILURE(status)) {
1852         return result;
1853     }
1854 
1855     U_ASSERT(inBounds(source, index));
1856 
1857 /*
1858 Arbitrary lookahead is required to parse key lists. To see why, consider
1859 this rule from the grammar:
1860 
1861 variant = key *(s key) [s] quoted-pattern
1862 
1863 And this example:
1864 when k1 k2   {a}
1865 
1866 Derivation:
1867    variant -> key *(s key) [s] quoted-pattern
1868            -> key s key *(s key) quoted-pattern
1869 
1870 After matching ' ' to `s` and 'k2' to `key`, it would require arbitrary lookahead
1871 to know whether to expect the start of a pattern or the start of another key.
1872 In other words: is the second whitespace sequence the required space in *(s key),
1873 or the optional space in [s] quoted-pattern?
1874 
1875 This is addressed using "backtracking" (similarly to `parseOptions()`).
1876 */
1877 
1878     SelectorKeys::Builder keysBuilder(status);
1879     if (U_FAILURE(status)) {
1880         return result;
1881     }
1882 
1883     // Since the first key is required, it's simplest to parse it separately.
1884     keysBuilder.add(parseKey(status), status);
1885 
1886     // Restore precondition
1887     if (!inBounds(source, index)) {
1888         ERROR(parseError, status, index);
1889         return result;
1890     }
1891 
1892     // We've seen at least one whitespace-key pair, so now we can parse
1893     // *(s key) [s]
1894     while (source[index] != LEFT_CURLY_BRACE || isWhitespace(source[index])) { // Try to recover from errors
1895         bool wasWhitespace = isWhitespace(source[index]);
1896         parseRequiredWhitespace(status);
1897         if (!wasWhitespace) {
1898             // Avoid infinite loop when parsing something like:
1899             // when * @{!...
1900             index++;
1901         }
1902 
1903         // Restore precondition
1904         if (!inBounds(source, index)) {
1905             ERROR(parseError, status, index);
1906             return result;
1907         }
1908 
1909         // At this point, it's ambiguous whether we are inside (s key) or [s].
1910         // This check resolves that ambiguity.
1911         if (source[index] == LEFT_CURLY_BRACE) {
1912             // A pattern follows, so what we just parsed was the optional
1913             // trailing whitespace. All the keys have been parsed.
1914 
1915             // Unpush the whitespace from `normalizedInput`
1916             normalizedInput.truncate(normalizedInput.length() - 1);
1917             break;
1918         }
1919         keysBuilder.add(parseKey(status), status);
1920     }
1921 
1922     return keysBuilder.build(status);
1923 }
1924 
parseQuotedPattern(UErrorCode & status)1925 Pattern Parser::parseQuotedPattern(UErrorCode& status) {
1926     U_ASSERT(inBounds(source, index));
1927 
1928     parseToken(LEFT_CURLY_BRACE, status);
1929     parseToken(LEFT_CURLY_BRACE, status);
1930     Pattern p = parseSimpleMessage(status);
1931     parseToken(RIGHT_CURLY_BRACE, status);
1932     parseToken(RIGHT_CURLY_BRACE, status);
1933     return p;
1934 }
1935 
1936 /*
1937   Consume a `placeholder`, matching the nonterminal in the grammar
1938   No postcondition (a markup can end a message)
1939 */
parseMarkup(UErrorCode & status)1940 Markup Parser::parseMarkup(UErrorCode& status) {
1941     U_ASSERT(inBounds(source, index + 1));
1942 
1943     U_ASSERT(source[index] == LEFT_CURLY_BRACE);
1944 
1945     Markup::Builder builder(status);
1946     if (U_FAILURE(status)) {
1947         return {};
1948     }
1949 
1950     // Consume the '{'
1951     index++;
1952     normalizedInput += LEFT_CURLY_BRACE;
1953     parseOptionalWhitespace(status);
1954     bool closing = false;
1955     switch (source[index]) {
1956     case NUMBER_SIGN: {
1957         // Open or standalone; consume the '#'
1958         normalizedInput += source[index];
1959         index++;
1960         break;
1961     }
1962     case SLASH: {
1963         // Closing
1964         normalizedInput += source[index];
1965         closing = true;
1966         index++;
1967         break;
1968     }
1969     default: {
1970         ERROR(parseError, status, index);
1971         return {};
1972     }
1973     }
1974 
1975     // Parse the markup identifier
1976     builder.setName(parseIdentifier(status));
1977 
1978     // Parse the options, which must begin with a ' '
1979     // if present
1980     if (inBounds(source, index) && isWhitespace(source[index])) {
1981         OptionAdder<Markup::Builder> optionAdder(builder);
1982         parseOptions(optionAdder, status);
1983     }
1984 
1985     // Parse the attributes, which also must begin
1986     // with a ' '
1987     if (inBounds(source, index) && isWhitespace(source[index])) {
1988         AttributeAdder attrAdder(builder);
1989         parseAttributes(attrAdder, status);
1990     }
1991 
1992     parseOptionalWhitespace(status);
1993 
1994     bool standalone = false;
1995     // Check if this is a standalone or not
1996     if (!closing) {
1997         if (inBounds(source, index) && source[index] == SLASH) {
1998             standalone = true;
1999             normalizedInput += SLASH;
2000             index++;
2001         }
2002     }
2003 
2004     parseToken(RIGHT_CURLY_BRACE, status);
2005 
2006     if (standalone) {
2007         builder.setStandalone();
2008     } else if (closing) {
2009         builder.setClose();
2010     } else {
2011         builder.setOpen();
2012     }
2013 
2014     return builder.build(status);
2015 }
2016 
2017 /*
2018   Consume a `placeholder`, matching the nonterminal in the grammar
2019   No postcondition (a placeholder can end a message)
2020 */
parsePlaceholder(UErrorCode & status)2021 std::variant<Expression, Markup> Parser::parsePlaceholder(UErrorCode& status) {
2022     U_ASSERT(source[index] == LEFT_CURLY_BRACE);
2023 
2024     if (!inBounds(source, index)) {
2025         ERROR(parseError, status, index);
2026         return exprFallback(status);
2027     }
2028 
2029     // Check if it's markup or an expression
2030     if (source[index + 1] == NUMBER_SIGN || source[index + 1] == SLASH) {
2031         // Markup
2032         return parseMarkup(status);
2033     }
2034     return parseExpression(status);
2035 }
2036 
2037 /*
2038   Consume a `simple-message`, matching the nonterminal in the grammar
2039   Postcondition: `index == source.length()` or U_FAILURE(status);
2040   for a syntactically correct message, this will consume the entire input
2041 */
parseSimpleMessage(UErrorCode & status)2042 Pattern Parser::parseSimpleMessage(UErrorCode& status) {
2043     Pattern::Builder result(status);
2044 
2045     if (U_SUCCESS(status)) {
2046         Expression expression;
2047         while (inBounds(source, index)) {
2048             switch (source[index]) {
2049             case LEFT_CURLY_BRACE: {
2050                 // Must be placeholder
2051                 std::variant<Expression, Markup> piece = parsePlaceholder(status);
2052                 if (std::holds_alternative<Expression>(piece)) {
2053                     Expression expr = *std::get_if<Expression>(&piece);
2054                     result.add(std::move(expr), status);
2055                 } else {
2056                     Markup markup = *std::get_if<Markup>(&piece);
2057                     result.add(std::move(markup), status);
2058                 }
2059                 break;
2060             }
2061             default: {
2062                 // Must be text
2063                 result.add(parseText(status), status);
2064                 break;
2065             }
2066             }
2067             if (source[index] == RIGHT_CURLY_BRACE) {
2068                 // End of quoted pattern
2069                 break;
2070             }
2071             // Don't loop infinitely
2072             if (errors.hasSyntaxError()) {
2073                 break;
2074             }
2075         }
2076     }
2077     return result.build(status);
2078 }
2079 
2080 
2081 /*
2082   Consume a `selectors` (matching the nonterminal in the grammar),
2083   followed by a non-empty sequence of `variant`s (matching the nonterminal
2084   in the grammar) preceded by whitespace
2085   No postcondition (on return, `index` might equal `source.length()` with no syntax error
2086   because a message can end with a variant)
2087 */
parseSelectors(UErrorCode & status)2088 void Parser::parseSelectors(UErrorCode& status) {
2089     CHECK_ERROR(status);
2090 
2091     U_ASSERT(inBounds(source, index));
2092 
2093     parseToken(ID_MATCH, status);
2094 
2095     bool empty = true;
2096     // Parse selectors
2097     // "Backtracking" is required here. It's not clear if whitespace is
2098     // (`[s]` selector) or (`[s]` variant)
2099     while (isWhitespace(source[index]) || source[index] == LEFT_CURLY_BRACE) {
2100         parseOptionalWhitespace(status);
2101         // Restore precondition
2102         CHECK_BOUNDS(source, index, parseError, status);
2103         if (source[index] != LEFT_CURLY_BRACE) {
2104             // This is not necessarily an error, but rather,
2105             // means the whitespace we parsed was the optional
2106             // whitespace preceding the first variant, not the
2107             // optional whitespace preceding a subsequent expression.
2108             break;
2109         }
2110         Expression expression;
2111         expression = parseExpression(status);
2112         empty = false;
2113 
2114         dataModel.addSelector(std::move(expression), status);
2115         CHECK_ERROR(status);
2116     }
2117 
2118     // At least one selector is required
2119     if (empty) {
2120         ERROR(parseError, status, index);
2121         return;
2122     }
2123 
2124     #define CHECK_END_OF_INPUT                     \
2125         if (((int32_t)index) >= source.length()) { \
2126             break;                                 \
2127         }                                          \
2128 
2129     // Parse variants
2130     while (isWhitespace(source[index]) || isKeyStart(source[index])) {
2131         if (isWhitespace(source[index])) {
2132             int32_t whitespaceStart = index;
2133             parseOptionalWhitespace(status);
2134             // Restore the precondition.
2135             // Error out if we reached the end of input. The message
2136             // cannot end with trailing whitespace if there are variants.
2137             if (!inBounds(source, index)) {
2138                 // Use index of first whitespace for error message
2139                 index = whitespaceStart;
2140                 ERROR(parseError, status, index);
2141                 return;
2142             }
2143         }
2144 
2145         // At least one key is required
2146         SelectorKeys keyList(parseNonEmptyKeys(status));
2147 
2148         CHECK_ERROR(status);
2149 
2150         // parseNonEmptyKeys() consumes any trailing whitespace,
2151         // so the pattern can be consumed next.
2152 
2153         // Restore precondition before calling parsePattern()
2154         // (which must return a non-null value)
2155         CHECK_BOUNDS(source, index, parseError, status);
2156         Pattern rhs = parseQuotedPattern(status);
2157 
2158         dataModel.addVariant(std::move(keyList), std::move(rhs), status);
2159 
2160         // Restore the precondition, *without* erroring out if we've
2161         // reached the end of input. That's because it's valid for the
2162         // message to end with a variant that has no trailing whitespace.
2163         // Why do we need to check this condition twice inside the loop?
2164         // Because if we don't check it here, the `isWhitespace()` call in
2165         // the loop head will read off the end of the input string.
2166         CHECK_END_OF_INPUT
2167     }
2168 }
2169 
2170 /*
2171   Consume a `body` (matching the nonterminal in the grammar),
2172   No postcondition (on return, `index` might equal `source.length()` with no syntax error,
2173   because a message can end with a body (trailing whitespace is optional)
2174 */
2175 
errorPattern(UErrorCode & status)2176 void Parser::errorPattern(UErrorCode& status) {
2177     errors.addSyntaxError(status);
2178     // Set to empty pattern
2179     Pattern::Builder result = Pattern::Builder(status);
2180     CHECK_ERROR(status);
2181 
2182     // If still in bounds, then add the remaining input as a single text part
2183     // to the pattern
2184     /*
2185       TODO: this behavior isn't documented in the spec, but it comes from
2186       https://github.com/messageformat/messageformat/blob/e0087bff312d759b67a9129eac135d318a1f0ce7/packages/mf2-messageformat/src/__fixtures/test-messages.json#L236
2187       and a pending pull request https://github.com/unicode-org/message-format-wg/pull/462 will clarify
2188       whether this is the intent behind the spec
2189      */
2190     UnicodeString partStr(LEFT_CURLY_BRACE);
2191     while (inBounds(source, index)) {
2192         partStr += source[index++];
2193     }
2194     // Add curly braces around the entire output (same comment as above)
2195     partStr += RIGHT_CURLY_BRACE;
2196     result.add(std::move(partStr), status);
2197     dataModel.setPattern(result.build(status));
2198 }
2199 
parseBody(UErrorCode & status)2200 void Parser::parseBody(UErrorCode& status) {
2201     CHECK_ERROR(status);
2202 
2203     // Out-of-input is a syntax warning
2204     if (!inBounds(source, index)) {
2205         errorPattern(status);
2206         return;
2207     }
2208 
2209     // Body must be either a pattern or selectors
2210     switch (source[index]) {
2211     case LEFT_CURLY_BRACE: {
2212         // Pattern
2213         dataModel.setPattern(parseQuotedPattern(status));
2214         break;
2215     }
2216     case ID_MATCH[0]: {
2217         // Selectors
2218         parseSelectors(status);
2219         return;
2220     }
2221     default: {
2222         ERROR(parseError, status, index);
2223         errorPattern(status);
2224         return;
2225     }
2226     }
2227 }
2228 
2229 // -------------------------------------
2230 // Parses the source pattern.
2231 
parse(UParseError & parseErrorResult,UErrorCode & status)2232 void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) {
2233     CHECK_ERROR(status);
2234 
2235     bool simple = true;
2236     // Message can be empty, so we need to only look ahead
2237     // if we know it's non-empty
2238     if (inBounds(source, index)) {
2239         if (source[index] == PERIOD
2240             || (index < ((uint32_t) source.length() + 1)
2241                 && source[index] == LEFT_CURLY_BRACE
2242                 && source[index + 1] == LEFT_CURLY_BRACE)) {
2243             // A complex message begins with a '.' or '{'
2244             parseDeclarations(status);
2245             parseBody(status);
2246             simple = false;
2247         }
2248     }
2249     if (simple) {
2250         // Simple message
2251         // For normalization, quote the pattern
2252         normalizedInput += LEFT_CURLY_BRACE;
2253         normalizedInput += LEFT_CURLY_BRACE;
2254         dataModel.setPattern(parseSimpleMessage(status));
2255         normalizedInput += RIGHT_CURLY_BRACE;
2256         normalizedInput += RIGHT_CURLY_BRACE;
2257     }
2258 
2259     CHECK_ERROR(status);
2260 
2261     // There are no errors; finally, check that the entire input was consumed
2262     if (((int32_t)index) != source.length()) {
2263       ERROR(parseError, status, index);
2264     }
2265 
2266     // Finally, copy the relevant fields of the internal `MessageParseError`
2267     // into the `UParseError` argument
2268     translateParseError(parseError, parseErrorResult);
2269 }
2270 
~Parser()2271 Parser::~Parser() {}
2272 
2273 } // namespace message2
2274 U_NAMESPACE_END
2275 
2276 #endif /* #if !UCONFIG_NO_MF2 */
2277 
2278 #endif /* #if !UCONFIG_NO_FORMATTING */
2279 
2280