1 // © 2024 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 #include "unicode/utypes.h"
5
6 #if !UCONFIG_NO_FORMATTING
7
8 #if !UCONFIG_NO_MF2
9
10 #include "messageformat2_errors.h"
11 #include "messageformat2_macros.h"
12 #include "messageformat2_parser.h"
13 #include "uvector.h" // U_ASSERT
14
15 U_NAMESPACE_BEGIN
16
17 namespace message2 {
18
19 using namespace pluralimpl;
20
21 using namespace data_model;
22
23 /*
24 The `ERROR()` macro sets a syntax error in the context
25 and sets the offset in `parseError` to `index`. It does not alter control flow.
26 */
27 #define ERROR(parseError, errorCode, index) \
28 if (!errors.hasSyntaxError()) { \
29 setParseError(parseError, index); \
30 errors.addSyntaxError(errorCode); \
31 }
32
33 // Returns true iff `index` is a valid index for the string `source`
inBounds(const UnicodeString & source,uint32_t index)34 static bool inBounds(const UnicodeString &source, uint32_t index) {
35 return (((int32_t)index) < source.length());
36 }
37
38 // Increments the line number and updates the "characters seen before
39 // current line" count in `parseError`, iff `source[index]` is a newline
maybeAdvanceLine()40 void Parser::maybeAdvanceLine() {
41 if (source[index] == LF) {
42 parseError.line++;
43 // add 1 to index to get the number of characters seen so far
44 // (including the newline)
45 parseError.lengthBeforeCurrentLine = index + 1;
46 }
47 }
48
49 /*
50 Signals an error and returns either if `parseError` already denotes an
51 error, or `index` is out of bounds for the string `source`
52 */
53 #define CHECK_BOUNDS(source, index, parseError, errorCode) \
54 if (!inBounds(source, index)) { \
55 ERROR(parseError, errorCode, index); \
56 return; \
57 }
58
59 // -------------------------------------
60 // Helper functions
61
copyContext(const UChar in[U_PARSE_CONTEXT_LEN],UChar out[U_PARSE_CONTEXT_LEN])62 static void copyContext(const UChar in[U_PARSE_CONTEXT_LEN], UChar out[U_PARSE_CONTEXT_LEN]) {
63 for (int32_t i = 0; i < U_PARSE_CONTEXT_LEN; i++) {
64 out[i] = in[i];
65 if (in[i] == '\0') {
66 break;
67 }
68 }
69 }
70
translateParseError(const MessageParseError & messageParseError,UParseError & parseError)71 /* static */ void Parser::translateParseError(const MessageParseError &messageParseError, UParseError &parseError) {
72 parseError.line = messageParseError.line;
73 parseError.offset = messageParseError.offset;
74 copyContext(messageParseError.preContext, parseError.preContext);
75 copyContext(messageParseError.postContext, parseError.postContext);
76 }
77
setParseError(MessageParseError & parseError,uint32_t index)78 /* static */ void Parser::setParseError(MessageParseError &parseError, uint32_t index) {
79 // Translate absolute to relative offset
80 parseError.offset = index // Start with total number of characters seen
81 - parseError.lengthBeforeCurrentLine; // Subtract all characters before the current line
82 // TODO: Fill this in with actual pre and post-context
83 parseError.preContext[0] = 0;
84 parseError.postContext[0] = 0;
85 }
86
87 // -------------------------------------
88 // Predicates
89
90 // Returns true if `c` is in the interval [`first`, `last`]
inRange(UChar32 c,UChar32 first,UChar32 last)91 static bool inRange(UChar32 c, UChar32 first, UChar32 last) {
92 U_ASSERT(first < last);
93 return c >= first && c <= last;
94 }
95
96 /*
97 The following helper predicates should exactly match nonterminals in the MessageFormat 2 grammar:
98
99 `isContentChar()` : `content-char`
100 `isTextChar()` : `text-char`
101 `isReservedStart()` : `reserved-start`
102 `isReservedChar()` : `reserved-char`
103 `isAlpha()` : `ALPHA`
104 `isDigit()` : `DIGIT`
105 `isNameStart()` : `name-start`
106 `isNameChar()` : `name-char`
107 `isUnquotedStart()` : `unquoted-start`
108 `isQuotedChar()` : `quoted-char`
109 `isWhitespace()` : `s`
110 */
111
isContentChar(UChar32 c)112 static bool isContentChar(UChar32 c) {
113 return inRange(c, 0x0001, 0x0008) // Omit NULL, HTAB and LF
114 || inRange(c, 0x000B, 0x000C) // Omit CR
115 || inRange(c, 0x000E, 0x001F) // Omit SP
116 || inRange(c, 0x0021, 0x002D) // Omit '.'
117 || inRange(c, 0x002F, 0x003F) // Omit '@'
118 || inRange(c, 0x0041, 0x005B) // Omit '\'
119 || inRange(c, 0x005D, 0x007A) // Omit { | }
120 || inRange(c, 0x007E, 0xD7FF) // Omit surrogates
121 || inRange(c, 0xE000, 0x10FFFF);
122 }
123
124 // See `s` in the MessageFormat 2 grammar
isWhitespace(UChar32 c)125 inline bool isWhitespace(UChar32 c) {
126 switch (c) {
127 case SPACE:
128 case HTAB:
129 case CR:
130 case LF:
131 case IDEOGRAPHIC_SPACE:
132 return true;
133 default:
134 return false;
135 }
136 }
137
isTextChar(UChar32 c)138 static bool isTextChar(UChar32 c) {
139 return isContentChar(c)
140 || isWhitespace(c)
141 || c == PERIOD
142 || c == AT
143 || c == PIPE;
144 }
145
146 // Note: this doesn't distinguish between private-use
147 // and reserved, since the data model doesn't
isReservedStart(UChar32 c)148 static bool isReservedStart(UChar32 c) {
149 switch (c) {
150 case BANG:
151 case PERCENT:
152 case ASTERISK:
153 case PLUS:
154 case LESS_THAN:
155 case GREATER_THAN:
156 case QUESTION:
157 case TILDE:
158 // Private-use
159 case CARET:
160 case AMPERSAND:
161 return true;
162 default:
163 return false;
164 }
165 }
166
isReservedChar(UChar32 c)167 static bool isReservedChar(UChar32 c) {
168 return isContentChar(c) || c == PERIOD;
169 }
170
isReservedBodyStart(UChar32 c)171 static bool isReservedBodyStart(UChar32 c) {
172 return isReservedChar(c) || c == BACKSLASH || c == PIPE;
173 }
174
isAlpha(UChar32 c)175 static bool isAlpha(UChar32 c) { return inRange(c, 0x0041, 0x005A) || inRange(c, 0x0061, 0x007A); }
176
isDigit(UChar32 c)177 static bool isDigit(UChar32 c) { return inRange(c, 0x0030, 0x0039); }
178
isNameStart(UChar32 c)179 static bool isNameStart(UChar32 c) {
180 return isAlpha(c) || c == UNDERSCORE || inRange(c, 0x00C0, 0x00D6) || inRange(c, 0x00D8, 0x00F6) ||
181 inRange(c, 0x00F8, 0x02FF) || inRange(c, 0x0370, 0x037D) || inRange(c, 0x037F, 0x1FFF) ||
182 inRange(c, 0x200C, 0x200D) || inRange(c, 0x2070, 0x218F) || inRange(c, 0x2C00, 0x2FEF) ||
183 inRange(c, 0x3001, 0xD7FF) || inRange(c, 0xF900, 0xFDCF) || inRange(c, 0xFDF0, 0xFFFD) ||
184 inRange(c, 0x10000, 0xEFFFF);
185 }
186
isNameChar(UChar32 c)187 static bool isNameChar(UChar32 c) {
188 return isNameStart(c) || isDigit(c) || c == HYPHEN || c == PERIOD || c == 0x00B7 ||
189 inRange(c, 0x0300, 0x036F) || inRange(c, 0x203F, 0x2040);
190 }
191
isUnquotedStart(UChar32 c)192 static bool isUnquotedStart(UChar32 c) {
193 return isNameStart(c) || isDigit(c) || c == HYPHEN || c == PERIOD || c == 0x00B7 ||
194 inRange(c, 0x0300, 0x036F) || inRange(c, 0x203F, 0x2040);
195 }
196
isQuotedChar(UChar32 c)197 static bool isQuotedChar(UChar32 c) {
198 return isContentChar(c)
199 || isWhitespace(c)
200 || c == PERIOD
201 || c == AT
202 || c == LEFT_CURLY_BRACE
203 || c == RIGHT_CURLY_BRACE;
204 }
205
206 // Returns true iff `c` can begin a `function` nonterminal
isFunctionStart(UChar32 c)207 static bool isFunctionStart(UChar32 c) {
208 switch (c) {
209 case COLON: {
210 return true;
211 }
212 default: {
213 return false;
214 }
215 }
216 }
217
218 // Returns true iff `c` can begin an `annotation` nonterminal
isAnnotationStart(UChar32 c)219 static bool isAnnotationStart(UChar32 c) {
220 return isFunctionStart(c) || isReservedStart(c);
221 }
222
223 // Returns true iff `c` can begin either a `reserved-char` or `reserved-escape`
224 // literal
reservedChunkFollows(UChar32 c)225 static bool reservedChunkFollows(UChar32 c) {
226 switch(c) {
227 // reserved-escape
228 case BACKSLASH:
229 // literal
230 case PIPE: {
231 return true;
232 }
233 default: {
234 // reserved-char
235 return (isReservedChar(c));
236 }
237 }
238 }
239
240 // Returns true iff `c` can begin a `literal` nonterminal
isLiteralStart(UChar32 c)241 static bool isLiteralStart(UChar32 c) {
242 return (c == PIPE || isNameStart(c) || c == HYPHEN || isDigit(c));
243 }
244
245 // Returns true iff `c` can begin a `key` nonterminal
isKeyStart(UChar32 c)246 static bool isKeyStart(UChar32 c) {
247 return (c == ASTERISK || isLiteralStart(c));
248 }
249
isDeclarationStart(const UnicodeString & source,int32_t index)250 inline bool isDeclarationStart(const UnicodeString& source, int32_t index) {
251 int32_t len = source.length();
252 int32_t next = index + 1;
253 return (source[index] == ID_LOCAL[0]
254 && next < len
255 && source[next] == ID_LOCAL[1])
256 || (source[index] == ID_INPUT[0]
257 && next < len
258 && source[next] == ID_INPUT[1]);
259 }
260
261 // -------------------------------------
262 // Parsing functions
263
264
265 /*
266 TODO: Since handling the whitespace ambiguities needs to be repeated
267 in several different places and is hard to factor out,
268 it probably would be better to replace the parser with a lexer + parser
269 to separate tokenizing from parsing, which would simplify the code significantly.
270 This has the disadvantage that there is no token grammar for MessageFormat,
271 so one would have to be invented that isn't a component of the spec.
272 */
273
274 /*
275 This is a recursive-descent scannerless parser that,
276 with a few exceptions, uses 1 character of lookahead.
277
278 This may not be an exhaustive list, as the additions of attributes and reserved
279 statements introduced several new ambiguities.
280
281 All but three of the exceptions involve ambiguities about the meaning of whitespace.
282 One ambiguity not involving whitespace is:
283 identifier -> namespace ":" name
284 vs.
285 identifier -> name
286
287 `namespace` and `name` can't be distinguished without arbitrary lookahead.
288 (For how this is handled, see parseIdentifier())
289
290 The second ambiguity not involving whitespace is:
291 complex-message -> *(declaration[s]) complex-body
292 -> declaration *(declaration[s]) complex-body
293 -> declaration complex-body
294 -> reserved-statement complex-body
295 -> .foo {$x} .match // ...
296 When processing the '.', arbitrary lookahead is required to distinguish the
297 arbitrary-length unsupported keyword from `.match`.
298 (For how this is handled, see parseDeclarations()).
299
300 The third ambiguity not involving whitespace is:
301 complex-message -> *(declaration [s]) complex-body
302 -> reserved-statement *(declaration [s]) complex-body
303 -> reserved-statement complex-body
304 -> reserved-statement quotedPattern
305 -> reserved-keyword [s reserved-body] 1*([s] expression) quoted-pattern
306 -> reserved-keyword expression quoted-pattern
307 Example: .foo {1} {{1}}
308
309 Without lookahead, the opening '{' of the quoted pattern can't be distinguished
310 from the opening '{' of another expression in the unsupported statement.
311 (Though this only requires 1 character of lookahead.)
312
313 Otherwise:
314
315 There are at least seven ambiguities in the grammar that can't be resolved with finite
316 lookahead (since whitespace sequences can be arbitrarily long). They are resolved
317 with a form of backtracking (early exit). No state needs to be saved/restored
318 since whitespace doesn't affect the shape of the resulting parse tree, so it's
319 not true backtracking.
320
321 In addition, the grammar has been refactored
322 in a semantics-preserving way in some cases to make the code easier to structure.
323
324 First: variant = when 1*(s key) [s] pattern
325 Example: when k {a}
326 When reading the first space after 'k', it's ambiguous whether it's the
327 required space before another key, or the optional space before `pattern`.
328 (See comments in parseNonEmptyKeys())
329
330 Second: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
331 annotation = (function *(s option)) / reserved
332 Example: {:f }
333 When reading the first space after 'f', it's ambiguous whether it's the
334 required space before an option, or the optional trailing space after an options list
335 (in this case, the options list is empty).
336 (See comments in parseOptions() -- handling this case also meant it was easier to base
337 the code on a slightly refactored grammar, which should be semantically equivalent.)
338
339 Third: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
340 annotation = (function *(s option)) / reserved
341 Example: {@a }
342 Similar to the previous case; see comments in parseReserved()
343
344 Fourth: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
345 Example: {|foo| }
346 When reading the first space after the '|', it's ambiguous whether it's the required
347 space before an annotation, or the optional trailing space before the '}'.
348 (See comments in parseLiteralOrVariableWithAnnotation(); handling this case relies on
349 the same grammar refactoring as the second exception.)
350
351 Most functions match a non-terminal in the grammar, except as explained
352 in comments.
353
354 Fifth: matcher = match-statement 1*([s] variant)
355 -> match 1 *([s] selector) 1*([s] variant)
356 Example: match {42} * {{_}}
357 When reading the space after the first '}', it's unclear whether
358 it's the optional space before another selector, or the optional space
359 before a variant.
360
361 Sixth: annotation-expression = "{" [s] annotation *(s attribute) [s] "}"
362 -> "{" [s] function *(s attribute) [s] "}"
363 -> "{" [s] ":" identifier *(s option) *(s attribute) [s] "}"
364 -> "{" [s] ":" identifier s attribute *(s attribute) [s] "}"
365
366 Example: {:func @foo}
367 (Note: the same ambiguity is present with variable-expression and literal-expression)
368
369 Seventh:
370
371
372 When parsing the space, it's unclear whether it's the optional space before an
373 option, or the optional space before an attribute.
374
375 Unless otherwise noted in a comment, all helper functions that take
376 a `source` string, an `index` unsigned int, and an `errorCode` `UErrorCode`
377 have the precondition:
378 `index` < `source.length()`
379 and the postcondition:
380 `U_FAILURE(errorCode)` || `index < `source.length()`
381 */
382
383 /*
384 No pre, no post.
385 A message may end with whitespace, so `index` may equal `source.length()` on exit.
386 */
parseWhitespaceMaybeRequired(bool required,UErrorCode & errorCode)387 void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) {
388 bool sawWhitespace = false;
389
390 // The loop exits either when we consume all the input,
391 // or when we see a non-whitespace character.
392 while (true) {
393 // Check if all input has been consumed
394 if (!inBounds(source, index)) {
395 // If whitespace isn't required -- or if we saw it already --
396 // then the caller is responsible for checking this case and
397 // setting an error if necessary.
398 if (!required || sawWhitespace) {
399 // Not an error.
400 return;
401 }
402 // Otherwise, whitespace is required; the end of the input has
403 // been reached without whitespace. This is an error.
404 ERROR(parseError, errorCode, index);
405 return;
406 }
407
408 // Input remains; process the next character if it's whitespace,
409 // exit the loop otherwise
410 if (isWhitespace(source[index])) {
411 sawWhitespace = true;
412 // Increment line number in parse error if we consume a newline
413 maybeAdvanceLine();
414 index++;
415 } else {
416 break;
417 }
418 }
419
420 if (!sawWhitespace && required) {
421 ERROR(parseError, errorCode, index);
422 }
423 }
424
425 /*
426 No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
427 */
parseRequiredWhitespace(UErrorCode & errorCode)428 void Parser::parseRequiredWhitespace(UErrorCode& errorCode) {
429 parseWhitespaceMaybeRequired(true, errorCode);
430 normalizedInput += SPACE;
431 }
432
433 /*
434 No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
435 */
parseOptionalWhitespace(UErrorCode & errorCode)436 void Parser::parseOptionalWhitespace(UErrorCode& errorCode) {
437 parseWhitespaceMaybeRequired(false, errorCode);
438 }
439
440 // Consumes a single character, signaling an error if `source[index]` != `c`
441 // No postcondition -- a message can end with a '}' token
parseToken(UChar32 c,UErrorCode & errorCode)442 void Parser::parseToken(UChar32 c, UErrorCode& errorCode) {
443 CHECK_BOUNDS(source, index, parseError, errorCode);
444
445 if (source[index] == c) {
446 index++;
447 normalizedInput += c;
448 return;
449 }
450 // Next character didn't match -- error out
451 ERROR(parseError, errorCode, index);
452 }
453
454 /*
455 Consumes a fixed-length token, signaling an error if the token isn't a prefix of
456 the string beginning at `source[index]`
457 No postcondition -- a message can end with a '}' token
458 */
459 template <int32_t N>
parseToken(const UChar32 (& token)[N],UErrorCode & errorCode)460 void Parser::parseToken(const UChar32 (&token)[N], UErrorCode& errorCode) {
461 U_ASSERT(inBounds(source, index));
462
463 int32_t tokenPos = 0;
464 while (tokenPos < N - 1) {
465 if (source[index] != token[tokenPos]) {
466 ERROR(parseError, errorCode, index);
467 return;
468 }
469 normalizedInput += token[tokenPos];
470 index++;
471 tokenPos++;
472 }
473 }
474
475 /*
476 Consumes optional whitespace, possibly advancing `index` to `index'`,
477 then consumes a fixed-length token (signaling an error if the token isn't a prefix of
478 the string beginning at `source[index']`),
479 then consumes optional whitespace again
480 */
481 template <int32_t N>
parseTokenWithWhitespace(const UChar32 (& token)[N],UErrorCode & errorCode)482 void Parser::parseTokenWithWhitespace(const UChar32 (&token)[N], UErrorCode& errorCode) {
483 // No need for error check or bounds check before parseOptionalWhitespace
484 parseOptionalWhitespace(errorCode);
485 // Establish precondition
486 CHECK_BOUNDS(source, index, parseError, errorCode);
487 parseToken(token);
488 parseOptionalWhitespace(errorCode);
489 // Guarantee postcondition
490 CHECK_BOUNDS(source, index, parseError, errorCode);
491 }
492
493 /*
494 Consumes optional whitespace, possibly advancing `index` to `index'`,
495 then consumes a single character (signaling an error if it doesn't match
496 `source[index']`),
497 then consumes optional whitespace again
498 */
parseTokenWithWhitespace(UChar32 c,UErrorCode & errorCode)499 void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) {
500 // No need for error check or bounds check before parseOptionalWhitespace(errorCode)
501 parseOptionalWhitespace(errorCode);
502 // Establish precondition
503 CHECK_BOUNDS(source, index, parseError, errorCode);
504 parseToken(c, errorCode);
505 parseOptionalWhitespace(errorCode);
506 // Guarantee postcondition
507 CHECK_BOUNDS(source, index, parseError, errorCode);
508 }
509
510 /*
511 Consumes a non-empty sequence of `name-char`s, the first of which is
512 also a `name-start`.
513 that begins with a character `start` such that `isNameStart(start)`.
514
515 Returns this sequence.
516
517 (Matches the `name` nonterminal in the grammar.)
518 */
parseName(UErrorCode & errorCode)519 UnicodeString Parser::parseName(UErrorCode& errorCode) {
520 UnicodeString name;
521
522 U_ASSERT(inBounds(source, index));
523
524 if (!isNameStart(source[index])) {
525 ERROR(parseError, errorCode, index);
526 return name;
527 }
528
529 while (isNameChar(source[index])) {
530 name += source[index];
531 normalizedInput += source[index];
532 index++;
533 if (!inBounds(source, index)) {
534 ERROR(parseError, errorCode, index);
535 break;
536 }
537 }
538 return name;
539 }
540
541 /*
542 Consumes a '$' followed by a `name`, returning a VariableName
543 with `name` as its name
544
545 (Matches the `variable` nonterminal in the grammar.)
546 */
parseVariableName(UErrorCode & errorCode)547 VariableName Parser::parseVariableName(UErrorCode& errorCode) {
548 VariableName result;
549
550 U_ASSERT(inBounds(source, index));
551 // If the '$' is missing, we don't want a binding
552 // for this variable to be created.
553 bool valid = source[index] == DOLLAR;
554 parseToken(DOLLAR, errorCode);
555 if (!inBounds(source, index)) {
556 ERROR(parseError, errorCode, index);
557 return result;
558 }
559 UnicodeString varName = parseName(errorCode);
560 // Set the name to "" if the variable wasn't
561 // declared correctly
562 if (!valid) {
563 varName.remove();
564 }
565 return VariableName(varName);
566 }
567
568 /*
569 Corresponds to the `identifier` nonterminal in the grammar
570 */
parseIdentifier(UErrorCode & errorCode)571 UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) {
572 U_ASSERT(inBounds(source, index));
573
574 UnicodeString result;
575 // The following is a hack to get around ambiguity in the grammar:
576 // identifier -> namespace ":" name
577 // vs.
578 // identifier -> name
579 // can't be distinguished without arbitrary lookahead.
580 // Instead, we treat the production as:
581 // identifier -> namespace *(":"name)
582 // and then check for multiple colons.
583
584 // Parse namespace
585 result += parseName(errorCode);
586 int32_t firstColon = -1;
587 while (inBounds(source, index) && source[index] == COLON) {
588 // Parse ':' separator
589 if (firstColon == -1) {
590 firstColon = index;
591 }
592 parseToken(COLON, errorCode);
593 result += COLON;
594 // Check for message ending with something like "foo:"
595 if (!inBounds(source, index)) {
596 ERROR(parseError, errorCode, index);
597 } else {
598 // Parse name part
599 result += parseName(errorCode);
600 }
601 }
602
603 // If there's at least one ':', scan from the first ':'
604 // to the end of the name to check for multiple ':'s
605 if (firstColon != -1) {
606 for (int32_t i = firstColon + 1; i < result.length(); i++) {
607 if (result[i] == COLON) {
608 ERROR(parseError, errorCode, i);
609 return {};
610 }
611 }
612 }
613
614 return result;
615 }
616
617 /*
618 Consumes a reference to a function, matching the ": identifier"
619 in the `function` nonterminal in the grammar.
620
621 Returns the function name.
622 */
parseFunction(UErrorCode & errorCode)623 FunctionName Parser::parseFunction(UErrorCode& errorCode) {
624 U_ASSERT(inBounds(source, index));
625 if (!isFunctionStart(source[index])) {
626 ERROR(parseError, errorCode, index);
627 return FunctionName();
628 }
629
630 normalizedInput += source[index];
631 index++; // Consume the function start character
632 if (!inBounds(source, index)) {
633 ERROR(parseError, errorCode, index);
634 return FunctionName();
635 }
636 return parseIdentifier(errorCode);
637 }
638
639
640 /*
641 Precondition: source[index] == BACKSLASH
642
643 Consume an escaped character.
644
645 Generalized to handle `reserved-escape`, `text-escape`,
646 or `literal-escape`, depending on the `kind` argument.
647
648 Appends result to `str`
649 */
parseEscapeSequence(EscapeKind kind,UnicodeString & str,UErrorCode & errorCode)650 void Parser::parseEscapeSequence(EscapeKind kind,
651 UnicodeString &str,
652 UErrorCode& errorCode) {
653 U_ASSERT(inBounds(source, index));
654 U_ASSERT(source[index] == BACKSLASH);
655 normalizedInput += BACKSLASH;
656 index++; // Skip the initial backslash
657 CHECK_BOUNDS(source, index, parseError, errorCode);
658
659 #define SUCCEED \
660 /* Append to the output string */ \
661 str += source[index]; \
662 /* Update normalizedInput */ \
663 normalizedInput += source[index]; \
664 /* Consume the character */ \
665 index++; \
666 /* Guarantee postcondition */ \
667 CHECK_BOUNDS(source, index, parseError, errorCode); \
668 return;
669
670 // Expect a '{', '|' or '}'
671 switch (source[index]) {
672 case LEFT_CURLY_BRACE:
673 case RIGHT_CURLY_BRACE: {
674 // Allowed in a `text-escape` or `reserved-escape`
675 switch (kind) {
676 case TEXT:
677 case RESERVED: {
678 SUCCEED;
679 }
680 default: {
681 break;
682 }
683 }
684 break;
685 }
686 case PIPE: {
687 // Allowed in a `literal-escape` or `reserved-escape`
688 switch (kind) {
689 case LITERAL:
690 case RESERVED: {
691 SUCCEED;
692 }
693 default: {
694 break;
695 }
696 }
697 break;
698 }
699 case BACKSLASH: {
700 // Allowed in any escape sequence
701 SUCCEED;
702 }
703 default: {
704 // No other characters are allowed here
705 break;
706 }
707 }
708 // If control reaches here, there was an error
709 ERROR(parseError, errorCode, index);
710 }
711
712 /*
713 Consume an escaped pipe or backslash, matching the `literal-escape`
714 nonterminal in the grammar
715 */
parseLiteralEscape(UnicodeString & str,UErrorCode & errorCode)716 void Parser::parseLiteralEscape(UnicodeString &str, UErrorCode& errorCode) {
717 parseEscapeSequence(LITERAL, str, errorCode);
718 }
719
720
721 /*
722 Consume and return a quoted literal, matching the `literal` nonterminal in the grammar.
723 */
parseQuotedLiteral(UErrorCode & errorCode)724 Literal Parser::parseQuotedLiteral(UErrorCode& errorCode) {
725 bool error = false;
726
727 UnicodeString contents;
728 if (U_SUCCESS(errorCode)) {
729 // Parse the opening '|'
730 parseToken(PIPE, errorCode);
731 if (!inBounds(source, index)) {
732 ERROR(parseError, errorCode, index);
733 error = true;
734 } else {
735 // Parse the contents
736 bool done = false;
737 while (!done) {
738 if (source[index] == BACKSLASH) {
739 parseLiteralEscape(contents, errorCode);
740 } else if (isQuotedChar(source[index])) {
741 contents += source[index];
742 normalizedInput += source[index];
743 index++; // Consume this character
744 maybeAdvanceLine();
745 } else {
746 // Assume the sequence of literal characters ends here
747 done = true;
748 }
749 if (!inBounds(source, index)) {
750 ERROR(parseError, errorCode, index);
751 error = true;
752 break;
753 }
754 }
755 }
756 }
757
758 if (error) {
759 return {};
760 }
761
762 // Parse the closing '|'
763 parseToken(PIPE, errorCode);
764
765 return Literal(true, contents);
766 }
767
768 // Parse (1*DIGIT)
parseDigits(UErrorCode & errorCode)769 UnicodeString Parser::parseDigits(UErrorCode& errorCode) {
770 if (U_FAILURE(errorCode)) {
771 return {};
772 }
773
774 U_ASSERT(isDigit(source[index]));
775
776 UnicodeString contents;
777 do {
778 contents += source[index];
779 normalizedInput += source[index];
780 index++;
781 if (!inBounds(source, index)) {
782 ERROR(parseError, errorCode, index);
783 return {};
784 }
785 } while (isDigit(source[index]));
786
787 return contents;
788 }
789 /*
790 Consume and return an unquoted literal, matching the `unquoted` nonterminal in the grammar.
791 */
parseUnquotedLiteral(UErrorCode & errorCode)792 Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) {
793 if (U_FAILURE(errorCode)) {
794 return {};
795 }
796
797 // unquoted -> name
798 if (isNameStart(source[index])) {
799 return Literal(false, parseName(errorCode));
800 }
801
802 // unquoted -> number
803 // Parse the contents
804 UnicodeString contents;
805
806 // Parse the sign
807 if (source[index] == HYPHEN) {
808 contents += source[index];
809 normalizedInput += source[index];
810 index++;
811 }
812 if (!inBounds(source, index)) {
813 ERROR(parseError, errorCode, index);
814 return {};
815 }
816
817 // Parse the integer part
818 if (source[index] == ((UChar32)0x0030) /* 0 */) {
819 contents += source[index];
820 normalizedInput += source[index];
821 index++;
822 } else if (isDigit(source[index])) {
823 contents += parseDigits(errorCode);
824 } else {
825 // Error -- nothing else can start a number literal
826 ERROR(parseError, errorCode, index);
827 return {};
828 }
829
830 // Parse the decimal point if present
831 if (source[index] == PERIOD) {
832 contents += source[index];
833 normalizedInput += source[index];
834 index++;
835 if (!inBounds(source, index)) {
836 ERROR(parseError, errorCode, index);
837 return {};
838 }
839 // Parse the fraction part
840 if (isDigit(source[index])) {
841 contents += parseDigits(errorCode);
842 } else {
843 // '.' not followed by digit is a parse error
844 ERROR(parseError, errorCode, index);
845 return {};
846 }
847 }
848
849 if (!inBounds(source, index)) {
850 ERROR(parseError, errorCode, index);
851 return {};
852 }
853
854 // Parse the exponent part if present
855 if (source[index] == UPPERCASE_E || source[index] == LOWERCASE_E) {
856 contents += source[index];
857 normalizedInput += source[index];
858 index++;
859 if (!inBounds(source, index)) {
860 ERROR(parseError, errorCode, index);
861 return {};
862 }
863 // Parse sign if present
864 if (source[index] == PLUS || source[index] == HYPHEN) {
865 contents += source[index];
866 normalizedInput += source[index];
867 index++;
868 if (!inBounds(source, index)) {
869 ERROR(parseError, errorCode, index);
870 return {};
871 }
872 }
873 // Parse exponent digits
874 if (!isDigit(source[index])) {
875 ERROR(parseError, errorCode, index);
876 return {};
877 }
878 contents += parseDigits(errorCode);
879 }
880
881 return Literal(false, contents);
882 }
883
884 /*
885 Consume and return a literal, matching the `literal` nonterminal in the grammar.
886 */
parseLiteral(UErrorCode & errorCode)887 Literal Parser::parseLiteral(UErrorCode& errorCode) {
888 Literal result;
889 if (!inBounds(source, index)) {
890 ERROR(parseError, errorCode, index);
891 } else {
892 if (source[index] == PIPE) {
893 result = parseQuotedLiteral(errorCode);
894 } else {
895 result = parseUnquotedLiteral(errorCode);
896 }
897 // Guarantee postcondition
898 if (!inBounds(source, index)) {
899 ERROR(parseError, errorCode, index);
900 }
901 }
902
903 return result;
904 }
905
906 /*
907 Consume a @name-value pair, matching the `attribute` nonterminal in the grammar.
908
909 Adds the option to `options`
910 */
911 template<class T>
parseAttribute(AttributeAdder<T> & attrAdder,UErrorCode & errorCode)912 void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
913 U_ASSERT(inBounds(source, index));
914
915 U_ASSERT(source[index] == AT);
916 // Consume the '@'
917 parseToken(AT, errorCode);
918
919 // Parse LHS
920 UnicodeString lhs = parseIdentifier(errorCode);
921
922 // Prepare to "backtrack" to resolve ambiguity
923 // about whether whitespace precedes another
924 // attribute, or the '=' sign
925 int32_t savedIndex = index;
926 parseOptionalWhitespace(errorCode);
927
928 Operand rand;
929 if (source[index] == EQUALS) {
930 // Parse '='
931 parseTokenWithWhitespace(EQUALS, errorCode);
932
933 UnicodeString rhsStr;
934 // Parse RHS, which is either a literal or variable
935 switch (source[index]) {
936 case DOLLAR: {
937 rand = Operand(parseVariableName(errorCode));
938 break;
939 }
940 default: {
941 // Must be a literal
942 rand = Operand(parseLiteral(errorCode));
943 break;
944 }
945 }
946 U_ASSERT(!rand.isNull());
947 } else {
948 // attribute -> "@" identifier [[s] "=" [s]]
949 // Use null operand, which `rand` is already set to
950 // "Backtrack" by restoring the whitespace (if there was any)
951 index = savedIndex;
952 }
953
954 attrAdder.addAttribute(lhs, std::move(rand), errorCode);
955 }
956
957 /*
958 Consume a name-value pair, matching the `option` nonterminal in the grammar.
959
960 Adds the option to `optionList`
961 */
962 template<class T>
parseOption(OptionAdder<T> & addOption,UErrorCode & errorCode)963 void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) {
964 U_ASSERT(inBounds(source, index));
965
966 // Parse LHS
967 UnicodeString lhs = parseIdentifier(errorCode);
968
969 // Parse '='
970 parseTokenWithWhitespace(EQUALS, errorCode);
971
972 UnicodeString rhsStr;
973 Operand rand;
974 // Parse RHS, which is either a literal or variable
975 switch (source[index]) {
976 case DOLLAR: {
977 rand = Operand(parseVariableName(errorCode));
978 break;
979 }
980 default: {
981 // Must be a literal
982 rand = Operand(parseLiteral(errorCode));
983 break;
984 }
985 }
986 U_ASSERT(!rand.isNull());
987
988 // Finally, add the key=value mapping
989 // Use a local error code, check for duplicate option error and
990 // record it as with other errors
991 UErrorCode status = U_ZERO_ERROR;
992 addOption.addOption(lhs, std::move(rand), status);
993 if (U_FAILURE(status)) {
994 U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR);
995 errors.setDuplicateOptionName(errorCode);
996 }
997 }
998
999 /*
1000 Note: there are multiple overloads of parseOptions() for parsing
1001 options within markup, vs. within an expression, vs. parsing
1002 attributes. This should be refactored. TODO
1003 */
1004
1005 /*
1006 Consume optional whitespace followed by a sequence of options
1007 (possibly empty), separated by whitespace
1008 */
1009 template <class T>
parseOptions(OptionAdder<T> & addOption,UErrorCode & errorCode)1010 void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) {
1011 // Early exit if out of bounds -- no more work is possible
1012 CHECK_BOUNDS(source, index, parseError, errorCode);
1013
1014 /*
1015 Arbitrary lookahead is required to parse option lists. To see why, consider
1016 these rules from the grammar:
1017
1018 expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1019 annotation = (function *(s option)) / reserved
1020
1021 And this example:
1022 {:foo }
1023
1024 Derivation:
1025 expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1026 -> "{" [s] annotation [s] "}"
1027 -> "{" [s] ((function *(s option)) / reserved) [s] "}"
1028 -> "{" [s] function *(s option) [s] "}"
1029
1030 In this example, knowing whether to expect a '}' or the start of another option
1031 after the whitespace would require arbitrary lookahead -- in other words, which
1032 rule should we apply?
1033 *(s option) -> s option *(s option)
1034 or
1035 *(s option) ->
1036
1037 The same would apply to the example {:foo k=v } (note the trailing space after "v").
1038
1039 This is addressed using a form of backtracking and (to make the backtracking easier
1040 to apply) a slight refactoring to the grammar.
1041
1042 This code is written as if the grammar is:
1043 expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}"
1044 annotation = (function *(s option) [s]) / (reserved [s])
1045
1046 Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning
1047 that `parseExpression()` can safely require a '}' after `parseOptions()` finishes.
1048
1049 Note that when "backtracking" really just means early exit, since only whitespace
1050 is involved and there's no state to save.
1051
1052 There is a separate but similar ambiguity as to whether the space precedes
1053 an option or an attribute.
1054 */
1055
1056 while(true) {
1057 // If the next character is not whitespace, that means we've already
1058 // parsed the entire options list (which may have been empty) and there's
1059 // no trailing whitespace. In that case, exit.
1060 if (!isWhitespace(source[index])) {
1061 break;
1062 }
1063 int32_t firstWhitespace = index;
1064
1065 // In any case other than an empty options list, there must be at least
1066 // one whitespace character.
1067 parseRequiredWhitespace(errorCode);
1068 // Restore precondition
1069 CHECK_BOUNDS(source, index, parseError, errorCode);
1070
1071 // If a name character follows, then at least one more option remains
1072 // in the list.
1073 // Otherwise, we've consumed all the options and any trailing whitespace,
1074 // and can exit.
1075 // Note that exiting is sort of like backtracking: "(s option)" doesn't apply,
1076 // so we back out to [s].
1077 if (!isNameStart(source[index])) {
1078 // We've consumed all the options (meaning that either we consumed non-empty
1079 // whitespace, or consumed at least one option.)
1080 // Done.
1081 // Remove the required whitespace from normalizedInput
1082 normalizedInput.truncate(normalizedInput.length() - 1);
1083 // "Backtrack" so as to leave the optional whitespace there
1084 // when parsing attributes
1085 index = firstWhitespace;
1086 break;
1087 }
1088 parseOption(addOption, errorCode);
1089 }
1090 }
1091
1092 /*
1093 Consume optional whitespace followed by a sequence of attributes
1094 (possibly empty), separated by whitespace
1095 */
1096 template<class T>
parseAttributes(AttributeAdder<T> & attrAdder,UErrorCode & errorCode)1097 void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1098
1099 // Early exit if out of bounds -- no more work is possible
1100 if (!inBounds(source, index)) {
1101 ERROR(parseError, errorCode, index);
1102 return;
1103 }
1104
1105 /*
1106 Arbitrary lookahead is required to parse attribute lists, similarly to option lists.
1107 (See comment in parseOptions()).
1108 */
1109
1110 while(true) {
1111 // If the next character is not whitespace, that means we've already
1112 // parsed the entire attributes list (which may have been empty) and there's
1113 // no trailing whitespace. In that case, exit.
1114 if (!isWhitespace(source[index])) {
1115 break;
1116 }
1117
1118 // In any case other than an empty attributes list, there must be at least
1119 // one whitespace character.
1120 parseRequiredWhitespace(errorCode);
1121 // Restore precondition
1122 if (!inBounds(source, index)) {
1123 ERROR(parseError, errorCode, index);
1124 break;
1125 }
1126
1127 // If an '@' follows, then at least one more attribute remains
1128 // in the list.
1129 // Otherwise, we've consumed all the attributes and any trailing whitespace,
1130 // and can exit.
1131 // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply,
1132 // so we back out to [s].
1133 if (source[index] != AT) {
1134 // We've consumed all the attributes (meaning that either we consumed non-empty
1135 // whitespace, or consumed at least one attribute.)
1136 // Done.
1137 // Remove the whitespace from normalizedInput
1138 normalizedInput.truncate(normalizedInput.length() - 1);
1139 break;
1140 }
1141 parseAttribute(attrAdder, errorCode);
1142 }
1143 }
1144
parseReservedEscape(UnicodeString & str,UErrorCode & errorCode)1145 void Parser::parseReservedEscape(UnicodeString &str, UErrorCode& errorCode) {
1146 parseEscapeSequence(RESERVED, str, errorCode);
1147 }
1148
1149 /*
1150 Consumes a non-empty sequence of reserved-chars, reserved-escapes, and
1151 literals (as in 1*(reserved-char / reserved-escape / literal) in the `reserved-body` rule)
1152
1153 Appends it to `str`
1154 */
parseReservedChunk(Reserved::Builder & result,UErrorCode & status)1155 void Parser::parseReservedChunk(Reserved::Builder& result, UErrorCode& status) {
1156 CHECK_ERROR(status);
1157
1158 bool empty = true;
1159 UnicodeString chunk;
1160 while(reservedChunkFollows(source[index])) {
1161 empty = false;
1162 // reserved-char
1163 if (isReservedChar(source[index])) {
1164 chunk += source[index];
1165 normalizedInput += source[index];
1166 // consume the char
1167 index++;
1168 // Restore precondition
1169 CHECK_BOUNDS(source, index, parseError, status);
1170 continue;
1171 }
1172
1173 if (chunk.length() > 0) {
1174 result.add(Literal(false, chunk), status);
1175 chunk.setTo(u"", 0);
1176 }
1177
1178 if (source[index] == BACKSLASH) {
1179 // reserved-escape
1180 parseReservedEscape(chunk, status);
1181 result.add(Literal(false, chunk), status);
1182 chunk.setTo(u"", 0);
1183 } else if (source[index] == PIPE || isUnquotedStart(source[index])) {
1184 result.add(parseLiteral(status), status);
1185 } else {
1186 // The reserved chunk ends here
1187 break;
1188 }
1189
1190 CHECK_ERROR(status); // Avoid looping infinitely
1191 }
1192
1193 // Add the last chunk if necessary
1194 if (chunk.length() > 0) {
1195 result.add(Literal(false, chunk), status);
1196 }
1197
1198 if (empty) {
1199 ERROR(parseError, status, index);
1200 }
1201 }
1202
1203 /*
1204 Consume a `reserved-start` character followed by a possibly-empty sequence
1205 of non-empty sequences of reserved characters, separated by whitespace.
1206 Matches the `reserved` nonterminal in the grammar
1207
1208 */
parseReserved(UErrorCode & status)1209 Reserved Parser::parseReserved(UErrorCode& status) {
1210 Reserved::Builder builder(status);
1211
1212 if (U_FAILURE(status)) {
1213 return {};
1214 }
1215
1216 U_ASSERT(inBounds(source, index));
1217
1218 // Require a `reservedStart` character
1219 if (!isReservedStart(source[index])) {
1220 ERROR(parseError, status, index);
1221 return Reserved();
1222 }
1223
1224 // Add the start char as a separate text chunk
1225 UnicodeString firstCharString(source[index]);
1226 builder.add(Literal(false, firstCharString), status);
1227 if (U_FAILURE(status)) {
1228 return {};
1229 }
1230 // Consume reservedStart
1231 normalizedInput += source[index];
1232 index++;
1233 return parseReservedBody(builder, status);
1234 }
1235
parseReservedBody(Reserved::Builder & builder,UErrorCode & status)1236 Reserved Parser::parseReservedBody(Reserved::Builder& builder, UErrorCode& status) {
1237 if (U_FAILURE(status)) {
1238 return {};
1239 }
1240
1241 /*
1242 Arbitrary lookahead is required to parse a `reserved`, for similar reasons
1243 to why it's required for parsing function annotations.
1244
1245 In the grammar:
1246
1247 annotation = (function *(s option)) / reserved
1248 expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1249 reserved = reserved-start reserved-body
1250 reserved-body = *( [s] 1*(reserved-char / reserved-escape / literal))
1251
1252 When reading a whitespace character, it's ambiguous whether it's the optional
1253 whitespace in this rule, or the optional whitespace that precedes a '}' in an
1254 expression.
1255
1256 The ambiguity is resolved using the same grammar refactoring as shown in
1257 the comment in `parseOptions()`.
1258 */
1259 // Consume reserved characters / literals / reserved escapes
1260 // until a character that can't be in a `reserved-body` is seen
1261 while (true) {
1262 /*
1263 First, if there is whitespace, it means either a chunk follows it,
1264 or this is the trailing whitespace before the '}' that terminates an
1265 expression.
1266
1267 Next, if the next character can start a reserved-char, reserved-escape,
1268 or literal, then parse a "chunk" of reserved things.
1269 In any other case, we exit successfully, since per the refactored
1270 grammar rule:
1271 annotation = (function *(s option) [s]) / (reserved [s])
1272 it's valid to consume whitespace after a `reserved`.
1273 (`parseExpression()` is responsible for checking that the next
1274 character is in fact a '}'.)
1275 */
1276 if (!inBounds(source, index)) {
1277 break;
1278 }
1279 int32_t numWhitespaceChars = 0;
1280 int32_t savedIndex = index;
1281 if (isWhitespace(source[index])) {
1282 parseOptionalWhitespace(status);
1283 numWhitespaceChars = index - savedIndex;
1284 // Restore precondition
1285 if (!inBounds(source, index)) {
1286 break;
1287 }
1288 }
1289
1290 if (reservedChunkFollows(source[index])) {
1291 parseReservedChunk(builder, status);
1292
1293 // Avoid looping infinitely
1294 if (U_FAILURE(status) || !inBounds(source, index)) {
1295 break;
1296 }
1297 } else {
1298 if (numWhitespaceChars > 0) {
1299 if (source[index] == LEFT_CURLY_BRACE) {
1300 // Resolve even more ambiguity (space preceding another piece of
1301 // a `reserved-body`, vs. space preceding an expression in `reserved-statement`
1302 // "Backtrack"
1303 index -= numWhitespaceChars;
1304 break;
1305 }
1306 if (source[index] == RIGHT_CURLY_BRACE) {
1307 // Not an error: just means there's no trailing whitespace
1308 // after this `reserved`
1309 break;
1310 }
1311 if (source[index] == AT) {
1312 // Not an error, but we have to "backtrack" due to the ambiguity
1313 // between an `s` preceding another reserved chunk
1314 // and an `s` preceding an attribute list
1315 index -= numWhitespaceChars;
1316 break;
1317 }
1318 // Error: if there's whitespace, it must either be followed
1319 // by a non-empty sequence or by '}'
1320 ERROR(parseError, status, index);
1321 break;
1322 }
1323 // If there was no whitespace, it's not an error,
1324 // just the end of the reserved string
1325 break;
1326 }
1327 }
1328
1329 return builder.build(status);
1330 }
1331
1332 /*
1333 Consume a function call or reserved string, matching the `annotation`
1334 nonterminal in the grammar
1335
1336 Returns an `Operator` representing this (a reserved is a parse error)
1337 */
parseAnnotation(UErrorCode & status)1338 Operator Parser::parseAnnotation(UErrorCode& status) {
1339 U_ASSERT(inBounds(source, index));
1340 Operator::Builder ratorBuilder(status);
1341 if (U_FAILURE(status)) {
1342 return {};
1343 }
1344 if (isFunctionStart(source[index])) {
1345 // Consume the function name
1346 FunctionName func = parseFunction(status);
1347 ratorBuilder.setFunctionName(std::move(func));
1348
1349 OptionAdder<Operator::Builder> addOptions(ratorBuilder);
1350 // Consume the options (which may be empty)
1351 parseOptions(addOptions, status);
1352 } else {
1353 // Must be reserved
1354 // A reserved sequence is not a parse error, but might be a formatting error
1355 Reserved rator = parseReserved(status);
1356 ratorBuilder.setReserved(std::move(rator));
1357 }
1358 UErrorCode localStatus = U_ZERO_ERROR;
1359 Operator result = ratorBuilder.build(localStatus);
1360 // Either `setReserved` or `setFunctionName` was called,
1361 // so there shouldn't be an error.
1362 U_ASSERT(U_SUCCESS(localStatus));
1363 return result;
1364 }
1365
1366 /*
1367 Consume a literal or variable (depending on `isVariable`),
1368 followed by either required whitespace followed by an annotation,
1369 or optional whitespace.
1370 */
parseLiteralOrVariableWithAnnotation(bool isVariable,Expression::Builder & builder,UErrorCode & status)1371 void Parser::parseLiteralOrVariableWithAnnotation(bool isVariable,
1372 Expression::Builder& builder,
1373 UErrorCode& status) {
1374 CHECK_ERROR(status);
1375
1376 U_ASSERT(inBounds(source, index));
1377
1378 Operand rand;
1379 if (isVariable) {
1380 rand = Operand(parseVariableName(status));
1381 } else {
1382 rand = Operand(parseLiteral(status));
1383 }
1384
1385 builder.setOperand(std::move(rand));
1386
1387 /*
1388 Parsing a literal or variable with an optional annotation requires arbitrary lookahead.
1389 To see why, consider this rule from the grammar:
1390
1391 expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1392
1393 And this example:
1394
1395 {|foo| }
1396
1397 Derivation:
1398 expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1399 -> "{" [s] ((literal / variable) [s annotation]) [s] "}"
1400 -> "{" [s] (literal [s annotation]) [s] "}"
1401
1402 When reading the ' ' after the second '|', it's ambiguous whether that's the required
1403 space before an annotation, or the optional space before the '}'.
1404
1405 To make this ambiguity easier to handle, this code is based on the same grammar
1406 refactoring for the `expression` nonterminal that `parseOptions()` relies on. See
1407 the comment in `parseOptions()` for details.
1408 */
1409
1410 if (isWhitespace(source[index])) {
1411 int32_t firstWhitespace = index;
1412
1413 // If the next character is whitespace, either [s annotation] or [s] applies
1414 // (the character is either the required space before an annotation, or optional
1415 // trailing space after the literal or variable). It's still ambiguous which
1416 // one does apply.
1417 parseOptionalWhitespace(status);
1418 // Restore precondition
1419 CHECK_BOUNDS(source, index, parseError, status);
1420
1421 // This next check resolves the ambiguity between [s annotation] and [s]
1422 bool isSAnnotation = isAnnotationStart(source[index]);
1423
1424 if (isSAnnotation) {
1425 normalizedInput += SPACE;
1426 }
1427
1428 if (isSAnnotation) {
1429 // The previously consumed whitespace precedes an annotation
1430 builder.setOperator(parseAnnotation(status));
1431 } else {
1432 // Either there's a right curly brace (will be consumed by the caller),
1433 // or there's an error and the trailing whitespace should be
1434 // handled by the caller. However, this is not an error
1435 // here because we're just parsing `literal [s annotation]`.
1436 index = firstWhitespace;
1437 }
1438 } else {
1439 // Either there was never whitespace, or
1440 // the previously consumed whitespace is the optional trailing whitespace;
1441 // either the next character is '}' or the error will be handled by parseExpression.
1442 // Do nothing, since the operand was already set
1443 }
1444
1445 // At the end of this code, the next character should either be '}',
1446 // whitespace followed by a '}',
1447 // or end-of-input
1448 }
1449
1450 /*
1451 Consume an expression, matching the `expression` nonterminal in the grammar
1452 */
1453
exprFallback(Expression::Builder & exprBuilder)1454 static void exprFallback(Expression::Builder& exprBuilder) {
1455 // Construct a literal consisting just of The U+FFFD REPLACEMENT CHARACTER
1456 // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
1457 exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
1458 }
1459
exprFallback(UErrorCode & status)1460 static Expression exprFallback(UErrorCode& status) {
1461 Expression result;
1462 if (U_SUCCESS(status)) {
1463 Expression::Builder exprBuilder(status);
1464 if (U_SUCCESS(status)) {
1465 // Construct a literal consisting just of The U+FFFD REPLACEMENT CHARACTER
1466 // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
1467 exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
1468 UErrorCode status = U_ZERO_ERROR;
1469 result = exprBuilder.build(status);
1470 // An operand was set, so there can't be an error
1471 U_ASSERT(U_SUCCESS(status));
1472 }
1473 }
1474 return result;
1475 }
1476
parseExpression(UErrorCode & status)1477 Expression Parser::parseExpression(UErrorCode& status) {
1478 if (U_FAILURE(status)) {
1479 return {};
1480 }
1481
1482 // Early return if out of input -- no more work is possible
1483 U_ASSERT(inBounds(source, index));
1484
1485 // Parse opening brace
1486 parseToken(LEFT_CURLY_BRACE, status);
1487 // Optional whitespace after opening brace
1488 parseOptionalWhitespace(status);
1489
1490 Expression::Builder exprBuilder(status);
1491 // Restore precondition
1492 if (!inBounds(source, index)) {
1493 exprFallback(exprBuilder);
1494 } else {
1495 // literal '|', variable '$' or annotation
1496 switch (source[index]) {
1497 case PIPE: {
1498 // Quoted literal
1499 parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
1500 break;
1501 }
1502 case DOLLAR: {
1503 // Variable
1504 parseLiteralOrVariableWithAnnotation(true, exprBuilder, status);
1505 break;
1506 }
1507 default: {
1508 if (isAnnotationStart(source[index])) {
1509 Operator rator = parseAnnotation(status);
1510 exprBuilder.setOperator(std::move(rator));
1511 } else if (isUnquotedStart(source[index])) {
1512 // Unquoted literal
1513 parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
1514 } else {
1515 // Not a literal, variable or annotation -- error out
1516 ERROR(parseError, status, index);
1517 exprFallback(exprBuilder);
1518 break;
1519 }
1520 break;
1521 }
1522 }
1523 }
1524
1525 // Parse attributes
1526 AttributeAdder attrAdder(exprBuilder);
1527 parseAttributes(attrAdder, status);
1528
1529 // Parse optional space
1530 // (the last [s] in e.g. "{" [s] literal [s annotation] *(s attribute) [s] "}")
1531 parseOptionalWhitespace(status);
1532
1533 // Either an operand or operator (or both) must have been set already,
1534 // so there can't be an error
1535 UErrorCode localStatus = U_ZERO_ERROR;
1536 Expression result = exprBuilder.build(localStatus);
1537 U_ASSERT(U_SUCCESS(localStatus));
1538
1539 // Check for end-of-input and missing '}'
1540 if (!inBounds(source, index)) {
1541 ERROR(parseError, status, index);
1542 } else {
1543 // Otherwise, it's safe to check for the '}'
1544 parseToken(RIGHT_CURLY_BRACE, status);
1545 }
1546 return result;
1547 }
1548
1549 /*
1550 Parse a .local declaration, matching the `local-declaration`
1551 production in the grammar
1552 */
parseLocalDeclaration(UErrorCode & status)1553 void Parser::parseLocalDeclaration(UErrorCode& status) {
1554 // End-of-input here would be an error; even empty
1555 // declarations must be followed by a body
1556 CHECK_BOUNDS(source, index, parseError, status);
1557
1558 parseToken(ID_LOCAL, status);
1559 parseRequiredWhitespace(status);
1560
1561 // Restore precondition
1562 CHECK_BOUNDS(source, index, parseError, status);
1563 VariableName lhs = parseVariableName(status);
1564 parseTokenWithWhitespace(EQUALS, status);
1565 // Restore precondition before calling parseExpression()
1566 CHECK_BOUNDS(source, index, parseError, status);
1567
1568 Expression rhs = parseExpression(status);
1569
1570 // Add binding from lhs to rhs, unless there was an error
1571 // (This ensures that if there was a correct lhs but a
1572 // parse error in rhs, the fallback for uses of the
1573 // lhs will be its own name rather than the rhs)
1574 /* This affects the behavior of this test case, which the spec
1575 is ambiguous about:
1576
1577 .local $bar {|foo|} {{{$bar}}}
1578
1579 Should `$bar` still be bound to a value although
1580 its declaration is syntactically incorrect (missing the '=')?
1581 This code says no, but it needs to change if
1582 https://github.com/unicode-org/message-format-wg/issues/703
1583 is resolved differently.
1584 */
1585 CHECK_ERROR(status);
1586 if (!errors.hasSyntaxError()) {
1587 dataModel.addBinding(Binding(std::move(lhs), std::move(rhs)), status);
1588 // Check if status is U_DUPLICATE_DECLARATION_ERROR
1589 // and add that as an internal error if so
1590 if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
1591 status = U_ZERO_ERROR;
1592 errors.addError(StaticErrorType::DuplicateDeclarationError, status);
1593 }
1594 }
1595 }
1596
1597 /*
1598 Parse an .input declaration, matching the `local-declaration`
1599 production in the grammar
1600 */
parseInputDeclaration(UErrorCode & status)1601 void Parser::parseInputDeclaration(UErrorCode& status) {
1602 // End-of-input here would be an error; even empty
1603 // declarations must be followed by a body
1604 CHECK_BOUNDS(source, index, parseError, status);
1605
1606 parseToken(ID_INPUT, status);
1607 parseOptionalWhitespace(status);
1608
1609 // Restore precondition before calling parseExpression()
1610 CHECK_BOUNDS(source, index, parseError, status);
1611
1612 // Save the index for error diagnostics
1613 int32_t exprIndex = index;
1614 Expression rhs = parseExpression(status);
1615
1616 // Here we have to check that the rhs is a variable-expression
1617 if (!rhs.getOperand().isVariable()) {
1618 // This case is a syntax error; report it at the beginning
1619 // of the expression
1620 ERROR(parseError, status, exprIndex);
1621 return;
1622 }
1623
1624 VariableName lhs = rhs.getOperand().asVariable();
1625
1626 // Add binding from lhs to rhs
1627 // This just adds a new local variable that shadows the message
1628 // argument referred to, which is harmless.
1629 // When evaluating the RHS, the new local is not in scope
1630 // and the message argument will be correctly referred to.
1631 CHECK_ERROR(status);
1632 if (!errors.hasSyntaxError()) {
1633 dataModel.addBinding(Binding::input(std::move(lhs), std::move(rhs), status), status);
1634 // Check if status is U_MF_DUPLICATE_DECLARATION_ERROR
1635 // and add that as an internal error if so
1636 if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
1637 status = U_ZERO_ERROR;
1638 errors.addError(StaticErrorType::DuplicateDeclarationError, status);
1639 }
1640 }
1641 }
1642
1643 /*
1644 Parses a `reserved-statement` per the grammar
1645 */
parseUnsupportedStatement(UErrorCode & status)1646 void Parser::parseUnsupportedStatement(UErrorCode& status) {
1647 U_ASSERT(inBounds(source, index) && source[index] == PERIOD);
1648
1649 UnsupportedStatement::Builder builder(status);
1650 CHECK_ERROR(status);
1651
1652 // Parse the keyword
1653 UnicodeString keyword(PERIOD);
1654 normalizedInput += UnicodeString(PERIOD);
1655 index++;
1656 keyword += parseName(status);
1657 builder.setKeyword(keyword);
1658
1659 // Parse the body, which is optional
1660 // Lookahead is required to distinguish the `s` in reserved-body
1661 // from the `s` in `[s] expression`
1662 // Next character may be:
1663 // * whitespace (followed by either a reserved-body start or
1664 // a '{')
1665 // * a '{'
1666
1667 CHECK_BOUNDS(source, index, parseError, status);
1668
1669 if (source[index] != LEFT_CURLY_BRACE) {
1670 if (!isWhitespace(source[index])) {
1671 ERROR(parseError, status, index);
1672 return;
1673 }
1674 // Expect a reserved-body start
1675 int32_t savedIndex = index;
1676 parseRequiredWhitespace(status);
1677 CHECK_BOUNDS(source, index, parseError, status);
1678 if (isReservedBodyStart(source[index])) {
1679 // There is a reserved body
1680 Reserved::Builder r(status);
1681 builder.setBody(parseReservedBody(r, status));
1682 } else {
1683 // No body -- backtrack so we can parse 1*([s] expression)
1684 index = savedIndex;
1685 normalizedInput.truncate(normalizedInput.length() - 1);
1686 }
1687 // Otherwise, the next character must be a '{'
1688 // to open the required expression (or optional whitespace)
1689 if (source[index] != LEFT_CURLY_BRACE && !isWhitespace(source[index])) {
1690 ERROR(parseError, status, index);
1691 return;
1692 }
1693 }
1694
1695 // Finally, parse the expressions
1696
1697 // Need to look ahead to disambiguate a '{' beginning
1698 // an expression from one beginning with a quoted pattern
1699 int32_t expressionCount = 0;
1700 while (source[index] == LEFT_CURLY_BRACE || isWhitespace(source[index])) {
1701 parseOptionalWhitespace(status);
1702
1703 bool nextIsLbrace = source[index] == LEFT_CURLY_BRACE;
1704 bool nextIsQuotedPattern = nextIsLbrace && inBounds(source, index + 1)
1705 && source[index + 1] == LEFT_CURLY_BRACE;
1706 if (nextIsQuotedPattern) {
1707 break;
1708 }
1709
1710 builder.addExpression(parseExpression(status), status);
1711 expressionCount++;
1712 }
1713 if (expressionCount <= 0) {
1714 // At least one expression is required
1715 ERROR(parseError, status, index);
1716 return;
1717 }
1718 dataModel.addUnsupportedStatement(builder.build(status), status);
1719 }
1720
1721 // Terrible hack to get around the ambiguity between `matcher` and `reserved-statement`
nextIsMatch() const1722 bool Parser::nextIsMatch() const {
1723 for(int32_t i = 0; i < 6; i++) {
1724 if (!inBounds(source, index + i) || source[index + i] != ID_MATCH[i]) {
1725 return false;
1726 }
1727 }
1728 return true;
1729 }
1730 /*
1731 Consume a possibly-empty sequence of declarations separated by whitespace;
1732 each declaration matches the `declaration` nonterminal in the grammar
1733
1734 Builds up an environment representing those declarations
1735 */
parseDeclarations(UErrorCode & status)1736 void Parser::parseDeclarations(UErrorCode& status) {
1737 // End-of-input here would be an error; even empty
1738 // declarations must be followed by a body
1739 CHECK_BOUNDS(source, index, parseError, status);
1740
1741 while (source[index] == PERIOD) {
1742 CHECK_BOUNDS(source, index + 1, parseError, status);
1743 if (source[index + 1] == ID_LOCAL[1]) {
1744 parseLocalDeclaration(status);
1745 } else if (source[index + 1] == ID_INPUT[1]) {
1746 parseInputDeclaration(status);
1747 } else {
1748 // Unsupported statement
1749 // Lookahead is needed to disambiguate this from a `match`
1750 if (!nextIsMatch()) {
1751 parseUnsupportedStatement(status);
1752 } else {
1753 // Done parsing declarations
1754 break;
1755 }
1756 }
1757
1758 // Avoid looping infinitely
1759 CHECK_ERROR(status);
1760
1761 parseOptionalWhitespace(status);
1762 // Restore precondition
1763 CHECK_BOUNDS(source, index, parseError, status);
1764 }
1765 }
1766
1767 /*
1768 Consume an escaped curly brace, or backslash, matching the `text-escape`
1769 nonterminal in the grammar
1770 */
parseTextEscape(UnicodeString & str,UErrorCode & status)1771 void Parser::parseTextEscape(UnicodeString &str, UErrorCode& status) {
1772 parseEscapeSequence(TEXT, str, status);
1773 }
1774
1775 /*
1776 Consume a non-empty sequence of text characters and escaped text characters,
1777 matching the `text` nonterminal in the grammar
1778
1779 No postcondition (a message can end with a text)
1780 */
parseText(UErrorCode & status)1781 UnicodeString Parser::parseText(UErrorCode& status) {
1782 UnicodeString str;
1783 if (!inBounds(source, index)) {
1784 // Text can be empty
1785 return str;
1786 }
1787
1788 if (!(isTextChar(source[index] || source[index] == BACKSLASH))) {
1789 // Error -- text is expected here
1790 ERROR(parseError, status, index);
1791 return str;
1792 }
1793
1794 while (true) {
1795 if (source[index] == BACKSLASH) {
1796 parseTextEscape(str, status);
1797 } else if (isTextChar(source[index])) {
1798 normalizedInput += source[index];
1799 str += source[index];
1800 index++;
1801 maybeAdvanceLine();
1802 } else {
1803 break;
1804 }
1805 if (!inBounds(source, index)) {
1806 // OK for text to end a message
1807 break;
1808 }
1809 }
1810
1811 return str;
1812 }
1813
1814 /*
1815 Consume an `nmtoken`, `literal`, or the string "*", matching
1816 the `key` nonterminal in the grammar
1817 */
parseKey(UErrorCode & status)1818 Key Parser::parseKey(UErrorCode& status) {
1819 U_ASSERT(inBounds(source, index));
1820
1821 Key k; // wildcard by default
1822 // Literal | '*'
1823 switch (source[index]) {
1824 case ASTERISK: {
1825 index++;
1826 normalizedInput += ASTERISK;
1827 // Guarantee postcondition
1828 if (!inBounds(source, index)) {
1829 ERROR(parseError, status, index);
1830 return k;
1831 }
1832 break;
1833 }
1834 default: {
1835 // Literal
1836 k = Key(parseLiteral(status));
1837 break;
1838 }
1839 }
1840 return k;
1841 }
1842
1843 /*
1844 Consume a non-empty sequence of `key`s separated by whitespace
1845
1846 Takes ownership of `keys`
1847 */
parseNonEmptyKeys(UErrorCode & status)1848 SelectorKeys Parser::parseNonEmptyKeys(UErrorCode& status) {
1849 SelectorKeys result;
1850
1851 if (U_FAILURE(status)) {
1852 return result;
1853 }
1854
1855 U_ASSERT(inBounds(source, index));
1856
1857 /*
1858 Arbitrary lookahead is required to parse key lists. To see why, consider
1859 this rule from the grammar:
1860
1861 variant = key *(s key) [s] quoted-pattern
1862
1863 And this example:
1864 when k1 k2 {a}
1865
1866 Derivation:
1867 variant -> key *(s key) [s] quoted-pattern
1868 -> key s key *(s key) quoted-pattern
1869
1870 After matching ' ' to `s` and 'k2' to `key`, it would require arbitrary lookahead
1871 to know whether to expect the start of a pattern or the start of another key.
1872 In other words: is the second whitespace sequence the required space in *(s key),
1873 or the optional space in [s] quoted-pattern?
1874
1875 This is addressed using "backtracking" (similarly to `parseOptions()`).
1876 */
1877
1878 SelectorKeys::Builder keysBuilder(status);
1879 if (U_FAILURE(status)) {
1880 return result;
1881 }
1882
1883 // Since the first key is required, it's simplest to parse it separately.
1884 keysBuilder.add(parseKey(status), status);
1885
1886 // Restore precondition
1887 if (!inBounds(source, index)) {
1888 ERROR(parseError, status, index);
1889 return result;
1890 }
1891
1892 // We've seen at least one whitespace-key pair, so now we can parse
1893 // *(s key) [s]
1894 while (source[index] != LEFT_CURLY_BRACE || isWhitespace(source[index])) { // Try to recover from errors
1895 bool wasWhitespace = isWhitespace(source[index]);
1896 parseRequiredWhitespace(status);
1897 if (!wasWhitespace) {
1898 // Avoid infinite loop when parsing something like:
1899 // when * @{!...
1900 index++;
1901 }
1902
1903 // Restore precondition
1904 if (!inBounds(source, index)) {
1905 ERROR(parseError, status, index);
1906 return result;
1907 }
1908
1909 // At this point, it's ambiguous whether we are inside (s key) or [s].
1910 // This check resolves that ambiguity.
1911 if (source[index] == LEFT_CURLY_BRACE) {
1912 // A pattern follows, so what we just parsed was the optional
1913 // trailing whitespace. All the keys have been parsed.
1914
1915 // Unpush the whitespace from `normalizedInput`
1916 normalizedInput.truncate(normalizedInput.length() - 1);
1917 break;
1918 }
1919 keysBuilder.add(parseKey(status), status);
1920 }
1921
1922 return keysBuilder.build(status);
1923 }
1924
parseQuotedPattern(UErrorCode & status)1925 Pattern Parser::parseQuotedPattern(UErrorCode& status) {
1926 U_ASSERT(inBounds(source, index));
1927
1928 parseToken(LEFT_CURLY_BRACE, status);
1929 parseToken(LEFT_CURLY_BRACE, status);
1930 Pattern p = parseSimpleMessage(status);
1931 parseToken(RIGHT_CURLY_BRACE, status);
1932 parseToken(RIGHT_CURLY_BRACE, status);
1933 return p;
1934 }
1935
1936 /*
1937 Consume a `placeholder`, matching the nonterminal in the grammar
1938 No postcondition (a markup can end a message)
1939 */
parseMarkup(UErrorCode & status)1940 Markup Parser::parseMarkup(UErrorCode& status) {
1941 U_ASSERT(inBounds(source, index + 1));
1942
1943 U_ASSERT(source[index] == LEFT_CURLY_BRACE);
1944
1945 Markup::Builder builder(status);
1946 if (U_FAILURE(status)) {
1947 return {};
1948 }
1949
1950 // Consume the '{'
1951 index++;
1952 normalizedInput += LEFT_CURLY_BRACE;
1953 parseOptionalWhitespace(status);
1954 bool closing = false;
1955 switch (source[index]) {
1956 case NUMBER_SIGN: {
1957 // Open or standalone; consume the '#'
1958 normalizedInput += source[index];
1959 index++;
1960 break;
1961 }
1962 case SLASH: {
1963 // Closing
1964 normalizedInput += source[index];
1965 closing = true;
1966 index++;
1967 break;
1968 }
1969 default: {
1970 ERROR(parseError, status, index);
1971 return {};
1972 }
1973 }
1974
1975 // Parse the markup identifier
1976 builder.setName(parseIdentifier(status));
1977
1978 // Parse the options, which must begin with a ' '
1979 // if present
1980 if (inBounds(source, index) && isWhitespace(source[index])) {
1981 OptionAdder<Markup::Builder> optionAdder(builder);
1982 parseOptions(optionAdder, status);
1983 }
1984
1985 // Parse the attributes, which also must begin
1986 // with a ' '
1987 if (inBounds(source, index) && isWhitespace(source[index])) {
1988 AttributeAdder attrAdder(builder);
1989 parseAttributes(attrAdder, status);
1990 }
1991
1992 parseOptionalWhitespace(status);
1993
1994 bool standalone = false;
1995 // Check if this is a standalone or not
1996 if (!closing) {
1997 if (inBounds(source, index) && source[index] == SLASH) {
1998 standalone = true;
1999 normalizedInput += SLASH;
2000 index++;
2001 }
2002 }
2003
2004 parseToken(RIGHT_CURLY_BRACE, status);
2005
2006 if (standalone) {
2007 builder.setStandalone();
2008 } else if (closing) {
2009 builder.setClose();
2010 } else {
2011 builder.setOpen();
2012 }
2013
2014 return builder.build(status);
2015 }
2016
2017 /*
2018 Consume a `placeholder`, matching the nonterminal in the grammar
2019 No postcondition (a placeholder can end a message)
2020 */
parsePlaceholder(UErrorCode & status)2021 std::variant<Expression, Markup> Parser::parsePlaceholder(UErrorCode& status) {
2022 U_ASSERT(source[index] == LEFT_CURLY_BRACE);
2023
2024 if (!inBounds(source, index)) {
2025 ERROR(parseError, status, index);
2026 return exprFallback(status);
2027 }
2028
2029 // Check if it's markup or an expression
2030 if (source[index + 1] == NUMBER_SIGN || source[index + 1] == SLASH) {
2031 // Markup
2032 return parseMarkup(status);
2033 }
2034 return parseExpression(status);
2035 }
2036
2037 /*
2038 Consume a `simple-message`, matching the nonterminal in the grammar
2039 Postcondition: `index == source.length()` or U_FAILURE(status);
2040 for a syntactically correct message, this will consume the entire input
2041 */
parseSimpleMessage(UErrorCode & status)2042 Pattern Parser::parseSimpleMessage(UErrorCode& status) {
2043 Pattern::Builder result(status);
2044
2045 if (U_SUCCESS(status)) {
2046 Expression expression;
2047 while (inBounds(source, index)) {
2048 switch (source[index]) {
2049 case LEFT_CURLY_BRACE: {
2050 // Must be placeholder
2051 std::variant<Expression, Markup> piece = parsePlaceholder(status);
2052 if (std::holds_alternative<Expression>(piece)) {
2053 Expression expr = *std::get_if<Expression>(&piece);
2054 result.add(std::move(expr), status);
2055 } else {
2056 Markup markup = *std::get_if<Markup>(&piece);
2057 result.add(std::move(markup), status);
2058 }
2059 break;
2060 }
2061 default: {
2062 // Must be text
2063 result.add(parseText(status), status);
2064 break;
2065 }
2066 }
2067 if (source[index] == RIGHT_CURLY_BRACE) {
2068 // End of quoted pattern
2069 break;
2070 }
2071 // Don't loop infinitely
2072 if (errors.hasSyntaxError()) {
2073 break;
2074 }
2075 }
2076 }
2077 return result.build(status);
2078 }
2079
2080
2081 /*
2082 Consume a `selectors` (matching the nonterminal in the grammar),
2083 followed by a non-empty sequence of `variant`s (matching the nonterminal
2084 in the grammar) preceded by whitespace
2085 No postcondition (on return, `index` might equal `source.length()` with no syntax error
2086 because a message can end with a variant)
2087 */
parseSelectors(UErrorCode & status)2088 void Parser::parseSelectors(UErrorCode& status) {
2089 CHECK_ERROR(status);
2090
2091 U_ASSERT(inBounds(source, index));
2092
2093 parseToken(ID_MATCH, status);
2094
2095 bool empty = true;
2096 // Parse selectors
2097 // "Backtracking" is required here. It's not clear if whitespace is
2098 // (`[s]` selector) or (`[s]` variant)
2099 while (isWhitespace(source[index]) || source[index] == LEFT_CURLY_BRACE) {
2100 parseOptionalWhitespace(status);
2101 // Restore precondition
2102 CHECK_BOUNDS(source, index, parseError, status);
2103 if (source[index] != LEFT_CURLY_BRACE) {
2104 // This is not necessarily an error, but rather,
2105 // means the whitespace we parsed was the optional
2106 // whitespace preceding the first variant, not the
2107 // optional whitespace preceding a subsequent expression.
2108 break;
2109 }
2110 Expression expression;
2111 expression = parseExpression(status);
2112 empty = false;
2113
2114 dataModel.addSelector(std::move(expression), status);
2115 CHECK_ERROR(status);
2116 }
2117
2118 // At least one selector is required
2119 if (empty) {
2120 ERROR(parseError, status, index);
2121 return;
2122 }
2123
2124 #define CHECK_END_OF_INPUT \
2125 if (((int32_t)index) >= source.length()) { \
2126 break; \
2127 } \
2128
2129 // Parse variants
2130 while (isWhitespace(source[index]) || isKeyStart(source[index])) {
2131 if (isWhitespace(source[index])) {
2132 int32_t whitespaceStart = index;
2133 parseOptionalWhitespace(status);
2134 // Restore the precondition.
2135 // Error out if we reached the end of input. The message
2136 // cannot end with trailing whitespace if there are variants.
2137 if (!inBounds(source, index)) {
2138 // Use index of first whitespace for error message
2139 index = whitespaceStart;
2140 ERROR(parseError, status, index);
2141 return;
2142 }
2143 }
2144
2145 // At least one key is required
2146 SelectorKeys keyList(parseNonEmptyKeys(status));
2147
2148 CHECK_ERROR(status);
2149
2150 // parseNonEmptyKeys() consumes any trailing whitespace,
2151 // so the pattern can be consumed next.
2152
2153 // Restore precondition before calling parsePattern()
2154 // (which must return a non-null value)
2155 CHECK_BOUNDS(source, index, parseError, status);
2156 Pattern rhs = parseQuotedPattern(status);
2157
2158 dataModel.addVariant(std::move(keyList), std::move(rhs), status);
2159
2160 // Restore the precondition, *without* erroring out if we've
2161 // reached the end of input. That's because it's valid for the
2162 // message to end with a variant that has no trailing whitespace.
2163 // Why do we need to check this condition twice inside the loop?
2164 // Because if we don't check it here, the `isWhitespace()` call in
2165 // the loop head will read off the end of the input string.
2166 CHECK_END_OF_INPUT
2167 }
2168 }
2169
2170 /*
2171 Consume a `body` (matching the nonterminal in the grammar),
2172 No postcondition (on return, `index` might equal `source.length()` with no syntax error,
2173 because a message can end with a body (trailing whitespace is optional)
2174 */
2175
errorPattern(UErrorCode & status)2176 void Parser::errorPattern(UErrorCode& status) {
2177 errors.addSyntaxError(status);
2178 // Set to empty pattern
2179 Pattern::Builder result = Pattern::Builder(status);
2180 CHECK_ERROR(status);
2181
2182 // If still in bounds, then add the remaining input as a single text part
2183 // to the pattern
2184 /*
2185 TODO: this behavior isn't documented in the spec, but it comes from
2186 https://github.com/messageformat/messageformat/blob/e0087bff312d759b67a9129eac135d318a1f0ce7/packages/mf2-messageformat/src/__fixtures/test-messages.json#L236
2187 and a pending pull request https://github.com/unicode-org/message-format-wg/pull/462 will clarify
2188 whether this is the intent behind the spec
2189 */
2190 UnicodeString partStr(LEFT_CURLY_BRACE);
2191 while (inBounds(source, index)) {
2192 partStr += source[index++];
2193 }
2194 // Add curly braces around the entire output (same comment as above)
2195 partStr += RIGHT_CURLY_BRACE;
2196 result.add(std::move(partStr), status);
2197 dataModel.setPattern(result.build(status));
2198 }
2199
parseBody(UErrorCode & status)2200 void Parser::parseBody(UErrorCode& status) {
2201 CHECK_ERROR(status);
2202
2203 // Out-of-input is a syntax warning
2204 if (!inBounds(source, index)) {
2205 errorPattern(status);
2206 return;
2207 }
2208
2209 // Body must be either a pattern or selectors
2210 switch (source[index]) {
2211 case LEFT_CURLY_BRACE: {
2212 // Pattern
2213 dataModel.setPattern(parseQuotedPattern(status));
2214 break;
2215 }
2216 case ID_MATCH[0]: {
2217 // Selectors
2218 parseSelectors(status);
2219 return;
2220 }
2221 default: {
2222 ERROR(parseError, status, index);
2223 errorPattern(status);
2224 return;
2225 }
2226 }
2227 }
2228
2229 // -------------------------------------
2230 // Parses the source pattern.
2231
parse(UParseError & parseErrorResult,UErrorCode & status)2232 void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) {
2233 CHECK_ERROR(status);
2234
2235 bool simple = true;
2236 // Message can be empty, so we need to only look ahead
2237 // if we know it's non-empty
2238 if (inBounds(source, index)) {
2239 if (source[index] == PERIOD
2240 || (index < ((uint32_t) source.length() + 1)
2241 && source[index] == LEFT_CURLY_BRACE
2242 && source[index + 1] == LEFT_CURLY_BRACE)) {
2243 // A complex message begins with a '.' or '{'
2244 parseDeclarations(status);
2245 parseBody(status);
2246 simple = false;
2247 }
2248 }
2249 if (simple) {
2250 // Simple message
2251 // For normalization, quote the pattern
2252 normalizedInput += LEFT_CURLY_BRACE;
2253 normalizedInput += LEFT_CURLY_BRACE;
2254 dataModel.setPattern(parseSimpleMessage(status));
2255 normalizedInput += RIGHT_CURLY_BRACE;
2256 normalizedInput += RIGHT_CURLY_BRACE;
2257 }
2258
2259 CHECK_ERROR(status);
2260
2261 // There are no errors; finally, check that the entire input was consumed
2262 if (((int32_t)index) != source.length()) {
2263 ERROR(parseError, status, index);
2264 }
2265
2266 // Finally, copy the relevant fields of the internal `MessageParseError`
2267 // into the `UParseError` argument
2268 translateParseError(parseError, parseErrorResult);
2269 }
2270
~Parser()2271 Parser::~Parser() {}
2272
2273 } // namespace message2
2274 U_NAMESPACE_END
2275
2276 #endif /* #if !UCONFIG_NO_MF2 */
2277
2278 #endif /* #if !UCONFIG_NO_FORMATTING */
2279
2280