xref: /aosp_15_r20/external/deqp/framework/xexml/xeXMLParser.cpp (revision 35238bce31c2a825756842865a792f8cf7f89930)
1 /*-------------------------------------------------------------------------
2  * drawElements Quality Program Test Executor
3  * ------------------------------------------
4  *
5  * Copyright 2014 The Android Open Source Project
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  *      http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  *//*!
20  * \file
21  * \brief XML Parser.
22  *//*--------------------------------------------------------------------*/
23 
24 #include "xeXMLParser.hpp"
25 #include "deInt32.h"
26 
27 namespace xe
28 {
29 namespace xml
30 {
31 
32 enum
33 {
34     TOKENIZER_INITIAL_BUFFER_SIZE = 1024
35 };
36 
isIdentifierStartChar(int ch)37 static inline bool isIdentifierStartChar(int ch)
38 {
39     return de::inRange<int>(ch, 'a', 'z') || de::inRange<int>(ch, 'A', 'Z');
40 }
41 
isIdentifierChar(int ch)42 static inline bool isIdentifierChar(int ch)
43 {
44     return isIdentifierStartChar(ch) || de::inRange<int>(ch, '0', '9') || (ch == '-') || (ch == '_');
45 }
46 
isWhitespaceChar(int ch)47 static inline bool isWhitespaceChar(int ch)
48 {
49     return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
50 }
51 
getNextBufferSize(int curSize,int minNewSize)52 static int getNextBufferSize(int curSize, int minNewSize)
53 {
54     return de::max(curSize * 2, 1 << deLog2Ceil32(minNewSize));
55 }
56 
Tokenizer(void)57 Tokenizer::Tokenizer(void)
58     : m_curToken(TOKEN_INCOMPLETE)
59     , m_curTokenLen(0)
60     , m_state(STATE_DATA)
61     , m_buf(TOKENIZER_INITIAL_BUFFER_SIZE)
62 {
63 }
64 
~Tokenizer(void)65 Tokenizer::~Tokenizer(void)
66 {
67 }
68 
clear(void)69 void Tokenizer::clear(void)
70 {
71     m_curToken    = TOKEN_INCOMPLETE;
72     m_curTokenLen = 0;
73     m_state       = STATE_DATA;
74     m_buf.clear();
75 }
76 
error(const std::string & what)77 void Tokenizer::error(const std::string &what)
78 {
79     throw ParseError(what);
80 }
81 
feed(const uint8_t * bytes,int numBytes)82 void Tokenizer::feed(const uint8_t *bytes, int numBytes)
83 {
84     // Grow buffer if necessary.
85     if (m_buf.getNumFree() < numBytes)
86     {
87         m_buf.resize(getNextBufferSize(m_buf.getSize(), m_buf.getNumElements() + numBytes));
88     }
89 
90     // Append to front.
91     m_buf.pushFront(bytes, numBytes);
92 
93     // If we haven't parsed complete token, re-try after data feed.
94     if (m_curToken == TOKEN_INCOMPLETE)
95         advance();
96 }
97 
getChar(int offset) const98 int Tokenizer::getChar(int offset) const
99 {
100     DE_ASSERT(de::inRange(offset, 0, m_buf.getNumElements()));
101 
102     if (offset < m_buf.getNumElements())
103         return m_buf.peekBack(offset);
104     else
105         return END_OF_BUFFER;
106 }
107 
advance(void)108 void Tokenizer::advance(void)
109 {
110     if (m_curToken != TOKEN_INCOMPLETE)
111     {
112         // Parser should not try to advance beyond end of string.
113         DE_ASSERT(m_curToken != TOKEN_END_OF_STRING);
114 
115         // If current token is tag end, change state to data.
116         if (m_curToken == TOKEN_TAG_END || m_curToken == TOKEN_EMPTY_ELEMENT_END ||
117             m_curToken == TOKEN_PROCESSING_INSTRUCTION_END || m_curToken == TOKEN_COMMENT || m_curToken == TOKEN_ENTITY)
118             m_state = STATE_DATA;
119 
120         // Advance buffer by length of last token.
121         m_buf.popBack(m_curTokenLen);
122 
123         // Reset state.
124         m_curToken    = TOKEN_INCOMPLETE;
125         m_curTokenLen = 0;
126 
127         // If we hit end of string here, report it as end of string.
128         if (getChar(0) == END_OF_STRING)
129         {
130             m_curToken    = TOKEN_END_OF_STRING;
131             m_curTokenLen = 1;
132             return;
133         }
134     }
135 
136     int curChar = getChar(m_curTokenLen);
137 
138     for (;;)
139     {
140         if (m_state == STATE_DATA)
141         {
142             // Advance until we hit end of buffer or tag start and treat that as data token.
143             if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER || curChar == '<' || curChar == '&')
144             {
145                 if (curChar == '<')
146                     m_state = STATE_TAG;
147                 else if (curChar == '&')
148                     m_state = STATE_ENTITY;
149 
150                 if (m_curTokenLen > 0)
151                 {
152                     // Report data token.
153                     m_curToken = TOKEN_DATA;
154                     return;
155                 }
156                 else if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER)
157                 {
158                     // Just return incomplete token, no data parsed.
159                     return;
160                 }
161                 else
162                 {
163                     DE_ASSERT(m_state == STATE_TAG || m_state == STATE_ENTITY);
164                     continue;
165                 }
166             }
167         }
168         else
169         {
170             // Eat all whitespace if present.
171             if (m_curTokenLen == 0)
172             {
173                 while (isWhitespaceChar(curChar))
174                 {
175                     m_buf.popBack();
176                     curChar = getChar(0);
177                 }
178             }
179 
180             // Handle end of string / buffer.
181             if (curChar == END_OF_STRING)
182                 error("Unexpected end of string");
183             else if (curChar == (int)END_OF_BUFFER)
184             {
185                 DE_ASSERT(m_curToken == TOKEN_INCOMPLETE);
186                 return;
187             }
188 
189             if (m_curTokenLen == 0)
190             {
191                 // Expect start of identifier, value or special tag token.
192                 if (curChar == '\'' || curChar == '"')
193                     m_state = STATE_VALUE;
194                 else if (isIdentifierStartChar(curChar))
195                     m_state = STATE_IDENTIFIER;
196                 else if (curChar == '<' || curChar == '?' || curChar == '/')
197                     m_state = STATE_TAG;
198                 else if (curChar == '&')
199                     DE_ASSERT(m_state == STATE_ENTITY);
200                 else if (curChar == '=')
201                 {
202                     m_curToken    = TOKEN_EQUAL;
203                     m_curTokenLen = 1;
204                     return;
205                 }
206                 else if (curChar == '>')
207                 {
208                     m_curToken    = TOKEN_TAG_END;
209                     m_curTokenLen = 1;
210                     return;
211                 }
212                 else
213                     error("Unexpected character");
214             }
215             else if (m_state == STATE_IDENTIFIER)
216             {
217                 if (!isIdentifierChar(curChar))
218                 {
219                     m_curToken = TOKEN_IDENTIFIER;
220                     return;
221                 }
222             }
223             else if (m_state == STATE_VALUE)
224             {
225                 // \todo [2012-06-07 pyry] Escapes.
226                 if (curChar == '\'' || curChar == '"')
227                 {
228                     // \todo [2012-10-17 pyry] Should we actually do the check against getChar(0)?
229                     if (curChar != getChar(0))
230                         error("Mismatched quote");
231                     m_curToken = TOKEN_STRING;
232                     m_curTokenLen += 1;
233                     return;
234                 }
235             }
236             else if (m_state == STATE_COMMENT)
237             {
238                 DE_ASSERT(m_curTokenLen >= 2); // 2 characters have been parsed if we are in comment state.
239 
240                 if (m_curTokenLen <= 3)
241                 {
242                     if (curChar != '-')
243                         error("Invalid comment start");
244                 }
245                 else
246                 {
247                     int prev2 = m_curTokenLen > 5 ? getChar(m_curTokenLen - 2) : 0;
248                     int prev1 = m_curTokenLen > 4 ? getChar(m_curTokenLen - 1) : 0;
249 
250                     if (prev2 == '-' && prev1 == '-')
251                     {
252                         if (curChar != '>')
253                             error("Invalid comment end");
254                         m_curToken = TOKEN_COMMENT;
255                         m_curTokenLen += 1;
256                         return;
257                     }
258                 }
259             }
260             else if (m_state == STATE_ENTITY)
261             {
262                 if (m_curTokenLen >= 1)
263                 {
264                     if (curChar == ';')
265                     {
266                         m_curToken = TOKEN_ENTITY;
267                         m_curTokenLen += 1;
268                         return;
269                     }
270                     else if (!de::inRange<int>(curChar, '0', '9') && !de::inRange<int>(curChar, 'a', 'z') &&
271                              !de::inRange<int>(curChar, 'A', 'Z'))
272                         error("Invalid entity");
273                 }
274             }
275             else
276             {
277                 // Special tokens are at most 2 characters.
278                 DE_ASSERT(m_state == STATE_TAG && m_curTokenLen == 1);
279 
280                 int prevChar = getChar(m_curTokenLen - 1);
281 
282                 if (prevChar == '<')
283                 {
284                     // Tag start.
285                     if (curChar == '/')
286                     {
287                         m_curToken    = TOKEN_END_TAG_START;
288                         m_curTokenLen = 2;
289                         return;
290                     }
291                     else if (curChar == '?')
292                     {
293                         m_curToken    = TOKEN_PROCESSING_INSTRUCTION_START;
294                         m_curTokenLen = 2;
295                         return;
296                     }
297                     else if (curChar == '!')
298                     {
299                         m_state = STATE_COMMENT;
300                     }
301                     else
302                     {
303                         m_curToken    = TOKEN_TAG_START;
304                         m_curTokenLen = 1;
305                         return;
306                     }
307                 }
308                 else if (prevChar == '?')
309                 {
310                     if (curChar != '>')
311                         error("Invalid processing instruction end");
312                     m_curToken    = TOKEN_PROCESSING_INSTRUCTION_END;
313                     m_curTokenLen = 2;
314                     return;
315                 }
316                 else if (prevChar == '/')
317                 {
318                     if (curChar != '>')
319                         error("Invalid empty element end");
320                     m_curToken    = TOKEN_EMPTY_ELEMENT_END;
321                     m_curTokenLen = 2;
322                     return;
323                 }
324                 else
325                     error("Could not parse special token");
326             }
327         }
328 
329         m_curTokenLen += 1;
330         curChar = getChar(m_curTokenLen);
331     }
332 }
333 
getString(std::string & dst) const334 void Tokenizer::getString(std::string &dst) const
335 {
336     DE_ASSERT(m_curToken == TOKEN_STRING);
337     dst.resize(m_curTokenLen - 2);
338     for (int ndx = 0; ndx < m_curTokenLen - 2; ndx++)
339         dst[ndx] = m_buf.peekBack(ndx + 1);
340 }
341 
Parser(void)342 Parser::Parser(void) : m_element(ELEMENT_INCOMPLETE), m_state(STATE_DATA)
343 {
344 }
345 
~Parser(void)346 Parser::~Parser(void)
347 {
348 }
349 
clear(void)350 void Parser::clear(void)
351 {
352     m_tokenizer.clear();
353     m_elementName.clear();
354     m_attributes.clear();
355     m_attribName.clear();
356     m_entityValue.clear();
357 
358     m_element = ELEMENT_INCOMPLETE;
359     m_state   = STATE_DATA;
360 }
361 
error(const std::string & what)362 void Parser::error(const std::string &what)
363 {
364     throw ParseError(what);
365 }
366 
feed(const uint8_t * bytes,int numBytes)367 void Parser::feed(const uint8_t *bytes, int numBytes)
368 {
369     m_tokenizer.feed(bytes, numBytes);
370 
371     if (m_element == ELEMENT_INCOMPLETE)
372         advance();
373 }
374 
advance(void)375 void Parser::advance(void)
376 {
377     if (m_element == ELEMENT_START)
378         m_attributes.clear();
379 
380     // \note No token is advanced when element end is reported.
381     if (m_state == STATE_YIELD_EMPTY_ELEMENT_END)
382     {
383         DE_ASSERT(m_element == ELEMENT_START);
384         m_element = ELEMENT_END;
385         m_state   = STATE_DATA;
386         return;
387     }
388 
389     if (m_element != ELEMENT_INCOMPLETE)
390     {
391         m_tokenizer.advance();
392         m_element = ELEMENT_INCOMPLETE;
393     }
394 
395     for (;;)
396     {
397         Token curToken = m_tokenizer.getToken();
398 
399         // Skip comments.
400         while (curToken == TOKEN_COMMENT)
401         {
402             m_tokenizer.advance();
403             curToken = m_tokenizer.getToken();
404         }
405 
406         if (curToken == TOKEN_INCOMPLETE)
407         {
408             DE_ASSERT(m_element == ELEMENT_INCOMPLETE);
409             return;
410         }
411 
412         switch (m_state)
413         {
414         case STATE_ENTITY:
415             m_state = STATE_DATA;
416             // Fall-through
417 
418         case STATE_DATA:
419             switch (curToken)
420             {
421             case TOKEN_DATA:
422                 m_element = ELEMENT_DATA;
423                 return;
424 
425             case TOKEN_END_OF_STRING:
426                 m_element = ELEMENT_END_OF_STRING;
427                 return;
428 
429             case TOKEN_TAG_START:
430                 m_state = STATE_START_TAG_OPEN;
431                 break;
432 
433             case TOKEN_END_TAG_START:
434                 m_state = STATE_END_TAG_OPEN;
435                 break;
436 
437             case TOKEN_PROCESSING_INSTRUCTION_START:
438                 m_state = STATE_IN_PROCESSING_INSTRUCTION;
439                 break;
440 
441             case TOKEN_ENTITY:
442                 m_state   = STATE_ENTITY;
443                 m_element = ELEMENT_DATA;
444                 parseEntityValue();
445                 return;
446 
447             default:
448                 error("Unexpected token");
449             }
450             break;
451 
452         case STATE_IN_PROCESSING_INSTRUCTION:
453             if (curToken == TOKEN_PROCESSING_INSTRUCTION_END)
454                 m_state = STATE_DATA;
455             else if (curToken != TOKEN_IDENTIFIER && curToken != TOKEN_EQUAL && curToken != TOKEN_STRING)
456                 error("Unexpected token in processing instruction");
457             break;
458 
459         case STATE_START_TAG_OPEN:
460             if (curToken != TOKEN_IDENTIFIER)
461                 error("Expected identifier");
462             m_tokenizer.getTokenStr(m_elementName);
463             m_state = STATE_ATTRIBUTE_LIST;
464             break;
465 
466         case STATE_END_TAG_OPEN:
467             if (curToken != TOKEN_IDENTIFIER)
468                 error("Expected identifier");
469             m_tokenizer.getTokenStr(m_elementName);
470             m_state = STATE_EXPECTING_END_TAG_CLOSE;
471             break;
472 
473         case STATE_EXPECTING_END_TAG_CLOSE:
474             if (curToken != TOKEN_TAG_END)
475                 error("Expected tag end");
476             m_state   = STATE_DATA;
477             m_element = ELEMENT_END;
478             return;
479 
480         case STATE_ATTRIBUTE_LIST:
481             if (curToken == TOKEN_IDENTIFIER)
482             {
483                 m_tokenizer.getTokenStr(m_attribName);
484                 m_state = STATE_EXPECTING_ATTRIBUTE_EQ;
485             }
486             else if (curToken == TOKEN_EMPTY_ELEMENT_END)
487             {
488                 m_state   = STATE_YIELD_EMPTY_ELEMENT_END;
489                 m_element = ELEMENT_START;
490                 return;
491             }
492             else if (curToken == TOKEN_TAG_END)
493             {
494                 m_state   = STATE_DATA;
495                 m_element = ELEMENT_START;
496                 return;
497             }
498             else
499                 error("Unexpected token");
500             break;
501 
502         case STATE_EXPECTING_ATTRIBUTE_EQ:
503             if (curToken != TOKEN_EQUAL)
504                 error("Expected '='");
505             m_state = STATE_EXPECTING_ATTRIBUTE_VALUE;
506             break;
507 
508         case STATE_EXPECTING_ATTRIBUTE_VALUE:
509             if (curToken != TOKEN_STRING)
510                 error("Expected value");
511             if (hasAttribute(m_attribName.c_str()))
512                 error("Duplicate attribute");
513 
514             m_tokenizer.getString(m_attributes[m_attribName]);
515             m_state = STATE_ATTRIBUTE_LIST;
516             break;
517 
518         default:
519             DE_ASSERT(false);
520         }
521 
522         m_tokenizer.advance();
523     }
524 }
525 
getEntityValue(const std::string & entity)526 static char getEntityValue(const std::string &entity)
527 {
528     static const struct
529     {
530         const char *name;
531         char value;
532     } s_entities[] = {
533         {"&lt;", '<'}, {"&gt;", '>'}, {"&amp;", '&'}, {"&apos;", '\''}, {"&quot;", '"'},
534     };
535 
536     for (int ndx = 0; ndx < DE_LENGTH_OF_ARRAY(s_entities); ndx++)
537     {
538         if (entity == s_entities[ndx].name)
539             return s_entities[ndx].value;
540     }
541 
542     return 0;
543 }
544 
parseEntityValue(void)545 void Parser::parseEntityValue(void)
546 {
547     DE_ASSERT(m_state == STATE_ENTITY && m_tokenizer.getToken() == TOKEN_ENTITY);
548 
549     std::string entity;
550     m_tokenizer.getTokenStr(entity);
551 
552     const char value = getEntityValue(entity);
553     if (value == 0)
554         error("Invalid entity '" + entity + "'");
555 
556     m_entityValue.resize(1);
557     m_entityValue[0] = value;
558 }
559 
560 } // namespace xml
561 } // namespace xe
562