1 /*-------------------------------------------------------------------------
2 * drawElements Quality Program Test Executor
3 * ------------------------------------------
4 *
5 * Copyright 2014 The Android Open Source Project
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 *
19 *//*!
20 * \file
21 * \brief XML Parser.
22 *//*--------------------------------------------------------------------*/
23
24 #include "xeXMLParser.hpp"
25 #include "deInt32.h"
26
27 namespace xe
28 {
29 namespace xml
30 {
31
32 enum
33 {
34 TOKENIZER_INITIAL_BUFFER_SIZE = 1024
35 };
36
isIdentifierStartChar(int ch)37 static inline bool isIdentifierStartChar(int ch)
38 {
39 return de::inRange<int>(ch, 'a', 'z') || de::inRange<int>(ch, 'A', 'Z');
40 }
41
isIdentifierChar(int ch)42 static inline bool isIdentifierChar(int ch)
43 {
44 return isIdentifierStartChar(ch) || de::inRange<int>(ch, '0', '9') || (ch == '-') || (ch == '_');
45 }
46
isWhitespaceChar(int ch)47 static inline bool isWhitespaceChar(int ch)
48 {
49 return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
50 }
51
getNextBufferSize(int curSize,int minNewSize)52 static int getNextBufferSize(int curSize, int minNewSize)
53 {
54 return de::max(curSize * 2, 1 << deLog2Ceil32(minNewSize));
55 }
56
Tokenizer(void)57 Tokenizer::Tokenizer(void)
58 : m_curToken(TOKEN_INCOMPLETE)
59 , m_curTokenLen(0)
60 , m_state(STATE_DATA)
61 , m_buf(TOKENIZER_INITIAL_BUFFER_SIZE)
62 {
63 }
64
~Tokenizer(void)65 Tokenizer::~Tokenizer(void)
66 {
67 }
68
clear(void)69 void Tokenizer::clear(void)
70 {
71 m_curToken = TOKEN_INCOMPLETE;
72 m_curTokenLen = 0;
73 m_state = STATE_DATA;
74 m_buf.clear();
75 }
76
error(const std::string & what)77 void Tokenizer::error(const std::string &what)
78 {
79 throw ParseError(what);
80 }
81
feed(const uint8_t * bytes,int numBytes)82 void Tokenizer::feed(const uint8_t *bytes, int numBytes)
83 {
84 // Grow buffer if necessary.
85 if (m_buf.getNumFree() < numBytes)
86 {
87 m_buf.resize(getNextBufferSize(m_buf.getSize(), m_buf.getNumElements() + numBytes));
88 }
89
90 // Append to front.
91 m_buf.pushFront(bytes, numBytes);
92
93 // If we haven't parsed complete token, re-try after data feed.
94 if (m_curToken == TOKEN_INCOMPLETE)
95 advance();
96 }
97
getChar(int offset) const98 int Tokenizer::getChar(int offset) const
99 {
100 DE_ASSERT(de::inRange(offset, 0, m_buf.getNumElements()));
101
102 if (offset < m_buf.getNumElements())
103 return m_buf.peekBack(offset);
104 else
105 return END_OF_BUFFER;
106 }
107
advance(void)108 void Tokenizer::advance(void)
109 {
110 if (m_curToken != TOKEN_INCOMPLETE)
111 {
112 // Parser should not try to advance beyond end of string.
113 DE_ASSERT(m_curToken != TOKEN_END_OF_STRING);
114
115 // If current token is tag end, change state to data.
116 if (m_curToken == TOKEN_TAG_END || m_curToken == TOKEN_EMPTY_ELEMENT_END ||
117 m_curToken == TOKEN_PROCESSING_INSTRUCTION_END || m_curToken == TOKEN_COMMENT || m_curToken == TOKEN_ENTITY)
118 m_state = STATE_DATA;
119
120 // Advance buffer by length of last token.
121 m_buf.popBack(m_curTokenLen);
122
123 // Reset state.
124 m_curToken = TOKEN_INCOMPLETE;
125 m_curTokenLen = 0;
126
127 // If we hit end of string here, report it as end of string.
128 if (getChar(0) == END_OF_STRING)
129 {
130 m_curToken = TOKEN_END_OF_STRING;
131 m_curTokenLen = 1;
132 return;
133 }
134 }
135
136 int curChar = getChar(m_curTokenLen);
137
138 for (;;)
139 {
140 if (m_state == STATE_DATA)
141 {
142 // Advance until we hit end of buffer or tag start and treat that as data token.
143 if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER || curChar == '<' || curChar == '&')
144 {
145 if (curChar == '<')
146 m_state = STATE_TAG;
147 else if (curChar == '&')
148 m_state = STATE_ENTITY;
149
150 if (m_curTokenLen > 0)
151 {
152 // Report data token.
153 m_curToken = TOKEN_DATA;
154 return;
155 }
156 else if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER)
157 {
158 // Just return incomplete token, no data parsed.
159 return;
160 }
161 else
162 {
163 DE_ASSERT(m_state == STATE_TAG || m_state == STATE_ENTITY);
164 continue;
165 }
166 }
167 }
168 else
169 {
170 // Eat all whitespace if present.
171 if (m_curTokenLen == 0)
172 {
173 while (isWhitespaceChar(curChar))
174 {
175 m_buf.popBack();
176 curChar = getChar(0);
177 }
178 }
179
180 // Handle end of string / buffer.
181 if (curChar == END_OF_STRING)
182 error("Unexpected end of string");
183 else if (curChar == (int)END_OF_BUFFER)
184 {
185 DE_ASSERT(m_curToken == TOKEN_INCOMPLETE);
186 return;
187 }
188
189 if (m_curTokenLen == 0)
190 {
191 // Expect start of identifier, value or special tag token.
192 if (curChar == '\'' || curChar == '"')
193 m_state = STATE_VALUE;
194 else if (isIdentifierStartChar(curChar))
195 m_state = STATE_IDENTIFIER;
196 else if (curChar == '<' || curChar == '?' || curChar == '/')
197 m_state = STATE_TAG;
198 else if (curChar == '&')
199 DE_ASSERT(m_state == STATE_ENTITY);
200 else if (curChar == '=')
201 {
202 m_curToken = TOKEN_EQUAL;
203 m_curTokenLen = 1;
204 return;
205 }
206 else if (curChar == '>')
207 {
208 m_curToken = TOKEN_TAG_END;
209 m_curTokenLen = 1;
210 return;
211 }
212 else
213 error("Unexpected character");
214 }
215 else if (m_state == STATE_IDENTIFIER)
216 {
217 if (!isIdentifierChar(curChar))
218 {
219 m_curToken = TOKEN_IDENTIFIER;
220 return;
221 }
222 }
223 else if (m_state == STATE_VALUE)
224 {
225 // \todo [2012-06-07 pyry] Escapes.
226 if (curChar == '\'' || curChar == '"')
227 {
228 // \todo [2012-10-17 pyry] Should we actually do the check against getChar(0)?
229 if (curChar != getChar(0))
230 error("Mismatched quote");
231 m_curToken = TOKEN_STRING;
232 m_curTokenLen += 1;
233 return;
234 }
235 }
236 else if (m_state == STATE_COMMENT)
237 {
238 DE_ASSERT(m_curTokenLen >= 2); // 2 characters have been parsed if we are in comment state.
239
240 if (m_curTokenLen <= 3)
241 {
242 if (curChar != '-')
243 error("Invalid comment start");
244 }
245 else
246 {
247 int prev2 = m_curTokenLen > 5 ? getChar(m_curTokenLen - 2) : 0;
248 int prev1 = m_curTokenLen > 4 ? getChar(m_curTokenLen - 1) : 0;
249
250 if (prev2 == '-' && prev1 == '-')
251 {
252 if (curChar != '>')
253 error("Invalid comment end");
254 m_curToken = TOKEN_COMMENT;
255 m_curTokenLen += 1;
256 return;
257 }
258 }
259 }
260 else if (m_state == STATE_ENTITY)
261 {
262 if (m_curTokenLen >= 1)
263 {
264 if (curChar == ';')
265 {
266 m_curToken = TOKEN_ENTITY;
267 m_curTokenLen += 1;
268 return;
269 }
270 else if (!de::inRange<int>(curChar, '0', '9') && !de::inRange<int>(curChar, 'a', 'z') &&
271 !de::inRange<int>(curChar, 'A', 'Z'))
272 error("Invalid entity");
273 }
274 }
275 else
276 {
277 // Special tokens are at most 2 characters.
278 DE_ASSERT(m_state == STATE_TAG && m_curTokenLen == 1);
279
280 int prevChar = getChar(m_curTokenLen - 1);
281
282 if (prevChar == '<')
283 {
284 // Tag start.
285 if (curChar == '/')
286 {
287 m_curToken = TOKEN_END_TAG_START;
288 m_curTokenLen = 2;
289 return;
290 }
291 else if (curChar == '?')
292 {
293 m_curToken = TOKEN_PROCESSING_INSTRUCTION_START;
294 m_curTokenLen = 2;
295 return;
296 }
297 else if (curChar == '!')
298 {
299 m_state = STATE_COMMENT;
300 }
301 else
302 {
303 m_curToken = TOKEN_TAG_START;
304 m_curTokenLen = 1;
305 return;
306 }
307 }
308 else if (prevChar == '?')
309 {
310 if (curChar != '>')
311 error("Invalid processing instruction end");
312 m_curToken = TOKEN_PROCESSING_INSTRUCTION_END;
313 m_curTokenLen = 2;
314 return;
315 }
316 else if (prevChar == '/')
317 {
318 if (curChar != '>')
319 error("Invalid empty element end");
320 m_curToken = TOKEN_EMPTY_ELEMENT_END;
321 m_curTokenLen = 2;
322 return;
323 }
324 else
325 error("Could not parse special token");
326 }
327 }
328
329 m_curTokenLen += 1;
330 curChar = getChar(m_curTokenLen);
331 }
332 }
333
getString(std::string & dst) const334 void Tokenizer::getString(std::string &dst) const
335 {
336 DE_ASSERT(m_curToken == TOKEN_STRING);
337 dst.resize(m_curTokenLen - 2);
338 for (int ndx = 0; ndx < m_curTokenLen - 2; ndx++)
339 dst[ndx] = m_buf.peekBack(ndx + 1);
340 }
341
Parser(void)342 Parser::Parser(void) : m_element(ELEMENT_INCOMPLETE), m_state(STATE_DATA)
343 {
344 }
345
~Parser(void)346 Parser::~Parser(void)
347 {
348 }
349
clear(void)350 void Parser::clear(void)
351 {
352 m_tokenizer.clear();
353 m_elementName.clear();
354 m_attributes.clear();
355 m_attribName.clear();
356 m_entityValue.clear();
357
358 m_element = ELEMENT_INCOMPLETE;
359 m_state = STATE_DATA;
360 }
361
error(const std::string & what)362 void Parser::error(const std::string &what)
363 {
364 throw ParseError(what);
365 }
366
feed(const uint8_t * bytes,int numBytes)367 void Parser::feed(const uint8_t *bytes, int numBytes)
368 {
369 m_tokenizer.feed(bytes, numBytes);
370
371 if (m_element == ELEMENT_INCOMPLETE)
372 advance();
373 }
374
advance(void)375 void Parser::advance(void)
376 {
377 if (m_element == ELEMENT_START)
378 m_attributes.clear();
379
380 // \note No token is advanced when element end is reported.
381 if (m_state == STATE_YIELD_EMPTY_ELEMENT_END)
382 {
383 DE_ASSERT(m_element == ELEMENT_START);
384 m_element = ELEMENT_END;
385 m_state = STATE_DATA;
386 return;
387 }
388
389 if (m_element != ELEMENT_INCOMPLETE)
390 {
391 m_tokenizer.advance();
392 m_element = ELEMENT_INCOMPLETE;
393 }
394
395 for (;;)
396 {
397 Token curToken = m_tokenizer.getToken();
398
399 // Skip comments.
400 while (curToken == TOKEN_COMMENT)
401 {
402 m_tokenizer.advance();
403 curToken = m_tokenizer.getToken();
404 }
405
406 if (curToken == TOKEN_INCOMPLETE)
407 {
408 DE_ASSERT(m_element == ELEMENT_INCOMPLETE);
409 return;
410 }
411
412 switch (m_state)
413 {
414 case STATE_ENTITY:
415 m_state = STATE_DATA;
416 // Fall-through
417
418 case STATE_DATA:
419 switch (curToken)
420 {
421 case TOKEN_DATA:
422 m_element = ELEMENT_DATA;
423 return;
424
425 case TOKEN_END_OF_STRING:
426 m_element = ELEMENT_END_OF_STRING;
427 return;
428
429 case TOKEN_TAG_START:
430 m_state = STATE_START_TAG_OPEN;
431 break;
432
433 case TOKEN_END_TAG_START:
434 m_state = STATE_END_TAG_OPEN;
435 break;
436
437 case TOKEN_PROCESSING_INSTRUCTION_START:
438 m_state = STATE_IN_PROCESSING_INSTRUCTION;
439 break;
440
441 case TOKEN_ENTITY:
442 m_state = STATE_ENTITY;
443 m_element = ELEMENT_DATA;
444 parseEntityValue();
445 return;
446
447 default:
448 error("Unexpected token");
449 }
450 break;
451
452 case STATE_IN_PROCESSING_INSTRUCTION:
453 if (curToken == TOKEN_PROCESSING_INSTRUCTION_END)
454 m_state = STATE_DATA;
455 else if (curToken != TOKEN_IDENTIFIER && curToken != TOKEN_EQUAL && curToken != TOKEN_STRING)
456 error("Unexpected token in processing instruction");
457 break;
458
459 case STATE_START_TAG_OPEN:
460 if (curToken != TOKEN_IDENTIFIER)
461 error("Expected identifier");
462 m_tokenizer.getTokenStr(m_elementName);
463 m_state = STATE_ATTRIBUTE_LIST;
464 break;
465
466 case STATE_END_TAG_OPEN:
467 if (curToken != TOKEN_IDENTIFIER)
468 error("Expected identifier");
469 m_tokenizer.getTokenStr(m_elementName);
470 m_state = STATE_EXPECTING_END_TAG_CLOSE;
471 break;
472
473 case STATE_EXPECTING_END_TAG_CLOSE:
474 if (curToken != TOKEN_TAG_END)
475 error("Expected tag end");
476 m_state = STATE_DATA;
477 m_element = ELEMENT_END;
478 return;
479
480 case STATE_ATTRIBUTE_LIST:
481 if (curToken == TOKEN_IDENTIFIER)
482 {
483 m_tokenizer.getTokenStr(m_attribName);
484 m_state = STATE_EXPECTING_ATTRIBUTE_EQ;
485 }
486 else if (curToken == TOKEN_EMPTY_ELEMENT_END)
487 {
488 m_state = STATE_YIELD_EMPTY_ELEMENT_END;
489 m_element = ELEMENT_START;
490 return;
491 }
492 else if (curToken == TOKEN_TAG_END)
493 {
494 m_state = STATE_DATA;
495 m_element = ELEMENT_START;
496 return;
497 }
498 else
499 error("Unexpected token");
500 break;
501
502 case STATE_EXPECTING_ATTRIBUTE_EQ:
503 if (curToken != TOKEN_EQUAL)
504 error("Expected '='");
505 m_state = STATE_EXPECTING_ATTRIBUTE_VALUE;
506 break;
507
508 case STATE_EXPECTING_ATTRIBUTE_VALUE:
509 if (curToken != TOKEN_STRING)
510 error("Expected value");
511 if (hasAttribute(m_attribName.c_str()))
512 error("Duplicate attribute");
513
514 m_tokenizer.getString(m_attributes[m_attribName]);
515 m_state = STATE_ATTRIBUTE_LIST;
516 break;
517
518 default:
519 DE_ASSERT(false);
520 }
521
522 m_tokenizer.advance();
523 }
524 }
525
getEntityValue(const std::string & entity)526 static char getEntityValue(const std::string &entity)
527 {
528 static const struct
529 {
530 const char *name;
531 char value;
532 } s_entities[] = {
533 {"<", '<'}, {">", '>'}, {"&", '&'}, {"'", '\''}, {""", '"'},
534 };
535
536 for (int ndx = 0; ndx < DE_LENGTH_OF_ARRAY(s_entities); ndx++)
537 {
538 if (entity == s_entities[ndx].name)
539 return s_entities[ndx].value;
540 }
541
542 return 0;
543 }
544
parseEntityValue(void)545 void Parser::parseEntityValue(void)
546 {
547 DE_ASSERT(m_state == STATE_ENTITY && m_tokenizer.getToken() == TOKEN_ENTITY);
548
549 std::string entity;
550 m_tokenizer.getTokenStr(entity);
551
552 const char value = getEntityValue(entity);
553 if (value == 0)
554 error("Invalid entity '" + entity + "'");
555
556 m_entityValue.resize(1);
557 m_entityValue[0] = value;
558 }
559
560 } // namespace xml
561 } // namespace xe
562