1 /**
2 * Test the UTF-8 decoding routines
3 *
4 * author: Daniel Veillard
5 * copy: see Copyright for the status of this software.
6 */
7
8 #define XML_DEPRECATED
9 #define XML_DEPRECATED_MEMBER
10
11 #include <stdio.h>
12 #include <string.h>
13 #include <libxml/tree.h>
14 #include <libxml/parser.h>
15 #include <libxml/parserInternals.h>
16
17 int lastError;
18
errorHandler(void * unused,const xmlError * err)19 static void errorHandler(void *unused, const xmlError *err) {
20 if ((unused == NULL) && (err != NULL) && (lastError == 0)) {
21 lastError = err->code;
22 }
23 }
24
25 char document1[100] = "<doc>XXXX</doc>";
26 char document2[100] = "<doc foo='XXXX'/>";
27
testDocumentRangeByte1(xmlParserCtxtPtr ctxt,char * document,int len,char * data,int forbid1,int forbid2)28 static int testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document,
29 int len, char *data, int forbid1, int forbid2) {
30 int i;
31 xmlDocPtr res;
32
33 for (i = 0;i <= 0xFF;i++) {
34 lastError = 0;
35 xmlCtxtReset(ctxt);
36
37 data[0] = (char) i;
38
39 res = xmlReadMemory(document, len, "test", NULL, 0);
40
41 if ((i == forbid1) || (i == forbid2)) {
42 if ((lastError == 0) || (res != NULL)) {
43 fprintf(stderr,
44 "Failed to detect invalid char for Byte 0x%02X: %c\n",
45 i, i);
46 return(1);
47 }
48 }
49
50 else if ((i == '<') || (i == '&')) {
51 if ((lastError == 0) || (res != NULL)) {
52 fprintf(stderr,
53 "Failed to detect illegal char %c for Byte 0x%02X\n", i, i);
54 return(1);
55 }
56 }
57 else if (((i < 0x20) || (i >= 0x80)) &&
58 (i != 0x9) && (i != 0xA) && (i != 0xD)) {
59 if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL)) {
60 fprintf(stderr,
61 "Failed to detect invalid char for Byte 0x%02X\n", i);
62 return(1);
63 }
64 }
65 else if (res == NULL) {
66 fprintf(stderr,
67 "Failed to parse valid char for Byte 0x%02X : %c\n", i, i);
68 return(1);
69 }
70 if (res != NULL)
71 xmlFreeDoc(res);
72 }
73 return(0);
74 }
75
testDocumentRangeByte2(xmlParserCtxtPtr ctxt,char * document,int len,char * data)76 static int testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document,
77 int len, char *data) {
78 int i, j;
79 xmlDocPtr res;
80
81 for (i = 0x80;i <= 0xFF;i++) {
82 for (j = 0;j <= 0xFF;j++) {
83 lastError = 0;
84 xmlCtxtReset(ctxt);
85
86 data[0] = (char) i;
87 data[1] = (char) j;
88
89 res = xmlReadMemory(document, len, "test", NULL, 0);
90
91 /* if first bit of first char is set, then second bit must too */
92 if ((i & 0x80) && ((i & 0x40) == 0)) {
93 if ((lastError == 0) || (res != NULL)) {
94 fprintf(stderr,
95 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
96 i, j);
97 return(1);
98 }
99 }
100
101 /*
102 * if first bit of first char is set, then second char first
103 * bits must be 10
104 */
105 else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
106 if ((lastError == 0) || (res != NULL)) {
107 fprintf(stderr,
108 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
109 i, j);
110 return(1);
111 }
112 }
113
114 /*
115 * if using a 2 byte encoding then the value must be greater
116 * than 0x80, i.e. one of bits 5 to 1 of i must be set
117 */
118 else if ((i & 0x80) && ((i & 0x1E) == 0)) {
119 if ((lastError == 0) || (res != NULL)) {
120 fprintf(stderr,
121 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
122 i, j);
123 return(1);
124 }
125 }
126
127 /*
128 * if third bit of first char is set, then the sequence would need
129 * at least 3 bytes, but we give only 2 !
130 */
131 else if ((i & 0xE0) == 0xE0) {
132 if ((lastError == 0) || (res != NULL)) {
133 fprintf(stderr,
134 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
135 i, j);
136 return(1);
137 }
138 }
139
140 /*
141 * We should see no error in remaining cases
142 */
143 else if ((lastError != 0) || (res == NULL)) {
144 fprintf(stderr,
145 "Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j);
146 return(1);
147 }
148 if (res != NULL)
149 xmlFreeDoc(res);
150 }
151 }
152 return(0);
153 }
154
155 /**
156 * testDocumentRanges:
157 *
158 * Test the correct UTF8 character parsing in context of XML documents
159 * Those are in-context injection tests checking the parser behaviour on
160 * edge case values at different point in content, beginning and end of
161 * CDATA in text or in attribute values.
162 */
163
testDocumentRanges(void)164 static int testDocumentRanges(void) {
165 xmlParserCtxtPtr ctxt;
166 char *data;
167 int test_ret = 0;
168
169 /*
170 * Set up a parsing context using the first document as
171 * the current input source.
172 */
173 ctxt = xmlNewParserCtxt();
174 if (ctxt == NULL) {
175 fprintf(stderr, "Failed to allocate parser context\n");
176 return(1);
177 }
178
179 printf("testing 1 byte char in document: 1");
180 fflush(stdout);
181 data = &document1[5];
182 data[0] = ' ';
183 data[1] = ' ';
184 data[2] = ' ';
185 data[3] = ' ';
186 /* test 1 byte injection at beginning of area */
187 test_ret += testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
188 data, -1, -1);
189 printf(" 2");
190 fflush(stdout);
191 data[0] = ' ';
192 data[1] = ' ';
193 data[2] = ' ';
194 data[3] = ' ';
195 /* test 1 byte injection at end of area */
196 test_ret += testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
197 data + 3, -1, -1);
198
199 printf(" 3");
200 fflush(stdout);
201 data = &document2[10];
202 data[0] = ' ';
203 data[1] = ' ';
204 data[2] = ' ';
205 data[3] = ' ';
206 /* test 1 byte injection at beginning of area */
207 test_ret += testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
208 data, '\'', -1);
209 printf(" 4");
210 fflush(stdout);
211 data[0] = ' ';
212 data[1] = ' ';
213 data[2] = ' ';
214 data[3] = ' ';
215 /* test 1 byte injection at end of area */
216 test_ret += testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
217 data + 3, '\'', -1);
218 printf(" done\n");
219
220 printf("testing 2 byte char in document: 1");
221 fflush(stdout);
222 data = &document1[5];
223 data[0] = ' ';
224 data[1] = ' ';
225 data[2] = ' ';
226 data[3] = ' ';
227 /* test 2 byte injection at beginning of area */
228 test_ret += testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
229 data);
230 printf(" 2");
231 fflush(stdout);
232 data[0] = ' ';
233 data[1] = ' ';
234 data[2] = ' ';
235 data[3] = ' ';
236 /* test 2 byte injection at end of area */
237 test_ret += testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
238 data + 2);
239
240 printf(" 3");
241 fflush(stdout);
242 data = &document2[10];
243 data[0] = ' ';
244 data[1] = ' ';
245 data[2] = ' ';
246 data[3] = ' ';
247 /* test 2 byte injection at beginning of area */
248 test_ret += testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
249 data);
250 printf(" 4");
251 fflush(stdout);
252 data[0] = ' ';
253 data[1] = ' ';
254 data[2] = ' ';
255 data[3] = ' ';
256 /* test 2 byte injection at end of area */
257 test_ret += testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
258 data + 2);
259 printf(" done\n");
260
261 xmlFreeParserCtxt(ctxt);
262 return(test_ret);
263 }
264
265 static int
testCurrentChar(xmlParserCtxtPtr ctxt,int * len)266 testCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
267 const xmlChar *oldcur;
268 int c, err, len2;
269
270 lastError = 0;
271 c = xmlCurrentChar(ctxt, len);
272 ctxt->input->flags = 0;
273 err = lastError;
274
275 oldcur = ctxt->input->cur;
276 lastError = 0;
277 xmlNextChar(ctxt);
278 ctxt->input->flags = 0;
279 len2 = ctxt->input->cur - oldcur;
280 ctxt->input->cur = oldcur;
281
282 if ((*ctxt->input->cur != 0) && (err != lastError)) {
283 fprintf(stderr, "xmlCurrentChar and xmlNextChar report different "
284 "errors: %d %d\n", err, lastError);
285 return(-1);
286 }
287
288 if ((err == 0) && (*len != len2)) {
289 fprintf(stderr, "xmlCurrentChar and xmlNextChar report different "
290 "lengths: %d %d\n", *len, len2);
291 return(-1);
292 }
293
294 lastError = err;
295
296 return(c);
297 }
298
testCharRangeByte1(xmlParserCtxtPtr ctxt)299 static int testCharRangeByte1(xmlParserCtxtPtr ctxt) {
300 int i = 0;
301 int len, c;
302 char *data = (char *) ctxt->input->cur;
303
304 data[1] = 0;
305 data[2] = 0;
306 data[3] = 0;
307 for (i = 0;i <= 0xFF;i++) {
308 data[0] = (char) i;
309 ctxt->nbErrors = 0;
310
311 c = testCurrentChar(ctxt, &len);
312 if (c < 0)
313 continue;
314 if (i >= 0x80) {
315 /* we must see an error there */
316 if (lastError != XML_ERR_INVALID_ENCODING) {
317 fprintf(stderr,
318 "Failed to detect invalid char for Byte 0x%02X\n", i);
319 return(1);
320 }
321 } else if (i == 0xD) {
322 if ((c != 0xA) || (len != 1)) {
323 fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i);
324 return(1);
325 }
326 } else if ((c != i) || (len != 1)) {
327 fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i);
328 return(1);
329 }
330 }
331 return(0);
332 }
333
testCharRangeByte2(xmlParserCtxtPtr ctxt)334 static int testCharRangeByte2(xmlParserCtxtPtr ctxt) {
335 int i, j;
336 int len, c;
337 char *data = (char *) ctxt->input->cur;
338
339 data[2] = 0;
340 data[3] = 0;
341 for (i = 0x80;i <= 0xFF;i++) {
342 for (j = 0;j <= 0xFF;j++) {
343 data[0] = (char) i;
344 data[1] = (char) j;
345 ctxt->nbErrors = 0;
346
347 c = testCurrentChar(ctxt, &len);
348 if (c < 0)
349 continue;
350
351 /* if first bit of first char is set, then second bit must too */
352 if ((i & 0x80) && ((i & 0x40) == 0)) {
353 if (lastError != XML_ERR_INVALID_ENCODING) {
354 fprintf(stderr,
355 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
356 i, j);
357 return(1);
358 }
359 }
360
361 /*
362 * if first bit of first char is set, then second char first
363 * bits must be 10
364 */
365 else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
366 if (lastError != XML_ERR_INVALID_ENCODING) {
367 fprintf(stderr,
368 "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
369 i, j, c);
370 return(1);
371 }
372 }
373
374 /*
375 * if using a 2 byte encoding then the value must be greater
376 * than 0x80, i.e. one of bits 5 to 1 of i must be set
377 */
378 else if ((i & 0x80) && ((i & 0x1E) == 0)) {
379 if (lastError != XML_ERR_INVALID_ENCODING) {
380 fprintf(stderr,
381 "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
382 i, j, c);
383 return(1);
384 }
385 }
386
387 /*
388 * if third bit of first char is set, then the sequence would need
389 * at least 3 bytes, but we give only 2 !
390 */
391 else if ((i & 0xE0) == 0xE0) {
392 if (lastError != XML_ERR_INVALID_ENCODING) {
393 fprintf(stderr,
394 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
395 i, j);
396 return(1);
397 }
398 }
399
400 /*
401 * We should see no error in remaining cases
402 */
403 else if ((lastError != 0) || (len != 2)) {
404 fprintf(stderr,
405 "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j);
406 return(1);
407 }
408
409 /*
410 * Finally check the value is right
411 */
412 else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) {
413 fprintf(stderr,
414 "Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n",
415 i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c);
416 return(1);
417 }
418 }
419 }
420 return(0);
421 }
422
testCharRangeByte3(xmlParserCtxtPtr ctxt)423 static int testCharRangeByte3(xmlParserCtxtPtr ctxt) {
424 int i, j, k, K;
425 int len, c;
426 unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
427 char *data = (char *) ctxt->input->cur;
428 int value;
429
430 data[3] = 0;
431 for (i = 0xE0;i <= 0xFF;i++) {
432 for (j = 0;j <= 0xFF;j++) {
433 for (k = 0;k < 6;k++) {
434 data[0] = (char) i;
435 data[1] = (char) j;
436 K = lows[k];
437 data[2] = (char) K;
438 value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
439 ctxt->nbErrors = 0;
440
441 c = testCurrentChar(ctxt, &len);
442 if (c < 0)
443 continue;
444
445 /*
446 * if fourth bit of first char is set, then the sequence would need
447 * at least 4 bytes, but we give only 3 !
448 */
449 if ((i & 0xF0) == 0xF0) {
450 if (lastError != XML_ERR_INVALID_ENCODING) {
451 fprintf(stderr,
452 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
453 i, j, K, data[3]);
454 return(1);
455 }
456 }
457
458 /*
459 * The second and the third bytes must start with 10
460 */
461 else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) {
462 if (lastError != XML_ERR_INVALID_ENCODING) {
463 fprintf(stderr,
464 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
465 i, j, K);
466 return(1);
467 }
468 }
469
470 /*
471 * if using a 3 byte encoding then the value must be greater
472 * than 0x800, i.e. one of bits 4 to 0 of i must be set or
473 * the 6th byte of data[1] must be set
474 */
475 else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) {
476 if (lastError != XML_ERR_INVALID_ENCODING) {
477 fprintf(stderr,
478 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
479 i, j, K);
480 return(1);
481 }
482 }
483
484 /*
485 * There are values that are not allowed in UTF-8
486 */
487 else if ((value > 0xD7FF) && (value <0xE000)) {
488 if (lastError != XML_ERR_INVALID_ENCODING) {
489 fprintf(stderr,
490 "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n",
491 value, i, j, K);
492 return(1);
493 }
494 }
495
496 /*
497 * We should see no error in remaining cases
498 */
499 else if ((lastError != 0) || (len != 3)) {
500 fprintf(stderr,
501 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
502 i, j, K);
503 return(1);
504 }
505
506 /*
507 * Finally check the value is right
508 */
509 else if (c != value) {
510 fprintf(stderr,
511 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
512 i, j, data[2], value, c);
513 return(1);
514 }
515 }
516 }
517 }
518 return(0);
519 }
520
testCharRangeByte4(xmlParserCtxtPtr ctxt)521 static int testCharRangeByte4(xmlParserCtxtPtr ctxt) {
522 int i, j, k, K, l, L;
523 int len, c;
524 unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
525 char *data = (char *) ctxt->input->cur;
526 int value;
527
528 data[4] = 0;
529 for (i = 0xF0;i <= 0xFF;i++) {
530 for (j = 0;j <= 0xFF;j++) {
531 for (k = 0;k < 6;k++) {
532 for (l = 0;l < 6;l++) {
533 data[0] = (char) i;
534 data[1] = (char) j;
535 K = lows[k];
536 data[2] = (char) K;
537 L = lows[l];
538 data[3] = (char) L;
539 value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
540 ((i & 0x7) << 18);
541 ctxt->nbErrors = 0;
542
543 c = testCurrentChar(ctxt, &len);
544 if (c < 0)
545 continue;
546
547 /*
548 * if fifth bit of first char is set, then the sequence would need
549 * at least 5 bytes, but we give only 4 !
550 */
551 if ((i & 0xF8) == 0xF8) {
552 if (lastError != XML_ERR_INVALID_ENCODING) {
553 fprintf(stderr,
554 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
555 i, j, K, data[3]);
556 return(1);
557 }
558 }
559
560 /*
561 * The second, third and fourth bytes must start with 10
562 */
563 else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) ||
564 ((L & 0xC0) != 0x80)) {
565 if (lastError != XML_ERR_INVALID_ENCODING) {
566 fprintf(stderr,
567 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
568 i, j, K, L);
569 return(1);
570 }
571 }
572
573 /*
574 * if using a 3 byte encoding then the value must be greater
575 * than 0x10000, i.e. one of bits 3 to 0 of i must be set or
576 * the 6 or 5th byte of j must be set
577 */
578 else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) {
579 if (lastError != XML_ERR_INVALID_ENCODING) {
580 fprintf(stderr,
581 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
582 i, j, K, L);
583 return(1);
584 }
585 }
586
587 /*
588 * There are values in that are not allowed in UTF-8
589 */
590 else if (((value > 0xD7FF) && (value < 0xE000)) ||
591 (value > 0x10FFFF)) {
592 if (lastError != XML_ERR_INVALID_ENCODING) {
593 fprintf(stderr,
594 "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
595 value, i, j, K, L);
596 return(1);
597 }
598 }
599
600 /*
601 * We should see no error in remaining cases
602 */
603 else if ((lastError != 0) || (len != 4)) {
604 fprintf(stderr,
605 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
606 i, j, K);
607 return(1);
608 }
609
610 /*
611 * Finally check the value is right
612 */
613 else if (c != value) {
614 fprintf(stderr,
615 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
616 i, j, data[2], value, c);
617 return(1);
618 }
619 }
620 }
621 }
622 }
623 return(0);
624 }
625
626 /**
627 * testCharRanges:
628 *
629 * Test the correct UTF8 character parsing in isolation i.e.
630 * not when parsing a full document, this is less expensive and we can
631 * cover the full range of UTF-8 chars accepted by XML-1.0
632 */
633
testCharRanges(void)634 static int testCharRanges(void) {
635 char data[5];
636 xmlParserCtxtPtr ctxt;
637 xmlParserInputBufferPtr buf;
638 xmlParserInputPtr input;
639 int test_ret = 0;
640
641 memset(data, 0, 5);
642
643 /*
644 * Set up a parsing context using the above data buffer as
645 * the current input source.
646 */
647 ctxt = xmlNewParserCtxt();
648 if (ctxt == NULL) {
649 fprintf(stderr, "Failed to allocate parser context\n");
650 return(1);
651 }
652 buf = xmlParserInputBufferCreateStatic(data, sizeof(data),
653 XML_CHAR_ENCODING_NONE);
654 if (buf == NULL) {
655 fprintf(stderr, "Failed to allocate input buffer\n");
656 test_ret = 1;
657 goto error;
658 }
659 input = xmlNewInputStream(ctxt);
660 if (input == NULL) {
661 xmlFreeParserInputBuffer(buf);
662 test_ret = 1;
663 goto error;
664 }
665 input->filename = NULL;
666 input->buf = buf;
667 input->cur =
668 input->base = xmlBufContent(input->buf->buffer);
669 input->end = input->base + 4;
670 inputPush(ctxt, input);
671
672 printf("testing char range: 1");
673 fflush(stdout);
674 test_ret += testCharRangeByte1(ctxt);
675 printf(" 2");
676 fflush(stdout);
677 test_ret += testCharRangeByte2(ctxt);
678 printf(" 3");
679 fflush(stdout);
680 test_ret += testCharRangeByte3(ctxt);
681 printf(" 4");
682 fflush(stdout);
683 test_ret += testCharRangeByte4(ctxt);
684 printf(" done\n");
685 fflush(stdout);
686
687 error:
688 xmlFreeParserCtxt(ctxt);
689 return(test_ret);
690 }
691
692 static int
testUserEncoding(void)693 testUserEncoding(void) {
694 /*
695 * Create a document encoded as UTF-16LE with an ISO-8859-1 encoding
696 * declaration, then parse it with xmlReadMemory and the encoding
697 * argument set to UTF-16LE.
698 */
699 xmlDocPtr doc = NULL;
700 const char *start = "<?xml version='1.0' encoding='ISO-8859-1'?><d>";
701 const char *end = "</d>";
702 char *buf = NULL;
703 xmlChar *text;
704 int startSize = strlen(start);
705 int textSize = 100000; /* Make sure to exceed internal buffer sizes. */
706 int endSize = strlen(end);
707 int totalSize = startSize + textSize + endSize;
708 int k = 0;
709 int i;
710 int ret = 1;
711
712 buf = xmlMalloc(2 * totalSize);
713 for (i = 0; start[i] != 0; i++) {
714 buf[k++] = start[i];
715 buf[k++] = 0;
716 }
717 for (i = 0; i < textSize; i++) {
718 buf[k++] = 'x';
719 buf[k++] = 0;
720 }
721 for (i = 0; end[i] != 0; i++) {
722 buf[k++] = end[i];
723 buf[k++] = 0;
724 }
725
726 doc = xmlReadMemory(buf, 2 * totalSize, NULL, "UTF-16LE", 0);
727 if (doc == NULL) {
728 fprintf(stderr, "failed to parse document\n");
729 goto error;
730 }
731
732 text = doc->children->children->content;
733 for (i = 0; i < textSize; i++) {
734 if (text[i] != 'x') {
735 fprintf(stderr, "text node has wrong content at offset %d\n", k);
736 goto error;
737 }
738 }
739
740 ret = 0;
741
742 error:
743 xmlFreeDoc(doc);
744 xmlFree(buf);
745
746 return ret;
747 }
748
749 #if defined(LIBXML_PUSH_ENABLED) && defined(LIBXML_OUTPUT_ENABLED)
750
751 static char *
convert(xmlCharEncodingHandlerPtr handler,const char * utf8,int size,int * outSize)752 convert(xmlCharEncodingHandlerPtr handler, const char *utf8, int size,
753 int *outSize) {
754 xmlBufferPtr in, out;
755 char *ret;
756
757 in = xmlBufferCreate();
758 xmlBufferAdd(in, BAD_CAST utf8, size);
759 out = xmlBufferCreate();
760 xmlCharEncOutFunc(handler, out, in);
761
762 if (outSize)
763 *outSize = out->use;
764 ret = (char *) xmlBufferDetach(out);
765
766 xmlBufferFree(out);
767 xmlBufferFree(in);
768 return(ret);
769 }
770
771 static int
testUserEncodingPush(void)772 testUserEncodingPush(void) {
773 xmlCharEncodingHandlerPtr handler;
774 xmlParserCtxtPtr ctxt;
775 xmlDocPtr doc;
776 char buf[] =
777 "\xEF\xBB\xBF"
778 "<?xml version='1.0' encoding='ISO-8859-1'?>\n"
779 "<d>text</d>\n";
780 char *utf16;
781 int utf16Size;
782 int ret = 1;
783
784 handler = xmlGetCharEncodingHandler(XML_CHAR_ENCODING_UTF16LE);
785 utf16 = convert(handler, buf, sizeof(buf) - 1, &utf16Size);
786 ctxt = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
787 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF16LE);
788 xmlParseChunk(ctxt, utf16, utf16Size, 0);
789 xmlParseChunk(ctxt, NULL, 0, 1);
790 doc = ctxt->myDoc;
791
792 if ((doc != NULL) &&
793 (doc->children != NULL) &&
794 (doc->children->children != NULL) &&
795 (xmlStrcmp(doc->children->children->content, BAD_CAST "text") == 0))
796 ret = 0;
797
798 xmlFreeDoc(doc);
799 xmlFreeParserCtxt(ctxt);
800 xmlFree(utf16);
801
802 return(ret);
803 }
804
805 static int
testUTF8Chunks(void)806 testUTF8Chunks(void) {
807 xmlParserCtxtPtr ctxt;
808 xmlChar *out;
809 int outSize;
810 char *buf;
811 int i;
812 int ret = 0;
813
814 ctxt = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
815
816 xmlParseChunk(ctxt, "<d>", 3, 0);
817 xmlParseChunk(ctxt, "\xF0", 1, 0);
818 xmlParseChunk(ctxt, "\x9F", 1, 0);
819 xmlParseChunk(ctxt, "\x98", 1, 0);
820 xmlParseChunk(ctxt, "\x8A", 1, 0);
821 xmlParseChunk(ctxt, "</d>", 4, 1);
822
823 xmlDocDumpMemory(ctxt->myDoc, &out, &outSize);
824 if (strcmp((char *) out,
825 "<?xml version=\"1.0\"?>\n<d>😊</d>\n") != 0) {
826 fprintf(stderr, "failed UTF-8 chunk test 1\n");
827 ret += 1;
828 }
829
830 xmlFree(out);
831 xmlFreeDoc(ctxt->myDoc);
832 xmlFreeParserCtxt(ctxt);
833
834 ctxt = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
835
836 xmlParseChunk(ctxt, "<d>", 3, 0);
837
838 /*
839 * Create a chunk longer than XML_PARSER_BIG_BUFFER_SIZE (300) ending
840 * with an incomplete UTF-8 sequence.
841 */
842 buf = xmlMalloc(1000 * 2 + 1);
843 for (i = 0; i < 2000; i += 2)
844 memcpy(buf + i, "\xCE\xB1", 2);
845 buf[i] = '\xCE';
846 xmlParseChunk(ctxt, buf, 2001, 0);
847 xmlFree(buf);
848
849 xmlParseChunk(ctxt, "\xB1</d>", 4, 0);
850 xmlParseChunk(ctxt, NULL, 0, 0);
851
852 xmlDocDumpMemory(ctxt->myDoc, &out, &outSize);
853 if (strncmp((char *) out, "<?xml version=\"1.0\"?>\n<d>", 25) != 0) {
854 fprintf(stderr, "failed UTF-8 chunk test 2-1\n");
855 ret += 1;
856 goto error;
857 }
858 for (i = 25; i < 25 + 1001 * 7; i += 7) {
859 if (memcmp(out + i, "α", 7) != 0) {
860 fprintf(stderr, "failed UTF-8 chunk test 2-2 %d\n", i);
861 ret += 1;
862 goto error;
863 }
864 }
865 if (strcmp((char *) out + i, "</d>\n") != 0) {
866 fprintf(stderr, "failed UTF-8 chunk test 2-3\n");
867 ret += 1;
868 goto error;
869 }
870
871 error:
872 xmlFree(out);
873 xmlFreeDoc(ctxt->myDoc);
874 xmlFreeParserCtxt(ctxt);
875
876 return(ret);
877 return(0);
878 }
879
880 #endif
881
882 static void
bufDump(const char * prefix,const xmlChar * content,int len)883 bufDump(const char *prefix, const xmlChar *content, int len) {
884 int i;
885
886 fprintf(stderr, "%s", prefix);
887 for (i = 0; i < len; i++) {
888 fprintf(stderr, " %02X", content[i]);
889 }
890 fprintf(stderr, "\n");
891 }
892
893 static int
bufCompare(xmlBufferPtr got,const xmlChar * expectContent,int expectLen)894 bufCompare(xmlBufferPtr got, const xmlChar *expectContent, int expectLen) {
895 const xmlChar *gotContent = xmlBufferContent(got);
896 int gotLen = xmlBufferLength(got);
897
898 if ((gotLen == expectLen) &&
899 (memcmp(gotContent, expectContent, gotLen) == 0))
900 return(0);
901
902 bufDump("got: ", gotContent, gotLen);
903 bufDump("expected:", expectContent, expectLen);
904
905 return(-1);
906 }
907
908 static int
testEncHandler(xmlCharEncodingHandlerPtr handler,const xmlChar * dec,int decSize,const xmlChar * enc,int encSize)909 testEncHandler(xmlCharEncodingHandlerPtr handler, const xmlChar *dec,
910 int decSize, const xmlChar *enc, int encSize) {
911 xmlBufferPtr encBuf = xmlBufferCreate();
912 xmlBufferPtr decBuf = xmlBufferCreate();
913 int ret = 0;
914
915 xmlBufferAdd(encBuf, enc, encSize);
916 xmlCharEncInFunc(handler, decBuf, encBuf);
917 if (bufCompare(decBuf, dec, decSize) != 0) {
918 fprintf(stderr, "Decoding %s failed\n", handler->name);
919 ret = -1;
920 }
921
922 #ifdef LIBXML_OUTPUT_ENABLED
923 xmlBufferEmpty(decBuf);
924 xmlBufferAdd(decBuf, dec, decSize);
925 xmlCharEncOutFunc(handler, encBuf, decBuf);
926 if (bufCompare(encBuf, enc, encSize) != 0) {
927 fprintf(stderr, "Encoding %s failed\n", handler->name);
928 ret = -1;
929 }
930 #endif
931
932 xmlBufferFree(decBuf);
933 xmlBufferFree(encBuf);
934 return(ret);
935 }
936
937 static int
testUTF16(void)938 testUTF16(void) {
939 static const xmlChar utf8[] =
940 "\x01"
941 "\x7F"
942 "\xC2\x80"
943 "\xDF\xBF"
944 "\xE0\xA0\x80"
945 "\xEF\xBF\xBF"
946 "\xF0\x90\x80\x80"
947 "\xF4\x8F\xBF\xBF";
948 static const xmlChar utf16LE[] =
949 "\x01\x00"
950 "\x7F\x00"
951 "\x80\x00"
952 "\xFF\x07"
953 "\x00\x08"
954 "\xFF\xFF"
955 "\x00\xD8\x00\xDC"
956 "\xFF\xDB\xFF\xDF";
957 static const xmlChar utf16BE[] =
958 "\x00\x01"
959 "\x00\x7F"
960 "\x00\x80"
961 "\x07\xFF"
962 "\x08\x00"
963 "\xFF\xFF"
964 "\xD8\x00\xDC\x00"
965 "\xDB\xFF\xDF\xFF";
966
967 xmlCharEncodingHandlerPtr handler16LE, handler16BE;
968 int ret = 0;
969
970 handler16LE = xmlFindCharEncodingHandler("UTF-16LE");
971 handler16BE = xmlFindCharEncodingHandler("UTF-16BE");
972
973 if (testEncHandler(handler16LE,
974 utf8, sizeof(utf8) - 1,
975 utf16LE, sizeof(utf16LE) - 1) != 0)
976 ret = -1;
977 if (testEncHandler(handler16BE,
978 utf8, sizeof(utf8) - 1,
979 utf16BE, sizeof(utf16BE) - 1) != 0)
980 ret = -1;
981
982 return(ret);
983 }
984
main(void)985 int main(void) {
986 int ret = 0;
987
988 /*
989 * this initialize the library and check potential ABI mismatches
990 * between the version it was compiled for and the actual shared
991 * library used.
992 */
993 LIBXML_TEST_VERSION
994
995 /*
996 * Catch errors separately
997 */
998
999 xmlSetStructuredErrorFunc(NULL, errorHandler);
1000
1001 /*
1002 * Run the tests
1003 */
1004 ret += testCharRanges();
1005 ret += testDocumentRanges();
1006 ret += testUserEncoding();
1007 #if defined(LIBXML_PUSH_ENABLED) && defined(LIBXML_OUTPUT_ENABLED)
1008 ret += testUserEncodingPush();
1009 ret += testUTF8Chunks();
1010 #endif
1011 ret += testUTF16();
1012
1013 /*
1014 * Cleanup function for the XML library.
1015 */
1016 xmlCleanupParser();
1017 return(ret ? 1 : 0);
1018 }
1019