xref: /aosp_15_r20/external/libxml2/testchar.c (revision 7c5688314b92172186c154356a6374bf7684c3ca)
1 /**
2  * Test the UTF-8 decoding routines
3  *
4  * author: Daniel Veillard
5  * copy: see Copyright for the status of this software.
6  */
7 
8 #define XML_DEPRECATED
9 #define XML_DEPRECATED_MEMBER
10 
11 #include <stdio.h>
12 #include <string.h>
13 #include <libxml/tree.h>
14 #include <libxml/parser.h>
15 #include <libxml/parserInternals.h>
16 
17 int lastError;
18 
errorHandler(void * unused,const xmlError * err)19 static void errorHandler(void *unused, const xmlError *err) {
20     if ((unused == NULL) && (err != NULL) && (lastError == 0)) {
21         lastError = err->code;
22     }
23 }
24 
25 char document1[100] = "<doc>XXXX</doc>";
26 char document2[100] = "<doc foo='XXXX'/>";
27 
testDocumentRangeByte1(xmlParserCtxtPtr ctxt,char * document,int len,char * data,int forbid1,int forbid2)28 static int testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document,
29                   int len,  char *data, int forbid1, int forbid2) {
30     int i;
31     xmlDocPtr res;
32 
33     for (i = 0;i <= 0xFF;i++) {
34 	lastError = 0;
35 	xmlCtxtReset(ctxt);
36 
37         data[0] = (char) i;
38 
39 	res = xmlReadMemory(document, len, "test", NULL, 0);
40 
41 	if ((i == forbid1) || (i == forbid2)) {
42 	    if ((lastError == 0) || (res != NULL)) {
43 	        fprintf(stderr,
44 		    "Failed to detect invalid char for Byte 0x%02X: %c\n",
45 		        i, i);
46 		return(1);
47 	    }
48 	}
49 
50 	else if ((i == '<') || (i == '&')) {
51 	    if ((lastError == 0) || (res != NULL)) {
52 	        fprintf(stderr,
53 		    "Failed to detect illegal char %c for Byte 0x%02X\n", i, i);
54 		return(1);
55 	    }
56 	}
57 	else if (((i < 0x20) || (i >= 0x80)) &&
58 	    (i != 0x9) && (i != 0xA) && (i != 0xD)) {
59 	    if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL)) {
60 	        fprintf(stderr,
61 		    "Failed to detect invalid char for Byte 0x%02X\n", i);
62 		return(1);
63 	    }
64 	}
65 	else if (res == NULL) {
66 	    fprintf(stderr,
67 		"Failed to parse valid char for Byte 0x%02X : %c\n", i, i);
68 		return(1);
69 	}
70 	if (res != NULL)
71 	    xmlFreeDoc(res);
72     }
73     return(0);
74 }
75 
testDocumentRangeByte2(xmlParserCtxtPtr ctxt,char * document,int len,char * data)76 static int testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document,
77                   int len,  char *data) {
78     int i, j;
79     xmlDocPtr res;
80 
81     for (i = 0x80;i <= 0xFF;i++) {
82     for (j = 0;j <= 0xFF;j++) {
83 	lastError = 0;
84 	xmlCtxtReset(ctxt);
85 
86         data[0] = (char) i;
87         data[1] = (char) j;
88 
89 	res = xmlReadMemory(document, len, "test", NULL, 0);
90 
91 	/* if first bit of first char is set, then second bit must too */
92 	if ((i & 0x80) && ((i & 0x40) == 0)) {
93 	    if ((lastError == 0) || (res != NULL)) {
94 		fprintf(stderr,
95 		"Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
96 			i, j);
97 		return(1);
98 	    }
99 	}
100 
101 	/*
102 	 * if first bit of first char is set, then second char first
103 	 * bits must be 10
104 	 */
105 	else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
106 	    if ((lastError == 0) || (res != NULL)) {
107 		fprintf(stderr,
108 	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
109 			i, j);
110 		return(1);
111 	    }
112 	}
113 
114 	/*
115 	 * if using a 2 byte encoding then the value must be greater
116 	 * than 0x80, i.e. one of bits 5 to 1 of i must be set
117 	 */
118 	else if ((i & 0x80) && ((i & 0x1E) == 0)) {
119 	    if ((lastError == 0) || (res != NULL)) {
120 		fprintf(stderr,
121 	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
122 			i, j);
123 		return(1);
124 	    }
125 	}
126 
127 	/*
128 	 * if third bit of first char is set, then the sequence would need
129 	 * at least 3 bytes, but we give only 2 !
130 	 */
131 	else if ((i & 0xE0) == 0xE0) {
132 	    if ((lastError == 0) || (res != NULL)) {
133 		fprintf(stderr,
134 	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
135 			i, j);
136 		return(1);
137 	    }
138 	}
139 
140 	/*
141 	 * We should see no error in remaining cases
142 	 */
143 	else if ((lastError != 0) || (res == NULL)) {
144 	    fprintf(stderr,
145 		"Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j);
146 	    return(1);
147 	}
148 	if (res != NULL)
149 	    xmlFreeDoc(res);
150     }
151     }
152     return(0);
153 }
154 
155 /**
156  * testDocumentRanges:
157  *
158  * Test the correct UTF8 character parsing in context of XML documents
159  * Those are in-context injection tests checking the parser behaviour on
160  * edge case values at different point in content, beginning and end of
161  * CDATA in text or in attribute values.
162  */
163 
testDocumentRanges(void)164 static int testDocumentRanges(void) {
165     xmlParserCtxtPtr ctxt;
166     char *data;
167     int test_ret = 0;
168 
169     /*
170      * Set up a parsing context using the first document as
171      * the current input source.
172      */
173     ctxt = xmlNewParserCtxt();
174     if (ctxt == NULL) {
175         fprintf(stderr, "Failed to allocate parser context\n");
176 	return(1);
177     }
178 
179     printf("testing 1 byte char in document: 1");
180     fflush(stdout);
181     data = &document1[5];
182     data[0] = ' ';
183     data[1] = ' ';
184     data[2] = ' ';
185     data[3] = ' ';
186     /* test 1 byte injection at beginning of area */
187     test_ret += testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
188                            data, -1, -1);
189     printf(" 2");
190     fflush(stdout);
191     data[0] = ' ';
192     data[1] = ' ';
193     data[2] = ' ';
194     data[3] = ' ';
195     /* test 1 byte injection at end of area */
196     test_ret += testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
197                            data + 3, -1, -1);
198 
199     printf(" 3");
200     fflush(stdout);
201     data = &document2[10];
202     data[0] = ' ';
203     data[1] = ' ';
204     data[2] = ' ';
205     data[3] = ' ';
206     /* test 1 byte injection at beginning of area */
207     test_ret += testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
208                            data, '\'', -1);
209     printf(" 4");
210     fflush(stdout);
211     data[0] = ' ';
212     data[1] = ' ';
213     data[2] = ' ';
214     data[3] = ' ';
215     /* test 1 byte injection at end of area */
216     test_ret += testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
217                            data + 3, '\'', -1);
218     printf(" done\n");
219 
220     printf("testing 2 byte char in document: 1");
221     fflush(stdout);
222     data = &document1[5];
223     data[0] = ' ';
224     data[1] = ' ';
225     data[2] = ' ';
226     data[3] = ' ';
227     /* test 2 byte injection at beginning of area */
228     test_ret += testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
229                            data);
230     printf(" 2");
231     fflush(stdout);
232     data[0] = ' ';
233     data[1] = ' ';
234     data[2] = ' ';
235     data[3] = ' ';
236     /* test 2 byte injection at end of area */
237     test_ret += testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
238                            data + 2);
239 
240     printf(" 3");
241     fflush(stdout);
242     data = &document2[10];
243     data[0] = ' ';
244     data[1] = ' ';
245     data[2] = ' ';
246     data[3] = ' ';
247     /* test 2 byte injection at beginning of area */
248     test_ret += testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
249                            data);
250     printf(" 4");
251     fflush(stdout);
252     data[0] = ' ';
253     data[1] = ' ';
254     data[2] = ' ';
255     data[3] = ' ';
256     /* test 2 byte injection at end of area */
257     test_ret += testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
258                            data + 2);
259     printf(" done\n");
260 
261     xmlFreeParserCtxt(ctxt);
262     return(test_ret);
263 }
264 
265 static int
testCurrentChar(xmlParserCtxtPtr ctxt,int * len)266 testCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
267     const xmlChar *oldcur;
268     int c, err, len2;
269 
270     lastError = 0;
271     c = xmlCurrentChar(ctxt, len);
272     ctxt->input->flags = 0;
273     err = lastError;
274 
275     oldcur = ctxt->input->cur;
276     lastError = 0;
277     xmlNextChar(ctxt);
278     ctxt->input->flags = 0;
279     len2 = ctxt->input->cur - oldcur;
280     ctxt->input->cur = oldcur;
281 
282     if ((*ctxt->input->cur != 0) && (err != lastError)) {
283         fprintf(stderr, "xmlCurrentChar and xmlNextChar report different "
284                 "errors: %d %d\n", err, lastError);
285         return(-1);
286     }
287 
288     if ((err == 0) && (*len != len2)) {
289         fprintf(stderr, "xmlCurrentChar and xmlNextChar report different "
290                 "lengths: %d %d\n", *len, len2);
291         return(-1);
292     }
293 
294     lastError = err;
295 
296     return(c);
297 }
298 
testCharRangeByte1(xmlParserCtxtPtr ctxt)299 static int testCharRangeByte1(xmlParserCtxtPtr ctxt) {
300     int i = 0;
301     int len, c;
302     char *data = (char *) ctxt->input->cur;
303 
304     data[1] = 0;
305     data[2] = 0;
306     data[3] = 0;
307     for (i = 0;i <= 0xFF;i++) {
308         data[0] = (char) i;
309         ctxt->nbErrors = 0;
310 
311         c = testCurrentChar(ctxt, &len);
312         if (c < 0)
313             continue;
314 	if (i >= 0x80) {
315 	    /* we must see an error there */
316 	    if (lastError != XML_ERR_INVALID_ENCODING) {
317 	        fprintf(stderr,
318 		    "Failed to detect invalid char for Byte 0x%02X\n", i);
319 		return(1);
320 	    }
321 	} else if (i == 0xD) {
322 	    if ((c != 0xA) || (len != 1)) {
323 		fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i);
324 		return(1);
325 	    }
326 	} else if ((c != i) || (len != 1)) {
327 	    fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i);
328 	    return(1);
329 	}
330     }
331     return(0);
332 }
333 
testCharRangeByte2(xmlParserCtxtPtr ctxt)334 static int testCharRangeByte2(xmlParserCtxtPtr ctxt) {
335     int i, j;
336     int len, c;
337     char *data = (char *) ctxt->input->cur;
338 
339     data[2] = 0;
340     data[3] = 0;
341     for (i = 0x80;i <= 0xFF;i++) {
342 	for (j = 0;j <= 0xFF;j++) {
343 	    data[0] = (char) i;
344 	    data[1] = (char) j;
345             ctxt->nbErrors = 0;
346 
347             c = testCurrentChar(ctxt, &len);
348             if (c < 0)
349                 continue;
350 
351 	    /* if first bit of first char is set, then second bit must too */
352 	    if ((i & 0x80) && ((i & 0x40) == 0)) {
353 		if (lastError != XML_ERR_INVALID_ENCODING) {
354 		    fprintf(stderr,
355 		    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
356 		            i, j);
357 		    return(1);
358 		}
359 	    }
360 
361 	    /*
362 	     * if first bit of first char is set, then second char first
363 	     * bits must be 10
364 	     */
365 	    else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
366 		if (lastError != XML_ERR_INVALID_ENCODING) {
367 		    fprintf(stderr,
368 		"Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
369 		            i, j, c);
370 		    return(1);
371 		}
372 	    }
373 
374 	    /*
375 	     * if using a 2 byte encoding then the value must be greater
376 	     * than 0x80, i.e. one of bits 5 to 1 of i must be set
377 	     */
378 	    else if ((i & 0x80) && ((i & 0x1E) == 0)) {
379 		if (lastError != XML_ERR_INVALID_ENCODING) {
380 		    fprintf(stderr,
381 		"Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
382 		            i, j, c);
383 		    return(1);
384 		}
385 	    }
386 
387 	    /*
388 	     * if third bit of first char is set, then the sequence would need
389 	     * at least 3 bytes, but we give only 2 !
390 	     */
391 	    else if ((i & 0xE0) == 0xE0) {
392 		if (lastError != XML_ERR_INVALID_ENCODING) {
393 		    fprintf(stderr,
394 		"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
395 		            i, j);
396 		    return(1);
397 		}
398 	    }
399 
400             /*
401 	     * We should see no error in remaining cases
402 	     */
403 	    else if ((lastError != 0) || (len != 2)) {
404 		fprintf(stderr,
405 		    "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j);
406 		return(1);
407 	    }
408 
409             /*
410 	     * Finally check the value is right
411 	     */
412 	    else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) {
413 		fprintf(stderr,
414 	"Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n",
415 	                i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c);
416 		return(1);
417 	    }
418         }
419     }
420     return(0);
421 }
422 
testCharRangeByte3(xmlParserCtxtPtr ctxt)423 static int testCharRangeByte3(xmlParserCtxtPtr ctxt) {
424     int i, j, k, K;
425     int len, c;
426     unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
427     char *data = (char *) ctxt->input->cur;
428     int value;
429 
430     data[3] = 0;
431     for (i = 0xE0;i <= 0xFF;i++) {
432     for (j = 0;j <= 0xFF;j++) {
433     for (k = 0;k < 6;k++) {
434 	data[0] = (char) i;
435 	data[1] = (char) j;
436 	K = lows[k];
437 	data[2] = (char) K;
438 	value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
439         ctxt->nbErrors = 0;
440 
441         c = testCurrentChar(ctxt, &len);
442         if (c < 0)
443             continue;
444 
445 	/*
446 	 * if fourth bit of first char is set, then the sequence would need
447 	 * at least 4 bytes, but we give only 3 !
448 	 */
449 	if ((i & 0xF0) == 0xF0) {
450 	    if (lastError != XML_ERR_INVALID_ENCODING) {
451 		fprintf(stderr,
452 	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
453 			i, j, K, data[3]);
454 		return(1);
455 	    }
456 	}
457 
458         /*
459 	 * The second and the third bytes must start with 10
460 	 */
461 	else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) {
462 	    if (lastError != XML_ERR_INVALID_ENCODING) {
463 		fprintf(stderr,
464 	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
465 			i, j, K);
466 		return(1);
467 	    }
468 	}
469 
470 	/*
471 	 * if using a 3 byte encoding then the value must be greater
472 	 * than 0x800, i.e. one of bits 4 to 0 of i must be set or
473 	 * the 6th byte of data[1] must be set
474 	 */
475 	else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) {
476 	    if (lastError != XML_ERR_INVALID_ENCODING) {
477 		fprintf(stderr,
478 	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
479 			i, j, K);
480 		return(1);
481 	    }
482 	}
483 
484         /*
485 	 * There are values that are not allowed in UTF-8
486 	 */
487 	else if ((value > 0xD7FF) && (value <0xE000)) {
488 	    if (lastError != XML_ERR_INVALID_ENCODING) {
489 		fprintf(stderr,
490 	"Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n",
491 			value, i, j, K);
492 		return(1);
493 	    }
494 	}
495 
496 	/*
497 	 * We should see no error in remaining cases
498 	 */
499 	else if ((lastError != 0) || (len != 3)) {
500 	    fprintf(stderr,
501 		"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
502 		    i, j, K);
503 	    return(1);
504 	}
505 
506 	/*
507 	 * Finally check the value is right
508 	 */
509 	else if (c != value) {
510 	    fprintf(stderr,
511     "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
512 		i, j, data[2], value, c);
513 	    return(1);
514 	}
515     }
516     }
517     }
518     return(0);
519 }
520 
testCharRangeByte4(xmlParserCtxtPtr ctxt)521 static int testCharRangeByte4(xmlParserCtxtPtr ctxt) {
522     int i, j, k, K, l, L;
523     int len, c;
524     unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
525     char *data = (char *) ctxt->input->cur;
526     int value;
527 
528     data[4] = 0;
529     for (i = 0xF0;i <= 0xFF;i++) {
530     for (j = 0;j <= 0xFF;j++) {
531     for (k = 0;k < 6;k++) {
532     for (l = 0;l < 6;l++) {
533 	data[0] = (char) i;
534 	data[1] = (char) j;
535 	K = lows[k];
536 	data[2] = (char) K;
537 	L = lows[l];
538 	data[3] = (char) L;
539 	value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
540 	        ((i & 0x7) << 18);
541         ctxt->nbErrors = 0;
542 
543         c = testCurrentChar(ctxt, &len);
544         if (c < 0)
545             continue;
546 
547 	/*
548 	 * if fifth bit of first char is set, then the sequence would need
549 	 * at least 5 bytes, but we give only 4 !
550 	 */
551 	if ((i & 0xF8) == 0xF8) {
552 	    if (lastError != XML_ERR_INVALID_ENCODING) {
553 		fprintf(stderr,
554   "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
555 			i, j, K, data[3]);
556 		return(1);
557 	    }
558 	}
559 
560         /*
561 	 * The second, third and fourth bytes must start with 10
562 	 */
563 	else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) ||
564 	         ((L & 0xC0) != 0x80)) {
565 	    if (lastError != XML_ERR_INVALID_ENCODING) {
566 		fprintf(stderr,
567 	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
568 			i, j, K, L);
569 		return(1);
570 	    }
571 	}
572 
573 	/*
574 	 * if using a 3 byte encoding then the value must be greater
575 	 * than 0x10000, i.e. one of bits 3 to 0 of i must be set or
576 	 * the 6 or 5th byte of j must be set
577 	 */
578 	else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) {
579 	    if (lastError != XML_ERR_INVALID_ENCODING) {
580 		fprintf(stderr,
581 	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
582 			i, j, K, L);
583 		return(1);
584 	    }
585 	}
586 
587         /*
588 	 * There are values in that are not allowed in UTF-8
589 	 */
590 	else if (((value > 0xD7FF) && (value < 0xE000)) ||
591 		 (value > 0x10FFFF)) {
592 	    if (lastError != XML_ERR_INVALID_ENCODING) {
593 		fprintf(stderr,
594 "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
595 			value, i, j, K, L);
596 		return(1);
597 	    }
598 	}
599 
600 	/*
601 	 * We should see no error in remaining cases
602 	 */
603 	else if ((lastError != 0) || (len != 4)) {
604 	    fprintf(stderr,
605 		"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
606 		    i, j, K);
607 	    return(1);
608 	}
609 
610 	/*
611 	 * Finally check the value is right
612 	 */
613 	else if (c != value) {
614 	    fprintf(stderr,
615     "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
616 		i, j, data[2], value, c);
617 	    return(1);
618 	}
619     }
620     }
621     }
622     }
623     return(0);
624 }
625 
626 /**
627  * testCharRanges:
628  *
629  * Test the correct UTF8 character parsing in isolation i.e.
630  * not when parsing a full document, this is less expensive and we can
631  * cover the full range of UTF-8 chars accepted by XML-1.0
632  */
633 
testCharRanges(void)634 static int testCharRanges(void) {
635     char data[5];
636     xmlParserCtxtPtr ctxt;
637     xmlParserInputBufferPtr buf;
638     xmlParserInputPtr input;
639     int test_ret = 0;
640 
641     memset(data, 0, 5);
642 
643     /*
644      * Set up a parsing context using the above data buffer as
645      * the current input source.
646      */
647     ctxt = xmlNewParserCtxt();
648     if (ctxt == NULL) {
649         fprintf(stderr, "Failed to allocate parser context\n");
650 	return(1);
651     }
652     buf = xmlParserInputBufferCreateStatic(data, sizeof(data),
653                                            XML_CHAR_ENCODING_NONE);
654     if (buf == NULL) {
655         fprintf(stderr, "Failed to allocate input buffer\n");
656 	test_ret = 1;
657 	goto error;
658     }
659     input = xmlNewInputStream(ctxt);
660     if (input == NULL) {
661         xmlFreeParserInputBuffer(buf);
662 	test_ret = 1;
663 	goto error;
664     }
665     input->filename = NULL;
666     input->buf = buf;
667     input->cur =
668     input->base = xmlBufContent(input->buf->buffer);
669     input->end = input->base + 4;
670     inputPush(ctxt, input);
671 
672     printf("testing char range: 1");
673     fflush(stdout);
674     test_ret += testCharRangeByte1(ctxt);
675     printf(" 2");
676     fflush(stdout);
677     test_ret += testCharRangeByte2(ctxt);
678     printf(" 3");
679     fflush(stdout);
680     test_ret += testCharRangeByte3(ctxt);
681     printf(" 4");
682     fflush(stdout);
683     test_ret += testCharRangeByte4(ctxt);
684     printf(" done\n");
685     fflush(stdout);
686 
687 error:
688     xmlFreeParserCtxt(ctxt);
689     return(test_ret);
690 }
691 
692 static int
testUserEncoding(void)693 testUserEncoding(void) {
694     /*
695      * Create a document encoded as UTF-16LE with an ISO-8859-1 encoding
696      * declaration, then parse it with xmlReadMemory and the encoding
697      * argument set to UTF-16LE.
698      */
699     xmlDocPtr doc = NULL;
700     const char *start = "<?xml version='1.0' encoding='ISO-8859-1'?><d>";
701     const char *end = "</d>";
702     char *buf = NULL;
703     xmlChar *text;
704     int startSize = strlen(start);
705     int textSize = 100000; /* Make sure to exceed internal buffer sizes. */
706     int endSize = strlen(end);
707     int totalSize = startSize + textSize + endSize;
708     int k = 0;
709     int i;
710     int ret = 1;
711 
712     buf = xmlMalloc(2 * totalSize);
713     for (i = 0; start[i] != 0; i++) {
714         buf[k++] = start[i];
715         buf[k++] = 0;
716     }
717     for (i = 0; i < textSize; i++) {
718         buf[k++] = 'x';
719         buf[k++] = 0;
720     }
721     for (i = 0; end[i] != 0; i++) {
722         buf[k++] = end[i];
723         buf[k++] = 0;
724     }
725 
726     doc = xmlReadMemory(buf, 2 * totalSize, NULL, "UTF-16LE", 0);
727     if (doc == NULL) {
728         fprintf(stderr, "failed to parse document\n");
729         goto error;
730     }
731 
732     text = doc->children->children->content;
733     for (i = 0; i < textSize; i++) {
734         if (text[i] != 'x') {
735             fprintf(stderr, "text node has wrong content at offset %d\n", k);
736             goto error;
737         }
738     }
739 
740     ret = 0;
741 
742 error:
743     xmlFreeDoc(doc);
744     xmlFree(buf);
745 
746     return ret;
747 }
748 
749 #if defined(LIBXML_PUSH_ENABLED) && defined(LIBXML_OUTPUT_ENABLED)
750 
751 static char *
convert(xmlCharEncodingHandlerPtr handler,const char * utf8,int size,int * outSize)752 convert(xmlCharEncodingHandlerPtr handler, const char *utf8, int size,
753         int *outSize) {
754     xmlBufferPtr in, out;
755     char *ret;
756 
757     in = xmlBufferCreate();
758     xmlBufferAdd(in, BAD_CAST utf8, size);
759     out = xmlBufferCreate();
760     xmlCharEncOutFunc(handler, out, in);
761 
762     if (outSize)
763         *outSize = out->use;
764     ret = (char *) xmlBufferDetach(out);
765 
766     xmlBufferFree(out);
767     xmlBufferFree(in);
768     return(ret);
769 }
770 
771 static int
testUserEncodingPush(void)772 testUserEncodingPush(void) {
773     xmlCharEncodingHandlerPtr handler;
774     xmlParserCtxtPtr ctxt;
775     xmlDocPtr doc;
776     char buf[] =
777         "\xEF\xBB\xBF"
778         "<?xml version='1.0' encoding='ISO-8859-1'?>\n"
779         "<d>text</d>\n";
780     char *utf16;
781     int utf16Size;
782     int ret = 1;
783 
784     handler = xmlGetCharEncodingHandler(XML_CHAR_ENCODING_UTF16LE);
785     utf16 = convert(handler, buf, sizeof(buf) - 1, &utf16Size);
786     ctxt = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
787     xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF16LE);
788     xmlParseChunk(ctxt, utf16, utf16Size, 0);
789     xmlParseChunk(ctxt, NULL, 0, 1);
790     doc = ctxt->myDoc;
791 
792     if ((doc != NULL) &&
793         (doc->children != NULL) &&
794         (doc->children->children != NULL) &&
795         (xmlStrcmp(doc->children->children->content, BAD_CAST "text") == 0))
796         ret = 0;
797 
798     xmlFreeDoc(doc);
799     xmlFreeParserCtxt(ctxt);
800     xmlFree(utf16);
801 
802     return(ret);
803 }
804 
805 static int
testUTF8Chunks(void)806 testUTF8Chunks(void) {
807     xmlParserCtxtPtr ctxt;
808     xmlChar *out;
809     int outSize;
810     char *buf;
811     int i;
812     int ret = 0;
813 
814     ctxt = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
815 
816     xmlParseChunk(ctxt, "<d>", 3, 0);
817     xmlParseChunk(ctxt, "\xF0", 1, 0);
818     xmlParseChunk(ctxt, "\x9F", 1, 0);
819     xmlParseChunk(ctxt, "\x98", 1, 0);
820     xmlParseChunk(ctxt, "\x8A", 1, 0);
821     xmlParseChunk(ctxt, "</d>", 4, 1);
822 
823     xmlDocDumpMemory(ctxt->myDoc, &out, &outSize);
824     if (strcmp((char *) out,
825                "<?xml version=\"1.0\"?>\n<d>&#x1F60A;</d>\n") != 0) {
826         fprintf(stderr, "failed UTF-8 chunk test 1\n");
827         ret += 1;
828     }
829 
830     xmlFree(out);
831     xmlFreeDoc(ctxt->myDoc);
832     xmlFreeParserCtxt(ctxt);
833 
834     ctxt = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
835 
836     xmlParseChunk(ctxt, "<d>", 3, 0);
837 
838     /*
839      * Create a chunk longer than XML_PARSER_BIG_BUFFER_SIZE (300) ending
840      * with an incomplete UTF-8 sequence.
841      */
842     buf = xmlMalloc(1000 * 2 + 1);
843     for (i = 0; i < 2000; i += 2)
844         memcpy(buf + i, "\xCE\xB1", 2);
845     buf[i] = '\xCE';
846     xmlParseChunk(ctxt, buf, 2001, 0);
847     xmlFree(buf);
848 
849     xmlParseChunk(ctxt, "\xB1</d>", 4, 0);
850     xmlParseChunk(ctxt, NULL, 0, 0);
851 
852     xmlDocDumpMemory(ctxt->myDoc, &out, &outSize);
853     if (strncmp((char *) out, "<?xml version=\"1.0\"?>\n<d>", 25) != 0) {
854         fprintf(stderr, "failed UTF-8 chunk test 2-1\n");
855         ret += 1;
856         goto error;
857     }
858     for (i = 25; i < 25 + 1001 * 7; i += 7) {
859         if (memcmp(out + i, "&#x3B1;", 7) != 0) {
860             fprintf(stderr, "failed UTF-8 chunk test 2-2 %d\n", i);
861             ret += 1;
862             goto error;
863         }
864     }
865     if (strcmp((char *) out + i, "</d>\n") != 0) {
866         fprintf(stderr, "failed UTF-8 chunk test 2-3\n");
867         ret += 1;
868         goto error;
869     }
870 
871 error:
872     xmlFree(out);
873     xmlFreeDoc(ctxt->myDoc);
874     xmlFreeParserCtxt(ctxt);
875 
876     return(ret);
877     return(0);
878 }
879 
880 #endif
881 
882 static void
bufDump(const char * prefix,const xmlChar * content,int len)883 bufDump(const char *prefix, const xmlChar *content, int len) {
884     int i;
885 
886     fprintf(stderr, "%s", prefix);
887     for (i = 0; i < len; i++) {
888         fprintf(stderr, " %02X", content[i]);
889     }
890     fprintf(stderr, "\n");
891 }
892 
893 static int
bufCompare(xmlBufferPtr got,const xmlChar * expectContent,int expectLen)894 bufCompare(xmlBufferPtr got, const xmlChar *expectContent, int expectLen) {
895     const xmlChar *gotContent = xmlBufferContent(got);
896     int gotLen = xmlBufferLength(got);
897 
898     if ((gotLen == expectLen) &&
899         (memcmp(gotContent, expectContent, gotLen) == 0))
900         return(0);
901 
902     bufDump("got:     ", gotContent, gotLen);
903     bufDump("expected:", expectContent, expectLen);
904 
905     return(-1);
906 }
907 
908 static int
testEncHandler(xmlCharEncodingHandlerPtr handler,const xmlChar * dec,int decSize,const xmlChar * enc,int encSize)909 testEncHandler(xmlCharEncodingHandlerPtr handler, const xmlChar *dec,
910                 int decSize, const xmlChar *enc, int encSize) {
911     xmlBufferPtr encBuf = xmlBufferCreate();
912     xmlBufferPtr decBuf = xmlBufferCreate();
913     int ret = 0;
914 
915     xmlBufferAdd(encBuf, enc, encSize);
916     xmlCharEncInFunc(handler, decBuf, encBuf);
917     if (bufCompare(decBuf, dec, decSize) != 0) {
918         fprintf(stderr, "Decoding %s failed\n", handler->name);
919         ret = -1;
920     }
921 
922 #ifdef LIBXML_OUTPUT_ENABLED
923     xmlBufferEmpty(decBuf);
924     xmlBufferAdd(decBuf, dec, decSize);
925     xmlCharEncOutFunc(handler, encBuf, decBuf);
926     if (bufCompare(encBuf, enc, encSize) != 0) {
927         fprintf(stderr, "Encoding %s failed\n", handler->name);
928         ret = -1;
929     }
930 #endif
931 
932     xmlBufferFree(decBuf);
933     xmlBufferFree(encBuf);
934     return(ret);
935 }
936 
937 static int
testUTF16(void)938 testUTF16(void) {
939     static const xmlChar utf8[] =
940         "\x01"
941         "\x7F"
942         "\xC2\x80"
943         "\xDF\xBF"
944         "\xE0\xA0\x80"
945         "\xEF\xBF\xBF"
946         "\xF0\x90\x80\x80"
947         "\xF4\x8F\xBF\xBF";
948     static const xmlChar utf16LE[] =
949         "\x01\x00"
950         "\x7F\x00"
951         "\x80\x00"
952         "\xFF\x07"
953         "\x00\x08"
954         "\xFF\xFF"
955         "\x00\xD8\x00\xDC"
956         "\xFF\xDB\xFF\xDF";
957     static const xmlChar utf16BE[] =
958         "\x00\x01"
959         "\x00\x7F"
960         "\x00\x80"
961         "\x07\xFF"
962         "\x08\x00"
963         "\xFF\xFF"
964         "\xD8\x00\xDC\x00"
965         "\xDB\xFF\xDF\xFF";
966 
967     xmlCharEncodingHandlerPtr handler16LE, handler16BE;
968     int ret = 0;
969 
970     handler16LE = xmlFindCharEncodingHandler("UTF-16LE");
971     handler16BE = xmlFindCharEncodingHandler("UTF-16BE");
972 
973     if (testEncHandler(handler16LE,
974                        utf8, sizeof(utf8) - 1,
975                        utf16LE, sizeof(utf16LE) - 1) != 0)
976         ret = -1;
977     if (testEncHandler(handler16BE,
978                        utf8, sizeof(utf8) - 1,
979                        utf16BE, sizeof(utf16BE) - 1) != 0)
980         ret = -1;
981 
982     return(ret);
983 }
984 
main(void)985 int main(void) {
986     int ret = 0;
987 
988     /*
989      * this initialize the library and check potential ABI mismatches
990      * between the version it was compiled for and the actual shared
991      * library used.
992      */
993     LIBXML_TEST_VERSION
994 
995     /*
996      * Catch errors separately
997      */
998 
999     xmlSetStructuredErrorFunc(NULL, errorHandler);
1000 
1001     /*
1002      * Run the tests
1003      */
1004     ret += testCharRanges();
1005     ret += testDocumentRanges();
1006     ret += testUserEncoding();
1007 #if defined(LIBXML_PUSH_ENABLED) && defined(LIBXML_OUTPUT_ENABLED)
1008     ret += testUserEncodingPush();
1009     ret += testUTF8Chunks();
1010 #endif
1011     ret += testUTF16();
1012 
1013     /*
1014      * Cleanup function for the XML library.
1015      */
1016     xmlCleanupParser();
1017     return(ret ? 1 : 0);
1018 }
1019