1 /*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
16 * See Copyright for the status of this software.
17 *
18 * [email protected]
19 *
20 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <[email protected]>
21 */
22
23 #define IN_LIBXML
24 #include "libxml.h"
25
26 #include <string.h>
27 #include <limits.h>
28 #include <ctype.h>
29 #include <stdlib.h>
30
31 #ifdef LIBXML_ICONV_ENABLED
32 #include <iconv.h>
33 #include <errno.h>
34 #endif
35
36 #include <libxml/encoding.h>
37 #include <libxml/xmlmemory.h>
38 #include <libxml/parser.h>
39 #ifdef LIBXML_HTML_ENABLED
40 #include <libxml/HTMLparser.h>
41 #endif
42 #include <libxml/xmlerror.h>
43
44 #include "private/buf.h"
45 #include "private/enc.h"
46 #include "private/entities.h"
47 #include "private/error.h"
48
49 #ifdef LIBXML_ICU_ENABLED
50 #include <unicode/ucnv.h>
51 #endif
52
53 #define XML_HANDLER_STATIC 1
54
55 typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
56 typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
57 struct _xmlCharEncodingAlias {
58 const char *name;
59 const char *alias;
60 };
61
62 static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
63 static int xmlCharEncodingAliasesNb = 0;
64 static int xmlCharEncodingAliasesMax = 0;
65
66 static int xmlLittleEndian = 1;
67
68 typedef struct {
69 const char *name;
70 xmlCharEncoding enc;
71 } xmlEncTableEntry;
72
73 static const xmlEncTableEntry xmlEncTable[] = {
74 { "ASCII", XML_CHAR_ENCODING_ASCII },
75 { "EUC-JP", XML_CHAR_ENCODING_EUC_JP },
76 { "HTML", XML_CHAR_ENCODING_HTML },
77 { "ISO LATIN 1", XML_CHAR_ENCODING_8859_1 },
78 { "ISO LATIN 2", XML_CHAR_ENCODING_8859_2 },
79 { "ISO-10646-UCS-2", XML_CHAR_ENCODING_UCS2 },
80 { "ISO-10646-UCS-4", XML_CHAR_ENCODING_UCS4LE },
81 { "ISO-2022-JP", XML_CHAR_ENCODING_2022_JP },
82 { "ISO-8859-1", XML_CHAR_ENCODING_8859_1 },
83 { "ISO-8859-10", XML_CHAR_ENCODING_8859_10 },
84 { "ISO-8859-11", XML_CHAR_ENCODING_8859_11 },
85 { "ISO-8859-13", XML_CHAR_ENCODING_8859_13 },
86 { "ISO-8859-14", XML_CHAR_ENCODING_8859_14 },
87 { "ISO-8859-15", XML_CHAR_ENCODING_8859_15 },
88 { "ISO-8859-16", XML_CHAR_ENCODING_8859_16 },
89 { "ISO-8859-2", XML_CHAR_ENCODING_8859_2 },
90 { "ISO-8859-3", XML_CHAR_ENCODING_8859_3 },
91 { "ISO-8859-4", XML_CHAR_ENCODING_8859_4 },
92 { "ISO-8859-5", XML_CHAR_ENCODING_8859_5 },
93 { "ISO-8859-6", XML_CHAR_ENCODING_8859_6 },
94 { "ISO-8859-7", XML_CHAR_ENCODING_8859_7 },
95 { "ISO-8859-8", XML_CHAR_ENCODING_8859_8 },
96 { "ISO-8859-9", XML_CHAR_ENCODING_8859_9 },
97 { "ISO-LATIN-1", XML_CHAR_ENCODING_8859_1 },
98 { "ISO-LATIN-2", XML_CHAR_ENCODING_8859_2 },
99 { "SHIFT_JIS", XML_CHAR_ENCODING_SHIFT_JIS },
100 { "UCS-2", XML_CHAR_ENCODING_UCS2 },
101 { "UCS-4", XML_CHAR_ENCODING_UCS4LE },
102 { "UCS2", XML_CHAR_ENCODING_UCS2 },
103 { "UCS4", XML_CHAR_ENCODING_UCS4LE },
104 { "US-ASCII", XML_CHAR_ENCODING_ASCII },
105 { "UTF-16", XML_CHAR_ENCODING_UTF16 },
106 { "UTF-16BE", XML_CHAR_ENCODING_UTF16BE },
107 { "UTF-16LE", XML_CHAR_ENCODING_UTF16LE },
108 { "UTF-8", XML_CHAR_ENCODING_UTF8 },
109 { "UTF16", XML_CHAR_ENCODING_UTF16 },
110 { "UTF8", XML_CHAR_ENCODING_UTF8 }
111 };
112
113 static int
114 asciiToAscii(unsigned char* out, int *outlen,
115 const unsigned char* in, int *inlen, void *vctxt);
116 static int
117 UTF8ToUTF8(unsigned char* out, int *outlen,
118 const unsigned char* inb, int *inlenb, void *vctxt);
119 static int
120 latin1ToUTF8(unsigned char* out, int *outlen,
121 const unsigned char* in, int *inlen, void *vctxt);
122 static int
123 UTF16LEToUTF8(unsigned char* out, int *outlen,
124 const unsigned char* inb, int *inlenb, void *vctxt);
125 static int
126 UTF16BEToUTF8(unsigned char* out, int *outlen,
127 const unsigned char* inb, int *inlenb, void *vctxt);
128
129 #ifdef LIBXML_OUTPUT_ENABLED
130
131 static int
132 UTF8ToLatin1(unsigned char* outb, int *outlen,
133 const unsigned char* in, int *inlen, void *vctxt);
134 static int
135 UTF8ToUTF16(unsigned char* outb, int *outlen,
136 const unsigned char* in, int *inlen, void *vctxt);
137 static int
138 UTF8ToUTF16LE(unsigned char* outb, int *outlen,
139 const unsigned char* in, int *inlen, void *vctxt);
140 static int
141 UTF8ToUTF16BE(unsigned char* outb, int *outlen,
142 const unsigned char* in, int *inlen, void *vctxt);
143
144 #else /* LIBXML_OUTPUT_ENABLED */
145
146 #define UTF8ToLatin1 NULL
147 #define UTF8ToUTF16 NULL
148 #define UTF8ToUTF16LE NULL
149 #define UTF8ToUTF16BE NULL
150
151 #endif /* LIBXML_OUTPUT_ENABLED */
152
153 #if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
154 static int
155 UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
156 const unsigned char *in, int *inlen, void *vctxt);
157 #else
158 #define UTF8ToHtmlWrapper NULL
159 #endif
160
161 #ifdef LIBXML_ICONV_ENABLED
162 #define EMPTY_ICONV , (iconv_t) 0, (iconv_t) 0
163 #else
164 #define EMPTY_ICONV
165 #endif
166
167 #if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \
168 defined(LIBXML_ISO8859X_ENABLED)
169
170 #include "iso8859x.inc"
171
172 static int
173 ISO8859xToUTF8(unsigned char* out, int *outlen,
174 const unsigned char* in, int *inlen, void *vctxt);
175 static int
176 UTF8ToISO8859x(unsigned char *out, int *outlen,
177 const unsigned char *in, int *inlen, void *vctxt);
178
179 #define MAKE_ISO_HANDLER(name, n) \
180 { (char *) name, \
181 (xmlCharEncodingInputFunc) (void (*)(void)) ISO8859xToUTF8, \
182 (xmlCharEncodingInputFunc) (void (*)(void)) UTF8ToISO8859x \
183 EMPTY_ICONV, \
184 (void *) xmlunicodetable_ISO8859_##n, \
185 (void *) xmltranscodetable_ISO8859_##n, \
186 NULL, XML_HANDLER_STATIC }
187
188 #else /* LIBXML_ISO8859X_ENABLED */
189
190 #define MAKE_ISO_HANDLER(name, n) \
191 { (char *) name, NULL, NULL EMPTY_ICONV, NULL, NULL, NULL, \
192 XML_HANDLER_STATIC }
193
194 #endif /* LIBXML_ISO8859X_ENABLED */
195
196 #define MAKE_HANDLER(name, in, out) \
197 { (char *) name, \
198 (xmlCharEncodingInputFunc) (void (*)(void)) in, \
199 (xmlCharEncodingOutputFunc) (void (*)(void)) out \
200 EMPTY_ICONV, NULL, NULL, NULL, XML_HANDLER_STATIC }
201
202 /*
203 * The layout must match enum xmlCharEncoding.
204 *
205 * Names should match the IANA registry if possible:
206 * https://www.iana.org/assignments/character-sets/character-sets.xhtml
207 */
208 static const xmlCharEncodingHandler defaultHandlers[31] = {
209 MAKE_HANDLER(NULL, NULL, NULL), /* NONE */
210 MAKE_HANDLER("UTF-8", UTF8ToUTF8, UTF8ToUTF8),
211 MAKE_HANDLER("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE),
212 MAKE_HANDLER("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE),
213 MAKE_HANDLER("UCS-4LE", NULL, NULL),
214 MAKE_HANDLER("UCS-4BE", NULL, NULL),
215 MAKE_HANDLER("IBM037", NULL, NULL),
216 MAKE_HANDLER("ISO-10646-UCS-4", NULL, NULL), /* UCS4_2143 */
217 MAKE_HANDLER("ISO-10646-UCS-4", NULL, NULL), /* UCS4_2143 */
218 MAKE_HANDLER("ISO-10646-UCS-2", NULL, NULL),
219 MAKE_HANDLER("ISO-8859-1", latin1ToUTF8, UTF8ToLatin1),
220 MAKE_ISO_HANDLER("ISO-8859-2", 2),
221 MAKE_ISO_HANDLER("ISO-8859-3", 3),
222 MAKE_ISO_HANDLER("ISO-8859-4", 4),
223 MAKE_ISO_HANDLER("ISO-8859-5", 5),
224 MAKE_ISO_HANDLER("ISO-8859-6", 6),
225 MAKE_ISO_HANDLER("ISO-8859-7", 7),
226 MAKE_ISO_HANDLER("ISO-8859-8", 8),
227 MAKE_ISO_HANDLER("ISO-8859-9", 9),
228 MAKE_HANDLER("ISO-2022-JP", NULL, NULL),
229 MAKE_HANDLER("Shift_JIS", NULL, NULL),
230 MAKE_HANDLER("EUC-JP", NULL, NULL),
231 MAKE_HANDLER("US-ASCII", asciiToAscii, asciiToAscii),
232 MAKE_HANDLER("UTF-16", UTF16LEToUTF8, UTF8ToUTF16),
233 MAKE_HANDLER("HTML", NULL, UTF8ToHtmlWrapper),
234 MAKE_ISO_HANDLER("ISO-8859-10", 10),
235 MAKE_ISO_HANDLER("ISO-8859-11", 11),
236 MAKE_ISO_HANDLER("ISO-8859-13", 13),
237 MAKE_ISO_HANDLER("ISO-8859-14", 14),
238 MAKE_ISO_HANDLER("ISO-8859-15", 15),
239 MAKE_ISO_HANDLER("ISO-8859-16", 16),
240 };
241
242 #define NUM_DEFAULT_HANDLERS \
243 (sizeof(defaultHandlers) / sizeof(defaultHandlers[0]))
244
245 /* the size should be growable, but it's not a big deal ... */
246 #define MAX_ENCODING_HANDLERS 50
247 static xmlCharEncodingHandlerPtr *globalHandlers = NULL;
248 static int nbCharEncodingHandler = 0;
249
250 #ifdef LIBXML_ICONV_ENABLED
251 static int
252 xmlCharEncIconv(void *vctxt, const char *name, xmlCharEncConverter *conv);
253 #endif
254
255 #ifdef LIBXML_ICU_ENABLED
256 static int
257 xmlCharEncUconv(void *vctxt, const char *name, xmlCharEncConverter *conv);
258 #endif
259
260 /************************************************************************
261 * *
262 * Generic encoding handling routines *
263 * *
264 ************************************************************************/
265
266 /**
267 * xmlDetectCharEncoding:
268 * @in: a pointer to the first bytes of the XML entity, must be at least
269 * 2 bytes long (at least 4 if encoding is UTF4 variant).
270 * @len: pointer to the length of the buffer
271 *
272 * Guess the encoding of the entity using the first bytes of the entity content
273 * according to the non-normative appendix F of the XML-1.0 recommendation.
274 *
275 * Returns one of the XML_CHAR_ENCODING_... values.
276 */
277 xmlCharEncoding
xmlDetectCharEncoding(const unsigned char * in,int len)278 xmlDetectCharEncoding(const unsigned char* in, int len)
279 {
280 if (in == NULL)
281 return(XML_CHAR_ENCODING_NONE);
282 if (len >= 4) {
283 if ((in[0] == 0x00) && (in[1] == 0x00) &&
284 (in[2] == 0x00) && (in[3] == 0x3C))
285 return(XML_CHAR_ENCODING_UCS4BE);
286 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
287 (in[2] == 0x00) && (in[3] == 0x00))
288 return(XML_CHAR_ENCODING_UCS4LE);
289 if ((in[0] == 0x00) && (in[1] == 0x00) &&
290 (in[2] == 0x3C) && (in[3] == 0x00))
291 return(XML_CHAR_ENCODING_UCS4_2143);
292 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
293 (in[2] == 0x00) && (in[3] == 0x00))
294 return(XML_CHAR_ENCODING_UCS4_3412);
295 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
296 (in[2] == 0xA7) && (in[3] == 0x94))
297 return(XML_CHAR_ENCODING_EBCDIC);
298 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
299 (in[2] == 0x78) && (in[3] == 0x6D))
300 return(XML_CHAR_ENCODING_UTF8);
301 /*
302 * Although not part of the recommendation, we also
303 * attempt an "auto-recognition" of UTF-16LE and
304 * UTF-16BE encodings.
305 */
306 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
307 (in[2] == 0x3F) && (in[3] == 0x00))
308 return(XML_CHAR_ENCODING_UTF16LE);
309 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
310 (in[2] == 0x00) && (in[3] == 0x3F))
311 return(XML_CHAR_ENCODING_UTF16BE);
312 }
313 if (len >= 3) {
314 /*
315 * Errata on XML-1.0 June 20 2001
316 * We now allow an UTF8 encoded BOM
317 */
318 if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
319 (in[2] == 0xBF))
320 return(XML_CHAR_ENCODING_UTF8);
321 }
322 /* For UTF-16 we can recognize by the BOM */
323 if (len >= 2) {
324 if ((in[0] == 0xFE) && (in[1] == 0xFF))
325 return(XML_CHAR_ENCODING_UTF16BE);
326 if ((in[0] == 0xFF) && (in[1] == 0xFE))
327 return(XML_CHAR_ENCODING_UTF16LE);
328 }
329 return(XML_CHAR_ENCODING_NONE);
330 }
331
332 /**
333 * xmlCleanupEncodingAliases:
334 *
335 * DEPRECATED: This function modifies global state and is not
336 * thread-safe.
337 *
338 * Unregisters all aliases
339 */
340 void
xmlCleanupEncodingAliases(void)341 xmlCleanupEncodingAliases(void) {
342 int i;
343
344 if (xmlCharEncodingAliases == NULL)
345 return;
346
347 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
348 if (xmlCharEncodingAliases[i].name != NULL)
349 xmlFree((char *) xmlCharEncodingAliases[i].name);
350 if (xmlCharEncodingAliases[i].alias != NULL)
351 xmlFree((char *) xmlCharEncodingAliases[i].alias);
352 }
353 xmlCharEncodingAliasesNb = 0;
354 xmlCharEncodingAliasesMax = 0;
355 xmlFree(xmlCharEncodingAliases);
356 xmlCharEncodingAliases = NULL;
357 }
358
359 /**
360 * xmlGetEncodingAlias:
361 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
362 *
363 * DEPRECATED: This function is not thread-safe.
364 *
365 * Lookup an encoding name for the given alias.
366 *
367 * Returns NULL if not found, otherwise the original name
368 */
369 const char *
xmlGetEncodingAlias(const char * alias)370 xmlGetEncodingAlias(const char *alias) {
371 int i;
372 char upper[100];
373
374 if (alias == NULL)
375 return(NULL);
376
377 if (xmlCharEncodingAliases == NULL)
378 return(NULL);
379
380 for (i = 0;i < 99;i++) {
381 upper[i] = (char) toupper((unsigned char) alias[i]);
382 if (upper[i] == 0) break;
383 }
384 upper[i] = 0;
385
386 /*
387 * Walk down the list looking for a definition of the alias
388 */
389 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
390 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
391 return(xmlCharEncodingAliases[i].name);
392 }
393 }
394 return(NULL);
395 }
396
397 /**
398 * xmlAddEncodingAlias:
399 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
400 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
401 *
402 * DEPRECATED: This function modifies global state and is not
403 * thread-safe.
404 *
405 * Registers an alias @alias for an encoding named @name. Existing alias
406 * will be overwritten.
407 *
408 * Returns 0 in case of success, -1 in case of error
409 */
410 int
xmlAddEncodingAlias(const char * name,const char * alias)411 xmlAddEncodingAlias(const char *name, const char *alias) {
412 int i;
413 char upper[100];
414 char *nameCopy, *aliasCopy;
415
416 if ((name == NULL) || (alias == NULL))
417 return(-1);
418
419 for (i = 0;i < 99;i++) {
420 upper[i] = (char) toupper((unsigned char) alias[i]);
421 if (upper[i] == 0) break;
422 }
423 upper[i] = 0;
424
425 if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
426 xmlCharEncodingAliasPtr tmp;
427 size_t newSize = xmlCharEncodingAliasesMax ?
428 xmlCharEncodingAliasesMax * 2 :
429 20;
430
431 tmp = (xmlCharEncodingAliasPtr)
432 xmlRealloc(xmlCharEncodingAliases,
433 newSize * sizeof(xmlCharEncodingAlias));
434 if (tmp == NULL)
435 return(-1);
436 xmlCharEncodingAliases = tmp;
437 xmlCharEncodingAliasesMax = newSize;
438 }
439
440 /*
441 * Walk down the list looking for a definition of the alias
442 */
443 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
444 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
445 /*
446 * Replace the definition.
447 */
448 nameCopy = xmlMemStrdup(name);
449 if (nameCopy == NULL)
450 return(-1);
451 xmlFree((char *) xmlCharEncodingAliases[i].name);
452 xmlCharEncodingAliases[i].name = nameCopy;
453 return(0);
454 }
455 }
456 /*
457 * Add the definition
458 */
459 nameCopy = xmlMemStrdup(name);
460 if (nameCopy == NULL)
461 return(-1);
462 aliasCopy = xmlMemStrdup(upper);
463 if (aliasCopy == NULL) {
464 xmlFree(nameCopy);
465 return(-1);
466 }
467 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = nameCopy;
468 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = aliasCopy;
469 xmlCharEncodingAliasesNb++;
470 return(0);
471 }
472
473 /**
474 * xmlDelEncodingAlias:
475 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
476 *
477 * DEPRECATED: This function modifies global state and is not
478 * thread-safe.
479 *
480 * Unregisters an encoding alias @alias
481 *
482 * Returns 0 in case of success, -1 in case of error
483 */
484 int
xmlDelEncodingAlias(const char * alias)485 xmlDelEncodingAlias(const char *alias) {
486 int i;
487
488 if (alias == NULL)
489 return(-1);
490
491 if (xmlCharEncodingAliases == NULL)
492 return(-1);
493 /*
494 * Walk down the list looking for a definition of the alias
495 */
496 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
497 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
498 xmlFree((char *) xmlCharEncodingAliases[i].name);
499 xmlFree((char *) xmlCharEncodingAliases[i].alias);
500 xmlCharEncodingAliasesNb--;
501 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
502 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
503 return(0);
504 }
505 }
506 return(-1);
507 }
508
509 static int
xmlCompareEncTableEntries(const void * vkey,const void * ventry)510 xmlCompareEncTableEntries(const void *vkey, const void *ventry) {
511 const char *key = vkey;
512 const xmlEncTableEntry *entry = ventry;
513
514 return(xmlStrcasecmp(BAD_CAST key, BAD_CAST entry->name));
515 }
516
517 static xmlCharEncoding
xmlParseCharEncodingInternal(const char * name)518 xmlParseCharEncodingInternal(const char *name)
519 {
520 const xmlEncTableEntry *entry;
521
522 if (name == NULL)
523 return(XML_CHAR_ENCODING_NONE);
524
525 entry = bsearch(name, xmlEncTable,
526 sizeof(xmlEncTable) / sizeof(xmlEncTable[0]),
527 sizeof(xmlEncTable[0]), xmlCompareEncTableEntries);
528 if (entry != NULL)
529 return(entry->enc);
530
531 return(XML_CHAR_ENCODING_ERROR);
532 }
533
534 /**
535 * xmlParseCharEncoding:
536 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
537 *
538 * Compare the string to the encoding schemes already known. Note
539 * that the comparison is case insensitive accordingly to the section
540 * [XML] 4.3.3 Character Encoding in Entities.
541 *
542 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
543 * if not recognized.
544 */
545 xmlCharEncoding
xmlParseCharEncoding(const char * name)546 xmlParseCharEncoding(const char *name)
547 {
548 xmlCharEncoding enc = xmlParseCharEncodingInternal(name);
549
550 /* Backward compatibility */
551 if (enc == XML_CHAR_ENCODING_UTF16)
552 enc = XML_CHAR_ENCODING_UTF16LE;
553
554 return(enc);
555 }
556
557 /**
558 * xmlGetCharEncodingName:
559 * @enc: the encoding
560 *
561 * The "canonical" name for XML encoding.
562 * C.f. http://www.w3.org/TR/REC-xml#charencoding
563 * Section 4.3.3 Character Encoding in Entities
564 *
565 * Returns the canonical name for the given encoding
566 */
567 const char*
xmlGetCharEncodingName(xmlCharEncoding enc)568 xmlGetCharEncodingName(xmlCharEncoding enc) {
569 switch (enc) {
570 case XML_CHAR_ENCODING_UTF16LE:
571 return("UTF-16");
572 case XML_CHAR_ENCODING_UTF16BE:
573 return("UTF-16");
574 case XML_CHAR_ENCODING_UCS4LE:
575 return("ISO-10646-UCS-4");
576 case XML_CHAR_ENCODING_UCS4BE:
577 return("ISO-10646-UCS-4");
578 default:
579 break;
580 }
581
582 if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS))
583 return(NULL);
584
585 return(defaultHandlers[enc].name);
586 }
587
588 /************************************************************************
589 * *
590 * Char encoding handlers *
591 * *
592 ************************************************************************/
593
594 /**
595 * xmlNewCharEncodingHandler:
596 * @name: the encoding name, in UTF-8 format (ASCII actually)
597 * @input: the xmlCharEncodingInputFunc to read that encoding
598 * @output: the xmlCharEncodingOutputFunc to write that encoding
599 *
600 * DEPRECATED: This function modifies global state and is not
601 * thread-safe.
602 *
603 * Create and registers an xmlCharEncodingHandler.
604 *
605 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
606 */
607 xmlCharEncodingHandlerPtr
xmlNewCharEncodingHandler(const char * name,xmlCharEncodingInputFunc input,xmlCharEncodingOutputFunc output)608 xmlNewCharEncodingHandler(const char *name,
609 xmlCharEncodingInputFunc input,
610 xmlCharEncodingOutputFunc output) {
611 xmlCharEncodingHandlerPtr handler;
612 const char *alias;
613 char upper[500];
614 int i;
615 char *up = NULL;
616
617 /*
618 * Do the alias resolution
619 */
620 alias = xmlGetEncodingAlias(name);
621 if (alias != NULL)
622 name = alias;
623
624 /*
625 * Keep only the uppercase version of the encoding.
626 */
627 if (name == NULL)
628 return(NULL);
629 for (i = 0;i < 499;i++) {
630 upper[i] = (char) toupper((unsigned char) name[i]);
631 if (upper[i] == 0) break;
632 }
633 upper[i] = 0;
634 up = xmlMemStrdup(upper);
635 if (up == NULL)
636 return(NULL);
637
638 /*
639 * allocate and fill-up an handler block.
640 */
641 handler = (xmlCharEncodingHandlerPtr)
642 xmlMalloc(sizeof(xmlCharEncodingHandler));
643 if (handler == NULL) {
644 xmlFree(up);
645 return(NULL);
646 }
647 memset(handler, 0, sizeof(xmlCharEncodingHandler));
648 handler->input = input;
649 handler->output = output;
650 handler->name = up;
651 handler->flags = XML_HANDLER_STATIC;
652
653 #ifdef LIBXML_ICONV_ENABLED
654 handler->iconv_in = NULL;
655 handler->iconv_out = NULL;
656 #endif
657
658 /*
659 * registers and returns the handler.
660 */
661 xmlRegisterCharEncodingHandler(handler);
662 return(handler);
663 }
664
665 /**
666 * xmlInitCharEncodingHandlers:
667 *
668 * DEPRECATED: Alias for xmlInitParser.
669 */
670 void
xmlInitCharEncodingHandlers(void)671 xmlInitCharEncodingHandlers(void) {
672 xmlInitParser();
673 }
674
675 /**
676 * xmlInitEncodingInternal:
677 *
678 * Initialize the char encoding support.
679 */
680 void
xmlInitEncodingInternal(void)681 xmlInitEncodingInternal(void) {
682 unsigned short int tst = 0x1234;
683 unsigned char *ptr = (unsigned char *) &tst;
684
685 if (*ptr == 0x12) xmlLittleEndian = 0;
686 else xmlLittleEndian = 1;
687 }
688
689 /**
690 * xmlCleanupCharEncodingHandlers:
691 *
692 * DEPRECATED: This function will be made private. Call xmlCleanupParser
693 * to free global state but see the warnings there. xmlCleanupParser
694 * should be only called once at program exit. In most cases, you don't
695 * have call cleanup functions at all.
696 *
697 * Cleanup the memory allocated for the char encoding support, it
698 * unregisters all the encoding handlers and the aliases.
699 */
700 void
xmlCleanupCharEncodingHandlers(void)701 xmlCleanupCharEncodingHandlers(void) {
702 xmlCleanupEncodingAliases();
703
704 if (globalHandlers == NULL) return;
705
706 for (;nbCharEncodingHandler > 0;) {
707 xmlCharEncodingHandler *handler;
708
709 nbCharEncodingHandler--;
710 handler = globalHandlers[nbCharEncodingHandler];
711 if (handler != NULL) {
712 if (handler->name != NULL)
713 xmlFree(handler->name);
714 xmlFree(handler);
715 }
716 }
717 xmlFree(globalHandlers);
718 globalHandlers = NULL;
719 nbCharEncodingHandler = 0;
720 }
721
722 /**
723 * xmlRegisterCharEncodingHandler:
724 * @handler: the xmlCharEncodingHandlerPtr handler block
725 *
726 * DEPRECATED: This function modifies global state and is not
727 * thread-safe.
728 *
729 * Register the char encoding handler.
730 */
731 void
xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler)732 xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
733 if (handler == NULL)
734 return;
735 if (globalHandlers == NULL) {
736 globalHandlers = xmlMalloc(
737 MAX_ENCODING_HANDLERS * sizeof(globalHandlers[0]));
738 if (globalHandlers == NULL)
739 goto free_handler;
740 }
741
742 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS)
743 goto free_handler;
744 globalHandlers[nbCharEncodingHandler++] = handler;
745 return;
746
747 free_handler:
748 if (handler != NULL) {
749 if (handler->name != NULL) {
750 xmlFree(handler->name);
751 }
752 xmlFree(handler);
753 }
754 }
755
756 static int
xmlInvokeConvImpl(xmlCharEncConvImpl impl,void * implCtxt,const char * name,xmlCharEncodingHandler * handler)757 xmlInvokeConvImpl(xmlCharEncConvImpl impl, void *implCtxt,
758 const char *name, xmlCharEncodingHandler *handler) {
759 xmlCharEncConverter conv = { NULL, NULL, NULL, NULL, NULL };
760 int ret;
761
762 ret = impl(implCtxt, name, &conv);
763
764 if (ret == XML_ERR_OK) {
765 handler->input =
766 (xmlCharEncodingInputFunc) (void (*)(void)) conv.input;
767 handler->output =
768 (xmlCharEncodingOutputFunc) (void (*)(void)) conv.output;
769 handler->ctxtDtor = conv.ctxtDtor;
770 handler->inputCtxt = conv.inputCtxt;
771 handler->outputCtxt = conv.outputCtxt;
772 }
773
774 return(ret);
775 }
776
777 /**
778 * xmlFindExtraHandler:
779 * @norig: name of the char encoding
780 * @name: potentially aliased name of the encoding
781 * @output: boolean, use handler for output
782 * @impl: a conversion implementation (optional)
783 * @implCtxt: user data for conversion implementation (optional)
784 * @out: pointer to resulting handler
785 *
786 * Search the non-default handlers for an exact match.
787 *
788 * Returns an xmlParserErrors error code.
789 */
790 static int
xmlFindExtraHandler(const char * norig,const char * name,int output,xmlCharEncConvImpl impl,void * implCtxt,xmlCharEncodingHandler ** out)791 xmlFindExtraHandler(const char *norig, const char *name, int output,
792 xmlCharEncConvImpl impl, void *implCtxt,
793 xmlCharEncodingHandler **out) {
794 xmlCharEncodingHandler *handler;
795 int ret;
796 int i;
797
798 handler = xmlMalloc(sizeof(*handler));
799 if (handler == NULL)
800 return(XML_ERR_NO_MEMORY);
801 memset(handler, 0, sizeof(*handler));
802
803 handler->name = xmlMemStrdup(name);
804 if (handler->name == NULL) {
805 ret = XML_ERR_NO_MEMORY;
806 goto done;
807 }
808
809 /*
810 * Try custom implementation before deprecated global handlers.
811 *
812 * Note that we pass the original name without deprecated
813 * alias resolution.
814 */
815 if (impl != NULL) {
816 ret = xmlInvokeConvImpl(impl, implCtxt, norig, handler);
817 if (ret != XML_ERR_OK)
818 goto done;
819
820 *out = handler;
821 return(XML_ERR_OK);
822 }
823
824 /*
825 * Deprecated
826 */
827 if (globalHandlers != NULL) {
828 for (i = 0; i < nbCharEncodingHandler; i++) {
829 xmlCharEncodingHandler *h = globalHandlers[i];
830
831 if (!xmlStrcasecmp((const xmlChar *) name,
832 (const xmlChar *) h->name)) {
833 if ((output ? h->output : h->input) != NULL) {
834 *out = h;
835 ret = XML_ERR_OK;
836 goto done;
837 }
838 }
839 }
840 }
841
842 #ifdef LIBXML_ICONV_ENABLED
843 ret = xmlInvokeConvImpl(xmlCharEncIconv, handler, name, handler);
844 if (ret == XML_ERR_OK) {
845 *out = handler;
846 return(XML_ERR_OK);
847 }
848 if (ret != XML_ERR_UNSUPPORTED_ENCODING)
849 goto done;
850 #endif /* LIBXML_ICONV_ENABLED */
851
852 #ifdef LIBXML_ICU_ENABLED
853 ret = xmlInvokeConvImpl(xmlCharEncUconv, handler, name, handler);
854 if (ret == XML_ERR_OK) {
855 *out = handler;
856 return(XML_ERR_OK);
857 }
858 if (ret != XML_ERR_UNSUPPORTED_ENCODING)
859 goto done;
860 #endif /* LIBXML_ICU_ENABLED */
861
862 ret = XML_ERR_UNSUPPORTED_ENCODING;
863
864 done:
865 if (handler != NULL) {
866 xmlFree(handler->name);
867 xmlFree(handler);
868 }
869
870 return(ret);
871 }
872
873 /**
874 * xmlLookupCharEncodingHandler:
875 * @enc: an xmlCharEncoding value.
876 * @out: pointer to result
877 *
878 * Find or create a handler matching the encoding. The following
879 * converters are looked up in order:
880 *
881 * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
882 * - User-registered global handler (deprecated)
883 * - iconv if enabled
884 * - ICU if enabled
885 *
886 * The handler must be closed with xmlCharEncCloseFunc.
887 *
888 * If the encoding is UTF-8, a NULL handler and no error code will
889 * be returned.
890 *
891 * Available since 2.13.0.
892 *
893 * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
894 * xmlParserErrors error code.
895 */
896 int
xmlLookupCharEncodingHandler(xmlCharEncoding enc,xmlCharEncodingHandler ** out)897 xmlLookupCharEncodingHandler(xmlCharEncoding enc,
898 xmlCharEncodingHandler **out) {
899 const xmlCharEncodingHandler *handler;
900
901 if (out == NULL)
902 return(XML_ERR_ARGUMENT);
903 *out = NULL;
904
905 if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS))
906 return(XML_ERR_UNSUPPORTED_ENCODING);
907
908 /* Return NULL handler for UTF-8 */
909 if ((enc == XML_CHAR_ENCODING_UTF8) ||
910 (enc == XML_CHAR_ENCODING_NONE))
911 return(XML_ERR_OK);
912
913 handler = &defaultHandlers[enc];
914 if ((handler->input != NULL) || (handler->output != NULL)) {
915 *out = (xmlCharEncodingHandler *) handler;
916 return(XML_ERR_OK);
917 }
918
919 if (handler->name != NULL)
920 return(xmlFindExtraHandler(handler->name, handler->name, 0,
921 NULL, NULL, out));
922
923 return(XML_ERR_UNSUPPORTED_ENCODING);
924 }
925
926 /**
927 * xmlGetCharEncodingHandler:
928 * @enc: an xmlCharEncoding value.
929 *
930 * DEPRECATED: Use xmlLookupCharEncodingHandler which has better error
931 * reporting.
932 *
933 * Returns the handler or NULL if no handler was found or an error
934 * occurred.
935 */
936 xmlCharEncodingHandlerPtr
xmlGetCharEncodingHandler(xmlCharEncoding enc)937 xmlGetCharEncodingHandler(xmlCharEncoding enc) {
938 xmlCharEncodingHandler *ret;
939
940 xmlLookupCharEncodingHandler(enc, &ret);
941 return(ret);
942 }
943
944 /**
945 * xmlCreateCharEncodingHandler:
946 * @name: a string describing the char encoding.
947 * @output: boolean, use handler for output
948 * @impl: a conversion implementation (optional)
949 * @implCtxt: user data for conversion implementation (optional)
950 * @out: pointer to result
951 *
952 * Find or create a handler matching the encoding. The following
953 * converters are looked up in order:
954 *
955 * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
956 * - Custom implementation if provided
957 * - User-registered global handler (deprecated)
958 * - iconv if enabled
959 * - ICU if enabled
960 *
961 * The handler must be closed with xmlCharEncCloseFunc.
962 *
963 * If the encoding is UTF-8, a NULL handler and no error code will
964 * be returned.
965 *
966 * Available since 2.14.0.
967 *
968 * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
969 * xmlParserErrors error code.
970 */
971 int
xmlCreateCharEncodingHandler(const char * name,int output,xmlCharEncConvImpl impl,void * implCtxt,xmlCharEncodingHandler ** out)972 xmlCreateCharEncodingHandler(const char *name, int output,
973 xmlCharEncConvImpl impl, void *implCtxt,
974 xmlCharEncodingHandler **out) {
975 const xmlCharEncodingHandler *handler;
976 const char *norig, *nalias;
977 xmlCharEncoding enc;
978
979 if (out == NULL)
980 return(XML_ERR_ARGUMENT);
981 *out = NULL;
982
983 if (name == NULL)
984 return(XML_ERR_ARGUMENT);
985
986 norig = name;
987 nalias = xmlGetEncodingAlias(name);
988 if (nalias != NULL)
989 name = nalias;
990
991 enc = xmlParseCharEncodingInternal(name);
992
993 /* Return NULL handler for UTF-8 */
994 if (enc == XML_CHAR_ENCODING_UTF8)
995 return(XML_ERR_OK);
996
997 if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) {
998 handler = &defaultHandlers[enc];
999 if ((output ? handler->output : handler->input) != NULL) {
1000 *out = (xmlCharEncodingHandler *) handler;
1001 return(XML_ERR_OK);
1002 }
1003 }
1004
1005 return(xmlFindExtraHandler(norig, name, output, impl, implCtxt, out));
1006 }
1007
1008 /**
1009 * xmlOpenCharEncodingHandler:
1010 * @name: a string describing the char encoding.
1011 * @output: boolean, use handler for output
1012 * @out: pointer to result
1013 *
1014 * Find or create a handler matching the encoding. The following
1015 * converters are looked up in order:
1016 *
1017 * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
1018 * - User-registered global handler (deprecated)
1019 * - iconv if enabled
1020 * - ICU if enabled
1021 *
1022 * The handler must be closed with xmlCharEncCloseFunc.
1023 *
1024 * If the encoding is UTF-8, a NULL handler and no error code will
1025 * be returned.
1026 *
1027 * Available since 2.13.0.
1028 *
1029 * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
1030 * xmlParserErrors error code.
1031 */
1032 int
xmlOpenCharEncodingHandler(const char * name,int output,xmlCharEncodingHandler ** out)1033 xmlOpenCharEncodingHandler(const char *name, int output,
1034 xmlCharEncodingHandler **out) {
1035 return(xmlCreateCharEncodingHandler(name, output, NULL, NULL, out));
1036 }
1037
1038 /**
1039 * xmlFindCharEncodingHandler:
1040 * @name: a string describing the char encoding.
1041 *
1042 * DEPRECATED: Use xmlOpenCharEncodingHandler which has better error
1043 * reporting.
1044 *
1045 * If the encoding is UTF-8, this will return a no-op handler that
1046 * shouldn't be used.
1047 *
1048 * Returns the handler or NULL if no handler was found or an error
1049 * occurred.
1050 */
1051 xmlCharEncodingHandlerPtr
xmlFindCharEncodingHandler(const char * name)1052 xmlFindCharEncodingHandler(const char *name) {
1053 xmlCharEncodingHandler *ret;
1054
1055 /*
1056 * This handler shouldn't be used, but we must return a non-NULL
1057 * handler.
1058 */
1059 if ((xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF-8") == 0) ||
1060 (xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF8") == 0))
1061 return((xmlCharEncodingHandlerPtr)
1062 &defaultHandlers[XML_CHAR_ENCODING_UTF8]);
1063
1064 xmlOpenCharEncodingHandler(name, 0, &ret);
1065 return(ret);
1066 }
1067
1068 /************************************************************************
1069 * *
1070 * ICONV based generic conversion functions *
1071 * *
1072 ************************************************************************/
1073
1074 #ifdef LIBXML_ICONV_ENABLED
1075 typedef struct {
1076 iconv_t cd;
1077 } xmlIconvCtxt;
1078
1079 /**
1080 * xmlIconvConvert:
1081 * @vctxt: conversion context
1082 * @out: a pointer to an array of bytes to store the result
1083 * @outlen: the length of @out
1084 * @in: a pointer to an array of input bytes
1085 * @inlen: the length of @in
1086 *
1087 * Returns an XML_ENC_ERR code.
1088 *
1089 * The value of @inlen after return is the number of octets consumed
1090 * as the return value is positive, else unpredictable.
1091 * The value of @outlen after return is the number of octets produced.
1092 */
1093 static int
xmlIconvConvert(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt)1094 xmlIconvConvert(unsigned char *out, int *outlen,
1095 const unsigned char *in, int *inlen, void *vctxt) {
1096 xmlIconvCtxt *ctxt = vctxt;
1097 size_t icv_inlen, icv_outlen;
1098 const char *icv_in = (const char *) in;
1099 char *icv_out = (char *) out;
1100 size_t ret;
1101
1102 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
1103 if (outlen != NULL) *outlen = 0;
1104 return(XML_ENC_ERR_INTERNAL);
1105 }
1106 icv_inlen = *inlen;
1107 icv_outlen = *outlen;
1108 /*
1109 * Some versions take const, other versions take non-const input.
1110 */
1111 ret = iconv(ctxt->cd, (void *) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
1112 *inlen -= icv_inlen;
1113 *outlen -= icv_outlen;
1114 if (ret == (size_t) -1) {
1115 if (errno == EILSEQ)
1116 return(XML_ENC_ERR_INPUT);
1117 if (errno == E2BIG)
1118 return(XML_ENC_ERR_SPACE);
1119 /*
1120 * EINVAL means a truncated multi-byte sequence at the end
1121 * of the input buffer. We treat this as success.
1122 */
1123 if (errno == EINVAL)
1124 return(XML_ENC_ERR_SUCCESS);
1125 return(XML_ENC_ERR_INTERNAL);
1126 }
1127 return(XML_ENC_ERR_SUCCESS);
1128 }
1129
1130 static void
xmlIconvFree(void * vctxt)1131 xmlIconvFree(void *vctxt) {
1132 xmlIconvCtxt *ctxt = vctxt;
1133
1134 if (ctxt->cd != (iconv_t) -1)
1135 iconv_close(ctxt->cd);
1136
1137 xmlFree(ctxt);
1138 }
1139
1140 static int
xmlCharEncIconv(void * vctxt,const char * name,xmlCharEncConverter * conv)1141 xmlCharEncIconv(void *vctxt, const char *name, xmlCharEncConverter *conv) {
1142 xmlCharEncodingHandler *handler = vctxt;
1143 xmlIconvCtxt *inputCtxt = NULL, *outputCtxt = NULL;
1144 iconv_t icv_in;
1145 iconv_t icv_out;
1146 int ret;
1147
1148 inputCtxt = xmlMalloc(sizeof(xmlIconvCtxt));
1149 if (inputCtxt == NULL) {
1150 ret = XML_ERR_NO_MEMORY;
1151 goto error;
1152 }
1153 inputCtxt->cd = (iconv_t) -1;
1154
1155 icv_in = iconv_open("UTF-8", name);
1156 if (icv_in == (iconv_t) -1) {
1157 if (errno == EINVAL)
1158 ret = XML_ERR_UNSUPPORTED_ENCODING;
1159 else if (errno == ENOMEM)
1160 ret = XML_ERR_NO_MEMORY;
1161 else
1162 ret = XML_ERR_SYSTEM;
1163 goto error;
1164 }
1165 inputCtxt->cd = icv_in;
1166
1167 outputCtxt = xmlMalloc(sizeof(xmlIconvCtxt));
1168 if (outputCtxt == NULL) {
1169 ret = XML_ERR_NO_MEMORY;
1170 goto error;
1171 }
1172 outputCtxt->cd = (iconv_t) -1;
1173
1174 icv_out = iconv_open(name, "UTF-8");
1175 if (icv_out == (iconv_t) -1) {
1176 if (errno == EINVAL)
1177 ret = XML_ERR_UNSUPPORTED_ENCODING;
1178 else if (errno == ENOMEM)
1179 ret = XML_ERR_NO_MEMORY;
1180 else
1181 ret = XML_ERR_SYSTEM;
1182 goto error;
1183 }
1184 outputCtxt->cd = icv_out;
1185
1186 conv->input = xmlIconvConvert;
1187 conv->output = xmlIconvConvert;
1188 conv->ctxtDtor = xmlIconvFree;
1189 conv->inputCtxt = inputCtxt;
1190 conv->outputCtxt = outputCtxt;
1191
1192 /* Backward compatibility */
1193 if (handler != NULL) {
1194 handler->iconv_in = icv_in;
1195 handler->iconv_out = icv_out;
1196 }
1197
1198 return(XML_ERR_OK);
1199
1200 error:
1201 if (inputCtxt != NULL)
1202 xmlIconvFree(inputCtxt);
1203 if (outputCtxt != NULL)
1204 xmlIconvFree(outputCtxt);
1205 return(ret);
1206 }
1207 #endif /* LIBXML_ICONV_ENABLED */
1208
1209 /************************************************************************
1210 * *
1211 * ICU based generic conversion functions *
1212 * *
1213 ************************************************************************/
1214
1215 #ifdef LIBXML_ICU_ENABLED
1216 /* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */
1217 #define ICU_PIVOT_BUF_SIZE 1024
1218
1219 typedef struct _uconv_t xmlUconvCtxt;
1220 struct _uconv_t {
1221 UConverter *uconv; /* for conversion between an encoding and UTF-16 */
1222 UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
1223 UChar *pivot_source;
1224 UChar *pivot_target;
1225 int isInput;
1226 UChar pivot_buf[ICU_PIVOT_BUF_SIZE];
1227 };
1228
1229 /**
1230 * xmlUconvConvert:
1231 * @vctxt: converison context
1232 * @out: a pointer to an array of bytes to store the result
1233 * @outlen: the length of @out
1234 * @in: a pointer to an array of input bytes
1235 * @inlen: the length of @in
1236 *
1237 * Returns an XML_ENC_ERR code.
1238 *
1239 * The value of @inlen after return is the number of octets consumed
1240 * as the return value is positive, else unpredictable.
1241 * The value of @outlen after return is the number of octets produced.
1242 */
1243 static int
xmlUconvConvert(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt)1244 xmlUconvConvert(unsigned char *out, int *outlen,
1245 const unsigned char *in, int *inlen, void *vctxt) {
1246 xmlUconvCtxt *cd = vctxt;
1247 const char *ucv_in = (const char *) in;
1248 char *ucv_out = (char *) out;
1249 UConverter *target, *source;
1250 UErrorCode err = U_ZERO_ERROR;
1251 int ret;
1252
1253 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
1254 if (outlen != NULL)
1255 *outlen = 0;
1256 return(XML_ENC_ERR_INTERNAL);
1257 }
1258
1259 /*
1260 * Note that the ICU API is stateful. It can always consume a certain
1261 * amount of input even if the output buffer would overflow. The
1262 * remaining input must be processed by calling ucnv_convertEx with a
1263 * possibly empty input buffer.
1264 *
1265 * ucnv_convertEx is always called with reset and flush set to 0,
1266 * so we don't mess up the state. This should never generate
1267 * U_TRUNCATED_CHAR_FOUND errors.
1268 */
1269 if (cd->isInput) {
1270 source = cd->uconv;
1271 target = cd->utf8;
1272 } else {
1273 source = cd->utf8;
1274 target = cd->uconv;
1275 }
1276
1277 ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
1278 &ucv_in, ucv_in + *inlen, cd->pivot_buf,
1279 &cd->pivot_source, &cd->pivot_target,
1280 cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
1281
1282 *inlen = ucv_in - (const char*) in;
1283 *outlen = ucv_out - (char *) out;
1284
1285 if (U_SUCCESS(err)) {
1286 ret = XML_ENC_ERR_SUCCESS;
1287 } else {
1288 switch (err) {
1289 case U_TRUNCATED_CHAR_FOUND:
1290 /* Shouldn't happen without flush */
1291 ret = XML_ENC_ERR_SUCCESS;
1292 break;
1293
1294 case U_BUFFER_OVERFLOW_ERROR:
1295 ret = XML_ENC_ERR_SPACE;
1296 break;
1297
1298 case U_INVALID_CHAR_FOUND:
1299 case U_ILLEGAL_CHAR_FOUND:
1300 case U_ILLEGAL_ESCAPE_SEQUENCE:
1301 case U_UNSUPPORTED_ESCAPE_SEQUENCE:
1302 ret = XML_ENC_ERR_INPUT;
1303 break;
1304
1305 case U_MEMORY_ALLOCATION_ERROR:
1306 ret = XML_ENC_ERR_MEMORY;
1307 break;
1308
1309 default:
1310 ret = XML_ENC_ERR_INTERNAL;
1311 break;
1312 }
1313 }
1314
1315 return(ret);
1316 }
1317
1318 static int
openIcuConverter(const char * name,int isInput,xmlUconvCtxt ** out)1319 openIcuConverter(const char* name, int isInput, xmlUconvCtxt **out)
1320 {
1321 UErrorCode status;
1322 xmlUconvCtxt *conv;
1323
1324 *out = NULL;
1325
1326 conv = (xmlUconvCtxt *) xmlMalloc(sizeof(xmlUconvCtxt));
1327 if (conv == NULL)
1328 return(XML_ERR_NO_MEMORY);
1329
1330 conv->isInput = isInput;
1331 conv->pivot_source = conv->pivot_buf;
1332 conv->pivot_target = conv->pivot_buf;
1333
1334 status = U_ZERO_ERROR;
1335 conv->uconv = ucnv_open(name, &status);
1336 if (U_FAILURE(status))
1337 goto error;
1338
1339 status = U_ZERO_ERROR;
1340 if (isInput) {
1341 ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP,
1342 NULL, NULL, NULL, &status);
1343 }
1344 else {
1345 ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP,
1346 NULL, NULL, NULL, &status);
1347 }
1348 if (U_FAILURE(status))
1349 goto error;
1350
1351 status = U_ZERO_ERROR;
1352 conv->utf8 = ucnv_open("UTF-8", &status);
1353 if (U_FAILURE(status))
1354 goto error;
1355
1356 *out = conv;
1357 return(0);
1358
1359 error:
1360 if (conv->uconv)
1361 ucnv_close(conv->uconv);
1362 xmlFree(conv);
1363
1364 if (status == U_FILE_ACCESS_ERROR)
1365 return(XML_ERR_UNSUPPORTED_ENCODING);
1366 if (status == U_MEMORY_ALLOCATION_ERROR)
1367 return(XML_ERR_NO_MEMORY);
1368 return(XML_ERR_SYSTEM);
1369 }
1370
1371 static void
closeIcuConverter(xmlUconvCtxt * conv)1372 closeIcuConverter(xmlUconvCtxt *conv)
1373 {
1374 if (conv == NULL)
1375 return;
1376 ucnv_close(conv->uconv);
1377 ucnv_close(conv->utf8);
1378 xmlFree(conv);
1379 }
1380
1381 static void
xmlUconvFree(void * vctxt)1382 xmlUconvFree(void *vctxt) {
1383 closeIcuConverter(vctxt);
1384 }
1385
1386 static int
xmlCharEncUconv(void * vctxt ATTRIBUTE_UNUSED,const char * name,xmlCharEncConverter * conv)1387 xmlCharEncUconv(void *vctxt ATTRIBUTE_UNUSED, const char *name,
1388 xmlCharEncConverter *conv) {
1389 xmlUconvCtxt *ucv_in = NULL;
1390 xmlUconvCtxt *ucv_out = NULL;
1391 int ret;
1392
1393 ret = openIcuConverter(name, 1, &ucv_in);
1394 if (ret != 0)
1395 goto error;
1396 ret = openIcuConverter(name, 0, &ucv_out);
1397 if (ret != 0)
1398 goto error;
1399
1400 conv->input = xmlUconvConvert;
1401 conv->output = xmlUconvConvert;
1402 conv->ctxtDtor = xmlUconvFree;
1403 conv->inputCtxt = ucv_in;
1404 conv->outputCtxt = ucv_out;
1405
1406 return(XML_ERR_OK);
1407
1408 error:
1409 if (ucv_in != NULL)
1410 closeIcuConverter(ucv_in);
1411 if (ucv_out != NULL)
1412 closeIcuConverter(ucv_out);
1413 return(ret);
1414 }
1415 #endif /* LIBXML_ICU_ENABLED */
1416
1417 /************************************************************************
1418 * *
1419 * The real API used by libxml for on-the-fly conversion *
1420 * *
1421 ************************************************************************/
1422
1423 /**
1424 * xmlEncConvertError:
1425 * @code: XML_ENC_ERR code
1426 *
1427 * Convert XML_ENC_ERR to libxml2 error codes.
1428 */
1429 static int
xmlEncConvertError(int code)1430 xmlEncConvertError(int code) {
1431 int ret;
1432
1433 switch (code) {
1434 case XML_ENC_ERR_SUCCESS:
1435 ret = XML_ERR_OK;
1436 break;
1437 case XML_ENC_ERR_INPUT:
1438 ret = XML_ERR_INVALID_ENCODING;
1439 break;
1440 case XML_ENC_ERR_MEMORY:
1441 ret = XML_ERR_NO_MEMORY;
1442 break;
1443 default:
1444 ret = XML_ERR_INTERNAL_ERROR;
1445 break;
1446 }
1447
1448 return(ret);
1449 }
1450
1451 /**
1452 * xmlEncInputChunk:
1453 * @handler: encoding handler
1454 * @out: a pointer to an array of bytes to store the result
1455 * @outlen: the length of @out
1456 * @in: a pointer to an array of input bytes
1457 * @inlen: the length of @in
1458 *
1459 * The value of @inlen after return is the number of octets consumed
1460 * as the return value is 0, else unpredictable.
1461 * The value of @outlen after return is the number of octets produced.
1462 *
1463 * Returns an XML_ENC_ERR code.
1464 */
1465 int
xmlEncInputChunk(xmlCharEncodingHandler * handler,unsigned char * out,int * outlen,const unsigned char * in,int * inlen)1466 xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
1467 int *outlen, const unsigned char *in, int *inlen) {
1468 int ret;
1469
1470 if (handler->input != NULL) {
1471 xmlCharEncConvFunc conv =
1472 (xmlCharEncConvFunc) (void (*)(void)) handler->input;
1473
1474 ret = conv(out, outlen, in, inlen, handler->inputCtxt);
1475 if (ret > 0)
1476 ret = XML_ENC_ERR_SUCCESS;
1477 }
1478 else {
1479 *outlen = 0;
1480 *inlen = 0;
1481 ret = XML_ENC_ERR_INTERNAL;
1482 }
1483
1484 return(ret);
1485 }
1486
1487 /**
1488 * xmlEncOutputChunk:
1489 * @handler: encoding handler
1490 * @out: a pointer to an array of bytes to store the result
1491 * @outlen: the length of @out
1492 * @in: a pointer to an array of input bytes
1493 * @inlen: the length of @in
1494 *
1495 * Returns an XML_ENC_ERR code.
1496 *
1497 * The value of @inlen after return is the number of octets consumed
1498 * as the return value is 0, else unpredictable.
1499 * The value of @outlen after return is the number of octets produced.
1500 */
1501 static int
xmlEncOutputChunk(xmlCharEncodingHandler * handler,unsigned char * out,int * outlen,const unsigned char * in,int * inlen)1502 xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
1503 int *outlen, const unsigned char *in, int *inlen) {
1504 int ret;
1505
1506 if (handler->output != NULL) {
1507 xmlCharEncConvFunc conv =
1508 (xmlCharEncConvFunc) (void (*)(void)) handler->output;
1509
1510 ret = conv(out, outlen, in, inlen, handler->outputCtxt);
1511 if (ret > 0)
1512 ret = XML_ENC_ERR_SUCCESS;
1513 }
1514 else {
1515 *outlen = 0;
1516 *inlen = 0;
1517 ret = XML_ENC_ERR_INTERNAL;
1518 }
1519
1520 return(ret);
1521 }
1522
1523 /**
1524 * xmlCharEncFirstLine:
1525 * @handler: char encoding transformation data structure
1526 * @out: an xmlBuffer for the output.
1527 * @in: an xmlBuffer for the input
1528 *
1529 * DEPERECATED: Don't use.
1530 *
1531 * Returns the number of bytes written or an XML_ENC_ERR code.
1532 */
1533 int
xmlCharEncFirstLine(xmlCharEncodingHandler * handler,xmlBufferPtr out,xmlBufferPtr in)1534 xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1535 xmlBufferPtr in) {
1536 return(xmlCharEncInFunc(handler, out, in));
1537 }
1538
1539 /**
1540 * xmlCharEncInput:
1541 * @input: a parser input buffer
1542 * @sizeOut: pointer to output size
1543 *
1544 * @sizeOut should be set to the maximum output size (or SIZE_MAX).
1545 * After return, it is set to the number of bytes written.
1546 *
1547 * Generic front-end for the encoding handler on parser input
1548 *
1549 * Returns an XML_ENC_ERR code.
1550 */
1551 int
xmlCharEncInput(xmlParserInputBufferPtr input,size_t * sizeOut)1552 xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut)
1553 {
1554 xmlBufPtr out, in;
1555 const xmlChar *dataIn;
1556 size_t availIn;
1557 size_t maxOut;
1558 size_t totalIn, totalOut;
1559 int ret;
1560
1561 out = input->buffer;
1562 in = input->raw;
1563
1564 maxOut = *sizeOut;
1565 totalOut = 0;
1566
1567 *sizeOut = 0;
1568
1569 availIn = xmlBufUse(in);
1570 if (availIn == 0)
1571 return(0);
1572 dataIn = xmlBufContent(in);
1573 totalIn = 0;
1574
1575 while (1) {
1576 size_t availOut;
1577 int completeOut, completeIn;
1578 int c_out, c_in;
1579
1580 availOut = xmlBufAvail(out);
1581 if (availOut > INT_MAX / 2)
1582 availOut = INT_MAX / 2;
1583
1584 if (availOut < maxOut) {
1585 c_out = availOut;
1586 completeOut = 0;
1587 } else {
1588 c_out = maxOut;
1589 completeOut = 1;
1590 }
1591
1592 if (availIn > INT_MAX / 2) {
1593 c_in = INT_MAX / 2;
1594 completeIn = 0;
1595 } else {
1596 c_in = availIn;
1597 completeIn = 1;
1598 }
1599
1600 ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
1601 dataIn, &c_in);
1602
1603 totalIn += c_in;
1604 dataIn += c_in;
1605 availIn -= c_in;
1606
1607 totalOut += c_out;
1608 maxOut -= c_out;
1609 xmlBufAddLen(out, c_out);
1610
1611 if ((ret != XML_ENC_ERR_SUCCESS) && (ret != XML_ENC_ERR_SPACE)) {
1612 input->error = xmlEncConvertError(ret);
1613 return(ret);
1614 }
1615
1616 if ((completeOut) && (completeIn))
1617 break;
1618 if ((completeOut) && (ret == XML_ENC_ERR_SPACE))
1619 break;
1620 if ((completeIn) && (ret == XML_ENC_ERR_SUCCESS))
1621 break;
1622
1623 if (ret == XML_ENC_ERR_SPACE) {
1624 if (xmlBufGrow(out, 4096) < 0) {
1625 input->error = XML_ERR_NO_MEMORY;
1626 return(XML_ENC_ERR_MEMORY);
1627 }
1628 }
1629 }
1630
1631 xmlBufShrink(in, totalIn);
1632
1633 if (input->rawconsumed > ULONG_MAX - (unsigned long) totalIn)
1634 input->rawconsumed = ULONG_MAX;
1635 else
1636 input->rawconsumed += totalIn;
1637
1638 *sizeOut = totalOut;
1639 return(XML_ERR_OK);
1640 }
1641
1642 /**
1643 * xmlCharEncInFunc:
1644 * @handler: char encoding transformation data structure
1645 * @out: an xmlBuffer for the output.
1646 * @in: an xmlBuffer for the input
1647 *
1648 * Generic front-end for the encoding handler input function
1649 *
1650 * Returns the number of bytes written or an XML_ENC_ERR code.
1651 */
1652 int
xmlCharEncInFunc(xmlCharEncodingHandler * handler,xmlBufferPtr out,xmlBufferPtr in)1653 xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
1654 xmlBufferPtr in)
1655 {
1656 int ret;
1657 int written;
1658 int toconv;
1659
1660 if (handler == NULL)
1661 return(XML_ENC_ERR_INTERNAL);
1662 if (out == NULL)
1663 return(XML_ENC_ERR_INTERNAL);
1664 if (in == NULL)
1665 return(XML_ENC_ERR_INTERNAL);
1666
1667 toconv = in->use;
1668 if (toconv == 0)
1669 return (0);
1670 written = out->size - out->use -1; /* count '\0' */
1671 if (toconv * 2 >= written) {
1672 xmlBufferGrow(out, out->size + toconv * 2);
1673 written = out->size - out->use - 1;
1674 }
1675 ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
1676 in->content, &toconv);
1677 xmlBufferShrink(in, toconv);
1678 out->use += written;
1679 out->content[out->use] = 0;
1680
1681 return (written? written : ret);
1682 }
1683
1684 #ifdef LIBXML_OUTPUT_ENABLED
1685 /**
1686 * xmlCharEncOutput:
1687 * @output: a parser output buffer
1688 * @init: is this an initialization call without data
1689 *
1690 * Generic front-end for the encoding handler on parser output
1691 * a first call with @init == 1 has to be made first to initiate the
1692 * output in case of non-stateless encoding needing to initiate their
1693 * state or the output (like the BOM in UTF16).
1694 * In case of UTF8 sequence conversion errors for the given encoder,
1695 * the content will be automatically remapped to a CharRef sequence.
1696 *
1697 * Returns the number of bytes written or an XML_ENC_ERR code.
1698 */
1699 int
xmlCharEncOutput(xmlOutputBufferPtr output,int init)1700 xmlCharEncOutput(xmlOutputBufferPtr output, int init)
1701 {
1702 int ret;
1703 size_t written;
1704 int writtentot = 0;
1705 size_t toconv;
1706 int c_in;
1707 int c_out;
1708 xmlBufPtr in;
1709 xmlBufPtr out;
1710
1711 if ((output == NULL) || (output->encoder == NULL) ||
1712 (output->buffer == NULL) || (output->conv == NULL))
1713 return(XML_ENC_ERR_INTERNAL);
1714 out = output->conv;
1715 in = output->buffer;
1716
1717 retry:
1718
1719 written = xmlBufAvail(out);
1720
1721 /*
1722 * First specific handling of the initialization call
1723 */
1724 if (init) {
1725 c_in = 0;
1726 c_out = written;
1727 /* TODO: Check return value. */
1728 xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
1729 NULL, &c_in);
1730 xmlBufAddLen(out, c_out);
1731 return(c_out);
1732 }
1733
1734 /*
1735 * Conversion itself.
1736 */
1737 toconv = xmlBufUse(in);
1738 if (toconv > 64 * 1024)
1739 toconv = 64 * 1024;
1740 if (toconv * 4 >= written) {
1741 if (xmlBufGrow(out, toconv * 4) < 0) {
1742 ret = XML_ENC_ERR_MEMORY;
1743 goto error;
1744 }
1745 written = xmlBufAvail(out);
1746 }
1747 if (written > 256 * 1024)
1748 written = 256 * 1024;
1749
1750 c_in = toconv;
1751 c_out = written;
1752 ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
1753 xmlBufContent(in), &c_in);
1754 xmlBufShrink(in, c_in);
1755 xmlBufAddLen(out, c_out);
1756 writtentot += c_out;
1757
1758 if (ret == XML_ENC_ERR_SPACE)
1759 goto retry;
1760
1761 /*
1762 * Attempt to handle error cases
1763 */
1764 if (ret == XML_ENC_ERR_INPUT) {
1765 xmlChar charref[20];
1766 int len = xmlBufUse(in);
1767 xmlChar *content = xmlBufContent(in);
1768 int cur, charrefLen;
1769
1770 cur = xmlGetUTF8Char(content, &len);
1771 if (cur <= 0)
1772 goto error;
1773
1774 /*
1775 * Removes the UTF8 sequence, and replace it by a charref
1776 * and continue the transcoding phase, hoping the error
1777 * did not mangle the encoder state.
1778 */
1779 charrefLen = xmlSerializeDecCharRef((char *) charref, cur);
1780 xmlBufGrow(out, charrefLen * 4);
1781 c_out = xmlBufAvail(out);
1782 c_in = charrefLen;
1783 ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
1784 charref, &c_in);
1785 if ((ret < 0) || (c_in != charrefLen)) {
1786 ret = XML_ENC_ERR_INTERNAL;
1787 goto error;
1788 }
1789
1790 xmlBufShrink(in, len);
1791 xmlBufAddLen(out, c_out);
1792 writtentot += c_out;
1793 goto retry;
1794 }
1795
1796 error:
1797 if (((writtentot <= 0) && (ret != 0)) ||
1798 (ret == XML_ENC_ERR_MEMORY)) {
1799 if (output->error == 0)
1800 output->error = xmlEncConvertError(ret);
1801 return(ret);
1802 }
1803
1804 return(writtentot);
1805 }
1806 #endif
1807
1808 /**
1809 * xmlCharEncOutFunc:
1810 * @handler: char encoding transformation data structure
1811 * @out: an xmlBuffer for the output.
1812 * @in: an xmlBuffer for the input
1813 *
1814 * Generic front-end for the encoding handler output function
1815 * a first call with @in == NULL has to be made firs to initiate the
1816 * output in case of non-stateless encoding needing to initiate their
1817 * state or the output (like the BOM in UTF16).
1818 * In case of UTF8 sequence conversion errors for the given encoder,
1819 * the content will be automatically remapped to a CharRef sequence.
1820 *
1821 * Returns the number of bytes written or an XML_ENC_ERR code.
1822 */
1823 int
xmlCharEncOutFunc(xmlCharEncodingHandler * handler,xmlBufferPtr out,xmlBufferPtr in)1824 xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1825 xmlBufferPtr in) {
1826 int ret;
1827 int written;
1828 int writtentot = 0;
1829 int toconv;
1830
1831 if (handler == NULL) return(XML_ENC_ERR_INTERNAL);
1832 if (out == NULL) return(XML_ENC_ERR_INTERNAL);
1833
1834 retry:
1835
1836 written = out->size - out->use;
1837
1838 if (written > 0)
1839 written--; /* Gennady: count '/0' */
1840
1841 /*
1842 * First specific handling of in = NULL, i.e. the initialization call
1843 */
1844 if (in == NULL) {
1845 toconv = 0;
1846 /* TODO: Check return value. */
1847 xmlEncOutputChunk(handler, &out->content[out->use], &written,
1848 NULL, &toconv);
1849 out->use += written;
1850 out->content[out->use] = 0;
1851 return(0);
1852 }
1853
1854 /*
1855 * Conversion itself.
1856 */
1857 toconv = in->use;
1858 if (toconv * 4 >= written) {
1859 xmlBufferGrow(out, toconv * 4);
1860 written = out->size - out->use - 1;
1861 }
1862 ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
1863 in->content, &toconv);
1864 xmlBufferShrink(in, toconv);
1865 out->use += written;
1866 writtentot += written;
1867 out->content[out->use] = 0;
1868
1869 if (ret == XML_ENC_ERR_SPACE)
1870 goto retry;
1871
1872 /*
1873 * Attempt to handle error cases
1874 */
1875 if (ret == XML_ENC_ERR_INPUT) {
1876 xmlChar charref[20];
1877 int len = in->use;
1878 const xmlChar *utf = (const xmlChar *) in->content;
1879 int cur, charrefLen;
1880
1881 cur = xmlGetUTF8Char(utf, &len);
1882 if (cur <= 0)
1883 return(ret);
1884
1885 /*
1886 * Removes the UTF8 sequence, and replace it by a charref
1887 * and continue the transcoding phase, hoping the error
1888 * did not mangle the encoder state.
1889 */
1890 charrefLen = xmlSerializeDecCharRef((char *) charref, cur);
1891 xmlBufferShrink(in, len);
1892 xmlBufferGrow(out, charrefLen * 4);
1893 written = out->size - out->use - 1;
1894 toconv = charrefLen;
1895 ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
1896 charref, &toconv);
1897 if ((ret < 0) || (toconv != charrefLen))
1898 return(XML_ENC_ERR_INTERNAL);
1899
1900 out->use += written;
1901 writtentot += written;
1902 out->content[out->use] = 0;
1903 goto retry;
1904 }
1905 return(writtentot ? writtentot : ret);
1906 }
1907
1908 /**
1909 * xmlCharEncCloseFunc:
1910 * @handler: char encoding transformation data structure
1911 *
1912 * Releases an xmlCharEncodingHandler. Must be called after
1913 * a handler is no longer in use.
1914 *
1915 * Returns 0.
1916 */
1917 int
xmlCharEncCloseFunc(xmlCharEncodingHandler * handler)1918 xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
1919 if (handler == NULL)
1920 return(0);
1921
1922 if (handler->flags & XML_HANDLER_STATIC)
1923 return(0);
1924
1925 xmlFree(handler->name);
1926 if (handler->ctxtDtor != NULL) {
1927 handler->ctxtDtor(handler->inputCtxt);
1928 handler->ctxtDtor(handler->outputCtxt);
1929 }
1930 xmlFree(handler);
1931 return(0);
1932 }
1933
1934 /**
1935 * xmlByteConsumed:
1936 * @ctxt: an XML parser context
1937 *
1938 * DEPRECATED: Don't use.
1939 *
1940 * This function provides the current index of the parser relative
1941 * to the start of the current entity. This function is computed in
1942 * bytes from the beginning starting at zero and finishing at the
1943 * size in byte of the file if parsing a file. The function is
1944 * of constant cost if the input is UTF-8 but can be costly if run
1945 * on non-UTF-8 input.
1946 *
1947 * Returns the index in bytes from the beginning of the entity or -1
1948 * in case the index could not be computed.
1949 */
1950 long
xmlByteConsumed(xmlParserCtxtPtr ctxt)1951 xmlByteConsumed(xmlParserCtxtPtr ctxt) {
1952 xmlParserInputPtr in;
1953
1954 if (ctxt == NULL)
1955 return(-1);
1956 in = ctxt->input;
1957 if (in == NULL)
1958 return(-1);
1959
1960 if ((in->buf != NULL) && (in->buf->encoder != NULL)) {
1961 int unused = 0;
1962 xmlCharEncodingHandler * handler = in->buf->encoder;
1963
1964 /*
1965 * Encoding conversion, compute the number of unused original
1966 * bytes from the input not consumed and subtract that from
1967 * the raw consumed value, this is not a cheap operation
1968 */
1969 if (in->end - in->cur > 0) {
1970 unsigned char *convbuf;
1971 const unsigned char *cur = (const unsigned char *)in->cur;
1972 int toconv, ret;
1973
1974 convbuf = xmlMalloc(32000);
1975 if (convbuf == NULL)
1976 return(-1);
1977
1978 toconv = in->end - cur;
1979 unused = 32000;
1980 ret = xmlEncOutputChunk(handler, convbuf, &unused, cur, &toconv);
1981
1982 xmlFree(convbuf);
1983
1984 if (ret != XML_ENC_ERR_SUCCESS)
1985 return(-1);
1986 }
1987
1988 if (in->buf->rawconsumed < (unsigned long) unused)
1989 return(-1);
1990 return(in->buf->rawconsumed - unused);
1991 }
1992
1993 return(in->consumed + (in->cur - in->base));
1994 }
1995
1996 /************************************************************************
1997 * *
1998 * Conversions To/From UTF8 encoding *
1999 * *
2000 ************************************************************************/
2001
2002 static int
asciiToAscii(unsigned char * out,int * poutlen,const unsigned char * in,int * pinlen,void * vctxt ATTRIBUTE_UNUSED)2003 asciiToAscii(unsigned char* out, int *poutlen,
2004 const unsigned char* in, int *pinlen,
2005 void *vctxt ATTRIBUTE_UNUSED) {
2006 const unsigned char *inend;
2007 const unsigned char *instart = in;
2008 int inlen, outlen, ret;
2009
2010 if (in == NULL) {
2011 *pinlen = 0;
2012 *poutlen = 0;
2013 return(XML_ENC_ERR_SUCCESS);
2014 }
2015
2016 inlen = *pinlen;
2017 outlen = *poutlen;
2018
2019 if (outlen < inlen) {
2020 inlen = outlen;
2021 ret = XML_ENC_ERR_SPACE;
2022 } else {
2023 ret = inlen;
2024 }
2025
2026 inend = in + inlen;
2027 *poutlen = inlen;
2028 *pinlen = inlen;
2029
2030 while (in < inend) {
2031 unsigned c = *in;
2032
2033 if (c >= 0x80) {
2034 *poutlen = in - instart;
2035 *pinlen = in - instart;
2036 return(XML_ENC_ERR_INPUT);
2037 }
2038
2039 in++;
2040 *out++ = c;
2041 }
2042
2043 return(ret);
2044 }
2045
2046 static int
latin1ToUTF8(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2047 latin1ToUTF8(unsigned char* out, int *outlen,
2048 const unsigned char* in, int *inlen,
2049 void *vctxt ATTRIBUTE_UNUSED) {
2050 unsigned char* outstart = out;
2051 const unsigned char* instart = in;
2052 unsigned char* outend;
2053 const unsigned char* inend;
2054 int ret = XML_ENC_ERR_SPACE;
2055
2056 if ((out == NULL) || (in == NULL) || (outlen == NULL) || (inlen == NULL))
2057 return(XML_ENC_ERR_INTERNAL);
2058
2059 outend = out + *outlen;
2060 inend = in + *inlen;
2061
2062 while (in < inend) {
2063 unsigned c = *in;
2064
2065 if (c < 0x80) {
2066 if (out >= outend)
2067 goto done;
2068 *out++ = c;
2069 } else {
2070 if (outend - out < 2)
2071 goto done;
2072 *out++ = (c >> 6) | 0xC0;
2073 *out++ = (c & 0x3F) | 0x80;
2074 }
2075
2076 in++;
2077 }
2078
2079 ret = out - outstart;
2080
2081 done:
2082 *outlen = out - outstart;
2083 *inlen = in - instart;
2084 return(ret);
2085 }
2086
2087 /**
2088 * isolat1ToUTF8:
2089 * @out: a pointer to an array of bytes to store the result
2090 * @outlen: the length of @out
2091 * @in: a pointer to an array of ISO Latin 1 chars
2092 * @inlen: the length of @in
2093 *
2094 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
2095 * block of chars out.
2096 *
2097 * Returns the number of bytes written or an XML_ENC_ERR code.
2098 *
2099 * The value of @inlen after return is the number of octets consumed
2100 * if the return value is positive, else unpredictable.
2101 * The value of @outlen after return is the number of octets produced.
2102 */
2103 int
isolat1ToUTF8(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2104 isolat1ToUTF8(unsigned char* out, int *outlen,
2105 const unsigned char* in, int *inlen) {
2106 return(latin1ToUTF8(out, outlen, in, inlen, NULL));
2107 }
2108
2109 static int
UTF8ToUTF8(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2110 UTF8ToUTF8(unsigned char* out, int *outlen,
2111 const unsigned char* in, int *inlen,
2112 void *vctxt ATTRIBUTE_UNUSED) {
2113 int len;
2114 int ret;
2115
2116 if (in == NULL) {
2117 *inlen = 0;
2118 *outlen = 0;
2119 return(XML_ENC_ERR_SUCCESS);
2120 }
2121
2122 if (*outlen < *inlen) {
2123 len = *outlen;
2124 ret = XML_ENC_ERR_SPACE;
2125 } else {
2126 len = *inlen;
2127 ret = len;
2128 }
2129
2130 memcpy(out, in, len);
2131
2132 *outlen = len;
2133 *inlen = len;
2134 return(ret);
2135 }
2136
2137
2138 #ifdef LIBXML_OUTPUT_ENABLED
2139 static int
UTF8ToLatin1(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2140 UTF8ToLatin1(unsigned char* out, int *outlen,
2141 const unsigned char* in, int *inlen,
2142 void *vctxt ATTRIBUTE_UNUSED) {
2143 const unsigned char* outend;
2144 const unsigned char* outstart = out;
2145 const unsigned char* instart = in;
2146 const unsigned char* inend;
2147 unsigned c;
2148 int ret = XML_ENC_ERR_SPACE;
2149
2150 if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
2151 return(XML_ENC_ERR_INTERNAL);
2152
2153 if (in == NULL) {
2154 *inlen = 0;
2155 *outlen = 0;
2156 return(XML_ENC_ERR_SUCCESS);
2157 }
2158
2159 inend = in + *inlen;
2160 outend = out + *outlen;
2161 while (in < inend) {
2162 if (out >= outend)
2163 goto done;
2164
2165 c = *in;
2166
2167 if (c < 0x80) {
2168 *out++ = c;
2169 } else if ((c >= 0xC2) && (c <= 0xC3)) {
2170 if (inend - in < 2)
2171 break;
2172 in++;
2173 *out++ = (unsigned char) ((c << 6) | (*in & 0x3F));
2174 } else {
2175 ret = XML_ENC_ERR_INPUT;
2176 goto done;
2177 }
2178
2179 in++;
2180 }
2181
2182 ret = out - outstart;
2183
2184 done:
2185 *outlen = out - outstart;
2186 *inlen = in - instart;
2187 return(ret);
2188 }
2189
2190 /**
2191 * UTF8Toisolat1:
2192 * @out: a pointer to an array of bytes to store the result
2193 * @outlen: the length of @out
2194 * @in: a pointer to an array of UTF-8 chars
2195 * @inlen: the length of @in
2196 *
2197 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
2198 * block of chars out.
2199 *
2200 * Returns the number of bytes written or an XML_ENC_ERR code.
2201 *
2202 * The value of @inlen after return is the number of octets consumed
2203 * if the return value is positive, else unpredictable.
2204 * The value of @outlen after return is the number of octets produced.
2205 */
2206 int
UTF8Toisolat1(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2207 UTF8Toisolat1(unsigned char* out, int *outlen,
2208 const unsigned char* in, int *inlen) {
2209 if ((out == NULL) || (outlen == NULL) || (in == NULL) || (inlen == NULL))
2210 return(XML_ENC_ERR_INTERNAL);
2211
2212 return(UTF8ToLatin1(out, outlen, in, inlen, NULL));
2213 }
2214 #endif /* LIBXML_OUTPUT_ENABLED */
2215
2216 static int
UTF16LEToUTF8(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2217 UTF16LEToUTF8(unsigned char *out, int *outlen,
2218 const unsigned char *in, int *inlen,
2219 void *vctxt ATTRIBUTE_UNUSED) {
2220 const unsigned char *instart = in;
2221 const unsigned char *inend = in + (*inlen & ~1);
2222 unsigned char *outstart = out;
2223 unsigned char *outend = out + *outlen;
2224 unsigned c, d;
2225 int ret = XML_ENC_ERR_SPACE;
2226
2227 while (in < inend) {
2228 c = in[0] | (in[1] << 8);
2229
2230 if (c < 0x80) {
2231 if (out >= outend)
2232 goto done;
2233 out[0] = c;
2234 in += 2;
2235 out += 1;
2236 } else if (c < 0x800) {
2237 if (outend - out < 2)
2238 goto done;
2239 out[0] = (c >> 6) | 0xC0;
2240 out[1] = (c & 0x3F) | 0x80;
2241 in += 2;
2242 out += 2;
2243 } else if ((c & 0xF800) != 0xD800) {
2244 if (outend - out < 3)
2245 goto done;
2246 out[0] = (c >> 12) | 0xE0;
2247 out[1] = ((c >> 6) & 0x3F) | 0x80;
2248 out[2] = (c & 0x3F) | 0x80;
2249 in += 2;
2250 out += 3;
2251 } else {
2252 /* Surrogate pair */
2253 if ((c & 0xFC00) != 0xD800) {
2254 ret = XML_ENC_ERR_INPUT;
2255 goto done;
2256 }
2257 if (inend - in < 4)
2258 break;
2259 d = in[2] | (in[3] << 8);
2260 if ((d & 0xFC00) != 0xDC00) {
2261 ret = XML_ENC_ERR_INPUT;
2262 goto done;
2263 }
2264 if (outend - out < 4)
2265 goto done;
2266 c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000);
2267 out[0] = (c >> 18) | 0xF0;
2268 out[1] = ((c >> 12) & 0x3F) | 0x80;
2269 out[2] = ((c >> 6) & 0x3F) | 0x80;
2270 out[3] = (c & 0x3F) | 0x80;
2271 in += 4;
2272 out += 4;
2273 }
2274 }
2275
2276 ret = out - outstart;
2277
2278 done:
2279 *outlen = out - outstart;
2280 *inlen = in - instart;
2281 return(ret);
2282 }
2283
2284 #ifdef LIBXML_OUTPUT_ENABLED
2285 static int
UTF8ToUTF16LE(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2286 UTF8ToUTF16LE(unsigned char *out, int *outlen,
2287 const unsigned char *in, int *inlen,
2288 void *vctxt ATTRIBUTE_UNUSED) {
2289 const unsigned char *instart = in;
2290 const unsigned char *inend;
2291 unsigned char *outstart = out;
2292 unsigned char *outend;
2293 unsigned c, d;
2294 int ret = XML_ENC_ERR_SPACE;
2295
2296 /* UTF16LE encoding has no BOM */
2297 if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
2298 return(XML_ENC_ERR_INTERNAL);
2299 if (in == NULL) {
2300 *outlen = 0;
2301 *inlen = 0;
2302 return(0);
2303 }
2304 inend = in + *inlen;
2305 outend = out + (*outlen & ~1);
2306 while (in < inend) {
2307 c = in[0];
2308
2309 if (c < 0x80) {
2310 if (out >= outend)
2311 goto done;
2312 out[0] = c;
2313 out[1] = 0;
2314 in += 1;
2315 out += 2;
2316 } else {
2317 int i, len;
2318 unsigned min;
2319
2320 if (c < 0xE0) {
2321 if (c < 0xC2) {
2322 ret = XML_ENC_ERR_INPUT;
2323 goto done;
2324 }
2325 c &= 0x1F;
2326 len = 2;
2327 min = 0x80;
2328 } else if (c < 0xF0) {
2329 c &= 0x0F;
2330 len = 3;
2331 min = 0x800;
2332 } else {
2333 c &= 0x0F;
2334 len = 4;
2335 min = 0x10000;
2336 }
2337
2338 if (inend - in < len)
2339 break;
2340
2341 for (i = 1; i < len; i++) {
2342 if ((in[i] & 0xC0) != 0x80) {
2343 ret = XML_ENC_ERR_INPUT;
2344 goto done;
2345 }
2346 c = (c << 6) | (in[i] & 0x3F);
2347 }
2348
2349 if ((c < min) ||
2350 ((c >= 0xD800) && (c <= 0xDFFF)) ||
2351 (c > 0x10FFFF)) {
2352 ret = XML_ENC_ERR_INPUT;
2353 goto done;
2354 }
2355
2356 if (c < 0x10000) {
2357 if (out >= outend)
2358 goto done;
2359 out[0] = c & 0xFF;
2360 out[1] = c >> 8;
2361 out += 2;
2362 } else {
2363 if (outend - out < 4)
2364 goto done;
2365 c -= 0x10000;
2366 d = (c & 0x03FF) | 0xDC00;
2367 c = (c >> 10) | 0xD800;
2368 out[0] = c & 0xFF;
2369 out[1] = c >> 8;
2370 out[2] = d & 0xFF;
2371 out[3] = d >> 8;
2372 out += 4;
2373 }
2374
2375 in += len;
2376 }
2377 }
2378
2379 ret = out - outstart;
2380
2381 done:
2382 *outlen = out - outstart;
2383 *inlen = in - instart;
2384 return(ret);
2385 }
2386
2387 static int
UTF8ToUTF16(unsigned char * outb,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2388 UTF8ToUTF16(unsigned char* outb, int *outlen,
2389 const unsigned char* in, int *inlen,
2390 void *vctxt ATTRIBUTE_UNUSED) {
2391 if (in == NULL) {
2392 /*
2393 * initialization, add the Byte Order Mark for UTF-16LE
2394 */
2395 if (*outlen >= 2) {
2396 outb[0] = 0xFF;
2397 outb[1] = 0xFE;
2398 *outlen = 2;
2399 *inlen = 0;
2400 return(2);
2401 }
2402 *outlen = 0;
2403 *inlen = 0;
2404 return(0);
2405 }
2406 return (UTF8ToUTF16LE(outb, outlen, in, inlen, NULL));
2407 }
2408 #endif /* LIBXML_OUTPUT_ENABLED */
2409
2410 static int
UTF16BEToUTF8(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2411 UTF16BEToUTF8(unsigned char *out, int *outlen,
2412 const unsigned char *in, int *inlen,
2413 void *vctxt ATTRIBUTE_UNUSED) {
2414 const unsigned char *instart = in;
2415 const unsigned char *inend = in + (*inlen & ~1);
2416 unsigned char *outstart = out;
2417 unsigned char *outend = out + *outlen;
2418 unsigned c, d;
2419 int ret = XML_ENC_ERR_SPACE;
2420
2421 while (in < inend) {
2422 c = (in[0] << 8) | in[1];
2423
2424 if (c < 0x80) {
2425 if (out >= outend)
2426 goto done;
2427 out[0] = c;
2428 in += 2;
2429 out += 1;
2430 } else if (c < 0x800) {
2431 if (outend - out < 2)
2432 goto done;
2433 out[0] = (c >> 6) | 0xC0;
2434 out[1] = (c & 0x3F) | 0x80;
2435 in += 2;
2436 out += 2;
2437 } else if ((c & 0xF800) != 0xD800) {
2438 if (outend - out < 3)
2439 goto done;
2440 out[0] = (c >> 12) | 0xE0;
2441 out[1] = ((c >> 6) & 0x3F) | 0x80;
2442 out[2] = (c & 0x3F) | 0x80;
2443 in += 2;
2444 out += 3;
2445 } else {
2446 /* Surrogate pair */
2447 if ((c & 0xFC00) != 0xD800) {
2448 ret = XML_ENC_ERR_INPUT;
2449 goto done;
2450 }
2451 if (inend - in < 4)
2452 break;
2453 d = (in[2] << 8) | in[3];
2454 if ((d & 0xFC00) != 0xDC00) {
2455 ret = XML_ENC_ERR_INPUT;
2456 goto done;
2457 }
2458 if (outend - out < 4)
2459 goto done;
2460 c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000);
2461 out[0] = (c >> 18) | 0xF0;
2462 out[1] = ((c >> 12) & 0x3F) | 0x80;
2463 out[2] = ((c >> 6) & 0x3F) | 0x80;
2464 out[3] = (c & 0x3F) | 0x80;
2465 in += 4;
2466 out += 4;
2467 }
2468 }
2469
2470 ret = out - outstart;
2471
2472 done:
2473 *outlen = out - outstart;
2474 *inlen = in - instart;
2475 return(ret);
2476 }
2477
2478 #ifdef LIBXML_OUTPUT_ENABLED
2479 static int
UTF8ToUTF16BE(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2480 UTF8ToUTF16BE(unsigned char *out, int *outlen,
2481 const unsigned char *in, int *inlen,
2482 void *vctxt ATTRIBUTE_UNUSED) {
2483 const unsigned char *instart = in;
2484 const unsigned char *inend;
2485 unsigned char *outstart = out;
2486 unsigned char *outend;
2487 unsigned c, d;
2488 int ret = XML_ENC_ERR_SPACE;
2489
2490 /* UTF-16BE has no BOM */
2491 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2492 if (in == NULL) {
2493 *outlen = 0;
2494 *inlen = 0;
2495 return(0);
2496 }
2497 inend = in + *inlen;
2498 outend = out + (*outlen & ~1);
2499 while (in < inend) {
2500 c = in[0];
2501
2502 if (c < 0x80) {
2503 if (out >= outend)
2504 goto done;
2505 out[0] = 0;
2506 out[1] = c;
2507 in += 1;
2508 out += 2;
2509 } else {
2510 int i, len;
2511 unsigned min;
2512
2513 if (c < 0xE0) {
2514 if (c < 0xC2) {
2515 ret = XML_ENC_ERR_INPUT;
2516 goto done;
2517 }
2518 c &= 0x1F;
2519 len = 2;
2520 min = 0x80;
2521 } else if (c < 0xF0) {
2522 c &= 0x0F;
2523 len = 3;
2524 min = 0x800;
2525 } else {
2526 c &= 0x0F;
2527 len = 4;
2528 min = 0x10000;
2529 }
2530
2531 if (inend - in < len)
2532 break;
2533
2534 for (i = 1; i < len; i++) {
2535 if ((in[i] & 0xC0) != 0x80) {
2536 ret = XML_ENC_ERR_INPUT;
2537 goto done;
2538 }
2539 c = (c << 6) | (in[i] & 0x3F);
2540 }
2541
2542 if ((c < min) ||
2543 ((c >= 0xD800) && (c <= 0xDFFF)) ||
2544 (c > 0x10FFFF)) {
2545 ret = XML_ENC_ERR_INPUT;
2546 goto done;
2547 }
2548
2549 if (c < 0x10000) {
2550 if (out >= outend)
2551 goto done;
2552 out[0] = c >> 8;
2553 out[1] = c & 0xFF;
2554 out += 2;
2555 } else {
2556 if (outend - out < 4)
2557 goto done;
2558 c -= 0x10000;
2559 d = (c & 0x03FF) | 0xDC00;
2560 c = (c >> 10) | 0xD800;
2561 out[0] = c >> 8;
2562 out[1] = c & 0xFF;
2563 out[2] = d >> 8;
2564 out[3] = d & 0xFF;
2565 out += 4;
2566 }
2567
2568 in += len;
2569 }
2570 }
2571
2572 ret = out - outstart;
2573
2574 done:
2575 *outlen = out - outstart;
2576 *inlen = in - instart;
2577 return(ret);
2578 }
2579 #endif /* LIBXML_OUTPUT_ENABLED */
2580
2581 #if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
2582 static int
UTF8ToHtmlWrapper(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2583 UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
2584 const unsigned char *in, int *inlen,
2585 void *vctxt ATTRIBUTE_UNUSED) {
2586 return(UTF8ToHtml(out, outlen, in, inlen));
2587 }
2588 #endif
2589
2590 #if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \
2591 defined(LIBXML_ISO8859X_ENABLED)
2592
2593 static int
UTF8ToISO8859x(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt)2594 UTF8ToISO8859x(unsigned char *out, int *outlen,
2595 const unsigned char *in, int *inlen, void *vctxt) {
2596 const unsigned char *xlattable = vctxt;
2597 const unsigned char *instart = in;
2598 const unsigned char *inend;
2599 unsigned char *outstart = out;
2600 unsigned char *outend;
2601 int ret = XML_ENC_ERR_SPACE;
2602
2603 if (in == NULL) {
2604 /*
2605 * initialization nothing to do
2606 */
2607 *outlen = 0;
2608 *inlen = 0;
2609 return(XML_ENC_ERR_SUCCESS);
2610 }
2611
2612 inend = in + *inlen;
2613 outend = out + *outlen;
2614 while (in < inend) {
2615 unsigned d = *in;
2616
2617 if (d < 0x80) {
2618 if (out >= outend)
2619 goto done;
2620 in += 1;
2621 } else if (d < 0xE0) {
2622 unsigned c;
2623
2624 if (inend - in < 2)
2625 break;
2626 c = in[1] & 0x3F;
2627 d = d & 0x1F;
2628 d = xlattable [48 + c + xlattable [d] * 64];
2629 if (d == 0) {
2630 /* not in character set */
2631 ret = XML_ENC_ERR_INPUT;
2632 goto done;
2633 }
2634 if (out >= outend)
2635 goto done;
2636 in += 2;
2637 } else if (d < 0xF0) {
2638 unsigned c1;
2639 unsigned c2;
2640
2641 if (inend - in < 3)
2642 break;
2643 c1 = in[1] & 0x3F;
2644 c2 = in[2] & 0x3F;
2645 d = d & 0x0F;
2646 d = xlattable [48 + c2 + xlattable [48 + c1 +
2647 xlattable [32 + d] * 64] * 64];
2648 if (d == 0) {
2649 /* not in character set */
2650 ret = XML_ENC_ERR_INPUT;
2651 goto done;
2652 }
2653 if (out >= outend)
2654 goto done;
2655 in += 3;
2656 } else {
2657 /* cannot transcode >= U+010000 */
2658 ret = XML_ENC_ERR_INPUT;
2659 goto done;
2660 }
2661
2662 *out++ = d;
2663 }
2664
2665 ret = out - outstart;
2666
2667 done:
2668 *outlen = out - outstart;
2669 *inlen = in - instart;
2670 return(ret);
2671 }
2672
2673 static int
ISO8859xToUTF8(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt)2674 ISO8859xToUTF8(unsigned char* out, int *outlen,
2675 const unsigned char* in, int *inlen, void *vctxt) {
2676 unsigned short const *unicodetable = vctxt;
2677 const unsigned char* instart = in;
2678 const unsigned char* inend;
2679 unsigned char* outstart = out;
2680 unsigned char* outend;
2681 int ret = XML_ENC_ERR_SPACE;
2682
2683 outend = out + *outlen;
2684 inend = in + *inlen;
2685
2686 while (in < inend) {
2687 unsigned c = *in;
2688
2689 if (c < 0x80) {
2690 if (out >= outend)
2691 goto done;
2692 *out++ = c;
2693 } else {
2694 c = unicodetable[c - 0x80];
2695 if (c == 0) {
2696 /* undefined code point */
2697 ret = XML_ENC_ERR_INPUT;
2698 goto done;
2699 }
2700 if (c < 0x800) {
2701 if (outend - out < 2)
2702 goto done;
2703 *out++ = ((c >> 6) & 0x1F) | 0xC0;
2704 *out++ = (c & 0x3F) | 0x80;
2705 } else {
2706 if (outend - out < 3)
2707 goto done;
2708 *out++ = ((c >> 12) & 0x0F) | 0xE0;
2709 *out++ = ((c >> 6) & 0x3F) | 0x80;
2710 *out++ = (c & 0x3F) | 0x80;
2711 }
2712 }
2713
2714 in += 1;
2715 }
2716
2717 ret = out - outstart;
2718
2719 done:
2720 *outlen = out - outstart;
2721 *inlen = in - instart;
2722 return(ret);
2723 }
2724
2725 #endif
2726
2727