xref: /aosp_15_r20/external/libxml2/encoding.c (revision 7c5688314b92172186c154356a6374bf7684c3ca)
1 /*
2  * encoding.c : implements the encoding conversion functions needed for XML
3  *
4  * Related specs:
5  * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6  * rfc2781        UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7  * [ISO-10646]    UTF-8 and UTF-16 in Annexes
8  * [ISO-8859-1]   ISO Latin-1 characters codes.
9  * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
10  *                Worldwide Character Encoding -- Version 1.0", Addison-
11  *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
12  *                described in Unicode Technical Report #4.
13  * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
14  *                Information Interchange, ANSI X3.4-1986.
15  *
16  * See Copyright for the status of this software.
17  *
18  * [email protected]
19  *
20  * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <[email protected]>
21  */
22 
23 #define IN_LIBXML
24 #include "libxml.h"
25 
26 #include <string.h>
27 #include <limits.h>
28 #include <ctype.h>
29 #include <stdlib.h>
30 
31 #ifdef LIBXML_ICONV_ENABLED
32 #include <iconv.h>
33 #include <errno.h>
34 #endif
35 
36 #include <libxml/encoding.h>
37 #include <libxml/xmlmemory.h>
38 #include <libxml/parser.h>
39 #ifdef LIBXML_HTML_ENABLED
40 #include <libxml/HTMLparser.h>
41 #endif
42 #include <libxml/xmlerror.h>
43 
44 #include "private/buf.h"
45 #include "private/enc.h"
46 #include "private/entities.h"
47 #include "private/error.h"
48 
49 #ifdef LIBXML_ICU_ENABLED
50 #include <unicode/ucnv.h>
51 #endif
52 
53 #define XML_HANDLER_STATIC 1
54 
55 typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
56 typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
57 struct _xmlCharEncodingAlias {
58     const char *name;
59     const char *alias;
60 };
61 
62 static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
63 static int xmlCharEncodingAliasesNb = 0;
64 static int xmlCharEncodingAliasesMax = 0;
65 
66 static int xmlLittleEndian = 1;
67 
68 typedef struct {
69     const char *name;
70     xmlCharEncoding enc;
71 } xmlEncTableEntry;
72 
73 static const xmlEncTableEntry xmlEncTable[] = {
74     { "ASCII", XML_CHAR_ENCODING_ASCII },
75     { "EUC-JP", XML_CHAR_ENCODING_EUC_JP },
76     { "HTML", XML_CHAR_ENCODING_HTML },
77     { "ISO LATIN 1", XML_CHAR_ENCODING_8859_1 },
78     { "ISO LATIN 2", XML_CHAR_ENCODING_8859_2 },
79     { "ISO-10646-UCS-2", XML_CHAR_ENCODING_UCS2 },
80     { "ISO-10646-UCS-4", XML_CHAR_ENCODING_UCS4LE },
81     { "ISO-2022-JP", XML_CHAR_ENCODING_2022_JP },
82     { "ISO-8859-1", XML_CHAR_ENCODING_8859_1 },
83     { "ISO-8859-10", XML_CHAR_ENCODING_8859_10 },
84     { "ISO-8859-11", XML_CHAR_ENCODING_8859_11 },
85     { "ISO-8859-13", XML_CHAR_ENCODING_8859_13 },
86     { "ISO-8859-14", XML_CHAR_ENCODING_8859_14 },
87     { "ISO-8859-15", XML_CHAR_ENCODING_8859_15 },
88     { "ISO-8859-16", XML_CHAR_ENCODING_8859_16 },
89     { "ISO-8859-2", XML_CHAR_ENCODING_8859_2 },
90     { "ISO-8859-3", XML_CHAR_ENCODING_8859_3 },
91     { "ISO-8859-4", XML_CHAR_ENCODING_8859_4 },
92     { "ISO-8859-5", XML_CHAR_ENCODING_8859_5 },
93     { "ISO-8859-6", XML_CHAR_ENCODING_8859_6 },
94     { "ISO-8859-7", XML_CHAR_ENCODING_8859_7 },
95     { "ISO-8859-8", XML_CHAR_ENCODING_8859_8 },
96     { "ISO-8859-9", XML_CHAR_ENCODING_8859_9 },
97     { "ISO-LATIN-1", XML_CHAR_ENCODING_8859_1 },
98     { "ISO-LATIN-2", XML_CHAR_ENCODING_8859_2 },
99     { "SHIFT_JIS", XML_CHAR_ENCODING_SHIFT_JIS },
100     { "UCS-2", XML_CHAR_ENCODING_UCS2 },
101     { "UCS-4", XML_CHAR_ENCODING_UCS4LE },
102     { "UCS2", XML_CHAR_ENCODING_UCS2 },
103     { "UCS4", XML_CHAR_ENCODING_UCS4LE },
104     { "US-ASCII", XML_CHAR_ENCODING_ASCII },
105     { "UTF-16", XML_CHAR_ENCODING_UTF16 },
106     { "UTF-16BE", XML_CHAR_ENCODING_UTF16BE },
107     { "UTF-16LE", XML_CHAR_ENCODING_UTF16LE },
108     { "UTF-8", XML_CHAR_ENCODING_UTF8 },
109     { "UTF16", XML_CHAR_ENCODING_UTF16 },
110     { "UTF8", XML_CHAR_ENCODING_UTF8 }
111 };
112 
113 static int
114 asciiToAscii(unsigned char* out, int *outlen,
115              const unsigned char* in, int *inlen, void *vctxt);
116 static int
117 UTF8ToUTF8(unsigned char* out, int *outlen,
118            const unsigned char* inb, int *inlenb, void *vctxt);
119 static int
120 latin1ToUTF8(unsigned char* out, int *outlen,
121              const unsigned char* in, int *inlen, void *vctxt);
122 static int
123 UTF16LEToUTF8(unsigned char* out, int *outlen,
124               const unsigned char* inb, int *inlenb, void *vctxt);
125 static int
126 UTF16BEToUTF8(unsigned char* out, int *outlen,
127               const unsigned char* inb, int *inlenb, void *vctxt);
128 
129 #ifdef LIBXML_OUTPUT_ENABLED
130 
131 static int
132 UTF8ToLatin1(unsigned char* outb, int *outlen,
133              const unsigned char* in, int *inlen, void *vctxt);
134 static int
135 UTF8ToUTF16(unsigned char* outb, int *outlen,
136             const unsigned char* in, int *inlen, void *vctxt);
137 static int
138 UTF8ToUTF16LE(unsigned char* outb, int *outlen,
139               const unsigned char* in, int *inlen, void *vctxt);
140 static int
141 UTF8ToUTF16BE(unsigned char* outb, int *outlen,
142               const unsigned char* in, int *inlen, void *vctxt);
143 
144 #else /* LIBXML_OUTPUT_ENABLED */
145 
146 #define UTF8ToLatin1 NULL
147 #define UTF8ToUTF16 NULL
148 #define UTF8ToUTF16LE NULL
149 #define UTF8ToUTF16BE NULL
150 
151 #endif /* LIBXML_OUTPUT_ENABLED */
152 
153 #if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
154 static int
155 UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
156                   const unsigned char *in, int *inlen, void *vctxt);
157 #else
158 #define UTF8ToHtmlWrapper NULL
159 #endif
160 
161 #ifdef LIBXML_ICONV_ENABLED
162   #define EMPTY_ICONV , (iconv_t) 0, (iconv_t) 0
163 #else
164   #define EMPTY_ICONV
165 #endif
166 
167 #if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \
168     defined(LIBXML_ISO8859X_ENABLED)
169 
170 #include "iso8859x.inc"
171 
172 static int
173 ISO8859xToUTF8(unsigned char* out, int *outlen,
174                const unsigned char* in, int *inlen, void *vctxt);
175 static int
176 UTF8ToISO8859x(unsigned char *out, int *outlen,
177                const unsigned char *in, int *inlen, void *vctxt);
178 
179 #define MAKE_ISO_HANDLER(name, n) \
180     { (char *) name, \
181       (xmlCharEncodingInputFunc) (void (*)(void)) ISO8859xToUTF8, \
182       (xmlCharEncodingInputFunc) (void (*)(void)) UTF8ToISO8859x \
183       EMPTY_ICONV, \
184       (void *) xmlunicodetable_ISO8859_##n, \
185       (void *) xmltranscodetable_ISO8859_##n, \
186       NULL, XML_HANDLER_STATIC }
187 
188 #else /* LIBXML_ISO8859X_ENABLED */
189 
190 #define MAKE_ISO_HANDLER(name, n) \
191     { (char *) name, NULL, NULL EMPTY_ICONV, NULL, NULL, NULL, \
192       XML_HANDLER_STATIC }
193 
194 #endif /* LIBXML_ISO8859X_ENABLED */
195 
196 #define MAKE_HANDLER(name, in, out) \
197     { (char *) name, \
198       (xmlCharEncodingInputFunc) (void (*)(void)) in, \
199       (xmlCharEncodingOutputFunc) (void (*)(void)) out \
200       EMPTY_ICONV, NULL, NULL, NULL, XML_HANDLER_STATIC }
201 
202 /*
203  * The layout must match enum xmlCharEncoding.
204  *
205  * Names should match the IANA registry if possible:
206  * https://www.iana.org/assignments/character-sets/character-sets.xhtml
207  */
208 static const xmlCharEncodingHandler defaultHandlers[31] = {
209     MAKE_HANDLER(NULL, NULL, NULL), /* NONE */
210     MAKE_HANDLER("UTF-8", UTF8ToUTF8, UTF8ToUTF8),
211     MAKE_HANDLER("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE),
212     MAKE_HANDLER("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE),
213     MAKE_HANDLER("UCS-4LE", NULL, NULL),
214     MAKE_HANDLER("UCS-4BE", NULL, NULL),
215     MAKE_HANDLER("IBM037", NULL, NULL),
216     MAKE_HANDLER("ISO-10646-UCS-4", NULL, NULL), /* UCS4_2143 */
217     MAKE_HANDLER("ISO-10646-UCS-4", NULL, NULL), /* UCS4_2143 */
218     MAKE_HANDLER("ISO-10646-UCS-2", NULL, NULL),
219     MAKE_HANDLER("ISO-8859-1", latin1ToUTF8, UTF8ToLatin1),
220     MAKE_ISO_HANDLER("ISO-8859-2", 2),
221     MAKE_ISO_HANDLER("ISO-8859-3", 3),
222     MAKE_ISO_HANDLER("ISO-8859-4", 4),
223     MAKE_ISO_HANDLER("ISO-8859-5", 5),
224     MAKE_ISO_HANDLER("ISO-8859-6", 6),
225     MAKE_ISO_HANDLER("ISO-8859-7", 7),
226     MAKE_ISO_HANDLER("ISO-8859-8", 8),
227     MAKE_ISO_HANDLER("ISO-8859-9", 9),
228     MAKE_HANDLER("ISO-2022-JP", NULL, NULL),
229     MAKE_HANDLER("Shift_JIS", NULL, NULL),
230     MAKE_HANDLER("EUC-JP", NULL, NULL),
231     MAKE_HANDLER("US-ASCII", asciiToAscii, asciiToAscii),
232     MAKE_HANDLER("UTF-16", UTF16LEToUTF8, UTF8ToUTF16),
233     MAKE_HANDLER("HTML", NULL, UTF8ToHtmlWrapper),
234     MAKE_ISO_HANDLER("ISO-8859-10", 10),
235     MAKE_ISO_HANDLER("ISO-8859-11", 11),
236     MAKE_ISO_HANDLER("ISO-8859-13", 13),
237     MAKE_ISO_HANDLER("ISO-8859-14", 14),
238     MAKE_ISO_HANDLER("ISO-8859-15", 15),
239     MAKE_ISO_HANDLER("ISO-8859-16", 16),
240 };
241 
242 #define NUM_DEFAULT_HANDLERS \
243     (sizeof(defaultHandlers) / sizeof(defaultHandlers[0]))
244 
245 /* the size should be growable, but it's not a big deal ... */
246 #define MAX_ENCODING_HANDLERS 50
247 static xmlCharEncodingHandlerPtr *globalHandlers = NULL;
248 static int nbCharEncodingHandler = 0;
249 
250 #ifdef LIBXML_ICONV_ENABLED
251 static int
252 xmlCharEncIconv(void *vctxt, const char *name, xmlCharEncConverter *conv);
253 #endif
254 
255 #ifdef LIBXML_ICU_ENABLED
256 static int
257 xmlCharEncUconv(void *vctxt, const char *name, xmlCharEncConverter *conv);
258 #endif
259 
260 /************************************************************************
261  *									*
262  *		Generic encoding handling routines			*
263  *									*
264  ************************************************************************/
265 
266 /**
267  * xmlDetectCharEncoding:
268  * @in:  a pointer to the first bytes of the XML entity, must be at least
269  *       2 bytes long (at least 4 if encoding is UTF4 variant).
270  * @len:  pointer to the length of the buffer
271  *
272  * Guess the encoding of the entity using the first bytes of the entity content
273  * according to the non-normative appendix F of the XML-1.0 recommendation.
274  *
275  * Returns one of the XML_CHAR_ENCODING_... values.
276  */
277 xmlCharEncoding
xmlDetectCharEncoding(const unsigned char * in,int len)278 xmlDetectCharEncoding(const unsigned char* in, int len)
279 {
280     if (in == NULL)
281         return(XML_CHAR_ENCODING_NONE);
282     if (len >= 4) {
283 	if ((in[0] == 0x00) && (in[1] == 0x00) &&
284 	    (in[2] == 0x00) && (in[3] == 0x3C))
285 	    return(XML_CHAR_ENCODING_UCS4BE);
286 	if ((in[0] == 0x3C) && (in[1] == 0x00) &&
287 	    (in[2] == 0x00) && (in[3] == 0x00))
288 	    return(XML_CHAR_ENCODING_UCS4LE);
289 	if ((in[0] == 0x00) && (in[1] == 0x00) &&
290 	    (in[2] == 0x3C) && (in[3] == 0x00))
291 	    return(XML_CHAR_ENCODING_UCS4_2143);
292 	if ((in[0] == 0x00) && (in[1] == 0x3C) &&
293 	    (in[2] == 0x00) && (in[3] == 0x00))
294 	    return(XML_CHAR_ENCODING_UCS4_3412);
295 	if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
296 	    (in[2] == 0xA7) && (in[3] == 0x94))
297 	    return(XML_CHAR_ENCODING_EBCDIC);
298 	if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
299 	    (in[2] == 0x78) && (in[3] == 0x6D))
300 	    return(XML_CHAR_ENCODING_UTF8);
301 	/*
302 	 * Although not part of the recommendation, we also
303 	 * attempt an "auto-recognition" of UTF-16LE and
304 	 * UTF-16BE encodings.
305 	 */
306 	if ((in[0] == 0x3C) && (in[1] == 0x00) &&
307 	    (in[2] == 0x3F) && (in[3] == 0x00))
308 	    return(XML_CHAR_ENCODING_UTF16LE);
309 	if ((in[0] == 0x00) && (in[1] == 0x3C) &&
310 	    (in[2] == 0x00) && (in[3] == 0x3F))
311 	    return(XML_CHAR_ENCODING_UTF16BE);
312     }
313     if (len >= 3) {
314 	/*
315 	 * Errata on XML-1.0 June 20 2001
316 	 * We now allow an UTF8 encoded BOM
317 	 */
318 	if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
319 	    (in[2] == 0xBF))
320 	    return(XML_CHAR_ENCODING_UTF8);
321     }
322     /* For UTF-16 we can recognize by the BOM */
323     if (len >= 2) {
324 	if ((in[0] == 0xFE) && (in[1] == 0xFF))
325 	    return(XML_CHAR_ENCODING_UTF16BE);
326 	if ((in[0] == 0xFF) && (in[1] == 0xFE))
327 	    return(XML_CHAR_ENCODING_UTF16LE);
328     }
329     return(XML_CHAR_ENCODING_NONE);
330 }
331 
332 /**
333  * xmlCleanupEncodingAliases:
334  *
335  * DEPRECATED: This function modifies global state and is not
336  * thread-safe.
337  *
338  * Unregisters all aliases
339  */
340 void
xmlCleanupEncodingAliases(void)341 xmlCleanupEncodingAliases(void) {
342     int i;
343 
344     if (xmlCharEncodingAliases == NULL)
345 	return;
346 
347     for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
348 	if (xmlCharEncodingAliases[i].name != NULL)
349 	    xmlFree((char *) xmlCharEncodingAliases[i].name);
350 	if (xmlCharEncodingAliases[i].alias != NULL)
351 	    xmlFree((char *) xmlCharEncodingAliases[i].alias);
352     }
353     xmlCharEncodingAliasesNb = 0;
354     xmlCharEncodingAliasesMax = 0;
355     xmlFree(xmlCharEncodingAliases);
356     xmlCharEncodingAliases = NULL;
357 }
358 
359 /**
360  * xmlGetEncodingAlias:
361  * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
362  *
363  * DEPRECATED: This function is not thread-safe.
364  *
365  * Lookup an encoding name for the given alias.
366  *
367  * Returns NULL if not found, otherwise the original name
368  */
369 const char *
xmlGetEncodingAlias(const char * alias)370 xmlGetEncodingAlias(const char *alias) {
371     int i;
372     char upper[100];
373 
374     if (alias == NULL)
375 	return(NULL);
376 
377     if (xmlCharEncodingAliases == NULL)
378 	return(NULL);
379 
380     for (i = 0;i < 99;i++) {
381         upper[i] = (char) toupper((unsigned char) alias[i]);
382 	if (upper[i] == 0) break;
383     }
384     upper[i] = 0;
385 
386     /*
387      * Walk down the list looking for a definition of the alias
388      */
389     for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
390 	if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
391 	    return(xmlCharEncodingAliases[i].name);
392 	}
393     }
394     return(NULL);
395 }
396 
397 /**
398  * xmlAddEncodingAlias:
399  * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
400  * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
401  *
402  * DEPRECATED: This function modifies global state and is not
403  * thread-safe.
404  *
405  * Registers an alias @alias for an encoding named @name. Existing alias
406  * will be overwritten.
407  *
408  * Returns 0 in case of success, -1 in case of error
409  */
410 int
xmlAddEncodingAlias(const char * name,const char * alias)411 xmlAddEncodingAlias(const char *name, const char *alias) {
412     int i;
413     char upper[100];
414     char *nameCopy, *aliasCopy;
415 
416     if ((name == NULL) || (alias == NULL))
417 	return(-1);
418 
419     for (i = 0;i < 99;i++) {
420         upper[i] = (char) toupper((unsigned char) alias[i]);
421 	if (upper[i] == 0) break;
422     }
423     upper[i] = 0;
424 
425     if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
426         xmlCharEncodingAliasPtr tmp;
427         size_t newSize = xmlCharEncodingAliasesMax ?
428                          xmlCharEncodingAliasesMax * 2 :
429                          20;
430 
431         tmp = (xmlCharEncodingAliasPtr)
432               xmlRealloc(xmlCharEncodingAliases,
433                          newSize * sizeof(xmlCharEncodingAlias));
434         if (tmp == NULL)
435             return(-1);
436         xmlCharEncodingAliases = tmp;
437         xmlCharEncodingAliasesMax = newSize;
438     }
439 
440     /*
441      * Walk down the list looking for a definition of the alias
442      */
443     for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
444 	if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
445 	    /*
446 	     * Replace the definition.
447 	     */
448 	    nameCopy = xmlMemStrdup(name);
449             if (nameCopy == NULL)
450                 return(-1);
451 	    xmlFree((char *) xmlCharEncodingAliases[i].name);
452 	    xmlCharEncodingAliases[i].name = nameCopy;
453 	    return(0);
454 	}
455     }
456     /*
457      * Add the definition
458      */
459     nameCopy = xmlMemStrdup(name);
460     if (nameCopy == NULL)
461         return(-1);
462     aliasCopy = xmlMemStrdup(upper);
463     if (aliasCopy == NULL) {
464         xmlFree(nameCopy);
465         return(-1);
466     }
467     xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = nameCopy;
468     xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = aliasCopy;
469     xmlCharEncodingAliasesNb++;
470     return(0);
471 }
472 
473 /**
474  * xmlDelEncodingAlias:
475  * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
476  *
477  * DEPRECATED: This function modifies global state and is not
478  * thread-safe.
479  *
480  * Unregisters an encoding alias @alias
481  *
482  * Returns 0 in case of success, -1 in case of error
483  */
484 int
xmlDelEncodingAlias(const char * alias)485 xmlDelEncodingAlias(const char *alias) {
486     int i;
487 
488     if (alias == NULL)
489 	return(-1);
490 
491     if (xmlCharEncodingAliases == NULL)
492 	return(-1);
493     /*
494      * Walk down the list looking for a definition of the alias
495      */
496     for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
497 	if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
498 	    xmlFree((char *) xmlCharEncodingAliases[i].name);
499 	    xmlFree((char *) xmlCharEncodingAliases[i].alias);
500 	    xmlCharEncodingAliasesNb--;
501 	    memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
502 		    sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
503 	    return(0);
504 	}
505     }
506     return(-1);
507 }
508 
509 static int
xmlCompareEncTableEntries(const void * vkey,const void * ventry)510 xmlCompareEncTableEntries(const void *vkey, const void *ventry) {
511     const char *key = vkey;
512     const xmlEncTableEntry *entry = ventry;
513 
514     return(xmlStrcasecmp(BAD_CAST key, BAD_CAST entry->name));
515 }
516 
517 static xmlCharEncoding
xmlParseCharEncodingInternal(const char * name)518 xmlParseCharEncodingInternal(const char *name)
519 {
520     const xmlEncTableEntry *entry;
521 
522     if (name == NULL)
523        return(XML_CHAR_ENCODING_NONE);
524 
525     entry = bsearch(name, xmlEncTable,
526                     sizeof(xmlEncTable) / sizeof(xmlEncTable[0]),
527                     sizeof(xmlEncTable[0]), xmlCompareEncTableEntries);
528     if (entry != NULL)
529         return(entry->enc);
530 
531     return(XML_CHAR_ENCODING_ERROR);
532 }
533 
534 /**
535  * xmlParseCharEncoding:
536  * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
537  *
538  * Compare the string to the encoding schemes already known. Note
539  * that the comparison is case insensitive accordingly to the section
540  * [XML] 4.3.3 Character Encoding in Entities.
541  *
542  * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
543  * if not recognized.
544  */
545 xmlCharEncoding
xmlParseCharEncoding(const char * name)546 xmlParseCharEncoding(const char *name)
547 {
548     xmlCharEncoding enc = xmlParseCharEncodingInternal(name);
549 
550     /* Backward compatibility */
551     if (enc == XML_CHAR_ENCODING_UTF16)
552         enc = XML_CHAR_ENCODING_UTF16LE;
553 
554     return(enc);
555 }
556 
557 /**
558  * xmlGetCharEncodingName:
559  * @enc:  the encoding
560  *
561  * The "canonical" name for XML encoding.
562  * C.f. http://www.w3.org/TR/REC-xml#charencoding
563  * Section 4.3.3  Character Encoding in Entities
564  *
565  * Returns the canonical name for the given encoding
566  */
567 const char*
xmlGetCharEncodingName(xmlCharEncoding enc)568 xmlGetCharEncodingName(xmlCharEncoding enc) {
569     switch (enc) {
570         case XML_CHAR_ENCODING_UTF16LE:
571 	    return("UTF-16");
572         case XML_CHAR_ENCODING_UTF16BE:
573 	    return("UTF-16");
574         case XML_CHAR_ENCODING_UCS4LE:
575             return("ISO-10646-UCS-4");
576         case XML_CHAR_ENCODING_UCS4BE:
577             return("ISO-10646-UCS-4");
578         default:
579             break;
580     }
581 
582     if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS))
583         return(NULL);
584 
585     return(defaultHandlers[enc].name);
586 }
587 
588 /************************************************************************
589  *									*
590  *			Char encoding handlers				*
591  *									*
592  ************************************************************************/
593 
594 /**
595  * xmlNewCharEncodingHandler:
596  * @name:  the encoding name, in UTF-8 format (ASCII actually)
597  * @input:  the xmlCharEncodingInputFunc to read that encoding
598  * @output:  the xmlCharEncodingOutputFunc to write that encoding
599  *
600  * DEPRECATED: This function modifies global state and is not
601  * thread-safe.
602  *
603  * Create and registers an xmlCharEncodingHandler.
604  *
605  * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
606  */
607 xmlCharEncodingHandlerPtr
xmlNewCharEncodingHandler(const char * name,xmlCharEncodingInputFunc input,xmlCharEncodingOutputFunc output)608 xmlNewCharEncodingHandler(const char *name,
609                           xmlCharEncodingInputFunc input,
610                           xmlCharEncodingOutputFunc output) {
611     xmlCharEncodingHandlerPtr handler;
612     const char *alias;
613     char upper[500];
614     int i;
615     char *up = NULL;
616 
617     /*
618      * Do the alias resolution
619      */
620     alias = xmlGetEncodingAlias(name);
621     if (alias != NULL)
622 	name = alias;
623 
624     /*
625      * Keep only the uppercase version of the encoding.
626      */
627     if (name == NULL)
628 	return(NULL);
629     for (i = 0;i < 499;i++) {
630         upper[i] = (char) toupper((unsigned char) name[i]);
631 	if (upper[i] == 0) break;
632     }
633     upper[i] = 0;
634     up = xmlMemStrdup(upper);
635     if (up == NULL)
636 	return(NULL);
637 
638     /*
639      * allocate and fill-up an handler block.
640      */
641     handler = (xmlCharEncodingHandlerPtr)
642               xmlMalloc(sizeof(xmlCharEncodingHandler));
643     if (handler == NULL) {
644         xmlFree(up);
645 	return(NULL);
646     }
647     memset(handler, 0, sizeof(xmlCharEncodingHandler));
648     handler->input = input;
649     handler->output = output;
650     handler->name = up;
651     handler->flags = XML_HANDLER_STATIC;
652 
653 #ifdef LIBXML_ICONV_ENABLED
654     handler->iconv_in = NULL;
655     handler->iconv_out = NULL;
656 #endif
657 
658     /*
659      * registers and returns the handler.
660      */
661     xmlRegisterCharEncodingHandler(handler);
662     return(handler);
663 }
664 
665 /**
666  * xmlInitCharEncodingHandlers:
667  *
668  * DEPRECATED: Alias for xmlInitParser.
669  */
670 void
xmlInitCharEncodingHandlers(void)671 xmlInitCharEncodingHandlers(void) {
672     xmlInitParser();
673 }
674 
675 /**
676  * xmlInitEncodingInternal:
677  *
678  * Initialize the char encoding support.
679  */
680 void
xmlInitEncodingInternal(void)681 xmlInitEncodingInternal(void) {
682     unsigned short int tst = 0x1234;
683     unsigned char *ptr = (unsigned char *) &tst;
684 
685     if (*ptr == 0x12) xmlLittleEndian = 0;
686     else xmlLittleEndian = 1;
687 }
688 
689 /**
690  * xmlCleanupCharEncodingHandlers:
691  *
692  * DEPRECATED: This function will be made private. Call xmlCleanupParser
693  * to free global state but see the warnings there. xmlCleanupParser
694  * should be only called once at program exit. In most cases, you don't
695  * have call cleanup functions at all.
696  *
697  * Cleanup the memory allocated for the char encoding support, it
698  * unregisters all the encoding handlers and the aliases.
699  */
700 void
xmlCleanupCharEncodingHandlers(void)701 xmlCleanupCharEncodingHandlers(void) {
702     xmlCleanupEncodingAliases();
703 
704     if (globalHandlers == NULL) return;
705 
706     for (;nbCharEncodingHandler > 0;) {
707         xmlCharEncodingHandler *handler;
708 
709         nbCharEncodingHandler--;
710         handler = globalHandlers[nbCharEncodingHandler];
711 	if (handler != NULL) {
712 	    if (handler->name != NULL)
713 		xmlFree(handler->name);
714 	    xmlFree(handler);
715 	}
716     }
717     xmlFree(globalHandlers);
718     globalHandlers = NULL;
719     nbCharEncodingHandler = 0;
720 }
721 
722 /**
723  * xmlRegisterCharEncodingHandler:
724  * @handler:  the xmlCharEncodingHandlerPtr handler block
725  *
726  * DEPRECATED: This function modifies global state and is not
727  * thread-safe.
728  *
729  * Register the char encoding handler.
730  */
731 void
xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler)732 xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
733     if (handler == NULL)
734         return;
735     if (globalHandlers == NULL) {
736         globalHandlers = xmlMalloc(
737                 MAX_ENCODING_HANDLERS * sizeof(globalHandlers[0]));
738         if (globalHandlers == NULL)
739             goto free_handler;
740     }
741 
742     if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS)
743         goto free_handler;
744     globalHandlers[nbCharEncodingHandler++] = handler;
745     return;
746 
747 free_handler:
748     if (handler != NULL) {
749         if (handler->name != NULL) {
750             xmlFree(handler->name);
751         }
752         xmlFree(handler);
753     }
754 }
755 
756 static int
xmlInvokeConvImpl(xmlCharEncConvImpl impl,void * implCtxt,const char * name,xmlCharEncodingHandler * handler)757 xmlInvokeConvImpl(xmlCharEncConvImpl impl, void *implCtxt,
758                   const char *name, xmlCharEncodingHandler *handler) {
759     xmlCharEncConverter conv = { NULL, NULL, NULL, NULL, NULL };
760     int ret;
761 
762     ret = impl(implCtxt, name, &conv);
763 
764     if (ret == XML_ERR_OK) {
765         handler->input =
766             (xmlCharEncodingInputFunc) (void (*)(void)) conv.input;
767         handler->output =
768             (xmlCharEncodingOutputFunc) (void (*)(void)) conv.output;
769         handler->ctxtDtor = conv.ctxtDtor;
770         handler->inputCtxt = conv.inputCtxt;
771         handler->outputCtxt = conv.outputCtxt;
772     }
773 
774     return(ret);
775 }
776 
777 /**
778  * xmlFindExtraHandler:
779  * @norig:  name of the char encoding
780  * @name:  potentially aliased name of the encoding
781  * @output:  boolean, use handler for output
782  * @impl:  a conversion implementation (optional)
783  * @implCtxt:  user data for conversion implementation (optional)
784  * @out:  pointer to resulting handler
785  *
786  * Search the non-default handlers for an exact match.
787  *
788  * Returns an xmlParserErrors error code.
789  */
790 static int
xmlFindExtraHandler(const char * norig,const char * name,int output,xmlCharEncConvImpl impl,void * implCtxt,xmlCharEncodingHandler ** out)791 xmlFindExtraHandler(const char *norig, const char *name, int output,
792                     xmlCharEncConvImpl impl, void *implCtxt,
793                     xmlCharEncodingHandler **out) {
794     xmlCharEncodingHandler *handler;
795     int ret;
796     int i;
797 
798     handler = xmlMalloc(sizeof(*handler));
799     if (handler == NULL)
800         return(XML_ERR_NO_MEMORY);
801     memset(handler, 0, sizeof(*handler));
802 
803     handler->name = xmlMemStrdup(name);
804     if (handler->name == NULL) {
805         ret = XML_ERR_NO_MEMORY;
806         goto done;
807     }
808 
809     /*
810      * Try custom implementation before deprecated global handlers.
811      *
812      * Note that we pass the original name without deprecated
813      * alias resolution.
814      */
815     if (impl != NULL) {
816         ret = xmlInvokeConvImpl(impl, implCtxt, norig, handler);
817         if (ret != XML_ERR_OK)
818             goto done;
819 
820         *out = handler;
821         return(XML_ERR_OK);
822     }
823 
824     /*
825      * Deprecated
826      */
827     if (globalHandlers != NULL) {
828         for (i = 0; i < nbCharEncodingHandler; i++) {
829             xmlCharEncodingHandler *h = globalHandlers[i];
830 
831             if (!xmlStrcasecmp((const xmlChar *) name,
832                                (const xmlChar *) h->name)) {
833                 if ((output ? h->output : h->input) != NULL) {
834                     *out = h;
835                     ret = XML_ERR_OK;
836                     goto done;
837                 }
838             }
839         }
840     }
841 
842 #ifdef LIBXML_ICONV_ENABLED
843     ret = xmlInvokeConvImpl(xmlCharEncIconv, handler, name, handler);
844     if (ret == XML_ERR_OK) {
845         *out = handler;
846         return(XML_ERR_OK);
847     }
848     if (ret != XML_ERR_UNSUPPORTED_ENCODING)
849         goto done;
850 #endif /* LIBXML_ICONV_ENABLED */
851 
852 #ifdef LIBXML_ICU_ENABLED
853     ret = xmlInvokeConvImpl(xmlCharEncUconv, handler, name, handler);
854     if (ret == XML_ERR_OK) {
855         *out = handler;
856         return(XML_ERR_OK);
857     }
858     if (ret != XML_ERR_UNSUPPORTED_ENCODING)
859         goto done;
860 #endif /* LIBXML_ICU_ENABLED */
861 
862     ret = XML_ERR_UNSUPPORTED_ENCODING;
863 
864 done:
865     if (handler != NULL) {
866         xmlFree(handler->name);
867         xmlFree(handler);
868     }
869 
870     return(ret);
871 }
872 
873 /**
874  * xmlLookupCharEncodingHandler:
875  * @enc:  an xmlCharEncoding value.
876  * @out:  pointer to result
877  *
878  * Find or create a handler matching the encoding. The following
879  * converters are looked up in order:
880  *
881  * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
882  * - User-registered global handler (deprecated)
883  * - iconv if enabled
884  * - ICU if enabled
885  *
886  * The handler must be closed with xmlCharEncCloseFunc.
887  *
888  * If the encoding is UTF-8, a NULL handler and no error code will
889  * be returned.
890  *
891  * Available since 2.13.0.
892  *
893  * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
894  * xmlParserErrors error code.
895  */
896 int
xmlLookupCharEncodingHandler(xmlCharEncoding enc,xmlCharEncodingHandler ** out)897 xmlLookupCharEncodingHandler(xmlCharEncoding enc,
898                              xmlCharEncodingHandler **out) {
899     const xmlCharEncodingHandler *handler;
900 
901     if (out == NULL)
902         return(XML_ERR_ARGUMENT);
903     *out = NULL;
904 
905     if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS))
906         return(XML_ERR_UNSUPPORTED_ENCODING);
907 
908     /* Return NULL handler for UTF-8 */
909     if ((enc == XML_CHAR_ENCODING_UTF8) ||
910         (enc == XML_CHAR_ENCODING_NONE))
911         return(XML_ERR_OK);
912 
913     handler = &defaultHandlers[enc];
914     if ((handler->input != NULL) || (handler->output != NULL)) {
915         *out = (xmlCharEncodingHandler *) handler;
916         return(XML_ERR_OK);
917     }
918 
919     if (handler->name != NULL)
920         return(xmlFindExtraHandler(handler->name, handler->name, 0,
921                                    NULL, NULL, out));
922 
923     return(XML_ERR_UNSUPPORTED_ENCODING);
924 }
925 
926 /**
927  * xmlGetCharEncodingHandler:
928  * @enc:  an xmlCharEncoding value.
929  *
930  * DEPRECATED: Use xmlLookupCharEncodingHandler which has better error
931  * reporting.
932  *
933  * Returns the handler or NULL if no handler was found or an error
934  * occurred.
935  */
936 xmlCharEncodingHandlerPtr
xmlGetCharEncodingHandler(xmlCharEncoding enc)937 xmlGetCharEncodingHandler(xmlCharEncoding enc) {
938     xmlCharEncodingHandler *ret;
939 
940     xmlLookupCharEncodingHandler(enc, &ret);
941     return(ret);
942 }
943 
944 /**
945  * xmlCreateCharEncodingHandler:
946  * @name:  a string describing the char encoding.
947  * @output:  boolean, use handler for output
948  * @impl:  a conversion implementation (optional)
949  * @implCtxt:  user data for conversion implementation (optional)
950  * @out:  pointer to result
951  *
952  * Find or create a handler matching the encoding. The following
953  * converters are looked up in order:
954  *
955  * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
956  * - Custom implementation if provided
957  * - User-registered global handler (deprecated)
958  * - iconv if enabled
959  * - ICU if enabled
960  *
961  * The handler must be closed with xmlCharEncCloseFunc.
962  *
963  * If the encoding is UTF-8, a NULL handler and no error code will
964  * be returned.
965  *
966  * Available since 2.14.0.
967  *
968  * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
969  * xmlParserErrors error code.
970  */
971 int
xmlCreateCharEncodingHandler(const char * name,int output,xmlCharEncConvImpl impl,void * implCtxt,xmlCharEncodingHandler ** out)972 xmlCreateCharEncodingHandler(const char *name, int output,
973                              xmlCharEncConvImpl impl, void *implCtxt,
974                              xmlCharEncodingHandler **out) {
975     const xmlCharEncodingHandler *handler;
976     const char *norig, *nalias;
977     xmlCharEncoding enc;
978 
979     if (out == NULL)
980         return(XML_ERR_ARGUMENT);
981     *out = NULL;
982 
983     if (name == NULL)
984         return(XML_ERR_ARGUMENT);
985 
986     norig = name;
987     nalias = xmlGetEncodingAlias(name);
988     if (nalias != NULL)
989 	name = nalias;
990 
991     enc = xmlParseCharEncodingInternal(name);
992 
993     /* Return NULL handler for UTF-8 */
994     if (enc == XML_CHAR_ENCODING_UTF8)
995         return(XML_ERR_OK);
996 
997     if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) {
998         handler = &defaultHandlers[enc];
999         if ((output ? handler->output : handler->input) != NULL) {
1000             *out = (xmlCharEncodingHandler *) handler;
1001             return(XML_ERR_OK);
1002         }
1003     }
1004 
1005     return(xmlFindExtraHandler(norig, name, output, impl, implCtxt, out));
1006 }
1007 
1008 /**
1009  * xmlOpenCharEncodingHandler:
1010  * @name:  a string describing the char encoding.
1011  * @output:  boolean, use handler for output
1012  * @out:  pointer to result
1013  *
1014  * Find or create a handler matching the encoding. The following
1015  * converters are looked up in order:
1016  *
1017  * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
1018  * - User-registered global handler (deprecated)
1019  * - iconv if enabled
1020  * - ICU if enabled
1021  *
1022  * The handler must be closed with xmlCharEncCloseFunc.
1023  *
1024  * If the encoding is UTF-8, a NULL handler and no error code will
1025  * be returned.
1026  *
1027  * Available since 2.13.0.
1028  *
1029  * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
1030  * xmlParserErrors error code.
1031  */
1032 int
xmlOpenCharEncodingHandler(const char * name,int output,xmlCharEncodingHandler ** out)1033 xmlOpenCharEncodingHandler(const char *name, int output,
1034                            xmlCharEncodingHandler **out) {
1035     return(xmlCreateCharEncodingHandler(name, output, NULL, NULL, out));
1036 }
1037 
1038 /**
1039  * xmlFindCharEncodingHandler:
1040  * @name:  a string describing the char encoding.
1041  *
1042  * DEPRECATED: Use xmlOpenCharEncodingHandler which has better error
1043  * reporting.
1044  *
1045  * If the encoding is UTF-8, this will return a no-op handler that
1046  * shouldn't be used.
1047  *
1048  * Returns the handler or NULL if no handler was found or an error
1049  * occurred.
1050  */
1051 xmlCharEncodingHandlerPtr
xmlFindCharEncodingHandler(const char * name)1052 xmlFindCharEncodingHandler(const char *name) {
1053     xmlCharEncodingHandler *ret;
1054 
1055     /*
1056      * This handler shouldn't be used, but we must return a non-NULL
1057      * handler.
1058      */
1059     if ((xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF-8") == 0) ||
1060         (xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF8") == 0))
1061         return((xmlCharEncodingHandlerPtr)
1062                 &defaultHandlers[XML_CHAR_ENCODING_UTF8]);
1063 
1064     xmlOpenCharEncodingHandler(name, 0, &ret);
1065     return(ret);
1066 }
1067 
1068 /************************************************************************
1069  *									*
1070  *		ICONV based generic conversion functions		*
1071  *									*
1072  ************************************************************************/
1073 
1074 #ifdef LIBXML_ICONV_ENABLED
1075 typedef struct {
1076     iconv_t cd;
1077 } xmlIconvCtxt;
1078 
1079 /**
1080  * xmlIconvConvert:
1081  * @vctxt:  conversion context
1082  * @out:  a pointer to an array of bytes to store the result
1083  * @outlen:  the length of @out
1084  * @in:  a pointer to an array of input bytes
1085  * @inlen:  the length of @in
1086  *
1087  * Returns an XML_ENC_ERR code.
1088  *
1089  * The value of @inlen after return is the number of octets consumed
1090  *     as the return value is positive, else unpredictable.
1091  * The value of @outlen after return is the number of octets produced.
1092  */
1093 static int
xmlIconvConvert(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt)1094 xmlIconvConvert(unsigned char *out, int *outlen,
1095                 const unsigned char *in, int *inlen, void *vctxt) {
1096     xmlIconvCtxt *ctxt = vctxt;
1097     size_t icv_inlen, icv_outlen;
1098     const char *icv_in = (const char *) in;
1099     char *icv_out = (char *) out;
1100     size_t ret;
1101 
1102     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
1103         if (outlen != NULL) *outlen = 0;
1104         return(XML_ENC_ERR_INTERNAL);
1105     }
1106     icv_inlen = *inlen;
1107     icv_outlen = *outlen;
1108     /*
1109      * Some versions take const, other versions take non-const input.
1110      */
1111     ret = iconv(ctxt->cd, (void *) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
1112     *inlen -= icv_inlen;
1113     *outlen -= icv_outlen;
1114     if (ret == (size_t) -1) {
1115         if (errno == EILSEQ)
1116             return(XML_ENC_ERR_INPUT);
1117         if (errno == E2BIG)
1118             return(XML_ENC_ERR_SPACE);
1119         /*
1120          * EINVAL means a truncated multi-byte sequence at the end
1121          * of the input buffer. We treat this as success.
1122          */
1123         if (errno == EINVAL)
1124             return(XML_ENC_ERR_SUCCESS);
1125         return(XML_ENC_ERR_INTERNAL);
1126     }
1127     return(XML_ENC_ERR_SUCCESS);
1128 }
1129 
1130 static void
xmlIconvFree(void * vctxt)1131 xmlIconvFree(void *vctxt) {
1132     xmlIconvCtxt *ctxt = vctxt;
1133 
1134     if (ctxt->cd != (iconv_t) -1)
1135         iconv_close(ctxt->cd);
1136 
1137     xmlFree(ctxt);
1138 }
1139 
1140 static int
xmlCharEncIconv(void * vctxt,const char * name,xmlCharEncConverter * conv)1141 xmlCharEncIconv(void *vctxt, const char *name, xmlCharEncConverter *conv) {
1142     xmlCharEncodingHandler *handler = vctxt;
1143     xmlIconvCtxt *inputCtxt = NULL, *outputCtxt = NULL;
1144     iconv_t icv_in;
1145     iconv_t icv_out;
1146     int ret;
1147 
1148     inputCtxt = xmlMalloc(sizeof(xmlIconvCtxt));
1149     if (inputCtxt == NULL) {
1150         ret = XML_ERR_NO_MEMORY;
1151         goto error;
1152     }
1153     inputCtxt->cd = (iconv_t) -1;
1154 
1155     icv_in = iconv_open("UTF-8", name);
1156     if (icv_in == (iconv_t) -1) {
1157         if (errno == EINVAL)
1158             ret = XML_ERR_UNSUPPORTED_ENCODING;
1159         else if (errno == ENOMEM)
1160             ret = XML_ERR_NO_MEMORY;
1161         else
1162             ret = XML_ERR_SYSTEM;
1163         goto error;
1164     }
1165     inputCtxt->cd = icv_in;
1166 
1167     outputCtxt = xmlMalloc(sizeof(xmlIconvCtxt));
1168     if (outputCtxt == NULL) {
1169         ret = XML_ERR_NO_MEMORY;
1170         goto error;
1171     }
1172     outputCtxt->cd = (iconv_t) -1;
1173 
1174     icv_out = iconv_open(name, "UTF-8");
1175     if (icv_out == (iconv_t) -1) {
1176         if (errno == EINVAL)
1177             ret = XML_ERR_UNSUPPORTED_ENCODING;
1178         else if (errno == ENOMEM)
1179             ret = XML_ERR_NO_MEMORY;
1180         else
1181             ret = XML_ERR_SYSTEM;
1182         goto error;
1183     }
1184     outputCtxt->cd = icv_out;
1185 
1186     conv->input = xmlIconvConvert;
1187     conv->output = xmlIconvConvert;
1188     conv->ctxtDtor = xmlIconvFree;
1189     conv->inputCtxt = inputCtxt;
1190     conv->outputCtxt = outputCtxt;
1191 
1192     /* Backward compatibility */
1193     if (handler != NULL) {
1194         handler->iconv_in = icv_in;
1195         handler->iconv_out = icv_out;
1196     }
1197 
1198     return(XML_ERR_OK);
1199 
1200 error:
1201     if (inputCtxt != NULL)
1202         xmlIconvFree(inputCtxt);
1203     if (outputCtxt != NULL)
1204         xmlIconvFree(outputCtxt);
1205     return(ret);
1206 }
1207 #endif /* LIBXML_ICONV_ENABLED */
1208 
1209 /************************************************************************
1210  *									*
1211  *		ICU based generic conversion functions		*
1212  *									*
1213  ************************************************************************/
1214 
1215 #ifdef LIBXML_ICU_ENABLED
1216 /* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */
1217 #define ICU_PIVOT_BUF_SIZE 1024
1218 
1219 typedef struct _uconv_t xmlUconvCtxt;
1220 struct _uconv_t {
1221   UConverter *uconv; /* for conversion between an encoding and UTF-16 */
1222   UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
1223   UChar      *pivot_source;
1224   UChar      *pivot_target;
1225   int        isInput;
1226   UChar      pivot_buf[ICU_PIVOT_BUF_SIZE];
1227 };
1228 
1229 /**
1230  * xmlUconvConvert:
1231  * @vctxt:  converison context
1232  * @out:  a pointer to an array of bytes to store the result
1233  * @outlen:  the length of @out
1234  * @in:  a pointer to an array of input bytes
1235  * @inlen:  the length of @in
1236  *
1237  * Returns an XML_ENC_ERR code.
1238  *
1239  * The value of @inlen after return is the number of octets consumed
1240  *     as the return value is positive, else unpredictable.
1241  * The value of @outlen after return is the number of octets produced.
1242  */
1243 static int
xmlUconvConvert(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt)1244 xmlUconvConvert(unsigned char *out, int *outlen,
1245                 const unsigned char *in, int *inlen, void *vctxt) {
1246     xmlUconvCtxt *cd = vctxt;
1247     const char *ucv_in = (const char *) in;
1248     char *ucv_out = (char *) out;
1249     UConverter *target, *source;
1250     UErrorCode err = U_ZERO_ERROR;
1251     int ret;
1252 
1253     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
1254         if (outlen != NULL)
1255             *outlen = 0;
1256         return(XML_ENC_ERR_INTERNAL);
1257     }
1258 
1259     /*
1260      * Note that the ICU API is stateful. It can always consume a certain
1261      * amount of input even if the output buffer would overflow. The
1262      * remaining input must be processed by calling ucnv_convertEx with a
1263      * possibly empty input buffer.
1264      *
1265      * ucnv_convertEx is always called with reset and flush set to 0,
1266      * so we don't mess up the state. This should never generate
1267      * U_TRUNCATED_CHAR_FOUND errors.
1268      */
1269     if (cd->isInput) {
1270         source = cd->uconv;
1271         target = cd->utf8;
1272     } else {
1273         source = cd->utf8;
1274         target = cd->uconv;
1275     }
1276 
1277     ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
1278                    &ucv_in, ucv_in + *inlen, cd->pivot_buf,
1279                    &cd->pivot_source, &cd->pivot_target,
1280                    cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
1281 
1282     *inlen = ucv_in - (const char*) in;
1283     *outlen = ucv_out - (char *) out;
1284 
1285     if (U_SUCCESS(err)) {
1286         ret = XML_ENC_ERR_SUCCESS;
1287     } else {
1288         switch (err) {
1289             case U_TRUNCATED_CHAR_FOUND:
1290                 /* Shouldn't happen without flush */
1291                 ret = XML_ENC_ERR_SUCCESS;
1292                 break;
1293 
1294             case U_BUFFER_OVERFLOW_ERROR:
1295                 ret = XML_ENC_ERR_SPACE;
1296                 break;
1297 
1298             case U_INVALID_CHAR_FOUND:
1299             case U_ILLEGAL_CHAR_FOUND:
1300             case U_ILLEGAL_ESCAPE_SEQUENCE:
1301             case U_UNSUPPORTED_ESCAPE_SEQUENCE:
1302                 ret = XML_ENC_ERR_INPUT;
1303                 break;
1304 
1305             case U_MEMORY_ALLOCATION_ERROR:
1306                 ret = XML_ENC_ERR_MEMORY;
1307                 break;
1308 
1309             default:
1310                 ret = XML_ENC_ERR_INTERNAL;
1311                 break;
1312         }
1313     }
1314 
1315     return(ret);
1316 }
1317 
1318 static int
openIcuConverter(const char * name,int isInput,xmlUconvCtxt ** out)1319 openIcuConverter(const char* name, int isInput, xmlUconvCtxt **out)
1320 {
1321     UErrorCode status;
1322     xmlUconvCtxt *conv;
1323 
1324     *out = NULL;
1325 
1326     conv = (xmlUconvCtxt *) xmlMalloc(sizeof(xmlUconvCtxt));
1327     if (conv == NULL)
1328         return(XML_ERR_NO_MEMORY);
1329 
1330     conv->isInput = isInput;
1331     conv->pivot_source = conv->pivot_buf;
1332     conv->pivot_target = conv->pivot_buf;
1333 
1334     status = U_ZERO_ERROR;
1335     conv->uconv = ucnv_open(name, &status);
1336     if (U_FAILURE(status))
1337         goto error;
1338 
1339     status = U_ZERO_ERROR;
1340     if (isInput) {
1341         ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP,
1342                                                 NULL, NULL, NULL, &status);
1343     }
1344     else {
1345         ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP,
1346                                                 NULL, NULL, NULL, &status);
1347     }
1348     if (U_FAILURE(status))
1349         goto error;
1350 
1351     status = U_ZERO_ERROR;
1352     conv->utf8 = ucnv_open("UTF-8", &status);
1353     if (U_FAILURE(status))
1354         goto error;
1355 
1356     *out = conv;
1357     return(0);
1358 
1359 error:
1360     if (conv->uconv)
1361         ucnv_close(conv->uconv);
1362     xmlFree(conv);
1363 
1364     if (status == U_FILE_ACCESS_ERROR)
1365         return(XML_ERR_UNSUPPORTED_ENCODING);
1366     if (status == U_MEMORY_ALLOCATION_ERROR)
1367         return(XML_ERR_NO_MEMORY);
1368     return(XML_ERR_SYSTEM);
1369 }
1370 
1371 static void
closeIcuConverter(xmlUconvCtxt * conv)1372 closeIcuConverter(xmlUconvCtxt *conv)
1373 {
1374     if (conv == NULL)
1375         return;
1376     ucnv_close(conv->uconv);
1377     ucnv_close(conv->utf8);
1378     xmlFree(conv);
1379 }
1380 
1381 static void
xmlUconvFree(void * vctxt)1382 xmlUconvFree(void *vctxt) {
1383     closeIcuConverter(vctxt);
1384 }
1385 
1386 static int
xmlCharEncUconv(void * vctxt ATTRIBUTE_UNUSED,const char * name,xmlCharEncConverter * conv)1387 xmlCharEncUconv(void *vctxt ATTRIBUTE_UNUSED, const char *name,
1388                 xmlCharEncConverter *conv) {
1389     xmlUconvCtxt *ucv_in = NULL;
1390     xmlUconvCtxt *ucv_out = NULL;
1391     int ret;
1392 
1393     ret = openIcuConverter(name, 1, &ucv_in);
1394     if (ret != 0)
1395         goto error;
1396     ret = openIcuConverter(name, 0, &ucv_out);
1397     if (ret != 0)
1398         goto error;
1399 
1400     conv->input = xmlUconvConvert;
1401     conv->output = xmlUconvConvert;
1402     conv->ctxtDtor = xmlUconvFree;
1403     conv->inputCtxt = ucv_in;
1404     conv->outputCtxt = ucv_out;
1405 
1406     return(XML_ERR_OK);
1407 
1408 error:
1409     if (ucv_in != NULL)
1410         closeIcuConverter(ucv_in);
1411     if (ucv_out != NULL)
1412         closeIcuConverter(ucv_out);
1413     return(ret);
1414 }
1415 #endif /* LIBXML_ICU_ENABLED */
1416 
1417 /************************************************************************
1418  *									*
1419  *		The real API used by libxml for on-the-fly conversion	*
1420  *									*
1421  ************************************************************************/
1422 
1423 /**
1424  * xmlEncConvertError:
1425  * @code:  XML_ENC_ERR code
1426  *
1427  * Convert XML_ENC_ERR to libxml2 error codes.
1428  */
1429 static int
xmlEncConvertError(int code)1430 xmlEncConvertError(int code) {
1431     int ret;
1432 
1433     switch (code) {
1434         case XML_ENC_ERR_SUCCESS:
1435             ret = XML_ERR_OK;
1436             break;
1437         case XML_ENC_ERR_INPUT:
1438             ret = XML_ERR_INVALID_ENCODING;
1439             break;
1440         case XML_ENC_ERR_MEMORY:
1441             ret = XML_ERR_NO_MEMORY;
1442             break;
1443         default:
1444             ret = XML_ERR_INTERNAL_ERROR;
1445             break;
1446     }
1447 
1448     return(ret);
1449 }
1450 
1451 /**
1452  * xmlEncInputChunk:
1453  * @handler:  encoding handler
1454  * @out:  a pointer to an array of bytes to store the result
1455  * @outlen:  the length of @out
1456  * @in:  a pointer to an array of input bytes
1457  * @inlen:  the length of @in
1458  *
1459  * The value of @inlen after return is the number of octets consumed
1460  *     as the return value is 0, else unpredictable.
1461  * The value of @outlen after return is the number of octets produced.
1462  *
1463  * Returns an XML_ENC_ERR code.
1464  */
1465 int
xmlEncInputChunk(xmlCharEncodingHandler * handler,unsigned char * out,int * outlen,const unsigned char * in,int * inlen)1466 xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
1467                  int *outlen, const unsigned char *in, int *inlen) {
1468     int ret;
1469 
1470     if (handler->input != NULL) {
1471         xmlCharEncConvFunc conv =
1472             (xmlCharEncConvFunc) (void (*)(void)) handler->input;
1473 
1474         ret = conv(out, outlen, in, inlen, handler->inputCtxt);
1475         if (ret > 0)
1476             ret = XML_ENC_ERR_SUCCESS;
1477     }
1478     else {
1479         *outlen = 0;
1480         *inlen = 0;
1481         ret = XML_ENC_ERR_INTERNAL;
1482     }
1483 
1484     return(ret);
1485 }
1486 
1487 /**
1488  * xmlEncOutputChunk:
1489  * @handler:  encoding handler
1490  * @out:  a pointer to an array of bytes to store the result
1491  * @outlen:  the length of @out
1492  * @in:  a pointer to an array of input bytes
1493  * @inlen:  the length of @in
1494  *
1495  * Returns an XML_ENC_ERR code.
1496  *
1497  * The value of @inlen after return is the number of octets consumed
1498  *     as the return value is 0, else unpredictable.
1499  * The value of @outlen after return is the number of octets produced.
1500  */
1501 static int
xmlEncOutputChunk(xmlCharEncodingHandler * handler,unsigned char * out,int * outlen,const unsigned char * in,int * inlen)1502 xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
1503                   int *outlen, const unsigned char *in, int *inlen) {
1504     int ret;
1505 
1506     if (handler->output != NULL) {
1507         xmlCharEncConvFunc conv =
1508             (xmlCharEncConvFunc) (void (*)(void)) handler->output;
1509 
1510         ret = conv(out, outlen, in, inlen, handler->outputCtxt);
1511         if (ret > 0)
1512             ret = XML_ENC_ERR_SUCCESS;
1513     }
1514     else {
1515         *outlen = 0;
1516         *inlen = 0;
1517         ret = XML_ENC_ERR_INTERNAL;
1518     }
1519 
1520     return(ret);
1521 }
1522 
1523 /**
1524  * xmlCharEncFirstLine:
1525  * @handler:   char encoding transformation data structure
1526  * @out:  an xmlBuffer for the output.
1527  * @in:  an xmlBuffer for the input
1528  *
1529  * DEPERECATED: Don't use.
1530  *
1531  * Returns the number of bytes written or an XML_ENC_ERR code.
1532  */
1533 int
xmlCharEncFirstLine(xmlCharEncodingHandler * handler,xmlBufferPtr out,xmlBufferPtr in)1534 xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1535                     xmlBufferPtr in) {
1536     return(xmlCharEncInFunc(handler, out, in));
1537 }
1538 
1539 /**
1540  * xmlCharEncInput:
1541  * @input: a parser input buffer
1542  * @sizeOut:  pointer to output size
1543  *
1544  * @sizeOut should be set to the maximum output size (or SIZE_MAX).
1545  * After return, it is set to the number of bytes written.
1546  *
1547  * Generic front-end for the encoding handler on parser input
1548  *
1549  * Returns an XML_ENC_ERR code.
1550  */
1551 int
xmlCharEncInput(xmlParserInputBufferPtr input,size_t * sizeOut)1552 xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut)
1553 {
1554     xmlBufPtr out, in;
1555     const xmlChar *dataIn;
1556     size_t availIn;
1557     size_t maxOut;
1558     size_t totalIn, totalOut;
1559     int ret;
1560 
1561     out = input->buffer;
1562     in = input->raw;
1563 
1564     maxOut = *sizeOut;
1565     totalOut = 0;
1566 
1567     *sizeOut = 0;
1568 
1569     availIn = xmlBufUse(in);
1570     if (availIn == 0)
1571         return(0);
1572     dataIn = xmlBufContent(in);
1573     totalIn = 0;
1574 
1575     while (1) {
1576         size_t availOut;
1577         int completeOut, completeIn;
1578         int c_out, c_in;
1579 
1580         availOut = xmlBufAvail(out);
1581         if (availOut > INT_MAX / 2)
1582             availOut = INT_MAX / 2;
1583 
1584         if (availOut < maxOut) {
1585             c_out = availOut;
1586             completeOut = 0;
1587         } else {
1588             c_out = maxOut;
1589             completeOut = 1;
1590         }
1591 
1592         if (availIn > INT_MAX / 2) {
1593             c_in = INT_MAX / 2;
1594             completeIn = 0;
1595         } else {
1596             c_in = availIn;
1597             completeIn = 1;
1598         }
1599 
1600         ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
1601                                dataIn, &c_in);
1602 
1603         totalIn += c_in;
1604         dataIn += c_in;
1605         availIn -= c_in;
1606 
1607         totalOut += c_out;
1608         maxOut -= c_out;
1609         xmlBufAddLen(out, c_out);
1610 
1611         if ((ret != XML_ENC_ERR_SUCCESS) && (ret != XML_ENC_ERR_SPACE)) {
1612             input->error = xmlEncConvertError(ret);
1613             return(ret);
1614         }
1615 
1616         if ((completeOut) && (completeIn))
1617             break;
1618         if ((completeOut) && (ret == XML_ENC_ERR_SPACE))
1619             break;
1620         if ((completeIn) && (ret == XML_ENC_ERR_SUCCESS))
1621             break;
1622 
1623         if (ret == XML_ENC_ERR_SPACE) {
1624             if (xmlBufGrow(out, 4096) < 0) {
1625                 input->error = XML_ERR_NO_MEMORY;
1626                 return(XML_ENC_ERR_MEMORY);
1627             }
1628         }
1629     }
1630 
1631     xmlBufShrink(in, totalIn);
1632 
1633     if (input->rawconsumed > ULONG_MAX - (unsigned long) totalIn)
1634         input->rawconsumed = ULONG_MAX;
1635     else
1636         input->rawconsumed += totalIn;
1637 
1638     *sizeOut = totalOut;
1639     return(XML_ERR_OK);
1640 }
1641 
1642 /**
1643  * xmlCharEncInFunc:
1644  * @handler:	char encoding transformation data structure
1645  * @out:  an xmlBuffer for the output.
1646  * @in:  an xmlBuffer for the input
1647  *
1648  * Generic front-end for the encoding handler input function
1649  *
1650  * Returns the number of bytes written or an XML_ENC_ERR code.
1651  */
1652 int
xmlCharEncInFunc(xmlCharEncodingHandler * handler,xmlBufferPtr out,xmlBufferPtr in)1653 xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
1654                  xmlBufferPtr in)
1655 {
1656     int ret;
1657     int written;
1658     int toconv;
1659 
1660     if (handler == NULL)
1661         return(XML_ENC_ERR_INTERNAL);
1662     if (out == NULL)
1663         return(XML_ENC_ERR_INTERNAL);
1664     if (in == NULL)
1665         return(XML_ENC_ERR_INTERNAL);
1666 
1667     toconv = in->use;
1668     if (toconv == 0)
1669         return (0);
1670     written = out->size - out->use -1; /* count '\0' */
1671     if (toconv * 2 >= written) {
1672         xmlBufferGrow(out, out->size + toconv * 2);
1673         written = out->size - out->use - 1;
1674     }
1675     ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
1676                            in->content, &toconv);
1677     xmlBufferShrink(in, toconv);
1678     out->use += written;
1679     out->content[out->use] = 0;
1680 
1681     return (written? written : ret);
1682 }
1683 
1684 #ifdef LIBXML_OUTPUT_ENABLED
1685 /**
1686  * xmlCharEncOutput:
1687  * @output: a parser output buffer
1688  * @init: is this an initialization call without data
1689  *
1690  * Generic front-end for the encoding handler on parser output
1691  * a first call with @init == 1 has to be made first to initiate the
1692  * output in case of non-stateless encoding needing to initiate their
1693  * state or the output (like the BOM in UTF16).
1694  * In case of UTF8 sequence conversion errors for the given encoder,
1695  * the content will be automatically remapped to a CharRef sequence.
1696  *
1697  * Returns the number of bytes written or an XML_ENC_ERR code.
1698  */
1699 int
xmlCharEncOutput(xmlOutputBufferPtr output,int init)1700 xmlCharEncOutput(xmlOutputBufferPtr output, int init)
1701 {
1702     int ret;
1703     size_t written;
1704     int writtentot = 0;
1705     size_t toconv;
1706     int c_in;
1707     int c_out;
1708     xmlBufPtr in;
1709     xmlBufPtr out;
1710 
1711     if ((output == NULL) || (output->encoder == NULL) ||
1712         (output->buffer == NULL) || (output->conv == NULL))
1713         return(XML_ENC_ERR_INTERNAL);
1714     out = output->conv;
1715     in = output->buffer;
1716 
1717 retry:
1718 
1719     written = xmlBufAvail(out);
1720 
1721     /*
1722      * First specific handling of the initialization call
1723      */
1724     if (init) {
1725         c_in = 0;
1726         c_out = written;
1727         /* TODO: Check return value. */
1728         xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
1729                           NULL, &c_in);
1730         xmlBufAddLen(out, c_out);
1731         return(c_out);
1732     }
1733 
1734     /*
1735      * Conversion itself.
1736      */
1737     toconv = xmlBufUse(in);
1738     if (toconv > 64 * 1024)
1739         toconv = 64 * 1024;
1740     if (toconv * 4 >= written) {
1741         if (xmlBufGrow(out, toconv * 4) < 0) {
1742             ret = XML_ENC_ERR_MEMORY;
1743             goto error;
1744         }
1745         written = xmlBufAvail(out);
1746     }
1747     if (written > 256 * 1024)
1748         written = 256 * 1024;
1749 
1750     c_in = toconv;
1751     c_out = written;
1752     ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
1753                             xmlBufContent(in), &c_in);
1754     xmlBufShrink(in, c_in);
1755     xmlBufAddLen(out, c_out);
1756     writtentot += c_out;
1757 
1758     if (ret == XML_ENC_ERR_SPACE)
1759         goto retry;
1760 
1761     /*
1762      * Attempt to handle error cases
1763      */
1764     if (ret == XML_ENC_ERR_INPUT) {
1765         xmlChar charref[20];
1766         int len = xmlBufUse(in);
1767         xmlChar *content = xmlBufContent(in);
1768         int cur, charrefLen;
1769 
1770         cur = xmlGetUTF8Char(content, &len);
1771         if (cur <= 0)
1772             goto error;
1773 
1774         /*
1775          * Removes the UTF8 sequence, and replace it by a charref
1776          * and continue the transcoding phase, hoping the error
1777          * did not mangle the encoder state.
1778          */
1779         charrefLen = xmlSerializeDecCharRef((char *) charref, cur);
1780         xmlBufGrow(out, charrefLen * 4);
1781         c_out = xmlBufAvail(out);
1782         c_in = charrefLen;
1783         ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
1784                                 charref, &c_in);
1785         if ((ret < 0) || (c_in != charrefLen)) {
1786             ret = XML_ENC_ERR_INTERNAL;
1787             goto error;
1788         }
1789 
1790         xmlBufShrink(in, len);
1791         xmlBufAddLen(out, c_out);
1792         writtentot += c_out;
1793         goto retry;
1794     }
1795 
1796 error:
1797     if (((writtentot <= 0) && (ret != 0)) ||
1798         (ret == XML_ENC_ERR_MEMORY)) {
1799         if (output->error == 0)
1800             output->error = xmlEncConvertError(ret);
1801         return(ret);
1802     }
1803 
1804     return(writtentot);
1805 }
1806 #endif
1807 
1808 /**
1809  * xmlCharEncOutFunc:
1810  * @handler:	char encoding transformation data structure
1811  * @out:  an xmlBuffer for the output.
1812  * @in:  an xmlBuffer for the input
1813  *
1814  * Generic front-end for the encoding handler output function
1815  * a first call with @in == NULL has to be made firs to initiate the
1816  * output in case of non-stateless encoding needing to initiate their
1817  * state or the output (like the BOM in UTF16).
1818  * In case of UTF8 sequence conversion errors for the given encoder,
1819  * the content will be automatically remapped to a CharRef sequence.
1820  *
1821  * Returns the number of bytes written or an XML_ENC_ERR code.
1822  */
1823 int
xmlCharEncOutFunc(xmlCharEncodingHandler * handler,xmlBufferPtr out,xmlBufferPtr in)1824 xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1825                   xmlBufferPtr in) {
1826     int ret;
1827     int written;
1828     int writtentot = 0;
1829     int toconv;
1830 
1831     if (handler == NULL) return(XML_ENC_ERR_INTERNAL);
1832     if (out == NULL) return(XML_ENC_ERR_INTERNAL);
1833 
1834 retry:
1835 
1836     written = out->size - out->use;
1837 
1838     if (written > 0)
1839 	written--; /* Gennady: count '/0' */
1840 
1841     /*
1842      * First specific handling of in = NULL, i.e. the initialization call
1843      */
1844     if (in == NULL) {
1845         toconv = 0;
1846         /* TODO: Check return value. */
1847         xmlEncOutputChunk(handler, &out->content[out->use], &written,
1848                           NULL, &toconv);
1849         out->use += written;
1850         out->content[out->use] = 0;
1851         return(0);
1852     }
1853 
1854     /*
1855      * Conversion itself.
1856      */
1857     toconv = in->use;
1858     if (toconv * 4 >= written) {
1859         xmlBufferGrow(out, toconv * 4);
1860 	written = out->size - out->use - 1;
1861     }
1862     ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
1863                             in->content, &toconv);
1864     xmlBufferShrink(in, toconv);
1865     out->use += written;
1866     writtentot += written;
1867     out->content[out->use] = 0;
1868 
1869     if (ret == XML_ENC_ERR_SPACE)
1870         goto retry;
1871 
1872     /*
1873      * Attempt to handle error cases
1874      */
1875     if (ret == XML_ENC_ERR_INPUT) {
1876         xmlChar charref[20];
1877         int len = in->use;
1878         const xmlChar *utf = (const xmlChar *) in->content;
1879         int cur, charrefLen;
1880 
1881         cur = xmlGetUTF8Char(utf, &len);
1882         if (cur <= 0)
1883             return(ret);
1884 
1885         /*
1886          * Removes the UTF8 sequence, and replace it by a charref
1887          * and continue the transcoding phase, hoping the error
1888          * did not mangle the encoder state.
1889          */
1890         charrefLen = xmlSerializeDecCharRef((char *) charref, cur);
1891         xmlBufferShrink(in, len);
1892         xmlBufferGrow(out, charrefLen * 4);
1893         written = out->size - out->use - 1;
1894         toconv = charrefLen;
1895         ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
1896                                 charref, &toconv);
1897         if ((ret < 0) || (toconv != charrefLen))
1898             return(XML_ENC_ERR_INTERNAL);
1899 
1900         out->use += written;
1901         writtentot += written;
1902         out->content[out->use] = 0;
1903         goto retry;
1904     }
1905     return(writtentot ? writtentot : ret);
1906 }
1907 
1908 /**
1909  * xmlCharEncCloseFunc:
1910  * @handler:	char encoding transformation data structure
1911  *
1912  * Releases an xmlCharEncodingHandler. Must be called after
1913  * a handler is no longer in use.
1914  *
1915  * Returns 0.
1916  */
1917 int
xmlCharEncCloseFunc(xmlCharEncodingHandler * handler)1918 xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
1919     if (handler == NULL)
1920         return(0);
1921 
1922     if (handler->flags & XML_HANDLER_STATIC)
1923         return(0);
1924 
1925     xmlFree(handler->name);
1926     if (handler->ctxtDtor != NULL) {
1927         handler->ctxtDtor(handler->inputCtxt);
1928         handler->ctxtDtor(handler->outputCtxt);
1929     }
1930     xmlFree(handler);
1931     return(0);
1932 }
1933 
1934 /**
1935  * xmlByteConsumed:
1936  * @ctxt: an XML parser context
1937  *
1938  * DEPRECATED: Don't use.
1939  *
1940  * This function provides the current index of the parser relative
1941  * to the start of the current entity. This function is computed in
1942  * bytes from the beginning starting at zero and finishing at the
1943  * size in byte of the file if parsing a file. The function is
1944  * of constant cost if the input is UTF-8 but can be costly if run
1945  * on non-UTF-8 input.
1946  *
1947  * Returns the index in bytes from the beginning of the entity or -1
1948  *         in case the index could not be computed.
1949  */
1950 long
xmlByteConsumed(xmlParserCtxtPtr ctxt)1951 xmlByteConsumed(xmlParserCtxtPtr ctxt) {
1952     xmlParserInputPtr in;
1953 
1954     if (ctxt == NULL)
1955         return(-1);
1956     in = ctxt->input;
1957     if (in == NULL)
1958         return(-1);
1959 
1960     if ((in->buf != NULL) && (in->buf->encoder != NULL)) {
1961         int unused = 0;
1962 	xmlCharEncodingHandler * handler = in->buf->encoder;
1963 
1964         /*
1965 	 * Encoding conversion, compute the number of unused original
1966 	 * bytes from the input not consumed and subtract that from
1967 	 * the raw consumed value, this is not a cheap operation
1968 	 */
1969         if (in->end - in->cur > 0) {
1970 	    unsigned char *convbuf;
1971 	    const unsigned char *cur = (const unsigned char *)in->cur;
1972 	    int toconv, ret;
1973 
1974             convbuf = xmlMalloc(32000);
1975             if (convbuf == NULL)
1976                 return(-1);
1977 
1978             toconv = in->end - cur;
1979             unused = 32000;
1980             ret = xmlEncOutputChunk(handler, convbuf, &unused, cur, &toconv);
1981 
1982             xmlFree(convbuf);
1983 
1984             if (ret != XML_ENC_ERR_SUCCESS)
1985                 return(-1);
1986 	}
1987 
1988 	if (in->buf->rawconsumed < (unsigned long) unused)
1989 	    return(-1);
1990 	return(in->buf->rawconsumed - unused);
1991     }
1992 
1993     return(in->consumed + (in->cur - in->base));
1994 }
1995 
1996 /************************************************************************
1997  *									*
1998  *		Conversions To/From UTF8 encoding			*
1999  *									*
2000  ************************************************************************/
2001 
2002 static int
asciiToAscii(unsigned char * out,int * poutlen,const unsigned char * in,int * pinlen,void * vctxt ATTRIBUTE_UNUSED)2003 asciiToAscii(unsigned char* out, int *poutlen,
2004              const unsigned char* in, int *pinlen,
2005              void *vctxt ATTRIBUTE_UNUSED) {
2006     const unsigned char *inend;
2007     const unsigned char *instart = in;
2008     int inlen, outlen, ret;
2009 
2010     if (in == NULL) {
2011         *pinlen = 0;
2012         *poutlen = 0;
2013         return(XML_ENC_ERR_SUCCESS);
2014     }
2015 
2016     inlen = *pinlen;
2017     outlen = *poutlen;
2018 
2019     if (outlen < inlen) {
2020         inlen = outlen;
2021         ret = XML_ENC_ERR_SPACE;
2022     } else {
2023         ret = inlen;
2024     }
2025 
2026     inend = in + inlen;
2027     *poutlen = inlen;
2028     *pinlen = inlen;
2029 
2030     while (in < inend) {
2031 	unsigned c = *in;
2032 
2033         if (c >= 0x80) {
2034 	    *poutlen = in - instart;
2035 	    *pinlen = in - instart;
2036 	    return(XML_ENC_ERR_INPUT);
2037 	}
2038 
2039         in++;
2040 	*out++ = c;
2041     }
2042 
2043     return(ret);
2044 }
2045 
2046 static int
latin1ToUTF8(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2047 latin1ToUTF8(unsigned char* out, int *outlen,
2048              const unsigned char* in, int *inlen,
2049              void *vctxt ATTRIBUTE_UNUSED) {
2050     unsigned char* outstart = out;
2051     const unsigned char* instart = in;
2052     unsigned char* outend;
2053     const unsigned char* inend;
2054     int ret = XML_ENC_ERR_SPACE;
2055 
2056     if ((out == NULL) || (in == NULL) || (outlen == NULL) || (inlen == NULL))
2057 	return(XML_ENC_ERR_INTERNAL);
2058 
2059     outend = out + *outlen;
2060     inend = in + *inlen;
2061 
2062     while (in < inend) {
2063         unsigned c = *in;
2064 
2065 	if (c < 0x80) {
2066             if (out >= outend)
2067                 goto done;
2068             *out++ = c;
2069 	} else {
2070             if (outend - out < 2)
2071                 goto done;
2072 	    *out++ = (c >> 6) | 0xC0;
2073             *out++ = (c & 0x3F) | 0x80;
2074         }
2075 
2076         in++;
2077     }
2078 
2079     ret = out - outstart;
2080 
2081 done:
2082     *outlen = out - outstart;
2083     *inlen = in - instart;
2084     return(ret);
2085 }
2086 
2087 /**
2088  * isolat1ToUTF8:
2089  * @out:  a pointer to an array of bytes to store the result
2090  * @outlen:  the length of @out
2091  * @in:  a pointer to an array of ISO Latin 1 chars
2092  * @inlen:  the length of @in
2093  *
2094  * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
2095  * block of chars out.
2096  *
2097  * Returns the number of bytes written or an XML_ENC_ERR code.
2098  *
2099  * The value of @inlen after return is the number of octets consumed
2100  *     if the return value is positive, else unpredictable.
2101  * The value of @outlen after return is the number of octets produced.
2102  */
2103 int
isolat1ToUTF8(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2104 isolat1ToUTF8(unsigned char* out, int *outlen,
2105               const unsigned char* in, int *inlen) {
2106     return(latin1ToUTF8(out, outlen, in, inlen, NULL));
2107 }
2108 
2109 static int
UTF8ToUTF8(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2110 UTF8ToUTF8(unsigned char* out, int *outlen,
2111            const unsigned char* in, int *inlen,
2112            void *vctxt ATTRIBUTE_UNUSED) {
2113     int len;
2114     int ret;
2115 
2116     if (in == NULL) {
2117         *inlen = 0;
2118         *outlen = 0;
2119         return(XML_ENC_ERR_SUCCESS);
2120     }
2121 
2122     if (*outlen < *inlen) {
2123 	len = *outlen;
2124         ret = XML_ENC_ERR_SPACE;
2125     } else {
2126 	len = *inlen;
2127         ret = len;
2128     }
2129 
2130     memcpy(out, in, len);
2131 
2132     *outlen = len;
2133     *inlen = len;
2134     return(ret);
2135 }
2136 
2137 
2138 #ifdef LIBXML_OUTPUT_ENABLED
2139 static int
UTF8ToLatin1(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2140 UTF8ToLatin1(unsigned char* out, int *outlen,
2141              const unsigned char* in, int *inlen,
2142              void *vctxt ATTRIBUTE_UNUSED) {
2143     const unsigned char* outend;
2144     const unsigned char* outstart = out;
2145     const unsigned char* instart = in;
2146     const unsigned char* inend;
2147     unsigned c;
2148     int ret = XML_ENC_ERR_SPACE;
2149 
2150     if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
2151         return(XML_ENC_ERR_INTERNAL);
2152 
2153     if (in == NULL) {
2154         *inlen = 0;
2155         *outlen = 0;
2156         return(XML_ENC_ERR_SUCCESS);
2157     }
2158 
2159     inend = in + *inlen;
2160     outend = out + *outlen;
2161     while (in < inend) {
2162         if (out >= outend)
2163             goto done;
2164 
2165 	c = *in;
2166 
2167         if (c < 0x80) {
2168             *out++ = c;
2169         } else if ((c >= 0xC2) && (c <= 0xC3)) {
2170             if (inend - in < 2)
2171                 break;
2172             in++;
2173             *out++ = (unsigned char) ((c << 6) | (*in & 0x3F));
2174         } else {
2175             ret = XML_ENC_ERR_INPUT;
2176             goto done;
2177 	}
2178 
2179         in++;
2180     }
2181 
2182     ret = out - outstart;
2183 
2184 done:
2185     *outlen = out - outstart;
2186     *inlen = in - instart;
2187     return(ret);
2188 }
2189 
2190 /**
2191  * UTF8Toisolat1:
2192  * @out:  a pointer to an array of bytes to store the result
2193  * @outlen:  the length of @out
2194  * @in:  a pointer to an array of UTF-8 chars
2195  * @inlen:  the length of @in
2196  *
2197  * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
2198  * block of chars out.
2199  *
2200  * Returns the number of bytes written or an XML_ENC_ERR code.
2201  *
2202  * The value of @inlen after return is the number of octets consumed
2203  *     if the return value is positive, else unpredictable.
2204  * The value of @outlen after return is the number of octets produced.
2205  */
2206 int
UTF8Toisolat1(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2207 UTF8Toisolat1(unsigned char* out, int *outlen,
2208               const unsigned char* in, int *inlen) {
2209     if ((out == NULL) || (outlen == NULL) || (in == NULL) || (inlen == NULL))
2210         return(XML_ENC_ERR_INTERNAL);
2211 
2212     return(UTF8ToLatin1(out, outlen, in, inlen, NULL));
2213 }
2214 #endif /* LIBXML_OUTPUT_ENABLED */
2215 
2216 static int
UTF16LEToUTF8(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2217 UTF16LEToUTF8(unsigned char *out, int *outlen,
2218               const unsigned char *in, int *inlen,
2219               void *vctxt ATTRIBUTE_UNUSED) {
2220     const unsigned char *instart = in;
2221     const unsigned char *inend = in + (*inlen & ~1);
2222     unsigned char *outstart = out;
2223     unsigned char *outend = out + *outlen;
2224     unsigned c, d;
2225     int ret = XML_ENC_ERR_SPACE;
2226 
2227     while (in < inend) {
2228         c = in[0] | (in[1] << 8);
2229 
2230         if (c < 0x80) {
2231             if (out >= outend)
2232                 goto done;
2233             out[0] = c;
2234             in += 2;
2235             out += 1;
2236         } else if (c < 0x800) {
2237             if (outend - out < 2)
2238                 goto done;
2239             out[0] = (c >> 6)   | 0xC0;
2240             out[1] = (c & 0x3F) | 0x80;
2241             in += 2;
2242             out += 2;
2243         } else if ((c & 0xF800) != 0xD800) {
2244             if (outend - out < 3)
2245                 goto done;
2246             out[0] =  (c >> 12)         | 0xE0;
2247             out[1] = ((c >>  6) & 0x3F) | 0x80;
2248             out[2] =  (c        & 0x3F) | 0x80;
2249             in += 2;
2250             out += 3;
2251         } else {
2252             /* Surrogate pair */
2253             if ((c & 0xFC00) != 0xD800) {
2254                 ret = XML_ENC_ERR_INPUT;
2255                 goto done;
2256             }
2257 	    if (inend - in < 4)
2258 		break;
2259             d = in[2] | (in[3] << 8);
2260             if ((d & 0xFC00) != 0xDC00) {
2261                 ret = XML_ENC_ERR_INPUT;
2262                 goto done;
2263             }
2264 	    if (outend - out < 4)
2265 		goto done;
2266             c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000);
2267             out[0] =  (c >> 18)         | 0xF0;
2268             out[1] = ((c >> 12) & 0x3F) | 0x80;
2269             out[2] = ((c >>  6) & 0x3F) | 0x80;
2270             out[3] =  (c        & 0x3F) | 0x80;
2271             in += 4;
2272             out += 4;
2273         }
2274     }
2275 
2276     ret = out - outstart;
2277 
2278 done:
2279     *outlen = out - outstart;
2280     *inlen = in - instart;
2281     return(ret);
2282 }
2283 
2284 #ifdef LIBXML_OUTPUT_ENABLED
2285 static int
UTF8ToUTF16LE(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2286 UTF8ToUTF16LE(unsigned char *out, int *outlen,
2287               const unsigned char *in, int *inlen,
2288               void *vctxt ATTRIBUTE_UNUSED) {
2289     const unsigned char *instart = in;
2290     const unsigned char *inend;
2291     unsigned char *outstart = out;
2292     unsigned char *outend;
2293     unsigned c, d;
2294     int ret = XML_ENC_ERR_SPACE;
2295 
2296     /* UTF16LE encoding has no BOM */
2297     if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
2298         return(XML_ENC_ERR_INTERNAL);
2299     if (in == NULL) {
2300 	*outlen = 0;
2301 	*inlen = 0;
2302 	return(0);
2303     }
2304     inend = in + *inlen;
2305     outend = out + (*outlen & ~1);
2306     while (in < inend) {
2307         c = in[0];
2308 
2309         if (c < 0x80) {
2310             if (out >= outend)
2311                 goto done;
2312             out[0] = c;
2313             out[1] = 0;
2314             in += 1;
2315             out += 2;
2316         } else {
2317             int i, len;
2318             unsigned min;
2319 
2320             if (c < 0xE0) {
2321                 if (c < 0xC2) {
2322                     ret = XML_ENC_ERR_INPUT;
2323                     goto done;
2324                 }
2325                 c &= 0x1F;
2326                 len = 2;
2327                 min = 0x80;
2328             } else if (c < 0xF0) {
2329                 c &= 0x0F;
2330                 len = 3;
2331                 min = 0x800;
2332             } else {
2333                 c &= 0x0F;
2334                 len = 4;
2335                 min = 0x10000;
2336             }
2337 
2338             if (inend - in < len)
2339                 break;
2340 
2341             for (i = 1; i < len; i++) {
2342                 if ((in[i] & 0xC0) != 0x80) {
2343                     ret = XML_ENC_ERR_INPUT;
2344                     goto done;
2345                 }
2346                 c = (c << 6) | (in[i] & 0x3F);
2347             }
2348 
2349             if ((c < min) ||
2350                 ((c >= 0xD800) && (c <= 0xDFFF)) ||
2351                 (c > 0x10FFFF)) {
2352                 ret = XML_ENC_ERR_INPUT;
2353                 goto done;
2354             }
2355 
2356             if (c < 0x10000) {
2357                 if (out >= outend)
2358                     goto done;
2359                 out[0] = c & 0xFF;
2360                 out[1] = c >> 8;
2361                 out += 2;
2362             } else {
2363                 if (outend - out < 4)
2364                     goto done;
2365                 c -= 0x10000;
2366                 d = (c & 0x03FF) | 0xDC00;
2367                 c = (c >> 10)    | 0xD800;
2368                 out[0] = c & 0xFF;
2369                 out[1] = c >> 8;
2370                 out[2] = d & 0xFF;
2371                 out[3] = d >> 8;
2372                 out += 4;
2373             }
2374 
2375             in += len;
2376         }
2377     }
2378 
2379     ret = out - outstart;
2380 
2381 done:
2382     *outlen = out - outstart;
2383     *inlen = in - instart;
2384     return(ret);
2385 }
2386 
2387 static int
UTF8ToUTF16(unsigned char * outb,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2388 UTF8ToUTF16(unsigned char* outb, int *outlen,
2389             const unsigned char* in, int *inlen,
2390             void *vctxt ATTRIBUTE_UNUSED) {
2391     if (in == NULL) {
2392 	/*
2393 	 * initialization, add the Byte Order Mark for UTF-16LE
2394 	 */
2395         if (*outlen >= 2) {
2396 	    outb[0] = 0xFF;
2397 	    outb[1] = 0xFE;
2398 	    *outlen = 2;
2399 	    *inlen = 0;
2400 	    return(2);
2401 	}
2402 	*outlen = 0;
2403 	*inlen = 0;
2404 	return(0);
2405     }
2406     return (UTF8ToUTF16LE(outb, outlen, in, inlen, NULL));
2407 }
2408 #endif /* LIBXML_OUTPUT_ENABLED */
2409 
2410 static int
UTF16BEToUTF8(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2411 UTF16BEToUTF8(unsigned char *out, int *outlen,
2412               const unsigned char *in, int *inlen,
2413               void *vctxt ATTRIBUTE_UNUSED) {
2414     const unsigned char *instart = in;
2415     const unsigned char *inend = in + (*inlen & ~1);
2416     unsigned char *outstart = out;
2417     unsigned char *outend = out + *outlen;
2418     unsigned c, d;
2419     int ret = XML_ENC_ERR_SPACE;
2420 
2421     while (in < inend) {
2422         c = (in[0] << 8) | in[1];
2423 
2424         if (c < 0x80) {
2425             if (out >= outend)
2426                 goto done;
2427             out[0] = c;
2428             in += 2;
2429             out += 1;
2430         } else if (c < 0x800) {
2431             if (outend - out < 2)
2432                 goto done;
2433             out[0] = (c >> 6)   | 0xC0;
2434             out[1] = (c & 0x3F) | 0x80;
2435             in += 2;
2436             out += 2;
2437         } else if ((c & 0xF800) != 0xD800) {
2438             if (outend - out < 3)
2439                 goto done;
2440             out[0] =  (c >> 12)         | 0xE0;
2441             out[1] = ((c >>  6) & 0x3F) | 0x80;
2442             out[2] =  (c        & 0x3F) | 0x80;
2443             in += 2;
2444             out += 3;
2445         } else {
2446             /* Surrogate pair */
2447             if ((c & 0xFC00) != 0xD800) {
2448                 ret = XML_ENC_ERR_INPUT;
2449                 goto done;
2450             }
2451 	    if (inend - in < 4)
2452 		break;
2453             d = (in[2] << 8) | in[3];
2454             if ((d & 0xFC00) != 0xDC00) {
2455                 ret = XML_ENC_ERR_INPUT;
2456                 goto done;
2457             }
2458 	    if (outend - out < 4)
2459 		goto done;
2460             c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000);
2461             out[0] =  (c >> 18)         | 0xF0;
2462             out[1] = ((c >> 12) & 0x3F) | 0x80;
2463             out[2] = ((c >>  6) & 0x3F) | 0x80;
2464             out[3] =  (c        & 0x3F) | 0x80;
2465             in += 4;
2466             out += 4;
2467         }
2468     }
2469 
2470     ret = out - outstart;
2471 
2472 done:
2473     *outlen = out - outstart;
2474     *inlen = in - instart;
2475     return(ret);
2476 }
2477 
2478 #ifdef LIBXML_OUTPUT_ENABLED
2479 static int
UTF8ToUTF16BE(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2480 UTF8ToUTF16BE(unsigned char *out, int *outlen,
2481               const unsigned char *in, int *inlen,
2482               void *vctxt ATTRIBUTE_UNUSED) {
2483     const unsigned char *instart = in;
2484     const unsigned char *inend;
2485     unsigned char *outstart = out;
2486     unsigned char *outend;
2487     unsigned c, d;
2488     int ret = XML_ENC_ERR_SPACE;
2489 
2490     /* UTF-16BE has no BOM */
2491     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2492     if (in == NULL) {
2493 	*outlen = 0;
2494 	*inlen = 0;
2495 	return(0);
2496     }
2497     inend = in + *inlen;
2498     outend = out + (*outlen & ~1);
2499     while (in < inend) {
2500         c = in[0];
2501 
2502         if (c < 0x80) {
2503             if (out >= outend)
2504                 goto done;
2505             out[0] = 0;
2506             out[1] = c;
2507             in += 1;
2508             out += 2;
2509         } else {
2510             int i, len;
2511             unsigned min;
2512 
2513             if (c < 0xE0) {
2514                 if (c < 0xC2) {
2515                     ret = XML_ENC_ERR_INPUT;
2516                     goto done;
2517                 }
2518                 c &= 0x1F;
2519                 len = 2;
2520                 min = 0x80;
2521             } else if (c < 0xF0) {
2522                 c &= 0x0F;
2523                 len = 3;
2524                 min = 0x800;
2525             } else {
2526                 c &= 0x0F;
2527                 len = 4;
2528                 min = 0x10000;
2529             }
2530 
2531             if (inend - in < len)
2532                 break;
2533 
2534             for (i = 1; i < len; i++) {
2535                 if ((in[i] & 0xC0) != 0x80) {
2536                     ret = XML_ENC_ERR_INPUT;
2537                     goto done;
2538                 }
2539                 c = (c << 6) | (in[i] & 0x3F);
2540             }
2541 
2542             if ((c < min) ||
2543                 ((c >= 0xD800) && (c <= 0xDFFF)) ||
2544                 (c > 0x10FFFF)) {
2545                 ret = XML_ENC_ERR_INPUT;
2546                 goto done;
2547             }
2548 
2549             if (c < 0x10000) {
2550                 if (out >= outend)
2551                     goto done;
2552                 out[0] = c >> 8;
2553                 out[1] = c & 0xFF;
2554                 out += 2;
2555             } else {
2556                 if (outend - out < 4)
2557                     goto done;
2558                 c -= 0x10000;
2559                 d = (c & 0x03FF) | 0xDC00;
2560                 c = (c >> 10)    | 0xD800;
2561                 out[0] = c >> 8;
2562                 out[1] = c & 0xFF;
2563                 out[2] = d >> 8;
2564                 out[3] = d & 0xFF;
2565                 out += 4;
2566             }
2567 
2568             in += len;
2569         }
2570     }
2571 
2572     ret = out - outstart;
2573 
2574 done:
2575     *outlen = out - outstart;
2576     *inlen = in - instart;
2577     return(ret);
2578 }
2579 #endif /* LIBXML_OUTPUT_ENABLED */
2580 
2581 #if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
2582 static int
UTF8ToHtmlWrapper(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt ATTRIBUTE_UNUSED)2583 UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
2584                   const unsigned char *in, int *inlen,
2585                   void *vctxt ATTRIBUTE_UNUSED) {
2586     return(UTF8ToHtml(out, outlen, in, inlen));
2587 }
2588 #endif
2589 
2590 #if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \
2591     defined(LIBXML_ISO8859X_ENABLED)
2592 
2593 static int
UTF8ToISO8859x(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt)2594 UTF8ToISO8859x(unsigned char *out, int *outlen,
2595                const unsigned char *in, int *inlen, void *vctxt) {
2596     const unsigned char *xlattable = vctxt;
2597     const unsigned char *instart = in;
2598     const unsigned char *inend;
2599     unsigned char *outstart = out;
2600     unsigned char *outend;
2601     int ret = XML_ENC_ERR_SPACE;
2602 
2603     if (in == NULL) {
2604         /*
2605         * initialization nothing to do
2606         */
2607         *outlen = 0;
2608         *inlen = 0;
2609         return(XML_ENC_ERR_SUCCESS);
2610     }
2611 
2612     inend = in + *inlen;
2613     outend = out + *outlen;
2614     while (in < inend) {
2615         unsigned d = *in;
2616 
2617         if  (d < 0x80)  {
2618             if (out >= outend)
2619                 goto done;
2620             in += 1;
2621         } else if (d < 0xE0) {
2622             unsigned c;
2623 
2624             if (inend - in < 2)
2625                 break;
2626             c = in[1] & 0x3F;
2627             d = d & 0x1F;
2628             d = xlattable [48 + c + xlattable [d] * 64];
2629             if (d == 0) {
2630                 /* not in character set */
2631                 ret = XML_ENC_ERR_INPUT;
2632                 goto done;
2633             }
2634             if (out >= outend)
2635                 goto done;
2636             in += 2;
2637         } else if (d < 0xF0) {
2638             unsigned c1;
2639             unsigned c2;
2640 
2641             if (inend - in < 3)
2642                 break;
2643             c1 = in[1] & 0x3F;
2644             c2 = in[2] & 0x3F;
2645 	    d = d & 0x0F;
2646 	    d = xlattable [48 + c2 + xlattable [48 + c1 +
2647 			xlattable [32 + d] * 64] * 64];
2648             if (d == 0) {
2649                 /* not in character set */
2650                 ret = XML_ENC_ERR_INPUT;
2651                 goto done;
2652             }
2653             if (out >= outend)
2654                 goto done;
2655             in += 3;
2656         } else {
2657             /* cannot transcode >= U+010000 */
2658                 ret = XML_ENC_ERR_INPUT;
2659                 goto done;
2660         }
2661 
2662         *out++ = d;
2663     }
2664 
2665     ret = out - outstart;
2666 
2667 done:
2668     *outlen = out - outstart;
2669     *inlen = in - instart;
2670     return(ret);
2671 }
2672 
2673 static int
ISO8859xToUTF8(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt)2674 ISO8859xToUTF8(unsigned char* out, int *outlen,
2675                const unsigned char* in, int *inlen, void *vctxt) {
2676     unsigned short const *unicodetable = vctxt;
2677     const unsigned char* instart = in;
2678     const unsigned char* inend;
2679     unsigned char* outstart = out;
2680     unsigned char* outend;
2681     int ret = XML_ENC_ERR_SPACE;
2682 
2683     outend = out + *outlen;
2684     inend = in + *inlen;
2685 
2686     while (in < inend) {
2687         unsigned c = *in;
2688 
2689         if (c < 0x80) {
2690             if (out >= outend)
2691                 goto done;
2692             *out++ = c;
2693         } else {
2694             c = unicodetable[c - 0x80];
2695             if (c == 0) {
2696                 /* undefined code point */
2697                 ret = XML_ENC_ERR_INPUT;
2698                 goto done;
2699             }
2700             if (c < 0x800) {
2701                 if (outend - out < 2)
2702                     goto done;
2703                 *out++ = ((c >>  6) & 0x1F) | 0xC0;
2704                 *out++ = (c & 0x3F) | 0x80;
2705             } else {
2706                 if (outend - out < 3)
2707                     goto done;
2708                 *out++ = ((c >>  12) & 0x0F) | 0xE0;
2709                 *out++ = ((c >>  6) & 0x3F) | 0x80;
2710                 *out++ = (c & 0x3F) | 0x80;
2711             }
2712         }
2713 
2714         in += 1;
2715     }
2716 
2717     ret = out - outstart;
2718 
2719 done:
2720     *outlen = out - outstart;
2721     *inlen = in - instart;
2722     return(ret);
2723 }
2724 
2725 #endif
2726 
2727