xref: /aosp_15_r20/external/flac/src/share/utf8/iconvert.c (revision 600f14f40d737144c998e2ec7a483122d3776fbc)
1 /*
2  * Copyright (C) 2001 Edmund Grimley Evans <[email protected]>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License along
15  * with this program; if not, write to the Free Software Foundation, Inc.,
16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17  */
18 
19 #ifdef HAVE_CONFIG_H
20 #  include <config.h>
21 #endif
22 
23 #if !defined _WIN32 && defined HAVE_ICONV
24 
25 #include <assert.h>
26 #include <errno.h>
27 #include <iconv.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 
32 #include "iconvert.h"
33 #include "share/alloc.h"
34 #include "share/safe_str.h"
35 
36 /*
37  * Convert data from one encoding to another. Return:
38  *
39  *  -2 : memory allocation failed
40  *  -1 : unknown encoding
41  *   0 : data was converted exactly
42  *   1 : data was converted inexactly
43  *   2 : data was invalid (but still converted)
44  *
45  * We convert in two steps, via UTF-8, as this is the only
46  * reliable way of distinguishing between invalid input
47  * and valid input which iconv refuses to transliterate.
48  * We convert from UTF-8 twice, because we have no way of
49  * knowing whether the conversion was exact if iconv returns
50  * E2BIG (due to a bug in the specification of iconv).
51  * An alternative approach is to assume that the output of
52  * iconv is never more than 4 times as long as the input,
53  * but I prefer to avoid that assumption if possible.
54  */
55 
iconvert(const char * fromcode,const char * tocode,const char * from,size_t fromlen,char ** to,size_t * tolen)56 int iconvert(const char *fromcode, const char *tocode,
57 	     const char *from, size_t fromlen,
58 	     char **to, size_t *tolen)
59 {
60   int ret = 0;
61   iconv_t cd1, cd2;
62   char *ib;
63   char *ob;
64   char *utfbuf = 0, *outbuf, *newbuf;
65   size_t utflen, outlen, ibl, obl, obp, k;
66   char tbuf[2048];
67 
68   cd1 = iconv_open("UTF-8", fromcode);
69   if (cd1 == (iconv_t)(-1))
70     return -1;
71 
72   cd2 = (iconv_t)(-1);
73   /* Don't use strcasecmp() as it's locale-dependent. */
74   if (!strchr("Uu", tocode[0]) ||
75       !strchr("Tt", tocode[1]) ||
76       !strchr("Ff", tocode[2]) ||
77       tocode[3] != '-' ||
78       tocode[4] != '8' ||
79       tocode[5] != '\0') {
80     char *tocode1;
81     int rc;
82     /*
83      * Try using this non-standard feature of glibc and libiconv.
84      * This is deliberately not a config option as people often
85      * change their iconv library without rebuilding applications.
86      */
87 
88     rc = asprintf(&tocode1, "%s//TRANSLIT", tocode);
89     if (rc < 0 || ! tocode1)
90       goto fail;
91 
92     cd2 = iconv_open(tocode1, "UTF-8");
93     free(tocode1);
94 
95     if (cd2 == (iconv_t)(-1))
96       cd2 = iconv_open(tocode, fromcode);
97 
98     if (cd2 == (iconv_t)(-1)) {
99       iconv_close(cd1);
100       return -1;
101     }
102   }
103 
104   utflen = 1; /*fromlen * 2 + 1; XXX */
105   utfbuf = malloc(utflen);
106   if (!utfbuf)
107     goto fail;
108 
109   /* Convert to UTF-8 */
110   ib = (char *)from;
111   ibl = fromlen;
112   ob = utfbuf;
113   obl = utflen;
114   for (;;) {
115     k = iconv(cd1, &ib, &ibl, &ob, &obl);
116     assert((!k && !ibl) ||
117 	   (k == (size_t)(-1) && errno == E2BIG && ibl && obl < 6) ||
118 	   (k == (size_t)(-1) &&
119 	    (errno == EILSEQ || errno == EINVAL) && ibl));
120     if (!ibl)
121       break;
122     if (obl < 6) {
123       /* Enlarge the buffer */
124       if(utflen*2 < utflen) /* overflow check */
125 	goto fail;
126       utflen *= 2;
127       obp = ob - utfbuf; /* save position */
128       newbuf = realloc(utfbuf, utflen);
129       if (!newbuf)
130 	goto fail;
131       ob = newbuf + obp;
132       obl = utflen - obp;
133       utfbuf = newbuf;
134     }
135     else {
136       /* Invalid input */
137       ib++, ibl--;
138       *ob++ = '#', obl--;
139       ret = 2;
140       iconv(cd1, 0, 0, 0, 0);
141     }
142   }
143 
144   if (cd2 == (iconv_t)(-1)) {
145     /* The target encoding was UTF-8 */
146     if (tolen)
147       *tolen = ob - utfbuf;
148     if (!to) {
149       free(utfbuf);
150       iconv_close(cd1);
151       return ret;
152     }
153     newbuf = safe_realloc_nofree_add_2op_(utfbuf, (ob - utfbuf), /*+*/1);
154     if (!newbuf)
155       goto fail;
156     ob = (ob - utfbuf) + newbuf;
157     *ob = '\0';
158     *to = newbuf;
159     iconv_close(cd1);
160     return ret;
161   }
162 
163   /* Truncate the buffer to be tidy */
164   utflen = ob - utfbuf;
165   if (utflen == 0)
166     goto fail;
167   newbuf = realloc(utfbuf, utflen);
168   if (!newbuf)
169     goto fail;
170   utfbuf = newbuf;
171 
172   /* Convert from UTF-8 to discover how long the output is */
173   outlen = 0;
174   ib = utfbuf;
175   ibl = utflen;
176   while (ibl) {
177     ob = tbuf;
178     obl = sizeof(tbuf);
179     k = iconv(cd2, &ib, &ibl, &ob, &obl);
180     assert((k != (size_t)(-1) && !ibl) ||
181 	   (k == (size_t)(-1) && errno == E2BIG && ibl) ||
182 	   (k == (size_t)(-1) && errno == EILSEQ && ibl));
183     if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
184       /* Replace one character */
185       char *tb = "?";
186       size_t tbl = 1;
187 
188       outlen += ob - tbuf;
189       ob = tbuf;
190       obl = sizeof(tbuf);
191       k = iconv(cd2, &tb, &tbl, &ob, &obl);
192       assert((!k && !tbl) ||
193 	     (k == (size_t)(-1) && errno == EILSEQ && tbl));
194       for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
195 	;
196     }
197     outlen += ob - tbuf;
198   }
199   ob = tbuf;
200   obl = sizeof(tbuf);
201   k = iconv(cd2, 0, 0, &ob, &obl);
202   assert(!k);
203   outlen += ob - tbuf;
204 
205   /* Convert from UTF-8 for real */
206   outbuf = safe_malloc_add_2op_(outlen, /*+*/1);
207   if (!outbuf)
208     goto fail;
209   ib = utfbuf;
210   ibl = utflen;
211   ob = outbuf;
212   obl = outlen;
213   while (ibl) {
214     k = iconv(cd2, &ib, &ibl, &ob, &obl);
215     assert((k != (size_t)(-1) && !ibl) ||
216 	   (k == (size_t)(-1) && errno == EILSEQ && ibl));
217     if (k && !ret)
218       ret = 1;
219     if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
220       /* Replace one character */
221       char *tb = "?";
222       size_t tbl = 1;
223 
224       k = iconv(cd2, &tb, &tbl, &ob, &obl);
225       assert((!k && !tbl) ||
226 	     (k == (size_t)(-1) && errno == EILSEQ && tbl));
227       for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
228 	;
229     }
230   }
231   k = iconv(cd2, 0, 0, &ob, &obl);
232   assert(!k);
233   assert(!obl);
234   *ob = '\0';
235 
236   free(utfbuf);
237   iconv_close(cd1);
238   iconv_close(cd2);
239   if (tolen)
240     *tolen = outlen;
241   if (!to) {
242     free(outbuf);
243     return ret;
244   }
245   *to = outbuf;
246   return ret;
247 
248  fail:
249   if(0 != utfbuf)
250     free(utfbuf);
251   iconv_close(cd1);
252   if (cd2 != (iconv_t)(-1))
253     iconv_close(cd2);
254   return -2;
255 }
256 
257 #endif /* HAVE_ICONV */
258