1 /*
2 * Copyright (C) 2001 Edmund Grimley Evans <[email protected]>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 */
18
19 #ifdef HAVE_CONFIG_H
20 # include <config.h>
21 #endif
22
23 #if !defined _WIN32 && defined HAVE_ICONV
24
25 #include <assert.h>
26 #include <errno.h>
27 #include <iconv.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32 #include "iconvert.h"
33 #include "share/alloc.h"
34 #include "share/safe_str.h"
35
36 /*
37 * Convert data from one encoding to another. Return:
38 *
39 * -2 : memory allocation failed
40 * -1 : unknown encoding
41 * 0 : data was converted exactly
42 * 1 : data was converted inexactly
43 * 2 : data was invalid (but still converted)
44 *
45 * We convert in two steps, via UTF-8, as this is the only
46 * reliable way of distinguishing between invalid input
47 * and valid input which iconv refuses to transliterate.
48 * We convert from UTF-8 twice, because we have no way of
49 * knowing whether the conversion was exact if iconv returns
50 * E2BIG (due to a bug in the specification of iconv).
51 * An alternative approach is to assume that the output of
52 * iconv is never more than 4 times as long as the input,
53 * but I prefer to avoid that assumption if possible.
54 */
55
iconvert(const char * fromcode,const char * tocode,const char * from,size_t fromlen,char ** to,size_t * tolen)56 int iconvert(const char *fromcode, const char *tocode,
57 const char *from, size_t fromlen,
58 char **to, size_t *tolen)
59 {
60 int ret = 0;
61 iconv_t cd1, cd2;
62 char *ib;
63 char *ob;
64 char *utfbuf = 0, *outbuf, *newbuf;
65 size_t utflen, outlen, ibl, obl, obp, k;
66 char tbuf[2048];
67
68 cd1 = iconv_open("UTF-8", fromcode);
69 if (cd1 == (iconv_t)(-1))
70 return -1;
71
72 cd2 = (iconv_t)(-1);
73 /* Don't use strcasecmp() as it's locale-dependent. */
74 if (!strchr("Uu", tocode[0]) ||
75 !strchr("Tt", tocode[1]) ||
76 !strchr("Ff", tocode[2]) ||
77 tocode[3] != '-' ||
78 tocode[4] != '8' ||
79 tocode[5] != '\0') {
80 char *tocode1;
81 int rc;
82 /*
83 * Try using this non-standard feature of glibc and libiconv.
84 * This is deliberately not a config option as people often
85 * change their iconv library without rebuilding applications.
86 */
87
88 rc = asprintf(&tocode1, "%s//TRANSLIT", tocode);
89 if (rc < 0 || ! tocode1)
90 goto fail;
91
92 cd2 = iconv_open(tocode1, "UTF-8");
93 free(tocode1);
94
95 if (cd2 == (iconv_t)(-1))
96 cd2 = iconv_open(tocode, fromcode);
97
98 if (cd2 == (iconv_t)(-1)) {
99 iconv_close(cd1);
100 return -1;
101 }
102 }
103
104 utflen = 1; /*fromlen * 2 + 1; XXX */
105 utfbuf = malloc(utflen);
106 if (!utfbuf)
107 goto fail;
108
109 /* Convert to UTF-8 */
110 ib = (char *)from;
111 ibl = fromlen;
112 ob = utfbuf;
113 obl = utflen;
114 for (;;) {
115 k = iconv(cd1, &ib, &ibl, &ob, &obl);
116 assert((!k && !ibl) ||
117 (k == (size_t)(-1) && errno == E2BIG && ibl && obl < 6) ||
118 (k == (size_t)(-1) &&
119 (errno == EILSEQ || errno == EINVAL) && ibl));
120 if (!ibl)
121 break;
122 if (obl < 6) {
123 /* Enlarge the buffer */
124 if(utflen*2 < utflen) /* overflow check */
125 goto fail;
126 utflen *= 2;
127 obp = ob - utfbuf; /* save position */
128 newbuf = realloc(utfbuf, utflen);
129 if (!newbuf)
130 goto fail;
131 ob = newbuf + obp;
132 obl = utflen - obp;
133 utfbuf = newbuf;
134 }
135 else {
136 /* Invalid input */
137 ib++, ibl--;
138 *ob++ = '#', obl--;
139 ret = 2;
140 iconv(cd1, 0, 0, 0, 0);
141 }
142 }
143
144 if (cd2 == (iconv_t)(-1)) {
145 /* The target encoding was UTF-8 */
146 if (tolen)
147 *tolen = ob - utfbuf;
148 if (!to) {
149 free(utfbuf);
150 iconv_close(cd1);
151 return ret;
152 }
153 newbuf = safe_realloc_nofree_add_2op_(utfbuf, (ob - utfbuf), /*+*/1);
154 if (!newbuf)
155 goto fail;
156 ob = (ob - utfbuf) + newbuf;
157 *ob = '\0';
158 *to = newbuf;
159 iconv_close(cd1);
160 return ret;
161 }
162
163 /* Truncate the buffer to be tidy */
164 utflen = ob - utfbuf;
165 if (utflen == 0)
166 goto fail;
167 newbuf = realloc(utfbuf, utflen);
168 if (!newbuf)
169 goto fail;
170 utfbuf = newbuf;
171
172 /* Convert from UTF-8 to discover how long the output is */
173 outlen = 0;
174 ib = utfbuf;
175 ibl = utflen;
176 while (ibl) {
177 ob = tbuf;
178 obl = sizeof(tbuf);
179 k = iconv(cd2, &ib, &ibl, &ob, &obl);
180 assert((k != (size_t)(-1) && !ibl) ||
181 (k == (size_t)(-1) && errno == E2BIG && ibl) ||
182 (k == (size_t)(-1) && errno == EILSEQ && ibl));
183 if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
184 /* Replace one character */
185 char *tb = "?";
186 size_t tbl = 1;
187
188 outlen += ob - tbuf;
189 ob = tbuf;
190 obl = sizeof(tbuf);
191 k = iconv(cd2, &tb, &tbl, &ob, &obl);
192 assert((!k && !tbl) ||
193 (k == (size_t)(-1) && errno == EILSEQ && tbl));
194 for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
195 ;
196 }
197 outlen += ob - tbuf;
198 }
199 ob = tbuf;
200 obl = sizeof(tbuf);
201 k = iconv(cd2, 0, 0, &ob, &obl);
202 assert(!k);
203 outlen += ob - tbuf;
204
205 /* Convert from UTF-8 for real */
206 outbuf = safe_malloc_add_2op_(outlen, /*+*/1);
207 if (!outbuf)
208 goto fail;
209 ib = utfbuf;
210 ibl = utflen;
211 ob = outbuf;
212 obl = outlen;
213 while (ibl) {
214 k = iconv(cd2, &ib, &ibl, &ob, &obl);
215 assert((k != (size_t)(-1) && !ibl) ||
216 (k == (size_t)(-1) && errno == EILSEQ && ibl));
217 if (k && !ret)
218 ret = 1;
219 if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
220 /* Replace one character */
221 char *tb = "?";
222 size_t tbl = 1;
223
224 k = iconv(cd2, &tb, &tbl, &ob, &obl);
225 assert((!k && !tbl) ||
226 (k == (size_t)(-1) && errno == EILSEQ && tbl));
227 for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
228 ;
229 }
230 }
231 k = iconv(cd2, 0, 0, &ob, &obl);
232 assert(!k);
233 assert(!obl);
234 *ob = '\0';
235
236 free(utfbuf);
237 iconv_close(cd1);
238 iconv_close(cd2);
239 if (tolen)
240 *tolen = outlen;
241 if (!to) {
242 free(outbuf);
243 return ret;
244 }
245 *to = outbuf;
246 return ret;
247
248 fail:
249 if(0 != utfbuf)
250 free(utfbuf);
251 iconv_close(cd1);
252 if (cd2 != (iconv_t)(-1))
253 iconv_close(cd2);
254 return -2;
255 }
256
257 #endif /* HAVE_ICONV */
258