1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnv_u7.c
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002jul01
14 * created by: Markus W. Scherer
15 *
16 * UTF-7 converter implementation. Used to be in ucnv_utf.c.
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
22
23 #include "cmemory.h"
24 #include "unicode/ucnv.h"
25 #include "ucnv_bld.h"
26 #include "ucnv_cnv.h"
27 #include "uassert.h"
28
29 /* UTF-7 -------------------------------------------------------------------- */
30
31 /*
32 * UTF-7 is a stateful encoding of Unicode.
33 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
34 * It was intended for use in Internet email systems, using in its bytewise
35 * encoding only a subset of 7-bit US-ASCII.
36 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
37 * occasionally used.
38 *
39 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
40 * characters directly or in base64. Especially, the characters in set O
41 * as defined in the RFC (see below) may be encoded directly but are not
42 * allowed in, e.g., email headers.
43 * By default, the ICU UTF-7 converter encodes set O directly.
44 * By choosing the option "version=1", set O will be escaped instead.
45 * For example:
46 * utf7Converter=ucnv_open("UTF-7,version=1");
47 *
48 * For details about email headers see RFC 2047.
49 */
50
51 /*
52 * Tests for US-ASCII characters belonging to character classes
53 * defined in UTF-7.
54 *
55 * Set D (directly encoded characters) consists of the following
56 * characters: the upper and lower case letters A through Z
57 * and a through z, the 10 digits 0-9, and the following nine special
58 * characters (note that "+" and "=" are omitted):
59 * '(),-./:?
60 *
61 * Set O (optional direct characters) consists of the following
62 * characters (note that "\" and "~" are omitted):
63 * !"#$%&*;<=>@[]^_`{|}
64 *
65 * According to the rules in RFC 2152, the byte values for the following
66 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
67 * - all C0 control codes except for CR LF TAB
68 * - BACKSLASH
69 * - TILDE
70 * - DEL
71 * - all codes beyond US-ASCII, i.e. all >127
72 */
73 #define inSetD(c) \
74 ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
75 (uint8_t)((c)-48)<10 || /* digits */ \
76 (uint8_t)((c)-39)<3 || /* '() */ \
77 (uint8_t)((c)-44)<4 || /* ,-./ */ \
78 (c)==58 || (c)==63 /* :? */ \
79 )
80
81 #define inSetO(c) \
82 ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
83 (uint8_t)((c)-59)<4 || /* ;<=> */ \
84 (uint8_t)((c)-93)<4 || /* ]^_` */ \
85 (uint8_t)((c)-123)<3 || /* {|} */ \
86 (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
87 )
88
89 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
90 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
91
92 #define PLUS 43
93 #define MINUS 45
94 #define BACKSLASH 92
95 #define TILDE 126
96
97 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
98 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
99
100 /* encode directly sets D and O and CR LF SP TAB */
101 static const UBool encodeDirectlyMaximum[128]={
102 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
111
112 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
114 };
115
116 /* encode directly set D and CR LF SP TAB but not set O */
117 static const UBool encodeDirectlyRestricted[128]={
118 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121
122 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
124
125 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
127
128 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
130 };
131
132 static const uint8_t
133 toBase64[64]={
134 /* A-Z */
135 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
136 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
137 /* a-z */
138 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
139 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
140 /* 0-9 */
141 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
142 /* +/ */
143 43, 47
144 };
145
146 static const int8_t
147 fromBase64[128]={
148 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
149 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
150 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
151
152 /* general punctuation with + and / and a special value (-2) for - */
153 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
154 /* digits */
155 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
156
157 /* A-Z */
158 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
159 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
160
161 /* a-z */
162 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
163 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
164 };
165
166 /*
167 * converter status values:
168 *
169 * toUnicodeStatus:
170 * 24 inDirectMode (boolean)
171 * 23..16 base64Counter (-1..7)
172 * 15..0 bits (up to 14 bits incoming base64)
173 *
174 * fromUnicodeStatus:
175 * 31..28 version (0: set O direct 1: set O escaped)
176 * 24 inDirectMode (boolean)
177 * 23..16 base64Counter (0..2)
178 * 7..0 bits (6 bits outgoing base64)
179 *
180 */
181
182 U_CDECL_BEGIN
183 static void U_CALLCONV
_UTF7Reset(UConverter * cnv,UConverterResetChoice choice)184 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
185 if(choice<=UCNV_RESET_TO_UNICODE) {
186 /* reset toUnicode */
187 cnv->toUnicodeStatus=0x1000000; /* inDirectMode=true */
188 cnv->toULength=0;
189 }
190 if(choice!=UCNV_RESET_TO_UNICODE) {
191 /* reset fromUnicode */
192 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
193 }
194 }
195
196 static void U_CALLCONV
_UTF7Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)197 _UTF7Open(UConverter *cnv,
198 UConverterLoadArgs *pArgs,
199 UErrorCode *pErrorCode) {
200 (void)pArgs;
201 if(UCNV_GET_VERSION(cnv)<=1) {
202 /* TODO(markus): Should just use cnv->options rather than copying the version number. */
203 cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
204 _UTF7Reset(cnv, UCNV_RESET_BOTH);
205 } else {
206 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
207 }
208 }
209
210 static void U_CALLCONV
_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)211 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
212 UErrorCode *pErrorCode) {
213 UConverter *cnv;
214 const uint8_t *source, *sourceLimit;
215 char16_t *target;
216 const char16_t *targetLimit;
217 int32_t *offsets;
218
219 uint8_t *bytes;
220 uint8_t byteIndex;
221
222 int32_t length, targetCapacity;
223
224 /* UTF-7 state */
225 uint16_t bits;
226 int8_t base64Counter;
227 UBool inDirectMode;
228
229 int8_t base64Value;
230
231 int32_t sourceIndex, nextSourceIndex;
232
233 uint8_t b;
234 /* set up the local pointers */
235 cnv=pArgs->converter;
236
237 source=(const uint8_t *)pArgs->source;
238 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
239 target=pArgs->target;
240 targetLimit=pArgs->targetLimit;
241 offsets=pArgs->offsets;
242 /* get the state machine state */
243 {
244 uint32_t status=cnv->toUnicodeStatus;
245 inDirectMode=(UBool)((status>>24)&1);
246 base64Counter=(int8_t)(status>>16);
247 bits=(uint16_t)status;
248 }
249 bytes=cnv->toUBytes;
250 byteIndex=cnv->toULength;
251
252 /* sourceIndex=-1 if the current character began in the previous buffer */
253 sourceIndex=byteIndex==0 ? 0 : -1;
254 nextSourceIndex=0;
255
256 if(inDirectMode) {
257 directMode:
258 /*
259 * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
260 * with their US-ASCII byte values.
261 * Backslash and Tilde and most control characters are not allowed in UTF-7.
262 * A plus sign starts Unicode (or "escape") Mode.
263 *
264 * In Direct Mode, only the sourceIndex is used.
265 */
266 byteIndex=0;
267 length=(int32_t)(sourceLimit-source);
268 targetCapacity=(int32_t)(targetLimit-target);
269 if(length>targetCapacity) {
270 length=targetCapacity;
271 }
272 while(length>0) {
273 b=*source++;
274 if(!isLegalUTF7(b)) {
275 /* illegal */
276 bytes[0]=b;
277 byteIndex=1;
278 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
279 break;
280 } else if(b!=PLUS) {
281 /* write directly encoded character */
282 *target++=b;
283 if(offsets!=nullptr) {
284 *offsets++=sourceIndex++;
285 }
286 } else /* PLUS */ {
287 /* switch to Unicode mode */
288 nextSourceIndex=++sourceIndex;
289 inDirectMode=false;
290 byteIndex=0;
291 bits=0;
292 base64Counter=-1;
293 goto unicodeMode;
294 }
295 --length;
296 }
297 if(source<sourceLimit && target>=targetLimit) {
298 /* target is full */
299 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
300 }
301 } else {
302 unicodeMode:
303 /*
304 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
305 * The base64 sequence ends with any character that is not in the base64 alphabet.
306 * A terminating minus sign is consumed.
307 *
308 * In Unicode Mode, the sourceIndex has the index to the start of the current
309 * base64 bytes, while nextSourceIndex is precisely parallel to source,
310 * keeping the index to the following byte.
311 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
312 */
313 while(source<sourceLimit) {
314 if(target<targetLimit) {
315 bytes[byteIndex++]=b=*source++;
316 ++nextSourceIndex;
317 base64Value = -3; /* initialize as illegal */
318 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
319 /* either
320 * base64Value==-1 for any legal character except base64 and minus sign, or
321 * base64Value==-3 for illegal characters:
322 * 1. In either case, leave Unicode mode.
323 * 2.1. If we ended with an incomplete char16_t or none after the +, then
324 * generate an error for the preceding erroneous sequence and deal with
325 * the current (possibly illegal) character next time through.
326 * 2.2. Else the current char comes after a complete char16_t, which was already
327 * pushed to the output buf, so:
328 * 2.2.1. If the current char is legal, just save it for processing next time.
329 * It may be for example, a plus which we need to deal with in direct mode.
330 * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
331 */
332 inDirectMode=true;
333 if(base64Counter==-1) {
334 /* illegal: + immediately followed by something other than base64 or minus sign */
335 /* include the plus sign in the reported sequence, but not the subsequent char */
336 --source;
337 bytes[0]=PLUS;
338 byteIndex=1;
339 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
340 break;
341 } else if(bits!=0) {
342 /* bits are illegally left over, a char16_t is incomplete */
343 /* don't include current char (legal or illegal) in error seq */
344 --source;
345 --byteIndex;
346 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
347 break;
348 } else {
349 /* previous char16_t was complete */
350 if(base64Value==-3) {
351 /* current character is illegal, deal with it here */
352 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
353 break;
354 } else {
355 /* un-read the current character in case it is a plus sign */
356 --source;
357 sourceIndex=nextSourceIndex-1;
358 goto directMode;
359 }
360 }
361 } else if(base64Value>=0) {
362 /* collect base64 bytes into UChars */
363 switch(base64Counter) {
364 case -1: /* -1 is immediately after the + */
365 case 0:
366 bits=base64Value;
367 base64Counter=1;
368 break;
369 case 1:
370 case 3:
371 case 4:
372 case 6:
373 bits=(uint16_t)((bits<<6)|base64Value);
374 ++base64Counter;
375 break;
376 case 2:
377 *target++=(char16_t)((bits<<4)|(base64Value>>2));
378 if(offsets!=nullptr) {
379 *offsets++=sourceIndex;
380 sourceIndex=nextSourceIndex-1;
381 }
382 bytes[0]=b; /* keep this byte in case an error occurs */
383 byteIndex=1;
384 bits=(uint16_t)(base64Value&3);
385 base64Counter=3;
386 break;
387 case 5:
388 *target++=(char16_t)((bits<<2)|(base64Value>>4));
389 if(offsets!=nullptr) {
390 *offsets++=sourceIndex;
391 sourceIndex=nextSourceIndex-1;
392 }
393 bytes[0]=b; /* keep this byte in case an error occurs */
394 byteIndex=1;
395 bits=(uint16_t)(base64Value&15);
396 base64Counter=6;
397 break;
398 case 7:
399 *target++=(char16_t)((bits<<6)|base64Value);
400 if(offsets!=nullptr) {
401 *offsets++=sourceIndex;
402 sourceIndex=nextSourceIndex;
403 }
404 byteIndex=0;
405 bits=0;
406 base64Counter=0;
407 break;
408 default:
409 /* will never occur */
410 break;
411 }
412 } else /*base64Value==-2*/ {
413 /* minus sign terminates the base64 sequence */
414 inDirectMode=true;
415 if(base64Counter==-1) {
416 /* +- i.e. a minus immediately following a plus */
417 *target++=PLUS;
418 if(offsets!=nullptr) {
419 *offsets++=sourceIndex-1;
420 }
421 } else {
422 /* absorb the minus and leave the Unicode Mode */
423 if(bits!=0) {
424 /* bits are illegally left over, a char16_t is incomplete */
425 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
426 break;
427 }
428 }
429 sourceIndex=nextSourceIndex;
430 goto directMode;
431 }
432 } else {
433 /* target is full */
434 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
435 break;
436 }
437 }
438 }
439
440 if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
441 /*
442 * if we are in Unicode mode, then the byteIndex might not be 0,
443 * but that is ok if bits==0
444 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
445 * (not true for IMAP-mailbox-name where we must end in direct mode)
446 */
447 byteIndex=0;
448 }
449
450 /* set the converter state back into UConverter */
451 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
452 cnv->toULength=byteIndex;
453
454 /* write back the updated pointers */
455 pArgs->source=(const char *)source;
456 pArgs->target=target;
457 pArgs->offsets=offsets;
458 }
459
460 static void U_CALLCONV
_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)461 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
462 UErrorCode *pErrorCode) {
463 UConverter *cnv;
464 const char16_t *source, *sourceLimit;
465 uint8_t *target, *targetLimit;
466 int32_t *offsets;
467
468 int32_t length, targetCapacity, sourceIndex;
469 char16_t c;
470
471 /* UTF-7 state */
472 const UBool *encodeDirectly;
473 uint8_t bits;
474 int8_t base64Counter;
475 UBool inDirectMode;
476
477 /* set up the local pointers */
478 cnv=pArgs->converter;
479
480 /* set up the local pointers */
481 source=pArgs->source;
482 sourceLimit=pArgs->sourceLimit;
483 target=(uint8_t *)pArgs->target;
484 targetLimit=(uint8_t *)pArgs->targetLimit;
485 offsets=pArgs->offsets;
486
487 /* get the state machine state */
488 {
489 uint32_t status=cnv->fromUnicodeStatus;
490 encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
491 inDirectMode=(UBool)((status>>24)&1);
492 base64Counter=(int8_t)(status>>16);
493 bits=(uint8_t)status;
494 U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
495 }
496
497 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
498 sourceIndex=0;
499
500 if(inDirectMode) {
501 directMode:
502 length=(int32_t)(sourceLimit-source);
503 targetCapacity=(int32_t)(targetLimit-target);
504 if(length>targetCapacity) {
505 length=targetCapacity;
506 }
507 while(length>0) {
508 c=*source++;
509 /* currently always encode CR LF SP TAB directly */
510 if(c<=127 && encodeDirectly[c]) {
511 /* encode directly */
512 *target++=(uint8_t)c;
513 if(offsets!=nullptr) {
514 *offsets++=sourceIndex++;
515 }
516 } else if(c==PLUS) {
517 /* output +- for + */
518 *target++=PLUS;
519 if(target<targetLimit) {
520 *target++=MINUS;
521 if(offsets!=nullptr) {
522 *offsets++=sourceIndex;
523 *offsets++=sourceIndex++;
524 }
525 /* realign length and targetCapacity */
526 goto directMode;
527 } else {
528 if(offsets!=nullptr) {
529 *offsets++=sourceIndex++;
530 }
531 cnv->charErrorBuffer[0]=MINUS;
532 cnv->charErrorBufferLength=1;
533 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
534 break;
535 }
536 } else {
537 /* un-read this character and switch to Unicode Mode */
538 --source;
539 *target++=PLUS;
540 if(offsets!=nullptr) {
541 *offsets++=sourceIndex;
542 }
543 inDirectMode=false;
544 base64Counter=0;
545 goto unicodeMode;
546 }
547 --length;
548 }
549 if(source<sourceLimit && target>=targetLimit) {
550 /* target is full */
551 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
552 }
553 } else {
554 unicodeMode:
555 while(source<sourceLimit) {
556 if(target<targetLimit) {
557 c=*source++;
558 if(c<=127 && encodeDirectly[c]) {
559 /* encode directly */
560 inDirectMode=true;
561
562 /* trick: back out this character to make this easier */
563 --source;
564
565 /* terminate the base64 sequence */
566 if(base64Counter!=0) {
567 /* write remaining bits for the previous character */
568 *target++=toBase64[bits];
569 if(offsets!=nullptr) {
570 *offsets++=sourceIndex-1;
571 }
572 }
573 if(fromBase64[c]!=-1) {
574 /* need to terminate with a minus */
575 if(target<targetLimit) {
576 *target++=MINUS;
577 if(offsets!=nullptr) {
578 *offsets++=sourceIndex-1;
579 }
580 } else {
581 cnv->charErrorBuffer[0]=MINUS;
582 cnv->charErrorBufferLength=1;
583 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
584 break;
585 }
586 }
587 goto directMode;
588 } else {
589 /*
590 * base64 this character:
591 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
592 * and the bits of this character, each implicitly in UTF-16BE.
593 *
594 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
595 * character to the next. The actual 2 or 4 bits are shifted to the left edge
596 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
597 */
598 switch(base64Counter) {
599 case 0:
600 *target++=toBase64[c>>10];
601 if(target<targetLimit) {
602 *target++=toBase64[(c>>4)&0x3f];
603 if(offsets!=nullptr) {
604 *offsets++=sourceIndex;
605 *offsets++=sourceIndex++;
606 }
607 } else {
608 if(offsets!=nullptr) {
609 *offsets++=sourceIndex++;
610 }
611 cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
612 cnv->charErrorBufferLength=1;
613 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
614 }
615 bits=(uint8_t)((c&15)<<2);
616 base64Counter=1;
617 break;
618 case 1:
619 *target++=toBase64[bits|(c>>14)];
620 if(target<targetLimit) {
621 *target++=toBase64[(c>>8)&0x3f];
622 if(target<targetLimit) {
623 *target++=toBase64[(c>>2)&0x3f];
624 if(offsets!=nullptr) {
625 *offsets++=sourceIndex;
626 *offsets++=sourceIndex;
627 *offsets++=sourceIndex++;
628 }
629 } else {
630 if(offsets!=nullptr) {
631 *offsets++=sourceIndex;
632 *offsets++=sourceIndex++;
633 }
634 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
635 cnv->charErrorBufferLength=1;
636 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
637 }
638 } else {
639 if(offsets!=nullptr) {
640 *offsets++=sourceIndex++;
641 }
642 cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
643 cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
644 cnv->charErrorBufferLength=2;
645 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
646 }
647 bits=(uint8_t)((c&3)<<4);
648 base64Counter=2;
649 break;
650 case 2:
651 *target++=toBase64[bits|(c>>12)];
652 if(target<targetLimit) {
653 *target++=toBase64[(c>>6)&0x3f];
654 if(target<targetLimit) {
655 *target++=toBase64[c&0x3f];
656 if(offsets!=nullptr) {
657 *offsets++=sourceIndex;
658 *offsets++=sourceIndex;
659 *offsets++=sourceIndex++;
660 }
661 } else {
662 if(offsets!=nullptr) {
663 *offsets++=sourceIndex;
664 *offsets++=sourceIndex++;
665 }
666 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
667 cnv->charErrorBufferLength=1;
668 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
669 }
670 } else {
671 if(offsets!=nullptr) {
672 *offsets++=sourceIndex++;
673 }
674 cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
675 cnv->charErrorBuffer[1]=toBase64[c&0x3f];
676 cnv->charErrorBufferLength=2;
677 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
678 }
679 bits=0;
680 base64Counter=0;
681 break;
682 default:
683 /* will never occur */
684 break;
685 }
686 }
687 } else {
688 /* target is full */
689 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
690 break;
691 }
692 }
693 }
694
695 if(pArgs->flush && source>=sourceLimit) {
696 /* flush remaining bits to the target */
697 if(!inDirectMode) {
698 if (base64Counter!=0) {
699 if(target<targetLimit) {
700 *target++=toBase64[bits];
701 if(offsets!=nullptr) {
702 *offsets++=sourceIndex-1;
703 }
704 } else {
705 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
706 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
707 }
708 }
709 /* Add final MINUS to terminate unicodeMode */
710 if(target<targetLimit) {
711 *target++=MINUS;
712 if(offsets!=nullptr) {
713 *offsets++=sourceIndex-1;
714 }
715 } else {
716 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
717 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
718 }
719 }
720 /* reset the state for the next conversion */
721 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
722 } else {
723 /* set the converter state back into UConverter */
724 cnv->fromUnicodeStatus=
725 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
726 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
727 }
728
729 /* write back the updated pointers */
730 pArgs->source=source;
731 pArgs->target=(char *)target;
732 pArgs->offsets=offsets;
733 }
734
735 static const char * U_CALLCONV
_UTF7GetName(const UConverter * cnv)736 _UTF7GetName(const UConverter *cnv) {
737 switch(cnv->fromUnicodeStatus>>28) {
738 case 1:
739 return "UTF-7,version=1";
740 default:
741 return "UTF-7";
742 }
743 }
744 U_CDECL_END
745
746 static const UConverterImpl _UTF7Impl={
747 UCNV_UTF7,
748
749 nullptr,
750 nullptr,
751
752 _UTF7Open,
753 nullptr,
754 _UTF7Reset,
755
756 _UTF7ToUnicodeWithOffsets,
757 _UTF7ToUnicodeWithOffsets,
758 _UTF7FromUnicodeWithOffsets,
759 _UTF7FromUnicodeWithOffsets,
760 nullptr,
761
762 nullptr,
763 _UTF7GetName,
764 nullptr, /* we don't need writeSub() because we never call a callback at fromUnicode() */
765 nullptr,
766 ucnv_getCompleteUnicodeSet,
767
768 nullptr,
769 nullptr
770 };
771
772 static const UConverterStaticData _UTF7StaticData={
773 sizeof(UConverterStaticData),
774 "UTF-7",
775 0, /* TODO CCSID for UTF-7 */
776 UCNV_IBM, UCNV_UTF7,
777 1, 4,
778 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
779 false, false,
780 0,
781 0,
782 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
783 };
784
785 const UConverterSharedData _UTF7Data=
786 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
787
788 /* IMAP mailbox name encoding ----------------------------------------------- */
789
790 /*
791 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
792 * http://www.ietf.org/rfc/rfc2060.txt
793 *
794 * 5.1.3. Mailbox International Naming Convention
795 *
796 * By convention, international mailbox names are specified using a
797 * modified version of the UTF-7 encoding described in [UTF-7]. The
798 * purpose of these modifications is to correct the following problems
799 * with UTF-7:
800 *
801 * 1) UTF-7 uses the "+" character for shifting; this conflicts with
802 * the common use of "+" in mailbox names, in particular USENET
803 * newsgroup names.
804 *
805 * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
806 * conflicts with the use of "/" as a popular hierarchy delimiter.
807 *
808 * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
809 * the use of "\" as a popular hierarchy delimiter.
810 *
811 * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
812 * the use of "~" in some servers as a home directory indicator.
813 *
814 * 5) UTF-7 permits multiple alternate forms to represent the same
815 * string; in particular, printable US-ASCII characters can be
816 * represented in encoded form.
817 *
818 * In modified UTF-7, printable US-ASCII characters except for "&"
819 * represent themselves; that is, characters with octet values 0x20-0x25
820 * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
821 * octet sequence "&-".
822 *
823 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
824 * Unicode 16-bit octets) are represented in modified BASE64, with a
825 * further modification from [UTF-7] that "," is used instead of "/".
826 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
827 * character which can represent itself.
828 *
829 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
830 * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
831 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
832 * ").
833 *
834 * For example, here is a mailbox name which mixes English, Japanese,
835 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
836 */
837
838 /*
839 * Tests for US-ASCII characters belonging to character classes
840 * defined in UTF-7.
841 *
842 * Set D (directly encoded characters) consists of the following
843 * characters: the upper and lower case letters A through Z
844 * and a through z, the 10 digits 0-9, and the following nine special
845 * characters (note that "+" and "=" are omitted):
846 * '(),-./:?
847 *
848 * Set O (optional direct characters) consists of the following
849 * characters (note that "\" and "~" are omitted):
850 * !"#$%&*;<=>@[]^_`{|}
851 *
852 * According to the rules in RFC 2152, the byte values for the following
853 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
854 * - all C0 control codes except for CR LF TAB
855 * - BACKSLASH
856 * - TILDE
857 * - DEL
858 * - all codes beyond US-ASCII, i.e. all >127
859 */
860
861 /* uses '&' not '+' to start a base64 sequence */
862 #define AMPERSAND 0x26
863 #define COMMA 0x2c
864 #define SLASH 0x2f
865
866 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
867 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
868
869 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
870 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
871
872 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
873 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
874
875 /*
876 * converter status values:
877 *
878 * toUnicodeStatus:
879 * 24 inDirectMode (boolean)
880 * 23..16 base64Counter (-1..7)
881 * 15..0 bits (up to 14 bits incoming base64)
882 *
883 * fromUnicodeStatus:
884 * 24 inDirectMode (boolean)
885 * 23..16 base64Counter (0..2)
886 * 7..0 bits (6 bits outgoing base64)
887 *
888 * ignore bits 31..25
889 */
890
891 U_CDECL_BEGIN
892 static void U_CALLCONV
_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)893 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
894 UErrorCode *pErrorCode) {
895 UConverter *cnv;
896 const uint8_t *source, *sourceLimit;
897 char16_t *target;
898 const char16_t *targetLimit;
899 int32_t *offsets;
900
901 uint8_t *bytes;
902 uint8_t byteIndex;
903
904 int32_t length, targetCapacity;
905
906 /* UTF-7 state */
907 uint16_t bits;
908 int8_t base64Counter;
909 UBool inDirectMode;
910
911 int8_t base64Value;
912
913 int32_t sourceIndex, nextSourceIndex;
914
915 char16_t c;
916 uint8_t b;
917
918 /* set up the local pointers */
919 cnv=pArgs->converter;
920
921 source=(const uint8_t *)pArgs->source;
922 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
923 target=pArgs->target;
924 targetLimit=pArgs->targetLimit;
925 offsets=pArgs->offsets;
926 /* get the state machine state */
927 {
928 uint32_t status=cnv->toUnicodeStatus;
929 inDirectMode=(UBool)((status>>24)&1);
930 base64Counter=(int8_t)(status>>16);
931 bits=(uint16_t)status;
932 }
933 bytes=cnv->toUBytes;
934 byteIndex=cnv->toULength;
935
936 /* sourceIndex=-1 if the current character began in the previous buffer */
937 sourceIndex=byteIndex==0 ? 0 : -1;
938 nextSourceIndex=0;
939
940 if(inDirectMode) {
941 directMode:
942 /*
943 * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
944 * with their US-ASCII byte values.
945 * An ampersand starts Unicode (or "escape") Mode.
946 *
947 * In Direct Mode, only the sourceIndex is used.
948 */
949 byteIndex=0;
950 length=(int32_t)(sourceLimit-source);
951 targetCapacity=(int32_t)(targetLimit-target);
952 if(length>targetCapacity) {
953 length=targetCapacity;
954 }
955 while(length>0) {
956 b=*source++;
957 if(!isLegalIMAP(b)) {
958 /* illegal */
959 bytes[0]=b;
960 byteIndex=1;
961 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
962 break;
963 } else if(b!=AMPERSAND) {
964 /* write directly encoded character */
965 *target++=b;
966 if(offsets!=nullptr) {
967 *offsets++=sourceIndex++;
968 }
969 } else /* AMPERSAND */ {
970 /* switch to Unicode mode */
971 nextSourceIndex=++sourceIndex;
972 inDirectMode=false;
973 byteIndex=0;
974 bits=0;
975 base64Counter=-1;
976 goto unicodeMode;
977 }
978 --length;
979 }
980 if(source<sourceLimit && target>=targetLimit) {
981 /* target is full */
982 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
983 }
984 } else {
985 unicodeMode:
986 /*
987 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
988 * The base64 sequence ends with any character that is not in the base64 alphabet.
989 * A terminating minus sign is consumed.
990 * US-ASCII must not be base64-ed.
991 *
992 * In Unicode Mode, the sourceIndex has the index to the start of the current
993 * base64 bytes, while nextSourceIndex is precisely parallel to source,
994 * keeping the index to the following byte.
995 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
996 */
997 while(source<sourceLimit) {
998 if(target<targetLimit) {
999 bytes[byteIndex++]=b=*source++;
1000 ++nextSourceIndex;
1001 if(b>0x7e) {
1002 /* illegal - test other illegal US-ASCII values by base64Value==-3 */
1003 inDirectMode=true;
1004 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1005 break;
1006 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1007 /* collect base64 bytes into UChars */
1008 switch(base64Counter) {
1009 case -1: /* -1 is immediately after the & */
1010 case 0:
1011 bits=base64Value;
1012 base64Counter=1;
1013 break;
1014 case 1:
1015 case 3:
1016 case 4:
1017 case 6:
1018 bits=(uint16_t)((bits<<6)|base64Value);
1019 ++base64Counter;
1020 break;
1021 case 2:
1022 c=(char16_t)((bits<<4)|(base64Value>>2));
1023 if(isLegalIMAP(c)) {
1024 /* illegal */
1025 inDirectMode=true;
1026 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1027 goto endloop;
1028 }
1029 *target++=c;
1030 if(offsets!=nullptr) {
1031 *offsets++=sourceIndex;
1032 sourceIndex=nextSourceIndex-1;
1033 }
1034 bytes[0]=b; /* keep this byte in case an error occurs */
1035 byteIndex=1;
1036 bits=(uint16_t)(base64Value&3);
1037 base64Counter=3;
1038 break;
1039 case 5:
1040 c=(char16_t)((bits<<2)|(base64Value>>4));
1041 if(isLegalIMAP(c)) {
1042 /* illegal */
1043 inDirectMode=true;
1044 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1045 goto endloop;
1046 }
1047 *target++=c;
1048 if(offsets!=nullptr) {
1049 *offsets++=sourceIndex;
1050 sourceIndex=nextSourceIndex-1;
1051 }
1052 bytes[0]=b; /* keep this byte in case an error occurs */
1053 byteIndex=1;
1054 bits=(uint16_t)(base64Value&15);
1055 base64Counter=6;
1056 break;
1057 case 7:
1058 c=(char16_t)((bits<<6)|base64Value);
1059 if(isLegalIMAP(c)) {
1060 /* illegal */
1061 inDirectMode=true;
1062 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1063 goto endloop;
1064 }
1065 *target++=c;
1066 if(offsets!=nullptr) {
1067 *offsets++=sourceIndex;
1068 sourceIndex=nextSourceIndex;
1069 }
1070 byteIndex=0;
1071 bits=0;
1072 base64Counter=0;
1073 break;
1074 default:
1075 /* will never occur */
1076 break;
1077 }
1078 } else if(base64Value==-2) {
1079 /* minus sign terminates the base64 sequence */
1080 inDirectMode=true;
1081 if(base64Counter==-1) {
1082 /* &- i.e. a minus immediately following an ampersand */
1083 *target++=AMPERSAND;
1084 if(offsets!=nullptr) {
1085 *offsets++=sourceIndex-1;
1086 }
1087 } else {
1088 /* absorb the minus and leave the Unicode Mode */
1089 if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1090 /* bits are illegally left over, a char16_t is incomplete */
1091 /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1092 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1093 break;
1094 }
1095 }
1096 sourceIndex=nextSourceIndex;
1097 goto directMode;
1098 } else {
1099 if(base64Counter==-1) {
1100 /* illegal: & immediately followed by something other than base64 or minus sign */
1101 /* include the ampersand in the reported sequence */
1102 --sourceIndex;
1103 bytes[0]=AMPERSAND;
1104 bytes[1]=b;
1105 byteIndex=2;
1106 }
1107 /* base64Value==-1 for characters that are illegal only in Unicode mode */
1108 /* base64Value==-3 for illegal characters */
1109 /* illegal */
1110 inDirectMode=true;
1111 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1112 break;
1113 }
1114 } else {
1115 /* target is full */
1116 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1117 break;
1118 }
1119 }
1120 }
1121 endloop:
1122
1123 /*
1124 * the end of the input stream and detection of truncated input
1125 * are handled by the framework, but here we must check if we are in Unicode
1126 * mode and byteIndex==0 because we must end in direct mode
1127 *
1128 * conditions:
1129 * successful
1130 * in Unicode mode and byteIndex==0
1131 * end of input and no truncated input
1132 */
1133 if( U_SUCCESS(*pErrorCode) &&
1134 !inDirectMode && byteIndex==0 &&
1135 pArgs->flush && source>=sourceLimit
1136 ) {
1137 if(base64Counter==-1) {
1138 /* & at the very end of the input */
1139 /* make the ampersand the reported sequence */
1140 bytes[0]=AMPERSAND;
1141 byteIndex=1;
1142 }
1143 /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1144
1145 inDirectMode=true; /* avoid looping */
1146 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1147 }
1148
1149 /* set the converter state back into UConverter */
1150 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1151 cnv->toULength=byteIndex;
1152
1153 /* write back the updated pointers */
1154 pArgs->source=(const char *)source;
1155 pArgs->target=target;
1156 pArgs->offsets=offsets;
1157 }
1158
1159 static void U_CALLCONV
_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1160 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1161 UErrorCode *pErrorCode) {
1162 UConverter *cnv;
1163 const char16_t *source, *sourceLimit;
1164 uint8_t *target, *targetLimit;
1165 int32_t *offsets;
1166
1167 int32_t length, targetCapacity, sourceIndex;
1168 char16_t c;
1169 uint8_t b;
1170
1171 /* UTF-7 state */
1172 uint8_t bits;
1173 int8_t base64Counter;
1174 UBool inDirectMode;
1175
1176 /* set up the local pointers */
1177 cnv=pArgs->converter;
1178
1179 /* set up the local pointers */
1180 source=pArgs->source;
1181 sourceLimit=pArgs->sourceLimit;
1182 target=(uint8_t *)pArgs->target;
1183 targetLimit=(uint8_t *)pArgs->targetLimit;
1184 offsets=pArgs->offsets;
1185
1186 /* get the state machine state */
1187 {
1188 uint32_t status=cnv->fromUnicodeStatus;
1189 inDirectMode=(UBool)((status>>24)&1);
1190 base64Counter=(int8_t)(status>>16);
1191 bits=(uint8_t)status;
1192 }
1193
1194 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1195 sourceIndex=0;
1196
1197 if(inDirectMode) {
1198 directMode:
1199 length=(int32_t)(sourceLimit-source);
1200 targetCapacity=(int32_t)(targetLimit-target);
1201 if(length>targetCapacity) {
1202 length=targetCapacity;
1203 }
1204 while(length>0) {
1205 c=*source++;
1206 /* encode 0x20..0x7e except '&' directly */
1207 if(inSetDIMAP(c)) {
1208 /* encode directly */
1209 *target++=(uint8_t)c;
1210 if(offsets!=nullptr) {
1211 *offsets++=sourceIndex++;
1212 }
1213 } else if(c==AMPERSAND) {
1214 /* output &- for & */
1215 *target++=AMPERSAND;
1216 if(target<targetLimit) {
1217 *target++=MINUS;
1218 if(offsets!=nullptr) {
1219 *offsets++=sourceIndex;
1220 *offsets++=sourceIndex++;
1221 }
1222 /* realign length and targetCapacity */
1223 goto directMode;
1224 } else {
1225 if(offsets!=nullptr) {
1226 *offsets++=sourceIndex++;
1227 }
1228 cnv->charErrorBuffer[0]=MINUS;
1229 cnv->charErrorBufferLength=1;
1230 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1231 break;
1232 }
1233 } else {
1234 /* un-read this character and switch to Unicode Mode */
1235 --source;
1236 *target++=AMPERSAND;
1237 if(offsets!=nullptr) {
1238 *offsets++=sourceIndex;
1239 }
1240 inDirectMode=false;
1241 base64Counter=0;
1242 goto unicodeMode;
1243 }
1244 --length;
1245 }
1246 if(source<sourceLimit && target>=targetLimit) {
1247 /* target is full */
1248 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1249 }
1250 } else {
1251 unicodeMode:
1252 while(source<sourceLimit) {
1253 if(target<targetLimit) {
1254 c=*source++;
1255 if(isLegalIMAP(c)) {
1256 /* encode directly */
1257 inDirectMode=true;
1258
1259 /* trick: back out this character to make this easier */
1260 --source;
1261
1262 /* terminate the base64 sequence */
1263 if(base64Counter!=0) {
1264 /* write remaining bits for the previous character */
1265 *target++=TO_BASE64_IMAP(bits);
1266 if(offsets!=nullptr) {
1267 *offsets++=sourceIndex-1;
1268 }
1269 }
1270 /* need to terminate with a minus */
1271 if(target<targetLimit) {
1272 *target++=MINUS;
1273 if(offsets!=nullptr) {
1274 *offsets++=sourceIndex-1;
1275 }
1276 } else {
1277 cnv->charErrorBuffer[0]=MINUS;
1278 cnv->charErrorBufferLength=1;
1279 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1280 break;
1281 }
1282 goto directMode;
1283 } else {
1284 /*
1285 * base64 this character:
1286 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1287 * and the bits of this character, each implicitly in UTF-16BE.
1288 *
1289 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1290 * character to the next. The actual 2 or 4 bits are shifted to the left edge
1291 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1292 */
1293 switch(base64Counter) {
1294 case 0:
1295 b=(uint8_t)(c>>10);
1296 *target++=TO_BASE64_IMAP(b);
1297 if(target<targetLimit) {
1298 b=(uint8_t)((c>>4)&0x3f);
1299 *target++=TO_BASE64_IMAP(b);
1300 if(offsets!=nullptr) {
1301 *offsets++=sourceIndex;
1302 *offsets++=sourceIndex++;
1303 }
1304 } else {
1305 if(offsets!=nullptr) {
1306 *offsets++=sourceIndex++;
1307 }
1308 b=(uint8_t)((c>>4)&0x3f);
1309 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1310 cnv->charErrorBufferLength=1;
1311 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1312 }
1313 bits=(uint8_t)((c&15)<<2);
1314 base64Counter=1;
1315 break;
1316 case 1:
1317 b=(uint8_t)(bits|(c>>14));
1318 *target++=TO_BASE64_IMAP(b);
1319 if(target<targetLimit) {
1320 b=(uint8_t)((c>>8)&0x3f);
1321 *target++=TO_BASE64_IMAP(b);
1322 if(target<targetLimit) {
1323 b=(uint8_t)((c>>2)&0x3f);
1324 *target++=TO_BASE64_IMAP(b);
1325 if(offsets!=nullptr) {
1326 *offsets++=sourceIndex;
1327 *offsets++=sourceIndex;
1328 *offsets++=sourceIndex++;
1329 }
1330 } else {
1331 if(offsets!=nullptr) {
1332 *offsets++=sourceIndex;
1333 *offsets++=sourceIndex++;
1334 }
1335 b=(uint8_t)((c>>2)&0x3f);
1336 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1337 cnv->charErrorBufferLength=1;
1338 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1339 }
1340 } else {
1341 if(offsets!=nullptr) {
1342 *offsets++=sourceIndex++;
1343 }
1344 b=(uint8_t)((c>>8)&0x3f);
1345 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1346 b=(uint8_t)((c>>2)&0x3f);
1347 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1348 cnv->charErrorBufferLength=2;
1349 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1350 }
1351 bits=(uint8_t)((c&3)<<4);
1352 base64Counter=2;
1353 break;
1354 case 2:
1355 b=(uint8_t)(bits|(c>>12));
1356 *target++=TO_BASE64_IMAP(b);
1357 if(target<targetLimit) {
1358 b=(uint8_t)((c>>6)&0x3f);
1359 *target++=TO_BASE64_IMAP(b);
1360 if(target<targetLimit) {
1361 b=(uint8_t)(c&0x3f);
1362 *target++=TO_BASE64_IMAP(b);
1363 if(offsets!=nullptr) {
1364 *offsets++=sourceIndex;
1365 *offsets++=sourceIndex;
1366 *offsets++=sourceIndex++;
1367 }
1368 } else {
1369 if(offsets!=nullptr) {
1370 *offsets++=sourceIndex;
1371 *offsets++=sourceIndex++;
1372 }
1373 b=(uint8_t)(c&0x3f);
1374 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1375 cnv->charErrorBufferLength=1;
1376 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1377 }
1378 } else {
1379 if(offsets!=nullptr) {
1380 *offsets++=sourceIndex++;
1381 }
1382 b=(uint8_t)((c>>6)&0x3f);
1383 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1384 b=(uint8_t)(c&0x3f);
1385 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1386 cnv->charErrorBufferLength=2;
1387 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1388 }
1389 bits=0;
1390 base64Counter=0;
1391 break;
1392 default:
1393 /* will never occur */
1394 break;
1395 }
1396 }
1397 } else {
1398 /* target is full */
1399 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1400 break;
1401 }
1402 }
1403 }
1404
1405 if(pArgs->flush && source>=sourceLimit) {
1406 /* flush remaining bits to the target */
1407 if(!inDirectMode) {
1408 if(base64Counter!=0) {
1409 if(target<targetLimit) {
1410 *target++=TO_BASE64_IMAP(bits);
1411 if(offsets!=nullptr) {
1412 *offsets++=sourceIndex-1;
1413 }
1414 } else {
1415 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1416 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1417 }
1418 }
1419 /* need to terminate with a minus */
1420 if(target<targetLimit) {
1421 *target++=MINUS;
1422 if(offsets!=nullptr) {
1423 *offsets++=sourceIndex-1;
1424 }
1425 } else {
1426 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1427 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1428 }
1429 }
1430 /* reset the state for the next conversion */
1431 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
1432 } else {
1433 /* set the converter state back into UConverter */
1434 cnv->fromUnicodeStatus=
1435 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
1436 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1437 }
1438
1439 /* write back the updated pointers */
1440 pArgs->source=source;
1441 pArgs->target=(char *)target;
1442 pArgs->offsets=offsets;
1443 }
1444 U_CDECL_END
1445
1446 static const UConverterImpl _IMAPImpl={
1447 UCNV_IMAP_MAILBOX,
1448
1449 nullptr,
1450 nullptr,
1451
1452 _UTF7Open,
1453 nullptr,
1454 _UTF7Reset,
1455
1456 _IMAPToUnicodeWithOffsets,
1457 _IMAPToUnicodeWithOffsets,
1458 _IMAPFromUnicodeWithOffsets,
1459 _IMAPFromUnicodeWithOffsets,
1460 nullptr,
1461
1462 nullptr,
1463 nullptr,
1464 nullptr, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1465 nullptr,
1466 ucnv_getCompleteUnicodeSet,
1467 nullptr,
1468 nullptr
1469 };
1470
1471 static const UConverterStaticData _IMAPStaticData={
1472 sizeof(UConverterStaticData),
1473 "IMAP-mailbox-name",
1474 0, /* TODO CCSID for IMAP-mailbox-name */
1475 UCNV_IBM, UCNV_IMAP_MAILBOX,
1476 1, 4,
1477 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1478 false, false,
1479 0,
1480 0,
1481 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1482 };
1483
1484 const UConverterSharedData _IMAPData=
1485 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
1486
1487 #endif
1488