xref: /aosp_15_r20/external/icu/icu4c/source/common/ucnv_u7.cpp (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2002-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   file name:  ucnv_u7.c
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002jul01
14 *   created by: Markus W. Scherer
15 *
16 *   UTF-7 converter implementation. Used to be in ucnv_utf.c.
17 */
18 
19 #include "unicode/utypes.h"
20 
21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
22 
23 #include "cmemory.h"
24 #include "unicode/ucnv.h"
25 #include "ucnv_bld.h"
26 #include "ucnv_cnv.h"
27 #include "uassert.h"
28 
29 /* UTF-7 -------------------------------------------------------------------- */
30 
31 /*
32  * UTF-7 is a stateful encoding of Unicode.
33  * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
34  * It was intended for use in Internet email systems, using in its bytewise
35  * encoding only a subset of 7-bit US-ASCII.
36  * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
37  * occasionally used.
38  *
39  * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
40  * characters directly or in base64. Especially, the characters in set O
41  * as defined in the RFC (see below) may be encoded directly but are not
42  * allowed in, e.g., email headers.
43  * By default, the ICU UTF-7 converter encodes set O directly.
44  * By choosing the option "version=1", set O will be escaped instead.
45  * For example:
46  *     utf7Converter=ucnv_open("UTF-7,version=1");
47  *
48  * For details about email headers see RFC 2047.
49  */
50 
51 /*
52  * Tests for US-ASCII characters belonging to character classes
53  * defined in UTF-7.
54  *
55  * Set D (directly encoded characters) consists of the following
56  * characters: the upper and lower case letters A through Z
57  * and a through z, the 10 digits 0-9, and the following nine special
58  * characters (note that "+" and "=" are omitted):
59  *     '(),-./:?
60  *
61  * Set O (optional direct characters) consists of the following
62  * characters (note that "\" and "~" are omitted):
63  *     !"#$%&*;<=>@[]^_`{|}
64  *
65  * According to the rules in RFC 2152, the byte values for the following
66  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
67  * - all C0 control codes except for CR LF TAB
68  * - BACKSLASH
69  * - TILDE
70  * - DEL
71  * - all codes beyond US-ASCII, i.e. all >127
72  */
73 #define inSetD(c) \
74     ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
75      (uint8_t)((c)-48)<10 ||    /* digits */ \
76      (uint8_t)((c)-39)<3 ||     /* '() */ \
77      (uint8_t)((c)-44)<4 ||     /* ,-./ */ \
78      (c)==58 || (c)==63         /* :? */ \
79     )
80 
81 #define inSetO(c) \
82     ((uint8_t)((c)-33)<6 ||         /* !"#$%& */ \
83      (uint8_t)((c)-59)<4 ||         /* ;<=> */ \
84      (uint8_t)((c)-93)<4 ||         /* ]^_` */ \
85      (uint8_t)((c)-123)<3 ||        /* {|} */ \
86      (c)==42 || (c)==64 || (c)==91  /* *@[ */ \
87     )
88 
89 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
90 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
91 
92 #define PLUS  43
93 #define MINUS 45
94 #define BACKSLASH 92
95 #define TILDE 126
96 
97 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
98 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
99 
100 /* encode directly sets D and O and CR LF SP TAB */
101 static const UBool encodeDirectlyMaximum[128]={
102  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
103     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
104     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 
106     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
107     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108 
109     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
111 
112     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
114 };
115 
116 /* encode directly set D and CR LF SP TAB but not set O */
117 static const UBool encodeDirectlyRestricted[128]={
118  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
119     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
120     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 
122     1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
123     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
124 
125     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
127 
128     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
130 };
131 
132 static const uint8_t
133 toBase64[64]={
134     /* A-Z */
135     65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
136     78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
137     /* a-z */
138     97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
139     110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
140     /* 0-9 */
141     48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
142     /* +/ */
143     43, 47
144 };
145 
146 static const int8_t
147 fromBase64[128]={
148     /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
149     -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
150     -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
151 
152     /* general punctuation with + and / and a special value (-2) for - */
153     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
154     /* digits */
155     52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
156 
157     /* A-Z */
158     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
159     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
160 
161     /* a-z */
162     -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
163     41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
164 };
165 
166 /*
167  * converter status values:
168  *
169  * toUnicodeStatus:
170  *     24 inDirectMode (boolean)
171  * 23..16 base64Counter (-1..7)
172  * 15..0  bits (up to 14 bits incoming base64)
173  *
174  * fromUnicodeStatus:
175  * 31..28 version (0: set O direct  1: set O escaped)
176  *     24 inDirectMode (boolean)
177  * 23..16 base64Counter (0..2)
178  *  7..0  bits (6 bits outgoing base64)
179  *
180  */
181 
182 U_CDECL_BEGIN
183 static void U_CALLCONV
_UTF7Reset(UConverter * cnv,UConverterResetChoice choice)184 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
185     if(choice<=UCNV_RESET_TO_UNICODE) {
186         /* reset toUnicode */
187         cnv->toUnicodeStatus=0x1000000; /* inDirectMode=true */
188         cnv->toULength=0;
189     }
190     if(choice!=UCNV_RESET_TO_UNICODE) {
191         /* reset fromUnicode */
192         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
193     }
194 }
195 
196 static void U_CALLCONV
_UTF7Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)197 _UTF7Open(UConverter *cnv,
198           UConverterLoadArgs *pArgs,
199           UErrorCode *pErrorCode) {
200     (void)pArgs;
201     if(UCNV_GET_VERSION(cnv)<=1) {
202         /* TODO(markus): Should just use cnv->options rather than copying the version number. */
203         cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
204         _UTF7Reset(cnv, UCNV_RESET_BOTH);
205     } else {
206         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
207     }
208 }
209 
210 static void U_CALLCONV
_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)211 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
212                           UErrorCode *pErrorCode) {
213     UConverter *cnv;
214     const uint8_t *source, *sourceLimit;
215     char16_t *target;
216     const char16_t *targetLimit;
217     int32_t *offsets;
218 
219     uint8_t *bytes;
220     uint8_t byteIndex;
221 
222     int32_t length, targetCapacity;
223 
224     /* UTF-7 state */
225     uint16_t bits;
226     int8_t base64Counter;
227     UBool inDirectMode;
228 
229     int8_t base64Value;
230 
231     int32_t sourceIndex, nextSourceIndex;
232 
233     uint8_t b;
234     /* set up the local pointers */
235     cnv=pArgs->converter;
236 
237     source=(const uint8_t *)pArgs->source;
238     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
239     target=pArgs->target;
240     targetLimit=pArgs->targetLimit;
241     offsets=pArgs->offsets;
242     /* get the state machine state */
243     {
244         uint32_t status=cnv->toUnicodeStatus;
245         inDirectMode=(UBool)((status>>24)&1);
246         base64Counter=(int8_t)(status>>16);
247         bits=(uint16_t)status;
248     }
249     bytes=cnv->toUBytes;
250     byteIndex=cnv->toULength;
251 
252     /* sourceIndex=-1 if the current character began in the previous buffer */
253     sourceIndex=byteIndex==0 ? 0 : -1;
254     nextSourceIndex=0;
255 
256     if(inDirectMode) {
257 directMode:
258         /*
259          * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
260          * with their US-ASCII byte values.
261          * Backslash and Tilde and most control characters are not allowed in UTF-7.
262          * A plus sign starts Unicode (or "escape") Mode.
263          *
264          * In Direct Mode, only the sourceIndex is used.
265          */
266         byteIndex=0;
267         length=(int32_t)(sourceLimit-source);
268         targetCapacity=(int32_t)(targetLimit-target);
269         if(length>targetCapacity) {
270             length=targetCapacity;
271         }
272         while(length>0) {
273             b=*source++;
274             if(!isLegalUTF7(b)) {
275                 /* illegal */
276                 bytes[0]=b;
277                 byteIndex=1;
278                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
279                 break;
280             } else if(b!=PLUS) {
281                 /* write directly encoded character */
282                 *target++=b;
283                 if(offsets!=nullptr) {
284                     *offsets++=sourceIndex++;
285                 }
286             } else /* PLUS */ {
287                 /* switch to Unicode mode */
288                 nextSourceIndex=++sourceIndex;
289                 inDirectMode=false;
290                 byteIndex=0;
291                 bits=0;
292                 base64Counter=-1;
293                 goto unicodeMode;
294             }
295             --length;
296         }
297         if(source<sourceLimit && target>=targetLimit) {
298             /* target is full */
299             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
300         }
301     } else {
302 unicodeMode:
303         /*
304          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
305          * The base64 sequence ends with any character that is not in the base64 alphabet.
306          * A terminating minus sign is consumed.
307          *
308          * In Unicode Mode, the sourceIndex has the index to the start of the current
309          * base64 bytes, while nextSourceIndex is precisely parallel to source,
310          * keeping the index to the following byte.
311          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
312          */
313         while(source<sourceLimit) {
314             if(target<targetLimit) {
315                 bytes[byteIndex++]=b=*source++;
316                 ++nextSourceIndex;
317                 base64Value = -3; /* initialize as illegal */
318                 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
319                     /* either
320                      * base64Value==-1 for any legal character except base64 and minus sign, or
321                      * base64Value==-3 for illegal characters:
322                      * 1. In either case, leave Unicode mode.
323                      * 2.1. If we ended with an incomplete char16_t or none after the +, then
324                      *      generate an error for the preceding erroneous sequence and deal with
325                      *      the current (possibly illegal) character next time through.
326                      * 2.2. Else the current char comes after a complete char16_t, which was already
327                      *      pushed to the output buf, so:
328                      * 2.2.1. If the current char is legal, just save it for processing next time.
329                      *        It may be for example, a plus which we need to deal with in direct mode.
330                      * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
331                      */
332                     inDirectMode=true;
333                     if(base64Counter==-1) {
334                         /* illegal: + immediately followed by something other than base64 or minus sign */
335                         /* include the plus sign in the reported sequence, but not the subsequent char */
336                         --source;
337                         bytes[0]=PLUS;
338                         byteIndex=1;
339                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
340                         break;
341                     } else if(bits!=0) {
342                         /* bits are illegally left over, a char16_t is incomplete */
343                         /* don't include current char (legal or illegal) in error seq */
344                         --source;
345                         --byteIndex;
346                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
347                         break;
348                     } else {
349                         /* previous char16_t was complete */
350                         if(base64Value==-3) {
351                             /* current character is illegal, deal with it here */
352                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
353                             break;
354                         } else {
355                             /* un-read the current character in case it is a plus sign */
356                             --source;
357                             sourceIndex=nextSourceIndex-1;
358                             goto directMode;
359                         }
360                     }
361                 } else if(base64Value>=0) {
362                     /* collect base64 bytes into UChars */
363                     switch(base64Counter) {
364                     case -1: /* -1 is immediately after the + */
365                     case 0:
366                         bits=base64Value;
367                         base64Counter=1;
368                         break;
369                     case 1:
370                     case 3:
371                     case 4:
372                     case 6:
373                         bits=(uint16_t)((bits<<6)|base64Value);
374                         ++base64Counter;
375                         break;
376                     case 2:
377                         *target++=(char16_t)((bits<<4)|(base64Value>>2));
378                         if(offsets!=nullptr) {
379                             *offsets++=sourceIndex;
380                             sourceIndex=nextSourceIndex-1;
381                         }
382                         bytes[0]=b; /* keep this byte in case an error occurs */
383                         byteIndex=1;
384                         bits=(uint16_t)(base64Value&3);
385                         base64Counter=3;
386                         break;
387                     case 5:
388                         *target++=(char16_t)((bits<<2)|(base64Value>>4));
389                         if(offsets!=nullptr) {
390                             *offsets++=sourceIndex;
391                             sourceIndex=nextSourceIndex-1;
392                         }
393                         bytes[0]=b; /* keep this byte in case an error occurs */
394                         byteIndex=1;
395                         bits=(uint16_t)(base64Value&15);
396                         base64Counter=6;
397                         break;
398                     case 7:
399                         *target++=(char16_t)((bits<<6)|base64Value);
400                         if(offsets!=nullptr) {
401                             *offsets++=sourceIndex;
402                             sourceIndex=nextSourceIndex;
403                         }
404                         byteIndex=0;
405                         bits=0;
406                         base64Counter=0;
407                         break;
408                     default:
409                         /* will never occur */
410                         break;
411                     }
412                 } else /*base64Value==-2*/ {
413                     /* minus sign terminates the base64 sequence */
414                     inDirectMode=true;
415                     if(base64Counter==-1) {
416                         /* +- i.e. a minus immediately following a plus */
417                         *target++=PLUS;
418                         if(offsets!=nullptr) {
419                             *offsets++=sourceIndex-1;
420                         }
421                     } else {
422                         /* absorb the minus and leave the Unicode Mode */
423                         if(bits!=0) {
424                             /* bits are illegally left over, a char16_t is incomplete */
425                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
426                             break;
427                         }
428                     }
429                     sourceIndex=nextSourceIndex;
430                     goto directMode;
431                 }
432             } else {
433                 /* target is full */
434                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
435                 break;
436             }
437         }
438     }
439 
440     if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
441         /*
442          * if we are in Unicode mode, then the byteIndex might not be 0,
443          * but that is ok if bits==0
444          * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
445          * (not true for IMAP-mailbox-name where we must end in direct mode)
446          */
447         byteIndex=0;
448     }
449 
450     /* set the converter state back into UConverter */
451     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
452     cnv->toULength=byteIndex;
453 
454     /* write back the updated pointers */
455     pArgs->source=(const char *)source;
456     pArgs->target=target;
457     pArgs->offsets=offsets;
458 }
459 
460 static void U_CALLCONV
_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)461 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
462                             UErrorCode *pErrorCode) {
463     UConverter *cnv;
464     const char16_t *source, *sourceLimit;
465     uint8_t *target, *targetLimit;
466     int32_t *offsets;
467 
468     int32_t length, targetCapacity, sourceIndex;
469     char16_t c;
470 
471     /* UTF-7 state */
472     const UBool *encodeDirectly;
473     uint8_t bits;
474     int8_t base64Counter;
475     UBool inDirectMode;
476 
477     /* set up the local pointers */
478     cnv=pArgs->converter;
479 
480     /* set up the local pointers */
481     source=pArgs->source;
482     sourceLimit=pArgs->sourceLimit;
483     target=(uint8_t *)pArgs->target;
484     targetLimit=(uint8_t *)pArgs->targetLimit;
485     offsets=pArgs->offsets;
486 
487     /* get the state machine state */
488     {
489         uint32_t status=cnv->fromUnicodeStatus;
490         encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
491         inDirectMode=(UBool)((status>>24)&1);
492         base64Counter=(int8_t)(status>>16);
493         bits=(uint8_t)status;
494         U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
495     }
496 
497     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
498     sourceIndex=0;
499 
500     if(inDirectMode) {
501 directMode:
502         length=(int32_t)(sourceLimit-source);
503         targetCapacity=(int32_t)(targetLimit-target);
504         if(length>targetCapacity) {
505             length=targetCapacity;
506         }
507         while(length>0) {
508             c=*source++;
509             /* currently always encode CR LF SP TAB directly */
510             if(c<=127 && encodeDirectly[c]) {
511                 /* encode directly */
512                 *target++=(uint8_t)c;
513                 if(offsets!=nullptr) {
514                     *offsets++=sourceIndex++;
515                 }
516             } else if(c==PLUS) {
517                 /* output +- for + */
518                 *target++=PLUS;
519                 if(target<targetLimit) {
520                     *target++=MINUS;
521                     if(offsets!=nullptr) {
522                         *offsets++=sourceIndex;
523                         *offsets++=sourceIndex++;
524                     }
525                     /* realign length and targetCapacity */
526                     goto directMode;
527                 } else {
528                     if(offsets!=nullptr) {
529                         *offsets++=sourceIndex++;
530                     }
531                     cnv->charErrorBuffer[0]=MINUS;
532                     cnv->charErrorBufferLength=1;
533                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
534                     break;
535                 }
536             } else {
537                 /* un-read this character and switch to Unicode Mode */
538                 --source;
539                 *target++=PLUS;
540                 if(offsets!=nullptr) {
541                     *offsets++=sourceIndex;
542                 }
543                 inDirectMode=false;
544                 base64Counter=0;
545                 goto unicodeMode;
546             }
547             --length;
548         }
549         if(source<sourceLimit && target>=targetLimit) {
550             /* target is full */
551             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
552         }
553     } else {
554 unicodeMode:
555         while(source<sourceLimit) {
556             if(target<targetLimit) {
557                 c=*source++;
558                 if(c<=127 && encodeDirectly[c]) {
559                     /* encode directly */
560                     inDirectMode=true;
561 
562                     /* trick: back out this character to make this easier */
563                     --source;
564 
565                     /* terminate the base64 sequence */
566                     if(base64Counter!=0) {
567                         /* write remaining bits for the previous character */
568                         *target++=toBase64[bits];
569                         if(offsets!=nullptr) {
570                             *offsets++=sourceIndex-1;
571                         }
572                     }
573                     if(fromBase64[c]!=-1) {
574                         /* need to terminate with a minus */
575                         if(target<targetLimit) {
576                             *target++=MINUS;
577                             if(offsets!=nullptr) {
578                                 *offsets++=sourceIndex-1;
579                             }
580                         } else {
581                             cnv->charErrorBuffer[0]=MINUS;
582                             cnv->charErrorBufferLength=1;
583                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
584                             break;
585                         }
586                     }
587                     goto directMode;
588                 } else {
589                     /*
590                      * base64 this character:
591                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
592                      * and the bits of this character, each implicitly in UTF-16BE.
593                      *
594                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
595                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
596                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
597                      */
598                     switch(base64Counter) {
599                     case 0:
600                         *target++=toBase64[c>>10];
601                         if(target<targetLimit) {
602                             *target++=toBase64[(c>>4)&0x3f];
603                             if(offsets!=nullptr) {
604                                 *offsets++=sourceIndex;
605                                 *offsets++=sourceIndex++;
606                             }
607                         } else {
608                             if(offsets!=nullptr) {
609                                 *offsets++=sourceIndex++;
610                             }
611                             cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
612                             cnv->charErrorBufferLength=1;
613                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
614                         }
615                         bits=(uint8_t)((c&15)<<2);
616                         base64Counter=1;
617                         break;
618                     case 1:
619                         *target++=toBase64[bits|(c>>14)];
620                         if(target<targetLimit) {
621                             *target++=toBase64[(c>>8)&0x3f];
622                             if(target<targetLimit) {
623                                 *target++=toBase64[(c>>2)&0x3f];
624                                 if(offsets!=nullptr) {
625                                     *offsets++=sourceIndex;
626                                     *offsets++=sourceIndex;
627                                     *offsets++=sourceIndex++;
628                                 }
629                             } else {
630                                 if(offsets!=nullptr) {
631                                     *offsets++=sourceIndex;
632                                     *offsets++=sourceIndex++;
633                                 }
634                                 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
635                                 cnv->charErrorBufferLength=1;
636                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
637                             }
638                         } else {
639                             if(offsets!=nullptr) {
640                                 *offsets++=sourceIndex++;
641                             }
642                             cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
643                             cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
644                             cnv->charErrorBufferLength=2;
645                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
646                         }
647                         bits=(uint8_t)((c&3)<<4);
648                         base64Counter=2;
649                         break;
650                     case 2:
651                         *target++=toBase64[bits|(c>>12)];
652                         if(target<targetLimit) {
653                             *target++=toBase64[(c>>6)&0x3f];
654                             if(target<targetLimit) {
655                                 *target++=toBase64[c&0x3f];
656                                 if(offsets!=nullptr) {
657                                     *offsets++=sourceIndex;
658                                     *offsets++=sourceIndex;
659                                     *offsets++=sourceIndex++;
660                                 }
661                             } else {
662                                 if(offsets!=nullptr) {
663                                     *offsets++=sourceIndex;
664                                     *offsets++=sourceIndex++;
665                                 }
666                                 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
667                                 cnv->charErrorBufferLength=1;
668                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
669                             }
670                         } else {
671                             if(offsets!=nullptr) {
672                                 *offsets++=sourceIndex++;
673                             }
674                             cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
675                             cnv->charErrorBuffer[1]=toBase64[c&0x3f];
676                             cnv->charErrorBufferLength=2;
677                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
678                         }
679                         bits=0;
680                         base64Counter=0;
681                         break;
682                     default:
683                         /* will never occur */
684                         break;
685                     }
686                 }
687             } else {
688                 /* target is full */
689                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
690                 break;
691             }
692         }
693     }
694 
695     if(pArgs->flush && source>=sourceLimit) {
696         /* flush remaining bits to the target */
697         if(!inDirectMode) {
698             if (base64Counter!=0) {
699                 if(target<targetLimit) {
700                     *target++=toBase64[bits];
701                     if(offsets!=nullptr) {
702                         *offsets++=sourceIndex-1;
703                     }
704                 } else {
705                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
706                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
707                 }
708             }
709             /* Add final MINUS to terminate unicodeMode */
710             if(target<targetLimit) {
711                 *target++=MINUS;
712                 if(offsets!=nullptr) {
713                     *offsets++=sourceIndex-1;
714                 }
715             } else {
716                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
717                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
718             }
719         }
720         /* reset the state for the next conversion */
721         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
722     } else {
723         /* set the converter state back into UConverter */
724         cnv->fromUnicodeStatus=
725             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
726             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
727     }
728 
729     /* write back the updated pointers */
730     pArgs->source=source;
731     pArgs->target=(char *)target;
732     pArgs->offsets=offsets;
733 }
734 
735 static const char * U_CALLCONV
_UTF7GetName(const UConverter * cnv)736 _UTF7GetName(const UConverter *cnv) {
737     switch(cnv->fromUnicodeStatus>>28) {
738     case 1:
739         return "UTF-7,version=1";
740     default:
741         return "UTF-7";
742     }
743 }
744 U_CDECL_END
745 
746 static const UConverterImpl _UTF7Impl={
747     UCNV_UTF7,
748 
749     nullptr,
750     nullptr,
751 
752     _UTF7Open,
753     nullptr,
754     _UTF7Reset,
755 
756     _UTF7ToUnicodeWithOffsets,
757     _UTF7ToUnicodeWithOffsets,
758     _UTF7FromUnicodeWithOffsets,
759     _UTF7FromUnicodeWithOffsets,
760     nullptr,
761 
762     nullptr,
763     _UTF7GetName,
764     nullptr, /* we don't need writeSub() because we never call a callback at fromUnicode() */
765     nullptr,
766     ucnv_getCompleteUnicodeSet,
767 
768     nullptr,
769     nullptr
770 };
771 
772 static const UConverterStaticData _UTF7StaticData={
773     sizeof(UConverterStaticData),
774     "UTF-7",
775     0, /* TODO CCSID for UTF-7 */
776     UCNV_IBM, UCNV_UTF7,
777     1, 4,
778     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
779     false, false,
780     0,
781     0,
782     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
783 };
784 
785 const UConverterSharedData _UTF7Data=
786         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
787 
788 /* IMAP mailbox name encoding ----------------------------------------------- */
789 
790 /*
791  * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
792  * http://www.ietf.org/rfc/rfc2060.txt
793  *
794  * 5.1.3.  Mailbox International Naming Convention
795  *
796  * By convention, international mailbox names are specified using a
797  * modified version of the UTF-7 encoding described in [UTF-7].  The
798  * purpose of these modifications is to correct the following problems
799  * with UTF-7:
800  *
801  *    1) UTF-7 uses the "+" character for shifting; this conflicts with
802  *       the common use of "+" in mailbox names, in particular USENET
803  *       newsgroup names.
804  *
805  *    2) UTF-7's encoding is BASE64 which uses the "/" character; this
806  *       conflicts with the use of "/" as a popular hierarchy delimiter.
807  *
808  *    3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
809  *       the use of "\" as a popular hierarchy delimiter.
810  *
811  *    4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
812  *       the use of "~" in some servers as a home directory indicator.
813  *
814  *    5) UTF-7 permits multiple alternate forms to represent the same
815  *       string; in particular, printable US-ASCII characters can be
816  *       represented in encoded form.
817  *
818  * In modified UTF-7, printable US-ASCII characters except for "&"
819  * represent themselves; that is, characters with octet values 0x20-0x25
820  * and 0x27-0x7e.  The character "&" (0x26) is represented by the two-
821  * octet sequence "&-".
822  *
823  * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
824  * Unicode 16-bit octets) are represented in modified BASE64, with a
825  * further modification from [UTF-7] that "," is used instead of "/".
826  * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
827  * character which can represent itself.
828  *
829  * "&" is used to shift to modified BASE64 and "-" to shift back to US-
830  * ASCII.  All names start in US-ASCII, and MUST end in US-ASCII (that
831  * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
832  * ").
833  *
834  * For example, here is a mailbox name which mixes English, Japanese,
835  * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
836  */
837 
838 /*
839  * Tests for US-ASCII characters belonging to character classes
840  * defined in UTF-7.
841  *
842  * Set D (directly encoded characters) consists of the following
843  * characters: the upper and lower case letters A through Z
844  * and a through z, the 10 digits 0-9, and the following nine special
845  * characters (note that "+" and "=" are omitted):
846  *     '(),-./:?
847  *
848  * Set O (optional direct characters) consists of the following
849  * characters (note that "\" and "~" are omitted):
850  *     !"#$%&*;<=>@[]^_`{|}
851  *
852  * According to the rules in RFC 2152, the byte values for the following
853  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
854  * - all C0 control codes except for CR LF TAB
855  * - BACKSLASH
856  * - TILDE
857  * - DEL
858  * - all codes beyond US-ASCII, i.e. all >127
859  */
860 
861 /* uses '&' not '+' to start a base64 sequence */
862 #define AMPERSAND 0x26
863 #define COMMA 0x2c
864 #define SLASH 0x2f
865 
866 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
867 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
868 
869 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
870 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
871 
872 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
873 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
874 
875 /*
876  * converter status values:
877  *
878  * toUnicodeStatus:
879  *     24 inDirectMode (boolean)
880  * 23..16 base64Counter (-1..7)
881  * 15..0  bits (up to 14 bits incoming base64)
882  *
883  * fromUnicodeStatus:
884  *     24 inDirectMode (boolean)
885  * 23..16 base64Counter (0..2)
886  *  7..0  bits (6 bits outgoing base64)
887  *
888  * ignore bits 31..25
889  */
890 
891 U_CDECL_BEGIN
892 static void U_CALLCONV
_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)893 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
894                           UErrorCode *pErrorCode) {
895     UConverter *cnv;
896     const uint8_t *source, *sourceLimit;
897     char16_t *target;
898     const char16_t *targetLimit;
899     int32_t *offsets;
900 
901     uint8_t *bytes;
902     uint8_t byteIndex;
903 
904     int32_t length, targetCapacity;
905 
906     /* UTF-7 state */
907     uint16_t bits;
908     int8_t base64Counter;
909     UBool inDirectMode;
910 
911     int8_t base64Value;
912 
913     int32_t sourceIndex, nextSourceIndex;
914 
915     char16_t c;
916     uint8_t b;
917 
918     /* set up the local pointers */
919     cnv=pArgs->converter;
920 
921     source=(const uint8_t *)pArgs->source;
922     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
923     target=pArgs->target;
924     targetLimit=pArgs->targetLimit;
925     offsets=pArgs->offsets;
926     /* get the state machine state */
927     {
928         uint32_t status=cnv->toUnicodeStatus;
929         inDirectMode=(UBool)((status>>24)&1);
930         base64Counter=(int8_t)(status>>16);
931         bits=(uint16_t)status;
932     }
933     bytes=cnv->toUBytes;
934     byteIndex=cnv->toULength;
935 
936     /* sourceIndex=-1 if the current character began in the previous buffer */
937     sourceIndex=byteIndex==0 ? 0 : -1;
938     nextSourceIndex=0;
939 
940     if(inDirectMode) {
941 directMode:
942         /*
943          * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
944          * with their US-ASCII byte values.
945          * An ampersand starts Unicode (or "escape") Mode.
946          *
947          * In Direct Mode, only the sourceIndex is used.
948          */
949         byteIndex=0;
950         length=(int32_t)(sourceLimit-source);
951         targetCapacity=(int32_t)(targetLimit-target);
952         if(length>targetCapacity) {
953             length=targetCapacity;
954         }
955         while(length>0) {
956             b=*source++;
957             if(!isLegalIMAP(b)) {
958                 /* illegal */
959                 bytes[0]=b;
960                 byteIndex=1;
961                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
962                 break;
963             } else if(b!=AMPERSAND) {
964                 /* write directly encoded character */
965                 *target++=b;
966                 if(offsets!=nullptr) {
967                     *offsets++=sourceIndex++;
968                 }
969             } else /* AMPERSAND */ {
970                 /* switch to Unicode mode */
971                 nextSourceIndex=++sourceIndex;
972                 inDirectMode=false;
973                 byteIndex=0;
974                 bits=0;
975                 base64Counter=-1;
976                 goto unicodeMode;
977             }
978             --length;
979         }
980         if(source<sourceLimit && target>=targetLimit) {
981             /* target is full */
982             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
983         }
984     } else {
985 unicodeMode:
986         /*
987          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
988          * The base64 sequence ends with any character that is not in the base64 alphabet.
989          * A terminating minus sign is consumed.
990          * US-ASCII must not be base64-ed.
991          *
992          * In Unicode Mode, the sourceIndex has the index to the start of the current
993          * base64 bytes, while nextSourceIndex is precisely parallel to source,
994          * keeping the index to the following byte.
995          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
996          */
997         while(source<sourceLimit) {
998             if(target<targetLimit) {
999                 bytes[byteIndex++]=b=*source++;
1000                 ++nextSourceIndex;
1001                 if(b>0x7e) {
1002                     /* illegal - test other illegal US-ASCII values by base64Value==-3 */
1003                     inDirectMode=true;
1004                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1005                     break;
1006                 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1007                     /* collect base64 bytes into UChars */
1008                     switch(base64Counter) {
1009                     case -1: /* -1 is immediately after the & */
1010                     case 0:
1011                         bits=base64Value;
1012                         base64Counter=1;
1013                         break;
1014                     case 1:
1015                     case 3:
1016                     case 4:
1017                     case 6:
1018                         bits=(uint16_t)((bits<<6)|base64Value);
1019                         ++base64Counter;
1020                         break;
1021                     case 2:
1022                         c=(char16_t)((bits<<4)|(base64Value>>2));
1023                         if(isLegalIMAP(c)) {
1024                             /* illegal */
1025                             inDirectMode=true;
1026                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1027                             goto endloop;
1028                         }
1029                         *target++=c;
1030                         if(offsets!=nullptr) {
1031                             *offsets++=sourceIndex;
1032                             sourceIndex=nextSourceIndex-1;
1033                         }
1034                         bytes[0]=b; /* keep this byte in case an error occurs */
1035                         byteIndex=1;
1036                         bits=(uint16_t)(base64Value&3);
1037                         base64Counter=3;
1038                         break;
1039                     case 5:
1040                         c=(char16_t)((bits<<2)|(base64Value>>4));
1041                         if(isLegalIMAP(c)) {
1042                             /* illegal */
1043                             inDirectMode=true;
1044                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1045                             goto endloop;
1046                         }
1047                         *target++=c;
1048                         if(offsets!=nullptr) {
1049                             *offsets++=sourceIndex;
1050                             sourceIndex=nextSourceIndex-1;
1051                         }
1052                         bytes[0]=b; /* keep this byte in case an error occurs */
1053                         byteIndex=1;
1054                         bits=(uint16_t)(base64Value&15);
1055                         base64Counter=6;
1056                         break;
1057                     case 7:
1058                         c=(char16_t)((bits<<6)|base64Value);
1059                         if(isLegalIMAP(c)) {
1060                             /* illegal */
1061                             inDirectMode=true;
1062                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1063                             goto endloop;
1064                         }
1065                         *target++=c;
1066                         if(offsets!=nullptr) {
1067                             *offsets++=sourceIndex;
1068                             sourceIndex=nextSourceIndex;
1069                         }
1070                         byteIndex=0;
1071                         bits=0;
1072                         base64Counter=0;
1073                         break;
1074                     default:
1075                         /* will never occur */
1076                         break;
1077                     }
1078                 } else if(base64Value==-2) {
1079                     /* minus sign terminates the base64 sequence */
1080                     inDirectMode=true;
1081                     if(base64Counter==-1) {
1082                         /* &- i.e. a minus immediately following an ampersand */
1083                         *target++=AMPERSAND;
1084                         if(offsets!=nullptr) {
1085                             *offsets++=sourceIndex-1;
1086                         }
1087                     } else {
1088                         /* absorb the minus and leave the Unicode Mode */
1089                         if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1090                             /* bits are illegally left over, a char16_t is incomplete */
1091                             /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1092                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1093                             break;
1094                         }
1095                     }
1096                     sourceIndex=nextSourceIndex;
1097                     goto directMode;
1098                 } else {
1099                     if(base64Counter==-1) {
1100                         /* illegal: & immediately followed by something other than base64 or minus sign */
1101                         /* include the ampersand in the reported sequence */
1102                         --sourceIndex;
1103                         bytes[0]=AMPERSAND;
1104                         bytes[1]=b;
1105                         byteIndex=2;
1106                     }
1107                     /* base64Value==-1 for characters that are illegal only in Unicode mode */
1108                     /* base64Value==-3 for illegal characters */
1109                     /* illegal */
1110                     inDirectMode=true;
1111                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1112                     break;
1113                 }
1114             } else {
1115                 /* target is full */
1116                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1117                 break;
1118             }
1119         }
1120     }
1121 endloop:
1122 
1123     /*
1124      * the end of the input stream and detection of truncated input
1125      * are handled by the framework, but here we must check if we are in Unicode
1126      * mode and byteIndex==0 because we must end in direct mode
1127      *
1128      * conditions:
1129      *   successful
1130      *   in Unicode mode and byteIndex==0
1131      *   end of input and no truncated input
1132      */
1133     if( U_SUCCESS(*pErrorCode) &&
1134         !inDirectMode && byteIndex==0 &&
1135         pArgs->flush && source>=sourceLimit
1136     ) {
1137         if(base64Counter==-1) {
1138             /* & at the very end of the input */
1139             /* make the ampersand the reported sequence */
1140             bytes[0]=AMPERSAND;
1141             byteIndex=1;
1142         }
1143         /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1144 
1145         inDirectMode=true; /* avoid looping */
1146         *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1147     }
1148 
1149     /* set the converter state back into UConverter */
1150     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1151     cnv->toULength=byteIndex;
1152 
1153     /* write back the updated pointers */
1154     pArgs->source=(const char *)source;
1155     pArgs->target=target;
1156     pArgs->offsets=offsets;
1157 }
1158 
1159 static void U_CALLCONV
_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1160 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1161                             UErrorCode *pErrorCode) {
1162     UConverter *cnv;
1163     const char16_t *source, *sourceLimit;
1164     uint8_t *target, *targetLimit;
1165     int32_t *offsets;
1166 
1167     int32_t length, targetCapacity, sourceIndex;
1168     char16_t c;
1169     uint8_t b;
1170 
1171     /* UTF-7 state */
1172     uint8_t bits;
1173     int8_t base64Counter;
1174     UBool inDirectMode;
1175 
1176     /* set up the local pointers */
1177     cnv=pArgs->converter;
1178 
1179     /* set up the local pointers */
1180     source=pArgs->source;
1181     sourceLimit=pArgs->sourceLimit;
1182     target=(uint8_t *)pArgs->target;
1183     targetLimit=(uint8_t *)pArgs->targetLimit;
1184     offsets=pArgs->offsets;
1185 
1186     /* get the state machine state */
1187     {
1188         uint32_t status=cnv->fromUnicodeStatus;
1189         inDirectMode=(UBool)((status>>24)&1);
1190         base64Counter=(int8_t)(status>>16);
1191         bits=(uint8_t)status;
1192     }
1193 
1194     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1195     sourceIndex=0;
1196 
1197     if(inDirectMode) {
1198 directMode:
1199         length=(int32_t)(sourceLimit-source);
1200         targetCapacity=(int32_t)(targetLimit-target);
1201         if(length>targetCapacity) {
1202             length=targetCapacity;
1203         }
1204         while(length>0) {
1205             c=*source++;
1206             /* encode 0x20..0x7e except '&' directly */
1207             if(inSetDIMAP(c)) {
1208                 /* encode directly */
1209                 *target++=(uint8_t)c;
1210                 if(offsets!=nullptr) {
1211                     *offsets++=sourceIndex++;
1212                 }
1213             } else if(c==AMPERSAND) {
1214                 /* output &- for & */
1215                 *target++=AMPERSAND;
1216                 if(target<targetLimit) {
1217                     *target++=MINUS;
1218                     if(offsets!=nullptr) {
1219                         *offsets++=sourceIndex;
1220                         *offsets++=sourceIndex++;
1221                     }
1222                     /* realign length and targetCapacity */
1223                     goto directMode;
1224                 } else {
1225                     if(offsets!=nullptr) {
1226                         *offsets++=sourceIndex++;
1227                     }
1228                     cnv->charErrorBuffer[0]=MINUS;
1229                     cnv->charErrorBufferLength=1;
1230                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1231                     break;
1232                 }
1233             } else {
1234                 /* un-read this character and switch to Unicode Mode */
1235                 --source;
1236                 *target++=AMPERSAND;
1237                 if(offsets!=nullptr) {
1238                     *offsets++=sourceIndex;
1239                 }
1240                 inDirectMode=false;
1241                 base64Counter=0;
1242                 goto unicodeMode;
1243             }
1244             --length;
1245         }
1246         if(source<sourceLimit && target>=targetLimit) {
1247             /* target is full */
1248             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1249         }
1250     } else {
1251 unicodeMode:
1252         while(source<sourceLimit) {
1253             if(target<targetLimit) {
1254                 c=*source++;
1255                 if(isLegalIMAP(c)) {
1256                     /* encode directly */
1257                     inDirectMode=true;
1258 
1259                     /* trick: back out this character to make this easier */
1260                     --source;
1261 
1262                     /* terminate the base64 sequence */
1263                     if(base64Counter!=0) {
1264                         /* write remaining bits for the previous character */
1265                         *target++=TO_BASE64_IMAP(bits);
1266                         if(offsets!=nullptr) {
1267                             *offsets++=sourceIndex-1;
1268                         }
1269                     }
1270                     /* need to terminate with a minus */
1271                     if(target<targetLimit) {
1272                         *target++=MINUS;
1273                         if(offsets!=nullptr) {
1274                             *offsets++=sourceIndex-1;
1275                         }
1276                     } else {
1277                         cnv->charErrorBuffer[0]=MINUS;
1278                         cnv->charErrorBufferLength=1;
1279                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1280                         break;
1281                     }
1282                     goto directMode;
1283                 } else {
1284                     /*
1285                      * base64 this character:
1286                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1287                      * and the bits of this character, each implicitly in UTF-16BE.
1288                      *
1289                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1290                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
1291                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1292                      */
1293                     switch(base64Counter) {
1294                     case 0:
1295                         b=(uint8_t)(c>>10);
1296                         *target++=TO_BASE64_IMAP(b);
1297                         if(target<targetLimit) {
1298                             b=(uint8_t)((c>>4)&0x3f);
1299                             *target++=TO_BASE64_IMAP(b);
1300                             if(offsets!=nullptr) {
1301                                 *offsets++=sourceIndex;
1302                                 *offsets++=sourceIndex++;
1303                             }
1304                         } else {
1305                             if(offsets!=nullptr) {
1306                                 *offsets++=sourceIndex++;
1307                             }
1308                             b=(uint8_t)((c>>4)&0x3f);
1309                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1310                             cnv->charErrorBufferLength=1;
1311                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1312                         }
1313                         bits=(uint8_t)((c&15)<<2);
1314                         base64Counter=1;
1315                         break;
1316                     case 1:
1317                         b=(uint8_t)(bits|(c>>14));
1318                         *target++=TO_BASE64_IMAP(b);
1319                         if(target<targetLimit) {
1320                             b=(uint8_t)((c>>8)&0x3f);
1321                             *target++=TO_BASE64_IMAP(b);
1322                             if(target<targetLimit) {
1323                                 b=(uint8_t)((c>>2)&0x3f);
1324                                 *target++=TO_BASE64_IMAP(b);
1325                                 if(offsets!=nullptr) {
1326                                     *offsets++=sourceIndex;
1327                                     *offsets++=sourceIndex;
1328                                     *offsets++=sourceIndex++;
1329                                 }
1330                             } else {
1331                                 if(offsets!=nullptr) {
1332                                     *offsets++=sourceIndex;
1333                                     *offsets++=sourceIndex++;
1334                                 }
1335                                 b=(uint8_t)((c>>2)&0x3f);
1336                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1337                                 cnv->charErrorBufferLength=1;
1338                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1339                             }
1340                         } else {
1341                             if(offsets!=nullptr) {
1342                                 *offsets++=sourceIndex++;
1343                             }
1344                             b=(uint8_t)((c>>8)&0x3f);
1345                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1346                             b=(uint8_t)((c>>2)&0x3f);
1347                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1348                             cnv->charErrorBufferLength=2;
1349                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1350                         }
1351                         bits=(uint8_t)((c&3)<<4);
1352                         base64Counter=2;
1353                         break;
1354                     case 2:
1355                         b=(uint8_t)(bits|(c>>12));
1356                         *target++=TO_BASE64_IMAP(b);
1357                         if(target<targetLimit) {
1358                             b=(uint8_t)((c>>6)&0x3f);
1359                             *target++=TO_BASE64_IMAP(b);
1360                             if(target<targetLimit) {
1361                                 b=(uint8_t)(c&0x3f);
1362                                 *target++=TO_BASE64_IMAP(b);
1363                                 if(offsets!=nullptr) {
1364                                     *offsets++=sourceIndex;
1365                                     *offsets++=sourceIndex;
1366                                     *offsets++=sourceIndex++;
1367                                 }
1368                             } else {
1369                                 if(offsets!=nullptr) {
1370                                     *offsets++=sourceIndex;
1371                                     *offsets++=sourceIndex++;
1372                                 }
1373                                 b=(uint8_t)(c&0x3f);
1374                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1375                                 cnv->charErrorBufferLength=1;
1376                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1377                             }
1378                         } else {
1379                             if(offsets!=nullptr) {
1380                                 *offsets++=sourceIndex++;
1381                             }
1382                             b=(uint8_t)((c>>6)&0x3f);
1383                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1384                             b=(uint8_t)(c&0x3f);
1385                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1386                             cnv->charErrorBufferLength=2;
1387                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1388                         }
1389                         bits=0;
1390                         base64Counter=0;
1391                         break;
1392                     default:
1393                         /* will never occur */
1394                         break;
1395                     }
1396                 }
1397             } else {
1398                 /* target is full */
1399                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1400                 break;
1401             }
1402         }
1403     }
1404 
1405     if(pArgs->flush && source>=sourceLimit) {
1406         /* flush remaining bits to the target */
1407         if(!inDirectMode) {
1408             if(base64Counter!=0) {
1409                 if(target<targetLimit) {
1410                     *target++=TO_BASE64_IMAP(bits);
1411                     if(offsets!=nullptr) {
1412                         *offsets++=sourceIndex-1;
1413                     }
1414                 } else {
1415                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1416                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1417                 }
1418             }
1419             /* need to terminate with a minus */
1420             if(target<targetLimit) {
1421                 *target++=MINUS;
1422                 if(offsets!=nullptr) {
1423                     *offsets++=sourceIndex-1;
1424                 }
1425             } else {
1426                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1427                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1428             }
1429         }
1430         /* reset the state for the next conversion */
1431         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
1432     } else {
1433         /* set the converter state back into UConverter */
1434         cnv->fromUnicodeStatus=
1435             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
1436             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1437     }
1438 
1439     /* write back the updated pointers */
1440     pArgs->source=source;
1441     pArgs->target=(char *)target;
1442     pArgs->offsets=offsets;
1443 }
1444 U_CDECL_END
1445 
1446 static const UConverterImpl _IMAPImpl={
1447     UCNV_IMAP_MAILBOX,
1448 
1449     nullptr,
1450     nullptr,
1451 
1452     _UTF7Open,
1453     nullptr,
1454     _UTF7Reset,
1455 
1456     _IMAPToUnicodeWithOffsets,
1457     _IMAPToUnicodeWithOffsets,
1458     _IMAPFromUnicodeWithOffsets,
1459     _IMAPFromUnicodeWithOffsets,
1460     nullptr,
1461 
1462     nullptr,
1463     nullptr,
1464     nullptr, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1465     nullptr,
1466     ucnv_getCompleteUnicodeSet,
1467     nullptr,
1468     nullptr
1469 };
1470 
1471 static const UConverterStaticData _IMAPStaticData={
1472     sizeof(UConverterStaticData),
1473     "IMAP-mailbox-name",
1474     0, /* TODO CCSID for IMAP-mailbox-name */
1475     UCNV_IBM, UCNV_IMAP_MAILBOX,
1476     1, 4,
1477     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1478     false, false,
1479     0,
1480     0,
1481     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1482 };
1483 
1484 const UConverterSharedData _IMAPData=
1485         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
1486 
1487 #endif
1488