xref: /aosp_15_r20/external/unicode/ConvertUTF.c (revision c14be686ac162d87fd361a4e7a5439b56849c4f4)
1*c14be686SAndroid Build Coastguard Worker /*
2*c14be686SAndroid Build Coastguard Worker  * Copyright 2001-2004 Unicode, Inc.
3*c14be686SAndroid Build Coastguard Worker  *
4*c14be686SAndroid Build Coastguard Worker  * Disclaimer
5*c14be686SAndroid Build Coastguard Worker  *
6*c14be686SAndroid Build Coastguard Worker  * This source code is provided as is by Unicode, Inc. No claims are
7*c14be686SAndroid Build Coastguard Worker  * made as to fitness for any particular purpose. No warranties of any
8*c14be686SAndroid Build Coastguard Worker  * kind are expressed or implied. The recipient agrees to determine
9*c14be686SAndroid Build Coastguard Worker  * applicability of information provided. If this file has been
10*c14be686SAndroid Build Coastguard Worker  * purchased on magnetic or optical media from Unicode, Inc., the
11*c14be686SAndroid Build Coastguard Worker  * sole remedy for any claim will be exchange of defective media
12*c14be686SAndroid Build Coastguard Worker  * within 90 days of receipt.
13*c14be686SAndroid Build Coastguard Worker  *
14*c14be686SAndroid Build Coastguard Worker  * Limitations on Rights to Redistribute This Code
15*c14be686SAndroid Build Coastguard Worker  *
16*c14be686SAndroid Build Coastguard Worker  * Unicode, Inc. hereby grants the right to freely use the information
17*c14be686SAndroid Build Coastguard Worker  * supplied in this file in the creation of products supporting the
18*c14be686SAndroid Build Coastguard Worker  * Unicode Standard, and to make copies of this file in any form
19*c14be686SAndroid Build Coastguard Worker  * for internal or external distribution as long as this notice
20*c14be686SAndroid Build Coastguard Worker  * remains attached.
21*c14be686SAndroid Build Coastguard Worker  */
22*c14be686SAndroid Build Coastguard Worker 
23*c14be686SAndroid Build Coastguard Worker /* ---------------------------------------------------------------------
24*c14be686SAndroid Build Coastguard Worker 
25*c14be686SAndroid Build Coastguard Worker     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26*c14be686SAndroid Build Coastguard Worker     Author: Mark E. Davis, 1994.
27*c14be686SAndroid Build Coastguard Worker     Rev History: Rick McGowan, fixes & updates May 2001.
28*c14be686SAndroid Build Coastguard Worker     Sept 2001: fixed const & error conditions per
29*c14be686SAndroid Build Coastguard Worker 	mods suggested by S. Parent & A. Lillich.
30*c14be686SAndroid Build Coastguard Worker     June 2002: Tim Dodd added detection and handling of incomplete
31*c14be686SAndroid Build Coastguard Worker 	source sequences, enhanced error detection, added casts
32*c14be686SAndroid Build Coastguard Worker 	to eliminate compiler warnings.
33*c14be686SAndroid Build Coastguard Worker     July 2003: slight mods to back out aggressive FFFE detection.
34*c14be686SAndroid Build Coastguard Worker     Jan 2004: updated switches in from-UTF8 conversions.
35*c14be686SAndroid Build Coastguard Worker     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36*c14be686SAndroid Build Coastguard Worker 
37*c14be686SAndroid Build Coastguard Worker     See the header file "ConvertUTF.h" for complete documentation.
38*c14be686SAndroid Build Coastguard Worker 
39*c14be686SAndroid Build Coastguard Worker ------------------------------------------------------------------------ */
40*c14be686SAndroid Build Coastguard Worker 
41*c14be686SAndroid Build Coastguard Worker 
42*c14be686SAndroid Build Coastguard Worker #include "ConvertUTF.h"
43*c14be686SAndroid Build Coastguard Worker #ifdef CVTUTF_DEBUG
44*c14be686SAndroid Build Coastguard Worker #include <stdio.h>
45*c14be686SAndroid Build Coastguard Worker #endif
46*c14be686SAndroid Build Coastguard Worker 
47*c14be686SAndroid Build Coastguard Worker static const int halfShift  = 10; /* used for shifting by 10 bits */
48*c14be686SAndroid Build Coastguard Worker 
49*c14be686SAndroid Build Coastguard Worker static const UTF32 halfBase = 0x0010000UL;
50*c14be686SAndroid Build Coastguard Worker static const UTF32 halfMask = 0x3FFUL;
51*c14be686SAndroid Build Coastguard Worker 
52*c14be686SAndroid Build Coastguard Worker #define UNI_SUR_HIGH_START  (UTF32)0xD800
53*c14be686SAndroid Build Coastguard Worker #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
54*c14be686SAndroid Build Coastguard Worker #define UNI_SUR_LOW_START   (UTF32)0xDC00
55*c14be686SAndroid Build Coastguard Worker #define UNI_SUR_LOW_END     (UTF32)0xDFFF
56*c14be686SAndroid Build Coastguard Worker #define false	   0
57*c14be686SAndroid Build Coastguard Worker #define true	    1
58*c14be686SAndroid Build Coastguard Worker 
59*c14be686SAndroid Build Coastguard Worker /* --------------------------------------------------------------------- */
60*c14be686SAndroid Build Coastguard Worker 
ConvertUTF32toUTF16(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)61*c14be686SAndroid Build Coastguard Worker ConversionResult ConvertUTF32toUTF16 (
62*c14be686SAndroid Build Coastguard Worker 	const UTF32** sourceStart, const UTF32* sourceEnd,
63*c14be686SAndroid Build Coastguard Worker 	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
64*c14be686SAndroid Build Coastguard Worker     ConversionResult result = conversionOK;
65*c14be686SAndroid Build Coastguard Worker     const UTF32* source = *sourceStart;
66*c14be686SAndroid Build Coastguard Worker     UTF16* target = *targetStart;
67*c14be686SAndroid Build Coastguard Worker     while (source < sourceEnd) {
68*c14be686SAndroid Build Coastguard Worker 	UTF32 ch;
69*c14be686SAndroid Build Coastguard Worker 	if (target >= targetEnd) {
70*c14be686SAndroid Build Coastguard Worker 	    result = targetExhausted; break;
71*c14be686SAndroid Build Coastguard Worker 	}
72*c14be686SAndroid Build Coastguard Worker 	ch = *source++;
73*c14be686SAndroid Build Coastguard Worker 	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
74*c14be686SAndroid Build Coastguard Worker 	    /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
75*c14be686SAndroid Build Coastguard Worker 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
76*c14be686SAndroid Build Coastguard Worker 		if (flags == strictConversion) {
77*c14be686SAndroid Build Coastguard Worker 		    --source; /* return to the illegal value itself */
78*c14be686SAndroid Build Coastguard Worker 		    result = sourceIllegal;
79*c14be686SAndroid Build Coastguard Worker 		    break;
80*c14be686SAndroid Build Coastguard Worker 		} else {
81*c14be686SAndroid Build Coastguard Worker 		    *target++ = UNI_REPLACEMENT_CHAR;
82*c14be686SAndroid Build Coastguard Worker 		}
83*c14be686SAndroid Build Coastguard Worker 	    } else {
84*c14be686SAndroid Build Coastguard Worker 		*target++ = (UTF16)ch; /* normal case */
85*c14be686SAndroid Build Coastguard Worker 	    }
86*c14be686SAndroid Build Coastguard Worker 	} else if (ch > UNI_MAX_LEGAL_UTF32) {
87*c14be686SAndroid Build Coastguard Worker 	    if (flags == strictConversion) {
88*c14be686SAndroid Build Coastguard Worker 		result = sourceIllegal;
89*c14be686SAndroid Build Coastguard Worker 	    } else {
90*c14be686SAndroid Build Coastguard Worker 		*target++ = UNI_REPLACEMENT_CHAR;
91*c14be686SAndroid Build Coastguard Worker 	    }
92*c14be686SAndroid Build Coastguard Worker 	} else {
93*c14be686SAndroid Build Coastguard Worker 	    /* target is a character in range 0xFFFF - 0x10FFFF. */
94*c14be686SAndroid Build Coastguard Worker 	    if (target + 1 >= targetEnd) {
95*c14be686SAndroid Build Coastguard Worker 		--source; /* Back up source pointer! */
96*c14be686SAndroid Build Coastguard Worker 		result = targetExhausted; break;
97*c14be686SAndroid Build Coastguard Worker 	    }
98*c14be686SAndroid Build Coastguard Worker 	    ch -= halfBase;
99*c14be686SAndroid Build Coastguard Worker 	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
100*c14be686SAndroid Build Coastguard Worker 	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
101*c14be686SAndroid Build Coastguard Worker 	}
102*c14be686SAndroid Build Coastguard Worker     }
103*c14be686SAndroid Build Coastguard Worker     *sourceStart = source;
104*c14be686SAndroid Build Coastguard Worker     *targetStart = target;
105*c14be686SAndroid Build Coastguard Worker     return result;
106*c14be686SAndroid Build Coastguard Worker }
107*c14be686SAndroid Build Coastguard Worker 
108*c14be686SAndroid Build Coastguard Worker /* --------------------------------------------------------------------- */
109*c14be686SAndroid Build Coastguard Worker 
ConvertUTF16toUTF32(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)110*c14be686SAndroid Build Coastguard Worker ConversionResult ConvertUTF16toUTF32 (
111*c14be686SAndroid Build Coastguard Worker 	const UTF16** sourceStart, const UTF16* sourceEnd,
112*c14be686SAndroid Build Coastguard Worker 	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
113*c14be686SAndroid Build Coastguard Worker     ConversionResult result = conversionOK;
114*c14be686SAndroid Build Coastguard Worker     const UTF16* source = *sourceStart;
115*c14be686SAndroid Build Coastguard Worker     UTF32* target = *targetStart;
116*c14be686SAndroid Build Coastguard Worker     UTF32 ch, ch2;
117*c14be686SAndroid Build Coastguard Worker     while (source < sourceEnd) {
118*c14be686SAndroid Build Coastguard Worker 	const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
119*c14be686SAndroid Build Coastguard Worker 	ch = *source++;
120*c14be686SAndroid Build Coastguard Worker 	/* If we have a surrogate pair, convert to UTF32 first. */
121*c14be686SAndroid Build Coastguard Worker 	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
122*c14be686SAndroid Build Coastguard Worker 	    /* If the 16 bits following the high surrogate are in the source buffer... */
123*c14be686SAndroid Build Coastguard Worker 	    if (source < sourceEnd) {
124*c14be686SAndroid Build Coastguard Worker 		ch2 = *source;
125*c14be686SAndroid Build Coastguard Worker 		/* If it's a low surrogate, convert to UTF32. */
126*c14be686SAndroid Build Coastguard Worker 		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
127*c14be686SAndroid Build Coastguard Worker 		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
128*c14be686SAndroid Build Coastguard Worker 			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
129*c14be686SAndroid Build Coastguard Worker 		    ++source;
130*c14be686SAndroid Build Coastguard Worker 		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
131*c14be686SAndroid Build Coastguard Worker 		    --source; /* return to the illegal value itself */
132*c14be686SAndroid Build Coastguard Worker 		    result = sourceIllegal;
133*c14be686SAndroid Build Coastguard Worker 		    break;
134*c14be686SAndroid Build Coastguard Worker 		}
135*c14be686SAndroid Build Coastguard Worker 	    } else { /* We don't have the 16 bits following the high surrogate. */
136*c14be686SAndroid Build Coastguard Worker 		--source; /* return to the high surrogate */
137*c14be686SAndroid Build Coastguard Worker 		result = sourceExhausted;
138*c14be686SAndroid Build Coastguard Worker 		break;
139*c14be686SAndroid Build Coastguard Worker 	    }
140*c14be686SAndroid Build Coastguard Worker 	} else if (flags == strictConversion) {
141*c14be686SAndroid Build Coastguard Worker 	    /* UTF-16 surrogate values are illegal in UTF-32 */
142*c14be686SAndroid Build Coastguard Worker 	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
143*c14be686SAndroid Build Coastguard Worker 		--source; /* return to the illegal value itself */
144*c14be686SAndroid Build Coastguard Worker 		result = sourceIllegal;
145*c14be686SAndroid Build Coastguard Worker 		break;
146*c14be686SAndroid Build Coastguard Worker 	    }
147*c14be686SAndroid Build Coastguard Worker 	}
148*c14be686SAndroid Build Coastguard Worker 	if (target >= targetEnd) {
149*c14be686SAndroid Build Coastguard Worker 	    source = oldSource; /* Back up source pointer! */
150*c14be686SAndroid Build Coastguard Worker 	    result = targetExhausted; break;
151*c14be686SAndroid Build Coastguard Worker 	}
152*c14be686SAndroid Build Coastguard Worker 	*target++ = ch;
153*c14be686SAndroid Build Coastguard Worker     }
154*c14be686SAndroid Build Coastguard Worker     *sourceStart = source;
155*c14be686SAndroid Build Coastguard Worker     *targetStart = target;
156*c14be686SAndroid Build Coastguard Worker #ifdef CVTUTF_DEBUG
157*c14be686SAndroid Build Coastguard Worker if (result == sourceIllegal) {
158*c14be686SAndroid Build Coastguard Worker     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
159*c14be686SAndroid Build Coastguard Worker     fflush(stderr);
160*c14be686SAndroid Build Coastguard Worker }
161*c14be686SAndroid Build Coastguard Worker #endif
162*c14be686SAndroid Build Coastguard Worker     return result;
163*c14be686SAndroid Build Coastguard Worker }
164*c14be686SAndroid Build Coastguard Worker 
165*c14be686SAndroid Build Coastguard Worker /* --------------------------------------------------------------------- */
166*c14be686SAndroid Build Coastguard Worker 
167*c14be686SAndroid Build Coastguard Worker /*
168*c14be686SAndroid Build Coastguard Worker  * Index into the table below with the first byte of a UTF-8 sequence to
169*c14be686SAndroid Build Coastguard Worker  * get the number of trailing bytes that are supposed to follow it.
170*c14be686SAndroid Build Coastguard Worker  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
171*c14be686SAndroid Build Coastguard Worker  * left as-is for anyone who may want to do such conversion, which was
172*c14be686SAndroid Build Coastguard Worker  * allowed in earlier algorithms.
173*c14be686SAndroid Build Coastguard Worker  */
174*c14be686SAndroid Build Coastguard Worker static const char trailingBytesForUTF8[256] = {
175*c14be686SAndroid Build Coastguard Worker     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
176*c14be686SAndroid Build Coastguard Worker     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
177*c14be686SAndroid Build Coastguard Worker     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
178*c14be686SAndroid Build Coastguard Worker     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
179*c14be686SAndroid Build Coastguard Worker     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
180*c14be686SAndroid Build Coastguard Worker     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
181*c14be686SAndroid Build Coastguard Worker     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
182*c14be686SAndroid Build Coastguard Worker     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
183*c14be686SAndroid Build Coastguard Worker };
184*c14be686SAndroid Build Coastguard Worker 
185*c14be686SAndroid Build Coastguard Worker /*
186*c14be686SAndroid Build Coastguard Worker  * Magic values subtracted from a buffer value during UTF8 conversion.
187*c14be686SAndroid Build Coastguard Worker  * This table contains as many values as there might be trailing bytes
188*c14be686SAndroid Build Coastguard Worker  * in a UTF-8 sequence.
189*c14be686SAndroid Build Coastguard Worker  */
190*c14be686SAndroid Build Coastguard Worker static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
191*c14be686SAndroid Build Coastguard Worker 		     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
192*c14be686SAndroid Build Coastguard Worker 
193*c14be686SAndroid Build Coastguard Worker /*
194*c14be686SAndroid Build Coastguard Worker  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
195*c14be686SAndroid Build Coastguard Worker  * into the first byte, depending on how many bytes follow.  There are
196*c14be686SAndroid Build Coastguard Worker  * as many entries in this table as there are UTF-8 sequence types.
197*c14be686SAndroid Build Coastguard Worker  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
198*c14be686SAndroid Build Coastguard Worker  * for *legal* UTF-8 will be 4 or fewer bytes total.
199*c14be686SAndroid Build Coastguard Worker  */
200*c14be686SAndroid Build Coastguard Worker static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
201*c14be686SAndroid Build Coastguard Worker 
202*c14be686SAndroid Build Coastguard Worker /* --------------------------------------------------------------------- */
203*c14be686SAndroid Build Coastguard Worker 
204*c14be686SAndroid Build Coastguard Worker /* The interface converts a whole buffer to avoid function-call overhead.
205*c14be686SAndroid Build Coastguard Worker  * Constants have been gathered. Loops & conditionals have been removed as
206*c14be686SAndroid Build Coastguard Worker  * much as possible for efficiency, in favor of drop-through switches.
207*c14be686SAndroid Build Coastguard Worker  * (See "Note A" at the bottom of the file for equivalent code.)
208*c14be686SAndroid Build Coastguard Worker  * If your compiler supports it, the "isLegalUTF8" call can be turned
209*c14be686SAndroid Build Coastguard Worker  * into an inline function.
210*c14be686SAndroid Build Coastguard Worker  */
211*c14be686SAndroid Build Coastguard Worker 
212*c14be686SAndroid Build Coastguard Worker /* --------------------------------------------------------------------- */
213*c14be686SAndroid Build Coastguard Worker 
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)214*c14be686SAndroid Build Coastguard Worker ConversionResult ConvertUTF16toUTF8 (
215*c14be686SAndroid Build Coastguard Worker 	const UTF16** sourceStart, const UTF16* sourceEnd,
216*c14be686SAndroid Build Coastguard Worker 	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
217*c14be686SAndroid Build Coastguard Worker     ConversionResult result = conversionOK;
218*c14be686SAndroid Build Coastguard Worker     const UTF16* source = *sourceStart;
219*c14be686SAndroid Build Coastguard Worker     UTF8* target = *targetStart;
220*c14be686SAndroid Build Coastguard Worker     while (source < sourceEnd) {
221*c14be686SAndroid Build Coastguard Worker 	UTF32 ch;
222*c14be686SAndroid Build Coastguard Worker 	unsigned short bytesToWrite = 0;
223*c14be686SAndroid Build Coastguard Worker 	const UTF32 byteMask = 0xBF;
224*c14be686SAndroid Build Coastguard Worker 	const UTF32 byteMark = 0x80;
225*c14be686SAndroid Build Coastguard Worker 	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
226*c14be686SAndroid Build Coastguard Worker 	ch = *source++;
227*c14be686SAndroid Build Coastguard Worker 	/* If we have a surrogate pair, convert to UTF32 first. */
228*c14be686SAndroid Build Coastguard Worker 	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
229*c14be686SAndroid Build Coastguard Worker 	    /* If the 16 bits following the high surrogate are in the source buffer... */
230*c14be686SAndroid Build Coastguard Worker 	    if (source < sourceEnd) {
231*c14be686SAndroid Build Coastguard Worker 		UTF32 ch2 = *source;
232*c14be686SAndroid Build Coastguard Worker 		/* If it's a low surrogate, convert to UTF32. */
233*c14be686SAndroid Build Coastguard Worker 		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
234*c14be686SAndroid Build Coastguard Worker 		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
235*c14be686SAndroid Build Coastguard Worker 			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
236*c14be686SAndroid Build Coastguard Worker 		    ++source;
237*c14be686SAndroid Build Coastguard Worker 		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
238*c14be686SAndroid Build Coastguard Worker 		    --source; /* return to the illegal value itself */
239*c14be686SAndroid Build Coastguard Worker 		    result = sourceIllegal;
240*c14be686SAndroid Build Coastguard Worker 		    break;
241*c14be686SAndroid Build Coastguard Worker 		}
242*c14be686SAndroid Build Coastguard Worker 	    } else { /* We don't have the 16 bits following the high surrogate. */
243*c14be686SAndroid Build Coastguard Worker 		--source; /* return to the high surrogate */
244*c14be686SAndroid Build Coastguard Worker 		result = sourceExhausted;
245*c14be686SAndroid Build Coastguard Worker 		break;
246*c14be686SAndroid Build Coastguard Worker 	    }
247*c14be686SAndroid Build Coastguard Worker 	} else if (flags == strictConversion) {
248*c14be686SAndroid Build Coastguard Worker 	    /* UTF-16 surrogate values are illegal in UTF-32 */
249*c14be686SAndroid Build Coastguard Worker 	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
250*c14be686SAndroid Build Coastguard Worker 		--source; /* return to the illegal value itself */
251*c14be686SAndroid Build Coastguard Worker 		result = sourceIllegal;
252*c14be686SAndroid Build Coastguard Worker 		break;
253*c14be686SAndroid Build Coastguard Worker 	    }
254*c14be686SAndroid Build Coastguard Worker 	}
255*c14be686SAndroid Build Coastguard Worker 
256*c14be686SAndroid Build Coastguard Worker 	// TPN: substitute all control characters except for NULL, TAB, LF or CR
257*c14be686SAndroid Build Coastguard Worker 	if (ch && (ch != (UTF32)0x09)  && (ch != (UTF32)0x0a)  && (ch != (UTF32)0x0d)  && (ch < (UTF32)0x20) )  {
258*c14be686SAndroid Build Coastguard Worker 		ch = (UTF32)0x3f;
259*c14be686SAndroid Build Coastguard Worker 	}
260*c14be686SAndroid Build Coastguard Worker 	// TPN: filter out byte order marks and invalid character 0xFFFF
261*c14be686SAndroid Build Coastguard Worker 	if((ch == (UTF32)0xFEFF) || (ch == (UTF32)0xFFFE)|| (ch == (UTF32)0xFFFF)) {
262*c14be686SAndroid Build Coastguard Worker 		continue;
263*c14be686SAndroid Build Coastguard Worker 	}
264*c14be686SAndroid Build Coastguard Worker 
265*c14be686SAndroid Build Coastguard Worker 	/* Figure out how many bytes the result will require */
266*c14be686SAndroid Build Coastguard Worker 	if (ch < (UTF32)0x80) {	    bytesToWrite = 1;
267*c14be686SAndroid Build Coastguard Worker 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
268*c14be686SAndroid Build Coastguard Worker 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
269*c14be686SAndroid Build Coastguard Worker 	} else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
270*c14be686SAndroid Build Coastguard Worker 	} else {			    bytesToWrite = 3;
271*c14be686SAndroid Build Coastguard Worker 					    ch = UNI_REPLACEMENT_CHAR;
272*c14be686SAndroid Build Coastguard Worker 	}
273*c14be686SAndroid Build Coastguard Worker 
274*c14be686SAndroid Build Coastguard Worker 	target += bytesToWrite;
275*c14be686SAndroid Build Coastguard Worker 	if (target > targetEnd) {
276*c14be686SAndroid Build Coastguard Worker 	    source = oldSource; /* Back up source pointer! */
277*c14be686SAndroid Build Coastguard Worker 	    target -= bytesToWrite; result = targetExhausted; break;
278*c14be686SAndroid Build Coastguard Worker 	}
279*c14be686SAndroid Build Coastguard Worker 	switch (bytesToWrite) { /* note: everything falls through. */
280*c14be686SAndroid Build Coastguard Worker 	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
281*c14be686SAndroid Build Coastguard Worker 	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
282*c14be686SAndroid Build Coastguard Worker 	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
283*c14be686SAndroid Build Coastguard Worker 	    case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
284*c14be686SAndroid Build Coastguard Worker 	}
285*c14be686SAndroid Build Coastguard Worker 	target += bytesToWrite;
286*c14be686SAndroid Build Coastguard Worker     }
287*c14be686SAndroid Build Coastguard Worker     *sourceStart = source;
288*c14be686SAndroid Build Coastguard Worker     *targetStart = target;
289*c14be686SAndroid Build Coastguard Worker     return result;
290*c14be686SAndroid Build Coastguard Worker }
291*c14be686SAndroid Build Coastguard Worker 
292*c14be686SAndroid Build Coastguard Worker /* --------------------------------------------------------------------- */
293*c14be686SAndroid Build Coastguard Worker 
294*c14be686SAndroid Build Coastguard Worker /*
295*c14be686SAndroid Build Coastguard Worker  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
296*c14be686SAndroid Build Coastguard Worker  * This must be called with the length pre-determined by the first byte.
297*c14be686SAndroid Build Coastguard Worker  * If not calling this from ConvertUTF8to*, then the length can be set by:
298*c14be686SAndroid Build Coastguard Worker  *  length = trailingBytesForUTF8[*source]+1;
299*c14be686SAndroid Build Coastguard Worker  * and the sequence is illegal right away if there aren't that many bytes
300*c14be686SAndroid Build Coastguard Worker  * available.
301*c14be686SAndroid Build Coastguard Worker  * If presented with a length > 4, this returns false.  The Unicode
302*c14be686SAndroid Build Coastguard Worker  * definition of UTF-8 goes up to 4-byte sequences.
303*c14be686SAndroid Build Coastguard Worker  */
304*c14be686SAndroid Build Coastguard Worker 
isLegalUTF8(const UTF8 * source,int length)305*c14be686SAndroid Build Coastguard Worker inline Boolean isLegalUTF8(const UTF8 *source, int length) {
306*c14be686SAndroid Build Coastguard Worker     UTF8 a;
307*c14be686SAndroid Build Coastguard Worker     const UTF8 *srcptr = source+length;
308*c14be686SAndroid Build Coastguard Worker     switch (length) {
309*c14be686SAndroid Build Coastguard Worker     default: return false;
310*c14be686SAndroid Build Coastguard Worker 	/* Everything else falls through when "true"... */
311*c14be686SAndroid Build Coastguard Worker     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
312*c14be686SAndroid Build Coastguard Worker     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
313*c14be686SAndroid Build Coastguard Worker     case 2: if ((a = (*--srcptr)) > 0xBF) return false;
314*c14be686SAndroid Build Coastguard Worker 
315*c14be686SAndroid Build Coastguard Worker 	switch (*source) {
316*c14be686SAndroid Build Coastguard Worker 	    /* no fall-through in this inner switch */
317*c14be686SAndroid Build Coastguard Worker 	    case 0xE0: if (a < 0xA0) return false; break;
318*c14be686SAndroid Build Coastguard Worker 	    case 0xED: if (a > 0x9F) return false; break;
319*c14be686SAndroid Build Coastguard Worker 	    case 0xF0: if (a < 0x90) return false; break;
320*c14be686SAndroid Build Coastguard Worker 	    case 0xF4: if (a > 0x8F) return false; break;
321*c14be686SAndroid Build Coastguard Worker 	    default:   if (a < 0x80) return false;
322*c14be686SAndroid Build Coastguard Worker 	}
323*c14be686SAndroid Build Coastguard Worker 
324*c14be686SAndroid Build Coastguard Worker     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
325*c14be686SAndroid Build Coastguard Worker     }
326*c14be686SAndroid Build Coastguard Worker     if (*source > 0xF4) return false;
327*c14be686SAndroid Build Coastguard Worker     return true;
328*c14be686SAndroid Build Coastguard Worker }
329*c14be686SAndroid Build Coastguard Worker 
330*c14be686SAndroid Build Coastguard Worker /* --------------------------------------------------------------------- */
331*c14be686SAndroid Build Coastguard Worker 
332*c14be686SAndroid Build Coastguard Worker /*
333*c14be686SAndroid Build Coastguard Worker  * Exported function to return whether a UTF-8 sequence is legal or not.
334*c14be686SAndroid Build Coastguard Worker  * This is not used here; it's just exported.
335*c14be686SAndroid Build Coastguard Worker  */
isLegalUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)336*c14be686SAndroid Build Coastguard Worker Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
337*c14be686SAndroid Build Coastguard Worker     int length = trailingBytesForUTF8[*source]+1;
338*c14be686SAndroid Build Coastguard Worker     if (source+length > sourceEnd) {
339*c14be686SAndroid Build Coastguard Worker 	return false;
340*c14be686SAndroid Build Coastguard Worker     }
341*c14be686SAndroid Build Coastguard Worker     return isLegalUTF8(source, length);
342*c14be686SAndroid Build Coastguard Worker }
343*c14be686SAndroid Build Coastguard Worker 
344*c14be686SAndroid Build Coastguard Worker /* --------------------------------------------------------------------- */
345*c14be686SAndroid Build Coastguard Worker 
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)346*c14be686SAndroid Build Coastguard Worker ConversionResult ConvertUTF8toUTF16 (
347*c14be686SAndroid Build Coastguard Worker 	const UTF8** sourceStart, const UTF8* sourceEnd,
348*c14be686SAndroid Build Coastguard Worker 	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
349*c14be686SAndroid Build Coastguard Worker     ConversionResult result = conversionOK;
350*c14be686SAndroid Build Coastguard Worker     const UTF8* source = *sourceStart;
351*c14be686SAndroid Build Coastguard Worker     UTF16* target = *targetStart;
352*c14be686SAndroid Build Coastguard Worker     while (source < sourceEnd) {
353*c14be686SAndroid Build Coastguard Worker 	UTF32 ch = 0;
354*c14be686SAndroid Build Coastguard Worker 	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
355*c14be686SAndroid Build Coastguard Worker 	if (source + extraBytesToRead >= sourceEnd) {
356*c14be686SAndroid Build Coastguard Worker 	    result = sourceExhausted; break;
357*c14be686SAndroid Build Coastguard Worker 	}
358*c14be686SAndroid Build Coastguard Worker 	/* Do this check whether lenient or strict */
359*c14be686SAndroid Build Coastguard Worker 	if (! isLegalUTF8(source, extraBytesToRead+1)) {
360*c14be686SAndroid Build Coastguard Worker 	    result = sourceIllegal;
361*c14be686SAndroid Build Coastguard Worker 	    break;
362*c14be686SAndroid Build Coastguard Worker 	}
363*c14be686SAndroid Build Coastguard Worker 	/*
364*c14be686SAndroid Build Coastguard Worker 	 * The cases all fall through. See "Note A" below.
365*c14be686SAndroid Build Coastguard Worker 	 */
366*c14be686SAndroid Build Coastguard Worker 	switch (extraBytesToRead) {
367*c14be686SAndroid Build Coastguard Worker 	    case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
368*c14be686SAndroid Build Coastguard Worker 	    case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
369*c14be686SAndroid Build Coastguard Worker 	    case 3: ch += *source++; ch <<= 6;
370*c14be686SAndroid Build Coastguard Worker 	    case 2: ch += *source++; ch <<= 6;
371*c14be686SAndroid Build Coastguard Worker 	    case 1: ch += *source++; ch <<= 6;
372*c14be686SAndroid Build Coastguard Worker 	    case 0: ch += *source++;
373*c14be686SAndroid Build Coastguard Worker 	}
374*c14be686SAndroid Build Coastguard Worker 	ch -= offsetsFromUTF8[extraBytesToRead];
375*c14be686SAndroid Build Coastguard Worker 
376*c14be686SAndroid Build Coastguard Worker 	if (target >= targetEnd) {
377*c14be686SAndroid Build Coastguard Worker 	    source -= (extraBytesToRead+1); /* Back up source pointer! */
378*c14be686SAndroid Build Coastguard Worker 	    result = targetExhausted; break;
379*c14be686SAndroid Build Coastguard Worker 	}
380*c14be686SAndroid Build Coastguard Worker 	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
381*c14be686SAndroid Build Coastguard Worker 	    /* UTF-16 surrogate values are illegal in UTF-32 */
382*c14be686SAndroid Build Coastguard Worker 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
383*c14be686SAndroid Build Coastguard Worker 		if (flags == strictConversion) {
384*c14be686SAndroid Build Coastguard Worker 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
385*c14be686SAndroid Build Coastguard Worker 		    result = sourceIllegal;
386*c14be686SAndroid Build Coastguard Worker 		    break;
387*c14be686SAndroid Build Coastguard Worker 		} else {
388*c14be686SAndroid Build Coastguard Worker 		    *target++ = UNI_REPLACEMENT_CHAR;
389*c14be686SAndroid Build Coastguard Worker 		}
390*c14be686SAndroid Build Coastguard Worker 	    } else {
391*c14be686SAndroid Build Coastguard Worker 		*target++ = (UTF16)ch; /* normal case */
392*c14be686SAndroid Build Coastguard Worker 	    }
393*c14be686SAndroid Build Coastguard Worker 	} else if (ch > UNI_MAX_UTF16) {
394*c14be686SAndroid Build Coastguard Worker 	    if (flags == strictConversion) {
395*c14be686SAndroid Build Coastguard Worker 		result = sourceIllegal;
396*c14be686SAndroid Build Coastguard Worker 		source -= (extraBytesToRead+1); /* return to the start */
397*c14be686SAndroid Build Coastguard Worker 		break; /* Bail out; shouldn't continue */
398*c14be686SAndroid Build Coastguard Worker 	    } else {
399*c14be686SAndroid Build Coastguard Worker 		*target++ = UNI_REPLACEMENT_CHAR;
400*c14be686SAndroid Build Coastguard Worker 	    }
401*c14be686SAndroid Build Coastguard Worker 	} else {
402*c14be686SAndroid Build Coastguard Worker 	    /* target is a character in range 0xFFFF - 0x10FFFF. */
403*c14be686SAndroid Build Coastguard Worker 	    if (target + 1 >= targetEnd) {
404*c14be686SAndroid Build Coastguard Worker 		source -= (extraBytesToRead+1); /* Back up source pointer! */
405*c14be686SAndroid Build Coastguard Worker 		result = targetExhausted; break;
406*c14be686SAndroid Build Coastguard Worker 	    }
407*c14be686SAndroid Build Coastguard Worker 	    ch -= halfBase;
408*c14be686SAndroid Build Coastguard Worker 	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
409*c14be686SAndroid Build Coastguard Worker 	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
410*c14be686SAndroid Build Coastguard Worker 	}
411*c14be686SAndroid Build Coastguard Worker     }
412*c14be686SAndroid Build Coastguard Worker     *sourceStart = source;
413*c14be686SAndroid Build Coastguard Worker     *targetStart = target;
414*c14be686SAndroid Build Coastguard Worker     return result;
415*c14be686SAndroid Build Coastguard Worker }
416*c14be686SAndroid Build Coastguard Worker 
417*c14be686SAndroid Build Coastguard Worker /* --------------------------------------------------------------------- */
418*c14be686SAndroid Build Coastguard Worker 
ConvertUTF32toUTF8(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)419*c14be686SAndroid Build Coastguard Worker ConversionResult ConvertUTF32toUTF8 (
420*c14be686SAndroid Build Coastguard Worker 	const UTF32** sourceStart, const UTF32* sourceEnd,
421*c14be686SAndroid Build Coastguard Worker 	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
422*c14be686SAndroid Build Coastguard Worker     ConversionResult result = conversionOK;
423*c14be686SAndroid Build Coastguard Worker     const UTF32* source = *sourceStart;
424*c14be686SAndroid Build Coastguard Worker     UTF8* target = *targetStart;
425*c14be686SAndroid Build Coastguard Worker     while (source < sourceEnd) {
426*c14be686SAndroid Build Coastguard Worker 	UTF32 ch;
427*c14be686SAndroid Build Coastguard Worker 	unsigned short bytesToWrite = 0;
428*c14be686SAndroid Build Coastguard Worker 	const UTF32 byteMask = 0xBF;
429*c14be686SAndroid Build Coastguard Worker 	const UTF32 byteMark = 0x80;
430*c14be686SAndroid Build Coastguard Worker 	ch = *source++;
431*c14be686SAndroid Build Coastguard Worker 	if (flags == strictConversion ) {
432*c14be686SAndroid Build Coastguard Worker 	    /* UTF-16 surrogate values are illegal in UTF-32 */
433*c14be686SAndroid Build Coastguard Worker 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
434*c14be686SAndroid Build Coastguard Worker 		--source; /* return to the illegal value itself */
435*c14be686SAndroid Build Coastguard Worker 		result = sourceIllegal;
436*c14be686SAndroid Build Coastguard Worker 		break;
437*c14be686SAndroid Build Coastguard Worker 	    }
438*c14be686SAndroid Build Coastguard Worker 	}
439*c14be686SAndroid Build Coastguard Worker 	/*
440*c14be686SAndroid Build Coastguard Worker 	 * Figure out how many bytes the result will require. Turn any
441*c14be686SAndroid Build Coastguard Worker 	 * illegally large UTF32 things (> Plane 17) into replacement chars.
442*c14be686SAndroid Build Coastguard Worker 	 */
443*c14be686SAndroid Build Coastguard Worker 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
444*c14be686SAndroid Build Coastguard Worker 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
445*c14be686SAndroid Build Coastguard Worker 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
446*c14be686SAndroid Build Coastguard Worker 	} else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
447*c14be686SAndroid Build Coastguard Worker 	} else {			    bytesToWrite = 3;
448*c14be686SAndroid Build Coastguard Worker 					    ch = UNI_REPLACEMENT_CHAR;
449*c14be686SAndroid Build Coastguard Worker 					    result = sourceIllegal;
450*c14be686SAndroid Build Coastguard Worker 	}
451*c14be686SAndroid Build Coastguard Worker 
452*c14be686SAndroid Build Coastguard Worker 	target += bytesToWrite;
453*c14be686SAndroid Build Coastguard Worker 	if (target > targetEnd) {
454*c14be686SAndroid Build Coastguard Worker 	    --source; /* Back up source pointer! */
455*c14be686SAndroid Build Coastguard Worker 	    target -= bytesToWrite; result = targetExhausted; break;
456*c14be686SAndroid Build Coastguard Worker 	}
457*c14be686SAndroid Build Coastguard Worker 	switch (bytesToWrite) { /* note: everything falls through. */
458*c14be686SAndroid Build Coastguard Worker 	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
459*c14be686SAndroid Build Coastguard Worker 	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
460*c14be686SAndroid Build Coastguard Worker 	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
461*c14be686SAndroid Build Coastguard Worker 	    case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
462*c14be686SAndroid Build Coastguard Worker 	}
463*c14be686SAndroid Build Coastguard Worker 	target += bytesToWrite;
464*c14be686SAndroid Build Coastguard Worker     }
465*c14be686SAndroid Build Coastguard Worker     *sourceStart = source;
466*c14be686SAndroid Build Coastguard Worker     *targetStart = target;
467*c14be686SAndroid Build Coastguard Worker     return result;
468*c14be686SAndroid Build Coastguard Worker }
469*c14be686SAndroid Build Coastguard Worker 
470*c14be686SAndroid Build Coastguard Worker /* --------------------------------------------------------------------- */
471*c14be686SAndroid Build Coastguard Worker 
ConvertUTF8toUTF32(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)472*c14be686SAndroid Build Coastguard Worker ConversionResult ConvertUTF8toUTF32 (
473*c14be686SAndroid Build Coastguard Worker 	const UTF8** sourceStart, const UTF8* sourceEnd,
474*c14be686SAndroid Build Coastguard Worker 	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
475*c14be686SAndroid Build Coastguard Worker     ConversionResult result = conversionOK;
476*c14be686SAndroid Build Coastguard Worker     const UTF8* source = *sourceStart;
477*c14be686SAndroid Build Coastguard Worker     UTF32* target = *targetStart;
478*c14be686SAndroid Build Coastguard Worker     while (source < sourceEnd) {
479*c14be686SAndroid Build Coastguard Worker 	UTF32 ch = 0;
480*c14be686SAndroid Build Coastguard Worker 	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
481*c14be686SAndroid Build Coastguard Worker 	if (source + extraBytesToRead >= sourceEnd) {
482*c14be686SAndroid Build Coastguard Worker 	    result = sourceExhausted; break;
483*c14be686SAndroid Build Coastguard Worker 	}
484*c14be686SAndroid Build Coastguard Worker 	/* Do this check whether lenient or strict */
485*c14be686SAndroid Build Coastguard Worker 	if (! isLegalUTF8(source, extraBytesToRead+1)) {
486*c14be686SAndroid Build Coastguard Worker 	    result = sourceIllegal;
487*c14be686SAndroid Build Coastguard Worker 	    break;
488*c14be686SAndroid Build Coastguard Worker 	}
489*c14be686SAndroid Build Coastguard Worker 	/*
490*c14be686SAndroid Build Coastguard Worker 	 * The cases all fall through. See "Note A" below.
491*c14be686SAndroid Build Coastguard Worker 	 */
492*c14be686SAndroid Build Coastguard Worker 	switch (extraBytesToRead) {
493*c14be686SAndroid Build Coastguard Worker 	    case 5: ch += *source++; ch <<= 6;
494*c14be686SAndroid Build Coastguard Worker 	    case 4: ch += *source++; ch <<= 6;
495*c14be686SAndroid Build Coastguard Worker 	    case 3: ch += *source++; ch <<= 6;
496*c14be686SAndroid Build Coastguard Worker 	    case 2: ch += *source++; ch <<= 6;
497*c14be686SAndroid Build Coastguard Worker 	    case 1: ch += *source++; ch <<= 6;
498*c14be686SAndroid Build Coastguard Worker 	    case 0: ch += *source++;
499*c14be686SAndroid Build Coastguard Worker 	}
500*c14be686SAndroid Build Coastguard Worker 	ch -= offsetsFromUTF8[extraBytesToRead];
501*c14be686SAndroid Build Coastguard Worker 
502*c14be686SAndroid Build Coastguard Worker 	if (target >= targetEnd) {
503*c14be686SAndroid Build Coastguard Worker 	    source -= (extraBytesToRead+1); /* Back up the source pointer! */
504*c14be686SAndroid Build Coastguard Worker 	    result = targetExhausted; break;
505*c14be686SAndroid Build Coastguard Worker 	}
506*c14be686SAndroid Build Coastguard Worker 	if (ch <= UNI_MAX_LEGAL_UTF32) {
507*c14be686SAndroid Build Coastguard Worker 	    /*
508*c14be686SAndroid Build Coastguard Worker 	     * UTF-16 surrogate values are illegal in UTF-32, and anything
509*c14be686SAndroid Build Coastguard Worker 	     * over Plane 17 (> 0x10FFFF) is illegal.
510*c14be686SAndroid Build Coastguard Worker 	     */
511*c14be686SAndroid Build Coastguard Worker 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
512*c14be686SAndroid Build Coastguard Worker 		if (flags == strictConversion) {
513*c14be686SAndroid Build Coastguard Worker 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
514*c14be686SAndroid Build Coastguard Worker 		    result = sourceIllegal;
515*c14be686SAndroid Build Coastguard Worker 		    break;
516*c14be686SAndroid Build Coastguard Worker 		} else {
517*c14be686SAndroid Build Coastguard Worker 		    *target++ = UNI_REPLACEMENT_CHAR;
518*c14be686SAndroid Build Coastguard Worker 		}
519*c14be686SAndroid Build Coastguard Worker 	    } else {
520*c14be686SAndroid Build Coastguard Worker 		*target++ = ch;
521*c14be686SAndroid Build Coastguard Worker 	    }
522*c14be686SAndroid Build Coastguard Worker 	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
523*c14be686SAndroid Build Coastguard Worker 	    result = sourceIllegal;
524*c14be686SAndroid Build Coastguard Worker 	    *target++ = UNI_REPLACEMENT_CHAR;
525*c14be686SAndroid Build Coastguard Worker 	}
526*c14be686SAndroid Build Coastguard Worker     }
527*c14be686SAndroid Build Coastguard Worker     *sourceStart = source;
528*c14be686SAndroid Build Coastguard Worker     *targetStart = target;
529*c14be686SAndroid Build Coastguard Worker     return result;
530*c14be686SAndroid Build Coastguard Worker }
531*c14be686SAndroid Build Coastguard Worker 
532*c14be686SAndroid Build Coastguard Worker /* ---------------------------------------------------------------------
533*c14be686SAndroid Build Coastguard Worker 
534*c14be686SAndroid Build Coastguard Worker     Note A.
535*c14be686SAndroid Build Coastguard Worker     The fall-through switches in UTF-8 reading code save a
536*c14be686SAndroid Build Coastguard Worker     temp variable, some decrements & conditionals.  The switches
537*c14be686SAndroid Build Coastguard Worker     are equivalent to the following loop:
538*c14be686SAndroid Build Coastguard Worker 	{
539*c14be686SAndroid Build Coastguard Worker 	    int tmpBytesToRead = extraBytesToRead+1;
540*c14be686SAndroid Build Coastguard Worker 	    do {
541*c14be686SAndroid Build Coastguard Worker 		ch += *source++;
542*c14be686SAndroid Build Coastguard Worker 		--tmpBytesToRead;
543*c14be686SAndroid Build Coastguard Worker 		if (tmpBytesToRead) ch <<= 6;
544*c14be686SAndroid Build Coastguard Worker 	    } while (tmpBytesToRead > 0);
545*c14be686SAndroid Build Coastguard Worker 	}
546*c14be686SAndroid Build Coastguard Worker     In UTF-8 writing code, the switches on "bytesToWrite" are
547*c14be686SAndroid Build Coastguard Worker     similarly unrolled loops.
548*c14be686SAndroid Build Coastguard Worker 
549*c14be686SAndroid Build Coastguard Worker    --------------------------------------------------------------------- */
550