vrshoot
diff libs/assimp/ConvertUTF/ConvertUTF.c @ 0:b2f14e535253
initial commit
author | John Tsiombikas <nuclear@member.fsf.org> |
---|---|
date | Sat, 01 Feb 2014 19:58:19 +0200 |
parents | |
children |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/libs/assimp/ConvertUTF/ConvertUTF.c Sat Feb 01 19:58:19 2014 +0200 1.3 @@ -0,0 +1,539 @@ 1.4 +/* 1.5 + * Copyright 2001-2004 Unicode, Inc. 1.6 + * 1.7 + * Disclaimer 1.8 + * 1.9 + * This source code is provided as is by Unicode, Inc. No claims are 1.10 + * made as to fitness for any particular purpose. No warranties of any 1.11 + * kind are expressed or implied. The recipient agrees to determine 1.12 + * applicability of information provided. If this file has been 1.13 + * purchased on magnetic or optical media from Unicode, Inc., the 1.14 + * sole remedy for any claim will be exchange of defective media 1.15 + * within 90 days of receipt. 1.16 + * 1.17 + * Limitations on Rights to Redistribute This Code 1.18 + * 1.19 + * Unicode, Inc. hereby grants the right to freely use the information 1.20 + * supplied in this file in the creation of products supporting the 1.21 + * Unicode Standard, and to make copies of this file in any form 1.22 + * for internal or external distribution as long as this notice 1.23 + * remains attached. 1.24 + */ 1.25 + 1.26 +/* --------------------------------------------------------------------- 1.27 + 1.28 + Conversions between UTF32, UTF-16, and UTF-8. Source code file. 1.29 + Author: Mark E. Davis, 1994. 1.30 + Rev History: Rick McGowan, fixes & updates May 2001. 1.31 + Sept 2001: fixed const & error conditions per 1.32 + mods suggested by S. Parent & A. Lillich. 1.33 + June 2002: Tim Dodd added detection and handling of incomplete 1.34 + source sequences, enhanced error detection, added casts 1.35 + to eliminate compiler warnings. 1.36 + July 2003: slight mods to back out aggressive FFFE detection. 1.37 + Jan 2004: updated switches in from-UTF8 conversions. 1.38 + Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. 1.39 + 1.40 + See the header file "ConvertUTF.h" for complete documentation. 1.41 + 1.42 +------------------------------------------------------------------------ */ 1.43 + 1.44 + 1.45 +#include "ConvertUTF.h" 1.46 +#ifdef CVTUTF_DEBUG 1.47 +#include <stdio.h> 1.48 +#endif 1.49 + 1.50 +static const int halfShift = 10; /* used for shifting by 10 bits */ 1.51 + 1.52 +static const UTF32 halfBase = 0x0010000UL; 1.53 +static const UTF32 halfMask = 0x3FFUL; 1.54 + 1.55 +#define UNI_SUR_HIGH_START (UTF32)0xD800 1.56 +#define UNI_SUR_HIGH_END (UTF32)0xDBFF 1.57 +#define UNI_SUR_LOW_START (UTF32)0xDC00 1.58 +#define UNI_SUR_LOW_END (UTF32)0xDFFF 1.59 +#define false 0 1.60 +#define true 1 1.61 + 1.62 +/* --------------------------------------------------------------------- */ 1.63 + 1.64 +ConversionResult ConvertUTF32toUTF16 ( 1.65 + const UTF32** sourceStart, const UTF32* sourceEnd, 1.66 + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 1.67 + ConversionResult result = conversionOK; 1.68 + const UTF32* source = *sourceStart; 1.69 + UTF16* target = *targetStart; 1.70 + while (source < sourceEnd) { 1.71 + UTF32 ch; 1.72 + if (target >= targetEnd) { 1.73 + result = targetExhausted; break; 1.74 + } 1.75 + ch = *source++; 1.76 + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 1.77 + /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ 1.78 + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 1.79 + if (flags == strictConversion) { 1.80 + --source; /* return to the illegal value itself */ 1.81 + result = sourceIllegal; 1.82 + break; 1.83 + } else { 1.84 + *target++ = UNI_REPLACEMENT_CHAR; 1.85 + } 1.86 + } else { 1.87 + *target++ = (UTF16)ch; /* normal case */ 1.88 + } 1.89 + } else if (ch > UNI_MAX_LEGAL_UTF32) { 1.90 + if (flags == strictConversion) { 1.91 + result = sourceIllegal; 1.92 + } else { 1.93 + *target++ = UNI_REPLACEMENT_CHAR; 1.94 + } 1.95 + } else { 1.96 + /* target is a character in range 0xFFFF - 0x10FFFF. */ 1.97 + if (target + 1 >= targetEnd) { 1.98 + --source; /* Back up source pointer! */ 1.99 + result = targetExhausted; break; 1.100 + } 1.101 + ch -= halfBase; 1.102 + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 1.103 + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 1.104 + } 1.105 + } 1.106 + *sourceStart = source; 1.107 + *targetStart = target; 1.108 + return result; 1.109 +} 1.110 + 1.111 +/* --------------------------------------------------------------------- */ 1.112 + 1.113 +ConversionResult ConvertUTF16toUTF32 ( 1.114 + const UTF16** sourceStart, const UTF16* sourceEnd, 1.115 + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { 1.116 + ConversionResult result = conversionOK; 1.117 + const UTF16* source = *sourceStart; 1.118 + UTF32* target = *targetStart; 1.119 + UTF32 ch, ch2; 1.120 + while (source < sourceEnd) { 1.121 + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 1.122 + ch = *source++; 1.123 + /* If we have a surrogate pair, convert to UTF32 first. */ 1.124 + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 1.125 + /* If the 16 bits following the high surrogate are in the source buffer... */ 1.126 + if (source < sourceEnd) { 1.127 + ch2 = *source; 1.128 + /* If it's a low surrogate, convert to UTF32. */ 1.129 + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 1.130 + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 1.131 + + (ch2 - UNI_SUR_LOW_START) + halfBase; 1.132 + ++source; 1.133 + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 1.134 + --source; /* return to the illegal value itself */ 1.135 + result = sourceIllegal; 1.136 + break; 1.137 + } 1.138 + } else { /* We don't have the 16 bits following the high surrogate. */ 1.139 + --source; /* return to the high surrogate */ 1.140 + result = sourceExhausted; 1.141 + break; 1.142 + } 1.143 + } else if (flags == strictConversion) { 1.144 + /* UTF-16 surrogate values are illegal in UTF-32 */ 1.145 + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 1.146 + --source; /* return to the illegal value itself */ 1.147 + result = sourceIllegal; 1.148 + break; 1.149 + } 1.150 + } 1.151 + if (target >= targetEnd) { 1.152 + source = oldSource; /* Back up source pointer! */ 1.153 + result = targetExhausted; break; 1.154 + } 1.155 + *target++ = ch; 1.156 + } 1.157 + *sourceStart = source; 1.158 + *targetStart = target; 1.159 +#ifdef CVTUTF_DEBUG 1.160 +if (result == sourceIllegal) { 1.161 + fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); 1.162 + fflush(stderr); 1.163 +} 1.164 +#endif 1.165 + return result; 1.166 +} 1.167 + 1.168 +/* --------------------------------------------------------------------- */ 1.169 + 1.170 +/* 1.171 + * Index into the table below with the first byte of a UTF-8 sequence to 1.172 + * get the number of trailing bytes that are supposed to follow it. 1.173 + * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is 1.174 + * left as-is for anyone who may want to do such conversion, which was 1.175 + * allowed in earlier algorithms. 1.176 + */ 1.177 +static const char trailingBytesForUTF8[256] = { 1.178 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1.179 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1.180 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1.181 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1.182 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1.183 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1.184 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1.185 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 1.186 +}; 1.187 + 1.188 +/* 1.189 + * Magic values subtracted from a buffer value during UTF8 conversion. 1.190 + * This table contains as many values as there might be trailing bytes 1.191 + * in a UTF-8 sequence. 1.192 + */ 1.193 +static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 1.194 + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; 1.195 + 1.196 +/* 1.197 + * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 1.198 + * into the first byte, depending on how many bytes follow. There are 1.199 + * as many entries in this table as there are UTF-8 sequence types. 1.200 + * (I.e., one byte sequence, two byte... etc.). Remember that sequencs 1.201 + * for *legal* UTF-8 will be 4 or fewer bytes total. 1.202 + */ 1.203 +static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 1.204 + 1.205 +/* --------------------------------------------------------------------- */ 1.206 + 1.207 +/* The interface converts a whole buffer to avoid function-call overhead. 1.208 + * Constants have been gathered. Loops & conditionals have been removed as 1.209 + * much as possible for efficiency, in favor of drop-through switches. 1.210 + * (See "Note A" at the bottom of the file for equivalent code.) 1.211 + * If your compiler supports it, the "isLegalUTF8" call can be turned 1.212 + * into an inline function. 1.213 + */ 1.214 + 1.215 +/* --------------------------------------------------------------------- */ 1.216 + 1.217 +ConversionResult ConvertUTF16toUTF8 ( 1.218 + const UTF16** sourceStart, const UTF16* sourceEnd, 1.219 + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 1.220 + ConversionResult result = conversionOK; 1.221 + const UTF16* source = *sourceStart; 1.222 + UTF8* target = *targetStart; 1.223 + while (source < sourceEnd) { 1.224 + UTF32 ch; 1.225 + unsigned short bytesToWrite = 0; 1.226 + const UTF32 byteMask = 0xBF; 1.227 + const UTF32 byteMark = 0x80; 1.228 + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 1.229 + ch = *source++; 1.230 + /* If we have a surrogate pair, convert to UTF32 first. */ 1.231 + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 1.232 + /* If the 16 bits following the high surrogate are in the source buffer... */ 1.233 + if (source < sourceEnd) { 1.234 + UTF32 ch2 = *source; 1.235 + /* If it's a low surrogate, convert to UTF32. */ 1.236 + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 1.237 + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 1.238 + + (ch2 - UNI_SUR_LOW_START) + halfBase; 1.239 + ++source; 1.240 + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 1.241 + --source; /* return to the illegal value itself */ 1.242 + result = sourceIllegal; 1.243 + break; 1.244 + } 1.245 + } else { /* We don't have the 16 bits following the high surrogate. */ 1.246 + --source; /* return to the high surrogate */ 1.247 + result = sourceExhausted; 1.248 + break; 1.249 + } 1.250 + } else if (flags == strictConversion) { 1.251 + /* UTF-16 surrogate values are illegal in UTF-32 */ 1.252 + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 1.253 + --source; /* return to the illegal value itself */ 1.254 + result = sourceIllegal; 1.255 + break; 1.256 + } 1.257 + } 1.258 + /* Figure out how many bytes the result will require */ 1.259 + if (ch < (UTF32)0x80) { bytesToWrite = 1; 1.260 + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 1.261 + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 1.262 + } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; 1.263 + } else { bytesToWrite = 3; 1.264 + ch = UNI_REPLACEMENT_CHAR; 1.265 + } 1.266 + 1.267 + target += bytesToWrite; 1.268 + if (target > targetEnd) { 1.269 + source = oldSource; /* Back up source pointer! */ 1.270 + target -= bytesToWrite; result = targetExhausted; break; 1.271 + } 1.272 + switch (bytesToWrite) { /* note: everything falls through. */ 1.273 + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 1.274 + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 1.275 + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 1.276 + case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); 1.277 + } 1.278 + target += bytesToWrite; 1.279 + } 1.280 + *sourceStart = source; 1.281 + *targetStart = target; 1.282 + return result; 1.283 +} 1.284 + 1.285 +/* --------------------------------------------------------------------- */ 1.286 + 1.287 +/* 1.288 + * Utility routine to tell whether a sequence of bytes is legal UTF-8. 1.289 + * This must be called with the length pre-determined by the first byte. 1.290 + * If not calling this from ConvertUTF8to*, then the length can be set by: 1.291 + * length = trailingBytesForUTF8[*source]+1; 1.292 + * and the sequence is illegal right away if there aren't that many bytes 1.293 + * available. 1.294 + * If presented with a length > 4, this returns false. The Unicode 1.295 + * definition of UTF-8 goes up to 4-byte sequences. 1.296 + */ 1.297 + 1.298 +static Boolean isLegalUTF8(const UTF8 *source, int length) { 1.299 + UTF8 a; 1.300 + const UTF8 *srcptr = source+length; 1.301 + switch (length) { 1.302 + default: return false; 1.303 + /* Everything else falls through when "true"... */ 1.304 + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 1.305 + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 1.306 + case 2: if ((a = (*--srcptr)) > 0xBF) return false; 1.307 + 1.308 + switch (*source) { 1.309 + /* no fall-through in this inner switch */ 1.310 + case 0xE0: if (a < 0xA0) return false; break; 1.311 + case 0xED: if (a > 0x9F) return false; break; 1.312 + case 0xF0: if (a < 0x90) return false; break; 1.313 + case 0xF4: if (a > 0x8F) return false; break; 1.314 + default: if (a < 0x80) return false; 1.315 + } 1.316 + 1.317 + case 1: if (*source >= 0x80 && *source < 0xC2) return false; 1.318 + } 1.319 + if (*source > 0xF4) return false; 1.320 + return true; 1.321 +} 1.322 + 1.323 +/* --------------------------------------------------------------------- */ 1.324 + 1.325 +/* 1.326 + * Exported function to return whether a UTF-8 sequence is legal or not. 1.327 + * This is not used here; it's just exported. 1.328 + */ 1.329 +Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { 1.330 + int length = trailingBytesForUTF8[*source]+1; 1.331 + if (source+length > sourceEnd) { 1.332 + return false; 1.333 + } 1.334 + return isLegalUTF8(source, length); 1.335 +} 1.336 + 1.337 +/* --------------------------------------------------------------------- */ 1.338 + 1.339 +ConversionResult ConvertUTF8toUTF16 ( 1.340 + const UTF8** sourceStart, const UTF8* sourceEnd, 1.341 + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 1.342 + ConversionResult result = conversionOK; 1.343 + const UTF8* source = *sourceStart; 1.344 + UTF16* target = *targetStart; 1.345 + while (source < sourceEnd) { 1.346 + UTF32 ch = 0; 1.347 + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 1.348 + if (source + extraBytesToRead >= sourceEnd) { 1.349 + result = sourceExhausted; break; 1.350 + } 1.351 + /* Do this check whether lenient or strict */ 1.352 + if (! isLegalUTF8(source, extraBytesToRead+1)) { 1.353 + result = sourceIllegal; 1.354 + break; 1.355 + } 1.356 + /* 1.357 + * The cases all fall through. See "Note A" below. 1.358 + */ 1.359 + switch (extraBytesToRead) { 1.360 + case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 1.361 + case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 1.362 + case 3: ch += *source++; ch <<= 6; 1.363 + case 2: ch += *source++; ch <<= 6; 1.364 + case 1: ch += *source++; ch <<= 6; 1.365 + case 0: ch += *source++; 1.366 + } 1.367 + ch -= offsetsFromUTF8[extraBytesToRead]; 1.368 + 1.369 + if (target >= targetEnd) { 1.370 + source -= (extraBytesToRead+1); /* Back up source pointer! */ 1.371 + result = targetExhausted; break; 1.372 + } 1.373 + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 1.374 + /* UTF-16 surrogate values are illegal in UTF-32 */ 1.375 + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 1.376 + if (flags == strictConversion) { 1.377 + source -= (extraBytesToRead+1); /* return to the illegal value itself */ 1.378 + result = sourceIllegal; 1.379 + break; 1.380 + } else { 1.381 + *target++ = UNI_REPLACEMENT_CHAR; 1.382 + } 1.383 + } else { 1.384 + *target++ = (UTF16)ch; /* normal case */ 1.385 + } 1.386 + } else if (ch > UNI_MAX_UTF16) { 1.387 + if (flags == strictConversion) { 1.388 + result = sourceIllegal; 1.389 + source -= (extraBytesToRead+1); /* return to the start */ 1.390 + break; /* Bail out; shouldn't continue */ 1.391 + } else { 1.392 + *target++ = UNI_REPLACEMENT_CHAR; 1.393 + } 1.394 + } else { 1.395 + /* target is a character in range 0xFFFF - 0x10FFFF. */ 1.396 + if (target + 1 >= targetEnd) { 1.397 + source -= (extraBytesToRead+1); /* Back up source pointer! */ 1.398 + result = targetExhausted; break; 1.399 + } 1.400 + ch -= halfBase; 1.401 + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 1.402 + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 1.403 + } 1.404 + } 1.405 + *sourceStart = source; 1.406 + *targetStart = target; 1.407 + return result; 1.408 +} 1.409 + 1.410 +/* --------------------------------------------------------------------- */ 1.411 + 1.412 +ConversionResult ConvertUTF32toUTF8 ( 1.413 + const UTF32** sourceStart, const UTF32* sourceEnd, 1.414 + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 1.415 + ConversionResult result = conversionOK; 1.416 + const UTF32* source = *sourceStart; 1.417 + UTF8* target = *targetStart; 1.418 + while (source < sourceEnd) { 1.419 + UTF32 ch; 1.420 + unsigned short bytesToWrite = 0; 1.421 + const UTF32 byteMask = 0xBF; 1.422 + const UTF32 byteMark = 0x80; 1.423 + ch = *source++; 1.424 + if (flags == strictConversion ) { 1.425 + /* UTF-16 surrogate values are illegal in UTF-32 */ 1.426 + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 1.427 + --source; /* return to the illegal value itself */ 1.428 + result = sourceIllegal; 1.429 + break; 1.430 + } 1.431 + } 1.432 + /* 1.433 + * Figure out how many bytes the result will require. Turn any 1.434 + * illegally large UTF32 things (> Plane 17) into replacement chars. 1.435 + */ 1.436 + if (ch < (UTF32)0x80) { bytesToWrite = 1; 1.437 + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 1.438 + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 1.439 + } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; 1.440 + } else { bytesToWrite = 3; 1.441 + ch = UNI_REPLACEMENT_CHAR; 1.442 + result = sourceIllegal; 1.443 + } 1.444 + 1.445 + target += bytesToWrite; 1.446 + if (target > targetEnd) { 1.447 + --source; /* Back up source pointer! */ 1.448 + target -= bytesToWrite; result = targetExhausted; break; 1.449 + } 1.450 + switch (bytesToWrite) { /* note: everything falls through. */ 1.451 + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 1.452 + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 1.453 + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 1.454 + case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); 1.455 + } 1.456 + target += bytesToWrite; 1.457 + } 1.458 + *sourceStart = source; 1.459 + *targetStart = target; 1.460 + return result; 1.461 +} 1.462 + 1.463 +/* --------------------------------------------------------------------- */ 1.464 + 1.465 +ConversionResult ConvertUTF8toUTF32 ( 1.466 + const UTF8** sourceStart, const UTF8* sourceEnd, 1.467 + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { 1.468 + ConversionResult result = conversionOK; 1.469 + const UTF8* source = *sourceStart; 1.470 + UTF32* target = *targetStart; 1.471 + while (source < sourceEnd) { 1.472 + UTF32 ch = 0; 1.473 + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 1.474 + if (source + extraBytesToRead >= sourceEnd) { 1.475 + result = sourceExhausted; break; 1.476 + } 1.477 + /* Do this check whether lenient or strict */ 1.478 + if (! isLegalUTF8(source, extraBytesToRead+1)) { 1.479 + result = sourceIllegal; 1.480 + break; 1.481 + } 1.482 + /* 1.483 + * The cases all fall through. See "Note A" below. 1.484 + */ 1.485 + switch (extraBytesToRead) { 1.486 + case 5: ch += *source++; ch <<= 6; 1.487 + case 4: ch += *source++; ch <<= 6; 1.488 + case 3: ch += *source++; ch <<= 6; 1.489 + case 2: ch += *source++; ch <<= 6; 1.490 + case 1: ch += *source++; ch <<= 6; 1.491 + case 0: ch += *source++; 1.492 + } 1.493 + ch -= offsetsFromUTF8[extraBytesToRead]; 1.494 + 1.495 + if (target >= targetEnd) { 1.496 + source -= (extraBytesToRead+1); /* Back up the source pointer! */ 1.497 + result = targetExhausted; break; 1.498 + } 1.499 + if (ch <= UNI_MAX_LEGAL_UTF32) { 1.500 + /* 1.501 + * UTF-16 surrogate values are illegal in UTF-32, and anything 1.502 + * over Plane 17 (> 0x10FFFF) is illegal. 1.503 + */ 1.504 + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 1.505 + if (flags == strictConversion) { 1.506 + source -= (extraBytesToRead+1); /* return to the illegal value itself */ 1.507 + result = sourceIllegal; 1.508 + break; 1.509 + } else { 1.510 + *target++ = UNI_REPLACEMENT_CHAR; 1.511 + } 1.512 + } else { 1.513 + *target++ = ch; 1.514 + } 1.515 + } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ 1.516 + result = sourceIllegal; 1.517 + *target++ = UNI_REPLACEMENT_CHAR; 1.518 + } 1.519 + } 1.520 + *sourceStart = source; 1.521 + *targetStart = target; 1.522 + return result; 1.523 +} 1.524 + 1.525 +/* --------------------------------------------------------------------- 1.526 + 1.527 + Note A. 1.528 + The fall-through switches in UTF-8 reading code save a 1.529 + temp variable, some decrements & conditionals. The switches 1.530 + are equivalent to the following loop: 1.531 + { 1.532 + int tmpBytesToRead = extraBytesToRead+1; 1.533 + do { 1.534 + ch += *source++; 1.535 + --tmpBytesToRead; 1.536 + if (tmpBytesToRead) ch <<= 6; 1.537 + } while (tmpBytesToRead > 0); 1.538 + } 1.539 + In UTF-8 writing code, the switches on "bytesToWrite" are 1.540 + similarly unrolled loops. 1.541 + 1.542 + --------------------------------------------------------------------- */