ovr_sdk
diff LibOVR/Src/Kernel/OVR_UTF8Util.cpp @ 0:1b39a1b46319
initial 0.4.4
author | John Tsiombikas <nuclear@member.fsf.org> |
---|---|
date | Wed, 14 Jan 2015 06:51:16 +0200 |
parents | |
children |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/LibOVR/Src/Kernel/OVR_UTF8Util.cpp Wed Jan 14 06:51:16 2015 +0200 1.3 @@ -0,0 +1,556 @@ 1.4 +/************************************************************************** 1.5 + 1.6 +Filename : OVR_UTF8Util.cpp 1.7 +Content : UTF8 Unicode character encoding/decoding support 1.8 +Created : September 19, 2012 1.9 +Notes : 1.10 +Notes : Much useful info at "UTF-8 and Unicode FAQ" 1.11 + http://www.cl.cam.ac.uk/~mgk25/unicode.html 1.12 + 1.13 +Copyright : Copyright 2014 Oculus VR, LLC All Rights reserved. 1.14 + 1.15 +Licensed under the Oculus VR Rift SDK License Version 3.2 (the "License"); 1.16 +you may not use the Oculus VR Rift SDK except in compliance with the License, 1.17 +which is provided at the time of installation or download, or which 1.18 +otherwise accompanies this software in either electronic or hard copy form. 1.19 + 1.20 +You may obtain a copy of the License at 1.21 + 1.22 +http://www.oculusvr.com/licenses/LICENSE-3.2 1.23 + 1.24 +Unless required by applicable law or agreed to in writing, the Oculus VR SDK 1.25 +distributed under the License is distributed on an "AS IS" BASIS, 1.26 +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.27 +See the License for the specific language governing permissions and 1.28 +limitations under the License. 1.29 + 1.30 +************************************************************************************/ 1.31 + 1.32 +#include "OVR_UTF8Util.h" 1.33 + 1.34 +namespace OVR { namespace UTF8Util { 1.35 + 1.36 +intptr_t OVR_STDCALL GetLength(const char* buf, intptr_t buflen) 1.37 +{ 1.38 + const char* p = buf; 1.39 + intptr_t length = 0; 1.40 + 1.41 + if (buflen != -1) 1.42 + { 1.43 + while (p - buf < buflen) 1.44 + { 1.45 + // We should be able to have ASStrings with 0 in the middle. 1.46 + UTF8Util::DecodeNextChar_Advance0(&p); 1.47 + length++; 1.48 + } 1.49 + } 1.50 + else 1.51 + { 1.52 + while (UTF8Util::DecodeNextChar_Advance0(&p)) 1.53 + length++; 1.54 + } 1.55 + 1.56 + return length; 1.57 +} 1.58 + 1.59 +uint32_t OVR_STDCALL GetCharAt(intptr_t index, const char* putf8str, intptr_t length) 1.60 +{ 1.61 + const char* buf = putf8str; 1.62 + uint32_t c = 0; 1.63 + 1.64 + if (length != -1) 1.65 + { 1.66 + while (buf - putf8str < length) 1.67 + { 1.68 + c = UTF8Util::DecodeNextChar_Advance0(&buf); 1.69 + if (index == 0) 1.70 + return c; 1.71 + index--; 1.72 + } 1.73 + 1.74 + return c; 1.75 + } 1.76 + 1.77 + do 1.78 + { 1.79 + c = UTF8Util::DecodeNextChar_Advance0(&buf); 1.80 + index--; 1.81 + 1.82 + if (c == 0) 1.83 + { 1.84 + // We've hit the end of the string; don't go further. 1.85 + OVR_ASSERT(index == 0); 1.86 + return c; 1.87 + } 1.88 + } while (index >= 0); 1.89 + 1.90 + return c; 1.91 +} 1.92 + 1.93 +intptr_t OVR_STDCALL GetByteIndex(intptr_t index, const char *putf8str, intptr_t length) 1.94 +{ 1.95 + const char* buf = putf8str; 1.96 + 1.97 + if (length != -1) 1.98 + { 1.99 + while ((buf - putf8str) < length && index > 0) 1.100 + { 1.101 + UTF8Util::DecodeNextChar_Advance0(&buf); 1.102 + index--; 1.103 + } 1.104 + 1.105 + return buf-putf8str; 1.106 + } 1.107 + 1.108 + while (index > 0) 1.109 + { 1.110 + uint32_t c = UTF8Util::DecodeNextChar_Advance0(&buf); 1.111 + index--; 1.112 + 1.113 + if (c == 0) 1.114 + return buf-putf8str; 1.115 + }; 1.116 + 1.117 + return buf-putf8str; 1.118 +} 1.119 + 1.120 +int OVR_STDCALL GetEncodeCharSize(uint32_t ucs_character) 1.121 +{ 1.122 + if (ucs_character <= 0x7F) 1.123 + return 1; 1.124 + else if (ucs_character <= 0x7FF) 1.125 + return 2; 1.126 + else if (ucs_character <= 0xFFFF) 1.127 + return 3; 1.128 + else if (ucs_character <= 0x1FFFFF) 1.129 + return 4; 1.130 + else if (ucs_character <= 0x3FFFFFF) 1.131 + return 5; 1.132 + else if (ucs_character <= 0x7FFFFFFF) 1.133 + return 6; 1.134 + else 1.135 + return 0; 1.136 +} 1.137 + 1.138 +uint32_t OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer) 1.139 +{ 1.140 + uint32_t uc; 1.141 + char c; 1.142 + 1.143 + // Security considerations: 1.144 + // 1.145 + // Changed, this is now only the case for DecodeNextChar: 1.146 + // - If we hit a zero byte, we want to return 0 without stepping 1.147 + // the buffer pointer past the 0. th 1.148 + // 1.149 + // If we hit an "overlong sequence"; i.e. a character encoded 1.150 + // in a longer multibyte string than is necessary, then we 1.151 + // need to discard the character. This is so attackers can't 1.152 + // disguise dangerous characters or character sequences -- 1.153 + // there is only one valid encoding for each character. 1.154 + // 1.155 + // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE, 1.156 + // 0xFFFF } then we ignore them; they are not valid in UTF-8. 1.157 + 1.158 + // This isn't actually an invalid character; it's a valid char that 1.159 + // looks like an inverted question mark. 1.160 +#define INVALID_CHAR 0x0FFFD 1.161 + 1.162 +#define FIRST_BYTE(mask, shift) \ 1.163 + uc = (c & (mask)) << (shift); 1.164 + 1.165 +#define NEXT_BYTE(shift) \ 1.166 + c = **putf8Buffer; \ 1.167 + if (c == 0) return 0; /* end of buffer, do not advance */ \ 1.168 + if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */ \ 1.169 + (*putf8Buffer)++; \ 1.170 + uc |= (c & 0x3F) << shift; 1.171 + 1.172 + c = **putf8Buffer; 1.173 + (*putf8Buffer)++; 1.174 + if (c == 0) 1.175 + return 0; // End of buffer. 1.176 + 1.177 + if ((c & 0x80) == 0) return (uint32_t) c; // Conventional 7-bit ASCII. 1.178 + 1.179 + // Multi-byte sequences. 1.180 + if ((c & 0xE0) == 0xC0) 1.181 + { 1.182 + // Two-byte sequence. 1.183 + FIRST_BYTE(0x1F, 6); 1.184 + NEXT_BYTE(0); 1.185 + if (uc < 0x80) return INVALID_CHAR; // overlong 1.186 + return uc; 1.187 + } 1.188 + else if ((c & 0xF0) == 0xE0) 1.189 + { 1.190 + // Three-byte sequence. 1.191 + FIRST_BYTE(0x0F, 12); 1.192 + NEXT_BYTE(6); 1.193 + NEXT_BYTE(0); 1.194 + if (uc < 0x800) return INVALID_CHAR; // overlong 1.195 + // Not valid ISO 10646, but Flash requires these to work 1.196 + // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0) 1.197 + // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR; 1.198 + // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646 1.199 + return uc; 1.200 + } 1.201 + else if ((c & 0xF8) == 0xF0) 1.202 + { 1.203 + // Four-byte sequence. 1.204 + FIRST_BYTE(0x07, 18); 1.205 + NEXT_BYTE(12); 1.206 + NEXT_BYTE(6); 1.207 + NEXT_BYTE(0); 1.208 + if (uc < 0x010000) return INVALID_CHAR; // overlong 1.209 + return uc; 1.210 + } 1.211 + else if ((c & 0xFC) == 0xF8) 1.212 + { 1.213 + // Five-byte sequence. 1.214 + FIRST_BYTE(0x03, 24); 1.215 + NEXT_BYTE(18); 1.216 + NEXT_BYTE(12); 1.217 + NEXT_BYTE(6); 1.218 + NEXT_BYTE(0); 1.219 + if (uc < 0x0200000) return INVALID_CHAR; // overlong 1.220 + return uc; 1.221 + } 1.222 + else if ((c & 0xFE) == 0xFC) 1.223 + { 1.224 + // Six-byte sequence. 1.225 + FIRST_BYTE(0x01, 30); 1.226 + NEXT_BYTE(24); 1.227 + NEXT_BYTE(18); 1.228 + NEXT_BYTE(12); 1.229 + NEXT_BYTE(6); 1.230 + NEXT_BYTE(0); 1.231 + if (uc < 0x04000000) return INVALID_CHAR; // overlong 1.232 + return uc; 1.233 + } 1.234 + else 1.235 + { 1.236 + // Invalid. 1.237 + return INVALID_CHAR; 1.238 + } 1.239 +} 1.240 + 1.241 + 1.242 +void OVR_STDCALL EncodeChar(char* pbuffer, intptr_t* pindex, uint32_t ucs_character) 1.243 +{ 1.244 + if (ucs_character <= 0x7F) 1.245 + { 1.246 + // Plain single-byte ASCII. 1.247 + pbuffer[(*pindex)++] = (char) ucs_character; 1.248 + } 1.249 + else if (ucs_character <= 0x7FF) 1.250 + { 1.251 + // Two bytes. 1.252 + pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6); 1.253 + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); 1.254 + } 1.255 + else if (ucs_character <= 0xFFFF) 1.256 + { 1.257 + // Three bytes. 1.258 + pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12); 1.259 + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); 1.260 + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); 1.261 + } 1.262 + else if (ucs_character <= 0x1FFFFF) 1.263 + { 1.264 + // Four bytes. 1.265 + pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18); 1.266 + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F); 1.267 + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); 1.268 + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); 1.269 + } 1.270 + else if (ucs_character <= 0x3FFFFFF) 1.271 + { 1.272 + // Five bytes. 1.273 + pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24); 1.274 + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F); 1.275 + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F); 1.276 + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); 1.277 + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); 1.278 + } 1.279 + else if (ucs_character <= 0x7FFFFFFF) 1.280 + { 1.281 + // Six bytes. 1.282 + pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30); 1.283 + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F); 1.284 + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F); 1.285 + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F); 1.286 + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); 1.287 + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); 1.288 + } 1.289 + else 1.290 + { 1.291 + // Invalid char; don't encode anything. 1.292 + } 1.293 +} 1.294 + 1.295 +intptr_t OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, intptr_t length) 1.296 +{ 1.297 + intptr_t len = 0; 1.298 + if (length != -1) 1.299 + for (int i = 0; i < length; i++) 1.300 + { 1.301 + len += GetEncodeCharSize(pchar[i]); 1.302 + } 1.303 + else 1.304 + for (int i = 0;; i++) 1.305 + { 1.306 + if (pchar[i] == 0) 1.307 + return len; 1.308 + len += GetEncodeCharSize(pchar[i]); 1.309 + } 1.310 + return len; 1.311 +} 1.312 + 1.313 +void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, intptr_t length) 1.314 +{ 1.315 + intptr_t ofs = 0; 1.316 + if (length != -1) 1.317 + { 1.318 + for (int i = 0; i < length; i++) 1.319 + { 1.320 + EncodeChar(pbuff, &ofs, pchar[i]); 1.321 + } 1.322 + } 1.323 + else 1.324 + { 1.325 + for (int i = 0;; i++) 1.326 + { 1.327 + if (pchar[i] == 0) 1.328 + break; 1.329 + EncodeChar(pbuff, &ofs, pchar[i]); 1.330 + } 1.331 + } 1.332 + pbuff[ofs] = 0; 1.333 +} 1.334 + 1.335 +size_t OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, intptr_t bytesLen) 1.336 +{ 1.337 + wchar_t *pbegin = pbuff; 1.338 + if (bytesLen == -1) 1.339 + { 1.340 + while (1) 1.341 + { 1.342 + uint32_t ch = DecodeNextChar_Advance0(&putf8str); 1.343 + if (ch == 0) 1.344 + break; 1.345 + else if (ch >= 0xFFFF) 1.346 + ch = 0xFFFD; 1.347 + *pbuff++ = wchar_t(ch); 1.348 + } 1.349 + } 1.350 + else 1.351 + { 1.352 + const char* p = putf8str; 1.353 + while ((p - putf8str) < bytesLen) 1.354 + { 1.355 + uint32_t ch = DecodeNextChar_Advance0(&p); 1.356 + if (ch >= 0xFFFF) 1.357 + ch = 0xFFFD; 1.358 + *pbuff++ = wchar_t(ch); 1.359 + } 1.360 + } 1.361 + 1.362 + *pbuff = 0; 1.363 + return pbuff - pbegin; 1.364 +} 1.365 + 1.366 + 1.367 +#ifdef UTF8_UNIT_TEST 1.368 + 1.369 +// Compile this test case with something like: 1.370 +// 1.371 +// gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test 1.372 +// 1.373 +// or 1.374 +// 1.375 +// cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I.. 1.376 +// 1.377 +// If possible, try running the test program with the first arg 1.378 +// pointing at the file: 1.379 +// 1.380 +// http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt 1.381 +// 1.382 +// and examine the results by eye to make sure they are acceptable to 1.383 +// you. 1.384 + 1.385 + 1.386 +#include "base/utility.h" 1.387 +#include <stdio.h> 1.388 + 1.389 + 1.390 +bool check_equal(const char* utf8_in, const uint32_t* ucs_in) 1.391 +{ 1.392 + for (;;) 1.393 + { 1.394 + uint32_t next_ucs = *ucs_in++; 1.395 + uint32_t next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in); 1.396 + if (next_ucs != next_ucs_from_utf8) 1.397 + { 1.398 + return false; 1.399 + } 1.400 + if (next_ucs == 0) 1.401 + { 1.402 + OVR_ASSERT(next_ucs_from_utf8 == 0); 1.403 + break; 1.404 + } 1.405 + } 1.406 + 1.407 + return true; 1.408 +} 1.409 + 1.410 + 1.411 +void log_ascii(const char* line) 1.412 +{ 1.413 + for (;;) 1.414 + { 1.415 + unsigned char c = (unsigned char) *line++; 1.416 + if (c == 0) 1.417 + { 1.418 + // End of line. 1.419 + return; 1.420 + } 1.421 + else if (c != '\n' 1.422 + && (c < 32 || c > 127)) 1.423 + { 1.424 + // Non-printable as plain ASCII. 1.425 + printf("<0x%02X>", (int) c); 1.426 + } 1.427 + else 1.428 + { 1.429 + printf("%c", c); 1.430 + } 1.431 + } 1.432 +} 1.433 + 1.434 + 1.435 +void log_ucs(const uint32_t* line) 1.436 +{ 1.437 + for (;;) 1.438 + { 1.439 + uint32_t uc = *line++; 1.440 + if (uc == 0) 1.441 + { 1.442 + // End of line. 1.443 + return; 1.444 + } 1.445 + else if (uc != '\n' 1.446 + && (uc < 32 || uc > 127)) 1.447 + { 1.448 + // Non-printable as plain ASCII. 1.449 + printf("<U-%04X>", uc); 1.450 + } 1.451 + else 1.452 + { 1.453 + printf("%c", (char) uc); 1.454 + } 1.455 + } 1.456 +} 1.457 + 1.458 + 1.459 +// Simple canned test. 1.460 +int main(int argc, const char* argv[]) 1.461 +{ 1.462 + { 1.463 + const char* test8 = "Ignacio CastaƱo"; 1.464 + const uint32_t test32[] = 1.465 + { 1.466 + 0x49, 0x67, 0x6E, 0x61, 0x63, 1.467 + 0x69, 0x6F, 0x20, 0x43, 0x61, 1.468 + 0x73, 0x74, 0x61, 0xF1, 0x6F, 1.469 + 0x00 1.470 + }; 1.471 + 1.472 + OVR_ASSERT(check_equal(test8, test32)); 1.473 + } 1.474 + 1.475 + // If user passed an arg, try reading the file as UTF-8 encoded text. 1.476 + if (argc > 1) 1.477 + { 1.478 + const char* filename = argv[1]; 1.479 + FILE* fp = fopen(filename, "rb"); 1.480 + if (fp == NULL) 1.481 + { 1.482 + printf("Can't open file '%s'\n", filename); 1.483 + return 1; 1.484 + } 1.485 + 1.486 + // Read lines from the file, encode/decode them, and highlight discrepancies. 1.487 + const int LINE_SIZE = 200; // max line size 1.488 + char line_buffer_utf8[LINE_SIZE]; 1.489 + char reencoded_utf8[6 * LINE_SIZE]; 1.490 + uint32_t line_buffer_ucs[LINE_SIZE]; 1.491 + 1.492 + int byte_counter = 0; 1.493 + for (;;) 1.494 + { 1.495 + int c = fgetc(fp); 1.496 + if (c == EOF) 1.497 + { 1.498 + // Done. 1.499 + break; 1.500 + } 1.501 + line_buffer_utf8[byte_counter++] = c; 1.502 + if (c == '\n' || byte_counter >= LINE_SIZE - 2) 1.503 + { 1.504 + // End of line. Process the line. 1.505 + line_buffer_utf8[byte_counter++] = 0; // terminate. 1.506 + 1.507 + // Decode into UCS. 1.508 + const char* p = line_buffer_utf8; 1.509 + uint32_t* q = line_buffer_ucs; 1.510 + for (;;) 1.511 + { 1.512 + uint32_t uc = UTF8Util::DecodeNextChar(&p); 1.513 + *q++ = uc; 1.514 + 1.515 + OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE); 1.516 + OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE); 1.517 + 1.518 + if (uc == 0) break; 1.519 + } 1.520 + 1.521 + // Encode back into UTF-8. 1.522 + q = line_buffer_ucs; 1.523 + int index = 0; 1.524 + for (;;) 1.525 + { 1.526 + uint32_t uc = *q++; 1.527 + OVR_ASSERT(index < LINE_SIZE * 6 - 6); 1.528 + int last_index = index; 1.529 + UTF8Util::EncodeChar(reencoded_utf8, &index, uc); 1.530 + OVR_ASSERT(index <= last_index + 6); 1.531 + if (uc == 0) break; 1.532 + } 1.533 + 1.534 + // This can be useful for debugging. 1.535 +#if 0 1.536 + // Show the UCS and the re-encoded UTF-8. 1.537 + log_ucs(line_buffer_ucs); 1.538 + log_ascii(reencoded_utf8); 1.539 +#endif // 0 1.540 + 1.541 + OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs)); 1.542 + OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs)); 1.543 + 1.544 + // Start next line. 1.545 + byte_counter = 0; 1.546 + } 1.547 + } 1.548 + 1.549 + fclose(fp); 1.550 + } 1.551 + 1.552 + return 0; 1.553 +} 1.554 + 1.555 + 1.556 +#endif // UTF8_UNIT_TEST 1.557 + 1.558 +}} // namespace UTF8Util::OVR 1.559 +