oculus1
diff libovr/Src/Kernel/OVR_UTF8Util.cpp @ 1:e2f9e4603129
added LibOVR and started a simple vr wrapper.
author | John Tsiombikas <nuclear@member.fsf.org> |
---|---|
date | Sat, 14 Sep 2013 16:14:59 +0300 |
parents | |
children | b069a5c27388 |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/libovr/Src/Kernel/OVR_UTF8Util.cpp Sat Sep 14 16:14:59 2013 +0300 1.3 @@ -0,0 +1,1 @@ 1.4 +/************************************************************************** 1.5 1.6 Filename : OVR_UTF8Util.cpp 1.7 Content : UTF8 Unicode character encoding/decoding support 1.8 Created : September 19, 2012 1.9 Notes : 1.10 Notes : Much useful info at "UTF-8 and Unicode FAQ" 1.11 http://www.cl.cam.ac.uk/~mgk25/unicode.html 1.12 1.13 Copyright : Copyright 2012 Oculus VR, Inc. All Rights reserved. 1.14 1.15 Use of this software is subject to the terms of the Oculus license 1.16 agreement provided at the time of installation or download, or which 1.17 otherwise accompanies this software in either electronic or hard copy form. 1.18 1.19 ************************************************************************************/ 1.20 1.21 #include "OVR_UTF8Util.h" 1.22 1.23 namespace OVR { namespace UTF8Util { 1.24 1.25 SPInt OVR_STDCALL GetLength(const char* buf, SPInt buflen) 1.26 { 1.27 const char* p = buf; 1.28 SPInt length = 0; 1.29 1.30 if (buflen != -1) 1.31 { 1.32 while (p - buf < buflen) 1.33 { 1.34 // We should be able to have ASStrings with 0 in the middle. 1.35 UTF8Util::DecodeNextChar_Advance0(&p); 1.36 length++; 1.37 } 1.38 } 1.39 else 1.40 { 1.41 while (UTF8Util::DecodeNextChar_Advance0(&p)) 1.42 length++; 1.43 } 1.44 1.45 return length; 1.46 } 1.47 1.48 UInt32 OVR_STDCALL GetCharAt(SPInt index, const char* putf8str, SPInt length) 1.49 { 1.50 const char* buf = putf8str; 1.51 UInt32 c = 0; 1.52 1.53 if (length != -1) 1.54 { 1.55 while (buf - putf8str < length) 1.56 { 1.57 c = UTF8Util::DecodeNextChar_Advance0(&buf); 1.58 if (index == 0) 1.59 return c; 1.60 index--; 1.61 } 1.62 1.63 return c; 1.64 } 1.65 1.66 do 1.67 { 1.68 c = UTF8Util::DecodeNextChar_Advance0(&buf); 1.69 index--; 1.70 1.71 if (c == 0) 1.72 { 1.73 // We've hit the end of the string; don't go further. 1.74 OVR_ASSERT(index == 0); 1.75 return c; 1.76 } 1.77 } while (index >= 0); 1.78 1.79 return c; 1.80 } 1.81 1.82 SPInt OVR_STDCALL GetByteIndex(SPInt index, const char *putf8str, SPInt length) 1.83 { 1.84 const char* buf = putf8str; 1.85 1.86 if (length != -1) 1.87 { 1.88 while ((buf - putf8str) < length && index > 0) 1.89 { 1.90 UTF8Util::DecodeNextChar_Advance0(&buf); 1.91 index--; 1.92 } 1.93 1.94 return buf-putf8str; 1.95 } 1.96 1.97 while (index > 0) 1.98 { 1.99 UInt32 c = UTF8Util::DecodeNextChar_Advance0(&buf); 1.100 index--; 1.101 1.102 if (c == 0) 1.103 return buf-putf8str; 1.104 }; 1.105 1.106 return buf-putf8str; 1.107 } 1.108 1.109 int OVR_STDCALL GetEncodeCharSize(UInt32 ucs_character) 1.110 { 1.111 if (ucs_character <= 0x7F) 1.112 return 1; 1.113 else if (ucs_character <= 0x7FF) 1.114 return 2; 1.115 else if (ucs_character <= 0xFFFF) 1.116 return 3; 1.117 else if (ucs_character <= 0x1FFFFF) 1.118 return 4; 1.119 else if (ucs_character <= 0x3FFFFFF) 1.120 return 5; 1.121 else if (ucs_character <= 0x7FFFFFFF) 1.122 return 6; 1.123 else 1.124 return 0; 1.125 } 1.126 1.127 UInt32 OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer) 1.128 { 1.129 UInt32 uc; 1.130 char c; 1.131 1.132 // Security considerations: 1.133 // 1.134 // Changed, this is now only the case for DecodeNextChar: 1.135 // - If we hit a zero byte, we want to return 0 without stepping 1.136 // the buffer pointer past the 0. th 1.137 // 1.138 // If we hit an "overlong sequence"; i.e. a character encoded 1.139 // in a longer multibyte string than is necessary, then we 1.140 // need to discard the character. This is so attackers can't 1.141 // disguise dangerous characters or character sequences -- 1.142 // there is only one valid encoding for each character. 1.143 // 1.144 // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE, 1.145 // 0xFFFF } then we ignore them; they are not valid in UTF-8. 1.146 1.147 // This isn't actually an invalid character; it's a valid char that 1.148 // looks like an inverted question mark. 1.149 #define INVALID_CHAR 0x0FFFD 1.150 1.151 #define FIRST_BYTE(mask, shift) \ 1.152 uc = (c & (mask)) << (shift); 1.153 1.154 #define NEXT_BYTE(shift) \ 1.155 c = **putf8Buffer; \ 1.156 if (c == 0) return 0; /* end of buffer, do not advance */ \ 1.157 if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */ \ 1.158 (*putf8Buffer)++; \ 1.159 uc |= (c & 0x3F) << shift; 1.160 1.161 c = **putf8Buffer; 1.162 (*putf8Buffer)++; 1.163 if (c == 0) 1.164 return 0; // End of buffer. 1.165 1.166 if ((c & 0x80) == 0) return (UInt32) c; // Conventional 7-bit ASCII. 1.167 1.168 // Multi-byte sequences. 1.169 if ((c & 0xE0) == 0xC0) 1.170 { 1.171 // Two-byte sequence. 1.172 FIRST_BYTE(0x1F, 6); 1.173 NEXT_BYTE(0); 1.174 if (uc < 0x80) return INVALID_CHAR; // overlong 1.175 return uc; 1.176 } 1.177 else if ((c & 0xF0) == 0xE0) 1.178 { 1.179 // Three-byte sequence. 1.180 FIRST_BYTE(0x0F, 12); 1.181 NEXT_BYTE(6); 1.182 NEXT_BYTE(0); 1.183 if (uc < 0x800) return INVALID_CHAR; // overlong 1.184 // Not valid ISO 10646, but Flash requires these to work 1.185 // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0) 1.186 // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR; 1.187 // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646 1.188 return uc; 1.189 } 1.190 else if ((c & 0xF8) == 0xF0) 1.191 { 1.192 // Four-byte sequence. 1.193 FIRST_BYTE(0x07, 18); 1.194 NEXT_BYTE(12); 1.195 NEXT_BYTE(6); 1.196 NEXT_BYTE(0); 1.197 if (uc < 0x010000) return INVALID_CHAR; // overlong 1.198 return uc; 1.199 } 1.200 else if ((c & 0xFC) == 0xF8) 1.201 { 1.202 // Five-byte sequence. 1.203 FIRST_BYTE(0x03, 24); 1.204 NEXT_BYTE(18); 1.205 NEXT_BYTE(12); 1.206 NEXT_BYTE(6); 1.207 NEXT_BYTE(0); 1.208 if (uc < 0x0200000) return INVALID_CHAR; // overlong 1.209 return uc; 1.210 } 1.211 else if ((c & 0xFE) == 0xFC) 1.212 { 1.213 // Six-byte sequence. 1.214 FIRST_BYTE(0x01, 30); 1.215 NEXT_BYTE(24); 1.216 NEXT_BYTE(18); 1.217 NEXT_BYTE(12); 1.218 NEXT_BYTE(6); 1.219 NEXT_BYTE(0); 1.220 if (uc < 0x04000000) return INVALID_CHAR; // overlong 1.221 return uc; 1.222 } 1.223 else 1.224 { 1.225 // Invalid. 1.226 return INVALID_CHAR; 1.227 } 1.228 } 1.229 1.230 1.231 void OVR_STDCALL EncodeChar(char* pbuffer, SPInt* pindex, UInt32 ucs_character) 1.232 { 1.233 if (ucs_character <= 0x7F) 1.234 { 1.235 // Plain single-byte ASCII. 1.236 pbuffer[(*pindex)++] = (char) ucs_character; 1.237 } 1.238 else if (ucs_character <= 0x7FF) 1.239 { 1.240 // Two bytes. 1.241 pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6); 1.242 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); 1.243 } 1.244 else if (ucs_character <= 0xFFFF) 1.245 { 1.246 // Three bytes. 1.247 pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12); 1.248 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); 1.249 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); 1.250 } 1.251 else if (ucs_character <= 0x1FFFFF) 1.252 { 1.253 // Four bytes. 1.254 pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18); 1.255 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F); 1.256 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); 1.257 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); 1.258 } 1.259 else if (ucs_character <= 0x3FFFFFF) 1.260 { 1.261 // Five bytes. 1.262 pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24); 1.263 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F); 1.264 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F); 1.265 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); 1.266 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); 1.267 } 1.268 else if (ucs_character <= 0x7FFFFFFF) 1.269 { 1.270 // Six bytes. 1.271 pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30); 1.272 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F); 1.273 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F); 1.274 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F); 1.275 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); 1.276 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); 1.277 } 1.278 else 1.279 { 1.280 // Invalid char; don't encode anything. 1.281 } 1.282 } 1.283 1.284 SPInt OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, SPInt length) 1.285 { 1.286 SPInt len = 0; 1.287 if (length != -1) 1.288 for (int i = 0; i < length; i++) 1.289 { 1.290 len += GetEncodeCharSize(pchar[i]); 1.291 } 1.292 else 1.293 for (int i = 0;; i++) 1.294 { 1.295 if (pchar[i] == 0) 1.296 return len; 1.297 len += GetEncodeCharSize(pchar[i]); 1.298 } 1.299 return len; 1.300 } 1.301 1.302 void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, SPInt length) 1.303 { 1.304 SPInt ofs = 0; 1.305 if (length != -1) 1.306 { 1.307 for (int i = 0; i < length; i++) 1.308 { 1.309 EncodeChar(pbuff, &ofs, pchar[i]); 1.310 } 1.311 } 1.312 else 1.313 { 1.314 for (int i = 0;; i++) 1.315 { 1.316 if (pchar[i] == 0) 1.317 break; 1.318 EncodeChar(pbuff, &ofs, pchar[i]); 1.319 } 1.320 } 1.321 pbuff[ofs] = 0; 1.322 } 1.323 1.324 UPInt OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, SPInt bytesLen) 1.325 { 1.326 wchar_t *pbegin = pbuff; 1.327 if (bytesLen == -1) 1.328 { 1.329 while (1) 1.330 { 1.331 UInt32 ch = DecodeNextChar_Advance0(&putf8str); 1.332 if (ch == 0) 1.333 break; 1.334 else if (ch >= 0xFFFF) 1.335 ch = 0xFFFD; 1.336 *pbuff++ = wchar_t(ch); 1.337 } 1.338 } 1.339 else 1.340 { 1.341 const char* p = putf8str; 1.342 while ((p - putf8str) < bytesLen) 1.343 { 1.344 UInt32 ch = DecodeNextChar_Advance0(&p); 1.345 if (ch >= 0xFFFF) 1.346 ch = 0xFFFD; 1.347 *pbuff++ = wchar_t(ch); 1.348 } 1.349 } 1.350 1.351 *pbuff = 0; 1.352 return pbuff - pbegin; 1.353 } 1.354 1.355 1.356 #ifdef UTF8_UNIT_TEST 1.357 1.358 // Compile this test case with something like: 1.359 // 1.360 // gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test 1.361 // 1.362 // or 1.363 // 1.364 // cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I.. 1.365 // 1.366 // If possible, try running the test program with the first arg 1.367 // pointing at the file: 1.368 // 1.369 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt 1.370 // 1.371 // and examine the results by eye to make sure they are acceptable to 1.372 // you. 1.373 1.374 1.375 #include "base/utility.h" 1.376 #include <stdio.h> 1.377 1.378 1.379 bool check_equal(const char* utf8_in, const UInt32* ucs_in) 1.380 { 1.381 for (;;) 1.382 { 1.383 UInt32 next_ucs = *ucs_in++; 1.384 UInt32 next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in); 1.385 if (next_ucs != next_ucs_from_utf8) 1.386 { 1.387 return false; 1.388 } 1.389 if (next_ucs == 0) 1.390 { 1.391 OVR_ASSERT(next_ucs_from_utf8 == 0); 1.392 break; 1.393 } 1.394 } 1.395 1.396 return true; 1.397 } 1.398 1.399 1.400 void log_ascii(const char* line) 1.401 { 1.402 for (;;) 1.403 { 1.404 unsigned char c = (unsigned char) *line++; 1.405 if (c == 0) 1.406 { 1.407 // End of line. 1.408 return; 1.409 } 1.410 else if (c != '\n' 1.411 && (c < 32 || c > 127)) 1.412 { 1.413 // Non-printable as plain ASCII. 1.414 printf("<0x%02X>", (int) c); 1.415 } 1.416 else 1.417 { 1.418 printf("%c", c); 1.419 } 1.420 } 1.421 } 1.422 1.423 1.424 void log_ucs(const UInt32* line) 1.425 { 1.426 for (;;) 1.427 { 1.428 UInt32 uc = *line++; 1.429 if (uc == 0) 1.430 { 1.431 // End of line. 1.432 return; 1.433 } 1.434 else if (uc != '\n' 1.435 && (uc < 32 || uc > 127)) 1.436 { 1.437 // Non-printable as plain ASCII. 1.438 printf("<U-%04X>", uc); 1.439 } 1.440 else 1.441 { 1.442 printf("%c", (char) uc); 1.443 } 1.444 } 1.445 } 1.446 1.447 1.448 // Simple canned test. 1.449 int main(int argc, const char* argv[]) 1.450 { 1.451 { 1.452 const char* test8 = "Ignacio CastaƱo"; 1.453 const UInt32 test32[] = 1.454 { 1.455 0x49, 0x67, 0x6E, 0x61, 0x63, 1.456 0x69, 0x6F, 0x20, 0x43, 0x61, 1.457 0x73, 0x74, 0x61, 0xF1, 0x6F, 1.458 0x00 1.459 }; 1.460 1.461 OVR_ASSERT(check_equal(test8, test32)); 1.462 } 1.463 1.464 // If user passed an arg, try reading the file as UTF-8 encoded text. 1.465 if (argc > 1) 1.466 { 1.467 const char* filename = argv[1]; 1.468 FILE* fp = fopen(filename, "rb"); 1.469 if (fp == NULL) 1.470 { 1.471 printf("Can't open file '%s'\n", filename); 1.472 return 1; 1.473 } 1.474 1.475 // Read lines from the file, encode/decode them, and highlight discrepancies. 1.476 const int LINE_SIZE = 200; // max line size 1.477 char line_buffer_utf8[LINE_SIZE]; 1.478 char reencoded_utf8[6 * LINE_SIZE]; 1.479 UInt32 line_buffer_ucs[LINE_SIZE]; 1.480 1.481 int byte_counter = 0; 1.482 for (;;) 1.483 { 1.484 int c = fgetc(fp); 1.485 if (c == EOF) 1.486 { 1.487 // Done. 1.488 break; 1.489 } 1.490 line_buffer_utf8[byte_counter++] = c; 1.491 if (c == '\n' || byte_counter >= LINE_SIZE - 2) 1.492 { 1.493 // End of line. Process the line. 1.494 line_buffer_utf8[byte_counter++] = 0; // terminate. 1.495 1.496 // Decode into UCS. 1.497 const char* p = line_buffer_utf8; 1.498 UInt32* q = line_buffer_ucs; 1.499 for (;;) 1.500 { 1.501 UInt32 uc = UTF8Util::DecodeNextChar(&p); 1.502 *q++ = uc; 1.503 1.504 OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE); 1.505 OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE); 1.506 1.507 if (uc == 0) break; 1.508 } 1.509 1.510 // Encode back into UTF-8. 1.511 q = line_buffer_ucs; 1.512 int index = 0; 1.513 for (;;) 1.514 { 1.515 UInt32 uc = *q++; 1.516 OVR_ASSERT(index < LINE_SIZE * 6 - 6); 1.517 int last_index = index; 1.518 UTF8Util::EncodeChar(reencoded_utf8, &index, uc); 1.519 OVR_ASSERT(index <= last_index + 6); 1.520 if (uc == 0) break; 1.521 } 1.522 1.523 // This can be useful for debugging. 1.524 #if 0 1.525 // Show the UCS and the re-encoded UTF-8. 1.526 log_ucs(line_buffer_ucs); 1.527 log_ascii(reencoded_utf8); 1.528 #endif // 0 1.529 1.530 OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs)); 1.531 OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs)); 1.532 1.533 // Start next line. 1.534 byte_counter = 0; 1.535 } 1.536 } 1.537 1.538 fclose(fp); 1.539 } 1.540 1.541 return 0; 1.542 } 1.543 1.544 1.545 #endif // UTF8_UNIT_TEST 1.546 1.547 }} // namespace UTF8Util::OVR 1.548 1.549 \ No newline at end of file