ovr_sdk

annotate LibOVR/Src/Kernel/OVR_UTF8Util.cpp @ 0:1b39a1b46319

initial 0.4.4
author John Tsiombikas <nuclear@member.fsf.org>
date Wed, 14 Jan 2015 06:51:16 +0200
parents
children
rev   line source
nuclear@0 1 /**************************************************************************
nuclear@0 2
nuclear@0 3 Filename : OVR_UTF8Util.cpp
nuclear@0 4 Content : UTF8 Unicode character encoding/decoding support
nuclear@0 5 Created : September 19, 2012
nuclear@0 6 Notes :
nuclear@0 7 Notes : Much useful info at "UTF-8 and Unicode FAQ"
nuclear@0 8 http://www.cl.cam.ac.uk/~mgk25/unicode.html
nuclear@0 9
nuclear@0 10 Copyright : Copyright 2014 Oculus VR, LLC All Rights reserved.
nuclear@0 11
nuclear@0 12 Licensed under the Oculus VR Rift SDK License Version 3.2 (the "License");
nuclear@0 13 you may not use the Oculus VR Rift SDK except in compliance with the License,
nuclear@0 14 which is provided at the time of installation or download, or which
nuclear@0 15 otherwise accompanies this software in either electronic or hard copy form.
nuclear@0 16
nuclear@0 17 You may obtain a copy of the License at
nuclear@0 18
nuclear@0 19 http://www.oculusvr.com/licenses/LICENSE-3.2
nuclear@0 20
nuclear@0 21 Unless required by applicable law or agreed to in writing, the Oculus VR SDK
nuclear@0 22 distributed under the License is distributed on an "AS IS" BASIS,
nuclear@0 23 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
nuclear@0 24 See the License for the specific language governing permissions and
nuclear@0 25 limitations under the License.
nuclear@0 26
nuclear@0 27 ************************************************************************************/
nuclear@0 28
nuclear@0 29 #include "OVR_UTF8Util.h"
nuclear@0 30
nuclear@0 31 namespace OVR { namespace UTF8Util {
nuclear@0 32
nuclear@0 33 intptr_t OVR_STDCALL GetLength(const char* buf, intptr_t buflen)
nuclear@0 34 {
nuclear@0 35 const char* p = buf;
nuclear@0 36 intptr_t length = 0;
nuclear@0 37
nuclear@0 38 if (buflen != -1)
nuclear@0 39 {
nuclear@0 40 while (p - buf < buflen)
nuclear@0 41 {
nuclear@0 42 // We should be able to have ASStrings with 0 in the middle.
nuclear@0 43 UTF8Util::DecodeNextChar_Advance0(&p);
nuclear@0 44 length++;
nuclear@0 45 }
nuclear@0 46 }
nuclear@0 47 else
nuclear@0 48 {
nuclear@0 49 while (UTF8Util::DecodeNextChar_Advance0(&p))
nuclear@0 50 length++;
nuclear@0 51 }
nuclear@0 52
nuclear@0 53 return length;
nuclear@0 54 }
nuclear@0 55
nuclear@0 56 uint32_t OVR_STDCALL GetCharAt(intptr_t index, const char* putf8str, intptr_t length)
nuclear@0 57 {
nuclear@0 58 const char* buf = putf8str;
nuclear@0 59 uint32_t c = 0;
nuclear@0 60
nuclear@0 61 if (length != -1)
nuclear@0 62 {
nuclear@0 63 while (buf - putf8str < length)
nuclear@0 64 {
nuclear@0 65 c = UTF8Util::DecodeNextChar_Advance0(&buf);
nuclear@0 66 if (index == 0)
nuclear@0 67 return c;
nuclear@0 68 index--;
nuclear@0 69 }
nuclear@0 70
nuclear@0 71 return c;
nuclear@0 72 }
nuclear@0 73
nuclear@0 74 do
nuclear@0 75 {
nuclear@0 76 c = UTF8Util::DecodeNextChar_Advance0(&buf);
nuclear@0 77 index--;
nuclear@0 78
nuclear@0 79 if (c == 0)
nuclear@0 80 {
nuclear@0 81 // We've hit the end of the string; don't go further.
nuclear@0 82 OVR_ASSERT(index == 0);
nuclear@0 83 return c;
nuclear@0 84 }
nuclear@0 85 } while (index >= 0);
nuclear@0 86
nuclear@0 87 return c;
nuclear@0 88 }
nuclear@0 89
nuclear@0 90 intptr_t OVR_STDCALL GetByteIndex(intptr_t index, const char *putf8str, intptr_t length)
nuclear@0 91 {
nuclear@0 92 const char* buf = putf8str;
nuclear@0 93
nuclear@0 94 if (length != -1)
nuclear@0 95 {
nuclear@0 96 while ((buf - putf8str) < length && index > 0)
nuclear@0 97 {
nuclear@0 98 UTF8Util::DecodeNextChar_Advance0(&buf);
nuclear@0 99 index--;
nuclear@0 100 }
nuclear@0 101
nuclear@0 102 return buf-putf8str;
nuclear@0 103 }
nuclear@0 104
nuclear@0 105 while (index > 0)
nuclear@0 106 {
nuclear@0 107 uint32_t c = UTF8Util::DecodeNextChar_Advance0(&buf);
nuclear@0 108 index--;
nuclear@0 109
nuclear@0 110 if (c == 0)
nuclear@0 111 return buf-putf8str;
nuclear@0 112 };
nuclear@0 113
nuclear@0 114 return buf-putf8str;
nuclear@0 115 }
nuclear@0 116
nuclear@0 117 int OVR_STDCALL GetEncodeCharSize(uint32_t ucs_character)
nuclear@0 118 {
nuclear@0 119 if (ucs_character <= 0x7F)
nuclear@0 120 return 1;
nuclear@0 121 else if (ucs_character <= 0x7FF)
nuclear@0 122 return 2;
nuclear@0 123 else if (ucs_character <= 0xFFFF)
nuclear@0 124 return 3;
nuclear@0 125 else if (ucs_character <= 0x1FFFFF)
nuclear@0 126 return 4;
nuclear@0 127 else if (ucs_character <= 0x3FFFFFF)
nuclear@0 128 return 5;
nuclear@0 129 else if (ucs_character <= 0x7FFFFFFF)
nuclear@0 130 return 6;
nuclear@0 131 else
nuclear@0 132 return 0;
nuclear@0 133 }
nuclear@0 134
nuclear@0 135 uint32_t OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer)
nuclear@0 136 {
nuclear@0 137 uint32_t uc;
nuclear@0 138 char c;
nuclear@0 139
nuclear@0 140 // Security considerations:
nuclear@0 141 //
nuclear@0 142 // Changed, this is now only the case for DecodeNextChar:
nuclear@0 143 // - If we hit a zero byte, we want to return 0 without stepping
nuclear@0 144 // the buffer pointer past the 0. th
nuclear@0 145 //
nuclear@0 146 // If we hit an "overlong sequence"; i.e. a character encoded
nuclear@0 147 // in a longer multibyte string than is necessary, then we
nuclear@0 148 // need to discard the character. This is so attackers can't
nuclear@0 149 // disguise dangerous characters or character sequences --
nuclear@0 150 // there is only one valid encoding for each character.
nuclear@0 151 //
nuclear@0 152 // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
nuclear@0 153 // 0xFFFF } then we ignore them; they are not valid in UTF-8.
nuclear@0 154
nuclear@0 155 // This isn't actually an invalid character; it's a valid char that
nuclear@0 156 // looks like an inverted question mark.
nuclear@0 157 #define INVALID_CHAR 0x0FFFD
nuclear@0 158
nuclear@0 159 #define FIRST_BYTE(mask, shift) \
nuclear@0 160 uc = (c & (mask)) << (shift);
nuclear@0 161
nuclear@0 162 #define NEXT_BYTE(shift) \
nuclear@0 163 c = **putf8Buffer; \
nuclear@0 164 if (c == 0) return 0; /* end of buffer, do not advance */ \
nuclear@0 165 if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */ \
nuclear@0 166 (*putf8Buffer)++; \
nuclear@0 167 uc |= (c & 0x3F) << shift;
nuclear@0 168
nuclear@0 169 c = **putf8Buffer;
nuclear@0 170 (*putf8Buffer)++;
nuclear@0 171 if (c == 0)
nuclear@0 172 return 0; // End of buffer.
nuclear@0 173
nuclear@0 174 if ((c & 0x80) == 0) return (uint32_t) c; // Conventional 7-bit ASCII.
nuclear@0 175
nuclear@0 176 // Multi-byte sequences.
nuclear@0 177 if ((c & 0xE0) == 0xC0)
nuclear@0 178 {
nuclear@0 179 // Two-byte sequence.
nuclear@0 180 FIRST_BYTE(0x1F, 6);
nuclear@0 181 NEXT_BYTE(0);
nuclear@0 182 if (uc < 0x80) return INVALID_CHAR; // overlong
nuclear@0 183 return uc;
nuclear@0 184 }
nuclear@0 185 else if ((c & 0xF0) == 0xE0)
nuclear@0 186 {
nuclear@0 187 // Three-byte sequence.
nuclear@0 188 FIRST_BYTE(0x0F, 12);
nuclear@0 189 NEXT_BYTE(6);
nuclear@0 190 NEXT_BYTE(0);
nuclear@0 191 if (uc < 0x800) return INVALID_CHAR; // overlong
nuclear@0 192 // Not valid ISO 10646, but Flash requires these to work
nuclear@0 193 // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0)
nuclear@0 194 // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR;
nuclear@0 195 // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646
nuclear@0 196 return uc;
nuclear@0 197 }
nuclear@0 198 else if ((c & 0xF8) == 0xF0)
nuclear@0 199 {
nuclear@0 200 // Four-byte sequence.
nuclear@0 201 FIRST_BYTE(0x07, 18);
nuclear@0 202 NEXT_BYTE(12);
nuclear@0 203 NEXT_BYTE(6);
nuclear@0 204 NEXT_BYTE(0);
nuclear@0 205 if (uc < 0x010000) return INVALID_CHAR; // overlong
nuclear@0 206 return uc;
nuclear@0 207 }
nuclear@0 208 else if ((c & 0xFC) == 0xF8)
nuclear@0 209 {
nuclear@0 210 // Five-byte sequence.
nuclear@0 211 FIRST_BYTE(0x03, 24);
nuclear@0 212 NEXT_BYTE(18);
nuclear@0 213 NEXT_BYTE(12);
nuclear@0 214 NEXT_BYTE(6);
nuclear@0 215 NEXT_BYTE(0);
nuclear@0 216 if (uc < 0x0200000) return INVALID_CHAR; // overlong
nuclear@0 217 return uc;
nuclear@0 218 }
nuclear@0 219 else if ((c & 0xFE) == 0xFC)
nuclear@0 220 {
nuclear@0 221 // Six-byte sequence.
nuclear@0 222 FIRST_BYTE(0x01, 30);
nuclear@0 223 NEXT_BYTE(24);
nuclear@0 224 NEXT_BYTE(18);
nuclear@0 225 NEXT_BYTE(12);
nuclear@0 226 NEXT_BYTE(6);
nuclear@0 227 NEXT_BYTE(0);
nuclear@0 228 if (uc < 0x04000000) return INVALID_CHAR; // overlong
nuclear@0 229 return uc;
nuclear@0 230 }
nuclear@0 231 else
nuclear@0 232 {
nuclear@0 233 // Invalid.
nuclear@0 234 return INVALID_CHAR;
nuclear@0 235 }
nuclear@0 236 }
nuclear@0 237
nuclear@0 238
nuclear@0 239 void OVR_STDCALL EncodeChar(char* pbuffer, intptr_t* pindex, uint32_t ucs_character)
nuclear@0 240 {
nuclear@0 241 if (ucs_character <= 0x7F)
nuclear@0 242 {
nuclear@0 243 // Plain single-byte ASCII.
nuclear@0 244 pbuffer[(*pindex)++] = (char) ucs_character;
nuclear@0 245 }
nuclear@0 246 else if (ucs_character <= 0x7FF)
nuclear@0 247 {
nuclear@0 248 // Two bytes.
nuclear@0 249 pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6);
nuclear@0 250 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
nuclear@0 251 }
nuclear@0 252 else if (ucs_character <= 0xFFFF)
nuclear@0 253 {
nuclear@0 254 // Three bytes.
nuclear@0 255 pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12);
nuclear@0 256 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
nuclear@0 257 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
nuclear@0 258 }
nuclear@0 259 else if (ucs_character <= 0x1FFFFF)
nuclear@0 260 {
nuclear@0 261 // Four bytes.
nuclear@0 262 pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18);
nuclear@0 263 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
nuclear@0 264 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
nuclear@0 265 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
nuclear@0 266 }
nuclear@0 267 else if (ucs_character <= 0x3FFFFFF)
nuclear@0 268 {
nuclear@0 269 // Five bytes.
nuclear@0 270 pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24);
nuclear@0 271 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
nuclear@0 272 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
nuclear@0 273 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
nuclear@0 274 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
nuclear@0 275 }
nuclear@0 276 else if (ucs_character <= 0x7FFFFFFF)
nuclear@0 277 {
nuclear@0 278 // Six bytes.
nuclear@0 279 pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30);
nuclear@0 280 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F);
nuclear@0 281 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
nuclear@0 282 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
nuclear@0 283 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
nuclear@0 284 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
nuclear@0 285 }
nuclear@0 286 else
nuclear@0 287 {
nuclear@0 288 // Invalid char; don't encode anything.
nuclear@0 289 }
nuclear@0 290 }
nuclear@0 291
nuclear@0 292 intptr_t OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, intptr_t length)
nuclear@0 293 {
nuclear@0 294 intptr_t len = 0;
nuclear@0 295 if (length != -1)
nuclear@0 296 for (int i = 0; i < length; i++)
nuclear@0 297 {
nuclear@0 298 len += GetEncodeCharSize(pchar[i]);
nuclear@0 299 }
nuclear@0 300 else
nuclear@0 301 for (int i = 0;; i++)
nuclear@0 302 {
nuclear@0 303 if (pchar[i] == 0)
nuclear@0 304 return len;
nuclear@0 305 len += GetEncodeCharSize(pchar[i]);
nuclear@0 306 }
nuclear@0 307 return len;
nuclear@0 308 }
nuclear@0 309
nuclear@0 310 void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, intptr_t length)
nuclear@0 311 {
nuclear@0 312 intptr_t ofs = 0;
nuclear@0 313 if (length != -1)
nuclear@0 314 {
nuclear@0 315 for (int i = 0; i < length; i++)
nuclear@0 316 {
nuclear@0 317 EncodeChar(pbuff, &ofs, pchar[i]);
nuclear@0 318 }
nuclear@0 319 }
nuclear@0 320 else
nuclear@0 321 {
nuclear@0 322 for (int i = 0;; i++)
nuclear@0 323 {
nuclear@0 324 if (pchar[i] == 0)
nuclear@0 325 break;
nuclear@0 326 EncodeChar(pbuff, &ofs, pchar[i]);
nuclear@0 327 }
nuclear@0 328 }
nuclear@0 329 pbuff[ofs] = 0;
nuclear@0 330 }
nuclear@0 331
nuclear@0 332 size_t OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, intptr_t bytesLen)
nuclear@0 333 {
nuclear@0 334 wchar_t *pbegin = pbuff;
nuclear@0 335 if (bytesLen == -1)
nuclear@0 336 {
nuclear@0 337 while (1)
nuclear@0 338 {
nuclear@0 339 uint32_t ch = DecodeNextChar_Advance0(&putf8str);
nuclear@0 340 if (ch == 0)
nuclear@0 341 break;
nuclear@0 342 else if (ch >= 0xFFFF)
nuclear@0 343 ch = 0xFFFD;
nuclear@0 344 *pbuff++ = wchar_t(ch);
nuclear@0 345 }
nuclear@0 346 }
nuclear@0 347 else
nuclear@0 348 {
nuclear@0 349 const char* p = putf8str;
nuclear@0 350 while ((p - putf8str) < bytesLen)
nuclear@0 351 {
nuclear@0 352 uint32_t ch = DecodeNextChar_Advance0(&p);
nuclear@0 353 if (ch >= 0xFFFF)
nuclear@0 354 ch = 0xFFFD;
nuclear@0 355 *pbuff++ = wchar_t(ch);
nuclear@0 356 }
nuclear@0 357 }
nuclear@0 358
nuclear@0 359 *pbuff = 0;
nuclear@0 360 return pbuff - pbegin;
nuclear@0 361 }
nuclear@0 362
nuclear@0 363
nuclear@0 364 #ifdef UTF8_UNIT_TEST
nuclear@0 365
nuclear@0 366 // Compile this test case with something like:
nuclear@0 367 //
nuclear@0 368 // gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test
nuclear@0 369 //
nuclear@0 370 // or
nuclear@0 371 //
nuclear@0 372 // cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I..
nuclear@0 373 //
nuclear@0 374 // If possible, try running the test program with the first arg
nuclear@0 375 // pointing at the file:
nuclear@0 376 //
nuclear@0 377 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
nuclear@0 378 //
nuclear@0 379 // and examine the results by eye to make sure they are acceptable to
nuclear@0 380 // you.
nuclear@0 381
nuclear@0 382
nuclear@0 383 #include "base/utility.h"
nuclear@0 384 #include <stdio.h>
nuclear@0 385
nuclear@0 386
nuclear@0 387 bool check_equal(const char* utf8_in, const uint32_t* ucs_in)
nuclear@0 388 {
nuclear@0 389 for (;;)
nuclear@0 390 {
nuclear@0 391 uint32_t next_ucs = *ucs_in++;
nuclear@0 392 uint32_t next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in);
nuclear@0 393 if (next_ucs != next_ucs_from_utf8)
nuclear@0 394 {
nuclear@0 395 return false;
nuclear@0 396 }
nuclear@0 397 if (next_ucs == 0)
nuclear@0 398 {
nuclear@0 399 OVR_ASSERT(next_ucs_from_utf8 == 0);
nuclear@0 400 break;
nuclear@0 401 }
nuclear@0 402 }
nuclear@0 403
nuclear@0 404 return true;
nuclear@0 405 }
nuclear@0 406
nuclear@0 407
nuclear@0 408 void log_ascii(const char* line)
nuclear@0 409 {
nuclear@0 410 for (;;)
nuclear@0 411 {
nuclear@0 412 unsigned char c = (unsigned char) *line++;
nuclear@0 413 if (c == 0)
nuclear@0 414 {
nuclear@0 415 // End of line.
nuclear@0 416 return;
nuclear@0 417 }
nuclear@0 418 else if (c != '\n'
nuclear@0 419 && (c < 32 || c > 127))
nuclear@0 420 {
nuclear@0 421 // Non-printable as plain ASCII.
nuclear@0 422 printf("<0x%02X>", (int) c);
nuclear@0 423 }
nuclear@0 424 else
nuclear@0 425 {
nuclear@0 426 printf("%c", c);
nuclear@0 427 }
nuclear@0 428 }
nuclear@0 429 }
nuclear@0 430
nuclear@0 431
nuclear@0 432 void log_ucs(const uint32_t* line)
nuclear@0 433 {
nuclear@0 434 for (;;)
nuclear@0 435 {
nuclear@0 436 uint32_t uc = *line++;
nuclear@0 437 if (uc == 0)
nuclear@0 438 {
nuclear@0 439 // End of line.
nuclear@0 440 return;
nuclear@0 441 }
nuclear@0 442 else if (uc != '\n'
nuclear@0 443 && (uc < 32 || uc > 127))
nuclear@0 444 {
nuclear@0 445 // Non-printable as plain ASCII.
nuclear@0 446 printf("<U-%04X>", uc);
nuclear@0 447 }
nuclear@0 448 else
nuclear@0 449 {
nuclear@0 450 printf("%c", (char) uc);
nuclear@0 451 }
nuclear@0 452 }
nuclear@0 453 }
nuclear@0 454
nuclear@0 455
nuclear@0 456 // Simple canned test.
nuclear@0 457 int main(int argc, const char* argv[])
nuclear@0 458 {
nuclear@0 459 {
nuclear@0 460 const char* test8 = "Ignacio CastaƱo";
nuclear@0 461 const uint32_t test32[] =
nuclear@0 462 {
nuclear@0 463 0x49, 0x67, 0x6E, 0x61, 0x63,
nuclear@0 464 0x69, 0x6F, 0x20, 0x43, 0x61,
nuclear@0 465 0x73, 0x74, 0x61, 0xF1, 0x6F,
nuclear@0 466 0x00
nuclear@0 467 };
nuclear@0 468
nuclear@0 469 OVR_ASSERT(check_equal(test8, test32));
nuclear@0 470 }
nuclear@0 471
nuclear@0 472 // If user passed an arg, try reading the file as UTF-8 encoded text.
nuclear@0 473 if (argc > 1)
nuclear@0 474 {
nuclear@0 475 const char* filename = argv[1];
nuclear@0 476 FILE* fp = fopen(filename, "rb");
nuclear@0 477 if (fp == NULL)
nuclear@0 478 {
nuclear@0 479 printf("Can't open file '%s'\n", filename);
nuclear@0 480 return 1;
nuclear@0 481 }
nuclear@0 482
nuclear@0 483 // Read lines from the file, encode/decode them, and highlight discrepancies.
nuclear@0 484 const int LINE_SIZE = 200; // max line size
nuclear@0 485 char line_buffer_utf8[LINE_SIZE];
nuclear@0 486 char reencoded_utf8[6 * LINE_SIZE];
nuclear@0 487 uint32_t line_buffer_ucs[LINE_SIZE];
nuclear@0 488
nuclear@0 489 int byte_counter = 0;
nuclear@0 490 for (;;)
nuclear@0 491 {
nuclear@0 492 int c = fgetc(fp);
nuclear@0 493 if (c == EOF)
nuclear@0 494 {
nuclear@0 495 // Done.
nuclear@0 496 break;
nuclear@0 497 }
nuclear@0 498 line_buffer_utf8[byte_counter++] = c;
nuclear@0 499 if (c == '\n' || byte_counter >= LINE_SIZE - 2)
nuclear@0 500 {
nuclear@0 501 // End of line. Process the line.
nuclear@0 502 line_buffer_utf8[byte_counter++] = 0; // terminate.
nuclear@0 503
nuclear@0 504 // Decode into UCS.
nuclear@0 505 const char* p = line_buffer_utf8;
nuclear@0 506 uint32_t* q = line_buffer_ucs;
nuclear@0 507 for (;;)
nuclear@0 508 {
nuclear@0 509 uint32_t uc = UTF8Util::DecodeNextChar(&p);
nuclear@0 510 *q++ = uc;
nuclear@0 511
nuclear@0 512 OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE);
nuclear@0 513 OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE);
nuclear@0 514
nuclear@0 515 if (uc == 0) break;
nuclear@0 516 }
nuclear@0 517
nuclear@0 518 // Encode back into UTF-8.
nuclear@0 519 q = line_buffer_ucs;
nuclear@0 520 int index = 0;
nuclear@0 521 for (;;)
nuclear@0 522 {
nuclear@0 523 uint32_t uc = *q++;
nuclear@0 524 OVR_ASSERT(index < LINE_SIZE * 6 - 6);
nuclear@0 525 int last_index = index;
nuclear@0 526 UTF8Util::EncodeChar(reencoded_utf8, &index, uc);
nuclear@0 527 OVR_ASSERT(index <= last_index + 6);
nuclear@0 528 if (uc == 0) break;
nuclear@0 529 }
nuclear@0 530
nuclear@0 531 // This can be useful for debugging.
nuclear@0 532 #if 0
nuclear@0 533 // Show the UCS and the re-encoded UTF-8.
nuclear@0 534 log_ucs(line_buffer_ucs);
nuclear@0 535 log_ascii(reencoded_utf8);
nuclear@0 536 #endif // 0
nuclear@0 537
nuclear@0 538 OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs));
nuclear@0 539 OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs));
nuclear@0 540
nuclear@0 541 // Start next line.
nuclear@0 542 byte_counter = 0;
nuclear@0 543 }
nuclear@0 544 }
nuclear@0 545
nuclear@0 546 fclose(fp);
nuclear@0 547 }
nuclear@0 548
nuclear@0 549 return 0;
nuclear@0 550 }
nuclear@0 551
nuclear@0 552
nuclear@0 553 #endif // UTF8_UNIT_TEST
nuclear@0 554
nuclear@0 555 }} // namespace UTF8Util::OVR
nuclear@0 556