oculus1

annotate libovr/Src/Kernel/OVR_UTF8Util.cpp @ 17:cfe4979ab3eb

ops, minor error in the last commit
author John Tsiombikas <nuclear@member.fsf.org>
date Sat, 21 Sep 2013 07:09:48 +0300
parents e2f9e4603129
children
rev   line source
nuclear@3 1 /**************************************************************************
nuclear@3 2
nuclear@3 3 Filename : OVR_UTF8Util.cpp
nuclear@3 4 Content : UTF8 Unicode character encoding/decoding support
nuclear@3 5 Created : September 19, 2012
nuclear@3 6 Notes :
nuclear@3 7 Notes : Much useful info at "UTF-8 and Unicode FAQ"
nuclear@3 8 http://www.cl.cam.ac.uk/~mgk25/unicode.html
nuclear@3 9
nuclear@3 10 Copyright : Copyright 2012 Oculus VR, Inc. All Rights reserved.
nuclear@3 11
nuclear@3 12 Use of this software is subject to the terms of the Oculus license
nuclear@3 13 agreement provided at the time of installation or download, or which
nuclear@3 14 otherwise accompanies this software in either electronic or hard copy form.
nuclear@3 15
nuclear@3 16 ************************************************************************************/
nuclear@3 17
nuclear@3 18 #include "OVR_UTF8Util.h"
nuclear@3 19
nuclear@3 20 namespace OVR { namespace UTF8Util {
nuclear@3 21
nuclear@3 22 SPInt OVR_STDCALL GetLength(const char* buf, SPInt buflen)
nuclear@3 23 {
nuclear@3 24 const char* p = buf;
nuclear@3 25 SPInt length = 0;
nuclear@3 26
nuclear@3 27 if (buflen != -1)
nuclear@3 28 {
nuclear@3 29 while (p - buf < buflen)
nuclear@3 30 {
nuclear@3 31 // We should be able to have ASStrings with 0 in the middle.
nuclear@3 32 UTF8Util::DecodeNextChar_Advance0(&p);
nuclear@3 33 length++;
nuclear@3 34 }
nuclear@3 35 }
nuclear@3 36 else
nuclear@3 37 {
nuclear@3 38 while (UTF8Util::DecodeNextChar_Advance0(&p))
nuclear@3 39 length++;
nuclear@3 40 }
nuclear@3 41
nuclear@3 42 return length;
nuclear@3 43 }
nuclear@3 44
nuclear@3 45 UInt32 OVR_STDCALL GetCharAt(SPInt index, const char* putf8str, SPInt length)
nuclear@3 46 {
nuclear@3 47 const char* buf = putf8str;
nuclear@3 48 UInt32 c = 0;
nuclear@3 49
nuclear@3 50 if (length != -1)
nuclear@3 51 {
nuclear@3 52 while (buf - putf8str < length)
nuclear@3 53 {
nuclear@3 54 c = UTF8Util::DecodeNextChar_Advance0(&buf);
nuclear@3 55 if (index == 0)
nuclear@3 56 return c;
nuclear@3 57 index--;
nuclear@3 58 }
nuclear@3 59
nuclear@3 60 return c;
nuclear@3 61 }
nuclear@3 62
nuclear@3 63 do
nuclear@3 64 {
nuclear@3 65 c = UTF8Util::DecodeNextChar_Advance0(&buf);
nuclear@3 66 index--;
nuclear@3 67
nuclear@3 68 if (c == 0)
nuclear@3 69 {
nuclear@3 70 // We've hit the end of the string; don't go further.
nuclear@3 71 OVR_ASSERT(index == 0);
nuclear@3 72 return c;
nuclear@3 73 }
nuclear@3 74 } while (index >= 0);
nuclear@3 75
nuclear@3 76 return c;
nuclear@3 77 }
nuclear@3 78
nuclear@3 79 SPInt OVR_STDCALL GetByteIndex(SPInt index, const char *putf8str, SPInt length)
nuclear@3 80 {
nuclear@3 81 const char* buf = putf8str;
nuclear@3 82
nuclear@3 83 if (length != -1)
nuclear@3 84 {
nuclear@3 85 while ((buf - putf8str) < length && index > 0)
nuclear@3 86 {
nuclear@3 87 UTF8Util::DecodeNextChar_Advance0(&buf);
nuclear@3 88 index--;
nuclear@3 89 }
nuclear@3 90
nuclear@3 91 return buf-putf8str;
nuclear@3 92 }
nuclear@3 93
nuclear@3 94 while (index > 0)
nuclear@3 95 {
nuclear@3 96 UInt32 c = UTF8Util::DecodeNextChar_Advance0(&buf);
nuclear@3 97 index--;
nuclear@3 98
nuclear@3 99 if (c == 0)
nuclear@3 100 return buf-putf8str;
nuclear@3 101 };
nuclear@3 102
nuclear@3 103 return buf-putf8str;
nuclear@3 104 }
nuclear@3 105
nuclear@3 106 int OVR_STDCALL GetEncodeCharSize(UInt32 ucs_character)
nuclear@3 107 {
nuclear@3 108 if (ucs_character <= 0x7F)
nuclear@3 109 return 1;
nuclear@3 110 else if (ucs_character <= 0x7FF)
nuclear@3 111 return 2;
nuclear@3 112 else if (ucs_character <= 0xFFFF)
nuclear@3 113 return 3;
nuclear@3 114 else if (ucs_character <= 0x1FFFFF)
nuclear@3 115 return 4;
nuclear@3 116 else if (ucs_character <= 0x3FFFFFF)
nuclear@3 117 return 5;
nuclear@3 118 else if (ucs_character <= 0x7FFFFFFF)
nuclear@3 119 return 6;
nuclear@3 120 else
nuclear@3 121 return 0;
nuclear@3 122 }
nuclear@3 123
nuclear@3 124 UInt32 OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer)
nuclear@3 125 {
nuclear@3 126 UInt32 uc;
nuclear@3 127 char c;
nuclear@3 128
nuclear@3 129 // Security considerations:
nuclear@3 130 //
nuclear@3 131 // Changed, this is now only the case for DecodeNextChar:
nuclear@3 132 // - If we hit a zero byte, we want to return 0 without stepping
nuclear@3 133 // the buffer pointer past the 0. th
nuclear@3 134 //
nuclear@3 135 // If we hit an "overlong sequence"; i.e. a character encoded
nuclear@3 136 // in a longer multibyte string than is necessary, then we
nuclear@3 137 // need to discard the character. This is so attackers can't
nuclear@3 138 // disguise dangerous characters or character sequences --
nuclear@3 139 // there is only one valid encoding for each character.
nuclear@3 140 //
nuclear@3 141 // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
nuclear@3 142 // 0xFFFF } then we ignore them; they are not valid in UTF-8.
nuclear@3 143
nuclear@3 144 // This isn't actually an invalid character; it's a valid char that
nuclear@3 145 // looks like an inverted question mark.
nuclear@3 146 #define INVALID_CHAR 0x0FFFD
nuclear@3 147
nuclear@3 148 #define FIRST_BYTE(mask, shift) \
nuclear@3 149 uc = (c & (mask)) << (shift);
nuclear@3 150
nuclear@3 151 #define NEXT_BYTE(shift) \
nuclear@3 152 c = **putf8Buffer; \
nuclear@3 153 if (c == 0) return 0; /* end of buffer, do not advance */ \
nuclear@3 154 if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */ \
nuclear@3 155 (*putf8Buffer)++; \
nuclear@3 156 uc |= (c & 0x3F) << shift;
nuclear@3 157
nuclear@3 158 c = **putf8Buffer;
nuclear@3 159 (*putf8Buffer)++;
nuclear@3 160 if (c == 0)
nuclear@3 161 return 0; // End of buffer.
nuclear@3 162
nuclear@3 163 if ((c & 0x80) == 0) return (UInt32) c; // Conventional 7-bit ASCII.
nuclear@3 164
nuclear@3 165 // Multi-byte sequences.
nuclear@3 166 if ((c & 0xE0) == 0xC0)
nuclear@3 167 {
nuclear@3 168 // Two-byte sequence.
nuclear@3 169 FIRST_BYTE(0x1F, 6);
nuclear@3 170 NEXT_BYTE(0);
nuclear@3 171 if (uc < 0x80) return INVALID_CHAR; // overlong
nuclear@3 172 return uc;
nuclear@3 173 }
nuclear@3 174 else if ((c & 0xF0) == 0xE0)
nuclear@3 175 {
nuclear@3 176 // Three-byte sequence.
nuclear@3 177 FIRST_BYTE(0x0F, 12);
nuclear@3 178 NEXT_BYTE(6);
nuclear@3 179 NEXT_BYTE(0);
nuclear@3 180 if (uc < 0x800) return INVALID_CHAR; // overlong
nuclear@3 181 // Not valid ISO 10646, but Flash requires these to work
nuclear@3 182 // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0)
nuclear@3 183 // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR;
nuclear@3 184 // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646
nuclear@3 185 return uc;
nuclear@3 186 }
nuclear@3 187 else if ((c & 0xF8) == 0xF0)
nuclear@3 188 {
nuclear@3 189 // Four-byte sequence.
nuclear@3 190 FIRST_BYTE(0x07, 18);
nuclear@3 191 NEXT_BYTE(12);
nuclear@3 192 NEXT_BYTE(6);
nuclear@3 193 NEXT_BYTE(0);
nuclear@3 194 if (uc < 0x010000) return INVALID_CHAR; // overlong
nuclear@3 195 return uc;
nuclear@3 196 }
nuclear@3 197 else if ((c & 0xFC) == 0xF8)
nuclear@3 198 {
nuclear@3 199 // Five-byte sequence.
nuclear@3 200 FIRST_BYTE(0x03, 24);
nuclear@3 201 NEXT_BYTE(18);
nuclear@3 202 NEXT_BYTE(12);
nuclear@3 203 NEXT_BYTE(6);
nuclear@3 204 NEXT_BYTE(0);
nuclear@3 205 if (uc < 0x0200000) return INVALID_CHAR; // overlong
nuclear@3 206 return uc;
nuclear@3 207 }
nuclear@3 208 else if ((c & 0xFE) == 0xFC)
nuclear@3 209 {
nuclear@3 210 // Six-byte sequence.
nuclear@3 211 FIRST_BYTE(0x01, 30);
nuclear@3 212 NEXT_BYTE(24);
nuclear@3 213 NEXT_BYTE(18);
nuclear@3 214 NEXT_BYTE(12);
nuclear@3 215 NEXT_BYTE(6);
nuclear@3 216 NEXT_BYTE(0);
nuclear@3 217 if (uc < 0x04000000) return INVALID_CHAR; // overlong
nuclear@3 218 return uc;
nuclear@3 219 }
nuclear@3 220 else
nuclear@3 221 {
nuclear@3 222 // Invalid.
nuclear@3 223 return INVALID_CHAR;
nuclear@3 224 }
nuclear@3 225 }
nuclear@3 226
nuclear@3 227
nuclear@3 228 void OVR_STDCALL EncodeChar(char* pbuffer, SPInt* pindex, UInt32 ucs_character)
nuclear@3 229 {
nuclear@3 230 if (ucs_character <= 0x7F)
nuclear@3 231 {
nuclear@3 232 // Plain single-byte ASCII.
nuclear@3 233 pbuffer[(*pindex)++] = (char) ucs_character;
nuclear@3 234 }
nuclear@3 235 else if (ucs_character <= 0x7FF)
nuclear@3 236 {
nuclear@3 237 // Two bytes.
nuclear@3 238 pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6);
nuclear@3 239 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
nuclear@3 240 }
nuclear@3 241 else if (ucs_character <= 0xFFFF)
nuclear@3 242 {
nuclear@3 243 // Three bytes.
nuclear@3 244 pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12);
nuclear@3 245 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
nuclear@3 246 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
nuclear@3 247 }
nuclear@3 248 else if (ucs_character <= 0x1FFFFF)
nuclear@3 249 {
nuclear@3 250 // Four bytes.
nuclear@3 251 pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18);
nuclear@3 252 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
nuclear@3 253 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
nuclear@3 254 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
nuclear@3 255 }
nuclear@3 256 else if (ucs_character <= 0x3FFFFFF)
nuclear@3 257 {
nuclear@3 258 // Five bytes.
nuclear@3 259 pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24);
nuclear@3 260 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
nuclear@3 261 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
nuclear@3 262 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
nuclear@3 263 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
nuclear@3 264 }
nuclear@3 265 else if (ucs_character <= 0x7FFFFFFF)
nuclear@3 266 {
nuclear@3 267 // Six bytes.
nuclear@3 268 pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30);
nuclear@3 269 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F);
nuclear@3 270 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
nuclear@3 271 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
nuclear@3 272 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
nuclear@3 273 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
nuclear@3 274 }
nuclear@3 275 else
nuclear@3 276 {
nuclear@3 277 // Invalid char; don't encode anything.
nuclear@3 278 }
nuclear@3 279 }
nuclear@3 280
nuclear@3 281 SPInt OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, SPInt length)
nuclear@3 282 {
nuclear@3 283 SPInt len = 0;
nuclear@3 284 if (length != -1)
nuclear@3 285 for (int i = 0; i < length; i++)
nuclear@3 286 {
nuclear@3 287 len += GetEncodeCharSize(pchar[i]);
nuclear@3 288 }
nuclear@3 289 else
nuclear@3 290 for (int i = 0;; i++)
nuclear@3 291 {
nuclear@3 292 if (pchar[i] == 0)
nuclear@3 293 return len;
nuclear@3 294 len += GetEncodeCharSize(pchar[i]);
nuclear@3 295 }
nuclear@3 296 return len;
nuclear@3 297 }
nuclear@3 298
nuclear@3 299 void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, SPInt length)
nuclear@3 300 {
nuclear@3 301 SPInt ofs = 0;
nuclear@3 302 if (length != -1)
nuclear@3 303 {
nuclear@3 304 for (int i = 0; i < length; i++)
nuclear@3 305 {
nuclear@3 306 EncodeChar(pbuff, &ofs, pchar[i]);
nuclear@3 307 }
nuclear@3 308 }
nuclear@3 309 else
nuclear@3 310 {
nuclear@3 311 for (int i = 0;; i++)
nuclear@3 312 {
nuclear@3 313 if (pchar[i] == 0)
nuclear@3 314 break;
nuclear@3 315 EncodeChar(pbuff, &ofs, pchar[i]);
nuclear@3 316 }
nuclear@3 317 }
nuclear@3 318 pbuff[ofs] = 0;
nuclear@3 319 }
nuclear@3 320
nuclear@3 321 UPInt OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, SPInt bytesLen)
nuclear@3 322 {
nuclear@3 323 wchar_t *pbegin = pbuff;
nuclear@3 324 if (bytesLen == -1)
nuclear@3 325 {
nuclear@3 326 while (1)
nuclear@3 327 {
nuclear@3 328 UInt32 ch = DecodeNextChar_Advance0(&putf8str);
nuclear@3 329 if (ch == 0)
nuclear@3 330 break;
nuclear@3 331 else if (ch >= 0xFFFF)
nuclear@3 332 ch = 0xFFFD;
nuclear@3 333 *pbuff++ = wchar_t(ch);
nuclear@3 334 }
nuclear@3 335 }
nuclear@3 336 else
nuclear@3 337 {
nuclear@3 338 const char* p = putf8str;
nuclear@3 339 while ((p - putf8str) < bytesLen)
nuclear@3 340 {
nuclear@3 341 UInt32 ch = DecodeNextChar_Advance0(&p);
nuclear@3 342 if (ch >= 0xFFFF)
nuclear@3 343 ch = 0xFFFD;
nuclear@3 344 *pbuff++ = wchar_t(ch);
nuclear@3 345 }
nuclear@3 346 }
nuclear@3 347
nuclear@3 348 *pbuff = 0;
nuclear@3 349 return pbuff - pbegin;
nuclear@3 350 }
nuclear@3 351
nuclear@3 352
nuclear@3 353 #ifdef UTF8_UNIT_TEST
nuclear@3 354
nuclear@3 355 // Compile this test case with something like:
nuclear@3 356 //
nuclear@3 357 // gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test
nuclear@3 358 //
nuclear@3 359 // or
nuclear@3 360 //
nuclear@3 361 // cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I..
nuclear@3 362 //
nuclear@3 363 // If possible, try running the test program with the first arg
nuclear@3 364 // pointing at the file:
nuclear@3 365 //
nuclear@3 366 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
nuclear@3 367 //
nuclear@3 368 // and examine the results by eye to make sure they are acceptable to
nuclear@3 369 // you.
nuclear@3 370
nuclear@3 371
nuclear@3 372 #include "base/utility.h"
nuclear@3 373 #include <stdio.h>
nuclear@3 374
nuclear@3 375
nuclear@3 376 bool check_equal(const char* utf8_in, const UInt32* ucs_in)
nuclear@3 377 {
nuclear@3 378 for (;;)
nuclear@3 379 {
nuclear@3 380 UInt32 next_ucs = *ucs_in++;
nuclear@3 381 UInt32 next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in);
nuclear@3 382 if (next_ucs != next_ucs_from_utf8)
nuclear@3 383 {
nuclear@3 384 return false;
nuclear@3 385 }
nuclear@3 386 if (next_ucs == 0)
nuclear@3 387 {
nuclear@3 388 OVR_ASSERT(next_ucs_from_utf8 == 0);
nuclear@3 389 break;
nuclear@3 390 }
nuclear@3 391 }
nuclear@3 392
nuclear@3 393 return true;
nuclear@3 394 }
nuclear@3 395
nuclear@3 396
nuclear@3 397 void log_ascii(const char* line)
nuclear@3 398 {
nuclear@3 399 for (;;)
nuclear@3 400 {
nuclear@3 401 unsigned char c = (unsigned char) *line++;
nuclear@3 402 if (c == 0)
nuclear@3 403 {
nuclear@3 404 // End of line.
nuclear@3 405 return;
nuclear@3 406 }
nuclear@3 407 else if (c != '\n'
nuclear@3 408 && (c < 32 || c > 127))
nuclear@3 409 {
nuclear@3 410 // Non-printable as plain ASCII.
nuclear@3 411 printf("<0x%02X>", (int) c);
nuclear@3 412 }
nuclear@3 413 else
nuclear@3 414 {
nuclear@3 415 printf("%c", c);
nuclear@3 416 }
nuclear@3 417 }
nuclear@3 418 }
nuclear@3 419
nuclear@3 420
nuclear@3 421 void log_ucs(const UInt32* line)
nuclear@3 422 {
nuclear@3 423 for (;;)
nuclear@3 424 {
nuclear@3 425 UInt32 uc = *line++;
nuclear@3 426 if (uc == 0)
nuclear@3 427 {
nuclear@3 428 // End of line.
nuclear@3 429 return;
nuclear@3 430 }
nuclear@3 431 else if (uc != '\n'
nuclear@3 432 && (uc < 32 || uc > 127))
nuclear@3 433 {
nuclear@3 434 // Non-printable as plain ASCII.
nuclear@3 435 printf("<U-%04X>", uc);
nuclear@3 436 }
nuclear@3 437 else
nuclear@3 438 {
nuclear@3 439 printf("%c", (char) uc);
nuclear@3 440 }
nuclear@3 441 }
nuclear@3 442 }
nuclear@3 443
nuclear@3 444
nuclear@3 445 // Simple canned test.
nuclear@3 446 int main(int argc, const char* argv[])
nuclear@3 447 {
nuclear@3 448 {
nuclear@3 449 const char* test8 = "Ignacio CastaƱo";
nuclear@3 450 const UInt32 test32[] =
nuclear@3 451 {
nuclear@3 452 0x49, 0x67, 0x6E, 0x61, 0x63,
nuclear@3 453 0x69, 0x6F, 0x20, 0x43, 0x61,
nuclear@3 454 0x73, 0x74, 0x61, 0xF1, 0x6F,
nuclear@3 455 0x00
nuclear@3 456 };
nuclear@3 457
nuclear@3 458 OVR_ASSERT(check_equal(test8, test32));
nuclear@3 459 }
nuclear@3 460
nuclear@3 461 // If user passed an arg, try reading the file as UTF-8 encoded text.
nuclear@3 462 if (argc > 1)
nuclear@3 463 {
nuclear@3 464 const char* filename = argv[1];
nuclear@3 465 FILE* fp = fopen(filename, "rb");
nuclear@3 466 if (fp == NULL)
nuclear@3 467 {
nuclear@3 468 printf("Can't open file '%s'\n", filename);
nuclear@3 469 return 1;
nuclear@3 470 }
nuclear@3 471
nuclear@3 472 // Read lines from the file, encode/decode them, and highlight discrepancies.
nuclear@3 473 const int LINE_SIZE = 200; // max line size
nuclear@3 474 char line_buffer_utf8[LINE_SIZE];
nuclear@3 475 char reencoded_utf8[6 * LINE_SIZE];
nuclear@3 476 UInt32 line_buffer_ucs[LINE_SIZE];
nuclear@3 477
nuclear@3 478 int byte_counter = 0;
nuclear@3 479 for (;;)
nuclear@3 480 {
nuclear@3 481 int c = fgetc(fp);
nuclear@3 482 if (c == EOF)
nuclear@3 483 {
nuclear@3 484 // Done.
nuclear@3 485 break;
nuclear@3 486 }
nuclear@3 487 line_buffer_utf8[byte_counter++] = c;
nuclear@3 488 if (c == '\n' || byte_counter >= LINE_SIZE - 2)
nuclear@3 489 {
nuclear@3 490 // End of line. Process the line.
nuclear@3 491 line_buffer_utf8[byte_counter++] = 0; // terminate.
nuclear@3 492
nuclear@3 493 // Decode into UCS.
nuclear@3 494 const char* p = line_buffer_utf8;
nuclear@3 495 UInt32* q = line_buffer_ucs;
nuclear@3 496 for (;;)
nuclear@3 497 {
nuclear@3 498 UInt32 uc = UTF8Util::DecodeNextChar(&p);
nuclear@3 499 *q++ = uc;
nuclear@3 500
nuclear@3 501 OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE);
nuclear@3 502 OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE);
nuclear@3 503
nuclear@3 504 if (uc == 0) break;
nuclear@3 505 }
nuclear@3 506
nuclear@3 507 // Encode back into UTF-8.
nuclear@3 508 q = line_buffer_ucs;
nuclear@3 509 int index = 0;
nuclear@3 510 for (;;)
nuclear@3 511 {
nuclear@3 512 UInt32 uc = *q++;
nuclear@3 513 OVR_ASSERT(index < LINE_SIZE * 6 - 6);
nuclear@3 514 int last_index = index;
nuclear@3 515 UTF8Util::EncodeChar(reencoded_utf8, &index, uc);
nuclear@3 516 OVR_ASSERT(index <= last_index + 6);
nuclear@3 517 if (uc == 0) break;
nuclear@3 518 }
nuclear@3 519
nuclear@3 520 // This can be useful for debugging.
nuclear@3 521 #if 0
nuclear@3 522 // Show the UCS and the re-encoded UTF-8.
nuclear@3 523 log_ucs(line_buffer_ucs);
nuclear@3 524 log_ascii(reencoded_utf8);
nuclear@3 525 #endif // 0
nuclear@3 526
nuclear@3 527 OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs));
nuclear@3 528 OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs));
nuclear@3 529
nuclear@3 530 // Start next line.
nuclear@3 531 byte_counter = 0;
nuclear@3 532 }
nuclear@3 533 }
nuclear@3 534
nuclear@3 535 fclose(fp);
nuclear@3 536 }
nuclear@3 537
nuclear@3 538 return 0;
nuclear@3 539 }
nuclear@3 540
nuclear@3 541
nuclear@3 542 #endif // UTF8_UNIT_TEST
nuclear@3 543
nuclear@3 544 }} // namespace UTF8Util::OVR
nuclear@3 545