oculus1

view libovr/Src/Kernel/OVR_UTF8Util.cpp @ 17:cfe4979ab3eb

ops, minor error in the last commit
author John Tsiombikas <nuclear@member.fsf.org>
date Sat, 21 Sep 2013 07:09:48 +0300
parents e2f9e4603129
children
line source
1 /**************************************************************************
3 Filename : OVR_UTF8Util.cpp
4 Content : UTF8 Unicode character encoding/decoding support
5 Created : September 19, 2012
6 Notes :
7 Notes : Much useful info at "UTF-8 and Unicode FAQ"
8 http://www.cl.cam.ac.uk/~mgk25/unicode.html
10 Copyright : Copyright 2012 Oculus VR, Inc. All Rights reserved.
12 Use of this software is subject to the terms of the Oculus license
13 agreement provided at the time of installation or download, or which
14 otherwise accompanies this software in either electronic or hard copy form.
16 ************************************************************************************/
18 #include "OVR_UTF8Util.h"
20 namespace OVR { namespace UTF8Util {
22 SPInt OVR_STDCALL GetLength(const char* buf, SPInt buflen)
23 {
24 const char* p = buf;
25 SPInt length = 0;
27 if (buflen != -1)
28 {
29 while (p - buf < buflen)
30 {
31 // We should be able to have ASStrings with 0 in the middle.
32 UTF8Util::DecodeNextChar_Advance0(&p);
33 length++;
34 }
35 }
36 else
37 {
38 while (UTF8Util::DecodeNextChar_Advance0(&p))
39 length++;
40 }
42 return length;
43 }
45 UInt32 OVR_STDCALL GetCharAt(SPInt index, const char* putf8str, SPInt length)
46 {
47 const char* buf = putf8str;
48 UInt32 c = 0;
50 if (length != -1)
51 {
52 while (buf - putf8str < length)
53 {
54 c = UTF8Util::DecodeNextChar_Advance0(&buf);
55 if (index == 0)
56 return c;
57 index--;
58 }
60 return c;
61 }
63 do
64 {
65 c = UTF8Util::DecodeNextChar_Advance0(&buf);
66 index--;
68 if (c == 0)
69 {
70 // We've hit the end of the string; don't go further.
71 OVR_ASSERT(index == 0);
72 return c;
73 }
74 } while (index >= 0);
76 return c;
77 }
79 SPInt OVR_STDCALL GetByteIndex(SPInt index, const char *putf8str, SPInt length)
80 {
81 const char* buf = putf8str;
83 if (length != -1)
84 {
85 while ((buf - putf8str) < length && index > 0)
86 {
87 UTF8Util::DecodeNextChar_Advance0(&buf);
88 index--;
89 }
91 return buf-putf8str;
92 }
94 while (index > 0)
95 {
96 UInt32 c = UTF8Util::DecodeNextChar_Advance0(&buf);
97 index--;
99 if (c == 0)
100 return buf-putf8str;
101 };
103 return buf-putf8str;
104 }
106 int OVR_STDCALL GetEncodeCharSize(UInt32 ucs_character)
107 {
108 if (ucs_character <= 0x7F)
109 return 1;
110 else if (ucs_character <= 0x7FF)
111 return 2;
112 else if (ucs_character <= 0xFFFF)
113 return 3;
114 else if (ucs_character <= 0x1FFFFF)
115 return 4;
116 else if (ucs_character <= 0x3FFFFFF)
117 return 5;
118 else if (ucs_character <= 0x7FFFFFFF)
119 return 6;
120 else
121 return 0;
122 }
124 UInt32 OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer)
125 {
126 UInt32 uc;
127 char c;
129 // Security considerations:
130 //
131 // Changed, this is now only the case for DecodeNextChar:
132 // - If we hit a zero byte, we want to return 0 without stepping
133 // the buffer pointer past the 0. th
134 //
135 // If we hit an "overlong sequence"; i.e. a character encoded
136 // in a longer multibyte string than is necessary, then we
137 // need to discard the character. This is so attackers can't
138 // disguise dangerous characters or character sequences --
139 // there is only one valid encoding for each character.
140 //
141 // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
142 // 0xFFFF } then we ignore them; they are not valid in UTF-8.
144 // This isn't actually an invalid character; it's a valid char that
145 // looks like an inverted question mark.
146 #define INVALID_CHAR 0x0FFFD
148 #define FIRST_BYTE(mask, shift) \
149 uc = (c & (mask)) << (shift);
151 #define NEXT_BYTE(shift) \
152 c = **putf8Buffer; \
153 if (c == 0) return 0; /* end of buffer, do not advance */ \
154 if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */ \
155 (*putf8Buffer)++; \
156 uc |= (c & 0x3F) << shift;
158 c = **putf8Buffer;
159 (*putf8Buffer)++;
160 if (c == 0)
161 return 0; // End of buffer.
163 if ((c & 0x80) == 0) return (UInt32) c; // Conventional 7-bit ASCII.
165 // Multi-byte sequences.
166 if ((c & 0xE0) == 0xC0)
167 {
168 // Two-byte sequence.
169 FIRST_BYTE(0x1F, 6);
170 NEXT_BYTE(0);
171 if (uc < 0x80) return INVALID_CHAR; // overlong
172 return uc;
173 }
174 else if ((c & 0xF0) == 0xE0)
175 {
176 // Three-byte sequence.
177 FIRST_BYTE(0x0F, 12);
178 NEXT_BYTE(6);
179 NEXT_BYTE(0);
180 if (uc < 0x800) return INVALID_CHAR; // overlong
181 // Not valid ISO 10646, but Flash requires these to work
182 // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0)
183 // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR;
184 // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646
185 return uc;
186 }
187 else if ((c & 0xF8) == 0xF0)
188 {
189 // Four-byte sequence.
190 FIRST_BYTE(0x07, 18);
191 NEXT_BYTE(12);
192 NEXT_BYTE(6);
193 NEXT_BYTE(0);
194 if (uc < 0x010000) return INVALID_CHAR; // overlong
195 return uc;
196 }
197 else if ((c & 0xFC) == 0xF8)
198 {
199 // Five-byte sequence.
200 FIRST_BYTE(0x03, 24);
201 NEXT_BYTE(18);
202 NEXT_BYTE(12);
203 NEXT_BYTE(6);
204 NEXT_BYTE(0);
205 if (uc < 0x0200000) return INVALID_CHAR; // overlong
206 return uc;
207 }
208 else if ((c & 0xFE) == 0xFC)
209 {
210 // Six-byte sequence.
211 FIRST_BYTE(0x01, 30);
212 NEXT_BYTE(24);
213 NEXT_BYTE(18);
214 NEXT_BYTE(12);
215 NEXT_BYTE(6);
216 NEXT_BYTE(0);
217 if (uc < 0x04000000) return INVALID_CHAR; // overlong
218 return uc;
219 }
220 else
221 {
222 // Invalid.
223 return INVALID_CHAR;
224 }
225 }
228 void OVR_STDCALL EncodeChar(char* pbuffer, SPInt* pindex, UInt32 ucs_character)
229 {
230 if (ucs_character <= 0x7F)
231 {
232 // Plain single-byte ASCII.
233 pbuffer[(*pindex)++] = (char) ucs_character;
234 }
235 else if (ucs_character <= 0x7FF)
236 {
237 // Two bytes.
238 pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6);
239 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
240 }
241 else if (ucs_character <= 0xFFFF)
242 {
243 // Three bytes.
244 pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12);
245 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
246 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
247 }
248 else if (ucs_character <= 0x1FFFFF)
249 {
250 // Four bytes.
251 pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18);
252 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
253 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
254 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
255 }
256 else if (ucs_character <= 0x3FFFFFF)
257 {
258 // Five bytes.
259 pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24);
260 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
261 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
262 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
263 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
264 }
265 else if (ucs_character <= 0x7FFFFFFF)
266 {
267 // Six bytes.
268 pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30);
269 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F);
270 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
271 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
272 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
273 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
274 }
275 else
276 {
277 // Invalid char; don't encode anything.
278 }
279 }
281 SPInt OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, SPInt length)
282 {
283 SPInt len = 0;
284 if (length != -1)
285 for (int i = 0; i < length; i++)
286 {
287 len += GetEncodeCharSize(pchar[i]);
288 }
289 else
290 for (int i = 0;; i++)
291 {
292 if (pchar[i] == 0)
293 return len;
294 len += GetEncodeCharSize(pchar[i]);
295 }
296 return len;
297 }
299 void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, SPInt length)
300 {
301 SPInt ofs = 0;
302 if (length != -1)
303 {
304 for (int i = 0; i < length; i++)
305 {
306 EncodeChar(pbuff, &ofs, pchar[i]);
307 }
308 }
309 else
310 {
311 for (int i = 0;; i++)
312 {
313 if (pchar[i] == 0)
314 break;
315 EncodeChar(pbuff, &ofs, pchar[i]);
316 }
317 }
318 pbuff[ofs] = 0;
319 }
321 UPInt OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, SPInt bytesLen)
322 {
323 wchar_t *pbegin = pbuff;
324 if (bytesLen == -1)
325 {
326 while (1)
327 {
328 UInt32 ch = DecodeNextChar_Advance0(&putf8str);
329 if (ch == 0)
330 break;
331 else if (ch >= 0xFFFF)
332 ch = 0xFFFD;
333 *pbuff++ = wchar_t(ch);
334 }
335 }
336 else
337 {
338 const char* p = putf8str;
339 while ((p - putf8str) < bytesLen)
340 {
341 UInt32 ch = DecodeNextChar_Advance0(&p);
342 if (ch >= 0xFFFF)
343 ch = 0xFFFD;
344 *pbuff++ = wchar_t(ch);
345 }
346 }
348 *pbuff = 0;
349 return pbuff - pbegin;
350 }
353 #ifdef UTF8_UNIT_TEST
355 // Compile this test case with something like:
356 //
357 // gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test
358 //
359 // or
360 //
361 // cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I..
362 //
363 // If possible, try running the test program with the first arg
364 // pointing at the file:
365 //
366 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
367 //
368 // and examine the results by eye to make sure they are acceptable to
369 // you.
372 #include "base/utility.h"
373 #include <stdio.h>
376 bool check_equal(const char* utf8_in, const UInt32* ucs_in)
377 {
378 for (;;)
379 {
380 UInt32 next_ucs = *ucs_in++;
381 UInt32 next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in);
382 if (next_ucs != next_ucs_from_utf8)
383 {
384 return false;
385 }
386 if (next_ucs == 0)
387 {
388 OVR_ASSERT(next_ucs_from_utf8 == 0);
389 break;
390 }
391 }
393 return true;
394 }
397 void log_ascii(const char* line)
398 {
399 for (;;)
400 {
401 unsigned char c = (unsigned char) *line++;
402 if (c == 0)
403 {
404 // End of line.
405 return;
406 }
407 else if (c != '\n'
408 && (c < 32 || c > 127))
409 {
410 // Non-printable as plain ASCII.
411 printf("<0x%02X>", (int) c);
412 }
413 else
414 {
415 printf("%c", c);
416 }
417 }
418 }
421 void log_ucs(const UInt32* line)
422 {
423 for (;;)
424 {
425 UInt32 uc = *line++;
426 if (uc == 0)
427 {
428 // End of line.
429 return;
430 }
431 else if (uc != '\n'
432 && (uc < 32 || uc > 127))
433 {
434 // Non-printable as plain ASCII.
435 printf("<U-%04X>", uc);
436 }
437 else
438 {
439 printf("%c", (char) uc);
440 }
441 }
442 }
445 // Simple canned test.
446 int main(int argc, const char* argv[])
447 {
448 {
449 const char* test8 = "Ignacio CastaƱo";
450 const UInt32 test32[] =
451 {
452 0x49, 0x67, 0x6E, 0x61, 0x63,
453 0x69, 0x6F, 0x20, 0x43, 0x61,
454 0x73, 0x74, 0x61, 0xF1, 0x6F,
455 0x00
456 };
458 OVR_ASSERT(check_equal(test8, test32));
459 }
461 // If user passed an arg, try reading the file as UTF-8 encoded text.
462 if (argc > 1)
463 {
464 const char* filename = argv[1];
465 FILE* fp = fopen(filename, "rb");
466 if (fp == NULL)
467 {
468 printf("Can't open file '%s'\n", filename);
469 return 1;
470 }
472 // Read lines from the file, encode/decode them, and highlight discrepancies.
473 const int LINE_SIZE = 200; // max line size
474 char line_buffer_utf8[LINE_SIZE];
475 char reencoded_utf8[6 * LINE_SIZE];
476 UInt32 line_buffer_ucs[LINE_SIZE];
478 int byte_counter = 0;
479 for (;;)
480 {
481 int c = fgetc(fp);
482 if (c == EOF)
483 {
484 // Done.
485 break;
486 }
487 line_buffer_utf8[byte_counter++] = c;
488 if (c == '\n' || byte_counter >= LINE_SIZE - 2)
489 {
490 // End of line. Process the line.
491 line_buffer_utf8[byte_counter++] = 0; // terminate.
493 // Decode into UCS.
494 const char* p = line_buffer_utf8;
495 UInt32* q = line_buffer_ucs;
496 for (;;)
497 {
498 UInt32 uc = UTF8Util::DecodeNextChar(&p);
499 *q++ = uc;
501 OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE);
502 OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE);
504 if (uc == 0) break;
505 }
507 // Encode back into UTF-8.
508 q = line_buffer_ucs;
509 int index = 0;
510 for (;;)
511 {
512 UInt32 uc = *q++;
513 OVR_ASSERT(index < LINE_SIZE * 6 - 6);
514 int last_index = index;
515 UTF8Util::EncodeChar(reencoded_utf8, &index, uc);
516 OVR_ASSERT(index <= last_index + 6);
517 if (uc == 0) break;
518 }
520 // This can be useful for debugging.
521 #if 0
522 // Show the UCS and the re-encoded UTF-8.
523 log_ucs(line_buffer_ucs);
524 log_ascii(reencoded_utf8);
525 #endif // 0
527 OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs));
528 OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs));
530 // Start next line.
531 byte_counter = 0;
532 }
533 }
535 fclose(fp);
536 }
538 return 0;
539 }
542 #endif // UTF8_UNIT_TEST
544 }} // namespace UTF8Util::OVR