ovr_sdk

view LibOVR/Src/Kernel/OVR_UTF8Util.cpp @ 0:1b39a1b46319

initial 0.4.4
author John Tsiombikas <nuclear@member.fsf.org>
date Wed, 14 Jan 2015 06:51:16 +0200
parents
children
line source
1 /**************************************************************************
3 Filename : OVR_UTF8Util.cpp
4 Content : UTF8 Unicode character encoding/decoding support
5 Created : September 19, 2012
6 Notes :
7 Notes : Much useful info at "UTF-8 and Unicode FAQ"
8 http://www.cl.cam.ac.uk/~mgk25/unicode.html
10 Copyright : Copyright 2014 Oculus VR, LLC All Rights reserved.
12 Licensed under the Oculus VR Rift SDK License Version 3.2 (the "License");
13 you may not use the Oculus VR Rift SDK except in compliance with the License,
14 which is provided at the time of installation or download, or which
15 otherwise accompanies this software in either electronic or hard copy form.
17 You may obtain a copy of the License at
19 http://www.oculusvr.com/licenses/LICENSE-3.2
21 Unless required by applicable law or agreed to in writing, the Oculus VR SDK
22 distributed under the License is distributed on an "AS IS" BASIS,
23 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24 See the License for the specific language governing permissions and
25 limitations under the License.
27 ************************************************************************************/
29 #include "OVR_UTF8Util.h"
31 namespace OVR { namespace UTF8Util {
33 intptr_t OVR_STDCALL GetLength(const char* buf, intptr_t buflen)
34 {
35 const char* p = buf;
36 intptr_t length = 0;
38 if (buflen != -1)
39 {
40 while (p - buf < buflen)
41 {
42 // We should be able to have ASStrings with 0 in the middle.
43 UTF8Util::DecodeNextChar_Advance0(&p);
44 length++;
45 }
46 }
47 else
48 {
49 while (UTF8Util::DecodeNextChar_Advance0(&p))
50 length++;
51 }
53 return length;
54 }
56 uint32_t OVR_STDCALL GetCharAt(intptr_t index, const char* putf8str, intptr_t length)
57 {
58 const char* buf = putf8str;
59 uint32_t c = 0;
61 if (length != -1)
62 {
63 while (buf - putf8str < length)
64 {
65 c = UTF8Util::DecodeNextChar_Advance0(&buf);
66 if (index == 0)
67 return c;
68 index--;
69 }
71 return c;
72 }
74 do
75 {
76 c = UTF8Util::DecodeNextChar_Advance0(&buf);
77 index--;
79 if (c == 0)
80 {
81 // We've hit the end of the string; don't go further.
82 OVR_ASSERT(index == 0);
83 return c;
84 }
85 } while (index >= 0);
87 return c;
88 }
90 intptr_t OVR_STDCALL GetByteIndex(intptr_t index, const char *putf8str, intptr_t length)
91 {
92 const char* buf = putf8str;
94 if (length != -1)
95 {
96 while ((buf - putf8str) < length && index > 0)
97 {
98 UTF8Util::DecodeNextChar_Advance0(&buf);
99 index--;
100 }
102 return buf-putf8str;
103 }
105 while (index > 0)
106 {
107 uint32_t c = UTF8Util::DecodeNextChar_Advance0(&buf);
108 index--;
110 if (c == 0)
111 return buf-putf8str;
112 };
114 return buf-putf8str;
115 }
117 int OVR_STDCALL GetEncodeCharSize(uint32_t ucs_character)
118 {
119 if (ucs_character <= 0x7F)
120 return 1;
121 else if (ucs_character <= 0x7FF)
122 return 2;
123 else if (ucs_character <= 0xFFFF)
124 return 3;
125 else if (ucs_character <= 0x1FFFFF)
126 return 4;
127 else if (ucs_character <= 0x3FFFFFF)
128 return 5;
129 else if (ucs_character <= 0x7FFFFFFF)
130 return 6;
131 else
132 return 0;
133 }
135 uint32_t OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer)
136 {
137 uint32_t uc;
138 char c;
140 // Security considerations:
141 //
142 // Changed, this is now only the case for DecodeNextChar:
143 // - If we hit a zero byte, we want to return 0 without stepping
144 // the buffer pointer past the 0. th
145 //
146 // If we hit an "overlong sequence"; i.e. a character encoded
147 // in a longer multibyte string than is necessary, then we
148 // need to discard the character. This is so attackers can't
149 // disguise dangerous characters or character sequences --
150 // there is only one valid encoding for each character.
151 //
152 // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
153 // 0xFFFF } then we ignore them; they are not valid in UTF-8.
155 // This isn't actually an invalid character; it's a valid char that
156 // looks like an inverted question mark.
157 #define INVALID_CHAR 0x0FFFD
159 #define FIRST_BYTE(mask, shift) \
160 uc = (c & (mask)) << (shift);
162 #define NEXT_BYTE(shift) \
163 c = **putf8Buffer; \
164 if (c == 0) return 0; /* end of buffer, do not advance */ \
165 if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */ \
166 (*putf8Buffer)++; \
167 uc |= (c & 0x3F) << shift;
169 c = **putf8Buffer;
170 (*putf8Buffer)++;
171 if (c == 0)
172 return 0; // End of buffer.
174 if ((c & 0x80) == 0) return (uint32_t) c; // Conventional 7-bit ASCII.
176 // Multi-byte sequences.
177 if ((c & 0xE0) == 0xC0)
178 {
179 // Two-byte sequence.
180 FIRST_BYTE(0x1F, 6);
181 NEXT_BYTE(0);
182 if (uc < 0x80) return INVALID_CHAR; // overlong
183 return uc;
184 }
185 else if ((c & 0xF0) == 0xE0)
186 {
187 // Three-byte sequence.
188 FIRST_BYTE(0x0F, 12);
189 NEXT_BYTE(6);
190 NEXT_BYTE(0);
191 if (uc < 0x800) return INVALID_CHAR; // overlong
192 // Not valid ISO 10646, but Flash requires these to work
193 // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0)
194 // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR;
195 // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646
196 return uc;
197 }
198 else if ((c & 0xF8) == 0xF0)
199 {
200 // Four-byte sequence.
201 FIRST_BYTE(0x07, 18);
202 NEXT_BYTE(12);
203 NEXT_BYTE(6);
204 NEXT_BYTE(0);
205 if (uc < 0x010000) return INVALID_CHAR; // overlong
206 return uc;
207 }
208 else if ((c & 0xFC) == 0xF8)
209 {
210 // Five-byte sequence.
211 FIRST_BYTE(0x03, 24);
212 NEXT_BYTE(18);
213 NEXT_BYTE(12);
214 NEXT_BYTE(6);
215 NEXT_BYTE(0);
216 if (uc < 0x0200000) return INVALID_CHAR; // overlong
217 return uc;
218 }
219 else if ((c & 0xFE) == 0xFC)
220 {
221 // Six-byte sequence.
222 FIRST_BYTE(0x01, 30);
223 NEXT_BYTE(24);
224 NEXT_BYTE(18);
225 NEXT_BYTE(12);
226 NEXT_BYTE(6);
227 NEXT_BYTE(0);
228 if (uc < 0x04000000) return INVALID_CHAR; // overlong
229 return uc;
230 }
231 else
232 {
233 // Invalid.
234 return INVALID_CHAR;
235 }
236 }
239 void OVR_STDCALL EncodeChar(char* pbuffer, intptr_t* pindex, uint32_t ucs_character)
240 {
241 if (ucs_character <= 0x7F)
242 {
243 // Plain single-byte ASCII.
244 pbuffer[(*pindex)++] = (char) ucs_character;
245 }
246 else if (ucs_character <= 0x7FF)
247 {
248 // Two bytes.
249 pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6);
250 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
251 }
252 else if (ucs_character <= 0xFFFF)
253 {
254 // Three bytes.
255 pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12);
256 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
257 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
258 }
259 else if (ucs_character <= 0x1FFFFF)
260 {
261 // Four bytes.
262 pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18);
263 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
264 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
265 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
266 }
267 else if (ucs_character <= 0x3FFFFFF)
268 {
269 // Five bytes.
270 pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24);
271 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
272 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
273 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
274 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
275 }
276 else if (ucs_character <= 0x7FFFFFFF)
277 {
278 // Six bytes.
279 pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30);
280 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F);
281 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
282 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
283 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
284 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
285 }
286 else
287 {
288 // Invalid char; don't encode anything.
289 }
290 }
292 intptr_t OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, intptr_t length)
293 {
294 intptr_t len = 0;
295 if (length != -1)
296 for (int i = 0; i < length; i++)
297 {
298 len += GetEncodeCharSize(pchar[i]);
299 }
300 else
301 for (int i = 0;; i++)
302 {
303 if (pchar[i] == 0)
304 return len;
305 len += GetEncodeCharSize(pchar[i]);
306 }
307 return len;
308 }
310 void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, intptr_t length)
311 {
312 intptr_t ofs = 0;
313 if (length != -1)
314 {
315 for (int i = 0; i < length; i++)
316 {
317 EncodeChar(pbuff, &ofs, pchar[i]);
318 }
319 }
320 else
321 {
322 for (int i = 0;; i++)
323 {
324 if (pchar[i] == 0)
325 break;
326 EncodeChar(pbuff, &ofs, pchar[i]);
327 }
328 }
329 pbuff[ofs] = 0;
330 }
332 size_t OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, intptr_t bytesLen)
333 {
334 wchar_t *pbegin = pbuff;
335 if (bytesLen == -1)
336 {
337 while (1)
338 {
339 uint32_t ch = DecodeNextChar_Advance0(&putf8str);
340 if (ch == 0)
341 break;
342 else if (ch >= 0xFFFF)
343 ch = 0xFFFD;
344 *pbuff++ = wchar_t(ch);
345 }
346 }
347 else
348 {
349 const char* p = putf8str;
350 while ((p - putf8str) < bytesLen)
351 {
352 uint32_t ch = DecodeNextChar_Advance0(&p);
353 if (ch >= 0xFFFF)
354 ch = 0xFFFD;
355 *pbuff++ = wchar_t(ch);
356 }
357 }
359 *pbuff = 0;
360 return pbuff - pbegin;
361 }
364 #ifdef UTF8_UNIT_TEST
366 // Compile this test case with something like:
367 //
368 // gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test
369 //
370 // or
371 //
372 // cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I..
373 //
374 // If possible, try running the test program with the first arg
375 // pointing at the file:
376 //
377 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
378 //
379 // and examine the results by eye to make sure they are acceptable to
380 // you.
383 #include "base/utility.h"
384 #include <stdio.h>
387 bool check_equal(const char* utf8_in, const uint32_t* ucs_in)
388 {
389 for (;;)
390 {
391 uint32_t next_ucs = *ucs_in++;
392 uint32_t next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in);
393 if (next_ucs != next_ucs_from_utf8)
394 {
395 return false;
396 }
397 if (next_ucs == 0)
398 {
399 OVR_ASSERT(next_ucs_from_utf8 == 0);
400 break;
401 }
402 }
404 return true;
405 }
408 void log_ascii(const char* line)
409 {
410 for (;;)
411 {
412 unsigned char c = (unsigned char) *line++;
413 if (c == 0)
414 {
415 // End of line.
416 return;
417 }
418 else if (c != '\n'
419 && (c < 32 || c > 127))
420 {
421 // Non-printable as plain ASCII.
422 printf("<0x%02X>", (int) c);
423 }
424 else
425 {
426 printf("%c", c);
427 }
428 }
429 }
432 void log_ucs(const uint32_t* line)
433 {
434 for (;;)
435 {
436 uint32_t uc = *line++;
437 if (uc == 0)
438 {
439 // End of line.
440 return;
441 }
442 else if (uc != '\n'
443 && (uc < 32 || uc > 127))
444 {
445 // Non-printable as plain ASCII.
446 printf("<U-%04X>", uc);
447 }
448 else
449 {
450 printf("%c", (char) uc);
451 }
452 }
453 }
456 // Simple canned test.
457 int main(int argc, const char* argv[])
458 {
459 {
460 const char* test8 = "Ignacio CastaƱo";
461 const uint32_t test32[] =
462 {
463 0x49, 0x67, 0x6E, 0x61, 0x63,
464 0x69, 0x6F, 0x20, 0x43, 0x61,
465 0x73, 0x74, 0x61, 0xF1, 0x6F,
466 0x00
467 };
469 OVR_ASSERT(check_equal(test8, test32));
470 }
472 // If user passed an arg, try reading the file as UTF-8 encoded text.
473 if (argc > 1)
474 {
475 const char* filename = argv[1];
476 FILE* fp = fopen(filename, "rb");
477 if (fp == NULL)
478 {
479 printf("Can't open file '%s'\n", filename);
480 return 1;
481 }
483 // Read lines from the file, encode/decode them, and highlight discrepancies.
484 const int LINE_SIZE = 200; // max line size
485 char line_buffer_utf8[LINE_SIZE];
486 char reencoded_utf8[6 * LINE_SIZE];
487 uint32_t line_buffer_ucs[LINE_SIZE];
489 int byte_counter = 0;
490 for (;;)
491 {
492 int c = fgetc(fp);
493 if (c == EOF)
494 {
495 // Done.
496 break;
497 }
498 line_buffer_utf8[byte_counter++] = c;
499 if (c == '\n' || byte_counter >= LINE_SIZE - 2)
500 {
501 // End of line. Process the line.
502 line_buffer_utf8[byte_counter++] = 0; // terminate.
504 // Decode into UCS.
505 const char* p = line_buffer_utf8;
506 uint32_t* q = line_buffer_ucs;
507 for (;;)
508 {
509 uint32_t uc = UTF8Util::DecodeNextChar(&p);
510 *q++ = uc;
512 OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE);
513 OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE);
515 if (uc == 0) break;
516 }
518 // Encode back into UTF-8.
519 q = line_buffer_ucs;
520 int index = 0;
521 for (;;)
522 {
523 uint32_t uc = *q++;
524 OVR_ASSERT(index < LINE_SIZE * 6 - 6);
525 int last_index = index;
526 UTF8Util::EncodeChar(reencoded_utf8, &index, uc);
527 OVR_ASSERT(index <= last_index + 6);
528 if (uc == 0) break;
529 }
531 // This can be useful for debugging.
532 #if 0
533 // Show the UCS and the re-encoded UTF-8.
534 log_ucs(line_buffer_ucs);
535 log_ascii(reencoded_utf8);
536 #endif // 0
538 OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs));
539 OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs));
541 // Start next line.
542 byte_counter = 0;
543 }
544 }
546 fclose(fp);
547 }
549 return 0;
550 }
553 #endif // UTF8_UNIT_TEST
555 }} // namespace UTF8Util::OVR