rev |
line source |
nuclear@3
|
1 /**************************************************************************
|
nuclear@3
|
2
|
nuclear@3
|
3 Filename : OVR_UTF8Util.cpp
|
nuclear@3
|
4 Content : UTF8 Unicode character encoding/decoding support
|
nuclear@3
|
5 Created : September 19, 2012
|
nuclear@3
|
6 Notes :
|
nuclear@3
|
7 Notes : Much useful info at "UTF-8 and Unicode FAQ"
|
nuclear@3
|
8 http://www.cl.cam.ac.uk/~mgk25/unicode.html
|
nuclear@3
|
9
|
nuclear@3
|
10 Copyright : Copyright 2012 Oculus VR, Inc. All Rights reserved.
|
nuclear@3
|
11
|
nuclear@3
|
12 Use of this software is subject to the terms of the Oculus license
|
nuclear@3
|
13 agreement provided at the time of installation or download, or which
|
nuclear@3
|
14 otherwise accompanies this software in either electronic or hard copy form.
|
nuclear@3
|
15
|
nuclear@3
|
16 ************************************************************************************/
|
nuclear@3
|
17
|
nuclear@3
|
18 #include "OVR_UTF8Util.h"
|
nuclear@3
|
19
|
nuclear@3
|
20 namespace OVR { namespace UTF8Util {
|
nuclear@3
|
21
|
nuclear@3
|
22 SPInt OVR_STDCALL GetLength(const char* buf, SPInt buflen)
|
nuclear@3
|
23 {
|
nuclear@3
|
24 const char* p = buf;
|
nuclear@3
|
25 SPInt length = 0;
|
nuclear@3
|
26
|
nuclear@3
|
27 if (buflen != -1)
|
nuclear@3
|
28 {
|
nuclear@3
|
29 while (p - buf < buflen)
|
nuclear@3
|
30 {
|
nuclear@3
|
31 // We should be able to have ASStrings with 0 in the middle.
|
nuclear@3
|
32 UTF8Util::DecodeNextChar_Advance0(&p);
|
nuclear@3
|
33 length++;
|
nuclear@3
|
34 }
|
nuclear@3
|
35 }
|
nuclear@3
|
36 else
|
nuclear@3
|
37 {
|
nuclear@3
|
38 while (UTF8Util::DecodeNextChar_Advance0(&p))
|
nuclear@3
|
39 length++;
|
nuclear@3
|
40 }
|
nuclear@3
|
41
|
nuclear@3
|
42 return length;
|
nuclear@3
|
43 }
|
nuclear@3
|
44
|
nuclear@3
|
45 UInt32 OVR_STDCALL GetCharAt(SPInt index, const char* putf8str, SPInt length)
|
nuclear@3
|
46 {
|
nuclear@3
|
47 const char* buf = putf8str;
|
nuclear@3
|
48 UInt32 c = 0;
|
nuclear@3
|
49
|
nuclear@3
|
50 if (length != -1)
|
nuclear@3
|
51 {
|
nuclear@3
|
52 while (buf - putf8str < length)
|
nuclear@3
|
53 {
|
nuclear@3
|
54 c = UTF8Util::DecodeNextChar_Advance0(&buf);
|
nuclear@3
|
55 if (index == 0)
|
nuclear@3
|
56 return c;
|
nuclear@3
|
57 index--;
|
nuclear@3
|
58 }
|
nuclear@3
|
59
|
nuclear@3
|
60 return c;
|
nuclear@3
|
61 }
|
nuclear@3
|
62
|
nuclear@3
|
63 do
|
nuclear@3
|
64 {
|
nuclear@3
|
65 c = UTF8Util::DecodeNextChar_Advance0(&buf);
|
nuclear@3
|
66 index--;
|
nuclear@3
|
67
|
nuclear@3
|
68 if (c == 0)
|
nuclear@3
|
69 {
|
nuclear@3
|
70 // We've hit the end of the string; don't go further.
|
nuclear@3
|
71 OVR_ASSERT(index == 0);
|
nuclear@3
|
72 return c;
|
nuclear@3
|
73 }
|
nuclear@3
|
74 } while (index >= 0);
|
nuclear@3
|
75
|
nuclear@3
|
76 return c;
|
nuclear@3
|
77 }
|
nuclear@3
|
78
|
nuclear@3
|
79 SPInt OVR_STDCALL GetByteIndex(SPInt index, const char *putf8str, SPInt length)
|
nuclear@3
|
80 {
|
nuclear@3
|
81 const char* buf = putf8str;
|
nuclear@3
|
82
|
nuclear@3
|
83 if (length != -1)
|
nuclear@3
|
84 {
|
nuclear@3
|
85 while ((buf - putf8str) < length && index > 0)
|
nuclear@3
|
86 {
|
nuclear@3
|
87 UTF8Util::DecodeNextChar_Advance0(&buf);
|
nuclear@3
|
88 index--;
|
nuclear@3
|
89 }
|
nuclear@3
|
90
|
nuclear@3
|
91 return buf-putf8str;
|
nuclear@3
|
92 }
|
nuclear@3
|
93
|
nuclear@3
|
94 while (index > 0)
|
nuclear@3
|
95 {
|
nuclear@3
|
96 UInt32 c = UTF8Util::DecodeNextChar_Advance0(&buf);
|
nuclear@3
|
97 index--;
|
nuclear@3
|
98
|
nuclear@3
|
99 if (c == 0)
|
nuclear@3
|
100 return buf-putf8str;
|
nuclear@3
|
101 };
|
nuclear@3
|
102
|
nuclear@3
|
103 return buf-putf8str;
|
nuclear@3
|
104 }
|
nuclear@3
|
105
|
nuclear@3
|
106 int OVR_STDCALL GetEncodeCharSize(UInt32 ucs_character)
|
nuclear@3
|
107 {
|
nuclear@3
|
108 if (ucs_character <= 0x7F)
|
nuclear@3
|
109 return 1;
|
nuclear@3
|
110 else if (ucs_character <= 0x7FF)
|
nuclear@3
|
111 return 2;
|
nuclear@3
|
112 else if (ucs_character <= 0xFFFF)
|
nuclear@3
|
113 return 3;
|
nuclear@3
|
114 else if (ucs_character <= 0x1FFFFF)
|
nuclear@3
|
115 return 4;
|
nuclear@3
|
116 else if (ucs_character <= 0x3FFFFFF)
|
nuclear@3
|
117 return 5;
|
nuclear@3
|
118 else if (ucs_character <= 0x7FFFFFFF)
|
nuclear@3
|
119 return 6;
|
nuclear@3
|
120 else
|
nuclear@3
|
121 return 0;
|
nuclear@3
|
122 }
|
nuclear@3
|
123
|
nuclear@3
|
124 UInt32 OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer)
|
nuclear@3
|
125 {
|
nuclear@3
|
126 UInt32 uc;
|
nuclear@3
|
127 char c;
|
nuclear@3
|
128
|
nuclear@3
|
129 // Security considerations:
|
nuclear@3
|
130 //
|
nuclear@3
|
131 // Changed, this is now only the case for DecodeNextChar:
|
nuclear@3
|
132 // - If we hit a zero byte, we want to return 0 without stepping
|
nuclear@3
|
133 // the buffer pointer past the 0. th
|
nuclear@3
|
134 //
|
nuclear@3
|
135 // If we hit an "overlong sequence"; i.e. a character encoded
|
nuclear@3
|
136 // in a longer multibyte string than is necessary, then we
|
nuclear@3
|
137 // need to discard the character. This is so attackers can't
|
nuclear@3
|
138 // disguise dangerous characters or character sequences --
|
nuclear@3
|
139 // there is only one valid encoding for each character.
|
nuclear@3
|
140 //
|
nuclear@3
|
141 // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
|
nuclear@3
|
142 // 0xFFFF } then we ignore them; they are not valid in UTF-8.
|
nuclear@3
|
143
|
nuclear@3
|
144 // This isn't actually an invalid character; it's a valid char that
|
nuclear@3
|
145 // looks like an inverted question mark.
|
nuclear@3
|
146 #define INVALID_CHAR 0x0FFFD
|
nuclear@3
|
147
|
nuclear@3
|
148 #define FIRST_BYTE(mask, shift) \
|
nuclear@3
|
149 uc = (c & (mask)) << (shift);
|
nuclear@3
|
150
|
nuclear@3
|
151 #define NEXT_BYTE(shift) \
|
nuclear@3
|
152 c = **putf8Buffer; \
|
nuclear@3
|
153 if (c == 0) return 0; /* end of buffer, do not advance */ \
|
nuclear@3
|
154 if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */ \
|
nuclear@3
|
155 (*putf8Buffer)++; \
|
nuclear@3
|
156 uc |= (c & 0x3F) << shift;
|
nuclear@3
|
157
|
nuclear@3
|
158 c = **putf8Buffer;
|
nuclear@3
|
159 (*putf8Buffer)++;
|
nuclear@3
|
160 if (c == 0)
|
nuclear@3
|
161 return 0; // End of buffer.
|
nuclear@3
|
162
|
nuclear@3
|
163 if ((c & 0x80) == 0) return (UInt32) c; // Conventional 7-bit ASCII.
|
nuclear@3
|
164
|
nuclear@3
|
165 // Multi-byte sequences.
|
nuclear@3
|
166 if ((c & 0xE0) == 0xC0)
|
nuclear@3
|
167 {
|
nuclear@3
|
168 // Two-byte sequence.
|
nuclear@3
|
169 FIRST_BYTE(0x1F, 6);
|
nuclear@3
|
170 NEXT_BYTE(0);
|
nuclear@3
|
171 if (uc < 0x80) return INVALID_CHAR; // overlong
|
nuclear@3
|
172 return uc;
|
nuclear@3
|
173 }
|
nuclear@3
|
174 else if ((c & 0xF0) == 0xE0)
|
nuclear@3
|
175 {
|
nuclear@3
|
176 // Three-byte sequence.
|
nuclear@3
|
177 FIRST_BYTE(0x0F, 12);
|
nuclear@3
|
178 NEXT_BYTE(6);
|
nuclear@3
|
179 NEXT_BYTE(0);
|
nuclear@3
|
180 if (uc < 0x800) return INVALID_CHAR; // overlong
|
nuclear@3
|
181 // Not valid ISO 10646, but Flash requires these to work
|
nuclear@3
|
182 // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0)
|
nuclear@3
|
183 // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR;
|
nuclear@3
|
184 // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646
|
nuclear@3
|
185 return uc;
|
nuclear@3
|
186 }
|
nuclear@3
|
187 else if ((c & 0xF8) == 0xF0)
|
nuclear@3
|
188 {
|
nuclear@3
|
189 // Four-byte sequence.
|
nuclear@3
|
190 FIRST_BYTE(0x07, 18);
|
nuclear@3
|
191 NEXT_BYTE(12);
|
nuclear@3
|
192 NEXT_BYTE(6);
|
nuclear@3
|
193 NEXT_BYTE(0);
|
nuclear@3
|
194 if (uc < 0x010000) return INVALID_CHAR; // overlong
|
nuclear@3
|
195 return uc;
|
nuclear@3
|
196 }
|
nuclear@3
|
197 else if ((c & 0xFC) == 0xF8)
|
nuclear@3
|
198 {
|
nuclear@3
|
199 // Five-byte sequence.
|
nuclear@3
|
200 FIRST_BYTE(0x03, 24);
|
nuclear@3
|
201 NEXT_BYTE(18);
|
nuclear@3
|
202 NEXT_BYTE(12);
|
nuclear@3
|
203 NEXT_BYTE(6);
|
nuclear@3
|
204 NEXT_BYTE(0);
|
nuclear@3
|
205 if (uc < 0x0200000) return INVALID_CHAR; // overlong
|
nuclear@3
|
206 return uc;
|
nuclear@3
|
207 }
|
nuclear@3
|
208 else if ((c & 0xFE) == 0xFC)
|
nuclear@3
|
209 {
|
nuclear@3
|
210 // Six-byte sequence.
|
nuclear@3
|
211 FIRST_BYTE(0x01, 30);
|
nuclear@3
|
212 NEXT_BYTE(24);
|
nuclear@3
|
213 NEXT_BYTE(18);
|
nuclear@3
|
214 NEXT_BYTE(12);
|
nuclear@3
|
215 NEXT_BYTE(6);
|
nuclear@3
|
216 NEXT_BYTE(0);
|
nuclear@3
|
217 if (uc < 0x04000000) return INVALID_CHAR; // overlong
|
nuclear@3
|
218 return uc;
|
nuclear@3
|
219 }
|
nuclear@3
|
220 else
|
nuclear@3
|
221 {
|
nuclear@3
|
222 // Invalid.
|
nuclear@3
|
223 return INVALID_CHAR;
|
nuclear@3
|
224 }
|
nuclear@3
|
225 }
|
nuclear@3
|
226
|
nuclear@3
|
227
|
nuclear@3
|
228 void OVR_STDCALL EncodeChar(char* pbuffer, SPInt* pindex, UInt32 ucs_character)
|
nuclear@3
|
229 {
|
nuclear@3
|
230 if (ucs_character <= 0x7F)
|
nuclear@3
|
231 {
|
nuclear@3
|
232 // Plain single-byte ASCII.
|
nuclear@3
|
233 pbuffer[(*pindex)++] = (char) ucs_character;
|
nuclear@3
|
234 }
|
nuclear@3
|
235 else if (ucs_character <= 0x7FF)
|
nuclear@3
|
236 {
|
nuclear@3
|
237 // Two bytes.
|
nuclear@3
|
238 pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6);
|
nuclear@3
|
239 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
|
nuclear@3
|
240 }
|
nuclear@3
|
241 else if (ucs_character <= 0xFFFF)
|
nuclear@3
|
242 {
|
nuclear@3
|
243 // Three bytes.
|
nuclear@3
|
244 pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12);
|
nuclear@3
|
245 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
|
nuclear@3
|
246 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
|
nuclear@3
|
247 }
|
nuclear@3
|
248 else if (ucs_character <= 0x1FFFFF)
|
nuclear@3
|
249 {
|
nuclear@3
|
250 // Four bytes.
|
nuclear@3
|
251 pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18);
|
nuclear@3
|
252 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
|
nuclear@3
|
253 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
|
nuclear@3
|
254 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
|
nuclear@3
|
255 }
|
nuclear@3
|
256 else if (ucs_character <= 0x3FFFFFF)
|
nuclear@3
|
257 {
|
nuclear@3
|
258 // Five bytes.
|
nuclear@3
|
259 pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24);
|
nuclear@3
|
260 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
|
nuclear@3
|
261 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
|
nuclear@3
|
262 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
|
nuclear@3
|
263 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
|
nuclear@3
|
264 }
|
nuclear@3
|
265 else if (ucs_character <= 0x7FFFFFFF)
|
nuclear@3
|
266 {
|
nuclear@3
|
267 // Six bytes.
|
nuclear@3
|
268 pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30);
|
nuclear@3
|
269 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F);
|
nuclear@3
|
270 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
|
nuclear@3
|
271 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
|
nuclear@3
|
272 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
|
nuclear@3
|
273 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
|
nuclear@3
|
274 }
|
nuclear@3
|
275 else
|
nuclear@3
|
276 {
|
nuclear@3
|
277 // Invalid char; don't encode anything.
|
nuclear@3
|
278 }
|
nuclear@3
|
279 }
|
nuclear@3
|
280
|
nuclear@3
|
281 SPInt OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, SPInt length)
|
nuclear@3
|
282 {
|
nuclear@3
|
283 SPInt len = 0;
|
nuclear@3
|
284 if (length != -1)
|
nuclear@3
|
285 for (int i = 0; i < length; i++)
|
nuclear@3
|
286 {
|
nuclear@3
|
287 len += GetEncodeCharSize(pchar[i]);
|
nuclear@3
|
288 }
|
nuclear@3
|
289 else
|
nuclear@3
|
290 for (int i = 0;; i++)
|
nuclear@3
|
291 {
|
nuclear@3
|
292 if (pchar[i] == 0)
|
nuclear@3
|
293 return len;
|
nuclear@3
|
294 len += GetEncodeCharSize(pchar[i]);
|
nuclear@3
|
295 }
|
nuclear@3
|
296 return len;
|
nuclear@3
|
297 }
|
nuclear@3
|
298
|
nuclear@3
|
299 void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, SPInt length)
|
nuclear@3
|
300 {
|
nuclear@3
|
301 SPInt ofs = 0;
|
nuclear@3
|
302 if (length != -1)
|
nuclear@3
|
303 {
|
nuclear@3
|
304 for (int i = 0; i < length; i++)
|
nuclear@3
|
305 {
|
nuclear@3
|
306 EncodeChar(pbuff, &ofs, pchar[i]);
|
nuclear@3
|
307 }
|
nuclear@3
|
308 }
|
nuclear@3
|
309 else
|
nuclear@3
|
310 {
|
nuclear@3
|
311 for (int i = 0;; i++)
|
nuclear@3
|
312 {
|
nuclear@3
|
313 if (pchar[i] == 0)
|
nuclear@3
|
314 break;
|
nuclear@3
|
315 EncodeChar(pbuff, &ofs, pchar[i]);
|
nuclear@3
|
316 }
|
nuclear@3
|
317 }
|
nuclear@3
|
318 pbuff[ofs] = 0;
|
nuclear@3
|
319 }
|
nuclear@3
|
320
|
nuclear@3
|
321 UPInt OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, SPInt bytesLen)
|
nuclear@3
|
322 {
|
nuclear@3
|
323 wchar_t *pbegin = pbuff;
|
nuclear@3
|
324 if (bytesLen == -1)
|
nuclear@3
|
325 {
|
nuclear@3
|
326 while (1)
|
nuclear@3
|
327 {
|
nuclear@3
|
328 UInt32 ch = DecodeNextChar_Advance0(&putf8str);
|
nuclear@3
|
329 if (ch == 0)
|
nuclear@3
|
330 break;
|
nuclear@3
|
331 else if (ch >= 0xFFFF)
|
nuclear@3
|
332 ch = 0xFFFD;
|
nuclear@3
|
333 *pbuff++ = wchar_t(ch);
|
nuclear@3
|
334 }
|
nuclear@3
|
335 }
|
nuclear@3
|
336 else
|
nuclear@3
|
337 {
|
nuclear@3
|
338 const char* p = putf8str;
|
nuclear@3
|
339 while ((p - putf8str) < bytesLen)
|
nuclear@3
|
340 {
|
nuclear@3
|
341 UInt32 ch = DecodeNextChar_Advance0(&p);
|
nuclear@3
|
342 if (ch >= 0xFFFF)
|
nuclear@3
|
343 ch = 0xFFFD;
|
nuclear@3
|
344 *pbuff++ = wchar_t(ch);
|
nuclear@3
|
345 }
|
nuclear@3
|
346 }
|
nuclear@3
|
347
|
nuclear@3
|
348 *pbuff = 0;
|
nuclear@3
|
349 return pbuff - pbegin;
|
nuclear@3
|
350 }
|
nuclear@3
|
351
|
nuclear@3
|
352
|
nuclear@3
|
353 #ifdef UTF8_UNIT_TEST
|
nuclear@3
|
354
|
nuclear@3
|
355 // Compile this test case with something like:
|
nuclear@3
|
356 //
|
nuclear@3
|
357 // gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test
|
nuclear@3
|
358 //
|
nuclear@3
|
359 // or
|
nuclear@3
|
360 //
|
nuclear@3
|
361 // cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I..
|
nuclear@3
|
362 //
|
nuclear@3
|
363 // If possible, try running the test program with the first arg
|
nuclear@3
|
364 // pointing at the file:
|
nuclear@3
|
365 //
|
nuclear@3
|
366 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
nuclear@3
|
367 //
|
nuclear@3
|
368 // and examine the results by eye to make sure they are acceptable to
|
nuclear@3
|
369 // you.
|
nuclear@3
|
370
|
nuclear@3
|
371
|
nuclear@3
|
372 #include "base/utility.h"
|
nuclear@3
|
373 #include <stdio.h>
|
nuclear@3
|
374
|
nuclear@3
|
375
|
nuclear@3
|
376 bool check_equal(const char* utf8_in, const UInt32* ucs_in)
|
nuclear@3
|
377 {
|
nuclear@3
|
378 for (;;)
|
nuclear@3
|
379 {
|
nuclear@3
|
380 UInt32 next_ucs = *ucs_in++;
|
nuclear@3
|
381 UInt32 next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in);
|
nuclear@3
|
382 if (next_ucs != next_ucs_from_utf8)
|
nuclear@3
|
383 {
|
nuclear@3
|
384 return false;
|
nuclear@3
|
385 }
|
nuclear@3
|
386 if (next_ucs == 0)
|
nuclear@3
|
387 {
|
nuclear@3
|
388 OVR_ASSERT(next_ucs_from_utf8 == 0);
|
nuclear@3
|
389 break;
|
nuclear@3
|
390 }
|
nuclear@3
|
391 }
|
nuclear@3
|
392
|
nuclear@3
|
393 return true;
|
nuclear@3
|
394 }
|
nuclear@3
|
395
|
nuclear@3
|
396
|
nuclear@3
|
397 void log_ascii(const char* line)
|
nuclear@3
|
398 {
|
nuclear@3
|
399 for (;;)
|
nuclear@3
|
400 {
|
nuclear@3
|
401 unsigned char c = (unsigned char) *line++;
|
nuclear@3
|
402 if (c == 0)
|
nuclear@3
|
403 {
|
nuclear@3
|
404 // End of line.
|
nuclear@3
|
405 return;
|
nuclear@3
|
406 }
|
nuclear@3
|
407 else if (c != '\n'
|
nuclear@3
|
408 && (c < 32 || c > 127))
|
nuclear@3
|
409 {
|
nuclear@3
|
410 // Non-printable as plain ASCII.
|
nuclear@3
|
411 printf("<0x%02X>", (int) c);
|
nuclear@3
|
412 }
|
nuclear@3
|
413 else
|
nuclear@3
|
414 {
|
nuclear@3
|
415 printf("%c", c);
|
nuclear@3
|
416 }
|
nuclear@3
|
417 }
|
nuclear@3
|
418 }
|
nuclear@3
|
419
|
nuclear@3
|
420
|
nuclear@3
|
421 void log_ucs(const UInt32* line)
|
nuclear@3
|
422 {
|
nuclear@3
|
423 for (;;)
|
nuclear@3
|
424 {
|
nuclear@3
|
425 UInt32 uc = *line++;
|
nuclear@3
|
426 if (uc == 0)
|
nuclear@3
|
427 {
|
nuclear@3
|
428 // End of line.
|
nuclear@3
|
429 return;
|
nuclear@3
|
430 }
|
nuclear@3
|
431 else if (uc != '\n'
|
nuclear@3
|
432 && (uc < 32 || uc > 127))
|
nuclear@3
|
433 {
|
nuclear@3
|
434 // Non-printable as plain ASCII.
|
nuclear@3
|
435 printf("<U-%04X>", uc);
|
nuclear@3
|
436 }
|
nuclear@3
|
437 else
|
nuclear@3
|
438 {
|
nuclear@3
|
439 printf("%c", (char) uc);
|
nuclear@3
|
440 }
|
nuclear@3
|
441 }
|
nuclear@3
|
442 }
|
nuclear@3
|
443
|
nuclear@3
|
444
|
nuclear@3
|
445 // Simple canned test.
|
nuclear@3
|
446 int main(int argc, const char* argv[])
|
nuclear@3
|
447 {
|
nuclear@3
|
448 {
|
nuclear@3
|
449 const char* test8 = "Ignacio CastaƱo";
|
nuclear@3
|
450 const UInt32 test32[] =
|
nuclear@3
|
451 {
|
nuclear@3
|
452 0x49, 0x67, 0x6E, 0x61, 0x63,
|
nuclear@3
|
453 0x69, 0x6F, 0x20, 0x43, 0x61,
|
nuclear@3
|
454 0x73, 0x74, 0x61, 0xF1, 0x6F,
|
nuclear@3
|
455 0x00
|
nuclear@3
|
456 };
|
nuclear@3
|
457
|
nuclear@3
|
458 OVR_ASSERT(check_equal(test8, test32));
|
nuclear@3
|
459 }
|
nuclear@3
|
460
|
nuclear@3
|
461 // If user passed an arg, try reading the file as UTF-8 encoded text.
|
nuclear@3
|
462 if (argc > 1)
|
nuclear@3
|
463 {
|
nuclear@3
|
464 const char* filename = argv[1];
|
nuclear@3
|
465 FILE* fp = fopen(filename, "rb");
|
nuclear@3
|
466 if (fp == NULL)
|
nuclear@3
|
467 {
|
nuclear@3
|
468 printf("Can't open file '%s'\n", filename);
|
nuclear@3
|
469 return 1;
|
nuclear@3
|
470 }
|
nuclear@3
|
471
|
nuclear@3
|
472 // Read lines from the file, encode/decode them, and highlight discrepancies.
|
nuclear@3
|
473 const int LINE_SIZE = 200; // max line size
|
nuclear@3
|
474 char line_buffer_utf8[LINE_SIZE];
|
nuclear@3
|
475 char reencoded_utf8[6 * LINE_SIZE];
|
nuclear@3
|
476 UInt32 line_buffer_ucs[LINE_SIZE];
|
nuclear@3
|
477
|
nuclear@3
|
478 int byte_counter = 0;
|
nuclear@3
|
479 for (;;)
|
nuclear@3
|
480 {
|
nuclear@3
|
481 int c = fgetc(fp);
|
nuclear@3
|
482 if (c == EOF)
|
nuclear@3
|
483 {
|
nuclear@3
|
484 // Done.
|
nuclear@3
|
485 break;
|
nuclear@3
|
486 }
|
nuclear@3
|
487 line_buffer_utf8[byte_counter++] = c;
|
nuclear@3
|
488 if (c == '\n' || byte_counter >= LINE_SIZE - 2)
|
nuclear@3
|
489 {
|
nuclear@3
|
490 // End of line. Process the line.
|
nuclear@3
|
491 line_buffer_utf8[byte_counter++] = 0; // terminate.
|
nuclear@3
|
492
|
nuclear@3
|
493 // Decode into UCS.
|
nuclear@3
|
494 const char* p = line_buffer_utf8;
|
nuclear@3
|
495 UInt32* q = line_buffer_ucs;
|
nuclear@3
|
496 for (;;)
|
nuclear@3
|
497 {
|
nuclear@3
|
498 UInt32 uc = UTF8Util::DecodeNextChar(&p);
|
nuclear@3
|
499 *q++ = uc;
|
nuclear@3
|
500
|
nuclear@3
|
501 OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE);
|
nuclear@3
|
502 OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE);
|
nuclear@3
|
503
|
nuclear@3
|
504 if (uc == 0) break;
|
nuclear@3
|
505 }
|
nuclear@3
|
506
|
nuclear@3
|
507 // Encode back into UTF-8.
|
nuclear@3
|
508 q = line_buffer_ucs;
|
nuclear@3
|
509 int index = 0;
|
nuclear@3
|
510 for (;;)
|
nuclear@3
|
511 {
|
nuclear@3
|
512 UInt32 uc = *q++;
|
nuclear@3
|
513 OVR_ASSERT(index < LINE_SIZE * 6 - 6);
|
nuclear@3
|
514 int last_index = index;
|
nuclear@3
|
515 UTF8Util::EncodeChar(reencoded_utf8, &index, uc);
|
nuclear@3
|
516 OVR_ASSERT(index <= last_index + 6);
|
nuclear@3
|
517 if (uc == 0) break;
|
nuclear@3
|
518 }
|
nuclear@3
|
519
|
nuclear@3
|
520 // This can be useful for debugging.
|
nuclear@3
|
521 #if 0
|
nuclear@3
|
522 // Show the UCS and the re-encoded UTF-8.
|
nuclear@3
|
523 log_ucs(line_buffer_ucs);
|
nuclear@3
|
524 log_ascii(reencoded_utf8);
|
nuclear@3
|
525 #endif // 0
|
nuclear@3
|
526
|
nuclear@3
|
527 OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs));
|
nuclear@3
|
528 OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs));
|
nuclear@3
|
529
|
nuclear@3
|
530 // Start next line.
|
nuclear@3
|
531 byte_counter = 0;
|
nuclear@3
|
532 }
|
nuclear@3
|
533 }
|
nuclear@3
|
534
|
nuclear@3
|
535 fclose(fp);
|
nuclear@3
|
536 }
|
nuclear@3
|
537
|
nuclear@3
|
538 return 0;
|
nuclear@3
|
539 }
|
nuclear@3
|
540
|
nuclear@3
|
541
|
nuclear@3
|
542 #endif // UTF8_UNIT_TEST
|
nuclear@3
|
543
|
nuclear@3
|
544 }} // namespace UTF8Util::OVR
|
nuclear@3
|
545
|