rev |
line source |
nuclear@0
|
1 /**************************************************************************
|
nuclear@0
|
2
|
nuclear@0
|
3 Filename : OVR_UTF8Util.cpp
|
nuclear@0
|
4 Content : UTF8 Unicode character encoding/decoding support
|
nuclear@0
|
5 Created : September 19, 2012
|
nuclear@0
|
6 Notes :
|
nuclear@0
|
7 Notes : Much useful info at "UTF-8 and Unicode FAQ"
|
nuclear@0
|
8 http://www.cl.cam.ac.uk/~mgk25/unicode.html
|
nuclear@0
|
9
|
nuclear@0
|
10 Copyright : Copyright 2014 Oculus VR, LLC All Rights reserved.
|
nuclear@0
|
11
|
nuclear@0
|
12 Licensed under the Oculus VR Rift SDK License Version 3.2 (the "License");
|
nuclear@0
|
13 you may not use the Oculus VR Rift SDK except in compliance with the License,
|
nuclear@0
|
14 which is provided at the time of installation or download, or which
|
nuclear@0
|
15 otherwise accompanies this software in either electronic or hard copy form.
|
nuclear@0
|
16
|
nuclear@0
|
17 You may obtain a copy of the License at
|
nuclear@0
|
18
|
nuclear@0
|
19 http://www.oculusvr.com/licenses/LICENSE-3.2
|
nuclear@0
|
20
|
nuclear@0
|
21 Unless required by applicable law or agreed to in writing, the Oculus VR SDK
|
nuclear@0
|
22 distributed under the License is distributed on an "AS IS" BASIS,
|
nuclear@0
|
23 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
nuclear@0
|
24 See the License for the specific language governing permissions and
|
nuclear@0
|
25 limitations under the License.
|
nuclear@0
|
26
|
nuclear@0
|
27 ************************************************************************************/
|
nuclear@0
|
28
|
nuclear@0
|
29 #include "OVR_UTF8Util.h"
|
nuclear@0
|
30
|
nuclear@0
|
31 namespace OVR { namespace UTF8Util {
|
nuclear@0
|
32
|
nuclear@0
|
33 intptr_t OVR_STDCALL GetLength(const char* buf, intptr_t buflen)
|
nuclear@0
|
34 {
|
nuclear@0
|
35 const char* p = buf;
|
nuclear@0
|
36 intptr_t length = 0;
|
nuclear@0
|
37
|
nuclear@0
|
38 if (buflen != -1)
|
nuclear@0
|
39 {
|
nuclear@0
|
40 while (p - buf < buflen)
|
nuclear@0
|
41 {
|
nuclear@0
|
42 // We should be able to have ASStrings with 0 in the middle.
|
nuclear@0
|
43 UTF8Util::DecodeNextChar_Advance0(&p);
|
nuclear@0
|
44 length++;
|
nuclear@0
|
45 }
|
nuclear@0
|
46 }
|
nuclear@0
|
47 else
|
nuclear@0
|
48 {
|
nuclear@0
|
49 while (UTF8Util::DecodeNextChar_Advance0(&p))
|
nuclear@0
|
50 length++;
|
nuclear@0
|
51 }
|
nuclear@0
|
52
|
nuclear@0
|
53 return length;
|
nuclear@0
|
54 }
|
nuclear@0
|
55
|
nuclear@0
|
56 uint32_t OVR_STDCALL GetCharAt(intptr_t index, const char* putf8str, intptr_t length)
|
nuclear@0
|
57 {
|
nuclear@0
|
58 const char* buf = putf8str;
|
nuclear@0
|
59 uint32_t c = 0;
|
nuclear@0
|
60
|
nuclear@0
|
61 if (length != -1)
|
nuclear@0
|
62 {
|
nuclear@0
|
63 while (buf - putf8str < length)
|
nuclear@0
|
64 {
|
nuclear@0
|
65 c = UTF8Util::DecodeNextChar_Advance0(&buf);
|
nuclear@0
|
66 if (index == 0)
|
nuclear@0
|
67 return c;
|
nuclear@0
|
68 index--;
|
nuclear@0
|
69 }
|
nuclear@0
|
70
|
nuclear@0
|
71 return c;
|
nuclear@0
|
72 }
|
nuclear@0
|
73
|
nuclear@0
|
74 do
|
nuclear@0
|
75 {
|
nuclear@0
|
76 c = UTF8Util::DecodeNextChar_Advance0(&buf);
|
nuclear@0
|
77 index--;
|
nuclear@0
|
78
|
nuclear@0
|
79 if (c == 0)
|
nuclear@0
|
80 {
|
nuclear@0
|
81 // We've hit the end of the string; don't go further.
|
nuclear@0
|
82 OVR_ASSERT(index == 0);
|
nuclear@0
|
83 return c;
|
nuclear@0
|
84 }
|
nuclear@0
|
85 } while (index >= 0);
|
nuclear@0
|
86
|
nuclear@0
|
87 return c;
|
nuclear@0
|
88 }
|
nuclear@0
|
89
|
nuclear@0
|
90 intptr_t OVR_STDCALL GetByteIndex(intptr_t index, const char *putf8str, intptr_t length)
|
nuclear@0
|
91 {
|
nuclear@0
|
92 const char* buf = putf8str;
|
nuclear@0
|
93
|
nuclear@0
|
94 if (length != -1)
|
nuclear@0
|
95 {
|
nuclear@0
|
96 while ((buf - putf8str) < length && index > 0)
|
nuclear@0
|
97 {
|
nuclear@0
|
98 UTF8Util::DecodeNextChar_Advance0(&buf);
|
nuclear@0
|
99 index--;
|
nuclear@0
|
100 }
|
nuclear@0
|
101
|
nuclear@0
|
102 return buf-putf8str;
|
nuclear@0
|
103 }
|
nuclear@0
|
104
|
nuclear@0
|
105 while (index > 0)
|
nuclear@0
|
106 {
|
nuclear@0
|
107 uint32_t c = UTF8Util::DecodeNextChar_Advance0(&buf);
|
nuclear@0
|
108 index--;
|
nuclear@0
|
109
|
nuclear@0
|
110 if (c == 0)
|
nuclear@0
|
111 return buf-putf8str;
|
nuclear@0
|
112 };
|
nuclear@0
|
113
|
nuclear@0
|
114 return buf-putf8str;
|
nuclear@0
|
115 }
|
nuclear@0
|
116
|
nuclear@0
|
117 int OVR_STDCALL GetEncodeCharSize(uint32_t ucs_character)
|
nuclear@0
|
118 {
|
nuclear@0
|
119 if (ucs_character <= 0x7F)
|
nuclear@0
|
120 return 1;
|
nuclear@0
|
121 else if (ucs_character <= 0x7FF)
|
nuclear@0
|
122 return 2;
|
nuclear@0
|
123 else if (ucs_character <= 0xFFFF)
|
nuclear@0
|
124 return 3;
|
nuclear@0
|
125 else if (ucs_character <= 0x1FFFFF)
|
nuclear@0
|
126 return 4;
|
nuclear@0
|
127 else if (ucs_character <= 0x3FFFFFF)
|
nuclear@0
|
128 return 5;
|
nuclear@0
|
129 else if (ucs_character <= 0x7FFFFFFF)
|
nuclear@0
|
130 return 6;
|
nuclear@0
|
131 else
|
nuclear@0
|
132 return 0;
|
nuclear@0
|
133 }
|
nuclear@0
|
134
|
nuclear@0
|
135 uint32_t OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer)
|
nuclear@0
|
136 {
|
nuclear@0
|
137 uint32_t uc;
|
nuclear@0
|
138 char c;
|
nuclear@0
|
139
|
nuclear@0
|
140 // Security considerations:
|
nuclear@0
|
141 //
|
nuclear@0
|
142 // Changed, this is now only the case for DecodeNextChar:
|
nuclear@0
|
143 // - If we hit a zero byte, we want to return 0 without stepping
|
nuclear@0
|
144 // the buffer pointer past the 0. th
|
nuclear@0
|
145 //
|
nuclear@0
|
146 // If we hit an "overlong sequence"; i.e. a character encoded
|
nuclear@0
|
147 // in a longer multibyte string than is necessary, then we
|
nuclear@0
|
148 // need to discard the character. This is so attackers can't
|
nuclear@0
|
149 // disguise dangerous characters or character sequences --
|
nuclear@0
|
150 // there is only one valid encoding for each character.
|
nuclear@0
|
151 //
|
nuclear@0
|
152 // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
|
nuclear@0
|
153 // 0xFFFF } then we ignore them; they are not valid in UTF-8.
|
nuclear@0
|
154
|
nuclear@0
|
155 // This isn't actually an invalid character; it's a valid char that
|
nuclear@0
|
156 // looks like an inverted question mark.
|
nuclear@0
|
157 #define INVALID_CHAR 0x0FFFD
|
nuclear@0
|
158
|
nuclear@0
|
159 #define FIRST_BYTE(mask, shift) \
|
nuclear@0
|
160 uc = (c & (mask)) << (shift);
|
nuclear@0
|
161
|
nuclear@0
|
162 #define NEXT_BYTE(shift) \
|
nuclear@0
|
163 c = **putf8Buffer; \
|
nuclear@0
|
164 if (c == 0) return 0; /* end of buffer, do not advance */ \
|
nuclear@0
|
165 if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */ \
|
nuclear@0
|
166 (*putf8Buffer)++; \
|
nuclear@0
|
167 uc |= (c & 0x3F) << shift;
|
nuclear@0
|
168
|
nuclear@0
|
169 c = **putf8Buffer;
|
nuclear@0
|
170 (*putf8Buffer)++;
|
nuclear@0
|
171 if (c == 0)
|
nuclear@0
|
172 return 0; // End of buffer.
|
nuclear@0
|
173
|
nuclear@0
|
174 if ((c & 0x80) == 0) return (uint32_t) c; // Conventional 7-bit ASCII.
|
nuclear@0
|
175
|
nuclear@0
|
176 // Multi-byte sequences.
|
nuclear@0
|
177 if ((c & 0xE0) == 0xC0)
|
nuclear@0
|
178 {
|
nuclear@0
|
179 // Two-byte sequence.
|
nuclear@0
|
180 FIRST_BYTE(0x1F, 6);
|
nuclear@0
|
181 NEXT_BYTE(0);
|
nuclear@0
|
182 if (uc < 0x80) return INVALID_CHAR; // overlong
|
nuclear@0
|
183 return uc;
|
nuclear@0
|
184 }
|
nuclear@0
|
185 else if ((c & 0xF0) == 0xE0)
|
nuclear@0
|
186 {
|
nuclear@0
|
187 // Three-byte sequence.
|
nuclear@0
|
188 FIRST_BYTE(0x0F, 12);
|
nuclear@0
|
189 NEXT_BYTE(6);
|
nuclear@0
|
190 NEXT_BYTE(0);
|
nuclear@0
|
191 if (uc < 0x800) return INVALID_CHAR; // overlong
|
nuclear@0
|
192 // Not valid ISO 10646, but Flash requires these to work
|
nuclear@0
|
193 // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0)
|
nuclear@0
|
194 // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR;
|
nuclear@0
|
195 // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646
|
nuclear@0
|
196 return uc;
|
nuclear@0
|
197 }
|
nuclear@0
|
198 else if ((c & 0xF8) == 0xF0)
|
nuclear@0
|
199 {
|
nuclear@0
|
200 // Four-byte sequence.
|
nuclear@0
|
201 FIRST_BYTE(0x07, 18);
|
nuclear@0
|
202 NEXT_BYTE(12);
|
nuclear@0
|
203 NEXT_BYTE(6);
|
nuclear@0
|
204 NEXT_BYTE(0);
|
nuclear@0
|
205 if (uc < 0x010000) return INVALID_CHAR; // overlong
|
nuclear@0
|
206 return uc;
|
nuclear@0
|
207 }
|
nuclear@0
|
208 else if ((c & 0xFC) == 0xF8)
|
nuclear@0
|
209 {
|
nuclear@0
|
210 // Five-byte sequence.
|
nuclear@0
|
211 FIRST_BYTE(0x03, 24);
|
nuclear@0
|
212 NEXT_BYTE(18);
|
nuclear@0
|
213 NEXT_BYTE(12);
|
nuclear@0
|
214 NEXT_BYTE(6);
|
nuclear@0
|
215 NEXT_BYTE(0);
|
nuclear@0
|
216 if (uc < 0x0200000) return INVALID_CHAR; // overlong
|
nuclear@0
|
217 return uc;
|
nuclear@0
|
218 }
|
nuclear@0
|
219 else if ((c & 0xFE) == 0xFC)
|
nuclear@0
|
220 {
|
nuclear@0
|
221 // Six-byte sequence.
|
nuclear@0
|
222 FIRST_BYTE(0x01, 30);
|
nuclear@0
|
223 NEXT_BYTE(24);
|
nuclear@0
|
224 NEXT_BYTE(18);
|
nuclear@0
|
225 NEXT_BYTE(12);
|
nuclear@0
|
226 NEXT_BYTE(6);
|
nuclear@0
|
227 NEXT_BYTE(0);
|
nuclear@0
|
228 if (uc < 0x04000000) return INVALID_CHAR; // overlong
|
nuclear@0
|
229 return uc;
|
nuclear@0
|
230 }
|
nuclear@0
|
231 else
|
nuclear@0
|
232 {
|
nuclear@0
|
233 // Invalid.
|
nuclear@0
|
234 return INVALID_CHAR;
|
nuclear@0
|
235 }
|
nuclear@0
|
236 }
|
nuclear@0
|
237
|
nuclear@0
|
238
|
nuclear@0
|
239 void OVR_STDCALL EncodeChar(char* pbuffer, intptr_t* pindex, uint32_t ucs_character)
|
nuclear@0
|
240 {
|
nuclear@0
|
241 if (ucs_character <= 0x7F)
|
nuclear@0
|
242 {
|
nuclear@0
|
243 // Plain single-byte ASCII.
|
nuclear@0
|
244 pbuffer[(*pindex)++] = (char) ucs_character;
|
nuclear@0
|
245 }
|
nuclear@0
|
246 else if (ucs_character <= 0x7FF)
|
nuclear@0
|
247 {
|
nuclear@0
|
248 // Two bytes.
|
nuclear@0
|
249 pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6);
|
nuclear@0
|
250 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
|
nuclear@0
|
251 }
|
nuclear@0
|
252 else if (ucs_character <= 0xFFFF)
|
nuclear@0
|
253 {
|
nuclear@0
|
254 // Three bytes.
|
nuclear@0
|
255 pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12);
|
nuclear@0
|
256 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
|
nuclear@0
|
257 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
|
nuclear@0
|
258 }
|
nuclear@0
|
259 else if (ucs_character <= 0x1FFFFF)
|
nuclear@0
|
260 {
|
nuclear@0
|
261 // Four bytes.
|
nuclear@0
|
262 pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18);
|
nuclear@0
|
263 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
|
nuclear@0
|
264 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
|
nuclear@0
|
265 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
|
nuclear@0
|
266 }
|
nuclear@0
|
267 else if (ucs_character <= 0x3FFFFFF)
|
nuclear@0
|
268 {
|
nuclear@0
|
269 // Five bytes.
|
nuclear@0
|
270 pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24);
|
nuclear@0
|
271 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
|
nuclear@0
|
272 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
|
nuclear@0
|
273 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
|
nuclear@0
|
274 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
|
nuclear@0
|
275 }
|
nuclear@0
|
276 else if (ucs_character <= 0x7FFFFFFF)
|
nuclear@0
|
277 {
|
nuclear@0
|
278 // Six bytes.
|
nuclear@0
|
279 pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30);
|
nuclear@0
|
280 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F);
|
nuclear@0
|
281 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
|
nuclear@0
|
282 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
|
nuclear@0
|
283 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
|
nuclear@0
|
284 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
|
nuclear@0
|
285 }
|
nuclear@0
|
286 else
|
nuclear@0
|
287 {
|
nuclear@0
|
288 // Invalid char; don't encode anything.
|
nuclear@0
|
289 }
|
nuclear@0
|
290 }
|
nuclear@0
|
291
|
nuclear@0
|
292 intptr_t OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, intptr_t length)
|
nuclear@0
|
293 {
|
nuclear@0
|
294 intptr_t len = 0;
|
nuclear@0
|
295 if (length != -1)
|
nuclear@0
|
296 for (int i = 0; i < length; i++)
|
nuclear@0
|
297 {
|
nuclear@0
|
298 len += GetEncodeCharSize(pchar[i]);
|
nuclear@0
|
299 }
|
nuclear@0
|
300 else
|
nuclear@0
|
301 for (int i = 0;; i++)
|
nuclear@0
|
302 {
|
nuclear@0
|
303 if (pchar[i] == 0)
|
nuclear@0
|
304 return len;
|
nuclear@0
|
305 len += GetEncodeCharSize(pchar[i]);
|
nuclear@0
|
306 }
|
nuclear@0
|
307 return len;
|
nuclear@0
|
308 }
|
nuclear@0
|
309
|
nuclear@0
|
310 void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, intptr_t length)
|
nuclear@0
|
311 {
|
nuclear@0
|
312 intptr_t ofs = 0;
|
nuclear@0
|
313 if (length != -1)
|
nuclear@0
|
314 {
|
nuclear@0
|
315 for (int i = 0; i < length; i++)
|
nuclear@0
|
316 {
|
nuclear@0
|
317 EncodeChar(pbuff, &ofs, pchar[i]);
|
nuclear@0
|
318 }
|
nuclear@0
|
319 }
|
nuclear@0
|
320 else
|
nuclear@0
|
321 {
|
nuclear@0
|
322 for (int i = 0;; i++)
|
nuclear@0
|
323 {
|
nuclear@0
|
324 if (pchar[i] == 0)
|
nuclear@0
|
325 break;
|
nuclear@0
|
326 EncodeChar(pbuff, &ofs, pchar[i]);
|
nuclear@0
|
327 }
|
nuclear@0
|
328 }
|
nuclear@0
|
329 pbuff[ofs] = 0;
|
nuclear@0
|
330 }
|
nuclear@0
|
331
|
nuclear@0
|
332 size_t OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, intptr_t bytesLen)
|
nuclear@0
|
333 {
|
nuclear@0
|
334 wchar_t *pbegin = pbuff;
|
nuclear@0
|
335 if (bytesLen == -1)
|
nuclear@0
|
336 {
|
nuclear@0
|
337 while (1)
|
nuclear@0
|
338 {
|
nuclear@0
|
339 uint32_t ch = DecodeNextChar_Advance0(&putf8str);
|
nuclear@0
|
340 if (ch == 0)
|
nuclear@0
|
341 break;
|
nuclear@0
|
342 else if (ch >= 0xFFFF)
|
nuclear@0
|
343 ch = 0xFFFD;
|
nuclear@0
|
344 *pbuff++ = wchar_t(ch);
|
nuclear@0
|
345 }
|
nuclear@0
|
346 }
|
nuclear@0
|
347 else
|
nuclear@0
|
348 {
|
nuclear@0
|
349 const char* p = putf8str;
|
nuclear@0
|
350 while ((p - putf8str) < bytesLen)
|
nuclear@0
|
351 {
|
nuclear@0
|
352 uint32_t ch = DecodeNextChar_Advance0(&p);
|
nuclear@0
|
353 if (ch >= 0xFFFF)
|
nuclear@0
|
354 ch = 0xFFFD;
|
nuclear@0
|
355 *pbuff++ = wchar_t(ch);
|
nuclear@0
|
356 }
|
nuclear@0
|
357 }
|
nuclear@0
|
358
|
nuclear@0
|
359 *pbuff = 0;
|
nuclear@0
|
360 return pbuff - pbegin;
|
nuclear@0
|
361 }
|
nuclear@0
|
362
|
nuclear@0
|
363
|
nuclear@0
|
364 #ifdef UTF8_UNIT_TEST
|
nuclear@0
|
365
|
nuclear@0
|
366 // Compile this test case with something like:
|
nuclear@0
|
367 //
|
nuclear@0
|
368 // gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test
|
nuclear@0
|
369 //
|
nuclear@0
|
370 // or
|
nuclear@0
|
371 //
|
nuclear@0
|
372 // cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I..
|
nuclear@0
|
373 //
|
nuclear@0
|
374 // If possible, try running the test program with the first arg
|
nuclear@0
|
375 // pointing at the file:
|
nuclear@0
|
376 //
|
nuclear@0
|
377 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
nuclear@0
|
378 //
|
nuclear@0
|
379 // and examine the results by eye to make sure they are acceptable to
|
nuclear@0
|
380 // you.
|
nuclear@0
|
381
|
nuclear@0
|
382
|
nuclear@0
|
383 #include "base/utility.h"
|
nuclear@0
|
384 #include <stdio.h>
|
nuclear@0
|
385
|
nuclear@0
|
386
|
nuclear@0
|
387 bool check_equal(const char* utf8_in, const uint32_t* ucs_in)
|
nuclear@0
|
388 {
|
nuclear@0
|
389 for (;;)
|
nuclear@0
|
390 {
|
nuclear@0
|
391 uint32_t next_ucs = *ucs_in++;
|
nuclear@0
|
392 uint32_t next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in);
|
nuclear@0
|
393 if (next_ucs != next_ucs_from_utf8)
|
nuclear@0
|
394 {
|
nuclear@0
|
395 return false;
|
nuclear@0
|
396 }
|
nuclear@0
|
397 if (next_ucs == 0)
|
nuclear@0
|
398 {
|
nuclear@0
|
399 OVR_ASSERT(next_ucs_from_utf8 == 0);
|
nuclear@0
|
400 break;
|
nuclear@0
|
401 }
|
nuclear@0
|
402 }
|
nuclear@0
|
403
|
nuclear@0
|
404 return true;
|
nuclear@0
|
405 }
|
nuclear@0
|
406
|
nuclear@0
|
407
|
nuclear@0
|
408 void log_ascii(const char* line)
|
nuclear@0
|
409 {
|
nuclear@0
|
410 for (;;)
|
nuclear@0
|
411 {
|
nuclear@0
|
412 unsigned char c = (unsigned char) *line++;
|
nuclear@0
|
413 if (c == 0)
|
nuclear@0
|
414 {
|
nuclear@0
|
415 // End of line.
|
nuclear@0
|
416 return;
|
nuclear@0
|
417 }
|
nuclear@0
|
418 else if (c != '\n'
|
nuclear@0
|
419 && (c < 32 || c > 127))
|
nuclear@0
|
420 {
|
nuclear@0
|
421 // Non-printable as plain ASCII.
|
nuclear@0
|
422 printf("<0x%02X>", (int) c);
|
nuclear@0
|
423 }
|
nuclear@0
|
424 else
|
nuclear@0
|
425 {
|
nuclear@0
|
426 printf("%c", c);
|
nuclear@0
|
427 }
|
nuclear@0
|
428 }
|
nuclear@0
|
429 }
|
nuclear@0
|
430
|
nuclear@0
|
431
|
nuclear@0
|
432 void log_ucs(const uint32_t* line)
|
nuclear@0
|
433 {
|
nuclear@0
|
434 for (;;)
|
nuclear@0
|
435 {
|
nuclear@0
|
436 uint32_t uc = *line++;
|
nuclear@0
|
437 if (uc == 0)
|
nuclear@0
|
438 {
|
nuclear@0
|
439 // End of line.
|
nuclear@0
|
440 return;
|
nuclear@0
|
441 }
|
nuclear@0
|
442 else if (uc != '\n'
|
nuclear@0
|
443 && (uc < 32 || uc > 127))
|
nuclear@0
|
444 {
|
nuclear@0
|
445 // Non-printable as plain ASCII.
|
nuclear@0
|
446 printf("<U-%04X>", uc);
|
nuclear@0
|
447 }
|
nuclear@0
|
448 else
|
nuclear@0
|
449 {
|
nuclear@0
|
450 printf("%c", (char) uc);
|
nuclear@0
|
451 }
|
nuclear@0
|
452 }
|
nuclear@0
|
453 }
|
nuclear@0
|
454
|
nuclear@0
|
455
|
nuclear@0
|
456 // Simple canned test.
|
nuclear@0
|
457 int main(int argc, const char* argv[])
|
nuclear@0
|
458 {
|
nuclear@0
|
459 {
|
nuclear@0
|
460 const char* test8 = "Ignacio CastaƱo";
|
nuclear@0
|
461 const uint32_t test32[] =
|
nuclear@0
|
462 {
|
nuclear@0
|
463 0x49, 0x67, 0x6E, 0x61, 0x63,
|
nuclear@0
|
464 0x69, 0x6F, 0x20, 0x43, 0x61,
|
nuclear@0
|
465 0x73, 0x74, 0x61, 0xF1, 0x6F,
|
nuclear@0
|
466 0x00
|
nuclear@0
|
467 };
|
nuclear@0
|
468
|
nuclear@0
|
469 OVR_ASSERT(check_equal(test8, test32));
|
nuclear@0
|
470 }
|
nuclear@0
|
471
|
nuclear@0
|
472 // If user passed an arg, try reading the file as UTF-8 encoded text.
|
nuclear@0
|
473 if (argc > 1)
|
nuclear@0
|
474 {
|
nuclear@0
|
475 const char* filename = argv[1];
|
nuclear@0
|
476 FILE* fp = fopen(filename, "rb");
|
nuclear@0
|
477 if (fp == NULL)
|
nuclear@0
|
478 {
|
nuclear@0
|
479 printf("Can't open file '%s'\n", filename);
|
nuclear@0
|
480 return 1;
|
nuclear@0
|
481 }
|
nuclear@0
|
482
|
nuclear@0
|
483 // Read lines from the file, encode/decode them, and highlight discrepancies.
|
nuclear@0
|
484 const int LINE_SIZE = 200; // max line size
|
nuclear@0
|
485 char line_buffer_utf8[LINE_SIZE];
|
nuclear@0
|
486 char reencoded_utf8[6 * LINE_SIZE];
|
nuclear@0
|
487 uint32_t line_buffer_ucs[LINE_SIZE];
|
nuclear@0
|
488
|
nuclear@0
|
489 int byte_counter = 0;
|
nuclear@0
|
490 for (;;)
|
nuclear@0
|
491 {
|
nuclear@0
|
492 int c = fgetc(fp);
|
nuclear@0
|
493 if (c == EOF)
|
nuclear@0
|
494 {
|
nuclear@0
|
495 // Done.
|
nuclear@0
|
496 break;
|
nuclear@0
|
497 }
|
nuclear@0
|
498 line_buffer_utf8[byte_counter++] = c;
|
nuclear@0
|
499 if (c == '\n' || byte_counter >= LINE_SIZE - 2)
|
nuclear@0
|
500 {
|
nuclear@0
|
501 // End of line. Process the line.
|
nuclear@0
|
502 line_buffer_utf8[byte_counter++] = 0; // terminate.
|
nuclear@0
|
503
|
nuclear@0
|
504 // Decode into UCS.
|
nuclear@0
|
505 const char* p = line_buffer_utf8;
|
nuclear@0
|
506 uint32_t* q = line_buffer_ucs;
|
nuclear@0
|
507 for (;;)
|
nuclear@0
|
508 {
|
nuclear@0
|
509 uint32_t uc = UTF8Util::DecodeNextChar(&p);
|
nuclear@0
|
510 *q++ = uc;
|
nuclear@0
|
511
|
nuclear@0
|
512 OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE);
|
nuclear@0
|
513 OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE);
|
nuclear@0
|
514
|
nuclear@0
|
515 if (uc == 0) break;
|
nuclear@0
|
516 }
|
nuclear@0
|
517
|
nuclear@0
|
518 // Encode back into UTF-8.
|
nuclear@0
|
519 q = line_buffer_ucs;
|
nuclear@0
|
520 int index = 0;
|
nuclear@0
|
521 for (;;)
|
nuclear@0
|
522 {
|
nuclear@0
|
523 uint32_t uc = *q++;
|
nuclear@0
|
524 OVR_ASSERT(index < LINE_SIZE * 6 - 6);
|
nuclear@0
|
525 int last_index = index;
|
nuclear@0
|
526 UTF8Util::EncodeChar(reencoded_utf8, &index, uc);
|
nuclear@0
|
527 OVR_ASSERT(index <= last_index + 6);
|
nuclear@0
|
528 if (uc == 0) break;
|
nuclear@0
|
529 }
|
nuclear@0
|
530
|
nuclear@0
|
531 // This can be useful for debugging.
|
nuclear@0
|
532 #if 0
|
nuclear@0
|
533 // Show the UCS and the re-encoded UTF-8.
|
nuclear@0
|
534 log_ucs(line_buffer_ucs);
|
nuclear@0
|
535 log_ascii(reencoded_utf8);
|
nuclear@0
|
536 #endif // 0
|
nuclear@0
|
537
|
nuclear@0
|
538 OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs));
|
nuclear@0
|
539 OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs));
|
nuclear@0
|
540
|
nuclear@0
|
541 // Start next line.
|
nuclear@0
|
542 byte_counter = 0;
|
nuclear@0
|
543 }
|
nuclear@0
|
544 }
|
nuclear@0
|
545
|
nuclear@0
|
546 fclose(fp);
|
nuclear@0
|
547 }
|
nuclear@0
|
548
|
nuclear@0
|
549 return 0;
|
nuclear@0
|
550 }
|
nuclear@0
|
551
|
nuclear@0
|
552
|
nuclear@0
|
553 #endif // UTF8_UNIT_TEST
|
nuclear@0
|
554
|
nuclear@0
|
555 }} // namespace UTF8Util::OVR
|
nuclear@0
|
556
|