oculus1: e2f9e4603129 libovr/Src/Kernel/OVR

oculus1

view libovr/Src/Kernel/OVR_UTF8Util.cpp @ 1:e2f9e4603129

added LibOVR and started a simple vr wrapper.

author	John Tsiombikas <nuclear@member.fsf.org>
date	Sat, 14 Sep 2013 16:14:59 +0300
parents
children	b069a5c27388

line source

1 /**************************************************************************

3 Filename : OVR_UTF8Util.cpp

4 Content : UTF8 Unicode character encoding/decoding support

5 Created : September 19, 2012

6 Notes :

7 Notes : Much useful info at "UTF-8 and Unicode FAQ"

8 http://www.cl.cam.ac.uk/~mgk25/unicode.html

12 Use of this software is subject to the terms of the Oculus license

13 agreement provided at the time of installation or download, or which

14 otherwise accompanies this software in either electronic or hard copy form.

16 ************************************************************************************/

18 #include "OVR_UTF8Util.h"

20 namespace OVR { namespace UTF8Util {

22 SPInt OVR_STDCALL GetLength(const char* buf, SPInt buflen)

23 {

24 const char* p = buf;

25 SPInt length = 0;

27 if (buflen != -1)

28 {

29 while (p - buf < buflen)

30 {

31 // We should be able to have ASStrings with 0 in the middle.

32 UTF8Util::DecodeNextChar_Advance0(&p);

33 length++;

34 }

35 }

36 else

37 {

38 while (UTF8Util::DecodeNextChar_Advance0(&p))

39 length++;

40 }

42 return length;

43 }

45 UInt32 OVR_STDCALL GetCharAt(SPInt index, const char* putf8str, SPInt length)

46 {

47 const char* buf = putf8str;

48 UInt32 c = 0;

50 if (length != -1)

51 {

52 while (buf - putf8str < length)

53 {

54 c = UTF8Util::DecodeNextChar_Advance0(&buf);

55 if (index == 0)

56 return c;

57 index--;

58 }

60 return c;

61 }

63 do

64 {

65 c = UTF8Util::DecodeNextChar_Advance0(&buf);

66 index--;

68 if (c == 0)

69 {

70 // We've hit the end of the string; don't go further.

71 OVR_ASSERT(index == 0);

72 return c;

73 }

74 } while (index >= 0);

76 return c;

77 }

79 SPInt OVR_STDCALL GetByteIndex(SPInt index, const char *putf8str, SPInt length)

80 {

81 const char* buf = putf8str;

83 if (length != -1)

84 {

85 while ((buf - putf8str) < length && index > 0)

86 {

87 UTF8Util::DecodeNextChar_Advance0(&buf);

88 index--;

89 }

91 return buf-putf8str;

92 }

94 while (index > 0)

95 {

96 UInt32 c = UTF8Util::DecodeNextChar_Advance0(&buf);

97 index--;

99 if (c == 0)

100 return buf-putf8str;

101 };

102

103 return buf-putf8str;

104 }

105

106 int OVR_STDCALL GetEncodeCharSize(UInt32 ucs_character)

107 {

108 if (ucs_character <= 0x7F)

109 return 1;

110 else if (ucs_character <= 0x7FF)

111 return 2;

112 else if (ucs_character <= 0xFFFF)

113 return 3;

114 else if (ucs_character <= 0x1FFFFF)

115 return 4;

116 else if (ucs_character <= 0x3FFFFFF)

117 return 5;

118 else if (ucs_character <= 0x7FFFFFFF)

119 return 6;

120 else

121 return 0;

122 }

123

124 UInt32 OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer)

125 {

126 UInt32 uc;

127 char c;

128

129 // Security considerations:

130 //

131 // Changed, this is now only the case for DecodeNextChar:

132 // - If we hit a zero byte, we want to return 0 without stepping

133 // the buffer pointer past the 0. th

134 //

135 // If we hit an "overlong sequence"; i.e. a character encoded

136 // in a longer multibyte string than is necessary, then we

137 // need to discard the character. This is so attackers can't

138 // disguise dangerous characters or character sequences --

139 // there is only one valid encoding for each character.

140 //

141 // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,

142 // 0xFFFF } then we ignore them; they are not valid in UTF-8.

143

144 // This isn't actually an invalid character; it's a valid char that

145 // looks like an inverted question mark.

146 #define INVALID_CHAR 0x0FFFD

147

148 #define FIRST_BYTE(mask, shift) \

149 uc = (c & (mask)) << (shift);

150

151 #define NEXT_BYTE(shift) \

152 c = **putf8Buffer; \

153 if (c == 0) return 0; /* end of buffer, do not advance */ \

154 if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */ \

155 (*putf8Buffer)++; \

156 uc |= (c & 0x3F) << shift;

157

158 c = **putf8Buffer;

159 (*putf8Buffer)++;

160 if (c == 0)

161 return 0; // End of buffer.

162

163 if ((c & 0x80) == 0) return (UInt32) c; // Conventional 7-bit ASCII.

164

165 // Multi-byte sequences.

166 if ((c & 0xE0) == 0xC0)

167 {

168 // Two-byte sequence.

169 FIRST_BYTE(0x1F, 6);

170 NEXT_BYTE(0);

171 if (uc < 0x80) return INVALID_CHAR; // overlong

172 return uc;

173 }

174 else if ((c & 0xF0) == 0xE0)

175 {

176 // Three-byte sequence.

177 FIRST_BYTE(0x0F, 12);

178 NEXT_BYTE(6);

179 NEXT_BYTE(0);

180 if (uc < 0x800) return INVALID_CHAR; // overlong

181 // Not valid ISO 10646, but Flash requires these to work

182 // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0)

183 // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR;

184 // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646

185 return uc;

186 }

187 else if ((c & 0xF8) == 0xF0)

188 {

189 // Four-byte sequence.

190 FIRST_BYTE(0x07, 18);

191 NEXT_BYTE(12);

192 NEXT_BYTE(6);

193 NEXT_BYTE(0);

194 if (uc < 0x010000) return INVALID_CHAR; // overlong

195 return uc;

196 }

197 else if ((c & 0xFC) == 0xF8)

198 {

199 // Five-byte sequence.

200 FIRST_BYTE(0x03, 24);

201 NEXT_BYTE(18);

202 NEXT_BYTE(12);

203 NEXT_BYTE(6);

204 NEXT_BYTE(0);

205 if (uc < 0x0200000) return INVALID_CHAR; // overlong

206 return uc;

207 }

208 else if ((c & 0xFE) == 0xFC)

209 {

210 // Six-byte sequence.

211 FIRST_BYTE(0x01, 30);

212 NEXT_BYTE(24);

213 NEXT_BYTE(18);

214 NEXT_BYTE(12);

215 NEXT_BYTE(6);

216 NEXT_BYTE(0);

217 if (uc < 0x04000000) return INVALID_CHAR; // overlong

218 return uc;

219 }

220 else

221 {

222 // Invalid.

223 return INVALID_CHAR;

224 }

225 }

226

227

228 void OVR_STDCALL EncodeChar(char* pbuffer, SPInt* pindex, UInt32 ucs_character)

229 {

230 if (ucs_character <= 0x7F)

231 {

232 // Plain single-byte ASCII.

233 pbuffer[(*pindex)++] = (char) ucs_character;

234 }

235 else if (ucs_character <= 0x7FF)

236 {

237 // Two bytes.

238 pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6);

239 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);

240 }

241 else if (ucs_character <= 0xFFFF)

242 {

243 // Three bytes.

244 pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12);

245 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);

246 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);

247 }

248 else if (ucs_character <= 0x1FFFFF)

249 {

250 // Four bytes.

251 pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18);

252 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);

253 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);

254 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);

255 }

256 else if (ucs_character <= 0x3FFFFFF)

257 {

258 // Five bytes.

259 pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24);

260 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);

261 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);

262 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);

263 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);

264 }

265 else if (ucs_character <= 0x7FFFFFFF)

266 {

267 // Six bytes.

268 pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30);

269 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F);

270 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);

271 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);

272 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);

273 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);

274 }

275 else

276 {

277 // Invalid char; don't encode anything.

278 }

279 }

280

281 SPInt OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, SPInt length)

282 {

283 SPInt len = 0;

284 if (length != -1)

285 for (int i = 0; i < length; i++)

286 {

287 len += GetEncodeCharSize(pchar[i]);

288 }

289 else

290 for (int i = 0;; i++)

291 {

292 if (pchar[i] == 0)

293 return len;

294 len += GetEncodeCharSize(pchar[i]);

295 }

296 return len;

297 }

298

299 void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, SPInt length)

300 {

301 SPInt ofs = 0;

302 if (length != -1)

303 {

304 for (int i = 0; i < length; i++)

305 {

306 EncodeChar(pbuff, &ofs, pchar[i]);

307 }

308 }

309 else

310 {

311 for (int i = 0;; i++)

312 {

313 if (pchar[i] == 0)

314 break;

315 EncodeChar(pbuff, &ofs, pchar[i]);

316 }

317 }

318 pbuff[ofs] = 0;

319 }

320

321 UPInt OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, SPInt bytesLen)

322 {

323 wchar_t *pbegin = pbuff;

324 if (bytesLen == -1)

325 {

326 while (1)

327 {

328 UInt32 ch = DecodeNextChar_Advance0(&putf8str);

329 if (ch == 0)

330 break;

331 else if (ch >= 0xFFFF)

332 ch = 0xFFFD;

333 *pbuff++ = wchar_t(ch);

334 }

335 }

336 else

337 {

338 const char* p = putf8str;

339 while ((p - putf8str) < bytesLen)

340 {

341 UInt32 ch = DecodeNextChar_Advance0(&p);

342 if (ch >= 0xFFFF)

343 ch = 0xFFFD;

344 *pbuff++ = wchar_t(ch);

345 }

346 }

347

348 *pbuff = 0;

349 return pbuff - pbegin;

350 }

351

352

353 #ifdef UTF8_UNIT_TEST

354

355 // Compile this test case with something like:

356 //

357 // gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test

358 //

359 // or

360 //

361 // cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I..

362 //

363 // If possible, try running the test program with the first arg

364 // pointing at the file:

365 //

366 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt

367 //

368 // and examine the results by eye to make sure they are acceptable to

369 // you.

370

371

372 #include "base/utility.h"

373 #include <stdio.h>

374

375

376 bool check_equal(const char* utf8_in, const UInt32* ucs_in)

377 {

378 for (;;)

379 {

380 UInt32 next_ucs = *ucs_in++;

381 UInt32 next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in);

382 if (next_ucs != next_ucs_from_utf8)

383 {

384 return false;

385 }

386 if (next_ucs == 0)

387 {

388 OVR_ASSERT(next_ucs_from_utf8 == 0);

389 break;

390 }

391 }

392

393 return true;

394 }

395

396

397 void log_ascii(const char* line)

398 {

399 for (;;)

400 {

401 unsigned char c = (unsigned char) *line++;

402 if (c == 0)

403 {

404 // End of line.

405 return;

406 }

407 else if (c != '\n'

408 && (c < 32 || c > 127))

409 {

410 // Non-printable as plain ASCII.

411 printf("<0x%02X>", (int) c);

412 }

413 else

414 {

415 printf("%c", c);

416 }

417 }

418 }

419

420

421 void log_ucs(const UInt32* line)

422 {

423 for (;;)

424 {

425 UInt32 uc = *line++;

426 if (uc == 0)

427 {

428 // End of line.

429 return;

430 }

431 else if (uc != '\n'

432 && (uc < 32 || uc > 127))

433 {

434 // Non-printable as plain ASCII.

435 printf("<U-%04X>", uc);

436 }

437 else

438 {

439 printf("%c", (char) uc);

440 }

441 }

442 }

443

444

445 // Simple canned test.

446 int main(int argc, const char* argv[])

447 {

448 {

449 const char* test8 = "Ignacio Castaño";

450 const UInt32 test32[] =

451 {

452 0x49, 0x67, 0x6E, 0x61, 0x63,

453 0x69, 0x6F, 0x20, 0x43, 0x61,

454 0x73, 0x74, 0x61, 0xF1, 0x6F,

455 0x00

456 };

457

458 OVR_ASSERT(check_equal(test8, test32));

459 }

460

461 // If user passed an arg, try reading the file as UTF-8 encoded text.

462 if (argc > 1)

463 {

464 const char* filename = argv[1];

465 FILE* fp = fopen(filename, "rb");

466 if (fp == NULL)

467 {

468 printf("Can't open file '%s'\n", filename);

469 return 1;

470 }

471

472 // Read lines from the file, encode/decode them, and highlight discrepancies.

473 const int LINE_SIZE = 200; // max line size

474 char line_buffer_utf8[LINE_SIZE];

475 char reencoded_utf8[6 * LINE_SIZE];

476 UInt32 line_buffer_ucs[LINE_SIZE];

477

478 int byte_counter = 0;

479 for (;;)

480 {

481 int c = fgetc(fp);

482 if (c == EOF)

483 {

484 // Done.

485 break;

486 }

487 line_buffer_utf8[byte_counter++] = c;

488 if (c == '\n' || byte_counter >= LINE_SIZE - 2)

489 {

490 // End of line. Process the line.

491 line_buffer_utf8[byte_counter++] = 0; // terminate.

492

493 // Decode into UCS.

494 const char* p = line_buffer_utf8;

495 UInt32* q = line_buffer_ucs;

496 for (;;)

497 {

498 UInt32 uc = UTF8Util::DecodeNextChar(&p);

499 *q++ = uc;

500

501 OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE);

502 OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE);

503

504 if (uc == 0) break;

505 }

506

507 // Encode back into UTF-8.

508 q = line_buffer_ucs;

509 int index = 0;

510 for (;;)

511 {

512 UInt32 uc = *q++;

513 OVR_ASSERT(index < LINE_SIZE * 6 - 6);

514 int last_index = index;

515 UTF8Util::EncodeChar(reencoded_utf8, &index, uc);

516 OVR_ASSERT(index <= last_index + 6);

517 if (uc == 0) break;

518 }

519

520 // This can be useful for debugging.

521 #if 0

522 // Show the UCS and the re-encoded UTF-8.

523 log_ucs(line_buffer_ucs);

524 log_ascii(reencoded_utf8);

525 #endif // 0

526

527 OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs));

528 OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs));

529

530 // Start next line.

531 byte_counter = 0;

532 }

533 }

534

535 fclose(fp);

536 }

537

538 return 0;

539 }

540

541

542 #endif // UTF8_UNIT_TEST

543

544 }} // namespace UTF8Util::OVR

545