ovr_sdk: 1b39a1b46319 LibOVR/Src/Kernel/OVR

ovr_sdk

view LibOVR/Src/Kernel/OVR_UTF8Util.cpp @ 0:1b39a1b46319

initial 0.4.4

author	John Tsiombikas <nuclear@member.fsf.org>
date	Wed, 14 Jan 2015 06:51:16 +0200
parents
children

line source

1 /**************************************************************************

3 Filename : OVR_UTF8Util.cpp

4 Content : UTF8 Unicode character encoding/decoding support

5 Created : September 19, 2012

6 Notes :

7 Notes : Much useful info at "UTF-8 and Unicode FAQ"

8 http://www.cl.cam.ac.uk/~mgk25/unicode.html

12 Licensed under the Oculus VR Rift SDK License Version 3.2 (the "License");

13 you may not use the Oculus VR Rift SDK except in compliance with the License,

14 which is provided at the time of installation or download, or which

15 otherwise accompanies this software in either electronic or hard copy form.

17 You may obtain a copy of the License at

19 http://www.oculusvr.com/licenses/LICENSE-3.2

21 Unless required by applicable law or agreed to in writing, the Oculus VR SDK

22 distributed under the License is distributed on an "AS IS" BASIS,

23 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

24 See the License for the specific language governing permissions and

25 limitations under the License.

27 ************************************************************************************/

29 #include "OVR_UTF8Util.h"

31 namespace OVR { namespace UTF8Util {

33 intptr_t OVR_STDCALL GetLength(const char* buf, intptr_t buflen)

34 {

35 const char* p = buf;

36 intptr_t length = 0;

38 if (buflen != -1)

39 {

40 while (p - buf < buflen)

41 {

42 // We should be able to have ASStrings with 0 in the middle.

43 UTF8Util::DecodeNextChar_Advance0(&p);

44 length++;

45 }

46 }

47 else

48 {

49 while (UTF8Util::DecodeNextChar_Advance0(&p))

50 length++;

51 }

53 return length;

54 }

56 uint32_t OVR_STDCALL GetCharAt(intptr_t index, const char* putf8str, intptr_t length)

57 {

58 const char* buf = putf8str;

59 uint32_t c = 0;

61 if (length != -1)

62 {

63 while (buf - putf8str < length)

64 {

65 c = UTF8Util::DecodeNextChar_Advance0(&buf);

66 if (index == 0)

67 return c;

68 index--;

69 }

71 return c;

72 }

74 do

75 {

76 c = UTF8Util::DecodeNextChar_Advance0(&buf);

77 index--;

79 if (c == 0)

80 {

81 // We've hit the end of the string; don't go further.

82 OVR_ASSERT(index == 0);

83 return c;

84 }

85 } while (index >= 0);

87 return c;

88 }

90 intptr_t OVR_STDCALL GetByteIndex(intptr_t index, const char *putf8str, intptr_t length)

91 {

92 const char* buf = putf8str;

94 if (length != -1)

95 {

96 while ((buf - putf8str) < length && index > 0)

97 {

98 UTF8Util::DecodeNextChar_Advance0(&buf);

99 index--;

100 }

101

102 return buf-putf8str;

103 }

104

105 while (index > 0)

106 {

107 uint32_t c = UTF8Util::DecodeNextChar_Advance0(&buf);

108 index--;

109

110 if (c == 0)

111 return buf-putf8str;

112 };

113

114 return buf-putf8str;

115 }

116

117 int OVR_STDCALL GetEncodeCharSize(uint32_t ucs_character)

118 {

119 if (ucs_character <= 0x7F)

120 return 1;

121 else if (ucs_character <= 0x7FF)

122 return 2;

123 else if (ucs_character <= 0xFFFF)

124 return 3;

125 else if (ucs_character <= 0x1FFFFF)

126 return 4;

127 else if (ucs_character <= 0x3FFFFFF)

128 return 5;

129 else if (ucs_character <= 0x7FFFFFFF)

130 return 6;

131 else

132 return 0;

133 }

134

135 uint32_t OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer)

136 {

137 uint32_t uc;

138 char c;

139

140 // Security considerations:

141 //

142 // Changed, this is now only the case for DecodeNextChar:

143 // - If we hit a zero byte, we want to return 0 without stepping

144 // the buffer pointer past the 0. th

145 //

146 // If we hit an "overlong sequence"; i.e. a character encoded

147 // in a longer multibyte string than is necessary, then we

148 // need to discard the character. This is so attackers can't

149 // disguise dangerous characters or character sequences --

150 // there is only one valid encoding for each character.

151 //

152 // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,

153 // 0xFFFF } then we ignore them; they are not valid in UTF-8.

154

155 // This isn't actually an invalid character; it's a valid char that

156 // looks like an inverted question mark.

157 #define INVALID_CHAR 0x0FFFD

158

159 #define FIRST_BYTE(mask, shift) \

160 uc = (c & (mask)) << (shift);

161

162 #define NEXT_BYTE(shift) \

163 c = **putf8Buffer; \

164 if (c == 0) return 0; /* end of buffer, do not advance */ \

165 if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */ \

166 (*putf8Buffer)++; \

167 uc |= (c & 0x3F) << shift;

168

169 c = **putf8Buffer;

170 (*putf8Buffer)++;

171 if (c == 0)

172 return 0; // End of buffer.

173

174 if ((c & 0x80) == 0) return (uint32_t) c; // Conventional 7-bit ASCII.

175

176 // Multi-byte sequences.

177 if ((c & 0xE0) == 0xC0)

178 {

179 // Two-byte sequence.

180 FIRST_BYTE(0x1F, 6);

181 NEXT_BYTE(0);

182 if (uc < 0x80) return INVALID_CHAR; // overlong

183 return uc;

184 }

185 else if ((c & 0xF0) == 0xE0)

186 {

187 // Three-byte sequence.

188 FIRST_BYTE(0x0F, 12);

189 NEXT_BYTE(6);

190 NEXT_BYTE(0);

191 if (uc < 0x800) return INVALID_CHAR; // overlong

192 // Not valid ISO 10646, but Flash requires these to work

193 // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0)

194 // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR;

195 // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646

196 return uc;

197 }

198 else if ((c & 0xF8) == 0xF0)

199 {

200 // Four-byte sequence.

201 FIRST_BYTE(0x07, 18);

202 NEXT_BYTE(12);

203 NEXT_BYTE(6);

204 NEXT_BYTE(0);

205 if (uc < 0x010000) return INVALID_CHAR; // overlong

206 return uc;

207 }

208 else if ((c & 0xFC) == 0xF8)

209 {

210 // Five-byte sequence.

211 FIRST_BYTE(0x03, 24);

212 NEXT_BYTE(18);

213 NEXT_BYTE(12);

214 NEXT_BYTE(6);

215 NEXT_BYTE(0);

216 if (uc < 0x0200000) return INVALID_CHAR; // overlong

217 return uc;

218 }

219 else if ((c & 0xFE) == 0xFC)

220 {

221 // Six-byte sequence.

222 FIRST_BYTE(0x01, 30);

223 NEXT_BYTE(24);

224 NEXT_BYTE(18);

225 NEXT_BYTE(12);

226 NEXT_BYTE(6);

227 NEXT_BYTE(0);

228 if (uc < 0x04000000) return INVALID_CHAR; // overlong

229 return uc;

230 }

231 else

232 {

233 // Invalid.

234 return INVALID_CHAR;

235 }

236 }

237

238

239 void OVR_STDCALL EncodeChar(char* pbuffer, intptr_t* pindex, uint32_t ucs_character)

240 {

241 if (ucs_character <= 0x7F)

242 {

243 // Plain single-byte ASCII.

244 pbuffer[(*pindex)++] = (char) ucs_character;

245 }

246 else if (ucs_character <= 0x7FF)

247 {

248 // Two bytes.

249 pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6);

250 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);

251 }

252 else if (ucs_character <= 0xFFFF)

253 {

254 // Three bytes.

255 pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12);

256 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);

257 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);

258 }

259 else if (ucs_character <= 0x1FFFFF)

260 {

261 // Four bytes.

262 pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18);

263 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);

264 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);

265 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);

266 }

267 else if (ucs_character <= 0x3FFFFFF)

268 {

269 // Five bytes.

270 pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24);

271 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);

272 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);

273 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);

274 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);

275 }

276 else if (ucs_character <= 0x7FFFFFFF)

277 {

278 // Six bytes.

279 pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30);

280 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F);

281 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);

282 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);

283 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);

284 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);

285 }

286 else

287 {

288 // Invalid char; don't encode anything.

289 }

290 }

291

292 intptr_t OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, intptr_t length)

293 {

294 intptr_t len = 0;

295 if (length != -1)

296 for (int i = 0; i < length; i++)

297 {

298 len += GetEncodeCharSize(pchar[i]);

299 }

300 else

301 for (int i = 0;; i++)

302 {

303 if (pchar[i] == 0)

304 return len;

305 len += GetEncodeCharSize(pchar[i]);

306 }

307 return len;

308 }

309

310 void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, intptr_t length)

311 {

312 intptr_t ofs = 0;

313 if (length != -1)

314 {

315 for (int i = 0; i < length; i++)

316 {

317 EncodeChar(pbuff, &ofs, pchar[i]);

318 }

319 }

320 else

321 {

322 for (int i = 0;; i++)

323 {

324 if (pchar[i] == 0)

325 break;

326 EncodeChar(pbuff, &ofs, pchar[i]);

327 }

328 }

329 pbuff[ofs] = 0;

330 }

331

332 size_t OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, intptr_t bytesLen)

333 {

334 wchar_t *pbegin = pbuff;

335 if (bytesLen == -1)

336 {

337 while (1)

338 {

339 uint32_t ch = DecodeNextChar_Advance0(&putf8str);

340 if (ch == 0)

341 break;

342 else if (ch >= 0xFFFF)

343 ch = 0xFFFD;

344 *pbuff++ = wchar_t(ch);

345 }

346 }

347 else

348 {

349 const char* p = putf8str;

350 while ((p - putf8str) < bytesLen)

351 {

352 uint32_t ch = DecodeNextChar_Advance0(&p);

353 if (ch >= 0xFFFF)

354 ch = 0xFFFD;

355 *pbuff++ = wchar_t(ch);

356 }

357 }

358

359 *pbuff = 0;

360 return pbuff - pbegin;

361 }

362

363

364 #ifdef UTF8_UNIT_TEST

365

366 // Compile this test case with something like:

367 //

368 // gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test

369 //

370 // or

371 //

372 // cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I..

373 //

374 // If possible, try running the test program with the first arg

375 // pointing at the file:

376 //

377 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt

378 //

379 // and examine the results by eye to make sure they are acceptable to

380 // you.

381

382

383 #include "base/utility.h"

384 #include <stdio.h>

385

386

387 bool check_equal(const char* utf8_in, const uint32_t* ucs_in)

388 {

389 for (;;)

390 {

391 uint32_t next_ucs = *ucs_in++;

392 uint32_t next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in);

393 if (next_ucs != next_ucs_from_utf8)

394 {

395 return false;

396 }

397 if (next_ucs == 0)

398 {

399 OVR_ASSERT(next_ucs_from_utf8 == 0);

400 break;

401 }

402 }

403

404 return true;

405 }

406

407

408 void log_ascii(const char* line)

409 {

410 for (;;)

411 {

412 unsigned char c = (unsigned char) *line++;

413 if (c == 0)

414 {

415 // End of line.

416 return;

417 }

418 else if (c != '\n'

419 && (c < 32 || c > 127))

420 {

421 // Non-printable as plain ASCII.

422 printf("<0x%02X>", (int) c);

423 }

424 else

425 {

426 printf("%c", c);

427 }

428 }

429 }

430

431

432 void log_ucs(const uint32_t* line)

433 {

434 for (;;)

435 {

436 uint32_t uc = *line++;

437 if (uc == 0)

438 {

439 // End of line.

440 return;

441 }

442 else if (uc != '\n'

443 && (uc < 32 || uc > 127))

444 {

445 // Non-printable as plain ASCII.

446 printf("<U-%04X>", uc);

447 }

448 else

449 {

450 printf("%c", (char) uc);

451 }

452 }

453 }

454

455

456 // Simple canned test.

457 int main(int argc, const char* argv[])

458 {

459 {

460 const char* test8 = "Ignacio Castaño";

461 const uint32_t test32[] =

462 {

463 0x49, 0x67, 0x6E, 0x61, 0x63,

464 0x69, 0x6F, 0x20, 0x43, 0x61,

465 0x73, 0x74, 0x61, 0xF1, 0x6F,

466 0x00

467 };

468

469 OVR_ASSERT(check_equal(test8, test32));

470 }

471

472 // If user passed an arg, try reading the file as UTF-8 encoded text.

473 if (argc > 1)

474 {

475 const char* filename = argv[1];

476 FILE* fp = fopen(filename, "rb");

477 if (fp == NULL)

478 {

479 printf("Can't open file '%s'\n", filename);

480 return 1;

481 }

482

483 // Read lines from the file, encode/decode them, and highlight discrepancies.

484 const int LINE_SIZE = 200; // max line size

485 char line_buffer_utf8[LINE_SIZE];

486 char reencoded_utf8[6 * LINE_SIZE];

487 uint32_t line_buffer_ucs[LINE_SIZE];

488

489 int byte_counter = 0;

490 for (;;)

491 {

492 int c = fgetc(fp);

493 if (c == EOF)

494 {

495 // Done.

496 break;

497 }

498 line_buffer_utf8[byte_counter++] = c;

499 if (c == '\n' || byte_counter >= LINE_SIZE - 2)

500 {

501 // End of line. Process the line.

502 line_buffer_utf8[byte_counter++] = 0; // terminate.

503

504 // Decode into UCS.

505 const char* p = line_buffer_utf8;

506 uint32_t* q = line_buffer_ucs;

507 for (;;)

508 {

509 uint32_t uc = UTF8Util::DecodeNextChar(&p);

510 *q++ = uc;

511

512 OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE);

513 OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE);

514

515 if (uc == 0) break;

516 }

517

518 // Encode back into UTF-8.

519 q = line_buffer_ucs;

520 int index = 0;

521 for (;;)

522 {

523 uint32_t uc = *q++;

524 OVR_ASSERT(index < LINE_SIZE * 6 - 6);

525 int last_index = index;

526 UTF8Util::EncodeChar(reencoded_utf8, &index, uc);

527 OVR_ASSERT(index <= last_index + 6);

528 if (uc == 0) break;

529 }

530

531 // This can be useful for debugging.

532 #if 0

533 // Show the UCS and the re-encoded UTF-8.

534 log_ucs(line_buffer_ucs);

535 log_ascii(reencoded_utf8);

536 #endif // 0

537

538 OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs));

539 OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs));

540

541 // Start next line.

542 byte_counter = 0;

543 }

544 }

545

546 fclose(fp);

547 }

548

549 return 0;

550 }

551

552

553 #endif // UTF8_UNIT_TEST

554

555 }} // namespace UTF8Util::OVR

556