nuclear@0: /************************************************************************** nuclear@0: nuclear@0: Filename : OVR_UTF8Util.cpp nuclear@0: Content : UTF8 Unicode character encoding/decoding support nuclear@0: Created : September 19, 2012 nuclear@0: Notes : nuclear@0: Notes : Much useful info at "UTF-8 and Unicode FAQ" nuclear@0: http://www.cl.cam.ac.uk/~mgk25/unicode.html nuclear@0: nuclear@0: Copyright : Copyright 2014 Oculus VR, LLC All Rights reserved. nuclear@0: nuclear@0: Licensed under the Oculus VR Rift SDK License Version 3.2 (the "License"); nuclear@0: you may not use the Oculus VR Rift SDK except in compliance with the License, nuclear@0: which is provided at the time of installation or download, or which nuclear@0: otherwise accompanies this software in either electronic or hard copy form. nuclear@0: nuclear@0: You may obtain a copy of the License at nuclear@0: nuclear@0: http://www.oculusvr.com/licenses/LICENSE-3.2 nuclear@0: nuclear@0: Unless required by applicable law or agreed to in writing, the Oculus VR SDK nuclear@0: distributed under the License is distributed on an "AS IS" BASIS, nuclear@0: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. nuclear@0: See the License for the specific language governing permissions and nuclear@0: limitations under the License. nuclear@0: nuclear@0: ************************************************************************************/ nuclear@0: nuclear@0: #include "OVR_UTF8Util.h" nuclear@0: nuclear@0: namespace OVR { namespace UTF8Util { nuclear@0: nuclear@0: intptr_t OVR_STDCALL GetLength(const char* buf, intptr_t buflen) nuclear@0: { nuclear@0: const char* p = buf; nuclear@0: intptr_t length = 0; nuclear@0: nuclear@0: if (buflen != -1) nuclear@0: { nuclear@0: while (p - buf < buflen) nuclear@0: { nuclear@0: // We should be able to have ASStrings with 0 in the middle. nuclear@0: UTF8Util::DecodeNextChar_Advance0(&p); nuclear@0: length++; nuclear@0: } nuclear@0: } nuclear@0: else nuclear@0: { nuclear@0: while (UTF8Util::DecodeNextChar_Advance0(&p)) nuclear@0: length++; nuclear@0: } nuclear@0: nuclear@0: return length; nuclear@0: } nuclear@0: nuclear@0: uint32_t OVR_STDCALL GetCharAt(intptr_t index, const char* putf8str, intptr_t length) nuclear@0: { nuclear@0: const char* buf = putf8str; nuclear@0: uint32_t c = 0; nuclear@0: nuclear@0: if (length != -1) nuclear@0: { nuclear@0: while (buf - putf8str < length) nuclear@0: { nuclear@0: c = UTF8Util::DecodeNextChar_Advance0(&buf); nuclear@0: if (index == 0) nuclear@0: return c; nuclear@0: index--; nuclear@0: } nuclear@0: nuclear@0: return c; nuclear@0: } nuclear@0: nuclear@0: do nuclear@0: { nuclear@0: c = UTF8Util::DecodeNextChar_Advance0(&buf); nuclear@0: index--; nuclear@0: nuclear@0: if (c == 0) nuclear@0: { nuclear@0: // We've hit the end of the string; don't go further. nuclear@0: OVR_ASSERT(index == 0); nuclear@0: return c; nuclear@0: } nuclear@0: } while (index >= 0); nuclear@0: nuclear@0: return c; nuclear@0: } nuclear@0: nuclear@0: intptr_t OVR_STDCALL GetByteIndex(intptr_t index, const char *putf8str, intptr_t length) nuclear@0: { nuclear@0: const char* buf = putf8str; nuclear@0: nuclear@0: if (length != -1) nuclear@0: { nuclear@0: while ((buf - putf8str) < length && index > 0) nuclear@0: { nuclear@0: UTF8Util::DecodeNextChar_Advance0(&buf); nuclear@0: index--; nuclear@0: } nuclear@0: nuclear@0: return buf-putf8str; nuclear@0: } nuclear@0: nuclear@0: while (index > 0) nuclear@0: { nuclear@0: uint32_t c = UTF8Util::DecodeNextChar_Advance0(&buf); nuclear@0: index--; nuclear@0: nuclear@0: if (c == 0) nuclear@0: return buf-putf8str; nuclear@0: }; nuclear@0: nuclear@0: return buf-putf8str; nuclear@0: } nuclear@0: nuclear@0: int OVR_STDCALL GetEncodeCharSize(uint32_t ucs_character) nuclear@0: { nuclear@0: if (ucs_character <= 0x7F) nuclear@0: return 1; nuclear@0: else if (ucs_character <= 0x7FF) nuclear@0: return 2; nuclear@0: else if (ucs_character <= 0xFFFF) nuclear@0: return 3; nuclear@0: else if (ucs_character <= 0x1FFFFF) nuclear@0: return 4; nuclear@0: else if (ucs_character <= 0x3FFFFFF) nuclear@0: return 5; nuclear@0: else if (ucs_character <= 0x7FFFFFFF) nuclear@0: return 6; nuclear@0: else nuclear@0: return 0; nuclear@0: } nuclear@0: nuclear@0: uint32_t OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer) nuclear@0: { nuclear@0: uint32_t uc; nuclear@0: char c; nuclear@0: nuclear@0: // Security considerations: nuclear@0: // nuclear@0: // Changed, this is now only the case for DecodeNextChar: nuclear@0: // - If we hit a zero byte, we want to return 0 without stepping nuclear@0: // the buffer pointer past the 0. th nuclear@0: // nuclear@0: // If we hit an "overlong sequence"; i.e. a character encoded nuclear@0: // in a longer multibyte string than is necessary, then we nuclear@0: // need to discard the character. This is so attackers can't nuclear@0: // disguise dangerous characters or character sequences -- nuclear@0: // there is only one valid encoding for each character. nuclear@0: // nuclear@0: // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE, nuclear@0: // 0xFFFF } then we ignore them; they are not valid in UTF-8. nuclear@0: nuclear@0: // This isn't actually an invalid character; it's a valid char that nuclear@0: // looks like an inverted question mark. nuclear@0: #define INVALID_CHAR 0x0FFFD nuclear@0: nuclear@0: #define FIRST_BYTE(mask, shift) \ nuclear@0: uc = (c & (mask)) << (shift); nuclear@0: nuclear@0: #define NEXT_BYTE(shift) \ nuclear@0: c = **putf8Buffer; \ nuclear@0: if (c == 0) return 0; /* end of buffer, do not advance */ \ nuclear@0: if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */ \ nuclear@0: (*putf8Buffer)++; \ nuclear@0: uc |= (c & 0x3F) << shift; nuclear@0: nuclear@0: c = **putf8Buffer; nuclear@0: (*putf8Buffer)++; nuclear@0: if (c == 0) nuclear@0: return 0; // End of buffer. nuclear@0: nuclear@0: if ((c & 0x80) == 0) return (uint32_t) c; // Conventional 7-bit ASCII. nuclear@0: nuclear@0: // Multi-byte sequences. nuclear@0: if ((c & 0xE0) == 0xC0) nuclear@0: { nuclear@0: // Two-byte sequence. nuclear@0: FIRST_BYTE(0x1F, 6); nuclear@0: NEXT_BYTE(0); nuclear@0: if (uc < 0x80) return INVALID_CHAR; // overlong nuclear@0: return uc; nuclear@0: } nuclear@0: else if ((c & 0xF0) == 0xE0) nuclear@0: { nuclear@0: // Three-byte sequence. nuclear@0: FIRST_BYTE(0x0F, 12); nuclear@0: NEXT_BYTE(6); nuclear@0: NEXT_BYTE(0); nuclear@0: if (uc < 0x800) return INVALID_CHAR; // overlong nuclear@0: // Not valid ISO 10646, but Flash requires these to work nuclear@0: // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0) nuclear@0: // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR; nuclear@0: // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646 nuclear@0: return uc; nuclear@0: } nuclear@0: else if ((c & 0xF8) == 0xF0) nuclear@0: { nuclear@0: // Four-byte sequence. nuclear@0: FIRST_BYTE(0x07, 18); nuclear@0: NEXT_BYTE(12); nuclear@0: NEXT_BYTE(6); nuclear@0: NEXT_BYTE(0); nuclear@0: if (uc < 0x010000) return INVALID_CHAR; // overlong nuclear@0: return uc; nuclear@0: } nuclear@0: else if ((c & 0xFC) == 0xF8) nuclear@0: { nuclear@0: // Five-byte sequence. nuclear@0: FIRST_BYTE(0x03, 24); nuclear@0: NEXT_BYTE(18); nuclear@0: NEXT_BYTE(12); nuclear@0: NEXT_BYTE(6); nuclear@0: NEXT_BYTE(0); nuclear@0: if (uc < 0x0200000) return INVALID_CHAR; // overlong nuclear@0: return uc; nuclear@0: } nuclear@0: else if ((c & 0xFE) == 0xFC) nuclear@0: { nuclear@0: // Six-byte sequence. nuclear@0: FIRST_BYTE(0x01, 30); nuclear@0: NEXT_BYTE(24); nuclear@0: NEXT_BYTE(18); nuclear@0: NEXT_BYTE(12); nuclear@0: NEXT_BYTE(6); nuclear@0: NEXT_BYTE(0); nuclear@0: if (uc < 0x04000000) return INVALID_CHAR; // overlong nuclear@0: return uc; nuclear@0: } nuclear@0: else nuclear@0: { nuclear@0: // Invalid. nuclear@0: return INVALID_CHAR; nuclear@0: } nuclear@0: } nuclear@0: nuclear@0: nuclear@0: void OVR_STDCALL EncodeChar(char* pbuffer, intptr_t* pindex, uint32_t ucs_character) nuclear@0: { nuclear@0: if (ucs_character <= 0x7F) nuclear@0: { nuclear@0: // Plain single-byte ASCII. nuclear@0: pbuffer[(*pindex)++] = (char) ucs_character; nuclear@0: } nuclear@0: else if (ucs_character <= 0x7FF) nuclear@0: { nuclear@0: // Two bytes. nuclear@0: pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6); nuclear@0: pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); nuclear@0: } nuclear@0: else if (ucs_character <= 0xFFFF) nuclear@0: { nuclear@0: // Three bytes. nuclear@0: pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12); nuclear@0: pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); nuclear@0: pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); nuclear@0: } nuclear@0: else if (ucs_character <= 0x1FFFFF) nuclear@0: { nuclear@0: // Four bytes. nuclear@0: pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18); nuclear@0: pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F); nuclear@0: pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); nuclear@0: pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); nuclear@0: } nuclear@0: else if (ucs_character <= 0x3FFFFFF) nuclear@0: { nuclear@0: // Five bytes. nuclear@0: pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24); nuclear@0: pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F); nuclear@0: pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F); nuclear@0: pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); nuclear@0: pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); nuclear@0: } nuclear@0: else if (ucs_character <= 0x7FFFFFFF) nuclear@0: { nuclear@0: // Six bytes. nuclear@0: pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30); nuclear@0: pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F); nuclear@0: pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F); nuclear@0: pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F); nuclear@0: pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); nuclear@0: pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); nuclear@0: } nuclear@0: else nuclear@0: { nuclear@0: // Invalid char; don't encode anything. nuclear@0: } nuclear@0: } nuclear@0: nuclear@0: intptr_t OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, intptr_t length) nuclear@0: { nuclear@0: intptr_t len = 0; nuclear@0: if (length != -1) nuclear@0: for (int i = 0; i < length; i++) nuclear@0: { nuclear@0: len += GetEncodeCharSize(pchar[i]); nuclear@0: } nuclear@0: else nuclear@0: for (int i = 0;; i++) nuclear@0: { nuclear@0: if (pchar[i] == 0) nuclear@0: return len; nuclear@0: len += GetEncodeCharSize(pchar[i]); nuclear@0: } nuclear@0: return len; nuclear@0: } nuclear@0: nuclear@0: void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, intptr_t length) nuclear@0: { nuclear@0: intptr_t ofs = 0; nuclear@0: if (length != -1) nuclear@0: { nuclear@0: for (int i = 0; i < length; i++) nuclear@0: { nuclear@0: EncodeChar(pbuff, &ofs, pchar[i]); nuclear@0: } nuclear@0: } nuclear@0: else nuclear@0: { nuclear@0: for (int i = 0;; i++) nuclear@0: { nuclear@0: if (pchar[i] == 0) nuclear@0: break; nuclear@0: EncodeChar(pbuff, &ofs, pchar[i]); nuclear@0: } nuclear@0: } nuclear@0: pbuff[ofs] = 0; nuclear@0: } nuclear@0: nuclear@0: size_t OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, intptr_t bytesLen) nuclear@0: { nuclear@0: wchar_t *pbegin = pbuff; nuclear@0: if (bytesLen == -1) nuclear@0: { nuclear@0: while (1) nuclear@0: { nuclear@0: uint32_t ch = DecodeNextChar_Advance0(&putf8str); nuclear@0: if (ch == 0) nuclear@0: break; nuclear@0: else if (ch >= 0xFFFF) nuclear@0: ch = 0xFFFD; nuclear@0: *pbuff++ = wchar_t(ch); nuclear@0: } nuclear@0: } nuclear@0: else nuclear@0: { nuclear@0: const char* p = putf8str; nuclear@0: while ((p - putf8str) < bytesLen) nuclear@0: { nuclear@0: uint32_t ch = DecodeNextChar_Advance0(&p); nuclear@0: if (ch >= 0xFFFF) nuclear@0: ch = 0xFFFD; nuclear@0: *pbuff++ = wchar_t(ch); nuclear@0: } nuclear@0: } nuclear@0: nuclear@0: *pbuff = 0; nuclear@0: return pbuff - pbegin; nuclear@0: } nuclear@0: nuclear@0: nuclear@0: #ifdef UTF8_UNIT_TEST nuclear@0: nuclear@0: // Compile this test case with something like: nuclear@0: // nuclear@0: // gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test nuclear@0: // nuclear@0: // or nuclear@0: // nuclear@0: // cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I.. nuclear@0: // nuclear@0: // If possible, try running the test program with the first arg nuclear@0: // pointing at the file: nuclear@0: // nuclear@0: // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt nuclear@0: // nuclear@0: // and examine the results by eye to make sure they are acceptable to nuclear@0: // you. nuclear@0: nuclear@0: nuclear@0: #include "base/utility.h" nuclear@0: #include nuclear@0: nuclear@0: nuclear@0: bool check_equal(const char* utf8_in, const uint32_t* ucs_in) nuclear@0: { nuclear@0: for (;;) nuclear@0: { nuclear@0: uint32_t next_ucs = *ucs_in++; nuclear@0: uint32_t next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in); nuclear@0: if (next_ucs != next_ucs_from_utf8) nuclear@0: { nuclear@0: return false; nuclear@0: } nuclear@0: if (next_ucs == 0) nuclear@0: { nuclear@0: OVR_ASSERT(next_ucs_from_utf8 == 0); nuclear@0: break; nuclear@0: } nuclear@0: } nuclear@0: nuclear@0: return true; nuclear@0: } nuclear@0: nuclear@0: nuclear@0: void log_ascii(const char* line) nuclear@0: { nuclear@0: for (;;) nuclear@0: { nuclear@0: unsigned char c = (unsigned char) *line++; nuclear@0: if (c == 0) nuclear@0: { nuclear@0: // End of line. nuclear@0: return; nuclear@0: } nuclear@0: else if (c != '\n' nuclear@0: && (c < 32 || c > 127)) nuclear@0: { nuclear@0: // Non-printable as plain ASCII. nuclear@0: printf("<0x%02X>", (int) c); nuclear@0: } nuclear@0: else nuclear@0: { nuclear@0: printf("%c", c); nuclear@0: } nuclear@0: } nuclear@0: } nuclear@0: nuclear@0: nuclear@0: void log_ucs(const uint32_t* line) nuclear@0: { nuclear@0: for (;;) nuclear@0: { nuclear@0: uint32_t uc = *line++; nuclear@0: if (uc == 0) nuclear@0: { nuclear@0: // End of line. nuclear@0: return; nuclear@0: } nuclear@0: else if (uc != '\n' nuclear@0: && (uc < 32 || uc > 127)) nuclear@0: { nuclear@0: // Non-printable as plain ASCII. nuclear@0: printf("", uc); nuclear@0: } nuclear@0: else nuclear@0: { nuclear@0: printf("%c", (char) uc); nuclear@0: } nuclear@0: } nuclear@0: } nuclear@0: nuclear@0: nuclear@0: // Simple canned test. nuclear@0: int main(int argc, const char* argv[]) nuclear@0: { nuclear@0: { nuclear@0: const char* test8 = "Ignacio CastaƱo"; nuclear@0: const uint32_t test32[] = nuclear@0: { nuclear@0: 0x49, 0x67, 0x6E, 0x61, 0x63, nuclear@0: 0x69, 0x6F, 0x20, 0x43, 0x61, nuclear@0: 0x73, 0x74, 0x61, 0xF1, 0x6F, nuclear@0: 0x00 nuclear@0: }; nuclear@0: nuclear@0: OVR_ASSERT(check_equal(test8, test32)); nuclear@0: } nuclear@0: nuclear@0: // If user passed an arg, try reading the file as UTF-8 encoded text. nuclear@0: if (argc > 1) nuclear@0: { nuclear@0: const char* filename = argv[1]; nuclear@0: FILE* fp = fopen(filename, "rb"); nuclear@0: if (fp == NULL) nuclear@0: { nuclear@0: printf("Can't open file '%s'\n", filename); nuclear@0: return 1; nuclear@0: } nuclear@0: nuclear@0: // Read lines from the file, encode/decode them, and highlight discrepancies. nuclear@0: const int LINE_SIZE = 200; // max line size nuclear@0: char line_buffer_utf8[LINE_SIZE]; nuclear@0: char reencoded_utf8[6 * LINE_SIZE]; nuclear@0: uint32_t line_buffer_ucs[LINE_SIZE]; nuclear@0: nuclear@0: int byte_counter = 0; nuclear@0: for (;;) nuclear@0: { nuclear@0: int c = fgetc(fp); nuclear@0: if (c == EOF) nuclear@0: { nuclear@0: // Done. nuclear@0: break; nuclear@0: } nuclear@0: line_buffer_utf8[byte_counter++] = c; nuclear@0: if (c == '\n' || byte_counter >= LINE_SIZE - 2) nuclear@0: { nuclear@0: // End of line. Process the line. nuclear@0: line_buffer_utf8[byte_counter++] = 0; // terminate. nuclear@0: nuclear@0: // Decode into UCS. nuclear@0: const char* p = line_buffer_utf8; nuclear@0: uint32_t* q = line_buffer_ucs; nuclear@0: for (;;) nuclear@0: { nuclear@0: uint32_t uc = UTF8Util::DecodeNextChar(&p); nuclear@0: *q++ = uc; nuclear@0: nuclear@0: OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE); nuclear@0: OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE); nuclear@0: nuclear@0: if (uc == 0) break; nuclear@0: } nuclear@0: nuclear@0: // Encode back into UTF-8. nuclear@0: q = line_buffer_ucs; nuclear@0: int index = 0; nuclear@0: for (;;) nuclear@0: { nuclear@0: uint32_t uc = *q++; nuclear@0: OVR_ASSERT(index < LINE_SIZE * 6 - 6); nuclear@0: int last_index = index; nuclear@0: UTF8Util::EncodeChar(reencoded_utf8, &index, uc); nuclear@0: OVR_ASSERT(index <= last_index + 6); nuclear@0: if (uc == 0) break; nuclear@0: } nuclear@0: nuclear@0: // This can be useful for debugging. nuclear@0: #if 0 nuclear@0: // Show the UCS and the re-encoded UTF-8. nuclear@0: log_ucs(line_buffer_ucs); nuclear@0: log_ascii(reencoded_utf8); nuclear@0: #endif // 0 nuclear@0: nuclear@0: OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs)); nuclear@0: OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs)); nuclear@0: nuclear@0: // Start next line. nuclear@0: byte_counter = 0; nuclear@0: } nuclear@0: } nuclear@0: nuclear@0: fclose(fp); nuclear@0: } nuclear@0: nuclear@0: return 0; nuclear@0: } nuclear@0: nuclear@0: nuclear@0: #endif // UTF8_UNIT_TEST nuclear@0: nuclear@0: }} // namespace UTF8Util::OVR nuclear@0: