oculus1

diff libovr/Src/Kernel/OVR_UTF8Util.cpp @ 3:b069a5c27388
added a couple more stuff, fixed all the LibOVR line endings
author: John Tsiombikas <nuclear@member.fsf.org>
date: Sun, 15 Sep 2013 04:10:05 +0300
parents: e2f9e4603129
     1.1 --- a/libovr/Src/Kernel/OVR_UTF8Util.cpp	Sat Sep 14 17:51:03 2013 +0300
     1.2 +++ b/libovr/Src/Kernel/OVR_UTF8Util.cpp	Sun Sep 15 04:10:05 2013 +0300
     1.3 @@ -1,1 +1,545 @@
     1.4 -/**************************************************************************
     1.5 
     1.6 Filename    :   OVR_UTF8Util.cpp
     1.7 Content     :   UTF8 Unicode character encoding/decoding support
     1.8 Created     :   September 19, 2012
     1.9 Notes       : 
    1.10 Notes       :   Much useful info at "UTF-8 and Unicode FAQ"
    1.11                 http://www.cl.cam.ac.uk/~mgk25/unicode.html
    1.12 
    1.13 Copyright   :   Copyright 2012 Oculus VR, Inc. All Rights reserved.
    1.14 
    1.15 Use of this software is subject to the terms of the Oculus license
    1.16 agreement provided at the time of installation or download, or which
    1.17 otherwise accompanies this software in either electronic or hard copy form.
    1.18 
    1.19 ************************************************************************************/
    1.20 
    1.21 #include "OVR_UTF8Util.h"
    1.22 
    1.23 namespace OVR { namespace UTF8Util {
    1.24 
    1.25 SPInt OVR_STDCALL GetLength(const char* buf, SPInt buflen)
    1.26 {
    1.27     const char* p = buf;
    1.28     SPInt length = 0;
    1.29 
    1.30     if (buflen != -1)
    1.31     {
    1.32         while (p - buf < buflen)
    1.33         {
    1.34             // We should be able to have ASStrings with 0 in the middle.
    1.35             UTF8Util::DecodeNextChar_Advance0(&p);
    1.36             length++;
    1.37         }
    1.38     }
    1.39     else
    1.40     {
    1.41         while (UTF8Util::DecodeNextChar_Advance0(&p))
    1.42             length++;
    1.43     }
    1.44     
    1.45     return length;
    1.46 }
    1.47 
    1.48 UInt32 OVR_STDCALL GetCharAt(SPInt index, const char* putf8str, SPInt length)
    1.49 {
    1.50     const char* buf = putf8str;
    1.51     UInt32  c = 0;
    1.52 
    1.53     if (length != -1)
    1.54     {
    1.55         while (buf - putf8str < length)
    1.56         {           
    1.57             c = UTF8Util::DecodeNextChar_Advance0(&buf);
    1.58             if (index == 0)
    1.59                 return c;
    1.60             index--;
    1.61         }
    1.62 
    1.63         return c;
    1.64     }
    1.65 
    1.66     do 
    1.67     {
    1.68         c = UTF8Util::DecodeNextChar_Advance0(&buf);
    1.69         index--;
    1.70 
    1.71         if (c == 0)
    1.72         {
    1.73             // We've hit the end of the string; don't go further.
    1.74             OVR_ASSERT(index == 0);
    1.75             return c;
    1.76         }
    1.77     } while (index >= 0);
    1.78 
    1.79     return c;
    1.80 }
    1.81 
    1.82 SPInt OVR_STDCALL GetByteIndex(SPInt index, const char *putf8str, SPInt length)
    1.83 {
    1.84     const char* buf = putf8str;
    1.85 
    1.86     if (length != -1)
    1.87     {
    1.88         while ((buf - putf8str) < length && index > 0)
    1.89         {
    1.90             UTF8Util::DecodeNextChar_Advance0(&buf);
    1.91             index--;
    1.92         }
    1.93 
    1.94         return buf-putf8str;
    1.95     }
    1.96 
    1.97     while (index > 0) 
    1.98     {
    1.99         UInt32 c = UTF8Util::DecodeNextChar_Advance0(&buf);
   1.100         index--;
   1.101 
   1.102         if (c == 0)
   1.103             return buf-putf8str;
   1.104     };
   1.105 
   1.106     return buf-putf8str;
   1.107 }
   1.108 
   1.109 int OVR_STDCALL GetEncodeCharSize(UInt32 ucs_character)
   1.110 {
   1.111     if (ucs_character <= 0x7F)
   1.112         return 1;
   1.113     else if (ucs_character <= 0x7FF)
   1.114         return 2;
   1.115     else if (ucs_character <= 0xFFFF)
   1.116         return 3;
   1.117     else if (ucs_character <= 0x1FFFFF)
   1.118         return 4;
   1.119     else if (ucs_character <= 0x3FFFFFF)
   1.120         return 5;
   1.121     else if (ucs_character <= 0x7FFFFFFF)
   1.122         return 6;
   1.123     else
   1.124         return 0;
   1.125 }
   1.126 
   1.127 UInt32 OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer)
   1.128 {
   1.129     UInt32  uc;
   1.130     char    c;
   1.131     
   1.132     // Security considerations:
   1.133     //
   1.134     // Changed, this is now only the case for DecodeNextChar:
   1.135     //  - If we hit a zero byte, we want to return 0 without stepping
   1.136     //    the buffer pointer past the 0. th
   1.137     //
   1.138     // If we hit an "overlong sequence"; i.e. a character encoded
   1.139     // in a longer multibyte string than is necessary, then we
   1.140     // need to discard the character.  This is so attackers can't
   1.141     // disguise dangerous characters or character sequences --
   1.142     // there is only one valid encoding for each character.
   1.143     //
   1.144     // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
   1.145     // 0xFFFF } then we ignore them; they are not valid in UTF-8.
   1.146     
   1.147     // This isn't actually an invalid character; it's a valid char that
   1.148     // looks like an inverted question mark.
   1.149 #define INVALID_CHAR 0x0FFFD
   1.150     
   1.151 #define FIRST_BYTE(mask, shift)     \
   1.152     uc = (c & (mask)) << (shift);
   1.153     
   1.154 #define NEXT_BYTE(shift) \
   1.155     c = **putf8Buffer;   \
   1.156     if (c == 0) return 0; /* end of buffer, do not advance */   \
   1.157     if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */  \
   1.158     (*putf8Buffer)++;    \
   1.159     uc |= (c & 0x3F) << shift;
   1.160     
   1.161     c = **putf8Buffer;
   1.162     (*putf8Buffer)++;
   1.163     if (c == 0)
   1.164         return 0;   // End of buffer.
   1.165     
   1.166     if ((c & 0x80) == 0) return (UInt32) c; // Conventional 7-bit ASCII.
   1.167     
   1.168     // Multi-byte sequences.
   1.169     if ((c & 0xE0) == 0xC0)
   1.170     {
   1.171         // Two-byte sequence.
   1.172         FIRST_BYTE(0x1F, 6);
   1.173         NEXT_BYTE(0);
   1.174         if (uc < 0x80) return INVALID_CHAR;  // overlong
   1.175         return uc;
   1.176     }
   1.177     else if ((c & 0xF0) == 0xE0)
   1.178     {
   1.179         // Three-byte sequence.
   1.180         FIRST_BYTE(0x0F, 12);
   1.181         NEXT_BYTE(6);
   1.182         NEXT_BYTE(0);
   1.183         if (uc < 0x800) return INVALID_CHAR; // overlong
   1.184         // Not valid ISO 10646, but Flash requires these to work
   1.185         // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0)
   1.186         // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR;
   1.187         // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646
   1.188         return uc;
   1.189     }
   1.190     else if ((c & 0xF8) == 0xF0)
   1.191     {
   1.192         // Four-byte sequence.
   1.193         FIRST_BYTE(0x07, 18);
   1.194         NEXT_BYTE(12);
   1.195         NEXT_BYTE(6);
   1.196         NEXT_BYTE(0);
   1.197         if (uc < 0x010000) return INVALID_CHAR;  // overlong
   1.198         return uc;
   1.199     }
   1.200     else if ((c & 0xFC) == 0xF8)
   1.201     {
   1.202         // Five-byte sequence.
   1.203         FIRST_BYTE(0x03, 24);
   1.204         NEXT_BYTE(18);
   1.205         NEXT_BYTE(12);
   1.206         NEXT_BYTE(6);
   1.207         NEXT_BYTE(0);
   1.208         if (uc < 0x0200000) return INVALID_CHAR; // overlong
   1.209         return uc;
   1.210     }
   1.211     else if ((c & 0xFE) == 0xFC)
   1.212     {
   1.213         // Six-byte sequence.
   1.214         FIRST_BYTE(0x01, 30);
   1.215         NEXT_BYTE(24);
   1.216         NEXT_BYTE(18);
   1.217         NEXT_BYTE(12);
   1.218         NEXT_BYTE(6);
   1.219         NEXT_BYTE(0);
   1.220         if (uc < 0x04000000) return INVALID_CHAR;    // overlong
   1.221         return uc;
   1.222     }
   1.223     else
   1.224     {
   1.225         // Invalid.
   1.226         return INVALID_CHAR;
   1.227     }
   1.228 }
   1.229 
   1.230 
   1.231 void OVR_STDCALL EncodeChar(char* pbuffer, SPInt* pindex, UInt32 ucs_character)
   1.232 {
   1.233     if (ucs_character <= 0x7F)
   1.234     {
   1.235         // Plain single-byte ASCII.
   1.236         pbuffer[(*pindex)++] = (char) ucs_character;
   1.237     }
   1.238     else if (ucs_character <= 0x7FF)
   1.239     {
   1.240         // Two bytes.
   1.241         pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6);
   1.242         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
   1.243     }
   1.244     else if (ucs_character <= 0xFFFF)
   1.245     {
   1.246         // Three bytes.
   1.247         pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12);
   1.248         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
   1.249         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
   1.250     }
   1.251     else if (ucs_character <= 0x1FFFFF)
   1.252     {
   1.253         // Four bytes.
   1.254         pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18);
   1.255         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
   1.256         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
   1.257         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
   1.258     }
   1.259     else if (ucs_character <= 0x3FFFFFF)
   1.260     {
   1.261         // Five bytes.
   1.262         pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24);
   1.263         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
   1.264         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
   1.265         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
   1.266         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
   1.267     }
   1.268     else if (ucs_character <= 0x7FFFFFFF)
   1.269     {
   1.270         // Six bytes.
   1.271         pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30);
   1.272         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F);
   1.273         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
   1.274         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
   1.275         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
   1.276         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
   1.277     }
   1.278     else
   1.279     {
   1.280         // Invalid char; don't encode anything.
   1.281     }
   1.282 }
   1.283 
   1.284 SPInt OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, SPInt length)
   1.285 {
   1.286     SPInt len = 0;
   1.287     if (length != -1)
   1.288         for (int i = 0; i < length; i++)
   1.289         {
   1.290             len += GetEncodeCharSize(pchar[i]);
   1.291         }
   1.292     else
   1.293         for (int i = 0;; i++)
   1.294         {
   1.295             if (pchar[i] == 0)
   1.296                 return len;
   1.297             len += GetEncodeCharSize(pchar[i]);
   1.298         }
   1.299     return len;
   1.300 }
   1.301 
   1.302 void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, SPInt length)
   1.303 {
   1.304     SPInt ofs = 0;
   1.305     if (length != -1)
   1.306     {
   1.307         for (int i = 0; i < length; i++)
   1.308         {            
   1.309             EncodeChar(pbuff, &ofs, pchar[i]);
   1.310         }
   1.311     }
   1.312     else
   1.313     {
   1.314         for (int i = 0;; i++)
   1.315         {
   1.316             if (pchar[i] == 0)
   1.317                 break;
   1.318             EncodeChar(pbuff, &ofs, pchar[i]);
   1.319         }
   1.320     }
   1.321     pbuff[ofs] = 0;
   1.322 }
   1.323 
   1.324 UPInt OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, SPInt bytesLen)
   1.325 {
   1.326     wchar_t *pbegin = pbuff;
   1.327     if (bytesLen == -1)
   1.328     {
   1.329         while (1)
   1.330         {
   1.331             UInt32 ch = DecodeNextChar_Advance0(&putf8str);
   1.332             if (ch == 0)
   1.333                 break;
   1.334             else if (ch >= 0xFFFF)
   1.335                 ch = 0xFFFD;
   1.336             *pbuff++ = wchar_t(ch);
   1.337         }
   1.338     }
   1.339     else
   1.340     {
   1.341         const char* p = putf8str;
   1.342         while ((p - putf8str) < bytesLen)
   1.343         {
   1.344             UInt32 ch = DecodeNextChar_Advance0(&p);
   1.345             if (ch >= 0xFFFF)
   1.346                 ch = 0xFFFD;
   1.347             *pbuff++ = wchar_t(ch);
   1.348         }
   1.349     }
   1.350 
   1.351     *pbuff = 0;
   1.352     return pbuff - pbegin;
   1.353 }
   1.354 
   1.355 
   1.356 #ifdef UTF8_UNIT_TEST
   1.357 
   1.358 // Compile this test case with something like:
   1.359 //
   1.360 // gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test
   1.361 //
   1.362 //    or
   1.363 //
   1.364 // cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I..
   1.365 //
   1.366 // If possible, try running the test program with the first arg
   1.367 // pointing at the file:
   1.368 //
   1.369 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
   1.370 // 
   1.371 // and examine the results by eye to make sure they are acceptable to
   1.372 // you.
   1.373 
   1.374 
   1.375 #include "base/utility.h"
   1.376 #include <stdio.h>
   1.377 
   1.378 
   1.379 bool    check_equal(const char* utf8_in, const UInt32* ucs_in)
   1.380 {
   1.381     for (;;)
   1.382     {
   1.383         UInt32  next_ucs = *ucs_in++;
   1.384         UInt32  next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in);
   1.385         if (next_ucs != next_ucs_from_utf8)
   1.386         {
   1.387             return false;
   1.388         }
   1.389         if (next_ucs == 0)
   1.390         {
   1.391             OVR_ASSERT(next_ucs_from_utf8 == 0);
   1.392             break;
   1.393         }
   1.394     }
   1.395     
   1.396     return true;
   1.397 }
   1.398 
   1.399 
   1.400 void    log_ascii(const char* line)
   1.401 {
   1.402     for (;;)
   1.403     {
   1.404         unsigned char   c = (unsigned char) *line++;
   1.405         if (c == 0)
   1.406         {
   1.407             // End of line.
   1.408             return;
   1.409         }
   1.410         else if (c != '\n'
   1.411             && (c < 32 || c > 127))
   1.412         {
   1.413             // Non-printable as plain ASCII.
   1.414             printf("<0x%02X>", (int) c);
   1.415         }
   1.416         else
   1.417         {
   1.418             printf("%c", c);
   1.419         }
   1.420     }
   1.421 }
   1.422 
   1.423 
   1.424 void    log_ucs(const UInt32* line)
   1.425 {
   1.426     for (;;)
   1.427     {
   1.428         UInt32  uc = *line++;
   1.429         if (uc == 0)
   1.430         {
   1.431             // End of line.
   1.432             return;
   1.433         }
   1.434         else if (uc != '\n'
   1.435             && (uc < 32 || uc > 127))
   1.436         {
   1.437             // Non-printable as plain ASCII.
   1.438             printf("<U-%04X>", uc);
   1.439         }
   1.440         else
   1.441         {
   1.442             printf("%c", (char) uc);
   1.443         }
   1.444     }
   1.445 }
   1.446 
   1.447 
   1.448 // Simple canned test.
   1.449 int main(int argc, const char* argv[])
   1.450 {
   1.451     {
   1.452         const char* test8 = "Ignacio Castaño";
   1.453         const UInt32    test32[] =
   1.454         {
   1.455             0x49, 0x67, 0x6E, 0x61, 0x63,
   1.456                 0x69, 0x6F, 0x20, 0x43, 0x61,
   1.457                 0x73, 0x74, 0x61, 0xF1, 0x6F,
   1.458                 0x00
   1.459         };
   1.460         
   1.461         OVR_ASSERT(check_equal(test8, test32));
   1.462     }
   1.463         
   1.464         // If user passed an arg, try reading the file as UTF-8 encoded text.
   1.465         if (argc > 1)
   1.466         {
   1.467             const char* filename = argv[1];
   1.468             FILE*   fp = fopen(filename, "rb");
   1.469             if (fp == NULL)
   1.470             {
   1.471                 printf("Can't open file '%s'\n", filename);
   1.472                 return 1;
   1.473             }
   1.474             
   1.475             // Read lines from the file, encode/decode them, and highlight discrepancies.
   1.476             const int LINE_SIZE = 200;  // max line size
   1.477             char    line_buffer_utf8[LINE_SIZE];
   1.478             char    reencoded_utf8[6 * LINE_SIZE];
   1.479             UInt32  line_buffer_ucs[LINE_SIZE];
   1.480             
   1.481             int byte_counter = 0;
   1.482             for (;;)
   1.483             {
   1.484                 int c = fgetc(fp);
   1.485                 if (c == EOF)
   1.486                 {
   1.487                     // Done.
   1.488                     break;
   1.489                 }
   1.490                 line_buffer_utf8[byte_counter++] = c;
   1.491                 if (c == '\n' || byte_counter >= LINE_SIZE - 2)
   1.492                 {
   1.493                     // End of line.  Process the line.
   1.494                     line_buffer_utf8[byte_counter++] = 0;   // terminate.
   1.495                     
   1.496                     // Decode into UCS.
   1.497                     const char* p = line_buffer_utf8;
   1.498                     UInt32* q = line_buffer_ucs;
   1.499                     for (;;)
   1.500                     {
   1.501                         UInt32  uc = UTF8Util::DecodeNextChar(&p);
   1.502                         *q++ = uc;
   1.503                         
   1.504                         OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE);
   1.505                         OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE);
   1.506                         
   1.507                         if (uc == 0) break;
   1.508                     }
   1.509                     
   1.510                     // Encode back into UTF-8.
   1.511                     q = line_buffer_ucs;
   1.512                     int index = 0;
   1.513                     for (;;)
   1.514                     {
   1.515                         UInt32  uc = *q++;
   1.516                         OVR_ASSERT(index < LINE_SIZE * 6 - 6);
   1.517                         int last_index = index;
   1.518                         UTF8Util::EncodeChar(reencoded_utf8, &index, uc);
   1.519                         OVR_ASSERT(index <= last_index + 6);
   1.520                         if (uc == 0) break;
   1.521                     }
   1.522                     
   1.523                     // This can be useful for debugging.
   1.524 #if 0
   1.525                     // Show the UCS and the re-encoded UTF-8.
   1.526                     log_ucs(line_buffer_ucs);
   1.527                     log_ascii(reencoded_utf8);
   1.528 #endif // 0
   1.529                     
   1.530                     OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs));
   1.531                     OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs));
   1.532                     
   1.533                     // Start next line.
   1.534                     byte_counter = 0;
   1.535                 }
   1.536             }
   1.537             
   1.538             fclose(fp);
   1.539         }
   1.540         
   1.541         return 0;
   1.542 }
   1.543 
   1.544 
   1.545 #endif // UTF8_UNIT_TEST
   1.546 
   1.547 }} // namespace UTF8Util::OVR
   1.548 
   1.549 \ No newline at end of file
   1.550 +/**************************************************************************
   1.551 +
   1.552 +Filename    :   OVR_UTF8Util.cpp
   1.553 +Content     :   UTF8 Unicode character encoding/decoding support
   1.554 +Created     :   September 19, 2012
   1.555 +Notes       : 
   1.556 +Notes       :   Much useful info at "UTF-8 and Unicode FAQ"
   1.557 +                http://www.cl.cam.ac.uk/~mgk25/unicode.html
   1.558 +
   1.559 +Copyright   :   Copyright 2012 Oculus VR, Inc. All Rights reserved.
   1.560 +
   1.561 +Use of this software is subject to the terms of the Oculus license
   1.562 +agreement provided at the time of installation or download, or which
   1.563 +otherwise accompanies this software in either electronic or hard copy form.
   1.564 +
   1.565 +************************************************************************************/
   1.566 +
   1.567 +#include "OVR_UTF8Util.h"
   1.568 +
   1.569 +namespace OVR { namespace UTF8Util {
   1.570 +
   1.571 +SPInt OVR_STDCALL GetLength(const char* buf, SPInt buflen)
   1.572 +{
   1.573 +    const char* p = buf;
   1.574 +    SPInt length = 0;
   1.575 +
   1.576 +    if (buflen != -1)
   1.577 +    {
   1.578 +        while (p - buf < buflen)
   1.579 +        {
   1.580 +            // We should be able to have ASStrings with 0 in the middle.
   1.581 +            UTF8Util::DecodeNextChar_Advance0(&p);
   1.582 +            length++;
   1.583 +        }
   1.584 +    }
   1.585 +    else
   1.586 +    {
   1.587 +        while (UTF8Util::DecodeNextChar_Advance0(&p))
   1.588 +            length++;
   1.589 +    }
   1.590 +    
   1.591 +    return length;
   1.592 +}
   1.593 +
   1.594 +UInt32 OVR_STDCALL GetCharAt(SPInt index, const char* putf8str, SPInt length)
   1.595 +{
   1.596 +    const char* buf = putf8str;
   1.597 +    UInt32  c = 0;
   1.598 +
   1.599 +    if (length != -1)
   1.600 +    {
   1.601 +        while (buf - putf8str < length)
   1.602 +        {           
   1.603 +            c = UTF8Util::DecodeNextChar_Advance0(&buf);
   1.604 +            if (index == 0)
   1.605 +                return c;
   1.606 +            index--;
   1.607 +        }
   1.608 +
   1.609 +        return c;
   1.610 +    }
   1.611 +
   1.612 +    do 
   1.613 +    {
   1.614 +        c = UTF8Util::DecodeNextChar_Advance0(&buf);
   1.615 +        index--;
   1.616 +
   1.617 +        if (c == 0)
   1.618 +        {
   1.619 +            // We've hit the end of the string; don't go further.
   1.620 +            OVR_ASSERT(index == 0);
   1.621 +            return c;
   1.622 +        }
   1.623 +    } while (index >= 0);
   1.624 +
   1.625 +    return c;
   1.626 +}
   1.627 +
   1.628 +SPInt OVR_STDCALL GetByteIndex(SPInt index, const char *putf8str, SPInt length)
   1.629 +{
   1.630 +    const char* buf = putf8str;
   1.631 +
   1.632 +    if (length != -1)
   1.633 +    {
   1.634 +        while ((buf - putf8str) < length && index > 0)
   1.635 +        {
   1.636 +            UTF8Util::DecodeNextChar_Advance0(&buf);
   1.637 +            index--;
   1.638 +        }
   1.639 +
   1.640 +        return buf-putf8str;
   1.641 +    }
   1.642 +
   1.643 +    while (index > 0) 
   1.644 +    {
   1.645 +        UInt32 c = UTF8Util::DecodeNextChar_Advance0(&buf);
   1.646 +        index--;
   1.647 +
   1.648 +        if (c == 0)
   1.649 +            return buf-putf8str;
   1.650 +    };
   1.651 +
   1.652 +    return buf-putf8str;
   1.653 +}
   1.654 +
   1.655 +int OVR_STDCALL GetEncodeCharSize(UInt32 ucs_character)
   1.656 +{
   1.657 +    if (ucs_character <= 0x7F)
   1.658 +        return 1;
   1.659 +    else if (ucs_character <= 0x7FF)
   1.660 +        return 2;
   1.661 +    else if (ucs_character <= 0xFFFF)
   1.662 +        return 3;
   1.663 +    else if (ucs_character <= 0x1FFFFF)
   1.664 +        return 4;
   1.665 +    else if (ucs_character <= 0x3FFFFFF)
   1.666 +        return 5;
   1.667 +    else if (ucs_character <= 0x7FFFFFFF)
   1.668 +        return 6;
   1.669 +    else
   1.670 +        return 0;
   1.671 +}
   1.672 +
   1.673 +UInt32 OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer)
   1.674 +{
   1.675 +    UInt32  uc;
   1.676 +    char    c;
   1.677 +    
   1.678 +    // Security considerations:
   1.679 +    //
   1.680 +    // Changed, this is now only the case for DecodeNextChar:
   1.681 +    //  - If we hit a zero byte, we want to return 0 without stepping
   1.682 +    //    the buffer pointer past the 0. th
   1.683 +    //
   1.684 +    // If we hit an "overlong sequence"; i.e. a character encoded
   1.685 +    // in a longer multibyte string than is necessary, then we
   1.686 +    // need to discard the character.  This is so attackers can't
   1.687 +    // disguise dangerous characters or character sequences --
   1.688 +    // there is only one valid encoding for each character.
   1.689 +    //
   1.690 +    // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
   1.691 +    // 0xFFFF } then we ignore them; they are not valid in UTF-8.
   1.692 +    
   1.693 +    // This isn't actually an invalid character; it's a valid char that
   1.694 +    // looks like an inverted question mark.
   1.695 +#define INVALID_CHAR 0x0FFFD
   1.696 +    
   1.697 +#define FIRST_BYTE(mask, shift)     \
   1.698 +    uc = (c & (mask)) << (shift);
   1.699 +    
   1.700 +#define NEXT_BYTE(shift) \
   1.701 +    c = **putf8Buffer;   \
   1.702 +    if (c == 0) return 0; /* end of buffer, do not advance */   \
   1.703 +    if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */  \
   1.704 +    (*putf8Buffer)++;    \
   1.705 +    uc |= (c & 0x3F) << shift;
   1.706 +    
   1.707 +    c = **putf8Buffer;
   1.708 +    (*putf8Buffer)++;
   1.709 +    if (c == 0)
   1.710 +        return 0;   // End of buffer.
   1.711 +    
   1.712 +    if ((c & 0x80) == 0) return (UInt32) c; // Conventional 7-bit ASCII.
   1.713 +    
   1.714 +    // Multi-byte sequences.
   1.715 +    if ((c & 0xE0) == 0xC0)
   1.716 +    {
   1.717 +        // Two-byte sequence.
   1.718 +        FIRST_BYTE(0x1F, 6);
   1.719 +        NEXT_BYTE(0);
   1.720 +        if (uc < 0x80) return INVALID_CHAR;  // overlong
   1.721 +        return uc;
   1.722 +    }
   1.723 +    else if ((c & 0xF0) == 0xE0)
   1.724 +    {
   1.725 +        // Three-byte sequence.
   1.726 +        FIRST_BYTE(0x0F, 12);
   1.727 +        NEXT_BYTE(6);
   1.728 +        NEXT_BYTE(0);
   1.729 +        if (uc < 0x800) return INVALID_CHAR; // overlong
   1.730 +        // Not valid ISO 10646, but Flash requires these to work
   1.731 +        // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0)
   1.732 +        // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR;
   1.733 +        // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646
   1.734 +        return uc;
   1.735 +    }
   1.736 +    else if ((c & 0xF8) == 0xF0)
   1.737 +    {
   1.738 +        // Four-byte sequence.
   1.739 +        FIRST_BYTE(0x07, 18);
   1.740 +        NEXT_BYTE(12);
   1.741 +        NEXT_BYTE(6);
   1.742 +        NEXT_BYTE(0);
   1.743 +        if (uc < 0x010000) return INVALID_CHAR;  // overlong
   1.744 +        return uc;
   1.745 +    }
   1.746 +    else if ((c & 0xFC) == 0xF8)
   1.747 +    {
   1.748 +        // Five-byte sequence.
   1.749 +        FIRST_BYTE(0x03, 24);
   1.750 +        NEXT_BYTE(18);
   1.751 +        NEXT_BYTE(12);
   1.752 +        NEXT_BYTE(6);
   1.753 +        NEXT_BYTE(0);
   1.754 +        if (uc < 0x0200000) return INVALID_CHAR; // overlong
   1.755 +        return uc;
   1.756 +    }
   1.757 +    else if ((c & 0xFE) == 0xFC)
   1.758 +    {
   1.759 +        // Six-byte sequence.
   1.760 +        FIRST_BYTE(0x01, 30);
   1.761 +        NEXT_BYTE(24);
   1.762 +        NEXT_BYTE(18);
   1.763 +        NEXT_BYTE(12);
   1.764 +        NEXT_BYTE(6);
   1.765 +        NEXT_BYTE(0);
   1.766 +        if (uc < 0x04000000) return INVALID_CHAR;    // overlong
   1.767 +        return uc;
   1.768 +    }
   1.769 +    else
   1.770 +    {
   1.771 +        // Invalid.
   1.772 +        return INVALID_CHAR;
   1.773 +    }
   1.774 +}
   1.775 +
   1.776 +
   1.777 +void OVR_STDCALL EncodeChar(char* pbuffer, SPInt* pindex, UInt32 ucs_character)
   1.778 +{
   1.779 +    if (ucs_character <= 0x7F)
   1.780 +    {
   1.781 +        // Plain single-byte ASCII.
   1.782 +        pbuffer[(*pindex)++] = (char) ucs_character;
   1.783 +    }
   1.784 +    else if (ucs_character <= 0x7FF)
   1.785 +    {
   1.786 +        // Two bytes.
   1.787 +        pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6);
   1.788 +        pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
   1.789 +    }
   1.790 +    else if (ucs_character <= 0xFFFF)
   1.791 +    {
   1.792 +        // Three bytes.
   1.793 +        pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12);
   1.794 +        pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
   1.795 +        pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
   1.796 +    }
   1.797 +    else if (ucs_character <= 0x1FFFFF)
   1.798 +    {
   1.799 +        // Four bytes.
   1.800 +        pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18);
   1.801 +        pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
   1.802 +        pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
   1.803 +        pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
   1.804 +    }
   1.805 +    else if (ucs_character <= 0x3FFFFFF)
   1.806 +    {
   1.807 +        // Five bytes.
   1.808 +        pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24);
   1.809 +        pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
   1.810 +        pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
   1.811 +        pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
   1.812 +        pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
   1.813 +    }
   1.814 +    else if (ucs_character <= 0x7FFFFFFF)
   1.815 +    {
   1.816 +        // Six bytes.
   1.817 +        pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30);
   1.818 +        pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F);
   1.819 +        pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
   1.820 +        pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
   1.821 +        pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
   1.822 +        pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
   1.823 +    }
   1.824 +    else
   1.825 +    {
   1.826 +        // Invalid char; don't encode anything.
   1.827 +    }
   1.828 +}
   1.829 +
   1.830 +SPInt OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, SPInt length)
   1.831 +{
   1.832 +    SPInt len = 0;
   1.833 +    if (length != -1)
   1.834 +        for (int i = 0; i < length; i++)
   1.835 +        {
   1.836 +            len += GetEncodeCharSize(pchar[i]);
   1.837 +        }
   1.838 +    else
   1.839 +        for (int i = 0;; i++)
   1.840 +        {
   1.841 +            if (pchar[i] == 0)
   1.842 +                return len;
   1.843 +            len += GetEncodeCharSize(pchar[i]);
   1.844 +        }
   1.845 +    return len;
   1.846 +}
   1.847 +
   1.848 +void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, SPInt length)
   1.849 +{
   1.850 +    SPInt ofs = 0;
   1.851 +    if (length != -1)
   1.852 +    {
   1.853 +        for (int i = 0; i < length; i++)
   1.854 +        {            
   1.855 +            EncodeChar(pbuff, &ofs, pchar[i]);
   1.856 +        }
   1.857 +    }
   1.858 +    else
   1.859 +    {
   1.860 +        for (int i = 0;; i++)
   1.861 +        {
   1.862 +            if (pchar[i] == 0)
   1.863 +                break;
   1.864 +            EncodeChar(pbuff, &ofs, pchar[i]);
   1.865 +        }
   1.866 +    }
   1.867 +    pbuff[ofs] = 0;
   1.868 +}
   1.869 +
   1.870 +UPInt OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, SPInt bytesLen)
   1.871 +{
   1.872 +    wchar_t *pbegin = pbuff;
   1.873 +    if (bytesLen == -1)
   1.874 +    {
   1.875 +        while (1)
   1.876 +        {
   1.877 +            UInt32 ch = DecodeNextChar_Advance0(&putf8str);
   1.878 +            if (ch == 0)
   1.879 +                break;
   1.880 +            else if (ch >= 0xFFFF)
   1.881 +                ch = 0xFFFD;
   1.882 +            *pbuff++ = wchar_t(ch);
   1.883 +        }
   1.884 +    }
   1.885 +    else
   1.886 +    {
   1.887 +        const char* p = putf8str;
   1.888 +        while ((p - putf8str) < bytesLen)
   1.889 +        {
   1.890 +            UInt32 ch = DecodeNextChar_Advance0(&p);
   1.891 +            if (ch >= 0xFFFF)
   1.892 +                ch = 0xFFFD;
   1.893 +            *pbuff++ = wchar_t(ch);
   1.894 +        }
   1.895 +    }
   1.896 +
   1.897 +    *pbuff = 0;
   1.898 +    return pbuff - pbegin;
   1.899 +}
   1.900 +
   1.901 +
   1.902 +#ifdef UTF8_UNIT_TEST
   1.903 +
   1.904 +// Compile this test case with something like:
   1.905 +//
   1.906 +// gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test
   1.907 +//
   1.908 +//    or
   1.909 +//
   1.910 +// cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I..
   1.911 +//
   1.912 +// If possible, try running the test program with the first arg
   1.913 +// pointing at the file:
   1.914 +//
   1.915 +// http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
   1.916 +// 
   1.917 +// and examine the results by eye to make sure they are acceptable to
   1.918 +// you.
   1.919 +
   1.920 +
   1.921 +#include "base/utility.h"
   1.922 +#include <stdio.h>
   1.923 +
   1.924 +
   1.925 +bool    check_equal(const char* utf8_in, const UInt32* ucs_in)
   1.926 +{
   1.927 +    for (;;)
   1.928 +    {
   1.929 +        UInt32  next_ucs = *ucs_in++;
   1.930 +        UInt32  next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in);
   1.931 +        if (next_ucs != next_ucs_from_utf8)
   1.932 +        {
   1.933 +            return false;
   1.934 +        }
   1.935 +        if (next_ucs == 0)
   1.936 +        {
   1.937 +            OVR_ASSERT(next_ucs_from_utf8 == 0);
   1.938 +            break;
   1.939 +        }
   1.940 +    }
   1.941 +    
   1.942 +    return true;
   1.943 +}
   1.944 +
   1.945 +
   1.946 +void    log_ascii(const char* line)
   1.947 +{
   1.948 +    for (;;)
   1.949 +    {
   1.950 +        unsigned char   c = (unsigned char) *line++;
   1.951 +        if (c == 0)
   1.952 +        {
   1.953 +            // End of line.
   1.954 +            return;
   1.955 +        }
   1.956 +        else if (c != '\n'
   1.957 +            && (c < 32 || c > 127))
   1.958 +        {
   1.959 +            // Non-printable as plain ASCII.
   1.960 +            printf("<0x%02X>", (int) c);
   1.961 +        }
   1.962 +        else
   1.963 +        {
   1.964 +            printf("%c", c);
   1.965 +        }
   1.966 +    }
   1.967 +}
   1.968 +
   1.969 +
   1.970 +void    log_ucs(const UInt32* line)
   1.971 +{
   1.972 +    for (;;)
   1.973 +    {
   1.974 +        UInt32  uc = *line++;
   1.975 +        if (uc == 0)
   1.976 +        {
   1.977 +            // End of line.
   1.978 +            return;
   1.979 +        }
   1.980 +        else if (uc != '\n'
   1.981 +            && (uc < 32 || uc > 127))
   1.982 +        {
   1.983 +            // Non-printable as plain ASCII.
   1.984 +            printf("<U-%04X>", uc);
   1.985 +        }
   1.986 +        else
   1.987 +        {
   1.988 +            printf("%c", (char) uc);
   1.989 +        }
   1.990 +    }
   1.991 +}
   1.992 +
   1.993 +
   1.994 +// Simple canned test.
   1.995 +int main(int argc, const char* argv[])
   1.996 +{
   1.997 +    {
   1.998 +        const char* test8 = "Ignacio Castaño";
   1.999 +        const UInt32    test32[] =
  1.1000 +        {
  1.1001 +            0x49, 0x67, 0x6E, 0x61, 0x63,
  1.1002 +                0x69, 0x6F, 0x20, 0x43, 0x61,
  1.1003 +                0x73, 0x74, 0x61, 0xF1, 0x6F,
  1.1004 +                0x00
  1.1005 +        };
  1.1006 +        
  1.1007 +        OVR_ASSERT(check_equal(test8, test32));
  1.1008 +    }
  1.1009 +        
  1.1010 +        // If user passed an arg, try reading the file as UTF-8 encoded text.
  1.1011 +        if (argc > 1)
  1.1012 +        {
  1.1013 +            const char* filename = argv[1];
  1.1014 +            FILE*   fp = fopen(filename, "rb");
  1.1015 +            if (fp == NULL)
  1.1016 +            {
  1.1017 +                printf("Can't open file '%s'\n", filename);
  1.1018 +                return 1;
  1.1019 +            }
  1.1020 +            
  1.1021 +            // Read lines from the file, encode/decode them, and highlight discrepancies.
  1.1022 +            const int LINE_SIZE = 200;  // max line size
  1.1023 +            char    line_buffer_utf8[LINE_SIZE];
  1.1024 +            char    reencoded_utf8[6 * LINE_SIZE];
  1.1025 +            UInt32  line_buffer_ucs[LINE_SIZE];
  1.1026 +            
  1.1027 +            int byte_counter = 0;
  1.1028 +            for (;;)
  1.1029 +            {
  1.1030 +                int c = fgetc(fp);
  1.1031 +                if (c == EOF)
  1.1032 +                {
  1.1033 +                    // Done.
  1.1034 +                    break;
  1.1035 +                }
  1.1036 +                line_buffer_utf8[byte_counter++] = c;
  1.1037 +                if (c == '\n' || byte_counter >= LINE_SIZE - 2)
  1.1038 +                {
  1.1039 +                    // End of line.  Process the line.
  1.1040 +                    line_buffer_utf8[byte_counter++] = 0;   // terminate.
  1.1041 +                    
  1.1042 +                    // Decode into UCS.
  1.1043 +                    const char* p = line_buffer_utf8;
  1.1044 +                    UInt32* q = line_buffer_ucs;
  1.1045 +                    for (;;)
  1.1046 +                    {
  1.1047 +                        UInt32  uc = UTF8Util::DecodeNextChar(&p);
  1.1048 +                        *q++ = uc;
  1.1049 +                        
  1.1050 +                        OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE);
  1.1051 +                        OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE);
  1.1052 +                        
  1.1053 +                        if (uc == 0) break;
  1.1054 +                    }
  1.1055 +                    
  1.1056 +                    // Encode back into UTF-8.
  1.1057 +                    q = line_buffer_ucs;
  1.1058 +                    int index = 0;
  1.1059 +                    for (;;)
  1.1060 +                    {
  1.1061 +                        UInt32  uc = *q++;
  1.1062 +                        OVR_ASSERT(index < LINE_SIZE * 6 - 6);
  1.1063 +                        int last_index = index;
  1.1064 +                        UTF8Util::EncodeChar(reencoded_utf8, &index, uc);
  1.1065 +                        OVR_ASSERT(index <= last_index + 6);
  1.1066 +                        if (uc == 0) break;
  1.1067 +                    }
  1.1068 +                    
  1.1069 +                    // This can be useful for debugging.
  1.1070 +#if 0
  1.1071 +                    // Show the UCS and the re-encoded UTF-8.
  1.1072 +                    log_ucs(line_buffer_ucs);
  1.1073 +                    log_ascii(reencoded_utf8);
  1.1074 +#endif // 0
  1.1075 +                    
  1.1076 +                    OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs));
  1.1077 +                    OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs));
  1.1078 +                    
  1.1079 +                    // Start next line.
  1.1080 +                    byte_counter = 0;
  1.1081 +                }
  1.1082 +            }
  1.1083 +            
  1.1084 +            fclose(fp);
  1.1085 +        }
  1.1086 +        
  1.1087 +        return 0;
  1.1088 +}
  1.1089 +
  1.1090 +
  1.1091 +#endif // UTF8_UNIT_TEST
  1.1092 +
  1.1093 +}} // namespace UTF8Util::OVR
  1.1094 +
author	John Tsiombikas <nuclear@member.fsf.org>
date	Sun, 15 Sep 2013 04:10:05 +0300
parents	e2f9e4603129
children