libdrawtext

annotate src/utf8.c @ 3:fe0c54e574ae

fixed a bug in utf-8 decoding
author John Tsiombikas <nuclear@mutantstargoat.com>
date Thu, 15 Sep 2011 23:32:39 +0300
parents bfe431dd1d80
children 17fed026b24b
rev   line source
nuclear@0 1 #include "drawtext.h"
nuclear@0 2
nuclear@0 3 #define U8_IS_FIRST(x) (((((x) >> 7) & 1) == 0) || ((((x) >> 6) & 3) == 3))
nuclear@0 4
nuclear@0 5 static const char first_mask[] = {
nuclear@0 6 0,
nuclear@0 7 0x7f, /* single byte, 7 bits valid */
nuclear@0 8 0x1f, /* two-bytes, 5 bits valid */
nuclear@0 9 0xf, /* three-bytes, 4 bits valid */
nuclear@0 10 0x7 /* four-bytes, 3 bits valid */
nuclear@0 11 };
nuclear@3 12 static const char first_shift[] = { 0, 7, 5, 4, 3 }; /* see above */
nuclear@0 13
nuclear@0 14 #define CONT_PREFIX 0x80
nuclear@0 15 #define CONT_MASK 0x3f
nuclear@0 16 #define CONT_SHIFT 6
nuclear@0 17
nuclear@0 18 /* last charcodes for 1, 2, 3 or 4-byte utf8 chars */
nuclear@0 19 static const int utf8_lastcode[] = { 0x7f, 0x7ff, 0xfff, 0x1fffff };
nuclear@0 20
nuclear@0 21 #define prefix_mask(x) (~first_mask[x])
nuclear@0 22 #define prefix(x) ((prefix_mask(x) << 1) & 0xff)
nuclear@0 23
nuclear@0 24
nuclear@0 25 char *dtx_utf8_next_char(char *str)
nuclear@0 26 {
nuclear@0 27 return str + dtx_utf8_nbytes(str);
nuclear@0 28 }
nuclear@0 29
nuclear@0 30 int dtx_utf8_char_code(const char *str)
nuclear@0 31 {
nuclear@0 32 int i, nbytes, shift, code = 0;
nuclear@3 33 int mask;
nuclear@0 34
nuclear@0 35 if(!U8_IS_FIRST(*str)) {
nuclear@0 36 return -1;
nuclear@0 37 }
nuclear@0 38
nuclear@0 39 nbytes = dtx_utf8_nbytes(str);
nuclear@0 40 mask = first_mask[nbytes];
nuclear@0 41 shift = 0;
nuclear@0 42
nuclear@0 43 for(i=0; i<nbytes; i++) {
nuclear@0 44 if(!*str) {
nuclear@0 45 break;
nuclear@0 46 }
nuclear@0 47
nuclear@0 48 code = (code << shift) | (*str++ & mask);
nuclear@0 49 mask = 0x3f;
nuclear@3 50 shift = 6;
nuclear@0 51 }
nuclear@0 52
nuclear@3 53 printf("code: %x\n", code);
nuclear@0 54 return code;
nuclear@0 55 }
nuclear@0 56
nuclear@0 57 int dtx_utf8_nbytes(const char *str)
nuclear@0 58 {
nuclear@0 59 int i, numset = 0;
nuclear@0 60 int c = *str;
nuclear@0 61
nuclear@0 62 if(!U8_IS_FIRST(c)) {
nuclear@0 63 for(i=0; !U8_IS_FIRST(str[i]); i++);
nuclear@0 64 return i;
nuclear@0 65 }
nuclear@0 66
nuclear@0 67 /* count the leading 1s */
nuclear@0 68 for(i=0; i<4; i++) {
nuclear@0 69 if(((c >> (7 - i)) & 1) == 0) {
nuclear@0 70 break;
nuclear@0 71 }
nuclear@0 72 numset++;
nuclear@0 73 }
nuclear@0 74
nuclear@0 75 if(!numset) {
nuclear@0 76 return 1;
nuclear@0 77 }
nuclear@0 78 return numset;
nuclear@0 79 }
nuclear@0 80
nuclear@0 81 int dtx_utf8_char_count(const char *str)
nuclear@0 82 {
nuclear@0 83 int n = 0;
nuclear@0 84
nuclear@0 85 while(*str) {
nuclear@0 86 n++;
nuclear@0 87 str = dtx_utf8_next_char((char*)str);
nuclear@0 88 }
nuclear@0 89 return n;
nuclear@0 90 }
nuclear@0 91
nuclear@0 92 size_t dtx_utf8_from_char_code(int code, char *buf)
nuclear@0 93 {
nuclear@0 94 size_t nbytes = 0;
nuclear@0 95 int i;
nuclear@0 96
nuclear@0 97 for(i=0; i<4; i++) {
nuclear@0 98 if(code <= utf8_lastcode[i]) {
nuclear@0 99 nbytes = i + 1;
nuclear@0 100 break;
nuclear@0 101 }
nuclear@0 102 }
nuclear@0 103
nuclear@0 104 if(!nbytes && buf) {
nuclear@0 105 for(i=0; i<nbytes; i++) {
nuclear@0 106 int idx = nbytes - i - 1;
nuclear@0 107 int mask, shift, prefix;
nuclear@0 108
nuclear@0 109 if(idx > 0) {
nuclear@0 110 mask = CONT_MASK;
nuclear@0 111 shift = CONT_SHIFT;
nuclear@0 112 prefix = CONT_PREFIX;
nuclear@0 113 } else {
nuclear@0 114 mask = first_mask[nbytes];
nuclear@0 115 shift = first_shift[nbytes];
nuclear@0 116 prefix = prefix(nbytes);
nuclear@0 117 }
nuclear@0 118
nuclear@0 119 buf[idx] = (code & mask) | (prefix & ~mask);
nuclear@0 120 code >>= shift;
nuclear@0 121 }
nuclear@0 122 }
nuclear@0 123 return nbytes;
nuclear@0 124 }
nuclear@0 125
nuclear@0 126 size_t dtx_utf8_from_string(const wchar_t *str, char *buf)
nuclear@0 127 {
nuclear@0 128 size_t nbytes = 0;
nuclear@0 129 char *ptr = buf;
nuclear@0 130
nuclear@0 131 while(*str) {
nuclear@0 132 int cbytes = dtx_utf8_from_char_code(*str++, ptr);
nuclear@0 133 if(ptr) {
nuclear@0 134 ptr += cbytes;
nuclear@0 135 }
nuclear@0 136 nbytes += cbytes;
nuclear@0 137 }
nuclear@0 138 return nbytes;
nuclear@0 139 }