rev |
line source |
nuclear@0
|
1 /*
|
nuclear@0
|
2 * Copyright 2001-2004 Unicode, Inc.
|
nuclear@0
|
3 *
|
nuclear@0
|
4 * Disclaimer
|
nuclear@0
|
5 *
|
nuclear@0
|
6 * This source code is provided as is by Unicode, Inc. No claims are
|
nuclear@0
|
7 * made as to fitness for any particular purpose. No warranties of any
|
nuclear@0
|
8 * kind are expressed or implied. The recipient agrees to determine
|
nuclear@0
|
9 * applicability of information provided. If this file has been
|
nuclear@0
|
10 * purchased on magnetic or optical media from Unicode, Inc., the
|
nuclear@0
|
11 * sole remedy for any claim will be exchange of defective media
|
nuclear@0
|
12 * within 90 days of receipt.
|
nuclear@0
|
13 *
|
nuclear@0
|
14 * Limitations on Rights to Redistribute This Code
|
nuclear@0
|
15 *
|
nuclear@0
|
16 * Unicode, Inc. hereby grants the right to freely use the information
|
nuclear@0
|
17 * supplied in this file in the creation of products supporting the
|
nuclear@0
|
18 * Unicode Standard, and to make copies of this file in any form
|
nuclear@0
|
19 * for internal or external distribution as long as this notice
|
nuclear@0
|
20 * remains attached.
|
nuclear@0
|
21 */
|
nuclear@0
|
22 #ifndef CONVERTUTF_H
|
nuclear@0
|
23 #define CONVERTUTF_H
|
nuclear@0
|
24 /* ---------------------------------------------------------------------
|
nuclear@0
|
25
|
nuclear@0
|
26 Conversions between UTF32, UTF-16, and UTF-8. Header file.
|
nuclear@0
|
27
|
nuclear@0
|
28 Several funtions are included here, forming a complete set of
|
nuclear@0
|
29 conversions between the three formats. UTF-7 is not included
|
nuclear@0
|
30 here, but is handled in a separate source file.
|
nuclear@0
|
31
|
nuclear@0
|
32 Each of these routines takes pointers to input buffers and output
|
nuclear@0
|
33 buffers. The input buffers are const.
|
nuclear@0
|
34
|
nuclear@0
|
35 Each routine converts the text between *sourceStart and sourceEnd,
|
nuclear@0
|
36 putting the result into the buffer between *targetStart and
|
nuclear@0
|
37 targetEnd. Note: the end pointers are *after* the last item: e.g.
|
nuclear@0
|
38 *(sourceEnd - 1) is the last item.
|
nuclear@0
|
39
|
nuclear@0
|
40 The return result indicates whether the conversion was successful,
|
nuclear@0
|
41 and if not, whether the problem was in the source or target buffers.
|
nuclear@0
|
42 (Only the first encountered problem is indicated.)
|
nuclear@0
|
43
|
nuclear@0
|
44 After the conversion, *sourceStart and *targetStart are both
|
nuclear@0
|
45 updated to point to the end of last text successfully converted in
|
nuclear@0
|
46 the respective buffers.
|
nuclear@0
|
47
|
nuclear@0
|
48 Input parameters:
|
nuclear@0
|
49 sourceStart - pointer to a pointer to the source buffer.
|
nuclear@0
|
50 The contents of this are modified on return so that
|
nuclear@0
|
51 it points at the next thing to be converted.
|
nuclear@0
|
52 targetStart - similarly, pointer to pointer to the target buffer.
|
nuclear@0
|
53 sourceEnd, targetEnd - respectively pointers to the ends of the
|
nuclear@0
|
54 two buffers, for overflow checking only.
|
nuclear@0
|
55
|
nuclear@0
|
56 These conversion functions take a ConversionFlags argument. When this
|
nuclear@0
|
57 flag is set to strict, both irregular sequences and isolated surrogates
|
nuclear@0
|
58 will cause an error. When the flag is set to lenient, both irregular
|
nuclear@0
|
59 sequences and isolated surrogates are converted.
|
nuclear@0
|
60
|
nuclear@0
|
61 Whether the flag is strict or lenient, all illegal sequences will cause
|
nuclear@0
|
62 an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
|
nuclear@0
|
63 or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
|
nuclear@0
|
64 must check for illegal sequences.
|
nuclear@0
|
65
|
nuclear@0
|
66 When the flag is set to lenient, characters over 0x10FFFF are converted
|
nuclear@0
|
67 to the replacement character; otherwise (when the flag is set to strict)
|
nuclear@0
|
68 they constitute an error.
|
nuclear@0
|
69
|
nuclear@0
|
70 Output parameters:
|
nuclear@0
|
71 The value "sourceIllegal" is returned from some routines if the input
|
nuclear@0
|
72 sequence is malformed. When "sourceIllegal" is returned, the source
|
nuclear@0
|
73 value will point to the illegal value that caused the problem. E.g.,
|
nuclear@0
|
74 in UTF-8 when a sequence is malformed, it points to the start of the
|
nuclear@0
|
75 malformed sequence.
|
nuclear@0
|
76
|
nuclear@0
|
77 Author: Mark E. Davis, 1994.
|
nuclear@0
|
78 Rev History: Rick McGowan, fixes & updates May 2001.
|
nuclear@0
|
79 Fixes & updates, Sept 2001.
|
nuclear@0
|
80
|
nuclear@0
|
81 ------------------------------------------------------------------------ */
|
nuclear@0
|
82
|
nuclear@0
|
83 /* ---------------------------------------------------------------------
|
nuclear@0
|
84 The following 4 definitions are compiler-specific.
|
nuclear@0
|
85 The C standard does not guarantee that wchar_t has at least
|
nuclear@0
|
86 16 bits, so wchar_t is no less portable than unsigned short!
|
nuclear@0
|
87 All should be unsigned values to avoid sign extension during
|
nuclear@0
|
88 bit mask & shift operations.
|
nuclear@0
|
89 ------------------------------------------------------------------------ */
|
nuclear@0
|
90
|
nuclear@0
|
91 typedef unsigned long UTF32; /* at least 32 bits */
|
nuclear@0
|
92 typedef unsigned short UTF16; /* at least 16 bits */
|
nuclear@0
|
93 typedef unsigned char UTF8; /* typically 8 bits */
|
nuclear@0
|
94 typedef unsigned char Boolean; /* 0 or 1 */
|
nuclear@0
|
95
|
nuclear@0
|
96 /* Some fundamental constants */
|
nuclear@0
|
97 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
|
nuclear@0
|
98 #define UNI_MAX_BMP (UTF32)0x0000FFFF
|
nuclear@0
|
99 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
|
nuclear@0
|
100 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
|
nuclear@0
|
101 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
|
nuclear@0
|
102
|
nuclear@0
|
103 typedef enum {
|
nuclear@0
|
104 conversionOK, /* conversion successful */
|
nuclear@0
|
105 sourceExhausted, /* partial character in source, but hit end */
|
nuclear@0
|
106 targetExhausted, /* insuff. room in target for conversion */
|
nuclear@0
|
107 sourceIllegal /* source sequence is illegal/malformed */
|
nuclear@0
|
108 } ConversionResult;
|
nuclear@0
|
109
|
nuclear@0
|
110 typedef enum {
|
nuclear@0
|
111 strictConversion = 0,
|
nuclear@0
|
112 lenientConversion
|
nuclear@0
|
113 } ConversionFlags;
|
nuclear@0
|
114
|
nuclear@0
|
115 /* This is for C++ and does no harm in C */
|
nuclear@0
|
116 #ifdef __cplusplus
|
nuclear@0
|
117 extern "C" {
|
nuclear@0
|
118 #endif
|
nuclear@0
|
119
|
nuclear@0
|
120 ConversionResult ConvertUTF8toUTF16 (
|
nuclear@0
|
121 const UTF8** sourceStart, const UTF8* sourceEnd,
|
nuclear@0
|
122 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
|
nuclear@0
|
123
|
nuclear@0
|
124 ConversionResult ConvertUTF16toUTF8 (
|
nuclear@0
|
125 const UTF16** sourceStart, const UTF16* sourceEnd,
|
nuclear@0
|
126 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
|
nuclear@0
|
127
|
nuclear@0
|
128 ConversionResult ConvertUTF8toUTF32 (
|
nuclear@0
|
129 const UTF8** sourceStart, const UTF8* sourceEnd,
|
nuclear@0
|
130 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
|
nuclear@0
|
131
|
nuclear@0
|
132 ConversionResult ConvertUTF32toUTF8 (
|
nuclear@0
|
133 const UTF32** sourceStart, const UTF32* sourceEnd,
|
nuclear@0
|
134 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
|
nuclear@0
|
135
|
nuclear@0
|
136 ConversionResult ConvertUTF16toUTF32 (
|
nuclear@0
|
137 const UTF16** sourceStart, const UTF16* sourceEnd,
|
nuclear@0
|
138 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
|
nuclear@0
|
139
|
nuclear@0
|
140 ConversionResult ConvertUTF32toUTF16 (
|
nuclear@0
|
141 const UTF32** sourceStart, const UTF32* sourceEnd,
|
nuclear@0
|
142 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
|
nuclear@0
|
143
|
nuclear@0
|
144 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
|
nuclear@0
|
145
|
nuclear@0
|
146 #ifdef __cplusplus
|
nuclear@0
|
147 }
|
nuclear@0
|
148 #endif
|
nuclear@0
|
149
|
nuclear@0
|
150 /* --------------------------------------------------------------------- */
|
nuclear@0
|
151 #endif // CONVERTUTF_H
|