forked from chcg/PythonScript
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathUtfConversion.h
More file actions
188 lines (168 loc) · 6.27 KB
/
UtfConversion.h
File metadata and controls
188 lines (168 loc) · 6.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#pragma once
/**
* Functions for UTF-8, UTF-16 and UTF-32.
* Similar to UniConversion.cxx in Scintilla, but with UTF-32, and with a special handling of invalid UTF-8.
* Invalid UTF-8 sequences are translated byte per byte to "UTF-32" in the range invalid_utf8_base + 0 to + 0xFF. When translating from UTF-32 to UTF-8, they are mapped back to single bytes. Note that UTF-16 cannot handle these codes, so any conversion to UTF-16 will lose these invalid UTF-8 sequences.
* @author Francois-R.Boyer@PolyMtl.ca
* @date March, 2013
* @copyright GPL; see license.txt in Notepad++ source for complete license text.
* @file
*/
#include <string>
#include "ConstString.h"
typedef unsigned char UCHAR;
#include "Utf8.h"
// Base element of UTF-8/16/32:
typedef char U8;
typedef wchar_t U16;
typedef int U32; // UTF-32 must be signed, otherwise its max value will overflow in (i > (int)(std::numeric_limits<charT>::max)()) comparison in basic_regex_parser.hpp.
// Strings in UTF-8/16/32:
typedef std::basic_string<U8> u8string;
typedef std::basic_string<U16> u16string;
typedef std::basic_string<U32> u32string;
/// Complete UTF-16 character, with possible surrogate pair.
class U16_char {
public:
U16_char() : _length(0) { _char[0] = _char[1] = 0; }
void writeTo(U16*& utf16Out) const {
for (unsigned int i = 0; i < _length; ++i)
*utf16Out++ = _char[i];
}
operator const U16*() const { return _char; }
operator U16*() { return _char; }
unsigned int length() const { return _length; }
unsigned int length(unsigned int newLength) { return _length = newLength; }
private:
unsigned int _length;
U16 _char[2];
};
namespace UtfConversion {
static const U32 INVALID = 0x7FFFFFFF;
static const U32 maximum_unicode_point = 0x10FFFF;
static const U32 maximum_utf8_length = 4;
static const U32 invalid_utf8_base = 0x40000000;
static const U16 surrogate_mask = 0xFC00;
static const U16 lead_surrogate_base = 0xD800;
static const U16 tail_surrogate_base = 0xDC00;
extern const unsigned char _firstByteMaskForLength[7];
extern const U32 _minimumCodePointForLength[7];
/// Test for UTF-16 surrogates.
inline bool isSurrogate(U32 b) { return (b & (surrogate_mask ^ lead_surrogate_base ^ tail_surrogate_base)) == lead_surrogate_base; }
inline bool isLeadSurrogate(U32 b) { return (b & surrogate_mask) == lead_surrogate_base; }
inline bool isTailSurrogate(U32 b) { return (b & surrogate_mask) == tail_surrogate_base; }
inline bool isInvalidUtf8(U32 b) {
return b >= invalid_utf8_base && b <= invalid_utf8_base+0xFF;
}
inline bool isValidCodepoint(U32 b) {
if (b < 0
|| b > UtfConversion::maximum_unicode_point
|| isSurrogate(b)
|| (b & 0xFFFF) >= 0xFFFE)
return false;
return true;
}
/// Permits direct access to string data as an array pointer.
/// Array should not be considered zero terminated; pointer to the array of an empty string should not be dereferenced.
template <class CharT>
inline CharT* stringData(std::basic_string<CharT>& str) {
if (str.empty())
return 0;
return &*str.begin();
}
/// Converts single UTF-32 character to UTF-16.
inline U16_char toUtf16(U32 utf32)
{
U16_char utf16Out;
if (utf32 < 0x10000u) {
utf16Out[0] = U16(utf32);
utf16Out.length(1);
}
else if (utf32 <= maximum_unicode_point) {
utf16Out[0] = U16(utf32 >> 10) + lead_surrogate_base;
utf16Out[1] = U16(utf32 & 0x3F) + tail_surrogate_base;
utf16Out.length(2);
}
else utf16Out.length(0);
return utf16Out;
}
/// Converts single UTF-16 character to UTF-32.
inline U32 toUtf32(U16_char utf16)
{
if (utf16.length() == 1)
return utf16[0];
if (utf16.length() == 2)
return ((utf16[0]&0x3F)<<10) | (utf16[1]&0x3F);
return INVALID;
}
/// Converts UTF-8 text to UTF-32.
u32string toUtf32(const ConstString<U8> utf8);
/// Converts UTF-32 text to UTF-8.
u8string toUtf8(const ConstString<U32> utf32source);
/// Converts UTF-32 text to UTF-16.
u16string toUtf16(const ConstString<U32> utf32source);
/// Template function that calls translation according to destination type.
template <class destination_string_type, class source_string_chartype>
destination_string_type toStringType(const ConstString<source_string_chartype>& source);
template <class destination_string_type, class source_string_chartype>
inline destination_string_type toStringType(const std::basic_string<source_string_chartype>& source) {
return toStringType<destination_string_type>(ConstString<source_string_chartype>(source));
}
template <> inline u8string toStringType(const ConstString<U8>& source) {
return source.toString();
}
template <> inline u8string toStringType(const ConstString<U32>& source) {
return toUtf8(source);
}
template <> inline u32string toStringType(const ConstString<U8>& source) {
return toUtf32(source);
}
/// Converts UTF-8 text to UTF-32 characters to be read one by one.
class Utf8Decoder {
public:
Utf8Decoder(const U8* bytes, unsigned int bytesLeft, unsigned int byteIndex = 0) :
_bytes(bytes), _bytesLeft(bytesLeft), _byteIndex(byteIndex) {
if (!isEnd())
decode();
}
U32 decodedChar() const { ///< Returns the character codepoint; an invalid UTF-32 code if UTF-8 is invalid.
return _decodedChar;
}
U32 decodedLength() const { ///< Returns how many bytes were read to decode that character.
return _decodedLength;
}
void advanceToNext() {
if (!isEnd()) {
_byteIndex += _decodedLength;
_bytesLeft -= _decodedLength;
decode();
}
}
bool isEnd() const {
return _bytesLeft == 0;
}
unsigned int lengthInCharacters(); ///< Returns the number of characters (number of UTF-32 codes that can be read, including invalid byte sequences) from current position to the end of the buffer.
private:
void decode() {
UCHAR current_byte = getByte(0);
if (Utf8::isSingleByte(current_byte)) {
_decodedChar = current_byte;
_decodedLength = 1;
return;
}
decodeMultiByte();
}
void invalidUtf8(UCHAR utf8Byte) {
_decodedChar = invalid_utf8_base + utf8Byte;
_decodedLength = 1;
}
UCHAR getByte(unsigned int i) const {
return UCHAR(_bytes[_byteIndex + i]);
}
void decodeMultiByte();
const U8* _bytes;
unsigned int _bytesLeft;
unsigned int _byteIndex;
U32 _decodedChar;
unsigned int _decodedLength;
};
}