-
Notifications
You must be signed in to change notification settings - Fork 4
/
Utf8_16.h
182 lines (156 loc) · 4.42 KB
/
Utf8_16.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
// Utf8_16.h
// Copyright (C) 2002 Scott Kirkwood
//
// Permission to use, copy, modify, distribute and sell this code
// and its documentation for any purpose is hereby granted without fee,
// provided that the above copyright notice appear in all copies or
// any derived copies. Scott Kirkwood makes no representations
// about the suitability of this software for any purpose.
// It is provided "as is" without express or implied warranty.
//
// Notes: Used the UTF information I found at:
// http://www.cl.cam.ac.uk/~mgk25/unicode.html
////////////////////////////////////////////////////////////////////////////////
//
// Modificated 2006 Jens Lorenz
//
// - Clean up the sources
// - Removing UCS-Bug in Utf8_Iter
// - Add convert function in Utf8_16_Write
////////////////////////////////////////////////////////////////////////////////
#ifndef UTF8_16_H
#pragma once
#ifdef _MSC_VER
#pragma warning(disable: 4514) // nreferenced inline function has been removed
#endif
#include <cstddef>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#ifdef _UNICODE
typedef wchar_t TCHAR;
#else
typedef char TCHAR;
#endif
typedef unsigned char UCHAR;
typedef int INT;
enum UniMode {
uni8Bit=0, uniUTF8=1, uni16BE=2, uni16LE=3, uniCookie=4,
uni7Bit=5, uni16BE_NoBOM=6, uni16LE_NoBOM=7, uniEnd
};
#define IS_TEXT_UNICODE_STATISTICS 0x0002
#define generic_fopen fopen
class Utf8_16 {
public:
typedef unsigned short utf16; // 16 bits
typedef UCHAR utf8; // 8 bits
typedef UCHAR ubyte;
static const utf8 k_Boms[uniEnd][3];
};
// Reads UTF-16 and outputs UTF-8
class Utf16_Iter : public Utf8_16 {
public:
enum eState {
eStart,
e2Bytes2,
e3Bytes2,
e3Bytes3
};
Utf16_Iter();
void reset();
void set(const ubyte* pBuf, size_t nLen, UniMode eEncoding);
utf8 get() const { return m_nCur; };
void operator++();
eState getState() { return m_eState; };
operator bool() { return m_pRead <= m_pEnd; };
protected:
void toStart(); // Put to start state, swap bytes if necessary
protected:
UniMode m_eEncoding;
eState m_eState;
utf8 m_nCur;
utf16 m_nCur16;
const ubyte* m_pBuf;
const ubyte* m_pRead;
const ubyte* m_pEnd;
};
// Reads UTF-8 and outputs UTF-16
class Utf8_Iter : public Utf8_16 {
public:
Utf8_Iter();
void reset();
void set(const ubyte* pBuf, size_t nLen, UniMode eEncoding);
utf16 get() const {
#ifdef _DEBUG
assert(m_eState == eStart);
#endif
return m_nCur;
}
bool canGet() const { return m_eState == eStart; }
void operator++();
operator bool() { return m_pRead <= m_pEnd; }
protected:
void swap();
void toStart(); // Put to start state, swap bytes if necessary
enum eState {
eStart,
e2Bytes_Byte2,
e3Bytes_Byte2,
e3Bytes_Byte3
};
protected:
UniMode m_eEncoding;
eState m_eState;
utf16 m_nCur;
const ubyte* m_pBuf;
const ubyte* m_pRead;
const ubyte* m_pEnd;
};
// Reads UTF16 and outputs UTF8
enum u78 {utf8NoBOM=0, ascii7bits=1, ascii8bits=2};
class Utf8_16_Read : public Utf8_16 {
public:
Utf8_16_Read();
~Utf8_16_Read();
size_t convert(char* buf, size_t len);
const char* getNewBuf() const { return (const char*) m_pNewBuf; }
size_t getNewSize() const { return m_nNewBufSize; }
UniMode getEncoding() const { return m_eEncoding; }
size_t calcCurPos(size_t pos);
static UniMode determineEncoding(const unsigned char *buf, int bufLen);
protected:
void determineEncoding();
u78 utf8_7bits_8bits();
private:
UniMode m_eEncoding;
ubyte* m_pBuf;
ubyte* m_pNewBuf;
// size of the new buffer
size_t m_nNewBufSize;
// size of the previously allocated buffer (if != 0)
size_t m_nAllocatedBufSize;
size_t m_nSkip;
bool m_bFirstRead;
size_t m_nLen;
Utf16_Iter m_Iter16;
};
// Read in a UTF-8 buffer and write out to UTF-16 or UTF-8
class Utf8_16_Write : public Utf8_16 {
public:
Utf8_16_Write();
~Utf8_16_Write();
void setEncoding(UniMode eType);
FILE * fopen(const TCHAR *_name, const TCHAR *_type);
size_t fwrite(const void* p, size_t _size);
void fclose();
size_t convert(char* p, size_t _size);
char* getNewBuf() { return reinterpret_cast<char*>(m_pNewBuf); }
size_t calcCurPos(size_t pos);
protected:
UniMode m_eEncoding;
FILE* m_pFile;
ubyte* m_pNewBuf;
size_t m_nBufSize;
bool m_bFirstWrite;
};
#endif// UTF8_16_H