Skip to content

Commit

Permalink
UTFstring: store data in std::wstring instead of doing manual string …
Browse files Browse the repository at this point in the history
…management
  • Loading branch information
mbunkus committed Dec 22, 2023
1 parent 684612c commit 58ccbaf
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 63 deletions.
12 changes: 5 additions & 7 deletions ebml/EbmlUnicodeString.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,17 @@ class EBML_DLL_API UTFstring {
UTFstring & operator=(wchar_t);

/// Return length of string
std::size_t length() const {return _Length;}
std::size_t length() const {return WString.size();}

explicit operator const wchar_t*() const;
const wchar_t* c_str() const {return _Data;}
explicit operator const wchar_t*() const {return WString.c_str();};
const wchar_t* c_str() const {return WString.c_str();}

const std::string & GetUTF8() const {return UTF8string;}
void SetUTF8(const std::string &);

private:
std::size_t _Length{0}; ///< length of the UCS string excluding the \0
wchar_t* _Data{nullptr}; ///< internal UCS representation
private:
std::wstring WString; ///< internal UCS representation
std::string UTF8string;
static bool wcscmp_internal(const wchar_t *str1, const wchar_t *str2);
void UpdateFromUTF8();
void UpdateFromUCS2();
};
Expand Down
76 changes: 20 additions & 56 deletions src/EbmlUnicodeString.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ UTFstring::UTFstring(std::wstring const &_aBuf)

UTFstring::~UTFstring()
{
delete [] _Data;
}

UTFstring::UTFstring(const UTFstring & _aBuf)
Expand All @@ -44,49 +43,34 @@ UTFstring & UTFstring::operator=(const UTFstring & _aBuf)
return *this;
}

UTFstring::operator const wchar_t*() const {return _Data;}


UTFstring & UTFstring::operator=(const wchar_t * _aBuf)
{
delete [] _Data;
if (_aBuf == nullptr) {
_Data = new wchar_t[1];
_Data[0] = 0;
UpdateFromUCS2();
return *this;
}
if (_aBuf != nullptr)
WString = _aBuf;
else
WString.clear();

std::size_t aLen;
for (aLen=0; _aBuf[aLen] != 0; aLen++);
_Length = aLen;
_Data = new wchar_t[_Length+1];
for (aLen=0; _aBuf[aLen] != 0; aLen++) {
_Data[aLen] = _aBuf[aLen];
}
_Data[aLen] = 0;
UpdateFromUCS2();
return *this;
}

UTFstring & UTFstring::operator=(wchar_t _aChar)
{
delete [] _Data;
_Data = new wchar_t[2];
_Length = 1;
_Data[0] = _aChar;
_Data[1] = 0;
WString = _aChar;
UpdateFromUCS2();
return *this;
}

bool UTFstring::operator==(const UTFstring& _aStr) const
{
if ((_Data == nullptr) && (_aStr._Data == nullptr))
return true;
if ((_Data == nullptr) || (_aStr._Data == nullptr))
// Only compare up to the first 0 char in both strings.
auto LengthThis = std::distance(WString.begin(), std::find(WString.begin(), WString.end(), L'\0'));
auto LengthOther = std::distance(_aStr.WString.begin(), std::find(_aStr.WString.begin(), _aStr.WString.end(), L'\0'));

if (LengthThis != LengthOther)
return false;
return wcscmp_internal(_Data, _aStr._Data);

return std::memcmp(WString.c_str(), _aStr.WString.c_str(), LengthThis * sizeof(wchar_t)) == 0;
}

void UTFstring::SetUTF8(const std::string & _aStr)
Expand All @@ -103,62 +87,42 @@ void UTFstring::UpdateFromUTF8()
// Only convert up to the first \0 character if present.
auto Current = std::find(UTF8string.begin(), UTF8string.end(), '\0');

std::wstring Temp;
WString.clear();
try {
// Even though the function names hint at UCS2, the internal
// representation must actually be compatible with the C++
// library's implementation. Implementations with sizeof(wchar_t)
// == 4 are using UCS4.
if (sizeof(wchar_t) == 2)
::utf8::utf8to16(UTF8string.begin(), Current, std::back_inserter(Temp));
::utf8::utf8to16(UTF8string.begin(), Current, std::back_inserter(WString));
else
::utf8::utf8to32(UTF8string.begin(), Current, std::back_inserter(Temp));
::utf8::utf8to32(UTF8string.begin(), Current, std::back_inserter(WString));
} catch (::utf8::invalid_code_point &) {
} catch (::utf8::invalid_utf8 &) {
}

delete [] _Data;
_Length = Temp.length();
_Data = new wchar_t[_Length + 1];

std::memcpy(_Data, Temp.c_str(), sizeof(wchar_t) * (_Length + 1));
}

void UTFstring::UpdateFromUCS2()
{
UTF8string.clear();

if (!_Data)
return;

// Only convert up to the first \0 character if present.
std::size_t Current = 0;
while ((Current < _Length) && _Data[Current])
++Current;
auto Current = std::find(WString.begin(), WString.end(), L'\0');

UTF8string.clear();

try {
// Even though the function is called UCS2, the internal
// representation must actually be compatible with the C++
// library's implementation. Implementations with sizeof(wchar_t)
// == 4 are using UCS4.
if (sizeof(wchar_t) == 2)
::utf8::utf16to8(_Data, _Data + Current, std::back_inserter(UTF8string));
::utf8::utf16to8(WString.begin(), Current, std::back_inserter(UTF8string));
else
::utf8::utf32to8(_Data, _Data + Current, std::back_inserter(UTF8string));
::utf8::utf32to8(WString.begin(), Current, std::back_inserter(UTF8string));
} catch (::utf8::invalid_code_point &) {
} catch (::utf8::invalid_utf16 &) {
}
}

bool UTFstring::wcscmp_internal(const wchar_t *str1, const wchar_t *str2)
{
std::size_t Index=0;
while (str1[Index] == str2[Index] && str1[Index] != 0) {
Index++;
}
return (str1[Index] == str2[Index]);
}

// ===================== EbmlUnicodeString class ===================

EbmlUnicodeString::EbmlUnicodeString()
Expand Down

0 comments on commit 58ccbaf

Please sign in to comment.