Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework UTFString API to only keep UTF-8 #184

Merged
merged 6 commits into from
Dec 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 16 additions & 14 deletions ebml/EbmlUnicodeString.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,42 +19,44 @@ namespace libebml {

/*!
\class UTFstring
A class storing strings in a wchar_t (ie, in UCS-2 or UCS-4)
\note inspired by wstring which is not available everywhere
A class storing strings in a char that can receive wchar_t (ie, in UCS-2 or UCS-4)
*/
class EBML_DLL_API UTFstring {
public:
using value_type = wchar_t;
using value_type = char;

UTFstring() = default;
UTFstring(const wchar_t *); // should be NULL terminated
UTFstring(const UTFstring &);
UTFstring(const char *); // should be NULL terminated
UTFstring(const UTFstring &) = default;
UTFstring(std::wstring const &);

virtual ~UTFstring() = default;
bool operator==(const UTFstring&) const;
inline bool operator==(const wchar_t *cmp) const
{
return *this == UTFstring(std::wstring{cmp});
}
inline bool operator!=(const UTFstring &cmp) const
{
return !(*this == cmp);
}
UTFstring & operator=(const UTFstring &);
inline bool operator!=(const wchar_t *cmp) const
{
return !(*this == cmp);
}
UTFstring & operator=(const UTFstring &) = default;
UTFstring & operator=(const wchar_t *);
UTFstring & operator=(wchar_t);

/// Return length of string
std::size_t length() const {return WString.size();}

explicit operator const wchar_t*() const {return WString.c_str();};
const wchar_t* c_str() const {return WString.c_str();}
/// Return length of string in bytes not counting the trailing nul character
std::size_t length() const {return UTF8string.length();}

const std::string & GetUTF8() const {return UTF8string;}
void SetUTF8(const std::string &);

private:
std::wstring WString; ///< internal UCS representation
std::string UTF8string;
void UpdateFromUTF8();
void UpdateFromUCS2();
void UpdateFromUCS2(const std::wstring &);
};


Expand Down
70 changes: 19 additions & 51 deletions src/EbmlUnicodeString.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,100 +19,68 @@ namespace libebml {

namespace {

std::size_t lengthToFirstNulll(std::wstring const &s)
std::size_t lengthToFirstNulll(std::string const &s)
{
auto PosNull = s.find(L'\0');
return PosNull != std::wstring::npos ? PosNull : s.size();
auto PosNull = s.find('\0');
return PosNull != std::string::npos ? PosNull : s.size();
}

}

// ===================== UTFstring class ===================

UTFstring::UTFstring(const wchar_t * _aBuf)
UTFstring::UTFstring(const char * _aBuf)
{
*this = _aBuf;
if (_aBuf != nullptr)
UTF8string = _aBuf;
}

UTFstring::UTFstring(std::wstring const &_aBuf)
{
*this = _aBuf.c_str();
}

UTFstring::UTFstring(const UTFstring & _aBuf)
{
*this = _aBuf.c_str();
}

UTFstring & UTFstring::operator=(const UTFstring & _aBuf)
{
*this = _aBuf.c_str();
return *this;
}

UTFstring & UTFstring::operator=(const wchar_t * _aBuf)
{
if (_aBuf != nullptr)
WString = _aBuf;
{
UpdateFromUCS2(std::wstring{_aBuf});
}
else
WString.clear();
{
UTF8string.clear();
}

UpdateFromUCS2();
return *this;
}

UTFstring & UTFstring::operator=(wchar_t _aChar)
{
WString = _aChar;
UpdateFromUCS2();
UpdateFromUCS2(std::wstring{_aChar});
return *this;
}

bool UTFstring::operator==(const UTFstring& _aStr) const
{
// Only compare up to the first 0 char in both strings.
auto LengthThis = lengthToFirstNulll(WString);
auto LengthOther = lengthToFirstNulll(_aStr.WString);
auto LengthThis = lengthToFirstNulll(UTF8string);
auto LengthOther = lengthToFirstNulll(_aStr.UTF8string);

if (LengthThis != LengthOther)
return false;

return std::memcmp(WString.c_str(), _aStr.WString.c_str(), LengthThis * sizeof(wchar_t)) == 0;
return std::memcmp(UTF8string.c_str(), _aStr.UTF8string.c_str(), LengthThis) == 0;
}

void UTFstring::SetUTF8(const std::string & _aStr)
{
UTF8string = _aStr;
UpdateFromUTF8();
}

/*!
\see RFC 2279
*/
void UTFstring::UpdateFromUTF8()
{
// Only convert up to the first \0 character if present.
auto Current = std::find(UTF8string.begin(), UTF8string.end(), '\0');

WString.clear();
try {
// Even though the function names hint at UCS2, the internal
// representation must actually be compatible with the C++
// library's implementation. Implementations with sizeof(wchar_t)
// == 4 are using UCS4.
if (sizeof(wchar_t) == 2)
::utf8::utf8to16(UTF8string.begin(), Current, std::back_inserter(WString));
else
::utf8::utf8to32(UTF8string.begin(), Current, std::back_inserter(WString));
} catch (::utf8::invalid_code_point &) {
} catch (::utf8::invalid_utf8 &) {
}
}

void UTFstring::UpdateFromUCS2()
void UTFstring::UpdateFromUCS2(const std::wstring & WString)
{
// Only convert up to the first \0 character if present.
auto Current = std::find(WString.begin(), WString.end(), L'\0');
auto Current = std::find(WString.cbegin(), WString.cend(), L'\0');

UTF8string.clear();

Expand Down Expand Up @@ -232,7 +200,7 @@ filepos_t EbmlUnicodeString::ReadData(IOCallback & input, ScopeMode ReadFully)
return GetSize();

if (GetSize() == 0) {
Value = static_cast<UTFstring::value_type>(0);
Value = UTFstring{};

} else {
std::string Buffer(static_cast<std::string::size_type>(GetSize()), static_cast<char>(0));
Expand Down
23 changes: 10 additions & 13 deletions test/test_utfstring.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using namespace libebml;

constexpr char emoji_8[] = "\xF0\x9F\x98\x80";
constexpr char emoji_u8[] = u8"\xF0\x9F\x98\x80";
constexpr wchar_t emoji_w[] = L"\U0001f600";

int main(void)
Expand All @@ -15,23 +16,25 @@ int main(void)
if (ascii != L"latin1")
return 1;

if (wcscmp(ascii.c_str(), L"latin1") != 0)
return 1;

if (ascii.GetUTF8() != "latin1")
return 1;

UTFstring u8;
u8.SetUTF8( emoji_u8 );

UTFstring u8construct{emoji_u8};

UTFstring utf8;
utf8.SetUTF8( emoji_8 );

if (utf8.length() != (4 / sizeof(wchar_t)))
if (utf8.length() != 4)
return 1;

if (utf8 != emoji_w)
return 1;

// UTFstring invalid;
// FIXME don't crash invalid.SetUTF8( "\x1\xF6\x00" );
UTFstring invalid;
invalid.SetUTF8( "\x1\xF6\x00" );

UTFstring empty{0};
if (empty.length() != 0)
Expand All @@ -45,21 +48,15 @@ int main(void)
return 1;

UTFstring copy = utf8;
if (copy.length() != (4 / sizeof(wchar_t)))
if (copy.length() != 4)
return 1;

if (copy != emoji_w)
return 1;

if (copy.c_str() == utf8.c_str())
return 1;

UTFstring copy2(utf8);
if (copy2 != emoji_w)
return 1;

if (copy2.c_str() == utf8.c_str())
return 1;

return 0;
}