Matroska-Org · robUx4 · Dec 27, 2023 · Dec 26, 2023 · Dec 26, 2023 · Dec 26, 2023
diff --git a/ebml/EbmlUnicodeString.h b/ebml/EbmlUnicodeString.h
@@ -19,42 +19,44 @@ namespace libebml {
 
 /*!
   \class UTFstring
-  A class storing strings in a wchar_t (ie, in UCS-2 or UCS-4)
-  \note inspired by wstring which is not available everywhere
+  A class storing strings in a char that can receive wchar_t (ie, in UCS-2 or UCS-4)
 */
 class EBML_DLL_API UTFstring {
 public:
-  using value_type = wchar_t;
+  using value_type = char;
 
   UTFstring() = default;
-  UTFstring(const wchar_t *); // should be NULL terminated
-  UTFstring(const UTFstring &);
+  UTFstring(const char *); // should be NULL terminated
+  UTFstring(const UTFstring &) = default;
   UTFstring(std::wstring const &);
 
   virtual ~UTFstring() = default;
   bool operator==(const UTFstring&) const;
+  inline bool operator==(const wchar_t *cmp) const
+  {
+    return *this == UTFstring(std::wstring{cmp});
+  }
   inline bool operator!=(const UTFstring &cmp) const
   {
     return !(*this == cmp);
   }
-  UTFstring & operator=(const UTFstring &);
+  inline bool operator!=(const wchar_t *cmp) const
+  {
+    return !(*this == cmp);
+  }
+  UTFstring & operator=(const UTFstring &) = default;
   UTFstring & operator=(const wchar_t *);
   UTFstring & operator=(wchar_t);
 
-  /// Return length of string
-  std::size_t length() const {return WString.size();}
-
-  explicit operator const wchar_t*() const {return WString.c_str();};
-  const wchar_t* c_str() const {return WString.c_str();}
+  /// Return length of string in bytes not counting the trailing nul character
+  std::size_t length() const {return UTF8string.length();}
 
   const std::string & GetUTF8() const {return UTF8string;}
   void SetUTF8(const std::string &);
 
 private:
-  std::wstring WString; ///< internal UCS representation
   std::string UTF8string;
-  void UpdateFromUTF8();
-  void UpdateFromUCS2();
+  void UpdateFromUCS2(const std::wstring &);
 };
 
 

diff --git a/src/EbmlUnicodeString.cpp b/src/EbmlUnicodeString.cpp
@@ -19,100 +19,68 @@ namespace libebml {
 
 namespace {
 
-std::size_t lengthToFirstNulll(std::wstring const &s)
+std::size_t lengthToFirstNulll(std::string const &s)
 {
-  auto PosNull = s.find(L'\0');
-  return PosNull != std::wstring::npos ? PosNull : s.size();
+  auto PosNull = s.find('\0');
+  return PosNull != std::string::npos ? PosNull : s.size();
 }
 
 }
 
 // ===================== UTFstring class ===================
 
-UTFstring::UTFstring(const wchar_t * _aBuf)
+UTFstring::UTFstring(const char * _aBuf)
 {
-  *this = _aBuf;
+  if (_aBuf != nullptr)
+    UTF8string = _aBuf;
 }
 
 UTFstring::UTFstring(std::wstring const &_aBuf)
 {
   *this = _aBuf.c_str();
 }
 
-UTFstring::UTFstring(const UTFstring & _aBuf)
-{
-  *this = _aBuf.c_str();
-}
-
-UTFstring & UTFstring::operator=(const UTFstring & _aBuf)
-{
-  *this = _aBuf.c_str();
-  return *this;
-}
-
 UTFstring & UTFstring::operator=(const wchar_t * _aBuf)
 {
   if (_aBuf != nullptr)
-    WString = _aBuf;
+  {
+    UpdateFromUCS2(std::wstring{_aBuf});
+  }
   else
-    WString.clear();
+  {
+    UTF8string.clear();
+  }
 
-  UpdateFromUCS2();
   return *this;
 }
 
 UTFstring & UTFstring::operator=(wchar_t _aChar)
 {
-  WString = _aChar;
-  UpdateFromUCS2();
+  UpdateFromUCS2(std::wstring{_aChar});
   return *this;
 }
 
 bool UTFstring::operator==(const UTFstring& _aStr) const
 {
   // Only compare up to the first 0 char in both strings.
-  auto LengthThis  = lengthToFirstNulll(WString);
-  auto LengthOther = lengthToFirstNulll(_aStr.WString);
+  auto LengthThis  = lengthToFirstNulll(UTF8string);
+  auto LengthOther = lengthToFirstNulll(_aStr.UTF8string);
 
   if (LengthThis != LengthOther)
     return false;
 
-  return std::memcmp(WString.c_str(), _aStr.WString.c_str(), LengthThis * sizeof(wchar_t)) == 0;
+  return std::memcmp(UTF8string.c_str(), _aStr.UTF8string.c_str(), LengthThis) == 0;
 }
 
 void UTFstring::SetUTF8(const std::string & _aStr)
 {
   UTF8string = _aStr;
-  UpdateFromUTF8();
-}
-
-/*!
-  \see RFC 2279
-*/
-void UTFstring::UpdateFromUTF8()
-{
-  // Only convert up to the first \0 character if present.
-  auto Current = std::find(UTF8string.begin(), UTF8string.end(), '\0');
-
-  WString.clear();
-  try {
-    // Even though the function names hint at UCS2, the internal
-    // representation must actually be compatible with the C++
-    // library's implementation. Implementations with sizeof(wchar_t)
-    // == 4 are using UCS4.
-    if (sizeof(wchar_t) == 2)
-      ::utf8::utf8to16(UTF8string.begin(), Current, std::back_inserter(WString));
-    else
-      ::utf8::utf8to32(UTF8string.begin(), Current, std::back_inserter(WString));
-  } catch (::utf8::invalid_code_point &) {
-  } catch (::utf8::invalid_utf8 &) {
-  }
 }
 
-void UTFstring::UpdateFromUCS2()
+void UTFstring::UpdateFromUCS2(const std::wstring & WString)
 {
   // Only convert up to the first \0 character if present.
-  auto Current = std::find(WString.begin(), WString.end(), L'\0');
+  auto Current = std::find(WString.cbegin(), WString.cend(), L'\0');
 
   UTF8string.clear();
 
@@ -232,7 +200,7 @@ filepos_t EbmlUnicodeString::ReadData(IOCallback & input, ScopeMode ReadFully)
     return GetSize();
 
   if (GetSize() == 0) {
-    Value = static_cast<UTFstring::value_type>(0);
+    Value = UTFstring{};
 
   } else {
     std::string Buffer(static_cast<std::string::size_type>(GetSize()), static_cast<char>(0));

diff --git a/test/test_utfstring.cxx b/test/test_utfstring.cxx
@@ -6,6 +6,7 @@
 using namespace libebml;
 
 constexpr char emoji_8[] = "\xF0\x9F\x98\x80";
+constexpr char emoji_u8[] = u8"\xF0\x9F\x98\x80";
 constexpr wchar_t emoji_w[] = L"\U0001f600";
 
 int main(void)
@@ -15,23 +16,25 @@ int main(void)
     if (ascii != L"latin1")
         return 1;
 
-    if (wcscmp(ascii.c_str(), L"latin1") != 0)
-        return 1;
-
     if (ascii.GetUTF8() != "latin1")
         return 1;
 
+    UTFstring u8;
+    u8.SetUTF8( emoji_u8 );
+
+    UTFstring u8construct{emoji_u8};
+
     UTFstring utf8;
     utf8.SetUTF8( emoji_8 );
 
-    if (utf8.length() != (4 / sizeof(wchar_t)))
+    if (utf8.length() != 4)
         return 1;
 
     if (utf8 != emoji_w)
         return 1;
 
-    // UTFstring invalid;
-    // FIXME don't crash invalid.SetUTF8( "\x1\xF6\x00" );
+    UTFstring invalid;
+    invalid.SetUTF8( "\x1\xF6\x00" );
 
     UTFstring empty{0};
     if (empty.length() != 0)
@@ -45,21 +48,15 @@ int main(void)
         return 1;
 
     UTFstring copy = utf8;
-    if (copy.length() != (4 / sizeof(wchar_t)))
+    if (copy.length() != 4)
         return 1;
 
     if (copy != emoji_w)
         return 1;
 
-    if (copy.c_str() == utf8.c_str())
-        return 1;
-
     UTFstring copy2(utf8);
     if (copy2 != emoji_w)
         return 1;
 
-    if (copy2.c_str() == utf8.c_str())
-        return 1;
-
     return 0;
 }