diff --git a/doc/version.texi b/doc/version.texi index c9e6b68..2c0159e 100644 --- a/doc/version.texi +++ b/doc/version.texi @@ -1,4 +1,4 @@ -@set UPDATED 25 January 2024 +@set UPDATED 27 January 2024 @set UPDATED-MONTH January 2024 @set EDITION 0.6.23 @set VERSION 0.6.23 diff --git a/scribbu/charsets.cc b/scribbu/charsets.cc index 0d5a37f..6dc44d8 100644 --- a/scribbu/charsets.cc +++ b/scribbu/charsets.cc @@ -771,7 +771,7 @@ namespace scribbu { srcenc, dstenc, rsp); } - /// Convert encodings from C strings to buffers of unsigned char + /// Convert encodings from C strings to buffers of unsigned char template <> std::vector convert_encoding(char const* const& text, @@ -800,6 +800,22 @@ namespace scribbu { const unsigned char *pbom = nullptr; if (add_bom) { switch (dstenc) { + case encoding::UCS_2: + cbbom = 2; + if constexpr (std::endian::native == std::endian::big) { + pbom = UTF16BE; + } else { + pbom = UTF16LE; + } + break; + case encoding::UCS_4: + cbbom = 4; + if constexpr (std::endian::native == std::endian::big) { + pbom = UTF32BE; + } else { + pbom = UTF32LE; + } + break; case encoding::UCS_2BE: case encoding::UTF_16BE: cbbom = 2; @@ -840,7 +856,7 @@ namespace scribbu { // http://stackoverflow.com/questions/13297458/simple-utf8-utf16-string-conversion-with-iconv std::size_t cbout = cbbom + (ntext << 2); vector out(cbout); - std::size_t outbytesleft = cbout; + std::size_t outbytesleft = cbout - cbbom; char *outbuf = reinterpret_cast(&(out[cbbom])); size_t status = iconv(dsc, &inbuf, &inbytesleft, &outbuf, &outbytesleft); while (~0 == status && E2BIG == errno) { diff --git a/scribbu/csv-pprinter.cc b/scribbu/csv-pprinter.cc index 96da91f..01b59ac 100644 --- a/scribbu/csv-pprinter.cc +++ b/scribbu/csv-pprinter.cc @@ -350,7 +350,7 @@ scribbu::csv_pprinter::pprint_COM(const COM &f, std::ostream &os) std::tie(dst, rsp) = encoding_from_stream(os); char lang[3]; - f.lang(lang); + std::tie(lang[0], lang[1],lang[2]) = f.lang(); return os << lang[0] << lang[1] << lang[2] << sep_ << escape(f.description(dst, rsp, v2enc_)) << sep_ << @@ -500,7 +500,7 @@ scribbu::csv_pprinter::pprint_COMM(const COMM &f, std::ostream &os) std::tie(dst, rsp) = encoding_from_stream(os); char lang[3]; - f.lang(lang); + std::tie(lang[0], lang[1],lang[2]) = f.lang(); return os << lang[0] << lang[1] << lang[2] << sep_ << escape(f.description(dst, rsp, v2enc_)) << sep_ << @@ -664,7 +664,7 @@ scribbu::csv_pprinter::pprint_COMM_2_4(const COMM_2_4 &f, std::ostream &os) std::tie(dst, rsp) = encoding_from_stream(os); char lang[3]; - f.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = f.lang(); return os << lang[0] << lang[1] << lang[2] << sep_ << escape(f.description(dst, rsp, v2enc_)) << sep_ << diff --git a/scribbu/framesv2.cc b/scribbu/framesv2.cc index ac0001b..332f29f 100644 --- a/scribbu/framesv2.cc +++ b/scribbu/framesv2.cc @@ -314,6 +314,22 @@ scribbu::unknown_text_frame::what() const noexcept(true) return pwhat_->c_str(); } +/////////////////////////////////////////////////////////////////////////////// +// class bad_unicode_value // +/////////////////////////////////////////////////////////////////////////////// + +/*virtual*/ const char * +scribbu::bad_unicode_value::what() const noexcept(true) +{ + // lazily format + if ( ! pwhat_ ) { + std::stringstream stm; + stm << "Unknown value for the Unicode byte: " << b_; + pwhat_.reset(new std::string(stm.str())); + } + return pwhat_->c_str(); +} + /////////////////////////////////////////////////////////////////////////////// // class frame_id3 // /////////////////////////////////////////////////////////////////////////////// @@ -920,13 +936,13 @@ scribbu::comments::comments(id3v2_version ver, cbnil_ = 1; break; case use_unicode::yes: - unicode_ = 4; + unicode_ = 3; dst = encoding::UTF_8; add_bom = false; cbnil_ = 1; break; case use_unicode::with_bom: - unicode_ = 4; + unicode_ = 3; dst = encoding::UTF_8; add_bom = true; cbnil_ = 1; @@ -941,6 +957,18 @@ scribbu::comments::comments(id3v2_version ver, } +void +scribbu::comments::description(const std::string &text, encoding src) +{ + description_ = convert_encoding(text, src, dst_enc_, add_bom()); +} + +void +scribbu::comments::text(const std::string &text, encoding src) +{ + text_ = convert_encoding(text, src, dst_enc_, add_bom()); +} + std::size_t scribbu::comments::size() const { @@ -1029,6 +1057,15 @@ scribbu::comments::count_syncs(bool false_only) const return cb; } +bool +scribbu::comments::add_bom() const +{ + return (id3v2_version::v2 == ver_ && encoding::UCS_2 == dst_enc_) || + (id3v2_version::v3 == ver_ && encoding::UCS_2 == dst_enc_) || + (id3v2_version::v4 == ver_ && encoding::UTF_16 == dst_enc_); + +} + /////////////////////////////////////////////////////////////////////////////// // class play_count // @@ -1521,6 +1558,3 @@ scribbu::tag_cloud::parse_to_map(const std::string &text, } } - - - diff --git a/scribbu/framesv2.hh b/scribbu/framesv2.hh index 4ecfb07..22a1ffc 100644 --- a/scribbu/framesv2.hh +++ b/scribbu/framesv2.hh @@ -244,6 +244,19 @@ namespace scribbu { mutable std::shared_ptr pwhat_; }; + /// Thrown on a bad "unicode" byte + class bad_unicode_value : public error + { + public: + bad_unicode_value(unsigned char b) : b_(b) + { } + virtual const char * what() const noexcept(true); + unsigned char bad_value() const + { return b_; } + private: + unsigned char b_; + mutable std::shared_ptr pwhat_; + }; /// ID3v2.2 identifier-- a simple UDT representing a three-character, /// ASCII-encoded frame ID for use in hashed collections class frame_id3 @@ -655,6 +668,13 @@ namespace scribbu { * structure, and is meant to be combined into version-specific id3v2_frame * sub-classes through multiple inheritence. * + * It is regrettable that this class is implemented in terms of its on-disk + * format (the comment text & descriptions, for instance, are represented as + * vectors of unsigned char in the frame's target encoding). It would be + * preferrable to pick a general representation (UTF-8 strings, to continue + * to the example) and only worry about the on-disk frame representation at + * I/O time. + * * */ @@ -668,11 +688,41 @@ namespace scribbu { comments(id3v2_version ver, forward_input_iterator p0, forward_input_iterator p1): + ver_(ver), cbnil_(1) { if (p0 != p1) { unicode_ = *p0++; + if (id3v2_version::v2 == ver_ || id3v2_version::v3 == ver) { + switch (unicode_) { + case 0: + dst_enc_ = encoding::ISO_8859_1; + break; + case 1: + dst_enc_ = encoding::UCS_2; + break; + default: + throw bad_unicode_value(unicode_); + } + } else { + switch (unicode_) { + case 0: + dst_enc_ = encoding::ISO_8859_1; + break; + case 1: + dst_enc_ = encoding::UTF_16; + break; + case 2: + dst_enc_ = encoding::UTF_16BE; + break; + case 3: + dst_enc_ = encoding::UTF_8; + break; + default: + throw bad_unicode_value(unicode_); + } + } if (id3v2_version::v2 == ver || id3v2_version::v3 == ver) { cbnil_ = unicode_ ? 2 : 1; @@ -715,12 +765,10 @@ namespace scribbu { return unicode_; } - template - forward_output_iterator lang(forward_output_iterator p) const { - *p++ = lang_[0]; - *p++ = lang_[1]; - *p++ = lang_[2]; - return p; + typedef std::tuple lang_type; + + lang_type lang() const { + return std::make_tuple(lang_[0], lang_[1], lang_[2]); } template @@ -733,6 +781,11 @@ namespace scribbu { return std::copy(text_.begin(), text_.end(), p); } + /// Set the description + void description(const std::string &text, encoding src); + /// Set the commente text + void text(const std::string &text, encoding src); + /// Return the size, in bytes, of this structure, prior to /// desynchronisation, compression, and/or encryption exclusive of the /// header @@ -742,9 +795,14 @@ namespace scribbu { std::size_t write(std::ostream &os) const; private: + /// Derive from our version & destination encoding whether we need to + /// ad a byte-order mark + bool add_bom() const; std::size_t count_syncs(bool false_only) const; private: + id3v2_version ver_; + encoding dst_enc_; unsigned char cbnil_; unsigned char unicode_; unsigned char lang_[3]; diff --git a/scribbu/pprinter.cc b/scribbu/pprinter.cc index ffef0ab..c59deef 100644 --- a/scribbu/pprinter.cc +++ b/scribbu/pprinter.cc @@ -79,8 +79,8 @@ std::string comment_lang(const scribbu::comments &frame) { using namespace std; - char blang[4]{ 0, 0, 0, 0 }; - frame.lang(blang); + char blang[4]{0, 0, 0, 0}; + std::tie(blang[0], blang[1], blang[2]) = frame.lang(); stringstream stm(blang); optional strict_lang; diff --git a/scribbu/scheme-serde.cc b/scribbu/scheme-serde.cc index 20de697..2c2f1b0 100644 --- a/scribbu/scheme-serde.cc +++ b/scribbu/scheme-serde.cc @@ -428,7 +428,7 @@ namespace { SCM x = init_frame("", sym_comment_frame); char lang[4] = { 0, 0, 0, 0 }; - f.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = f.lang(); if (0 != (lang[0] & 0x80)) lang[0] = 0; if (0 != (lang[1] & 0x80)) lang[1] = 0; if (0 != (lang[2] & 0x80)) lang[2] = 0; @@ -454,7 +454,7 @@ namespace { f.readonly()); char lang[4] = { 0, 0, 0, 0 }; - f.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = f.lang(); if (0 != (lang[0] & 0x80)) lang[0] = 0; if (0 != (lang[1] & 0x80)) lang[1] = 0; if (0 != (lang[2] & 0x80)) lang[2] = 0; @@ -480,7 +480,7 @@ namespace { f.readonly(), f.unsynchronised()); char lang[4] = { 0, 0, 0, 0 }; - f.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = f.lang(); if (0 != (lang[0] & 0x80)) lang[0] = 0; if (0 != (lang[1] & 0x80)) lang[1] = 0; if (0 != (lang[2] & 0x80)) lang[2] = 0; diff --git a/scribbu/tdf-pprinter.cc b/scribbu/tdf-pprinter.cc index 13afc92..c684b6c 100644 --- a/scribbu/tdf-pprinter.cc +++ b/scribbu/tdf-pprinter.cc @@ -305,7 +305,7 @@ scribbu::tdf_pprinter::pprint_COM(const COM &f, std::ostream &os) std::tie(dst, rsp) = encoding_from_stream(os); char lang[3]; - f.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = f.lang(); return os << lang[0] << lang[1] << lang[2] << sep_ << f.description(dst, rsp, v2enc_) << sep_ << @@ -454,7 +454,7 @@ scribbu::tdf_pprinter::pprint_COMM(const COMM &f, std::ostream &os) std::tie(dst, rsp) = encoding_from_stream(os); char lang[3]; - f.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = f.lang(); return os << lang[0] << lang[1] << lang[2] << sep_ << f.description(dst, rsp, v2enc_) << sep_ << @@ -620,7 +620,7 @@ scribbu::tdf_pprinter::pprint_COMM_2_4(const COMM_2_4 &f, std::ostream &os) std::tie(dst, rsp) = encoding_from_stream(os); char lang[3]; - f.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = f.lang(); return os << lang[0] << lang[1] << lang[2] << sep_ << f.description(dst, rsp, v2enc_) << sep_ << diff --git a/test/framesv2.cc b/test/framesv2.cc index 0d24257..0cbe9ab 100644 --- a/test/framesv2.cc +++ b/test/framesv2.cc @@ -484,7 +484,7 @@ BOOST_AUTO_TEST_CASE( test_comments ) comments C1(id3v2_version::v3, B1.begin(), B1.end()); BOOST_CHECK(0 == C1.unicode()); - C1.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = C1.lang(); BOOST_CHECK('e' == lang[0] && 'n' == lang[1] && 'g' == lang[2]); outbuf.erase(outbuf.begin(), outbuf.end()); C1.descriptionb(back_inserter(outbuf)); @@ -508,7 +508,7 @@ BOOST_AUTO_TEST_CASE( test_comments ) comments C2(id3v2_version::v3, B2.begin(), B2.end()); BOOST_CHECK(1 == C2.unicode()); - C2.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = C2.lang(); BOOST_CHECK('e' == lang[0] && 'n' == lang[1] && 'g' == lang[2]); outbuf.erase(outbuf.begin(), outbuf.end()); C2.descriptionb(back_inserter(outbuf)); @@ -531,7 +531,7 @@ BOOST_AUTO_TEST_CASE( test_comments ) comments C3(id3v2_version::v3, B3.begin(), B3.end()); BOOST_CHECK(0 == C3.unicode()); - C3.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = C3.lang(); BOOST_CHECK('e' == lang[0] && 'n' == lang[1] && 'g' == lang[2]); outbuf.erase(outbuf.begin(), outbuf.end()); C3.descriptionb(back_inserter(outbuf)); @@ -553,7 +553,7 @@ BOOST_AUTO_TEST_CASE( test_comments ) comments C4(id3v2_version::v3, B4.begin(), B4.end()); BOOST_CHECK(0 == C4.unicode()); - C4.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = C4.lang(); BOOST_CHECK('e' == lang[0] && 'n' == lang[1] && 'g' == lang[2]); outbuf.erase(outbuf.begin(), outbuf.end()); C4.descriptionb(back_inserter(outbuf)); @@ -574,7 +574,7 @@ BOOST_AUTO_TEST_CASE( test_comments ) comments C5(id3v2_version::v3, B5.begin(), B5.end()); BOOST_CHECK(0 == C5.unicode()); - C5.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = C5.lang(); BOOST_CHECK('e' == lang[0] && 'n' == lang[1] && 'g' == lang[2]); outbuf.erase(outbuf.begin(), outbuf.end()); C5.descriptionb(back_inserter(outbuf)); @@ -614,7 +614,7 @@ BOOST_AUTO_TEST_CASE( test_comments ) comments C7(id3v2_version::v3, B7.begin(), B7.end()); BOOST_CHECK(1 == C7.unicode()); - C7.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = C7.lang(); BOOST_CHECK('e' == lang[0] && 'n' == lang[1] && 'g' == lang[2]); outbuf.erase(outbuf.begin(), outbuf.end()); C7.descriptionb(back_inserter(outbuf)); @@ -636,7 +636,7 @@ BOOST_AUTO_TEST_CASE( test_comments ) comments C8(id3v2_version::v3, B8.begin(), B8.end()); BOOST_CHECK(1 == C8.unicode()); - C8.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = C8.lang(); BOOST_CHECK('e' == lang[0] && 'n' == lang[1] && 'g' == lang[2]); outbuf.erase(outbuf.begin(), outbuf.end()); C8.descriptionb(back_inserter(outbuf)); @@ -657,7 +657,7 @@ BOOST_AUTO_TEST_CASE( test_comments ) comments C9(id3v2_version::v3, B9.begin(), B9.end()); BOOST_CHECK(1 == C9.unicode()); - C9.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = C9.lang(); BOOST_CHECK('e' == lang[0] && 'n' == lang[1] && 'g' == lang[2]); outbuf.erase(outbuf.begin(), outbuf.end()); C9.descriptionb(back_inserter(outbuf)); @@ -687,6 +687,54 @@ BOOST_AUTO_TEST_CASE( test_comments ) } +BOOST_AUTO_TEST_CASE(test_comments_mut) +{ + using namespace std; + using namespace scribbu; + + vector B1 = { + 0x00, // ISO-8859-1 + 0x65, 0x6e, 0x67, // eng + 0x64, 0x73, 0x63, 0x00, // "dsc" + 0x76, 0x61, 0x6c, // "val" + }; + + comments C1(id3v2_version::v3, B1.begin(), B1.end()); + C1.description("sp1ff@pobox.com", encoding::ASCII); + C1.text("Hello, world!", encoding::UTF_8); + + const char DSC[] = "sp1ff@pobox.com"; + const size_t NDSC = strlen(DSC); + + const char TEXT[] = "Hello, world!"; + const size_t NTEXT = strlen(TEXT); + + vector outbuf; + C1.descriptionb(back_inserter(outbuf)); + BOOST_CHECK_EQUAL_COLLECTIONS(DSC, DSC + NDSC, outbuf.begin(), outbuf.end()); + outbuf.erase(outbuf.begin(), outbuf.end()); + C1.textb(back_inserter(outbuf)); + BOOST_CHECK_EQUAL_COLLECTIONS(TEXT, TEXT + NTEXT, outbuf.begin(), outbuf.end()); + + vector B2 = + { 0x01, // UCS-2 + 0x65, 0x6e, 0x67, // eng + 0xfe, 0xff, 0x00, 0x64, 0x00, 0x73, 0x00, 0x63, 0x00, 0x00, // "dsc" + 0xfe, 0xff, 0x00, 0x76, 0x00, 0x61, 0x00, 0x6c, // "val" + }; + + comments C2(id3v2_version::v3, B2.begin(), B2.end()); + C2.description("sp1ff", encoding::ASCII); + + const unsigned char UDSC[] = + { 0xff, 0xfe, 0x73, 0x00, 0x70, 0x00, 0x31, 0x00, 0x66, 0x00, 0x66, 0x00 }; + const size_t NUDSC = sizeof(UDSC); + + outbuf.erase(outbuf.begin(), outbuf.end()); + C2.descriptionb(back_inserter(outbuf)); + BOOST_CHECK_EQUAL_COLLECTIONS(UDSC, UDSC + NUDSC, outbuf.begin(), outbuf.end()); +} + BOOST_AUTO_TEST_CASE( test_play_count ) { using namespace std; diff --git a/test/framesv22.cc b/test/framesv22.cc index ae1b7f8..e7736d8 100644 --- a/test/framesv22.cc +++ b/test/framesv22.cc @@ -67,7 +67,7 @@ BOOST_AUTO_TEST_CASE( test_com ) BOOST_CHECK( 0 == C.unicode() ); unsigned char buf[3]; - BOOST_CHECK( buf + 3 == C.lang(buf) ); + std::tie(buf[0], buf[1], buf[2]) = C.lang(); BOOST_CHECK( 'e' == buf[0] && 'n' == buf[1] && 'g' == buf[2] ); BOOST_CHECK( buf + 3 == C.descriptionb(buf) ); diff --git a/test/id3v22.cc b/test/id3v22.cc index 4e06d02..c5bd700 100644 --- a/test/id3v22.cc +++ b/test/id3v22.cc @@ -233,7 +233,7 @@ BOOST_AUTO_TEST_CASE( test_id3v2_2_tag ) char lang[3]; vector dsc, text; BOOST_CHECK(0 == C0.unicode()); - C0.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = C0.lang(); BOOST_CHECK('e' == lang[0] && 'n' == lang[1] && 'g' == lang[2]); C0.descriptionb(back_inserter(dsc)); BOOST_CHECK(dsc == DSC0); @@ -262,7 +262,7 @@ BOOST_AUTO_TEST_CASE( test_id3v2_2_tag ) const COM &C1 = C[1]; BOOST_CHECK(0 == C1.unicode()); - C1.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = C1.lang(); BOOST_CHECK('e' == lang[0] && 'n' == lang[1] && 'g' == lang[2]); C1.descriptionb(back_inserter(dsc)); BOOST_CHECK(dsc == DSC1); diff --git a/test/id3v23.cc b/test/id3v23.cc index 9b8da6e..263c287 100644 --- a/test/id3v23.cc +++ b/test/id3v23.cc @@ -695,7 +695,7 @@ BOOST_AUTO_TEST_CASE( test_id3v2_3_files ) BOOST_CHECK(1 == C.unicode()); - C.lang(lang); + std::tie(lang[0], lang[1], lang[2]) = C.lang(); BOOST_CHECK(0 == lang[0] && 0 == lang[1] && 0 == lang[2]); vector GOLD0 = vector{0xff, 0xfe}; @@ -919,7 +919,7 @@ BOOST_AUTO_TEST_CASE( test_funny_files ) BOOST_CHECK(4 == C.size()); BOOST_CHECK(0 == C[0].unicode()); - C[0].lang(lang); + std::tie(lang[0], lang[1], lang[2]) = C[0].lang(); BOOST_CHECK('e' == lang[0] && 'n' == lang[1] && 'g' == lang[2]); C[0].descriptionb(back_inserter(dsc)); BOOST_CHECK(0 == dsc.size()); @@ -930,7 +930,7 @@ BOOST_AUTO_TEST_CASE( test_funny_files ) txt.resize(0); BOOST_CHECK(0 == C[1].unicode()); - C[1].lang(lang); + std::tie(lang[0], lang[1], lang[2]) = C[1].lang(); BOOST_CHECK(0 == lang[0] && 0 == lang[1] && 0 == lang[2]); C[1].descriptionb(back_inserter(dsc)); BOOST_CHECK(dsc == GOLD1); @@ -942,7 +942,7 @@ BOOST_AUTO_TEST_CASE( test_funny_files ) txt.resize(0); BOOST_CHECK(0 == C[2].unicode()); - C[2].lang(lang); + std::tie(lang[0], lang[1], lang[2]) = C[2].lang(); BOOST_CHECK(0 == lang[0] && 0 == lang[1] && 0 == lang[2]); C[2].descriptionb(back_inserter(dsc)); BOOST_CHECK(dsc == GOLD2); @@ -954,7 +954,7 @@ BOOST_AUTO_TEST_CASE( test_funny_files ) txt.resize(0); BOOST_CHECK(0 == C[3].unicode()); - C[3].lang(lang); + std::tie(lang[0], lang[1], lang[2]) = C[3].lang(); BOOST_CHECK(0 == lang[0] && 0 == lang[1] && 0 == lang[2]); C[3].descriptionb(back_inserter(dsc)); BOOST_CHECK(dsc == GOLD3);