From aea4df3b0c616a4fac63e60d3800fb97ea2ef244 Mon Sep 17 00:00:00 2001 From: Markus Mirz Date: Tue, 26 Mar 2024 21:19:36 +0100 Subject: [PATCH] add required arabica code Signed-off-by: Markus Mirz --- CMakeLists.txt | 2 +- arabica/.gitignore | 103 + arabica/CMakeLists.txt | 152 + arabica/LICENSE | 32 + arabica/include/Arabica/StringAdaptor.hpp | 252 ++ arabica/include/Arabica/getparam.hpp | 54 + arabica/include/Arabica/mbstate.hpp | 23 + arabica/include/Arabica/stringadaptortag.hpp | 13 + arabica/include/SAX/ArabicaConfig.hpp.in | 12 + arabica/include/SAX/AttributeList.hpp | 193 ++ arabica/include/SAX/Attributes.hpp | 272 ++ arabica/include/SAX/ContentHandler.hpp | 341 ++ arabica/include/SAX/DTDHandler.hpp | 127 + arabica/include/SAX/DocumentHandler.hpp | 76 + arabica/include/SAX/EntityResolver.hpp | 117 + arabica/include/SAX/ErrorHandler.hpp | 123 + arabica/include/SAX/HandlerBase.hpp | 291 ++ arabica/include/SAX/IStreamHandle.hpp | 452 +++ arabica/include/SAX/InputSource.hpp | 337 ++ arabica/include/SAX/Locator.hpp | 131 + arabica/include/SAX/Parser.hpp | 166 + arabica/include/SAX/ParserConfig.hpp | 91 + arabica/include/SAX/SAXException.hpp | 68 + .../include/SAX/SAXNotRecognizedException.hpp | 54 + .../include/SAX/SAXNotSupportedException.hpp | 56 + arabica/include/SAX/SAXParseException.hpp | 170 + arabica/include/SAX/XMLFilter.hpp | 71 + arabica/include/SAX/XMLReader.hpp | 450 +++ arabica/include/SAX/ext/Attributes2.hpp | 79 + arabica/include/SAX/ext/DeclHandler.hpp | 135 + arabica/include/SAX/ext/LexicalHandler.hpp | 202 ++ arabica/include/SAX/ext/Locator2.hpp | 75 + .../include/SAX/helpers/AttributeDefaults.hpp | 34 + .../include/SAX/helpers/AttributeListImpl.hpp | 296 ++ .../include/SAX/helpers/AttributeTypes.hpp | 48 + .../include/SAX/helpers/AttributesImpl.hpp | 587 ++++ .../include/SAX/helpers/DefaultHandler.hpp | 603 ++++ arabica/include/SAX/helpers/FeatureNames.hpp | 95 + .../SAX/helpers/InputSourceResolver.hpp | 63 + .../include/SAX/helpers/NamespaceSupport.hpp | 390 +++ arabica/include/SAX/helpers/PropertyNames.hpp | 53 + arabica/include/SAX/helpers/XMLFilterImpl.hpp | 608 ++++ arabica/include/SAX/saxfwd.hpp | 89 + arabica/include/SAX/wrappers/saxlibxml2.hpp | 955 ++++++ arabica/include/Taggle/Taggle.hpp | 13 + arabica/include/Taggle/impl/Element.hpp | 363 ++ arabica/include/Taggle/impl/ElementType.hpp | 333 ++ arabica/include/Taggle/impl/Parser.hpp | 1389 ++++++++ arabica/include/Taggle/impl/ScanHandler.hpp | 108 + arabica/include/Taggle/impl/Scanner.hpp | 47 + arabica/include/Taggle/impl/Schema.hpp | 44 + arabica/include/Taggle/impl/SchemaImpl.hpp | 182 + .../include/Taggle/impl/html/HTMLModels.hpp | 49 + .../include/Taggle/impl/html/HTMLScanner.hpp | 707 ++++ .../include/Taggle/impl/html/HTMLSchema.hpp | 2955 +++++++++++++++++ arabica/include/XML/QName.hpp | 194 ++ arabica/include/XML/XMLCharacterClasses.hpp | 28 + arabica/include/XML/strings.hpp | 77 + .../convert/impl/codecvt_specialisations.hpp | 131 + arabica/include/convert/impl/ucs2_utf16.hpp | 27 + arabica/include/convert/impl/ucs2_utf8.hpp | 23 + arabica/include/convert/utf8ucs2codecvt.hpp | 70 + arabica/include/io/convertstream.hpp | 282 ++ arabica/include/io/socket_stream.hpp | 436 +++ arabica/include/io/uri.hpp | 84 + arabica/include/text/UnicodeCharacters.hpp | 292 ++ arabica/include/text/normalize_whitespace.hpp | 50 + .../src/SAX/helpers/InputSourceResolver.cpp | 143 + arabica/src/SAX/wrappers/saxlibxml2.cpp | 249 ++ arabica/src/XML/XMLCharacterClasses.cpp | 288 ++ arabica/src/arabica.cpp | 9 + arabica/src/convert/impl/ucs2_utf16.cpp | 57 + arabica/src/convert/impl/ucs2_utf8.cpp | 103 + arabica/src/convert/utf8ucs2codecvt.cpp | 85 + arabica/src/io/uri.cpp | 197 ++ arabica/src/taggle/Schema.cpp | 14 + 76 files changed, 17569 insertions(+), 1 deletion(-) create mode 100644 arabica/.gitignore create mode 100644 arabica/CMakeLists.txt create mode 100644 arabica/LICENSE create mode 100644 arabica/include/Arabica/StringAdaptor.hpp create mode 100644 arabica/include/Arabica/getparam.hpp create mode 100644 arabica/include/Arabica/mbstate.hpp create mode 100644 arabica/include/Arabica/stringadaptortag.hpp create mode 100644 arabica/include/SAX/ArabicaConfig.hpp.in create mode 100644 arabica/include/SAX/AttributeList.hpp create mode 100644 arabica/include/SAX/Attributes.hpp create mode 100644 arabica/include/SAX/ContentHandler.hpp create mode 100644 arabica/include/SAX/DTDHandler.hpp create mode 100644 arabica/include/SAX/DocumentHandler.hpp create mode 100644 arabica/include/SAX/EntityResolver.hpp create mode 100644 arabica/include/SAX/ErrorHandler.hpp create mode 100644 arabica/include/SAX/HandlerBase.hpp create mode 100644 arabica/include/SAX/IStreamHandle.hpp create mode 100644 arabica/include/SAX/InputSource.hpp create mode 100644 arabica/include/SAX/Locator.hpp create mode 100644 arabica/include/SAX/Parser.hpp create mode 100644 arabica/include/SAX/ParserConfig.hpp create mode 100644 arabica/include/SAX/SAXException.hpp create mode 100644 arabica/include/SAX/SAXNotRecognizedException.hpp create mode 100644 arabica/include/SAX/SAXNotSupportedException.hpp create mode 100644 arabica/include/SAX/SAXParseException.hpp create mode 100644 arabica/include/SAX/XMLFilter.hpp create mode 100644 arabica/include/SAX/XMLReader.hpp create mode 100644 arabica/include/SAX/ext/Attributes2.hpp create mode 100644 arabica/include/SAX/ext/DeclHandler.hpp create mode 100644 arabica/include/SAX/ext/LexicalHandler.hpp create mode 100644 arabica/include/SAX/ext/Locator2.hpp create mode 100644 arabica/include/SAX/helpers/AttributeDefaults.hpp create mode 100644 arabica/include/SAX/helpers/AttributeListImpl.hpp create mode 100644 arabica/include/SAX/helpers/AttributeTypes.hpp create mode 100644 arabica/include/SAX/helpers/AttributesImpl.hpp create mode 100644 arabica/include/SAX/helpers/DefaultHandler.hpp create mode 100644 arabica/include/SAX/helpers/FeatureNames.hpp create mode 100644 arabica/include/SAX/helpers/InputSourceResolver.hpp create mode 100644 arabica/include/SAX/helpers/NamespaceSupport.hpp create mode 100644 arabica/include/SAX/helpers/PropertyNames.hpp create mode 100644 arabica/include/SAX/helpers/XMLFilterImpl.hpp create mode 100644 arabica/include/SAX/saxfwd.hpp create mode 100644 arabica/include/SAX/wrappers/saxlibxml2.hpp create mode 100644 arabica/include/Taggle/Taggle.hpp create mode 100755 arabica/include/Taggle/impl/Element.hpp create mode 100755 arabica/include/Taggle/impl/ElementType.hpp create mode 100644 arabica/include/Taggle/impl/Parser.hpp create mode 100755 arabica/include/Taggle/impl/ScanHandler.hpp create mode 100755 arabica/include/Taggle/impl/Scanner.hpp create mode 100644 arabica/include/Taggle/impl/Schema.hpp create mode 100644 arabica/include/Taggle/impl/SchemaImpl.hpp create mode 100644 arabica/include/Taggle/impl/html/HTMLModels.hpp create mode 100644 arabica/include/Taggle/impl/html/HTMLScanner.hpp create mode 100644 arabica/include/Taggle/impl/html/HTMLSchema.hpp create mode 100644 arabica/include/XML/QName.hpp create mode 100644 arabica/include/XML/XMLCharacterClasses.hpp create mode 100755 arabica/include/XML/strings.hpp create mode 100644 arabica/include/convert/impl/codecvt_specialisations.hpp create mode 100644 arabica/include/convert/impl/ucs2_utf16.hpp create mode 100644 arabica/include/convert/impl/ucs2_utf8.hpp create mode 100644 arabica/include/convert/utf8ucs2codecvt.hpp create mode 100644 arabica/include/io/convertstream.hpp create mode 100644 arabica/include/io/socket_stream.hpp create mode 100644 arabica/include/io/uri.hpp create mode 100644 arabica/include/text/UnicodeCharacters.hpp create mode 100644 arabica/include/text/normalize_whitespace.hpp create mode 100644 arabica/src/SAX/helpers/InputSourceResolver.cpp create mode 100644 arabica/src/SAX/wrappers/saxlibxml2.cpp create mode 100644 arabica/src/XML/XMLCharacterClasses.cpp create mode 100644 arabica/src/arabica.cpp create mode 100644 arabica/src/convert/impl/ucs2_utf16.cpp create mode 100644 arabica/src/convert/impl/ucs2_utf8.cpp create mode 100644 arabica/src/convert/utf8ucs2codecvt.cpp create mode 100644 arabica/src/io/uri.cpp create mode 100644 arabica/src/taggle/Schema.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4d5baf008..29e6a7cd8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,7 +36,7 @@ if(CIMPP_BUILD_DOC) add_subdirectory(doc) endif() -add_subdirectory(thirdparty) +add_subdirectory(arabica) if(NOT USE_CIM_VERSION) set(USE_CIM_VERSION "CGMES_2.4.15_27JAN2020" CACHE STRING "Define CIM Version") diff --git a/arabica/.gitignore b/arabica/.gitignore new file mode 100644 index 000000000..3c415b35c --- /dev/null +++ b/arabica/.gitignore @@ -0,0 +1,103 @@ +*.o +*~ +*.tar.gz +*.tar.bz2 +*.zip +*.ilk +*.pdb +*.exp +*.lib +*.bsc +*.exe +*.lo +*.la +*.ncb +*.suo +*.lnk +*.dll +*.log +*.trs +*.sdf +*.opensdf +*.DS_Store* + +lib +doc/html + +.libs +.deps +.dirstamp +arabica.pc + +include/SAX/ArabicaConfig.hpp +include/SAX/stamp-h1 +tests/XSLT/test_path.hpp + +*/Debug* +*/Release* + +# IDE files +vs9/mangle.sln +vs9/*.user +vs10/mangle.sln +*.vcxproj.filters +*.vcxproj.user + +.idea + +# Examples +examples/DOM/dom2pyx +examples/DOM/domwriter +examples/SAX/pyx +examples/SAX/simple_handler +examples/SAX/writer +examples/SAX/xmlbase +examples/Taggle/taggle +examples/Utils/transcode +examples/XPath/xgrep +examples/XSLT/mangle +fractal*.html +tests/SAX/filter_test* +tests/DOM/dom_test* +tests/XPath/xpath_test* +tests/Taggle/taggle_test +tests/Utils/utils_test* +tests/XSLT/xslt_test* +bin/mandelbrot-*.html + +# Autoconf +m4/lt~obsolete.m4 +m4/ltversion.m4 +m4/ltsugar.m4 +m4/ltoptions.m4 +m4/libtool.m4 +test-driver +libtool +ltmain.sh +config.log +config.status +config.cache +configure +compile +autom4te.cache +missing +Makefile +Makefile.in +aclocal.m4 + +# CMake +CMakeCache.txt +CMakeFiles +CMakeFiles/* +cmake_install.cmake + +build/* +cmake-build-debug/ + +# Misc +*.stackdump + +spec +gmon.out +ipch +test_path.hpp diff --git a/arabica/CMakeLists.txt b/arabica/CMakeLists.txt new file mode 100644 index 000000000..ed4eff2fb --- /dev/null +++ b/arabica/CMakeLists.txt @@ -0,0 +1,152 @@ +cmake_minimum_required(VERSION 3.5) + +project(arabica) + +# Enable C++11 support +set(CMAKE_CXX_STANDARD 11) + +set(LIB_NAME arabica) + +set(ARABICA_MAJOR_VERSION 2016) +set(ARABICA_MINOR_VERSION January) + +# +# Build as shared library +# +option(BUILD_SHARED_LIBS "Build as shared libaries" OFF) + +# +# Set the used xml backend +# options: USE_MSXML, USE_EXPAT, USE_LIBXML2, USE_XERCES +set(ARABICA_XML_BACKEND USE_LIBXML2) + +configure_file(${CMAKE_CURRENT_LIST_DIR}/include/SAX/ArabicaConfig.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/include/SAX/ArabicaConfig.hpp) + +# +# find libxml2: LIBXML2_INCLUDE_DIR LIBXML2_LIBRARIES +if(ARABICA_XML_BACKEND STREQUAL USE_LIBXML2) + find_package(LibXml2) + set(ADDITIONAL_INC ${LIBXML2_INCLUDE_DIR}) + set(ADDITIONAL_LIB ${LIBXML2_LIBRARIES}) +endif() + +# +# platform check +set(BUILD_X64 TRUE) + +set(GENERATED_HEADER_FILES + ${CMAKE_CURRENT_BINARY_DIR}/include/SAX/ArabicaConfig.hpp +) +source_group("Generated Header Files" FILES ${GENERATED_HEADER_FILES}) + +set(PUBLIC_HEADER_FILES + include/SAX/AttributeList.hpp + include/SAX/Attributes.hpp + include/SAX/ContentHandler.hpp + include/SAX/DocumentHandler.hpp + include/SAX/DTDHandler.hpp + include/SAX/EntityResolver.hpp + include/SAX/ErrorHandler.hpp + include/SAX/HandlerBase.hpp + include/SAX/InputSource.hpp + include/SAX/IStreamHandle.hpp + include/SAX/Locator.hpp + include/SAX/Parser.hpp + include/SAX/ParserConfig.hpp + include/SAX/SAXException.hpp + include/SAX/saxfwd.hpp + include/SAX/SAXNotRecognizedException.hpp + include/SAX/SAXNotSupportedException.hpp + include/SAX/SAXParseException.hpp + include/SAX/XMLFilter.hpp + include/SAX/XMLReader.hpp + include/SAX/ext/Attributes2.hpp + include/SAX/ext/DeclHandler.hpp + include/SAX/ext/LexicalHandler.hpp + include/SAX/ext/Locator2.hpp + include/SAX/helpers/AttributeDefaults.hpp + include/SAX/helpers/AttributeListImpl.hpp + include/SAX/helpers/AttributesImpl.hpp + include/SAX/helpers/AttributeTypes.hpp + include/SAX/helpers/DefaultHandler.hpp + include/SAX/helpers/FeatureNames.hpp + include/SAX/helpers/InputSourceResolver.hpp + include/SAX/helpers/NamespaceSupport.hpp + include/SAX/helpers/PropertyNames.hpp + include/SAX/helpers/XMLFilterImpl.hpp + include/SAX/wrappers/saxlibxml2.hpp + include/Arabica/getparam.hpp + include/Arabica/StringAdaptor.hpp + include/XML/QName.hpp + include/XML/strings.hpp + include/XML/XMLCharacterClasses.hpp + include/io/convertstream.hpp + include/io/socket_stream.hpp + include/io/uri.hpp + include/convert/impl/codecvt_specialisations.hpp + include/convert/impl/ucs2_utf16.hpp + include/convert/impl/ucs2_utf8.hpp + include/convert/utf8ucs2codecvt.hpp + include/text/normalize_whitespace.hpp + include/text/UnicodeCharacters.hpp + include/Taggle/impl/Element.hpp + include/Taggle/impl/ElementType.hpp + include/Taggle/impl/html/HTMLModels.hpp + include/Taggle/impl/html/HTMLScanner.hpp + include/Taggle/impl/html/HTMLSchema.hpp + include/Taggle/impl/Parser.hpp + include/Taggle/impl/ScanHandler.hpp + include/Taggle/impl/Scanner.hpp + include/Taggle/impl/Schema.hpp + include/Taggle/impl/SchemaImpl.hpp + include/Taggle/Taggle.hpp + ) +source_group("Header Files" FILES ${PUBLIC_HEADER_FILES}) + +set(SOURCE_FILES + src/arabica.cpp + src/XML/XMLCharacterClasses.cpp + src/SAX/helpers/InputSourceResolver.cpp + src/io/uri.cpp + src/convert/impl/ucs2_utf16.cpp + src/convert/impl/ucs2_utf8.cpp + src/convert/utf8ucs2codecvt.cpp + src/taggle/Schema.cpp +) +source_group("Source Files" FILES ${SOURCE_FILES}) + + +add_library (${LIB_NAME} + ${GENERATED_HEADER_FILES} + ${PUBLIC_HEADER_FILES} + ${SOURCE_FILES} +) +target_compile_definitions(${LIB_NAME} PUBLIC "$<$:ARABICA_DEBUG>") +target_compile_definitions(${LIB_NAME} PUBLIC ARABICA_NOT_USE_PRAGMA_LINKER_OPTIONS) +target_include_directories(${LIB_NAME} PUBLIC $) +target_include_directories(${LIB_NAME} PUBLIC ${ADDITIONAL_INC} ${Boost_INCLUDE_DIRS}) +target_include_directories(${LIB_NAME} PUBLIC $) +# link backend +target_link_libraries(${LIB_NAME} ${ADDITIONAL_LIB}) + +set_target_properties(${LIB_NAME} PROPERTIES FOLDER "3rdparty/arabica") + +# +# Install library +# +install(TARGETS ${LIB_NAME} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}/static +) + +install(DIRECTORY include/ + DESTINATION include/${PROJECT_NAME} + FILES_MATCHING + PATTERN *.hpp + PATTERN *.h +) + +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/include/SAX/ArabicaConfig.hpp + DESTINATION include/${PROJECT_NAME}/SAX +) diff --git a/arabica/LICENSE b/arabica/LICENSE new file mode 100644 index 000000000..2cef36588 --- /dev/null +++ b/arabica/LICENSE @@ -0,0 +1,32 @@ +Copyright 2001-2017 Jez UK Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the following +conditions are met: + + Redistributions of source code must retain the above + copyright notice, this list of conditions and the following + disclaimer. + Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + Neither the name of Jez UK Ltd nor the names of + contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS +BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY +OF SUCH DAMAGE. diff --git a/arabica/include/Arabica/StringAdaptor.hpp b/arabica/include/Arabica/StringAdaptor.hpp new file mode 100644 index 000000000..5983f813e --- /dev/null +++ b/arabica/include/Arabica/StringAdaptor.hpp @@ -0,0 +1,252 @@ +#ifndef ARABICA_UTILS_STRING_ADAPTOR_H +#define ARABICA_UTILS_STRING_ADAPTOR_H + +#include +#include +#include +#include +#include + +namespace Arabica +{ + +template class default_string_adaptor; + +template +class default_string_adaptor_base +{ +public: + typedef stringT string_type; + typedef typename string_type::const_iterator const_iterator; + typedef typename string_type::iterator mutable_iterator; + typedef typename string_type::const_reverse_iterator const_reverse_iterator; + typedef typename string_type::iterator iterator; + typedef typename string_type::value_type value_type; + typedef typename string_type::size_type size_type; + + virtual ~default_string_adaptor_base() {} + + static size_type npos() + { + return static_cast(-1); + } + + //todo: is this safe? + template + static inline string_type construct(InputIterator from, InputIterator to) + { + return string_type(from, to); + } + + static inline string_type construct(const_iterator from, const_iterator to) + { + return string_type(from, to); + } + + static inline string_type construct(const std::basic_string& str) + { + return construct(str.begin(), str.end()); + } // construct + + static string_type construct(const value_type* str) + { + return str ? string_type(str) : string_type(); + } + + static const string_type& empty_string() { static string_type es; return es; } + + //todo: fix for utf8 + static size_type length(const string_type& str) { return str.length(); } + + // all these functions should operate as std::string member functions do + static bool empty(const string_type& str) { return str.empty(); } + static size_type find(const string_type& str, value_type what) { return str.find(what); } + static size_type find(const string_type& str, const string_type& what) { return str.find(what); } + static size_type find(const string_type& str, value_type what, size_type from) { return str.find(what, from); } + static size_type find(const string_type& str, const string_type& what, size_type from) { return str.find(what, from); } + static string_type substr(const string_type& str, const size_type& offset) { return str.substr(offset); } + static string_type substr(const string_type& str, const size_type& offset, const size_type& count) { return str.substr(offset, count); } + static void append(string_type& str, const string_type& a) { str.append(a); } + static void append(string_type& str, const value_type& a) { str += a; } + static string_type concat(const string_type& str, const string_type& a) { return str + a; } + static string_type concat(const string_type& str, const value_type& a) { return str + a; } + static void insert(string_type& str, size_type offset, const string_type& a) { str.insert(offset, a); } + static void replace(string_type& str, size_type offset, size_type count, const string_type& a) { str.replace(offset, count, a); } + + static const_iterator begin(const string_type& str) { return str.begin(); } + static const_iterator end(const string_type& str) { return str.end(); } + + static iterator begin(string_type& str) { return str.begin(); } + static iterator end(string_type& str) { return str.end(); } + + static const_reverse_iterator rbegin(const string_type& str) { return str.rbegin(); } + + // only used to constuct error strings - don't have to be highly efficient! + static std::string asStdString(const string_type& str); + +#ifndef ARABICA_NO_WCHAR_T + static string_type construct_from_utf16(const wchar_t* str); + static string_type construct_from_utf16(const wchar_t* str, int length); + static std::wstring asStdWString(const string_type& str); + + typedef Arabica::io::basic_iconvertstream, + char, std::char_traits > widener_t; + typedef Arabica::io::basic_oconvertstream, + char, std::char_traits > narrower_t; + + + static const std::locale& utf8ucs2_locale() + { + static const std::locale loc = std::locale(std::locale(), new Arabica::convert::utf8ucs2codecvt); + return loc; + } +#endif //ARABICA_NO_WCHAR_T + +}; // class default_string_adaptor_base + + +// specialize for std::string and std::wstring +template<> +class default_string_adaptor : + public string_adaptor_tag, + public default_string_adaptor_base +{ +public: + + static char convert_from_utf8(char c) { return c; } + + static std::string construct_from_utf8(const char* str) + { + return str ? std::string(str) : std::string(); + } // construct_from_utf8 + + static std::string construct_from_utf8(const char* str, int length) + { + return std::string(str, length); + } // construct_from_utf8 + + static const std::string& asStdString(const std::string& str) + { + return str; + } // asStdString + +#ifndef ARABICA_NO_WCHAR_T + static std::string construct_from_utf16(const wchar_t* str) + { + narrower_t n; + n.imbue(utf8ucs2_locale()); + n.str(str ? str : L""); + //n << str; + return n.str(); + } + + static std::string construct_from_utf16(const wchar_t* str, int length) + { + narrower_t n; + n.imbue(utf8ucs2_locale()); + n.str(std::wstring(str, length)); + //for(int i = 0; i < length; ++i) + // n << str[i]; + return n.str(); + } // construct_from_utf16 + + static std::wstring asStdWString(const std::string& str) + { + widener_t w; + w.imbue(utf8ucs2_locale()); + w.str(str); + return w.str(); + } // toStdWString +#endif //ARABICA_NO_WCHAR_T + +}; // class default_string_adaptor + + +#ifndef ARABICA_NO_WCHAR_T + +template<> +class default_string_adaptor : + public string_adaptor_tag, + public default_string_adaptor_base +{ +public: + + static wchar_t makeValueT(char c) + { + return static_cast(c); + } // makeValueT + + template + static inline string_type construct(InputIterator from, InputIterator to) + { + return string_type(from, to); + } + + static inline string_type construct(const_iterator from, const_iterator to) + { + return string_type(from, to); + } + + static inline string_type construct(const std::basic_string& str) + { + return construct(str.begin(), str.end()); + } // construct + + static string_type construct(const value_type* str) + { + return str ? string_type(str) : string_type(); + } + + static std::wstring construct(const std::string& str) + { + return construct_from_utf8(str.c_str()); + } + + static std::wstring construct_from_utf8(const char* str) + { + widener_t w; + w.imbue(utf8ucs2_locale()); + w.str(str ? str : ""); + return w.str(); + } + + static std::wstring construct_from_utf8(const char* str, int length) + { + widener_t w; + w.imbue(utf8ucs2_locale()); + w.str(std::string(str, length)); + return w.str(); + } + + static std::wstring construct_from_utf16(const wchar_t* str) + { + return str ? std::wstring(str) : std::wstring(); + } + + static std::wstring construct_from_utf16(const wchar_t* str, int length) + { + return std::wstring(str, length); + } + + static std::string asStdString(const std::wstring& str) + { + narrower_t n; + n.imbue(utf8ucs2_locale()); + n.str(str); + return n.str(); + } // toStdString + + static const std::wstring& asStdWString(const std::wstring& str) + { + return str; + } // toStdWString + +}; // class default_string_adaptor + + +#endif // ARABICA_NO_WCHAR_T + +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/Arabica/getparam.hpp b/arabica/include/Arabica/getparam.hpp new file mode 100644 index 000000000..3d54a645a --- /dev/null +++ b/arabica/include/Arabica/getparam.hpp @@ -0,0 +1,54 @@ +#ifndef ARABICA_UTILS_GET_PARAM_HPP +#define ARABICA_UTILS_GET_PARAM_HPP + +#ifdef ARABICA_HAVE_BOOST +#include +#include +#endif + +namespace Arabica +{ + +struct nil_t { }; + +#ifdef ARABICA_HAVE_BOOST +template +struct get_param +{ + typedef typename boost::mpl::if_< + boost::is_base_and_derived + , T0 + , typename boost::mpl::if_< + boost::is_base_and_derived + , T1 + , DefaultT + >::type + >::type type; +}; // get_param +#else +template +struct chosen_type { typedef T0 type; }; + +template +struct chosen_type { typedef DefaultT type; }; + +template +struct get_param +{ + typedef typename chosen_type::type type; +}; +#endif + +template +struct get_string_adaptor +{ + typedef typename get_param, + T0, + T1>::type type; +}; + +} // namespace Arabica + +#endif + diff --git a/arabica/include/Arabica/mbstate.hpp b/arabica/include/Arabica/mbstate.hpp new file mode 100644 index 000000000..6edf72688 --- /dev/null +++ b/arabica/include/Arabica/mbstate.hpp @@ -0,0 +1,23 @@ +#ifndef ARABICA_MBSTATE_T +#define ARABICA_MBSTATE_T + +#ifdef ARABICA_NO_STD_MBSTATE_T + +#ifndef ARABICA_NO_MBSTATE_T +#include +namespace std { + using ::mbstate_t; +} +#else +namespace std { + typedef struct + { + int dummy; /* So that {0} is a valid constant initializer. */ + } mbstate_t; +} +#endif + + +#endif + +#endif diff --git a/arabica/include/Arabica/stringadaptortag.hpp b/arabica/include/Arabica/stringadaptortag.hpp new file mode 100644 index 000000000..a9706d6a8 --- /dev/null +++ b/arabica/include/Arabica/stringadaptortag.hpp @@ -0,0 +1,13 @@ +#ifndef ARABICA_STRING_ADAPTOR_TAG_HPP +#define ARABICA_STRING_ADAPTOR_TAG_HPP + +namespace Arabica +{ + +struct string_adaptor_tag { + virtual ~string_adaptor_tag() {} +}; + +} // namespace Arabica + +#endif diff --git a/arabica/include/SAX/ArabicaConfig.hpp.in b/arabica/include/SAX/ArabicaConfig.hpp.in new file mode 100644 index 000000000..1934dc60f --- /dev/null +++ b/arabica/include/SAX/ArabicaConfig.hpp.in @@ -0,0 +1,12 @@ +#ifndef ARABICA_ARABICA_CONFIG_H +#define ARABICA_ARABICA_CONFIG_H + +#cmakedefine ARABICA_NO_WCHAR_T +#cmakedefine ARABICA_VS6_WORKAROUND +#cmakedefine ARABICA_NO_CODECVT_SPECIALISATIONS +#cmakedefine ARABICA_USE_WINSOCK +#cmakedefine ARABICA_WINDOWS +#cmakedefine ARABICA_HAVE_BOOST +#define ARABICA_@ARABICA_XML_BACKEND@ + +#endif // ARABICA_ARABICA_CONFIG_H diff --git a/arabica/include/SAX/AttributeList.hpp b/arabica/include/SAX/AttributeList.hpp new file mode 100644 index 000000000..99fc82fa8 --- /dev/null +++ b/arabica/include/SAX/AttributeList.hpp @@ -0,0 +1,193 @@ +#ifndef ARABICA_ATTRIBUTES_LIST_H +#define ARABICA_ATTRIBUTES_LIST_H + +// SAX Attribute List Interface. +// $Id$ + +#include + +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Interface for an element's attribute specifications. + * + *

This is the original SAX1 interface for reporting an element's + * attributes. Unlike the new {@link Attributes Attributes} + * interface, it does not support Namespace-related information.

+ * + *

When an attribute list is supplied as part of a + * {@link DocumentHandler#startElement startElement} + * event, the list will return valid results only during the + * scope of the event; once the event handler returns control + * to the parser, the attribute list is invalid. To save a + * persistent copy of the attribute list, use the SAX1 + * {@link AttributeListImpl AttributeListImpl} + * helper class.

+ * + *

An attribute list includes only attributes that have been + * specified or defaulted: #IMPLIED attributes will not be included.

+ * + *

There are two ways for the SAX application to obtain information + * from the AttributeList. First, it can iterate through the entire + * list:

+ * + *
+ * void startElement(const stringT& name, const AttributeListT& atts) 
+ * {
+ *   for(int i = 0; i < atts.getLength(); ++i) 
+ *   {
+ *     stringT name = atts.getName(i);
+ *     stringT type = atts.getType(i);
+ *     stringT value = atts.getValue(i);
+ *     [...]
+ *   }
+ * }
+ * 
+ * + *

(Note that the result of getLength() will be zero if there + * are no attributes.) + * + *

As an alternative, the application can request the value or + * type of specific attributes:

+ * + *
+ * void startElement(const string& name, const AttributeListT& atts) 
+ * {
+ *   stringT identifier = atts.getValue("id");
+ *   stringT label = atts.getValue("label");
+ *   [...]
+ * }
+ * 
+ * + * @deprecated This interface has been replaced by the SAX2 + * {@link Attributes Attributes} + * interface, which includes Namespace support. + * @since SAX 1.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see DocumentHandler#startElement startElement + * @see AttributeListImpl + */ +template +class AttributeList +{ +public: + typedef string_type stringT; + + virtual ~AttributeList() { } + + // + // Iteration methods. + // + /** + * Return the number of attributes in this list. + * + *

The SAX parser may provide attributes in any + * arbitrary order, regardless of the order in which they were + * declared or specified. The number of attributes may be + * zero.

+ * + * @return The number of attributes in the list. + */ + virtual int getLength() const = 0; + /** + * Return the name of an attribute in this list (by position). + * + *

The names must be unique: the SAX parser shall not include the + * same attribute twice. Attributes without values (those declared + * #IMPLIED without a value specified in the start tag) will be + * omitted from the list.

+ * + *

If the attribute name has a namespace prefix, the prefix + * will still be attached.

+ * + * @param i The index of the attribute in the list (starting at 0). + * @return The name of the indexed attribute, or an empty string + * if the index is out of range. + * @see #getLength + */ + virtual const stringT& getName(int i) const = 0; + /** + * Return the type of an attribute in the list (by position). + * + *

The attribute type is one of the strings "CDATA", "ID", + * "IDREF", "IDREFS", "NMTOKEN", "NMTOKENS", "ENTITY", "ENTITIES", + * or "NOTATION" (always in upper case).

+ * + *

If the parser has not read a declaration for the attribute, + * or if the parser does not report attribute types, then it must + * return the value "CDATA" as stated in the XML 1.0 Recommentation + * (clause 3.3.3, "Attribute-Value Normalization").

+ * + *

For an enumerated attribute that is not a notation, the + * parser will report the type as "NMTOKEN".

+ * + * @param i The index of the attribute in the list (starting at 0). + * @return The attribute type as a string, or + * an empty string if the index is out of range. + * @see #getLength + * @see #getType(const stringT&) + */ + virtual const stringT& getType(int i) const = 0; + /** + * Return the value of an attribute in the list (by position). + * + *

If the attribute value is a list of tokens (IDREFS, + * ENTITIES, or NMTOKENS), the tokens will be concatenated + * into a single string separated by whitespace.

+ * + * @param i The index of the attribute in the list (starting at 0). + * @return The attribute value as a string, or + * an empty string if the index is out of range. + * @see #getLength + * @see #getValue(stringT) + */ + virtual const stringT& getValue(int i) const = 0; + + // + // Lookup methods. + // + /** + * Return the type of an attribute in the list (by name). + * + *

The return value is the same as the return value for + * getType(int).

+ * + *

If the attribute name has a namespace prefix in the document, + * the application must include the prefix here.

+ * + * @param name The name of the attribute. + * @return The attribute type as a string, or an empty string if no + * such attribute exists. + * @see #getType(int) + */ + virtual const stringT& getType(const stringT& name) const = 0; + /** + * Return the value of an attribute in the list (by name). + * + *

The return value is the same as the return value for + * getValue(int).

+ * + *

If the attribute name has a namespace prefix in the document, + * the application must include the prefix here.

+ * + * @param name The name of the attribute in the list. + * @return The attribute value as a string, or an empty string if + * no such attribute exists. + * @see #getValue(int) + */ + virtual const stringT& getValue(const stringT& name) const = 0; + +}; // class AttributeList + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/Attributes.hpp b/arabica/include/SAX/Attributes.hpp new file mode 100644 index 000000000..da38c5771 --- /dev/null +++ b/arabica/include/SAX/Attributes.hpp @@ -0,0 +1,272 @@ +#ifndef ARABICA_ATTRIBUTES_H +#define ARABICA_ATTRIBUTES_H + +// Attributes.h +// $Id$ + +#include + +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Interface for a list of XML attributes. + * + *

This interface allows access to a list of attributes in + * three different ways:

+ * + *
    + *
  1. by attribute index;
  2. + *
  3. by Namespace-qualified name; or
  4. + *
  5. by qualified (prefixed) name.
  6. + *
+ * + *

The list will not contain attributes that were declared + * #IMPLIED but not specified in the start tag. It will also not + * contain attributes used as Namespace declarations (xmlns*) unless + * the http://xml.org/sax/features/namespace-prefixes + * feature is set to true (it is false by + * default).

+ * + *

If the namespace-prefixes feature (see above) is false, + * access by qualified name may not be available; if the + * http://xml.org/sax/features/namespaces + * feature is false, access by Namespace-qualified names + * may not be available.

+ * + *

This interface replaces the now-deprecated SAX1 {@link + * AttributeList AttributeList} interface, which does not + * contain Namespace support. In addition to Namespace support, it + * adds the getIndex methods (below).

+ * + *

The order of attributes in the list is unspecified, and will + * vary from implementation to implementation.

+ * + * @since SAX 2.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see AttributesImpl + */ +template > +class AttributeType +{ +public: + static const string_type CDATA; + static const string_type ID; + static const string_type IDREF; + static const string_type IDREFS; + static const string_type NMTOKEN; + static const string_type NMTOKENS; + static const string_type ENTITY; + static const string_type ENTITIES; + static const string_type NOTATION; +}; // AttributeType + +template > +class Attributes +{ +public: + + typedef string_type stringT; + + virtual ~Attributes() { } + + // + // indexed access + // + /** + * Return the number of attributes in the list. + * + *

Once you know the number of attributes, you can iterate + * through the list.

+ * + * @return The number of attributes in the list. + * @see #getURI(unsigned int) + * @see #getLocalName(unsigned int) + * @see #getQName(unsigned int) + * @see #getType(unsigned int) + * @see #getValue(unsigned int) + */ + virtual int getLength() const = 0; + + /** + * Look up an attribute's Namespace URI by index. + * + * @param index The attribute index (zero-based). + * @return The Namespace URI, or the empty string if none + * is available, or if the index is out of range. + * @see #getLength + */ + virtual stringT getURI(unsigned int index) const = 0; + + /** + * Look up an attribute's local name by index. + * + * @param index The attribute index (zero-based). + * @return The local name, or the empty string if Namespace + * processing is not being performed, or + * if the index is out of range. + * @see #getLength + */ + virtual stringT getLocalName(unsigned int index) const = 0; + + /** + * Look up an attribute's XML 1.0 qualified name by index. + * + * @param index The attribute index (zero-based). + * @return The XML 1.0 qualified name, or the empty string + * if none is available, or if the index + * is out of range. + * @see #getLength + */ + virtual stringT getQName(unsigned int index) const = 0; + + /** + * Look up an attribute's type by index. + * + *

The attribute type is one of the strings "CDATA", "ID", + * "IDREF", "IDREFS", "NMTOKEN", "NMTOKENS", "ENTITY", "ENTITIES", + * or "NOTATION" (always in upper case).

+ * + *

If the parser has not read a declaration for the attribute, + * or if the parser does not report attribute types, then it must + * return the value "CDATA" as stated in the XML 1.0 Recommentation + * (clause 3.3.3, "Attribute-Value Normalization").

+ * + *

For an enumerated attribute that is not a notation, the + * parser will report the type as "NMTOKEN".

+ * + * @param index The attribute index (zero-based). + * @return The attribute's type as a string, or an empty string if the + * index is out of range. + * @see #getLength + */ + virtual stringT getType(unsigned int index) const = 0; + + /** + * Look up an attribute's value by index. + * + *

If the attribute value is a list of tokens (IDREFS, + * ENTITIES, or NMTOKENS), the tokens will be concatenated + * into a single string with each token separated by a + * single space.

+ * + * @param index The attribute index (zero-based). + * @return The attribute's value as a string, or an empty string if the + * index is out of range. + * @see #getLength + */ + virtual stringT getValue(unsigned int index) const = 0; + + // + // name based query + // + /** + * Look up the index of an attribute by Namespace name. + * + * @param uri The Namespace URI, or the empty string if + * the name has no Namespace URI. + * @param localName The attribute's local name. + * @return The index of the attribute, or -1 if it does not + * appear in the list. + */ + virtual int getIndex(const stringT& uri, const stringT& localName) const = 0; + + /** + * Look up the index of an attribute by XML 1.0 qualified name. + * + * @param qName The qualified (prefixed) name. + * @return The index of the attribute, or -1 if it does not + * appear in the list. + */ + virtual int getIndex(const stringT& qName) const = 0; + + /** + * Look up an attribute's type by Namespace name. + * + *

See {@link #getType(unsigned int) getType(unsigned int)} for a description + * of the possible types.

+ * + * @param uri The Namespace URI, or the empty String if the + * name has no Namespace URI. + * @param localName The local name of the attribute. + * @return The attribute type as a string, or an empty string if the + * attribute is not in the list or if Namespace + * processing is not being performed. + */ + virtual stringT getType(const stringT& uri, const stringT& localName) const = 0; + + /** + * Look up an attribute's type by XML 1.0 qualified name. + * + *

See {@link #getType(unsigned int) getType(unsigned int)} for a description + * of the possible types.

+ * + * @param qName The XML 1.0 qualified name. + * @return The attribute type as a string, or an empty string if the + * attribute is not in the list or if qualified names + * are not available. + */ + virtual stringT getType(const stringT& qName) const = 0; + + /** + * Look up an attribute's value by Namespace name. + * + *

See {@link #getValue(unsigned int) getValue(unsigned int)} for a + * description of the possible values.

+ * + * @param uri The Namespace URI, or the empty String if the + * name has no Namespace URI. + * @param localName The local name of the attribute. + * @return The attribute value as a string, or an empty string if the + * attribute is not in the list. + */ + virtual stringT getValue(const stringT& uri, const stringT& localName) const = 0; + + /** + * Look up an attribute's value by XML 1.0 qualified name. + * + *

See {@link #getValue(unsigned int) getValue(unsigned int)} for a + * description of the possible values.

+ * + * @param qName The XML 1.0 qualified name. + * @return The attribute value as a string, or an empty string if the + * attribute is not in the list or if qualified names + * are not available. + */ + virtual stringT getValue(const stringT& qName) const = 0; + +private: + static AttributeType types_; +}; // class Attributes + +template +const string_type AttributeType::CDATA = string_adaptor::construct_from_utf8("CDATA"); +template +const string_type AttributeType::ID = string_adaptor::construct_from_utf8("ID"); +template +const string_type AttributeType::IDREF = string_adaptor::construct_from_utf8("IDREF"); +template +const string_type AttributeType::IDREFS = string_adaptor::construct_from_utf8("IDREFS"); +template +const string_type AttributeType::NMTOKEN = string_adaptor::construct_from_utf8("NMTOKEN"); +template +const string_type AttributeType::NMTOKENS = string_adaptor::construct_from_utf8("NMTOKENS"); +template +const string_type AttributeType::ENTITY = string_adaptor::construct_from_utf8("ENTITY"); +template +const string_type AttributeType::ENTITIES = string_adaptor::construct_from_utf8("ENTITIES"); +template +const string_type AttributeType::NOTATION = string_adaptor::construct_from_utf8("NOTATION"); + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/ContentHandler.hpp b/arabica/include/SAX/ContentHandler.hpp new file mode 100644 index 000000000..f48e5af27 --- /dev/null +++ b/arabica/include/SAX/ContentHandler.hpp @@ -0,0 +1,341 @@ +#ifndef ARABICA_CONTENTHANDLER_H +#define ARABICA_CONTENTHANDLER_H + +// ContentHandler.h +// $Id$ + +#include + +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +template class Locator; + +/** + * Receive notification of the logical content of a document. + * + *

This is the main interface that most SAX applications + * implement: if the application needs to be informed of basic parsing + * events, it implements this interface and registers an instance with + * the SAX parser using the {@link XMLReader#setContentHandler + * setContentHandler} method. The parser uses the instance to report + * basic document-related events like the start and end of elements + * and character data.

+ * + *

The order of events in this interface is very important, and + * mirrors the order of information in the document itself. For + * example, all of an element's content (character data, processing + * instructions, and/or subelements) will appear, in order, between + * the startElement event and the corresponding endElement event.

+ * + *

This interface is similar to the now-deprecated SAX 1.0 + * DocumentHandler interface, but it adds support for Namespaces + * and for reporting skipped entities (in non-validating XML + * processors).

+ * + * @since SAX 2.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see XMLReader + * @see DTDHandler + * @see ErrorHandler + */ +template > +class ContentHandler +{ +public: + typedef Locator LocatorT; + typedef Attributes AttributesT; + + virtual ~ContentHandler() { } + + /** + * Receive an object for locating the origin of SAX document events. + * + *

SAX parsers are strongly encouraged (though not absolutely + * required) to supply a locator: if it does so, it must supply + * the locator to the application by invoking this method before + * invoking any of the other methods in the ContentHandler + * interface.

+ * + *

The locator allows the application to determine the end + * position of any document-related event, even if the parser is + * not reporting an error. Typically, the application will + * use this information for reporting its own errors (such as + * character content that does not match an application's + * business rules). The information returned by the locator + * is probably not sufficient for use with a search engine.

+ * + *

Note that the locator will return correct information only + * during the invocation of the events in this interface. The + * application should not attempt to use it at any other time.

+ * + * @param locator An object that can return the location of + * any SAX document event. + * @see Locator + */ + virtual void setDocumentLocator(const LocatorT& locator) = 0; + + /** + * Receive notification of the beginning of a document. + * + *

The SAX parser will invoke this method only once, before any + * other methods in this interface or in {@link DTDHandler + * DTDHandler} (except for {@link #setDocumentLocator + * setDocumentLocator}).

+ * + * @exception SAXException Any SAX exception. + * @see #endDocument + */ + virtual void startDocument() = 0; + /** + * Receive notification of the end of a document. + * + *

The SAX parser will invoke this method only once, and it will + * be the last method invoked during the parse. The parser shall + * not invoke this method until it has either abandoned parsing + * (because of an unrecoverable error) or reached the end of + * input.

+ * + * @exception SAXException Any SAX exception. + * @see #startDocument + */ + virtual void endDocument() = 0; + + /** + * Begin the scope of a prefix-URI Namespace mapping. + * + *

The information from this event is not necessary for + * normal Namespace processing: the SAX XML reader will + * automatically replace prefixes for element and attribute + * names when the http://xml.org/sax/features/namespaces + * feature is true (the default).

+ * + *

There are cases, however, when applications need to + * use prefixes in character data or in attribute values, + * where they cannot safely be expanded automatically; the + * start/endPrefixMapping event supplies the information + * to the application to expand prefixes in those contexts + * itself, if necessary.

+ * + *

Note that start/endPrefixMapping events are not + * guaranteed to be properly nested relative to each-other: + * all startPrefixMapping events will occur before the + * corresponding {@link #startElement startElement} event, + * and all {@link #endPrefixMapping endPrefixMapping} + * events will occur after the corresponding {@link #endElement + * endElement} event, but their order is not otherwise + * guaranteed.

+ * + *

There should never be start/endPrefixMapping events for the + * "xml" prefix, since it is predeclared and immutable.

+ * + * @param prefix The Namespace prefix being declared. + * @param uri The Namespace URI the prefix is mapped to. + * @exception SAXException The client may throw + * an exception during processing. + * @see #endPrefixMapping + * @see #startElement + */ + virtual void startPrefixMapping(const string_type& prefix, const string_type& uri) = 0; + /** + * End the scope of a prefix-URI mapping. + * + *

See {@link #startPrefixMapping startPrefixMapping} for + * details. This event will always occur after the corresponding + * {@link #endElement endElement} event, but the order of + * {@link #endPrefixMapping endPrefixMapping} events is not otherwise + * guaranteed.

+ * + * @param prefix The prefix that was being mapping. + * @exception SAXException The client may throw + * an exception during processing. + * @see #startPrefixMapping + * @see #endElement + */ + virtual void endPrefixMapping(const string_type& prefix) = 0; + + /** + * Receive notification of the beginning of an element. + * + *

The Parser will invoke this method at the beginning of every + * element in the XML document; there will be a corresponding + * {@link #endElement endElement} event for every startElement event + * (even when the element is empty). All of the element's content will be + * reported, in order, before the corresponding endElement + * event.

+ * + *

This event allows up to three name components for each + * element:

+ * + *
    + *
  1. the Namespace URI;
  2. + *
  3. the local name; and
  4. + *
  5. the qualified (prefixed) name.
  6. + *
+ * + *

Any or all of these may be provided, depending on the + * values of the http://xml.org/sax/features/namespaces + * and the http://xml.org/sax/features/namespace-prefixes + * properties:

+ * + *
    + *
  • the Namespace URI and local name are required when + * the namespaces property is true (the default), and are + * optional when the namespaces property is false (if one is + * specified, both must be);
  • + *
  • the qualified name is required when the namespace-prefixes property + * is true, and is optional when the namespace-prefixes property + * is false (the default).
  • + *
+ * + *

Note that the attribute list provided will contain only + * attributes with explicit values (specified or defaulted): + * #IMPLIED attributes will be omitted. The attribute list + * will contain attributes used for Namespace declarations + * (xmlns* attributes) only if the + * http://xml.org/sax/features/namespace-prefixes + * property is true (it is false by default, and support for a + * true value is optional).

+ * + * @param namespaceURI The Namespace URI, or the empty string if the + * element has no Namespace URI or if Namespace + * processing is not being performed. + * @param localName The local name (without prefix), or the + * empty string if Namespace processing is not being + * performed. + * @param qName The qualified name (with prefix), or the + * empty string if qualified names are not available. + * @param atts The attributes attached to the element. If + * there are no attributes, it shall be an empty + * Attributes object. + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see #endElement + * @see Attributes + */ + virtual void startElement(const string_type& namespaceURI, const string_type& localName, + const string_type& qName, const AttributesT& atts) = 0; + /** + * Receive notification of the end of an element. + * + *

The SAX parser will invoke this method at the end of every + * element in the XML document; there will be a corresponding + * {@link #startElement startElement} event for every endElement + * event (even when the element is empty).

+ * + *

For information on the names, see startElement.

+ * + * @param namespaceURI The Namespace URI, or the empty string if the + * element has no Namespace URI or if Namespace + * processing is not being performed. + * @param localName The local name (without prefix), or the + * empty string if Namespace processing is not being + * performed. + * @param qName The qualified XML 1.0 name (with prefix), or the + * empty string if qualified names are not available. + * @exception SAXException Any SAX exception. + */ + virtual void endElement(const string_type& namespaceURI, const string_type& localName, + const string_type& qName) = 0; + + /** + * Receive notification of character data. + * + *

The Parser will call this method to report each chunk of + * character data. SAX parsers may return all contiguous character + * data in a single chunk, or they may split it into several + * chunks; however, all of the characters in any single event + * must come from the same external entity so that the Locator + * provides useful information.

+ * + *

The application must not attempt to read from the array + * outside of the specified range.

+ * + *

Note that some parsers will report whitespace in element + * content using the {@link #ignorableWhitespace ignorableWhitespace} + * method rather than this one (validating parsers must + * do so).

+ * + * @param ch The characters from the XML document. + * @exception SAXException Any SAX exception. + * @see #ignorableWhitespace + * @see Locator + */ + virtual void characters(const string_type& ch) = 0; + /** + * Receive notification of ignorable whitespace in element content. + * + *

Validating Parsers must use this method to report each chunk + * of whitespace in element content (see the W3C XML 1.0 recommendation, + * section 2.10): non-validating parsers may also use this method + * if they are capable of parsing and using content models.

+ * + *

SAX parsers may return all contiguous whitespace in a single + * chunk, or they may split it into several chunks; however, all of + * the characters in any single event must come from the same + * external entity, so that the Locator provides useful + * information.

+ * + *

The application must not attempt to read from the array + * outside of the specified range.

+ * + * @param ch The characters from the XML document. + * @exception SAXException Any SAX exception. + * @see #characters + */ + virtual void ignorableWhitespace(const string_type& ch) = 0; + + /** + * Receive notification of a processing instruction. + * + *

The Parser will invoke this method once for each processing + * instruction found: note that processing instructions may occur + * before or after the main document element.

+ * + *

A SAX parser must never report an XML declaration (XML 1.0, + * section 2.8) or a text declaration (XML 1.0, section 4.3.1) + * using this method.

+ * + * @param target The processing instruction target. + * @param data The processing instruction data, or an empty string if + * none was supplied. The data does not include any + * whitespace separating it from the target. + * @exception SAXException Any SAX exception. + */ + virtual void processingInstruction(const string_type& target, const string_type& data) = 0; + + /** + * Receive notification of a skipped entity. + * + *

The Parser will invoke this method once for each entity + * skipped. Non-validating processors may skip entities if they + * have not seen the declarations (because, for example, the + * entity was declared in an external DTD subset). All processors + * may skip external entities, depending on the values of the + * http://xml.org/sax/features/external-general-entities + * and the + * http://xml.org/sax/features/external-parameter-entities + * properties.

+ * + * @param name The name of the skipped entity. If it is a + * parameter entity, the name will begin with '%', and if + * it is the external DTD subset, it will be the string + * "[dtd]". + * @exception SAXException Any SAX exception. + */ + virtual void skippedEntity(const string_type& name) = 0; +}; // class ContentHandler + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/DTDHandler.hpp b/arabica/include/SAX/DTDHandler.hpp new file mode 100644 index 000000000..51f53d44c --- /dev/null +++ b/arabica/include/SAX/DTDHandler.hpp @@ -0,0 +1,127 @@ +#ifndef ARABICA_DTDHANDLER_H +#define ARABICA_DTDHANDLER_H + +// DTDHandler.h +// $Id$ + +#include + +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Receive notification of basic DTD-related events. + * + *

If a SAX application needs information about notations and + * unparsed entities, then the application implements this + * interface and registers an instance with the SAX parser using + * the parser's setDTDHandler method. The parser uses the + * instance to report notation and unparsed entity declarations to + * the application.

+ * + *

Note that this interface includes only those DTD events that + * the XML recommendation requires processors to report: + * notation and unparsed entity declarations.

+ * + *

The SAX parser may report these events in any order, regardless + * of the order in which the notations and unparsed entities were + * declared; however, all DTD events must be reported after the + * document handler's startDocument event, and before the first + * startElement event. + * (If the {@link LexicalHandler LexicalHandler} is + * used, these events must also be reported before the endDTD event.) + *

+ * + *

It is up to the application to store the information for + * future use (perhaps in a hash table or object tree). + * If the application encounters attributes of type "NOTATION", + * "ENTITY", or "ENTITIES", it can use the information that it + * obtained through this interface to find the entity and/or + * notation corresponding with the attribute value.

+ * + * @since SAX 1.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version $Id$ + * @see XMLReader#setDTDHandler + */ + +template > +class DTDHandler +{ +public: + virtual ~DTDHandler() { } + + + /** + * Receive notification of a notation declaration event. + * + *

It is up to the application to record the notation for later + * reference, if necessary; + * notations may appear as attribute values and in unparsed entity + * declarations, and are sometime used with processing instruction + * target names. + * When a system identifier is present, applications are responsible + * for knowing if it is used as a URL, and absolutizing it against + * the appropriate URI when appropriate. + * That base URI is available from {@link Locator#getSystemId} during + * this callback, assuming the parser provides a Locator.

+ * + *

At least one of publicId and systemId must be non-empty.

+ * + *

There is no guarantee that the notation declaration will be + * reported before any unparsed entities that use it.

+ * + * @param name The notation name. + * @param publicId The notation's public identifier, or an empty string if + * none was given. + * @param systemId The notation's system identifier, or an empty string if + * none was given. + * @exception SAXException Any SAX exception. + * @see #unparsedEntityDecl + * @see Attributes + */ + virtual void notationDecl(const string_type& name, + const string_type& publicId, + const string_type& systemId) = 0; + + + /** + * Receive notification of an unparsed entity declaration event. + * + *

Note that the notation name corresponds to a notation + * reported by the {@link #notationDecl notationDecl} event. + * It is up to the application to record the entity for later + * reference, if necessary; + * unparsed entities may appear as attribute values. + *

+ * + *

If the system identifier is a URL, the parser must resolve it + * fully before passing it to the application.

+ * + * @exception SAXException Any SAX exception. + * @param name The unparsed entity's name. + * @param publicId The entity's public identifier, or an empty string if none + * was given. + * @param systemId The entity's system identifier. + * @param notationName The name of the associated notation. + * @see #notationDecl + * @see Attributes + */ + virtual void unparsedEntityDecl(const string_type& name, + const string_type& publicId, + const string_type& systemId, + const string_type& notationName) = 0; + +}; // class DTDHandler + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/DocumentHandler.hpp b/arabica/include/SAX/DocumentHandler.hpp new file mode 100644 index 000000000..eddd39651 --- /dev/null +++ b/arabica/include/SAX/DocumentHandler.hpp @@ -0,0 +1,76 @@ +#ifndef ARABICA_DOCUMENTHANDLER_H +#define ARABICA_DOCUMENTHANDLER_H +// SAX document handler. +// $Id$ + +#include + +namespace Arabica +{ +namespace SAX +{ + +template class Locator; +template class AttributeList; + +/** + * Receive notification of general document events. + * + * This is the main interface that most SAX applications + * implement: if the application needs to be informed of basic parsing + * events, it implements this interface and registers an instance with + * the SAX parser using the setDocumentHandler method. The parser + * uses the instance to report basic document-related events like + * the start and end of elements and character data. + * + * The order of events in this interface is very important, and + * mirrors the order of information in the document itself. For + * example, all of an element's content (character data, processing + * instructions, and/or subelements) will appear, in order, between + * the startElement event and the corresponding endElement event. + * + * Application writers who do not want to implement the entire + * interface can derive a class from HandlerBase, which implements + * the default functionality; parser writers can instantiate + * HandlerBase to obtain a default handler. The application can find + * the location of any document event using the Locator interface + * supplied by the Parser through the setDocumentLocator method. + * + * @author Jez Higgins, jez@jezuk.co.uk + * @version 0.1 + * @see Parser#setDocumentHandler + * @see Locator + * @see HandlerBase + */ +template +class DocumentHandler +{ +public: + typedef string_type stringT; + typedef Locator LocatorT; + typedef AttributeList AttributeListT; + + virtual ~DocumentHandler() { } + + virtual void setDocumentLocator(const LocatorT& locator) = 0; + + virtual void startDocument() = 0; + virtual void endDocument() = 0; + + virtual void startElement(const stringT& name, + const AttributeListT& atts) = 0; + virtual void endElement(const stringT& name) = 0; + + virtual void characters(const stringT& ch) = 0; + + virtual void ignorableWhitespace(const stringT& ch) = 0; + + virtual void processingInstruction(const stringT& target, + const stringT& data) = 0; +}; // class DocumentHandler + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/EntityResolver.hpp b/arabica/include/SAX/EntityResolver.hpp new file mode 100644 index 000000000..c9e1744ff --- /dev/null +++ b/arabica/include/SAX/EntityResolver.hpp @@ -0,0 +1,117 @@ +#ifndef ARABICA_ENTITYRESOLVER_H +#define ARABICA_ENTITYRESOLVER_H + +// EntityResolver.h +// $Id$ + +#include + +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Basic interface for resolving entities. + * + *

If a SAX application needs to implement customized handling + * for external entities, it must implement this interface and + * register an instance with the SAX driver using the + * {@link XMLReader#setEntityResolver setEntityResolver} + * method.

+ * + *

The XML reader will then allow the application to intercept any + * external entities (including the external DTD subset and external + * parameter entities, if any) before including them.

+ * + *

Many SAX applications will not need to implement this interface, + * but it will be especially useful for applications that build + * XML documents from databases or other specialised input sources, + * or for applications that use URI types other than URLs.

+ * + *

The following resolver would provide the application + * with a special character stream for the entity with the system + * identifier "http://www.myhost.com/today":

+ * + *
+ * #include <EntityResolver>
+ * #include <InputSource>
+ *
+ * public class MyResolver implements SAX::EntityResolver 
+ * {
+ *   public InputSource resolveEntity (const std::string& publicId, const std::string& systemId)
+ *   {
+ *     if(systemId == "http://www.myhost.com/today") 
+ *     {
+ *       // return a special input source
+ *       MyStream* reader = new MyStream();
+ *       return SAX::InputSource(stream);
+ *     } else {
+ *       // request default behaviour
+ *       return SAX::InputSource();
+ *     }
+ *   }
+ * }
+ * 
+ * + *

The application can also use this interface to redirect system + * identifiers to local URIs or to look up replacements in a catalog + * (possibly by using the public identifier).

+ * + * @since SAX 1.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see Parser#setEntityResolver + * @see InputSource + */ +template > +class EntityResolver +{ +public: + typedef InputSource InputSourceT; + + virtual ~EntityResolver() { }; + + /** + * Allow the application to resolve external entities. + * + *

The Parser will call this method before opening any external + * entity except the top-level document entity (including the + * external DTD subset, external entities referenced within the + * DTD, and external entities referenced within the document + * element): the application may request that the parser resolve + * the entity itself, that it use an alternative URI, or that it + * use an entirely different input source.

+ * + *

Application writers can use this method to redirect external + * system identifiers to secure and/or local URIs, to look up + * public identifiers in a catalogue, or to read an entity from a + * database or other input source (including, for example, a dialog + * box).

+ * + *

If the system identifier is a URL, the SAX parser must + * resolve it fully before reporting it to the application.

+ * + * @param publicId The public identifier of the external entity + * being referenced, or an empty string if none was supplied. + * @param systemId The system identifier of the external entity + * being referenced. + * @return An InputSource object describing the new input source, + * or a default-constructed InputSource to request that + * the parser open a regular URI connection to the system identifier. + * @exception SAXException Any SAX exception. + * @see InputSource + */ + virtual InputSourceT resolveEntity(const string_type& publicId, const string_type& systemId) = 0; +}; // class EntityResolver + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/ErrorHandler.hpp b/arabica/include/SAX/ErrorHandler.hpp new file mode 100644 index 000000000..bc280e068 --- /dev/null +++ b/arabica/include/SAX/ErrorHandler.hpp @@ -0,0 +1,123 @@ +#ifndef ARABICA_ERRORHANDLER_H +#define ARABICA_ERRORHANDLER_H + +// EntityResolver.h +// $Id$ + +#include + +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Basic interface for SAX error handlers. + * + *

If a SAX application needs to implement customized error + * handling, it must implement this interface and then register an + * instance with the XML reader using the + * {@link XMLReader#setErrorHandler setErrorHandler} + * method. The parser will then report all errors and warnings + * through this interface.

+ * + *

WARNING: If an application does not + * register an ErrorHandler, XML parsing errors will go unreported + * and bizarre behaviour may result.

+ * + *

For XML processing errors, a SAX driver must use this interface + * instead of throwing an exception: it is up to the application + * to decide whether to throw an exception for different types of + * errors and warnings. Note, however, that there is no requirement that + * the parser continue to provide useful information after a call to + * {@link #fatalError fatalError} (in other words, a SAX driver class + * could catch an exception and report a fatalError).

+ * + * @since SAX 1.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see Parser#setErrorHandler + * @see SAXParseException + */ +template > +class ErrorHandler +{ +public: + typedef SAXParseException SAXParseExceptionT; + + virtual ~ErrorHandler() { }; + + /** + * Receive notification of a warning. + * + *

SAX parsers will use this method to report conditions that + * are not errors or fatal errors as defined by the XML 1.0 + * recommendation. The default behaviour is to take no action.

+ * + *

The SAX parser must continue to provide normal parsing events + * after invoking this method: it should still be possible for the + * application to process the document through to the end.

+ * + *

Filters may use this method to report other, non-XML warnings + * as well.

+ * + * @param exception The warning information encapsulated in a + * SAX parse exception. + * @see SAXParseException + */ + virtual void warning(const SAXParseExceptionT& exception) = 0; + /** + * Receive notification of a recoverable error. + * + *

This corresponds to the definition of "error" in section 1.2 + * of the W3C XML 1.0 Recommendation. For example, a validating + * parser would use this callback to report the violation of a + * validity constraint. The default behaviour is to take no + * action.

+ * + *

The SAX parser must continue to provide normal parsing events + * after invoking this method: it should still be possible for the + * application to process the document through to the end. If the + * application cannot do so, then the parser should report a fatal + * error even if the XML 1.0 recommendation does not require it to + * do so.

+ * + *

Filters may use this method to report other, non-XML errors + * as well.

+ * + * @param exception The error information encapsulated in a + * SAX parse exception. + * @see SAXParseException + */ + virtual void error(const SAXParseExceptionT& exception) = 0; + /** + * Receive notification of a non-recoverable error. + * + *

This corresponds to the definition of "fatal error" in + * section 1.2 of the W3C XML 1.0 Recommendation. For example, a + * parser would use this callback to report the violation of a + * well-formedness constraint.

+ * + *

The application must assume that the document is unusable + * after the parser has invoked this method, and should continue + * (if at all) only for the sake of collecting addition error + * messages: in fact, SAX parsers are free to stop reporting any + * other events once this method has been invoked.

+ * + * @param exception The error information encapsulated in a + * SAX parse exception. + * @see SAXParseException + */ + virtual void fatalError(const SAXParseExceptionT& exception) = 0; +}; // class ErrorHandler + +} // namespace SAX +} // namespace Arabica + +#endif +//end of file diff --git a/arabica/include/SAX/HandlerBase.hpp b/arabica/include/SAX/HandlerBase.hpp new file mode 100644 index 000000000..ed1dc9e7c --- /dev/null +++ b/arabica/include/SAX/HandlerBase.hpp @@ -0,0 +1,291 @@ +#ifndef ARABICA_HANDLERBASE_H +#define ARABICA_HANDLERBASE_H +// SAX default handler base class. +// $Id$ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Default base class for handlers. + * + *

This class implements the default behaviour for four SAX1 + * interfaces: EntityResolver, DTDHandler, DocumentHandler, + * and ErrorHandler. It is now obsolete, but is included in SAX2 to + * support legacy SAX1 applications. SAX2 applications should use + * the {@link DefaultHandler DefaultHandler} + * class instead.

+ * + *

Application writers can extend this class when they need to + * implement only part of an interface; parser writers can + * instantiate this class to provide default handlers when the + * application has not supplied its own.

+ * + *

Note that the use of this class is optional.

+ * + * @deprecated This class works with the deprecated + * {@link DocumentHandler DocumentHandler} + * interface. It has been replaced by the SAX2 + * {@link DefaultHandler DefaultHandler} + * class. + * @since SAX 1.0 + * @author Jez Higgins, jez@jezuk.co.uk + * @version 1.0 + * @see EntityResolver + * @see DTDHandler + * @see DocumentHandler + * @see ErrorHandler + */ +template +class HandlerBase : public EntityResolver, + public DTDHandler, + public DocumentHandler, + public ErrorHandler +{ +public: + typedef string_name stringT; + typedef Locator LocatorT; + typedef AttributeList AttributeListT; + typedef InputSource InputSourceT; + typedef ErrorHandler::SAXParseExceptionT SAXParseExceptionT; + + virtual ~HandlerBase() { } + + ////////////////////////////////////////////////////////////////////// + // Default implementation of the EntityResolver interface. + ////////////////////////////////////////////////////////////////////// + /** + * Resolve an external entity. + * + *

Always return a default-constructed InputSourceT, so that + * the parser will use the system identifier provided in the XML document. + * This method implements the SAX default behaviour: application writers can + * override it in a subclass to do special translations such as catalog + * lookups or URI redirection.

+ * + * @param publicId The public identifer, or an empty string if none is + * available. + * @param systemId The system identifier provided in the XML + * document. + * @return The new input source, or an empty string to require the + * default behaviour. + * @see EntityResolver#resolveEntity + */ + virtual InputSourceT resolveEntity(const stringT& publicId, + const stringT& systemId) + { + return InputSourceT(); + } // resolverEntity + + ////////////////////////////////////////////////////////////////////// + // Default implementation of DTDHandler interface. + ////////////////////////////////////////////////////////////////////// + /** + * Receive notification of a notation declaration. + * + *

By default, do nothing. Application writers may override this + * method in a subclass if they wish to keep track of the notations + * declared in a document.

+ * + * @param name The notation name. + * @param publicId The notation public identifier, or an empty string if not + * available. + * @param systemId The notation system identifier. + * @see DTDHandler#notationDecl + */ + virtual void notationDecl(const stringT& name, + const stringT& publicId, + const stringT& systemId) { } + + /** + * Receive notification of an unparsed entity declaration. + * + *

By default, do nothing. Application writers may override this + * method in a subclass to keep track of the unparsed entities + * declared in a document.

+ * + * @param name The entity name. + * @param publicId The entity public identifier, or an empty string if not + * available. + * @param systemId The entity system identifier. + * @param notationName The name of the associated notation. + * @see DTDHandler#unparsedEntityDecl + */ + virtual void unparsedEntityDecl(const stringT& name, + const stringT& publicId, + const stringT& systemId, + const stringT& notationName) { } + + ////////////////////////////////////////////////////////////////////// + // Default implementation of DocumentHandler interface. + ////////////////////////////////////////////////////////////////////// + /** + * Receive a Locator object for document events. + * + *

By default, do nothing. Application writers may override this + * method in a subclass if they wish to store the locator for use + * with other document events.

+ * + * @param locator A locator for all SAX document events. + * @see DocumentHandler#setDocumentLocator + * @see Locator + */ + virtual void setDocumentLocator(const LocatorT& locator) { } + + /** + * Receive notification of the beginning of the document. + * + *

By default, do nothing. Application writers may override this + * method in a subclass to take specific actions at the beginning + * of a document (such as allocating the root node of a tree or + * creating an output file).

+ * + * @see DocumentHandler#startDocument + */ + virtual void startDocument() { } + /** + * Receive notification of the end of the document. + * + *

By default, do nothing. Application writers may override this + * method in a subclass to take specific actions at the beginning + * of a document (such as finalising a tree or closing an output + * file).

+ * + * @see DocumentHandler#endDocument + */ + virtual void endDocument() { } + + /** + * Receive notification of the start of an element. + * + *

By default, do nothing. Application writers may override this + * method in a subclass to take specific actions at the start of + * each element (such as allocating a new tree node or writing + * output to a file).

+ * + * @param name The element type name. + * @param attributes The specified or defaulted attributes. + * @see DocumentHandler#startElement + */ + virtual void startElement(const stringT& name, + const AttributeListT& attributes) { } + /** + * Receive notification of the end of an element. + * + *

By default, do nothing. Application writers may override this + * method in a subclass to take specific actions at the end of + * each element (such as finalising a tree node or writing + * output to a file).

+ * + * @param name The element type name. + * @see DocumentHandler#endElement + */ + virtual void endElement(const stringT& name) { } + + /** + * Receive notification of character data inside an element. + * + *

By default, do nothing. Application writers may override this + * method to take specific actions for each chunk of character data + * (such as adding the data to a node or buffer, or printing it to + * a file).

+ * + * @param ch The characters. + * @see DocumentHandler#characters + */ + virtual void characters(const stringT& ch) { } + /** + * Receive notification of ignorable whitespace in element content. + * + *

By default, do nothing. Application writers may override this + * method to take specific actions for each chunk of ignorable + * whitespace (such as adding data to a node or buffer, or printing + * it to a file).

+ * + * @param ch The whitespace characters. + * @see DocumentHandler#ignorableWhitespace + */ + virtual void ignorableWhitespace(const stringT& ch) { } + + /** + * Receive notification of a processing instruction. + * + *

By default, do nothing. Application writers may override this + * method in a subclass to take specific actions for each + * processing instruction, such as setting status variables or + * invoking other methods.

+ * + * @param target The processing instruction target. + * @param data The processing instruction data, or an empty string if + * none is supplied. + * @see DocumentHandler#processingInstruction + */ + virtual void processingInstruction(const stringT& target, + const stringT& data) { } + + ////////////////////////////////////////////////////////////////////// + // Default implementation of the ErrorHandler interface. + ////////////////////////////////////////////////////////////////////// + /** + * Receive notification of a parser warning. + * + *

The default implementation does nothing. Application writers + * may override this method in a subclass to take specific actions + * for each warning, such as inserting the message in a log file or + * printing it to the console.

+ * + * @param e The warning information encoded as an exception. + * @see ErrorHandler#warning + * @see SAXParseException + */ + virtual void warning(const SAXParseExceptionT& e) { } + /** + * Receive notification of a recoverable parser error. + * + *

The default implementation does nothing. Application writers + * may override this method in a subclass to take specific actions + * for each error, such as inserting the message in a log file or + * printing it to the console.

+ * + * @param e The warning information encoded as an exception. + * @see ErrorHandler#warning + * @see SAXParseException + */ + virtual void error(const SAXParseExceptionT& e) { } + /** + * Report a fatal XML parsing error. + * + *

The default implementation throws a SAXParseException. + * Application writers may override this method in a subclass if + * they need to take specific actions for each fatal error (such as + * collecting all of the errors into a single report): in any case, + * the application must stop all regular processing when this + * method is invoked, since the document is no longer reliable, and + * the parser may no longer report parsing events.

+ * + * @param e The error information encoded as an exception. + * @see ErrorHandler#fatalError + * @see SAXParseException + */ + virtual void fatalError(const SAXParseExceptionT& e) { throw e; } +}; // class HandlerBase + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/IStreamHandle.hpp b/arabica/include/SAX/IStreamHandle.hpp new file mode 100644 index 000000000..11cdbf8d1 --- /dev/null +++ b/arabica/include/SAX/IStreamHandle.hpp @@ -0,0 +1,452 @@ +/* $Id$ + * + * + */ + +#ifndef ARABICA_ISTREAMHANDLE_H +#define ARABICA_ISTREAMHANDLE_H + +#include +#include +#include +#include + +// If we've been asked to debug, enforce post-conditions on all public API +// methods. All post-conditions which do not relate to direct in-method +// assignment are tested. (Those which relate to a direct assignment of a POD +// within the same method are not tested.) +#ifdef ARABICA_DEBUG +#define ISTREAMHANDLE_POSTCONDITION(x) assert(x) +#else +#define ISTREAMHANDLE_POSTCONDITION(x) do {} while (false) +#endif + +namespace Arabica { +namespace SAX { + +/** Reference-counting pointer to std::istream. + * + * \par Summary: + * This works much like any other reference-counted pointer, + * except that it only optionally owns the pointee. That + * is, it will not necessarily delete the pointee when the + * reference count goes to zero. + * + * \par + * Ownership of a std::istream is passed to an IStreamHandle via a + * std::auto_ptr. Ownership of a + * std::istream is passed between IStreamHandle by assignment. + * + * \see IStreamHandle(std::auto_ptr) for passing + * ownership at construction. + * \see IStreamHandle::operator=(std::auto_ptr) for + * passing ownership after construction. + * \see IStreamHandle(std::istream&) for constructing without + * passing ownership + * \see IStreamHandle::operator=(std::istream&) for assigning + * without passing ownership. + * + * \par Justification: + * + * \par + * This class is needed to fix an ownership problem between the + * InputSource and EntityResovler classes. InputSource is a SAX + * class designed to wrap stream objects. Normally, an + * InputSource is constructed with reference to a std::istream + * object. There are two separate use-cases for InputSource which + * lead to conflicting models of std::istream ownership. + * + * \par + * -# When an InputSource is constructed to pass to a parser + * parse() method, the InputSource is owned by + * the client code. When parse() returns, the + * InputSource is no-longer needed and it's stream can be + * destroyed by the client code. This will most likely + * happen when the variable goes out of scope). + * -# When an EntityResolver responds to a + * resolveEntity() request, it must return an + * InputSource. In this case the InputSource, and more + * importantly, the std::istream, is needed by the parser for + * some time after resolveEntity() has returned. + * + * \par + * In case 1, the std::istream in question may be local variable, + * and so the InputSource simply cannot own it. + * + * \par + * In case 2, the std::istream in question cannot be a local + * variable because it must be returned from a method. Since it + * must be destroyed at some time known only to the parser, there + * is no alternative but for the InputSource to own it. + * + * \par Solution: + * + * \par + * To solve this issue, I have created the IStreamHandle and + * modified InputSource. IStreamHandle can behave either as a + * std::istream* or as a reference-counting std::istream*. + * The difference is in ownership: + * + * \par + * If an IStreamHandle is constructed with or assigned a + * std::auto_ptr, it takes ownership of + * the pointee. If an IStreamHandle is constructed with or + * assigned a std::istream&, it does not take + * ownership of the pointee. + * + * \par + * Upon destruction or re-assignment, an IStreamHandle checks + * whether it is the last reference to its pointee, and + * whether it owns its pointee. If both conditions + * are met, the IStreamHandle deletes its pointee. + * + * @author Philip Walford + * \par Created: + * 21/05/2003 + * + * + * + * + * + * \internal + * \par Pre- and Post-Conditions + * All public API methods of IStreamHandle conform to the following + * post-condition: + * - counter_ points to a valid heap-allocated integer greater + * than 0. + * + * \par + * All public API methods of IStreamHandle that take an + * auto_ptr parameter conform to the follow post-conditions. + * For the purposes of these post-conditions, rhs will represent + * the std::auto_ptr parameter. + * - is_ contains the value returned by rhs.release(). + * - rhs.get() == 0 (ownership has been transfered). + * - owner_ = true + * + * \par + * All public API methods of IStreamHandle that take a std::istream& + * parameter conform to the follow post-conditions. For the purposes of + * these post-conditions, rhs will represent the std::istream& + * parameter. + * - is_ = &rhs + * - owner_ = false + * + * \par + * The default constructor and release() methods of IStreamHandle conform to + * the following post-conditions: + * - is_ = 0 + * - owner_ = false + * + * \par + * Further post-conditions are documented under the appropriate methods. + */ +class IStreamHandle +{ +public: + /** Create an IStreamHandle which does not point to any std::istream. + * + * \internal + * \post is_ == 0 + * \post owner_ == false + */ + IStreamHandle(); + + /** Create an IStreamHandle which does not own it's std::istream. + * + * \par Ownership: + * The IStreamHandle constructed in this manner does not own the + * std::istream to which it points. + * + */ + IStreamHandle(std::istream& is); + + /** Create an IStreamHandle taking ownership of a std::istream. + * + * \par Ownership: + * The IStreamHandle constructed in this manner owns the std::istream to + * which is pointed. This ownership may then be shared with + * other IStreamHandle objects. + * + */ + explicit IStreamHandle(std::auto_ptr is); + explicit IStreamHandle(std::auto_ptr is); + + /** Construct an IStreamHandle sharing a std::istream with rhs. + * + * \par Ownership + * The IStreamHandle constructed in this manner will share ownership of + * the std::istream to which it points if-and-only-if rhs + * shares ownership of the std::istream. + * + * \internal + * \post is_ == rhs.is_ + * \post counter_ == rhs.counter_ + * \post *counter_ == (old value of *(rhs.counter_)) + 1 + * \post owner_ == rhs.owner_ + */ + IStreamHandle(const IStreamHandle& rhs); + + /** Destroy an IStreamHandle, destroying the internal std::istream if + * this owns the istream and is the last reference. + * + * \internal + * \post is_ == 0 + * \post counter_ == 0 + * \post owner_ == false + */ + ~IStreamHandle(); + + /** Assign another IStreamHandle to this. This destroys the + * std::istream held by this if this owns + * the istream and is the last reference. + * + * \par Ownership: + * After this operation, the IStreamHandle this will share + * ownership of the std::istream to which it points if-and-only-if + * rhs shares ownership of the std::istream. + * + * \internal + * \post is_ == rhs.is_ + * \post counter_ == rhs.counter_ + * \post *counter_ == (old value of *(rhs.counter_)) + 1 + * \post owner_ == rhs.owner_ + */ + IStreamHandle& operator=(const IStreamHandle& rhs); + + /** Assign a new std::istream to this. This destroys the + * std::istream held by this if this owns + * the istream and is the last reference. Ownership of the istream is + * transfered to this. + * + * \par Ownership: + * After this operation, the IStreamHandle this owns the + * std::istream to which rhs pointed. This ownership may then be + * shared with other IStreamHandle objects. + * + */ + IStreamHandle& operator=(std::auto_ptr rhs); + IStreamHandle& operator=(std::auto_ptr rhs); + + /** Assign a new std::istream to this. This destroys the + * std::istream held by this if this owns + * the istream and is the last reference. + * + * \par Ownership: + * After this operation, the IStreamHandle this does not own + * the std::istream to which rhs points. + * + */ + IStreamHandle& operator=(std::istream& rhs); + + /** Return the std::istream contained by this. */ + std::istream* get() const; + + /** Release the std::istream held by this. This will destroy + * the internal std::istream if this owns the istream + * and is the last reference. + * + * \internal + * \post is_ == 0 + * \post owner_ == false + */ + void release(); + +private: + std::istream* is_; + int* counter_; + bool owner_; + + // Set the internal pointer and ownership + // \post is_ == is + // \post owner_ == own + void set(std::istream* is, bool own); + + // Increment the counter. + // \pre counter_ != 0 + // \post *counter_ = (old value of *counter_) + 1 + void addRef() const; + + // Decrement the counter and free memory if zero references remain and we + // own the std::istream. Clears internal pointers either way. + // \post is_ == 0 + // \post counter_ == 0 + // \post owner_ = false + void removeRef(); +}; + +inline +IStreamHandle::IStreamHandle() +: is_(0), + counter_(new int(0)), + owner_(false) +{ + addRef(); + ISTREAMHANDLE_POSTCONDITION(*counter_ == 1); +} + +inline +IStreamHandle::IStreamHandle(std::istream& is) +: is_(&is), + counter_(new int(0)), + owner_(false) +{ + addRef(); + ISTREAMHANDLE_POSTCONDITION(*counter_ == 1); +} + +inline +IStreamHandle::IStreamHandle(std::auto_ptr is) +: is_(is.release()), + counter_(new int(0)), + owner_(true) +{ + addRef(); + ISTREAMHANDLE_POSTCONDITION(*counter_ == 1); + ISTREAMHANDLE_POSTCONDITION(is.get() == 0); +} + +inline +IStreamHandle::IStreamHandle(std::auto_ptr is) +: is_(is.release()), + counter_(new int(0)), + owner_(true) +{ + addRef(); + ISTREAMHANDLE_POSTCONDITION(*counter_ == 1); + ISTREAMHANDLE_POSTCONDITION(is.get() == 0); +} + +inline +IStreamHandle::IStreamHandle(const IStreamHandle& rhs) +: is_(rhs.is_), + counter_(rhs.counter_), + owner_(rhs.owner_) +{ + addRef(); + ISTREAMHANDLE_POSTCONDITION(*counter_ >= 2); +} + +inline +IStreamHandle::~IStreamHandle() +{ + removeRef(); + ISTREAMHANDLE_POSTCONDITION(is_ == 0); + ISTREAMHANDLE_POSTCONDITION(counter_ == 0); + ISTREAMHANDLE_POSTCONDITION(owner_ == false); +} + +inline +IStreamHandle& IStreamHandle::operator=(const IStreamHandle& rhs) +{ + // Add to rhs first to avoid self-assignment bug. + rhs.addRef(); + removeRef(); + is_ = rhs.is_; + counter_ = rhs.counter_; + owner_ = rhs.owner_; + + ISTREAMHANDLE_POSTCONDITION(*counter_ >= 1); + + return *this; +} + +inline +IStreamHandle& IStreamHandle::operator=(std::auto_ptr rhs) +{ + removeRef(); + set(rhs.release(), true); + + ISTREAMHANDLE_POSTCONDITION(*counter_ == 1); + ISTREAMHANDLE_POSTCONDITION(owner_ == true); + ISTREAMHANDLE_POSTCONDITION(rhs.get() == 0); + + return *this; +} + +inline +IStreamHandle& IStreamHandle::operator=(std::auto_ptr rhs) +{ + removeRef(); + set(rhs.release(), true); + + ISTREAMHANDLE_POSTCONDITION(*counter_ == 1); + ISTREAMHANDLE_POSTCONDITION(owner_ == true); + ISTREAMHANDLE_POSTCONDITION(rhs.get() == 0); + + return *this; +} + +inline +IStreamHandle& IStreamHandle::operator=(std::istream& rhs) +{ + removeRef(); + set(&rhs, false); + + ISTREAMHANDLE_POSTCONDITION(*counter_ == 1); + ISTREAMHANDLE_POSTCONDITION(owner_ == false); + + return *this; +} + +inline +std::istream* IStreamHandle::get() const +{ + return is_; +} + +inline +void IStreamHandle::set(std::istream* is, bool own) +{ + is_ = is; + owner_ = own; + counter_ = new int(0); + addRef(); +} + +inline +void IStreamHandle::release() +{ + removeRef(); + // We're pretending to have a counted reference to 0, this ensures internal + // consistency. + set(0, false); +} + +// Increment the counter. +inline +void IStreamHandle::addRef() const +{ + ++(*counter_); + ISTREAMHANDLE_POSTCONDITION(*counter_ >= 1); +} + +// Decrement the counter and free memory if zero references remain and we own +// the std::istream. Clears internal pointers either way and resets counter_ +// to a new int. +inline +void IStreamHandle::removeRef() +{ + // counter_ may be 0 if we are default-constructed or if someone has + // called release(). + if (counter_) + { + if (--(*counter_) == 0) + { + if (owner_) + { + delete is_; + } + delete counter_; + } + } + is_ = 0; + owner_ = false; + counter_ = 0; +} + +} // namespace SAX +} // namespace Arabica + +#endif /* SAX_ISTREAMHANDLE_H */ + diff --git a/arabica/include/SAX/InputSource.hpp b/arabica/include/SAX/InputSource.hpp new file mode 100644 index 000000000..1933cd7ec --- /dev/null +++ b/arabica/include/SAX/InputSource.hpp @@ -0,0 +1,337 @@ +#ifndef ARABICA_INPUTSOURCE_H +#define ARABICA_INPUTSOURCE_H + +// InputSource.h +// $Id$ + +#include +#include + +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * A single input source for an XML entity. + * + *

This class allows a SAX application to encapsulate information + * about an input source in a single object, which may include + * a public identifier, a system identifier, a byte stream (possibly + * with a specified encoding).

+ * + *

There are two places that the application will deliver this + * input source to the parser: as the argument to the Parser.parse + * method, or as the return value of the EntityResolver.resolveEntity + * method.

+ * + *

The SAX parser will use the InputSource object to determine + * how to read XML input. If there is a byte stream available, + * the parser will read that stream directly; if a byte stream is + * not available, the parser will attempt to open a URI connection + * to the resource identified by the system identifier.

+ * + *

An InputSource object belongs to the application: the SAX parser + * shall never modify it in any way (it may modify a copy if + * necessary).

+ * + * InputSource is still under active consideration, and its + * interface may change. + * + * @since SAX 1.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see Parser#parse + * @see EntityResolver#resolveEntity + */ +template > +class InputSource +{ +public: + /** + * Zero-argument default constructor. + * + * @see #setPublicId + * @see #setSystemId + * @see #setByteStream + * @see #setEncoding + */ + InputSource() : + byteStream_(), + publicId_(), + systemId_(), + encoding_() + { } + /** + * Create a new input source with a system identifier. + * + *

Applications may use setPublicId to include a + * public identifier as well, or setEncoding to specify + * the character encoding, if known.

+ * + *

If the system identifier is a URL, it must be full resolved.

+ * + * @param systemId The system identifier (URI). + * @see #setPublicId + * @see #setSystemId + * @see #setByteStream + * @see #setEncoding + */ + InputSource(const string_type& systemId) : + byteStream_(), + publicId_(), + systemId_(systemId), + encoding_() + { } + InputSource(const InputSource& rhs) : + byteStream_(rhs.byteStream_), + publicId_(rhs.publicId_), + systemId_(rhs.systemId_), + encoding_(rhs.encoding_) + { } + /** + * Create a new input source with a byte stream. + * + *

Application writers may use setSystemId to provide a base + * for resolving relative URIs, setPublicId to include a + * public identifier, and/or setEncoding to specify the object's + * character encoding.

+ * + * @param byteStream The raw byte stream containing the document. The + * InputSource does not assume ownership of + * this byteStream. + * @see #InputSource(std::auto_ptr) + * @see #setPublicId + * @see #setSystemId + * @see #setEncoding + * @see #setByteStream + */ + InputSource(std::istream& byteStream) : + byteStream_(byteStream), + publicId_(), + systemId_(), + encoding_() + { } + + + /** + * Create a new input source with a byte stream. + * + *

Application writers may use setSystemId to provide a base + * for resolving relative URIs, setPublicId to include a + * public identifier, and/or setEncoding to specify the object's + * character encoding.

+ * + * @param byteStream The raw byte stream containing the document. The + * InputSource assumes ownership of the byteStream + * and will delete it when no-longer required. + * @see InputSource(std::istream&) + * @see #setPublicId + * @see #setSystemId + * @see #setEncoding + * @see #setByteStream + */ + InputSource(std::auto_ptr byteStream) : + byteStream_(byteStream), + publicId_(), + systemId_(), + encoding_() + { } + + InputSource(std::auto_ptr byteStream) : + byteStream_(byteStream), + publicId_(), + systemId_(), + encoding_() + { } + + + virtual ~InputSource() { } + + InputSource& operator=(const InputSource& rhs) + { + byteStream_ = rhs.byteStream_; + publicId_ = rhs.publicId_; + systemId_ = rhs.systemId_; + encoding_ = rhs.encoding_; + + return *this; + } // operator= + + /** + * Set the public identifier for this input source. + * + *

The public identifier is always optional: if the application + * writer includes one, it will be provided as part of the + * location information.

+ * + * @param publicId The public identifier as a string. + * @see #getPublicId + * @see Locator#getPublicId + * @see SAXParseException#getPublicId + */ + void setPublicId(const string_type& publicId) { publicId_ = publicId; } + /** + * Get the public identifier for this input source. + * + * @return The public identifier, or an empty string if none was supplied. + * @see #setPublicId + */ + const string_type& getPublicId() const { return publicId_; } + + /** + * Set the system identifier for this input source. + * + *

The system identifier is optional if there is a byte + * stream but it is still useful to provide one, since the + * application can use it to resolve relative URIs and can + * include it in error messages and warnings (the parser will + * attempt to open a connection to the URI only if there is no + * byte stream specified).

+ * + *

If the application knows the character encoding of the + * object pointed to by the system identifier, it can register + * the encoding using the setEncoding method.

+ * + *

If the system ID is a URL, it must be fully resolved.

+ * + * @param systemId The system identifier as a string. + * @see #setEncoding + * @see #getSystemId + * @see Locator#getSystemId + * @see SAXParseException#getSystemId + */ + void setSystemId(const string_type& systemId) { systemId_ = systemId; } + /** + * Get the system identifier for this input source. + * + *

The getEncoding method will return the character encoding + * of the object pointed to, or an empty string if unknown.

+ * + *

If the system ID is a URL, it will be fully resolved.

+ * + * @return The system identifier. + * @see #setSystemId + * @see #getEncoding + */ + const string_type& getSystemId() const { return systemId_; } + + /** + * Set the byte stream for this input source. + * + *

The SAX parser will use a byte stream in preference + * to opening a URI connection itself.

+ * + *

If the application knows the character encoding of the + * byte stream, it should set it with the setEncoding method.

+ * + * @param byteStream A byte stream containing an XML document or + * other entity. The InputSource does not assume + * ownership of byteStream. + * @see #setByteStream(std::auto_ptr) To transfer ownership of + * an std::istream to an InputSource + * @see #setEncoding + * @see #getByteStream + * @see #getEncoding + */ + void setByteStream(std::istream& byteStream) + { + byteStream_ = byteStream; + } // setByteStream + + /** + * Set the byte stream for this input source. + * + *

The SAX parser will use a byte stream in preference to + * opening a URI connection itself.

+ * + *

If the application knows the character encoding of the + * byte stream, it should set it with the setEncoding method.

+ * + * @param byteStream A byte stream containing an XML document or + * other entity. The InputSource assumes + * ownership of byteStream. + * @see #setByteStream(std::istream&) + * @see #setEncoding + * @see #getByteStream + * @see #getEncoding + */ + void setByteStream(std::auto_ptr byteStream) + { + byteStream_ = byteStream; + } // setByteStream + + /** + * Get the byte stream for this input source as a std::istream*. + * + *

The getEncoding method will return the character + * encoding for this byte stream, or an empty string if unknown.

+ * + * @return The byte stream, or null if none was supplied. No ownership is + * transfered. + * @see #getEncoding + * @see #setByteStream + */ + std::istream* getByteStream() const + { + return byteStream_.get(); + } // getByteStream + + /** + * Get the byte stream for this input source as an IStreamHandle. + * + *

The getEncoding method will return the character + * encoding for this byte stream, or null if unknown.

+ * + * @return The byte stream, or null if none was supplied. Ownership is + * shared between this and the client code. + * @see #getEncoding + * @see #setByteStream + */ + IStreamHandle getByteStreamHandle() const + { + return byteStream_; + } + /** + * Set the character encoding, if known. + * + *

The encoding must be a string acceptable for an + * XML encoding declaration (see section 4.3.3 of the XML 1.0 + * recommendation).

+ * + * @param encoding A string describing the character encoding. + * @see #setSystemId + * @see #setByteStream + * @see #getEncoding + */ + void setEncoding(const string_type& encoding) { encoding_ = encoding; } + /** + * Get the character encoding for a byte stream or URI. + * + * @return The encoding, or an empty string if none was supplied. + * @see #setByteStream + * @see #getSystemId + * @see #getByteStream + */ + const string_type& getEncoding() const { return encoding_; } + + /////////////////////////////////////////////////////////// +private: + IStreamHandle byteStream_; + string_type publicId_; + string_type systemId_; + string_type encoding_; + + bool operator==(const InputSource&); // no implementation +}; // class InputSource + +} // namespace SAX +} // namespace Arabica + +#endif // #define InputSourceH +// end of file diff --git a/arabica/include/SAX/Locator.hpp b/arabica/include/SAX/Locator.hpp new file mode 100644 index 000000000..b2fe4e484 --- /dev/null +++ b/arabica/include/SAX/Locator.hpp @@ -0,0 +1,131 @@ +#ifndef ARABICA_LOCATOR_H +#define ARABICA_LOCATOR_H + +// Locator.h + +#include + +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Interface for associating a SAX event with a document location. + * + *

If a SAX parser provides location information to the SAX + * application, it does so by implementing this interface and then + * passing an instance to the application using the content + * handler's {@link ContentHandler#setDocumentLocator + * setDocumentLocator} method. The application can use the + * object to obtain the location of any other content handler event + * in the XML source document.

+ * + *

Note that the results returned by the object will be valid only + * during the scope of each content handler method: the application + * will receive unpredictable results if it attempts to use the + * locator at any other time.

+ * + *

SAX parsers are not required to supply a locator, but they are + * very strongly encouraged to do so. If the parser supplies a + * locator, it must do so before reporting any other document events. + * If no locator has been set by the time the application receives + * the {@link ContentHandler#startDocument startDocument} + * event, the application should assume that a locator is not + * available.

+ * + * @since SAX 1.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see ContentHandler#setDocumentLocator + */ +template > +class Locator +{ +public: + typedef string_type stringT; + + virtual ~Locator() { } + + /** + * Return the public identifier for the current document event. + * + *

The return value is the public identifier of the document + * entity or of the external parsed entity in which the markup + * triggering the event appears.

+ * + * @return A string containing the public identifier, or + * an empty string if none is available. + * @see #getSystemId + */ + virtual stringT getPublicId() const = 0; + /** + * Return the system identifier for the current document event. + * + *

The return value is the system identifier of the document + * entity or of the external parsed entity in which the markup + * triggering the event appears.

+ * + *

If the system identifier is a URL, the parser must resolve it + * fully before passing it to the application.

+ * + * @return A string containing the public identifier, or + * an empty string if none is available. + * @see #getPublicId + */ + virtual stringT getSystemId() const = 0; + /** + * Return the line number where the current document event ends. + * + *

Warning: The return value from the method + * is intended only as an approximation for the sake of error + * reporting; it is not intended to provide sufficient information + * to edit the character content of the original XML document.

+ * + *

The return value is an approximation of the line number + * in the document entity or external parsed entity where the + * markup triggering the event appears.

+ * + *

If possible, the SAX driver should provide the line position + * of the first character after the text associated with the document + * event. The first line in the document is line 1.

+ * + * @return The line number, or -1 if none is available. + * @see #getColumnNumber + */ + virtual size_t getLineNumber() const = 0; + /** + * Return the column number where the current document event ends. + * + *

Warning: The return value from the method + * is intended only as an approximation for the sake of error + * reporting; it is not intended to provide sufficient information + * to edit the character content of the original XML document.

+ * + *

The return value is an approximation of the column number + * in the document entity or external parsed entity where the + * markup triggering the event appears.

+ * + *

If possible, the SAX driver should provide the line position + * of the first character after the text associated with the document + * event.

+ * + *

If possible, the SAX driver should provide the line position + * of the first character after the text associated with the document + * event. The first column in each line is column 1.

+ * + * @return The column number, or -1 if none is available. + * @see #getLineNumber + */ + virtual size_t getColumnNumber() const = 0; +}; // class Locator + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/Parser.hpp b/arabica/include/SAX/Parser.hpp new file mode 100644 index 000000000..d85d47aed --- /dev/null +++ b/arabica/include/SAX/Parser.hpp @@ -0,0 +1,166 @@ +#ifndef ARABICA_PARSER_H +#define ARABICA_PARSER_H + +// Parser.h +// $Id$ + +#pragma message("Parser is a SAX1 class. Consider using XMLReader instead.") + +#include + +#include + +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Basic interface for SAX (Simple API for XML) parsers. + * + *

This was the main event supplier interface for SAX1; it has + * been replaced in SAX2 by {@link XMLReader XMLReader}, + * which includes Namespace support and sophisticated configurability + * and extensibility.

+ * + *

All SAX1 parsers must implement this basic interface: it allows + * applications to register handlers for different types of events + * and to initiate a parse from a URI, or a character stream.

+ * + *

All SAX1 parsers must also implement a zero-argument constructor + * (though other constructors are also allowed).

+ * + *

SAX1 parsers are reusable but not re-entrant: the application + * may reuse a parser object (possibly with a different input source) + * once the first parse has completed successfully, but it may not + * invoke the parse() methods recursively within a parse.

+ * + * @deprecated This interface has been replaced by the SAX2 + * {@link XMLReader XMLReader} + * interface, which includes Namespace support. + * @since SAX 1.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see EntityResolver + * @see DTDHandler + * @see DocumentHandler + * @see ErrorHandler + * @see HandlerBase + * @see InputSource + */ +template +class Parser +{ +public: + typedef string_name stringT; + typedef EntityResolver EntityResolverT; + typedef DTDHandler DTDHandlerT; + typedef DocumentHandler DocumentHandlerT; + ypedef InputSource InputSourceT; + + virtual ~Parser() { } + + // virtual void setLocale(Locale locale) throws SAXException = 0; + /** + * Allow an application to register a custom entity resolver. + * + *

If the application does not register an entity resolver, the + * SAX parser will resolve system identifiers and open connections + * to entities itself (this is the default behaviour implemented in + * HandlerBase).

+ * + *

Applications may register a new or different entity resolver + * in the middle of a parse, and the SAX parser must begin using + * the new resolver immediately.

+ * + * @param resolver The object for resolving entities. + * @see EntityResolver + * @see HandlerBase + */ + virtual void setEntityResolver(EntityResolverT& resolver) = 0; + /** + * Allow an application to register a DTD event handler. + * + *

If the application does not register a DTD handler, all DTD + * events reported by the SAX parser will be silently + * ignored (this is the default behaviour implemented by + * HandlerBase).

+ * + *

Applications may register a new or different + * handler in the middle of a parse, and the SAX parser must + * begin using the new handler immediately.

+ * + * @param handler The DTD handler. + * @see DTDHandler + * @see HandlerBase + */ + virtual void setDTDHandler(DTDHandlerT& handler) = 0; + /** + * Allow an application to register a document event handler. + * + *

If the application does not register a document handler, all + * document events reported by the SAX parser will be silently + * ignored (this is the default behaviour implemented by + * HandlerBase).

+ * + *

Applications may register a new or different handler in the + * middle of a parse, and the SAX parser must begin using the new + * handler immediately.

+ * + * @param handler The document handler. + * @see DocumentHandler + * @see HandlerBase + */ + virtual void setDocumentHandler(DocumentHandlerT& handler) = 0; + /** + * Allow an application to register an error event handler. + * + *

If the application does not register an error event handler, + * all error events reported by the SAX parser will be silently + * ignored, except for fatalError, which will throw a SAXException + * (this is the default behaviour implemented by HandlerBase).

+ * + *

Applications may register a new or different handler in the + * middle of a parse, and the SAX parser must begin using the new + * handler immediately.

+ * + * @param handler The error handler. + * @see ErrorHandler + * @see SAXException + * @see HandlerBase + */ + virtual void setErrorHandler(ErrorHandler& handler) = 0; + + /** + * Parse an XML document. + * + *

The application can use this method to instruct the SAX parser + * to begin parsing an XML document from any valid input + * source (a character stream, a byte stream, or a URI).

+ * + *

Applications may not invoke this method while a parse is in + * progress (they should create a new Parser instead for each + * additional XML document). Once a parse is complete, an + * application may reuse the same Parser object, possibly with a + * different input source.

+ * + * @param source The input source for the top-level of the + * XML document. + * @see InputSource + * @see #setEntityResolver + * @see #setDTDHandler + * @see #setDocumentHandler + * @see #setErrorHandler + */ + virtual void parse(InputSourceT& source) = 0; +}; // class Parser + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file + diff --git a/arabica/include/SAX/ParserConfig.hpp b/arabica/include/SAX/ParserConfig.hpp new file mode 100644 index 000000000..ff44c5a6d --- /dev/null +++ b/arabica/include/SAX/ParserConfig.hpp @@ -0,0 +1,91 @@ +#ifndef ARABICA_PARSERCONFIG_H +#define ARABICA_PARSERCONFIG_H + +#ifdef ARABICA_USE_LIBXML2 +#include +#undef DEF_SAX_P +#define DEF_SAX_P libxml2_wrapper +#ifdef _MSC_VER +#pragma message("Including libxml2") +#pragma comment(lib, "libxml2.lib") +#endif +#endif + +#ifdef ARABICA_USE_MSXML +#ifndef _MSC_VER +#error "Can only use MSXML on Windows" +#endif +#pragma message("Including MSXML") +#include +#undef DEF_SAX_P +#define DEF_SAX_P msxml2_wrapper +#endif + +#ifdef ARABICA_USE_XERCES +#include +#undef DEF_SAX_P +#define DEF_SAX_P xerces_wrapper +#ifndef ARABICA_NOT_USE_PRAGMA_LINKER_OPTIONS +#ifdef _MSC_VER +#pragma message("Including Xerces v3") +#ifdef _DEBUG +#pragma comment(lib, "xerces-c_3D.lib") +#else +#pragma comment(lib, "xerces-c_3.lib") +#endif +#endif +#endif +#endif + +#ifdef ARABICA_USE_GARDEN +#ifdef _MSC_VER +#pragma message("Including Garden") +#endif +#include +#undef DEF_SAX_P +#define DEF_SAX_P Garden +#endif + +#ifdef ARABICA_USE_EXPAT +#include +#undef DEF_SAX_P +#define DEF_SAX_P expat_wrapper +#ifdef _MSC_VER +#ifndef ARABICA_NOT_USE_PRAGMA_LINKER_OPTIONS +#pragma message("Including Expat") +#ifndef XML_STATIC +#pragma comment(lib, "libexpat.lib") +#else +#pragma comment(lib, "libexpatMT.lib") +#endif +#endif +#endif +#endif + +#ifdef _MSC_VER +#ifndef ARABICA_NOT_USE_PRAGMA_LINKER_OPTIONS +#pragma comment(lib, "wsock32.lib") +#endif +#endif + + +#ifndef NO_DEFAULT_PARSER +#ifdef DEF_SAX_P +namespace Arabica +{ +namespace SAX +{ +template + class XMLReader : public DEF_SAX_P { }; +} // namespace SAX +} // namespace Arabica +#else +#error "No default parser defined." +#endif +#endif + +#undef DEF_P + +#endif + + diff --git a/arabica/include/SAX/SAXException.hpp b/arabica/include/SAX/SAXException.hpp new file mode 100644 index 000000000..3712d476e --- /dev/null +++ b/arabica/include/SAX/SAXException.hpp @@ -0,0 +1,68 @@ +#ifndef ARABICA_SAXEXCEPTION_H +#define ARABICA_SAXEXCEPTION_H + +// SAXParseException.h +// $Id$ + +#include + +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Encapsulate a general SAX error or warning. + * + *

This class can contain basic error or warning information from + * either the XML parser or the application: a parser writer or + * application writer can subclass it to provide additional + * functionality. SAX handlers may throw this exception or + * any exception subclassed from it.

+ * + *

If the parser or application needs to include information about a + * specific location in an XML document, it should use the + * {@link SAXParseException SAXParseException} subclass.

+ * + * @since SAX 1.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see SAXParseException + */ +class SAXException : public std::runtime_error +{ +public: + SAXException() : std::runtime_error("Unspecified SAX Exception") + { + } // SAXException + + SAXException(const std::string& message) : + std::runtime_error(message) + { + } // SAXException + + SAXException(const SAXException& rhs) : + std::runtime_error(rhs.what()) + { + } // SAXException + + virtual ~SAXException() throw() { } + + SAXException& operator=(const SAXException& rhs) + { + std::runtime_error* re = static_cast(this); + *re = rhs; + return *this; + } // operator= + +private: + bool operator==(const SAXException&); +}; // class SAXException + +} // namespace SAX +} // namespace Arabica + +#endif // SAXExceptionH diff --git a/arabica/include/SAX/SAXNotRecognizedException.hpp b/arabica/include/SAX/SAXNotRecognizedException.hpp new file mode 100644 index 000000000..77e4a0f9a --- /dev/null +++ b/arabica/include/SAX/SAXNotRecognizedException.hpp @@ -0,0 +1,54 @@ +#ifndef ARABICA_SAXNOTRECOGNIZEDEXCEPTION_H +#define ARABICA_SAXNOTRECOGNIZEDEXCEPTION_H + +// SAXNotRecognizedException.h +// $Id$ + + +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Exception class for an unrecognized identifier. + * + *

An XMLReader will throw this exception when it finds an + * unrecognized feature or property identifier; SAX applications and + * extensions may use this class for other, similar purposes.

+ * + * @since SAX 2.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see SAXNotSupportedException + */ +class SAXNotRecognizedException : public SAXException +{ +public: + SAXNotRecognizedException(const std::string& message) : + SAXException(message) + { + } // SAXNotRecognizedException + + SAXNotRecognizedException(const SAXNotRecognizedException& rhs) : + SAXException(rhs) + { + } // SAXNotRecognizedException + + SAXNotRecognizedException& operator=(const SAXNotRecognizedException& rhs) + { + SAXException::operator =(rhs); + return *this; + } // operator= + + virtual ~SAXNotRecognizedException() throw() { } +}; // class SAXNotRecognizedException + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/SAXNotSupportedException.hpp b/arabica/include/SAX/SAXNotSupportedException.hpp new file mode 100644 index 000000000..8d0cf5bbe --- /dev/null +++ b/arabica/include/SAX/SAXNotSupportedException.hpp @@ -0,0 +1,56 @@ +#ifndef ARARBICA_SAXNOTSUPPORTEDEXCEPTION_H +#define ARARBICA_SAXNOTSUPPORTEDEXCEPTION_H + +// SAXNotSupportedException.h +// $Id$ + + +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Exception class for an unsupported operation. + * + *

An XMLReader will throw this exception when it recognizes a + * feature or property identifier, but cannot perform the requested + * operation (setting a state or value). Other SAX2 applications and + * extensions may use this class for similar purposes.

+ * + * @since SAX 2.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see SAXNotRecognizedException + */ +class SAXNotSupportedException : public SAXException +{ +public: + SAXNotSupportedException(const std::string& message) : + SAXException(message) + { + } // SAXNotSupportedException + + SAXNotSupportedException(const SAXNotSupportedException& rhs) : + SAXException(rhs) + { + } // SAXNotSupportedException + + SAXNotSupportedException& operator=(const SAXNotSupportedException& rhs) + { + SAXException::operator =(rhs); + return *this; + } // operator= + + virtual ~SAXNotSupportedException() throw() { } + +}; // class SAXNotSupportedException + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/SAXParseException.hpp b/arabica/include/SAX/SAXParseException.hpp new file mode 100644 index 000000000..fd82e5248 --- /dev/null +++ b/arabica/include/SAX/SAXParseException.hpp @@ -0,0 +1,170 @@ +#ifndef ARABICA_SAXPARSEEXCEPTION_H +#define ARABICA_SAXPARSEEXCEPTION_H + +// SAXParseException.h +// $Id$ + +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Encapsulate an XML parse error or warning. + * + *

This exception will include information for locating the error + * in the original XML document. Note that although the application + * will receive a SAXParseException as the argument to the handlers + * in the {@link ErrorHandler ErrorHandler} interface, + * the application is not actually required to throw the exception; + * instead, it can simply read the information in it and take a + * different action.

+ * + *

Since this exception is a subclass of {@link SAXException + * SAXException}, it inherits the ability to wrap another exception.

+ * + * @since SAX 1.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see SAXException + * @see Locator + * @see ErrorHandler + */ +template > +class SAXParseException : public SAXException +{ +public: + typedef Locator LocatorT; + + SAXParseException(const std::string& message) : + SAXException(message), + publicId_(), + systemId_(), + lineNumber_(-1), + columnNumber_(-1) + { + setMsg(); + } // SAXParseException + + SAXParseException(const std::string& message, + const LocatorT& locator) : + SAXException(message), + publicId_(locator.getPublicId()), + systemId_(locator.getSystemId()), + lineNumber_(locator.getLineNumber()), + columnNumber_(locator.getColumnNumber()) + { + setMsg(); + } // SAXParseException + + SAXParseException(const std::string& message, + const string_type& publicId, + const string_type& systemId, + size_t lineNumber, + size_t columnNumber) : + SAXException(message), + publicId_(publicId), + systemId_(systemId), + lineNumber_(lineNumber), + columnNumber_(columnNumber) + { + setMsg(); + } // SAXParseException + + SAXParseException(const SAXParseException& rhs) : + SAXException(rhs), + msg_(rhs.msg_), + publicId_(rhs.publicId_), + systemId_(rhs.systemId_), + lineNumber_(rhs.lineNumber_), + columnNumber_(rhs.columnNumber_) + { + } // SAXParseException + + SAXParseException& operator=(const SAXParseException& rhs) + { + SAXException::operator=(rhs); + + msg_ = rhs.msg_; + publicId_ = rhs.publicId_; + systemId_ = rhs.systemId_; + lineNumber_ = rhs.lineNumber_; + columnNumber_ = rhs.columnNumber_; + + return *this; + } // operator= + + virtual ~SAXParseException() throw() { } + + /** + * Get the public identifier of the entity where the exception occurred. + * + * @return A string containing the public identifier, or an empty string + * if none is available. + * @see Locator#getPublicId + */ + const string_type& getPublicId() const { return publicId_; } + /** + * Get the system identifier of the entity where the exception occurred. + * + *

If the system identifier is a URL, it will be resolved + * fully.

+ * + * @return A string containing the system identifier, or an empty string + * if none is available. + * @see Locator#getSystemId + */ + const string_type& getSystemId() const { return systemId_; } + /** + * The line number of the end of the text where the exception occurred. + * + * @return An integer representing the line number, or -1 + * if none is available. + * @see Locator#getLineNumber + */ + size_t getLineNumber() const { return lineNumber_; } + /** + * The column number of the end of the text where the exception occurred. + * + *

The first column in a line is position 1.

+ * + * @return An integer representing the column number, or -1 + * if none is available. + * @see Locator#getColumnNumber + */ + size_t getColumnNumber() const { return columnNumber_; } + + virtual const char* what() const throw() + { + return msg_.c_str(); + } // what + +private: + void setMsg() + { + std::ostringstream str; + str << "Parse exception at " << lineNumber_ << "," << columnNumber_ << std::endl; + str << SAXException::what(); + msg_ = str.str(); + } // setMsg + + std::string msg_; + + string_type publicId_; + string_type systemId_; + size_t lineNumber_; + size_t columnNumber_; + + SAXParseException(); +}; // class SAXParseException + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/XMLFilter.hpp b/arabica/include/SAX/XMLFilter.hpp new file mode 100644 index 000000000..ea5f1273a --- /dev/null +++ b/arabica/include/SAX/XMLFilter.hpp @@ -0,0 +1,71 @@ +#ifndef ARABICA_XML_FILTER_H +#define ARABICA_XML_FILTER_H + +// XMLFilter.h + +#include +#include + +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Interface for an XML filter. + * + *

An XML filter is like an XML reader, except that it obtains its + * events from another XML reader rather than a primary source like + * an XML document or database. Filters can modify a stream of + * events as they pass on to the final application.

+ * + *

The XMLFilterImpl helper class provides a convenient base + * for creating SAX2 filters, by passing on all {@link EntityResolver + * EntityResolver}, {@link DTDHandler DTDHandler}, + * {@link ContentHandler ContentHandler} and {@link ErrorHandler + * ErrorHandler} events automatically.

+ * + * @since SAX 2.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see XMLFilterImpl + */ +template +class XMLFilter : public XMLReaderInterface +{ +public: + typedef XMLReaderInterface XMLReaderT; + typedef typename XMLReaderT::string_adaptor string_adaptor; + + virtual ~XMLFilter() { } + + /** + * Set the parent reader. + * + *

This method allows the application to link the filter to + * a parent reader (which may be another filter).

+ * + * @param parent The parent reader. + */ + virtual void setParent(XMLReaderT& parent) = 0; + /** + * Get the parent reader. + * + *

This method allows the application to query the parent + * reader (which may be another filter). It is generally a + * bad idea to perform any operations on the parent reader + * directly: they should all pass through this filter.

+ * + * @return The parent filter, or 0 if none has been set. + */ + virtual XMLReaderT* getParent() const = 0; +}; // class XMLFilter + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/XMLReader.hpp b/arabica/include/SAX/XMLReader.hpp new file mode 100644 index 000000000..159d096d4 --- /dev/null +++ b/arabica/include/SAX/XMLReader.hpp @@ -0,0 +1,450 @@ +#ifndef ARABICA_XML_READER_H +#define ARABICA_XML_READER_H + +// XMLReader.h +// $Id$ + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Interface for reading an XML document using callbacks. + * + *

XMLReader is the interface that an XML parser's SAX2 driver must + * implement. This interface allows an application to set and + * query features and properties in the parser, to register + * event handlers for document processing, and to initiate + * a document parse.

+ * + *

All SAX interfaces are assumed to be synchronous: the + * {@link #parse parse} methods must not return until parsing + * is complete, and readers must wait for an event-handler callback + * to return before reporting the next event.

+ * + *

This interface replaces the (now deprecated) SAX 1.0 {@link + * Parser Parser} interface. The XMLReader interface + * contains two important enhancements over the old Parser + * interface:

+ * + *
    + *
  1. it adds a standard way to query and set features and + * properties; and
  2. + *
  3. it adds Namespace support, which is required for many + * higher-level XML standards.
  4. + *
+ * + *

There are adapters available to convert a SAX1 Parser to + * a SAX2 XMLReader and vice-versa.

+ * + * @since SAX 2.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see XMLFilter + * @see helpers.ParserAdapter + * @see helpers.XMLReaderAdapter + */ +class XMLReaderInterface_tag { }; + +template +class XMLReaderInterface : public XMLReaderInterface_tag +{ +public: + typedef string_adaptor_type string_adaptor; + typedef EntityResolver EntityResolverT; + typedef DTDHandler DTDHandlerT; + typedef ContentHandler ContentHandlerT; + typedef InputSource InputSourceT; + typedef ErrorHandler ErrorHandlerT; + typedef DeclHandler DeclHandlerT; + typedef LexicalHandler LexicalHandlerT; + + virtual ~XMLReaderInterface() { } + + ///////////////////////////////////////////////// + // Configuration + /** + * Look up the value of a feature. + * + *

The feature name is any fully-qualified URI. It is + * possible for an XMLReader to recognize a feature name but + * to be unable to return its value; this is especially true + * in the case of an adapter for a SAX1 Parser, which has + * no way of knowing whether the underlying parser is + * performing validation or expanding external entities.

+ * + *

All XMLReaders are required to recognize the + * http://xml.org/sax/features/namespaces and the + * http://xml.org/sax/features/namespace-prefixes feature names.

+ * + *

Some feature values may be available only in specific + * contexts, such as before, during, or after a parse.

+ * + *

Typical usage is something like this:

+ * + *
+   * XMLReader r = new MySAXDriver();
+   *
+   *                         // try to activate validation
+   * try {
+   *   r.setFeature("http://xml.org/sax/features/validation", true);
+   * } catch (SAXException e) {
+   *   System.err.println("Cannot activate validation."); 
+   * }
+   *
+   *                         // register event handlers
+   * r.setContentHandler(new MyContentHandler());
+   * r.setErrorHandler(new MyErrorHandler());
+   *
+   *                         // parse the first document
+   * try {
+   *   r.parse("http://www.foo.com/mydoc.xml");
+   * } catch (IOException e) {
+   *   System.err.println("I/O exception reading XML document");
+   * } catch (SAXException e) {
+   *   System.err.println("XML exception reading document.");
+   * }
+   * 
+ * + *

Implementors are free (and encouraged) to invent their own features, + * using names built on their own URIs.

+ * + * @param name The feature name, which is a fully-qualified URI. + * @return The current state of the feature (true or false). + * @see #setFeature + */ + virtual bool getFeature(const string_type& name) const = 0; + /** + * Set the state of a feature. + * + *

The feature name is any fully-qualified URI. It is + * possible for an XMLReader to recognize a feature name but + * to be unable to set its value; this is especially true + * in the case of an adapter for a SAX1 {@link Parser Parser}, + * which has no way of affecting whether the underlying parser is + * validating, for example.

+ * + *

All XMLReaders are required to support setting + * http://xml.org/sax/features/namespaces to true and + * http://xml.org/sax/features/namespace-prefixes to false.

+ * + *

Some feature values may be immutable or mutable only + * in specific contexts, such as before, during, or after + * a parse.

+ * + * @param name The feature name, which is a fully-qualified URI. + * @param value The requested value of the feature (true or false). + * @exception SAXNotRecognizedException When the + * XMLReader does not recognize the feature name. + * @exception SAXNotSupportedException When the + * XMLReader recognizes the feature name but + * cannot set the requested value. + * @see #getFeature + * @see FeatureNames + * @see http://www.saxproject.org/apidoc/org/xml/sax/package-summary.html#package_description for a list of SAX2 features. + */ + virtual void setFeature(const string_type& name, bool value) = 0; + + ///////////////////////////////////////////////// + // Event Handlers + /** + * Allow an application to register an entity resolver. + * + *

If the application does not register an entity resolver, + * the XMLReader will perform its own default resolution.

+ * + *

Applications may register a new or different resolver in the + * middle of a parse, and the SAX parser must begin using the new + * resolver immediately.

+ * + * @param resolver The entity resolver. + * @see #getEntityResolver + */ + virtual void setEntityResolver(EntityResolverT& resolver) = 0; + /** + * Return the current entity resolver. + * + * @return The current entity resolver, or null if none + * has been registered. + * @see #setEntityResolver + */ + virtual EntityResolverT* getEntityResolver() const = 0; + + /** + * Allow an application to register a DTD event handler. + * + *

If the application does not register a DTD handler, all DTD + * events reported by the SAX parser will be silently ignored.

+ * + *

Applications may register a new or different handler in the + * middle of a parse, and the SAX parser must begin using the new + * handler immediately.

+ * + * @param handler The DTD handler. + * @see #getDTDHandler + */ + virtual void setDTDHandler(DTDHandlerT& handler) = 0; + /** + * Return the current DTD handler. + * + * @return The current DTD handler, or null if none + * has been registered. + * @see #setDTDHandler + */ + virtual DTDHandlerT* getDTDHandler() const = 0; + + /** + * Allow an application to register a content event handler. + * + *

If the application does not register a content handler, all + * content events reported by the SAX parser will be silently + * ignored.

+ * + *

Applications may register a new or different handler in the + * middle of a parse, and the SAX parser must begin using the new + * handler immediately.

+ * + * @param handler The content handler. + * @see #getContentHandler + */ + virtual void setContentHandler(ContentHandlerT& handler) = 0; + /** + * Return the current content handler. + * + * @return The current content handler, or null if none + * has been registered. + * @see #setContentHandler + */ + virtual ContentHandlerT* getContentHandler() const = 0; + + /** + * Allow an application to register an error event handler. + * + *

If the application does not register an error handler, all + * error events reported by the SAX parser will be silently + * ignored; however, normal processing may not continue. It is + * highly recommended that all SAX applications implement an + * error handler to avoid unexpected bugs.

+ * + *

Applications may register a new or different handler in the + * middle of a parse, and the SAX parser must begin using the new + * handler immediately.

+ * + * @param handler The error handler. + * @see #getErrorHandler + */ + virtual void setErrorHandler(ErrorHandlerT& handler) = 0; + /** + * Return the current error handler. + * + * @return The current error handler, or null if none + * has been registered. + * @see #setErrorHandler + */ + virtual ErrorHandlerT* getErrorHandler() const = 0; + + virtual void setDeclHandler(DeclHandlerT& handler) = 0; + /** + * Return the current decl handler. + * + * @return The current decl handler, or 0 if none has been registered + * @see #setDeclHandler + */ + virtual DeclHandlerT* getDeclHandler() const = 0; + + virtual void setLexicalHandler(LexicalHandlerT& handler) = 0; + /** + * Return the current lexical handler. + * + * @return The current lexical handler, or 0 if none has been registered + * @see #setLexicalHandler + */ + virtual LexicalHandlerT* getLexicalHandler() const = 0; + + ////////////////////////////////////////////////// + // Parsing + /** + * Parse an XML document from a system identifier (URI). + * + *

This method is a shortcut for the common case of reading a + * document from a system identifier. It is the exact + * equivalent of the following:

+ * + *
+   * InputSource is(systemId);
+   * parse(is);
+   * 
+ * + *

If the system identifier is a URL, it must be fully resolved + * by the application before it is passed to the parser.

+ * + * @param systemId The system identifier (URI). + * @see #parse(InputSource&) + */ + void parse(const string_type& systemId) + { + InputSourceT is(systemId); + parse(is); + } // parser + /** + * Parse an XML document. + * + *

The application can use this method to instruct the XML + * reader to begin parsing an XML document from any valid input + * source (a character stream, a byte stream, or a URI).

+ * + *

Applications may not invoke this method while a parse is in + * progress (they should create a new XMLReader instead for each + * nested XML document). Once a parse is complete, an + * application may reuse the same XMLReader object, possibly with a + * different input source.

+ * + *

During the parse, the XMLReader will provide information + * about the XML document through the registered event + * handlers.

+ * + *

This method is synchronous: it will not return until parsing + * has ended. If a client application wants to terminate + * parsing early, it should throw an exception.

+ * + * @param input The input source for the top-level of the + * XML document. + * @see InputSource + * @see #parse(const string_type&) + * @see #setEntityResolver + * @see #setDTDHandler + * @see #setContentHandler + * @see #setErrorHandler + */ + virtual void parse(InputSourceT& input) = 0; + + //////////////////////////////////////////////////// + // property implementation +protected: + class PropertyBase + { + public: + virtual ~PropertyBase() { } + }; // PropertyBase + + template + class Property : public PropertyBase + { + public: + Property(propertyTypeT wrappee) : wrappee_(wrappee) { } + + propertyTypeT get() { return wrappee_; } + + private: + propertyTypeT wrappee_; + }; // class Property + +public: + virtual std::auto_ptr doGetProperty(const string_type& name) = 0; + virtual void doSetProperty(const string_type& name, std::auto_ptr value) = 0; + + /** + * Look up the value of a property. + * + *

The property name is any fully-qualified URI. It is + * possible for an XMLReader to recognize a property name but + * to be unable to return its state; this is especially true + * in the case of an adapter for a SAX1 {@link Parser + * Parser}.

+ * + *

XMLReaders are not required to recognize any specific + * property names, though an initial core set is documented for + * SAX2.

+ * + *

Some property values may be available only in specific + * contexts, such as before, during, or after a parse.

+ * + *

Implementors are free (and encouraged) to invent their own properties, + * using names built on their own URIs.

+ * + * @param name The property name, which is a fully-qualified URI. + * @return The current value of the property. + * @exception SAXNotRecognizedException When the + * XMLReader does not recognize the property name. + * @exception SAXNotSupportedException When the + * XMLReader recognizes the property name but + * cannot determine its value at this time. + * @see #setProperty + */ + template + propertyTypeT& getProperty(const string_type& name) const + { + std::auto_ptr pb = doGetProperty(name); + Property* prop = dynamic_cast* >(pb.get()); + + if(!prop) + throw SAX::SAXNotSupportedException("Property " + name + " is wrong type."); + + return prop->get(); + } // getProperty + + /** + * Set the value of a property. + * + *

The property name is any fully-qualified URI. It is + * possible for an XMLReader to recognize a property name but + * to be unable to set its value; this is especially true + * in the case of an adapter for a SAX1 {@link Parser + * Parser}.

+ * + *

XMLReaders are not required to recognize setting + * any specific property names, though a core set is provided with + * SAX2.

+ * + *

Some property values may be immutable or mutable only + * in specific contexts, such as before, during, or after + * a parse.

+ * + *

This method is also the standard mechanism for setting + * extended handlers.

+ * + * @param name The property name, which is a fully-qualified URI. + * @param value The requested value for the property. + * @exception SAXNotRecognizedException When the + * XMLReader does not recognize the property name. + * @exception SAXNotSupportedException When the + * XMLReader recognizes the property name but + * cannot set the requested value. + */ + template + void setProperty(const string_type& name, propertyTypeT& value) + { + Property* prop = new Property(value); + doSetProperty(name, std::auto_ptr(prop)); + } // setProperty +}; // class XMLReaderInterface + +} // namespace SAX +} // namespace Arabica + +/* Included to ensure that #include defines a class called + * XMLReader. + */ +#include + +#endif +// end of file diff --git a/arabica/include/SAX/ext/Attributes2.hpp b/arabica/include/SAX/ext/Attributes2.hpp new file mode 100644 index 000000000..30f96d801 --- /dev/null +++ b/arabica/include/SAX/ext/Attributes2.hpp @@ -0,0 +1,79 @@ +#ifndef ARABICA_ATTRIBUTES2_H +#define ARABICA_ATTRIBUTES2_H + +// Attributes2.h +// $Id$ + +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * SAX2 extension to augment the per-attribute information provided though Attributes. + * + * If an implementation supports + * this extension, the attributes provided in + * ContentHandler.startElement() will implement this + * interface, and the http://xml.org/sax/features/use-attributes2 + * feature flag will have the value true. + * + * @since SAX 2.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see Attributes + * @see Attributes2Impl + */ +template +class Attributes2 : public Attributes +{ + +public: + typedef string_type stringT; + virtual ~Attributes2() { } + + /** + * Returns true unless the attribute value was provided by DTD defaulting. + * + * @param index The attribute index (zero-based). + * @return true if the value was found in the XML text, + * false if the value was provided by DTD defaulting. + * @throws std::out_of_range exception when the supplied index + * does not identify an attribute + */ + virtual bool isSpecified(unsigned int index) const = 0; + + /** + * Returns true unless the attribute value was provided by DTD defaulting. + * + * @param qName The XML 1.0 qualified name. + * @return true if the value was found in the XML text, + * false if the value was provided by DTD defaulting. + * @throws std::invalid_argument exception when the supplied name + * does not identify an attribute + */ + virtual bool isSpecified(const stringT& qName) const = 0; + + /** + * Returns true unless the attribute value was provided by DTD defaulting. + * + * @param uri The Namespace URI, or the empty string if the name has no + * Namespace URI. + * @param localName The attribute's local name. + * @return true if the value was found in the XML text, + * false if the value was provided by DTD defaulting. + * @throws std::invalid_argument exception when the supplied names + * does not identify an attribute + */ + virtual bool isSpecified(const stringT& uri, const stringT& localName) const = 0; +}; // class Attributes2 + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/ext/DeclHandler.hpp b/arabica/include/SAX/ext/DeclHandler.hpp new file mode 100644 index 000000000..29125a910 --- /dev/null +++ b/arabica/include/SAX/ext/DeclHandler.hpp @@ -0,0 +1,135 @@ +#ifndef ARABICA_DECL_HANDLER_H +#define ARABICA_DECL_HANDLER_H + +// DeclHandler.h +// $Id$ + +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * SAX2 extension handler for DTD declaration events. + * + *

This is an optional extension handler for SAX2 to provide + * information about DTD declarations in an XML document. XML + * readers are not required to support this handler, and this + * handler is not included in the core SAX2 distribution.

+ * + *

Note that data-related DTD declarations (unparsed entities and + * notations) are already reported through the {@link + * DTDHandler DTDHandler} interface.

+ * + *

If you are using the declaration handler together with a lexical + * handler, all of the events will occur between the + * {@link LexicalHandler#startDTD startDTD} and the + * {@link LexicalHandler#endDTD endDTD} events.

+ * + *

To set the DeclHandler for an XML reader, use the + * {@link XMLReader#setProperty setProperty} method + * with the propertyId "http://xml.org/sax/properties/declaration-handler". + * If the reader does not support declaration events, it will throw a + * {@link SAXNotRecognizedException SAXNotRecognizedException} + * or a + * {@link SAXNotSupportedException SAXNotSupportedException} + * when you attempt to register the handler.

+ * + * @since 2.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 1.0 + * @see XMLReader + */ +template > +class DeclHandler +{ +public: + virtual ~DeclHandler() { } + + /** + * Report an element type declaration. + * + *

The content model will consist of the string "EMPTY", the + * string "ANY", or a parenthesised group, optionally followed + * by an occurrence indicator. The model will be normalized so + * that all parameter entities are fully resolved and all whitespace + * is removed,and will include the enclosing parentheses. Other + * normalization (such as removing redundant parentheses or + * simplifying occurrence indicators) is at the discretion of the + * parser.

+ * + * @param name The element type name. + * @param model The content model as a normalized string. + */ + virtual void elementDecl(const string_type& name, const string_type& model) = 0; + /** + * Report an attribute type declaration. + * + *

Only the effective (first) declaration for an attribute will + * be reported. The type will be one of the strings "CDATA", + * "ID", "IDREF", "IDREFS", "NMTOKEN", "NMTOKENS", "ENTITY", + * "ENTITIES", a parenthesized token group with + * the separator "|" and all whitespace removed, or the word + * "NOTATION" followed by a space followed by a parenthesized + * token group with all whitespace removed.

+ * + *

Any parameter entities in the attribute value will be + * expanded, but general entities will not.

+ * + * @param elementName The name of the associated element. + * @param attributeName The name of the attribute. + * @param type A string representing the attribute type. + * @param valueDefault A string representing the attribute default + * ("#IMPLIED", "#REQUIRED", or "#FIXED") or empty string if + * none of these applies. + * @param value A string representing the attribute's default value, + * or empty string if there is none. + */ + virtual void attributeDecl(const string_type& elementName, + const string_type& attributeName, + const string_type& type, + const string_type& valueDefault, + const string_type& value) = 0; + /** + * Report an internal entity declaration. + * + *

Only the effective (first) declaration for each entity + * will be reported. All parameter entities in the value + * will be expanded, but general entities will not.

+ * + * @param name The name of the entity. If it is a parameter + * entity, the name will begin with '%'. + * @param value The replacement text of the entity. + * @see #externalEntityDecl + * @see DTDHandler#unparsedEntityDecl + */ + virtual void internalEntityDecl(const string_type& name, const string_type& value) = 0; + /** + * Report a parsed external entity declaration. + * + *

Only the effective (first) declaration for each entity + * will be reported.

+ * + * @param name The name of the entity. If it is a parameter + * entity, the name will begin with '%'. + * @param publicId The declared public identifier of the entity, or + * an empty string if none was declared. + * @param systemId The declared system identifier of the entity. + * @see #internalEntityDecl + * @see DTDHandler#unparsedEntityDecl + */ + virtual void externalEntityDecl(const string_type& name, + const string_type& publicId, + const string_type& systemId) = 0; +}; // class DeclHandler + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/ext/LexicalHandler.hpp b/arabica/include/SAX/ext/LexicalHandler.hpp new file mode 100644 index 000000000..3e57576ab --- /dev/null +++ b/arabica/include/SAX/ext/LexicalHandler.hpp @@ -0,0 +1,202 @@ +#ifndef ARABICA_LEXICAL_HANDLER_H +#define ARABICA_LEXICAL_HANDLER_H + +// LexicalHandler.h +// $Id$ + +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * SAX2 extension handler for lexical events. + * + *

This is an optional extension handler for SAX2 to provide + * lexical information about an XML document, such as comments + * and CDATA section boundaries; XML readers are not required to + * support this handler, and it is not part of the core SAX2 + * distribution.

+ * + *

The events in the lexical handler apply to the entire document, + * not just to the document element, and all lexical handler events + * must appear between the content handler's startDocument and + * endDocument events.

+ * + *

To set the LexicalHandler for an XML reader, use the + * {@link XMLReader#setProperty setProperty} method + * with the propertyId "http://xml.org/sax/properties/lexical-handler". + * If the reader does not support lexical events, it will throw a + * {@link SAXNotRecognizedException SAXNotRecognizedException} + * or a + * {@link SAXNotSupportedException SAXNotSupportedException} + * when you attempt to register the handler.

+ * + * @since 2.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 1.0 + * @see XMLReader#setProperty + * @see SAXNotRecognizedException + * @see SAXNotSupportedException + */ +template > +class LexicalHandler +{ +public: + virtual ~LexicalHandler() { } + + /** + * Report the start of DTD declarations, if any. + * + *

This method is intended to report the beginning of the + * DOCTYPE declaration; if the document has no DOCTYPE declaration, + * this method will not be invoked.

+ * + *

All declarations reported through + * {@link DTDHandler DTDHandler} or + * {@link DeclHandler DeclHandler} events must appear + * between the startDTD and {@link #endDTD endDTD} events. + * Declarations are assumed to belong to the internal DTD subset + * unless they appear between {@link #startEntity startEntity} + * and {@link #endEntity endEntity} events. Comments and + * processing instructions from the DTD should also be reported + * between the startDTD and endDTD events, in their original + * order of (logical) occurrence; they are not required to + * appear in their correct locations relative to DTDHandler + * or DeclHandler events, however.

+ * + *

Note that the start/endDTD events will appear within + * the start/endDocument events from ContentHandler and + * before the first + * {@link ContentHandler#startElement startElement} + * event.

+ * + * @param name The document type name. + * @param publicId The declared public identifier for the + * external DTD subset, or an empty string if none was declared. + * @param systemId The declared system identifier for the + * external DTD subset, or an empty string if none was declared. + * @see #endDTD + * @see #startEntity + */ + virtual void startDTD(const string_type& name, + const string_type& publicId, + const string_type& systemId) = 0; + /** + * Report the end of DTD declarations. + * + *

This method is intended to report the end of the + * DOCTYPE declaration; if the document has no DOCTYPE declaration, + * this method will not be invoked.

+ * + * @see #startDTD + */ + virtual void endDTD() = 0; + + /** + * Report the beginning of some internal and external XML entities. + * + *

The reporting of parameter entities (including + * the external DTD subset) is optional, and SAX2 drivers that + * support LexicalHandler may not support it; you can use the + * http://xml.org/sax/features/lexical-handler/parameter-entities + * feature to query or control the reporting of parameter entities.

+ * + *

General entities are reported with their regular names, + * parameter entities have '%' prepended to their names, and + * the external DTD subset has the pseudo-entity name "[dtd]".

+ * + *

When a SAX2 driver is providing these events, all other + * events must be properly nested within start/end entity + * events. There is no additional requirement that events from + * {@link DeclHandler DeclHandler} or + * {@link DTDHandler DTDHandler} be properly ordered.

+ * + *

Note that skipped entities will be reported through the + * {@link ContentHandler#skippedEntity skippedEntity} + * event, which is part of the ContentHandler interface.

+ * + *

Because of the streaming event model that SAX uses, some + * entity boundaries cannot be reported under any + * circumstances:

+ * + *
    + *
  • general entities within attribute values
  • + *
  • parameter entities within declarations
  • + *
+ * + *

These will be silently expanded, with no indication of where + * the original entity boundaries were.

+ * + *

Note also that the boundaries of character references (which + * are not really entities anyway) are not reported.

+ * + *

All start/endEntity events must be properly nested. + * + * @param name The name of the entity. If it is a parameter + * entity, the name will begin with '%', and if it is the + * external DTD subset, it will be "[dtd]". + * @see #endEntity + * @see DeclHandler#internalEntityDecl + * @see DeclHandler#externalEntityDecl + */ + virtual void startEntity(const string_type& name) = 0; + /** + * Report the end of an entity. + * + * @param name The name of the entity that is ending. + * @see #startEntity + */ + virtual void endEntity(const string_type& name) = 0; + + /** + * Report the start of a CDATA section. + * + *

The contents of the CDATA section will be reported through + * the regular {@link ContentHandler#characters + * characters} event; this event is intended only to report + * the boundary.

+ * + * @see #endCDATA + */ + virtual void startCDATA() = 0; + /** + * Report the end of a CDATA section. + * + * @see #startCDATA + */ + virtual void endCDATA() = 0; + + /** + * Report an XML comment anywhere in the document. + * + *

This callback will be used for comments inside or outside the + * document element, including comments in the external DTD + * subset (if read). Comments in the DTD must be properly + * nested inside start/endDTD and start/endEntity events (if + * used).

+ * + * @param text A string holding the comment. + */ + virtual void comment(const string_type& text) = 0; + + const string_type dtd_pseudo_entity; + + protected: + LexicalHandler() : + dtd_pseudo_entity(string_adaptor::construct_from_utf8("[dtd]")) + { + } +}; // class LexicalHandler + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/ext/Locator2.hpp b/arabica/include/SAX/ext/Locator2.hpp new file mode 100644 index 000000000..4642d5fe6 --- /dev/null +++ b/arabica/include/SAX/ext/Locator2.hpp @@ -0,0 +1,75 @@ +#ifndef ARABICA_LOCATOR2_H +#define ARABICA_LOCATOR2_H + +// Locator2.h +// $Id$ + +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * SAX2 extension to augment the entity information provided though a Locator. + *

+ * If an implementation supports this extension, the Locator provided in + * ContentHandler.setDocumentLocator() will implement this interface, + * and the http://xml.org/sax/features/use-locator2 feature flag will have + * the value true. + *

+ * @since SAX 2.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + */ +template +class Locator2 : public Locator +{ +public: + typedef string_type stringT; + + virtual ~Locator2() { } + + /** + * Returns the version of XML used for the entity. This will normally + * be the identifier from the current entity's + * declaration, or be defaulted by the parser. + *

+ * At this writing, only one version ("1.0") is defined, but it seems + * likely that a new version will be defined which has slightly different + * rules about which characters are legal in XML names. + *

+ * @return Identifier for the XML version being used to interpret the entity's text. + */ + virtual stringT getXMLVersion() const = 0; + + /** + * Returns the name of the character encoding for the entity. If the encoding was + * declared externally (for example, in a MIME Content-Type header), that will be + * the name returned. Else if there was an declaration at + * the start of the document, that encoding name will be returned. Otherwise the + * encoding will been inferred (norally to be UTF-8, or some UTF-16 variant), and + * that inferred name will be returned. + *

+ * Note that some recent W3C specifications require that text in some encodings + * be normalized, using Unicode Normalization Form C, before processing. Such + * normalization must be performed by applications, and would normally be triggered + * based on the value returned by this method. + *

+ * Encoding names may be those used by the underlying implementation, and + * comparisons should be case-insensitive. + * + * @return Name of the character encoding being used to interpret the entity's text. + */ + virtual stringT getEncoding() const = 0; +}; // class Locator2 + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file + diff --git a/arabica/include/SAX/helpers/AttributeDefaults.hpp b/arabica/include/SAX/helpers/AttributeDefaults.hpp new file mode 100644 index 000000000..abf57859a --- /dev/null +++ b/arabica/include/SAX/helpers/AttributeDefaults.hpp @@ -0,0 +1,34 @@ +#ifndef ARABICA_ATTRIBUTE_DEFAULTS_H +#define ARABICA_ATTRIBUTE_DEFAULTS_H +/* + * $Id$ + */ + +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +template > +struct AttributeDefaults +{ + const string_type required; + const string_type implied; + const string_type fixed; + + AttributeDefaults() : + required(string_adaptor::construct_from_utf8("#REQUIRED")), + implied(string_adaptor::construct_from_utf8("#IMPLIED")), + fixed(string_adaptor::construct_from_utf8("#FIXED")) + { + } // AttributeDefaults +}; // struct AttributeDefaults + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/helpers/AttributeListImpl.hpp b/arabica/include/SAX/helpers/AttributeListImpl.hpp new file mode 100644 index 000000000..dbe270e02 --- /dev/null +++ b/arabica/include/SAX/helpers/AttributeListImpl.hpp @@ -0,0 +1,296 @@ +#ifndef ARABICA_ATTRIBUTES_LIST_IMPL_H +#define ARABICA_ATTRIBUTES_LIST_IMPL_H +// SAX default implementation for AttributeList. +// $Id$ + +#include +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +template +class Attribute +{ +public: + Attribute(const string_type& name, const string_adaptor& type, const string_adaptor& value) + : name_(name), type_(type), value_(value) + {} + virtual ~Attribute() { } + +public: + string_type name_; + const string_type& type_; + string_type value_; +}; + +static const std::string empty_; +const std::string const types[] = { empty_, "CDATA", "ID", "IDREF", "IDREFS", "NMTOKEN", "NMTOKENS", "ENTITY", "ENTITIES", "NOTATION" }; // TV + +/** + * Default implementation for AttributeList. + * + *

AttributeList implements the deprecated SAX1 {@link + * AttributeList AttributeList} interface, and has been + * replaced by the new SAX2 {@link AttributesImpl + * AttributesImpl} interface.

+ * + *

This class provides a convenience implementation of the SAX + * {@link AttributeList AttributeList} interface. This + * implementation is useful both for SAX parser writers, who can use + * it to provide attributes to the application, and for SAX application + * writers, who can use it to create a persistent copy of an element's + * attribute specifications:

+ * + *
+ * private AttributeList myatts;
+ *
+ * void startElement(const string_type& name, const AttributeList& atts)
+ * {
+ *   // create a persistent copy of the attribute list
+ *   // for use outside this method
+ *   AttributeListImpl myatts(atts);
+ *   [...]
+ * }
+ * 
+ * + *

Please note that SAX parsers are not required to use this + * class to provide an implementation of AttributeList; it is + * supplied only as an optional convenience. In particular, + * parser writers are encouraged to invent more efficient + * implementations.

+ * + * @deprecated This class implements a deprecated interface, + * {@link AttributeList AttributeList}; + * that interface has been replaced by + * {@link Attributes Attributes}, + * which is implemented in the + * {@link AttributesImpl + * AttributesImpl} helper class. + * @since SAX 1.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see AttributeList + * @see DocumentHandler#startElement + */ +template +class AttributeListImpl : public AttributeList +{ +public: + AttributeListImpl() : atts_() { } + AttributeListImpl(const AttributeList& atts) + : atts_(atts.getLength()) + { + setAttributeList(atts); + } // AttributeListImpl + + AttributeListImpl& operator=(const AttributeList& atts) + { + setAttributeList(atts); + + return *this; + } // operator= + + virtual ~AttributeListImpl() { clear(); } + + ////////////////////////////////////////////////////////////////////// + // Methods specific to this class. + ////////////////////////////////////////////////////////////////////// + /** + * Set the attribute list, discarding previous contents. + * + *

This method allows an application writer to reuse an + * attribute list easily.

+ * + * @param atts The attribute list to copy. + */ + void setAttributeList(const AttributeList& atts) + { + int count = atts.getLength(); + + clear(); + + for (int i = 0; i < count; ++i) + addAttribute(atts.getName(i), atts.getType(i), atts.getValue(i)); + } // setAttributeList + + /** + * Add an attribute to an attribute list. + * + *

This method is provided for SAX parser writers, to allow them + * to build up an attribute list incrementally before delivering + * it to the application.

+ * + * @param name The attribute name. + * @param type The attribute type ("NMTOKEN" for an enumeration). + * @param value The attribute value. + * @see #removeAttribute + * @see DocumentHandler#startElement + */ + void addAttribute(const string_type& name, const string_adaptor& type, const string_adaptor& value) + { + atts_.push_back(new Attribute(name, type, value)); // TV + } // addAttribute + + /** + * Remove an attribute from the list. + * + *

SAX application writers can use this method to filter an + * attribute out of an AttributeList. Note that invoking this + * method will change the length of the attribute list and + * some of the attribute's indices.

+ * + *

If the requested attribute is not in the list, this is + * a no-op.

+ * + * @param name The attribute name. + * @see #addAttribute + */ + void removeAttribute(const string_type& name) + { + int i = index(name); + + if(i < 0) return; + delete atts_[i]; + atts_.erase(atts_.begin() + i); + } // removeAttribute + + /** + * Clear the attribute list. + * + *

SAX parser writers can use this method to reset the attribute + * list between DocumentHandler.startElement events. Normally, + * it will make sense to reuse the same AttributeListImpl object + * rather than allocating a new one each time.

+ * + * @see DocumentHandler#startElement + */ + void clear() + { + while(!atts_.empty()) + { + delete atts_.back(); + atts_.pop_back(); + } + } // clear + + ////////////////////////////////////////////////////////////////////// + // Implementation of AttributeList + ////////////////////////////////////////////////////////////////////// + /** + * Return the number of attributes in the list. + * + * @return The number of attributes in the list. + * @see AttributeList#getLength + */ + virtual int getLength() const + { + return atts_.size(); + } // getLength + + /** + * Get the name of an attribute (by position). + * + * @param i The position of the attribute in the list. + * @return The attribute name as a string, or an empty string if there + * is no attribute at that position. + * @see AttributeList#getName(int) + */ + virtual const string_type& getName(int i) const + { + if(i > atts_.size()) + return empty_; + return atts_[i]->name_; + } // getName + + /** + * Get the type of an attribute (by position). + * + * @param i The position of the attribute in the list. + * @return The attribute type as a string ("NMTOKEN" for an + * enumeration, and "CDATA" if no declaration was + * read), or an empty string if there is no attribute at + * that position. + * @see AttributeList#getType(int) + */ + virtual const string_type& getType(int i) const + { + if(i > atts_.size()) + return empty_; + return atts_[i]->type_; + } // getType + + /** + * Get the value of an attribute (by position). + * + * @param i The position of the attribute in the list. + * @return The attribute value as a string, or an empty string if + * there is no attribute at that position. + * @see AttributeList#getValue(int) + */ + virtual const string_type& getValue(int i) const + { + if(i > atts_.size()) + return empty_; + return atts_[i]->value_; + } // getValue + + /** + * Get the type of an attribute (by name). + * + * @param name The attribute name. + * @return The attribute type as a string ("NMTOKEN" for an + * enumeration, and "CDATA" if no declaration was + * read). + * @see AttributeList#getType(java.lang.String) + */ + virtual const string_type& getType(const string_adaptor& name) const + { + int i = index(name); + return i < 0 ? empty_ : getType(i); + } // getType + + /** + * Get the value of an attribute (by name). + * + * @param name The attribute name. + * @see AttributeList#getValue(java.lang.String) + */ + virtual const string_type& getValue(const string_adaptor& name) const + { + int i = index(name); + return i < 0 ? empty_ : getValue(i); + } // getValue + +private: + ////////////////////////////////////////////////////////////////////// + // Internal state. + ////////////////////////////////////////////////////////////////////// + std::vector *> atts_; + + int index(const string_type& name) const + { + int i = 0, res = -1; + std::vector*>::const_iterator iter; + for (iter = atts_.begin() ; iter != atts_.end() ; i++, iter++) + if ((*iter)->name_ == name) { + res = i; + break; + } + return res; + } // index + + bool operator==(const AttributeList&) const; // not implemented +}; // class AttributeListImpl + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file + diff --git a/arabica/include/SAX/helpers/AttributeTypes.hpp b/arabica/include/SAX/helpers/AttributeTypes.hpp new file mode 100644 index 000000000..ec153f93a --- /dev/null +++ b/arabica/include/SAX/helpers/AttributeTypes.hpp @@ -0,0 +1,48 @@ +#ifndef ARABICA_ATTRIBUTE_TYPES_H +#define ARABICA_ATTRIBUTE_TYPES_H +/* + * $Id$ + */ + +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +template > +struct AttributeTypes +{ + const string_type cdata; + const string_type nmtoken; + const string_type nmtokens; + const string_type enumeration; + const string_type entity; + const string_type entities; + const string_type id; + const string_type idref; + const string_type idrefs; + const string_type notation; + + AttributeTypes() : + cdata(string_adaptor::construct_from_utf8("CDATA")), + nmtoken(string_adaptor::construct_from_utf8("NMTOKEN")), + nmtokens(string_adaptor::construct_from_utf8("NMTOKENS")), + enumeration(string_adaptor::construct_from_utf8("ENUMERATION")), + entity(string_adaptor::construct_from_utf8("ENTITY")), + entities(string_adaptor::construct_from_utf8("ENTITIES")), + id(string_adaptor::construct_from_utf8("ID")), + idref(string_adaptor::construct_from_utf8("IDREF")), + idrefs(string_adaptor::construct_from_utf8("IDREFS")), + notation(string_adaptor::construct_from_utf8("NOTATION")) + { + } // AttributeTypes +}; // struct AttributeTypes + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/helpers/AttributesImpl.hpp b/arabica/include/SAX/helpers/AttributesImpl.hpp new file mode 100644 index 000000000..12ae708da --- /dev/null +++ b/arabica/include/SAX/helpers/AttributesImpl.hpp @@ -0,0 +1,587 @@ +#ifndef ARABICA_ATTRIBUTES_IMPL_H +#define ARABICA_ATTRIBUTES_IMPL_H + +// AttributesImpl.h - default implementation of Attributes. +// $Id$ + +#include +#include +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Default implementation of the Attributes interface. + * + *

This class provides a default implementation of the SAX2 + * {@link Attributes Attributes} interface, with the + * addition of manipulators so that the list can be modified or + * reused.

+ * + *

There are two typical uses of this class:

+ * + *
    + *
  1. to take a persistent snapshot of an Attributes object + * in a {@link ContentHandler#startElement startElement} event; or
  2. + *
  3. to construct or modify an Attributes object in a SAX2 driver or filter.
  4. + *
+ * + *

This class replaces the now-deprecated SAX1 {@link + * AttributeListImpl AttributeListImpl} + * class.

+ * + * @since SAX 2.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + */ +template > +class AttributesImpl : public Attributes +{ +public: + typedef Attributes AttributesT; + + struct Attr + { + Attr(const string_type& uri, + const string_type& localName, + const string_type& qName, + const string_type& type, + const string_type& value) : + uri_(uri), localName_(localName), qName_(qName), type_(type), value_(value) + { } + Attr() { } + Attr& operator=(const Attr& rhs) + { + uri_ = rhs.uri_; + localName_ = rhs.localName_; + qName_ = rhs.qName_; + type_ = rhs.type_; + value_ = rhs.value_; + + return *this; + } // operator= + bool operator==(const Attr& rhs) const + { + return (uri_ == rhs.uri_) && + (localName_ == rhs.localName_) && + (qName_ == rhs.qName_) && + (type_ == rhs.type_) && + (value_ == rhs.value_); + } // operator== + + string_type uri_; + string_type localName_; + string_type qName_; + string_type type_; + string_type value_; + }; // Attr + + //////////////////////////////////////////////////////////////////// + // Constructors. + AttributesImpl() { } + AttributesImpl(const AttributesT& rhs) + { + setAttributes(rhs); + } // AttributesImpl + + AttributesImpl& operator=(const AttributesT& rhs) + { + setAttributes(rhs); + } // operator= + + bool operator==(const AttributesImpl& rhs) const + { + return attributes_ == rhs.attributes_; + } // operator== + + //////////////////////////////////////////////////////////////////// + // Implementation of SAX::Attributes. + /** + * Return the number of attributes in the list. + * + * @return The number of attributes in the list. + * @see Attributes#getLength + */ + virtual int getLength() const + { + return static_cast(attributes_.size()); + } // getLength + + /** + * Return an attribute's Namespace URI. + * + * @param index The attribute's index (zero-based). + * @return The Namespace URI, the empty string if none is + * available, or if the index is out of range. + * @see Attributes#getURI + */ + virtual string_type getURI(unsigned int index) const + { + if(index < attributes_.size()) + return attributes_[index].uri_; + return emptyString_; + } // getUri + + /** + * Return an attribute's local name. + * + * @param index The attribute's index (zero-based). + * @return The attribute's local name, the empty string if + * none is available, or if the index if out of range. + * @see Attributes#getLocalName + */ + virtual string_type getLocalName(unsigned int index) const + { + if(index < attributes_.size()) + return attributes_[index].localName_; + + return emptyString_; + } // getLocalName + + /** + * Return an attribute's qualified (prefixed) name. + * + * @param index The attribute's index (zero-based). + * @return The attribute's qualified name, the empty string if + * none is available, or if the index is out of bounds. + * @see Attributes#getQName + */ + virtual string_type getQName(unsigned int index) const + { + if(index < attributes_.size()) + return attributes_[index].qName_; + + return emptyString_; + } // getQName + + /** + * Return an attribute's type by index. + * + * @param index The attribute's index (zero-based). + * @return The attribute's type, "CDATA" if the type is unknown, or an empty + * string if the index is out of bounds. + * @see Attributes#getType(int) + */ + virtual string_type getType(unsigned int index) const + { + if(index < attributes_.size()) + return attributes_[index].type_; + + return emptyString_; + } // getType + + /** + * Return an attribute's value by index. + * + * @param index The attribute's index (zero-based). + * @return The attribute's value or an empty string if the index is out of bounds. + * @see Attributes#getValue(int) + */ + virtual string_type getValue(unsigned int index) const + { + if(index < attributes_.size()) + return attributes_[index].value_; + + return emptyString_; + } // getValue + + /** + * Look up an attribute's index by Namespace name. + * + *

In many cases, it will be more efficient to look up the name once and + * use the index query methods rather than using the name query methods + * repeatedly.

+ * + * @param uri The attribute's Namespace URI, or the empty + * string if none is available. + * @param localName The attribute's local name. + * @return The attribute's index, or -1 if none matches. + * @see Attributes#getIndex(const string_type&,const string_type&) + */ + virtual int getIndex(const string_type& uri, const string_type& localName) const + { + typename AttrList::const_iterator a = std::find_if(attributes_.begin(), attributes_.end(), AttributeNamed(uri, localName)); + if(a != attributes_.end()) + return static_cast(std::distance(attributes_.begin(), a)); + return -1; + } // getIndex + + /** + * Look up an attribute's index by qualified (prefixed) name. + * + * @param qName The qualified name. + * @return The attribute's index, or -1 if none matches. + * @see Attributes#getIndex(const string_type&) + */ + virtual int getIndex(const string_type& qName) const + { + size_t max = attributes_.size(); + for(size_t i = 0; i < max; ++i) + { + if(attributes_[i].qName_ == qName) + return static_cast(i); + } + return -1; + } // getIndex + + /** + * Look up an attribute's type by Namespace-qualified name. + * + * @param uri The Namespace URI, or the empty string for a name + * with no explicit Namespace URI. + * @param localName The local name. + * @return The attribute's type, or an empty string if there is no + * matching attribute. + * @see Attributes#getType(const string_type&,const string_type&) + */ + virtual string_type getType(const string_type& uri, const string_type& localName) const + { + typename AttrList::const_iterator a = std::find_if(attributes_.begin(), attributes_.end(), AttributeNamed(uri, localName)); + if(a != attributes_.end()) + return a->type_; + return emptyString_; + } // getType + + /** + * Look up an attribute's type by qualified (prefixed) name. + * + * @param qName The qualified name. + * @return The attribute's type, or an empty string if there is no + * matching attribute. + * @see Attributes#getType(const string_type&) + */ + virtual string_type getType(const string_type& qName) const + { + size_t max = attributes_.size(); + for(size_t i = 0; i < max; ++i) + { + if(attributes_[i].qName_ == qName) + return attributes_[i].type_; + } // for ... + return emptyString_; + } // getType + + /** + * Look up an attribute's value by Namespace-qualified name. + * + * @param uri The Namespace URI, or the empty string for a name + * with no explicit Namespace URI. + * @param localName The local name. + * @return The attribute's value, or an empty string if there is no + * matching attribute. + * @see Attributes#getValue(const string_type&,const string_type&) + */ + virtual string_type getValue(const string_type& uri, const string_type& localName) const + { + typename AttrList::const_iterator a = std::find_if(attributes_.begin(), attributes_.end(), AttributeNamed(uri, localName)); + if(a != attributes_.end()) + return a->value_; + return emptyString_; + } // getType + + /** + * Look up an attribute's value by qualified (prefixed) name. + * + * @param qName The qualified name. + * @return The attribute's value, or an empty string if there is no + * matching attribute. + * @see Attributes#getValue(const string_type&) + */ + virtual string_type getValue(const string_type& qName) const + { + size_t max = attributes_.size(); + for(size_t i = 0; i < max; ++i) + { + if(attributes_[i].qName_ == qName) + return attributes_[i].value_; + } // for ... + return emptyString_; + } // getValue + + //////////////////////////////////////////////////////////////////// + // Manipulators. + /** + * Clear the attribute list for reuse. + */ + void clear() + { + attributes_.erase(attributes_.begin(), attributes_.end()); + } // clear + + /** + * Copy an entire Attributes object. + * + *

It may be more efficient to reuse an existing object + * rather than constantly allocating new ones.

+ * + * @param atts The attributes to copy. + */ + void setAttributes(const AttributesT& atts) + { + clear(); + + int max = atts.getLength(); + for(int i = 0; i < max; ++i) + attributes_.push_back(Attr(atts.getURI(i), + atts.getLocalName(i), + atts.getQName(i), + atts.getType(i), + atts.getValue(i))); + } // setAttributes + + /** + * Add an attribute to the end of the list. + * + *

For the sake of speed, this method does no checking + * to see if the attribute is already in the list: that is + * the responsibility of the application.

+ * + * @param uri The Namespace URI, or the empty string if + * none is available or Namespace processing is not + * being performed. + * @param localName The local name, or the empty string if + * Namespace processing is not being performed. + * @param qName The qualified (prefixed) name, or the empty string + * if qualified names are not available. + * @param type The attribute type as a string. + * @param value The attribute value. + */ + void addAttribute(const string_type& uri, + const string_type& localName, + const string_type& qName, + const string_type& type, + const string_type& value) + { + attributes_.push_back(Attr(uri, localName, qName, type, value)); + } // addAttribute + + void addAttribute(const Attr& attr) + { + attributes_.push_back(attr); + } // addAttribute + + /** + * Add an attribute to the end of the list. + * + * @param uri The Namespace URI, or the empty string if + * none is available or Namespace processing is not + * being performed. + * @param localName The local name, or the empty string if + * Namespace processing is not being performed. + * @param qName The qualified (prefixed) name, or the empty string + * if qualified names are not available. + * @param type The attribute type as a string. + * @param value The attribute value. + */ + void addOrReplaceAttribute(const string_type& uri, + const string_type& localName, + const string_type& qName, + const string_type& type, + const string_type& value) + { + typename AttrList::iterator a = std::find_if(attributes_.begin(), attributes_.end(), AttributeNamed(uri, localName)); + if(a != attributes_.end()) + { + a->value_ = value; + return; + } // if ... + + attributes_.push_back(Attr(uri, localName, qName, type, value)); + } // addOrReplaceAttribute + + /** + * Set an attribute in the list. + * + *

For the sake of speed, this method does no checking + * for name conflicts or well-formedness: such checks are the + * responsibility of the application.

+ * + * @param index The index of the attribute (zero-based). + * @param uri The Namespace URI, or the empty string if + * none is available or Namespace processing is not + * being performed. + * @param localName The local name, or the empty string if + * Namespace processing is not being performed. + * @param qName The qualified name, or the empty string + * if qualified names are not available. + * @param type The attribute type as a string. + * @param value The attribute value. + * @exception java.lang.ArrayIndexOutOfBoundsException When the + * supplied index does not point to an attribute + * in the list. + */ + void setAttribute(unsigned int index, + const string_type& uri, + const string_type& localName, + const string_type& qName, + const string_type& type, + const string_type& value) + { + if(index < attributes_.size()) + { + Attr& a = attributes_[index]; + a.uri_ = uri; + a.localName_ = localName; + a.qName_ = qName; + a.type_ = type; + a.value_ = value; + } + else + badIndex(index); + } // setAttribute + + /** + * Remove an attribute from the list. + * + * @param index The index of the attribute (zero-based). + * @exception std::out_of_range When the + * supplied index does not point to an attribute + * in the list. + */ + void removeAttribute(unsigned int index) + { + if(index < attributes_.size()) + attributes_.erase(attributes_.begin() + index); + else + badIndex(index); + } // removeAttribute + + /** + * Set the Namespace URI of a specific attribute. + * + * @param index The index of the attribute (zero-based). + * @param uri The attribute's Namespace URI, or the empty + * string for none. + * @exception std::out_of_range When the + * supplied index does not point to an attribute + * in the list. + */ + void setURI(unsigned int index, const string_type& uri) + { + if(index < attributes_.size()) + attributes_[index].uri_ = uri; + else + badIndex(index); + } // setURI + + /** + * Set the local name of a specific attribute. + * + * @param index The index of the attribute (zero-based). + * @param localName The attribute's local name, or the empty + * string for none. + * @exception std::out_of_range When the + * supplied index does not point to an attribute + * in the list. + */ + void setLocalName(unsigned int index, const string_type& localName) + { + if(index < attributes_.size()) + attributes_[index].localName_ = localName; + else + badIndex(index); + } // setLocalName + + /** + * Set the qualified name of a specific attribute. + * + * @param index The index of the attribute (zero-based). + * @param qName The attribute's qualified name, or the empty + * string for none. + * @exception std::out_of_range When the + * supplied index does not point to an attribute + * in the list. + */ + void setQName(unsigned int index, const string_type& qName) + { + if(index >= 0 && index < attributes_.size()) + attributes_[index].qName_ = qName; + else + badIndex(index); + } // setQName + + /** + * Set the type of a specific attribute. + * + * @param index The index of the attribute (zero-based). + * @param type The attribute's type. + * @exception std::out_of_range When the + * supplied index does not point to an attribute + * in the list. + */ + void setType(unsigned int index, const string_type& type) + { + if(index >= 0 && index < attributes_.size()) + attributes_[index].type_ = type; + else + badIndex(index); + } // setType + + /** + * Set the value of a specific attribute. + * + * @param index The index of the attribute (zero-based). + * @param value The attribute's value. + * @exception std::out_of_range When the + * supplied index does not point to an attribute + * in the list. + */ + void setValue(unsigned int index, const string_type& value) + { + if(index < attributes_.size()) + attributes_[index].value_ = value; + else + badIndex(index); + } // setURI + +private: + //////////////////////////////////////////////////////////////////// + // Internal methods. + //////////////////////////////////////////////////////////////////// + void badIndex(unsigned int index) + { + // sort out + std::stringstream msg; + msg << "Attempt to modify attribute at illegal index: " << index; + throw std::out_of_range(msg.str()); + } + + class AttributeNamed + { + public: + AttributeNamed(const string_type& uri, const string_type& localName) : + uri_(uri), localName_(localName) { } + AttributeNamed(const AttributeNamed& rhs) : + uri_(rhs.uri_), localName_(rhs.localName_) { } + ~AttributeNamed() { } + + bool operator()(const Attr& attr) const + { + return (attr.uri_ == uri_) && (attr.localName_ == localName_); + } // operator() + + private: + const string_type& uri_; + const string_type& localName_; + + AttributeNamed& operator=(const AttributeNamed&); + bool operator==(const AttributeNamed&) const; + }; // class AttributeNamed + + typedef typename std::deque AttrList; + AttrList attributes_; + + string_type emptyString_; +}; // class AttributesImpl + +} // namespace SAX +} // namespace Arabica + +#endif + diff --git a/arabica/include/SAX/helpers/DefaultHandler.hpp b/arabica/include/SAX/helpers/DefaultHandler.hpp new file mode 100644 index 000000000..c7ffd3612 --- /dev/null +++ b/arabica/include/SAX/helpers/DefaultHandler.hpp @@ -0,0 +1,603 @@ +#ifndef ARABICA_DEFAULT_HANDLER_H +#define ARABICA_DEFAULT_HANDLER_H + +// DefaultHandler.h +// $Id$ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Default base class for SAX2 event handlers. + * + *

This class is available as a convenience base class for SAX2 + * applications: it provides default implementations for all of the + * callbacks in the four core SAX2 handler classes:

+ * + *
    + *
  • {@link EntityResolver EntityResolver}
  • + *
  • {@link DTDHandler DTDHandler}
  • + *
  • {@link ContentHandler ContentHandler}
  • + *
  • {@link ErrorHandler ErrorHandler}
  • + *
+ * + *

Application writers can extend this class when they need to + * implement only part of an interface; parser writers can + * instantiate this class to provide default handlers when the + * application has not supplied its own.

+ * + *

This class replaces the deprecated SAX1 + * {@link HandlerBase HandlerBase} class.

+ * + * @since SAX 2.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see EntityResolver + * @see DTDHandler + * @see ContentHandler + * @see ErrorHandler + */ +template > +class DefaultHandler : public EntityResolver, + public DTDHandler, + public ContentHandler, + public ErrorHandler, + public LexicalHandler, + public DeclHandler +{ +public: + typedef InputSource InputSourceT; + typedef Locator LocatorT; + typedef Attributes AttributesT; + typedef SAXParseException SAXParseExceptionT; + + DefaultHandler() { } + virtual ~DefaultHandler() { } + + ////////////////////////////////////////////// + // EntityResolver + /** + * Resolve an external entity. + * + *

Always return a default-constructed InputSourceT, so that + * the parser will use the system identifier provided in the XML document. + * This method implements the SAX default behaviour: application writers can + * override it in a subclass to do special translations such as catalog + * lookups or URI redirection.

+ * + * @param publicId The public identifer, or an empty string if none is + * available. + * @param systemId The system identifier provided in the XML + * document. + * @return The new input source, (empty to require the + * default behaviour). + * @exception SAXException Any SAX exception. + * @see EntityResolver#resolveEntity + */ + virtual InputSourceT resolveEntity(const string_type& /* publicId */, const string_type& /* systemId */) + { + return InputSourceT(); + } // resolveEntity + + ////////////////////////////////////////////// + // DTDHandler + /** + * Receive notification of a notation declaration. + * + *

By default, do nothing. Application writers may override this + * method in a subclass if they wish to keep track of the notations + * declared in a document.

+ * + * @param name The notation name. + * @param publicId The notation public identifier, or an empty string if not + * available. + * @param systemId The notation system identifier. + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see DTDHandler#notationDecl + */ + virtual void notationDecl(const string_type& /* name */, + const string_type& /* publicId */, + const string_type& /* systemId */) + { + } // notationDecl + + /** + * Receive notification of an unparsed entity declaration. + * + *

By default, do nothing. Application writers may override this + * method in a subclass to keep track of the unparsed entities + * declared in a document.

+ * + * @param name The entity name. + * @param publicId The entity public identifier, or an empty string if not + * available. + * @param systemId The entity system identifier. + * @param notationName The name of the associated notation. + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see DTDHandler#unparsedEntityDecl + */ + virtual void unparsedEntityDecl(const string_type& /* name */, + const string_type& /* publicId */, + const string_type& /* systemId */, + const string_type& /* notationName */) + { + } // unparsedEntityDecl + + //////////////////////////////////////////////////// + // ContentHandler + /** + * Receive a Locator object for document events. + * + *

By default, do nothing. Application writers may override this + * method in a subclass if they wish to store the locator for use + * with other document events.

+ * + * @param locator A locator for all SAX document events. + * @see ContentHandler#setDocumentLocator + * @see Locator + */ + virtual void setDocumentLocator(const LocatorT& /* locator */) { } + + /** + * Receive notification of the beginning of the document. + * + *

By default, do nothing. Application writers may override this + * method in a subclass to take specific actions at the beginning + * of a document (such as allocating the root node of a tree or + * creating an output file).

+ * + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see ContentHandler#startDocument + */ + virtual void startDocument() { } + /** + * Receive notification of the end of the document. + * + *

By default, do nothing. Application writers may override this + * method in a subclass to take specific actions at the end + * of a document (such as finalising a tree or closing an output + * file).

+ * + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see ContentHandler#endDocument + */ + virtual void endDocument() { } + + /** + * Receive notification of the start of a Namespace mapping. + * + *

By default, do nothing. Application writers may override this + * method in a subclass to take specific actions at the start of + * each Namespace prefix scope (such as storing the prefix mapping).

+ * + * @param prefix The Namespace prefix being declared. + * @param uri The Namespace URI mapped to the prefix. + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see ContentHandler#startPrefixMapping + */ + virtual void startPrefixMapping(const string_type& /* prefix */, const string_type& /* uri */) { } + /** + * Receive notification of the end of a Namespace mapping. + * + *

By default, do nothing. Application writers may override this + * method in a subclass to take specific actions at the end of + * each prefix mapping.

+ * + * @param prefix The Namespace prefix being declared. + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see ContentHandler#endPrefixMapping + */ + virtual void endPrefixMapping(const string_type& /* prefix */) { } + + /** + * Receive notification of the start of an element. + * + *

By default, do nothing. Application writers may override this + * method in a subclass to take specific actions at the start of + * each element (such as allocating a new tree node or writing + * output to a file).

+ * + * @param namespaceURI The Namespace URI, or the empty string if the element + * has no Namespace URI or if Namespace processing is not + * being performed. + * @param localName The local name (without prefix), or the empty string if + * Namespace processing is not being performed. + * @param qName The qualified name (with prefix), or the empty string if + * qualified names are not available. + * @param atts The attributes attached to the element. If there are no + * attributes, it shall be an empty Attributes object. + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see ContentHandler#startElement + */ + virtual void startElement(const string_type& /* namespaceURI */, const string_type& /* localName */, + const string_type& /* qName */, const AttributesT& /* atts */) { } + /** + * Receive notification of the end of an element. + * + *

By default, do nothing. Application writers may override this + * method in a subclass to take specific actions at the end of + * each element (such as finalising a tree node or writing + * output to a file).

+ * + * @param namespaceURI The Namespace URI, or the empty string if the element + * has no Namespace URI or if Namespace processing is not + * being performed. + * @param localName The local name (without prefix), or the empty string if + * Namespace processing is not being performed. + * @param qName The qualified name (with prefix), or the empty string if + * qualified names are not available. + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see ContentHandler#endElement + */ + virtual void endElement(const string_type& /* namespaceURI */, const string_type& /* localName */, + const string_type& /* qName */) { } + + /** + * Receive notification of character data inside an element. + * + *

By default, do nothing. Application writers may override this + * method to take specific actions for each chunk of character data + * (such as adding the data to a node or buffer, or printing it to + * a file).

+ * + * @param ch The characters. + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see ContentHandler#characters + */ + virtual void characters(const string_type& /* ch */) { } + /** + * Receive notification of ignorable whitespace in element content. + * + *

By default, do nothing. Application writers may override this + * method to take specific actions for each chunk of ignorable + * whitespace (such as adding data to a node or buffer, or printing + * it to a file).

+ * + * @param ch The whitespace characters. + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see ContentHandler#ignorableWhitespace + */ + virtual void ignorableWhitespace(const string_type& /* ch */) { } + + /** + * Receive notification of a processing instruction. + * + *

By default, do nothing. Application writers may override this + * method in a subclass to take specific actions for each + * processing instruction, such as setting status variables or + * invoking other methods.

+ * + * @param target The processing instruction target. + * @param data The processing instruction data, or an empty string if + * none is supplied. + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see ContentHandler#processingInstruction + */ + virtual void processingInstruction(const string_type& /* target */, const string_type& /* data */) { } + + /** + * Receive notification of a skipped entity. + * + *

By default, do nothing. Application writers may override this + * method in a subclass to take specific actions for each + * processing instruction, such as setting status variables or + * invoking other methods.

+ * + * @param name The name of the skipped entity. + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see ContentHandler#processingInstruction + */ + virtual void skippedEntity(const string_type& /* name */) { } + + ///////////////////////////////////////////////////// + // ErrorHandler + /** + * Receive notification of a parser warning. + * + *

The default implementation does nothing. Application writers + * may override this method in a subclass to take specific actions + * for each warning, such as inserting the message in a log file or + * printing it to the console.

+ * + * @param e The warning information encoded as an exception. + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see ErrorHandler#warning + * @see SAXParseException + */ + virtual void warning(const SAXParseExceptionT& /* e */) { } + /** + * Receive notification of a recoverable parser error. + * + *

The default implementation does nothing. Application writers + * may override this method in a subclass to take specific actions + * for each error, such as inserting the message in a log file or + * printing it to the console.

+ * + * @param e The warning information encoded as an exception. + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see ErrorHandler#error + * @see SAXParseException + */ + virtual void error(const SAXParseExceptionT& /* e */) { } + /** + * Report a fatal XML parsing error. + * + *

The default implementation throws a SAXParseException. + * Application writers may override this method in a subclass if + * they need to take specific actions for each fatal error (such as + * collecting all of the errors into a single report): in any case, + * the application must stop all regular processing when this + * method is invoked, since the document is no longer reliable, and + * the parser may no longer report parsing events.

+ * + * @param e The error information encoded as an exception. + * @exception SAXException Any SAX exception, possibly + * wrapping another exception. + * @see ErrorHandler#fatalError + * @see SAXParseException + */ + virtual void fatalError(const SAXParseExceptionT& e) + { + throw SAXParseExceptionT(e); + // VS.NET refuses throw e; saying the copy constructor is inaccessible + // GCC likes throw e; + // one of them, I presume, is wrong + } // fatalError + + ////////////////////////////////////////////////////////// + // LexicalHandler + /** + * Report the start of DTD declarations, if any. + * + *

This method is intended to report the beginning of the + * DOCTYPE declaration; if the document has no DOCTYPE declaration, + * this method will not be invoked.

+ * + *

All declarations reported through + * {@link DTDHandler DTDHandler} or + * {@link DeclHandler DeclHandler} events must appear + * between the startDTD and {@link #endDTD endDTD} events. + * Declarations are assumed to belong to the internal DTD subset + * unless they appear between {@link #startEntity startEntity} + * and {@link #endEntity endEntity} events. Comments and + * processing instructions from the DTD should also be reported + * between the startDTD and endDTD events, in their original + * order of (logical) occurrence; they are not required to + * appear in their correct locations relative to DTDHandler + * or DeclHandler events, however.

+ * + *

Note that the start/endDTD events will appear within + * the start/endDocument events from ContentHandler and + * before the first + * {@link ContentHandler#startElement startElement} + * event.

+ * + * @param name The document type name. + * @param publicId The declared public identifier for the + * external DTD subset, or an empty string if none was declared. + * @param systemId The declared system identifier for the + * external DTD subset, or an empty string if none was declared. + * @see #endDTD + * @see #startEntity + */ + virtual void startDTD(const string_type& /*name*/, + const string_type& /*publicId*/, + const string_type& /*systemId*/) { } + + /** + * Report the end of DTD declarations. + * + *

This method is intended to report the end of the + * DOCTYPE declaration; if the document has no DOCTYPE declaration, + * this method will not be invoked.

+ * + * @see #startDTD + */ + virtual void endDTD() { } + + /** + * Report the beginning of some internal and external XML entities. + * + *

The reporting of parameter entities (including + * the external DTD subset) is optional, and SAX2 drivers that + * support LexicalHandler may not support it; you can use the + * http://xml.org/sax/features/lexical-handler/parameter-entities + * feature to query or control the reporting of parameter entities.

+ * + *

General entities are reported with their regular names, + * parameter entities have '%' prepended to their names, and + * the external DTD subset has the pseudo-entity name "[dtd]".

+ * + *

When a SAX2 driver is providing these events, all other + * events must be properly nested within start/end entity + * events. There is no additional requirement that events from + * {@link DeclHandler DeclHandler} or + * {@link DTDHandler DTDHandler} be properly ordered.

+ * + *

Note that skipped entities will be reported through the + * {@link ContentHandler#skippedEntity skippedEntity} + * event, which is part of the ContentHandler interface.

+ * + *

Because of the streaming event model that SAX uses, some + * entity boundaries cannot be reported under any + * circumstances:

+ * + *
    + *
  • general entities within attribute values
  • + *
  • parameter entities within declarations
  • + *
+ * + *

These will be silently expanded, with no indication of where + * the original entity boundaries were.

+ * + *

Note also that the boundaries of character references (which + * are not really entities anyway) are not reported.

+ * + *

All start/endEntity events must be properly nested. + * + * @param name The name of the entity. If it is a parameter + * entity, the name will begin with '%', and if it is the + * external DTD subset, it will be "[dtd]". + * @see #endEntity + * @see DeclHandler#internalEntityDecl + * @see DeclHandler#externalEntityDecl + */ + virtual void startEntity(const string_type& /*name*/) { } + /** + * Report the end of an entity. + * + * @param name The name of the entity that is ending. + * @see #startEntity + */ + virtual void endEntity(const string_type& /*name*/) { } + + /** + * Report the start of a CDATA section. + * + *

The contents of the CDATA section will be reported through + * the regular {@link ContentHandler#characters + * characters} event; this event is intended only to report + * the boundary.

+ * + * @see #endCDATA + */ + virtual void startCDATA() { } + /** + * Report the end of a CDATA section. + * + * @see #startCDATA + */ + virtual void endCDATA() { } + + /** + * Report an XML comment anywhere in the document. + * + *

This callback will be used for comments inside or outside the + * document element, including comments in the external DTD + * subset (if read). Comments in the DTD must be properly + * nested inside start/endDTD and start/endEntity events (if + * used).

+ * + * @param text A string holding the comment. + */ + virtual void comment(const string_type& /*text*/) { } + + //////////////////////////////////////////////////////////// + // DeclHandler + /** + * Report an element type declaration. + * + *

The content model will consist of the string "EMPTY", the + * string "ANY", or a parenthesised group, optionally followed + * by an occurrence indicator. The model will be normalized so + * that all parameter entities are fully resolved and all whitespace + * is removed,and will include the enclosing parentheses. Other + * normalization (such as removing redundant parentheses or + * simplifying occurrence indicators) is at the discretion of the + * parser.

+ * + * @param name The element type name. + * @param model The content model as a normalized string. + */ + virtual void elementDecl(const string_type& /*name*/, const string_type& /*model*/) { } + /** + * Report an attribute type declaration. + * + *

Only the effective (first) declaration for an attribute will + * be reported. The type will be one of the strings "CDATA", + * "ID", "IDREF", "IDREFS", "NMTOKEN", "NMTOKENS", "ENTITY", + * "ENTITIES", a parenthesized token group with + * the separator "|" and all whitespace removed, or the word + * "NOTATION" followed by a space followed by a parenthesized + * token group with all whitespace removed.

+ * + *

Any parameter entities in the attribute value will be + * expanded, but general entities will not.

+ * + * @param elementName The name of the associated element. + * @param attributeName The name of the attribute. + * @param type A string representing the attribute type. + * @param valueDefault A string representing the attribute default + * ("#IMPLIED", "#REQUIRED", or "#FIXED") or empty string if + * none of these applies. + * @param value A string representing the attribute's default value, + * or empty string if there is none. + */ + virtual void attributeDecl(const string_type& /*elementName*/, + const string_type& /*attributeName*/, + const string_type& /*type*/, + const string_type& /*valueDefault*/, + const string_type& /*value*/) { } + /** + * Report an internal entity declaration. + * + *

Only the effective (first) declaration for each entity + * will be reported. All parameter entities in the value + * will be expanded, but general entities will not.

+ * + * @param name The name of the entity. If it is a parameter + * entity, the name will begin with '%'. + * @param value The replacement text of the entity. + * @see #externalEntityDecl + * @see DTDHandler#unparsedEntityDecl + */ + virtual void internalEntityDecl(const string_type& /*name*/, const string_type& /*value*/) { } + /** + * Report a parsed external entity declaration. + * + *

Only the effective (first) declaration for each entity + * will be reported.

+ * + * @param name The name of the entity. If it is a parameter + * entity, the name will begin with '%'. + * @param publicId The declared public identifier of the entity, or + * an empty string if none was declared. + * @param systemId The declared system identifier of the entity. + * @see #internalEntityDecl + * @see DTDHandler#unparsedEntityDecl + */ + virtual void externalEntityDecl(const string_type& /*name*/, + const string_type& /*publicId*/, + const string_type& /*systemId*/) { } +private: + DefaultHandler(const DefaultHandler&); + DefaultHandler& operator=(const DefaultHandler&); + bool operator==(const DefaultHandler&); +}; // class DefaultHandler + +} // namespace SAX +} // namespace Arabica + +#endif diff --git a/arabica/include/SAX/helpers/FeatureNames.hpp b/arabica/include/SAX/helpers/FeatureNames.hpp new file mode 100644 index 000000000..86bb1b8b9 --- /dev/null +++ b/arabica/include/SAX/helpers/FeatureNames.hpp @@ -0,0 +1,95 @@ +#ifndef ARABICA_FEATURE_NAMES_H +#define ARABICA_FEATURE_NAMES_H +/* + * $Id$ + */ + +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +template > +struct FeatureNames +{ + /** Reports whether this parser processes external general entities; always + * true if validating. */ + const string_type external_general; + /** Reports whether this parser processes external parameter entities; always + * true if validating. */ + const string_type external_parameter; + /** May be examined only during a parse, after the startDocument() callback has + * been completed; read-only. The value is true if the document specified the + * "standalone" flag in its XML declaration, and otherwise is false. */ + const string_type is_standalone; + /** True indicates that the LexicalHandler will report the beginning and end + * of parameter entities. */ + const string_type lexical_parameter; + /** True indicates namespace URIs and unprefixed local names for element and + * attribute names will be available.*/ + const string_type namespaces; + /** True indicates XML 1.0 names (with prefixes) and attributes (including + * xmlns* attributes) will be available. */ + const string_type namespace_prefixes; + /** A value of "true" indicates that system IDs in declarations will be + * absolutized (relative to their base URIs) before reporting. (That is the + * default behavior for all SAX2 XML parsers.) A value of "false" indicates + * those IDs will not be absolutized; parsers will provide the base URI from + * Locator.getSystemId(). This applies to system IDs passed in + * + * - DTDHandler.notationDecl(), + * - DTDHandler.unparsedEntityDecl(), and + * - DeclHandler.externalEntityDecl(). + * + *It does not apply to EntityResolver.resolveEntity(), which is not used to + report declarations, or to LexicalHandler.startDTD(), which already provides + the non-absolutized URI. */ + const string_type resolve_dtd_uris; + /** Returns true if the Attributes objects passed by this parser in + * ContentHandler.startElement() implement the org.xml.sax.ext.Attributes2 + * interface. That interface exposes additional DTD-related information, such + * as whether the attribute was specified in the source text rather than + * defaulted. */ + const string_type use_attributes2; + /** Returns true if the Locator objects passed by this parser in + * ContentHandler.setDocumentLocator() implement the org.xml.sax.ext.Locator2 + * interface. That interface exposes additional entity information, such as + * the character encoding and XML version used. */ + const string_type use_locator2; + /** Controls whether the parser is reporting all validity errors; if true, + * all external entities will be read. */ + const string_type validation; + /** Controls whether, when the namespace-prefixes feature is set, the parser + * treats namespace declaration attributes as being in the + * http://www.w3.org/2000/xmlns/ namespace. By default, SAX2 conforms to the + * original "Namespaces in XML" Recommendation, which explicitly states that + * such attributes are not in any namespace. Setting this optional flag to + * true makes the SAX2 events conform to a later backwards-incompatible + * revision of that recommendation, placing those attributes in a namespace. + * */ + const string_type xmlns_uris; + + FeatureNames() : + external_general(string_adaptor::construct_from_utf8("http://xml.org/sax/features/external-general-entities")), + external_parameter(string_adaptor::construct_from_utf8("http://xml.org/sax/features/external-parameter-entities")), + is_standalone(string_adaptor::construct_from_utf8("http://xml.org/sax/features/is-standalone")), + lexical_parameter(string_adaptor::construct_from_utf8("http://xml.org/sax/features/lexical-handler/parameter-entities")), + namespaces(string_adaptor::construct_from_utf8("http://xml.org/sax/features/namespaces")), + namespace_prefixes(string_adaptor::construct_from_utf8("http://xml.org/sax/features/namespace-prefixes")), + resolve_dtd_uris(string_adaptor::construct_from_utf8("http://xml.org/sax/features/resolve-dtd-uris")), + use_attributes2(string_adaptor::construct_from_utf8("http://xml.org/sax/features/use-attributes2")), + use_locator2(string_adaptor::construct_from_utf8("http://xml.org/sax/features/use-locator2")), + validation(string_adaptor::construct_from_utf8("http://xml.org/sax/features/validation")), + xmlns_uris(string_adaptor::construct_from_utf8("http://xml.org/sax/features/xmlns-uris")) + { + } // FeatureNames +}; // class FeatureNames + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/helpers/InputSourceResolver.hpp b/arabica/include/SAX/helpers/InputSourceResolver.hpp new file mode 100644 index 000000000..5ff7e1185 --- /dev/null +++ b/arabica/include/SAX/helpers/InputSourceResolver.hpp @@ -0,0 +1,63 @@ +#ifndef ARABICA_INPUT_SOURCE_RESOLVER_H +#define ARABICA_INPUT_SOURCE_RESOLVER_H + +#include +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +class InputSourceResolver +{ +public: + template + InputSourceResolver(const SAX::InputSource& inputSource, + const stringAdaptorT& /*SA*/) : + deleteStream_(false), + byteStream_(0) + { + open(stringAdaptorT::asStdString(inputSource.getPublicId()), + stringAdaptorT::asStdString(inputSource.getSystemId()), + inputSource.getByteStream()); + } // InputSourceResolver + ~InputSourceResolver(); + + std::istream* resolve() const { return byteStream_; } + + typedef std::istream* (*URIResolver)(const std::string& url); + static bool registerResolver(const std::string& method, URIResolver resolver); + static bool unRegisterResolver(const std::string& method); + +private: + // no impl + InputSourceResolver(const InputSourceResolver&); + InputSourceResolver& operator=(const InputSourceResolver&); + bool operator==(const InputSourceResolver&); + + // instance variables + bool deleteStream_; + std::istream* byteStream_; + + void open(const std::string& publicId, + const std::string& systemId, + std::istream* byteStream); + + // class variables + static URIResolver findResolver(std::string method); + + typedef std::map resolverMapT; + static resolverMapT& resolverMap() + { + static resolverMapT theMap; + return theMap; + } // resolverMap +}; // class InputSourceResolver + +} // namespace SAX +} // namespace Arabica +#endif + diff --git a/arabica/include/SAX/helpers/NamespaceSupport.hpp b/arabica/include/SAX/helpers/NamespaceSupport.hpp new file mode 100644 index 000000000..4f9998cd0 --- /dev/null +++ b/arabica/include/SAX/helpers/NamespaceSupport.hpp @@ -0,0 +1,390 @@ +#ifndef ARABICA_NAMESPACE_SUPPORT_H +#define ARABICA_NAMESPACE_SUPPORT_H + +#include +#include +#include +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +template +struct NamespaceConstants +{ + const string_type xml; + const string_type xmlns; + const string_type xml_uri; + const string_type xmlns_uri; + const string_type xmlns11_uri; + const string_type colon; + + NamespaceConstants() : + xml(string_adaptor::construct_from_utf8("xml")), + xmlns(string_adaptor::construct_from_utf8("xmlns")), + xml_uri(string_adaptor::construct_from_utf8("http://www.w3.org/XML/1998/namespace")), + xmlns_uri(), + xmlns11_uri(string_adaptor::construct_from_utf8("http://www.w3.org/2000/xmlns/")), + colon(string_adaptor::construct_from_utf8(":")) + { + } // NamespaceConstants +}; // struct NamespaceContants + +/** + * Encapsulate Namespace logic for use by SAX drivers. + * + *

This class encapsulates the logic of Namespace processing: + * it tracks the declarations currently in force for each context + * and automatically processes qualified XML 1.0 names into their + * Namespace parts; it can also be used in reverse for generating + * XML 1.0 from Namespaces.

+ * + *

Namespace support objects are reusable, but the reset method + * must be invoked between each session.

+ * + *

Here is a simple session:

+ * + *
+ * NamespaceSupport support;
+ *
+ * support.pushContext();
+ * support.declarePrefix("", "http://www.w3.org/1999/xhtml");
+ * support.declarePrefix("dc", "http://www.purl.org/dc#");
+ *
+ * NamespaceSupport parts = support.processName("p", parts, false);
+ * std::cout << "Namespace URI: " << parts.URI << std::endl;
+ * std::cout << "Local name: " << parts.localName << std::endl;
+ * std::cout << "Raw name: " << parts.rawName << std::endl;
+
+ * parts = support.processName("dc:title", parts, false);
+ * std::cout << "Namespace URI: " << parts.URI << std::endl;
+ * std::cout << "Local name: " << parts.localName << std::endl;
+ * std::cout << "Raw name: " << parts.rawName << std::endl;
+
+ * support.popContext();
+ * 
+ * + *

Note that this class is optimized for the use case where most + * elements do not contain Namespace declarations: if the same + * prefix/URI mapping is repeated for each context (for example), this + * class will be somewhat less efficient.

+ * + * @since SAX 2.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + */ +template +class NamespaceSupport +{ + public: + typedef std::vector stringListT; + + // functions + NamespaceSupport() + { + reset(); + } // NamespaceSupport + + /** + * Reset this Namespace support object for reuse. + * + *

It is necessary to invoke this method before reusing the + * Namespace support object for a new session.

+ */ + void reset() + { + contexts_.clear(); + contexts_.push_back(Context()); + contexts_.back().insert(std::make_pair(nsc_.xml, nsc_.xml_uri)); + } // reset + + //////////////////////////////////////////////////////////////////// + // Context management. + //////////////////////////////////////////////////////////////////// + /** + * Start a new Namespace context. + * + *

Normally, you should push a new context at the beginning + * of each XML element: the new context will automatically inherit + * the declarations of its parent context, but it will also keep + * track of which declarations were made within this context.

+ * + *

The Namespace support object always starts with a base context + * already in force: in this context, only the "xml" prefix is + * declared.

+ * + * @see #popContext + */ + void pushContext() + { + contexts_.push_back(Context()); + } // pushContext + + /** + * Revert to the previous Namespace context. + * + *

Normally, you should pop the context at the end of each + * XML element. After popping the context, all Namespace prefix + * mappings that were previously in force are restored.

+ * + *

You must not attempt to declare additional Namespace + * prefixes after popping a context, unless you push another + * context first.

+ * + * @see #pushContext + */ + void popContext() + { + contexts_.pop_back(); + } // popContext + + //////////////////////////////////////////////////////////////////// + // Operations within a context. + //////////////////////////////////////////////////////////////////// + /** + * Declare a Namespace prefix. + * + *

This method declares a prefix in the current Namespace + * context; the prefix will remain in force until this context + * is popped, unless it is shadowed in a descendant context.

+ * + *

To declare a default Namespace, use the empty string. The + * prefix must not be "xml" or "xmlns".

+ * + *

Note that you must not declare a prefix after + * you've pushed and popped another Namespace.

+ * + *

Note that there is an asymmetry in this library: while {@link + * #getPrefix getPrefix} will not return the default "" prefix, + * even if you have declared one; to check for a default prefix, + * you have to look it up explicitly using {@link #getURI getURI}. + * This asymmetry exists to make it easier to look up prefixes + * for attribute names, where the default prefix is not allowed.

+ * + * @param prefix The prefix to declare, or the empty string. + * @param uri The Namespace URI to associate with the prefix. + * @return true if the prefix was legal, false otherwise + * @see #processName + * @see #getURI + * @see #getPrefix + */ + bool declarePrefix(const string_type& prefix, const string_type& uri) + { + if((prefix == nsc_.xml) || (prefix == nsc_.xmlns)) + return false; + + contexts_.back().insert(std::make_pair(prefix, uri)); + return true; + } // declarePrefix + + /** + * Process a raw XML 1.0 name. + * + *

This method processes a raw XML 1.0 name in the current + * context by removing the prefix and looking it up among the + * prefixes currently declared. + * + *

If + * the raw name has a prefix that has not been declared, then + * the return value will be empty.

+ * + *

Note that attribute names are processed differently than + * element names: an unprefixed element name will received the + * default Namespace (if any), while an unprefixed attribute name + * will not.

+ * + * @param qName The raw XML 1.0 name to be processed. + * @param isAttribute A flag indicating whether this is an + * attribute name (true) or an element name (false). + * @return A Parts holding three strings representing the + * Namespace URI (or empty string), the local name, and the raw XML + * 1.0 name. + * @see #declarePrefix + */ + private: + class URIMapper + { + public: + URIMapper(const NamespaceSupport* ns) : ns_(ns) { } + string_type operator()(const string_type& prefix) const { return ns_->getURI(prefix); } + private: + const NamespaceSupport* const ns_; + }; // class URIMapper + + public: + XML::QualifiedName processName(const string_type& rawName, bool isAttribute) const + { + try + { + return XML::QualifiedName::parseQName(rawName, isAttribute, URIMapper(this)); + } // try + catch(const std::runtime_error& ex) + { + throw SAX::SAXException(ex.what()); + } // catch + } // processName + + /** + * Look up a prefix and get the currently-mapped Namespace URI. + * + *

This method looks up the prefix in the current context. + * Use the empty string ("") for the default Namespace.

+ * + * @param prefix The prefix to look up. + * @return The associated Namespace URI, or empty string if the prefix + * is undeclared in this context. + * @see #getPrefix + * @see #getPrefixes + */ + string_type getURI(const string_type& prefix) const + { + for(typename contextListT::const_reverse_iterator i = contexts_.rbegin(); i != contexts_.rend(); ++i) + { + typename stringMapT::const_iterator u = i->find(prefix); + if(u != i->end()) + return u->second; + } // for ... + + return string_type(); + } // getURI + + /** + * Return one of the prefixes mapped to a Namespace URI. + * + *

If more than one prefix is currently mapped to the same + * URI, this method will make an arbitrary selection; if you + * want all of the prefixes, use the {@link #getPrefixes} + * method instead.

+ * + *

Note: this will never return the empty (default) prefix; + * to check for a default prefix, use the {@link #getURI getURI} + * method with an argument of "".

+ * + * @param uri The Namespace URI. + * @return One of the prefixes currently mapped to the URI supplied, + * or an empty string if none is mapped or if the URI is assigned to + * the default Namespace. + * @see #getPrefixes(const string_type&) + * @see #getURI + */ + string_type getPrefix(const string_type& uri) const + { + for(typename contextListT::const_reverse_iterator i = contexts_.rbegin(); i != contexts_.rend(); ++i) + { + for(typename stringMapT::const_iterator u = i->begin(); u != i->end(); ++u) + if(u->second == uri) + return u->first; + } // for ... + + return string_type(); + } // getPrefix + + /** + * Returns all prefixes currently declared. + * + *

Note: if there is a default prefix, it will not be + * returned in this enumeration; check for the default prefix + * using the {@link #getURI getURI} with an argument of "".

+ * + * @return A list of all prefixes declared in the + * current context except for the empty (default) + * prefix. + * @see #getDeclaredPrefixes + * @see #getURI + */ + stringListT getPrefixes() const + { + stringListT prefixes; + + for(typename contextListT::const_reverse_iterator i = contexts_.rbegin(); i != contexts_.rend(); ++i) + { + for(typename stringMapT::const_iterator u = i->begin(); u != i->end(); ++u) + if(!string_adaptor::empty(u->first)) + prefixes.push_back(u->first); + } // for ... + + return prefixes; + } // getPrefixes + + /** + * Returns a list of all prefixes currently declared for a URI. + * + *

This method returns prefixes mapped to a specific Namespace + * URI. The xml: prefix will be included. If you want only one + * prefix that's mapped to the Namespace URI, and you don't care + * which one you get, use the {@link #getPrefix getPrefix} + * method instead.

+ * + *

Note: the empty (default) prefix is never included + * in this enumeration; to check for the presence of a default + * Namespace, use the {@link #getURI getURI} method with an + * argument of "".

+ * + * @param uri The Namespace URI. + * @return A list of all prefixes declared in the + * current context. + * @see #getPrefix + * @see #getDeclaredPrefixes + * @see #getURI + */ + stringListT getPrefixes(const string_type& uri) const + { + stringListT prefixes; + + for(typename contextListT::const_reverse_iterator i = contexts_.rbegin(); i != contexts_.rend(); ++i) + { + for(typename stringMapT::const_iterator u = i->begin(); u != i->end(); ++u) + if(u->second == uri) + prefixes.push_back(u->first); + } // for ... + + return prefixes; + } // getPrefixes + + /** + * Return an enumeration of all prefixes declared in this context. + * + *

The empty (default) prefix will be included in this + * enumeration; note that this behaviour differs from that of + * {@link #getPrefix} and {@link #getPrefixes}.

+ * + * @return An enumeration of all prefixes declared in this + * context. + * @see #getPrefixes + * @see #getURI + */ + stringListT getDeclaredPrefixes() const + { + stringListT prefixes; + + for(typename stringMapT::const_iterator u = contexts_.back().begin(); u != contexts_.back().end(); ++u) + prefixes.push_back(u->first); + + return prefixes; + } // getDeclaredPrefixes + + private: + typedef typename std::multimap stringMapT; + typedef stringMapT Context; + typedef typename std::vector contextListT; + + // member variables + contextListT contexts_; + + const NamespaceConstants nsc_; + + // no impl + NamespaceSupport(const NamespaceSupport&); + NamespaceSupport& operator=(const NamespaceSupport&); + bool operator==(const NamespaceSupport&) const; +}; // class NamespaceSupport + +} // namespace SAX +} // namespace Arabica + +#endif // NamespaceSupportH diff --git a/arabica/include/SAX/helpers/PropertyNames.hpp b/arabica/include/SAX/helpers/PropertyNames.hpp new file mode 100644 index 000000000..2d1f0d9ae --- /dev/null +++ b/arabica/include/SAX/helpers/PropertyNames.hpp @@ -0,0 +1,53 @@ +#ifndef ARABICA_PROPERTY_NAMES_H +#define ARABICA_PROPERTY_NAMES_H +/* + * $Id$ + */ + +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** The core SAX 2 parser properties. */ +template > +struct PropertyNames +{ + /** @name SAX 2 Properties + * @{ */ + /** Register a lexical handler. + * + * The Lexical Handler is used to see some syntax events that are essential + * in some applications: comments, CDATA delimiters, selected general + * entity inclusions, and the start and end of the DTD (and declaration of + * document element name). + * + * The value assigned must implement SAX::LexicalHandler. + */ + const string_type lexicalHandler; + /** Register a Declaration Handler. + * + * Used to see most DTD declarations except those treated as lexical + * ("document element name is ...") or which are mandatory for all SAX + * parsers (DTDHandler). + * + * The value assigned must implement SAX::DeclHandler */ + const string_type declHandler; + /** @} */ + + PropertyNames() : + lexicalHandler(string_adaptor_type::construct_from_utf8("http://xml.org/sax/handlers/LexicalHandler")), + declHandler(string_adaptor_type::construct_from_utf8("http://xml.org/sax/handlers/DeclHandler")) + { + } // PropertyNames +}; // struct PropertyNames + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/helpers/XMLFilterImpl.hpp b/arabica/include/SAX/helpers/XMLFilterImpl.hpp new file mode 100644 index 000000000..98abec276 --- /dev/null +++ b/arabica/include/SAX/helpers/XMLFilterImpl.hpp @@ -0,0 +1,608 @@ +#ifndef ARABICA_XML_FILTER_IMPL_H +#define ARABICA_XML_FILTER_IMPL_H + +// XMLFilterImpl.h + +#include +#include +#include +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * Base class for deriving an XML filter. + * + *

This class is designed to sit between an {@link XMLReader + * XMLReader} and the client application's event handlers. By default, it + * does nothing but pass requests up to the reader and events + * on to the handlers unmodified, but subclasses can override + * specific methods to modify the event stream or the configuration + * requests as they pass through.

+ * + * @since SAX 2.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 2.0 + * @see XMLFilter + * @see XMLReader + * @see EntityResolver + * @see DTDHandler + * @see ContentHandler + * @see ErrorHandler + */ +template +class XMLFilterImpl : public XMLFilter, + public EntityResolver, + public DTDHandler, + public ContentHandler, + public ErrorHandler, + public DeclHandler, + public LexicalHandler +{ +public: + typedef XMLReaderInterface XMLReaderT; + typedef EntityResolver EntityResolverT; + typedef DTDHandler DTDHandlerT; + typedef ContentHandler ContentHandlerT; + typedef InputSource InputSourceT; + typedef Locator LocatorT; + typedef ErrorHandler ErrorHandlerT; + typedef DeclHandler DeclHandlerT; + typedef LexicalHandler LexicalHandlerT; + typedef typename ErrorHandler::SAXParseExceptionT SAXParseExceptionT; + typedef DefaultHandler DefaultHandlerT; + + + XMLFilterImpl() : + parent_(0) + { + setDefaults(); + } // XMLFilterImpl + XMLFilterImpl(XMLReaderT& parent) : + parent_(&parent) + { + setDefaults(); + } // XMLFilterImpl + + virtual ~XMLFilterImpl() { } + + ///////////////////////////////////////////////// + // XMLFilter implementation + /** + * Set the parent reader. + * + *

This is the {@link XMLReader XMLReader} from which + * this filter will obtain its events and to which it will pass its + * configuration requests. The parent may itself be another filter.

+ * + *

If there is no parent reader set, any attempt to parse + * or to set or get a feature or property will fail.

+ * + * @param parent The parent XML reader. + * @see #getParent + */ + virtual void setParent(XMLReaderT& parent) { parent_ = &parent; } + /** + * Get the parent reader. + * + * @return The parent XML reader, or null if none is set. + * @see #setParent + */ + virtual XMLReaderT* getParent() const { return parent_; } + + ///////////////////////////////////////////////// + // XMLReader implementation + /** + * Set the state of a feature. + * + *

This will always fail if the parent is null.

+ * + * @param name The feature name. + * @param value The requested feature value. + * @exception SAXNotRecognizedException When the + * XMLReader does not recognize the feature name. + * @exception SAXNotSupportedException When the + * XMLReader recognizes the feature name but + * cannot set the requested value. + * @see XMLReader#setFeature + */ + virtual void setFeature(const string_type& name, bool value) + { + if(!parent_) + { + string_type ex = string_adaptor::construct_from_utf8("Feature: "); + string_adaptor::append(ex, name); + throw SAXNotRecognizedException(string_adaptor::asStdString(ex)); + } // if ... + + parent_->setFeature(name, value); + } // setFeature + /** + * Look up the state of a feature. + * + *

This will always fail if the parent is null.

+ * + * @param name The feature name. + * @return The current state of the feature. + * @exception SAXNotRecognizedException When the + * XMLReader does not recognize the feature name. + * @exception SAXNotSupportedException When the + * XMLReader recognizes the feature name but + * cannot determine its state at this time. + * @see XMLReader#getFeature + */ + virtual bool getFeature(const string_type& name) const + { + if(!parent_) + { + string_type ex = string_adaptor::construct_from_utf8("Feature: "); + string_adaptor::append(ex, name); + throw SAXNotRecognizedException(string_adaptor::asStdString(ex)); + } // if ... + + return parent_->getFeature(name); + } // setFeature + + /** + * Set the entity resolver. + * + * @param resolver The new entity resolver. + * @see XMLReader#setEntityResolver + */ + virtual void setEntityResolver(EntityResolverT& resolver) { entityResolver_ = &resolver; } + /** + * Get the current entity resolver. + * + * @return The current entity resolver, or null if none was set. + * @see XMLReader#getEntityResolver + */ + virtual EntityResolverT* getEntityResolver() const { return entityResolver_ ; } + /** + * Set the DTD event handler. + * + * @param handler The new DTD handler. + * @see XMLReader#setDTDHandler + */ + virtual void setDTDHandler(DTDHandlerT& handler) { dtdHandler_ = &handler; } + /** + * Get the current DTD event handler. + * + * @return The current DTD handler, or null if none was set. + * @see XMLReader#getDTDHandler + */ + virtual DTDHandlerT* getDTDHandler() const { return dtdHandler_; } + /** + * Set the content event handler. + * + * @param handler The new content handler. + * @see XMLReader#setContentHandler + */ + virtual void setContentHandler(ContentHandlerT& handler) { contentHandler_ = &handler; } + /** + * Get the content event handler. + * + * @return The current content handler, or null if none was set. + * @see XMLReader#getContentHandler + */ + virtual ContentHandlerT* getContentHandler() const { return contentHandler_; } + /** + * Set the error event handler. + * + * @param handler The new error handler. + * @see XMLReader#setErrorHandler + */ + virtual void setErrorHandler(ErrorHandlerT& handler) { errorHandler_ = &handler; } + /** + * Get the current error event handler. + * + * @return The current error handler, or null if none was set. + * @see XMLReader#getErrorHandler + */ + virtual ErrorHandlerT* getErrorHandler() const { return errorHandler_; } + + virtual void setDeclHandler(DeclHandlerT& handler) { declHandler_ = &handler; } + virtual DeclHandlerT* getDeclHandler() const { return declHandler_; } + virtual void setLexicalHandler(LexicalHandlerT& handler) { lexicalHandler_ = &handler; } + virtual LexicalHandlerT* getLexicalHandler() const { return lexicalHandler_; } + + /** + * Parse a document. + * + * @param input The input source for the document entity. + * @see XMLReader#parse(InputSource) + */ + virtual void parse(InputSourceT& input) + { + setupParse(); + parent_->parse(input); + } // parse + + virtual std::auto_ptr doGetProperty(const string_type& name) + { + if(parent_) + return parent_->doGetProperty(name); + + string_type ex = string_adaptor::construct_from_utf8("Property: "); + string_adaptor::append(ex, name); + throw SAXNotRecognizedException(string_adaptor::asStdString(ex)); + } // doGetProperty + + virtual void doSetProperty(const string_type& name, typename std::auto_ptr value) + { + if(parent_) + { + parent_->doSetProperty(name, value); + return; + } // if(parent_) + + string_type ex = string_adaptor::construct_from_utf8("Property: "); + string_adaptor::append(ex, name); + throw SAXNotRecognizedException(string_adaptor::asStdString(ex)); + } // doSetProperty + +public: + ////////////////////////////////////////////////// + // EntityResolver + /** + * Filter an external entity resolution. + * + * @param publicId The entity's public identifier, or an empty string. + * @param systemId The entity's system identifier. + * @return A new InputSource or a default-constructed + * InputSourceT for the default. + * @see EntityResolver#resolveEntity + */ + virtual InputSourceT resolveEntity(const string_type& publicId, const string_type& systemId) + { + if(entityResolver_) + return entityResolver_->resolveEntity(publicId, systemId); + return InputSourceT(); + } // resolveEntity + + ////////////////////////////////////////////////// + // DTDHandler + /** + * Filter a notation declaration event. + * + * @param name The notation name. + * @param publicId The notation's public identifier, or an empty string. + * @param systemId The notation's system identifier, or an empty string. + * @see DTDHandler#notationDecl + */ + virtual void notationDecl(const string_type& name, + const string_type& publicId, + const string_type& systemId) + { + dtdHandler_->notationDecl(name, publicId, systemId); + } // notationDecl + + /** + * Filter an unparsed entity declaration event. + * + * @param name The entity name. + * @param publicId The entity's public identifier, or an empty string. + * @param systemId The entity's system identifier, or an empty string. + * @param notationName The name of the associated notation. + * @see DTDHandler#unparsedEntityDecl + */ + virtual void unparsedEntityDecl(const string_type& name, + const string_type& publicId, + const string_type& systemId, + const string_type& notationName) + { + dtdHandler_->unparsedEntityDecl(name, publicId, systemId, notationName); + } // unparsedEntityDecl + + ////////////////////////////////////////////////// + // ContentHandler + /** + * Filter a new document locator event. + * + * @param locator The document locator. + * @see ContentHandler#setDocumentLocator + */ + virtual void setDocumentLocator(const LocatorT& locator) + { + contentHandler_->setDocumentLocator(locator); + } // setDocumentLocator + + /** + * Filter a start document event. + * + * @see ContentHandler#startDocument + */ + virtual void startDocument() + { + contentHandler_->startDocument(); + } // startDocument + + /** + * Filter an end document event. + * + * @see ContentHandler#endDocument + */ + virtual void endDocument() + { + contentHandler_->endDocument(); + } // endDocument + + /** + * Filter a start Namespace prefix mapping event. + * + * @param prefix The Namespace prefix. + * @param uri The Namespace URI. + * @see ContentHandler#startPrefixMapping + */ + virtual void startPrefixMapping(const string_type& prefix, const string_type& uri) + { + contentHandler_->startPrefixMapping(prefix, uri); + } // startPrefixMapping + + /** + * Filter an end Namespace prefix mapping event. + * + * @param prefix The Namespace prefix. + * @see ContentHandler#endPrefixMapping + */ + virtual void endPrefixMapping(const string_type& prefix) + { + contentHandler_->endPrefixMapping(prefix); + } // endPrefixMapping + + /** + * Filter a start element event. + * + * @param namespaceURI The element's Namespace URI, or the empty string. + * @param localName The element's local name, or the empty string. + * @param qName The element's qualified (prefixed) name, or the empty + * string. + * @param atts The element's attributes. + * @see ContentHandler#startElement + */ + virtual void startElement(const string_type& namespaceURI, const string_type& localName, + const string_type& qName, const typename ContentHandlerT::AttributesT& atts) + { + contentHandler_->startElement(namespaceURI, localName, qName, atts); + } // startElement + + /** + * Filter an end element event. + * + * @param namespaceURI The element's Namespace URI, or the empty string. + * @param localName The element's local name, or the empty string. + * @param qName The element's qualified (prefixed) name, or the empty + * string. + * @see ContentHandler#endElement + */ + virtual void endElement(const string_type& namespaceURI, const string_type& localName, + const string_type& qName) + { + contentHandler_->endElement(namespaceURI, localName, qName); + } // endElement + + /** + * Filter a character data event. + * + * @param ch The characters. + * @see ContentHandler#characters + */ + virtual void characters(const string_type& ch) + { + contentHandler_->characters(ch); + } // characters + + /** + * Filter an ignorable whitespace event. + * + * @param ch The whitespace + * @see ContentHandler#ignorableWhitespace + */ + virtual void ignorableWhitespace(const string_type& ch) + { + contentHandler_->ignorableWhitespace(ch); + } // ignorableWhitespace + + /** + * Filter a processing instruction event. + * + * @param target The processing instruction target. + * @param data The text following the target. + * @see ContentHandler#processingInstruction + */ + virtual void processingInstruction(const string_type& target, const string_type& data) + { + contentHandler_->processingInstruction(target, data); + } // processingInstruction + + /** + * Filter a skipped entity event. + * + * @param name The name of the skipped entity. + * @see ContentHandler#skippedEntity + */ + virtual void skippedEntity(const string_type& name) + { + contentHandler_->skippedEntity(name); + } // skippedEntity + + ////////////////////////////////////////////////// + // ErrorHandler + /** + * Filter a warning event. + * + * @param exception The warning as an exception. + * @see ErrorHandler#warning + */ + virtual void warning(const SAXParseExceptionT& exception) + { + errorHandler_->warning(exception); + } // warning + + /** + * Filter an error event. + * + * @param exception The error as an exception. + * @see ErrorHandler#error + */ + virtual void error(const SAXParseExceptionT& exception) + { + errorHandler_->error(exception); + } // error + + /** + * Filter a fatal error event. + * + * @param exception The error as an exception. + * @see ErrorHandler#fatalError + */ + virtual void fatalError(const SAXParseExceptionT& exception) + { + errorHandler_->fatalError(exception); + } // fatalError + + //////////////////////////////////////////////////////////// + // DeclHandler + /** + * Filter an element type declaration. + */ + virtual void elementDecl(const string_type& name, const string_type& model) + { + declHandler_->elementDecl(name, model); + } // elementDecl + + /** + * Filter an attribute type declaration. + */ + virtual void attributeDecl(const string_type& elementName, + const string_type& attributeName, + const string_type& type, + const string_type& valueDefault, + const string_type& value) + { + declHandler_->attributeDecl(elementName, attributeName, type, valueDefault, value); + } // attributeDecl + + /** + * Filter an internal entity declaration. + */ + virtual void internalEntityDecl(const string_type& name, const string_type& value) + { + declHandler_->internalEntityDecl(name, value); + } // internalEntityDecl + + /** + * Filter a parsed external entity declaration. + */ + virtual void externalEntityDecl(const string_type& name, + const string_type& publicId, + const string_type& systemId) + { + declHandler_->externalEntityDecl(name, publicId, systemId); + } // externalEntityDecl + + ////////////////////////////////////////////////////////// + // LexicalHandler + /** + * Filter the start of DTD declarations, if any. + */ + virtual void startDTD(const string_type& name, + const string_type& publicId, + const string_type& systemId) + { + lexicalHandler_->startDTD(name, publicId, systemId); + } // startDTD + + /** + * Filter the end of DTD declarations. + */ + virtual void endDTD() + { + lexicalHandler_->endDTD(); + } // endDTD + + /** + * Filter the beginning of some internal and external XML entities. + */ + virtual void startEntity(const string_type& name) + { + lexicalHandler_->startEntity(name); + } // startEntity + + /** + * Filter the end of an entity. + */ + virtual void endEntity(const string_type& name) + { + lexicalHandler_->endEntity(name); + } // endEntity + + /** + * Filter the start of a CDATA section. + */ + virtual void startCDATA() + { + lexicalHandler_->startCDATA(); + } // startCDATA + + /** + * Filter the end of a CDATA section. + */ + virtual void endCDATA() + { + lexicalHandler_->endCDATA(); + } // endCDATA + + /** + * Filter an XML comment anywhere in the document. + */ + virtual void comment(const string_type& text) + { + lexicalHandler_->comment(text); + } // comment + +private: + void setDefaults() + { + setEntityResolver(defaultHandler_); + setDTDHandler(defaultHandler_); + setContentHandler(defaultHandler_); + setErrorHandler(defaultHandler_); + setDeclHandler(defaultHandler_); + setLexicalHandler(defaultHandler_); + } // setDefaults + + void setupParse() + { + parent_->setEntityResolver(*this); + parent_->setDTDHandler(*this); + parent_->setContentHandler(*this); + parent_->setErrorHandler(*this); + parent_->setDeclHandler(*this); + parent_->setLexicalHandler(*this); + } // setupParse + + XMLFilterImpl(const XMLFilterImpl&); + XMLFilterImpl& operator=(const XMLFilterImpl&); // no impl + bool operator==(const XMLFilterImpl&); // no impl + + XMLReaderT* parent_; + EntityResolverT* entityResolver_; + DTDHandlerT* dtdHandler_; + ContentHandlerT* contentHandler_; + ErrorHandlerT* errorHandler_; + DeclHandlerT* declHandler_; + LexicalHandlerT* lexicalHandler_; + DefaultHandlerT defaultHandler_; +}; // class XMLFilter + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file diff --git a/arabica/include/SAX/saxfwd.hpp b/arabica/include/SAX/saxfwd.hpp new file mode 100644 index 000000000..983a67aa3 --- /dev/null +++ b/arabica/include/SAX/saxfwd.hpp @@ -0,0 +1,89 @@ +def ARABICA_SAX_FWD_H +#define ARABICA_SAX_FWD_H + +#include + +/** \mainpage Arabica XML Parser Toolkit + * + * \section intro Introduction + * + *

Arabica has a full SAX2, the Simple API for XML, implementation + * including the optional interfaces and helper classes. Layered on + * SAX is a W3C Document Object Model (DOM) Level 2.0 Core + * implementation, together with an XPath engine.

+ * + *

It delivers UTF-8 encoded std::strings or UCS-2 std::wstrings, + * but can also accommodate custom string types and alternative + * encodings.

+ * + *

It provides uniform SAX2 wrappers for the expat parser, + * Xerces, libxml + * and for the Microsoft XML parser COM component.

+ * + * \section license License + * + * Copyright (c) 2001-2012 Jez UK Ltd
+ * All rights reserved. + *

+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *

    + *
  • Redistributions of source code must retain the above
    + * copyright notice, this list of conditions and the following
    + * disclaimer.
  • + *
  • Redistributions in binary form must reproduce the above
    + * copyright notice, this list of conditions and the following
    + * disclaimer in the documentation and/or other materials
    + * provided with the distribution.
  • + *
  • Neither the name of Jez UK Ltd nor the names of
    + * contributors may be used to endorse or promote products
    + * derived from this software without specific prior written
    + * permission.
  • + *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+ * OF SUCH DAMAGE.
+ * + */ + + +#include +#include + +/** + * + * SAX in C++ - A C++ implementation of the SAX2 interface. + * + */ +namespace Arabica +{ +namespace SAX +{ + +template class AttributeList; +template class DocumentHandler; +template class DTDHandler; +template class EntityResolver; +template class InputSource; +template class Locator; +template class Parser; +template class SAXParseException; +template class ErrorHandler; + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file + diff --git a/arabica/include/SAX/wrappers/saxlibxml2.hpp b/arabica/include/SAX/wrappers/saxlibxml2.hpp new file mode 100644 index 000000000..76072d83b --- /dev/null +++ b/arabica/include/SAX/wrappers/saxlibxml2.hpp @@ -0,0 +1,955 @@ +#ifndef ARABICA_SAX_LIBXML2_H +#define ARABICA_SAX_LIBXML2_H +//////////////////////////////////////////////////////////////// +// A SAX2 Wrapper for libxml2 +//////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +//////////////////////////////////////////////////////////////////////////// +// the callback functions for libxml +namespace libxml2_wrapper_impl_tiddle +{ + +extern "C" +{ + +void lwit_startDocument(void* user_data); +void lwit_endDocument(void* user_data); +void lwit_startElement(void *user_data, const xmlChar* name, const xmlChar** attrs); +void lwit_endElement(void *user_data, const xmlChar* name); +void lwit_characters(void* user_data, const xmlChar* ch, int len); +void lwit_cdata(void* user_data, const xmlChar* ch, int len); +void lwit_ignorableWhitespace(void *user_data, const xmlChar* ch, int len); +void lwit_processingInstruction(void *user_data, const xmlChar* target, const xmlChar* data); +void lwit_comment(void *user_data, const xmlChar* comment); +void lwit_warning(void *user_data, const char* fmt, ...); +void lwit_error(void* user_data, const char* fmt, ...); +void lwit_fatalError(void* user_data, const char* fmt, ...); +void lwit_locator(void* user_data, xmlSAXLocatorPtr locator); +void lwit_notationDecl(void* user_data, const xmlChar *name, const xmlChar *publicId, const xmlChar *systemId); +void lwit_unparsedEntityDecl(void* user_data, + const xmlChar *name, const xmlChar *publicId, + const xmlChar *systemId, const xmlChar *notationName); +void lwit_elementDecl(void* user_date, const xmlChar *name, int type, xmlElementContentPtr content); +void lwit_attributeDecl(void *user_data, const xmlChar *elem, const xmlChar *fullname, int type, int def, const xmlChar *defaultValue, xmlEnumerationPtr tree); +void lwit_entityDecl(void* user_data, const xmlChar *name, int type, const xmlChar *publicId, const xmlChar *systemId, xmlChar *content); +void lwit_setFeature(xmlParserCtxtPtr context, const char* name, bool value); +bool lwit_getFeature(xmlParserCtxtPtr context, const char* name); +xmlEntityPtr lwit_getEntity(void* user_data, const xmlChar* name); +xmlParserInputPtr lwit_resolveEntity(void* user_data, const xmlChar* publicId, const xmlChar* systemId); +xmlSAXHandler* lwit_SaxHandler(); + +class libxml2_base +{ + protected: + virtual ~libxml2_base() { } + + private: + virtual void SAXstartDocument() = 0; + virtual void SAXendDocument() = 0; + virtual void SAXlocator(xmlSAXLocatorPtr locator) = 0; + virtual void SAXcharacters(const xmlChar* ch, int len) = 0; + virtual void SAXcdata(const xmlChar* ch, int len) = 0; + virtual void SAXignorableWhitespace(const xmlChar* ch, int len) = 0; + virtual void SAXwarning(const std::string& warning) = 0; + virtual void SAXerror(const std::string& error) = 0; + virtual void SAXfatalError(const std::string& fatal) = 0; + virtual void SAXprocessingInstruction(const xmlChar* target, const xmlChar* data) = 0; + virtual void SAXcomment(const xmlChar* comment) = 0; + virtual void SAXstartElement(const xmlChar* name, const xmlChar** attrs) = 0; + virtual void SAXendElement(const xmlChar* name) = 0; + virtual void SAXnotationDecl(const xmlChar *name, const xmlChar *publicId, const xmlChar *systemId) = 0; + virtual void SAXunparsedEntityDecl(const xmlChar *name, const xmlChar *publicId, const xmlChar *systemId, const xmlChar *notationName) = 0; + virtual void SAXelementDecl(const xmlChar* name, int type, xmlElementContentPtr content) = 0; + virtual void SAXattributeDecl(const xmlChar *elem, const xmlChar *fullname, int type, int def, const xmlChar *defaultValue, xmlEnumerationPtr tree) = 0; + virtual void SAXentityDecl(const xmlChar *name, int type, const xmlChar *publicId, const xmlChar *systemId, xmlChar *content) = 0; + virtual xmlParserInputPtr SAXresolveEntity(const xmlChar* publicId, const xmlChar* systemId) = 0; + virtual xmlParserCtxtPtr parserContext() = 0; + + virtual void SAXstartCdataSection() = 0; + virtual void SAXendCdataSection() = 0; + + friend void lwit_startDocument(void* user_data); + friend void lwit_endDocument(void* user_data); + friend void lwit_characters(void *user_data, const xmlChar* ch, int len); + friend void lwit_cdata(void *user_data, const xmlChar* ch, int len); + friend void lwit_ignorableWhitespace(void *user_data, const xmlChar* ch, int len); + friend void lwit_locator(void* user_data, xmlSAXLocatorPtr locator); + friend void lwit_warning(void *user_data, const char* fmt, ...); + friend void lwit_error(void* user_data, const char* fmt, ...); + friend void lwit_fatalError(void* user_data, const char* fmt, ...); + friend void lwit_processingInstruction(void *user_data, const xmlChar* target, const xmlChar* data); + friend void lwit_comment(void *user_data, const xmlChar* comment); + friend void lwit_startElement(void *user_data, const xmlChar* name, const xmlChar** attrs); + friend void lwit_endElement(void* user_data, const xmlChar* name); + friend void lwit_notationDecl(void* user_data, const xmlChar *name, const xmlChar *publicId, const xmlChar *systemId); + friend void lwit_unparsedEntityDecl(void* user_data, const xmlChar *name, const xmlChar *publicId, const xmlChar *systemId, const xmlChar *notationName); + friend void lwit_elementDecl(void* user_data, const xmlChar *name, int type, xmlElementContentPtr content); + friend void lwit_attributeDecl(void* user_data, const xmlChar *elem, const xmlChar *fullname, int type, int def, const xmlChar *defaultValue, xmlEnumerationPtr tree); + friend void lwit_entityDecl(void* user_data, const xmlChar *name, int type, const xmlChar *publicId, const xmlChar *systemId, xmlChar *content); + friend xmlEntityPtr lwit_getEntity(void* user_data, const xmlChar* name); + friend xmlParserInputPtr lwit_resolveEntity(void* user_data, const xmlChar* publicId, const xmlChar* systemId); +}; // class libxml2_base + +} // extern "C" + +} // namespace libxml2_wrapper_impl_tiddle + +template +class libxml2_wrapper : + public SAX::XMLReaderInterface::type>, + public SAX::Locator::type>, + protected libxml2_wrapper_impl_tiddle::libxml2_base +{ + public: + typedef SAX::XMLReaderInterface::type> XMLReaderT; + typedef typename XMLReaderT::string_adaptor string_adaptor; + typedef SAX::EntityResolver entityResolverT; + typedef SAX::DTDHandler dtdHandlerT; + typedef SAX::ContentHandler contentHandlerT; + typedef SAX::Attributes attributesT; + typedef SAX::AttributeType attributeTypeT; + typedef SAX::DeclHandler declHandlerT; + typedef SAX::LexicalHandler lexicalHandlerT; + typedef SAX::InputSource inputSourceT; + typedef SAX::Locator locatorT; + typedef SAX::NamespaceSupport namespaceSupportT; + typedef SAX::ErrorHandler errorHandlerT; + typedef SAX::SAXParseException SAXParseExceptionT; + typedef typename XMLReaderT::PropertyBase PropertyBaseT; + typedef typename XMLReaderT::template Property getLexicalHandlerT; + typedef typename XMLReaderT::template Property setLexicalHandlerT; + typedef typename XMLReaderT::template Property getDeclHandlerT; + typedef typename XMLReaderT::template Property setDeclHandlerT; + typedef XML::QualifiedName qualifiedNameT; + + libxml2_wrapper(); + ~libxml2_wrapper(); + + //////////////////////////////////////////////// + // configuration + virtual bool getFeature(const string_type& name) const; + virtual void setFeature(const string_type& name, bool value); + + //////////////////////////////////////////////// + // Event Handlers + virtual void setEntityResolver(entityResolverT& resolver) { entityResolver_ = &resolver; } + virtual entityResolverT* getEntityResolver() const { return entityResolver_; } + virtual void setDTDHandler(dtdHandlerT& handler) { dtdHandler_ = &handler; } + virtual dtdHandlerT* getDTDHandler() const { return dtdHandler_; } + virtual void setContentHandler(contentHandlerT& handler) { contentHandler_ = &handler; } + virtual contentHandlerT* getContentHandler() const { return contentHandler_; } + virtual void setErrorHandler(errorHandlerT& handler) { errorHandler_ = &handler; } + virtual errorHandlerT* getErrorHandler() const { return errorHandler_; } + virtual void setDeclHandler(declHandlerT& handler) { declHandler_ = &handler; } + virtual declHandlerT* getDeclHandler() const { return declHandler_; } + virtual void setLexicalHandler(lexicalHandlerT& handler) { lexicalHandler_ = &handler; } + virtual lexicalHandlerT* getLexicalHandler() const { return lexicalHandler_; } + + //////////////////////////////////////////////// + // parsing + virtual void parse(inputSourceT& source); + + protected: + //////////////////////////////////////////////// + // properties + virtual std::auto_ptr doGetProperty(const string_type& name); + virtual void doSetProperty(const string_type& name, std::auto_ptr value); + + public: + virtual string_type getPublicId() const; + virtual string_type getSystemId() const; + virtual size_t getLineNumber() const; + virtual size_t getColumnNumber() const; + + private: + virtual void SAXstartDocument(); + virtual void SAXendDocument(); + virtual void SAXlocator(xmlSAXLocatorPtr locator) { locator_ = locator; } + virtual void SAXcharacters(const xmlChar* ch, int len); + virtual void SAXcdata(const xmlChar* ch, int len); + virtual void SAXignorableWhitespace(const xmlChar* ch, int len); + virtual void SAXwarning(const std::string& warning); + virtual void SAXerror(const std::string& error); + virtual void SAXfatalError(const std::string& fatal); + virtual void SAXprocessingInstruction(const xmlChar* target, const xmlChar* data); + virtual void SAXcomment(const xmlChar* comment); + virtual void SAXstartCdataSection(); + virtual void SAXendCdataSection(); + virtual void SAXstartElement(const xmlChar* name, const xmlChar** attrs); + virtual void SAXstartElementNoNS(const xmlChar* name, const xmlChar** attrs); + virtual void SAXendElement(const xmlChar* name); + virtual void SAXendElementNoNS(const xmlChar* name); + virtual void SAXnotationDecl(const xmlChar *name, const xmlChar *publicId, const xmlChar *systemId); + virtual void SAXunparsedEntityDecl(const xmlChar *name, const xmlChar *publicId, const xmlChar *systemId, const xmlChar *notationName); + virtual void SAXelementDecl(const xmlChar* name, int type, xmlElementContentPtr content); + void convertXML_Content(std::ostream& os, int type, xmlElementContentPtr model, bool isChild) const; + virtual void SAXattributeDecl(const xmlChar *elem, const xmlChar *fullname, int type, int def, const xmlChar *defaultValue, xmlEnumerationPtr tree); + string_type stringAttrEnum(xmlEnumerationPtr tree, bool leadingSpace) const; + virtual void SAXentityDecl(const xmlChar *name, int type, const xmlChar *publicId, const xmlChar *systemId, xmlChar *content); + virtual xmlParserInputPtr SAXresolveEntity(const xmlChar* publicId, const xmlChar* systemId); + virtual xmlParserCtxtPtr parserContext() { return context_; } + + + qualifiedNameT processName(const string_type& qName, bool isAttribute); + void reportError(const std::string& message, bool fatal = false); + void checkNotParsing(const string_type& type, const string_type& name) const; + + private: + // member variables + entityResolverT* entityResolver_; + dtdHandlerT* dtdHandler_; + contentHandlerT* contentHandler_; + errorHandlerT* errorHandler_; + namespaceSupportT nsSupport_; + declHandlerT* declHandler_; + lexicalHandlerT* lexicalHandler_; + + xmlParserCtxtPtr context_; + xmlSAXLocatorPtr locator_; + + bool parsing_; + + bool namespaces_; + bool prefixes_; + bool isInCData_; + + string_type emptyString_; + const FeatureNames features_; + const PropertyNames properties_; + const NamespaceConstants nsc_; + const AttributeDefaults attrDefaults_; + const AttributeTypes attrTypes_; +}; // class libxml2_wrapper + +template +libxml2_wrapper::libxml2_wrapper() : + entityResolver_(0), + dtdHandler_(0), + contentHandler_(0), + errorHandler_(0), + declHandler_(0), + lexicalHandler_(0), + locator_(0), + parsing_(false), + isInCData_(false), + namespaces_(true), + prefixes_(true) +{ + context_ = xmlCreatePushParserCtxt(libxml2_wrapper_impl_tiddle::lwit_SaxHandler(), + reinterpret_cast(static_cast(this)), + 0, + 0, + 0); + xmlCtxtUseOptions(context_, XML_PARSE_DTDLOAD + + XML_PARSE_DTDVALID + + XML_PARSE_NOENT + + XML_PARSE_NOBLANKS); +} // libxml2_wrapper + +template +libxml2_wrapper::~libxml2_wrapper() +{ + xmlFreeParserCtxt(context_); +} // ~libxml2_wrapper + +template +bool libxml2_wrapper::getFeature(const string_type& name) const +{ + if(name == features_.namespaces) + return namespaces_; + + if(name == features_.namespace_prefixes) + return prefixes_; + + if(name == features_.validation) + return libxml2_wrapper_impl_tiddle::lwit_getFeature(context_, "validate"); + + if(name == features_.external_general) + return libxml2_wrapper_impl_tiddle::lwit_getFeature(context_, "fetch external entities"); + + if(name == features_.external_parameter) + { + throw SAX::SAXNotSupportedException(std::string("Feature not supported ") + string_adaptor::asStdString(name)); + } + else + { + throw SAX::SAXNotRecognizedException(std::string("Feature not recognized ") + string_adaptor::asStdString(name)); + } +} // getFeature + +template +void libxml2_wrapper::setFeature(const string_type& name, bool value) +{ + if(name == features_.namespaces) + { + checkNotParsing(string_adaptor::construct_from_utf8("feature"), name); + namespaces_ = value; + if(!namespaces_ && !prefixes_) + prefixes_ = true; + return; + } + + if(name == features_.namespace_prefixes) + { + checkNotParsing(string_adaptor::construct_from_utf8("feature"), name); + prefixes_ = value; + if(prefixes_ && !namespaces_) + namespaces_ = true; + return; + } + + if(name == features_.validation) + { + libxml2_wrapper_impl_tiddle::lwit_setFeature(context_, "validate", value); + return; + } // if ... + + if(name == features_.external_general) + { + libxml2_wrapper_impl_tiddle::lwit_setFeature(context_, "fetch external entities", value); + return; + } // if ... + + if(name == features_.external_parameter) + { + std::ostringstream os; + os << "Feature not supported " << string_adaptor::asStdString(name); + throw SAX::SAXNotSupportedException(os.str()); + } + else + { + std::ostringstream os; + os << "Feature not recognized " << string_adaptor::asStdString(name); + throw SAX::SAXNotRecognizedException(os.str()); + } +} // setFeature + +template +std::auto_ptr::PropertyBaseT> libxml2_wrapper::doGetProperty(const string_type& name) +{ + if(name == properties_.declHandler) + { + getDeclHandlerT* prop = new getDeclHandlerT(declHandler_); + return std::auto_ptr(prop); + } + if(name == properties_.lexicalHandler) + { + getLexicalHandlerT* prop = new getLexicalHandlerT(lexicalHandler_); + return std::auto_ptr(prop); + } + + throw SAX::SAXNotRecognizedException(std::string("Property not recognized ") + string_adaptor::asStdString(name)); +} // doGetProperty + +template +void libxml2_wrapper::doSetProperty(const string_type& name, std::auto_ptr value) +{ + if(name == properties_.declHandler) + { + setDeclHandlerT* prop = dynamic_cast(value.get()); + + if(!prop) + throw std::bad_cast(); + + declHandler_ = &(prop->get()); + } + if(name == properties_.lexicalHandler) + { + setLexicalHandlerT* prop = dynamic_cast(value.get()); + + if(!prop) + throw std::bad_cast(); + + lexicalHandler_ = &(prop->get()); + } + + throw SAX::SAXNotRecognizedException(std::string("Property not recognized ") + string_adaptor::asStdString(name)); +} // doSetProperty + +template +typename XML::QualifiedName::string_adaptor> libxml2_wrapper::processName(const string_type& qName, bool isAttribute) +{ + qualifiedNameT p = nsSupport_.processName(qName, isAttribute); + if(string_adaptor::empty(p.namespaceUri()) && !string_adaptor::empty(p.prefix())) + reportError(std::string("Undeclared prefix ") + string_adaptor::asStdString(qName)); + return p; +} // processName + +template +void libxml2_wrapper::reportError(const std::string& message, bool fatal) +{ + if(!errorHandler_) + return; + + SAXParseExceptionT e(message, *this); + if(fatal) + errorHandler_->fatalError(e); + else + errorHandler_->error(e); +} // reportError + +template +void libxml2_wrapper::checkNotParsing(const string_type& type, const string_type& name) const +{ + if(parsing_) + { + std::ostringstream os; + os << "Can't change " << string_adaptor::asStdString(type) << " " << string_adaptor::asStdString(name) << " while parsing"; + throw SAX::SAXNotSupportedException(os.str()); + } // if(parsing_) +} // checkNotParsing + +template +string_type libxml2_wrapper::getPublicId() const +{ + if(locator_) + return string_adaptor::construct_from_utf8(reinterpret_cast(locator_->getPublicId(context_))); + return string_type(); +} // getPublicId + +template +string_type libxml2_wrapper::getSystemId() const +{ + if(locator_) + return string_adaptor::construct_from_utf8(reinterpret_cast(locator_->getSystemId(context_))); + return string_type(); +} // getSystemId + +template +size_t libxml2_wrapper::getLineNumber() const +{ + if(locator_) + return locator_->getLineNumber(context_); + return -1; +} // getLineNumber + +template +size_t libxml2_wrapper::getColumnNumber() const +{ + if(locator_) + return locator_->getColumnNumber(context_); + return -1; +} // getColumnNumber + +template +void libxml2_wrapper::parse(inputSourceT& source) +{ + if(contentHandler_) + contentHandler_->setDocumentLocator(*this); + + InputSourceResolver is(source, string_adaptor()); + if(is.resolve() == 0) + return; + + parsing_ = true; + + while(!is.resolve()->eof()) + { + char buffer[4096]; + is.resolve()->read(buffer, sizeof(buffer)); + xmlParseChunk(context_, buffer, (int)is.resolve()->gcount(), is.resolve()->eof()); + } // while(!in.eof()) + + xmlCtxtResetPush(context_, 0, 0, 0, 0); + + parsing_ = false; +} // parse + +template +void libxml2_wrapper::SAXstartDocument() +{ + if(isInCData_) + SAXendCdataSection(); + + if(contentHandler_) + contentHandler_->startDocument(); +} // SAXstartDocument + +template +void libxml2_wrapper::SAXendDocument() +{ + if(isInCData_) + SAXendCdataSection(); + + if(contentHandler_) + contentHandler_->endDocument(); +} // SAXendDocument + +template +void libxml2_wrapper::SAXcharacters(const xmlChar* ch, int len) +{ + if(isInCData_) + SAXendCdataSection(); + + if(contentHandler_) + contentHandler_->characters(string_adaptor::construct_from_utf8(reinterpret_cast(ch), len)); +} // SAXcharacters + +template +void libxml2_wrapper::SAXcdata(const xmlChar* ch, int len) +{ + if(contentHandler_) + contentHandler_->characters(string_adaptor::construct_from_utf8(reinterpret_cast(ch), len)); +} // SAXcdata + +template +void libxml2_wrapper::SAXignorableWhitespace(const xmlChar* ch, int len) +{ + if(contentHandler_) + contentHandler_->ignorableWhitespace(string_adaptor::construct_from_utf8(reinterpret_cast(ch), len)); +} // SAXignorableWhitespace + +template +void libxml2_wrapper::SAXwarning(const std::string& warning) +{ + if(errorHandler_) + errorHandler_->warning(SAXParseExceptionT(warning, *this)); +} // warning + +template +void libxml2_wrapper::SAXerror(const std::string& error) +{ + if(errorHandler_) + errorHandler_->error(SAXParseExceptionT(error, *this)); +} // error + +template +void libxml2_wrapper::SAXfatalError(const std::string& fatal) +{ + if(errorHandler_) + errorHandler_->fatalError(SAXParseExceptionT(fatal, *this)); +} // fatal + +template +void libxml2_wrapper::SAXprocessingInstruction(const xmlChar* target, const xmlChar* data) +{ + if(isInCData_) + SAXendCdataSection(); + + if(contentHandler_) + contentHandler_->processingInstruction(string_adaptor::construct_from_utf8(reinterpret_cast(target)), + string_adaptor::construct_from_utf8(reinterpret_cast(data))); +} // SAXprocessingInstruction + +template +void libxml2_wrapper::SAXcomment(const xmlChar* comment) +{ + if(isInCData_) + SAXendCdataSection(); + + if(lexicalHandler_) + lexicalHandler_->comment(string_adaptor::construct_from_utf8(reinterpret_cast(comment))); +} // SAXcomment + +template +void libxml2_wrapper::SAXstartCdataSection() +{ + if (isInCData_) + return; + + isInCData_ = true; + if(lexicalHandler_) + lexicalHandler_->startCDATA(); +} // startCdataSection + +template +void libxml2_wrapper::SAXendCdataSection() +{ + if (!isInCData_) + return; + + if(lexicalHandler_) + lexicalHandler_->endCDATA(); + isInCData_ = false; +} // endCdataSection + +template +void libxml2_wrapper::SAXstartElement(const xmlChar* qName, const xmlChar** atts) +{ + + if(isInCData_) + SAXendCdataSection(); + + if(!contentHandler_) + return; + + if(!namespaces_) + { + SAXstartElementNoNS(qName, atts); + return; + } // if(!namespaces) + + // OK we're doing Namespaces + nsSupport_.pushContext(); + SAX::AttributesImpl attributes; + + // take a first pass and copy all the attributes, noting any declarations + if(atts && *atts != 0) + { + const xmlChar** a1 = atts; + while(*a1 != 0) + { + string_type attQName = string_adaptor::construct_from_utf8(reinterpret_cast(*a1++)); + string_type value = string_adaptor::construct_from_utf8(reinterpret_cast(*a1++)); + + // declaration? + if(string_adaptor::find(attQName, nsc_.xmlns) == 0) + { + string_type prefix; + typename string_adaptor::size_type n = string_adaptor::find(attQName, nsc_.colon); + if(n != string_adaptor::npos()) + prefix = string_adaptor::construct(string_adaptor::begin(attQName) + n + 1, string_adaptor::end(attQName)); + if(!nsSupport_.declarePrefix(prefix, value)) + reportError(std::string("Illegal Namespace prefix ") + string_adaptor::asStdString(prefix)); + contentHandler_->startPrefixMapping(prefix, value); + if(prefixes_) + attributes.addAttribute(emptyString_, + emptyString_, + attQName, + attributeTypeT::CDATA, + value); + } + } // while + + while(*atts != 0) + { + string_type attQName = string_adaptor::construct_from_utf8(reinterpret_cast(*atts++)); + string_type value = string_adaptor::construct_from_utf8(reinterpret_cast(*atts++)); + + // declaration? + if(string_adaptor::find(attQName, nsc_.xmlns) != 0) + { + qualifiedNameT attName = processName(attQName, true); + attributes.addAttribute(attName.namespaceUri(), + attName.localName(), + attName.rawName(), + attributeTypeT::CDATA, + value); + } + } // while ... + } // if ... + + // at last! report the event + qualifiedNameT name = processName(string_adaptor::construct_from_utf8(reinterpret_cast(qName)), false); + contentHandler_->startElement(name.namespaceUri(), + name.localName(), + name.rawName(), + attributes); +} // SAXstartElement + +template +void libxml2_wrapper::SAXstartElementNoNS(const xmlChar* qName, const xmlChar** atts) +{ + SAX::AttributesImpl attributes; + + if(isInCData_) + SAXendCdataSection(); + + if(atts && *atts != 0) + { + while(*atts != 0) + { + string_type attQName = string_adaptor::construct_from_utf8(reinterpret_cast(*atts++)); + string_type value = string_adaptor::construct_from_utf8(reinterpret_cast(*atts++)); + + attributes.addAttribute(emptyString_, + emptyString_, + attQName, + attributeTypeT::CDATA, + value); + } // while .. + } // if ... + + contentHandler_->startElement(emptyString_, emptyString_, string_adaptor::construct_from_utf8((reinterpret_cast(qName))), attributes); +} // SAXstartElementNoNS + +template +void libxml2_wrapper::SAXendElement(const xmlChar* qName) +{ + if(isInCData_) + SAXendCdataSection(); + + if(!contentHandler_) + return; + + if(!namespaces_) + { + SAXendElementNoNS(qName); + return; + } // if(!namespaces_) + + qualifiedNameT name = processName(string_adaptor::construct_from_utf8(reinterpret_cast(qName)), false); + contentHandler_->endElement(name.namespaceUri(), + name.localName(), + name.rawName()); + typename NamespaceSupport::stringListT prefixes = nsSupport_.getDeclaredPrefixes(); + for(size_t i = 0, end = prefixes.size(); i < end; ++i) + contentHandler_->endPrefixMapping(prefixes[i]); + nsSupport_.popContext(); +} // SAXendElement + +template +void libxml2_wrapper::SAXendElementNoNS(const xmlChar* qName) +{ + if(isInCData_) + SAXendCdataSection(); + + if(contentHandler_) + contentHandler_->endElement(emptyString_, emptyString_, string_adaptor::construct_from_utf8(reinterpret_cast(qName))); +} // SAXendElementNoNS + +template +void libxml2_wrapper::SAXnotationDecl(const xmlChar *name, const xmlChar *publicId, const xmlChar *systemId) +{ + if(isInCData_) + SAXendCdataSection(); + + if(dtdHandler_) + dtdHandler_->notationDecl(string_adaptor::construct_from_utf8(reinterpret_cast(name)), + string_adaptor::construct_from_utf8(reinterpret_cast(publicId)), + string_adaptor::construct_from_utf8(reinterpret_cast(systemId))); +} // SAXnotationDecl + +template +void libxml2_wrapper::SAXunparsedEntityDecl(const xmlChar *name, const xmlChar *publicId, const xmlChar *systemId, const xmlChar *notationName) +{ + if(isInCData_) + SAXendCdataSection(); + + if(dtdHandler_) + dtdHandler_->unparsedEntityDecl(string_adaptor::construct_from_utf8(reinterpret_cast(name)), + string_adaptor::construct_from_utf8(reinterpret_cast(publicId)), + string_adaptor::construct_from_utf8(reinterpret_cast(systemId)), + string_adaptor::construct_from_utf8(reinterpret_cast(notationName))); +} // SAXunparsedEntityDecl + +template +void libxml2_wrapper::SAXelementDecl(const xmlChar* name, int type, xmlElementContentPtr content) +{ + if(isInCData_) + SAXendCdataSection(); + + if(!declHandler_) + return; + + std::ostringstream os; + convertXML_Content(os, type, content, false); + declHandler_->elementDecl(string_adaptor::construct_from_utf8(reinterpret_cast(name)), string_adaptor::construct_from_utf8(os.str().c_str())); +} // elementDeclaration + +template +void libxml2_wrapper::convertXML_Content(std::ostream& os, int type, xmlElementContentPtr model, bool isChild) const +{ + char concatenator = ' '; + + switch(type) + { + case XML_ELEMENT_TYPE_EMPTY: + os << "EMPTY"; + break; + case XML_ELEMENT_TYPE_ANY: + os << "ANY"; + return; + case XML_ELEMENT_TYPE_MIXED: + if(model->c1 == 0) + os << "(#PCDATA)"; + else + os << "(#PCDATA"; + concatenator = '|'; + break; + case XML_ELEMENT_TYPE_ELEMENT: + break; + } // switch + + switch(model->type) + { + case XML_ELEMENT_CONTENT_ELEMENT: + if(!isChild) + os << '(' << model->name << ')'; + else + os << model->name; + break; + case XML_ELEMENT_CONTENT_SEQ: + concatenator = ','; + break; + case XML_ELEMENT_CONTENT_OR: + concatenator = '|'; + break; + case XML_ELEMENT_CONTENT_PCDATA: + break; + } // switch + + // do children here + if(model->c1 != 0) + { + if(!isChild) + os << '('; + convertXML_Content(os, XML_ELEMENT_TYPE_ELEMENT, model->c1, true); + if(model->c2 != 0) + { + os << concatenator; + convertXML_Content(os, XML_ELEMENT_TYPE_ELEMENT, model->c2, true); + } // if ... + if(!isChild) + os << ')'; + } // if ... + + switch(model->ocur) + { + case XML_ELEMENT_CONTENT_ONCE: + break; + case XML_ELEMENT_CONTENT_OPT: + os << "?"; + break; + case XML_ELEMENT_CONTENT_MULT: + os << "*"; + break; + case XML_ELEMENT_CONTENT_PLUS: + os << "+"; + break; + } // switch +} // convertXML_Content + +template +void libxml2_wrapper::SAXattributeDecl(const xmlChar *elem, const xmlChar *fullname, int type, int def, const xmlChar *defaultValue, xmlEnumerationPtr tree) +{ + if(isInCData_) + SAXendCdataSection(); + + if(!declHandler_) + return; + + const string_type* defType = &attrDefaults_.implied; + if(def) + defType = (defaultValue) ? &attrDefaults_.fixed : &attrDefaults_.required; + + string_type typeStr; + switch(type) + { + case XML_ATTRIBUTE_CDATA: + typeStr = attrTypes_.cdata; + break; + case XML_ATTRIBUTE_ID: + typeStr = attrTypes_.id; + break; + case XML_ATTRIBUTE_IDREF : + typeStr = attrTypes_.idref; + break; + case XML_ATTRIBUTE_IDREFS: + typeStr = attrTypes_.idrefs; + break; + case XML_ATTRIBUTE_ENTITY: + typeStr = attrTypes_.entity; + break; + case XML_ATTRIBUTE_ENTITIES: + typeStr = attrTypes_.entities; + break; + case XML_ATTRIBUTE_NMTOKEN: + typeStr = attrTypes_.nmtoken; + break; + case XML_ATTRIBUTE_NMTOKENS: + typeStr = attrTypes_.nmtokens; + break; + case XML_ATTRIBUTE_ENUMERATION: + typeStr = stringAttrEnum(tree, false); + break; + case XML_ATTRIBUTE_NOTATION: + string_adaptor::append(typeStr, attrTypes_.notation); + string_adaptor::append(typeStr, stringAttrEnum(tree, true)); + break; + } // switch(type) + + declHandler_->attributeDecl(string_adaptor::construct_from_utf8(reinterpret_cast(elem)), + string_adaptor::construct_from_utf8(reinterpret_cast(fullname)), + typeStr, + *defType, + string_adaptor::construct_from_utf8(reinterpret_cast(defaultValue))); +} // SAXattributeDecl + +template +string_type libxml2_wrapper::stringAttrEnum(xmlEnumerationPtr tree, bool leadingSpace) const +{ + std::ostringstream os; + if(leadingSpace) + os << " "; + os << "("; + while(tree) + { + os << tree->name; + tree = tree->next; + if(tree) + os << " | "; + } // while + os << ")"; + + return string_adaptor::construct_from_utf8(os.str().c_str()); +} // stringAttrEnum + +template +void libxml2_wrapper::SAXentityDecl(const xmlChar *name, int type, const xmlChar *publicId, const xmlChar *systemId, xmlChar *content) +{ + if(isInCData_) + SAXendCdataSection(); + + if(!declHandler_) + return; + + switch(type) + { + case 1: // internal + declHandler_->internalEntityDecl(string_adaptor::construct_from_utf8(reinterpret_cast(name)), + string_adaptor::construct_from_utf8(reinterpret_cast(content))); + break; + case 2: // external + declHandler_->externalEntityDecl(string_adaptor::construct_from_utf8(reinterpret_cast(name)), + string_adaptor::construct_from_utf8(reinterpret_cast(publicId)), + string_adaptor::construct_from_utf8(reinterpret_cast(systemId))); + break; + } // switch +} // SAXentityDecl + +template +xmlParserInputPtr libxml2_wrapper::SAXresolveEntity(const xmlChar* publicId, const xmlChar* systemId) +{ + if(!entityResolver_) + return xmlLoadExternalEntity(reinterpret_cast(systemId), + reinterpret_cast(publicId), + context_); + return 0; +} // SAXresolveEntity + + +} // namespace SAX +} // namespace Arabica + +#endif diff --git a/arabica/include/Taggle/Taggle.hpp b/arabica/include/Taggle/Taggle.hpp new file mode 100644 index 000000000..070e3a6e3 --- /dev/null +++ b/arabica/include/Taggle/Taggle.hpp @@ -0,0 +1,13 @@ +#ifndef ARABICA_TAGGLE_TAGGLE_HPP +#define ARABICA_TAGGLE_TAGGLE_HPP + +#include "impl/ScanHandler.hpp" +#include "impl/ElementType.hpp" +#include "impl/Element.hpp" +#include "impl/Schema.hpp" +#include "impl/html/HTMLModels.hpp" +#include "impl/html/HTMLScanner.hpp" +#include "impl/html/HTMLSchema.hpp" +#include "impl/Parser.hpp" + +#endif diff --git a/arabica/include/Taggle/impl/Element.hpp b/arabica/include/Taggle/impl/Element.hpp new file mode 100755 index 000000000..7b28ec6b8 --- /dev/null +++ b/arabica/include/Taggle/impl/Element.hpp @@ -0,0 +1,363 @@ +#ifndef ARABICA_SAX_TAGSOUP_ELEMENT_HPP +#define ARABICA_SAX_TAGSOUP_ELEMENT_HPP + +#include +#include +#include "ElementType.hpp" + +namespace Arabica +{ + +namespace SAX +{ + +/** + The internal representation of an actual element (not an element type). + An Element has an element type, attributes, and a successor Element + for use in constructing stacks and queues of Elements. + @see ElementType + @see AttributesImpl + + Based on code from John Cowan's super TagSoup package +*/ + +class Element_impl; + +class Element +{ +public: + static const Element Null; + + Element(ElementType& type, bool defaultAttributes); + Element(const Element& rhs); + ~Element(); + + Element& operator=(const Element& rhs); + bool operator==(const Element& rhs) const; + bool operator!=(const Element& rhs) const; + + /** + Return the element type. + @return The element type. + */ + const ElementType& type() const; + + /** + Return the attributes as an AttributesImpl object. + @return The attributes + @see AttributesImpl + */ + const AttributesImpl& atts() const; + + /** + return the next element in the element stack or queue + @return the next element + */ + Element next() const; + + /** + Change the next element in an element stack or queue. + @param next The new next element + */ + void setNext(const Element& next); + + /** + Return the name of the element's type. + Convenience method. + @return the element type name + */ + std::string name() const; + + + /** + Return the namespace name of the element's type. + Convenience method. + @return The element type namespace name + */ + std::string namespaceName() const; + + /** + Return the local name of the element's type. + Convenience method. + @return The element type local name + */ + std::string localName() const; + + /** + Return the content model vector of the element's type. + Convenience method. + @return The content model vector + */ + int model() const; + + + /** + Return the member-of vector of the element's type. + Convenience method. + @return The member-of vector + */ + int memberOf() const; + + /** + Return the flags vector of the element's type. + Convenience method. + @return The flags vector + */ + int flags() const; + + /** + Return the parent element type of the element's type + Convenience method + @return the parent element type + */ + ElementType& parent() const; + + /** + Return true if the type of this element can contain the type of + another element. + Convenience method. + @param other The other element + */ + bool canContain(const Element& other) const; + + /** + Set an attribute and its value into this element. + @param name The attribute name (Qname) + @param type The attribute type + @param value The attribute value + */ + void setAttribute(const std::string& name, const std::string& type, const std::string& value); + + /** + Make this element anonymous. + Remove any id or name attribute present + in the element's attributes. + */ + void anonymize(); + + /** + Clean the attributes of this element. + Attributes with null name (the name was ill-formed) + or null value (the attribute was present in the element type but + not in this actual element) are removed. + */ + void clean(); + + /** + Force this element to preclosed status, meaning that an end-tag has + been seen but the element cannot yet be closed for structural reasons. + */ + void preclose(); + + /** + Return true if this element has been preclosed. + */ + bool isPreclosed() const; + +private: + Element() : impl_(0) { } + bool is_null() const { return impl_ == 0; } + + Element_impl* impl_; + + friend class Element_impl; +}; // class Element + +class Element_impl +{ +private: + ElementType* type_; // type of element + AttributesImpl atts_; // attributes of element + Element next_; // successor of element + bool preclosed_; // this element has been preclosed + int refCount_; + +public: + /** + Return an Element from a specified ElementType. + @param type The element type of the newly constructed element + @param defaultAttributes True if default attributes are wanted + */ + Element_impl(ElementType& type, bool defaultAttributes) : + type_(&type), + atts_(), + next_(), + preclosed_(false), + refCount_(1) + { + if (defaultAttributes) + atts_ = type.atts(); + } // Element_impl + + const ElementType& type() const { return *type_; } + const AttributesImpl& atts() const { return atts_; } + + Element next() const + { + if(next_.is_null()) + return Element::Null; + return next_; + } // next() + void setNext(const Element& next) { next_ = next; } + + std::string name() const { return type_->name(); } + std::string namespaceName() const { return type_->namespaceName(); } + std::string localName() const { return type_->localName(); } + int model() const { return type_->model(); } + int memberOf() const { return type_->memberOf(); } + int flags() const { return type_->flags(); } + ElementType& parent() const { return type_->parent(); } + bool canContain(const Element_impl* const other) const { return type_->canContain(*(other->type_)); } + + void setAttribute(const std::string& name, const std::string& type, const std::string& value) + { + type_->setAttribute(atts_, name, type, value); + } // setAttribute + + void anonymize() + { + for (int i = atts_.getLength() - 1; i >= 0; i--) + { + if((atts_.getType(i) == "ID") || (atts_.getQName(i) == "name")) + atts_.removeAttribute(i); + } // for ... + } // anonymize + + void clean() + { + for (int i = atts_.getLength() - 1; i >= 0; i--) + { + const std::string& name = atts_.getLocalName(i); + if (atts_.getValue(i) == "" || name == "" || name.length() == 0) + { + atts_.removeAttribute(i); + continue; + } // if ... + } // for ... + } // clean + + void preclose() + { + preclosed_ = true; + } // preclose + + bool isPreclosed() const + { + return preclosed_; + } // isPreclosed + + void add_ref() + { + ++refCount_; + } // add_ref + + void remove_ref() + { + --refCount_; + if(refCount_ == 0) + delete this; + } // remove_ref + +private: + Element_impl(); + + ~Element_impl() + { + } // ~Element_impl + + + Element_impl(const Element_impl& rhs); + Element_impl& operator=(const Element_impl& rhs); + bool operator==(const Element_impl& rhs) const; + bool operator!=(const Element_impl& rhs) const; +}; // class Element_impl + +const Element Element::Null = Element(ElementType::Null, false); + +////////////////////////////////////////////// +Element::Element(const Element& rhs) : + impl_(rhs.impl_) +{ + impl_->add_ref(); +} // Element + +Element::Element(ElementType& type, bool defaultAttributes) : + impl_(new Element_impl(type, defaultAttributes)) +{ +} // Element + +Element::~Element() +{ + if(impl_) + impl_->remove_ref(); +} // ~Element + +Element& Element::operator=(const Element& rhs) +{ + if(impl_ == rhs.impl_) + return *this; + + if(impl_) + impl_->remove_ref(); + impl_ = rhs.impl_; + if(impl_) + impl_->add_ref(); + return *this; +} // operator= + +bool Element::operator==(const Element& rhs) const +{ + return impl_ == rhs.impl_; +} // operator== + +bool Element::operator!=(const Element& rhs) const +{ + return !(operator==(rhs)); +} // operator!= + +const ElementType& Element::type() const +{ + return impl_->type(); +} // type + +const AttributesImpl& Element::atts() const +{ + return impl_->atts(); +} // atts + +Element Element::next() const +{ + return impl_->next(); +} // next + +void Element::setNext(const Element& next) +{ + impl_->setNext(next); +} // setNext + +std::string Element::name() const { return impl_->name(); } +std::string Element::namespaceName() const { return impl_->namespaceName(); } +std::string Element::localName() const { return impl_->localName(); } + +int Element::model() const { return impl_->model(); } +int Element::memberOf() const { return impl_->memberOf(); } +int Element::flags() const { return impl_->flags(); } + +ElementType& Element::parent() const { return impl_->parent(); } +bool Element::canContain(const Element& other) const { return impl_->canContain(other.impl_); } + +void Element::setAttribute(const std::string& name, const std::string& type, const std::string& value) +{ + impl_->setAttribute(name, type, value); +} // setAttribute + +void Element::anonymize() { impl_->anonymize(); } +void Element::clean() { impl_->clean(); } +void Element::preclose() { impl_->preclose(); } +bool Element::isPreclosed() const { return impl_->isPreclosed(); } + +} // namespace SAX + +} // namespace Arabica + +#endif diff --git a/arabica/include/Taggle/impl/ElementType.hpp b/arabica/include/Taggle/impl/ElementType.hpp new file mode 100755 index 000000000..cff6e684f --- /dev/null +++ b/arabica/include/Taggle/impl/ElementType.hpp @@ -0,0 +1,333 @@ +#ifndef ARABICA_SAX_ELEMENT_TYPE_HPP +#define ARABICA_SAX_ELEMENT_TYPE_HPP + +#include +#include +#include "Schema.hpp" + +namespace Arabica +{ + +namespace SAX +{ + + +/** +This class represents an element type in the schema. +An element type has a name, a content model vector, a member-of vector, +a flags vector, default attributes, and a schema to which it belongs. + +Based on code from John Cowan's super TagSoup package +@see Schema +*/ +class ElementType +{ +private: + std::string name_; // element type name (Qname) + std::string namespace_; // element type namespace name + std::string localName_; // element type local name + int model_; // bitmap: what the element contains + int memberOf_; // bitmap: what element is contained in + int flags_; // bitmap: element flags + AttributesImpl atts_; // default attributes + ElementType* parent_; // parent of this element type + Schema* schema_; // schema to which this belongs + +public: + static ElementType Null; + +private: + ElementType() : + name_(""), + namespace_(""), + localName_(""), + model_(0), + memberOf_(0), + flags_(0), + atts_(), + parent_(0), + schema_(0) + { + } // ElementType + + /** + Construct an ElementType: + but it's better to use Schema.element() instead. + The content model, member-of, and flags vectors are specified as ints. + @param name The element type name + @param model ORed-together bits representing the content models + allowed in the content of this element type + @param memberOf ORed-together bits representing the content models + to which this element type belongs + @param flags ORed-together bits representing the flags associated + with this element type + @param schema The schema with which this element type will be + associated + */ + ElementType(const std::string& name, int model, int memberOf, int flags, Schema& schema) : + name_(name), + namespace_(), + localName_(), + model_(model), + memberOf_(memberOf), + flags_(flags), + parent_(0), + schema_(&schema) + { + namespace_ = namespaceName(name, false); + localName_ = localName(name); + } // ElementType + + ElementType(const ElementType& rhs) : + name_(rhs.name_), + namespace_(rhs.namespace_), + localName_(rhs.localName_), + model_(rhs.model_), + memberOf_(rhs.memberOf_), + flags_(rhs.flags_), + parent_(rhs.parent_), + schema_(rhs.schema_) + { + } // ElementType + + friend class SchemaImpl; + +public: + /** + Return a namespace name from a Qname. + The attribute flag tells us whether to return an empty namespace + name if there is no prefix, or use the schema default instead. + @param name The Qname + @param attribute True if name is an attribute name + @return The namespace name + **/ + std::string namespaceName(const std::string& name, bool attribute) const + { + size_t colon = name.find(':'); + if (colon == std::string::npos) + return attribute ? "" : schema_->getURI(); + + std::string prefix = name.substr(0, colon); + if (prefix == "xml") + return "http://www.w3.org/XML/1998/namespace"; + else + return "urn:x-prefix:" + prefix; + } // namespaceName + + /** + Return a local name from a Qname. + @param name The Qname + @return The local name + **/ + std::string localName(const std::string& name) const + { + size_t colon = name.find(':'); + if (colon == std::string::npos) + return name; + else + return name.substr(colon+1); + } // localName + + /** + Returns the name of this element type. + @return The name of the element type + */ + std::string name() const { return name_; } + + /** + Returns the namespace name of this element type. + @return The namespace name of the element type + */ + std::string namespaceName() const { return namespace_; } + + /** + Returns the local name of this element type. + @return The local name of the element type + */ + std::string localName() const { return localName_; } + + /** + Returns the content models of this element type. + @return The content models of this element type as a vector of bits + */ + int model() const { return model_; } + + /** + Returns the content models to which this element type belongs. + @return The content models to which this element type belongs as a + vector of bits + */ + int memberOf() const { return memberOf_; } + + /** + Returns the flags associated with this element type. + @return The flags associated with this element type as a vector of bits + */ + int flags() const { return flags_; } + + /** + Returns the default attributes associated with this element type. + Attributes of type CDATA that don't have default values are + typically not included. Other attributes without default values + have an internal value of null. + The return value is an AttributesImpl to allow the caller to mutate + the attributes. + */ + const AttributesImpl& atts() const { return atts_; } + + /** + Returns the parent element type of this element type. + @return The parent element type + */ + ElementType& parent() const + { + return *parent_; + } // parent + + /** + Returns the schema which this element type is associated with. + @return The schema + */ + Schema& schema() const + { + return *schema_; + } // schema + + + /** + Returns true if this element type can contain another element type. + That is, if any of the models in this element's model vector + match any of the models in the other element type's member-of + vector. + @param other The other element type + */ + bool canContain(const ElementType& other) const + { + return (model_ & other.memberOf_) != 0; + } // canContain + + + /** + Sets an attribute and its value into an AttributesImpl object. + Attempts to set a namespace declaration are ignored. + @param atts The AttributesImpl object + @param name The name (Qname) of the attribute + @param type The type of the attribute + @param value The value of the attribute + */ + void setAttribute(AttributesImpl& atts, + const std::string& name, + const std::string& type, + const std::string& value) + { + if (name == "xmlns" || name.find("xmlns:") == 0) + { + return; + } + + std::string namespaceN = namespaceName(name, true); + std::string localN = localName(name); + std::string actualType = type; + std::string actualValue = value; + + int i = atts.getIndex(name); + if (i == -1) + { + if (actualType == "") + actualType = "CDATA"; + if (actualType != "CDATA") + actualValue = Arabica::text::normalize_whitespace >(value); + atts.addAttribute(namespaceN, localN, name, actualType, actualValue); + } + else + { + if (actualType == "") + actualType = atts.getType(i); + if (actualType != ("CDATA")) + actualValue = Arabica::text::normalize_whitespace >(value); + atts.setAttribute(i, namespaceN, localN, name, actualType, actualValue); + } + } // setAttribute + + /** + Sets an attribute and its value into this element type. + @param name The name of the attribute + @param type The type of the attribute + @param value The value of the attribute + */ + void setAttribute(const std::string& name, const std::string& type, const std::string& value) + { + setAttribute(atts_, name, type, value); + } // setAttribute + + /** + Sets the models of this element type. + @param model The content models of this element type as a vector of bits + */ + void setModel(int model) + { + model_ = model; + } // setModel + + /** + Sets the content models to which this element type belongs. + @param memberOf The content models to which this element type belongs as a vector of bits + */ + void setMemberOf(int memberOf) + { + memberOf_ = memberOf; + } // setMemberOf + + /** + Sets the flags of this element type. + @param flags associated with this element type The flags as a vector of bits + */ + void setFlags(int flags) + { + flags_ = flags; + } // setFlags + + /** + Sets the parent element type of this element type. + @param parent The parent element type + */ + void setParent(ElementType& parent) + { + parent_ = &parent; + } // setParent + + bool operator==(const ElementType& rhs) const + { + return (name_ == rhs.name_) && + (namespace_ == rhs.namespace_) && + (localName_ == rhs.localName_) && + (model_ == rhs.model_) && + (memberOf_ == rhs.memberOf_) && + (flags_ == rhs.flags_) && + (parent_ == rhs.parent_) && + (schema_ == rhs.schema_); + } // operator == + + ElementType& operator=(const ElementType& rhs) + { + name_ = rhs.name_; + namespace_ = rhs.namespace_; + localName_ = rhs.localName_; + model_ = rhs.model_; + memberOf_ = rhs.memberOf_; + flags_ = rhs.flags_; + atts_ = rhs.atts_; + parent_ = rhs.parent_; + schema_ = rhs.schema_; + + return *this; + } // operator= +}; // class ElementType + +ElementType ElementType::Null; + +} // namespace SAX + +} // namespace Arabica + +#endif diff --git a/arabica/include/Taggle/impl/Parser.hpp b/arabica/include/Taggle/impl/Parser.hpp new file mode 100644 index 000000000..a6630540d --- /dev/null +++ b/arabica/include/Taggle/impl/Parser.hpp @@ -0,0 +1,1389 @@ +#ifndef ARABICA_SAX_TAGGLE_PARSER_HPP +#define ARABICA_SAX_TAGGLE_PARSER_HPP + + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ScanHandler.hpp" + +namespace Arabica +{ + +namespace SAX +{ + +/** +The Taggle SAX parser class. + +Based on code from John Cowan's super TagSoup package +**/ +template > +class Taggle : + public XMLReaderInterface, + private DefaultHandler, + private ScanHandler +{ +public: + typedef XMLReaderInterface XMLReaderT; + typedef typename XMLReaderT::string_adaptor string_adaptor; + typedef ContentHandler ContentHandlerT; + typedef LexicalHandler LexicalHandlerT; + typedef DeclHandler DeclHandlerT; + typedef DTDHandler DTDHandlerT; + typedef ErrorHandler ErrorHandlerT; + typedef EntityResolver EntityResolverT; + typedef InputSource InputSourceT; + typedef Locator LocatorT; + + + /** + A value of "true" indicates namespace URIs and unprefixed local + names for element and attribute names will be available. + **/ + static const string_type namespacesFeature; + + /** + A value of "true" indicates that XML qualified names (with prefixes) + and attributes (including xmlns* attributes) will be available. + We don't support this value. + **/ + static const string_type namespacePrefixesFeature; + + /** + Reports whether this parser processes external general entities + (it doesn't). + **/ + static const string_type externalGeneralEntitiesFeature; + + /** + Reports whether this parser processes external parameter entities + (it doesn't). + **/ + static const string_type externalParameterEntitiesFeature; + + /** + May be examined only during a parse, after the startDocument() + callback has been completed; read-only. The value is true if + the document specified standalone="yes" in its XML declaration, + and otherwise is false. (It's always false.) + **/ + static const string_type isStandaloneFeature; + + /** + A value of "true" indicates that the LexicalHandler will report + the beginning and end of parameter entities (it won't). + **/ + static const string_type lexicalHandlerParameterEntitiesFeature; + + /** + A value of "true" indicates that system IDs in declarations will + be absolutized (relative to their base URIs) before reporting. + (This returns true but doesn't actually do anything.) + **/ + static const string_type resolveDTDURIsFeature; + + /** + Has a value of "true" if all XML names (for elements, + prefixes, attributes, entities, notations, and local + names), as well as Namespace URIs, will have been interned + using java.lang.String.intern. This supports fast testing of + equality/inequality against string constants, rather than forcing + slower calls to String.equals(). (We always intern.) + **/ + static const string_type stringInterningFeature; + + /** + Returns "true" if the Attributes objects passed by this + parser in ContentHandler.startElement() implement the + org.xml.sax.ext.Attributes2 interface. (They don't.) + **/ + static const string_type useAttributes2Feature; + + /** + Returns "true" if the Locator objects passed by this parser + in ContentHandler.setDocumentLocator() implement the + org.xml.sax.ext.Locator2 interface. (They don't.) + **/ + static const string_type useLocator2Feature; + + /** + Returns "true" if, when setEntityResolver is given an object + implementing the org.xml.sax.ext.EntityResolver2 interface, + those new methods will be used. (They won't be.) + **/ + static const string_type useEntityResolver2Feature; + + /** + Controls whether the parser is reporting all validity errors + (We don't report any validity errors.) + **/ + static const string_type validationFeature; + + /** + Controls whether the parser reports Unicode normalization + errors as described in section 2.13 and Appendix B of the XML + 1.1 Recommendation. (We don't normalize.) + **/ + static const string_type unicodeNormalizationCheckingFeature; + + /** + Controls whether, when the namespace-prefixes feature is set, + the parser treats namespace declaration attributes as being in + the http://www.w3.org/2000/xmlns/ namespace. (It doesn't.) + **/ + static const string_type xmlnsURIsFeature; + + /** + Returns "true" if the parser supports both XML 1.1 and XML 1.0. + (Always false.) + **/ + static const string_type XML11Feature; + + /** + A value of "true" indicates that the parser will ignore + unknown elements. + **/ + static const string_type ignoreBogonsFeature; + + /** + A value of "true" indicates that the parser will give unknown + elements a content model of EMPTY; a value of "false", a + content model of ANY. + **/ + static const string_type bogonsEmptyFeature; + + /** + A value of "true" indicates that the parser will allow unknown + elements to be the root element. + **/ + static const string_type rootBogonsFeature; + + /** + A value of "true" indicates that the parser will return default + attribute values for missing attributes that have default values. + **/ + static const string_type defaultAttributesFeature; + + /** + A value of "true" indicates that the parser will + translate colons into underscores in names. + **/ + static const string_type translateColonsFeature; + + /** + A value of "true" indicates that the parser will + attempt to restart the restartable elements. + **/ + static const string_type restartElementsFeature; + + /** + A value of "true" indicates that the parser will + transmit whitespace in element-only content via the SAX + ignorableWhitespace callback. Normally this is not done, + because HTML is an SGML application and SGML suppresses + such whitespace. + **/ + static const string_type ignorableWhitespaceFeature; + + /** + A value of "true" indicates that the parser will treat CDATA + elements specially. Normally true, since the input is by + default HTML. + **/ + static const string_type CDATAElementsFeature; + + /** + Used to see some syntax events that are essential in some + applications: comments, CDATA delimiters, selected general + entity inclusions, and the start and end of the DTD (and + declaration of document element name). The Object must implement + org.xml.sax.ext.LexicalHandler. + **/ + static const string_type lexicalHandlerProperty; + + /** + Specifies the Scanner object this Parser uses. + **/ + static const string_type scannerProperty; + + /** + Specifies the Schema object this Parser uses. + **/ + static const string_type schemaProperty; + + /** + Specifies the AutoDetector (for encoding detection) this Parser uses. + **/ + static const string_type autoDetectorProperty; + +private: + // Default values for feature flags + static bool DEFAULT_NAMESPACES; + static bool DEFAULT_IGNORE_BOGONS; + static bool DEFAULT_BOGONS_EMPTY; + static bool DEFAULT_ROOT_BOGONS; + static bool DEFAULT_DEFAULT_ATTRIBUTES; + static bool DEFAULT_TRANSLATE_COLONS; + static bool DEFAULT_RESTART_ELEMENTS; + static bool DEFAULT_IGNORABLE_WHITESPACE; + static bool DEFAULT_CDATA_ELEMENTS; + + static const string_type legal; + + typedef std::map FeatureMapT; + + // XMLReader implementation + ContentHandlerT* contentHandler_; + LexicalHandlerT* lexicalHandler_; + DTDHandlerT* dtdHandler_; + ErrorHandlerT* errorHandler_; + EntityResolverT* entityResolver_; + Schema* schema_; + bool ownSchema_; + Scanner* scanner_; + bool ownScanner_; + FeatureMapT features_; + Element newElement_; + std::string attributeName_; + bool doctypeIsPresent_; + std::string doctypePublicId_; + std::string doctypeSystemId_; + std::string doctypeName_; + std::string piTarget_; + Element stack_; + Element saved_; + Element pcdata_; + int entity_; + + // Feature flags. + bool namespaces; + bool ignoreBogons; + bool bogonsEmpty; + bool rootBogons; + bool defaultAttributes; + bool translateColons; + bool restartElements; + bool ignorableWhitespace; + bool CDATAElements; + bool virginStack; + +public: + Taggle() : + contentHandler_(0), + lexicalHandler_(0), + dtdHandler_(0), + errorHandler_(0), + entityResolver_(0), + schema_(0), + ownSchema_(false), + scanner_(0), + ownScanner_(false), + features_(initialFeatures()), + newElement_(Element::Null), + attributeName_(), + doctypeIsPresent_(false), + doctypePublicId_(), + doctypeSystemId_(), + doctypeName_(), + piTarget_(), + stack_(Element::Null), + saved_(Element::Null), + pcdata_(Element::Null), + entity_(0), + namespaces(DEFAULT_NAMESPACES), + ignoreBogons(DEFAULT_IGNORE_BOGONS), + bogonsEmpty(DEFAULT_BOGONS_EMPTY), + rootBogons(DEFAULT_ROOT_BOGONS), + defaultAttributes(DEFAULT_DEFAULT_ATTRIBUTES), + translateColons(DEFAULT_TRANSLATE_COLONS), + restartElements(DEFAULT_RESTART_ELEMENTS), + ignorableWhitespace(DEFAULT_IGNORABLE_WHITESPACE), + CDATAElements(DEFAULT_CDATA_ELEMENTS), + virginStack(true) + { + contentHandler_ = this; + lexicalHandler_ = this; + dtdHandler_ = this; + errorHandler_ = this; + entityResolver_ = this; + } // Taggle + + ~Taggle() + { + if(ownSchema_) + delete schema_; + if(ownScanner_) + delete scanner_; + } // ~Taggle + +private: + static FeatureMapT initialFeatures() + { + FeatureMapT features; + features[namespacesFeature] = DEFAULT_NAMESPACES; + features[namespacePrefixesFeature] = false; + features[externalGeneralEntitiesFeature] = false; + features[externalParameterEntitiesFeature] = false; + features[isStandaloneFeature] = false; + features[lexicalHandlerParameterEntitiesFeature] = false; + features[resolveDTDURIsFeature] = true; + features[stringInterningFeature] = true; + features[useAttributes2Feature] = false; + features[useLocator2Feature] = false; + features[useEntityResolver2Feature] = false; + features[validationFeature] = false; + features[xmlnsURIsFeature] = false; + features[xmlnsURIsFeature] = false; + features[XML11Feature] = false; + features[ignoreBogonsFeature] = DEFAULT_IGNORE_BOGONS; + features[bogonsEmptyFeature] = DEFAULT_BOGONS_EMPTY; + features[rootBogonsFeature] = DEFAULT_ROOT_BOGONS; + features[defaultAttributesFeature] = DEFAULT_DEFAULT_ATTRIBUTES; + features[translateColonsFeature] = DEFAULT_TRANSLATE_COLONS; + features[restartElementsFeature] = DEFAULT_RESTART_ELEMENTS; + features[ignorableWhitespaceFeature] = DEFAULT_IGNORABLE_WHITESPACE; + features[CDATAElementsFeature] = DEFAULT_CDATA_ELEMENTS; + return features; + } // initialFeatures + +public: + /////////////////////////////////////////////////// + // XMLReader + bool getFeature(const string_type& name) const + { + typename FeatureMapT::const_iterator b = features_.find(name); + if(b == features_.end()) + { + throw SAXNotRecognizedException("Unknown feature " + string_adaptor::asStdString(name)); + } + return b->second; + } // getFeature + + void setFeature(const string_type& name, bool value) + { + typename FeatureMapT::iterator b = features_.find(name); + if(b == features_.end()) + { + throw SAXNotRecognizedException("Unknown feature " + string_adaptor::asStdString(name)); + } + + features_[name] = value; + + if(name == namespacesFeature) + namespaces = value; + else if(name == ignoreBogonsFeature) + ignoreBogons = value; + else if(name == bogonsEmptyFeature) + bogonsEmpty = value; + else if(name == rootBogonsFeature) + rootBogons = value; + else if(name == defaultAttributesFeature) + defaultAttributes = value; + else if(name == translateColonsFeature) + translateColons = value; + else if(name == restartElementsFeature) + restartElements = value; + else if(name == ignorableWhitespaceFeature) + ignorableWhitespace = value; + else if(name == CDATAElementsFeature) + CDATAElements = value; + } // setFeature + + typedef typename XMLReaderInterface::PropertyBase PropertyBaseT; + virtual std::auto_ptr doGetProperty(const string_type& /*name*/) + { + return std::auto_ptr(0); + } // doGetProperty + + virtual void doSetProperty(const string_type& /*name*/, std::auto_ptr /*value*/) + { + } // doSetProperty + + /* + Object getProperty (std::string name) + { + if(name.equals(lexicalHandlerProperty)) + { + return lexicalHandler_ == this ? null : lexicalHandler_; + } + else if(name.equals(scannerProperty)) + { + return scanner_; + } + else if(name.equals(schemaProperty)) + { + return schema_; + } + else if(name.equals(autoDetectorProperty)) + { + return theAutoDetector; + } + else + { + throw new SAXNotRecognizedException("Unknown property " + name); + } + } // getProperty + + void setProperty (std::string name, Object value) + { + if(name.equals(lexicalHandlerProperty)) + { + if(value == null) + { + lexicalHandler_ = this; + } + else if(value instanceof LexicalHandler) + { + lexicalHandler_ = (LexicalHandler)value; + } + else + { + throw new SAXNotSupportedException("Your lexical handler is not a LexicalHandler"); + } + } + else if(name.equals(scannerProperty)) + { + if(value instanceof Scanner) { + scanner_ = (Scanner)value; + } + else { + throw new SAXNotSupportedException("Your scanner is not a Scanner"); + } + } + else if(name.equals(schemaProperty)) { + if(value instanceof Schema) { + schema_ = (Schema)value; + } + else { + throw new SAXNotSupportedException("Your schema is not a Schema"); + } + } + else if(name.equals(autoDetectorProperty)) { + if(value instanceof AutoDetector) { + theAutoDetector = (AutoDetector)value; + } + else { + throw new SAXNotSupportedException("Your auto-detector is not an AutoDetector"); + } + } + else { + throw new SAXNotRecognizedException("Unknown property " + name); + } + } +*/ + + virtual void setEntityResolver(EntityResolverT& resolver) + { + entityResolver_ = &resolver; + } // setEntityResolver + + virtual EntityResolverT* getEntityResolver() const + { + return (entityResolver_ == this) ? 0 : entityResolver_; + } // getEntityResolver + + virtual void setDTDHandler(DTDHandlerT& handler) + { + dtdHandler_ = &handler; + } // setDTDHandler + + virtual DTDHandlerT* getDTDHandler() const + { + return (dtdHandler_ == this) ? 0 : dtdHandler_; + } // getDTDHandler + + virtual void setContentHandler(ContentHandlerT& handler) + { + contentHandler_ = &handler; + } // setContentHandler + + virtual ContentHandlerT* getContentHandler() const + { + return (contentHandler_ == this) ? 0 : contentHandler_; + } // getContentHandler + + virtual void setErrorHandler(ErrorHandlerT& handler) + { + errorHandler_ = &handler; + } // setErrorHandler + + virtual ErrorHandlerT* getErrorHandler() const + { + return (errorHandler_ == this) ? 0 : errorHandler_; + } // getErrorHandler + + virtual void setDeclHandler(DeclHandlerT& /*handler*/) + { + } // setDeclHandler + + virtual DeclHandlerT* getDeclHandler() const + { + return 0; + } // getDeclHandler + + virtual void setLexicalHandler(LexicalHandlerT& handler) + { + lexicalHandler_ = &handler; + } // setLexicalHandler + + virtual LexicalHandlerT* getLexicalHandler() const + { + return (lexicalHandler_ == this) ? 0 : lexicalHandler_; + } // getLexicalHandler + + virtual void parse(InputSourceT& input) + { + setup(); + + InputSourceResolver is(input, string_adaptor()); + if(is.resolve() == 0) + { + reportError("Could not resolve XML document", true); + return; + } // if(is.resolver() == 0) + + contentHandler_->startDocument(); + scanner_->resetDocumentLocator(string_adaptor::asStdString(input.getPublicId()), string_adaptor::asStdString(input.getSystemId())); + + if(dynamic_cast(scanner_) != 0) + contentHandler_->setDocumentLocator(*(dynamic_cast(scanner_))); + + if(schema_->getURI() != "") + contentHandler_->startPrefixMapping(S(schema_->getPrefix()), + S(schema_->getURI())); + scanner_->scan(*is.resolve(), *this); + } // parse + +private: + // Sets up instance variables that haven't been set by setFeature + void setup() + { + if(schema_ && ownSchema_) + { + delete schema_; + schema_ = 0; + } // if ... + if(schema_ == 0) + { + schema_ = new HTMLSchema(); + ownSchema_ = true; + } // if ... + + if(scanner_ && ownScanner_) + { + delete scanner_; + scanner_ = 0; + } // if ... + if(scanner_ == 0) + { + scanner_ = new HTMLScanner(); + ownScanner_ = true; + } // if ... + + stack_ = Element(schema_->getElementType(""), defaultAttributes); + pcdata_ = Element(schema_->getElementType(""), defaultAttributes); + + newElement_ = Element::Null; + attributeName_ = ""; + piTarget_ = ""; + saved_ = Element::Null; + entity_ = 0; + virginStack = true; + doctypeName_ = doctypePublicId_ = doctypeSystemId_ = ""; + } // setup + + /////////////////////////////////////////////////////// + // ScanHandler implementation + virtual void adup(const std::string& /*buff*/) + { + // std::cerr << "adup(\"" << buff.substr(offset, length) << "\", " << offset << ", " << length << ")" << std::endl; + if(newElement_ == Element::Null || attributeName_ == "") + return; + newElement_.setAttribute(attributeName_, "", attributeName_); + attributeName_ = ""; + } // adup + + virtual void aname(const std::string& buff) + { + // std::cerr << "aname(\"" << buff.substr(offset, length) << "\", " << offset << ", " << length << ")" << std::endl; + if(newElement_ == Element::Null) + return; + // Currently we don't rely on Schema to canonicalize + // attribute names. + attributeName_ = lower_case(makeName(buff)); + } // aname + + virtual void aval(const std::string& buff) + { + // std::cerr << "aval(\"" << buff.substr(offset, length) << "\", " << offset << ", " << length << ")" << std::endl; + if(newElement_ == Element::Null || attributeName_ == "") + return; + std::string value = expandEntities(buff); + newElement_.setAttribute(attributeName_, "", value); + attributeName_ = ""; + } // aval + + // Expand entity references in attribute values selectively. + // Currently we expand a reference iff it is properly terminated + // with a semicolon. + std::string expandEntities(std::string src) + { + size_t refStart = std::string::npos; + std::string dst; + for(std::string::const_iterator i = src.begin(), ie = src.end(); i != ie; ++i) + { + char ch = *i; + dst.push_back(ch); + if(ch == '&' && refStart == std::string::npos) + { + // start of a ref excluding & + refStart = dst.length(); + } + else if(refStart == std::string::npos) + { + // not in a ref + } + else if(Arabica::XML::is_letter_or_digit(ch) || ch == '#') + { + // valid entity char + } + else if(ch == ';') + { + // properly terminated ref + int ent = lookupEntity(dst.substr(refStart, dst.size() - refStart - 1)); + if(ent > 0xFFFF) + { + ent -= 0x10000; + dst[refStart - 1] = (char)((ent>>10) + 0xD800); + dst[refStart] = (char)((ent&0x3FF) + 0xDC00); + dst.erase(refStart + 1); + } + else if(ent != 0) + { + dst[refStart - 1] = (char)ent; + dst.erase(refStart); + } + refStart = std::string::npos; + } + else + { + // improperly terminated ref + refStart = std::string::npos; + } // if ... + } // for ... + return std::string(dst, 0, dst.size()); + } // expandEntities + + virtual void entity(const std::string& buff) + { + entity_ = lookupEntity(buff); + } // entity + + // Process numeric character references, + // deferring to the schema for named ones. + int lookupEntity(const std::string& buff) + { + int result = 0; + if(buff.length() < 1) + return result; + + if(buff[0] == '#') + { + const char* b = buff.c_str(); + char* end; + if(buff.length() > 1 && (buff[1] == 'x' || buff[1] == 'X')) + return strtol(b + 2, &end, 16); + return strtol(b + 1, &end, 10); + } + return schema_->getEntity(buff); + } // lookupEntity + + virtual void eof(const std::string& /*buff*/) + { + if(virginStack) + rectify(pcdata_); + while (stack_.next() != Element::Null) + { + pop(); + } + if(schema_->getURI() != "") + contentHandler_->endPrefixMapping(S(schema_->getPrefix())); + contentHandler_->endDocument(); + } // eof + + virtual void etag(const std::string& buff) + { + // std::cerr << "etag(\"" << buff.substr(offset, length) << "\", " << offset << ", " << length << ")" << std::endl; + if(etag_cdata(buff)) + return; + etag_basic(buff); + } // etag + + bool etag_cdata(const std::string& buff) + { + std::string currentName = stack_.name(); + // If this is a CDATA element and the tag doesn't match, + // or isn't properly formed (junk after the name), + // restart CDATA mode and process the tag as characters. + if(CDATAElements && (stack_.flags() & Schema::F_CDATA) != 0) + { + bool realTag = (buff.length() == currentName.length()); + if(realTag) + { + std::string buffl = lower_case(buff); + std::string currentl = lower_case(currentName); + for (size_t i = 0; i < buffl.length(); ++i) + { + if(buffl[i] != currentl[i]) + { + realTag = false; + break; + } // if ... + } // for ... + } // if ... + if(!realTag) + { + contentHandler_->characters(S("characters(S(buff)); + contentHandler_->characters(S(">")); + scanner_->startCDATA(); + return true; + } // if ... + } // if ... + return false; + } // etag_cdata + + void etag_basic(const std::string& buff) + { + newElement_ = Element::Null; + std::string name; + if(!buff.empty()) + { + // Canonicalize case of name + name = makeName(buff); + ElementType& type = schema_->getElementType(name); + if(type == ElementType::Null) + return; // mysterious end-tag + name = type.name(); + } + else + { + name = stack_.name(); + } + + Element sp = Element::Null; + bool inNoforce = false; + for (sp = stack_; sp != Element::Null; sp = sp.next()) + { + if(sp.name() == name) + break; + if((sp.flags() & Schema::F_NOFORCE) != 0) + inNoforce = true; + } // for ... + + if(sp == Element::Null) + return; // Ignore unknown etags + if(sp.next() == Element::Null || sp.next().next() == Element::Null) + return; + if(inNoforce) + { // inside an F_NOFORCE element? + sp.preclose(); // preclose the matching element + } + else + { // restartably pop everything above us + while (stack_ != sp) + restartablyPop(); + pop(); + } + // pop any preclosed elements now at the top + while (stack_.isPreclosed()) + pop(); + restart(Element::Null); + } // etag_basic + + // Push restartables on the stack if possible + // e is the next element to be started, if we know what it is + void restart(Element e) + { + while (saved_ != Element::Null && stack_.canContain(saved_) && + (e == Element::Null || saved_.canContain(e))) + { + Element next = saved_.next(); + push(saved_); + saved_ = next; + } // while ... + } // restart + + // Pop the stack irrevocably + void pop() + { + if(stack_ == Element::Null) + return; // empty stack + std::string name = stack_.name(); + std::string localName = stack_.localName(); + std::string namespaceName = stack_.namespaceName(); + std::string prefix = prefixOf(name); + + if(!namespaces) + namespaceName = localName = ""; + contentHandler_->endElement(S(namespaceName), + S(localName), + S(name)); + if(foreign(prefix, namespaceName)) + contentHandler_->endPrefixMapping(S(prefix)); + + const Attributes& atts = stack_.atts(); + for (int i = atts.getLength() - 1; i >= 0; i--) + { + std::string attNamespace = atts.getURI(i); + std::string attPrefix = prefixOf(atts.getQName(i)); + if(foreign(attPrefix, attNamespace)) + contentHandler_->endPrefixMapping(S(attPrefix)); + } // for ... + stack_ = stack_.next(); + } // pop + + // Pop the stack restartably + void restartablyPop() + { + Element popped = stack_; + pop(); + if(restartElements && (popped.flags() & Schema::F_RESTART) != 0) + { + popped.anonymize(); + popped.setNext(saved_); + saved_ = popped; + } // if ... + } // restartablyPop + + // Push element onto stack + void push(Element e) + { + std::string name = e.name(); + std::string localName = e.localName(); + std::string namespaceName = e.namespaceName(); + std::string prefix = prefixOf(name); + + e.clean(); + if(!namespaces) + namespaceName = localName = ""; + if(virginStack && (lower_case(localName) == lower_case(doctypeName_))) + entityResolver_->resolveEntity(S(doctypePublicId_), S(doctypeSystemId_)); + if(foreign(prefix, namespaceName)) + contentHandler_->startPrefixMapping(S(prefix), S(namespaceName)); + + AttributesImpl atts; + int len = e.atts().getLength(); + for (int i = 0; i != len; ++i) + { + std::string attNamespace = e.atts().getURI(i); + std::string attPrefix = prefixOf(e.atts().getQName(i)); + if(foreign(attPrefix, attNamespace)) + contentHandler_->startPrefixMapping(S(attPrefix), S(attNamespace)); + + atts.addAttribute(S(e.atts().getURI(i)), + S(e.atts().getLocalName(i)), + S(e.atts().getQName(i)), + S(e.atts().getType(i)), + S(e.atts().getValue(i))); + } // for ... + contentHandler_->startElement(S(namespaceName), S(localName), S(name), atts); + + e.setNext(stack_); + stack_ = e; + virginStack = false; + if(CDATAElements && (stack_.flags() & Schema::F_CDATA) != 0) + scanner_->startCDATA(); + } // push + + // Get the prefix from a QName + std::string prefixOf(std::string name) + { + size_t i = name.find(':'); + std::string prefix = ""; + if(i != std::string::npos) + prefix = name.substr(0, i); + return prefix; + } // prefixOf + + // Return true if we have a foreign name + bool foreign(std::string prefix, std::string namespaceName) + { + bool foreign = !((prefix == "") || (namespaceName == "") || (namespaceName == schema_->getURI())); + return foreign; + } // foreign + + /** + * Parsing the complete XML Document Type Definition is way too complex, + * but for many simple cases we can extract something useful from it. + * + * doctypedecl ::= '' + * DeclSep ::= PEReference | S + * intSubset ::= (markupdecl | DeclSep)* + * markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment + * ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral + */ + + virtual void decl(const std::string& buff) + { + // std::cerr << "decl(\"" << buff.substr(offset, length) << "\", " << offset << ", " << length << ")" << std::endl; + std::string name; + std::string systemid; + std::string publicid; + std::vector v = split(buff); + if((v.size() > 0) && ("DOCTYPE" == v[0])) + { + if(doctypeIsPresent_) + return; // one doctype only! + doctypeIsPresent_ = true; + if(v.size() > 1) + { + name = v[1]; + if(v.size()>3 && "SYSTEM" == v[2]) + { + systemid = v[3]; + } + else if(v.size() > 3 && "PUBLIC" == v[2]) + { + publicid = v[3]; + if(v.size() > 4) + { + systemid = v[4]; + } + else + { + systemid = ""; + } + } + } + } + publicid = trimquotes(publicid); + systemid = trimquotes(systemid); + if(name != "") + { + publicid = cleanPublicid(publicid); + lexicalHandler_->startDTD(S(name), S(publicid), S(systemid)); + lexicalHandler_->endDTD(); + doctypeName_ = name; + doctypePublicId_ = publicid; + if(dynamic_cast(scanner_)) + { // Must resolve systemid + doctypeSystemId_ = string_adaptor::asStdString(dynamic_cast(scanner_)->getSystemId()); + doctypeSystemId_ = Arabica::io::URI(doctypeSystemId_, systemid).as_string(); + } // if ... + } // if ... + } // decl + + // If the String is quoted, trim the quotes. + static std::string trimquotes(const std::string& in) + { + size_t length = in.length(); + if(length == 0) + return in; + char s = in[0]; + char e = in[length - 1]; + if(s == e && (s == '\'' || s == '"')) + return in.substr(1, length - 1); + return in; + } // trimquotes + + // Split the supplied String into words or phrases seperated by spaces. + // Recognises quotes around a phrase and doesn't split it. + static std::vector split(const std::string& val) + { + std::vector splits; + + std::string v = Arabica::text::normalize_whitespace >(val); + if(v.length() == 0) + { + splits.push_back(v); + return splits; + } + + size_t s = 0; + size_t e = 0; + bool sq = false; // single quote + bool dq = false; // double quote + char lastc = 0; + size_t len = v.length(); + for(e=0; e < len; ++e) + { + char c = v[e]; + if(!dq && c == '\'' && lastc != '\\') + { + sq = !sq; + if(s == std::string::npos) + s = e; + } + else if(!sq && c == '\"' && lastc != '\\') + { + dq = !dq; + if(s == std::string::npos) + s = e; + } + else if(!sq && !dq) + { + if(Arabica::XML::is_space(c)) + { + splits.push_back(v.substr(s, e)); + s = std::string::npos; + } + else if(s == std::string::npos && c != ' ') + { + s = e; + } + } + lastc = c; + } // for ... + splits.push_back(v.substr(s, e)); + + return splits; + } // split + + // Replace junk in publicids with spaces + std::string cleanPublicid(const std::string& src) + { + std::string dst; + bool suppressSpace = true; + for(std::string::const_iterator i = src.begin(), ie = src.end(); i != ie; ++i) + { + if(legal.find(*i) != std::string::npos) + { + // legal but not whitespace + dst.push_back(*i); + suppressSpace = false; + } + else if(suppressSpace) + { // normalizable whitespace or junk + ; + } + else + { + dst.push_back(' '); + suppressSpace = true; + } + } + return dst; + } // cleanPublicId + + virtual void gi(const std::string& buff) + { + // std::cerr << "gi(\"" << buff.substr(offset, length) << "\", " << offset << ", " << length << ")" << std::endl; + if(newElement_ != Element::Null) + return; + std::string name = makeName(buff); + if(name == "") + return; + ElementType* type = &schema_->getElementType(name); + if(*type == ElementType::Null) + { + // Suppress unknown elements if ignore-bogons is on + if(ignoreBogons) + return; + int bogonModel = bogonsEmpty ? Schema::M_EMPTY : Schema::M_ANY; + int bogonMemberOf = rootBogons ? Schema::M_ANY : (Schema::M_ANY & ~Schema::M_ROOT); + schema_->elementType(name, bogonModel, bogonMemberOf, 0); + if(!rootBogons) + schema_->parent(name, schema_->rootElementType().name()); + type = &schema_->getElementType(name); + } // if ... + + newElement_ = Element(*type, defaultAttributes); + } // gi + + virtual void cdsect(const std::string& buff) + { + // std::cerr << "cdsect(\"" << buff.substr(offset, length) << "\", " << offset << ", " << length << ")" << std::endl; + lexicalHandler_->startCDATA(); + pcdata(buff); + lexicalHandler_->endCDATA(); + } // cdsect + + virtual void pcdata(const std::string& buff) + { + // std::cerr << "pcdata(\"" << buff.substr(offset, length) << "\", " << offset << ", " << length << ")" << std::endl; + if(buff.empty()) + return; + bool allWhite = true; + for (std::string::const_iterator i = buff.begin(), ie = buff.end(); i != ie; ++i) + { + if(!Arabica::XML::is_space(*i)) + allWhite = false; + } // for ... + if(allWhite && !stack_.canContain(pcdata_)) + { + if(ignorableWhitespace) + contentHandler_->ignorableWhitespace(S(buff)); + } + else + { + rectify(pcdata_); + contentHandler_->characters(S(buff)); + } // if ... + } // pcdata + + virtual void pitarget(const std::string& buff) + { + // std::cerr << "pitarget(\"" << buff.substr(offset, length) << "\", " << offset << ", " << length << ")" << std::endl; + if(newElement_ != Element::Null) + return; + std::string name = makeName(buff); + size_t colon = name.find(':'); + while(colon != std::string::npos) + { + name[colon] = '_'; + colon = name.find(':'); + } // while + piTarget_ = name; + } // pitarget + + virtual void pi(const std::string& buff) + { + // std::cerr << "pi(\"" << buff.substr(offset, length) << "\", " << offset << ", " << length << ")" << std::endl; + if(newElement_ != Element::Null || piTarget_ == "") + return; + if("xml" == lower_case(piTarget_)) + return; + size_t length = buff.length(); + if((length > 0) && (buff[length - 1] == '?')) + length--; // remove trailing ? + contentHandler_->processingInstruction(S(piTarget_), + S(buff.substr(0, length))); + piTarget_ = ""; + } // pi + + virtual void stagc(const std::string& buff) + { + // std::cerr << "stagc(\"" << buff.substr(offset, length) << "\", " << offset << ", " << length << ")" << std::endl; + if(newElement_ == Element::Null) + return; + rectify(newElement_); + if(stack_.model() == Schema::M_EMPTY) + { + // Force an immediate end tag + etag_basic(buff); + } // if ... + } // stagc + + virtual void stage(const std::string& buff) + { + // std::cerr << "stage(\"" << buff.substr(offset, length) << "\", " << offset << ", " << length << ")" << std::endl; + if(newElement_ == Element::Null) + return; + rectify(newElement_); + // Force an immediate end tag + etag_basic(buff); + } // stage + + // Comment buffer is twice the size of the output buffer + virtual void cmnt(const std::string& buff) + { + // std::cerr << "cmnt(\"" << buff.substr(offset, length) << "\", " << offset << ", " << length << ")" << std::endl; + lexicalHandler_->comment(S(buff)); + } // cmnt + + // Rectify the stack, pushing and popping as needed + // so that the argument can be safely pushed + void rectify(Element e) + { + Element sp = Element::Null; + while (true) + { + for (sp = stack_; sp != Element::Null; sp = sp.next()) + { + if(sp.canContain(e)) + break; + } // for ... + if(sp != Element::Null) + break; + ElementType& parentType = e.parent(); + if(parentType == ElementType::Null) + break; + Element parent = Element(parentType, defaultAttributes); + parent.setNext(e); + e = parent; + } // while ... + if(sp == Element::Null) + return; // don't know what to do + while (stack_ != sp) + { + if(stack_ == Element::Null || stack_.next() == Element::Null || stack_.next().next() == Element::Null) + break; + restartablyPop(); + } // while ... + while (e != Element::Null) + { + Element nexte = e.next(); + if(e.name() != "") + push(e); + e = nexte; + restart(e); + } // while ... + newElement_ = Element::Null; + } // rectify + + virtual int getEntity() + { + return entity_; + } // getEntity + + // Return the argument as a valid XML name + // This no longer lowercases the result: we depend on Schema to + // canonicalize case. + std::string makeName(const std::string& buff) + { + std::string dst; + bool seenColon = false; + bool start = true; +// String src = new String(buff, offset, length); // DEBUG + for(std::string::const_iterator ch = buff.begin(), che = buff.end(); ch != che; ++ch) + { + if(Arabica::XML::is_letter(*ch) || *ch == '_') + { + start = false; + dst.push_back(*ch); + } + else if(Arabica::XML::is_digit(*ch) || *ch == '-' || *ch == '.') + { + if(start) + dst.push_back('_'); + start = false; + dst.push_back(*ch); + } + else if(*ch == ':' && !seenColon) + { + seenColon = true; + if(start) + dst.push_back('_'); + start = true; + dst.push_back(translateColons ? '_' : *ch); + } + } // for ... + size_t dstLength = dst.length(); + if(dstLength == 0 || dst[dstLength - 1] == ':') + dst.push_back('_'); + return dst; + } // makeName + + static std::string lower_case(const std::string& str) + { + std::string lower; + std::transform(str.begin(), str.end(), std::back_inserter(lower), (int(*)(int))std::tolower); + return lower; + } // lower_case + + void reportError(const std::string& message, bool fatal) + { + SAXParseException e(message, + S(""), + S(""), + -1, + -1); + if(fatal) + errorHandler_->fatalError(e); + else + errorHandler_->error(e); + } // reportError + +public: + static string_type S(const std::string& s) + { + return string_adaptor::construct_from_utf8(s.c_str()); + } // S + + static string_type S(const char* s) + { + return string_adaptor::construct_from_utf8(s); + } // S +}; // class Taggle + +template +bool Taggle::DEFAULT_NAMESPACES = true; +template +bool Taggle::DEFAULT_IGNORE_BOGONS = false; +template +bool Taggle::DEFAULT_BOGONS_EMPTY = false; +template +bool Taggle::DEFAULT_ROOT_BOGONS = true; +template +bool Taggle::DEFAULT_DEFAULT_ATTRIBUTES = true; +template +bool Taggle::DEFAULT_TRANSLATE_COLONS = false; +template +bool Taggle::DEFAULT_RESTART_ELEMENTS = true; +template +bool Taggle::DEFAULT_IGNORABLE_WHITESPACE = false; +template +bool Taggle::DEFAULT_CDATA_ELEMENTS = true; + +template +const string_type Taggle::namespacesFeature = Taggle::S("http://xml.org/sax/features/namespaces"); +template +const string_type Taggle::namespacePrefixesFeature = Taggle::S("http://xml.org/sax/features/namespace-prefixes"); +template +const string_type Taggle::externalGeneralEntitiesFeature = Taggle::S("http://xml.org/sax/features/external-general-entities"); +template +const string_type Taggle::externalParameterEntitiesFeature = Taggle::S("http://xml.org/sax/features/external-parameter-entities"); +template +const string_type Taggle::isStandaloneFeature = Taggle::S("http://xml.org/sax/features/is-standalone"); +template +const string_type Taggle::lexicalHandlerParameterEntitiesFeature = Taggle::S("http://xml.org/sax/features/lexical-handler/parameter-entities"); +template +const string_type Taggle::resolveDTDURIsFeature = Taggle::S("http://xml.org/sax/features/resolve-dtd-uris"); +template +const string_type Taggle::stringInterningFeature = Taggle::S("http://xml.org/sax/features/string-interning"); +template +const string_type Taggle::useAttributes2Feature = Taggle::S("http://xml.org/sax/features/use-attributes2"); +template +const string_type Taggle::useLocator2Feature = Taggle::S("http://xml.org/sax/features/use-locator2"); +template +const string_type Taggle::useEntityResolver2Feature = Taggle::S("http://xml.org/sax/features/use-entity-resolver2"); +template +const string_type Taggle::validationFeature = Taggle::S("http://xml.org/sax/features/validation"); +template +const string_type Taggle::unicodeNormalizationCheckingFeature = Taggle::S("http://xml.org/sax/features/unicode-normalization-checking"); +template +const string_type Taggle::xmlnsURIsFeature = Taggle::S("http://xml.org/sax/features/xmlns-uris"); +template +const string_type Taggle::XML11Feature = Taggle::S("http://xml.org/sax/features/xml-1.1"); +template +const string_type Taggle::ignoreBogonsFeature = Taggle::S("http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons"); +template +const string_type Taggle::bogonsEmptyFeature = Taggle::S("http://www.ccil.org/~cowan/tagsoup/features/bogons-empty"); +template +const string_type Taggle::rootBogonsFeature = Taggle::S("http://www.ccil.org/~cowan/tagsoup/features/root-bogons"); +template +const string_type Taggle::defaultAttributesFeature = Taggle::S("http://www.ccil.org/~cowan/tagsoup/features/default-attributes"); +template +const string_type Taggle::translateColonsFeature = Taggle::S("http://www.ccil.org/~cowan/tagsoup/features/translate-colons"); +template +const string_type Taggle::restartElementsFeature = Taggle::S("http://www.ccil.org/~cowan/tagsoup/features/restart-elements"); +template +const string_type Taggle::ignorableWhitespaceFeature = Taggle::S("http://www.ccil.org/~cowan/tagsoup/features/ignorable-whitespace"); +template +const string_type Taggle::CDATAElementsFeature = Taggle::S("http://www.ccil.org/~cowan/tagsoup/features/cdata-elements"); +template +const string_type Taggle::lexicalHandlerProperty = Taggle::S("http://xml.org/sax/properties/lexical-handler"); +template +const string_type Taggle::scannerProperty = Taggle::S("http://www.ccil.org/~cowan/tagsoup/properties/scanner"); +template +const string_type Taggle::schemaProperty = Taggle::S("http://www.ccil.org/~cowan/tagsoup/properties/schema"); +template +const string_type Taggle::autoDetectorProperty = Taggle::S("http://www.ccil.org/~cowan/tagsoup/properties/auto-detector"); + +template +const string_type Taggle::legal = + Taggle::S("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'()+,./:=?;!*#@$_%"); + +} // namespace SAX + +} // namespace Arabica +#endif + diff --git a/arabica/include/Taggle/impl/ScanHandler.hpp b/arabica/include/Taggle/impl/ScanHandler.hpp new file mode 100755 index 000000000..1f42c3b6c --- /dev/null +++ b/arabica/include/Taggle/impl/ScanHandler.hpp @@ -0,0 +1,108 @@ +#ifndef ARABICA_SCAN_HANDLER_HPP +#define ARABICA_SCAN_HANDLER_HPP + +#include + +namespace Arabica +{ + +namespace SAX +{ + +/** + An interface that Scanners use to report events in the input stream. + + This code is derived from John Cowan's splendid TagSoup package +*/ +class ScanHandler +{ +protected: + ~ScanHandler() {} + +public: + /** + Reports an attribute name without a value. + **/ + virtual void adup(const std::string& buff) = 0; + + /** + Reports an attribute name; a value will follow. + **/ + virtual void aname(const std::string& buff) = 0; + + /** + Reports an attribute value. + **/ + virtual void aval(const std::string& buff) = 0; + + /** + * Reports the content of a CDATA section (not a CDATA element) + */ + virtual void cdsect(const std::string& buff) = 0; + + /** + * Reports a declaration - typically a DOCTYPE + */ + virtual void decl(const std::string& buff) = 0; + + /** + Reports an entity reference or character reference. + **/ + virtual void entity(const std::string& buff) = 0; + + /** + Reports EOF. + **/ + virtual void eof(const std::string& buff) = 0; + + /** + Reports an end-tag. + **/ + virtual void etag(const std::string& buff) = 0; + + /** + Reports the general identifier (element type name) of a start-tag. + **/ + virtual void gi(const std::string& buff) = 0; + + /** + Reports character content. + **/ + virtual void pcdata(const std::string& buff) = 0; + + /** + Reports the data part of a processing instruction. + **/ + virtual void pi(const std::string& buff) = 0; + + /** + Reports the target part of a processing instruction. + **/ + virtual void pitarget(const std::string& buff) = 0; + + /** + Reports the close of a start-tag. + **/ + virtual void stagc(const std::string& buff) = 0; + + /** + Reports the close of an empty-tag. + **/ + virtual void stage(const std::string& buff) = 0; + + /** + Reports a comment. + **/ + virtual void cmnt(const std::string& buff) = 0; + + /** + Returns the value of the last entity or character reference reported. + **/ + virtual int getEntity() = 0; +}; // class ScanHandler + +} // namespace SAX + +} // namespace Arabica + +#endif diff --git a/arabica/include/Taggle/impl/Scanner.hpp b/arabica/include/Taggle/impl/Scanner.hpp new file mode 100755 index 000000000..efc6a591d --- /dev/null +++ b/arabica/include/Taggle/impl/Scanner.hpp @@ -0,0 +1,47 @@ +#ifndef ARABICA_SAX_SCANNER_HPP +#define ARABICA_SAX_SCANNER_HPP + +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +class ScanHandler; + +/** + An interface allowing Parser to invoke scanners. + + This code is derived from John Cowan's splendid TagSoup package +*/ +class Scanner +{ +public: + virtual ~Scanner() {} + + /** + Invoke a scanner. + @param r A source of characters to scan + @param h A ScanHandler to report events to + **/ + virtual void scan(std::istream& r, ScanHandler& h) = 0; + + /** + Reset the embedded locator. + @param publicid The publicid of the source + @param systemid The systemid of the source + **/ + virtual void resetDocumentLocator(const std::string& publicid, const std::string& systemid) = 0; + + /** + Signal to the scanner to start CDATA content mode. + **/ + virtual void startCDATA() = 0; +}; // Scanner + + +} // namespace SAX +} // namespace Arabica +#endif diff --git a/arabica/include/Taggle/impl/Schema.hpp b/arabica/include/Taggle/impl/Schema.hpp new file mode 100644 index 000000000..0a4130c36 --- /dev/null +++ b/arabica/include/Taggle/impl/Schema.hpp @@ -0,0 +1,44 @@ +#ifndef ARABICA_SAX_TAGGLE_SCHEMA_HPP +#define ARABICA_SAX_TAGGLE_SCHEMA_HPP + +namespace Arabica +{ +namespace SAX +{ + +class ElementType; + +/** +Abstract class representing a TSSL schema. +Actual TSSL schemas are compiled into concrete subclasses of this class. + +Based on code from John Cowan's super TagSoup package +**/ +class Schema +{ +public: + static const int M_ANY; + static const int M_EMPTY; + static const int M_PCDATA; + static const int M_ROOT; + + static const int F_RESTART; + static const int F_CDATA; + static const int F_NOFORCE; + + virtual void elementType(const std::string& name, int model, int memberOf, int flags) = 0; + virtual ElementType& rootElementType() = 0; + virtual void parent(std::string name, std::string parentName) = 0; + + virtual ElementType& getElementType(const std::string& name) = 0; + virtual int getEntity(const std::string& name) const = 0; + virtual const std::string& getURI() const = 0; + virtual const std::string& getPrefix() const = 0; + + virtual ~Schema() { } +}; // class Schema + +} // namespace SAX + +} // namespace Arabica +#endif diff --git a/arabica/include/Taggle/impl/SchemaImpl.hpp b/arabica/include/Taggle/impl/SchemaImpl.hpp new file mode 100644 index 000000000..6e35994ca --- /dev/null +++ b/arabica/include/Taggle/impl/SchemaImpl.hpp @@ -0,0 +1,182 @@ +#ifndef ARABICA_SAX_TAGGLE_SCHEMAIMPL_HPP +#define ARABICA_SAX_TAGGLE_SCHEMAIMPL_HPP + +#include +#include +#include +#include +#include "ElementType.hpp" +#include "Schema.hpp" + +namespace Arabica +{ +namespace SAX +{ + +/** +Abstract class representing a TSSL schema. +Actual TSSL schemas are compiled into concrete subclasses of this class. + +Based on code from John Cowan's super TagSoup package +**/ +class SchemaImpl : public Schema +{ +private: + std::map entities_; + std::map elementTypes_; + + std::string URI_; + std::string prefix_; + ElementType* root_; + +public: + virtual ~SchemaImpl() + { + for(std::map::iterator i = elementTypes_.begin(), ie = elementTypes_.end(); i != ie; ++i) + delete i->second; + } // ~SchemaImpl + + /** + Add or replace an element type for this schema. + @param name Name (Qname) of the element + @param model Models of the element's content as a vector of bits + @param memberOf Models the element is a member of as a vector of bits + @param flags Flags for the element + **/ + void elementType(const std::string& name, int model, int memberOf, int flags) + { + ElementType* e = new ElementType(name, model, memberOf, flags, *this); + std::string lname = lower_case(name); + elementTypes_[lname] = e; + if(memberOf == M_ROOT) + root_ = elementTypes_[lname]; + } // elementType + + /** + Get the root element of this schema + **/ + ElementType& rootElementType() + { + return *root_; + } // rootElementType + + /** + Add or replace a default attribute for an element type in this schema. + @param elemName Name (Qname) of the element type + @param attrName Name (Qname) of the attribute + @param type Type of the attribute + @param value Default value of the attribute; null if no default + **/ + void attribute(const std::string& elemName, const std::string& attrName, const std::string& type, const std::string& value) + { + ElementType& e = getElementType(elemName); + if (e == ElementType::Null) + { + throw std::runtime_error("Attribute " + attrName + + " specified for unknown element type " + + elemName); + } + e.setAttribute(attrName, type, value); + } // attribute + + /** + Specify natural parent of an element in this schema. + @param name Name of the child element + @param parentName Name of the parent element + **/ + void parent(std::string name, std::string parentName) + { + ElementType& child = getElementType(name); + ElementType& parent = getElementType(parentName); + if (child == ElementType::Null) + { + throw std::runtime_error("No child " + name + " for parent " + parentName); + } + if (parent == ElementType::Null) + { + throw std::runtime_error("No parent " + parentName + " for child " + name); + } + child.setParent(parent); + } // parent + + /** + Add to or replace a character entity in this schema. + @param name Name of the entity + @param value Value of the entity + **/ + void entity(const std::string& name, int value) + { + entities_[name] = value; + } // entity + + /** + Get an ElementType by name. + @param name Name (Qname) of the element type + @return The corresponding ElementType + **/ + ElementType& getElementType(const std::string& name) + { + std::map::iterator elemType = elementTypes_.find(lower_case(name)); + if(elemType == elementTypes_.end()) + return ElementType::Null; + return *elemType->second; + } // getElementType + + /** + Get an entity value by name. + @param name Name of the entity + @return The corresponding character, or 0 if none + **/ + int getEntity(const std::string& name) const + { + std::map::const_iterator ent = entities_.find(name); + if(ent == entities_.end()) + return 0; + return ent->second; + } // getEntity + + /** + Return the URI (namespace name) of this schema. + **/ + const std::string& getURI() const + { + return URI_; + } // getURI + + /** + Return the prefix of this schema. + **/ + const std::string& getPrefix() const + { + return prefix_; + } // getPrefix + + /** + Change the URI (namespace name) of this schema. + **/ + void setURI(std::string uri) + { + URI_ = uri; + } // setURI + + /** + Change the prefix of this schema. + **/ + void setPrefix(std::string prefix) + { + prefix_ = prefix; + } // setPrefix + +private: + static std::string lower_case(const std::string& str) + { + std::string lower; + std::transform(str.begin(), str.end(), std::back_inserter(lower), (int(*)(int))std::tolower); + return lower; + } // lower_case +}; // class Schema + +} // namespace SAX + +} // namespace Arabica +#endif diff --git a/arabica/include/Taggle/impl/html/HTMLModels.hpp b/arabica/include/Taggle/impl/html/HTMLModels.hpp new file mode 100644 index 000000000..360fd40d1 --- /dev/null +++ b/arabica/include/Taggle/impl/html/HTMLModels.hpp @@ -0,0 +1,49 @@ +#ifndef ARABICA_SAX_TAGGLE_HTML_MODELS_HPP +#define ARABICA_SAX_TAGGLE_HTML_MODELS_HPP + +namespace Arabica +{ + +namespace SAX +{ + +/** +This interface contains generated constants representing HTML content +models. Logically, it is part of HTMLSchema, but it is more +convenient to generate the constants into a separate interface. + +Based on code from John Cowan's super TagSoup package +*/ +class HTMLModels +{ +protected: + // Start of model definitions + static const int M_AREA = 1 << 1; + static const int M_BLOCK = 1 << 2; + static const int M_BLOCKINLINE = 1 << 3; + static const int M_BODY = 1 << 4; + static const int M_CELL = 1 << 5; + static const int M_COL = 1 << 6; + static const int M_DEF = 1 << 7; + static const int M_FORM = 1 << 8; + static const int M_FRAME = 1 << 9; + static const int M_HEAD = 1 << 10; + static const int M_HTML = 1 << 11; + static const int M_INLINE = 1 << 12; + static const int M_LEGEND = 1 << 13; + static const int M_LI = 1 << 14; + static const int M_NOLINK = 1 << 15; + static const int M_OPTION = 1 << 16; + static const int M_OPTIONS = 1 << 17; + static const int M_P = 1 << 18; + static const int M_PARAM = 1 << 19; + static const int M_TABLE = 1 << 20; + static const int M_TABULAR = 1 << 21; + static const int M_TR = 1 << 22; +}; // namespace HTMLModels + +} // namespace SAX + +} // namespace Arabica +#endif + diff --git a/arabica/include/Taggle/impl/html/HTMLScanner.hpp b/arabica/include/Taggle/impl/html/HTMLScanner.hpp new file mode 100644 index 000000000..1001b2d7f --- /dev/null +++ b/arabica/include/Taggle/impl/html/HTMLScanner.hpp @@ -0,0 +1,707 @@ +#ifndef ARABICA_SAX_TAGGLE_HTML_SCANNER_HPP +#define ARABICA_SAX_TAGGLE_HTML_SCANNER_HPP + +#include +#include +#include +#include +#include "../Scanner.hpp" + +namespace Arabica +{ + +namespace SAX +{ + +/** +This class implements a table-driven scanner for HTML, allowing for lots of +defects. It implements the Scanner interface, which accepts a Reader +object to fetch characters from and a ScanHandler object to report lexical +events to. + +Based on code from John Cowan's super TagSoup package +*/ +class HTMLScanner : public Scanner, public SAX::Locator +{ +private: + // Start of state table + static const int S_ANAME = 1; + static const int S_APOS = 2; + static const int S_AVAL = 3; + static const int S_BB = 4; + static const int S_BBC = 5; + static const int S_BBCD = 6; + static const int S_BBCDA = 7; + static const int S_BBCDAT = 8; + static const int S_BBCDATA = 9; + static const int S_CDATA = 10; + static const int S_CDATA2 = 11; + static const int S_CDSECT = 12; + static const int S_CDSECT1 = 13; + static const int S_CDSECT2 = 14; + static const int S_COM = 15; + static const int S_COM2 = 16; + static const int S_COM3 = 17; + static const int S_COM4 = 18; + static const int S_DECL = 19; + static const int S_DECL2 = 20; + static const int S_DONE = 21; + static const int S_EMPTYTAG = 22; + static const int S_ENT = 23; + static const int S_EQ = 24; + static const int S_ETAG = 25; + static const int S_GI = 26; + static const int S_NCR = 27; + static const int S_PCDATA = 28; + static const int S_PI = 29; + static const int S_PITARGET = 30; + static const int S_QUOT = 31; + static const int S_STAGC = 32; + static const int S_TAG = 33; + static const int S_TAGWS = 34; + static const int S_XNCR = 35; + static const int A_ADUP = 1; + static const int A_ADUP_SAVE = 2; + static const int A_ADUP_STAGC = 3; + static const int A_ANAME = 4; + static const int A_ANAME_ADUP = 5; + static const int A_ANAME_ADUP_STAGC = 6; + static const int A_AVAL = 7; + static const int A_AVAL_STAGC = 8; + static const int A_CDATA = 9; + static const int A_CMNT = 10; + static const int A_DECL = 11; + static const int A_EMPTYTAG = 12; + static const int A_ENTITY = 13; + static const int A_ENTITY_START = 14; + static const int A_ETAG = 15; + static const int A_GI = 16; + static const int A_GI_STAGC = 17; + static const int A_LT = 18; + static const int A_LT_PCDATA = 19; + static const int A_MINUS = 20; + static const int A_MINUS2 = 21; + static const int A_MINUS3 = 22; + static const int A_PCDATA = 23; + static const int A_PI = 24; + static const int A_PITARGET = 25; + static const int A_PITARGET_PI = 26; + static const int A_SAVE = 27; + static const int A_SKIP = 28; + static const int A_SP = 29; + static const int A_STAGC = 30; + static const int A_UNGET = 31; + static const int A_UNSAVE_PCDATA = 32; + static const int statetable[]; + static const std::string debug_actionnames[]; + static const std::string debug_statenames[]; + // End of state table + static const int WinCharMap[]; // Windows char map + static const std::string hexLetters; + + std::string publicId_; // Locator state + std::string systemId_; + size_t lastLine_; + size_t lastColumn_; + size_t currentLine_; + size_t currentColumn_; + + int state_; // Current state + int nextState_; // Next state + std::string outputBuffer_; // Output buffer + + // Compensate for bug in PushbackReader that allows + // pushing back EOF. + //void unread(PushbackReader r, int c) throws IOException { + // if (c != -1) r.unread(c); + // } + +public: + HTMLScanner() : + publicId_(), + systemId_(), + lastLine_(0), + lastColumn_(0), + currentLine_(0), + currentColumn_(0), + state_(0), + nextState_(0), + outputBuffer_() + { + outputBuffer_.reserve(200); + } // HTMLScanner + + // Locator implementation + size_t getLineNumber() const + { + return lastLine_; + } // getLineNumber + + size_t getColumnNumber() const + { + return lastColumn_; + } // getColumnNumber + + std::string getPublicId() const + { + return publicId_; + } // getPublicId + + std::string getSystemId() const + { + return systemId_; + } // getSystemId + + + // Scanner implementation + /** + Reset document locator, supplying systemid and publicid. + @param systemid System id + @param publicid Public id + */ + virtual void resetDocumentLocator(const std::string& publicid, const std::string& systemid) + { + publicId_ = publicid; + systemId_ = systemid; + lastLine_ = lastColumn_ = currentLine_ = currentColumn_ = 0; + } // resetDocumentLocator + + /** + Scan HTML source, reporting lexical events. + @param r0 Reader that provides characters + @param h ScanHandler that accepts lexical events. + */ + virtual void scan(std::istream& r, ScanHandler& h) + { + state_ = S_PCDATA; +/* PushbackReader r; + if (r0 instanceof PushbackReader) { + r = (PushbackReader)r0; + } + else if (r0 instanceof BufferedReader) { + r = new PushbackReader(r0); + } + else { + r = new PushbackReader(new BufferedReader(r0)); + } +*/ +// int firstChar = r.read(); // Remove any leading BOM +// if (firstChar != '\uFEFF') unread(r, firstChar); + + while (state_ != S_DONE) + { + int ch = r.get(); + + // Process control characters + //if (ch >= 0x80 && ch <= 0x9F) + //ch = WinCharMap[ch-0x80]; + + if (ch == '\r') + { + ch = r.get(); // expect LF next + if (ch != '\n') + { + r.unget(); + ch = '\n'; + } + } + + if (ch == '\n') + { + ++currentLine_; + currentColumn_ = 0; + } + else + { + ++currentColumn_; + } + + if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) + continue; + + // Search state table + int action = 0; + for (int i = 0; statetable[i] != -1; i += 4) + { + if (state_ != statetable[i]) + { + if (action != 0) + break; + continue; + } + if (statetable[i+1] == 0) + { + action = statetable[i+2]; + nextState_ = statetable[i+3]; + } + else if (statetable[i+1] == ch) + { + action = statetable[i+2]; + nextState_ = statetable[i+3]; + break; + } + } // for ... + + switch (action) + { + case 0: + { + std::ostringstream os; + os << "HTMLScanner can't cope with " << ch << " in state " << state_; + throw std::runtime_error(os.str()); + } + case A_ADUP: + h.adup(outputBuffer_); + outputBuffer_.clear(); + break; + case A_ADUP_SAVE: + h.adup(outputBuffer_); + outputBuffer_.clear(); + save(ch, h); + break; + case A_ADUP_STAGC: + h.adup(outputBuffer_); + outputBuffer_.clear(); + h.stagc(outputBuffer_); + break; + case A_ANAME: + h.aname(outputBuffer_); + outputBuffer_.clear(); + break; + case A_ANAME_ADUP: + h.aname(outputBuffer_); + outputBuffer_.clear(); + h.adup(outputBuffer_); + break; + case A_ANAME_ADUP_STAGC: + h.aname(outputBuffer_); + outputBuffer_.clear(); + h.adup(outputBuffer_); + h.stagc(outputBuffer_); + break; + case A_AVAL: + h.aval(outputBuffer_); + outputBuffer_.clear(); + break; + case A_AVAL_STAGC: + h.aval(outputBuffer_); + outputBuffer_.clear(); + h.stagc(outputBuffer_); + break; + case A_CDATA: + mark(); + // suppress the final "]]" in the buffer + if (outputBuffer_.size() > 1) + outputBuffer_.erase(outputBuffer_.size()-2); + h.pcdata(outputBuffer_); + outputBuffer_.clear(); + break; + case A_ENTITY_START: + h.pcdata(outputBuffer_); + outputBuffer_.clear(); + save(ch, h); + break; + case A_ENTITY: + { + mark(); + char ch1 = (char)ch; + // System.out.println("Got " + ch1 + " in state " + ((state_ == S_ENT) ? "S_ENT" : ((state_ == S_NCR) ? "S_NCR" : "UNK"))); + if (state_ == S_ENT && ch1 == '#') + { + nextState_ = S_NCR; + save(ch, h); + break; + } + else if (state_ == S_NCR && (ch1 == 'x' || ch1 == 'X')) + { + nextState_ = S_XNCR; + save(ch, h); + break; + } + else if (state_ == S_ENT && XML::is_letter_or_digit(ch1)) + { + save(ch, h); + break; + } + else if (state_ == S_NCR && XML::is_digit(ch1)) + { + save(ch, h); + break; + } + else if (state_ == S_XNCR && (XML::is_digit(ch1) || hexLetters.find(ch1) != std::string::npos)) + { + save(ch, h); + break; + } + + // The whole entity reference has been collected + h.entity(outputBuffer_.substr(1, outputBuffer_.size()-1)); + int ent = h.getEntity(); + if (ent != 0) + { + outputBuffer_.clear(); + if (ent >= 0x80 && ent <= 0x9F) + { + //ent = WinCharMap[ent-0x80]; + } + if (ent < 0x20) + { + // Control becomes space + ent = 0x20; + } + else if (ent >= 0xD800 && ent <= 0xDFFF) + { + // Surrogates get dropped + ent = 0; + } + else if (ent <= 0xFFFF) + { + // BMP character + save(ent, h); + } + else + { + // Astral converted to two surrogates + ent -= 0x10000; + save((ent>>10) + 0xD800, h); + save((ent&0x3FF) + 0xDC00, h); + } + if (ch != ';') + { + r.unget(); + currentColumn_--; + } + } + else + { + r.unget(); + currentColumn_--; + } + nextState_ = S_PCDATA; + } // case A_ENTITY: + break; + case A_ETAG: + h.etag(outputBuffer_); + outputBuffer_.clear(); + break; + case A_DECL: + h.decl(outputBuffer_); + outputBuffer_.clear(); + break; + case A_GI: + h.gi(outputBuffer_); + outputBuffer_.clear(); + break; + case A_GI_STAGC: + h.gi(outputBuffer_); + outputBuffer_.clear(); + h.stagc(outputBuffer_); + break; + case A_LT: + mark(); + save('<', h); + save(ch, h); + break; + case A_LT_PCDATA: + mark(); + save('<', h); + h.pcdata(outputBuffer_); + outputBuffer_.clear(); + break; + case A_PCDATA: + mark(); + h.pcdata(outputBuffer_); + outputBuffer_.clear(); + break; + case A_CMNT: + mark(); + h.cmnt(outputBuffer_); + outputBuffer_.clear(); + break; + case A_MINUS3: + save('-', h); + save(' ', h); + break; + case A_MINUS2: + save('-', h); + save(' ', h); + // fall through into A_MINUS + case A_MINUS: + save('-', h); + save(ch, h); + break; + case A_PI: + mark(); + h.pi(outputBuffer_); + outputBuffer_.clear(); + break; + case A_PITARGET: + h.pitarget(outputBuffer_); + outputBuffer_.clear(); + break; + case A_PITARGET_PI: + h.pitarget(outputBuffer_); + outputBuffer_.clear(); + h.pi(outputBuffer_); + break; + case A_SAVE: + save(ch, h); + break; + case A_SKIP: + break; + case A_SP: + save(' ', h); + break; + case A_STAGC: + h.stagc(outputBuffer_); + outputBuffer_.clear(); + break; + case A_EMPTYTAG: + mark(); + if (outputBuffer_.size() > 0) + h.gi(outputBuffer_); + outputBuffer_.clear(); + h.stage(outputBuffer_); + break; + case A_UNGET: + r.unget(); + currentColumn_--; + break; + case A_UNSAVE_PCDATA: + if (outputBuffer_.size() > 0) + outputBuffer_.erase(outputBuffer_.size()-1); + h.pcdata(outputBuffer_); + outputBuffer_.clear(); + break; + default: + throw std::runtime_error( + "Can't process state " + static_cast( + std::stringstream() << action).str()); + } // switch ... + state_ = nextState_; + } // while (state_ != S_DONE) + h.eof(""); + } // scan + + /** + A callback for the ScanHandler that allows it to force + the lexer state to CDATA content (no markup is recognized except + the end of element. + */ + void startCDATA() + { + nextState_ = S_CDATA; + } // startCDATA + +private: + /** + * Mark the current scan position as a "point of interest" - start of a tag, + * cdata, processing instruction etc. + */ + void mark() + { + lastColumn_ = currentColumn_; + lastLine_ = currentLine_; + } // mark + + void save(int ch, ScanHandler& h) + { + if (outputBuffer_.size() >= outputBuffer_.capacity() - 20) + { + if (state_ == S_PCDATA || state_ == S_CDATA) + { + // Return a buffer-sized chunk of PCDATA + h.pcdata(outputBuffer_); + outputBuffer_.clear(); + } + } + outputBuffer_ += static_cast(ch); + } // save + + static std::string nicechar(int in) + { + if (in == '\n') + return "\\n"; + std::ostringstream os; + if(in >= 32) + os << '\'' << static_cast(in) << '\''; + else + os << std::hex << std::showbase << in; + return os.str(); + } // nicechar + + HTMLScanner(const HTMLScanner&); + bool operator==(const HTMLScanner&) const; + HTMLScanner& operator=(const HTMLScanner&); +}; // class HTMLScanner + +const int HTMLScanner::statetable[] = { + S_ANAME, '/', A_ANAME_ADUP, S_EMPTYTAG, + S_ANAME, '=', A_ANAME, S_AVAL, + S_ANAME, '>', A_ANAME_ADUP_STAGC, S_PCDATA, + S_ANAME, 0, A_SAVE, S_ANAME, + S_ANAME, -1, A_ANAME_ADUP_STAGC, S_DONE, + S_ANAME, ' ', A_ANAME, S_EQ, + S_ANAME, '\n', A_ANAME, S_EQ, + S_ANAME, '\t', A_ANAME, S_EQ, + S_APOS, '\'', A_AVAL, S_TAGWS, + S_APOS, 0, A_SAVE, S_APOS, + S_APOS, -1, A_AVAL_STAGC, S_DONE, + S_APOS, ' ', A_SP, S_APOS, + S_APOS, '\n', A_SP, S_APOS, + S_APOS, '\t', A_SP, S_APOS, + S_AVAL, '\'', A_SKIP, S_APOS, + S_AVAL, '"', A_SKIP, S_QUOT, + S_AVAL, '>', A_AVAL_STAGC, S_PCDATA, + S_AVAL, 0, A_SAVE, S_STAGC, + S_AVAL, -1, A_AVAL_STAGC, S_DONE, + S_AVAL, ' ', A_SKIP, S_AVAL, + S_AVAL, '\n', A_SKIP, S_AVAL, + S_AVAL, '\t', A_SKIP, S_AVAL, + S_BB, 'C', A_SKIP, S_BBC, + S_BB, 0, A_SKIP, S_DECL, + S_BB, -1, A_SKIP, S_DONE, + S_BBC, 'D', A_SKIP, S_BBCD, + S_BBC, 0, A_SKIP, S_DECL, + S_BBC, -1, A_SKIP, S_DONE, + S_BBCD, 'A', A_SKIP, S_BBCDA, + S_BBCD, 0, A_SKIP, S_DECL, + S_BBCD, -1, A_SKIP, S_DONE, + S_BBCDA, 'T', A_SKIP, S_BBCDAT, + S_BBCDA, 0, A_SKIP, S_DECL, + S_BBCDA, -1, A_SKIP, S_DONE, + S_BBCDAT, 'A', A_SKIP, S_BBCDATA, + S_BBCDAT, 0, A_SKIP, S_DECL, + S_BBCDAT, -1, A_SKIP, S_DONE, + S_BBCDATA, '[', A_SKIP, S_CDSECT, + S_BBCDATA, 0, A_SKIP, S_DECL, + S_BBCDATA, -1, A_SKIP, S_DONE, + S_CDATA, '<', A_SAVE, S_CDATA2, + S_CDATA, 0, A_SAVE, S_CDATA, + S_CDATA, -1, A_PCDATA, S_DONE, + S_CDATA2, '/', A_UNSAVE_PCDATA, S_ETAG, + S_CDATA2, 0, A_SAVE, S_CDATA, + S_CDATA2, -1, A_UNSAVE_PCDATA, S_DONE, + S_CDSECT, ']', A_SAVE, S_CDSECT1, + S_CDSECT, 0, A_SAVE, S_CDSECT, + S_CDSECT, -1, A_SKIP, S_DONE, + S_CDSECT1, ']', A_SAVE, S_CDSECT2, + S_CDSECT1, 0, A_SAVE, S_CDSECT, + S_CDSECT1, -1, A_SKIP, S_DONE, + S_CDSECT2, '>', A_CDATA, S_PCDATA, + S_CDSECT2, 0, A_SAVE, S_CDSECT, + S_CDSECT2, -1, A_SKIP, S_DONE, + S_COM, '-', A_SKIP, S_COM2, + S_COM, 0, A_SAVE, S_COM2, + S_COM, -1, A_CMNT, S_DONE, + S_COM2, '-', A_SKIP, S_COM3, + S_COM2, 0, A_SAVE, S_COM2, + S_COM2, -1, A_CMNT, S_DONE, + S_COM3, '-', A_SKIP, S_COM4, + S_COM3, 0, A_MINUS, S_COM2, + S_COM3, -1, A_CMNT, S_DONE, + S_COM4, '-', A_MINUS3, S_COM4, + S_COM4, '>', A_CMNT, S_PCDATA, + S_COM4, 0, A_MINUS2, S_COM2, + S_COM4, -1, A_CMNT, S_DONE, + S_DECL, '-', A_SKIP, S_COM, + S_DECL, '[', A_SKIP, S_BB, + S_DECL, '>', A_SKIP, S_PCDATA, + S_DECL, 0, A_SAVE, S_DECL2, + S_DECL, -1, A_SKIP, S_DONE, + S_DECL2, '>', A_DECL, S_PCDATA, + S_DECL2, 0, A_SAVE, S_DECL2, + S_DECL2, -1, A_SKIP, S_DONE, + S_EMPTYTAG, '>', A_EMPTYTAG, S_PCDATA, + S_EMPTYTAG, 0, A_SAVE, S_ANAME, + S_EMPTYTAG, ' ', A_SKIP, S_TAGWS, + S_EMPTYTAG, '\n', A_SKIP, S_TAGWS, + S_EMPTYTAG, '\t', A_SKIP, S_TAGWS, + S_ENT, 0, A_ENTITY, S_ENT, + S_ENT, -1, A_ENTITY, S_DONE, + S_EQ, '=', A_SKIP, S_AVAL, + S_EQ, '>', A_ADUP_STAGC, S_PCDATA, + S_EQ, 0, A_ADUP_SAVE, S_ANAME, + S_EQ, -1, A_ADUP_STAGC, S_DONE, + S_EQ, ' ', A_SKIP, S_EQ, + S_EQ, '\n', A_SKIP, S_EQ, + S_EQ, '\t', A_SKIP, S_EQ, + S_ETAG, '>', A_ETAG, S_PCDATA, + S_ETAG, 0, A_SAVE, S_ETAG, + S_ETAG, -1, A_ETAG, S_DONE, + S_ETAG, ' ', A_SKIP, S_ETAG, + S_ETAG, '\n', A_SKIP, S_ETAG, + S_ETAG, '\t', A_SKIP, S_ETAG, + S_GI, '/', A_SKIP, S_EMPTYTAG, + S_GI, '>', A_GI_STAGC, S_PCDATA, + S_GI, 0, A_SAVE, S_GI, + S_GI, -1, A_SKIP, S_DONE, + S_GI, ' ', A_GI, S_TAGWS, + S_GI, '\n', A_GI, S_TAGWS, + S_GI, '\t', A_GI, S_TAGWS, + S_NCR, 0, A_ENTITY, S_NCR, + S_NCR, -1, A_ENTITY, S_DONE, + S_PCDATA, '&', A_ENTITY_START, S_ENT, + S_PCDATA, '<', A_PCDATA, S_TAG, + S_PCDATA, 0, A_SAVE, S_PCDATA, + S_PCDATA, -1, A_PCDATA, S_DONE, + S_PI, '>', A_PI, S_PCDATA, + S_PI, 0, A_SAVE, S_PI, + S_PI, -1, A_PI, S_DONE, + S_PITARGET, '>', A_PITARGET_PI, S_PCDATA, + S_PITARGET, 0, A_SAVE, S_PITARGET, + S_PITARGET, -1, A_PITARGET_PI, S_DONE, + S_PITARGET, ' ', A_PITARGET, S_PI, + S_PITARGET, '\n', A_PITARGET, S_PI, + S_PITARGET, '\t', A_PITARGET, S_PI, + S_QUOT, '"', A_AVAL, S_TAGWS, + S_QUOT, 0, A_SAVE, S_QUOT, + S_QUOT, -1, A_AVAL_STAGC, S_DONE, + S_QUOT, ' ', A_SP, S_QUOT, + S_QUOT, '\n', A_SP, S_QUOT, + S_QUOT, '\t', A_SP, S_QUOT, + S_STAGC, '>', A_AVAL_STAGC, S_PCDATA, + S_STAGC, 0, A_SAVE, S_STAGC, + S_STAGC, -1, A_AVAL_STAGC, S_DONE, + S_STAGC, ' ', A_AVAL, S_TAGWS, + S_STAGC, '\n', A_AVAL, S_TAGWS, + S_STAGC, '\t', A_AVAL, S_TAGWS, + S_TAG, '!', A_SKIP, S_DECL, + S_TAG, '?', A_SKIP, S_PITARGET, + S_TAG, '/', A_SKIP, S_ETAG, + S_TAG, '<', A_SAVE, S_TAG, + S_TAG, 0, A_SAVE, S_GI, + S_TAG, -1, A_LT_PCDATA, S_DONE, + S_TAG, ' ', A_LT, S_PCDATA, + S_TAG, '\n', A_LT, S_PCDATA, + S_TAG, '\t', A_LT, S_PCDATA, + S_TAGWS, '/', A_SKIP, S_EMPTYTAG, + S_TAGWS, '>', A_STAGC, S_PCDATA, + S_TAGWS, 0, A_SAVE, S_ANAME, + S_TAGWS, -1, A_STAGC, S_DONE, + S_TAGWS, ' ', A_SKIP, S_TAGWS, + S_TAGWS, '\n', A_SKIP, S_TAGWS, + S_TAGWS, '\t', A_SKIP, S_TAGWS, + S_XNCR, 0, A_ENTITY, S_XNCR, + S_XNCR, -1, A_ENTITY, S_DONE, + -1, -1, -1, -1 +}; // HTMLScanner::statetable + +const std::string HTMLScanner::debug_actionnames[] = { "", "A_ADUP", "A_ADUP_SAVE", "A_ADUP_STAGC", "A_ANAME", "A_ANAME_ADUP", "A_ANAME_ADUP_STAGC", "A_AVAL", "A_AVAL_STAGC", "A_CDATA", "A_CMNT", "A_DECL", "A_EMPTYTAG", "A_ENTITY", "A_ENTITY_START", "A_ETAG", "A_GI", "A_GI_STAGC", "A_LT", "A_LT_PCDATA", "A_MINUS", "A_MINUS2", "A_MINUS3", "A_PCDATA", "A_PI", "A_PITARGET", "A_PITARGET_PI", "A_SAVE", "A_SKIP", "A_SP", "A_STAGC", "A_UNGET", "A_UNSAVE_PCDATA"}; +const std::string HTMLScanner::debug_statenames[] = { "", "S_ANAME", "S_APOS", "S_AVAL", "S_BB", "S_BBC", "S_BBCD", "S_BBCDA", "S_BBCDAT", "S_BBCDATA", "S_CDATA", "S_CDATA2", "S_CDSECT", "S_CDSECT1", "S_CDSECT2", "S_COM", "S_COM2", "S_COM3", "S_COM4", "S_DECL", "S_DECL2", "S_DONE", "S_EMPTYTAG", "S_ENT", "S_EQ", "S_ETAG", "S_GI", "S_NCR", "S_PCDATA", "S_PI", "S_PITARGET", "S_QUOT", "S_STAGC", "S_TAG", "S_TAGWS", "S_XNCR"}; + +const int HTMLScanner::WinCharMap[] = { // Windows chars map + 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD, + 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178 +}; // HTMLScanner::WinCharMap + +const std::string HTMLScanner::hexLetters = "abcdefABCDEF"; + +} // namespace SAX + +} // namespace Arabica + +#endif + diff --git a/arabica/include/Taggle/impl/html/HTMLSchema.hpp b/arabica/include/Taggle/impl/html/HTMLSchema.hpp new file mode 100644 index 000000000..010714b93 --- /dev/null +++ b/arabica/include/Taggle/impl/html/HTMLSchema.hpp @@ -0,0 +1,2955 @@ +#ifndef ARABICA_SAX_TAGGLE_HTML_SCHEMA_HPP +#define ARABICA_SAX_TAGGLE_HTML_SCHEMA_HPP + +#include "../SchemaImpl.hpp" +#include "HTMLModels.hpp" + +namespace Arabica +{ +namespace SAX +{ + +/** +This class provides a Schema that has been preinitialized with HTML +elements, attributes, and character entity declarations. All the declarations +normally provided with HTML 4.01 are given, plus some that are IE-specific +and NS4-specific. Attribute declarations of type CDATA with no default +value are not included. + +Based on code from John Cowan's super TagSoup package +*/ +class HTMLSchema : public SchemaImpl, private HTMLModels +{ +public: + /** + Returns a newly constructed HTMLSchema object independent of + any existing ones. + */ + HTMLSchema() + { + // Start of Schema calls + setURI("http://www.w3.org/1999/xhtml"); + setPrefix("html"); + + // the original Java method body was generated automatically, + // and was huge. unfortunately, that sent gcc into a spin, + // so I've had to break it up + + elementTypes(); + + parents(); + + attributes(); + + entities(); + + } // HTMLSchema + +private: + void elementTypes() + { + elementType("", M_EMPTY, M_PCDATA, 0); + elementType("", M_ROOT, M_EMPTY, 0); + elementType("a", M_PCDATA|M_NOLINK|M_BLOCK, M_INLINE, 0); + elementType("abbr", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("acronym", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("address", M_PCDATA|M_INLINE|M_P, M_BLOCK, 0); + elementType("applet", M_PCDATA|M_PARAM|M_INLINE|M_BLOCK, M_INLINE|M_NOLINK, 0); + elementType("area", M_EMPTY, M_AREA, 0); + elementType("b", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("base", M_EMPTY, M_HEAD, 0); + elementType("basefont", M_EMPTY, M_INLINE|M_NOLINK, 0); + elementType("bdo", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("bgsound", M_EMPTY, M_HEAD, 0); + elementType("big", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("blink", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("blockquote", M_PCDATA|M_INLINE|M_BLOCK, M_BLOCK, 0); + elementType("body", M_PCDATA|M_INLINE|M_BLOCK, M_HTML|M_BODY, 0); + elementType("br", M_EMPTY, M_INLINE|M_NOLINK, 0); + elementType("button", M_PCDATA|M_INLINE|M_BLOCK, M_INLINE|M_NOLINK, 0); + elementType("canvas", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, 0); + elementType("caption", M_PCDATA|M_INLINE, M_TABULAR, 0); + elementType("center", M_PCDATA|M_INLINE|M_BLOCK, M_BLOCK, 0); + elementType("cite", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("code", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("col", M_EMPTY, M_COL|M_TABULAR, 0); + elementType("colgroup", M_COL, M_TABULAR, 0); + elementType("comment", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, 0); + elementType("dd", M_PCDATA|M_INLINE|M_BLOCK, M_DEF, 0); + elementType("del", M_PCDATA|M_INLINE|M_BLOCK, M_INLINE|M_BLOCKINLINE|M_BLOCK, F_RESTART); + elementType("dfn", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("dir", M_LI, M_BLOCK, 0); + elementType("div", M_PCDATA|M_INLINE|M_BLOCK, M_BLOCK, 0); + elementType("dl", M_DEF, M_BLOCK, 0); + elementType("dt", M_PCDATA|M_INLINE, M_DEF, 0); + elementType("em", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("fieldset", M_PCDATA|M_LEGEND|M_INLINE|M_BLOCK, M_BLOCK, 0); + elementType("font", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, 0); + elementType("form", M_PCDATA|M_INLINE|M_NOLINK|M_BLOCK|M_TR|M_CELL, M_BLOCK|M_FORM, F_NOFORCE); + elementType("frame", M_EMPTY, M_FRAME, 0); + elementType("frameset", M_FRAME, M_FRAME|M_HTML, 0); + elementType("h1", M_PCDATA|M_INLINE, M_BLOCK, 0); + elementType("h2", M_PCDATA|M_INLINE, M_BLOCK, 0); + elementType("h3", M_PCDATA|M_INLINE, M_BLOCK, 0); + elementType("h4", M_PCDATA|M_INLINE, M_BLOCK, 0); + elementType("h5", M_PCDATA|M_INLINE, M_BLOCK, 0); + elementType("h6", M_PCDATA|M_INLINE, M_BLOCK, 0); + elementType("head", M_HEAD, M_HTML, 0); + elementType("hr", M_EMPTY, M_BLOCK, 0); + elementType("html", M_HTML, M_ROOT, 0); + elementType("i", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("iframe", M_PCDATA|M_INLINE|M_BLOCK, M_INLINE|M_NOLINK, 0); + elementType("img", M_EMPTY, M_INLINE|M_NOLINK, 0); + elementType("input", M_EMPTY, M_INLINE|M_NOLINK, 0); + elementType("ins", M_PCDATA|M_INLINE|M_BLOCK, M_INLINE|M_BLOCK, F_RESTART); + elementType("isindex", M_EMPTY, M_HEAD, 0); + elementType("kbd", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("label", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, 0); + elementType("legend", M_PCDATA|M_INLINE, M_LEGEND, 0); + elementType("li", M_PCDATA|M_INLINE|M_BLOCK, M_LI, 0); + elementType("link", M_EMPTY, M_HEAD|M_INLINE, 0); + elementType("listing", M_PCDATA|M_INLINE, M_BLOCK, 0); + elementType("map", M_BLOCK|M_AREA, M_INLINE, 0); + elementType("marquee", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, 0); + elementType("menu", M_LI, M_BLOCK, 0); + elementType("meta", M_EMPTY, M_HEAD, 0); + elementType("nobr", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, 0); + elementType("noframes", M_BODY|M_BLOCK|M_INLINE, M_BLOCK|M_HTML|M_FRAME, 0); + elementType("noscript", M_PCDATA|M_INLINE|M_BLOCK, M_BLOCK, 0); + elementType("object", M_PCDATA|M_PARAM|M_INLINE|M_BLOCK, M_HEAD|M_INLINE|M_NOLINK, 0); + elementType("ol", M_LI, M_BLOCK, 0); + elementType("optgroup", M_OPTIONS, M_OPTIONS, 0); + elementType("option", M_PCDATA, M_OPTION|M_OPTIONS, 0); + elementType("p", M_PCDATA|M_INLINE|M_TABLE, M_BLOCK|M_P, 0); + elementType("param", M_EMPTY, M_PARAM, 0); + elementType("pre", M_PCDATA|M_INLINE, M_BLOCK, 0); + elementType("q", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("rb", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("rbc", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("rp", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("rt", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("rtc", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("ruby", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("s", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("samp", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("script", M_PCDATA, M_ANY & ~M_ROOT, F_CDATA); + elementType("select", M_OPTIONS, M_INLINE, 0); + elementType("small", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("span", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, 0); + elementType("strike", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("strong", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("style", M_PCDATA, M_HEAD|M_INLINE, F_CDATA); + elementType("sub", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("sup", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("table", M_FORM|M_TABULAR, M_BLOCK|M_TABLE, F_NOFORCE); + elementType("tbody", M_TR, M_TABULAR, 0); + elementType("td", M_PCDATA|M_INLINE|M_BLOCK, M_CELL, 0); + elementType("textarea", M_PCDATA, M_INLINE, 0); + elementType("tfoot", M_TR|M_FORM|M_CELL, M_TABULAR, 0); + elementType("th", M_PCDATA|M_INLINE|M_BLOCK, M_CELL, 0); + elementType("thead", M_TR|M_FORM|M_CELL, M_TABULAR, 0); + elementType("title", M_PCDATA, M_HEAD, 0); + elementType("tr", M_FORM|M_CELL, M_TR|M_TABULAR, 0); + elementType("tt", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("u", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, F_RESTART); + elementType("ul", M_LI, M_BLOCK, 0); + elementType("var", M_PCDATA|M_INLINE, M_INLINE|M_NOLINK, 0); + elementType("wbr", M_EMPTY, M_INLINE|M_NOLINK, 0); + elementType("xmp", M_PCDATA|M_INLINE, M_BLOCK, 0); + } // elementTypes + + void parents() + { + parent("", "body"); + parent("html", ""); + parent("a", "body"); + parent("abbr", "body"); + parent("acronym", "body"); + parent("address", "body"); + parent("applet", "body"); + parent("area", "map"); + parent("b", "body"); + parent("base", "head"); + parent("basefont", "body"); + parent("bdo", "body"); + parent("bgsound", "head"); + parent("big", "body"); + parent("blink", "body"); + parent("blockquote", "body"); + parent("body", "html"); + parent("br", "body"); + parent("button", "form"); + parent("canvas", "body"); + parent("caption", "table"); + parent("center", "body"); + parent("cite", "body"); + parent("code", "body"); + parent("col", "table"); + parent("colgroup", "table"); + parent("comment", "body"); + parent("dd", "dl"); + parent("del", "body"); + parent("dfn", "body"); + parent("dir", "body"); + parent("div", "body"); + parent("dl", "body"); + parent("dt", "dl"); + parent("em", "body"); + parent("fieldset", "form"); + parent("font", "body"); + parent("form", "body"); + parent("frame", "frameset"); + parent("frameset", "html"); + parent("h1", "body"); + parent("h2", "body"); + parent("h3", "body"); + parent("h4", "body"); + parent("h5", "body"); + parent("h6", "body"); + parent("head", "html"); + parent("hr", "body"); + parent("i", "body"); + parent("iframe", "body"); + parent("img", "body"); + parent("input", "form"); + parent("ins", "body"); + parent("isindex", "head"); + parent("kbd", "body"); + parent("label", "form"); + parent("legend", "fieldset"); + parent("li", "ul"); + parent("link", "head"); + parent("listing", "body"); + parent("map", "body"); + parent("marquee", "body"); + parent("menu", "body"); + parent("meta", "head"); + parent("nobr", "body"); + parent("noframes", "html"); + parent("noscript", "body"); + parent("object", "body"); + parent("ol", "body"); + parent("optgroup", "select"); + parent("option", "select"); + parent("p", "body"); + parent("param", "object"); + parent("pre", "body"); + parent("q", "body"); + parent("rb", "body"); + parent("rbc", "body"); + parent("rp", "body"); + parent("rt", "body"); + parent("rtc", "body"); + parent("ruby", "body"); + parent("s", "body"); + parent("samp", "body"); + parent("script", "html"); + parent("select", "form"); + parent("small", "body"); + parent("span", "body"); + parent("strike", "body"); + parent("strong", "body"); + parent("style", "head"); + parent("sub", "body"); + parent("sup", "body"); + parent("table", "body"); + parent("tbody", "table"); + parent("td", "tr"); + parent("textarea", "form"); + parent("tfoot", "table"); + parent("th", "tr"); + parent("thead", "table"); + parent("title", "head"); + parent("tr", "tbody"); + parent("tt", "body"); + parent("u", "body"); + parent("ul", "body"); + parent("var", "body"); + parent("wbr", "body"); + parent("xmp", "body"); + } // parents + + void attributes() + { + attributes_misc(); + attributes_class(); + attributes_dir(); + attributes_id(); + attributes_lang(); + } // attributes + + void attributes_misc() + { + attribute("a", "hreflang", "NMTOKEN", ""); + attribute("a", "shape", "CDATA", "rect"); + attribute("a", "tabindex", "NMTOKEN", ""); + attribute("applet", "align", "NMTOKEN", ""); + attribute("area", "nohref", "BOOLEAN", ""); + attribute("area", "shape", "CDATA", "rect"); + attribute("area", "tabindex", "NMTOKEN", ""); + attribute("br", "clear", "CDATA", "none"); + attribute("button", "disabled", "BOOLEAN", ""); + attribute("button", "tabindex", "NMTOKEN", ""); + attribute("button", "type", "CDATA", "submit"); + attribute("caption", "align", "NMTOKEN", ""); + attribute("col", "align", "NMTOKEN", ""); + attribute("col", "span", "CDATA", "1"); + attribute("col", "valign", "NMTOKEN", ""); + attribute("colgroup", "align", "NMTOKEN", ""); + attribute("colgroup", "span", "CDATA", "1"); + attribute("colgroup", "valign", "NMTOKEN", ""); + attribute("dir", "compact", "BOOLEAN", ""); + attribute("div", "align", "NMTOKEN", ""); + attribute("dl", "compact", "BOOLEAN", ""); + attribute("form", "enctype", "CDATA", "application/x-www-form-urlencoded"); + attribute("form", "method", "CDATA", "get"); + attribute("frame", "frameborder", "CDATA", "1"); + attribute("frame", "noresize", "BOOLEAN", ""); + attribute("frame", "scrolling", "CDATA", "auto"); + attribute("h1", "align", "NMTOKEN", ""); + attribute("h2", "align", "NMTOKEN", ""); + attribute("h3", "align", "NMTOKEN", ""); + attribute("h4", "align", "NMTOKEN", ""); + attribute("h5", "align", "NMTOKEN", ""); + attribute("h6", "align", "NMTOKEN", ""); + attribute("hr", "align", "NMTOKEN", ""); + attribute("hr", "noshade", "BOOLEAN", ""); + attribute("iframe", "align", "NMTOKEN", ""); + attribute("iframe", "frameborder", "CDATA", "1"); + attribute("iframe", "scrolling", "CDATA", "auto"); + attribute("img", "align", "NMTOKEN", ""); + attribute("img", "ismap", "BOOLEAN", ""); + attribute("input", "align", "NMTOKEN", ""); + attribute("input", "checked", "BOOLEAN", ""); + attribute("input", "disabled", "BOOLEAN", ""); + attribute("input", "ismap", "BOOLEAN", ""); + attribute("input", "maxlength", "NMTOKEN", ""); + attribute("input", "readonly", "BOOLEAN", ""); + attribute("input", "tabindex", "NMTOKEN", ""); + attribute("input", "type", "CDATA", "text"); + attribute("label", "for", "IDREF", ""); + attribute("legend", "align", "NMTOKEN", ""); + attribute("li", "value", "NMTOKEN", ""); + attribute("link", "hreflang", "NMTOKEN", ""); + attribute("marquee", "width", "NMTOKEN", ""); + attribute("menu", "compact", "BOOLEAN", ""); + attribute("meta", "http-equiv", "NMTOKEN", ""); + attribute("meta", "name", "NMTOKEN", ""); + attribute("object", "align", "NMTOKEN", ""); + attribute("object", "declare", "BOOLEAN", ""); + attribute("object", "tabindex", "NMTOKEN", ""); + attribute("ol", "compact", "BOOLEAN", ""); + attribute("ol", "start", "NMTOKEN", ""); + attribute("optgroup", "disabled", "BOOLEAN", ""); + attribute("option", "disabled", "BOOLEAN", ""); + attribute("option", "selected", "BOOLEAN", ""); + attribute("p", "align", "NMTOKEN", ""); + attribute("param", "valuetype", "CDATA", "data"); + attribute("pre", "width", "NMTOKEN", ""); + attribute("rt", "rbspan", "CDATA", "1"); + attribute("script", "defer", "BOOLEAN", ""); + attribute("select", "disabled", "BOOLEAN", ""); + attribute("select", "multiple", "BOOLEAN", ""); + attribute("select", "size", "NMTOKEN", ""); + attribute("select", "tabindex", "NMTOKEN", ""); + attribute("table", "align", "NMTOKEN", ""); + attribute("table", "frame", "NMTOKEN", ""); + attribute("table", "rules", "NMTOKEN", ""); + attribute("tbody", "align", "NMTOKEN", ""); + attribute("tbody", "valign", "NMTOKEN", ""); + attribute("td", "align", "NMTOKEN", ""); + attribute("td", "colspan", "CDATA", "1"); + attribute("td", "headers", "IDREFS", ""); + attribute("td", "nowrap", "BOOLEAN", ""); + attribute("td", "rowspan", "CDATA", "1"); + attribute("td", "scope", "NMTOKEN", ""); + attribute("td", "valign", "NMTOKEN", ""); + attribute("textarea", "cols", "NMTOKEN", ""); + attribute("textarea", "disabled", "BOOLEAN", ""); + attribute("textarea", "readonly", "BOOLEAN", ""); + attribute("textarea", "rows", "NMTOKEN", ""); + attribute("textarea", "tabindex", "NMTOKEN", ""); + attribute("tfoot", "align", "NMTOKEN", ""); + attribute("tfoot", "valign", "NMTOKEN", ""); + attribute("th", "align", "NMTOKEN", ""); + attribute("th", "colspan", "CDATA", "1"); + attribute("th", "headers", "IDREFS", ""); + attribute("th", "nowrap", "BOOLEAN", ""); + attribute("th", "rowspan", "CDATA", "1"); + attribute("th", "scope", "NMTOKEN", ""); + attribute("th", "valign", "NMTOKEN", ""); + attribute("thead", "align", "NMTOKEN", ""); + attribute("thead", "valign", "NMTOKEN", ""); + attribute("tr", "align", "NMTOKEN", ""); + attribute("tr", "valign", "NMTOKEN", ""); + attribute("ul", "compact", "BOOLEAN", ""); + attribute("ul", "type", "NMTOKEN", ""); + attribute("xmp", "width", "NMTOKEN", ""); + } // attributes_misc + + void attributes_class() + { + attribute("a", "class", "NMTOKEN", ""); + attribute("abbr", "class", "NMTOKEN", ""); + attribute("acronym", "class", "NMTOKEN", ""); + attribute("address", "class", "NMTOKEN", ""); + attribute("applet", "class", "NMTOKEN", ""); + attribute("area", "class", "NMTOKEN", ""); + attribute("b", "class", "NMTOKEN", ""); + attribute("base", "class", "NMTOKEN", ""); + attribute("basefont", "class", "NMTOKEN", ""); + attribute("bdo", "class", "NMTOKEN", ""); + attribute("bgsound", "class", "NMTOKEN", ""); + attribute("big", "class", "NMTOKEN", ""); + attribute("blink", "class", "NMTOKEN", ""); + attribute("blockquote", "class", "NMTOKEN", ""); + attribute("body", "class", "NMTOKEN", ""); + attribute("br", "class", "NMTOKEN", ""); + attribute("button", "class", "NMTOKEN", ""); + attribute("canvas", "class", "NMTOKEN", ""); + attribute("caption", "class", "NMTOKEN", ""); + attribute("center", "class", "NMTOKEN", ""); + attribute("cite", "class", "NMTOKEN", ""); + attribute("code", "class", "NMTOKEN", ""); + attribute("col", "class", "NMTOKEN", ""); + attribute("colgroup", "class", "NMTOKEN", ""); + attribute("comment", "class", "NMTOKEN", ""); + attribute("dd", "class", "NMTOKEN", ""); + attribute("del", "class", "NMTOKEN", ""); + attribute("dfn", "class", "NMTOKEN", ""); + attribute("dir", "class", "NMTOKEN", ""); + attribute("div", "class", "NMTOKEN", ""); + attribute("dl", "class", "NMTOKEN", ""); + attribute("dt", "class", "NMTOKEN", ""); + attribute("em", "class", "NMTOKEN", ""); + attribute("fieldset", "class", "NMTOKEN", ""); + attribute("font", "class", "NMTOKEN", ""); + attribute("form", "class", "NMTOKEN", ""); + attribute("frame", "class", "NMTOKEN", ""); + attribute("frameset", "class", "NMTOKEN", ""); + attribute("h1", "class", "NMTOKEN", ""); + attribute("h2", "class", "NMTOKEN", ""); + attribute("h3", "class", "NMTOKEN", ""); + attribute("h4", "class", "NMTOKEN", ""); + attribute("h5", "class", "NMTOKEN", ""); + attribute("h6", "class", "NMTOKEN", ""); + attribute("head", "class", "NMTOKEN", ""); + attribute("hr", "class", "NMTOKEN", ""); + attribute("html", "class", "NMTOKEN", ""); + attribute("i", "class", "NMTOKEN", ""); + attribute("iframe", "class", "NMTOKEN", ""); + attribute("img", "class", "NMTOKEN", ""); + attribute("input", "class", "NMTOKEN", ""); + attribute("ins", "class", "NMTOKEN", ""); + attribute("isindex", "class", "NMTOKEN", ""); + attribute("kbd", "class", "NMTOKEN", ""); + attribute("label", "class", "NMTOKEN", ""); + attribute("legend", "class", "NMTOKEN", ""); + attribute("li", "class", "NMTOKEN", ""); + attribute("link", "class", "NMTOKEN", ""); + attribute("listing", "class", "NMTOKEN", ""); + attribute("map", "class", "NMTOKEN", ""); + attribute("marquee", "class", "NMTOKEN", ""); + attribute("menu", "class", "NMTOKEN", ""); + attribute("meta", "class", "NMTOKEN", ""); + attribute("nobr", "class", "NMTOKEN", ""); + attribute("noframes", "class", "NMTOKEN", ""); + attribute("noscript", "class", "NMTOKEN", ""); + attribute("object", "class", "NMTOKEN", ""); + attribute("ol", "class", "NMTOKEN", ""); + attribute("optgroup", "class", "NMTOKEN", ""); + attribute("option", "class", "NMTOKEN", ""); + attribute("p", "class", "NMTOKEN", ""); + attribute("param", "class", "NMTOKEN", ""); + attribute("pre", "class", "NMTOKEN", ""); + attribute("q", "class", "NMTOKEN", ""); + attribute("rb", "class", "NMTOKEN", ""); + attribute("rbc", "class", "NMTOKEN", ""); + attribute("rp", "class", "NMTOKEN", ""); + attribute("rt", "class", "NMTOKEN", ""); + attribute("rtc", "class", "NMTOKEN", ""); + attribute("ruby", "class", "NMTOKEN", ""); + attribute("s", "class", "NMTOKEN", ""); + attribute("samp", "class", "NMTOKEN", ""); + attribute("script", "class", "NMTOKEN", ""); + attribute("select", "class", "NMTOKEN", ""); + attribute("small", "class", "NMTOKEN", ""); + attribute("span", "class", "NMTOKEN", ""); + attribute("strike", "class", "NMTOKEN", ""); + attribute("strong", "class", "NMTOKEN", ""); + attribute("style", "class", "NMTOKEN", ""); + attribute("sub", "class", "NMTOKEN", ""); + attribute("sup", "class", "NMTOKEN", ""); + attribute("table", "class", "NMTOKEN", ""); + attribute("tbody", "class", "NMTOKEN", ""); + attribute("td", "class", "NMTOKEN", ""); + attribute("textarea", "class", "NMTOKEN", ""); + attribute("tfoot", "class", "NMTOKEN", ""); + attribute("th", "class", "NMTOKEN", ""); + attribute("thead", "class", "NMTOKEN", ""); + attribute("title", "class", "NMTOKEN", ""); + attribute("tr", "class", "NMTOKEN", ""); + attribute("tt", "class", "NMTOKEN", ""); + attribute("u", "class", "NMTOKEN", ""); + attribute("ul", "class", "NMTOKEN", ""); + attribute("var", "class", "NMTOKEN", ""); + attribute("wbr", "class", "NMTOKEN", ""); + attribute("xmp", "class", "NMTOKEN", ""); + } // attributes_class + + void attributes_dir() + { + + attribute("a", "dir", "NMTOKEN", ""); + attribute("abbr", "dir", "NMTOKEN", ""); + attribute("acronym", "dir", "NMTOKEN", ""); + attribute("address", "dir", "NMTOKEN", ""); + attribute("applet", "dir", "NMTOKEN", ""); + attribute("area", "dir", "NMTOKEN", ""); + attribute("b", "dir", "NMTOKEN", ""); + attribute("base", "dir", "NMTOKEN", ""); + attribute("basefont", "dir", "NMTOKEN", ""); + attribute("bdo", "dir", "NMTOKEN", ""); + attribute("bgsound", "dir", "NMTOKEN", ""); + attribute("big", "dir", "NMTOKEN", ""); + attribute("blink", "dir", "NMTOKEN", ""); + attribute("blockquote", "dir", "NMTOKEN", ""); + attribute("body", "dir", "NMTOKEN", ""); + attribute("br", "dir", "NMTOKEN", ""); + attribute("button", "dir", "NMTOKEN", ""); + attribute("canvas", "dir", "NMTOKEN", ""); + attribute("caption", "dir", "NMTOKEN", ""); + attribute("center", "dir", "NMTOKEN", ""); + attribute("cite", "dir", "NMTOKEN", ""); + attribute("code", "dir", "NMTOKEN", ""); + attribute("col", "dir", "NMTOKEN", ""); + attribute("colgroup", "dir", "NMTOKEN", ""); + attribute("comment", "dir", "NMTOKEN", ""); + attribute("dd", "dir", "NMTOKEN", ""); + attribute("del", "dir", "NMTOKEN", ""); + attribute("dfn", "dir", "NMTOKEN", ""); + attribute("dir", "dir", "NMTOKEN", ""); + attribute("div", "dir", "NMTOKEN", ""); + attribute("dl", "dir", "NMTOKEN", ""); + attribute("dt", "dir", "NMTOKEN", ""); + attribute("em", "dir", "NMTOKEN", ""); + attribute("fieldset", "dir", "NMTOKEN", ""); + attribute("font", "dir", "NMTOKEN", ""); + attribute("form", "dir", "NMTOKEN", ""); + attribute("frame", "dir", "NMTOKEN", ""); + attribute("frameset", "dir", "NMTOKEN", ""); + attribute("h1", "dir", "NMTOKEN", ""); + attribute("h2", "dir", "NMTOKEN", ""); + attribute("h3", "dir", "NMTOKEN", ""); + attribute("h4", "dir", "NMTOKEN", ""); + attribute("h5", "dir", "NMTOKEN", ""); + attribute("h6", "dir", "NMTOKEN", ""); + attribute("head", "dir", "NMTOKEN", ""); + attribute("hr", "dir", "NMTOKEN", ""); + attribute("html", "dir", "NMTOKEN", ""); + attribute("i", "dir", "NMTOKEN", ""); + attribute("iframe", "dir", "NMTOKEN", ""); + attribute("img", "dir", "NMTOKEN", ""); + attribute("input", "dir", "NMTOKEN", ""); + attribute("ins", "dir", "NMTOKEN", ""); + attribute("isindex", "dir", "NMTOKEN", ""); + attribute("kbd", "dir", "NMTOKEN", ""); + attribute("label", "dir", "NMTOKEN", ""); + attribute("legend", "dir", "NMTOKEN", ""); + attribute("li", "dir", "NMTOKEN", ""); + attribute("link", "dir", "NMTOKEN", ""); + attribute("listing", "dir", "NMTOKEN", ""); + attribute("map", "dir", "NMTOKEN", ""); + attribute("marquee", "dir", "NMTOKEN", ""); + attribute("menu", "dir", "NMTOKEN", ""); + attribute("meta", "dir", "NMTOKEN", ""); + attribute("nobr", "dir", "NMTOKEN", ""); + attribute("noframes", "dir", "NMTOKEN", ""); + attribute("noscript", "dir", "NMTOKEN", ""); + attribute("object", "dir", "NMTOKEN", ""); + attribute("ol", "dir", "NMTOKEN", ""); + attribute("optgroup", "dir", "NMTOKEN", ""); + attribute("option", "dir", "NMTOKEN", ""); + attribute("p", "dir", "NMTOKEN", ""); + attribute("param", "dir", "NMTOKEN", ""); + attribute("pre", "dir", "NMTOKEN", ""); + attribute("q", "dir", "NMTOKEN", ""); + attribute("rb", "dir", "NMTOKEN", ""); + attribute("rbc", "dir", "NMTOKEN", ""); + attribute("rp", "dir", "NMTOKEN", ""); + attribute("rt", "dir", "NMTOKEN", ""); + attribute("rtc", "dir", "NMTOKEN", ""); + attribute("ruby", "dir", "NMTOKEN", ""); + attribute("s", "dir", "NMTOKEN", ""); + attribute("samp", "dir", "NMTOKEN", ""); + attribute("script", "dir", "NMTOKEN", ""); + attribute("select", "dir", "NMTOKEN", ""); + attribute("small", "dir", "NMTOKEN", ""); + attribute("span", "dir", "NMTOKEN", ""); + attribute("strike", "dir", "NMTOKEN", ""); + attribute("strong", "dir", "NMTOKEN", ""); + attribute("style", "dir", "NMTOKEN", ""); + attribute("sub", "dir", "NMTOKEN", ""); + attribute("sup", "dir", "NMTOKEN", ""); + attribute("table", "dir", "NMTOKEN", ""); + attribute("tbody", "dir", "NMTOKEN", ""); + attribute("td", "dir", "NMTOKEN", ""); + attribute("textarea", "dir", "NMTOKEN", ""); + attribute("tfoot", "dir", "NMTOKEN", ""); + attribute("th", "dir", "NMTOKEN", ""); + attribute("thead", "dir", "NMTOKEN", ""); + attribute("title", "dir", "NMTOKEN", ""); + attribute("tr", "dir", "NMTOKEN", ""); + attribute("tt", "dir", "NMTOKEN", ""); + attribute("u", "dir", "NMTOKEN", ""); + attribute("ul", "dir", "NMTOKEN", ""); + attribute("var", "dir", "NMTOKEN", ""); + attribute("wbr", "dir", "NMTOKEN", ""); + attribute("xmp", "dir", "NMTOKEN", ""); + } // attributes_dir + + void attributes_id() + { + attribute("a", "id", "ID", ""); + attribute("abbr", "id", "ID", ""); + attribute("acronym", "id", "ID", ""); + attribute("address", "id", "ID", ""); + attribute("applet", "id", "ID", ""); + attribute("area", "id", "ID", ""); + attribute("b", "id", "ID", ""); + attribute("base", "id", "ID", ""); + attribute("basefont", "id", "ID", ""); + attribute("bdo", "id", "ID", ""); + attribute("bgsound", "id", "ID", ""); + attribute("big", "id", "ID", ""); + attribute("blink", "id", "ID", ""); + attribute("blockquote", "id", "ID", ""); + attribute("body", "id", "ID", ""); + attribute("br", "id", "ID", ""); + attribute("button", "id", "ID", ""); + attribute("canvas", "id", "ID", ""); + attribute("caption", "id", "ID", ""); + attribute("center", "id", "ID", ""); + attribute("cite", "id", "ID", ""); + attribute("code", "id", "ID", ""); + attribute("col", "id", "ID", ""); + attribute("colgroup", "id", "ID", ""); + attribute("comment", "id", "ID", ""); + attribute("dd", "id", "ID", ""); + attribute("del", "id", "ID", ""); + attribute("dfn", "id", "ID", ""); + attribute("dir", "id", "ID", ""); + attribute("div", "id", "ID", ""); + attribute("dl", "id", "ID", ""); + attribute("dt", "id", "ID", ""); + attribute("em", "id", "ID", ""); + attribute("fieldset", "id", "ID", ""); + attribute("font", "id", "ID", ""); + attribute("form", "id", "ID", ""); + attribute("frame", "id", "ID", ""); + attribute("frameset", "id", "ID", ""); + attribute("h1", "id", "ID", ""); + attribute("h2", "id", "ID", ""); + attribute("h3", "id", "ID", ""); + attribute("h4", "id", "ID", ""); + attribute("h5", "id", "ID", ""); + attribute("h6", "id", "ID", ""); + attribute("head", "id", "ID", ""); + attribute("hr", "id", "ID", ""); + attribute("html", "id", "ID", ""); + attribute("i", "id", "ID", ""); + attribute("iframe", "id", "ID", ""); + attribute("img", "id", "ID", ""); + attribute("input", "id", "ID", ""); + attribute("ins", "id", "ID", ""); + attribute("isindex", "id", "ID", ""); + attribute("kbd", "id", "ID", ""); + attribute("label", "id", "ID", ""); + attribute("legend", "id", "ID", ""); + attribute("li", "id", "ID", ""); + attribute("link", "id", "ID", ""); + attribute("listing", "id", "ID", ""); + attribute("map", "id", "ID", ""); + attribute("marquee", "id", "ID", ""); + attribute("menu", "id", "ID", ""); + attribute("meta", "id", "ID", ""); + attribute("nobr", "id", "ID", ""); + attribute("noframes", "id", "ID", ""); + attribute("noscript", "id", "ID", ""); + attribute("object", "id", "ID", ""); + attribute("ol", "id", "ID", ""); + attribute("optgroup", "id", "ID", ""); + attribute("option", "id", "ID", ""); + attribute("p", "id", "ID", ""); + attribute("param", "id", "ID", ""); + attribute("pre", "id", "ID", ""); + attribute("q", "id", "ID", ""); + attribute("rb", "id", "ID", ""); + attribute("rbc", "id", "ID", ""); + attribute("rp", "id", "ID", ""); + attribute("rt", "id", "ID", ""); + attribute("rtc", "id", "ID", ""); + attribute("ruby", "id", "ID", ""); + attribute("s", "id", "ID", ""); + attribute("samp", "id", "ID", ""); + attribute("script", "id", "ID", ""); + attribute("select", "id", "ID", ""); + attribute("small", "id", "ID", ""); + attribute("span", "id", "ID", ""); + attribute("strike", "id", "ID", ""); + attribute("strong", "id", "ID", ""); + attribute("style", "id", "ID", ""); + attribute("sub", "id", "ID", ""); + attribute("sup", "id", "ID", ""); + attribute("table", "id", "ID", ""); + attribute("tbody", "id", "ID", ""); + attribute("td", "id", "ID", ""); + attribute("textarea", "id", "ID", ""); + attribute("tfoot", "id", "ID", ""); + attribute("th", "id", "ID", ""); + attribute("thead", "id", "ID", ""); + attribute("title", "id", "ID", ""); + attribute("tr", "id", "ID", ""); + attribute("tt", "id", "ID", ""); + attribute("u", "id", "ID", ""); + attribute("ul", "id", "ID", ""); + attribute("var", "id", "ID", ""); + attribute("wbr", "id", "ID", ""); + attribute("xmp", "id", "ID", ""); + } // attributes_id + + void attributes_lang() + { + attribute("a", "lang", "NMTOKEN", ""); + attribute("abbr", "lang", "NMTOKEN", ""); + attribute("acronym", "lang", "NMTOKEN", ""); + attribute("address", "lang", "NMTOKEN", ""); + attribute("applet", "lang", "NMTOKEN", ""); + attribute("area", "lang", "NMTOKEN", ""); + attribute("b", "lang", "NMTOKEN", ""); + attribute("base", "lang", "NMTOKEN", ""); + attribute("basefont", "lang", "NMTOKEN", ""); + attribute("bdo", "lang", "NMTOKEN", ""); + attribute("bgsound", "lang", "NMTOKEN", ""); + attribute("big", "lang", "NMTOKEN", ""); + attribute("blink", "lang", "NMTOKEN", ""); + attribute("blockquote", "lang", "NMTOKEN", ""); + attribute("body", "lang", "NMTOKEN", ""); + attribute("br", "lang", "NMTOKEN", ""); + attribute("button", "lang", "NMTOKEN", ""); + attribute("canvas", "lang", "NMTOKEN", ""); + attribute("caption", "lang", "NMTOKEN", ""); + attribute("center", "lang", "NMTOKEN", ""); + attribute("cite", "lang", "NMTOKEN", ""); + attribute("code", "lang", "NMTOKEN", ""); + attribute("col", "lang", "NMTOKEN", ""); + attribute("colgroup", "lang", "NMTOKEN", ""); + attribute("comment", "lang", "NMTOKEN", ""); + attribute("dd", "lang", "NMTOKEN", ""); + attribute("del", "lang", "NMTOKEN", ""); + attribute("dfn", "lang", "NMTOKEN", ""); + attribute("dir", "lang", "NMTOKEN", ""); + attribute("div", "lang", "NMTOKEN", ""); + attribute("dl", "lang", "NMTOKEN", ""); + attribute("dt", "lang", "NMTOKEN", ""); + attribute("em", "lang", "NMTOKEN", ""); + attribute("fieldset", "lang", "NMTOKEN", ""); + attribute("font", "lang", "NMTOKEN", ""); + attribute("form", "lang", "NMTOKEN", ""); + attribute("frame", "lang", "NMTOKEN", ""); + attribute("frameset", "lang", "NMTOKEN", ""); + attribute("h1", "lang", "NMTOKEN", ""); + attribute("h2", "lang", "NMTOKEN", ""); + attribute("h3", "lang", "NMTOKEN", ""); + attribute("h4", "lang", "NMTOKEN", ""); + attribute("h5", "lang", "NMTOKEN", ""); + attribute("h6", "lang", "NMTOKEN", ""); + attribute("head", "lang", "NMTOKEN", ""); + attribute("hr", "lang", "NMTOKEN", ""); + attribute("html", "lang", "NMTOKEN", ""); + attribute("i", "lang", "NMTOKEN", ""); + attribute("iframe", "lang", "NMTOKEN", ""); + attribute("img", "lang", "NMTOKEN", ""); + attribute("input", "lang", "NMTOKEN", ""); + attribute("ins", "lang", "NMTOKEN", ""); + attribute("isindex", "lang", "NMTOKEN", ""); + attribute("kbd", "lang", "NMTOKEN", ""); + attribute("label", "lang", "NMTOKEN", ""); + attribute("legend", "lang", "NMTOKEN", ""); + attribute("li", "lang", "NMTOKEN", ""); + attribute("link", "lang", "NMTOKEN", ""); + attribute("listing", "lang", "NMTOKEN", ""); + attribute("map", "lang", "NMTOKEN", ""); + attribute("marquee", "lang", "NMTOKEN", ""); + attribute("menu", "lang", "NMTOKEN", ""); + attribute("meta", "lang", "NMTOKEN", ""); + attribute("nobr", "lang", "NMTOKEN", ""); + attribute("noframes", "lang", "NMTOKEN", ""); + attribute("noscript", "lang", "NMTOKEN", ""); + attribute("object", "lang", "NMTOKEN", ""); + attribute("ol", "lang", "NMTOKEN", ""); + attribute("optgroup", "lang", "NMTOKEN", ""); + attribute("option", "lang", "NMTOKEN", ""); + attribute("p", "lang", "NMTOKEN", ""); + attribute("param", "lang", "NMTOKEN", ""); + attribute("pre", "lang", "NMTOKEN", ""); + attribute("q", "lang", "NMTOKEN", ""); + attribute("rb", "lang", "NMTOKEN", ""); + attribute("rbc", "lang", "NMTOKEN", ""); + attribute("rp", "lang", "NMTOKEN", ""); + attribute("rt", "lang", "NMTOKEN", ""); + attribute("rtc", "lang", "NMTOKEN", ""); + attribute("ruby", "lang", "NMTOKEN", ""); + attribute("s", "lang", "NMTOKEN", ""); + attribute("samp", "lang", "NMTOKEN", ""); + attribute("script", "lang", "NMTOKEN", ""); + attribute("select", "lang", "NMTOKEN", ""); + attribute("small", "lang", "NMTOKEN", ""); + attribute("span", "lang", "NMTOKEN", ""); + attribute("strike", "lang", "NMTOKEN", ""); + attribute("strong", "lang", "NMTOKEN", ""); + attribute("style", "lang", "NMTOKEN", ""); + attribute("sub", "lang", "NMTOKEN", ""); + attribute("sup", "lang", "NMTOKEN", ""); + attribute("table", "lang", "NMTOKEN", ""); + attribute("tbody", "lang", "NMTOKEN", ""); + attribute("td", "lang", "NMTOKEN", ""); + attribute("textarea", "lang", "NMTOKEN", ""); + attribute("tfoot", "lang", "NMTOKEN", ""); + attribute("th", "lang", "NMTOKEN", ""); + attribute("thead", "lang", "NMTOKEN", ""); + attribute("title", "lang", "NMTOKEN", ""); + attribute("tr", "lang", "NMTOKEN", ""); + attribute("tt", "lang", "NMTOKEN", ""); + attribute("u", "lang", "NMTOKEN", ""); + attribute("ul", "lang", "NMTOKEN", ""); + attribute("var", "lang", "NMTOKEN", ""); + attribute("wbr", "lang", "NMTOKEN", ""); + attribute("xmp", "lang", "NMTOKEN", ""); + } // attributes_lang + + void entities() + { + //entity("aacgr", 0x03AC); + //entity("Aacgr", 0x0386); + entity("aacute", 0x00E1); + entity("Aacute", 0x00C1); + //entity("abreve", 0x0103); + //entity("Abreve", 0x0102); + //entity("ac", 0x223E); + //entity("acd", 0x223F); + entity("acirc", 0x00E2); + entity("Acirc", 0x00C2); + entity("acute", 0x00B4); + //entity("acy", 0x0430); + //entity("Acy", 0x0410); + entity("aelig", 0x00E6); + entity("AElig", 0x00C6); + //entity("af", 0x2061); + //entity("afr", 0x1D51E); + //entity("Afr", 0x1D504); + //entity("agr", 0x03B1); + //entity("Agr", 0x0391); + entity("agrave", 0x00E0); + entity("Agrave", 0x00C0); + //entity("alefsym", 0x2135); + //entity("aleph", 0x2135); + //entity("alpha", 0x03B1); + //entity("Alpha", 0x0391); + //entity("amacr", 0x0101); + //entity("Amacr", 0x0100); + //entity("amalg", 0x2A3F); + entity("amp", 0x0026); + //entity("and", 0x2227); + //entity("And", 0x2A53); + //entity("andand", 0x2A55); + //entity("andd", 0x2A5C); + //entity("andslope", 0x2A58); + //entity("andv", 0x2A5A); + //entity("ang", 0x2220); + //entity("ange", 0x29A4); + //entity("angle", 0x2220); + //entity("angmsd", 0x2221); + //entity("angmsdaa", 0x29A8); + //entity("angmsdab", 0x29A9); + //entity("angmsdac", 0x29AA); + //entity("angmsdad", 0x29AB); + //entity("angmsdae", 0x29AC); + //entity("angmsdaf", 0x29AD); + //entity("angmsdag", 0x29AE); + //entity("angmsdah", 0x29AF); + //entity("angrt", 0x221F); + //entity("angrtvb", 0x22BE); + //entity("angrtvbd", 0x299D); + //entity("angsph", 0x2222); + //entity("angst", 0x212B); + //entity("angzarr", 0x237C); + //entity("aogon", 0x0105); + //entity("Aogon", 0x0104); + //entity("aopf", 0x1D552); + //entity("Aopf", 0x1D538); + //entity("ap", 0x2248); + //entity("apacir", 0x2A6F); + //entity("ape", 0x224A); + //entity("apE", 0x2A70); + //entity("apid", 0x224B); + entity("apos", 0x0027); + //entity("ApplyFunction", 0x2061); + //entity("approx", 0x2248); + //entity("approxeq", 0x224A); + entity("aring", 0x00E5); + entity("Aring", 0x00C5); + //entity("ascr", 0x1D4B6); + //entity("Ascr", 0x1D49C); + //entity("Assign", 0x2254); + entity("ast", 0x002A); + //entity("asymp", 0x2248); + //entity("asympeq", 0x224D); + entity("atilde", 0x00E3); + entity("Atilde", 0x00C3); + entity("auml", 0x00E4); + entity("Auml", 0x00C4); + //entity("awconint", 0x2233); + //entity("awint", 0x2A11); + //entity("b.alpha", 0x1D6C2); + //entity("b.beta", 0x1D6C3); + //entity("b.chi", 0x1D6D8); + //entity("b.delta", 0x1D6C5); + //entity("b.Delta", 0x1D6AB); + //entity("b.epsi", 0x1D6C6); + //entity("b.epsiv", 0x1D6DC); + //entity("b.eta", 0x1D6C8); + //entity("b.gamma", 0x1D6C4); + //entity("b.Gamma", 0x1D6AA); + //entity("b.gammad", 0x1D7CB); + //entity("b.Gammad", 0x1D7CA); + //entity("b.iota", 0x1D6CA); + //entity("b.kappa", 0x1D6CB); + //entity("b.kappav", 0x1D6DE); + //entity("b.lambda", 0x1D6CC); + //entity("b.Lambda", 0x1D6B2); + //entity("b.mu", 0x1D6CD); + //entity("b.nu", 0x1D6CE); + //entity("b.omega", 0x1D6DA); + //entity("b.Omega", 0x1D6C0); + //entity("b.phi", 0x1D6D7); + //entity("b.Phi", 0x1D6BD); + //entity("b.phiv", 0x1D6DF); + //entity("b.pi", 0x1D6D1); + //entity("b.Pi", 0x1D6B7); + //entity("b.piv", 0x1D6E1); + //entity("b.psi", 0x1D6D9); + //entity("b.Psi", 0x1D6BF); + //entity("b.rho", 0x1D6D2); + //entity("b.rhov", 0x1D6E0); + //entity("b.sigma", 0x1D6D4); + //entity("b.Sigma", 0x1D6BA); + //entity("b.sigmav", 0x1D6D3); + //entity("b.tau", 0x1D6D5); + //entity("b.Theta", 0x1D6AF); + //entity("b.thetas", 0x1D6C9); + //entity("b.thetav", 0x1D6DD); + //entity("b.upsi", 0x1D6D6); + //entity("b.Upsi", 0x1D6BC); + //entity("b.xi", 0x1D6CF); + //entity("b.Xi", 0x1D6B5); + //entity("b.zeta", 0x1D6C7); + //entity("backcong", 0x224C); + //entity("backepsilon", 0x03F6); + //entity("backprime", 0x2035); + //entity("backsim", 0x223D); + //entity("backsimeq", 0x22CD); + //entity("Backslash", 0x2216); + //entity("Barv", 0x2AE7); + //entity("barvee", 0x22BD); + //entity("barwed", 0x2305); + //entity("Barwed", 0x2306); + //entity("barwedge", 0x2305); + //entity("bbrk", 0x23B5); + //entity("bbrktbrk", 0x23B6); + //entity("bcong", 0x224C); + //entity("bcy", 0x0431); + //entity("Bcy", 0x0411); + //entity("bdquo", 0x201E); + //entity("becaus", 0x2235); + //entity("because", 0x2235); + //entity("bemptyv", 0x29B0); + //entity("bepsi", 0x03F6); + //entity("bernou", 0x212C); + //entity("Bernoullis", 0x212C); + //entity("beta", 0x03B2); + //entity("Beta", 0x0392); + //entity("beth", 0x2136); + //entity("between", 0x226C); + //entity("bfr", 0x1D51F); + //entity("Bfr", 0x1D505); + //entity("bgr", 0x03B2); + //entity("Bgr", 0x0392); + //entity("bigcap", 0x22C2); + //entity("bigcirc", 0x25EF); + //entity("bigcup", 0x22C3); + //entity("bigodot", 0x2A00); + //entity("bigoplus", 0x2A01); + //entity("bigotimes", 0x2A02); + //entity("bigsqcup", 0x2A06); + //entity("bigstar", 0x2605); + //entity("bigtriangledown", 0x25BD); + //entity("bigtriangleup", 0x25B3); + //entity("biguplus", 0x2A04); + //entity("bigvee", 0x22C1); + //entity("bigwedge", 0x22C0); + //entity("bkarow", 0x290D); + //entity("blacklozenge", 0x29EB); + //entity("blacksquare", 0x25AA); + //entity("blacktriangle", 0x25B4); + //entity("blacktriangledown", 0x25BE); + //entity("blacktriangleleft", 0x25C2); + //entity("blacktriangleright", 0x25B8); + //entity("blank", 0x2423); + //entity("blk12", 0x2592); + //entity("blk14", 0x2591); + //entity("blk34", 0x2593); + //entity("block", 0x2588); + //entity("bnot", 0x2310); + //entity("bNot", 0x2AED); + //entity("bopf", 0x1D553); + //entity("Bopf", 0x1D539); + //entity("bot", 0x22A5); + //entity("bottom", 0x22A5); + //entity("bowtie", 0x22C8); + //entity("boxbox", 0x29C9); + //entity("boxdl", 0x2510); + //entity("boxdL", 0x2555); + //entity("boxDl", 0x2556); + //entity("boxDL", 0x2557); + //entity("boxdr", 0x250C); + //entity("boxdR", 0x2552); + //entity("boxDr", 0x2553); + //entity("boxDR", 0x2554); + //entity("boxh", 0x2500); + //entity("boxH", 0x2550); + //entity("boxhd", 0x252C); + //entity("boxhD", 0x2565); + //entity("boxHd", 0x2564); + //entity("boxHD", 0x2566); + //entity("boxhu", 0x2534); + //entity("boxhU", 0x2568); + //entity("boxHu", 0x2567); + //entity("boxHU", 0x2569); + //entity("boxminus", 0x229F); + //entity("boxplus", 0x229E); + //entity("boxtimes", 0x22A0); + //entity("boxul", 0x2518); + //entity("boxuL", 0x255B); + //entity("boxUl", 0x255C); + //entity("boxUL", 0x255D); + //entity("boxur", 0x2514); + //entity("boxuR", 0x2558); + //entity("boxUr", 0x2559); + //entity("boxUR", 0x255A); + //entity("boxv", 0x2502); + //entity("boxV", 0x2551); + //entity("boxvh", 0x253C); + //entity("boxvH", 0x256A); + //entity("boxVh", 0x256B); + //entity("boxVH", 0x256C); + //entity("boxvl", 0x2524); + //entity("boxvL", 0x2561); + //entity("boxVl", 0x2562); + //entity("boxVL", 0x2563); + //entity("boxvr", 0x251C); + //entity("boxvR", 0x255E); + //entity("boxVr", 0x255F); + //entity("boxVR", 0x2560); + //entity("bprime", 0x2035); + //entity("breve", 0x02D8); + entity("brvbar", 0x00A6); + //entity("bscr", 0x1D4B7); + //entity("Bscr", 0x212C); + //entity("bsemi", 0x204F); + //entity("bsim", 0x223D); + //entity("bsime", 0x22CD); + entity("bsol", 0x005C); + //entity("bsolb", 0x29C5); + //entity("bull", 0x2022); + //entity("bullet", 0x2022); + //entity("bump", 0x224E); + //entity("bumpe", 0x224F); + //entity("bumpE", 0x2AAE); + //entity("bumpeq", 0x224F); + //entity("Bumpeq", 0x224E); + //entity("cacute", 0x0107); + //entity("Cacute", 0x0106); + //entity("cap", 0x2229); + //entity("Cap", 0x22D2); + //entity("capand", 0x2A44); + //entity("capbrcup", 0x2A49); + //entity("capcap", 0x2A4B); + //entity("capcup", 0x2A47); + //entity("capdot", 0x2A40); + //entity("CapitalDifferentialD", 0x2145); + //entity("caret", 0x2041); + //entity("caron", 0x02C7); + //entity("Cayleys", 0x212D); + //entity("ccaps", 0x2A4D); + //entity("ccaron", 0x010D); + //entity("Ccaron", 0x010C); + entity("ccedil", 0x00E7); + entity("Ccedil", 0x00C7); + //entity("ccirc", 0x0109); + //entity("Ccirc", 0x0108); + //entity("Cconint", 0x2230); + //entity("ccups", 0x2A4C); + //entity("ccupssm", 0x2A50); + //entity("cdot", 0x010B); + //entity("Cdot", 0x010A); + entity("cedil", 0x00B8); + entity("Cedilla", 0x00B8); + //entity("cemptyv", 0x29B2); + entity("cent", 0x00A2); + entity("centerdot", 0x00B7); + //entity("cfr", 0x1D520); + //entity("Cfr", 0x212D); + //entity("chcy", 0x0447); + //entity("CHcy", 0x0427); + //entity("check", 0x2713); + //entity("checkmark", 0x2713); + //entity("chi", 0x03C7); + //entity("Chi", 0x03A7); + //entity("cir", 0x25CB); + //entity("circ", 0x02C6); + //entity("circeq", 0x2257); + //entity("circlearrowleft", 0x21BA); + //entity("circlearrowright", 0x21BB); + //entity("circledast", 0x229B); + //entity("circledcirc", 0x229A); + //entity("circleddash", 0x229D); + //entity("CircleDot", 0x2299); + entity("circledR", 0x00AE); + //entity("circledS", 0x24C8); + //entity("CircleMinus", 0x2296); + //entity("CirclePlus", 0x2295); + //entity("CircleTimes", 0x2297); + //entity("cire", 0x2257); + //entity("cirE", 0x29C3); + //entity("cirfnint", 0x2A10); + //entity("cirmid", 0x2AEF); + //entity("cirscir", 0x29C2); + //entity("ClockwiseContourIntegral", 0x2232); + //entity("CloseCurlyDoubleQuote", 0x201D); + //entity("CloseCurlyQuote", 0x2019); + //entity("clubs", 0x2663); + //entity("clubsuit", 0x2663); + entity("colon", 0x003A); + //entity("Colon", 0x2237); + //entity("colone", 0x2254); + //entity("Colone", 0x2A74); + //entity("coloneq", 0x2254); + entity("comma", 0x002C); + entity("commat", 0x0040); + //entity("comp", 0x2201); + //entity("compfn", 0x2218); + //entity("complement", 0x2201); + //entity("complexes", 0x2102); + //entity("cong", 0x2245); + //entity("congdot", 0x2A6D); + //entity("Congruent", 0x2261); + //entity("conint", 0x222E); + //entity("Conint", 0x222F); + //entity("ContourIntegral", 0x222E); + //entity("copf", 0x1D554); + //entity("Copf", 0x2102); + //entity("coprod", 0x2210); + //entity("Coproduct", 0x2210); + entity("copy", 0x00A9); + //entity("copysr", 0x2117); + //entity("CounterClockwiseContourIntegral", 0x2233); + //entity("crarr", 0x21B5); + //entity("cross", 0x2717); + //entity("Cross", 0x2A2F); + //entity("cscr", 0x1D4B8); + //entity("Cscr", 0x1D49E); + //entity("csub", 0x2ACF); + //entity("csube", 0x2AD1); + //entity("csup", 0x2AD0); + //entity("csupe", 0x2AD2); + //entity("ctdot", 0x22EF); + //entity("cudarrl", 0x2938); + //entity("cudarrr", 0x2935); + //entity("cuepr", 0x22DE); + //entity("cuesc", 0x22DF); + //entity("cularr", 0x21B6); + //entity("cularrp", 0x293D); + //entity("cup", 0x222A); + //entity("Cup", 0x22D3); + //entity("cupbrcap", 0x2A48); + //entity("cupcap", 0x2A46); + //entity("CupCap", 0x224D); + //entity("cupcup", 0x2A4A); + //entity("cupdot", 0x228D); + //entity("cupor", 0x2A45); + //entity("curarr", 0x21B7); + //entity("curarrm", 0x293C); + //entity("curlyeqprec", 0x22DE); + //entity("curlyeqsucc", 0x22DF); + //entity("curlyvee", 0x22CE); + //entity("curlywedge", 0x22CF); + entity("curren", 0x00A4); + //entity("curvearrowleft", 0x21B6); + //entity("curvearrowright", 0x21B7); + //entity("cuvee", 0x22CE); + //entity("cuwed", 0x22CF); + //entity("cwconint", 0x2232); + //entity("cwint", 0x2231); + //entity("cylcty", 0x232D); + //entity("dagger", 0x2020); + //entity("Dagger", 0x2021); + //entity("daleth", 0x2138); + //entity("darr", 0x2193); + //entity("dArr", 0x21D3); + //entity("Darr", 0x21A1); + //entity("dash", 0x2010); + //entity("dashv", 0x22A3); + //entity("Dashv", 0x2AE4); + //entity("dbkarow", 0x290F); + //entity("dblac", 0x02DD); + //entity("dcaron", 0x010F); + //entity("Dcaron", 0x010E); + //entity("dcy", 0x0434); + //entity("Dcy", 0x0414); + //entity("dd", 0x2146); + //entity("DD", 0x2145); + //entity("ddagger", 0x2021); + //entity("ddarr", 0x21CA); + //entity("DDotrahd", 0x2911); + //entity("ddotseq", 0x2A77); + entity("deg", 0x00B0); + //entity("Del", 0x2207); + //entity("delta", 0x03B4); + //entity("Delta", 0x0394); + //entity("demptyv", 0x29B1); + //entity("dfisht", 0x297F); + //entity("dfr", 0x1D521); + //entity("Dfr", 0x1D507); + //entity("dgr", 0x03B4); + //entity("Dgr", 0x0394); + //entity("dHar", 0x2965); + //entity("dharl", 0x21C3); + //entity("dharr", 0x21C2); + entity("DiacriticalAcute", 0x00B4); + //entity("DiacriticalDot", 0x02D9); + //entity("DiacriticalDoubleAcute", 0x02DD); + entity("DiacriticalGrave", 0x0060); + //entity("DiacriticalTilde", 0x02DC); + //entity("diam", 0x22C4); + //entity("diamond", 0x22C4); + //entity("diamondsuit", 0x2666); + //entity("diams", 0x2666); + entity("die", 0x00A8); + //entity("DifferentialD", 0x2146); + //entity("digamma", 0x03DD); + //entity("disin", 0x22F2); + entity("div", 0x00F7); + entity("divide", 0x00F7); + //entity("divideontimes", 0x22C7); + //entity("divonx", 0x22C7); + //entity("djcy", 0x0452); + //entity("DJcy", 0x0402); + //entity("dlcorn", 0x231E); + //entity("dlcrop", 0x230D); + entity("dollar", 0x0024); + //entity("dopf", 0x1D555); + //entity("Dopf", 0x1D53B); + //entity("dot", 0x02D9); + entity("Dot", 0x00A8); + //entity("doteq", 0x2250); + //entity("doteqdot", 0x2251); + //entity("DotEqual", 0x2250); + //entity("dotminus", 0x2238); + //entity("dotplus", 0x2214); + //entity("dotsquare", 0x22A1); + //entity("doublebarwedge", 0x2306); + //entity("DoubleContourIntegral", 0x222F); + entity("DoubleDot", 0x00A8); + //entity("DoubleDownArrow", 0x21D3); + //entity("DoubleLeftArrow", 0x21D0); + //entity("DoubleLeftRightArrow", 0x21D4); + //entity("DoubleLeftTee", 0x2AE4); + //entity("DoubleLongLeftArrow", 0x27F8); + //entity("DoubleLongLeftRightArrow", 0x27FA); + //entity("DoubleLongRightArrow", 0x27F9); + //entity("DoubleRightArrow", 0x21D2); + //entity("DoubleRightTee", 0x22A8); + //entity("DoubleUpArrow", 0x21D1); + //entity("DoubleUpDownArrow", 0x21D5); + //entity("DoubleVerticalBar", 0x2225); + //entity("downarrow", 0x2193); + //entity("Downarrow", 0x21D3); + //entity("DownArrowBar", 0x2913); + //entity("DownArrowUpArrow", 0x21F5); + //entity("downdownarrows", 0x21CA); + //entity("downharpoonleft", 0x21C3); + //entity("downharpoonright", 0x21C2); + //entity("DownLeftRightVector", 0x2950); + //entity("DownLeftTeeVector", 0x295E); + //entity("DownLeftVector", 0x21BD); + //entity("DownLeftVectorBar", 0x2956); + //entity("DownRightTeeVector", 0x295F); + //entity("DownRightVector", 0x21C1); + //entity("DownRightVectorBar", 0x2957); + //entity("DownTee", 0x22A4); + //entity("DownTeeArrow", 0x21A7); + //entity("drbkarow", 0x2910); + //entity("drcorn", 0x231F); + //entity("drcrop", 0x230C); + //entity("dscr", 0x1D4B9); + //entity("Dscr", 0x1D49F); + //entity("dscy", 0x0455); + //entity("DScy", 0x0405); + //entity("dsol", 0x29F6); + //entity("dstrok", 0x0111); + //entity("Dstrok", 0x0110); + //entity("dtdot", 0x22F1); + //entity("dtri", 0x25BF); + //entity("dtrif", 0x25BE); + //entity("duarr", 0x21F5); + //entity("duhar", 0x296F); + //entity("dwangle", 0x29A6); + //entity("dzcy", 0x045F); + //entity("DZcy", 0x040F); + //entity("dzigrarr", 0x27FF); + //entity("eacgr", 0x03AD); + //entity("Eacgr", 0x0388); + entity("eacute", 0x00E9); + entity("Eacute", 0x00C9); + //entity("easter", 0x2A6E); + //entity("ecaron", 0x011B); + //entity("Ecaron", 0x011A); + //entity("ecir", 0x2256); + entity("ecirc", 0x00EA); + entity("Ecirc", 0x00CA); + //entity("ecolon", 0x2255); + //entity("ecy", 0x044D); + //entity("Ecy", 0x042D); + //entity("eDDot", 0x2A77); + //entity("edot", 0x0117); + //entity("eDot", 0x2251); + //entity("Edot", 0x0116); + //entity("ee", 0x2147); + //entity("eeacgr", 0x03AE); + //entity("EEacgr", 0x0389); + //entity("eegr", 0x03B7); + //entity("EEgr", 0x0397); + //entity("efDot", 0x2252); + //entity("efr", 0x1D522); + //entity("Efr", 0x1D508); + //entity("eg", 0x2A9A); + //entity("egr", 0x03B5); + //entity("Egr", 0x0395); + entity("egrave", 0x00E8); + entity("Egrave", 0x00C8); + //entity("egs", 0x2A96); + //entity("egsdot", 0x2A98); + //entity("el", 0x2A99); + //entity("Element", 0x2208); + //entity("elinters", 0x23E7); + //entity("ell", 0x2113); + //entity("els", 0x2A95); + //entity("elsdot", 0x2A97); + //entity("emacr", 0x0113); + //entity("Emacr", 0x0112); + //entity("empty", 0x2205); + //entity("emptyset", 0x2205); + //entity("EmptySmallSquare", 0x25FB); + //entity("emptyv", 0x2205); + //entity("EmptyVerySmallSquare", 0x25AB); + //entity("emsp", 0x2003); + //entity("emsp13", 0x2004); + //entity("emsp14", 0x2005); + //entity("eng", 0x014B); + //entity("ENG", 0x014A); + //entity("ensp", 0x2002); + //entity("eogon", 0x0119); + //entity("Eogon", 0x0118); + //entity("eopf", 0x1D556); + //entity("Eopf", 0x1D53C); + //entity("epar", 0x22D5); + //entity("eparsl", 0x29E3); + //entity("eplus", 0x2A71); + //entity("epsi", 0x03F5); + //entity("epsilon", 0x03B5); + //entity("Epsilon", 0x0395); + //entity("epsiv", 0x03B5); + //entity("eqcirc", 0x2256); + //entity("eqcolon", 0x2255); + //entity("eqsim", 0x2242); + //entity("eqslantgtr", 0x2A96); + //entity("eqslantless", 0x2A95); + //entity("Equal", 0x2A75); + entity("equals", 0x003D); + //entity("EqualTilde", 0x2242); + //entity("equest", 0x225F); + //entity("Equilibrium", 0x21CC); + //entity("equiv", 0x2261); + //entity("equivDD", 0x2A78); + //entity("eqvparsl", 0x29E5); + //entity("erarr", 0x2971); + //entity("erDot", 0x2253); + //entity("escr", 0x212F); + //entity("Escr", 0x2130); + //entity("esdot", 0x2250); + //entity("esim", 0x2242); + //entity("Esim", 0x2A73); + //entity("eta", 0x03B7); + //entity("Eta", 0x0397); + entity("eth", 0x00F0); + entity("ETH", 0x00D0); + entity("euml", 0x00EB); + entity("Euml", 0x00CB); + //entity("euro", 0x20AC); + entity("excl", 0x0021); + //entity("exist", 0x2203); + //entity("Exists", 0x2203); + //entity("expectation", 0x2130); + //entity("exponentiale", 0x2147); + //entity("fallingdotseq", 0x2252); + //entity("fcy", 0x0444); + //entity("Fcy", 0x0424); + //entity("female", 0x2640); + //entity("ffilig", 0xFB03); + //entity("fflig", 0xFB00); + //entity("ffllig", 0xFB04); + //entity("ffr", 0x1D523); + //entity("Ffr", 0x1D509); + //entity("filig", 0xFB01); + //entity("FilledSmallSquare", 0x25FC); + //entity("FilledVerySmallSquare", 0x25AA); + //entity("flat", 0x266D); + //entity("fllig", 0xFB02); + //entity("fltns", 0x25B1); + //entity("fnof", 0x0192); + //entity("fopf", 0x1D557); + //entity("Fopf", 0x1D53D); + //entity("forall", 0x2200); + //entity("fork", 0x22D4); + //entity("forkv", 0x2AD9); + //entity("Fouriertrf", 0x2131); + //entity("fpartint", 0x2A0D); + entity("frac12", 0x00BD); + //entity("frac13", 0x2153); + entity("frac14", 0x00BC); + //entity("frac15", 0x2155); + //entity("frac16", 0x2159); + //entity("frac18", 0x215B); + //entity("frac23", 0x2154); + //entity("frac25", 0x2156); + entity("frac34", 0x00BE); + //entity("frac35", 0x2157); + //entity("frac38", 0x215C); + //entity("frac45", 0x2158); + //entity("frac56", 0x215A); + //entity("frac58", 0x215D); + //entity("frac78", 0x215E); + //entity("frasl", 0x2044); + //entity("frown", 0x2322); + //entity("fscr", 0x1D4BB); + //entity("Fscr", 0x2131); + //entity("gacute", 0x01F5); + //entity("gamma", 0x03B3); + //entity("Gamma", 0x0393); + //entity("gammad", 0x03DD); + //entity("Gammad", 0x03DC); + //entity("gap", 0x2A86); + //entity("gbreve", 0x011F); + //entity("Gbreve", 0x011E); + //entity("Gcedil", 0x0122); + //entity("gcirc", 0x011D); + //entity("Gcirc", 0x011C); + //entity("gcy", 0x0433); + //entity("Gcy", 0x0413); + //entity("gdot", 0x0121); + //entity("Gdot", 0x0120); + //entity("ge", 0x2265); + //entity("gE", 0x2267); + //entity("gel", 0x22DB); + //entity("gEl", 0x2A8C); + //entity("geq", 0x2265); + //entity("geqq", 0x2267); + //entity("geqslant", 0x2A7E); + //entity("ges", 0x2A7E); + //entity("gescc", 0x2AA9); + //entity("gesdot", 0x2A80); + //entity("gesdoto", 0x2A82); + //entity("gesdotol", 0x2A84); + //entity("gesles", 0x2A94); + //entity("gfr", 0x1D524); + //entity("Gfr", 0x1D50A); + //entity("gg", 0x226B); + //entity("Gg", 0x22D9); + //entity("ggg", 0x22D9); + //entity("ggr", 0x03B3); + //entity("Ggr", 0x0393); + //entity("gimel", 0x2137); + //entity("gjcy", 0x0453); + //entity("GJcy", 0x0403); + //entity("gl", 0x2277); + //entity("gla", 0x2AA5); + //entity("glE", 0x2A92); + //entity("glj", 0x2AA4); + //entity("gnap", 0x2A8A); + //entity("gnapprox", 0x2A8A); + //entity("gne", 0x2A88); + //entity("gnE", 0x2269); + //entity("gneq", 0x2A88); + //entity("gneqq", 0x2269); + //entity("gnsim", 0x22E7); + //entity("gopf", 0x1D558); + //entity("Gopf", 0x1D53E); + entity("grave", 0x0060); + //entity("GreaterEqual", 0x2265); + //entity("GreaterEqualLess", 0x22DB); + //entity("GreaterFullEqual", 0x2267); + //entity("GreaterGreater", 0x2AA2); + //entity("GreaterLess", 0x2277); + //entity("GreaterSlantEqual", 0x2A7E); + //entity("GreaterTilde", 0x2273); + //entity("gscr", 0x210A); + //entity("Gscr", 0x1D4A2); + //entity("gsim", 0x2273); + //entity("gsime", 0x2A8E); + //entity("gsiml", 0x2A90); + entity("gt", 0x003E); + //entity("Gt", 0x226B); + //entity("gtcc", 0x2AA7); + //entity("gtcir", 0x2A7A); + //entity("gtdot", 0x22D7); + //entity("gtlPar", 0x2995); + //entity("gtquest", 0x2A7C); + //entity("gtrapprox", 0x2A86); + //entity("gtrarr", 0x2978); + //entity("gtrdot", 0x22D7); + //entity("gtreqless", 0x22DB); + //entity("gtreqqless", 0x2A8C); + //entity("gtrless", 0x2277); + //entity("gtrsim", 0x2273); + //entity("Hacek", 0x02C7); + //entity("hairsp", 0x200A); + entity("half", 0x00BD); + //entity("hamilt", 0x210B); + //entity("hardcy", 0x044A); + //entity("HARDcy", 0x042A); + //entity("harr", 0x2194); + //entity("hArr", 0x21D4); + //entity("harrcir", 0x2948); + //entity("harrw", 0x21AD); + entity("Hat", 0x005E); + //entity("hbar", 0x210F); + //entity("hcirc", 0x0125); + //entity("Hcirc", 0x0124); + //entity("hearts", 0x2665); + //entity("heartsuit", 0x2665); + //entity("hellip", 0x2026); + //entity("hercon", 0x22B9); + //entity("hfr", 0x1D525); + //entity("Hfr", 0x210C); + //entity("HilbertSpace", 0x210B); + //entity("hksearow", 0x2925); + //entity("hkswarow", 0x2926); + //entity("hoarr", 0x21FF); + //entity("homtht", 0x223B); + //entity("hookleftarrow", 0x21A9); + //entity("hookrightarrow", 0x21AA); + //entity("hopf", 0x1D559); + //entity("Hopf", 0x210D); + //entity("horbar", 0x2015); + //entity("HorizontalLine", 0x2500); + //entity("hscr", 0x1D4BD); + //entity("Hscr", 0x210B); + //entity("hslash", 0x210F); + //entity("hstrok", 0x0127); + //entity("Hstrok", 0x0126); + //entity("HumpDownHump", 0x224E); + //entity("HumpEqual", 0x224F); + //entity("hybull", 0x2043); + //entity("hyphen", 0x2010); + //entity("iacgr", 0x03AF); + //entity("Iacgr", 0x038A); + entity("iacute", 0x00ED); + entity("Iacute", 0x00CD); + //entity("ic", 0x2063); + entity("icirc", 0x00EE); + entity("Icirc", 0x00CE); + //entity("icy", 0x0438); + //entity("Icy", 0x0418); + //entity("idiagr", 0x0390); + //entity("idigr", 0x03CA); + //entity("Idigr", 0x03AA); + //entity("Idot", 0x0130); + //entity("iecy", 0x0435); + //entity("IEcy", 0x0415); + entity("iexcl", 0x00A1); + //entity("iff", 0x21D4); + //entity("ifr", 0x1D526); + //entity("Ifr", 0x2111); + //entity("igr", 0x03B9); + //entity("Igr", 0x0399); + entity("igrave", 0x00EC); + entity("Igrave", 0x00CC); + //entity("ii", 0x2148); + //entity("iiiint", 0x2A0C); + //entity("iiint", 0x222D); + //entity("iinfin", 0x29DC); + //entity("iiota", 0x2129); + //entity("ijlig", 0x0133); + //entity("IJlig", 0x0132); + //entity("Im", 0x2111); + //entity("imacr", 0x012B); + //entity("Imacr", 0x012A); + //entity("image", 0x2111); + //entity("ImaginaryI", 0x2148); + //entity("imagline", 0x2110); + //entity("imagpart", 0x2111); + //entity("imath", 0x0131); + //entity("imof", 0x22B7); + //entity("imped", 0x01B5); + //entity("Implies", 0x21D2); + //entity("in", 0x2208); + //entity("incare", 0x2105); + //entity("infin", 0x221E); + //entity("infintie", 0x29DD); + //entity("inodot", 0x0131); + //entity("int", 0x222B); + //entity("Int", 0x222C); + //entity("intcal", 0x22BA); + //entity("integers", 0x2124); + //entity("Integral", 0x222B); + //entity("intercal", 0x22BA); + //entity("Intersection", 0x22C2); + //entity("intlarhk", 0x2A17); + //entity("intprod", 0x2A3C); + //entity("InvisibleComma", 0x2063); + //entity("InvisibleTimes", 0x2062); + //entity("iocy", 0x0451); + //entity("IOcy", 0x0401); + //entity("iogon", 0x012F); + //entity("Iogon", 0x012E); + //entity("iopf", 0x1D55A); + //entity("Iopf", 0x1D540); + //entity("iota", 0x03B9); + //entity("Iota", 0x0399); + //entity("iprod", 0x2A3C); + entity("iquest", 0x00BF); + //entity("iscr", 0x1D4BE); + //entity("Iscr", 0x2110); + //entity("isin", 0x2208); + //entity("isindot", 0x22F5); + //entity("isinE", 0x22F9); + //entity("isins", 0x22F4); + //entity("isinsv", 0x22F3); + //entity("isinv", 0x2208); + //entity("it", 0x2062); + //entity("itilde", 0x0129); + //entity("Itilde", 0x0128); + //entity("iukcy", 0x0456); + //entity("Iukcy", 0x0406); + entity("iuml", 0x00EF); + entity("Iuml", 0x00CF); + //entity("jcirc", 0x0135); + //entity("Jcirc", 0x0134); + //entity("jcy", 0x0439); + //entity("Jcy", 0x0419); + //entity("jfr", 0x1D527); + //entity("Jfr", 0x1D50D); + //entity("jmath", 0x0237); + //entity("jopf", 0x1D55B); + //entity("Jopf", 0x1D541); + //entity("jscr", 0x1D4BF); + //entity("Jscr", 0x1D4A5); + //entity("jsercy", 0x0458); + //entity("Jsercy", 0x0408); + //entity("jukcy", 0x0454); + //entity("Jukcy", 0x0404); + //entity("kappa", 0x03BA); + //entity("Kappa", 0x039A); + //entity("kappav", 0x03F0); + //entity("kcedil", 0x0137); + //entity("Kcedil", 0x0136); + //entity("kcy", 0x043A); + //entity("Kcy", 0x041A); + //entity("kfr", 0x1D528); + //entity("Kfr", 0x1D50E); + //entity("kgr", 0x03BA); + //entity("Kgr", 0x039A); + //entity("kgreen", 0x0138); + //entity("khcy", 0x0445); + //entity("KHcy", 0x0425); + //entity("khgr", 0x03C7); + //entity("KHgr", 0x03A7); + //entity("kjcy", 0x045C); + //entity("KJcy", 0x040C); + //entity("kopf", 0x1D55C); + //entity("Kopf", 0x1D542); + //entity("kscr", 0x1D4C0); + //entity("Kscr", 0x1D4A6); + //entity("lAarr", 0x21DA); + //entity("lacute", 0x013A); + //entity("Lacute", 0x0139); + //entity("laemptyv", 0x29B4); + //entity("lagran", 0x2112); + //entity("lambda", 0x03BB); + //entity("Lambda", 0x039B); + //entity("lang", 0x2329); + //entity("Lang", 0x27EA); + //entity("langd", 0x2991); + //entity("langle", 0x2329); + //entity("lap", 0x2A85); + //entity("Laplacetrf", 0x2112); + entity("laquo", 0x00AB); + //entity("larr", 0x2190); + //entity("lArr", 0x21D0); + //entity("Larr", 0x219E); + //entity("larrb", 0x21E4); + //entity("larrbfs", 0x291F); + //entity("larrfs", 0x291D); + //entity("larrhk", 0x21A9); + //entity("larrlp", 0x21AB); + //entity("larrpl", 0x2939); + //entity("larrsim", 0x2973); + //entity("larrtl", 0x21A2); + //entity("lat", 0x2AAB); + //entity("latail", 0x2919); + //entity("lAtail", 0x291B); + //entity("late", 0x2AAD); + //entity("lbarr", 0x290C); + //entity("lBarr", 0x290E); + //entity("lbbrk", 0x2997); + entity("lbrace", 0x007B); + entity("lbrack", 0x005B); + //entity("lbrke", 0x298B); + //entity("lbrksld", 0x298F); + //entity("lbrkslu", 0x298D); + //entity("lcaron", 0x013E); + //entity("Lcaron", 0x013D); + //entity("lcedil", 0x013C); + //entity("Lcedil", 0x013B); + //entity("lceil", 0x2308); + entity("lcub", 0x007B); + //entity("lcy", 0x043B); + //entity("Lcy", 0x041B); + //entity("ldca", 0x2936); + //entity("ldquo", 0x201C); + //entity("ldquor", 0x201E); + //entity("ldrdhar", 0x2967); + //entity("ldrushar", 0x294B); + //entity("ldsh", 0x21B2); + //entity("le", 0x2264); + //entity("lE", 0x2266); + //entity("LeftAngleBracket", 0x2329); + //entity("leftarrow", 0x2190); + //entity("Leftarrow", 0x21D0); + //entity("LeftArrowBar", 0x21E4); + //entity("LeftArrowRightArrow", 0x21C6); + //entity("leftarrowtail", 0x21A2); + //entity("LeftCeiling", 0x2308); + //entity("LeftDoubleBracket", 0x27E6); + //entity("LeftDownTeeVector", 0x2961); + //entity("LeftDownVector", 0x21C3); + //entity("LeftDownVectorBar", 0x2959); + //entity("LeftFloor", 0x230A); + //entity("leftharpoondown", 0x21BD); + //entity("leftharpoonup", 0x21BC); + //entity("leftleftarrows", 0x21C7); + //entity("leftrightarrow", 0x2194); + //entity("Leftrightarrow", 0x21D4); + //entity("leftrightarrows", 0x21C6); + //entity("leftrightharpoons", 0x21CB); + //entity("leftrightsquigarrow", 0x21AD); + //entity("LeftRightVector", 0x294E); + //entity("LeftTee", 0x22A3); + //entity("LeftTeeArrow", 0x21A4); + //entity("LeftTeeVector", 0x295A); + //entity("leftthreetimes", 0x22CB); + //entity("LeftTriangle", 0x22B2); + //entity("LeftTriangleBar", 0x29CF); + //entity("LeftTriangleEqual", 0x22B4); + //entity("LeftUpDownVector", 0x2951); + //entity("LeftUpTeeVector", 0x2960); + //entity("LeftUpVector", 0x21BF); + //entity("LeftUpVectorBar", 0x2958); + //entity("LeftVector", 0x21BC); + //entity("LeftVectorBar", 0x2952); + //entity("leg", 0x22DA); + //entity("lEg", 0x2A8B); + //entity("leq", 0x2264); + //entity("leqq", 0x2266); + //entity("leqslant", 0x2A7D); + //entity("les", 0x2A7D); + //entity("lescc", 0x2AA8); + //entity("lesdot", 0x2A7F); + //entity("lesdoto", 0x2A81); + //entity("lesdotor", 0x2A83); + //entity("lesges", 0x2A93); + //entity("lessapprox", 0x2A85); + //entity("lessdot", 0x22D6); + //entity("lesseqgtr", 0x22DA); + //entity("lesseqqgtr", 0x2A8B); + //entity("LessEqualGreater", 0x22DA); + //entity("LessFullEqual", 0x2266); + //entity("LessGreater", 0x2276); + //entity("lessgtr", 0x2276); + //entity("LessLess", 0x2AA1); + //entity("lesssim", 0x2272); + //entity("LessSlantEqual", 0x2A7D); + //entity("LessTilde", 0x2272); + //entity("lfisht", 0x297C); + //entity("lfloor", 0x230A); + //entity("lfr", 0x1D529); + //entity("Lfr", 0x1D50F); + //entity("lg", 0x2276); + //entity("lgE", 0x2A91); + //entity("lgr", 0x03BB); + //entity("Lgr", 0x039B); + //entity("lHar", 0x2962); + //entity("lhard", 0x21BD); + //entity("lharu", 0x21BC); + //entity("lharul", 0x296A); + //entity("lhblk", 0x2584); + //entity("ljcy", 0x0459); + //entity("LJcy", 0x0409); + //entity("ll", 0x226A); + //entity("Ll", 0x22D8); + //entity("llarr", 0x21C7); + //entity("llcorner", 0x231E); + //entity("Lleftarrow", 0x21DA); + //entity("llhard", 0x296B); + //entity("lltri", 0x25FA); + //entity("lmidot", 0x0140); + //entity("Lmidot", 0x013F); + //entity("lmoust", 0x23B0); + //entity("lmoustache", 0x23B0); + //entity("lnap", 0x2A89); + //entity("lnapprox", 0x2A89); + //entity("lne", 0x2A87); + //entity("lnE", 0x2268); + //entity("lneq", 0x2A87); + //entity("lneqq", 0x2268); + //entity("lnsim", 0x22E6); + //entity("loang", 0x27EC); + //entity("loarr", 0x21FD); + //entity("lobrk", 0x27E6); + //entity("longleftarrow", 0x27F5); + //entity("Longleftarrow", 0x27F8); + //entity("longleftrightarrow", 0x27F7); + //entity("Longleftrightarrow", 0x27FA); + //entity("longmapsto", 0x27FC); + //entity("longrightarrow", 0x27F6); + //entity("Longrightarrow", 0x27F9); + //entity("looparrowleft", 0x21AB); + //entity("looparrowright", 0x21AC); + //entity("lopar", 0x2985); + //entity("lopf", 0x1D55D); + //entity("Lopf", 0x1D543); + //entity("loplus", 0x2A2D); + //entity("lotimes", 0x2A34); + //entity("lowast", 0x2217); + entity("lowbar", 0x005F); + //entity("LowerLeftArrow", 0x2199); + //entity("LowerRightArrow", 0x2198); + //entity("loz", 0x25CA); + //entity("lozenge", 0x25CA); + //entity("lozf", 0x29EB); + entity("lpar", 0x0028); + //entity("lparlt", 0x2993); + //entity("lrarr", 0x21C6); + //entity("lrcorner", 0x231F); + //entity("lrhar", 0x21CB); + //entity("lrhard", 0x296D); + //entity("lrm", 0x200E); + //entity("lrtri", 0x22BF); + //entity("lsaquo", 0x2039); + //entity("lscr", 0x1D4C1); + //entity("Lscr", 0x2112); + //entity("lsh", 0x21B0); + //entity("lsim", 0x2272); + //entity("lsime", 0x2A8D); + //entity("lsimg", 0x2A8F); + entity("lsqb", 0x005B); + //entity("lsquo", 0x2018); + //entity("lsquor", 0x201A); + //entity("lstrok", 0x0142); + //entity("Lstrok", 0x0141); + entity("lt", 0x003C); + //entity("Lt", 0x226A); + //entity("ltcc", 0x2AA6); + //entity("ltcir", 0x2A79); + //entity("ltdot", 0x22D6); + //entity("lthree", 0x22CB); + //entity("ltimes", 0x22C9); + //entity("ltlarr", 0x2976); + //entity("ltquest", 0x2A7B); + //entity("ltri", 0x25C3); + //entity("ltrie", 0x22B4); + //entity("ltrif", 0x25C2); + //entity("ltrPar", 0x2996); + //entity("lurdshar", 0x294A); + //entity("luruhar", 0x2966); + entity("macr", 0x00AF); + //entity("male", 0x2642); + //entity("malt", 0x2720); + //entity("maltese", 0x2720); + //entity("map", 0x21A6); + //entity("Map", 0x2905); + //entity("mapsto", 0x21A6); + //entity("mapstodown", 0x21A7); + //entity("mapstoleft", 0x21A4); + //entity("mapstoup", 0x21A5); + //entity("marker", 0x25AE); + //entity("mcomma", 0x2A29); + //entity("mcy", 0x043C); + //entity("Mcy", 0x041C); + //entity("mdash", 0x2014); + //entity("mDDot", 0x223A); + //entity("measuredangle", 0x2221); + //entity("MediumSpace", 0x205F); + //entity("Mellintrf", 0x2133); + //entity("mfr", 0x1D52A); + //entity("Mfr", 0x1D510); + //entity("mgr", 0x03BC); + //entity("Mgr", 0x039C); + //entity("mho", 0x2127); + entity("micro", 0x00B5); + //entity("mid", 0x2223); + //entity("midast", 0x002A); + //entity("midcir", 0x2AF0); + entity("middot", 0x00B7); + //entity("minus", 0x2212); + //entity("minusb", 0x229F); + //entity("minusd", 0x2238); + //entity("minusdu", 0x2A2A); + //entity("MinusPlus", 0x2213); + //entity("mlcp", 0x2ADB); + //entity("mldr", 0x2026); + //entity("mnplus", 0x2213); + //entity("models", 0x22A7); + //entity("mopf", 0x1D55E); + //entity("Mopf", 0x1D544); + //entity("mp", 0x2213); + //entity("mscr", 0x1D4C2); + //entity("Mscr", 0x2133); + //entity("mstpos", 0x223E); + //entity("mu", 0x03BC); + //entity("Mu", 0x039C); + //entity("multimap", 0x22B8); + //entity("mumap", 0x22B8); + //entity("nabla", 0x2207); + //entity("nacute", 0x0144); + //entity("Nacute", 0x0143); + //entity("nap", 0x2249); + //entity("napos", 0x0149); + //entity("napprox", 0x2249); + //entity("natur", 0x266E); + //entity("natural", 0x266E); + //entity("naturals", 0x2115); + entity("nbsp", 0x00A0); + //entity("ncap", 0x2A43); + //entity("ncaron", 0x0148); + //entity("Ncaron", 0x0147); + //entity("ncedil", 0x0146); + //entity("Ncedil", 0x0145); + //entity("ncong", 0x2247); + //entity("ncup", 0x2A42); + //entity("ncy", 0x043D); + //entity("Ncy", 0x041D); + //entity("ndash", 0x2013); + //entity("ne", 0x2260); + //entity("nearhk", 0x2924); + //entity("nearr", 0x2197); + //entity("neArr", 0x21D7); + //entity("nearrow", 0x2197); + //entity("NegativeMediumSpace", 0x200B); + //entity("NegativeThickSpace", 0x200B); + //entity("NegativeThinSpace", 0x200B); + //entity("NegativeVeryThinSpace", 0x200B); + //entity("nequiv", 0x2262); + //entity("nesear", 0x2928); + //entity("NestedGreaterGreater", 0x226B); + //entity("NestedLessLess", 0x226A); + entity("NewLine", 0x000A); + //entity("nexist", 0x2204); + //entity("nexists", 0x2204); + //entity("nfr", 0x1D52B); + //entity("Nfr", 0x1D511); + //entity("nge", 0x2271); + //entity("ngeq", 0x2271); + //entity("ngr", 0x03BD); + //entity("Ngr", 0x039D); + //entity("ngsim", 0x2275); + //entity("ngt", 0x226F); + //entity("ngtr", 0x226F); + //entity("nharr", 0x21AE); + //entity("nhArr", 0x21CE); + //entity("nhpar", 0x2AF2); + //entity("ni", 0x220B); + //entity("nis", 0x22FC); + //entity("nisd", 0x22FA); + //entity("niv", 0x220B); + //entity("njcy", 0x045A); + //entity("NJcy", 0x040A); + //entity("nlarr", 0x219A); + //entity("nlArr", 0x21CD); + //entity("nldr", 0x2025); + //entity("nle", 0x2270); + //entity("nleftarrow", 0x219A); + //entity("nLeftarrow", 0x21CD); + //entity("nleftrightarrow", 0x21AE); + //entity("nLeftrightarrow", 0x21CE); + //entity("nleq", 0x2270); + //entity("nless", 0x226E); + //entity("nlsim", 0x2274); + //entity("nlt", 0x226E); + //entity("nltri", 0x22EA); + //entity("nltrie", 0x22EC); + //entity("nmid", 0x2224); + //entity("NoBreak", 0x2060); + entity("NonBreakingSpace", 0x00A0); + //entity("nopf", 0x1D55F); + //entity("Nopf", 0x2115); + entity("not", 0x00AC); + //entity("Not", 0x2AEC); + //entity("NotCongruent", 0x2262); + //entity("NotCupCap", 0x226D); + //entity("NotDoubleVerticalBar", 0x2226); + //entity("NotElement", 0x2209); + //entity("NotEqual", 0x2260); + //entity("NotExists", 0x2204); + //entity("NotGreater", 0x226F); + //entity("NotGreaterEqual", 0x2271); + //entity("NotGreaterLess", 0x2279); + //entity("NotGreaterTilde", 0x2275); + //entity("notin", 0x2209); + //entity("notinva", 0x2209); + //entity("notinvb", 0x22F7); + //entity("notinvc", 0x22F6); + //entity("NotLeftTriangle", 0x22EA); + //entity("NotLeftTriangleEqual", 0x22EC); + //entity("NotLess", 0x226E); + //entity("NotLessEqual", 0x2270); + //entity("NotLessGreater", 0x2278); + //entity("NotLessTilde", 0x2274); + //entity("notni", 0x220C); + //entity("notniva", 0x220C); + //entity("notnivb", 0x22FE); + //entity("notnivc", 0x22FD); + //entity("NotPrecedes", 0x2280); + //entity("NotPrecedesSlantEqual", 0x22E0); + //entity("NotReverseElement", 0x220C); + //entity("NotRightTriangle", 0x22EB); + //entity("NotRightTriangleEqual", 0x22ED); + //entity("NotSquareSubsetEqual", 0x22E2); + //entity("NotSquareSupersetEqual", 0x22E3); + //entity("NotSubsetEqual", 0x2288); + //entity("NotSucceeds", 0x2281); + //entity("NotSucceedsSlantEqual", 0x22E1); + //entity("NotSupersetEqual", 0x2289); + //entity("NotTilde", 0x2241); + //entity("NotTildeEqual", 0x2244); + //entity("NotTildeFullEqual", 0x2247); + //entity("NotTildeTilde", 0x2249); + //entity("NotVerticalBar", 0x2224); + //entity("npar", 0x2226); + //entity("nparallel", 0x2226); + //entity("npolint", 0x2A14); + //entity("npr", 0x2280); + //entity("nprcue", 0x22E0); + //entity("nprec", 0x2280); + //entity("nrarr", 0x219B); + //entity("nrArr", 0x21CF); + //entity("nrightarrow", 0x219B); + //entity("nRightarrow", 0x21CF); + //entity("nrtri", 0x22EB); + //entity("nrtrie", 0x22ED); + //entity("nsc", 0x2281); + //entity("nsccue", 0x22E1); + //entity("nscr", 0x1D4C3); + //entity("Nscr", 0x1D4A9); + //entity("nshortmid", 0x2224); + //entity("nshortparallel", 0x2226); + //entity("nsim", 0x2241); + //entity("nsime", 0x2244); + //entity("nsimeq", 0x2244); + //entity("nsmid", 0x2224); + //entity("nspar", 0x2226); + //entity("nsqsube", 0x22E2); + //entity("nsqsupe", 0x22E3); + //entity("nsub", 0x2284); + //entity("nsube", 0x2288); + //entity("nsubseteq", 0x2288); + //entity("nsucc", 0x2281); + //entity("nsup", 0x2285); + //entity("nsupe", 0x2289); + //entity("nsupseteq", 0x2289); + //entity("ntgl", 0x2279); + entity("ntilde", 0x00F1); + entity("Ntilde", 0x00D1); + //entity("ntlg", 0x2278); + //entity("ntriangleleft", 0x22EA); + //entity("ntrianglelefteq", 0x22EC); + //entity("ntriangleright", 0x22EB); + //entity("ntrianglerighteq", 0x22ED); + //entity("nu", 0x03BD); + //entity("Nu", 0x039D); + entity("num", 0x0023); + //entity("numero", 0x2116); + //entity("numsp", 0x2007); + //entity("nvdash", 0x22AC); + //entity("nvDash", 0x22AD); + //entity("nVdash", 0x22AE); + //entity("nVDash", 0x22AF); + //entity("nvHarr", 0x2904); + //entity("nvinfin", 0x29DE); + //entity("nvlArr", 0x2902); + //entity("nvrArr", 0x2903); + //entity("nwarhk", 0x2923); + //entity("nwarr", 0x2196); + //entity("nwArr", 0x21D6); + //entity("nwarrow", 0x2196); + //entity("nwnear", 0x2927); + //entity("oacgr", 0x03CC); + //entity("Oacgr", 0x038C); + entity("oacute", 0x00F3); + entity("Oacute", 0x00D3); + //entity("oast", 0x229B); + //entity("ocir", 0x229A); + entity("ocirc", 0x00F4); + entity("Ocirc", 0x00D4); + //entity("ocy", 0x043E); + //entity("Ocy", 0x041E); + //entity("odash", 0x229D); + //entity("odblac", 0x0151); + //entity("Odblac", 0x0150); + //entity("odiv", 0x2A38); + //entity("odot", 0x2299); + //entity("odsold", 0x29BC); + //entity("oelig", 0x0153); + //entity("OElig", 0x0152); + //entity("ofcir", 0x29BF); + //entity("ofr", 0x1D52C); + //entity("Ofr", 0x1D512); + //entity("ogon", 0x02DB); + //entity("ogr", 0x03BF); + //entity("Ogr", 0x039F); + entity("ograve", 0x00F2); + entity("Ograve", 0x00D2); + //entity("ogt", 0x29C1); + //entity("ohacgr", 0x03CE); + //entity("OHacgr", 0x038F); + //entity("ohbar", 0x29B5); + //entity("ohgr", 0x03C9); + //entity("OHgr", 0x03A9); + //entity("ohm", 0x2126); + //entity("oint", 0x222E); + //entity("olarr", 0x21BA); + //entity("olcir", 0x29BE); + //entity("olcross", 0x29BB); + //entity("oline", 0x203E); + //entity("olt", 0x29C0); + //entity("omacr", 0x014D); + //entity("Omacr", 0x014C); + //entity("omega", 0x03C9); + //entity("Omega", 0x03A9); + //entity("omicron", 0x03BF); + //entity("Omicron", 0x039F); + //entity("omid", 0x29B6); + //entity("ominus", 0x2296); + //entity("oopf", 0x1D560); + //entity("Oopf", 0x1D546); + //entity("opar", 0x29B7); + //entity("OpenCurlyDoubleQuote", 0x201C); + //entity("OpenCurlyQuote", 0x2018); + //entity("operp", 0x29B9); + //entity("oplus", 0x2295); + //entity("or", 0x2228); + //entity("Or", 0x2A54); + //entity("orarr", 0x21BB); + //entity("ord", 0x2A5D); + //entity("order", 0x2134); + //entity("orderof", 0x2134); + entity("ordf", 0x00AA); + entity("ordm", 0x00BA); + //entity("origof", 0x22B6); + //entity("oror", 0x2A56); + //entity("orslope", 0x2A57); + //entity("orv", 0x2A5B); + //entity("oS", 0x24C8); + //entity("oscr", 0x2134); + //entity("Oscr", 0x1D4AA); + entity("oslash", 0x00F8); + entity("Oslash", 0x00D8); + //entity("osol", 0x2298); + entity("otilde", 0x00F5); + entity("Otilde", 0x00D5); + //entity("otimes", 0x2297); + //entity("Otimes", 0x2A37); + //entity("otimesas", 0x2A36); + entity("ouml", 0x00F6); + entity("Ouml", 0x00D6); + //entity("ovbar", 0x233D); + entity("OverBar", 0x00AF); + //entity("OverBrace", 0xFE37); + //entity("OverBracket", 0x23B4); + //entity("OverParenthesis", 0xFE35); + //entity("par", 0x2225); + entity("para", 0x00B6); + //entity("parallel", 0x2225); + //entity("parsim", 0x2AF3); + //entity("parsl", 0x2AFD); + //entity("part", 0x2202); + //entity("PartialD", 0x2202); + //entity("pcy", 0x043F); + //entity("Pcy", 0x041F); + entity("percnt", 0x0025); + entity("period", 0x002E); + //entity("permil", 0x2030); + //entity("perp", 0x22A5); + //entity("pertenk", 0x2031); + //entity("pfr", 0x1D52D); + //entity("Pfr", 0x1D513); + //entity("pgr", 0x03C0); + //entity("Pgr", 0x03A0); + //entity("phgr", 0x03C6); + //entity("PHgr", 0x03A6); + //entity("phi", 0x03D5); + //entity("Phi", 0x03A6); + //entity("phiv", 0x03C6); + //entity("phmmat", 0x2133); + //entity("phone", 0x260E); + //entity("pi", 0x03C0); + //entity("Pi", 0x03A0); + //entity("pitchfork", 0x22D4); + //entity("piv", 0x03D6); + //entity("planck", 0x210F); + //entity("planckh", 0x210E); + //entity("plankv", 0x210F); + entity("plus", 0x002B); + //entity("plusacir", 0x2A23); + //entity("plusb", 0x229E); + //entity("pluscir", 0x2A22); + //entity("plusdo", 0x2214); + //entity("plusdu", 0x2A25); + //entity("pluse", 0x2A72); + entity("PlusMinus", 0x00B1); + entity("plusmn", 0x00B1); + //entity("plussim", 0x2A26); + //entity("plustwo", 0x2A27); + entity("pm", 0x00B1); + //entity("Poincareplane", 0x210C); + //entity("pointint", 0x2A15); + //entity("popf", 0x1D561); + //entity("Popf", 0x2119); + entity("pound", 0x00A3); + //entity("pr", 0x227A); + //entity("Pr", 0x2ABB); + //entity("prap", 0x2AB7); + //entity("prcue", 0x227C); + //entity("pre", 0x2AAF); + //entity("prE", 0x2AB3); + //entity("prec", 0x227A); + //entity("precapprox", 0x2AB7); + //entity("preccurlyeq", 0x227C); + //entity("Precedes", 0x227A); + //entity("PrecedesEqual", 0x2AAF); + //entity("PrecedesSlantEqual", 0x227C); + //entity("PrecedesTilde", 0x227E); + //entity("preceq", 0x2AAF); + //entity("precnapprox", 0x2AB9); + //entity("precneqq", 0x2AB5); + //entity("precnsim", 0x22E8); + //entity("precsim", 0x227E); + //entity("prime", 0x2032); + //entity("Prime", 0x2033); + //entity("primes", 0x2119); + //entity("prnap", 0x2AB9); + //entity("prnE", 0x2AB5); + //entity("prnsim", 0x22E8); + //entity("prod", 0x220F); + //entity("Product", 0x220F); + //entity("profalar", 0x232E); + //entity("profline", 0x2312); + //entity("profsurf", 0x2313); + //entity("prop", 0x221D); + //entity("Proportion", 0x2237); + //entity("Proportional", 0x221D); + //entity("propto", 0x221D); + //entity("prsim", 0x227E); + //entity("prurel", 0x22B0); + //entity("pscr", 0x1D4C5); + //entity("Pscr", 0x1D4AB); + //entity("psgr", 0x03C8); + //entity("PSgr", 0x03A8); + //entity("psi", 0x03C8); + //entity("Psi", 0x03A8); + //entity("puncsp", 0x2008); + //entity("qfr", 0x1D52E); + //entity("Qfr", 0x1D514); + //entity("qint", 0x2A0C); + //entity("qopf", 0x1D562); + //entity("Qopf", 0x211A); + //entity("qprime", 0x2057); + //entity("qscr", 0x1D4C6); + //entity("Qscr", 0x1D4AC); + //entity("quaternions", 0x210D); + //entity("quatint", 0x2A16); + entity("quest", 0x003F); + //entity("questeq", 0x225F); + entity("quot", 0x0022); + //entity("rAarr", 0x21DB); + //entity("race", 0x29DA); + //entity("racute", 0x0155); + //entity("Racute", 0x0154); + //entity("radic", 0x221A); + //entity("raemptyv", 0x29B3); + //entity("rang", 0x232A); + //entity("Rang", 0x27EB); + //entity("rangd", 0x2992); + //entity("range", 0x29A5); + //entity("rangle", 0x232A); + entity("raquo", 0x00BB); + //entity("rarr", 0x2192); + //entity("rArr", 0x21D2); + //entity("Rarr", 0x21A0); + //entity("rarrap", 0x2975); + //entity("rarrb", 0x21E5); + //entity("rarrbfs", 0x2920); + //entity("rarrc", 0x2933); + //entity("rarrfs", 0x291E); + //entity("rarrhk", 0x21AA); + //entity("rarrlp", 0x21AC); + //entity("rarrpl", 0x2945); + //entity("rarrsim", 0x2974); + //entity("rarrtl", 0x21A3); + //entity("Rarrtl", 0x2916); + //entity("rarrw", 0x219D); + //entity("ratail", 0x291A); + //entity("rAtail", 0x291C); + //entity("ratio", 0x2236); + //entity("rationals", 0x211A); + //entity("rbarr", 0x290D); + //entity("rBarr", 0x290F); + //entity("RBarr", 0x2910); + //entity("rbbrk", 0x2998); + entity("rbrace", 0x007D); + entity("rbrack", 0x005D); + //entity("rbrke", 0x298C); + //entity("rbrksld", 0x298E); + //entity("rbrkslu", 0x2990); + //entity("rcaron", 0x0159); + //entity("Rcaron", 0x0158); + //entity("rcedil", 0x0157); + //entity("Rcedil", 0x0156); + //entity("rceil", 0x2309); + entity("rcub", 0x007D); + //entity("rcy", 0x0440); + //entity("Rcy", 0x0420); + //entity("rdca", 0x2937); + //entity("rdldhar", 0x2969); + //entity("rdquo", 0x201D); + //entity("rdquor", 0x201D); + //entity("rdsh", 0x21B3); + //entity("Re", 0x211C); + //entity("real", 0x211C); + //entity("realine", 0x211B); + //entity("realpart", 0x211C); + //entity("reals", 0x211D); + //entity("rect", 0x25AD); + entity("reg", 0x00AE); + //entity("ReverseElement", 0x220B); + //entity("ReverseEquilibrium", 0x21CB); + //entity("ReverseUpEquilibrium", 0x296F); + //entity("rfisht", 0x297D); + //entity("rfloor", 0x230B); + //entity("rfr", 0x1D52F); + //entity("Rfr", 0x211C); + //entity("rgr", 0x03C1); + //entity("Rgr", 0x03A1); + //entity("rHar", 0x2964); + //entity("rhard", 0x21C1); + //entity("rharu", 0x21C0); + //entity("rharul", 0x296C); + //entity("rho", 0x03C1); + //entity("Rho", 0x03A1); + //entity("rhov", 0x03F1); + //entity("RightAngleBracket", 0x232A); + //entity("rightarrow", 0x2192); + //entity("Rightarrow", 0x21D2); + //entity("RightArrowBar", 0x21E5); + //entity("RightArrowLeftArrow", 0x21C4); + //entity("rightarrowtail", 0x21A3); + //entity("RightCeiling", 0x2309); + //entity("RightDoubleBracket", 0x27E7); + //entity("RightDownTeeVector", 0x295D); + //entity("RightDownVector", 0x21C2); + //entity("RightDownVectorBar", 0x2955); + //entity("RightFloor", 0x230B); + //entity("rightharpoondown", 0x21C1); + //entity("rightharpoonup", 0x21C0); + //entity("rightleftarrows", 0x21C4); + //entity("rightleftharpoons", 0x21CC); + //entity("rightrightarrows", 0x21C9); + //entity("rightsquigarrow", 0x219D); + //entity("RightTee", 0x22A2); + //entity("RightTeeArrow", 0x21A6); + //entity("RightTeeVector", 0x295B); + //entity("rightthreetimes", 0x22CC); + //entity("RightTriangle", 0x22B3); + //entity("RightTriangleBar", 0x29D0); + //entity("RightTriangleEqual", 0x22B5); + //entity("RightUpDownVector", 0x294F); + //entity("RightUpTeeVector", 0x295C); + //entity("RightUpVector", 0x21BE); + //entity("RightUpVectorBar", 0x2954); + //entity("RightVector", 0x21C0); + //entity("RightVectorBar", 0x2953); + //entity("ring", 0x02DA); + //entity("risingdotseq", 0x2253); + //entity("rlarr", 0x21C4); + //entity("rlhar", 0x21CC); + //entity("rlm", 0x200F); + //entity("rmoust", 0x23B1); + //entity("rmoustache", 0x23B1); + //entity("rnmid", 0x2AEE); + //entity("roang", 0x27ED); + //entity("roarr", 0x21FE); + //entity("robrk", 0x27E7); + //entity("ropar", 0x2986); + //entity("ropf", 0x1D563); + //entity("Ropf", 0x211D); + //entity("roplus", 0x2A2E); + //entity("rotimes", 0x2A35); + //entity("RoundImplies", 0x2970); + entity("rpar", 0x0029); + //entity("rpargt", 0x2994); + //entity("rppolint", 0x2A12); + //entity("rrarr", 0x21C9); + //entity("Rrightarrow", 0x21DB); + //entity("rsaquo", 0x203A); + //entity("rscr", 0x1D4C7); + //entity("Rscr", 0x211B); + //entity("rsh", 0x21B1); + entity("rsqb", 0x005D); + //entity("rsquo", 0x2019); + //entity("rsquor", 0x2019); + //entity("rthree", 0x22CC); + //entity("rtimes", 0x22CA); + //entity("rtri", 0x25B9); + //entity("rtrie", 0x22B5); + //entity("rtrif", 0x25B8); + //entity("rtriltri", 0x29CE); + //entity("RuleDelayed", 0x29F4); + //entity("ruluhar", 0x2968); + //entity("rx", 0x211E); + //entity("sacute", 0x015B); + //entity("Sacute", 0x015A); + //entity("sbquo", 0x201A); + //entity("sc", 0x227B); + //entity("Sc", 0x2ABC); + //entity("scap", 0x2AB8); + //entity("scaron", 0x0161); + //entity("Scaron", 0x0160); + //entity("sccue", 0x227D); + //entity("sce", 0x2AB0); + //entity("scE", 0x2AB4); + //entity("scedil", 0x015F); + //entity("Scedil", 0x015E); + //entity("scirc", 0x015D); + //entity("Scirc", 0x015C); + //entity("scnap", 0x2ABA); + //entity("scnE", 0x2AB6); + //entity("scnsim", 0x22E9); + //entity("scpolint", 0x2A13); + //entity("scsim", 0x227F); + //entity("scy", 0x0441); + //entity("Scy", 0x0421); + //entity("sdot", 0x22C5); + //entity("sdotb", 0x22A1); + //entity("sdote", 0x2A66); + //entity("searhk", 0x2925); + //entity("searr", 0x2198); + //entity("seArr", 0x21D8); + //entity("searrow", 0x2198); + entity("sect", 0x00A7); + entity("semi", 0x003B); + //entity("seswar", 0x2929); + //entity("setminus", 0x2216); + //entity("setmn", 0x2216); + //entity("sext", 0x2736); + //entity("sfgr", 0x03C2); + //entity("sfr", 0x1D530); + //entity("Sfr", 0x1D516); + //entity("sfrown", 0x2322); + //entity("sgr", 0x03C3); + //entity("Sgr", 0x03A3); + //entity("sharp", 0x266F); + //entity("shchcy", 0x0449); + //entity("SHCHcy", 0x0429); + //entity("shcy", 0x0448); + //entity("SHcy", 0x0428); + //entity("ShortDownArrow", 0x2193); + //entity("ShortLeftArrow", 0x2190); + //entity("shortmid", 0x2223); + //entity("shortparallel", 0x2225); + //entity("ShortRightArrow", 0x2192); + //entity("ShortUpArrow", 0x2191); + entity("shy", 0x00AD); + //entity("sigma", 0x03C3); + //entity("Sigma", 0x03A3); + //entity("sigmaf", 0x03C2); + //entity("sigmav", 0x03C2); + //entity("sim", 0x223C); + //entity("simdot", 0x2A6A); + //entity("sime", 0x2243); + //entity("simeq", 0x2243); + //entity("simg", 0x2A9E); + //entity("simgE", 0x2AA0); + //entity("siml", 0x2A9D); + //entity("simlE", 0x2A9F); + //entity("simne", 0x2246); + //entity("simplus", 0x2A24); + //entity("simrarr", 0x2972); + //entity("slarr", 0x2190); + //entity("SmallCircle", 0x2218); + //entity("smallsetminus", 0x2216); + //entity("smashp", 0x2A33); + //entity("smeparsl", 0x29E4); + //entity("smid", 0x2223); + //entity("smile", 0x2323); + //entity("smt", 0x2AAA); + //entity("smte", 0x2AAC); + //entity("softcy", 0x044C); + //entity("SOFTcy", 0x042C); + entity("sol", 0x002F); + //entity("solb", 0x29C4); + //entity("solbar", 0x233F); + //entity("sopf", 0x1D564); + //entity("Sopf", 0x1D54A); + //entity("spades", 0x2660); + //entity("spadesuit", 0x2660); + //entity("spar", 0x2225); + //entity("sqcap", 0x2293); + //entity("sqcup", 0x2294); + //entity("Sqrt", 0x221A); + //entity("sqsub", 0x228F); + //entity("sqsube", 0x2291); + //entity("sqsubset", 0x228F); + //entity("sqsubseteq", 0x2291); + //entity("sqsup", 0x2290); + //entity("sqsupe", 0x2292); + //entity("sqsupset", 0x2290); + //entity("sqsupseteq", 0x2292); + //entity("squ", 0x25A1); + //entity("square", 0x25A1); + //entity("SquareIntersection", 0x2293); + //entity("SquareSubset", 0x228F); + //entity("SquareSubsetEqual", 0x2291); + //entity("SquareSuperset", 0x2290); + //entity("SquareSupersetEqual", 0x2292); + //entity("SquareUnion", 0x2294); + //entity("squarf", 0x25AA); + //entity("squf", 0x25AA); + //entity("srarr", 0x2192); + //entity("sscr", 0x1D4C8); + //entity("Sscr", 0x1D4AE); + //entity("ssetmn", 0x2216); + //entity("ssmile", 0x2323); + //entity("sstarf", 0x22C6); + //entity("star", 0x2606); + //entity("Star", 0x22C6); + //entity("starf", 0x2605); + //entity("straightepsilon", 0x03F5); + //entity("straightphi", 0x03D5); + entity("strns", 0x00AF); + //entity("sub", 0x2282); + //entity("Sub", 0x22D0); + //entity("subdot", 0x2ABD); + //entity("sube", 0x2286); + //entity("subE", 0x2AC5); + //entity("subedot", 0x2AC3); + //entity("submult", 0x2AC1); + //entity("subne", 0x228A); + //entity("subnE", 0x2ACB); + //entity("subplus", 0x2ABF); + //entity("subrarr", 0x2979); + //entity("subset", 0x2282); + //entity("Subset", 0x22D0); + //entity("subseteq", 0x2286); + //entity("subseteqq", 0x2AC5); + //entity("SubsetEqual", 0x2286); + //entity("subsetneq", 0x228A); + //entity("subsetneqq", 0x2ACB); + //entity("subsim", 0x2AC7); + //entity("subsub", 0x2AD5); + //entity("subsup", 0x2AD3); + //entity("succ", 0x227B); + //entity("succapprox", 0x2AB8); + //entity("succcurlyeq", 0x227D); + //entity("Succeeds", 0x227B); + //entity("SucceedsEqual", 0x2AB0); + //entity("SucceedsSlantEqual", 0x227D); + //entity("SucceedsTilde", 0x227F); + //entity("succeq", 0x2AB0); + //entity("succnapprox", 0x2ABA); + //entity("succneqq", 0x2AB6); + //entity("succnsim", 0x22E9); + //entity("succsim", 0x227F); + //entity("SuchThat", 0x220B); + //entity("sum", 0x2211); + //entity("sung", 0x266A); + //entity("sup", 0x2283); + //entity("Sup", 0x22D1); + entity("sup1", 0x00B9); + entity("sup2", 0x00B2); + entity("sup3", 0x00B3); + //entity("supdot", 0x2ABE); + //entity("supdsub", 0x2AD8); + //entity("supe", 0x2287); + //entity("supE", 0x2AC6); + //entity("supedot", 0x2AC4); + //entity("Superset", 0x2283); + //entity("SupersetEqual", 0x2287); + //entity("suphsub", 0x2AD7); + //entity("suplarr", 0x297B); + //entity("supmult", 0x2AC2); + //entity("supne", 0x228B); + //entity("supnE", 0x2ACC); + //entity("supplus", 0x2AC0); + //entity("supset", 0x2283); + //entity("Supset", 0x22D1); + //entity("supseteq", 0x2287); + //entity("supseteqq", 0x2AC6); + //entity("supsetneq", 0x228B); + //entity("supsetneqq", 0x2ACC); + //entity("supsim", 0x2AC8); + //entity("supsub", 0x2AD4); + //entity("supsup", 0x2AD6); + //entity("swarhk", 0x2926); + //entity("swarr", 0x2199); + //entity("swArr", 0x21D9); + //entity("swarrow", 0x2199); + //entity("swnwar", 0x292A); + entity("szlig", 0x00DF); + entity("Tab", 0x0009); + //entity("target", 0x2316); + //entity("tau", 0x03C4); + //entity("Tau", 0x03A4); + //entity("tbrk", 0x23B4); + //entity("tcaron", 0x0165); + //entity("Tcaron", 0x0164); + //entity("tcedil", 0x0163); + //entity("Tcedil", 0x0162); + //entity("tcy", 0x0442); + //entity("Tcy", 0x0422); + //entity("telrec", 0x2315); + //entity("tfr", 0x1D531); + //entity("Tfr", 0x1D517); + //entity("tgr", 0x03C4); + //entity("Tgr", 0x03A4); + //entity("there4", 0x2234); + //entity("therefore", 0x2234); + //entity("theta", 0x03B8); + //entity("Theta", 0x0398); + //entity("thetasym", 0x03D1); + //entity("thetav", 0x03D1); + //entity("thgr", 0x03B8); + //entity("THgr", 0x0398); + //entity("thickapprox", 0x2248); + //entity("thicksim", 0x223C); + //entity("thinsp", 0x2009); + //entity("ThinSpace", 0x2009); + //entity("thkap", 0x2248); + //entity("thksim", 0x223C); + entity("thorn", 0x00FE); + entity("THORN", 0x00DE); + //entity("tilde", 0x02DC); + //entity("Tilde", 0x223C); + //entity("TildeEqual", 0x2243); + //entity("TildeFullEqual", 0x2245); + //entity("TildeTilde", 0x2248); + entity("times", 0x00D7); + //entity("timesb", 0x22A0); + //entity("timesbar", 0x2A31); + //entity("timesd", 0x2A30); + //entity("tint", 0x222D); + //entity("toea", 0x2928); + //entity("top", 0x22A4); + //entity("topbot", 0x2336); + //entity("topcir", 0x2AF1); + //entity("topf", 0x1D565); + //entity("Topf", 0x1D54B); + //entity("topfork", 0x2ADA); + //entity("tosa", 0x2929); + //entity("tprime", 0x2034); + //entity("trade", 0x2122); + //entity("triangle", 0x25B5); + //entity("triangledown", 0x25BF); + //entity("triangleleft", 0x25C3); + //entity("trianglelefteq", 0x22B4); + //entity("triangleq", 0x225C); + //entity("triangleright", 0x25B9); + //entity("trianglerighteq", 0x22B5); + //entity("tridot", 0x25EC); + //entity("trie", 0x225C); + //entity("triminus", 0x2A3A); + //entity("triplus", 0x2A39); + //entity("trisb", 0x29CD); + //entity("tritime", 0x2A3B); + //entity("trpezium", 0x23E2); + //entity("tscr", 0x1D4C9); + //entity("Tscr", 0x1D4AF); + //entity("tscy", 0x0446); + //entity("TScy", 0x0426); + //entity("tshcy", 0x045B); + //entity("TSHcy", 0x040B); + //entity("tstrok", 0x0167); + //entity("Tstrok", 0x0166); + //entity("twixt", 0x226C); + //entity("twoheadleftarrow", 0x219E); + //entity("twoheadrightarrow", 0x21A0); + //entity("uacgr", 0x03CD); + //entity("Uacgr", 0x038E); + entity("uacute", 0x00FA); + entity("Uacute", 0x00DA); + //entity("uarr", 0x2191); + //entity("uArr", 0x21D1); + //entity("Uarr", 0x219F); + //entity("Uarrocir", 0x2949); + //entity("ubrcy", 0x045E); + //entity("Ubrcy", 0x040E); + //entity("ubreve", 0x016D); + //entity("Ubreve", 0x016C); + entity("ucirc", 0x00FB); + entity("Ucirc", 0x00DB); + //entity("ucy", 0x0443); + //entity("Ucy", 0x0423); + //entity("udarr", 0x21C5); + //entity("udblac", 0x0171); + //entity("Udblac", 0x0170); + //entity("udhar", 0x296E); + //entity("udiagr", 0x03B0); + //entity("udigr", 0x03CB); + //entity("Udigr", 0x03AB); + //entity("ufisht", 0x297E); + //entity("ufr", 0x1D532); + //entity("Ufr", 0x1D518); + //entity("ugr", 0x03C5); + //entity("Ugr", 0x03A5); + entity("ugrave", 0x00F9); + entity("Ugrave", 0x00D9); + //entity("uHar", 0x2963); + //entity("uharl", 0x21BF); + //entity("uharr", 0x21BE); + //entity("uhblk", 0x2580); + //entity("ulcorn", 0x231C); + //entity("ulcorner", 0x231C); + //entity("ulcrop", 0x230F); + //entity("ultri", 0x25F8); + //entity("umacr", 0x016B); + //entity("Umacr", 0x016A); + entity("uml", 0x00A8); + //entity("UnderBrace", 0xFE38); + //entity("UnderBracket", 0x23B5); + //entity("UnderParenthesis", 0xFE36); + //entity("Union", 0x22C3); + //entity("UnionPlus", 0x228E); + //entity("uogon", 0x0173); + //entity("Uogon", 0x0172); + //entity("uopf", 0x1D566); + //entity("Uopf", 0x1D54C); + //entity("uparrow", 0x2191); + //entity("Uparrow", 0x21D1); + //entity("UpArrowBar", 0x2912); + //entity("UpArrowDownArrow", 0x21C5); + //entity("updownarrow", 0x2195); + //entity("Updownarrow", 0x21D5); + //entity("UpEquilibrium", 0x296E); + //entity("upharpoonleft", 0x21BF); + //entity("upharpoonright", 0x21BE); + //entity("uplus", 0x228E); + //entity("UpperLeftArrow", 0x2196); + //entity("UpperRightArrow", 0x2197); + //entity("upsi", 0x03C5); + //entity("Upsi", 0x03D2); + //entity("upsih", 0x03D2); + //entity("upsilon", 0x03C5); + //entity("Upsilon", 0x03A5); + //entity("UpTee", 0x22A5); + //entity("UpTeeArrow", 0x21A5); + //entity("upuparrows", 0x21C8); + //entity("urcorn", 0x231D); + //entity("urcorner", 0x231D); + //entity("urcrop", 0x230E); + //entity("uring", 0x016F); + //entity("Uring", 0x016E); + //entity("urtri", 0x25F9); + //entity("uscr", 0x1D4CA); + //entity("Uscr", 0x1D4B0); + //entity("utdot", 0x22F0); + //entity("utilde", 0x0169); + //entity("Utilde", 0x0168); + //entity("utri", 0x25B5); + //entity("utrif", 0x25B4); + //entity("uuarr", 0x21C8); + entity("uuml", 0x00FC); + entity("Uuml", 0x00DC); + //entity("uwangle", 0x29A7); + //entity("vangrt", 0x299C); + //entity("varepsilon", 0x03B5); + //entity("varkappa", 0x03F0); + //entity("varnothing", 0x2205); + //entity("varphi", 0x03C6); + //entity("varpi", 0x03D6); + //entity("varpropto", 0x221D); + //entity("varr", 0x2195); + //entity("vArr", 0x21D5); + //entity("varrho", 0x03F1); + //entity("varsigma", 0x03C2); + //entity("vartheta", 0x03D1); + //entity("vartriangleleft", 0x22B2); + //entity("vartriangleright", 0x22B3); + //entity("vBar", 0x2AE8); + //entity("Vbar", 0x2AEB); + //entity("vBarv", 0x2AE9); + //entity("vcy", 0x0432); + //entity("Vcy", 0x0412); + //entity("vdash", 0x22A2); + //entity("vDash", 0x22A8); + //entity("Vdash", 0x22A9); + //entity("VDash", 0x22AB); + //entity("Vdashl", 0x2AE6); + //entity("vee", 0x2228); + //entity("Vee", 0x22C1); + //entity("veebar", 0x22BB); + //entity("veeeq", 0x225A); + //entity("vellip", 0x22EE); + entity("verbar", 0x007C); + //entity("Verbar", 0x2016); + entity("vert", 0x007C); + //entity("Vert", 0x2016); + //entity("VerticalBar", 0x2223); + entity("VerticalLine", 0x007C); + //entity("VerticalSeparator", 0x2758); + //entity("VerticalTilde", 0x2240); + //entity("VeryThinSpace", 0x200A); + //entity("vfr", 0x1D533); + //entity("Vfr", 0x1D519); + //entity("vltri", 0x22B2); + //entity("vopf", 0x1D567); + //entity("Vopf", 0x1D54D); + //entity("vprop", 0x221D); + //entity("vrtri", 0x22B3); + //entity("vscr", 0x1D4CB); + //entity("Vscr", 0x1D4B1); + //entity("Vvdash", 0x22AA); + //entity("vzigzag", 0x299A); + //entity("wcirc", 0x0175); + //entity("Wcirc", 0x0174); + //entity("wedbar", 0x2A5F); + //entity("wedge", 0x2227); + //entity("Wedge", 0x22C0); + //entity("wedgeq", 0x2259); + //entity("weierp", 0x2118); + //entity("wfr", 0x1D534); + //entity("Wfr", 0x1D51A); + //entity("wopf", 0x1D568); + //entity("Wopf", 0x1D54E); + //entity("wp", 0x2118); + //entity("wr", 0x2240); + //entity("wreath", 0x2240); + //entity("wscr", 0x1D4CC); + //entity("Wscr", 0x1D4B2); + //entity("xcap", 0x22C2); + //entity("xcirc", 0x25EF); + //entity("xcup", 0x22C3); + //entity("xdtri", 0x25BD); + //entity("xfr", 0x1D535); + //entity("Xfr", 0x1D51B); + //entity("xgr", 0x03BE); + //entity("Xgr", 0x039E); + //entity("xharr", 0x27F7); + //entity("xhArr", 0x27FA); + //entity("xi", 0x03BE); + //entity("Xi", 0x039E); + //entity("xlarr", 0x27F5); + //entity("xlArr", 0x27F8); + //entity("xmap", 0x27FC); + //entity("xnis", 0x22FB); + //entity("xodot", 0x2A00); + //entity("xopf", 0x1D569); + //entity("Xopf", 0x1D54F); + //entity("xoplus", 0x2A01); + //entity("xotime", 0x2A02); + //entity("xrarr", 0x27F6); + //entity("xrArr", 0x27F9); + //entity("xscr", 0x1D4CD); + //entity("Xscr", 0x1D4B3); + //entity("xsqcup", 0x2A06); + //entity("xuplus", 0x2A04); + //entity("xutri", 0x25B3); + //entity("xvee", 0x22C1); + //entity("xwedge", 0x22C0); + entity("yacute", 0x00FD); + entity("Yacute", 0x00DD); + //entity("yacy", 0x044F); + //entity("YAcy", 0x042F); + //entity("ycirc", 0x0177); + //entity("Ycirc", 0x0176); + //entity("ycy", 0x044B); + //entity("Ycy", 0x042B); + entity("yen", 0x00A5); + //entity("yfr", 0x1D536); + //entity("Yfr", 0x1D51C); + //entity("yicy", 0x0457); + //entity("YIcy", 0x0407); + //entity("yopf", 0x1D56A); + //entity("Yopf", 0x1D550); + //entity("yscr", 0x1D4CE); + //entity("Yscr", 0x1D4B4); + //entity("yucy", 0x044E); + //entity("YUcy", 0x042E); + entity("yuml", 0x00FF); + //entity("Yuml", 0x0178); + //entity("zacute", 0x017A); + //entity("Zacute", 0x0179); + //entity("zcaron", 0x017E); + //entity("Zcaron", 0x017D); + //entity("zcy", 0x0437); + //entity("Zcy", 0x0417); + //entity("zdot", 0x017C); + //entity("Zdot", 0x017B); + //entity("zeetrf", 0x2128); + //entity("ZeroWidthSpace", 0x200B); + //entity("zeta", 0x03B6); + //entity("Zeta", 0x0396); + //entity("zfr", 0x1D537); + //entity("Zfr", 0x2128); + //entity("zgr", 0x03B6); + //entity("Zgr", 0x0396); + //entity("zhcy", 0x0436); + //entity("ZHcy", 0x0416); + //entity("zigrarr", 0x21DD); + //entity("zopf", 0x1D56B); + //entity("Zopf", 0x2124); + //entity("zscr", 0x1D4CF); + //entity("Zscr", 0x1D4B5); + //entity("zwj", 0x200D); + //entity("zwnj", 0x200C); + } // entities + + HTMLSchema(const HTMLSchema&); + HTMLSchema& operator=(const HTMLSchema&); + bool operator==(const HTMLSchema&) const; +}; // class HTMLSchema + +} // namespace SAX + +} // namespace Arabica +#endif + diff --git a/arabica/include/XML/QName.hpp b/arabica/include/XML/QName.hpp new file mode 100644 index 000000000..1e697a12e --- /dev/null +++ b/arabica/include/XML/QName.hpp @@ -0,0 +1,194 @@ +#ifndef ARABICA_XML_QNAME_HPP +#define ARABICA_XML_QNAME_HPP + +#include +#include +#include +#include +#include + +template class QualifiedNameTest; + +namespace Arabica +{ +namespace XML +{ +namespace impl +{ + +template +class MapMapper +{ + typedef typename std::map string_map; + typedef typename std::map::const_iterator string_map_iterator; +public: + MapMapper(const std::map& namespaces) : namespaces_(namespaces) { } + MapMapper(const MapMapper& rhs) : namespaces_(rhs.namespaces_) { } + + string_type operator()(const string_type& prefix) const + { + string_map_iterator ns = namespaces_.find(prefix); + if(ns == namespaces_.end()) + return string_adaptor::empty_string(); + return ns->second; + } //operator() + +private: + const string_map& namespaces_; + + bool operator==(const MapMapper&) const; + MapMapper& operator=(const MapMapper&); +}; // class MapMapper + +} // namespace impl + +template > +class QualifiedName +{ + typedef string_adaptor SA; + +public: + /** + *

This function processes a raw XML 1.0 name in the current + * context by removing the prefix and looking it up among the + * prefixes currently declared. + * + *

If the raw name has a prefix that has not been declared, + * then the return value will be empty.

+ * + *

Note that attribute names are processed differently than + * element names: an unprefixed element name will received the + * default Namespace (if any), while an unprefixed attribute name + * will not.

+ */ + template + static QualifiedName parseQName(const string_type& rawname, + bool is_attribute, + const UriMapper& mapper) + { + if(!Arabica::XML::is_qname(rawname)) + throw std::runtime_error("Bad qname : '" + SA::asStdString(rawname) +"'"); + + static string_type COLON = SA::construct_from_utf8(":"); + + typename string_adaptor::size_type index = string_adaptor::find(rawname, COLON); + + if(index == string_adaptor::npos()) + return QualifiedName(rawname, + is_attribute ? SA::empty_string() : mapper(SA::empty_string())); + + // prefix + string_type prefix = string_adaptor::substr(rawname, 0, index); + string_type localName = string_adaptor::substr(rawname, index + 1); + string_type uri = mapper(prefix); + + return QualifiedName(prefix, localName, uri, rawname); + } // parseQName + + static QualifiedName parseQName(const string_type& rawname, + bool is_attribute, + const std::map& namespaces) + { + return parseQName(rawname, is_attribute, impl::MapMapper(namespaces)); + } // parseQName + +public: + QualifiedName(const QualifiedName& rhs) : + prefix_(rhs.prefix_), + localName_(rhs.localName_), + namespaceUri_(rhs.namespaceUri_), + rawName_(rhs.rawName_) + { + } // QualifiedName + + QualifiedName& operator=(const QualifiedName& rhs) + { + QualifiedName qn(rhs); + std::swap(prefix_, qn.prefix_); + std::swap(localName_, qn.localName_); + std::swap(namespaceUri_, qn.namespaceUri_); + std::swap(rawName_, qn.rawName_); + return *this; + } // operator= + + bool operator==(const QualifiedName& rhs) const + { + return (localName_ == rhs.localName_) && + (namespaceUri_ == rhs.namespaceUri_); + } // operator== + + bool operator!=(const QualifiedName& rhs) const + { + return !(operator==(rhs)); + } // operator!= + + string_type clarkName() const + { + if(SA::empty(namespaceUri_)) + return localName_; + + string_type cn; + SA::append(cn, SA::construct_from_utf8("{")); + SA::append(cn, namespaceUri_); + SA::append(cn, SA::construct_from_utf8("}")); + SA::append(cn, localName_); + return cn; + } // clarkName + + bool has_prefix() const { return !SA::empty(prefix_); } + void set_prefix(const string_type& prefix) { prefix_ = prefix; } + bool has_namespaceUri() const { return !SA::empty(namespaceUri_); } + + const string_type& prefix() const { return prefix_; } + const string_type& localName() const { return localName_; } + const string_type& namespaceUri() const { return namespaceUri_; } + + bool has_rawName() const { return !SA::empty(rawName_); } + const string_type& rawName() const { return rawName_; } + +private: + string_type prefix_; + string_type localName_; + string_type namespaceUri_; + string_type rawName_; + + QualifiedName(const string_type& localName, + const string_type& namespaceUri) : + prefix_(), + localName_(localName), + namespaceUri_(namespaceUri), + rawName_(localName) + { + } // QualifiedName + + QualifiedName(const string_type& prefix, + const string_type& localName, + const string_type& namespaceUri) : + prefix_(prefix), + localName_(localName), + namespaceUri_(namespaceUri), + rawName_() + { + } // QualifiedName + + QualifiedName(const string_type& prefix, + const string_type& localName, + const string_type& namespaceUri, + const string_type& rawName) : + prefix_(prefix), + localName_(localName), + namespaceUri_(namespaceUri), + rawName_(rawName) + { + } // QualifiedName + + + QualifiedName(); + + friend class QualifiedNameTest; +}; // class QualifiedName + +} // namespace XML + +} // namespace Arabica +#endif diff --git a/arabica/include/XML/XMLCharacterClasses.hpp b/arabica/include/XML/XMLCharacterClasses.hpp new file mode 100644 index 000000000..caf85eef5 --- /dev/null +++ b/arabica/include/XML/XMLCharacterClasses.hpp @@ -0,0 +1,28 @@ +#ifndef ARABICA_XML_CHARACTER_CLASSES_H +#define ARABICA_XML_CHARACTER_CLASSES_H + +#ifdef _MSC_VER +# include +#endif + +namespace Arabica +{ + +namespace XML +{ + bool is_char(wchar_t c); + bool is_space(wchar_t c); + bool is_name_char(wchar_t c); + bool is_ncname_char(wchar_t c); + bool is_letter(wchar_t c); + bool is_base_char(wchar_t c); + bool is_ideographic(wchar_t c); + bool is_digit(wchar_t c); + bool is_combining_char(wchar_t c); + bool is_extender(wchar_t c); + bool is_letter_or_digit(wchar_t c); + +} // namespace XML + +} // namespace Arabica +#endif diff --git a/arabica/include/XML/strings.hpp b/arabica/include/XML/strings.hpp new file mode 100755 index 000000000..fb01057b9 --- /dev/null +++ b/arabica/include/XML/strings.hpp @@ -0,0 +1,77 @@ +#ifndef ARABICA_XML_STRINGS_HPP +#define ARABICA_XML_STRINGS_HPP + +#include +#include + + // QName + //[7] QName ::= PrefixedName | UnprefixedName + //[8] PrefixedName ::= Prefix ':' LocalPart + //[9] UnprefixedName ::= LocalPart + //[10] Prefix ::= NCName + //[11] LocalPart ::= NCName + + // NCName + // [4] NCName ::= NCNameStartChar NCNameChar* //An XML Name, minus the ":" + // [5] NCNameChar ::= NameChar - ':' + // [6] NCNameStartChar ::= Letter | '_' } // namespace XML + +namespace Arabica +{ +namespace XML +{ + template + inline bool is_ncname(const typename string_adaptor::const_iterator& b, + const typename string_adaptor::const_iterator& e) + { + using namespace Arabica::text; + typedef typename string_adaptor::const_iterator const_iterator; + typedef typename string_adaptor::value_type value_type; + + if(b == e) + return false; // zero length + + const_iterator s = b; + if(!(is_letter(*s) || (*s == Unicode::LOW_LINE))) + return false; + + ++s; + for( ; s != e; ++s) + { + value_type c = *s; + if(!is_ncname_char(c)) + return false; + } + return true; + } // is_ncname + + template + inline bool is_ncname(const typename string_adaptor::string_type& str) + { + return is_ncname(string_adaptor::begin(str), + string_adaptor::end(str)); + } // is_ncname + + template + inline bool is_qname(const typename string_adaptor::string_type& str) + { + using namespace Arabica::text; + typedef typename string_adaptor::const_iterator const_iterator; + typedef typename string_adaptor::value_type value_type; + + size_t colon_index = string_adaptor::find(str, Unicode::COLON); + + if(colon_index == string_adaptor::npos()) + return is_ncname(str); + + const_iterator b = string_adaptor::begin(str); + const_iterator e = string_adaptor::end(str); + return is_ncname(b, b+colon_index) && + is_ncname(b+(colon_index+1), e); + } // is_qname + +} // namespace XML +} // namespace Arabica + + +#endif diff --git a/arabica/include/convert/impl/codecvt_specialisations.hpp b/arabica/include/convert/impl/codecvt_specialisations.hpp new file mode 100644 index 000000000..c52cdc2e8 --- /dev/null +++ b/arabica/include/convert/impl/codecvt_specialisations.hpp @@ -0,0 +1,131 @@ +#ifndef ARABICA_IMPL_CODECVT_SPECIALISATIONS_H +#define ARABICA_IMPL_CODECVT_SPECIALISATIONS_H + +#include + +namespace std +{ + +template<> +class codecvt : + public locale::facet, public codecvt_base +{ +public: + static locale::id id; + + codecvt_base::result out(std::mbstate_t& state, + const char* from, + const char* from_end, + const char*& from_next, + wchar_t* to, + wchar_t* to_limit, + wchar_t*& to_next) const + { + return this->do_out(state, from, from_end, from_next, to, to_limit, to_next); + } // out + + + codecvt_base::result in(std::mbstate_t& state, + const wchar_t* from, + const wchar_t* from_end, + const wchar_t*& from_next, + char* to, + char* to_limit, + char*& to_next) const + { + return this->do_in(state, from, from_end, from_next, to, to_limit, to_next); + } // in + + int encoding() const throw() + { + return this->do_encoding(); + } // encoding + + bool always_noconv() const throw() + { + return this->do_always_noconv(); + } // always_noconv + + int length(std::mbstate_t& state, + const wchar_t* from, + const wchar_t* end, + size_t max) const + { + return this->do_length(state, from, end, max); + } // length + +protected: + virtual ~codecvt() { } + + virtual codecvt_base::result do_out(std::mbstate_t&, + const char* from, + const char* from_end, + const char*& from_next, + wchar_t* to, + wchar_t* to_limit, + wchar_t*& to_next) const + { + int limit = std::max(from_end - from, to_limit - to); + from_next = from; + to_next = to; + + while(limit--) + *to_next++ = static_cast(*from_next++); + + return codecvt_base::ok; + } // do_out + + + virtual codecvt_base::result do_in(std::mbstate_t&, + const wchar_t* from, + const wchar_t* from_end, + const wchar_t*& from_next, + char* to, + char* to_limit, + char*& to_next) const + { + int limit = std::max(from_end - from, to_limit - to); + from_next = from; + to_next = to; + + while(limit--) + *to_next++ = static_cast(*from_next++); + + return codecvt_base::ok; + } // do_in + + virtual codecvt_base::result do_unshift(std::mbstate_t&, + wchar_t* to, + wchar_t* /*to_limit*/, + wchar_t*& to_next) const + { + to_next = to; + return codecvt_base::noconv; + } // do_unshift + + virtual int do_encoding() const throw() + { + return 1; + } // do_encoding + + virtual bool do_always_noconv() const throw() + { + return false; + } // do_always_noconv + + virtual int do_length(std::mbstate_t&, + const wchar_t* from, + const wchar_t* end, + size_t max) const + { + return std::min(max, (end - from)); + } // do_length + + virtual int do_max_length() const throw() + { + return 1; + } // do_max_length +}; // class codecvt : + +} // namespace std +#endif diff --git a/arabica/include/convert/impl/ucs2_utf16.hpp b/arabica/include/convert/impl/ucs2_utf16.hpp new file mode 100644 index 000000000..bb111baea --- /dev/null +++ b/arabica/include/convert/impl/ucs2_utf16.hpp @@ -0,0 +1,27 @@ +#ifndef ARABICA_UTILS_UCS2_UTF16_H +#define ARABICA_UTILS_UCS2_UTF16_H + +#include + +namespace Arabica +{ +namespace convert +{ +namespace impl +{ + +std::codecvt_base::result ucs2_2_utf16(bool be, + wchar_t const* from, wchar_t const* from_end, wchar_t const*& from_next, + char* to, char* to_limit, char*& to_next); +std::codecvt_base::result utf16_2_ucs2(bool be, + char const* from, char const* from_end, char const*& from_next, + wchar_t* to, wchar_t* to_limit, wchar_t*& to_next); + +} // namespace impl +} // namepsace convert +} // namespace Arabica + +#endif + + + diff --git a/arabica/include/convert/impl/ucs2_utf8.hpp b/arabica/include/convert/impl/ucs2_utf8.hpp new file mode 100644 index 000000000..e2ed06f20 --- /dev/null +++ b/arabica/include/convert/impl/ucs2_utf8.hpp @@ -0,0 +1,23 @@ +#ifndef ARABICA_UTILS_UCS2_UTF8_H +#define ARABICA_UTILS_UCS2_UTF8_H + +#include + +namespace Arabica +{ +namespace convert +{ +namespace impl +{ + +std::codecvt_base::result ucs2_2_utf8(const wchar_t* from, const wchar_t* from_end, const wchar_t*& from_next, + char* to, char* to_limit, char*& to_next); + +std::codecvt_base::result utf8_2_ucs2(const char* from, const char* from_end, const char*& from_next, + wchar_t* to, wchar_t* to_limit, wchar_t*& to_next); + +} // namespace impl +} // namespace convert +} // namespace Arabica + +#endif diff --git a/arabica/include/convert/utf8ucs2codecvt.hpp b/arabica/include/convert/utf8ucs2codecvt.hpp new file mode 100644 index 000000000..c30a9694f --- /dev/null +++ b/arabica/include/convert/utf8ucs2codecvt.hpp @@ -0,0 +1,70 @@ +#ifndef ARABICA_UTF8UCS2_CODECVT_H +#define ARABICA_UTF8UCS2_CODECVT_H +//--------------------------------------------------------------------------- +// class utf8ucs2codecvt +// This facet converts from Unicode (UCS-2) wchar_ts to +// char using the UTF-8 encoding. +// +// For the full guff on codecvts see section 22.2.1.5 of +// The C++ Standard (ISO/IEC 14882 to be pedantic). +// +// I got my information about UTF-8 from RFC 2044. +//--------------------------------------------------------------------------- +#include + +#ifndef ARABICA_NO_WCHAR_T +#include +#include + +#ifndef ARABICA_NO_CODECVT_SPECIALISATIONS +#include +#endif + +namespace Arabica +{ +namespace convert +{ + +class utf8ucs2codecvt : public std::codecvt +{ +protected: + virtual ~utf8ucs2codecvt() { } + + virtual result do_out(std::mbstate_t&, + const wchar_t* from, + const wchar_t* from_end, + const wchar_t*& from_next, + char* to, + char* to_limit, + char*& to_next) const; + + virtual result do_in(std::mbstate_t&, + const char* from, + const char* from_end, + const char*& from_next, + wchar_t* to, + wchar_t* to_limit, + wchar_t*& to_next) const; + + virtual result do_unshift(std::mbstate_t&, + char*, + char*, + char*&) const; + + virtual int do_encoding() const throw() { return 0; } + + virtual bool do_always_noconv() const throw() { return false; } + + virtual int do_length(const std::mbstate_t&, + const char* from, + const char* end, + size_t max) const throw(); + + virtual int do_max_length() const throw() { return 3; } +}; // class utf8ucs2codecvt + +} // namespace convert +} // namespace Arabica + +#endif +#endif diff --git a/arabica/include/io/convertstream.hpp b/arabica/include/io/convertstream.hpp new file mode 100644 index 000000000..26bb317d7 --- /dev/null +++ b/arabica/include/io/convertstream.hpp @@ -0,0 +1,282 @@ +#ifndef ARABICA_CONVERT_STREAM_H +#define ARABICA_CONVERT_STREAM_H +////////////////////////////////////////////////////// +// +// $Id$ +// +////////////////////////////////////////////////////// +// +// basic_iconvertstream, basic_oconvertstream +// +// Written by Jez Higgins +// Copyright 1999-2005 Jez UK Ltd, http://www.jezuk.co.uk/ +// +// Normal basic_stringstream do not apply the codecvt facet +// of their locale. These two streams act exactly like +// basic_stringstream except that they do apply the imbued codecvt +// facet to their input (in the case of basic_iconvertstream) +// or output (int the case of basic_oconvertstream). +// +// This means you can to cool things like this +// +// std::locale loc(std::_Addfac(std::locale(), new base64_codecvt)); +// converting_ostringstream os; +// os.imbue(loc); +// +// os << "stuff"; +// ... lots more stuff streamed into os +// +// std::cout << os.str() << std::endl; +// os.str() contains the Base64 encoded byte sequence +// +// Decoding is just as simple. +// +// std::locale loc(std::_Addfac(std::locale(), new base64_codecvt)); +// +// converting_istringstream is; +// is.imbue(loc); +// is.str(a_base64_byte_stream); +// std::cout << is.str(); +// ... is.str() is the decode byte stream, which can also be extracted +// ... using >> operators (is >> byte; etc) +// +//////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include + +namespace Arabica +{ +namespace io +{ + +template +class convertstreambuf_init +{ +public: + typedef std::basic_stringbuf stringbufT; + + convertstreambuf_init(std::ios_base::openmode mode) : + buf_(mode) + { + } // convertstreambuf_init + + stringbufT const* buf() const + { + return &buf_; + } // buf() + + stringbufT* buf() + { + return &buf_; + } // buf() + +private: + stringbufT buf_; +}; // class convertstreambuf_init + +template, + typename fromCharT = charT, + typename fromTraitsT = std::char_traits > +class basic_iconvertstream : + private virtual convertstreambuf_init, + public std::basic_istream +{ + typedef convertstreambuf_init convertstreambuf_initT; +public: + typedef std::basic_istream istreamT; + typedef typename convertstreambuf_initT::stringbufT stringbufT; + typedef std::basic_string stringT; + typedef std::basic_string fromStringT; + + explicit basic_iconvertstream(std::ios_base::openmode mode = std::ios_base::in) : + convertstreambuf_initT(mode | std::ios_base::in), + std::basic_istream(convertstreambuf_initT::buf()) + { + } // basic_iconvertstream + + explicit basic_iconvertstream(const stringT& str, std::ios_base::openmode mode = std::ios_base::in) : + convertstreambuf_initT(mode | std::ios_base::in), + std::basic_istream(convertstreambuf_initT::buf()) + { + str(str); + } // basic_iconvertstream + + virtual ~basic_iconvertstream() + {} + + stringbufT* rdbuf() const + { + return const_cast(convertstreambuf_initT::buf()); + } // rdbuf + + stringT str() const + { + return convertstreambuf_initT::buf()->str(); + } // str + + void str(const fromStringT& str) + { + // do conversion + const std::codecvt& cvt = + std::use_facet >(this->getloc()); + + if(cvt.always_noconv()) + { + convertstreambuf_initT::buf()->str(no_conversion(str)); + return; + } + + // we must do code conversion + stringT converted; + const fromCharT* from_next = str.data(); + typename std::codecvt_base::result r; + typename traitsT::state_type state; + + do + { + charT* to_next; + r = cvt.in(state, from_next, str.data() + str.length(), from_next, + to_, to_ + toSize_, to_next); + if(r == std::codecvt_base::noconv) + { + converted.append(no_conversion(str)); + break; + } + converted.append(to_, (to_next - to_)); + } + while(r == std::codecvt_base::partial); + + // naughty! ignore (r == std::codecvt_base::error) + convertstreambuf_initT::buf()->str(converted); + } // str + +private: + stringT no_conversion(const fromStringT& str) + { + stringT dest; + + std::back_insert_iterator id(dest); + for(typename fromStringT::const_iterator i = str.begin(); i != str.end(); ++i, ++id) + *id = static_cast(*i); + + return dest; + } // no_conversion + + static const int toSize_ = 4096; + charT to_[toSize_]; +}; // basic_iconvertstream + +template, + typename toCharT = charT, + typename toTraitsT = std::char_traits > +class basic_oconvertstream : + private virtual convertstreambuf_init, + public std::basic_ostream +{ + typedef convertstreambuf_init convertstreambuf_initT; +public: + typedef std::basic_ostream ostreamT; + typedef typename convertstreambuf_initT::stringbufT stringbufT; + typedef std::basic_string stringT; + typedef std::basic_string toStringT; + + explicit basic_oconvertstream(std::ios_base::openmode mode = std::ios_base::out) : + convertstreambuf_initT(mode | std::ios_base::out), + std::basic_ostream(convertstreambuf_initT::buf()) + { + } // basic_oconvertstream + + explicit basic_oconvertstream(const stringT& str, std::ios_base::openmode mode = std::ios_base::out) : + convertstreambuf_initT(mode | std::ios_base::out), + std::basic_ostream(convertstreambuf_initT::buf()) + { + convertstreambuf_initT::buf()->str(str); + } // basic_oconvertstream + + virtual ~basic_oconvertstream() + {} + + stringbufT* rdbuf() const + { + return const_cast(convertstreambuf_initT::buf()); + } // rdbuf + + toStringT str() + { + toStringT out; + stringT newstuff(convertstreambuf_initT::buf()->str()); + + if(newstuff.length() == 0) + return out; + + // convert it here + const std::codecvt& cvt = + std::use_facet >(this->getloc()); + + if(cvt.always_noconv()) + out.append(no_conversion(newstuff)); + else + { + // we must do code conversion + const charT* from_next = newstuff.data(); + typename std::codecvt_base::result r; + typename traitsT::state_type state; + + do + { + toCharT* to_next; + r = cvt.out(state, from_next, newstuff.data() + newstuff.length(), from_next, + to_, to_ + toSize_, to_next); + if(r == std::codecvt_base::noconv) + { + out.append(no_conversion(newstuff)); + break; + } + out.append(to_, (to_next - to_)); + } + while(r == std::codecvt_base::partial); + // naughty! ignore (r == std::codecvt_base::error) + } // if(cvt.always_noconv()) + + convertstreambuf_initT::buf()->str(stringT()); + + return out; + } // str + + void str(const stringT& str) + { + convertstreambuf_initT::buf()->str(str); + } // str + +private: + toStringT no_conversion(const stringT& str) + { + toStringT dest; + + std::back_insert_iterator id(dest); + for(typename stringT::const_iterator i = str.begin(); i != str.end(); ++i, ++id) + *id = static_cast(*i); + + return dest; + } // no_conversion + + static const int toSize_ = 4096; + toCharT to_[toSize_]; +}; // basic_oconvertstream + +typedef basic_iconvertstream converting_istringstream; +typedef basic_oconvertstream converting_ostringstream; +#ifndef ARABICA_NO_WSTRING_T +typedef basic_iconvertstream converting_iwstringstream; +typedef basic_oconvertstream converting_owstringstream; +#endif + +} // namespace io +} // namespace Arabica +#endif diff --git a/arabica/include/io/socket_stream.hpp b/arabica/include/io/socket_stream.hpp new file mode 100644 index 000000000..48bba0f8b --- /dev/null +++ b/arabica/include/io/socket_stream.hpp @@ -0,0 +1,436 @@ +#ifndef ARABICA_SOCKET_STREAM_H +#define ARABICA_SOCKET_STREAM_H +/////////////////////////////////////////////////////////////////////// +// +// socket_stream.h +// +// Written by Jez Higgins +// Copyright 1999-2003 Jez UK Ltd, http://www.jezuk.co.uk/ +// +/////////////////////////////////////////////////////////////////////// +// $Id$ +/////////////////////////////////////////////////////////////////////// + +#include +#ifndef ARABICA_USE_WINSOCK +#include +#include +#include +#include +#include +#include +#include +#else +#include +#endif +#include +#include +#include +#include +#include + +#ifndef INADDR_NONE +# define INADDR_NONE ((in_addr_t) -1) +#endif + +#ifdef _MSC_VER +#pragma warning(disable: 4250) +// See http://connect.microsoft.com/VisualStudio/feedback/details/733720/inheriting-from-std-fstream-produces-c4250-warning +#endif +namespace Arabica +{ +namespace io +{ + +/////////////////////////////////////////////////////////// +// basic_socketbuf declaration +template +class basic_socketbuf : public std::basic_streambuf +{ + public: + typedef typename traitsT::int_type int_type; + + using std::basic_streambuf::setp; + using std::basic_streambuf::setg; + using std::basic_streambuf::underflow; + using std::basic_streambuf::gptr; + using std::basic_streambuf::gbump; + using std::basic_streambuf::egptr; + using std::basic_streambuf::eback; + using std::basic_streambuf::pptr; + using std::basic_streambuf::sputc; + + basic_socketbuf(); + virtual ~basic_socketbuf(); + + bool is_open() const; + + basic_socketbuf* open(const char* hostname, unsigned short port); + basic_socketbuf* close(); + + protected: + virtual int_type overflow(int_type c = traitsT::eof()); + virtual int sync(); + virtual int_type underflow(); + virtual int_type pbackfail(int_type c); + + private: + typedef typename traitsT::state_type state_t; + + int sock_; + std::vector outBuffer_; + state_t outState_; + std::vector inBuffer_; + state_t inState_; + + void growOutBuffer(); + bool writeSocket(); + void growInBuffer(); + int readSocket(); + int closeSocket(int sock) const; + + static const size_t bufferSize_; + static const size_t pbSize_; + +#ifndef ARABICA_USE_WINSOCK + static const int INVALID_SOCKET; + static const int SOCKET_ERROR; +#endif +}; // class basic_socketbuf + +template +const size_t basic_socketbuf::bufferSize_ = 1024; +template +const size_t basic_socketbuf::pbSize_ = 4; + // why 4? both Josuttis and Langer&Kreft use 4. +#ifndef ARABICA_USE_WINSOCK +template +const int basic_socketbuf::INVALID_SOCKET = -1; +template +const int basic_socketbuf::SOCKET_ERROR = -1; +#endif + +/////////////////////////////////////////////////////////// +// basic_socketbuf definition +template +basic_socketbuf::basic_socketbuf() + : std::basic_streambuf(), + sock_(INVALID_SOCKET), + outBuffer_(0), + inBuffer_(0) +{ +// outState_ = 0; +// inState_ = 0; + + setp(0, 0); + setg(0, 0, 0); +} // basic_socketbuf + +template +basic_socketbuf::~basic_socketbuf() +{ + if(is_open()) + { + sync(); + closeSocket(sock_); + } // if(is_open()) +} // ~basic_socketbuf + +template +bool basic_socketbuf::is_open() const +{ + return (sock_ != INVALID_SOCKET); +} // is_open + +template +basic_socketbuf* basic_socketbuf::open(const char* hostname, + unsigned short port) +{ + // already open? and crappy data + if((sock_ != INVALID_SOCKET) || + (hostname == 0) || + (strlen(hostname) == 0)) + return 0; + + // set up address + sockaddr_in sockAddr; + memset(&sockAddr, 0, sizeof(sockAddr)); + + sockAddr.sin_family = AF_INET; + sockAddr.sin_addr.s_addr = inet_addr(hostname); + if(sockAddr.sin_addr.s_addr == INADDR_NONE) + { + hostent* host = gethostbyname(hostname); + if(!host) + return 0; + + sockAddr.sin_addr.s_addr = + reinterpret_cast(host->h_addr)->s_addr; + } // if(sockAddr.sin_addr.s_addr == INADDR_NONE) + sockAddr.sin_port = htons(port); + + // connect + int tmpsock = socket(AF_INET, SOCK_STREAM, 0); + if(tmpsock == INVALID_SOCKET) + return 0; + if(connect(tmpsock, reinterpret_cast(&sockAddr), sizeof(sockaddr_in)) != 0) + { + closeSocket(tmpsock); + return 0; + } + + // hurray, we've connected so initialise everything else we need to + sock_ = tmpsock; + + return this; +} // open + +template +basic_socketbuf* basic_socketbuf::close() +{ + if(!is_open()) + return 0; + + if(closeSocket(sock_) == SOCKET_ERROR) + return 0; + + sock_ = INVALID_SOCKET; + setg(0,0,0); + + return this; +} // close; + +template +typename basic_socketbuf::int_type basic_socketbuf::overflow(typename basic_socketbuf::int_type c) +{ + if(traitsT::eq_int_type(traitsT::eof(), c)) + return traitsT::not_eof(c); + if(!is_open()) + return traitsT::eof(); + + growOutBuffer(); + sputc(traitsT::to_char_type(c)); + + return traitsT::not_eof(c); +} // overflow + +template +int basic_socketbuf::sync() +{ + return writeSocket() ? 0 : -1; +} // sync + +template +typename basic_socketbuf::int_type basic_socketbuf::underflow() +{ + if(!is_open()) + return traitsT::eof(); + if(gptr() != 0 && gptr() < egptr()) + return (traitsT::to_int_type(*gptr())); + + size_t length = readSocket(); + if(!length) + return traitsT::eof(); + + return traitsT::to_int_type(*gptr()); +} // underflow + +template +typename basic_socketbuf::int_type basic_socketbuf::pbackfail(int_type c) +{ + if(gptr() == eback()) + return traitsT::eof(); + + gbump(-1); + if(!traitsT::eq_int_type(c, traitsT::eof())) + *(gptr()) = traitsT::to_char_type(c); + return traitsT::not_eof(c); +} // pbackfail + +template +void basic_socketbuf::growOutBuffer() +{ + size_t oldsize = outBuffer_.capacity(); + size_t newsize = (oldsize ? oldsize*2 : bufferSize_); + outBuffer_.resize(newsize); + + char* out_begin = &(outBuffer_[0]); + setp(out_begin + oldsize, out_begin + newsize); +} // growOutBuffer + +template +bool basic_socketbuf::writeSocket() +{ + // write to the socket + charT* from_next = &(outBuffer_[0]); + size_t length = pptr() - from_next; + if(!length) + return true; + + bool ok = (send(sock_, from_next, length, 0) != SOCKET_ERROR); + + if(ok) + setp(from_next, from_next + outBuffer_.capacity()); + + return ok; +} // writeSocket + +template +void basic_socketbuf::growInBuffer() +{ + size_t oldsize = inBuffer_.capacity(); + size_t newsize = (oldsize ? oldsize*2 : bufferSize_+pbSize_); + inBuffer_.resize(newsize); +} // growInBuffer + +template +int basic_socketbuf::readSocket() +{ + if(!inBuffer_.capacity()) + growInBuffer(); + + size_t pbCount = std::min(gptr() - eback(), pbSize_); + + memcpy(&(inBuffer_[0]) + (pbSize_-pbCount)*sizeof(charT), + gptr() - pbCount*sizeof(charT), + pbCount*sizeof(charT)); + + int res = recv(sock_, &(inBuffer_[0]) + pbSize_, inBuffer_.capacity() - pbSize_, 0); + if(res == 0) + { + // server closed the socket + close(); + return 0; + } // if(res == 0) + else if(res == SOCKET_ERROR) + { +#ifdef ARABICA_USE_WINSOCK + if(GetLastError() == WSAEMSGSIZE) + { + // buffer was too small, so make it bigger + growInBuffer(); + return readSocket(); + } // if(GetLastError() != WSAEMSGSIZE) +#endif + + // unclever error handling + close(); + return 0; + } // if(res == SOCKET_ERROR) + + charT* to_begin = &(inBuffer_[0]) + pbSize_; + setg(to_begin - pbCount, to_begin, to_begin + res); + + return res; +} // readSocket + +template +int basic_socketbuf::closeSocket(int sock) const +{ +#ifdef ARABICA_USE_WINSOCK + return closesocket(sock); +#else + return ::close(sock); +#endif +} // closeSocket + +/////////////////////////////////////////////////////////// +// basic_socketstream declaration +template +class socketstreambuf_init +{ +public: + typedef basic_socketbuf sockbuf; + + sockbuf* buf() const + { + return &buf_; + } // buf() + +private: + mutable sockbuf buf_; +}; // class socketstreambuf_init + +template +class basic_socketstream : + private virtual socketstreambuf_init, + public std::basic_iostream +{ + public: + using std::basic_iostream::setstate; + using std::basic_iostream::badbit; + + basic_socketstream(); + explicit basic_socketstream(const char* hostname, int port); + + virtual ~basic_socketstream(); + + basic_socketbuf* rdbuf() const; + bool is_open() const; + void open(const char* hostname, unsigned short port); + void close(); +}; // class basic_socketstream + +//////////////////////////////////////////////////////////////// +// basic_socketstream definition +template +basic_socketstream::basic_socketstream() : + socketstreambuf_init(), + std::basic_iostream(socketstreambuf_init::buf()) +{ +} // basic_socketstream + +template +basic_socketstream::basic_socketstream(const char* hostname, int port) : + socketstreambuf_init(), + std::basic_iostream(socketstreambuf_init::buf()) +{ + open(hostname, port); +} // basic_socketstream + +template +basic_socketstream::~basic_socketstream() +{ +} // ~basic_socketstream + +template +basic_socketbuf* basic_socketstream::rdbuf() const +{ + return socketstreambuf_init::buf(); +} // rdbuf + +template +bool basic_socketstream::is_open() const +{ + return socketstreambuf_init::buf()->is_open(); +} // is_open + +template +void basic_socketstream::open(const char* hostname, unsigned short port) +{ + if(socketstreambuf_init::buf()->open(hostname, port) == 0) + setstate(badbit); +} // open + +template +void basic_socketstream::close() +{ + if(!is_open()) + return; + + if(socketstreambuf_init::buf()->close() == 0) + setstate(badbit); +} // close + +typedef basic_socketbuf > socketbuf; +typedef basic_socketstream > socketstream; +#ifndef ARABICA_NO_WSTRING_T +typedef basic_socketbuf > wsocketbuf; +typedef basic_socketstream > wsocketstream; +#endif + +} // namespace io +} // namespace Arabica + +#endif +//end of file diff --git a/arabica/include/io/uri.hpp b/arabica/include/io/uri.hpp new file mode 100644 index 000000000..0bd9ef8e8 --- /dev/null +++ b/arabica/include/io/uri.hpp @@ -0,0 +1,84 @@ +#ifndef ARABICA_IO_URI_HPP +#define ARABICA_IO_URI_HPP + +#include + +namespace Arabica +{ + namespace io + { + class URI + { + public: + URI() { } + + URI(const std::string& URI); + + URI(const URI& base, const std::string& relativeURI); + + URI(const URI& rhs) : + scheme_(rhs.scheme_), + host_(rhs.host_), + path_(rhs.path_), + port_(rhs.port_), + is_absolute_(rhs.is_absolute_) + { + } // URI + + URI& operator=(const URI& rhs) + { + URI t(rhs); + swap(t); + return *this; + } // operator= + + bool operator==(const URI& rhs) const + { + return scheme_ == rhs.scheme_ && + host_ == rhs.host_ && + path_ == rhs.path_ && + port_ == rhs.port_ && + is_absolute_ == rhs.is_absolute_; + } // operator== + + bool operator!=(const URI& rhs) const + { + return !(operator==(rhs)); + } // operator!= + + ~URI() { } + + void swap(URI& rhs) + { + std::swap(scheme_, rhs.scheme_); + std::swap(host_, rhs.host_); + std::swap(path_, rhs.path_); + std::swap(port_, rhs.port_); + std::swap(is_absolute_, rhs.is_absolute_); + } // swap + + const std::string& scheme() const { return scheme_; } + const std::string& host() const { return host_; } + const std::string& port() const; + const std::string& path() const { return path_; } + const bool& is_absolute() const { return is_absolute_; } + + std::string as_string() const; + + private: + void parse(const std::string& URI); + void parse_uri(const std::string& URI); + std::string::const_iterator parseAuthority(const std::string::const_iterator& u, const std::string::const_iterator& ue); + void absolutise(URI& relURI); + void combinePath(const std::string& path); + + std::string scheme_; + std::string host_; + std::string path_; + std::string port_; + bool is_absolute_; + }; // class URI + } // namespace io +} // namespace Arabica + +#endif diff --git a/arabica/include/text/UnicodeCharacters.hpp b/arabica/include/text/UnicodeCharacters.hpp new file mode 100644 index 000000000..4a61a4b25 --- /dev/null +++ b/arabica/include/text/UnicodeCharacters.hpp @@ -0,0 +1,292 @@ +#ifndef ARABICA_XML_UNICODE_CHARACTERS_H +#define ARABICA_XML_UNICODE_CHARACTERS_H + +namespace Arabica +{ +namespace text +{ + +template +struct Unicode +{ + static const charT HORIZONTAL_TABULATION; + static const charT LINE_FEED; + static const charT CARRIAGE_RETURN; + static const charT SPACE; + static const charT EXCLAMATION_MARK; + static const charT QUOTATION_MARK; + static const charT NUMBER_SIGN; + static const charT PERCENT_SIGN; + static const charT AMPERSAND; + static const charT APOSTROPHE; + static const charT LEFT_PARENTHESIS; + static const charT RIGHT_PARENTHESIS; + static const charT ASTERISK; + static const charT PLUS_SIGN; + static const charT COMMA; + static const charT HYPHEN_MINUS; + static const charT FULL_STOP; + static const charT SLASH; + static const charT NUMBER_0; + static const charT NUMBER_1; + static const charT NUMBER_2; + static const charT NUMBER_3; + static const charT NUMBER_4; + static const charT NUMBER_5; + static const charT NUMBER_6; + static const charT NUMBER_7; + static const charT NUMBER_8; + static const charT NUMBER_9; + static const charT COLON; + static const charT SEMI_COLON; + static const charT LESS_THAN_SIGN; + static const charT EQUALS_SIGN; + static const charT GREATER_THAN_SIGN; + static const charT QUESTION_MARK; + static const charT CAPITAL_A; + static const charT CAPITAL_B; + static const charT CAPITAL_C; + static const charT CAPITAL_D; + static const charT CAPITAL_E; + static const charT CAPITAL_F; + static const charT CAPITAL_G; + static const charT CAPITAL_H; + static const charT CAPITAL_I; + static const charT CAPITAL_J; + static const charT CAPITAL_K; + static const charT CAPITAL_L; + static const charT CAPITAL_M; + static const charT CAPITAL_N; + static const charT CAPITAL_O; + static const charT CAPITAL_P; + static const charT CAPITAL_Q; + static const charT CAPITAL_R; + static const charT CAPITAL_S; + static const charT CAPITAL_T; + static const charT CAPITAL_U; + static const charT CAPITAL_V; + static const charT CAPITAL_W; + static const charT CAPITAL_X; + static const charT CAPITAL_Y; + static const charT CAPITAL_Z; + static const charT LEFT_SQUARE_BRACKET; + static const charT BACK_SLASH; + static const charT RIGHT_SQUARE_BRACKET; + static const charT LOW_LINE; + static const charT LOWERCASE_A; + static const charT LOWERCASE_B; + static const charT LOWERCASE_C; + static const charT LOWERCASE_D; + static const charT LOWERCASE_E; + static const charT LOWERCASE_F; + static const charT LOWERCASE_G; + static const charT LOWERCASE_H; + static const charT LOWERCASE_I; + static const charT LOWERCASE_J; + static const charT LOWERCASE_K; + static const charT LOWERCASE_L; + static const charT LOWERCASE_M; + static const charT LOWERCASE_N; + static const charT LOWERCASE_O; + static const charT LOWERCASE_P; + static const charT LOWERCASE_Q; + static const charT LOWERCASE_R; + static const charT LOWERCASE_S; + static const charT LOWERCASE_T; + static const charT LOWERCASE_U; + static const charT LOWERCASE_V; + static const charT LOWERCASE_W; + static const charT LOWERCASE_X; + static const charT LOWERCASE_Y; + static const charT LOWERCASE_Z; + static const charT VERTICAL_BAR; +}; // namespace XML + +template +const charT Unicode::HORIZONTAL_TABULATION = 0x09; +template +const charT Unicode::LINE_FEED = 0x0A; +template +const charT Unicode::CARRIAGE_RETURN = 0x0D; +template +const charT Unicode::SPACE = 0x20; +template +const charT Unicode::EXCLAMATION_MARK = 0x21; // ! +template +const charT Unicode::QUOTATION_MARK = 0x22; // " +template +const charT Unicode::NUMBER_SIGN = 0x23; // # +template +const charT Unicode::PERCENT_SIGN = 0x25; // % +template +const charT Unicode::AMPERSAND = 0x26; // & +template +const charT Unicode::APOSTROPHE = 0x27; // ' +template +const charT Unicode::LEFT_PARENTHESIS = 0x28; // ( +template +const charT Unicode::RIGHT_PARENTHESIS = 0x29; // ) +template +const charT Unicode::ASTERISK = 0x2A; // * +template +const charT Unicode::PLUS_SIGN = 0x2B; // + +template +const charT Unicode::COMMA = 0x2C; // , +template +const charT Unicode::HYPHEN_MINUS = 0x2D; // - +template +const charT Unicode::FULL_STOP = 0x2E; // . +template +const charT Unicode::SLASH = 0x2F; // / +template +const charT Unicode::NUMBER_0 = 0x30; +template +const charT Unicode::NUMBER_1 = 0x31; +template +const charT Unicode::NUMBER_2 = 0x32; +template +const charT Unicode::NUMBER_3 = 0x33; +template +const charT Unicode::NUMBER_4 = 0x34; +template +const charT Unicode::NUMBER_5 = 0x35; +template +const charT Unicode::NUMBER_6 = 0x36; +template +const charT Unicode::NUMBER_7 = 0x37; +template +const charT Unicode::NUMBER_8 = 0x38; +template +const charT Unicode::NUMBER_9 = 0x39; +template +const charT Unicode::COLON = 0x3A; // : +template +const charT Unicode::SEMI_COLON = 0x3B; // ; +template +const charT Unicode::LESS_THAN_SIGN = 0x3C; // < +template +const charT Unicode::EQUALS_SIGN = 0x3D; // = +template +const charT Unicode::GREATER_THAN_SIGN = 0x3E; // > +template +const charT Unicode::QUESTION_MARK = 0x3F; // ? +template +const charT Unicode::CAPITAL_A = 0x41; +template +const charT Unicode::CAPITAL_B = 0x42; +template +const charT Unicode::CAPITAL_C = 0x43; +template +const charT Unicode::CAPITAL_D = 0x44; +template +const charT Unicode::CAPITAL_E = 0x45; +template +const charT Unicode::CAPITAL_F = 0x46; +template +const charT Unicode::CAPITAL_G = 0x47; +template +const charT Unicode::CAPITAL_H = 0x48; +template +const charT Unicode::CAPITAL_I = 0x49; +template +const charT Unicode::CAPITAL_J = 0x4A; +template +const charT Unicode::CAPITAL_K = 0x4B; +template +const charT Unicode::CAPITAL_L = 0x4C; +template +const charT Unicode::CAPITAL_M = 0x4D; +template +const charT Unicode::CAPITAL_N = 0x4E; +template +const charT Unicode::CAPITAL_O = 0x4F; +template +const charT Unicode::CAPITAL_P = 0x50; +template +const charT Unicode::CAPITAL_Q = 0x51; +template +const charT Unicode::CAPITAL_R = 0x52; +template +const charT Unicode::CAPITAL_S = 0x53; +template +const charT Unicode::CAPITAL_T = 0x54; +template +const charT Unicode::CAPITAL_U = 0x55; +template +const charT Unicode::CAPITAL_V = 0x56; +template +const charT Unicode::CAPITAL_W = 0x57; +template +const charT Unicode::CAPITAL_X = 0x58; +template +const charT Unicode::CAPITAL_Y = 0x59; +template +const charT Unicode::CAPITAL_Z = 0x5A; +template +const charT Unicode::LEFT_SQUARE_BRACKET = 0x5B; // ] +template +const charT Unicode::BACK_SLASH = 0x5C; // +template +const charT Unicode::RIGHT_SQUARE_BRACKET = 0x5D; // [ +template +const charT Unicode::LOW_LINE = 0x5F; // _ +template +const charT Unicode::LOWERCASE_A = 0x61; +template +const charT Unicode::LOWERCASE_B = 0x62; +template +const charT Unicode::LOWERCASE_C = 0x63; +template +const charT Unicode::LOWERCASE_D = 0x64; +template +const charT Unicode::LOWERCASE_E = 0x65; +template +const charT Unicode::LOWERCASE_F = 0x66; +template +const charT Unicode::LOWERCASE_G = 0x67; +template +const charT Unicode::LOWERCASE_H = 0x68; +template +const charT Unicode::LOWERCASE_I = 0x69; +template +const charT Unicode::LOWERCASE_J = 0x6A; +template +const charT Unicode::LOWERCASE_K = 0x6B; +template +const charT Unicode::LOWERCASE_L = 0x6C; +template +const charT Unicode::LOWERCASE_M = 0x6D; +template +const charT Unicode::LOWERCASE_N = 0x6E; +template +const charT Unicode::LOWERCASE_O = 0x6F; +template +const charT Unicode::LOWERCASE_P = 0x70; +template +const charT Unicode::LOWERCASE_Q = 0x71; +template +const charT Unicode::LOWERCASE_R = 0x72; +template +const charT Unicode::LOWERCASE_S = 0x73; +template +const charT Unicode::LOWERCASE_T = 0x74; +template +const charT Unicode::LOWERCASE_U = 0x75; +template +const charT Unicode::LOWERCASE_V = 0x76; +template +const charT Unicode::LOWERCASE_W = 0x77; +template +const charT Unicode::LOWERCASE_X = 0x78; +template +const charT Unicode::LOWERCASE_Y = 0x79; +template +const charT Unicode::LOWERCASE_Z = 0x7A; +template +const charT Unicode::VERTICAL_BAR = 0x7C; // | + +} // namespace text +} // namespace Arabica + +#endif + diff --git a/arabica/include/text/normalize_whitespace.hpp b/arabica/include/text/normalize_whitespace.hpp new file mode 100644 index 000000000..97b282388 --- /dev/null +++ b/arabica/include/text/normalize_whitespace.hpp @@ -0,0 +1,50 @@ +#ifndef ARABICA_UTILS_NORMALIZE_WHITESPACE_HPP +#define ARABICA_UTILS_NORMALIZE_WHITESPACE_HPP + +#include +#include +#include + +namespace Arabica +{ +namespace text +{ + +template +inline string_type normalize_whitespace(const string_type& ch) +{ + std::string value = string_adaptor::asStdString(ch); + std::string stripped = normalize_whitespace >(value); + return string_adaptor::construct_from_utf8(stripped.c_str()); +} // normalize_whitespace + +template<> +inline std::string normalize_whitespace >(const std::string& ch) +{ + std::string value(ch); + std::string::const_iterator i = value.begin(), ie = value.end(); + std::string::iterator p = value.begin(), pe = value.end(); + + // string leading space + while((i != ie) && (Arabica::XML::is_space(static_cast(*i)))) + ++i; + + while(i != ie) + { + while((i != ie) && (!Arabica::XML::is_space(static_cast(*i)))) + *p++ = *i++; + while((i != ie) && (Arabica::XML::is_space(static_cast(*i)))) + ++i; + if(i != ie) + *p++ = Arabica::text::Unicode::SPACE; + } // while ... + if(p != pe) + value.erase(p, pe); + + return value; +} // normalize_whitespace + +} // namespace text +} // namespace Arabica +#endif + diff --git a/arabica/src/SAX/helpers/InputSourceResolver.cpp b/arabica/src/SAX/helpers/InputSourceResolver.cpp new file mode 100644 index 000000000..b33ff099c --- /dev/null +++ b/arabica/src/SAX/helpers/InputSourceResolver.cpp @@ -0,0 +1,143 @@ +/* + * $Id$ + */ + +#include +#include +#include +#include +#include +#include + +using namespace Arabica::SAX; + +void InputSourceResolver::open(const std::string& /* publicId */, + const std::string& systemId, + std::istream* byteStream) +{ + if(byteStream != 0) + { + byteStream_ = byteStream; + return; + } + + // does it look like a URI? + Arabica::io::URI url(systemId); + if(!url.scheme().empty()) + { + URIResolver res = findResolver(url.scheme()); + if(res) + byteStream_ = res(systemId); + if(byteStream_) + { + deleteStream_ = true; + return; + } // if ... + } // if ... + + // try and open it as a file + std::ifstream* ifs = new std::ifstream(url.path().c_str()); + if(ifs->is_open()) + { + deleteStream_ = true; + byteStream_ = ifs; + } + else + delete ifs; +} // InputSourceResolver + +InputSourceResolver::~InputSourceResolver() +{ + if(deleteStream_) + delete byteStream_; +} // ~InputSourceResolver + +////////////////////////////////////////////////////// +// resolverMap register/unregister +bool InputSourceResolver::registerResolver(const std::string& method, URIResolver resolver) +{ + resolverMap()[method] = resolver; + return true; +} // registerResolver + +bool InputSourceResolver::unRegisterResolver(const std::string& method) +{ + resolverMapT::iterator i = resolverMap().find(method); + if(i != resolverMap().end()) + resolverMap().erase(i); + return true; +} // unRegisterResolver + +InputSourceResolver::URIResolver InputSourceResolver::findResolver(std::string method) +{ + resolverMapT::iterator i = resolverMap().find(method); + return (i != resolverMap().end()) ? i->second : 0; +} // findResolver + +namespace +{ + std::istream* fileResolver(const std::string& fileURI) + { + Arabica::io::URI url(fileURI); + std::string fileName = url.path(); + + std::ifstream* ifs = new std::ifstream(fileName.c_str()); + if(ifs->is_open()) + return ifs; + delete ifs; + + // WIN32 specific stuff + for(std::string::iterator i = fileName.begin(); i != fileName.end(); ++i) + if(*i == '/') + *i = '\\'; + + if((fileName[0] == '\\') && (fileName[2] == ':')) + fileName.erase(0, 1); + + ifs = new std::ifstream(fileName.c_str()); + if(ifs->is_open()) + return ifs; + delete ifs; + + return 0; + } // fileResolver + + static bool fileReg = InputSourceResolver::registerResolver("file", fileResolver); + + std::istream* httpResolver(const std::string& httpURI) + { +#ifdef ARABICA_USE_WINSOCK + WORD wVersionRequested; + WSADATA wsaData; + int err; + + wVersionRequested = MAKEWORD(1, 1); + err = WSAStartup( wVersionRequested, &wsaData ); + if(err != 0) + return 0; +#endif + + Arabica::io::URI url(httpURI); + + Arabica::io::socketstream* ifs = new Arabica::io::socketstream(url.host().c_str(), std::atoi(url.port().c_str())); + if(!ifs->is_open()) + return 0; + *ifs << "GET " << url.path() << " HTTP/1.0" << std::endl; + *ifs << "Host: " << url.host() << std::endl; + *ifs << "Connection: close" << std::endl; + *ifs << std::endl; + + char buffer[1024]; + do + { + ifs->getline(buffer, sizeof(buffer)); + } + while(buffer[0] != '\r'); + + return ifs; + } // httpResolver + + static bool httpReg = InputSourceResolver::registerResolver("http", httpResolver); +} // namespace + +// end of file diff --git a/arabica/src/SAX/wrappers/saxlibxml2.cpp b/arabica/src/SAX/wrappers/saxlibxml2.cpp new file mode 100644 index 000000000..c2eeb68c6 --- /dev/null +++ b/arabica/src/SAX/wrappers/saxlibxml2.cpp @@ -0,0 +1,249 @@ +/*@ + * $Id$ + */ + +#ifdef _MSC_VER +#pragma warning(disable: 4786 4800) +#endif + +#include + +#include + +namespace Arabica +{ +namespace SAX +{ +namespace libxml2_wrapper_impl_tiddle +{ + +std::string formatErrorMsg(const char* fmt, va_list arg) +{ + char buff[4096]; + vsprintf(buff, fmt, arg); + return std::string(buff); +} // formatErrorMsg + +void lwit_startDocument(void* user_data) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->parserContext()->myDoc = xmlNewDoc(p->parserContext()->version); + p->parserContext()->myDoc->intSubset = xmlNewDtd(p->parserContext()->myDoc, BAD_CAST "fake", NULL, NULL); + p->SAXstartDocument(); +} // lwit_startDocument + +void lwit_endDocument(void* user_data) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->SAXendDocument(); + xmlFreeDoc(p->parserContext()->myDoc); + p->parserContext()->myDoc = 0; +} // lwit_endDocument + +void lwit_startElement(void *user_data, const xmlChar* name, const xmlChar** attrs) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->SAXstartElement(name, attrs); +} // lwit_startElement + +void lwit_endElement(void *user_data, const xmlChar* name) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->SAXendElement(name); +} // lwit_endElement + +void lwit_characters(void* user_data, const xmlChar* ch, int len) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->SAXcharacters(ch, len); +} // lwit_characters + +void lwit_cdata(void* user_data, const xmlChar* ch, int len) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->SAXstartCdataSection(); + p->SAXcdata(ch, len); +// everyone else will call endCData if we are in cdata +// p->SAXendCdataSection(); +} // lwit_cdata + +void lwit_ignorableWhitespace(void *user_data, const xmlChar* ch, int len) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->SAXignorableWhitespace(ch, len); +} // lwit_ignorableWhitespace + +void lwit_processingInstruction(void *user_data, const xmlChar* target, const xmlChar* data) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->SAXprocessingInstruction(target, data); +} // lwit_processingInstruction + +void lwit_comment(void *user_data, const xmlChar* comment) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->SAXcomment(comment); +} // lwit_comment + +void lwit_warning(void *user_data, const char* fmt, ...) +{ + va_list arg; + va_start(arg, fmt); + std::string msg(formatErrorMsg(fmt, arg)); + va_end(arg); + + libxml2_base* p = reinterpret_cast(user_data); + p->SAXwarning(msg); +} // lwit_warning + +void lwit_error(void* user_data, const char* fmt, ...) +{ + va_list arg; + va_start(arg, fmt); + std::string msg(formatErrorMsg(fmt, arg)); + va_end(arg); + + libxml2_base* p = reinterpret_cast(user_data); + p->SAXerror(msg); +} // lwit_error + +void lwit_fatalError(void* user_data, const char* fmt, ...) +{ + va_list arg; + va_start(arg, fmt); + std::string msg(formatErrorMsg(fmt, arg)); + va_end(arg); + + libxml2_base* p = reinterpret_cast(user_data); + p->SAXfatalError(msg); +} // lwit_fatalError + +void lwit_locator(void* user_data, xmlSAXLocatorPtr locator) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->SAXlocator(locator); +} // lwit_locator + +void lwit_notationDecl(void* user_data, const xmlChar *name, const xmlChar *publicId, const xmlChar *systemId) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->SAXnotationDecl(name, publicId, systemId); +} // lwit_notationDecl + +void lwit_unparsedEntityDecl(void* user_data, + const xmlChar *name, const xmlChar *publicId, + const xmlChar *systemId, const xmlChar *notationName) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->SAXunparsedEntityDecl(name, publicId, systemId, notationName); +} // lwit_unparsedEntityDecl + +void lwit_elementDecl(void* user_data, const xmlChar *name, int type, xmlElementContentPtr content) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->SAXelementDecl(name, type, content); +} // lwit_elementDecl + +void lwit_attributeDecl(void *user_data, const xmlChar *elem, const xmlChar *fullname, int type, int def, const xmlChar *defaultValue, xmlEnumerationPtr tree) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->SAXattributeDecl(elem, fullname, type, def, defaultValue, tree); +} // lwit_attributeDecl + +void lwit_entityDecl(void* user_data, const xmlChar *name, int type, const xmlChar *publicId, const xmlChar *systemId, xmlChar *content) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->SAXentityDecl(name, type, publicId, systemId, content); + xmlSAX2EntityDecl(p->parserContext(), name, type, publicId, systemId, content); +} // lwit_entityDecl + +xmlParserInputPtr lwit_resolveEntity(void* user_data, const xmlChar* publicId, const xmlChar* systemId) +{ + libxml2_base* p = reinterpret_cast(user_data); + return p->SAXresolveEntity(publicId, systemId); +} // lwit_resolveEntity + +xmlEntityPtr lwit_getEntity(void* user_data, const xmlChar* name) +{ + libxml2_base* p = reinterpret_cast(user_data); + xmlEntityPtr ent = xmlSAX2GetEntity(p->parserContext(), name); + return ent; +} // lwit_getEntity + +class libxmlInitialiser +{ +public: + libxmlInitialiser() { xmlInitParser(); } +}; + +static xmlSAXHandler saxHandler = { + 0, // internalSubsetSAXFunc internalSubset; + 0, // isStandaloneSAXFunc isStandalone; + 0, // hasInternalSubsetSAXFunc hasInternalSubset; + 0, // hasExternalSubsetSAXFunc hasExternalSubset; + lwit_resolveEntity, // resolveEntitySAXFunc resolveEntity; + lwit_getEntity, // getEntitySAXFunc getEntity; + lwit_entityDecl, // entityDeclSAXFunc entityDecl; + lwit_notationDecl, // notationDeclSAXFunc notationDecl; + lwit_attributeDecl, // attributeDeclSAXFunc attributeDecl; + lwit_elementDecl, // elementDeclSAXFunc elementDecl; + lwit_unparsedEntityDecl, // unparsedEntityDeclSAXFunc unparsedEntityDecl; + lwit_locator, // setDocumentLocatorSAXFunc setDocumentLocator; + lwit_startDocument, // startDocumentSAXFunc startDocument; + lwit_endDocument, // endDocumentSAXFunc endDocument; + lwit_startElement, // startElementSAXFunc startElement; + lwit_endElement, // endElementSAXFunc endElement; + 0, // referenceSAXFunc reference; + lwit_characters, // charactersSAXFunc characters; + lwit_ignorableWhitespace, // ignorableWhitespaceSAXFunc ignorableWhitespace; + lwit_processingInstruction, // processingInstructionSAXFunc processingInstruction; + lwit_comment, // commentSAXFunc comment; + lwit_warning, // warningSAXFunc warning; + lwit_error, // errorSAXFunc error; + lwit_fatalError, // fatalErrorSAXFunc fatalError; + 0, // getParameterEntitySAXFunc getParameterEntity; + lwit_cdata, // cdataBlockSAXFunc cdataBlock; + 0, // externalSubsetSAXFunc externalSubset; + 0, // initialized; + /* The following fields are extensions available only on version 2 */ + 0, // _private + 0, // startElementNs + 0, // endElementNs; + 0 // serror; + }; + +xmlSAXHandler* lwit_SaxHandler() +{ + static libxmlInitialiser init; + return &saxHandler; +} + +void lwit_setFeature(xmlParserCtxtPtr context, const char* name, bool value) +{ + int v = value; + if(xmlSetFeature(context, name, reinterpret_cast(&v)) == -1) + { + std::ostringstream os; + os << "Feature not recognized " << name; + throw SAX::SAXNotRecognizedException(os.str()); + } // if ... +} // lwitSetFeature + +bool lwit_getFeature(xmlParserCtxtPtr context, const char* name) +{ + int v; + if(xmlGetFeature(context, name, reinterpret_cast(&v)) == -1) + { + std::ostringstream os; + os << "Feature not recognized " << name; + throw SAX::SAXNotRecognizedException(os.str()); + } // if ... + return static_cast(v); +} // lwit_getFeature + +} // namespace libxml2_wrapper_impl_tiddle + +} // namespace SAX +} // namespace Arabica + +// end of file diff --git a/arabica/src/XML/XMLCharacterClasses.cpp b/arabica/src/XML/XMLCharacterClasses.cpp new file mode 100644 index 000000000..b3ae23c96 --- /dev/null +++ b/arabica/src/XML/XMLCharacterClasses.cpp @@ -0,0 +1,288 @@ + +#include +#include +#include + +const wchar_t base_char_ranges[][2] = +{ + { 0x0041, 0x005A }, { 0x0061, 0x007A }, { 0x00C0, 0x00D6 }, + { 0x00D8, 0x00F6 }, { 0x00F8, 0x00FF }, { 0x0100, 0x0131 }, + { 0x0134, 0x013E }, { 0x0141, 0x0148 }, { 0x014A, 0x017E }, + { 0x0180, 0x01C3 }, { 0x01CD, 0x01F0 }, { 0x01F4, 0x01F5 }, + { 0x01FA, 0x0217 }, { 0x0250, 0x02A8 }, { 0x02BB, 0x02C1 }, + { 0x0386, 0x0386 }, { 0x0388, 0x038A }, { 0x038C, 0x038C }, + { 0x038E, 0x03A1 }, { 0x03A3, 0x03CE }, { 0x03D0, 0x03D6 }, + { 0x03DA, 0x03DA }, { 0x03DC, 0x03DC }, { 0x03DE, 0x03DE }, + { 0x03E0, 0x03E0 }, { 0x03E2, 0x03F3 }, { 0x0401, 0x040C }, + { 0x040E, 0x044F }, { 0x0451, 0x045C }, { 0x045E, 0x0481 }, + { 0x0490, 0x04C4 }, { 0x04C7, 0x04C8 }, { 0x04CB, 0x04CC }, + { 0x04D0, 0x04EB }, { 0x04EE, 0x04F5 }, { 0x04F8, 0x04F9 }, + { 0x0531, 0x0556 }, { 0x0559, 0x0559 }, { 0x0561, 0x0586 }, + { 0x05D0, 0x05EA }, { 0x05F0, 0x05F2 }, { 0x0621, 0x063A }, + { 0x0641, 0x064A }, { 0x0671, 0x06B7 }, { 0x06BA, 0x06BE }, + { 0x06C0, 0x06CE }, { 0x06D0, 0x06D3 }, { 0x06D5, 0x06D5 }, + { 0x06E5, 0x06E6 }, { 0x0905, 0x0939 }, { 0x093D, 0x093D }, + { 0x0958, 0x0961 }, { 0x0985, 0x098C }, { 0x098F, 0x0990 }, + { 0x0993, 0x09A8 }, { 0x09AA, 0x09B0 }, { 0x09B2, 0x09B2 }, + { 0x09B6, 0x09B9 }, { 0x09DC, 0x09DD }, { 0x09DF, 0x09E1 }, + { 0x09F0, 0x09F1 }, { 0x0A05, 0x0A0A }, { 0x0A0F, 0x0A10 }, + { 0x0A13, 0x0A28 }, { 0x0A2A, 0x0A30 }, { 0x0A32, 0x0A33 }, + { 0x0A35, 0x0A36 }, { 0x0A38, 0x0A39 }, { 0x0A59, 0x0A5C }, + { 0x0A5E, 0x0A5E }, { 0x0A72, 0x0A74 }, { 0x0A85, 0x0A8B }, + { 0x0A8D, 0x0A8D }, { 0x0A8F, 0x0A91 }, { 0x0A93, 0x0AA8 }, + { 0x0AAA, 0x0AB0 }, { 0x0AB2, 0x0AB3 }, { 0x0AB5, 0x0AB9 }, + { 0x0ABD, 0x0ABD }, { 0x0AE0, 0x0AE0 }, { 0x0B05, 0x0B0C }, + { 0x0B0F, 0x0B10 }, { 0x0B13, 0x0B28 }, { 0x0B2A, 0x0B30 }, + { 0x0B32, 0x0B33 }, { 0x0B36, 0x0B39 }, { 0x0B3D, 0x0B3D }, + { 0x0B5C, 0x0B5D }, { 0x0B5F, 0x0B61 }, { 0x0B85, 0x0B8A }, + { 0x0B8E, 0x0B90 }, { 0x0B92, 0x0B95 }, { 0x0B99, 0x0B9A }, + { 0x0B9C, 0x0B9C }, { 0x0B9E, 0x0B9F }, { 0x0BA3, 0x0BA4 }, + { 0x0BA8, 0x0BAA }, { 0x0BAE, 0x0BB5 }, { 0x0BB7, 0x0BB9 }, + { 0x0C05, 0x0C0C }, { 0x0C0E, 0x0C10 }, { 0x0C12, 0x0C28 }, + { 0x0C2A, 0x0C33 }, { 0x0C35, 0x0C39 }, { 0x0C60, 0x0C61 }, + { 0x0C85, 0x0C8C }, { 0x0C8E, 0x0C90 }, { 0x0C92, 0x0CA8 }, + { 0x0CAA, 0x0CB3 }, { 0x0CB5, 0x0CB9 }, { 0x0CDE, 0x0CDE }, + { 0x0CE0, 0x0CE1 }, { 0x0D05, 0x0D0C }, { 0x0D0E, 0x0D10 }, + { 0x0D12, 0x0D28 }, { 0x0D2A, 0x0D39 }, { 0x0D60, 0x0D61 }, + { 0x0E01, 0x0E2E }, { 0x0E30, 0x0E30 }, { 0x0E32, 0x0E33 }, + { 0x0E40, 0x0E45 }, { 0x0E81, 0x0E82 }, { 0x0E84, 0x0E84 }, + { 0x0E87, 0x0E88 }, { 0x0E8A, 0x0E8A }, { 0x0E8D, 0x0E8D }, + { 0x0E94, 0x0E97 }, { 0x0E99, 0x0E9F }, { 0x0EA1, 0x0EA3 }, + { 0x0EA5, 0x0EA5 }, { 0x0EA7, 0x0EA7 }, { 0x0EAA, 0x0EAB }, + { 0x0EAD, 0x0EAE }, { 0x0EB0, 0x0EB0 }, { 0x0EB2, 0x0EB3 }, + { 0x0EBD, 0x0EBD }, { 0x0EC0, 0x0EC4 }, { 0x0F40, 0x0F47 }, + { 0x0F49, 0x0F69 }, { 0x10A0, 0x10C5 }, { 0x10D0, 0x10F6 }, + { 0x1100, 0x1100 }, { 0x1102, 0x1103 }, { 0x1105, 0x1107 }, + { 0x1109, 0x1109 }, { 0x110B, 0x110C }, { 0x110E, 0x1112 }, + { 0x113C, 0x113C }, { 0x113E, 0x113E }, { 0x1140, 0x1140 }, + { 0x114C, 0x114C }, { 0x114E, 0x114E }, { 0x1150, 0x1150 }, + { 0x1154, 0x1155 }, { 0x1159, 0x1159 }, { 0x115F, 0x1161 }, + { 0x1163, 0x1163 }, { 0x1165, 0x1165 }, { 0x1167, 0x1167 }, + { 0x1169, 0x1169 }, { 0x116D, 0x116E }, { 0x1172, 0x1173 }, + { 0x1175, 0x1175 }, { 0x119E, 0x119E }, { 0x11A8, 0x11A8 }, + { 0x11AB, 0x11AB }, { 0x11AE, 0x11AF }, { 0x11B7, 0x11B8 }, + { 0x11BA, 0x11BA }, { 0x11BC, 0x11C2 }, { 0x11EB, 0x11EB }, + { 0x11F0, 0x11F0 }, { 0x11F9, 0x11F9 }, { 0x1E00, 0x1E9B }, + { 0x1EA0, 0x1EF9 }, { 0x1F00, 0x1F15 }, { 0x1F18, 0x1F1D }, + { 0x1F20, 0x1F45 }, { 0x1F48, 0x1F4D }, { 0x1F50, 0x1F57 }, + { 0x1F59, 0x1F59 }, { 0x1F5B, 0x1F5B }, { 0x1F5D, 0x1F5D }, + { 0x1F5F, 0x1F7D }, { 0x1F80, 0x1FB4 }, { 0x1FB6, 0x1FBC }, + { 0x1FBE, 0x1FBE }, { 0x1FC2, 0x1FC4 }, { 0x1FC6, 0x1FCC }, + { 0x1FD0, 0x1FD3 }, { 0x1FD6, 0x1FDB }, { 0x1FE0, 0x1FEC }, + { 0x1FF2, 0x1FF4 }, { 0x1FF6, 0x1FFC }, { 0x2126, 0x2126 }, + { 0x212A, 0x212B }, { 0x212E, 0x212E }, { 0x2180, 0x2182 }, + { 0x3041, 0x3094 }, { 0x30A1, 0x30FA }, { 0x3105, 0x312C }, + { 0xAC00, 0xD7A3 }, { 0, 0 } +}; // base_char_ranges + +bool Arabica::XML::is_char(wchar_t c) +{ + return (c == text::Unicode::HORIZONTAL_TABULATION) || + (c == text::Unicode::LINE_FEED) || + (c == text::Unicode::CARRIAGE_RETURN) || + ((c >= 0x0020) && (c <= 0xD7FF)) || + ((c >= 0xE000) && (c <= 0xFFFD)) +#ifndef ARABICA_NO_WCHAR_T + || ((c >= 0x10000) && (c <= 0x10FFFF)) +#endif + ; +} // is_char + +bool Arabica::XML::is_space(wchar_t c) +{ + return (c == text::Unicode::SPACE) || + (c == text::Unicode::HORIZONTAL_TABULATION) || + (c == text::Unicode::CARRIAGE_RETURN) || + (c == text::Unicode::LINE_FEED); +} // is_space + +bool Arabica::XML::is_name_char(wchar_t c) +{ + return is_letter(c) || + is_digit(c) || + (c == text::Unicode::FULL_STOP) || // . + (c == text::Unicode::HYPHEN_MINUS) || // - + (c == text::Unicode::LOW_LINE) || // _ + (c == text::Unicode::COLON) || // : + is_combining_char(c) || + is_extender(c); +} // is_name_char + +bool Arabica::XML::is_ncname_char(wchar_t c) +{ + return is_letter(c) || + is_digit(c) || + (c == text::Unicode::FULL_STOP) || // . + (c == text::Unicode::HYPHEN_MINUS) || // - + (c == text::Unicode::LOW_LINE) || // _ + is_combining_char(c) || + is_extender(c); +} // is_ncname_char + +bool Arabica::XML::is_letter(wchar_t c) +{ + return is_base_char(c) || + is_ideographic(c); +} // is_letter + +bool Arabica::XML::is_base_char(wchar_t c) +{ + for(int i=0; base_char_ranges[i][0]; ++i) + { + if(c < base_char_ranges[i][0]) + return false; + + if((c >= base_char_ranges[i][0]) && (c <= base_char_ranges[i][1])) + return true; + } // for ... + + return false; +} // is_base_char + +bool Arabica::XML::is_ideographic(wchar_t c) +{ + return ((c >= 0x4E00) && (c <= 0x9FA5)) || + c == 0x3007 || + ((c >= 0x3021) && (c <= 0x3029)); +} // is_ideographic + +bool Arabica::XML::is_digit(wchar_t c) +{ + return ((c >= 0x0030) && (c <= 0x0039)) || + ((c >= 0x0660) && (c <= 0x0669)) || + ((c >= 0x06F0) && (c <= 0x06F9)) || + ((c >= 0x0966) && (c <= 0x096F)) || + ((c >= 0x09E6) && (c <= 0x09EF)) || + ((c >= 0x0A66) && (c <= 0x0A6F)) || + ((c >= 0x0AE6) && (c <= 0x0AEF)) || + ((c >= 0x0B66) && (c <= 0x0B6F)) || + ((c >= 0x0BE7) && (c <= 0x0BEF)) || + ((c >= 0x0C66) && (c <= 0x0C6F)) || + ((c >= 0x0CE6) && (c <= 0x0CEF)) || + ((c >= 0x0D66) && (c <= 0x0D6F)) || + ((c >= 0x0E50) && (c <= 0x0E59)) || + ((c >= 0x0ED0) && (c <= 0x0ED9)) || + ((c >= 0x0F20) && (c <= 0x0F29)); +} // is_digit + +bool Arabica::XML::is_combining_char(wchar_t c) +{ + return ((c >= 0x0300) && (c <= 0x0345)) || + ((c >= 0x0360) && (c <= 0x0361)) || + ((c >= 0x0483) && (c <= 0x0486)) || + ((c >= 0x0591) && (c <= 0x05A1)) || + ((c >= 0x05A3) && (c <= 0x05B9)) || + ((c >= 0x05BB) && (c <= 0x05BD)) || + (c == 0x05BF) || + ((c >= 0x05C1) && (c <= 0x05C2)) || + (c == 0x05C4) || + ((c >= 0x064B) && (c <= 0x0652)) || + (c == 0x0670) || + ((c >= 0x06D6) && (c <= 0x06DC)) || + ((c >= 0x06DD) && (c <= 0x06DF)) || + ((c >= 0x06E0) && (c <= 0x06E4)) || + ((c >= 0x06E7) && (c <= 0x06E8)) || + ((c >= 0x06EA) && (c <= 0x06ED)) || + ((c >= 0x0901) && (c <= 0x0903)) || + (c == 0x093C) || + ((c >= 0x093E) && (c <= 0x094C)) || + (c == 0x094D) || + ((c >= 0x0951) && (c <= 0x0954)) || + ((c >= 0x0962) && (c <= 0x0963)) || + ((c >= 0x0981) && (c <= 0x0983)) || + (c == 0x09BC) || + (c == 0x09BE) || + (c == 0x09BF) || + ((c >= 0x09C0) && (c <= 0x09C4)) || + ((c >= 0x09C7) && (c <= 0x09C8)) || + ((c >= 0x09CB) && (c <= 0x09CD)) || + (c == 0x09D7) || + ((c >= 0x09E2) && (c <= 0x09E3)) || + (c == 0x0A02) || + (c == 0x0A3C) || + (c == 0x0A3E) || + (c == 0x0A3F) || + ((c >= 0x0A40) && (c <= 0x0A42)) || + ((c >= 0x0A47) && (c <= 0x0A48)) || + ((c >= 0x0A4B) && (c <= 0x0A4D)) || + ((c >= 0x0A70) && (c <= 0x0A71)) || + ((c >= 0x0A81) && (c <= 0x0A83)) || + (c == 0x0ABC) || + ((c >= 0x0ABE) && (c <= 0x0AC5)) || + ((c >= 0x0AC7) && (c <= 0x0AC9)) || + ((c >= 0x0ACB) && (c <= 0x0ACD)) || + ((c >= 0x0B01) && (c <= 0x0B03)) || + (c == 0x0B3C) || + ((c >= 0x0B3E) && (c <= 0x0B43)) || + ((c >= 0x0B47) && (c <= 0x0B48)) || + ((c >= 0x0B4B) && (c <= 0x0B4D)) || + ((c >= 0x0B56) && (c <= 0x0B57)) || + ((c >= 0x0B82) && (c <= 0x0B83)) || + ((c >= 0x0BBE) && (c <= 0x0BC2)) || + ((c >= 0x0BC6) && (c <= 0x0BC8)) || + ((c >= 0x0BCA) && (c <= 0x0BCD)) || + (c == 0x0BD7) || + ((c >= 0x0C01) && (c <= 0x0C03)) || + ((c >= 0x0C3E) && (c <= 0x0C44)) || + ((c >= 0x0C46) && (c <= 0x0C48)) || + ((c >= 0x0C4A) && (c <= 0x0C4D)) || + ((c >= 0x0C55) && (c <= 0x0C56)) || + ((c >= 0x0C82) && (c <= 0x0C83)) || + ((c >= 0x0CBE) && (c <= 0x0CC4)) || + ((c >= 0x0CC6) && (c <= 0x0CC8)) || + ((c >= 0x0CCA) && (c <= 0x0CCD)) || + ((c >= 0x0CD5) && (c <= 0x0CD6)) || + ((c >= 0x0D02) && (c <= 0x0D03)) || + ((c >= 0x0D3E) && (c <= 0x0D43)) || + ((c >= 0x0D46) && (c <= 0x0D48)) || + ((c >= 0x0D4A) && (c <= 0x0D4D)) || + (c == 0x0D57) || + (c == 0x0E31) || + ((c >= 0x0E34) && (c <= 0x0E3A)) || + ((c >= 0x0E47) && (c <= 0x0E4E)) || + (c == 0x0EB1) || + ((c >= 0x0EB4) && (c <= 0x0EB9)) || + ((c >= 0x0EBB) && (c <= 0x0EBC)) || + ((c >= 0x0EC8) && (c <= 0x0ECD)) || + ((c >= 0x0F18) && (c <= 0x0F19)) || + (c == 0x0F35) || + (c == 0x0F37) || + (c == 0x0F39) || + (c == 0x0F3E) || + (c == 0x0F3F) || + ((c >= 0x0F71) && (c <= 0x0F84)) || + ((c >= 0x0F86) && (c <= 0x0F8B)) || + ((c >= 0x0F90) && (c <= 0x0F95)) || + (c == 0x0F97) || + ((c >= 0x0F99) && (c <= 0x0FAD)) || + ((c >= 0x0FB1) && (c <= 0x0FB7)) || + (c == 0x0FB9) || + ((c >= 0x20D0) && (c <= 0x20DC)) || + (c == 0x20E1) || + ((c >= 0x302A) && (c <= 0x302F)) || + (c == 0x3099) || + (c == 0x309A); +} // is_combining + +bool Arabica::XML::is_extender(wchar_t c) +{ + return (c == 0x00B7) || + (c == 0x02D0) || + (c == 0x02D1) || + (c == 0x0387) || + (c == 0x0640) || + (c == 0x0E46) || + (c == 0x0EC6) || + (c == 0x3005) || + ((c >= 0x3031) && (c <= 0x3035)) || + ((c >= 0x309D) && (c <= 0x309E)) || + ((c >= 0x30FC) && (c <= 0x30FE)); +} // is_extender + +bool Arabica::XML::is_letter_or_digit(wchar_t c) +{ + return is_letter(c) || is_digit(c); +} // is_letter_or_digit + + +// end of file diff --git a/arabica/src/arabica.cpp b/arabica/src/arabica.cpp new file mode 100644 index 000000000..ff809beaa --- /dev/null +++ b/arabica/src/arabica.cpp @@ -0,0 +1,9 @@ + +#include + +#ifdef ARABICA_USE_LIBXML2 +#include "SAX/wrappers/saxlibxml2.cpp" +#ifdef _MSC_VER +#pragma message("Pulling in libxml2 wrappers") +#endif +#endif diff --git a/arabica/src/convert/impl/ucs2_utf16.cpp b/arabica/src/convert/impl/ucs2_utf16.cpp new file mode 100644 index 000000000..ca1e0502b --- /dev/null +++ b/arabica/src/convert/impl/ucs2_utf16.cpp @@ -0,0 +1,57 @@ +// -------------------------------------------------------------------------- +// -------------------------------------------------------------------------- +#include +// -------------------------------------------------------------------------- + +std::codecvt_base::result Arabica::convert::impl::utf16_2_ucs2(bool be, + char const* from, char const* from_end, char const*& from_next, + wchar_t* to, wchar_t* to_limit, wchar_t*& to_next) +{ + from_next = from; + to_next = to; + + while((from_next+1 < from_end) && (to_next < to_limit)) + { + wchar_t b1 = static_cast(*from_next++); + wchar_t b2 = static_cast(*from_next++); + + *to_next++ = be ? ((b1 << 8) + b2) : ((b2 << 8) + b1); + } // while + + return (from_next == from_end) ? std::codecvt_base::ok : std::codecvt_base::partial; +} // utf16_2_ucs2 + +std::codecvt_base::result Arabica::convert::impl::ucs2_2_utf16(bool be, + wchar_t const* from, wchar_t const* from_end, wchar_t const*& from_next, + char* to, char* to_limit, char*& to_next) +{ + from_next = from; + to_next = to; + + while(from_next < from_end) + { + if(to_next + 2 >= to_limit) + return std::codecvt_base::partial; + + wchar_t ch = *from_next; + unsigned char lb = static_cast(ch & 0xFF); + unsigned char hb = static_cast((ch >> 8) & 0xFF); + + if(be) + { // big endian + *to_next++ = hb; + *to_next++ = lb; + } + else + { // little endian + *to_next++ = lb; + *to_next++ = hb; + } + + ++from_next; + } // while(from_next < from_end) + + return std::codecvt_base::ok; +} // ucs2_2_utf8 + +// end of file diff --git a/arabica/src/convert/impl/ucs2_utf8.cpp b/arabica/src/convert/impl/ucs2_utf8.cpp new file mode 100644 index 000000000..4b9f5d864 --- /dev/null +++ b/arabica/src/convert/impl/ucs2_utf8.cpp @@ -0,0 +1,103 @@ +//--------------------------------------------------------------------------- +// $Id$ +//--------------------------------------------------------------------------- +#include +//--------------------------------------------------------------------------- +// Some of this code is derived from work done by Ken Thompson, +// provided to the X/Open Group. +// +// I got my information about UTF-8 from RFC 2044. + + +namespace { + struct Tab + { + unsigned char char_mask; + unsigned char char_value; + int shift; + unsigned long wide_mask; + }; + + static const Tab tab[] = + { + { (unsigned char)(0x80), (unsigned char)(0x00), 0*6, 0x7F, }, // 1 byte sequence + { (unsigned char)(0xE0), (unsigned char)(0xC0), 1*6, 0x7FF, }, // 2 byte sequence + { (unsigned char)(0xF0), (unsigned char)(0xE0), 2*6, 0xFFFF, }, // 3 byte sequence + { 0, 0, 0, 0, } // end of table + }; +} // namespace + + +std::codecvt_base::result Arabica::convert::impl::ucs2_2_utf8( + const wchar_t* from, const wchar_t* from_end, const wchar_t*& from_next, + char* to, char* to_limit, char*& to_next) +{ + from_next = from; + to_next = to; + + while(from_next < from_end) + { + unsigned long fn = static_cast(*from_next); + + for(const Tab *t = tab; t->char_mask; t++) + { + if(fn > t->wide_mask ) + continue; + + // is there enough room in outbuffer? + if(to_next + (t - tab) + 1 >= to_limit) + return std::codecvt_base::partial; + + int c = t->shift; + *to_next++ = static_cast(t->char_value | (fn >> c)); + while(c > 0) + { + c -= 6; + *to_next++ = static_cast(0x80 | ((fn >> c) & 0x3F)); + } // while(c > 0) + break; + } // for(Tab *t = tab; t->char_mask; t++) + ++from_next; + } // while(from_next < from_end) + + return std::codecvt_base::ok; +} // ucs2_2_utf8 + +std::codecvt_base::result Arabica::convert::impl::utf8_2_ucs2( + const char* from, const char* from_end, const char*& from_next, + wchar_t* to, wchar_t* to_limit, wchar_t*& to_next) +{ + from_next = from; + to_next = to; + + while((from_next < from_end) && (to_next < to_limit)) + { + unsigned char start = static_cast(*from_next); + + const Tab *t = tab; + for(; t->char_mask; ++t) + { + if((start & t->char_mask) == t->char_value) + break; + } + + if((from_next + (t - tab)) >= from_end) + break; + + unsigned long wide_mask = t->wide_mask; + + *to_next = start; + for(; t != tab; --t) + { + from_next++; + *to_next = (*to_next << 6) | ((*from_next ^ 0x80) & 0xff); + } + *to_next &= wide_mask; + + ++from_next; + ++to_next; + } // while + + return (from_next == from_end) ? std::codecvt_base::ok : std::codecvt_base::partial; +} // utf8_2_ucs2 +// end of file diff --git a/arabica/src/convert/utf8ucs2codecvt.cpp b/arabica/src/convert/utf8ucs2codecvt.cpp new file mode 100644 index 000000000..b51a3d003 --- /dev/null +++ b/arabica/src/convert/utf8ucs2codecvt.cpp @@ -0,0 +1,85 @@ +//--------------------------------------------------------------------------- +// $Id$ +//--------------------------------------------------------------------------- +#include +#include +#include +//--------------------------------------------------------------------------- +#ifndef ARABICA_NO_WCHAR_T + +using namespace Arabica::convert; + +std::codecvt_base::result utf8ucs2codecvt::do_out(std::mbstate_t& /* state */, + const wchar_t* from, + const wchar_t* from_end, + const wchar_t*& from_next, + char* to, + char* to_limit, + char*& to_next) const +{ + return impl::ucs2_2_utf8(from, from_end, from_next, to, to_limit, to_next); +} // do_out + +std::codecvt_base::result utf8ucs2codecvt::do_in(std::mbstate_t& /* state */, + const char* from, + const char* from_end, + const char*& from_next, + wchar_t* to, + wchar_t* to_limit, + wchar_t*& to_next) const +{ + return impl::utf8_2_ucs2(from, from_end, from_next, to, to_limit, to_next); +} // do_in + +std::codecvt_base::result utf8ucs2codecvt::do_unshift(std::mbstate_t& /* state */, + char* to, + char* /* to_limit */, + char*& to_next) const +{ + to_next = to; + return noconv; +} // do_unshift + +int utf8ucs2codecvt::do_length(const std::mbstate_t&, + const char* from, + const char* end, + size_t max) const throw() +{ + size_t count(0); + const char* from_next = from; + + while((from_next < end) && (count < max)) + { + if(!(*from_next & 0x80)) + { + ++count; + ++from_next; + } + else if((*from_next&0xc0) == 0xc0) + { + if(from_next+2 < end) + { + ++count; + from_next += 2; + } + else + break; + } + else if((*from_next&0xe0) == 0xe0) + { + if(from_next+3 < end) + { + ++count; + from_next += 3; + } + else + break; + } + } // while + + return (from_next-from); +} // do_length + +#endif +// end of file + diff --git a/arabica/src/io/uri.cpp b/arabica/src/io/uri.cpp new file mode 100644 index 000000000..f28351ee3 --- /dev/null +++ b/arabica/src/io/uri.cpp @@ -0,0 +1,197 @@ + +#include +#include + +using namespace Arabica::io; + +namespace { + const std::string ZERO = "0"; + const std::string PORT_EIGHTY = "80"; + const std::string PORT_443 = "443"; + + const std::string SCHEME_HTTP = "http"; + const std::string SCHEME_HTTPS = "https"; + const std::string SCHEME_FILE = "file"; + + const std::string COLON = ":"; + const char FORWARD_SLASH = '/'; + + const std::string& wellKnownPort(const std::string& scheme) + { + if(scheme.empty()) + return ZERO; + + if(scheme == SCHEME_HTTP) + return PORT_EIGHTY; + if(scheme == SCHEME_HTTPS) + return PORT_443; + + return ZERO; + } // wellKnownPort +} // namespace + +URI::URI(const std::string& uri) : + is_absolute_(false) +{ + parse(uri); +} // URI + +URI::URI(const URI& base, const std::string& relativeUrl) : + scheme_(base.scheme_), + host_(base.host_), + path_(base.path_), + port_(base.port_), + is_absolute_(base.is_absolute_) +{ + if(!relativeUrl.empty()) + { + URI relUrl(relativeUrl); + absolutise(relUrl); + } // if ... +} // URI + +const std::string& URI::port() const +{ + if(port_.empty()) + return wellKnownPort(scheme_); + return port_; +} // port() + +std::string URI::as_string() const +{ + std::string str; + if(!scheme_.empty()) + str.append(scheme_).append(COLON); + if(is_absolute_) + str.append("//"); + if(!host_.empty()) + { + str.append(host_); + if(!port_.empty()) + str.append(COLON).append(port_); + } + str.append(path_); + return str; +} // as_string + +void fixSlashes(std::string& path) +{ + for(size_t i = path.find('\\'); i != std::string::npos; i = path.find('\\', i)) + path[i] = FORWARD_SLASH; +} // fixSlashes + +void URI::parse(const std::string& uri) +{ + parse_uri(uri); + + is_absolute_ = (!scheme_.empty() && !host_.empty()) || + (((scheme_ == SCHEME_FILE) && (!path_.empty())) && + ((path_[0] == FORWARD_SLASH) || (path_[1] == ':'))); +} // parse + +void URI::parse_uri(const std::string& uri) +{ + // I'd like to use something a bit stronger - http://code.google.com/p/uri-grammar/ + // but that would put a Boost Spirit dependence right in the core, which I'm not prepared to do at the moment + + size_t d = uri.find_first_of(COLON); + if(d == std::string::npos) + { + path_ = uri; + fixSlashes(path_); + return; + } // if ... + + if(d == 1) + { + // looks like a windows file path + path_ = uri; + fixSlashes(path_); + scheme_ = SCHEME_FILE; + return; + } // if ... + + scheme_ = uri.substr(0, d); + + std::string::const_iterator u = uri.begin() + d; + std::string::const_iterator ue = uri.end(); + + ++u; + if(*u == FORWARD_SLASH && *(u+1) == FORWARD_SLASH) + { + u += 2; + u = parseAuthority(u, ue); + } // if ... + + path_.append(u, ue); +} // parse + +std::string::const_iterator URI::parseAuthority(const std::string::const_iterator& u, + const std::string::const_iterator& ue) +{ + std::string::const_iterator slash = std::find(u, ue, FORWARD_SLASH); + if(slash == ue) + { + // ah, this is easy + host_.append(u, ue); + return ue; + } // if(slash == we) + + std::string::const_iterator colon = std::find(u, slash, ':'); + host_.append(u, colon); + + if(colon != slash) + port_.append(colon+1, slash); + + return slash; +} // parseAuthority + +bool compatible_schemes(const std::string& scheme, + const std::string& relative) +{ + if(scheme.empty() && (relative == "file")) + return true; + if(relative.empty()) + return true; + return (scheme == relative); +} // compatible_schemes + +void URI::absolutise(URI& relative) +{ + if((relative.is_absolute()) || + !compatible_schemes(scheme_, relative.scheme())) + { + swap(relative); + return; + } + + if(relative.path_[0] == FORWARD_SLASH) + path_ = relative.path_; + else + combinePath(relative.path_); +} // absolutise + +void URI::combinePath(const std::string& relPath) +{ + if(*(path_.rbegin()) != FORWARD_SLASH) + path_.erase(path_.rfind(FORWARD_SLASH)+1); + + std::string::size_type from = path_.length() - 1; + path_.append(relPath); + + size_t dots = path_.find("/../", from); + while(dots != std::string::npos) + { + int preceding_slash = (dots > 0) ? path_.rfind(FORWARD_SLASH, dots-1) : 0; + path_.erase(preceding_slash, dots+3-preceding_slash); + dots = path_.find("/../", preceding_slash); + } // while + + size_t dot = path_.find("/./"); + while(dot != std::string::npos) + { + path_.erase(dot, 2); + dot = path_.find("/./", dot); + } +} // combinePath + diff --git a/arabica/src/taggle/Schema.cpp b/arabica/src/taggle/Schema.cpp new file mode 100644 index 000000000..15aad905a --- /dev/null +++ b/arabica/src/taggle/Schema.cpp @@ -0,0 +1,14 @@ + +#include +#include + +const int Arabica::SAX::Schema::M_ANY = 0xFFFFFFFF; +const int Arabica::SAX::Schema::M_EMPTY = 0; +const int Arabica::SAX::Schema::M_PCDATA = 1 << 30; +const int Arabica::SAX::Schema::M_ROOT = 1 << 31; + +const int Arabica::SAX::Schema::F_RESTART = 1; +const int Arabica::SAX::Schema::F_CDATA = 2; +const int Arabica::SAX::Schema::F_NOFORCE = 4; + +