Introduce App.StringHasher and StringID

Split Document::mapStringIDs function into StringHasher StringHasher function as a persistent string hash/indexer. It stores the string into a internal map, and index the string with an incremental integer ID. The ID is returned as a reference counted object StringID. By default StringHasher only persists used strings. Strings longer than a configurable length threshold will be internally hashed with SHA1 and original text discarded. Both StringHasher and StringID are exposed to Python. Document object has a default hasher object. Other property can easily embed their own string hasher.
realthunder · Mar 5, 2018 · e057cc1 · e057cc1
1 parent 2ddd4b4
commit e057cc1
Show file tree

Hide file tree

Showing 12 changed files with 801 additions and 110 deletions.
diff --git a/src/App/Application.cpp b/src/App/Application.cpp
@@ -81,6 +81,7 @@
 #include <Base/QuantityPy.h>
 #include <Base/UnitPy.h>
 
+#include "StringHasherPy.h"
 #include "GeoFeature.h"
 #include "FeatureTest.h"
 #include "FeaturePython.h"
@@ -231,6 +232,7 @@ Application::Application(std::map<std::string,std::string> &mConfig)
  Base::Interpreter().addType(&Base::PlacementPy::Type, pAppModule, "Placement");
  Base::Interpreter().addType(&Base::RotationPy::Type, pAppModule, "Rotation");
  Base::Interpreter().addType(&Base::AxisPy::Type, pAppModule, "Axis");
+ Base::Interpreter().addType(&App::StringHasherPy::Type, pAppModule, "StringHasher");
 
  // Note: Create an own module 'Base' which should provide the python
  // binding classes from the base module. At a later stage we should
@@ -1312,6 +1314,9 @@ void Application::initTypes(void)
  Data::ComplexGeoData ::init();
  Data::Segment ::init();
 
+ App::StringID ::init();
+ App::StringHasher ::init();
+
  // Properties
  App ::Property ::init();
  App ::PropertyContainer ::init();

diff --git a/src/App/CMakeLists.txt b/src/App/CMakeLists.txt
@@ -71,6 +71,8 @@ else()
  )
 endif()
 
+generate_from_xml(StringIDPy)
+generate_from_xml(StringHasherPy)
 generate_from_xml(DocumentPy)
 generate_from_xml(DocumentObjectPy)
 generate_from_xml(ExtensionPy)
@@ -92,6 +94,8 @@ generate_from_py(FreeCADInit InitScript.h)
 generate_from_py(FreeCADTest TestScript.h)
 
 SET(FreeCADApp_XML_SRCS
+ StringIDPy.xml
+ StringHasherPy.xml
  ExtensionPy.xml
  ExtensionContainerPy.xml
  DocumentObjectExtensionPy.xml
@@ -113,6 +117,9 @@ SOURCE_GROUP("XML" FILES ${FreeCADApp_XML_SRCS})
 # The document stuff
 SET(Document_CPP_SRCS
  Annotation.cpp
+ StringHasher.cpp
+ StringHasherPyImp.cpp
+ StringIDPyImp.cpp
  Document.cpp
  DocumentObject.cpp
  Extension.cpp
@@ -160,6 +167,7 @@ SET(Document_CPP_SRCS
 
 SET(Document_HPP_SRCS
  Annotation.h
+ StringHasher.h
  Document.h
  DocumentObject.h
  Extension.h

diff --git a/src/App/Document.cpp b/src/App/Document.cpp
@@ -62,6 +62,7 @@ recompute path. Also enables more complicated dependencies beyond trees.
 #include <boost/graph/adjacency_list.hpp>
 #include <boost/graph/subgraph.hpp>
 #include <boost/graph/graphviz.hpp>
+#include <boost/bimap.hpp>
 
 #ifdef USE_OLD_DAG
 #include <boost/graph/topological_sort.hpp>
@@ -143,6 +144,8 @@ typedef std::vector <size_t> Path;
 
 namespace App {
 
+typedef boost::bimap<StringHasherRef,int> HasherMap;
+
 // Pimpl class
 struct DocumentP
 {
@@ -158,8 +161,7 @@ struct DocumentP
  int iUndoMode;
  unsigned int UndoMemSize;
  unsigned int UndoMaxStackSize;
- std::map<QByteArray,Document::StringID> stringHashes;
- long stringHashID;
+ mutable HasherMap hashers;
 #ifdef USE_OLD_DAG
  DependencyList DepList;
  std::map<DocumentObject*,Vertex> VertexObjectList;
@@ -178,7 +180,6 @@ struct DocumentP
  iUndoMode = 0;
  UndoMemSize = 0;
  UndoMaxStackSize = 20;
- stringHashID = 0;
  }
 
  static
@@ -194,19 +195,6 @@ struct DocumentP
 
 PROPERTY_SOURCE(App::Document, App::PropertyContainer)
 
-Document::StringID Document::mapStringToID(const char *text) {
- return mapStringToID(QByteArray(text));
-}
-
-Document::StringID Document::mapStringToID(const QByteArray &data) {
- QCryptographicHash hash(QCryptographicHash::Sha1);
- hash.addData(data);
- auto &id = d->stringHashes[hash.result()];
- if(!id) 
- id = std::make_shared<const long>(++d->stringHashID);
- return id;
-}
-
 bool Document::testStatus(Status pos) const
 {
  return d->StatusBits.test((size_t)pos);
@@ -1310,6 +1298,7 @@ void Document::setTransactionMode(int iMode)
 // constructor
 //--------------------------------------------------------------------------
 Document::Document(void)
+ :Hasher(new StringHasher)
 {
  // Remark: In a constructor we should never increment a Python object as we cannot be sure
  // if the Python interpreter gets a reference of it. E.g. if we increment but Python don't
@@ -1401,8 +1390,6 @@ Document::Document(void)
  ADD_PROPERTY_TYPE(LicenseURL,(licenseUrl.c_str()),0,Prop_None,"URL to the license text/contract");
  ADD_PROPERTY_TYPE(ShowHidden,(false), 0,PropertyType(Prop_None), 
  "Whether to show hidden object items in the tree view");
- ADD_PROPERTY_TYPE(SaveAllStringIDs,(false), 0,PropertyType(Prop_None), 
- "Whether to preserve unreferenced string IDs");
 
  // this creates and sets 'TransientDir' in onChanged()
  ADD_PROPERTY_TYPE(TransientDir,(""),0,PropertyType(Prop_Transient|Prop_ReadOnly),
@@ -1473,48 +1460,37 @@ std::string Document::getTransientDirectoryName(const std::string& uuid, const s
 
 void Document::Save (Base::Writer &writer) const
 {
+ d->hashers.clear();
+ addStringHasher(Hasher);
+
  writer.Stream() << "<?xml version='1.0' encoding='utf-8'?>" << endl
  << "<!--" << endl
  << " FreeCAD Document, see http://www.freecadweb.org for more information..." << endl
  << "-->" << endl;
 
- size_t count = 0;
- if(SaveAllStringIDs.getValue())
- count = d->stringHashes.size();
- for(auto &v : d->stringHashes)
- if(v.second.use_count()>1)
- ++count;
-
  writer.Stream() << "<Document SchemaVersion=\"4\" ProgramVersion=\""
  << App::Application::Config()["BuildVersionMajor"] << "."
  << App::Application::Config()["BuildVersionMinor"] << "R"
  << App::Application::Config()["BuildRevision"]
  << "\" FileVersion=\"" << writer.getFileVersion() 
- << "\" StringHashCount=\"" << count << "\">" << endl;
+ << "\" StringHasher=\"1\">" << endl;
 
- writer.incInd();
- count = 0;
- for(auto &v : d->stringHashes) {
- if(SaveAllStringIDs.getValue() || v.second.use_count()>1)
- writer.Stream() << "<Hash value=\""<< v.first.toBase64().constData()
- << "\" id=\""<<*v.second<<"\"/>" << endl;
- else
- ++count;
- }
- writer.decInd();
+ Hasher->Save(writer);
 
- FC_LOG("string hash size " << d->stringHashes.size() << ", unused " << count);
-
  PropertyContainer::Save(writer);
 
  // writing the features types
  writeObjects(d->objectArray, writer);
  writer.Stream() << "</Document>" << endl;
+
+ d->hashers.clear();
 }
 
 void Document::Restore(Base::XMLReader &reader)
 {
  int i,Cnt;
+ d->hashers.clear();
+ addStringHasher(Hasher);
 
  reader.readElement("Document");
  long scheme = reader.getAttributeAsInteger("SchemaVersion");
@@ -1530,19 +1506,10 @@ void Document::Restore(Base::XMLReader &reader)
  reader.FileVersion = 0;
  }
 
- d->stringHashes.clear();
- d->stringHashID = 0;
- if (reader.hasAttribute("StringHashCount")) {
- int count = reader.getAttributeAsInteger("StringHashCount");
- for(i=0;i<count;++i) {
- reader.readElement("Hash");
- QByteArray value(reader.getAttribute("value"));
- long id = reader.getAttributeAsInteger("id");
- if(d->stringHashID < id)
- d->stringHashID = id;
- d->stringHashes[QByteArray::fromBase64(value)] = std::make_shared<const long>(id);
- }
- }
+ if (reader.hasAttribute("StringHasher"))
+ Hasher->Restore(reader);
+ else
+ Hasher->clear();
 
  // When this document was created the FileName and Label properties
  // were set to the absolute path or file name, respectively. To save
@@ -1571,7 +1538,6 @@ void Document::Restore(Base::XMLReader &reader)
  reader.readElement("Feature");
  string type = reader.getAttribute("type");
  string name = reader.getAttribute("name");
-
  try {
  addObject(type.c_str(), name.c_str(), /*isNew=*/ false);
  }
@@ -1607,6 +1573,23 @@ void Document::Restore(Base::XMLReader &reader)
  }
 
  reader.readEndElement("Document");
+ d->hashers.clear();
+}
+
+std::pair<bool,int> Document::addStringHasher(StringHasherRef hasher) const {
+ auto ret = d->hashers.left.insert(HasherMap::left_map::value_type(hasher,(int)d->hashers.size()));
+ return std::make_pair(ret.second,ret.first->second);
+}
+
+StringHasherRef Document::getStringHasher(int idx) const {
+ auto it = d->hashers.right.find(idx);
+ StringHasherRef hasher;
+ if(it == d->hashers.right.end()) {
+ hasher = new StringHasher;
+ d->hashers.right.insert(HasherMap::right_map::value_type(idx,hasher));
+ }else
+ hasher = it->second;
+ return hasher;
 }
 
 static Document::ExportStatus _DocExporting;
@@ -1636,6 +1619,7 @@ void Document::exportObjects(const std::vector<App::DocumentObject*>& obj,
  std::ostream& out, bool keepExternal)
 {
  DocumentExporting exporting(keepExternal);
+ d->hashers.clear();
 
  if(FC_LOG_INSTANCE.isEnabled(FC_LOGLEVEL_LOG)) {
  for(auto o : obj) {
@@ -1666,6 +1650,7 @@ void Document::exportObjects(const std::vector<App::DocumentObject*>& obj,
 
  // write additional files
  writer.writeFiles();
+ d->hashers.clear();
 }
 
 void Document::writeObjects(const std::vector<App::DocumentObject*>& obj,
@@ -1794,6 +1779,7 @@ Document::readObjects(Base::XMLReader& reader)
 std::vector<App::DocumentObject*>
 Document::importObjects(Base::XMLReader& reader)
 {
+ d->hashers.clear();
  Base::ObjectStatusLocker<Status, Document> restoreBit(Status::Restoring, this);
  Base::ObjectStatusLocker<Status, Document> restoreBit2(Status::Importing, this);
  reader.readElement("Document");
@@ -1824,6 +1810,7 @@ Document::importObjects(Base::XMLReader& reader)
 
  afterRestore(objs);
  signalFinishImportObjects(objs);
+ d->hashers.clear();
  return objs;
 }
 
@@ -1836,6 +1823,8 @@ unsigned int Document::getMemSize (void) const
  for (it = d->objectArray.begin(); it != d->objectArray.end(); ++it)
  size += (*it)->getMemSize();
 
+ size += Hasher->getMemSize();
+
  // size of the document properties...
  size += PropertyContainer::getMemSize();
 

diff --git a/src/App/Document.h b/src/App/Document.h
@@ -28,6 +28,7 @@
 #include <Base/Persistence.h>
 #include <Base/Type.h>
 
+#include "StringHasher.h"
 #include "PropertyContainer.h"
 #include "PropertyStandard.h"
 #include "PropertyLinks.h"
@@ -114,10 +115,10 @@ class AppExport Document : public App::PropertyContainer
  PropertyString TipName;
  /// Whether to show hidden items in TreeView
  PropertyBool ShowHidden;
- /// Whether to preserve unreferences string ID
- PropertyBool SaveAllStringIDs;
  //@}
 
+ StringHasherRef Hasher;
+
  /** @name Signals of the document */
  //@{
  /// signal on new Object
@@ -430,6 +431,34 @@ class AppExport Document : public App::PropertyContainer
  (const App::DocumentObject* from, const App::DocumentObject* to) const;
  //@}
 
+ /** Called by property during properly save its continaing StringHasher
+ *
+ * @param hasher: the input hasher
+ * @return Returns a pair<bool,int>. Boolean member indicate if the
+ * StringHasher has been saved before. The Integer is the hasher index.
+ *
+ * The StringHasher object is designed to be shared among multiple objects.
+ * So, we must not save duplicate copies of the same hasher. And must be
+ * able to restore with the same sharing relationship. This function returns
+ * whether the hasher has been saved before by other objects, and the index
+ * of the hasher. If the hasher has not been saved before, the object must
+ * save the hasher by calling StringHasher::Save
+ */
+ std::pair<bool,int> addStringHasher(StringHasherRef hasher) const;
+
+ /** Called by property to restore its containing StringHasher
+ *
+ * @param index: the index previously returned by calling
+ * addStringHasher() during save.
+ *
+ * @return Return the resulting string hasher.
+ *
+ * The caller is responsible to restore the hasher itself if it is the first
+ * owner of the hasher, i.e. return addStringHasher() returns true during
+ * save
+ */
+ StringHasherRef getStringHasher(int index) const;
+
  /** Return the object linked to this object
  *
  * @param links: holds the links found
@@ -445,28 +474,11 @@ class AppExport Document : public App::PropertyContainer
  return !links.empty();
  }
 
+ void addRemapProperty(Property *prop);
+
  /// Function called to signal that an object identifier has been renamed
  void renameObjectIdentifiers(const std::map<App::ObjectIdentifier, App::ObjectIdentifier> & paths, const std::function<bool(const App::DocumentObject*)> &selector = [](const App::DocumentObject *) { return true; });
 
- /** @name Maps an arbitary string to an integer
- *
- * These function internally hashes the string, and stroes the hash in a
- * map to integer. The hashes of the strings passed to this function are
- * persisted, which means the returned ID is an unique identifier of the
- * string. The function return the interger as a shared pointer to
- * reference count the ID so that it is possible to prune any unused hash,
- * depending on the value of Document.SaveAllStringIDs
- *
- * The purpose of function is to provide a short form of a stable string
- * hash.
- */
- //@{
- typedef std::shared_ptr<const long> StringID;
- StringID mapStringToID(const char *text);
- StringID mapStringToID(const QByteArray &data);
- static inline long stringID(StringID id) {return id?*id:-1;}
- //@}
-
  virtual PyObject *getPyObject(void);
 
  friend class Application;
@@ -507,7 +519,6 @@ class AppExport Document : public App::PropertyContainer
 
  std::string getTransientDirectoryName(const std::string& uuid, const std::string& filename) const;
 
-
 private:
  // # Data Member of the document +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  std::list<Transaction*> mUndoTransactions;