dogsheep · engdan77 · May 14, 2021 · May 14, 2021 · May 14, 2021 · May 15, 2021
diff --git a/README.md b/README.md
@@ -25,6 +25,56 @@ You can convert that file to SQLite like so:
 
 This will display a progress bar and create a SQLite database file called `evernote.db`.
 
+In situations where the ENEX file being malformed 
+or size of notes grown bigger than the optimised XML parser
+you have an option to run in recovery mode that will use methods
+that will allow the process to carry on through all notes.
+
+    $ evernote-to-sqlite recover-enex evernote.db MyNotes.enex
+
+If you have very large file you can also supply a resume-file that allows
+the process to process where it left of in such case of interruption.
+
+```shell script
+$ evernote-to-sqlite recover-enex --help                                                                   
+Usage: evernote-to-sqlite recover-enex [OPTIONS] DB_PATH ENEX_FILE
+
+  Use recover techniques allowing malformed Evernote exports to be transformed
+  to SQLite and specifically useful for very large Enex file. Be warned that
+  this takes a very long time for larges Enex files.
+
+Options:
+  --max_note_size INTEGER  This maximum size on MB attempting to discover end-
+                           tag of recognised note before skipping to next.
+  --resume_file PATH       Allows resume where conversion was
+                           aborted/failed.File will be created if it does not
+                           exist and will register start, end byte in Enex
+                           file.
+  --help                   Show this message and exit.
+
+$ evernote-to-sqlite recover-enex evernote.db MyNotes.enex --max_note_size 30 --resume_file my_resume_file
+
+...
+
+5763: 0.3 MB,recovered: 0, exceed max size: 16
+processing current content 1: 1 MB
+processing current content 1: 2 MB
+processing current content 1: 3 MB
+5764: 3.2 MB,recovered: 0, exceed max size: 16
+5765: 0.0 MB,recovered: 0, exceed max size: 16
+processing current content 1: 1 MB
+processing current content 1: 2 MB
+processing current content 1: 3 MB
+5766: 3.4 MB,recovered: 0, exceed max size: 16
+[07:22:40] INFO     Notes with new start generated: 0                                                                                                             cli.py:150
+[07:22:41] INFO     Notes that exceeded the maximum size: 16                                                                                                      cli.py:151
+           INFO     Notes that were found but required splitting: 51                                                                                              cli.py:152
+           INFO     Notes found where <content> tag required to be escaped: 7                                                                                     cli.py:154
+Processing Evernote export file 5GB... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸ 100% 0:00:01
+Parsing note...                        ━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━  10% 0:00:00
+```
+
+
 ### Limitations
 
 Unfortunately the ENEX export format does not include a unique identifier for each note. This means you cannot use this tool to re-import notes after they have been updated - you should consider this tool to be a one-time transformation of an ENEX file into an equivalent SQLite database.

diff --git a/evernote_to_sqlite/cli.py b/evernote_to_sqlite/cli.py
@@ -1,15 +1,36 @@
 import sqlite_utils
 import click
 import os
-from .utils import find_all_tags, save_note, ensure_indexes
+import logging
+from rich.logging import RichHandler
+from rich.progress import Progress
+import lxml
+from lxml import etree
+import sys
+try:
+    from .utils import find_all_tags, save_note, save_note_recovery, ensure_indexes, human_size
+    from .hugexmlparser import HugeXmlParser, read_recovery_file, update_recovery_file
+except ModuleNotFoundError:
+    # workaround for PyCharm
+    from utils import find_all_tags, save_note, save_note_recovery, ensure_indexes, human_size
+    from hugexmlparser import HugeXmlParser, read_recovery_file, update_recovery_file
+
+FORMAT = "%(message)s"
+logging.basicConfig(
+    level="NOTSET", format=FORMAT, datefmt="[%X]", handlers=[RichHandler()]
+)
+logger = logging.getLogger(__name__)
+
+MEGABYTE = 1_000_000
 
 
 @click.group()
 @click.version_option()
 def cli():
-    "Tools for converting Evernote content to SQLite"
+    """Tools for converting Evernote content to SQLite"""
 
 
+# noinspection SpellCheckingInspection
 @cli.command()
 @click.argument(
     "db_path",
@@ -22,7 +43,7 @@ def cli():
     required=True,
 )
 def enex(db_path, enex_file):
-    "Convert Evernote .enex exports to SQLite"
+    """Convert Evernote .enex exports to SQLite"""
     file_length = os.path.getsize(enex_file)
     fp = open(enex_file, "r", encoding="utf-8")
     db = sqlite_utils.Database(db_path)
@@ -31,3 +52,99 @@ def enex(db_path, enex_file):
             save_note(db, note)
     fp.close()
     ensure_indexes(db)
+
+
+@cli.command()
+@click.argument(
+    "db_path",
+    type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
+    required=True,
+)
+@click.argument(
+    "enex_file",
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, allow_dash=False),
+    required=True,
+)
+@click.option(
+    "--max_note_size",
+    type=click.INT,
+    required=False,
+    default=30,
+    help="This maximum size on MB attempting to discover end-tag of recognised note before skipping to next.",
+)
+@click.option(
+    "--resume_file",
+    type=click.Path(),
+    required=False,
+    help="Allows resume where conversion was aborted/failed."
+         "File will be created if it does not exist and will register start, end byte in Enex file.",
+)
+def recover_enex(db_path, enex_file, max_note_size=30, resume_file=None):
+    """Use recover techniques allowing malformed Evernote exports to be transformed to SQLite
+    and specifically useful for very large Enex file. Be warned that this takes
+    a very long time for larges Enex files."""
+
+    file_length = os.path.getsize(enex_file)
+    db = sqlite_utils.Database(db_path)
+    fp = open(enex_file, "r", encoding="utf-8")
+
+    records = read_recovery_file(resume_file)
+    current_position = sorted(records)[-1][0] if records else 0
+    count = len(records) - 1
+    splitted = 0
+    content_escaped = 0
+
+    with Progress() as progress:
+        all_tasks = progress.add_task(f"[red]Processing Evernote export file {human_size(file_length)}...", total=file_length)
+        xml_parser = HugeXmlParser(enex_file, max_size_mb=max_note_size, progress_bar=progress)
+
+        while not progress.finished:
+            try:
+                start_pos, end_pos, data = next(xml_parser.yield_tag(start_pos=current_position))
+            except StopIteration:
+                break
+
+            progress.update(all_tasks, completed=end_pos)
+            current_position = end_pos
+
+            progress.console.print(
+                f"{count}: {round(len(data) / MEGABYTE, 1)} MB,"
+                f"recovered: {xml_parser.new_start}, exceed max size: {xml_parser.exceed_max}"
+            )
+            records.add((start_pos, end_pos))
+            if resume_file:
+                update_recovery_file(records, resume_file)
+            notes = []
+            try:
+                notes.append(lxml.etree.fromstring(data))
+            except lxml.etree.XMLSyntaxError as e:
+                progress.console.print(e)
+                progress.console.print("potential multiple notes breaking these up")
+                splitted += 1
+                for data_chunk in xml_parser.split_multiple_tag_chunk(data):
+                    try:
+                        data_chunk = lxml.etree.fromstring(data_chunk)
+                    except lxml.etree.XMLSyntaxError as e:
+                        progress.console.print(e)
+                        progress.console.print("invalid xml, attempt to escaping content-tag")
+                        data_chunk = xml_parser.escape_single_tag(data_chunk, "content")
+                        content_escaped += 1
+                        data_chunk = lxml.etree.fromstring(data_chunk)
+                    notes.append(data_chunk)
+            for note in notes:
+                save_note_recovery(db, note)
+                count += 1
+
+        logger.info(f"Notes with new start generated: {xml_parser.new_start}")
+        logger.info(f"Notes that exceeded the maximum size: {xml_parser.exceed_max}")
+        logger.info(f"Notes that were found but required splitting: {splitted}")
+        logger.info(
+            f"Notes found where <content> tag required to be escaped: {content_escaped}"
+        )
+
+        fp.close()
+        ensure_indexes(db)
+
+
+if __name__ == '__main__':
+    cli(sys.argv[1:])