Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Recovering of malformed ENEX file #12

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
50 changes: 50 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,56 @@ You can convert that file to SQLite like so:

This will display a progress bar and create a SQLite database file called `evernote.db`.

In situations where the ENEX file being malformed
or size of notes grown bigger than the optimised XML parser
you have an option to run in recovery mode that will use methods
that will allow the process to carry on through all notes.

$ evernote-to-sqlite recover-enex evernote.db MyNotes.enex

If you have very large file you can also supply a resume-file that allows
the process to process where it left of in such case of interruption.

```shell script
$ evernote-to-sqlite recover-enex --help
Usage: evernote-to-sqlite recover-enex [OPTIONS] DB_PATH ENEX_FILE

Use recover techniques allowing malformed Evernote exports to be transformed
to SQLite and specifically useful for very large Enex file. Be warned that
this takes a very long time for larges Enex files.

Options:
--max_note_size INTEGER This maximum size on MB attempting to discover end-
tag of recognised note before skipping to next.
--resume_file PATH Allows resume where conversion was
aborted/failed.File will be created if it does not
exist and will register start, end byte in Enex
file.
--help Show this message and exit.

$ evernote-to-sqlite recover-enex evernote.db MyNotes.enex --max_note_size 30 --resume_file my_resume_file

...

5763: 0.3 MB,recovered: 0, exceed max size: 16
processing current content 1: 1 MB
processing current content 1: 2 MB
processing current content 1: 3 MB
5764: 3.2 MB,recovered: 0, exceed max size: 16
5765: 0.0 MB,recovered: 0, exceed max size: 16
processing current content 1: 1 MB
processing current content 1: 2 MB
processing current content 1: 3 MB
5766: 3.4 MB,recovered: 0, exceed max size: 16
[07:22:40] INFO Notes with new start generated: 0 cli.py:150
[07:22:41] INFO Notes that exceeded the maximum size: 16 cli.py:151
INFO Notes that were found but required splitting: 51 cli.py:152
INFO Notes found where <content> tag required to be escaped: 7 cli.py:154
Processing Evernote export file 5GB... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸ 100% 0:00:01
Parsing note... ━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10% 0:00:00
```


### Limitations

Unfortunately the ENEX export format does not include a unique identifier for each note. This means you cannot use this tool to re-import notes after they have been updated - you should consider this tool to be a one-time transformation of an ENEX file into an equivalent SQLite database.
Expand Down
123 changes: 120 additions & 3 deletions evernote_to_sqlite/cli.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,36 @@
import sqlite_utils
import click
import os
from .utils import find_all_tags, save_note, ensure_indexes
import logging
from rich.logging import RichHandler
from rich.progress import Progress
import lxml
from lxml import etree
import sys
try:
from .utils import find_all_tags, save_note, save_note_recovery, ensure_indexes, human_size
from .hugexmlparser import HugeXmlParser, read_recovery_file, update_recovery_file
except ModuleNotFoundError:
# workaround for PyCharm
from utils import find_all_tags, save_note, save_note_recovery, ensure_indexes, human_size
from hugexmlparser import HugeXmlParser, read_recovery_file, update_recovery_file

FORMAT = "%(message)s"
logging.basicConfig(
level="NOTSET", format=FORMAT, datefmt="[%X]", handlers=[RichHandler()]
)
logger = logging.getLogger(__name__)

MEGABYTE = 1_000_000


@click.group()
@click.version_option()
def cli():
"Tools for converting Evernote content to SQLite"
"""Tools for converting Evernote content to SQLite"""


# noinspection SpellCheckingInspection
@cli.command()
@click.argument(
"db_path",
Expand All @@ -22,7 +43,7 @@ def cli():
required=True,
)
def enex(db_path, enex_file):
"Convert Evernote .enex exports to SQLite"
"""Convert Evernote .enex exports to SQLite"""
file_length = os.path.getsize(enex_file)
fp = open(enex_file, "r", encoding="utf-8")
db = sqlite_utils.Database(db_path)
Expand All @@ -31,3 +52,99 @@ def enex(db_path, enex_file):
save_note(db, note)
fp.close()
ensure_indexes(db)


@cli.command()
@click.argument(
"db_path",
type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
required=True,
)
@click.argument(
"enex_file",
type=click.Path(exists=True, file_okay=True, dir_okay=False, allow_dash=False),
required=True,
)
@click.option(
"--max_note_size",
type=click.INT,
required=False,
default=30,
help="This maximum size on MB attempting to discover end-tag of recognised note before skipping to next.",
)
@click.option(
"--resume_file",
type=click.Path(),
required=False,
help="Allows resume where conversion was aborted/failed."
"File will be created if it does not exist and will register start, end byte in Enex file.",
)
def recover_enex(db_path, enex_file, max_note_size=30, resume_file=None):
"""Use recover techniques allowing malformed Evernote exports to be transformed to SQLite
and specifically useful for very large Enex file. Be warned that this takes
a very long time for larges Enex files."""

file_length = os.path.getsize(enex_file)
db = sqlite_utils.Database(db_path)
fp = open(enex_file, "r", encoding="utf-8")

records = read_recovery_file(resume_file)
current_position = sorted(records)[-1][0] if records else 0
count = len(records) - 1
splitted = 0
content_escaped = 0

with Progress() as progress:
all_tasks = progress.add_task(f"[red]Processing Evernote export file {human_size(file_length)}...", total=file_length)
xml_parser = HugeXmlParser(enex_file, max_size_mb=max_note_size, progress_bar=progress)

while not progress.finished:
try:
start_pos, end_pos, data = next(xml_parser.yield_tag(start_pos=current_position))
except StopIteration:
break

progress.update(all_tasks, completed=end_pos)
current_position = end_pos

progress.console.print(
f"{count}: {round(len(data) / MEGABYTE, 1)} MB,"
f"recovered: {xml_parser.new_start}, exceed max size: {xml_parser.exceed_max}"
)
records.add((start_pos, end_pos))
if resume_file:
update_recovery_file(records, resume_file)
notes = []
try:
notes.append(lxml.etree.fromstring(data))
except lxml.etree.XMLSyntaxError as e:
progress.console.print(e)
progress.console.print("potential multiple notes breaking these up")
splitted += 1
for data_chunk in xml_parser.split_multiple_tag_chunk(data):
try:
data_chunk = lxml.etree.fromstring(data_chunk)
except lxml.etree.XMLSyntaxError as e:
progress.console.print(e)
progress.console.print("invalid xml, attempt to escaping content-tag")
data_chunk = xml_parser.escape_single_tag(data_chunk, "content")
content_escaped += 1
data_chunk = lxml.etree.fromstring(data_chunk)
notes.append(data_chunk)
for note in notes:
save_note_recovery(db, note)
count += 1

logger.info(f"Notes with new start generated: {xml_parser.new_start}")
logger.info(f"Notes that exceeded the maximum size: {xml_parser.exceed_max}")
logger.info(f"Notes that were found but required splitting: {splitted}")
logger.info(
f"Notes found where <content> tag required to be escaped: {content_escaped}"
)

fp.close()
ensure_indexes(db)


if __name__ == '__main__':
cli(sys.argv[1:])
Loading