From 38259765fb49c58ab5166c019d337bf0addd23ea Mon Sep 17 00:00:00 2001 From: Daniel Engvall Date: Fri, 14 May 2021 18:10:28 +0200 Subject: [PATCH 01/10] Unable to find a XML parser allow full recovery of XML so this module should to the trick even though not most efficient way. --- evernote_to_sqlite/hugexmlparser.py | 226 ++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 evernote_to_sqlite/hugexmlparser.py diff --git a/evernote_to_sqlite/hugexmlparser.py b/evernote_to_sqlite/hugexmlparser.py new file mode 100644 index 0000000..bdc7654 --- /dev/null +++ b/evernote_to_sqlite/hugexmlparser.py @@ -0,0 +1,226 @@ +import re +from typing import Union, List, BinaryIO, Annotated, Tuple + + +def read_recovery_file(fn="/tmp/records.pickle") -> set: + try: + with open(fn, "rb") as f: + records = pickle.load(f) + except (FileNotFoundError, EOFError): + records = set() + return records + + +def update_recovery_file(records, fn="/tmp/records.pickle"): + with open(fn, "wb") as f: + pickle.dump(records, f) + + +class HugeXmlParser: + def __init__(self, filename: str, tag: str = "note", max_size_mb: int = 30): + """ + Class for handling big malformed XML files + Args: + filename: Input file + tag: The "root" tag you like to retrieve from the XML + max_size_mb: The maximum size allowed once discovering the tag before carrying on to next + """ + self.exceed_max = 0 + self.new_start = 0 + self.filename = filename + self.tag = tag + self.max_size_mb = max_size_mb + + @staticmethod + def split_and_strip(whole_chunk: Union[str, bytes], tag: str = "note") -> List: + """ + + Args: + whole_chunk: Input str or bytes + tag: The tag to split upon + + Returns: List of chunk based on tag + + """ + if type(whole_chunk) is bytes: + whole_chunk = whole_chunk.decode() + chunks = re.split(fr"", whole_chunk) + chunks = [_ for _ in chunks if _.strip()] + return chunks + + def split_multiple_tag_chunk( + self, whole_chunk: Union[str, bytes], tag: str = "note" + ) -> str: + """ + Split and yield tags from str or bytes + Args: + whole_chunk: Input str or bytes + tag: Tag to split out from whole_chunk + + Returns: yields str + + """ + chunks = self.split_and_strip(whole_chunk, tag) + for chunk in chunks: + yield "".join([f"<{tag}>", chunk, f""]) + + def escape_single_tag(self, whole_chunk, tag="content"): + chunks = self.split_and_strip(whole_chunk, tag) + if len(chunks) == 3: + return "".join( + [ + chunks[0], + f"<{tag}>", + "", + f"", + chunks[2], + ] + ) + + def tag_in_chunk(self, chunk: bytes) -> str: + """ + Checks whether either start or end tag exists in tag used in class constructor. + Args: + chunk: The input bytes + + Returns: + + """ + if f"<{self.tag}>".encode() in chunk: + return "start" + if f"".encode() in chunk: + return "end" + + @staticmethod + def get_chunk_size(tag: str) -> int: + """ + Return the number of bytes required to capture either start or end tag + Args: + tag: Tag to estimate size of + + Returns: Number of bytes + + """ + start, end = [f"<{tag}>", f""] + return max([len(_) for _ in (start, end)]) + + def yield_tag(self, start_pos: int = 0) -> bytes: + """ + Yield chunks of bytes covering the tag within the XML + Args: + start_pos: Instead of starting from beginning start the byte position + + Returns: Bytes including start and end tag + + """ + chunk_size = self.get_chunk_size(self.tag) + index_content = 0 + with open(self.filename, "rb") as f: + pos = start_pos + while True: + pos += 1 + f.seek(pos) + chunk = f.read(chunk_size) + if chunk == b"": + break + if self.tag_in_chunk(chunk): + if self.tag_in_chunk(chunk) == "start": + index_content += 1 + pos = yield from self.yield_content_until_end( + chunk, + chunk_size, + f, + index_content, + pos, + ) + + def get_next_chunk_without_end( + self, f: BinaryIO, pos: int, big_chunk: int = 1_000, margin: int = 10 + ) -> Tuple[int, Union[int, bytes]]: + """ + Returns chunk of bytes that doesn't have a end-tag within the big_chunk size + Args: + f: File-pointer + pos: Byte pointer + big_chunk: Size in bytes to check + margin: The margin at end excluded to avoid miss-match on end-tag + + Returns: + + """ + f.seek(pos) + read_chunk_excluding_margin = f.read(big_chunk)[:-margin] + if not self.tag_in_chunk(read_chunk_excluding_margin): + pos += big_chunk - margin + return pos, read_chunk_excluding_margin + else: + return pos, None + + def yield_content_until_end( + self, chunk: bytes, chunk_size: int, f: BinaryIO, index_content: int, pos: int + ) -> List[Annotated[int, "Start byte"], Annotated[int, "End byte"], bytes]: + """ + Yields bytes until end tag reached + Args: + chunk: Current chunk + chunk_size: Size in bytes to iterate + f: Input file-pointer + index_content: Current index number of content recovered + pos: Current byte position + + Returns: List [start position, end position, bytes] + + """ + result = b"" + abort = False + current_pos = 0 + last_megabyte_progress = 0 + start_pos = pos + while self.tag_in_chunk(chunk) != "end": + pos += 1 + # break if no data left + if f.read(1) == "": + break + # get next big chunks without end-tag + while True: + pos, big_chunk_with_no_end = self.get_next_chunk_without_end(f, pos) + # print(f"pos: {pos}, big_chunk_with_no_end: {big_chunk_with_no_end}") + if big_chunk_with_no_end: + result += big_chunk_with_no_end + current_pos += len(big_chunk_with_no_end) + new_megabyte_progress = int(round(current_pos, -6) / 1_000_000) + if new_megabyte_progress is not last_megabyte_progress: + print( + f"processing current content {index_content}: {new_megabyte_progress} MB" + ) + last_megabyte_progress = new_megabyte_progress + else: + break + if last_megabyte_progress >= self.max_size_mb: + print(f"Exceeding max size of {self.max_size_mb}, breaking") + self.exceed_max += 1 + abort = True + break + # carry on byte per byte now when end tag discovered + f.seek(pos) + result += f.read(1) + chunk = f.read(chunk_size) + if ( + self.tag_in_chunk(chunk) == "start" + and f"" not in result.decode() + ): + print("Found new start, ending previous") + end_tag = f"".encode() + yield start_pos, pos + len(end_tag), result + end_tag + pos -= 1 + abort = True + self.new_start += 1 + break + if abort: + break + if not abort: + # Return start, end and chunk + yield start_pos + 1, pos + len(chunk) + 1, result + chunk + return pos From 23d04ed2132fae6478a4fec0670b5c174da2736b Mon Sep 17 00:00:00 2001 From: Daniel Engvall Date: Fri, 14 May 2021 18:10:59 +0200 Subject: [PATCH 02/10] Integrating hugexml parser and techniques allowing recovering large Enex files. --- evernote_to_sqlite/cli.py | 109 +++++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 2 deletions(-) diff --git a/evernote_to_sqlite/cli.py b/evernote_to_sqlite/cli.py index 5c720f3..6bd712e 100644 --- a/evernote_to_sqlite/cli.py +++ b/evernote_to_sqlite/cli.py @@ -2,12 +2,27 @@ import click import os from .utils import find_all_tags, save_note, ensure_indexes +import time +import datetime as dt +from .hugexmlparser import HugeXmlParser, read_recovery_file, update_recovery_file +import logging +from rich.logging import RichHandler +from rich.progress import Progress +import lxml + +FORMAT = "%(message)s" +logging.basicConfig( + level="NOTSET", format=FORMAT, datefmt="[%X]", handlers=[RichHandler()] +) +logger = logging.getLogger(__name__) + +MEGABYTE = 1_000_000 @click.group() @click.version_option() def cli(): - "Tools for converting Evernote content to SQLite" + """Tools for converting Evernote content to SQLite""" @cli.command() @@ -22,7 +37,7 @@ def cli(): required=True, ) def enex(db_path, enex_file): - "Convert Evernote .enex exports to SQLite" + """Convert Evernote .enex exports to SQLite""" file_length = os.path.getsize(enex_file) fp = open(enex_file, "r", encoding="utf-8") db = sqlite_utils.Database(db_path) @@ -31,3 +46,93 @@ def enex(db_path, enex_file): save_note(db, note) fp.close() ensure_indexes(db) + + +@cli.command() +@click.argument( + "db_path", + type=click.Path(file_okay=True, dir_okay=False, allow_dash=False), + required=True, +) +@click.argument( + "enex_file", + type=click.Path(exists=True, file_okay=True, dir_okay=False, allow_dash=False), + required=True, +) +@click.option( + "--max_note_size", + type=click.INT, + required=False, + default=30, + help="This maximum size on MB attempting to discover end-tag of recognised note before skipping to next.", +) +@click.option( + "--resume_file", + type=click.Path(), + required=False, + help="Allows resume where conversion was aborted/failed. File will be created if it does not exist and will register start, end byte in Enex file.", +) +def recover_enex(db_path, enex_file, max_note_size=30, resume_file=None): + """Use recover techniques allowing malformed Evernote exports to be transformed to SQLite and specficially useful for very large Enex file. Be warned that this takes a very long time for larges Enex files.""" + + with Progress() as progress: + task1 = progress.add_task("[red]Downloading...", total=1000) + task2 = progress.add_task("[green]Processing...", total=1000) + task3 = progress.add_task("[cyan]Cooking...", total=1000) + + while not progress.finished: + progress.update(task1, advance=0.5) + progress.update(task2, advance=0.3) + progress.update(task3, advance=0.9) + progress.console.print(f"Working on job #{dt.datetime.now().isoformat()}") + time.sleep(0.01) + + file_length = os.path.getsize(enex_file) + db = sqlite_utils.Database(db_path) + fp = open(enex_file, "r", encoding="utf-8") + + records = read_recovery_file() + last_start = sorted(records)[-1][0] if records else 0 + count = len(records) - 1 + splitted = 0 + content_escaped = 0 + + xml_parser = HugeXmlParser(enex_file) + + for start_pos, end_pos, data in xml_parser.yield_tag(start_pos=last_start): + logger.info( + f"{count}: {round(len(data) / MEGABYTE, 1)} MB," + f"recovered: {xml_parser.new_start}, exceed max size: {xml_parser.exceed_max}" + ) + records.add((start_pos, end_pos)) + update_recovery_file(records) + notes = [] + try: + notes.append(etree.fromstring(data)) + except lxml.etree.XMLSyntaxError as e: + logger.error(e) + logger.warning("potential multiple notes breaking these up") + splitted += 1 + for data_chunk in xml_parser.split_multiple_tag_chunk(data): + try: + data_chunk = etree.fromstring(data_chunk) + except lxml.etree.XMLSyntaxError as e: + logger.debug(e) + logger.warning("invalid xml, attempt to escaping content-tag") + data_chunk = xml_parser.escape_single_tag(data_chunk, "content") + content_escaped += 1 + data_chunk = lxml.etree.fromstring(data_chunk) + notes.append(data_chunk) + for note in notes: + save_note(db, note) + print(f"saved {count}") + count += 1 + logger.info(f"Notes with new start generated: {xml_parser.new_start}") + logger.info(f"Notes that exceeded the maximum size: {xml_parser.exceed_max}") + logger.info(f"Notes that were found but required splitting: {splitted}") + logger.info( + f"Notes found where tag required to be escaped: {content_escaped}" + ) + + fp.close() + ensure_indexes(db) From 317c01d23025ff7f005af72e2c32e0e6577c0597 Mon Sep 17 00:00:00 2001 From: Daniel Engvall Date: Fri, 14 May 2021 18:11:13 +0200 Subject: [PATCH 03/10] Adding dependencies to requirements.txt --- requirements.txt | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..19a7e59 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +click~=8.0.0 +setuptools~=56.2.0 +rich~=10.2.0 +lxml~=4.6.3 \ No newline at end of file From d04a3b7bc08c8e5e2317e0ab2139f0c84d1f3483 Mon Sep 17 00:00:00 2001 From: Daniel Engvall Date: Sat, 15 May 2021 08:40:39 +0200 Subject: [PATCH 04/10] Adding support for progressbar while parsing large individual notes --- evernote_to_sqlite/hugexmlparser.py | 33 +++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/evernote_to_sqlite/hugexmlparser.py b/evernote_to_sqlite/hugexmlparser.py index bdc7654..d72f96e 100644 --- a/evernote_to_sqlite/hugexmlparser.py +++ b/evernote_to_sqlite/hugexmlparser.py @@ -1,12 +1,14 @@ import re -from typing import Union, List, BinaryIO, Annotated, Tuple +from typing import Union, List, BinaryIO, Tuple +from typing_extensions import Annotated +import pickle def read_recovery_file(fn="/tmp/records.pickle") -> set: try: with open(fn, "rb") as f: records = pickle.load(f) - except (FileNotFoundError, EOFError): + except (FileNotFoundError, EOFError, TypeError): records = set() return records @@ -17,7 +19,7 @@ def update_recovery_file(records, fn="/tmp/records.pickle"): class HugeXmlParser: - def __init__(self, filename: str, tag: str = "note", max_size_mb: int = 30): + def __init__(self, filename: str, tag: str = "note", max_size_mb: int = 30, progress_bar=None): """ Class for handling big malformed XML files Args: @@ -30,6 +32,15 @@ def __init__(self, filename: str, tag: str = "note", max_size_mb: int = 30): self.filename = filename self.tag = tag self.max_size_mb = max_size_mb + self.progess_bar = progress_bar + if self.progess_bar: + self.note_progress = self.progess_bar.add_task("[red]Parsing note...", total=self.max_size_mb) + + def print(self, text): + if self.progess_bar: + self.progess_bar.console.print(text) + else: + print(text) @staticmethod def split_and_strip(whole_chunk: Union[str, bytes], tag: str = "note") -> List: @@ -158,9 +169,17 @@ def get_next_chunk_without_end( else: return pos, None + def update_progress_bar(self, current_megabyte): + if not self.progess_bar: + return + if current_megabyte > 0: + self.progess_bar.update(self.note_progress, completed=current_megabyte) + else: + self.progess_bar.reset(self.note_progress) + def yield_content_until_end( self, chunk: bytes, chunk_size: int, f: BinaryIO, index_content: int, pos: int - ) -> List[Annotated[int, "Start byte"], Annotated[int, "End byte"], bytes]: + ) -> Tuple[Annotated[int, "Start byte"], Annotated[int, "End byte"], bytes]: """ Yields bytes until end tag reached Args: @@ -186,20 +205,20 @@ def yield_content_until_end( # get next big chunks without end-tag while True: pos, big_chunk_with_no_end = self.get_next_chunk_without_end(f, pos) - # print(f"pos: {pos}, big_chunk_with_no_end: {big_chunk_with_no_end}") if big_chunk_with_no_end: result += big_chunk_with_no_end current_pos += len(big_chunk_with_no_end) new_megabyte_progress = int(round(current_pos, -6) / 1_000_000) if new_megabyte_progress is not last_megabyte_progress: - print( + self.print( f"processing current content {index_content}: {new_megabyte_progress} MB" ) + self.update_progress_bar(new_megabyte_progress) last_megabyte_progress = new_megabyte_progress else: break if last_megabyte_progress >= self.max_size_mb: - print(f"Exceeding max size of {self.max_size_mb}, breaking") + self.print(f"Exceeding max size of {self.max_size_mb}, breaking") self.exceed_max += 1 abort = True break From 7883f3dc436935e5b60e94e08406b94b712c8c3d Mon Sep 17 00:00:00 2001 From: Daniel Engvall Date: Sat, 15 May 2021 08:42:02 +0200 Subject: [PATCH 05/10] Changes made to recover_enex to support parsing of large ENEX files --- evernote_to_sqlite/cli.py | 132 ++++++++++++++++++++++---------------- 1 file changed, 78 insertions(+), 54 deletions(-) diff --git a/evernote_to_sqlite/cli.py b/evernote_to_sqlite/cli.py index 6bd712e..2f03dfd 100644 --- a/evernote_to_sqlite/cli.py +++ b/evernote_to_sqlite/cli.py @@ -1,14 +1,19 @@ import sqlite_utils import click import os -from .utils import find_all_tags, save_note, ensure_indexes -import time -import datetime as dt -from .hugexmlparser import HugeXmlParser, read_recovery_file, update_recovery_file import logging from rich.logging import RichHandler from rich.progress import Progress import lxml +from lxml import etree +import sys +try: + from .utils import find_all_tags, save_note, save_note_recovery, ensure_indexes, human_size + from .hugexmlparser import HugeXmlParser, read_recovery_file, update_recovery_file +except ModuleNotFoundError: + # workaround for PyCharm + from utils import find_all_tags, save_note, save_note_recovery, ensure_indexes, human_size + from hugexmlparser import HugeXmlParser, read_recovery_file, update_recovery_file FORMAT = "%(message)s" logging.basicConfig( @@ -25,6 +30,7 @@ def cli(): """Tools for converting Evernote content to SQLite""" +# noinspection SpellCheckingInspection @cli.command() @click.argument( "db_path", @@ -70,69 +76,87 @@ def enex(db_path, enex_file): "--resume_file", type=click.Path(), required=False, - help="Allows resume where conversion was aborted/failed. File will be created if it does not exist and will register start, end byte in Enex file.", + help="Allows resume where conversion was aborted/failed." + "File will be created if it does not exist and will register start, end byte in Enex file.", ) def recover_enex(db_path, enex_file, max_note_size=30, resume_file=None): - """Use recover techniques allowing malformed Evernote exports to be transformed to SQLite and specficially useful for very large Enex file. Be warned that this takes a very long time for larges Enex files.""" + """Use recover techniques allowing malformed Evernote exports to be transformed to SQLite + and specifically useful for very large Enex file. Be warned that this takes + a very long time for larges Enex files.""" - with Progress() as progress: - task1 = progress.add_task("[red]Downloading...", total=1000) - task2 = progress.add_task("[green]Processing...", total=1000) - task3 = progress.add_task("[cyan]Cooking...", total=1000) - - while not progress.finished: - progress.update(task1, advance=0.5) - progress.update(task2, advance=0.3) - progress.update(task3, advance=0.9) - progress.console.print(f"Working on job #{dt.datetime.now().isoformat()}") - time.sleep(0.01) + # with Progress() as progress: + # task1 = progress.add_task("[red]Downloading...", total=1000) + # task2 = progress.add_task("[green]Processing...", total=1000) + # task3 = progress.add_task("[cyan]Cooking...", total=1000) + # + # while not progress.finished: + # progress.update(task1, advance=0.5) + # progress.update(task2, advance=0.3) + # progress.update(task3, advance=0.9) + # progress.console.print(f"Working on job #{dt.datetime.now().isoformat()}") + # time.sleep(0.01) file_length = os.path.getsize(enex_file) db = sqlite_utils.Database(db_path) fp = open(enex_file, "r", encoding="utf-8") - records = read_recovery_file() - last_start = sorted(records)[-1][0] if records else 0 + records = read_recovery_file(resume_file) + current_position = sorted(records)[-1][0] if records else 0 count = len(records) - 1 splitted = 0 content_escaped = 0 - xml_parser = HugeXmlParser(enex_file) + with Progress() as progress: + all_tasks = progress.add_task(f"[red]Processing Evernote export file {human_size(file_length)}...", total=file_length) + xml_parser = HugeXmlParser(enex_file, max_size_mb=max_note_size, progress_bar=progress) + + while not progress.finished: + try: + start_pos, end_pos, data = next(xml_parser.yield_tag(start_pos=current_position)) + except StopIteration: + break + + progress.update(all_tasks, completed=end_pos) + current_position = end_pos - for start_pos, end_pos, data in xml_parser.yield_tag(start_pos=last_start): + progress.console.print( + f"{count}: {round(len(data) / MEGABYTE, 1)} MB," + f"recovered: {xml_parser.new_start}, exceed max size: {xml_parser.exceed_max}" + ) + records.add((start_pos, end_pos)) + if resume_file: + update_recovery_file(records, resume_file) + notes = [] + try: + notes.append(lxml.etree.fromstring(data)) + except lxml.etree.XMLSyntaxError as e: + progress.console.print(e) + progress.console.print("potential multiple notes breaking these up") + splitted += 1 + for data_chunk in xml_parser.split_multiple_tag_chunk(data): + try: + data_chunk = lxml.etree.fromstring(data_chunk) + except lxml.etree.XMLSyntaxError as e: + progress.console.print(e) + progress.console.print("invalid xml, attempt to escaping content-tag") + data_chunk = xml_parser.escape_single_tag(data_chunk, "content") + content_escaped += 1 + data_chunk = lxml.etree.fromstring(data_chunk) + notes.append(data_chunk) + for note in notes: + save_note_recovery(db, note) + count += 1 + + logger.info(f"Notes with new start generated: {xml_parser.new_start}") + logger.info(f"Notes that exceeded the maximum size: {xml_parser.exceed_max}") + logger.info(f"Notes that were found but required splitting: {splitted}") logger.info( - f"{count}: {round(len(data) / MEGABYTE, 1)} MB," - f"recovered: {xml_parser.new_start}, exceed max size: {xml_parser.exceed_max}" + f"Notes found where tag required to be escaped: {content_escaped}" ) - records.add((start_pos, end_pos)) - update_recovery_file(records) - notes = [] - try: - notes.append(etree.fromstring(data)) - except lxml.etree.XMLSyntaxError as e: - logger.error(e) - logger.warning("potential multiple notes breaking these up") - splitted += 1 - for data_chunk in xml_parser.split_multiple_tag_chunk(data): - try: - data_chunk = etree.fromstring(data_chunk) - except lxml.etree.XMLSyntaxError as e: - logger.debug(e) - logger.warning("invalid xml, attempt to escaping content-tag") - data_chunk = xml_parser.escape_single_tag(data_chunk, "content") - content_escaped += 1 - data_chunk = lxml.etree.fromstring(data_chunk) - notes.append(data_chunk) - for note in notes: - save_note(db, note) - print(f"saved {count}") - count += 1 - logger.info(f"Notes with new start generated: {xml_parser.new_start}") - logger.info(f"Notes that exceeded the maximum size: {xml_parser.exceed_max}") - logger.info(f"Notes that were found but required splitting: {splitted}") - logger.info( - f"Notes found where tag required to be escaped: {content_escaped}" - ) - fp.close() - ensure_indexes(db) + fp.close() + ensure_indexes(db) + + +if __name__ == '__main__': + cli(sys.argv[1:]) From bb65c5ec1a4bffd7275be39d00043bee89220448 Mon Sep 17 00:00:00 2001 From: Daniel Engvall Date: Sat, 15 May 2021 08:42:47 +0200 Subject: [PATCH 06/10] Adding function to allow resuming of an already started recovery process --- evernote_to_sqlite/utils.py | 46 +++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/evernote_to_sqlite/utils.py b/evernote_to_sqlite/utils.py index 398cba7..7b6d96f 100644 --- a/evernote_to_sqlite/utils.py +++ b/evernote_to_sqlite/utils.py @@ -113,3 +113,49 @@ def resolve_entities(s): return _entities_re.sub( lambda m: html.entities.entitydefs.get(m.group(1), m.group(1)), s ) + + +def save_note_recovery(db, note): + title = note.find("title").text + created = note.find("created").text + if note.find("updated") is not None: + updated = note.find("updated").text + else: + updated = created + content = note.find("content").text + row = { + "title": title, + "content": content, + "created": convert_datetime(created), + "updated": convert_datetime(updated), + } + attributes = note.find("note-attributes") + if attributes is not None: + row.update({attribute.tag: attribute.text for attribute in attributes}) + # If any of those attributes end in -date, e.g. 'subject-date', convert them + for key in row: + if key.endswith("-date"): + row[key] = convert_datetime(row[key]) + note_id = db["notes"].insert(row, hash_id="id", replace=True, alter=True).last_pk + # Now do the resources + for resource in note.findall("resource"): + resource_id = save_resource(db, resource) + db["note_resources"].insert( + { + "note_id": note_id, + "resource_id": resource_id, + }, + pk=("note_id", "resource_id"), + foreign_keys=("note_id", "resource_id"), + replace=True, + ) + + +def human_size(bytes, units=[" bytes", "KB", "MB", "GB", "TB", "PB", "EB"]): + return ( + str(bytes) + units[0] + if bytes < 1024 + else human_size(bytes >> 10, units[1:]) + if units[1:] + else f"{bytes>>10}ZB" + ) From 27610cc999c293ab87284dff098c3ad560c5e94e Mon Sep 17 00:00:00 2001 From: Daniel Engvall Date: Sat, 15 May 2021 08:43:27 +0200 Subject: [PATCH 07/10] Adding help to describing the recover enex command --- README.md | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/README.md b/README.md index 08ee34d..fcdeea4 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,56 @@ You can convert that file to SQLite like so: This will display a progress bar and create a SQLite database file called `evernote.db`. +In situations where the ENEX file being malformed +or size of notes grown bigger than the optimised XML parser +you have an option to run in recovery mode that will use methods +that will allow the process to carry on through all notes. + + $ evernote-to-sqlite recover-enex evernote.db MyNotes.enex + +If you have very large file you can also supply a resume-file that allows +the process to process where it left of in such case of interruption. + +```shell script +$ evernote-to-sqlite recover-enex --help +Usage: evernote-to-sqlite recover-enex [OPTIONS] DB_PATH ENEX_FILE + + Use recover techniques allowing malformed Evernote exports to be transformed + to SQLite and specifically useful for very large Enex file. Be warned that + this takes a very long time for larges Enex files. + +Options: + --max_note_size INTEGER This maximum size on MB attempting to discover end- + tag of recognised note before skipping to next. + --resume_file PATH Allows resume where conversion was + aborted/failed.File will be created if it does not + exist and will register start, end byte in Enex + file. + --help Show this message and exit. + +$ evernote-to-sqlite recover-enex evernote.db MyNotes.enex --max_note_size 30 --resume_file my_resume_file + +... + +5763: 0.3 MB,recovered: 0, exceed max size: 16 +processing current content 1: 1 MB +processing current content 1: 2 MB +processing current content 1: 3 MB +5764: 3.2 MB,recovered: 0, exceed max size: 16 +5765: 0.0 MB,recovered: 0, exceed max size: 16 +processing current content 1: 1 MB +processing current content 1: 2 MB +processing current content 1: 3 MB +5766: 3.4 MB,recovered: 0, exceed max size: 16 +[07:22:40] INFO Notes with new start generated: 0 cli.py:150 +[07:22:41] INFO Notes that exceeded the maximum size: 16 cli.py:151 + INFO Notes that were found but required splitting: 51 cli.py:152 + INFO Notes found where tag required to be escaped: 7 cli.py:154 +Processing Evernote export file 5GB... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸ 100% 0:00:01 +Parsing note... ━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10% 0:00:00 +``` + + ### Limitations Unfortunately the ENEX export format does not include a unique identifier for each note. This means you cannot use this tool to re-import notes after they have been updated - you should consider this tool to be a one-time transformation of an ENEX file into an equivalent SQLite database. From a47a5a868aa7d00146fc1ac910f513245cf4baf4 Mon Sep 17 00:00:00 2001 From: Daniel Engvall Date: Sat, 15 May 2021 09:11:56 +0200 Subject: [PATCH 08/10] Adding tests for recover-enex --- tests/example-note_broken.enex | 177 +++++++++++++++++++++++++++ tests/test_evernote_to_sqlite.py | 197 +++++++++++++++++++++++++++++++ 2 files changed, 374 insertions(+) create mode 100644 tests/example-note_broken.enex diff --git a/tests/example-note_broken.enex b/tests/example-note_broken.enex new file mode 100644 index 0000000..583e92a --- /dev/null +++ b/tests/example-note_broken.enex @@ -0,0 +1,177 @@ + + + +Example note with images
This note includes two images. š.

The Python logo



The Evernote logo



This image contains text:

PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHhtbG5zOnhsaW5rPSJodHRwOi8v +d3d3LnczLm9yZy8xOTk5L3hsaW5rIgphcmlhLWxhYmVsPSJQeXRob24iIHJvbGU9ImltZyIKdmlld0Jv +eD0iMCAwIDUxMiA1MTIiPjxyZWN0CndpZHRoPSI1MTIiIGhlaWdodD0iNTEyIgpyeD0iMTUlIgpmaWxs +PSIjZmZmIi8+PGcgZmlsbD0iIzVhOWZkNCI+PHBhdGggaWQ9InAiIGQ9Ik0yNTQgNjRjLTE2IDAtMzEg +MS00NCA0LTM5IDctNDYgMjEtNDYgNDd2MzVoOTJ2MTJIMTMwYy0yNyAwLTUwIDE2LTU4IDQ2LTggMzUt +OCA1NyAwIDkzIDcgMjggMjMgNDcgNDkgNDdoMzJ2LTQyYzAtMzAgMjYtNTcgNTctNTdoOTFjMjYgMCA0 +Ni0yMSA0Ni00NnYtODhjMC0yNC0yMS00My00Ni00Ny0xNS0zLTMyLTQtNDctNHptLTUwIDI4YzEwIDAg +MTcgOCAxNyAxOCAwIDktNyAxNy0xNyAxNy05IDAtMTctOC0xNy0xNyAwLTEwIDgtMTggMTctMTh6Ii8+ +PC9nPjx1c2UgeGxpbms6aHJlZj0iI3AiIGZpbGw9IiNmZmQ0M2IiIHRyYW5zZm9ybT0icm90YXRlKDE4 +MCwyNTYsMjU1KSIvPjwvc3ZnPg==image/svg+xml00019700101T000000ZPHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciCmFyaWEtbGFiZWw9IkV2ZXJub3Rl +IiByb2xlPSJpbWciCnZpZXdCb3g9IjAgMCA1MTIgNTEyIj48cmVjdAp3aWR0aD0iNTEyIiBoZWlnaHQ9 +IjUxMiIKcng9IjE1JSIKZmlsbD0iIzQ2Yzg1MCIvPjxwYXRoIGQ9Im0xMjEgMTQzaDM1YzMgMCA0LTEg +NC00bC0xLTM4YzAtMTAgNi0xOSA2LTE5aC0xbC02OCA2N3YxczEwLTcgMjUtN3ptMjcxLTZjLTMtMTUt +MTItMjMtMjAtMjUtMzItOC02NS0xMi05OC0xMS0yLTE5LTE4LTI5LTU0LTI5LTMxLTEtNDkgNi00OSAy +OXYzOWMwIDgtNSAxMy0xNCAxM2gtMzRjLTcgMC0xMyAyLTE4IDQtNCAyLTE0IDctMTQgMzAtMSAxOSAx +MyA5NSAyMyAxMTUgMyA5IDYgMTIgMTQgMTUgMTYgOCA1NCAxNSA3MyAxOCAxNyAyIDI4IDYgMzYtOCAy +LTQgMTAtMzAgOS01MiAwLTEgMi0yIDIgMCAwIDctMiAzNiAxOSA0M2w0NSA5YzE2IDEgMjggNyAyOCA0 +OSAwIDI1LTYgMjgtMzQgMjgtMjIgMC0zMCAxLTMwLTE3IDAtMTQgMTQtMTMgMjUtMTMgNCAwIDEtMyAx +LTEyczUtMTQgMC0xNGMtMzYtMS01OCAwLTU4IDQ1IDAgNDIgMTYgNDkgNjggNDkgNDAgMCA1NS0xIDcx +LTUyIDI1LTc4IDE4LTIwNSA5LTI1M3ptLTQ2IDExNWMtNS02LTMxLTgtNDAtNCAyLTEwIDYtMjIgMjIt +MjIgMTUgMCAxOCAxNiAxOCAyNnoiIGZpbGw9IiM0YjRiNGIiLz48L3N2Zz4=image/svg+xml00019700101T000000ZiVBORw0KGgoAAAANSUhEUgAAAp4AAACACAIAAAAtV/4GAAAAAXNSR0IArs4c6QAAAGxlWElmTU0AKgAA +AAgABAEaAAUAAAABAAAAPgEbAAUAAAABAAAARgEoAAMAAAABAAIAAIdpAAQAAAABAAAATgAAAAAAAACQ +AAAAAQAAAJAAAAABAAKgAgAEAAAAAQAAAp6gAwAEAAAAAQAAAIAAAAAAcQXnGQAAAAlwSFlzAAAWJQAA +FiUBSVIk8AAAAgppVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADx4OnhtcG1ldGEgeG1sbnM6eD0iYWRv +YmU6bnM6bWV0YS8iIHg6eG1wdGs9IlhNUCBDb3JlIDUuNC4wIj4KICAgPHJkZjpSREYgeG1sbnM6cmRm +PSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4KICAgICAgPHJkZjpE +ZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIKICAgICAgICAgICAgeG1sbnM6ZXhpZj0iaHR0cDovL25zLmFk +b2JlLmNvbS9leGlmLzEuMC8iCiAgICAgICAgICAgIHhtbG5zOnRpZmY9Imh0dHA6Ly9ucy5hZG9iZS5j +b20vdGlmZi8xLjAvIj4KICAgICAgICAgPGV4aWY6UGl4ZWxYRGltZW5zaW9uPjY3MDwvZXhpZjpQaXhl +bFhEaW1lbnNpb24+CiAgICAgICAgIDxleGlmOlBpeGVsWURpbWVuc2lvbj4xMjg8L2V4aWY6UGl4ZWxZ +RGltZW5zaW9uPgogICAgICAgICA8dGlmZjpSZXNvbHV0aW9uVW5pdD4yPC90aWZmOlJlc29sdXRpb25V +bml0PgogICAgICA8L3JkZjpEZXNjcmlwdGlvbj4KICAgPC9yZGY6UkRGPgo8L3g6eG1wbWV0YT4KEW+9 +lAAAIEJJREFUeAHtnWW0JDW3hi8wuLsP7q6Du7s7DO4M7u7u7izc7eKw8MHdfXB3d+7Dl+9u9kpV96nu +ktNdvOfHTCoVfSrJTnZ20oP17dv3f/QnAiIgAiIgAiJQFwKD16UiqocIiIAIiIAIiMDfBCTa1Q5EQARE +QAREoFYEJNpr9TlVGREQAREQARGQaFcbEAEREAEREIFaEZBor9XnVGVEQAREQAREQKJdbUAEREAEREAE +akVAor1Wn1OVEQEREAEREAGJdrUBERABERABEagVAYn2Wn1OVUYEREAEREAEJNrVBkRABERABESgVgQk +2mv1OVUZERABERABEZBoVxsQAREQAREQgVoRkGiv1edUZURABERABERAol1tQAREQAREQARqRUCivVaf +U5URAREQAREQAYl2tQEREAEREAERqBUBifZafU5VRgREQAREQAQk2tUGREAEREAERKBWBCTaa/U5VRkR +EAEREAERkGhXGxABERABERCBWhGQaK/V51RlREAEREAERECiXW1ABERABERABGpFQKK9Vp9TlREBERAB +ERABiXa1AREQAREQARGoFQGJ9lp9TlVGBERABERABCTa1QZEQAREQAREoFYEJNpr9TlVGREQAREQARGQ +aFcbEAEREAEREIFaEZBor9XnVGVEQAREQAREQKJdbUAEREAEREAEakVAor1Wn1OVEQEREAEREAGJdrUB +ERABERABEagVAYn2Wn1OVUYEREAEREAEJNrVBkRABERABESgVgQk2mv1OVUZERABERABEZBoVxsQAREQ +AREQgVoRkGiv1edUZURABERABERAol1tQAREQAREQARqRUCivVafU5URAREQAREQAYl2tQEREAEREAER +qBUBifZafU5VRgREQAREQAQk2tUGREAEREAERKBWBCTaa/U5VRkREAEREAER6DPccMOJggiIgAiIgAiI +QOEEDjv8UEtzrz33NnfZDq3ayyas9EVABERABESgUgIS7ZXiVmYiIAIiIAIiUDYBifayCSt9ERABERAB +EaiUgER7pbiVmQiIgAiIgAiUTUCivWzCSl8EREAEREAEKiXQ58cff6w0Q2UmAiIgAiIgAv8+AlVKW63a +/33tSzUWAREQARGoNQGJ9lp/XlVOBERABETg30egT2dWefnll19xxRUp26+//rrzzjv/9NNP+cu52267 +jTvuuKTz1FNPXXTRRfkTrCyFGWeccaONNiK7P//88/DDD//8888ry1oZiYAI1IzALrvsMvXUU1Opd999 +96CDDur22g055JB9+vTJKSOGGmooOCBuup2Glb940b7hhhvONttslkF2x++//77jjjuG8JNOOul4440X +3GOMMcZ7772XPZ1GIeeYY47hhx+et2OPPXZ3ifZJJplkqqmmCvVCzN99992N6ij/9ghMN910W265Zca4 +e++997fffpsxsIKJQKcRQK6H0XXMMcfstLJlLM9ggw22zjrrLLHEEkgH3MT666+/kO6PPfbYOeec8+WX +X2ZJZ9ppp11jjTVmmGGGYYYZxifCjOeSSy558sknfSKLL774Cius4H2Cm0wRT2+++eYLL7xAxGSA3vIp +XrTPN998JpV7q1bKVwRaIjD99NNPNtlkGaNwN7NEe0ZW2YOtuuqqyy67bAg/YMCA77//Pnvc3g3JtHv3 +3XcPZTj//PMfeOCBKsvTvdzapoR8OeaYY0YZZRSfArKZjrnQQgshgE466aS77rrLv43crPH23XdflkmR +f0iEqc/BBx/88ccfb7zxxhZg0UUXbTREMHqEYB988MFxxx338ssvW6xedBQv2nuxMspaBESgSwkwbo4z +zjih8EE72i0VGXXUUa3k1a9qupdbe98X+X3yyScPO+ywjaKjnEf7+9VXX0XLbguPXD/33HNHGmkk80l1 +RFOH1DCR5/jjj3/sscfeeOONZ5xxRvSq+sfiRftrr702xBBDJGsy1lhjBaUHr7744gvU71GYP/74I/Ip +9hGdycQTT0yahaj3iy1b89Qo8Ndffz344H/bPOJoHlhv2yDw4YcffvLJJxaRhkpztUf/Cs+ff/7ZXskh +AiJQJYH999/fy/V33nnn0UcfZVSca665UK0HEcO/LMpXWWUVjJOSZTvllFO8XP/uu++QWcwDUK2zMYcO +ZsIJJ0zG8j5ECQZPSDpmACOOOKKJNoKht6dITz/9tI9Svbt40X7UUUelVuPyyy83oLvuuivqjtRg5Xnu +scce5SVeasrY/bGxVGoW//LEUaJ6PerQQw993XXXBSZ0+GDD+C9HpOqLQK8TQCmC/LZiPPjgg4cddlh4 +vP766+edd17sYMIjip/+/fuzP2KBg4Mw2FqZ53PPPeflwu23386raaaZZq+99mrym6hXXHHFtddea4mw +6Np8880x/TYBv88++7BRYgF6xfH3QlB/IiACIiACItDhBNZbbz0r4WeffWZyPXgOHDjwpptusgBLLrmk +uc2BDDY3Nnderps/m+Xrr7/+FltsYT7NHegG0MB702z0Ctj3NY9V9tviV+0llRhThbXXXpvTa0ym0Oe/ +/vrrbGm89dZbTbLr27fv5JNPngyA9oZ1cNI/6cO2zZprrolVBepZtC5sGWA/RfQnnnjitttuYz2XjFKI +D7O/+eefn9yTqaE4+uabb5L+SR/mnkwkITbaaKOxDMUuiYjvv//+Lbfc8uqrrybDF+uDQSkqMibI7ERS +HVRY7H6Fr0afbJ4X2h1sYYJpG58b4G+88cazzz5L1/3tt9+ax63BW/YCOc2BbpBzImziMnCg/cP49ppr +rmligou183LLLcfO6+ijjz7yyCPTVjHqQdOI4S66wUbcQB1GQAyMb775Zpo0n4zlDumQCG1m0KBB9957 +L+QLB0sFMVG2ZMNmWXhEpfnDDz/Yq+CgeE0u82I9169fP9IkHdrMRx99xACNXvT555+P0kk+ttrNQY25 +lqUz0UQTmRvLrKQSmOECjBYmp6NYbqEwtIGlllqKgYKDZKB75ZVXLrvssiyWjHxBzkMxQjLYUnG2Dmlv +KMBoezmrmRp95plnNn+apbnNgW6YXhAeadtoyxk97C3dyg4F0OA5SGyvko5WzxiTNfb2GNuHpGadddY7 +7rgjmWxlPinCo7K8s2fEkXTfl/hmnAdDeJx11lk33HBDo3T23HNP3+ssGB87i36bU3zs1kTylXGTdGaZ +ZZbNNtuMFty8cViOrToY01Onk6SDaWiPh9/oZiimoh0joDH8Ie/hxvFNznqVtCfCpGSbbbYhO19r+hjl +YeBDT8XWdRMV9wILLMC5W48d5kgaRh+Ge0yRm8/nfKbd6F5mmWX4NL761AICbAHy4fhkRx55ZDQzW331 +1VdbbTWmnlF9iQVwXiGwUTBGsUJg+pF9CyQiMz/gWzqhzaDD9JpPe5vTQbHnnHPO1EQYIpP+TGpTPz0T +R8aHBRdc0EehvdHU6b/MCNl2TVr2WOA2ujkCxqBZOsHByMBf5MkkiRNZkWfbj0VxCwWgpVE2b1kS0NEO +GYJS20yISNtg75WhxlcEwYlIY8mLnPOrWB+mbTdFpWwWPSjP7TE4OPbG/I+WHB4XWWQRrznfdNNNLTzT +l19++cUeC3Gw5sSSLiRF8+td0d4FCnkaipfr9g3o0uhM2jtDb4k0ctAIGF+iETYKHDXr6G1vPdLlTjjh +hEiuR4VhIyoc8Y/88z/SsRkRIrkeJRumR5FneNxpp52I3gg7ZcY4NtxllBq9qz1HGGEEDu1su+22japP +7VjEzzPPPFE18UnKdR8G9SBnclI3/1i7WEjWXlwPZY/egRJl++239z4d4qadcwQ5kuu+bDPNNNOll17q +t1f92+7t5r4Wedwnnniil+uWFHo+VhH8az7ewfz74osvbjQAMjKjYaUx+yj53czvLRGkMqav9ugdaKrs +MdLaeiU5rcKCFeXw9uNlW4X3WOYuWLUzrFAN5t2YuPM5mRZNOeWUVjE6Z6NDDo8//rifl00xxRS0OYvY +xIFkYr5vAVDCP/LII+ipGCJRAzCalH3EBc25b6DIwibi0MoZHJgomlKIgRu15IsvvohuDW6UnO5R6ski +VtUGGSUwqkhWWrRy1KQTTDABa0R7GxWbR2Zpiy22mPljiM7MGrUYEemiQeARnd2ye+65p34nyw888EA/ +ElH9t99+mzbP90KLA4RG46wR43PTSlGKoLeHOa2UFWSIBbdNNtnkmWeeIUELHzmCtpNEiE7W7KSgbrV5 +Bud6mVcltc1RItkf2VBD/WvhWeVYu0Wpm9xB8JpVi8VZI6ZE9oi9NB0HpRTEmKmExkYAzKq33nprCxYc +bXdzBhYato3j7HnZ4TfUKnyCKKMsmwJRlCaPhXAL6cOfLokbfRjcGCUYJK0uVHCDDTY4++yzo8IQwHdz +Ggz9lHbF5/Pm5TRmmhzHzKLobT/SQixuk60Zf56FXmNRcFgDw91IavjwrbrZ0bAofgA3zyodXSDawYGi +g9WM7TEjA7gfMfRbZC1amtRuf95553mUV111Vcalql8XRiaUIUEaNwKGzWOffoFu5NkOO+xgCVKe7DYd +ZkFKl2OlxS61pRMc7EWRIEgj//yPTB1s+cjmOqrOyByBuRGDBf0/NS/U+ObPELnVVlvZzAzgp556KtEJ +wHffbrvtDj30UAtcAwcy2I9cbBJzaYaXo1jh8jX91MdqzTDH36233spCyoiFt7A65JBDTEVMo7XLVSy6 +dyBQ2cpBsgZP9KvYBwXsyHhMN5rsf/l0srixd/EmL37T7YgjjshyoRgNzOQ64pwmgdC1rJHuRx99dCg8 +M0v2ifwhCIK13c0Zbfbbbz/LCJsSe0RLjPm0vSrDkZ9bVCqmKUx9GC6CP0ONkWEXLCnaaSFh7CU8djO0 +KL+1RxPlWHkIwALp6quvtnE7yrfVR9smJ2LUzn1SftLv1Ye4rdhNNmh8Ui252UezVRMwy5g6tFSeLlDI +g4lB37cPqPlNIGaaLdW5x8Be9lx44YXJ8DRl5hYsYpKveteH8deaFxqOpFyneBiRYiiQOhnKWXibVZDO +/fffH8l1PPE588wz0bonM0KTb8sFOh7KGN97AY44t1iooH2nNf/udaBrscLfeeedrOC9XOcVj8cffzwQ +2Pa2kMHBSRs21FkeeWLhFX3ngAMOsBWwnz1EiYRH1LMm1/Fh4GZCbCEL72iWchsOpum2xUA1ETBerpMg +6iJvsJK8SLh7u3kbuBpFCdMUAFoAeqjd3BAmRvYKB+YRphgnGJNFL9cJwDVwbLSHKIhS3219Om24bQed +uMxlG6XgRbtXC1mxichEsFH0NvzZmGAy6q+hRctbxtqppbJ1gWhn3ey/VqieNyWLtC4t1T81sG/oya3N +1Cgd4mnTUsrD/qJ/rKCEXhphx9tSjn49yprVJxXSYabCX3BTr7nnnrul9Ds5MOtLsw9iWnPaaac1Ki3i +ilMG0VvfXKNXPCLXzVzZtOvJYPiglfXdKoSx8/08+q3K1BSq9FxppZVMJf7QQw/5ub4VA1bmD+GoO3hu +3dXNrYL5Hak6BrsqFWKmFwl5sYlumWLTk5xN8hYDOpscFDgd9KI9eYDCSuUXgb7Bh62HEKxJdEuniQMj +StRX/LE5woEjNFv+zlqaHGYKTaJX86oLFPKp1/rYaAUmM0osChn7Rmadx73WFKDXtSsZq8Y4zl+Yq9Ks +UZ1hP9Vkhpsx2YzBXnrpJQuJCnTllVf2gsFepTpsyc7b1OEGfxYE6PND9EYmPKmJd7inX0yzKE8dLjNW +gU8PGXap4GmSzBQ5JMI6LKlNCYmnnqyjMIjAkFR2g4+Mpc0TzC/CkPHYjYfUrNbBYfKbR3bfTWgRuHu7 +eR5uUdzUI2T+eCrNyfdrb5BIG2uEHdEeNrb99nOUdauPNpMjYhMjNS/OfTC/G8sg2WruPjwl8YWxV/Qs +Dm2lmu5bmMocXSDaMY1J4vj000/N0xtHmGceB2sXmmwYGrBCYteTb4ZhEQKeWw5Stdx5sis2LuYbtmJm +LcLqlo6KJR3qSv5yTlebFxXZAChT4qH2Z3rLJIwDSHBD+9Jki8uLDVCnZuT9C5/PpeZYjae3nrNVZktZ +sxO/1lprsbXpVzapKTQR7ZFm1aLz1cJk0U8R7G1vOXwDYLebvx5LgrW8F+1d3c17rGyWAMx7/BrXonhD +h2jV7h+xdbAojRyIQLp2Idpp0wSQlw0yyXy9aPdv/f3ivhY+TE43FpTFmkzmKU8XiPakxSkVtsl4nso3 +iouIwiiJk50WgJbE0oo/rkNipENSshr2c1sL2esOJiIoxGyTiQkKh1v4W3jhhSkbZUaJxOUnJZUTwyWO +EYdZEVnQzVhB8sdGFJ8M2YxtY6oKxOzvCNZoBmAKeVKmRiVVofpkvQaiiQV7o4Khz6c1ZpS7qauNkHKj +OzpK7WuNKtWjfxvLwWgN0NXdvEc+WQL4Ra0P3+iLM3FsJDh99MjNcrkQ0e61j02msNwiYAXwFfSXIkQt +wcJndDCOsUwiMHoL5otm90NPRDnPae3UCVPGxIsK1gWivVE7KwpBajr8hADaAqalyekhjZvPiYjimsOH +H344NXovemKXQNvCDosjgiZirTxYmXIiBWthb4Fvb/M7sB/hzhMuSPGr8JAshWG7i5kHWvqk2W2WIcOb +/jXp2/lrUXEKNjSQbyP52qhIDGSRXGdxw6oLVjYUoojOskzplY7WqF49+vujgKl7dskUmJFHnt3bzaOK +tPfY6hf3NurMvzOuUIuScxy6sWo2kc2+N/mFPtcxWXRb+ZhPSw52Bv1NOIx16OGDpGAc4x6z5EnLltIv +JHAXiPZC6tlGIv/7nz+kOLeAYUzLBM2veHBjlMu1NsUaW7ZRzmQUpDvnT5gsc1qJg09se9uaOARG6hd7 +5NSXAZ0ncwv0peTO1kDyQDZ78Owoe9Uo0dnT7VG6+5V60rLSl6G73KhSTL3MCr6RYjy1Usw+bb1OU0TM +czYhCokqxfZoolfd+8ghbLM9ZCLbdjfs3m5e/bfzFk5ITUx5qixDxmW3X7X7roRlLtORMMiwzCBYUXMO +dBIDBgxAuoelFOMtN9qGZX2VfKK8usBCPipxxY/sE2PuyLYxggpVs9+wYUj1N9tUXLAes2NbnVMozD+4 +Bh+7VvTwXtFtNy33mE57ARgF0E1x9AVBzrkjlPB+icCZmShZ62Z0j6SmJAT2RyFSt2miNLvl0VuTeDve +LOW3kwLg5VRhUq6TiJ8SZUmzK8J49QaXxuQsc/d285wVbyk6A4hNoZqsm1tKM3tgvxhAC9Vo5e0tV7x1 +DhmZHgv3uuuumz3rHkMy3HmDxE64ulGivcev9k8A5BP3OfjR09+L90+4znMhOBG0CFST7ugze1wlF1UP +NjWZFbF2tAS9nW3w9CM1mhIL6R3YOtijV6+ZZ5c6vOmcHc3IWBfTx6CB98saH91Wt96z09x+5md6iCaF +pFHZW3YczJ3f0Wo3b7Xk+UvoU6g4d5uCM3p4m3NfpJLcbDPZ8MUCIPV2Yfz9PM/fiUSp2C60sgXbI3vM +7+BiXSsehiCpl6PnzyV7ChLt2Vn9N6TfJ/bKn5YTqjwC6ilvpcVNolUWgVmtHetKrsu9rg81Q2rBfG/0 +K93UwF3k6feA+b07U863VAVjG8Viz6jR+iYK2buPfm6H3WWPheFyUwvDBpC5i3Jk7+ZegcS2XVEFyJhO +q9wyJtsomJ9So4VuFKwkfz95TRXtHJQIWnEKgKCNjDC4jccM65iXNPp1ovYKTwf0vweDore9dIqKJdGe +QpL9ziYLndlnn93ieElpnr3r8FfCJUvip7SFX5SLaZvXhkW5Y95iq7GkHPK3nqELSYo3ZsFmC4ZWMHkp +W5RdFz2y1+5vnObW0kYKFTaAuLbPV80OqXOfTNK0kPHL3zHiI3aa28/Vsqx4/G+8skji/okmNeLH67zK +J4Qsqpt7aeevLmlSngJftcotZ9bcp2R6An7do8mvadCGaa7WZ3PmG6J72UlHiGijifTWwb5Pheh0Fn/Z +MKpEu4M5Kt7SSy/ttYzR20aP3AhpcFg4ZWnGjZLK7y/RnsKQ66z5XSAaMZbk0Wuagr/qPPUcVxSlykdG +K370E+tNNnsijQLtnuvEzXwU0w9THxVVQhaIaKW4yZxldyScWM3A0ybU0WyaAqBR4OB7KAnBTj/9dH+P +FZaM/irW66+/PnldXZ5asK5FNmBOEf68IQKv/t/7v//bBCVPjlFcPo0NClzFn/xZLQCyjuT+3Wi3wn4M +A+D8VorXkWI6RzrekjzKtKMe/S17/LAYdxM1Lzm4uHnXqkCv5PBFFAUa7KBx1BODr6QqqKhuTlO0HWiM +pbnZt42DeVaRVh2tcms1/Sg82rX77rsveNJPaZNJlQnzcqanHIShuZotSJROe49c/Wb75eROr8FgLSRF +r+G8g+1P4Zn603N+aoIpNCnwA992zSKdnaU8p58wEvJ3ImUsLVMHv+To3YV7bS3kL7jgAhNj4cOY3Qcy +zx9d4C3Wnv4X3IMEwtCRX3znGmoOXSAIGUpoAT5NPLmJJeNXzx4MARL9GrQXk0xL+aUcn9qVV17pL23m +FTXldx34Y3HM3hirc1o86/VQrxCXn23wiRTi5uIU0mFc69+/P0NzOIWFNR8TWH9sBpKpv6jIvZX2iz5U +mfvM6SoUHuxemjKMpl7sn6cK/B7GxhtvnJoCJeGHavwrbF+95a1/1bYbtSrLUJtS0MywjaCm2IGTJo++ +DfhcaOfYhwcf9Ni0BMpGA2AFn9z18BE7zY2BKn8M0BSMhsoNPPzRVEyDSrP3++sEGzhwIGozG4IxU6Bf +M/TT7ImF4q35TkSB3fy2226zK8SRDcyofMn5KKkNvpBP0Aa3nPkyo0Jgh1kUDFELMadnkIQ5wpJleqO2 +mjPfEB35zZwsuMmIxg9qVinRt2ZbPWotIQodClN2+7Utys9VJfxxPx3lNxnRdlGZOqDMCE2LcY9JattJ +5YxY21U7soTv5P+MFNy9P24/17NgwUFgBlbslrmOwMt1mgIrY1pVFD7/IwftouL5roI7ettkiUD3wzqa +BNGbhdYWise8mwuQ8xe1SQpkh2hh/o523ct1orC4ZDxKxkUmcSTU6xJIgZJ7uc5EATvwYpfslMQfa0wW +rBofBgWmLL5FUXE+Ln++AUSFYZ7BDermSUVgztrR5DoNFfNvC9DJDi6K8F+fotKKqLv9JQtPH7RFZAjP +Sh1iNLlorE/G9T45uznSwl+6EEpixU5ulPis87vb4JYnU1oUR2e98TmtjokUrY5/qXWexHuMi8lOdOMW +3y761gh1fr6rUVIMfSgbfEcjJCkwrjaKkt2fpRQ/nWzhk0eB7FXZjupEOw2ipMr4ewkKyQJVEvYa0Shj +KdMm2LDh7IS/jtHe9q6DXVsG+miU8UVC04D4RGnvPYtygwWluu3+JpNl94sVcPI3SCwkKaCxSF0Tg53E +Ua54UxqLWA8HyzuUB+g8o3En1I7FKApJb5QQ/FEqsnZPba6DBg1CfeJ5Jq0csqMztXP2KC2FZNsY7S4q +TS4tSBIwTaxPk0kejZllXJM2z54F0NAJ+Yi4C+zmFIMBAZ0BF2DzIaLCN+kRUZHae2yDW/aMmEwnAzP0 +cSsLE5pGTQICKFQwW/NHwpLptOfDljY/4JtaMOAjuZO/8hdlxKenozEcRV/KgjFxSTYYe5uatb31vwjK +LNP8K3YMxu5sxVl2UXasd1l0snAMEzqGSMZKfvUydZTpqHqxUMDcj4PgHDNjUsUfjRWjYm90U16ByXTm +mWdmCwCdAZo6egK58yMT3pq3ee4sWLEHJBHIM2qzN8+p1kb9sHlS3fiWhQh6ZqyEaH6MVtBjGDJbhNQa +sUwHF8fA+Ogwx2ybH4ftwNlnauEL8WS9iK0czYa1I/oqZkis3oDWY2/t3m5eCLeciaAmoaFi2EFXpYdi +VEhzBXsFvZUP169fP8xQULOjTmeIYG3QUr4s1rG0Z+81aBYpPApFjswVeCPWCSf+YxGyw/Y75qSdPbpE +e3ZWCikCIiACIiACLRDoLdFenUK+BRgKKgIiIAIiIAIi0C4BifZ2ySmeCIiACIiACHQkAYn2jvwsKpQI +iIAIiIAItEtAor1dcoonAiIgAiIgAh1JQKK9Iz+LCiUCIiACIiAC7RKQaG+XnOKJgAiIgAiIQEcSkGjv +yM+iQomACIiACIhAuwQk2tslp3giIAIiIAIi0JEE+pR9v3FH1lqFEgEREAEREIFKCVQpbbVqr/TTKjMR +EAEREAERKJuARHvZhJW+CIiACIiACFRKQKK9UtzKTAREQAREQATKJiDRXjZhpS8CIiACIiAClRKQaK8U +tzITAREQAREQgbIJ6Eddyyas9EVABERABESgUgJatVeKW5mJgAiIgAiIQNkEJNrLJqz0RUAEREAERKBS +AhLtleJWZiIgAiIgAiJQNgGJ9rIJK30REAEREAERqJSARHuluJWZCIiACIiACJRNQKK9bMJKXwREQARE +QAQqJSDRXiluZSYCIiACIiACZROQaC+bsNIXAREQAREQgUoJSLRXiluZiYAIiIAIiEDZBCTayyas9EVA +BERABESgUgIS7ZXiVmYiIAIiIAIiUDYBifayCSt9ERABERABEaiUgER7pbiVmQiIgAiIgAiUTUCivWzC +Sl8EREAEREAEKiUg0V4pbmUmAiIgAiIgAmUTkGgvm7DSFwEREAEREIFKCUi0V4pbmYmACIiACIhA2QQk +2ssmrPRFQAREQAREoFICEu2V4lZmIiACIiACIlA2AYn2sgkrfREQAREQARGolIBEe6W4lZkIiIAIiIAI +lE1Aor1swkpfBERABERABColINFeKW5lJgIiIAIiIAJlE5BoL5uw0hcBERABERCBSglItFeKW5mJgAiI +gAiIQNkEJNrLJqz0RUAEREAERKBSAhLtleJWZiIgAiIgAiJQNgGJ9rIJK30REAEREAERqJSARHuluJWZ +CIiACIiACJRNQKK9bMJKXwREQAREQAQqJSDRXiluZSYCIiACIiACZRP4P6721h1vhDZmAAAAAElFTkSu +QmCCimage/png6701280 + +ThisissocantesttheOCRDeROCR!OCR]]]>19700101T000000ZunknownUntitled-1.png + diff --git a/tests/test_evernote_to_sqlite.py b/tests/test_evernote_to_sqlite.py index eb9666e..22c3a51 100644 --- a/tests/test_evernote_to_sqlite.py +++ b/tests/test_evernote_to_sqlite.py @@ -4,6 +4,7 @@ import pathlib example_enex = pathlib.Path(__file__).parent / "example-note.enex" +example_broken_enex = pathlib.Path(__file__).parent / "example-note_broken.enex" def test_version(): @@ -110,3 +111,199 @@ def test_enex(tmpdir): ] # Check we enabled Porter stemming assert "tokenize='porter'" in db["notes_fts"].schema + + +def test_recover_proper_enex(tmpdir): + output = str(tmpdir / "output.db") + result = CliRunner().invoke( + cli, ["recover-enex", output, str(example_enex)], catch_exceptions=False + ) + assert 0 == result.exit_code + db = sqlite_utils.Database(output) + assert set(db.table_names()) == { + "notes", + "resources", + "resources_data", + "note_resources", + "notes_fts_idx", + "notes_fts", + "notes_fts_config", + "notes_fts_docsize", + "notes_fts_data", + "resources_fts_config", + "resources_fts", + "resources_fts_idx", + "resources_fts_data", + "resources_fts_docsize", + } + assert list(db["notes"].rows) == [ + { + "id": "e2d3f11777001291c06f20a1de05772fe0ba5a2c", + "title": "Example note with images", + "content": '
This note includes two images. š.

The Python logo



The Evernote logo



This image contains text:


', + "created": "2020-10-11T21:28:22", + "updated": "2020-10-11T23:30:38", + "latitude": "37.77742571705006", + "longitude": "-122.4256495114116", + "altitude": "23.16121864318848", + "author": "Simon Willison", + "source": "desktop.mac", + "reminder-order": "0", + } + ] + + assert list(db["resources"].rows) == [ + { + "md5": "61098c2c541de7f0a907c301dd6542da", + "mime": "image/svg+xml", + "width": "0", + "height": "0", + "duration": "0", + "timestamp": "19700101T000000Z", + "ocr": None, + "reco-type": None, + "file-name": None, + }, + { + "md5": "91bd26175acac0b2ffdb6efac199f8ca", + "mime": "image/svg+xml", + "width": "0", + "height": "0", + "duration": "0", + "timestamp": "19700101T000000Z", + "ocr": None, + "reco-type": None, + "file-name": None, + }, + { + "md5": "76dd28b07797cc9f3f129c4871c5293c", + "mime": "image/png", + "width": "670", + "height": "128", + "duration": "0", + "timestamp": "19700101T000000Z", + "ocr": "This is so can test the OCR", + "reco-type": "unknown", + "file-name": "Untitled-1.png", + }, + ] + resource_md5s = [rd["md5"] for rd in db["resources_data"].rows] + assert resource_md5s == [ + "61098c2c541de7f0a907c301dd6542da", + "91bd26175acac0b2ffdb6efac199f8ca", + "76dd28b07797cc9f3f129c4871c5293c", + ] + assert list(db["note_resources"].rows) == [ + { + "note_id": "e2d3f11777001291c06f20a1de05772fe0ba5a2c", + "resource_id": "61098c2c541de7f0a907c301dd6542da", + }, + { + "note_id": "e2d3f11777001291c06f20a1de05772fe0ba5a2c", + "resource_id": "91bd26175acac0b2ffdb6efac199f8ca", + }, + { + "note_id": "e2d3f11777001291c06f20a1de05772fe0ba5a2c", + "resource_id": "76dd28b07797cc9f3f129c4871c5293c", + }, + ] + # Check we enabled Porter stemming + assert "tokenize='porter'" in db["notes_fts"].schema + + +def test_recover_broken_enex(tmpdir): + output = str(tmpdir / "output.db") + result = CliRunner().invoke( + cli, ["recover-enex", output, str(example_broken_enex)], catch_exceptions=False + ) + assert 0 == result.exit_code + db = sqlite_utils.Database(output) + assert set(db.table_names()) == { + "notes", + "resources", + "resources_data", + "note_resources", + "notes_fts_idx", + "notes_fts", + "notes_fts_config", + "notes_fts_docsize", + "notes_fts_data", + "resources_fts_config", + "resources_fts", + "resources_fts_idx", + "resources_fts_data", + "resources_fts_docsize", + } + assert list(db["notes"].rows) == [ + { + "id": "0c59e90500da181d5518ec94c68956f23bfd79c2", + "title": "Example note with images", + "content": '
This note includes two images. š.

The Python logo



The Evernote logo



This image contains text:

Date: Sat, 15 May 2021 09:22:10 +0200 Subject: [PATCH 09/10] Removing comment --- evernote_to_sqlite/cli.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/evernote_to_sqlite/cli.py b/evernote_to_sqlite/cli.py index 2f03dfd..931aa72 100644 --- a/evernote_to_sqlite/cli.py +++ b/evernote_to_sqlite/cli.py @@ -84,18 +84,6 @@ def recover_enex(db_path, enex_file, max_note_size=30, resume_file=None): and specifically useful for very large Enex file. Be warned that this takes a very long time for larges Enex files.""" - # with Progress() as progress: - # task1 = progress.add_task("[red]Downloading...", total=1000) - # task2 = progress.add_task("[green]Processing...", total=1000) - # task3 = progress.add_task("[cyan]Cooking...", total=1000) - # - # while not progress.finished: - # progress.update(task1, advance=0.5) - # progress.update(task2, advance=0.3) - # progress.update(task3, advance=0.9) - # progress.console.print(f"Working on job #{dt.datetime.now().isoformat()}") - # time.sleep(0.01) - file_length = os.path.getsize(enex_file) db = sqlite_utils.Database(db_path) fp = open(enex_file, "r", encoding="utf-8") From a5839dadaa43694f208ad74a53670cebbe756956 Mon Sep 17 00:00:00 2001 From: Daniel Engvall Date: Sat, 15 May 2021 21:57:44 +0200 Subject: [PATCH 10/10] Missing some packages in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f0ce878..cbbab48 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ def get_long_description(): [console_scripts] evernote-to-sqlite=evernote_to_sqlite.cli:cli """, - install_requires=["click", "sqlite-utils>=3.0"], + install_requires=["click", "sqlite-utils>=3.0", "rich~=10.2.0", "lxml~=4.6.3", "typing_extensions"], extras_require={"test": ["pytest"]}, tests_require=["evernote-to-sqlite[test]"], )