diff --git a/getter/cache.py b/getter/cache.py index 593beae..dbeb515 100644 --- a/getter/cache.py +++ b/getter/cache.py @@ -6,51 +6,66 @@ DATABASE_NAME = "cache_datagetter.db" CACHE_DIR = "cache_dir" +class DatagetterCacheError(Exception): + pass + def setup_database(): - con = sqlite3.connect(DATABASE_NAME) - cur = con.cursor() - cur.execute( - """CREATE TABLE IF NOT EXISTS cache - (original_file_name TEXT NOT NULL UNIQUE, - hash TEXT NOT NULL UNIQUE, - json_file TEXT NOT NULL UNIQUE);""" - ) - con.commit() - con.close() + try: + con = sqlite3.connect(DATABASE_NAME) + cur = con.cursor() + cur.execute( + """CREATE TABLE IF NOT EXISTS cache + (original_file_name TEXT NOT NULL UNIQUE, + hash TEXT NOT NULL UNIQUE, + json_file TEXT NOT NULL UNIQUE);""" + ) + con.commit() + con.close() + except Exception as e: + raise DatagetterCacheError(e) def setup_cache_dir(): - os.makedirs(CACHE_DIR, exist_ok=True) + try: + os.makedirs(CACHE_DIR, exist_ok=True) + except Exception as e: + raise DatagetterCacheError(e) def hash_file(original_file_path): - file_hash = hashlib.sha1() + try: + file_hash = hashlib.sha1() - with open(original_file_path, "rb") as fp: - while True: - data = fp.read(6000) - if not data: - break + with open(original_file_path, "rb") as fp: + while True: + data = fp.read(6000) + if not data: + break - file_hash.update(data) + file_hash.update(data) + except Exception as e: + raise DatagetterCacheError(e) return file_hash.hexdigest() def get_file(file_hash_str): - con = sqlite3.connect(DATABASE_NAME) - cur = con.cursor() - cur.execute("SELECT json_file FROM cache WHERE hash = ?", (file_hash_str,)) - row = cur.fetchone() - con.close() + try: + con = sqlite3.connect(DATABASE_NAME) + cur = con.cursor() + cur.execute("SELECT json_file FROM cache WHERE hash = ?", (file_hash_str,)) + row = cur.fetchone() + con.close() - # We haven't already converted the file - if not row: - return False + # We haven't already converted the file + if not row: + return False - # We have converted the file before so copy from the CACHE_DIR - return os.path.join(CACHE_DIR, row[0]) + # We have converted the file before so copy from the CACHE_DIR + return os.path.join(CACHE_DIR, row[0]) + except Exception as e: + raise DatagetterCacheError(e) def update_cache(json_file_name, file_hash_str, file_identifier, file_type): @@ -58,25 +73,31 @@ def update_cache(json_file_name, file_hash_str, file_identifier, file_type): Updates the cache database and copies the data into the cache dir json_file_name: Output desination for the file """ - con = sqlite3.connect(DATABASE_NAME) - cur = con.cursor() - - cur.execute( - """ - INSERT INTO cache (original_file_name, hash, json_file) - VALUES (?, ?, ?) - ON CONFLICT(original_file_name) DO UPDATE SET hash=? - WHERE original_file_name=? - """, - ( - file_identifier + "." + file_type, - file_hash_str, - file_identifier + ".json", - file_hash_str, - file_identifier + "." + file_type, - ), - ) - - con.commit() - - shutil.copy(json_file_name, CACHE_DIR) + # Todo clean up cache functionality for orphaned files or to reset the cache + + try: + con = sqlite3.connect(DATABASE_NAME) + cur = con.cursor() + + cur.execute( + """ + INSERT INTO cache (original_file_name, hash, json_file) + VALUES (?, ?, ?) + ON CONFLICT(original_file_name) DO UPDATE SET hash=? + ON CONFLICT(json_file) DO UPDATE SET hash=?, original_file_name=? + """, + ( + file_identifier + "." + file_type, + file_hash_str, + file_identifier + ".json", + file_hash_str, + file_hash_str, + file_identifier + "." + file_type, + ), + ) + + con.commit() + + shutil.copy(json_file_name, CACHE_DIR) + except Exception as e: + raise DatagetterCacheError(e) diff --git a/getter/get.py b/getter/get.py index e7672ed..05ddba5 100644 --- a/getter/get.py +++ b/getter/get.py @@ -211,18 +211,22 @@ def fetch_and_convert(args, dataset, schema_path, package_schema): print("Running convert on %s to %s" % (original_file_path, json_file_name)) - # Hash the file - file_hash_str = cache.hash_file(original_file_path) - - # Check if we have already converted the file - cached_file_path = cache.get_file(file_hash_str) - - # We have converted the file before so copy from the CACHE_DIR - if cached_file_path: - try: - shutil.copy(cached_file_path, json_file_name) - print("Cache hit") - except FileNotFoundError: + try: + # Hash the file + file_hash_str = cache.hash_file(original_file_path) + # Check if we have already converted the file + cached_file_path = cache.get_file(file_hash_str) + + # We have converted the file before so copy from the CACHE_DIR + if cached_file_path: + try: + shutil.copy(cached_file_path, json_file_name) + print("Cache hit") + except (FileNotFoundError, PermissionError): + cached_file_path = False + except cache.DatagetterCacheError as e: + print(e) + print("Continuing without cache") cached_file_path = False if not cached_file_path: @@ -233,12 +237,15 @@ def fetch_and_convert(args, dataset, schema_path, package_schema): schema_path, args.schema_branch ) - - cache.update_cache( - json_file_name, - file_hash_str, - dataset['identifier'], - file_type) + try: + cache.update_cache( + json_file_name, + file_hash_str, + dataset['identifier'], + file_type) + except cache.DatagetterCacheError as e: + print(e) + print("Continuing without cache") except KeyboardInterrupt: raise @@ -309,8 +316,12 @@ def file_cache_schema(schema_branch): def get(args): - cache.setup_database() - cache.setup_cache_dir() + try: + cache.setup_database() + cache.setup_cache_dir() + except cache.DatagetterCacheError as e: + print(e) + print("Continuing without cache") if not args.download: mkdirs(args.data_dir, True)