Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alter way to catch missing columns error #469 #470

Merged
merged 8 commits into from
Nov 20, 2023
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/
- Add date=existing parameter to Mastr.download [#452](https://github.com/OpenEnergyPlatform/open-MaStR/pull/452)
### Changed
- Using sphinx version <7 to build documentation [#454](https://github.com/OpenEnergyPlatform/open-MaStR/pull/454)
- Get missing column names in xml download in a robust way [#470](https://github.com/OpenEnergyPlatform/open-MaStR/pull/470)
### Removed
- Delete `on push` for github workflow [#445](https://github.com/OpenEnergyPlatform/open-MaStR/pull/445)

Expand Down
77 changes: 37 additions & 40 deletions open_mastr/xml_download/utils_write_to_database.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import sqlite3
from shutil import Error
from zipfile import ZipFile

import lxml
import numpy as np
import pandas as pd
import sqlite3
import sqlalchemy
from sqlalchemy import select
from sqlalchemy.sql import text

from open_mastr.utils.config import setup_logger
from open_mastr.utils.helpers import data_to_include_tables
from open_mastr.utils.orm import tablename_mapping
from open_mastr.xml_download.utils_cleansing_bulk import cleanse_bulk_data
from open_mastr.utils.config import setup_logger


def write_mastr_xml_to_database(
Expand Down Expand Up @@ -98,7 +99,6 @@ def is_first_file(file_name: str) -> bool:


def cast_date_columns_to_datetime(xml_tablename: str, df: pd.DataFrame) -> pd.DataFrame:

sqlalchemy_columnlist = tablename_mapping[xml_tablename][
"__class__"
].__table__.columns.items()
Expand Down Expand Up @@ -186,7 +186,6 @@ def add_table_to_database(
if_exists: str,
engine: sqlalchemy.engine.Engine,
) -> None:

# get a dictionary for the data types

table_columns_list = list(
Expand All @@ -199,7 +198,8 @@ def add_table_to_database(
}

continueloop = True
while continueloop:
add_missing_columns_to_table(engine, xml_tablename, df)
for _ in range(10000):
try:
with engine.connect() as con:
with con.begin():
Expand All @@ -210,20 +210,12 @@ def add_table_to_database(
if_exists=if_exists,
dtype=dtypes_for_writing_sql,
)
continueloop = False
except sqlalchemy.exc.OperationalError as err:
add_missing_column_to_table(err, engine, xml_tablename)

except sqlalchemy.exc.ProgrammingError as err:
add_missing_column_to_table(err, engine, xml_tablename)

except sqlite3.OperationalError as err:
add_missing_column_to_table(err, engine, xml_tablename)
break

except sqlalchemy.exc.DataError as err:
delete_wrong_xml_entry(err, df)

except sqlalchemy.exc.IntegrityError as err:
except sqlalchemy.exc.IntegrityError:
# error resulting from Unique constraint failed
df = write_single_entries_until_not_unique_comes_up(
df=df, xml_tablename=xml_tablename, engine=engine
Expand All @@ -232,7 +224,8 @@ def add_table_to_database(

def add_zero_as_first_character_for_too_short_string(df: pd.DataFrame) -> pd.DataFrame:
"""Some columns are read as integer even though they are actually strings starting with
a 0. This function converts those columns back to strings and adds a 0 as first character."""
a 0. This function converts those columns back to strings and adds a 0 as first character.
"""

dict_of_columns_and_string_length = {
"Gemeindeschluessel": 8,
Expand Down Expand Up @@ -301,45 +294,49 @@ def write_single_entries_until_not_unique_comes_up(
return df


def add_missing_column_to_table(
err: Error, engine: sqlalchemy.engine.Engine, xml_tablename: str
def add_missing_columns_to_table(
engine: sqlalchemy.engine.Engine,
xml_tablename: str,
df: pd.DataFrame,
FlorianK13 marked this conversation as resolved.
Show resolved Hide resolved
) -> None:
"""
Some files introduce new columns for existing tables.
If this happens, the error from writing entries into
non-existing columns is caught and the column is created.
If the pandas dataframe contains columns that do not
exist in the database, they are added to the database.
Parameters
----------
err
engine
xml_tablename
df

Returns
-------

"""
log = setup_logger()

if engine.name == "postgresql":
missing_column = err.args[0].split("»")[1].split("«")[0]
elif engine.name == "sqlite":
missing_column = err.args[0].split()[-1]
else:
# only a guess, can fail with other db systems
missing_column = err.args[0].split()[-1]
table = tablename_mapping[xml_tablename]["__class__"].__table__
# get the columns name from the existing database
inspector = sqlalchemy.inspect(engine)
table_name = tablename_mapping[xml_tablename]["__class__"].__table__.name
columns = inspector.get_columns(table_name)
column_names_from_database = [column["name"] for column in columns]

alter_query = 'ALTER TABLE %s ADD "%s" VARCHAR NULL;' % (
table.name,
missing_column,
)
with engine.connect().execution_options(autocommit=True) as con:
with con.begin():
con.execute(text(alter_query).execution_options(autocommit=True))
log.info(
"From the downloaded xml files following new attribute was "
f"introduced: {table.name}.{missing_column}"
)
column_names_from_df = df.columns.tolist()

missing_columns = set(column_names_from_df) - set(column_names_from_database)

for column_name in missing_columns:
alter_query = 'ALTER TABLE %s ADD "%s" VARCHAR NULL;' % (
table_name,
column_name,
)
with engine.connect().execution_options(autocommit=True) as con:
with con.begin():
con.execute(text(alter_query).execution_options(autocommit=True))
log.info(
"From the downloaded xml files following new attribute was "
f"introduced: {table_name}.{column_name}"
)


def delete_wrong_xml_entry(err: Error, df: pd.DataFrame) -> None:
Expand Down
Loading