From fcfa476b05ca75113a7af73abff4f2ae26d3de6d Mon Sep 17 00:00:00 2001 From: Ratchet Date: Wed, 17 Apr 2024 10:37:34 +0300 Subject: [PATCH] fix text exporter unify excel and text importer char stripping --- tablite/export_utils.py | 6 ++++-- tablite/file_reader_utils.py | 5 +++-- tablite/import_utils.py | 6 +++--- tablite/utils.py | 32 +++++++++++++++++++++++++++++--- tablite/version.py | 2 +- 5 files changed, 40 insertions(+), 11 deletions(-) diff --git a/tablite/export_utils.py b/tablite/export_utils.py index e8bb6b0a..faec3f82 100644 --- a/tablite/export_utils.py +++ b/tablite/export_utils.py @@ -1,3 +1,4 @@ +import csv from tablite.utils import sub_cls_check, type_check from tablite.base import BaseTable from tablite.config import Config @@ -180,9 +181,10 @@ def txt(value): # helper for text writer delimiter = delimiters.get(path.suffix) with path.open("w", encoding="utf-8") as fo: - fo.write(delimiter.join(c for c in table.columns) + "\n") + w = csv.writer(fo, delimiter=delimiter) + w.writerow(c for c in table.columns) for row in tqdm(table.rows, total=len(table), disable=Config.TQDM_DISABLE): - fo.write(delimiter.join(txt(c) for c in row) + "\n") + w.writerow(txt(c) for c in row) def sql_writer(table, path): diff --git a/tablite/file_reader_utils.py b/tablite/file_reader_utils.py index 2f3ff1b7..7cc9412a 100644 --- a/tablite/file_reader_utils.py +++ b/tablite/file_reader_utils.py @@ -6,9 +6,8 @@ from tablite.datatypes import DataTypes import csv from io import StringIO -from tablite.utils import fixup_worksheet from tablite.nimlite import get_headers as _get_headers -from tablite.utils import py_to_nim_encoding +from tablite.utils import fixup_worksheet, py_to_nim_encoding, strip_escape ENCODING_GUESS_BYTES = 10000 @@ -219,6 +218,8 @@ def excel_reader_headers(path, delimiter, header_row_index, text_qualifier, line if i < 0: # NOTE: for some reason `iter_rows` specifying a start row starts reading cells as binary, instead skip the rows that are before our first read row continue + + row_data = [strip_escape(r) for r in row_data] # NOTE: text readers do not cast types and give back strings, neither should xlsx reader, can't find documentation if it's possible to ignore this via `iter_rows` instead of casting back to string container[i] = [DataTypes.to_json(v) for v in row_data] diff --git a/tablite/import_utils.py b/tablite/import_utils.py index 02655678..7ca36d6a 100644 --- a/tablite/import_utils.py +++ b/tablite/import_utils.py @@ -27,7 +27,7 @@ from tablite.datatypes import DataTypes, list_to_np_array from tablite.config import Config from tablite.file_reader_utils import TextEscape, get_encoding, get_delimiter, ENCODING_GUESS_BYTES -from tablite.utils import type_check, unique_name, fixup_worksheet +from tablite.utils import type_check, unique_name, fixup_worksheet, strip_escape from tablite.base import BaseTable, Page, Column from tqdm import tqdm as _tqdm @@ -215,7 +215,7 @@ def excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet= it_header = worksheet.iter_rows(min_row=header_row_index + 1) while True: # get the first row to know our headers or the number of columns - row = [c.value for c in next(it_header)] + row = [strip_escape(c.value) for c in next(it_header)] break fields = [str(c) if c is not None else "" for c in row] # excel is offset by 1 except StopIteration: @@ -259,7 +259,7 @@ def excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet= it_used_indices = list(field_dict.values()) # filter columns that we're not going to use - it_rows_filtered = ([row[idx].value for idx in it_used_indices] for row in it_rows) + it_rows_filtered = ([strip_escape(row[idx].value) for idx in it_used_indices] for row in it_rows) # create page directory workdir = Path(Config.workdir) / Config.pid diff --git a/tablite/utils.py b/tablite/utils.py index d98583a8..9112562c 100644 --- a/tablite/utils.py +++ b/tablite/utils.py @@ -348,7 +348,7 @@ def _time_statistics_summary(v, c): timestamp -= minutes * 60 seconds = int(timestamp) microseconds = int(1e6 * (timestamp-seconds)) - + d[k] = time(hours, minutes, seconds, microseconds) elif k in {"stdev", "iqr", "sum"}: d[k] = f"{d[k]} seconds" @@ -420,6 +420,7 @@ def dict_to_rows(d): rows.append(row) return rows + def calc_col_count(letters: str): ord_nil = ord("A") - 1 cols_per_letter = ord("Z") - ord_nil @@ -430,12 +431,13 @@ def calc_col_count(letters: str): return col_count + def calc_true_dims(sheet): src = sheet._get_source() max_col, max_row = 0, 0 regex = re.compile("\d+") - + def handleStartElement(name, attrs): nonlocal max_col, max_row @@ -455,6 +457,7 @@ def handleStartElement(name, attrs): return max_col, max_row + def fixup_worksheet(worksheet): try: ws_cols, ws_rows = calc_true_dims(worksheet) @@ -464,16 +467,19 @@ def fixup_worksheet(worksheet): except Exception as e: logging.error(f"Failed to fetch true dimensions: {e}") + def update_access_time(path): path = Path(path) stat = path.stat() os.utime(path, (now(), stat.st_mtime)) + def load_numpy(path): update_access_time(path) return np.load(path, allow_pickle=True, fix_imports=False) + def select_type_name(dtypes: dict): dtypes = [t for t in dtypes.items() if t[0] != NoneType] @@ -496,6 +502,7 @@ def get_predominant_types(table, all_dtypes=None): return dtypes + def py_to_nim_encoding(encoding: str) -> str: if encoding is None or encoding.lower() in ["ascii", "utf8", "utf-8", "utf-8-sig"]: return "ENC_UTF8" @@ -503,5 +510,24 @@ def py_to_nim_encoding(encoding: str) -> str: return "ENC_UTF16" elif encoding in Config.NIM_SUPPORTED_CONV_TYPES: return f"ENC_CONV|{encoding}" - + raise NotImplementedError(f"encoding not implemented: {encoding}") + + +def strip_escape(str_: str) -> str: + if not isinstance(str_, str): + return str_ + + seqs = ( + ("\t", ""), + ("\n", ""), + ("\r", ""), + ("\t", ""), + ("\n", ""), + ("\r", "") + ) + + for (i, o) in seqs: + str_ = str_.replace(i, o) + + return str_ diff --git a/tablite/version.py b/tablite/version.py index 5edfe41f..136eb73f 100644 --- a/tablite/version.py +++ b/tablite/version.py @@ -1,3 +1,3 @@ -major, minor, patch = 2023, 11, 3 +major, minor, patch = 2023, 11, 4 __version_info__ = (major, minor, patch) __version__ = ".".join(str(i) for i in __version_info__)