fix text exporter

unify excel and text importer char stripping
root-11 · Apr 17, 2024 · fcfa476 · fcfa476
1 parent fc36813
commit fcfa476
Show file tree

Hide file tree

Showing 5 changed files with 40 additions and 11 deletions.
diff --git a/tablite/export_utils.py b/tablite/export_utils.py
@@ -1,3 +1,4 @@
+import csv
 from tablite.utils import sub_cls_check, type_check
 from tablite.base import BaseTable
 from tablite.config import Config
@@ -180,9 +181,10 @@ def txt(value):  # helper for text writer
     delimiter = delimiters.get(path.suffix)
 
     with path.open("w", encoding="utf-8") as fo:
-        fo.write(delimiter.join(c for c in table.columns) + "\n")
+        w = csv.writer(fo, delimiter=delimiter)
+        w.writerow(c for c in table.columns)
         for row in tqdm(table.rows, total=len(table), disable=Config.TQDM_DISABLE):
-            fo.write(delimiter.join(txt(c) for c in row) + "\n")
+            w.writerow(txt(c) for c in row)
 
 
 def sql_writer(table, path):

diff --git a/tablite/file_reader_utils.py b/tablite/file_reader_utils.py
@@ -6,9 +6,8 @@
 from tablite.datatypes import DataTypes
 import csv
 from io import StringIO
-from tablite.utils import fixup_worksheet
 from tablite.nimlite import get_headers as _get_headers
-from tablite.utils import py_to_nim_encoding
+from tablite.utils import fixup_worksheet, py_to_nim_encoding, strip_escape
 
 ENCODING_GUESS_BYTES = 10000
 
@@ -219,6 +218,8 @@ def excel_reader_headers(path, delimiter, header_row_index, text_qualifier, line
                 if i < 0:
                     # NOTE: for some reason `iter_rows` specifying a start row starts reading cells as binary, instead skip the rows that are before our first read row
                     continue
+
+                row_data = [strip_escape(r) for r in row_data]
 
                 # NOTE: text readers do not cast types and give back strings, neither should xlsx reader, can't find documentation if it's possible to ignore this via `iter_rows` instead of casting back to string
                 container[i] = [DataTypes.to_json(v) for v in row_data]

diff --git a/tablite/import_utils.py b/tablite/import_utils.py
@@ -27,7 +27,7 @@
 from tablite.datatypes import DataTypes, list_to_np_array
 from tablite.config import Config
 from tablite.file_reader_utils import TextEscape, get_encoding, get_delimiter, ENCODING_GUESS_BYTES
-from tablite.utils import type_check, unique_name, fixup_worksheet
+from tablite.utils import type_check, unique_name, fixup_worksheet, strip_escape
 from tablite.base import BaseTable, Page, Column
 
 from tqdm import tqdm as _tqdm
@@ -215,7 +215,7 @@ def excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=
         it_header = worksheet.iter_rows(min_row=header_row_index + 1)
         while True:
             # get the first row to know our headers or the number of columns
-            row = [c.value for c in next(it_header)]
+            row = [strip_escape(c.value) for c in next(it_header)]
             break
         fields = [str(c) if c is not None else "" for c in row] # excel is offset by 1
     except StopIteration:
@@ -259,7 +259,7 @@ def excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=
     it_used_indices = list(field_dict.values())
 
     # filter columns that we're not going to use
-    it_rows_filtered = ([row[idx].value for idx in it_used_indices] for row in it_rows)
+    it_rows_filtered = ([strip_escape(row[idx].value) for idx in it_used_indices] for row in it_rows)
 
     # create page directory
     workdir = Path(Config.workdir) / Config.pid

diff --git a/tablite/utils.py b/tablite/utils.py
@@ -348,7 +348,7 @@ def _time_statistics_summary(v, c):
             timestamp -= minutes * 60
             seconds = int(timestamp)
             microseconds = int(1e6 * (timestamp-seconds))
-            
+
             d[k] = time(hours, minutes, seconds, microseconds)
         elif k in {"stdev", "iqr", "sum"}:
             d[k] = f"{d[k]} seconds"
@@ -420,6 +420,7 @@ def dict_to_rows(d):
         rows.append(row)
     return rows
 
+
 def calc_col_count(letters: str):
     ord_nil = ord("A") - 1
     cols_per_letter = ord("Z") - ord_nil
@@ -430,12 +431,13 @@ def calc_col_count(letters: str):
 
     return col_count
 
+
 def calc_true_dims(sheet):
     src = sheet._get_source()
     max_col, max_row = 0, 0
 
     regex = re.compile("\d+")
-    
+
     def handleStartElement(name, attrs):
         nonlocal max_col, max_row
 
@@ -455,6 +457,7 @@ def handleStartElement(name, attrs):
 
     return max_col, max_row
 
+
 def fixup_worksheet(worksheet):
     try:
         ws_cols, ws_rows = calc_true_dims(worksheet)
@@ -464,16 +467,19 @@ def fixup_worksheet(worksheet):
     except Exception as e:
         logging.error(f"Failed to fetch true dimensions: {e}")
 
+
 def update_access_time(path):
     path = Path(path)
     stat = path.stat()
     os.utime(path, (now(), stat.st_mtime))
 
+
 def load_numpy(path):
     update_access_time(path)
 
     return np.load(path, allow_pickle=True, fix_imports=False)
 
+
 def select_type_name(dtypes: dict):
     dtypes = [t for t in dtypes.items() if t[0] != NoneType]
 
@@ -496,12 +502,32 @@ def get_predominant_types(table, all_dtypes=None):
 
     return dtypes
 
+
 def py_to_nim_encoding(encoding: str) -> str:
     if encoding is None or encoding.lower() in ["ascii", "utf8", "utf-8", "utf-8-sig"]:
         return "ENC_UTF8"
     elif encoding.lower() in ["utf16", "utf-16"]:
         return "ENC_UTF16"
     elif encoding in Config.NIM_SUPPORTED_CONV_TYPES:
         return f"ENC_CONV|{encoding}"
-    
+
     raise NotImplementedError(f"encoding not implemented: {encoding}")
+
+
+def strip_escape(str_: str) -> str:
+    if not isinstance(str_, str):
+        return str_
+
+    seqs = (
+        ("\t", ""),
+        ("\n", ""),
+        ("\r", ""),
+        ("\t", ""),
+        ("\n", ""),
+        ("\r", "")
+    )
+
+    for (i, o) in seqs:
+        str_ = str_.replace(i, o)
+
+    return str_
diff --git a/tablite/version.py b/tablite/version.py
@@ -1,3 +1,3 @@
-major, minor, patch = 2023, 11, 3
+major, minor, patch = 2023, 11, 4
 __version_info__ = (major, minor, patch)
 __version__ = ".".join(str(i) for i in __version_info__)