Skip to content

Commit

Permalink
fix text exporter
Browse files Browse the repository at this point in the history
unify excel and text importer char stripping
  • Loading branch information
realratchet committed Apr 17, 2024
1 parent fc36813 commit fcfa476
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 11 deletions.
6 changes: 4 additions & 2 deletions tablite/export_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import csv
from tablite.utils import sub_cls_check, type_check
from tablite.base import BaseTable
from tablite.config import Config
Expand Down Expand Up @@ -180,9 +181,10 @@ def txt(value): # helper for text writer
delimiter = delimiters.get(path.suffix)

with path.open("w", encoding="utf-8") as fo:
fo.write(delimiter.join(c for c in table.columns) + "\n")
w = csv.writer(fo, delimiter=delimiter)
w.writerow(c for c in table.columns)
for row in tqdm(table.rows, total=len(table), disable=Config.TQDM_DISABLE):
fo.write(delimiter.join(txt(c) for c in row) + "\n")
w.writerow(txt(c) for c in row)


def sql_writer(table, path):
Expand Down
5 changes: 3 additions & 2 deletions tablite/file_reader_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@
from tablite.datatypes import DataTypes
import csv
from io import StringIO
from tablite.utils import fixup_worksheet
from tablite.nimlite import get_headers as _get_headers
from tablite.utils import py_to_nim_encoding
from tablite.utils import fixup_worksheet, py_to_nim_encoding, strip_escape

ENCODING_GUESS_BYTES = 10000

Expand Down Expand Up @@ -219,6 +218,8 @@ def excel_reader_headers(path, delimiter, header_row_index, text_qualifier, line
if i < 0:
# NOTE: for some reason `iter_rows` specifying a start row starts reading cells as binary, instead skip the rows that are before our first read row
continue

row_data = [strip_escape(r) for r in row_data]

# NOTE: text readers do not cast types and give back strings, neither should xlsx reader, can't find documentation if it's possible to ignore this via `iter_rows` instead of casting back to string
container[i] = [DataTypes.to_json(v) for v in row_data]
Expand Down
6 changes: 3 additions & 3 deletions tablite/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from tablite.datatypes import DataTypes, list_to_np_array
from tablite.config import Config
from tablite.file_reader_utils import TextEscape, get_encoding, get_delimiter, ENCODING_GUESS_BYTES
from tablite.utils import type_check, unique_name, fixup_worksheet
from tablite.utils import type_check, unique_name, fixup_worksheet, strip_escape
from tablite.base import BaseTable, Page, Column

from tqdm import tqdm as _tqdm
Expand Down Expand Up @@ -215,7 +215,7 @@ def excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=
it_header = worksheet.iter_rows(min_row=header_row_index + 1)
while True:
# get the first row to know our headers or the number of columns
row = [c.value for c in next(it_header)]
row = [strip_escape(c.value) for c in next(it_header)]
break
fields = [str(c) if c is not None else "" for c in row] # excel is offset by 1
except StopIteration:
Expand Down Expand Up @@ -259,7 +259,7 @@ def excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=
it_used_indices = list(field_dict.values())

# filter columns that we're not going to use
it_rows_filtered = ([row[idx].value for idx in it_used_indices] for row in it_rows)
it_rows_filtered = ([strip_escape(row[idx].value) for idx in it_used_indices] for row in it_rows)

# create page directory
workdir = Path(Config.workdir) / Config.pid
Expand Down
32 changes: 29 additions & 3 deletions tablite/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ def _time_statistics_summary(v, c):
timestamp -= minutes * 60
seconds = int(timestamp)
microseconds = int(1e6 * (timestamp-seconds))

d[k] = time(hours, minutes, seconds, microseconds)
elif k in {"stdev", "iqr", "sum"}:
d[k] = f"{d[k]} seconds"
Expand Down Expand Up @@ -420,6 +420,7 @@ def dict_to_rows(d):
rows.append(row)
return rows


def calc_col_count(letters: str):
ord_nil = ord("A") - 1
cols_per_letter = ord("Z") - ord_nil
Expand All @@ -430,12 +431,13 @@ def calc_col_count(letters: str):

return col_count


def calc_true_dims(sheet):
src = sheet._get_source()
max_col, max_row = 0, 0

regex = re.compile("\d+")

def handleStartElement(name, attrs):
nonlocal max_col, max_row

Expand All @@ -455,6 +457,7 @@ def handleStartElement(name, attrs):

return max_col, max_row


def fixup_worksheet(worksheet):
try:
ws_cols, ws_rows = calc_true_dims(worksheet)
Expand All @@ -464,16 +467,19 @@ def fixup_worksheet(worksheet):
except Exception as e:
logging.error(f"Failed to fetch true dimensions: {e}")


def update_access_time(path):
path = Path(path)
stat = path.stat()
os.utime(path, (now(), stat.st_mtime))


def load_numpy(path):
update_access_time(path)

return np.load(path, allow_pickle=True, fix_imports=False)


def select_type_name(dtypes: dict):
dtypes = [t for t in dtypes.items() if t[0] != NoneType]

Expand All @@ -496,12 +502,32 @@ def get_predominant_types(table, all_dtypes=None):

return dtypes


def py_to_nim_encoding(encoding: str) -> str:
if encoding is None or encoding.lower() in ["ascii", "utf8", "utf-8", "utf-8-sig"]:
return "ENC_UTF8"
elif encoding.lower() in ["utf16", "utf-16"]:
return "ENC_UTF16"
elif encoding in Config.NIM_SUPPORTED_CONV_TYPES:
return f"ENC_CONV|{encoding}"

raise NotImplementedError(f"encoding not implemented: {encoding}")


def strip_escape(str_: str) -> str:
if not isinstance(str_, str):
return str_

seqs = (
("\t", ""),
("\n", ""),
("\r", ""),
("\t", ""),
("\n", ""),
("\r", "")
)

for (i, o) in seqs:
str_ = str_.replace(i, o)

return str_
2 changes: 1 addition & 1 deletion tablite/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
major, minor, patch = 2023, 11, 3
major, minor, patch = 2023, 11, 4
__version_info__ = (major, minor, patch)
__version__ = ".".join(str(i) for i in __version_info__)

0 comments on commit fcfa476

Please sign in to comment.