Skip to content

Commit

Permalink
Only use empty string as an NA value when reading dataframes
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinschaper committed Apr 3, 2024
1 parent dcf801f commit 5360e2a
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 11 deletions.
9 changes: 8 additions & 1 deletion cat_merge/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,14 @@ def read_df(fh: Union[str, IO[bytes]],
Returns:
pandas.DataFrame: Dataframe.
"""
df = pd.read_csv(fh, sep="\t", dtype="string", lineterminator="\n", quoting=csv.QUOTE_NONE, comment=comment_character)
df = pd.read_csv(fh,
sep="\t",
dtype="string",
lineterminator="\n",
quoting=csv.QUOTE_NONE,
comment=comment_character,
keep_default_na=False,
na_values=[''])
if add_source_col is not None:
df[add_source_col] = source_col_value
return df
Expand Down
16 changes: 8 additions & 8 deletions tests/integration/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ def nodes_and_edges() -> Tuple[List[DataFrame], List[DataFrame]]:
edges = []

gene_nodes = u"""\
id category xref symbol
Gene:1 Gene NCBI:10 nan
Gene:2 Gene ZFIN:123
Gene:3 Gene HGNC:11
Gene:4 Gene
Gene:4 Gene
id category symbol xref
Gene:1 Gene nan NCBI:10
Gene:2 Gene fgf ZFIN:123
Gene:3 Gene pax HGNC:11
Gene:4 Gene foo
Gene:4 Gene bar
"""
nodes.append(string_df(gene_nodes))

Expand Down Expand Up @@ -89,5 +89,5 @@ def test_merge_kg_duplicate_node_count(nodes_and_edges):

def test_merged_node_name(nodes_and_edges):
kg, qc = merge_kg(node_dfs=nodes_and_edges[0], edge_dfs=nodes_and_edges[1])
gene_1_name = kg.nodes[kg.nodes.id == 'Gene:1'].name.values[0]
assert(gene_1_name == 'nan', "Gene:1 should have name 'nan' and not be replaced by an empty string")
gene_1_symbol = kg.nodes[kg.nodes.id == 'Gene:1'].symbol.values[0]
assert gene_1_symbol == 'nan', "Gene:1 should have name 'nan' and not be replaced by an empty string"
4 changes: 2 additions & 2 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
# Borrowed from https://stackoverflow.com/questions/58771331/cleanly-hard-code-a-pandas-dataframe-into-a-python-script
def string_df(data: str, index_column_is_id=True):
if index_column_is_id:
df = pd.read_csv(StringIO(data), sep=r"\s+", engine='python')
df = pd.read_csv(StringIO(data), sep=r"\s+", engine='python', keep_default_na=False, na_values=[''])
else:
df = pd.read_csv(StringIO(data), sep=r"\s+", engine='python')
df = pd.read_csv(StringIO(data), sep=r"\s+", engine='python', keep_default_na=False, na_values=[''])
return df


Expand Down

0 comments on commit 5360e2a

Please sign in to comment.