Skip to content

Commit

Permalink
update crow functions
Browse files Browse the repository at this point in the history
  • Loading branch information
c-tomlin committed Nov 20, 2023
1 parent e3e6597 commit a482742
Showing 1 changed file with 51 additions and 25 deletions.
76 changes: 51 additions & 25 deletions src/pes_match/crow.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import glob
import os
import pandas as pd

import numpy as np
import pandas as pd

from pes_match.cluster import cluster_number
from pes_match.parameters import CLERICAL_PATH, CLERICAL_VARIABLES
from pes_match.parameters import CLERICAL_VARIABLES


def collect_uniques(df, id_1, id_2, match_type):
Expand Down Expand Up @@ -47,8 +49,11 @@ def collect_uniques(df, id_1, id_2, match_type):
2 A5 B4 0 Stage_X_Matchkeys
3 A6 B5 0 Stage_X_Matchkeys
"""
if not ((isinstance(id_1, str)) and (isinstance(id_2, str)) and
(isinstance(match_type, str))):
if not (
(isinstance(id_1, str))
and (isinstance(id_2, str))
and (isinstance(match_type, str))
):
raise TypeError("id_1, id_2 and match_type must be strings")
pd.options.mode.chained_assignment = None
df["ID_count_1"] = df.groupby([id_1])[id_2].transform("count")
Expand Down Expand Up @@ -110,7 +115,9 @@ def collect_conflicts(df, id_1, id_2):
return df


def crow_output_updater(output_df, id_column, source_column, suffix_1, suffix_2, match_type):
def crow_output_updater(
output_df, id_column, source_column, suffix_1, suffix_2, match_type
):
"""
Returns the outputs of CROW in a pairwise linked format.
Only matched pairs are retained.
Expand Down Expand Up @@ -225,7 +232,7 @@ def crow_output_updater(output_df, id_column, source_column, suffix_1, suffix_2,
return df


def combine_crow_results(stage):
def combine_crow_results(stage, results_path):
"""
Takes all matches made in CROW from a chosen stage and combines
them into a single pandas DataFrame. All matching in CROW for
Expand All @@ -234,29 +241,33 @@ def combine_crow_results(stage):
Parameters
----------
stage: str
Chosen stage of matching e.g., 'Stage_1'. The function will look inside CLERICAL_PATH and
combine all clerically matched CSV files that contain this string. File names for completed
matches must also end in '_DONE.csv', otherwise they will not be included in the final set of
combined clerical matches.
Chosen stage of matching e.g., 'Stage_1'.
The function will look inside CLERICAL_PATH and combine all clerically
matched CSV files that contain this string. File names for completed
matches must also end in '_DONE.csv', otherwise they will not be
included in the final set of combined clerical matches.
results_path: str
Location of outputs from CROW matching.
Returns
-------
pandas.DataFrame
Pandas dataframe with all clerically matched records from a selected stage combined.
Pandas dataframe with all clerically matched records from a selected
stage combined.
"""
if not isinstance(stage, str):
raise TypeError("stage must be a string")
if not os.path.exists(CLERICAL_PATH):
os.makedirs(CLERICAL_PATH)
all_files = glob.glob(os.path.join(CLERICAL_PATH, "*.csv"))
all_files = glob.glob(os.path.join(results_path, "*.csv"))
completed_files = []
for filename in all_files:
if stage in filename:
if filename.endswith("_DONE.csv"):
df = pd.read_csv(filename, index_col=None, iterator=False, header=0)
completed_files.append(df)
assert len(completed_files) > 0, f"No completed clerical matching files" \
f" (ending with _DONE) found from {stage}"
assert len(completed_files) > 0, (
f"No completed clerical matching files"
f" (ending with _DONE) found from {stage}"
)
df = pd.concat(completed_files, axis=0, ignore_index=True)
return df

Expand Down Expand Up @@ -313,7 +324,9 @@ def remove_large_clusters(df, n):
return df


def save_for_crow(df, id_column, suffix_1, suffix_2, file_name, no_of_files=1):
def save_for_crow(
df, id_column, suffix_1, suffix_2, output_folder, file_name, no_of_files=1
):
"""
Takes candidate matches, updates their format ready for CROW
and then saves them. Split matches into multiple files if desired.
Expand All @@ -329,6 +342,8 @@ def save_for_crow(df, id_column, suffix_1, suffix_2, file_name, no_of_files=1):
Suffix used for the first data source.
suffix_2: str
Suffix used for the second data source.
output_folder: str
Path to save CROW input data.
file_name: str
Name of file that will be saved. If multiple files are
saved, each file will have a different suffix
Expand All @@ -342,8 +357,12 @@ def save_for_crow(df, id_column, suffix_1, suffix_2, file_name, no_of_files=1):
remove_large_clusters
split_save
"""
if not ((isinstance(id_column, str)) and (isinstance(suffix_1, str)) and
(isinstance(suffix_2, str)) and (isinstance(file_name, str))):
if not (
(isinstance(id_column, str))
and (isinstance(suffix_1, str))
and (isinstance(suffix_2, str))
and (isinstance(file_name, str))
):
raise TypeError("id_column, file_name and suffixes must be strings")

crow_variables = CLERICAL_VARIABLES
Expand All @@ -367,10 +386,15 @@ def save_for_crow(df, id_column, suffix_1, suffix_2, file_name, no_of_files=1):
crow_input = pd.concat([crow_records_1, crow_records_2], axis=0).sort_values(
["Cluster_Number", "Source_Dataset"]
)
split_save(crow_input, file_name=file_name, no_of_files=no_of_files)
split_save(
crow_input,
file_name=file_name,
no_of_files=no_of_files,
output_folder=output_folder,
)


def split_save(df, file_name, no_of_files):
def split_save(df, file_name, no_of_files, output_folder):
"""
Splits clusters (that are already in a format ready for
CROW) into multiple smaller files.
Expand All @@ -384,20 +408,22 @@ def split_save(df, file_name, no_of_files):
different suffix e.g. "_1", "_2", etc.
no_of_files: int
Number of csv files that the output will be split into.
output_folder: str
Path to save CROW input data.
See Also
--------
save_for_crow
"""
if not os.path.exists(output_folder):
os.makedirs(output_folder)
if not ((isinstance(file_name, str)) and (isinstance(no_of_files, int))):
raise TypeError("file_name must be a string, no_of_files must be an integer")
clusters_split = np.array_split(df["Cluster_Number"].unique(), no_of_files)
for i, group in enumerate(clusters_split):
df_split = df[
df["Cluster_Number"].isin(list(group))
]
df_split = df[df["Cluster_Number"].isin(list(group))]
df_split.to_csv(
CLERICAL_PATH + file_name + "_" + str(i + 1) + ".csv",
output_folder + "/" + file_name + "_" + str(i + 1) + ".csv",
header=True,
index=False,
)

0 comments on commit a482742

Please sign in to comment.