Skip to content

Commit

Permalink
Merge pull request #21 from eastgenomics/EBH-3338_Pull_out_a_list_of_…
Browse files Browse the repository at this point in the history
…samples_without_results_JSONs_from_Medicover

EBH-3338 Pull out a list of samples without results JSONs from Medicover - BunnyNo
  • Loading branch information
mattgarner authored Jan 7, 2025
2 parents fb9a632 + 46c21fb commit f09407f
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 0 deletions.
140 changes: 140 additions & 0 deletions EBH-3338/medicover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
"""
Created on Mon Dec 16 11:50:04 2024
@author: arun
Code logic:
1. get a list of bam files from the project of Staging area 055?
2. get the list of jsons from the same project.
3. For each JSON, determine the sample name using regex.
4. For each BAM, determine the sample name using regex
5. Merge JSON and BAM files into a dataframe where the common samplename
"""
import re
import pandas as pd
import dxpy


# Set Default project ID
PROJECT_ID = "project-GXx8Gg8468bZ072Py1Bb1p0F"

def extract_sample_name(file_name: str) -> str:
"""
Extracts a sample name from a file name based on a regex.
"""
pattern = r'(SP[a-zA-Z0-9]+|[gG][mM][0-9]+_?[0-9]+)'
match = re.search(pattern, file_name)
return match.group() if match else None


def find_dx_files(project_id: str, extension: str, outfile: str) -> pd.DataFrame:
""" Finds dx files matching extension in project ID
"""
print(f"Looking for {extension} files in {project_id}")
result = list(dxpy.bindings.search.find_data_objects(
classname="file",
name=f"*.{extension}",
name_mode="glob",
project=project_id,
describe={"fields": {"name": True,
"folder": True}}
))

print(f"Found {len(result)} files with extension: {extension}")

df = pd.DataFrame([
{
"sample_name": extract_sample_name(x["describe"]["name"]),
f"{extension}_file_name": x["describe"]["name"],
f"{extension}_file_id": x["id"],
f"{extension}_folder_path": x["describe"]["folder"]
} for x in result
])
df.to_csv(outfile, sep="\t", index=False)
print(f"Files written to {outfile}\n")

return clean_df(df, extension, outfile)


def clean_df(df: pd.DataFrame, extension: str, outfile: str) -> pd.DataFrame:
"""_
Log and handle duplicate and missing sample_name(s) in a df.
"""
sample_col = "sample_name"
files = f"{extension}_files"
outfile = outfile.removesuffix(".tsv")
outfile = f"{outfile}_cleaned.tsv"
print(f"Cleaning {files} dataframe ...\n")

rows_with_na = df[df[sample_col].isna()]
if rows_with_na.empty:
print(f"All {len(df)} {files} matched a samplename.")
else:
print(
f"Removing the following {len(rows_with_na)} {files} "
f"with no matched samples:"
)
print(rows_with_na)
df = df.dropna(subset=[sample_col])

duplicate_rows = df[df.duplicated(subset=sample_col, keep='first')]
if duplicate_rows.empty:
print("\nNo duplicate sample name in {files}.")
else:
print(
f"\nRemoving the following {len(duplicate_rows)} rows "
f"with duplicated sample names in {files}, keeping only "
f"the first instance."
)
print(duplicate_rows)
df = df.drop_duplicates(subset=sample_col, keep='first')

df.to_csv(outfile, sep="\t", index=False)
print(f"Saved cleaned data in {outfile} \n")

return df


def main():
"""Entry point
"""
outfile_without_json = "medicover_without_json_file.tsv"
outfile_with_json = "medicover_with_json_file.tsv"

bam_df = find_dx_files(PROJECT_ID, "bam", "medicover_bam_files.tsv")
json_df = find_dx_files(PROJECT_ID, "json", "medicover_json_files.tsv")

samples_without_json = bam_df[~bam_df.sample_name.isin(json_df.sample_name)]
samples_without_json.to_csv(outfile_without_json, sep="\t", index=False)

samples_with_json = pd.merge(bam_df, json_df,
on="sample_name", how="left").dropna()
samples_with_json.to_csv(outfile_with_json, sep="\t", index=False)

samples_with_json_in_json_missing = samples_with_json[
samples_with_json.json_folder_path.str.contains("JSON_MISSING")]

print(
f"Found {len(samples_with_json_in_json_missing)} Medicover samples "
"with a json folder in a JSON_MISSING folder. "
)

print(samples_with_json_in_json_missing)

print("\nSUMMARY...\n")
print(f"Found {len(bam_df)} unique Medicover samples with sequence data")
print(
f"{len(samples_without_json)} "
"Medicover samples have no JSON result "
f"files.\n"
f"{len(samples_with_json)} "
"Medicover samples have a JSON result file. Where "
f"{len(samples_with_json_in_json_missing)} "
"Medicover samples have a JSON result file in a JSON_MISSING folder.\n"
f"See {outfile_without_json} and {outfile_with_json} for full "
"information.\n"
)

if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Repository for code related to small tasks for supporting the RD service.

| Ticket | Summary |
| --- | --- |
| [EBH-3338] | Pull out a list of medicover samples for which we do not have medicover results JSONs in DNAnexus |
| [EBH-3050] | Determine how many workbooks were released/not released (based on number of variants filtered) per clinical indication since new filtering introduced |
| [DI-773] | Gather and plot QC metrics |
| [DI-1057] | Validate East GLH RD Test Directory hosted on AWS RDS |
Expand All @@ -13,6 +14,7 @@ Repository for code related to small tasks for supporting the RD service.
| [DI-1299] | Compare values between GRCh37 and GRCh38 for Somalier |


[EBH-3338]: https://cuhbioinformatics.atlassian.net/browse/EBH-3338
[EBH-3050]: https://cuhbioinformatics.atlassian.net/browse/EBH-3050
[DI-773]: https://cuhbioinformatics.atlassian.net/browse/DI-773
[DI-1057]: https://cuhbioinformatics.atlassian.net/browse/DI-1057
Expand Down

0 comments on commit f09407f

Please sign in to comment.