Skip to content

Commit

Permalink
FASTA loader
Browse files Browse the repository at this point in the history
  • Loading branch information
nickzoic committed Sep 20, 2024
1 parent c274c96 commit 7c74148
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 6 deletions.
58 changes: 52 additions & 6 deletions countess/plugins/fastq.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@

import pandas as pd
from fqfa.fastq.fastq import parse_fastq_reads # type: ignore
from fqfa.fasta.fasta import parse_fasta_records # type: ignore

from countess import VERSION
from countess.core.parameters import BooleanParam, FloatParam
from countess.core.parameters import BooleanParam, FloatParam, StringParam
from countess.core.plugins import PandasInputFilesPlugin
from countess.utils.files import clean_filename


def _file_reader(
def _fastq_reader(
file_handle, min_avg_quality: float, row_limit: Optional[int] = None, filename: str = ""
) -> Iterable[dict[str, str]]:
for fastq_read in islice(parse_fastq_reads(file_handle), 0, row_limit):
Expand All @@ -23,7 +24,6 @@ def _file_reader(
"filename": clean_filename(filename),
}


class LoadFastqPlugin(PandasInputFilesPlugin):
"""Load counts from one or more FASTQ files, by first building a dask dataframe of raw sequences
with count=1 and then grouping by sequence and summing counts. It supports counting
Expand All @@ -47,13 +47,13 @@ def read_file_to_dataframe(self, file_params, row_limit=None):

if filename.endswith(".gz"):
with gzip.open(filename, mode="rt", encoding="utf-8") as fh:
dataframe = pd.DataFrame(_file_reader(fh, min_avg_quality, row_limit, filename))
dataframe = pd.DataFrame(_fastq_reader(fh, min_avg_quality, row_limit, filename))
elif filename.endswith(".bz2"):
with bz2.open(filename, mode="rt", encoding="utf-8") as fh:
dataframe = pd.DataFrame(_file_reader(fh, min_avg_quality, row_limit, filename))
dataframe = pd.DataFrame(_fastq_reader(fh, min_avg_quality, row_limit, filename))
else:
with open(filename, "r", encoding="utf-8") as fh:
dataframe = pd.DataFrame(_file_reader(fh, min_avg_quality, row_limit, filename))
dataframe = pd.DataFrame(_fastq_reader(fh, min_avg_quality, row_limit, filename))

group_columns = ["sequence"]

Expand All @@ -78,3 +78,49 @@ def read_file_to_dataframe(self, file_params, row_limit=None):
return dataframe.assign(count=1).groupby(group_columns).count()
else:
return dataframe


def _fasta_reader(
file_handle, row_limit: Optional[int] = None
) -> Iterable[dict[str, str]]:
for header, sequence in islice(parse_fasta_records(file_handle), 0, row_limit):
yield {
"__s": sequence,
"__h": header,
}

class LoadFastaPlugin(PandasInputFilesPlugin):
name = "FASTA Load"
description = "Loads sequences from FASTA files"
link = "https://countess-project.github.io/CountESS/included-plugins/#fasta-load"
version = VERSION

file_types = [("FASTA", [".fasta", ".fa", ".fasta.gz", ".fa.gz", ".fasta.bz2", ".fa.bz2"])]

sequence_column = StringParam("Sequence Column", "sequence")
header_column = StringParam("Header Column", "header")
filename_column = StringParam("Filename Column", "filename")

def read_file_to_dataframe(self, file_params, row_limit=None):
filename = file_params.filename.value

if filename.endswith(".gz"):
with gzip.open(filename, mode="rt", encoding="utf-8") as fh:
dataframe = pd.DataFrame(_fasta_reader(fh, row_limit))
elif filename.endswith(".bz2"):
with bz2.open(filename, mode="rt", encoding="utf-8") as fh:
dataframe = pd.DataFrame(_fasta_reader(fh, row_limit))
else:
with open(filename, "r", encoding="utf-8") as fh:
dataframe = pd.DataFrame(_fasta_reader(fh, row_limit))

dataframe.rename(columns={"__s": self.sequence_column.value}, inplace=True)

if self.header_column:
dataframe.rename(columns={"__h": self.header_column.value}, inplace=True)
else:
dataframe.drop(columns=["__h"], inplace=True)

if self.filename_column:
dataframe[self.filename_column.value] = clean_filename(filename)
return dataframe
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ dev = [
[project.entry-points.countess_plugins]
collate = "countess.plugins.collate:CollatePlugin"
load_fastq = "countess.plugins.fastq:LoadFastqPlugin"
load_fasta = "countess.plugins.fastq:LoadFastaPlugin"
load_csv = "countess.plugins.csv:LoadCsvPlugin"
group_by = "countess.plugins.group_by:GroupByPlugin"
expression = "countess.plugins.expression:ExpressionPlugin"
Expand Down

0 comments on commit 7c74148

Please sign in to comment.