Skip to content

Commit

Permalink
Merge branch 'main' into nick/genomic-variants
Browse files Browse the repository at this point in the history
  • Loading branch information
nickzoic committed Sep 20, 2024
2 parents 3d097ec + 10694c9 commit e731308
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 5 deletions.
56 changes: 51 additions & 5 deletions countess/plugins/fastq.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
from typing import Iterable, Optional

import pandas as pd
from fqfa.fasta.fasta import parse_fasta_records # type: ignore
from fqfa.fastq.fastq import parse_fastq_reads # type: ignore

from countess import VERSION
from countess.core.parameters import BooleanParam, FloatParam
from countess.core.parameters import BooleanParam, FloatParam, StringParam
from countess.core.plugins import PandasInputFilesPlugin
from countess.utils.files import clean_filename


def _file_reader(
def _fastq_reader(
file_handle, min_avg_quality: float, row_limit: Optional[int] = None, filename: str = ""
) -> Iterable[dict[str, str]]:
for fastq_read in islice(parse_fastq_reads(file_handle), 0, row_limit):
Expand Down Expand Up @@ -47,13 +48,13 @@ def read_file_to_dataframe(self, file_params, row_limit=None):

if filename.endswith(".gz"):
with gzip.open(filename, mode="rt", encoding="utf-8") as fh:
dataframe = pd.DataFrame(_file_reader(fh, min_avg_quality, row_limit, filename))
dataframe = pd.DataFrame(_fastq_reader(fh, min_avg_quality, row_limit, filename))
elif filename.endswith(".bz2"):
with bz2.open(filename, mode="rt", encoding="utf-8") as fh:
dataframe = pd.DataFrame(_file_reader(fh, min_avg_quality, row_limit, filename))
dataframe = pd.DataFrame(_fastq_reader(fh, min_avg_quality, row_limit, filename))
else:
with open(filename, "r", encoding="utf-8") as fh:
dataframe = pd.DataFrame(_file_reader(fh, min_avg_quality, row_limit, filename))
dataframe = pd.DataFrame(_fastq_reader(fh, min_avg_quality, row_limit, filename))

group_columns = ["sequence"]

Expand All @@ -78,3 +79,48 @@ def read_file_to_dataframe(self, file_params, row_limit=None):
return dataframe.assign(count=1).groupby(group_columns).count()
else:
return dataframe


def _fasta_reader(file_handle, row_limit: Optional[int] = None) -> Iterable[dict[str, str]]:
for header, sequence in islice(parse_fasta_records(file_handle), 0, row_limit):
yield {
"__s": sequence,
"__h": header,
}


class LoadFastaPlugin(PandasInputFilesPlugin):
name = "FASTA Load"
description = "Loads sequences from FASTA files"
link = "https://countess-project.github.io/CountESS/included-plugins/#fasta-load"
version = VERSION

file_types = [("FASTA", [".fasta", ".fa", ".fasta.gz", ".fa.gz", ".fasta.bz2", ".fa.bz2"])]

sequence_column = StringParam("Sequence Column", "sequence")
header_column = StringParam("Header Column", "header")
filename_column = StringParam("Filename Column", "filename")

def read_file_to_dataframe(self, file_params, row_limit=None):
filename = file_params.filename.value

if filename.endswith(".gz"):
with gzip.open(filename, mode="rt", encoding="utf-8") as fh:
dataframe = pd.DataFrame(_fasta_reader(fh, row_limit))
elif filename.endswith(".bz2"):
with bz2.open(filename, mode="rt", encoding="utf-8") as fh:
dataframe = pd.DataFrame(_fasta_reader(fh, row_limit))
else:
with open(filename, "r", encoding="utf-8") as fh:
dataframe = pd.DataFrame(_fasta_reader(fh, row_limit))

dataframe.rename(columns={"__s": self.sequence_column.value}, inplace=True)

if self.header_column:
dataframe.rename(columns={"__h": self.header_column.value}, inplace=True)
else:
dataframe.drop(columns=["__h"], inplace=True)

if self.filename_column:
dataframe[self.filename_column.value] = clean_filename(filename)
return dataframe
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ dev = [
[project.entry-points.countess_plugins]
collate = "countess.plugins.collate:CollatePlugin"
load_fastq = "countess.plugins.fastq:LoadFastqPlugin"
load_fasta = "countess.plugins.fastq:LoadFastaPlugin"
load_csv = "countess.plugins.csv:LoadCsvPlugin"
group_by = "countess.plugins.group_by:GroupByPlugin"
expression = "countess.plugins.expression:ExpressionPlugin"
Expand Down
36 changes: 36 additions & 0 deletions tests/test_datatable.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
[DataTable]
_module = countess.plugins.data_table
_class = DataTablePlugin
_version = 0.0.69
_hash = 5d54face788a55338e1873cd0c29820317e805c06f2fe9d95450f3f38153392c
_sort = 0 0
_position = 500 500
columns.0.name = 'aaa'
columns.0.type = 'string'
columns.0.index = False
columns.1.name = 'bbb'
columns.1.type = 'number'
columns.1.index = False
columns.2.name = 'ccc'
columns.2.type = 'boolean'
columns.2.index = False
rows.0.aaa = ''
rows.0.bbb = None
rows.0.ccc = False
rows.1.aaa = ''
rows.1.bbb = None
rows.1.ccc = False

[CSV Save]
_module = countess.plugins.csv
_class = SaveCsvPlugin
_version = 0.0.69
_hash = eeb7325e4c7649ab971759c5120eb0ffc8684495a4ddc7a4dfb395c3b9cf1e60
_sort = 0 0
_position = 581 559
_parent.0 = DataTable
header = True
filename = None
delimiter = ','
quoting = False

0 comments on commit e731308

Please sign in to comment.