From 34ef373ab2cd8015f5ef2914450196153e33ccb0 Mon Sep 17 00:00:00 2001 From: Andrea Telatin <15690844+telatin@users.noreply.github.com> Date: Mon, 13 Nov 2023 16:37:51 +0000 Subject: [PATCH] Update bibliography and paper markdown (#21) * Update paper.bib * Update paper.md * typos * add SeqFu to the list, since it's in the benchmark --- paper/paper.bib | 37 +++++++++++++++++++++++++------------ paper/paper.md | 10 +++++----- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 77cbbd51..74d36565 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -9,19 +9,32 @@ @software{Peter_hyperfine_2023 } @article{seqkit, - doi = {10.1371/journal.pone.0163962}, - author = {Shen, Wei AND Le, Shuai AND Li, Yan AND Hu, Fuquan}, - journal = {PLOS ONE}, - publisher = {Public Library of Science}, - title = {SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation}, - year = {2016}, - month = {10}, - volume = {11}, - url = {https://doi.org/10.1371/journal.pone.0163962}, - pages = {1-10}, - abstract = {FASTA and FASTQ are basic and ubiquitous formats for storing nucleotide and protein sequences. Common manipulations of FASTA/Q file include converting, searching, filtering, deduplication, splitting, shuffling, and sampling. Existing tools only implement some of these manipulations, and not particularly efficiently, and some are only available for certain operating systems. Furthermore, the complicated installation process of required packages and running environments can render these programs less user friendly. This paper describes a cross-platform ultrafast comprehensive toolkit for FASTA/Q processing. SeqKit provides executable binary files for all major operating systems, including Windows, Linux, and Mac OSX, and can be directly used without any dependencies or pre-configurations. SeqKit demonstrates competitive performance in execution time and memory usage compared to similar tools. The efficiency and usability of SeqKit enable researchers to rapidly accomplish common FASTA/Q file manipulations. SeqKit is open source and available on Github at https://github.com/shenwei356/seqkit.}, - number = {10}, + doi = {10.1371/journal.pone.0163962}, + url = {https://doi.org/10.1371/journal.pone.0163962}, + year = {2016}, + month = oct, + publisher = {Public Library of Science ({PLoS})}, + volume = {11}, + number = {10}, + pages = {e0163962}, + author = {Wei Shen and Shuai Le and Yan Li and Fuquan Hu}, + editor = {Quan Zou}, + title = {{SeqKit}: A Cross-Platform and Ultrafast Toolkit for {FASTA}/Q File Manipulation}, + journal = {{PLOS} {ONE}} +} +@article{seqfu, + doi = {10.3390/bioengineering8050059}, + url = {https://doi.org/10.3390/bioengineering8050059}, + year = {2021}, + month = may, + publisher = {{MDPI} {AG}}, + volume = {8}, + number = {5}, + pages = {59}, + author = {Andrea Telatin and Piero Fariselli and Giovanni Birolo}, + title = {{SeqFu}: A Suite of Utilities for the Robust and Reproducible Manipulation of Sequence Files}, + journal = {Bioengineering} } @software{seqtk, diff --git a/paper/paper.md b/paper/paper.md index 61cfc527..b03cafab 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -22,9 +22,9 @@ bibliography: paper.bib ## Statement of need There are still many gaps in basic command line tools for the handling of standard file formats in the field of bioinformatics. -Bioinformaticians have been able to use many tools to manipulate sequence data files in the fastq format, such as `seqkit` [@seqkit], `seqtk` [@seqtk] or FASTX-Toolkit [@fastx]. -These tools only accept paired end (PE) sequence data when split into multiple files per sample. -Additionally, these tools do not always allow for Unix-style pipe file control. Sometimes they require explicity input/output options instead of using `stdin` and `stdout`. +Bioinformaticians have been able to use many tools to manipulate sequence data files in the fastq format, such as `seqkit` [@seqkit], `seqtk` [@seqtk], FASTX-Toolkit [@fastx], or `seqfu` [@seqfu]. +These tools only accept paired-end (PE) sequence data when split into multiple files per sample. +Additionally, these tools do not always allow for Unix-style pipe file control. Sometimes they require explicitly input/output options instead of using `stdin` and `stdout`. However, some bioinformaticians prefer to combine PE data from a single sample into one file using the interleaved fastq file format, but this format is not always well supported in mainstream tools. Here, we provide Fasten to the community to address these needs. @@ -46,14 +46,14 @@ Continuous integration was implemented in GitHub Actions for unit testing. Each executable is tested to make sure the expected output is obtained with each `git push` event. We also used GitHub Actions to automatically create a Docker container which is also available on the GitHub repo. -![Benchmarks comparing fasten with other analagous tools. From left to right, then to bottom: Trimming with a minimum quality score; converting fastq to fasta; interleaving R1 and R2 reads; kmer counting; normalizing read depth using kmer coverage; Searching for a sequence in a fastq file; downsampling reads; sorting fastq entries by either sequence or ID; and converting nonstandard fastq files to a format whose entries are four lines each, and selecting the first 100.\label{fig:benchmarks}](benchmarks.png) +![Benchmarks comparing fasten with other analogous tools. From left to right, then to bottom: Trimming with a minimum quality score; converting fastq to fasta; interleaving R1 and R2 reads; kmer counting; normalizing read depth using kmer coverage; Searching for a sequence in a fastq file; downsampling reads; sorting fastq entries by either sequence or ID; and converting nonstandard fastq files to a format whose entries are four lines each, and selecting the first 100.\label{fig:benchmarks}](benchmarks.png) ## Conclusions Fasten is a powerful manipulation suite for interleaved fastq files, written in Rust. We benchmarked Fasten on several categories. It has strengths as shown in Figure 1 but it does not occupy the fastest position in all cases. -Its major strengths include its competetive speeds, +Its major strengths include its competitive speeds, Unix-style pipes, paired-end handling, and the advantages afforded by the Rust language including documentation and stability.