Skip to content

Commit

Permalink
Trivial Fasta Reader implementation
Browse files Browse the repository at this point in the history
- The old implementation read the entire file into a buffer.
  That's not feasible for huge files.
  • Loading branch information
peri4n committed Sep 27, 2024
1 parent 51a2cfb commit cb64315
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 52 deletions.
92 changes: 48 additions & 44 deletions src/fasta.rs
Original file line number Diff line number Diff line change
@@ -1,58 +1,62 @@
use nom::{
bytes::complete::{tag, take_while},
multi::many0,
IResult,
};
use std::io::{BufReader, BufRead, Read};

use crate::dna::Dna;
pub struct FastaReader<R: Read> {
reader: BufReader<R>,
next_id: Option<String>,

pub fn parse_records(input: &str) -> IResult<&str, Vec<FastaDna>> {
many0(parse_record)(input)
}

fn from_fasta_body(ascii: &str) -> Dna {
let mut dna = Dna {
length: ascii.len(),
nucleotides: vec![0; Dna::bytes_to_store(ascii.len())],
};

let mut i = 0;
for c in ascii.chars() {
match c {
'\n' => {
continue;
}
'C' | 'c' => dna.init_with(i, 1),
'G' | 'g' => dna.init_with(i, 2),
'T' | 't' => dna.init_with(i, 3),
_ => dna.init_with(i, 0),
}

i += 1;
impl<R: Read> FastaReader<R> {
pub fn new(reader: R) -> Self {
Self { reader: BufReader::new(reader), next_id: None }
}

dna.trim(i);
dna
}

fn parse_record(input: &str) -> IResult<&str, FastaDna> {
let (input, id) = parse_id(input)?;
let (input, sequence) = parse_sequence(input)?;

Ok((input, FastaDna { id, sequence }))
}
impl<R: Read> Iterator for FastaReader<R> {
type Item = FastaDna;

fn parse_id(input: &str) -> IResult<&str, String> {
let (input, _) = tag(">")(input)?;
let (input, id) = take_while(|c| c != '\n')(input)?;
fn next(&mut self) -> Option<Self::Item> {
let mut id = self.next_id.take().unwrap_or_default();
let mut sequence = String::new();
//if let Ok(buffer) = self.reader.fill_buf() {
//
//} else {
//
//}

Ok((&input[1..], id.to_string()))
loop {
let mut line = String::new();
match self.reader.read_line(&mut line) {
// buffer is completely read
Ok(0) => {
if id.is_empty() {
return None;
} else {
return Some(FastaDna { id, sequence: Dna::from_ascii(&sequence) });
}
}
// buffer is not empty yet
Ok(_) => {
if line.starts_with('>') {
self.next_id = Some(line[1..].trim().to_string());
if !id.is_empty() {
return Some(FastaDna { id, sequence: Dna::from_ascii(&sequence) });
}
id = line[1..].trim().to_string();
sequence.clear();
} else {
sequence.push_str(&line.trim());
}
}
Err(_) => {
return None;
}
}
}
}
}

fn parse_sequence(input: &str) -> IResult<&str, Dna> {
let (input, sequence) = take_while(|c| c != '>')(input)?;
Ok((input, from_fasta_body(sequence)))
}
use crate::dna::Dna;

#[derive(Debug, PartialEq)]
pub struct FastaDna {
Expand Down
2 changes: 1 addition & 1 deletion tests/dna_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ fn can_be_appended() {

#[test]
fn kmer_cache_contains_correct_value() {
let cache = nuc::dna::KMER_CACHE;
let cache = &nuc::dna::KMER_CACHE;
assert_eq!(cache[0], [4, 0, 0, 0]);
assert_eq!(cache[1], [3, 1, 0, 0]);
assert_eq!(cache[2], [3, 0, 1, 0]);
Expand Down
16 changes: 9 additions & 7 deletions tests/fasta_test.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,23 @@

use nuc::{
dna::Dna,
fasta::{self, FastaDna},
fasta::{self, FastaDna, FastaReader},
};

#[test]
fn can_read_an_example_fasta_file() {
let fasta_content = ">Some Identifier\n\
let reader = FastaReader::new(
">Some Identifier\n\
ATGCCGTA\n\
>Another Identifier\n\
CTAACGAA\n\
>Yet Another Identifier\n\
ATAC\n\
ATAAGTAGGG";
let records = fasta::parse_records(fasta_content).unwrap().1;
ATAAGTAGGG"
.as_bytes()
);

assert_eq!(
records,
reader.into_iter().collect::<Vec<_>>(),
vec![
FastaDna {
id: "Some Identifier".to_string(),
Expand All @@ -35,6 +37,6 @@ fn can_read_an_example_fasta_file() {

#[test]
fn can_read_an_empty_fasta_file() {
let records = fasta::parse_records("").unwrap().1;
let records = FastaReader::new("".as_bytes()).into_iter().collect::<Vec<_>>();
assert_eq!(records, vec![]);
}

0 comments on commit cb64315

Please sign in to comment.