Skip to content

Commit

Permalink
Prototype FASTA parser
Browse files Browse the repository at this point in the history
Closes #10
  • Loading branch information
peri4n committed Aug 6, 2024
1 parent 7c93a5c commit 08d0432
Show file tree
Hide file tree
Showing 7 changed files with 188 additions and 55 deletions.
25 changes: 25 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
nom = "7.1.3"

[lib]
name = "nuc"
Expand Down
89 changes: 34 additions & 55 deletions src/dna.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ impl Dna {
}
}

pub fn len(&self) -> usize {
self.length
}

pub fn from_bytes(bytes: &[u8]) -> Dna {
Dna {
length: bytes.len(),
Expand All @@ -46,6 +50,36 @@ impl Dna {
dna
}

pub(crate) fn from_fasta_body(ascii: &str) -> Dna {
let mut dna = Dna {
length: ascii.len(),
nucleotides: vec![0; Dna::bytes_to_store(ascii.len())],
};

let mut found_newlines = 0;
let mut i = 0;
for c in ascii.chars() {
match c {
'\n' => {
found_newlines += 1;
continue;
}
'C' | 'c' => dna.init_with(i, 1),
'G' | 'g' => dna.init_with(i, 2),
'T' | 't' => dna.init_with(i, 3),
_ => dna.init_with(i, 0),
}

i += 1;
}

dna.nucleotides
.truncate(Dna::bytes_to_store(i - found_newlines));
dna.length = i;

dna
}

#[inline(always)]
pub fn address(&self, index: usize) -> (usize, u8) {
let block = index / NUCS_PER_BLOCK;
Expand Down Expand Up @@ -173,58 +207,3 @@ impl Ord for Dna {
PartialOrd::partial_cmp(self, other).unwrap()
}
}

#[cfg(test)]
mod test {
use super::Dna;

#[test]
fn can_be_created_from_a_string() {
let dna = Dna::try_from("ATGCCGTA").unwrap();

assert_eq!(dna.get(0), 0);
assert_eq!(dna.get(1), 3);
assert_eq!(dna.get(2), 2);
assert_eq!(dna.get(3), 1);
assert_eq!(dna.get(4), 1);
assert_eq!(dna.get(5), 2);
assert_eq!(dna.get(6), 3);
assert_eq!(dna.get(7), 0);
assert_eq!(dna.length, 8);
}

#[test]
fn can_be_sorted() {
let mut sequences = vec![
Dna::from_ascii("ATGCCGTA"),
Dna::from_ascii("CTAACGAA"),
Dna::from_ascii("ATAC"),
Dna::from_ascii("ATAA"),
Dna::from_ascii("GTAGGG"),
];
sequences.sort();

assert_eq!(
sequences,
vec![
Dna::from_ascii("ATAA"),
Dna::from_ascii("ATAC"),
Dna::from_ascii("ATGCCGTA"),
Dna::from_ascii("CTAACGAA"),
Dna::from_ascii("GTAGGG")
]
);
}

#[test]
fn can_be_appended() {
let mut dna = Dna::from_ascii("ATGCCGTA");
dna.append(&Dna::from_ascii("AAA"));
dna.append(&Dna::from_ascii("CCC"));
dna.append(&Dna::from_ascii("GGG"));
dna.append(&Dna::from_ascii("TTT"));
dna.append(&Dna::from_ascii(""));

assert_eq!(dna.to_string(), "ATGCCGTAAAACCCGGGTTT");
}
}
36 changes: 36 additions & 0 deletions src/fasta.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use nom::{
bytes::complete::{tag, take_while},
multi::many0,
IResult,
};

use crate::dna::Dna;

pub fn parse_records(input: &str) -> IResult<&str, Vec<FastaDna>> {
many0(parse_record)(input)
}

fn parse_record(input: &str) -> IResult<&str, FastaDna> {
let (input, id) = parse_id(input)?;
let (input, sequence) = parse_sequence(input)?;

Ok((input, FastaDna { id, sequence }))
}

fn parse_id(input: &str) -> IResult<&str, String> {
let (input, _) = tag(">")(input)?;
let (input, id) = take_while(|c| c != '\n')(input)?;

Ok((&input[1..], id.to_string()))
}

fn parse_sequence(input: &str) -> IResult<&str, Dna> {
let (input, sequence) = take_while(|c| c != '>')(input)?;
Ok((input, Dna::from_fasta_body(sequence)))
}

#[derive(Debug, PartialEq)]
pub struct FastaDna {
pub id: String,
pub sequence: Dna,
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
pub mod dna;
pub mod fasta;
51 changes: 51 additions & 0 deletions tests/dna_test.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
use nuc::dna::Dna;

#[test]
fn can_be_created_from_a_string() {
let dna = Dna::try_from("ATGCCGTA").unwrap();

assert_eq!(dna.get(0), 0);
assert_eq!(dna.get(1), 3);
assert_eq!(dna.get(2), 2);
assert_eq!(dna.get(3), 1);
assert_eq!(dna.get(4), 1);
assert_eq!(dna.get(5), 2);
assert_eq!(dna.get(6), 3);
assert_eq!(dna.get(7), 0);
assert_eq!(dna.len(), 8);
}

#[test]
fn can_be_sorted() {
let mut sequences = vec![
Dna::from_ascii("ATGCCGTA"),
Dna::from_ascii("CTAACGAA"),
Dna::from_ascii("ATAC"),
Dna::from_ascii("ATAA"),
Dna::from_ascii("GTAGGG"),
];
sequences.sort();

assert_eq!(
sequences,
vec![
Dna::from_ascii("ATAA"),
Dna::from_ascii("ATAC"),
Dna::from_ascii("ATGCCGTA"),
Dna::from_ascii("CTAACGAA"),
Dna::from_ascii("GTAGGG")
]
);
}

#[test]
fn can_be_appended() {
let mut dna = Dna::from_ascii("ATGCCGTA");
dna.append(&Dna::from_ascii("AAA"));
dna.append(&Dna::from_ascii("CCC"));
dna.append(&Dna::from_ascii("GGG"));
dna.append(&Dna::from_ascii("TTT"));
dna.append(&Dna::from_ascii(""));

assert_eq!(dna.to_string(), "ATGCCGTAAAACCCGGGTTT");
}
40 changes: 40 additions & 0 deletions tests/fasta_test.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@

use nuc::{
dna::Dna,
fasta::{self, FastaDna},
};

#[test]
fn can_read_an_example_fasta_file() {
let fasta_content = ">Some Identifier\n\
ATGCCGTA\n\
>Another Identifier\n\
CTAACGAA\n\
>Yet Another Identifier\n\
ATAC\n\
ATAAGTAGGG";
let records = fasta::parse_records(fasta_content).unwrap().1;
assert_eq!(
records,
vec![
FastaDna {
id: "Some Identifier".to_string(),
sequence: Dna::from_ascii("ATGCCGTA")
},
FastaDna {
id: "Another Identifier".to_string(),
sequence: Dna::from_ascii("CTAACGAA")
},
FastaDna {
id: "Yet Another Identifier".to_string(),
sequence: Dna::from_ascii("ATACATAAGTAGGG")
},
]
);
}

#[test]
fn can_read_an_empty_fasta_file() {
let records = fasta::parse_records("").unwrap().1;
assert_eq!(records, vec![]);
}

0 comments on commit 08d0432

Please sign in to comment.