From 29882a0d2a18de0595e38555d19389982e3937de Mon Sep 17 00:00:00 2001 From: Kelly Westbrooks Date: Tue, 10 Jun 2014 14:39:13 -0700 Subject: [PATCH] Fixed some FASTA parsing bugs --- .../cs/amplab/fastaparser/FastaReader.java | 134 ++++++++---------- .../amplab/fastaparser/FastaReaderTest.java | 12 +- 2 files changed, 61 insertions(+), 85 deletions(-) diff --git a/smash4j/fastaparser/src/main/java/edu/berkeley/cs/amplab/fastaparser/FastaReader.java b/smash4j/fastaparser/src/main/java/edu/berkeley/cs/amplab/fastaparser/FastaReader.java index 0cdc009..be3e8e2 100644 --- a/smash4j/fastaparser/src/main/java/edu/berkeley/cs/amplab/fastaparser/FastaReader.java +++ b/smash4j/fastaparser/src/main/java/edu/berkeley/cs/amplab/fastaparser/FastaReader.java @@ -1,15 +1,14 @@ package edu.berkeley.cs.amplab.fastaparser; import com.google.common.base.Function; -import com.google.common.base.Optional; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; +import java.nio.ByteBuffer; import java.nio.channels.FileChannel; -import java.nio.charset.StandardCharsets; import java.util.Map; public class FastaReader { @@ -17,13 +16,10 @@ public class FastaReader { public interface Callback { public interface FastaFile { - - Map info(); - - Optional get(String contig, int beginIndex, int endIndex); + String get(String contigName, int beginIndex, int endIndex); } - X read(FastaFile fastaFile) throws Exception; + X read(Map info, FastaFile fastaFile) throws Exception; } public static FastaReader create(File fastaFile) throws IOException { @@ -51,80 +47,62 @@ private FastaReader(File fastaFile, FastaIndex index) { public X read(Callback callback) throws Exception { try (RandomAccessFile file = new RandomAccessFile(fastaFile, "r")) { try (FileChannel channel = file.getChannel()) { + class Contig { + + private final ByteBuffer contig; + private final String contigName; + private final int basesPerLine; + + Contig(ByteBuffer contig, String contigName, int basesPerLine) { + this.contig = contig; + this.contigName = contigName; + this.basesPerLine = basesPerLine; + } + + String get(int beginIndex, int endIndex) { + StringBuilder builder = new StringBuilder(); + for (int end = accountForNewlines(endIndex), i = accountForNewlines(beginIndex); + i < end; ++i) { + char c = (char) contig.get(i); + if (Character.isAlphabetic(c) || '-' == c) { + builder.append(c); + } else if ('\n' != c) { + throw new IllegalStateException(String.format( + "Illegal character %c on contig \"%s\" at position %d", c, contigName, i)); + } + } + return builder.toString(); + } + + int length() { + int limit = contig.limit(); + return limit - limit / basesPerLine; + } + + private int accountForNewlines(int i) { + return i + i / basesPerLine; + } + } + ImmutableMap.Builder builder = ImmutableMap.builder(); + for (FastaIndex.Entry entry : index.entries()) { + long length = entry.length(); + int bases = entry.bases(); + String name = entry.name(); + builder.put(name, new Contig(channel.map(FileChannel.MapMode.READ_ONLY, entry.offset(), + (length / bases) * entry.bytes() + length % bases), name, entry.bases())); + } + final Map chromosomes = builder.build(); return callback.read( + Maps.transformValues(chromosomes, + new Function() { + @Override public Integer apply(Contig contig) { + return contig.length(); + } + }), new Callback.FastaFile() { - - class Contig { - - final CharSequence contig; - final int basesPerLine; - - Contig(CharSequence contig, int basesPerLine) { - this.contig = contig; - this.basesPerLine = basesPerLine; - } - } - - private final Function length = - new Function() { - @Override public Integer apply(Contig contig) { - return contig.contig.length(); - } - }; - - private final Map chromosomes = chromosomes(); - @Override - public Map info() { - return Maps.transformValues(chromosomes, length); - } - - @Override public Optional get( - String contig, final int beginIndex, final int endIndex) { - return Optional.fromNullable(chromosomes.get(contig)) - .transform( - new Function() { - @Override public String apply(Contig contig) { - CharSequence string = contig.contig.subSequence( - beginIndex + beginIndex / contig.basesPerLine, - endIndex + endIndex / contig.basesPerLine); - int length = string.length(); - StringBuilder builder = new StringBuilder(); - for (int i = 0; i < length; ++i) { - char c = string.charAt(i); - switch (c) { - case 'A': case 'C': case 'G': case 'T': case 'U': case 'R': - case 'Y': case 'K': case 'M': case 'S': case 'W': case 'B': - case 'D': case 'H': case 'V': case 'N': case 'X': case '-': - builder.append(c); - break; - case '\n': - break; - default: - throw new IllegalStateException( - String.format("Illegal character: %c", c)); - } - } - return builder.toString(); - } - }); - } - - private Map chromosomes() throws IOException { - final ImmutableMap.Builder chromosomes = ImmutableMap.builder(); - for (FastaIndex.Entry entry : index.entries()) { - long length = entry.length(); - int bases = entry.bases(); - chromosomes.put( - entry.name(), - new Contig( - StandardCharsets.UTF_8.decode(channel.map( - FileChannel.MapMode.READ_ONLY, - entry.offset(), - (length / bases) * entry.bytes() + length % bases)), - entry.bases())); - } - return chromosomes.build(); + public String get(String contigName, final int beginIndex, final int endIndex) { + return chromosomes.get(contigName).get(beginIndex, endIndex); } }); } diff --git a/smash4j/fastaparser/src/test/java/edu/berkeley/cs/amplab/fastaparser/FastaReaderTest.java b/smash4j/fastaparser/src/test/java/edu/berkeley/cs/amplab/fastaparser/FastaReaderTest.java index 8e2d381..a55b01b 100644 --- a/smash4j/fastaparser/src/test/java/edu/berkeley/cs/amplab/fastaparser/FastaReaderTest.java +++ b/smash4j/fastaparser/src/test/java/edu/berkeley/cs/amplab/fastaparser/FastaReaderTest.java @@ -2,9 +2,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertSame; -import static org.junit.Assert.assertTrue; -import com.google.common.base.Optional; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; @@ -58,8 +56,8 @@ public void testFastaReader() throws Exception { index = ImmutableList.copyOf(fasta.entrySet()); private final int indexSize = index.size(); - @Override public FastaReaderTest read(FastaReader.Callback.FastaFile fastaFile) - throws Exception { + @Override public FastaReaderTest read(Map info, + FastaReader.Callback.FastaFile fastaFile) throws Exception { int numberOfQueries = NUMBER_OF_QUERIES.nextInt(); for (int i = 0; i < numberOfQueries; ++i) { Map.Entry entry = index.get(RANDOM.nextInt(indexSize)); @@ -67,9 +65,9 @@ public void testFastaReader() throws Exception { int queryLength = QUERY_LENGTH.nextInt(), queryStart = Random.create(0, contig.length() - queryLength - 1).nextInt(), queryEnd = queryStart + queryLength; - Optional optional = fastaFile.get(contigName, queryStart, queryEnd); - assertTrue(optional.isPresent()); - assertEquals(fasta.get(contigName).substring(queryStart, queryEnd), optional.get()); + assertEquals( + fasta.get(contigName).substring(queryStart, queryEnd), + fastaFile.get(contigName, queryStart, queryEnd)); } return FastaReaderTest.this; }