From 1254f35ff2f755acb6006ac65bd1ddbf8f3c066c Mon Sep 17 00:00:00 2001 From: "Marcus R. Breese" Date: Tue, 10 Dec 2024 01:09:57 -0500 Subject: [PATCH] updated fasta-split, made bgzwriter an outputstream --- .../ngsutils/cli/fasta/FastaSplit.java | 75 +++++++++++++++---- .../compgen/ngsutils/cli/tab/TabixSplit.java | 8 +- ...{BGZWriter.java => BGZipOutputStream.java} | 29 +++++-- 3 files changed, 87 insertions(+), 25 deletions(-) rename src/java/io/compgen/ngsutils/tabix/{BGZWriter.java => BGZipOutputStream.java} (81%) diff --git a/src/java/io/compgen/ngsutils/cli/fasta/FastaSplit.java b/src/java/io/compgen/ngsutils/cli/fasta/FastaSplit.java index 90c111a..66dd804 100644 --- a/src/java/io/compgen/ngsutils/cli/fasta/FastaSplit.java +++ b/src/java/io/compgen/ngsutils/cli/fasta/FastaSplit.java @@ -15,19 +15,32 @@ import io.compgen.cmdline.impl.AbstractCommand; import io.compgen.common.IterUtils; import io.compgen.common.StringLineReader; +import io.compgen.ngsutils.tabix.BGZipOutputStream; -@Command(name="fasta-split", desc="Split a FASTA file into a new file for each sequence present", category="fasta") +@Command(name="fasta-split", desc="Split a FASTA file into a new file for each sequence or a number of sequences", category="fasta") public class FastaSplit extends AbstractCommand { private String filename = null; private String template = ""; private Set valid = null; + private int maxSeqPerFile = -1; + private boolean gzip = false; - @Option(desc="Output template (new files will be named: template${name}.fa, set to - for stdout)", name="template") + @Option(desc="Output template (new files will be named: template${name}.fa or template${num}.fa , set to - for stdout)", name="template") public void setTemplate(String template) { this.template = template; } + @Option(desc="Compress output with bgzip", name="gz") + public void setGzip(boolean gzip) { + this.gzip = gzip; + } + + @Option(desc="Split into files of N sequences per file (default: split by each sequence)", name="split-count") + public void setMaxSeqPerFile(int maxSeqPerFile) { + this.maxSeqPerFile = maxSeqPerFile; + } + @UnnamedArg(name = "FILE [seq1 seq2...]") public void setFilename(String[] filename) throws CommandArgumentException { this.filename = filename[0]; @@ -44,33 +57,65 @@ public void exec() throws IOException, CommandArgumentException { if (filename == null) { throw new CommandArgumentException("Missing/invalid arguments!"); } - + StringLineReader reader = new StringLineReader(filename); OutputStream bos = null; + int seqCount = 0; + int fileCount = 0; + boolean isValidSeq = false; + for (String line: IterUtils.wrap(reader.iterator())) { if (line.charAt(0) == '>') { String name = line.substring(1).split("\\s",2)[0]; - - if (bos != null) { - if (bos != System.out) { - bos.close(); - } - } - + if (valid == null || valid.contains(name)) { - if (template.equals("-")) { + seqCount++; + isValidSeq = true; + + if (template.equals("-")) { bos = System.out; } else { - bos = new BufferedOutputStream(new FileOutputStream(template+name+".fa")); + if (maxSeqPerFile > 0) { + if (bos == null || seqCount > maxSeqPerFile) { + if (bos != null) { + if (bos != System.out) { + bos.close(); + } + } + fileCount++; + if (gzip) { + bos = new BufferedOutputStream(new BGZipOutputStream(template+fileCount+".fa.gz")); + } else { + bos = new BufferedOutputStream(new FileOutputStream(template+fileCount+".fa")); + } + seqCount = 1; + } + } else { + if (bos != null) { + if (bos != System.out) { + bos.close(); + } + } + if (gzip) { + bos = new BufferedOutputStream(new BGZipOutputStream(template+name+".fa.gz")); + } else { + bos = new BufferedOutputStream(new FileOutputStream(template+name+".fa")); + } + } } - System.err.println(name); + if (verbose) { + System.err.println(name); + } } else { bos = null; - System.err.println(name + " (skip)"); + isValidSeq = false; + if (verbose) { + System.err.println(name + " (skip)"); + } } } - if (bos != null) { + if (isValidSeq) { bos.write((line+"\n").getBytes()); } } diff --git a/src/java/io/compgen/ngsutils/cli/tab/TabixSplit.java b/src/java/io/compgen/ngsutils/cli/tab/TabixSplit.java index 2b86a29..ebd22e0 100644 --- a/src/java/io/compgen/ngsutils/cli/tab/TabixSplit.java +++ b/src/java/io/compgen/ngsutils/cli/tab/TabixSplit.java @@ -10,7 +10,7 @@ import io.compgen.cmdline.exceptions.CommandArgumentException; import io.compgen.cmdline.impl.AbstractOutputCommand; import io.compgen.common.IterUtils; -import io.compgen.ngsutils.tabix.BGZWriter; +import io.compgen.ngsutils.tabix.BGZipOutputStream; import io.compgen.ngsutils.tabix.TabixFile; @Command(name = "tabix-split", desc = "Splits a tabix file by ref/chrom", category = "annotation") @@ -69,7 +69,7 @@ public void exec() throws Exception { TabixFile tabix = new TabixFile(infile, verbose); String curSeq = null; - BGZWriter bgz = null; + BGZipOutputStream bgz = null; List headerLines = new ArrayList(); @@ -97,7 +97,7 @@ public void exec() throws Exception { bgz.close(); } curSeq = seq; - bgz = new BGZWriter(templFilename.replaceAll("\\{\\}", curSeq)); + bgz = new BGZipOutputStream(templFilename.replaceAll("\\{\\}", curSeq)); if (header) { for (String hl: headerLines) { bgz.writeString(hl+"\n"); @@ -112,7 +112,7 @@ public void exec() throws Exception { } curLineNum=0; fileno++; - bgz = new BGZWriter(templFilename.replaceAll("\\{\\}", ""+fileno)); + bgz = new BGZipOutputStream(templFilename.replaceAll("\\{\\}", ""+fileno)); if (header) { for (String hl: headerLines) { bgz.writeString(hl+"\n"); diff --git a/src/java/io/compgen/ngsutils/tabix/BGZWriter.java b/src/java/io/compgen/ngsutils/tabix/BGZipOutputStream.java similarity index 81% rename from src/java/io/compgen/ngsutils/tabix/BGZWriter.java rename to src/java/io/compgen/ngsutils/tabix/BGZipOutputStream.java index 93b8db3..dd8b2da 100644 --- a/src/java/io/compgen/ngsutils/tabix/BGZWriter.java +++ b/src/java/io/compgen/ngsutils/tabix/BGZipOutputStream.java @@ -11,7 +11,7 @@ import io.compgen.common.io.DataIO; -public class BGZWriter { +public class BGZipOutputStream extends OutputStream { private String filename; public String getFilename() { return filename; @@ -25,22 +25,34 @@ public String getFilename() { private int curpos = 0; - public BGZWriter(String filename) throws IOException { + public BGZipOutputStream(String filename) throws IOException { super(); this.filename = filename; this.os = new FileOutputStream(filename); } + // Each string/line should be written to the same block, so if + // this string would cause a block to get written in the middle, + // then flush the current block first. + public void writeString(String line) throws IOException { write(line.getBytes(Charset.defaultCharset()), true); } + public void writeBytes(byte[] bytes) throws IOException { write(bytes, false); } public void write(byte[] bytes, boolean keepTogether) throws IOException { + + // if we need to keep these bytes in the same block, flush the current block if + // {bytes} is large enough to cause a split block. + // + // note: writeBlock only operates if we have written anything (curpos > 0), so + // no need to worry about writing an empty block. + if (keepTogether && curpos + bytes.length >= this.uncompressedMaxBlock) { - flush(); + writeBlock(); } for (int i=0; i= this.uncompressedMaxBlock) { - flush(); + writeBlock(); } } public void close() throws IOException { - flush(); + writeBlock(); // The last block is empty and has a fixed 28 bytes int[] last = new int[] {0x1f,0x8b,0x08,0x04,0x00,0x00,0x00,0x00,0x00,0xff,0x06,0x00,0x42,0x43,0x02,0x00,0x1b,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; @@ -71,7 +83,7 @@ public void close() throws IOException { * Writes a block to the BGZip file * @throws IOException */ - private void flush() throws IOException { + private void writeBlock() throws IOException { // System.err.println("Flushing (curpos="+curpos+") "); if (curpos == 0 ) { return; @@ -115,4 +127,9 @@ private void flush() throws IOException { curpos = 0; } + + @Override + public void write(int b) throws IOException { + write((byte)b); + } }