Skip to content

Commit

Permalink
updated fasta-split, made bgzwriter an outputstream
Browse files Browse the repository at this point in the history
  • Loading branch information
mbreese committed Dec 10, 2024
1 parent d55a7ba commit 1254f35
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 25 deletions.
75 changes: 60 additions & 15 deletions src/java/io/compgen/ngsutils/cli/fasta/FastaSplit.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,32 @@
import io.compgen.cmdline.impl.AbstractCommand;
import io.compgen.common.IterUtils;
import io.compgen.common.StringLineReader;
import io.compgen.ngsutils.tabix.BGZipOutputStream;

@Command(name="fasta-split", desc="Split a FASTA file into a new file for each sequence present", category="fasta")
@Command(name="fasta-split", desc="Split a FASTA file into a new file for each sequence or a number of sequences", category="fasta")
public class FastaSplit extends AbstractCommand {

private String filename = null;
private String template = "";
private Set<String> valid = null;
private int maxSeqPerFile = -1;
private boolean gzip = false;

@Option(desc="Output template (new files will be named: template${name}.fa, set to - for stdout)", name="template")
@Option(desc="Output template (new files will be named: template${name}.fa or template${num}.fa , set to - for stdout)", name="template")
public void setTemplate(String template) {
this.template = template;
}

@Option(desc="Compress output with bgzip", name="gz")
public void setGzip(boolean gzip) {
this.gzip = gzip;
}

@Option(desc="Split into files of N sequences per file (default: split by each sequence)", name="split-count")
public void setMaxSeqPerFile(int maxSeqPerFile) {
this.maxSeqPerFile = maxSeqPerFile;
}

@UnnamedArg(name = "FILE [seq1 seq2...]")
public void setFilename(String[] filename) throws CommandArgumentException {
this.filename = filename[0];
Expand All @@ -44,33 +57,65 @@ public void exec() throws IOException, CommandArgumentException {
if (filename == null) {
throw new CommandArgumentException("Missing/invalid arguments!");
}

StringLineReader reader = new StringLineReader(filename);
OutputStream bos = null;

int seqCount = 0;
int fileCount = 0;
boolean isValidSeq = false;

for (String line: IterUtils.wrap(reader.iterator())) {
if (line.charAt(0) == '>') {
String name = line.substring(1).split("\\s",2)[0];

if (bos != null) {
if (bos != System.out) {
bos.close();
}
}


if (valid == null || valid.contains(name)) {
if (template.equals("-")) {
seqCount++;
isValidSeq = true;

if (template.equals("-")) {
bos = System.out;
} else {
bos = new BufferedOutputStream(new FileOutputStream(template+name+".fa"));
if (maxSeqPerFile > 0) {
if (bos == null || seqCount > maxSeqPerFile) {
if (bos != null) {
if (bos != System.out) {
bos.close();
}
}
fileCount++;
if (gzip) {
bos = new BufferedOutputStream(new BGZipOutputStream(template+fileCount+".fa.gz"));
} else {
bos = new BufferedOutputStream(new FileOutputStream(template+fileCount+".fa"));
}
seqCount = 1;
}
} else {
if (bos != null) {
if (bos != System.out) {
bos.close();
}
}
if (gzip) {
bos = new BufferedOutputStream(new BGZipOutputStream(template+name+".fa.gz"));
} else {
bos = new BufferedOutputStream(new FileOutputStream(template+name+".fa"));
}
}
}
System.err.println(name);
if (verbose) {
System.err.println(name);
}
} else {
bos = null;
System.err.println(name + " (skip)");
isValidSeq = false;
if (verbose) {
System.err.println(name + " (skip)");
}
}
}
if (bos != null) {
if (isValidSeq) {
bos.write((line+"\n").getBytes());
}
}
Expand Down
8 changes: 4 additions & 4 deletions src/java/io/compgen/ngsutils/cli/tab/TabixSplit.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import io.compgen.cmdline.exceptions.CommandArgumentException;
import io.compgen.cmdline.impl.AbstractOutputCommand;
import io.compgen.common.IterUtils;
import io.compgen.ngsutils.tabix.BGZWriter;
import io.compgen.ngsutils.tabix.BGZipOutputStream;
import io.compgen.ngsutils.tabix.TabixFile;

@Command(name = "tabix-split", desc = "Splits a tabix file by ref/chrom", category = "annotation")
Expand Down Expand Up @@ -69,7 +69,7 @@ public void exec() throws Exception {
TabixFile tabix = new TabixFile(infile, verbose);
String curSeq = null;

BGZWriter bgz = null;
BGZipOutputStream bgz = null;

List<String> headerLines = new ArrayList<String>();

Expand Down Expand Up @@ -97,7 +97,7 @@ public void exec() throws Exception {
bgz.close();
}
curSeq = seq;
bgz = new BGZWriter(templFilename.replaceAll("\\{\\}", curSeq));
bgz = new BGZipOutputStream(templFilename.replaceAll("\\{\\}", curSeq));
if (header) {
for (String hl: headerLines) {
bgz.writeString(hl+"\n");
Expand All @@ -112,7 +112,7 @@ public void exec() throws Exception {
}
curLineNum=0;
fileno++;
bgz = new BGZWriter(templFilename.replaceAll("\\{\\}", ""+fileno));
bgz = new BGZipOutputStream(templFilename.replaceAll("\\{\\}", ""+fileno));
if (header) {
for (String hl: headerLines) {
bgz.writeString(hl+"\n");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import io.compgen.common.io.DataIO;

public class BGZWriter {
public class BGZipOutputStream extends OutputStream {
private String filename;
public String getFilename() {
return filename;
Expand All @@ -25,22 +25,34 @@ public String getFilename() {
private int curpos = 0;


public BGZWriter(String filename) throws IOException {
public BGZipOutputStream(String filename) throws IOException {
super();
this.filename = filename;
this.os = new FileOutputStream(filename);
}

// Each string/line should be written to the same block, so if
// this string would cause a block to get written in the middle,
// then flush the current block first.

public void writeString(String line) throws IOException {
write(line.getBytes(Charset.defaultCharset()), true);
}

public void writeBytes(byte[] bytes) throws IOException {
write(bytes, false);
}

public void write(byte[] bytes, boolean keepTogether) throws IOException {

// if we need to keep these bytes in the same block, flush the current block if
// {bytes} is large enough to cause a split block.
//
// note: writeBlock only operates if we have written anything (curpos > 0), so
// no need to worry about writing an empty block.

if (keepTogether && curpos + bytes.length >= this.uncompressedMaxBlock) {
flush();
writeBlock();
}
for (int i=0; i<bytes.length; i++) {
write(bytes[i]);
Expand All @@ -50,13 +62,13 @@ public void write(byte[] bytes, boolean keepTogether) throws IOException {
public void write(byte b) throws IOException {
curBuffer[curpos++] = b;
if (curpos >= this.uncompressedMaxBlock) {
flush();
writeBlock();
}
}


public void close() throws IOException {
flush();
writeBlock();

// The last block is empty and has a fixed 28 bytes
int[] last = new int[] {0x1f,0x8b,0x08,0x04,0x00,0x00,0x00,0x00,0x00,0xff,0x06,0x00,0x42,0x43,0x02,0x00,0x1b,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
Expand All @@ -71,7 +83,7 @@ public void close() throws IOException {
* Writes a block to the BGZip file
* @throws IOException
*/
private void flush() throws IOException {
private void writeBlock() throws IOException {
// System.err.println("Flushing (curpos="+curpos+") ");
if (curpos == 0 ) {
return;
Expand Down Expand Up @@ -115,4 +127,9 @@ private void flush() throws IOException {

curpos = 0;
}

@Override
public void write(int b) throws IOException {
write((byte)b);
}
}

0 comments on commit 1254f35

Please sign in to comment.