From c5bdb600fd171a6b6b9e9b5a9b59c8c2b298a7e4 Mon Sep 17 00:00:00 2001 From: Luca Pireddu Date: Thu, 3 Jul 2014 12:12:51 +0200 Subject: [PATCH 1/4] Return original id line with fastq tuple --- src/fi/aalto/seqpig/io/FastqLoader.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fi/aalto/seqpig/io/FastqLoader.java b/src/fi/aalto/seqpig/io/FastqLoader.java index 7ed8c55..a77c4b8 100644 --- a/src/fi/aalto/seqpig/io/FastqLoader.java +++ b/src/fi/aalto/seqpig/io/FastqLoader.java @@ -95,8 +95,6 @@ public Tuple getNext() throws IOException { Text fastqrec_name = ((FastqRecordReader)in).getCurrentKey(); SequencedFragment fastqrec = ((FastqRecordReader)in).getCurrentValue(); - //mProtoTuple.add(new String(fastqrec_name.toString())); - mProtoTuple.add(fastqrec.getInstrument()); mProtoTuple.add(fastqrec.getRunNumber()); mProtoTuple.add(fastqrec.getFlowcellId()); @@ -110,6 +108,7 @@ public Tuple getNext() throws IOException { mProtoTuple.add(fastqrec.getIndexSequence()); mProtoTuple.add(fastqrec.getSequence().toString()); mProtoTuple.add(fastqrec.getQuality().toString()); + mProtoTuple.add(fastqrec_name.toString()); Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple); mProtoTuple = null; @@ -152,6 +151,7 @@ public ResourceSchema getSchema(String location, Job job) throws IOException { s.add(new Schema.FieldSchema("index_sequence", DataType.CHARARRAY)); s.add(new Schema.FieldSchema("sequence", DataType.CHARARRAY)); s.add(new Schema.FieldSchema("quality", DataType.CHARARRAY)); + s.add(new Schema.FieldSchema("id", DataType.CHARARRAY)); return new ResourceSchema(s); } From 6aa96b165b4af22a6146a6447715563f9762af3d Mon Sep 17 00:00:00 2001 From: Luca Pireddu Date: Thu, 3 Jul 2014 12:15:36 +0200 Subject: [PATCH 2/4] If present in the tuple, use the "id" field as the fastq id line --- src/fi/aalto/seqpig/io/FastqStorer.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/fi/aalto/seqpig/io/FastqStorer.java b/src/fi/aalto/seqpig/io/FastqStorer.java index e09e362..9b49376 100644 --- a/src/fi/aalto/seqpig/io/FastqStorer.java +++ b/src/fi/aalto/seqpig/io/FastqStorer.java @@ -171,8 +171,14 @@ public void putNext(Tuple f) throws IOException { fastqrec.setQuality(new Text((String)f.get(index))); } + Text key = null; + index = getFieldIndex("id", allFastqFieldNames); + if(index > -1 && DataType.findType(f.get(index)) == DataType.CHARARRAY) { + key = new Text((String)f.get(index)); + } + try { - writer.write(null, fastqrec); + writer.write(key, fastqrec); } catch (InterruptedException e) { throw new IOException(e); } From 6f7b3e8578da6ada2a8e1e32b674a2e6904be58b Mon Sep 17 00:00:00 2001 From: Luca Pireddu Date: Thu, 7 Aug 2014 10:09:20 +0200 Subject: [PATCH 3/4] Update comment --- src/fi/aalto/seqpig/io/FastqLoader.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fi/aalto/seqpig/io/FastqLoader.java b/src/fi/aalto/seqpig/io/FastqLoader.java index a77c4b8..86e2295 100644 --- a/src/fi/aalto/seqpig/io/FastqLoader.java +++ b/src/fi/aalto/seqpig/io/FastqLoader.java @@ -77,6 +77,7 @@ public class FastqLoader extends LoadFunc implements LoadMetadata { // index_sequence: string // sequence: string // quality: string (note: we assume that encoding chosen on command line!!!) + // id: string public FastqLoader() {} From 3da8b77137f8e514c4f5c6544ba536d4601260e9 Mon Sep 17 00:00:00 2001 From: Luca Pireddu Date: Thu, 7 Aug 2014 10:09:34 +0200 Subject: [PATCH 4/4] Update SeqPig reference to document new fastq "id" field --- doc/seqpig_reference.tex | 48 ++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/doc/seqpig_reference.tex b/doc/seqpig_reference.tex index 804d2f0..5f4117e 100644 --- a/doc/seqpig_reference.tex +++ b/doc/seqpig_reference.tex @@ -199,12 +199,10 @@ \subsubsection{\texttt{BamStorer} and \texttt{SamStorer}} & \emph{attributes} & \texttt{map} & SAMRecord attributes \end{tabular} -\subsubsection{\texttt{FastqLoader} and \texttt{QseqLoader}} +\subsubsection{\texttt{FastqLoader}} -Both loaders for unaligned read file formats Fastq and Qseq -essentially provide the same output schema for the tuple field names -they produce. Note that some fields that are not present in the input -data may remain empty. +The loader for Fastq files provides the following output schema. Note that +fields that are not present in the input data remain empty. \begin{tabular}{lp{0.15\textwidth}p{0.15\textwidth}p{0.4\textwidth}} Usage: & \multicolumn{3}{l}{} @@ -227,17 +225,23 @@ \subsubsection{\texttt{FastqLoader} and \texttt{QseqLoader}} & \emph{control\_number} & \texttt{integer} & control number\\ & \emph{index\_sequence} & \texttt{chararray} & index sequence\\ & \emph{sequence} & \texttt{chararray} & read bases\\ -& \emph{quality} & \texttt{chararray} & base qualities +& \emph{quality} & \texttt{chararray} & base qualities (ASCII-encoded Sanger +format)\\ +& \emph{id} & \texttt{chararray} & entire read id (as found in input) \end{tabular} -\subsubsection{\texttt{FastqStorer} and \texttt{QseqStorer}} +\subsubsection{\texttt{FastqStorer}} -The Fastq and Qseq storer input schemas are identical and both are -essentially equal to the output schema of the corresponding loader -functions. Note that the order of the fields inside tuples does not -matter, only their field names need to be present. All fields except +The \texttt{FastqStorer}'s input schema is identical to the corresponding loader +function's output schema. Note that the order of the fields inside tuples does +not matter -- only their field names need to be present. All fields except \emph{sequence} and \emph{quality} are optional. +The ``id'' field is optional and should be used only to override the default +read id format (which follows the standard used by Illumina's Fastq format) with +a custom one; if ``id'' is set the provided string will be used as the read id +and the other metadata fields will be ignored. + \begin{tabular}{lp{0.15\textwidth}p{0.15\textwidth}p{0.4\textwidth}} Usage: & \multicolumn{3}{l}{} \hspace*{-0.55cm}\begin{minipage}{0.8\textwidth} @@ -259,9 +263,29 @@ \subsubsection{\texttt{FastqStorer} and \texttt{QseqStorer}} & \emph{control\_number} & \texttt{integer} & control number\\ & \emph{index\_sequence} & \texttt{chararray} & index sequence\\ & \emph{sequence} & \texttt{chararray} & read bases\\ -& \emph{quality} & \texttt{chararray} & base qualities +& \emph{quality} & \texttt{chararray} & base qualities\\ +& \emph{id} & \texttt{chararray} & id of the read (overrides meta data fields) \end{tabular} +\subsubsection{\texttt{QseqLoader}} + +The \texttt{QseqLoader} essentially produces the same output schema as the +\texttt{FastqLoader}, with two differences: it does not produce the +``control\_number'' and ``id'' fields, since these fields are not present in +Qseq files. + +Note that the base qualities coming from the Qseq format are transformed into +Sanger $q + 33$ format. + +\subsubsection{\texttt{QseqStorer}} + +Analogously to the \texttt{FastqStorer}, the \texttt{QseqStorer}'s +input schema is identical to the output for the corresponding loader function. +Note that the order of the fields inside tuples does not +matter; only their field names need to be present. All fields except +\emph{sequence} and \emph{quality} are optional. + + \subsubsection{\texttt{FastaLoader}} The FastaLoader loads reference sequence data in FASTA format and