From c5bdb600fd171a6b6b9e9b5a9b59c8c2b298a7e4 Mon Sep 17 00:00:00 2001
From: Luca Pireddu <pireddu@crs4.it>
Date: Thu, 3 Jul 2014 12:12:51 +0200
Subject: [PATCH 1/4] Return original id line with fastq tuple

---
 src/fi/aalto/seqpig/io/FastqLoader.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fi/aalto/seqpig/io/FastqLoader.java b/src/fi/aalto/seqpig/io/FastqLoader.java
index 7ed8c55..a77c4b8 100644
--- a/src/fi/aalto/seqpig/io/FastqLoader.java
+++ b/src/fi/aalto/seqpig/io/FastqLoader.java
@@ -95,8 +95,6 @@ public Tuple getNext() throws IOException {
             Text fastqrec_name = ((FastqRecordReader)in).getCurrentKey();
             SequencedFragment fastqrec = ((FastqRecordReader)in).getCurrentValue();
 
-            //mProtoTuple.add(new String(fastqrec_name.toString()));
-
             mProtoTuple.add(fastqrec.getInstrument());
             mProtoTuple.add(fastqrec.getRunNumber());
             mProtoTuple.add(fastqrec.getFlowcellId());
@@ -110,6 +108,7 @@ public Tuple getNext() throws IOException {
             mProtoTuple.add(fastqrec.getIndexSequence());
             mProtoTuple.add(fastqrec.getSequence().toString());
             mProtoTuple.add(fastqrec.getQuality().toString());
+            mProtoTuple.add(fastqrec_name.toString());
 
             Tuple t =  mTupleFactory.newTupleNoCopy(mProtoTuple);
             mProtoTuple = null;
@@ -152,6 +151,7 @@ public ResourceSchema getSchema(String location, Job job) throws IOException {
         s.add(new Schema.FieldSchema("index_sequence", DataType.CHARARRAY));
         s.add(new Schema.FieldSchema("sequence", DataType.CHARARRAY));
         s.add(new Schema.FieldSchema("quality", DataType.CHARARRAY));
+        s.add(new Schema.FieldSchema("id", DataType.CHARARRAY));
 
         return new ResourceSchema(s);
     }

From 6aa96b165b4af22a6146a6447715563f9762af3d Mon Sep 17 00:00:00 2001
From: Luca Pireddu <pireddu@crs4.it>
Date: Thu, 3 Jul 2014 12:15:36 +0200
Subject: [PATCH 2/4] If present in the tuple, use the "id" field as the fastq
 id line

---
 src/fi/aalto/seqpig/io/FastqStorer.java | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/fi/aalto/seqpig/io/FastqStorer.java b/src/fi/aalto/seqpig/io/FastqStorer.java
index e09e362..9b49376 100644
--- a/src/fi/aalto/seqpig/io/FastqStorer.java
+++ b/src/fi/aalto/seqpig/io/FastqStorer.java
@@ -171,8 +171,14 @@ public void putNext(Tuple f) throws IOException {
             fastqrec.setQuality(new Text((String)f.get(index)));
         }
 
+        Text key = null;
+        index = getFieldIndex("id", allFastqFieldNames);
+        if(index > -1 && DataType.findType(f.get(index)) == DataType.CHARARRAY) {
+            key = new Text((String)f.get(index));
+        }
+
         try {
-            writer.write(null, fastqrec);
+            writer.write(key, fastqrec);
         } catch (InterruptedException e) {
             throw new IOException(e);
         }

From 6f7b3e8578da6ada2a8e1e32b674a2e6904be58b Mon Sep 17 00:00:00 2001
From: Luca Pireddu <pireddu@crs4.it>
Date: Thu, 7 Aug 2014 10:09:20 +0200
Subject: [PATCH 3/4] Update comment

---
 src/fi/aalto/seqpig/io/FastqLoader.java | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fi/aalto/seqpig/io/FastqLoader.java b/src/fi/aalto/seqpig/io/FastqLoader.java
index a77c4b8..86e2295 100644
--- a/src/fi/aalto/seqpig/io/FastqLoader.java
+++ b/src/fi/aalto/seqpig/io/FastqLoader.java
@@ -77,6 +77,7 @@ public class FastqLoader extends LoadFunc implements LoadMetadata {
     //   index_sequence: string
     //   sequence: string
     //   quality: string (note: we assume that encoding chosen on command line!!!)
+    //   id: string
 
     public FastqLoader() {}
 

From 3da8b77137f8e514c4f5c6544ba536d4601260e9 Mon Sep 17 00:00:00 2001
From: Luca Pireddu <pireddu@crs4.it>
Date: Thu, 7 Aug 2014 10:09:34 +0200
Subject: [PATCH 4/4] Update SeqPig reference to document new fastq "id" field

---
 doc/seqpig_reference.tex | 48 ++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 12 deletions(-)

diff --git a/doc/seqpig_reference.tex b/doc/seqpig_reference.tex
index 804d2f0..5f4117e 100644
--- a/doc/seqpig_reference.tex
+++ b/doc/seqpig_reference.tex
@@ -199,12 +199,10 @@ \subsubsection{\texttt{BamStorer} and \texttt{SamStorer}}
 & \emph{attributes} & \texttt{map} & SAMRecord attributes
 \end{tabular}
 
-\subsubsection{\texttt{FastqLoader} and \texttt{QseqLoader}}
+\subsubsection{\texttt{FastqLoader}}
 
-Both loaders for unaligned read file formats Fastq and Qseq
-essentially provide the same output schema for the tuple field names
-they produce.  Note that some fields that are not present in the input
-data may remain empty.
+The loader for Fastq files provides the following output schema.  Note that
+fields that are not present in the input data remain empty.
 
 \begin{tabular}{lp{0.15\textwidth}p{0.15\textwidth}p{0.4\textwidth}}
 Usage: & \multicolumn{3}{l}{}
@@ -227,17 +225,23 @@ \subsubsection{\texttt{FastqLoader} and \texttt{QseqLoader}}
 & \emph{control\_number} & \texttt{integer} & control number\\
 & \emph{index\_sequence} & \texttt{chararray} & index sequence\\
 & \emph{sequence} & \texttt{chararray} & read bases\\
-& \emph{quality} & \texttt{chararray} & base qualities
+& \emph{quality} & \texttt{chararray} & base qualities (ASCII-encoded Sanger
+format)\\
+& \emph{id} & \texttt{chararray} & entire read id (as found in input)
 \end{tabular}
 
-\subsubsection{\texttt{FastqStorer} and \texttt{QseqStorer}}
+\subsubsection{\texttt{FastqStorer}}
 
-The Fastq and Qseq storer input schemas are identical and both are
-essentially equal to the output schema of the corresponding loader
-functions. Note that the order of the fields inside tuples does not
-matter, only their field names need to be present. All fields except
+The \texttt{FastqStorer}'s input schema is identical to the corresponding loader
+function's output schema. Note that the order of the fields inside tuples does
+not matter -- only their field names need to be present. All fields except
 \emph{sequence} and \emph{quality} are optional.
 
+The ``id'' field is optional and should be used only to override the default
+read id format (which follows the standard used by Illumina's Fastq format) with
+a custom one; if ``id'' is set the provided string will be used as the read id
+and the other metadata fields will be ignored.
+
 \begin{tabular}{lp{0.15\textwidth}p{0.15\textwidth}p{0.4\textwidth}}
 Usage: & \multicolumn{3}{l}{}
 \hspace*{-0.55cm}\begin{minipage}{0.8\textwidth}
@@ -259,9 +263,29 @@ \subsubsection{\texttt{FastqStorer} and \texttt{QseqStorer}}
 & \emph{control\_number} & \texttt{integer} & control number\\
 & \emph{index\_sequence} & \texttt{chararray} & index sequence\\
 & \emph{sequence} & \texttt{chararray} & read bases\\
-& \emph{quality} & \texttt{chararray} & base qualities
+& \emph{quality} & \texttt{chararray} & base qualities\\
+& \emph{id} & \texttt{chararray} & id of the read (overrides meta data fields)
 \end{tabular}
 
+\subsubsection{\texttt{QseqLoader}}
+
+The \texttt{QseqLoader} essentially produces the same output schema as the
+\texttt{FastqLoader}, with two differences:  it does not produce the
+``control\_number'' and ``id'' fields, since these fields are not present in
+Qseq files.
+
+Note that the base qualities coming from the Qseq format are transformed into
+Sanger $q + 33$ format.
+
+\subsubsection{\texttt{QseqStorer}}
+
+Analogously to the \texttt{FastqStorer}, the \texttt{QseqStorer}'s 
+input schema is identical to the output for the corresponding loader function.
+Note that the order of the fields inside tuples does not
+matter; only their field names need to be present. All fields except
+\emph{sequence} and \emph{quality} are optional.
+
+
 \subsubsection{\texttt{FastaLoader}}
 
 The FastaLoader loads reference sequence data in FASTA format and