From 9ec3e57cb23ad3183c608259df2ac0c016aea16d Mon Sep 17 00:00:00 2001 From: Graham Gower Date: Wed, 30 May 2018 09:53:55 +0930 Subject: [PATCH] Add -C parameter for sampe/samse, to copy fastq comments into sam output. This follows the BWA-MEM -C parameter with the same function. --- bwa.1 | 12 ++++++++++++ bwape.c | 5 ++++- bwase.c | 10 ++++++++-- bwaseqio.c | 5 +++++ bwtaln.h | 2 +- 5 files changed, 30 insertions(+), 4 deletions(-) diff --git a/bwa.1 b/bwa.1 index d30ac331..cf321b50 100644 --- a/bwa.1 +++ b/bwa.1 @@ -513,6 +513,12 @@ written. [3] .TP .BI -r \ STR Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null] +.TP +.BI -C +Append FASTA/Q comment to SAM output. This option can be used to +transfer read meta information (e.g. barcode) to the SAM output. Note that the +FASTA/Q comment (the string after a space in the header line) must conform the SAM +spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output. .RE .TP @@ -553,6 +559,12 @@ XA tag will not be written. [10] .TP .BI -r \ STR Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null] +.TP +.BI -C +Append FASTA/Q comment to SAM output. This option can be used to +transfer read meta information (e.g. barcode) to the SAM output. Note that the +FASTA/Q comment (the string after a space in the header line) must conform the SAM +spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output. .RE .TP diff --git a/bwape.c b/bwape.c index fa4f7f7c..36544e98 100644 --- a/bwape.c +++ b/bwape.c @@ -735,9 +735,10 @@ int bwa_sai2sam_pe(int argc, char *argv[]) int c; pe_opt_t *popt; char *prefix, *rg_line = 0; + extern int copy_comment; popt = bwa_init_pe_opt(); - while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) { + while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:C")) >= 0) { switch (c) { case 'r': if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; @@ -751,6 +752,7 @@ int bwa_sai2sam_pe(int argc, char *argv[]) case 'c': popt->ap_prior = atof(optarg); break; case 'f': xreopen(optarg, "w", stdout); break; case 'A': popt->force_isize = 1; break; + case 'C': copy_comment = 1; break; default: return 1; } } @@ -768,6 +770,7 @@ int bwa_sai2sam_pe(int argc, char *argv[]) fprintf(stderr, " -P preload index into memory (for base-space reads only)\n"); fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n"); fprintf(stderr, " -A disable insert size estimate (force -s)\n\n"); + fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, "Notes: 1. For SOLiD reads, corresponds R3 reads and to F3.\n"); fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n"); fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n"); diff --git a/bwase.c b/bwase.c index 18e86718..f7f6575e 100644 --- a/bwase.c +++ b/bwase.c @@ -477,6 +477,8 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in } } } + if (p->comment) + err_printf("\t%s", p->comment); err_putchar('\n'); } else { // this read has no match //ubyte_t *s = p->strand? p->rseq : p->seq; @@ -494,6 +496,8 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id); if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); + if (p->comment) + err_printf("\t%s", p->comment); err_putchar('\n'); } } @@ -580,7 +584,8 @@ int bwa_sai2sam_se(int argc, char *argv[]) { int c, n_occ = 3; char *prefix, *rg_line = 0; - while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) { + extern int copy_comment; + while ((c = getopt(argc, argv, "hn:f:r:C")) >= 0) { switch (c) { case 'h': break; case 'r': @@ -588,12 +593,13 @@ int bwa_sai2sam_se(int argc, char *argv[]) break; case 'n': n_occ = atoi(optarg); break; case 'f': xreopen(optarg, "w", stdout); break; + case 'C': copy_comment = 1; break; default: return 1; } } if (optind + 3 > argc) { - fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] \n"); + fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] [-C] \n"); return 1; } if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { diff --git a/bwaseqio.c b/bwaseqio.c index d850307c..0cc700ac 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -13,6 +13,7 @@ KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; +int copy_comment = 0; struct __bwa_seqio_t { // for BAM input @@ -199,6 +200,8 @@ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int tri p->qual = (ubyte_t*)strdup((char*)seq->qual.s); if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); } + if (copy_comment && seq->comment.l) + p->comment = strdup(seq->comment.s); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() @@ -228,6 +231,8 @@ void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs) for (j = 0; j < p->n_multi; ++j) if (p->multi[j].cigar) free(p->multi[j].cigar); free(p->name); + if (p->comment) + free(p->comment); free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi); free(p->cigar); } diff --git a/bwtaln.h b/bwtaln.h index 4616ff5a..402f0788 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -64,7 +64,7 @@ typedef struct { } bwt_multi1_t; typedef struct { - char *name; + char *name, *comment; ubyte_t *seq, *rseq, *qual; uint32_t len:20, strand:1, type:2, dummy:1, extra_flag:8; uint32_t n_mm:8, n_gapo:8, n_gape:8, mapQ:8;