From 9b62ed91384096e700fdf17c31374e691db8191b Mon Sep 17 00:00:00 2001 From: Tobias Rausch Date: Thu, 26 Oct 2023 15:08:07 +0200 Subject: [PATCH] pan-genomes --- README.md | 41 +++++++++++++++++++++++++-------- src/convert.h | 63 ++++++++++++++++++++++++++++++++------------------- src/gaf.h | 27 ++++++++++++++++++++-- 3 files changed, 97 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 5289551..ba29857 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,9 @@ [![GitHub license](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://github.com/tobiasrausch/lorax/blob/master/LICENSE) [![GitHub Releases](https://img.shields.io/github/release/tobiasrausch/lorax.svg)](https://github.com/tobiasrausch/lorax/releases) -# Lorax: A long-read analysis toolbox for cancer genomics +# Lorax: A long-read analysis toolbox for cancer and population genomics -In cancer genomics, long-read de novo assembly approaches may not be applicable because of tumor heterogeneity, normal cell contamination and aneuploid chromosomes. Generating sufficiently high coverage for each derivative, potentially sub-clonal, chromosome is not feasible. Lorax is a targeted approach to reveal complex cancer genomic structures such as telomere fusions, templated insertions or chromothripsis rearrangements. Lorax is NOT a long-read SV caller, this functionality is implemented in [delly](https://github.com/dellytools/delly). Lorax requires matched tumor-normal data sequenced using long-reads. +In cancer genomics, long-read de novo assembly approaches may not be applicable because of tumor heterogeneity, normal cell contamination and aneuploid chromosomes. Generating sufficiently high coverage for each derivative, potentially sub-clonal, chromosome is not feasible. Lorax is a targeted approach to reveal complex cancer genomic structures such as telomere fusions, templated insertions or chromothripsis rearrangements. Lorax is NOT a long-read SV caller, this functionality is implemented in [delly](https://github.com/dellytools/delly). ## Installing lorax @@ -19,7 +19,11 @@ Lorax is available as a [statically linked binary](https://github.com/tobiasraus `make all` -## Templated insertion threads +## Linear reference genomes + +Lorax has several subcommands for alignments to linear reference genomes. + +### Templated insertion threads Templated insertions threads can be identified using @@ -37,7 +41,7 @@ The `out.reads` file lists unique assignments of reads to templated insertion so `lorax extract -a -g hg38.fa -r reads.lst tumor.bam` -## Telomere repeats associated with complex rearrangements +### Telomere repeats associated with complex rearrangements Telomere-associated SVs can be identified with lorax using @@ -45,7 +49,7 @@ Telomere-associated SVs can be identified with lorax using The output files cluster reads into distinct telomere junctions that can be locally assembled. Since telomeres are repetitive, common mis-mapping artifacts found in a panel of normal samples are provided in the `maps` subdirectory. It is recommended to use the telomere-to-telomere assembly as the reference genome for `lorax telomere`. -## Read selection for targeted assembly of amplicons +### Read selection for targeted assembly of amplicons Given a list of amplicon regions and a phased VCF file, lorax can be used to extract amplicon reads for targeted assembly approaches. @@ -59,7 +63,7 @@ The amplicon subcommand outputs the selected reads (as a hash list `out.reads`) To extract the FASTA sequences for all reads use the `lorax extract` subcommand (below) with the `-a` option. -## Extracting pairwise matches and FASTA sequences of reads +### Extracting pairwise matches and FASTA sequences of reads To get FASTA sequences and pairwise read to genome matches for a list of reads (`list.reads`) use @@ -69,15 +73,34 @@ If the read list contains hashes instead of read names as from the `lorax amplic `lorax extract -a -g hg38.fa -r list.reads tumor.bam` -## Converting pan-genome graph alignments to BAM + +## Pan-genome graphs + +For pan-genome graphs and pan-genome graph alignments, lorax supports the below subcommands, some are work-in-progress. + +### Connected components of a pan-genome graph + +`lorax components pangenome.gfa.gz > comp.tsv` + +### Converting a pan-genome (sub-)graph to dot format + +`lorax gfa2dot -s s103 -r 3 pangenome.gfa.gz > graph.dot` + +`dot -Tpng graph.dot > graph.png` + +### Converting pan-genome graph alignments to BAM With long reads aligned to a pan-genome graph -`minigraph --vc -cx lr GRCh38-90c.r518.gfa.gz input.fastq.gz | bgzip > sample.gaf.gz` +`minigraph --vc -cx lr pangenome.gfa.gz input.fastq.gz | bgzip > sample.gaf.gz` lorax can be used to convert the graph alignment to BAM -`lorax convert -g GRCh38-90c.r518.gfa.gz -f input.fastq.gz sample.gaf.gz | samtools sort -o sample.bam -` +`lorax convert -g pangenome.gfa.gz -f input.fastq.gz sample.gaf.gz | samtools sort -o sample.bam -` + +### Node coverage of pan-genome graph alignments + +`lorax ncov -g pangenome.gfa.gz sample.gaf.gz > ncov.tsv` ## Citation diff --git a/src/convert.h b/src/convert.h index aacd18f..22e296d 100644 --- a/src/convert.h +++ b/src/convert.h @@ -36,6 +36,7 @@ namespace lorax struct ConvertConfig { bool hasFastq; + uint32_t chunk; boost::filesystem::path outfile; boost::filesystem::path gfafile; boost::filesystem::path seqfile; @@ -336,7 +337,7 @@ namespace lorax template inline bool - convertToBamViaFASTQ(TConfig const& c, Graph const& g, std::vector const& aln) { + convertToBamViaFASTQ(TConfig const& c, Graph const& g, std::vector const& aln, bool const firstPass) { typedef std::vector TAlignRecords; // Vertex map @@ -358,10 +359,12 @@ namespace lorax faidx_t* fai = fai_load(c.seqfile.string().c_str()); // Write header - for(int32_t refIndex = 0; refIndex < faidx_nseq(fai); ++refIndex) { - std::string qname(faidx_iseq(fai, refIndex)); - int32_t seqlen = faidx_seq_len(fai, qname.c_str()); - sfile << "@SQ\tSN:" << qname << "\tLN:" << seqlen << std::endl; + if (firstPass) { + for(int32_t refIndex = 0; refIndex < faidx_nseq(fai); ++refIndex) { + std::string qname(faidx_iseq(fai, refIndex)); + int32_t seqlen = faidx_seq_len(fai, qname.c_str()); + sfile << "@SQ\tSN:" << qname << "\tLN:" << seqlen << std::endl; + } } // Load FASTA/FASTQ @@ -410,7 +413,7 @@ namespace lorax template inline bool - convertToBamViaCRAM(TConfig const& c, Graph const& g, std::vector const& aln) { + convertToBamViaCRAM(TConfig const& c, Graph const& g, std::vector const& aln, bool const firstPass) { typedef std::vector TAlignRecords; // Vertex map @@ -437,10 +440,12 @@ namespace lorax faidx_t* fai = fai_load(c.seqfile.string().c_str()); // Write header - for(int32_t refIndex = 0; refIndex < faidx_nseq(fai); ++refIndex) { - std::string qname(faidx_iseq(fai, refIndex)); - int32_t seqlen = faidx_seq_len(fai, qname.c_str()); - sfile << "@SQ\tSN:" << qname << "\tLN:" << seqlen << std::endl; + if (firstPass) { + for(int32_t refIndex = 0; refIndex < faidx_nseq(fai); ++refIndex) { + std::string qname(faidx_iseq(fai, refIndex)); + int32_t seqlen = faidx_seq_len(fai, qname.c_str()); + sfile << "@SQ\tSN:" << qname << "\tLN:" << seqlen << std::endl; + } } // Convert to BAM @@ -500,22 +505,33 @@ namespace lorax parseGfa(c, g, true); // Parse alignments + std::cerr << '[' << boost::posix_time::to_simple_string(boost::posix_time::second_clock::local_time()) << "] Convert alignments" << std::endl; std::vector aln; - parseGaf(c, g, aln); - - //if (!plotGraphAlignments(c, g, aln)) return -1; - if (c.hasFastq) { - if (!convertToBamViaFASTQ(c, g, aln)) { - std::cerr << "Error converting to BAM!" << std::endl; - return -1; - } - } else { - if (!convertToBamViaCRAM(c, g, aln)) { - std::cerr << "Error converting to BAM!" << std::endl; - return -1; + bool firstPass = true; + std::set processed; + bool gafdone = true; + do { + gafdone = parseGaf(c, g, aln, c.chunk, processed); + std::cerr << '[' << boost::posix_time::to_simple_string(boost::posix_time::second_clock::local_time()) << "] Parsed " << aln.size() << " alignments" << std::endl; + //if (!plotGraphAlignments(c, g, aln)) return -1; + if (c.hasFastq) { + if (!convertToBamViaFASTQ(c, g, aln, firstPass)) { + std::cerr << "Error converting to BAM!" << std::endl; + return -1; + } + } else { + if (!convertToBamViaCRAM(c, g, aln, firstPass)) { + std::cerr << "Error converting to BAM!" << std::endl; + return -1; + } } - } + + // Clean-up + std::cerr << '[' << boost::posix_time::to_simple_string(boost::posix_time::second_clock::local_time()) << "] Processed " << aln.size() << " alignments" << std::endl; + firstPass = false; + aln.clear(); + } while (!gafdone); #ifdef PROFILE ProfilerStop(); @@ -535,6 +551,7 @@ namespace lorax boost::program_options::options_description generic("Generic options"); generic.add_options() ("help,?", "show help message") + ("chunk,c", boost::program_options::value(&c.chunk)->default_value(0), "chunk size [0: all at once]") ("graph,g", boost::program_options::value(&c.gfafile), "GFA pan-genome graph") ("reference,r", boost::program_options::value(&c.genome), "FASTA reference") ("align,a", boost::program_options::value(&c.readsfile), "BAM/CRAM file") diff --git a/src/gaf.h b/src/gaf.h index ff48306..a716b2a 100644 --- a/src/gaf.h +++ b/src/gaf.h @@ -160,7 +160,7 @@ namespace lorax template inline bool - parseGaf(TConfig const& c, Graph const& g, std::vector& aln) { + parseGaf(TConfig const& c, Graph const& g, std::vector& aln, uint32_t const maxread, std::set& seedSet) { // Open GAF std::ifstream gafFile; boost::iostreams::filtering_streambuf dataIn; @@ -171,16 +171,32 @@ namespace lorax dataIn.push(gafFile); // Parse GAF + std::set newReads; std::istream instream(&dataIn); bool parseAR = true; + bool gafdone = true; while (parseAR) { AlignRecord ar; if (parseAlignRecord(instream, g, ar)) { + if (maxread) { + if (seedSet.find(ar.seed) == seedSet.end()) { + // New read + if (newReads.find(ar.seed) == newReads.end()) { + if (newReads.size() >= maxread) { + gafdone = false; + continue; // No break here because all records are required for primary/secondary alignments + } + newReads.insert(ar.seed); + } + } + } aln.push_back(ar); //std::cerr << ar.seed << ',' << ar.qlen << ',' << ar.qstart << ',' << ar.qend << ',' << ar.strand << ',' << ar.plen << ',' << ar.pstart << ',' << ar.pend << ',' << ar.matches << ',' << ar.alignlen << ',' << ar.mapq << std::endl; } else parseAR = false; } + // Keep track of processed reads + if (!maxread) seedSet.insert(newReads.begin(), newReads.end()); // Close file dataIn.pop(); @@ -190,7 +206,14 @@ namespace lorax // Sort alignment records by read std::sort(aln.begin(), aln.end(), SortAlignRecord()); - return true; + return gafdone; + } + + template + inline bool + parseGaf(TConfig const& c, Graph const& g, std::vector& aln) { + std::set empty; + return parseGaf(c, g, aln, 0, empty); } }