Skip to content

Commit

Permalink
Report metrics and phasing information (#40)
Browse files Browse the repository at this point in the history
* Output phasing info into a single file
* Improve names
* Remove ostream from header
* Add code comment
* Make phasing info non-optional
  • Loading branch information
egor-dolzhenko authored Dec 29, 2021
1 parent 3962a7f commit d02c534
Show file tree
Hide file tree
Showing 8 changed files with 135 additions and 148 deletions.
76 changes: 69 additions & 7 deletions reviewer/app/GenotypePaths.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include "app/GenotypePaths.hh"

#include <algorithm>
#include <cassert>
#include <fstream>
#include <map>
Expand Down Expand Up @@ -202,8 +203,7 @@ static vector<NodeVectors> extendDiplotype(const vector<NodeVectors>& genotypes,
return extendedGenotype;
}

vector<Diplotype>
getCandidateDiplotypePaths(int meanFragLen, const string& vcfPath, const LocusSpecification& locusSpec)
vector<Diplotype> getCandidateDiplotypes(int meanFragLen, const string& vcfPath, const LocusSpecification& locusSpec)
{
auto genotypeNodesByNodeRange = getGenotypeNodesByNodeRange(meanFragLen, vcfPath, locusSpec);

Expand Down Expand Up @@ -235,18 +235,80 @@ getCandidateDiplotypePaths(int meanFragLen, const string& vcfPath, const LocusSp
++node;
}

vector<Diplotype> pathsByDiplotype;
vector<Diplotype> diplotypes;
const NodeId rightFlankNode = locusSpec.regionGraph().numNodes() - 1;
const int rightFlankLength = locusSpec.regionGraph().nodeSeq(rightFlankNode).length();
for (const auto& diplotypeNodes : nodesByDiplotype)
{
Diplotype genotypePaths;
Diplotype diplotype;
for (const auto& haplotypeNodes : diplotypeNodes)
{
genotypePaths.emplace_back(&locusSpec.regionGraph(), 0, haplotypeNodes, rightFlankLength);
diplotype.emplace_back(&locusSpec.regionGraph(), 0, haplotypeNodes, rightFlankLength);
}
pathsByDiplotype.push_back(genotypePaths);

// The code so far considers diplotypes that differ by the order of constituent haplotypes to be distinct.
// To overcome this issue, we enforce a consistent haplotype order.
if (diplotype.front() < diplotype.back())
{
std::iter_swap(diplotype.begin(), diplotype.end() - 1);
}

diplotypes.push_back(diplotype);
}

return pathsByDiplotype;
std::sort(diplotypes.begin(), diplotypes.end());
diplotypes.erase(std::unique(diplotypes.begin(), diplotypes.end()), diplotypes.end());

return diplotypes;
}

static string summarizePath(const graphtools::Path& path)
{
const auto& graph = *path.graphRawPtr();
string summary;
std::set<graphtools::NodeId> observedNodes;
for (const auto nodeId : path.nodeIds())
{
if (observedNodes.find(nodeId) != observedNodes.end())
{
continue;
}

const bool isLoopNode = graph.hasEdge(nodeId, nodeId);

if (nodeId == 0)
{
summary += "(LF)";
}
else if (nodeId + 1 == graph.numNodes())
{
summary += "(RF)";
}
else
{
assert(graph.numNodes() != 0);
const string& nodeSeq = graph.nodeSeq(nodeId);
summary += "(" + nodeSeq + ")";

if (isLoopNode)
{
int numMotifs = std::count(path.nodeIds().begin(), path.nodeIds().end(), nodeId);
summary += "{" + std::to_string(numMotifs) + "}";
}
}

observedNodes.emplace(nodeId);
}

return summary;
}

std::ostream& operator<<(std::ostream& out, const Diplotype& diplotype)
{
out << summarizePath(diplotype.front());
if (diplotype.size() == 2)
{
out << "/" << summarizePath(diplotype.back());
}
return out;
}
3 changes: 2 additions & 1 deletion reviewer/app/GenotypePaths.hh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "core/LocusSpecification.hh"

using Diplotype = std::vector<graphtools::Path>;
std::ostream& operator<<(std::ostream& out, const Diplotype& diplotype);

/// Computes all possible diplotype paths at the given locus
/// \param meanFragLen: Mean fragment length
Expand All @@ -41,4 +42,4 @@ using Diplotype = std::vector<graphtools::Path>;
/// (last node)
///
std::vector<Diplotype>
getCandidateDiplotypePaths(int meanFragLen, const std::string& vcfPath, const LocusSpecification& locusSpec);
getCandidateDiplotypes(int meanFragLen, const std::string& vcfPath, const LocusSpecification& locusSpec);
2 changes: 2 additions & 0 deletions reviewer/app/Phasing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,5 @@ ScoredDiplotypes scoreDiplotypes(const FragById& fragById, const vector<Diplotyp
assert(!scoredDiplotypes.empty());
return scoredDiplotypes;
}

std::ostream& operator<<(std::ostream& out, const ScoredDiplotype& diplotype) { return out; }
1 change: 0 additions & 1 deletion reviewer/app/REViewer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ optional<WorkflowArguments> getCommandLineArguments(int argc, char** argv)
("catalog", po::value<string>(&args.catalogPath)->required(), "Variant catalog")
("locus", po::value<string>(&args.locusId)->required(), "Locus to analyze (or a list of comma-separated loci)")
("region-extension-length", po::value<int>(&args.locusExtensionLength)->default_value(1000), "Length of flanking region (must match corresponding ExpansionHunter setting)")
("output-phasing-info", po::bool_switch(&args.outputPhasingInfo), "Output results of the haplotype estimation algorithm")
("output-prefix", po::value<string>(&args.outputPrefix)->required(), "Prefix for the output files");
// clang-format on

Expand Down
166 changes: 54 additions & 112 deletions reviewer/app/Workflow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

#include "Workflow.hh"

#include <fstream>
#include <set>

#include <boost/algorithm/string.hpp>
Expand Down Expand Up @@ -50,6 +49,24 @@ using std::string;
using std::unordered_map;
using std::vector;

template <typename T> static string encode(const vector<T>& depths)
{
std::ostringstream encoding;
encoding.precision(2);
encoding << std::fixed;

for (auto depth : depths)
{
if (encoding.tellp())
{
encoding << "/";
}
encoding << depth;
}

return encoding.str();
}

class LocusResults
{
public:
Expand All @@ -71,8 +88,6 @@ class LocusResults
MetricsByVariant metricsByVariant_;
};

using ResultsByLocus = map<string, LocusResults>;

static LocusResults analyzeLocus(
const string& referencePath, const string& readsPath, const string& vcfPath, const string& locusId,
const LocusSpecification& locusSpec)
Expand All @@ -87,10 +102,9 @@ static LocusResults analyzeLocus(
spdlog::info("Fragment length is estimated to be {}", meanFragLen);

spdlog::info("Extracting genotype paths");
auto pathsByDiplotype = getCandidateDiplotypePaths(meanFragLen, vcfPath, locusSpec);
auto pathsByDiplotype = getCandidateDiplotypes(meanFragLen, vcfPath, locusSpec);

spdlog::info("Phasing");

auto scoredDiplotypes = scoreDiplotypes(fragById, pathsByDiplotype);
auto topDiplotype = scoredDiplotypes.front().first; // scoredDiplotypes are sorted
spdlog::info("Found {} paths defining diplotype", topDiplotype.size());
Expand Down Expand Up @@ -137,137 +151,65 @@ vector<string> getLocusIds(const RegionCatalog& catalog, const string& encoding)
return locusIds;
}

static string summarizePath(const graphtools::Graph& graph, const graphtools::Path& path)
std::ofstream initPhasingFile(const std::string& prefix)
{
string summary;
std::set<graphtools::NodeId> observedNodes;
for (const auto nodeId : path.nodeIds())
const auto path = prefix + ".phasing.tsv";
std::ofstream file(path);
if (!file.is_open())
{
if (observedNodes.find(nodeId) != observedNodes.end())
{
continue;
}

const bool isLoopNode = graph.hasEdge(nodeId, nodeId);

if (nodeId == 0)
{
summary += "(LF)";
}
else if (nodeId + 1 == graph.numNodes())
{
summary += "(RF)";
}
else
{
assert(graph.numNodes() != 0);
const string& nodeSeq = graph.nodeSeq(nodeId);
summary += "(" + nodeSeq + ")";

if (isLoopNode)
{
int numMotifs = std::count(path.nodeIds().begin(), path.nodeIds().end(), nodeId);
summary += "{" + std::to_string(numMotifs) + "}";
}
}

observedNodes.emplace(nodeId);
throw std::runtime_error("Unable to open " + path);
}

return summary;
}

void outputPhasingInfo(const Graph& graph, const ScoredDiplotypes& scoredDiplotypes, const string& phasingInfoPath)
{
using Diplotypes = std::set<graphtools::Path>;
file << "LocusId\tDiplotype\tScore" << std::endl;

ofstream phasingInfoFile(phasingInfoPath);
if (phasingInfoFile.is_open())
{
std::set<Diplotypes> observedGenotypes;
for (const auto& scoredGenotype : scoredDiplotypes)
{
const auto& genotypePaths = scoredGenotype.first;
const int score = scoredGenotype.second;

Diplotypes genotypePathSet;
genotypePathSet.emplace(genotypePaths.front());
genotypePathSet.emplace(genotypePaths.back());
if (observedGenotypes.find(genotypePathSet) != observedGenotypes.end())
{
continue;
}

phasingInfoFile << summarizePath(graph, genotypePaths.front());
if (genotypePaths.size() == 2)
{
phasingInfoFile << "/" << summarizePath(graph, genotypePaths.back());
}
phasingInfoFile << "\t" << score << std::endl;
observedGenotypes.emplace(genotypePathSet);
}
}
else
{
throw std::runtime_error("Unable to open " + phasingInfoPath);
}
return file;
}

void outputResults(
const RegionCatalog& locusCatalog, const ResultsByLocus& resultsByLocus, const string& outputPrefix,
bool phasingInfoNeeded)
std::ofstream initMetricsFile(const std::string& prefix)
{
spdlog::info("Writing output to disk");
nlohmann::json metricsReport;
for (const auto& locusIdAndResults : resultsByLocus)
const auto path = prefix + ".metrics.tsv";
std::ofstream metricsFile(path);
if (!metricsFile.is_open())
{
const auto& locusId = locusIdAndResults.first;
const auto& locusResults = locusIdAndResults.second;
const auto& locusSpec = locusCatalog.at(locusId);
const string locusOutputPrefix = outputPrefix + "." + locusId;
generateSvg(locusResults.lanePlots(), locusOutputPrefix + ".svg");
if (phasingInfoNeeded)
{
const auto phasingInfoPath = locusOutputPrefix + ".phasing.txt";
const Graph& graph = locusSpec.regionGraph();
outputPhasingInfo(graph, locusResults.scoredDiplotypes(), phasingInfoPath);
}

for (const auto& variantMetrics : locusResults.metricsByVariant())
{
metricsReport[variantMetrics.variantId]["Genotype"] = variantMetrics.genotype;
metricsReport[variantMetrics.variantId]["AlleleDepths"] = variantMetrics.alleleDepth;
}
throw std::runtime_error("Unable to open " + path);
}

const string metricsFilePath = outputPrefix + ".metrics.json";
ofstream metricsFile(metricsFilePath);
if (metricsFile.is_open())
{
metricsFile << metricsReport.dump(4) << std::endl;
}
else
{
throw std::runtime_error("Unable to open " + metricsFilePath);
}
metricsFile.close();
metricsFile << "VariantId\tGenotype\tAlleleDepth" << std::endl;

return metricsFile;
}

int runWorkflow(const WorkflowArguments& args)
{
Reference reference(args.referencePath);
auto locusCatalog = loadLocusCatalogFromDisk(args.catalogPath, reference, args.locusExtensionLength);

ResultsByLocus resultsByLocus;
auto locusIds = getLocusIds(locusCatalog, args.locusId);
auto phasingFile = initPhasingFile(args.outputPrefix);
auto metricsFile = initMetricsFile(args.outputPrefix);

for (const auto& locusId : locusIds)
{
auto locusSpec = locusCatalog.at(locusId);
auto locusResults = analyzeLocus(args.referencePath, args.readsPath, args.vcfPath, locusId, locusSpec);
resultsByLocus.emplace(locusId, locusResults);

const auto svgPath = args.outputPrefix + "." + locusId + ".svg";
generateSvg(locusResults.lanePlots(), svgPath);

for (const auto& metrics : locusResults.metricsByVariant())
{
const auto& genotype = encode(metrics.genotype);
const auto& alleleDepth = encode(metrics.alleleDepth);
metricsFile << metrics.variantId << "\t" << genotype << "\t" << alleleDepth << std::endl;
}

for (const auto& diplotypeAndScore : locusResults.scoredDiplotypes())
{
phasingFile << locusId << "\t" << diplotypeAndScore.first << "\t" << diplotypeAndScore.second << std::endl;
}
}

outputResults(locusCatalog, resultsByLocus, args.outputPrefix, args.outputPhasingInfo);
phasingFile.close();
metricsFile.close();

return 0;
}
4 changes: 3 additions & 1 deletion reviewer/app/Workflow.hh
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@

#pragma once

#include <fstream>
#include <string>

#include "app/CatalogLoading.hh"

struct WorkflowArguments
{
std::string readsPath;
Expand All @@ -31,7 +34,6 @@ struct WorkflowArguments
std::string locusId;
std::string outputPrefix;
int locusExtensionLength;
bool outputPhasingInfo;
};

int runWorkflow(const WorkflowArguments& args);
Loading

0 comments on commit d02c534

Please sign in to comment.