Skip to content

Commit

Permalink
Merge pull request #588 from PatrickDeelen/master
Browse files Browse the repository at this point in the history
fix build issues
  • Loading branch information
PatrickDeelen authored Dec 4, 2020
2 parents 2892a5b + 9d45f0f commit da081f0
Show file tree
Hide file tree
Showing 16 changed files with 1,239 additions and 157 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ public class DownstreamerOptions {
private final List<PathwayDatabase> pathwayDatabases;
private List<PathwayDatabase> pathwayDatabases2 = null;
private final File conversionColumnIncludeFilter;
private final File conversionRowIncludeFilter;
private final boolean correctForLambdaInflation;
private final int permutationPathwayEnrichment;
private final int permutationGeneCorrelations;
Expand All @@ -81,6 +82,7 @@ public class DownstreamerOptions {
private final String y;
private List<String> pathwayDatabasesToAnnotateWithGwas;
private Map<String, File> alternativeTopHitFiles;
private final boolean trimGeneNames; //for convert expression data mode

public boolean isDebugMode() {
return debugMode;
Expand Down Expand Up @@ -238,6 +240,12 @@ public boolean isDebugMode() {
OptionBuilder.withDescription("Optional file with columns to select during conversion");
OptionBuilder.withLongOpt("cols");
OPTIONS.addOption(OptionBuilder.create("co"));

OptionBuilder.withArgName("path");
OptionBuilder.hasArg();
OptionBuilder.withDescription("Optional file with rows to select during conversion");
OptionBuilder.withLongOpt("rows");
OPTIONS.addOption(OptionBuilder.create("ro"));

OptionBuilder.withArgName("boolean");
OptionBuilder.withDescription("Correct the GWAS for the lambda inflation");
Expand Down Expand Up @@ -343,6 +351,12 @@ public boolean isDebugMode() {
OptionBuilder.withLongOpt("regress-gene-lengths");
OPTIONS.addOption(OptionBuilder.create("rgl"));

OptionBuilder.withArgName("boolean");
OptionBuilder.withDescription("For mode CONVERT_EXP trim train .## from the gene names");
OptionBuilder.withLongOpt("trimGeneNames");
OPTIONS.addOption(OptionBuilder.create("tgn"));


OptionBuilder.withArgName("int");
OptionBuilder.hasArg();
OptionBuilder.withDescription("Only relevant for MODE: R_2_Z_SCORE to specify the number of samples used to create the correlation matrix");
Expand Down Expand Up @@ -407,6 +421,7 @@ public DownstreamerOptions(String... args) throws ParseException {
quantileNormalizePermutations = commandLine.hasOption("qn");
regressGeneLengths = commandLine.hasOption("rgl");
run1BasePath = commandLine.hasOption("soo") ? new File(commandLine.getOptionValue("soo")) : outputBasePath;
trimGeneNames = commandLine.hasOption("tgn");

if (quantileNormalizePermutations && forceNormalGenePvalues) {
throw new ParseException("Can't combine -qn with -fngp");
Expand Down Expand Up @@ -438,17 +453,22 @@ public DownstreamerOptions(String... args) throws ParseException {
gwasZscoreMatrixPath = null;
}

if (mode == DownstreamerMode.CONVERT_TXT || mode == DownstreamerMode.CONVERT_TXT_MERGE) {
if (mode == DownstreamerMode.CONVERT_TXT || mode == DownstreamerMode.CONVERT_TXT_MERGE || mode == DownstreamerMode.CONVERT_EXP) {
pvalueToZscore = commandLine.hasOption("p2z");
if (commandLine.hasOption("co")) {
conversionColumnIncludeFilter = new File(commandLine.getOptionValue("co"));
} else {
conversionColumnIncludeFilter = null;
}
if (commandLine.hasOption("ro")) {
conversionRowIncludeFilter = new File(commandLine.getOptionValue("ro"));
} else {
conversionRowIncludeFilter = null;
}
} else {
pvalueToZscore = false;
conversionColumnIncludeFilter = null;

conversionRowIncludeFilter = null;
}

switch (mode) {
Expand Down Expand Up @@ -513,15 +533,14 @@ public DownstreamerOptions(String... args) throws ParseException {
pathwayDatabasesToAnnotateWithGwas = new ArrayList<>();
}
if (commandLine.hasOption("ath")) {


String[] athValues = commandLine.getOptionValues("ath");

if (athValues.length % 2 != 0) {
throw new ParseException("Error parsing --alternaitveTopHits. Must be in name=path format");
}
alternativeTopHitFiles = new HashMap<>(athValues.length/2);

alternativeTopHitFiles = new HashMap<>(athValues.length / 2);

for (int i = 0; i < athValues.length; i += 2) {

Expand All @@ -530,11 +549,11 @@ public DownstreamerOptions(String... args) throws ParseException {
}

File hitsFile = new File(athValues[i + 1]);
if(!hitsFile.canRead()){

if (!hitsFile.canRead()) {
throw new ParseException("Error parsing --alternaitveTopHits. Can't find: " + hitsFile.getAbsolutePath());
}

alternativeTopHitFiles.put(athValues[i], hitsFile);

}
Expand Down Expand Up @@ -606,6 +625,7 @@ public DownstreamerOptions(String... args) throws ParseException {
pathwayDatabasesToAnnotateWithGwas = new ArrayList<>();
break;
case REMOVE_CIS_COEXP:
case INVESTIGATE_NETWORK:
if (!commandLine.hasOption("ge")) {
throw new ParseException("--genes not specified");
} else {
Expand Down Expand Up @@ -1036,6 +1056,9 @@ public void printOptions() {
if (conversionColumnIncludeFilter != null) {
LOGGER.info(" * Columns to include: " + conversionColumnIncludeFilter.getAbsolutePath());
}
if (conversionRowIncludeFilter != null){
LOGGER.info(" * Rows to include: " + conversionRowIncludeFilter.getAbsolutePath());
}
LOGGER.info(" * Convert p-values to Z-score: " + (pvalueToZscore ? "on" : "off"));
break;
case CONVERT_TXT_MERGE:
Expand Down Expand Up @@ -1073,8 +1096,14 @@ public void printOptions() {
break;
case CONVERT_EXP:
LOGGER.info(" * Expression matrix: " + gwasZscoreMatrixPath.getAbsolutePath());
if (columnsToExtract != null) {
LOGGER.info(" * Columns to extract: " + String.join(" ", columnsToExtract));
if (conversionColumnIncludeFilter != null) {
LOGGER.info(" * Columns to include: " + conversionColumnIncludeFilter.getAbsolutePath());
}
if (conversionRowIncludeFilter != null){
LOGGER.info(" * Rows to include: " + conversionRowIncludeFilter.getAbsolutePath());
}
if(trimGeneNames){
LOGGER.info("Trimming gene names to remove .## from the name");
}
break;
case TRANSPOSE:
Expand Down Expand Up @@ -1373,5 +1402,13 @@ public List<String> getPathwayDatabasesToAnnotateWithGwas() {
public Map<String, File> getAlternativeTopHitFiles() {
return alternativeTopHitFiles;
}


public File getConversionRowIncludeFilter() {
return conversionRowIncludeFilter;
}

public boolean isTrimGeneNames() {
return trimGeneNames;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nl.systemsgenetics.downstreamer.development;

import cern.jet.math.tdouble.DoubleFunctions;
import com.opencsv.CSVParser;
import com.opencsv.CSVParserBuilder;
import com.opencsv.CSVReader;
import com.opencsv.CSVReaderBuilder;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.molgenis.genotype.RandomAccessGenotypeData;
import org.molgenis.genotype.RandomAccessGenotypeDataReaderFormats;
import org.molgenis.genotype.util.LdCalculatorException;
import org.molgenis.genotype.variant.GeneticVariant;
import org.molgenis.genotype.variantFilter.VariantIdIncludeFilter;
import umcg.genetica.math.matrix2.DoubleMatrixDataset;
import static umcg.genetica.math.matrix2.DoubleMatrixDataset.loadDoubleBinaryData;

/**
*
* @author patri
*/
public class TransEqtlEnrichment {

/**
* @param args the command line arguments
*/
public static void main(String[] args) throws Exception {

String transEqtlMatrixPath = "D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\eqtlgen\\ZScoreMatrix";
File gwasFile = new File("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\eqtlgen\\gwas.txt");
String outputPath = "D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\eqtlgen\\SumChi2PerGwas.txt";

if (false) {
HashSet<String> colsToExclude = new HashSet<>();
colsToExclude.add("Alleles");
colsToExclude.add("AlleleAssessed");

DoubleMatrixDataset<String, String> transEqtlDataset = DoubleMatrixDataset.loadDoubleTextDoubleDataExlcudeCols(transEqtlMatrixPath + ".txt.gz", '\t', colsToExclude);

fixHeader(transEqtlDataset);
//dont do this all rows contain some NaN values
//transEqtlDataset = removeNaNRows(transEqtlDataset);
transEqtlDataset.saveBinary(transEqtlMatrixPath);
}

DoubleMatrixDataset<String, String> transEqtlDataset = loadDoubleBinaryData(transEqtlMatrixPath);
System.out.println("trans matrix " + transEqtlDataset.rows() + " x " + transEqtlDataset.columns());

for(int r = 0 ; r < transEqtlDataset.rows() ; ++r){
for(int c = 0 ; c < transEqtlDataset.columns(); ++c){
if(Double.isNaN(transEqtlDataset.getElementQuick(r, c))){
transEqtlDataset.setElementQuick(r, c, 0);
}
}
}

HashMap<String, HashSet<String>> gwasVariants = readGwasFile(gwasFile);

HashSet<String> allGwasVariants = new HashSet<>();
for (HashSet<String> variants : gwasVariants.values()) {
allGwasVariants.addAll(variants);
}

RandomAccessGenotypeData referenceGenotypeData = RandomAccessGenotypeDataReaderFormats.VCF_FOLDER.createFilteredGenotypeData("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\reference_datasets\\human_b37\\VCF_1Kgp1v3_EUR\\", 20000, new VariantIdIncludeFilter(allGwasVariants), null);

System.out.println(gwasVariants.get("Inflammatory bowel disease").size());
prune(gwasVariants, referenceGenotypeData.getVariantIdMap());
System.out.println(gwasVariants.get("Inflammatory bowel disease").size());

DoubleMatrixDataset<String, String> sumChi2Dataset = new DoubleMatrixDataset<String, String>(transEqtlDataset.getHashCols().keySet(), gwasVariants.keySet());

for (String trait : gwasVariants.keySet()) {

System.out.println(trait);

HashSet<String> variants = gwasVariants.get(trait);

variants.retainAll(transEqtlDataset.getRowObjects());

DoubleMatrixDataset<String, String> transEffectOfGwasVariants = transEqtlDataset.viewRowSelection(variants);

for (String gene : transEffectOfGwasVariants.getHashCols().keySet()) {
double sumchi2 = transEffectOfGwasVariants.getCol(gene).aggregate(DoubleFunctions.plus, DoubleFunctions.square);
sumChi2Dataset.setElement(gene, trait, sumchi2);
}

}

sumChi2Dataset.save(outputPath);

}

private static HashMap<String, HashSet<String>> readGwasFile(File gwas) throws FileNotFoundException, IOException {

final CSVParser gwasParser = new CSVParserBuilder().withSeparator('\t').withIgnoreQuotations(true).build();
final CSVReader gwasReader = new CSVReaderBuilder(new BufferedReader(new FileReader(gwas))).withSkipLines(1).withCSVParser(gwasParser).build();

HashMap<String, HashSet<String>> gwasVariants = new HashMap<>();

String[] inputLine;
while ((inputLine = gwasReader.readNext()) != null) {

String trait = inputLine[8];
String snp = inputLine[0];

HashSet<String> variants = gwasVariants.get(trait);
if (variants == null) {
variants = new HashSet<>();
gwasVariants.put(trait, variants);
}
variants.add(snp);
}

for (Map.Entry<String, HashSet<String>> e : gwasVariants.entrySet()) {
System.out.println(e.getKey() + "\t" + e.getValue().size());
}

return gwasVariants;

}

public static void fixHeader(DoubleMatrixDataset dataset) throws Exception {

LinkedHashMap<String, Integer> oldHash = dataset.getHashCols();
LinkedHashMap<String, Integer> newHash = new LinkedHashMap<>(oldHash.size());

for (Map.Entry<String, Integer> oldEntry : oldHash.entrySet()) {

String oldGeneName = oldEntry.getKey();
int indexOfPoint = oldGeneName.indexOf('_');

if (indexOfPoint < 0) {
if (newHash.put(oldGeneName, oldEntry.getValue()) != null) {
throw new Exception("Can't trim gene names if this causes duplicate genes: " + oldGeneName);
}
} else {
if (newHash.put(oldGeneName.substring(0, indexOfPoint), oldEntry.getValue()) != null) {
throw new Exception("Can't trim gene names if this causes duplicate genes: " + oldGeneName);
}
}

}

dataset.setHashCols(newHash);

}

private static DoubleMatrixDataset removeNaNRows(DoubleMatrixDataset dataset) {

ArrayList<String> rowNames = dataset.getRowObjects();
ArrayList<String> nonNanRowNames = new ArrayList<>(dataset.rows());

rows:
for (int r = 0; r < dataset.rows(); ++r) {
boolean nonZeroValue = false;
double e;
for (int c = 0; c < dataset.columns(); ++c) {
e = dataset.getElementQuick(r, c);
if (Double.isNaN(e)) {
continue rows;
} else if (e != 0) {
nonZeroValue = true;
}
}
//if here not NaN values;
if (nonZeroValue) {
nonNanRowNames.add(rowNames.get(r));
}

}

if (nonNanRowNames.size() < rowNames.size()) {
dataset = dataset.viewRowSelection(nonNanRowNames);
System.out.println("Removing " + (rowNames.size() - nonNanRowNames.size()) + " rows with only zero's or NaN");
}

return dataset;

}

public static void prune(final HashMap<String, HashSet<String>> gwasVariants, final HashMap<String, GeneticVariant> varMap) throws IOException {

final double variantLdThreshold = 0.1;

gwasVariants.values().parallelStream().forEach((HashSet<String> variants) -> {

ArrayList<String> variantsList = new ArrayList<>(variants);

variants:
for (int i = 0; i < variantsList.size(); ++i) {
String var1Name = variantsList.get(i);
GeneticVariant var1 = varMap.get(var1Name);
if (var1 == null) {
//System.out.println(var1Name);
variants.remove(var1Name);
continue variants;
}
if (!var1.isBiallelic()) {
variants.remove(var1Name);
continue variants;
}

for (int j = i + i; j < variantsList.size(); ++j) {
String var2Name = variantsList.get(i);
GeneticVariant var2 = varMap.get(var2Name);

try {
if (var2.isBiallelic() && var1.calculateLd(var2).getR2() >= variantLdThreshold) {
//now remove var1
variants.remove(var1Name);
continue variants;
}
} catch (LdCalculatorException ex) {
throw new RuntimeException(ex);
}

}

}

});

}

}
Loading

0 comments on commit da081f0

Please sign in to comment.