Skip to content

Commit

Permalink
Merge pull request #636 from PatrickDeelen/master
Browse files Browse the repository at this point in the history
Replaced .dat matrix with lz4 row compressed .datg matrix.
  • Loading branch information
harmjanwestra authored Nov 14, 2022
2 parents a7b88b7 + bf2f099 commit ce7e048
Show file tree
Hide file tree
Showing 32 changed files with 1,782 additions and 402 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,5 @@ deconvolutionTestResults/
.Rhistory
/DEPICT2/src/main/r/downstreamer_main/downstreamer_main.Rproj
Downstreamer/src/main/r/downstreamer_main/.remoterserverlog
Genotype-Harmonizer/nb-configuration.xml
Genotype-Harmonizer/nb-configuration.xml
2 changes: 1 addition & 1 deletion Downstreamer/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
<version>1.0.4-SNAPSHOT</version>
</parent>
<artifactId>Downstreamer</artifactId>
<version>1.30-SNAPSHOT</version>
<version>1.31-SNAPSHOT</version>
<packaging>jar</packaging>
<build>
<resources>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import nl.systemsgenetics.downstreamer.development.CorrelateExpressionToPredictions;
import nl.systemsgenetics.downstreamer.development.First1000qtl;
import nl.systemsgenetics.downstreamer.io.DatToDatg;
import nl.systemsgenetics.downstreamer.io.ExcelWriter;
import nl.systemsgenetics.downstreamer.pathway.PredictedPathwayAnnotations;

Expand Down Expand Up @@ -239,7 +240,10 @@ public static void main(String[] args) throws InterruptedException {
break;
case PREPARE_GENE_PVALUES:
PrepareExternalGenePvalues.prepare(options);

break;
case CONVERT_DAT_TO_DATG:
DatToDatg.convert(options);
break;
}
} catch (TabixFileNotFoundException e) {
System.err.println("Problem running mode: " + options.getMode());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ public enum DownstreamerMode {
STEP2,
CONVERT_TXT,
CONVERT_BIN,
CONVERT_DAT_TO_DATG,
CONVERT_EQTL,
CONVERT_GTEX,
CONVERT_TXT_MERGE,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ public DownstreamerOptions(String... args) throws ParseException {
throw new ParseException("Could not parse -cwe as integerer: " + commandLine.getOptionValue("cwe"));
}

if (mode == DownstreamerMode.STEP2 || mode == DownstreamerMode.CONVERT_TXT || mode == DownstreamerMode.CONVERT_TXT_MERGE || mode == DownstreamerMode.STEP1 || mode == DownstreamerMode.GET_NORMALIZED_GENEP || mode == DownstreamerMode.CONVERT_EQTL || mode == DownstreamerMode.FIRST1000 || mode == DownstreamerMode.CONVERT_GTEX || mode == DownstreamerMode.CONVERT_BIN || mode == DownstreamerMode.SPECIAL || mode == DownstreamerMode.CORRELATE_GENES || mode == DownstreamerMode.TRANSPOSE || mode == DownstreamerMode.CONVERT_EXP || mode == DownstreamerMode.MERGE_BIN || mode == DownstreamerMode.PCA || mode == DownstreamerMode.INVESTIGATE_NETWORK || mode == DownstreamerMode.PTOZSCORE || mode == DownstreamerMode.R_2_Z_SCORE || mode == DownstreamerMode.TOP_HITS || mode == DownstreamerMode.GET_PATHWAY_LOADINGS || mode == DownstreamerMode.REMOVE_CIS_COEXP || mode == DownstreamerMode.SUBSET_MATRIX || mode == DownstreamerMode.GET_MARKER_GENES || mode == DownstreamerMode.PREPARE_GENE_PVALUES) {
if (mode == DownstreamerMode.STEP2 || mode == DownstreamerMode.CONVERT_TXT || mode == DownstreamerMode.CONVERT_TXT_MERGE || mode == DownstreamerMode.STEP1 || mode == DownstreamerMode.GET_NORMALIZED_GENEP || mode == DownstreamerMode.CONVERT_EQTL || mode == DownstreamerMode.FIRST1000 || mode == DownstreamerMode.CONVERT_GTEX || mode == DownstreamerMode.CONVERT_BIN || mode == DownstreamerMode.SPECIAL || mode == DownstreamerMode.CORRELATE_GENES || mode == DownstreamerMode.TRANSPOSE || mode == DownstreamerMode.CONVERT_EXP || mode == DownstreamerMode.MERGE_BIN || mode == DownstreamerMode.PCA || mode == DownstreamerMode.INVESTIGATE_NETWORK || mode == DownstreamerMode.PTOZSCORE || mode == DownstreamerMode.R_2_Z_SCORE || mode == DownstreamerMode.TOP_HITS || mode == DownstreamerMode.GET_PATHWAY_LOADINGS || mode == DownstreamerMode.REMOVE_CIS_COEXP || mode == DownstreamerMode.SUBSET_MATRIX || mode == DownstreamerMode.GET_MARKER_GENES || mode == DownstreamerMode.PREPARE_GENE_PVALUES || mode == DownstreamerMode.CONVERT_DAT_TO_DATG) {

if (!commandLine.hasOption("g")) {
throw new ParseException("Please provide --gwas for mode: " + mode.name());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Set;
import nl.systemsgenetics.downstreamer.DownstreamerOptions;
import org.apache.log4j.Logger;
import umcg.genetica.math.matrix2.DoubleMatrixDataset;
Expand All @@ -29,8 +30,8 @@ public static void run(DownstreamerOptions options) throws IOException, Exceptio

DoubleMatrixDatasetFastSubsetLoader expressionDataLoader = new DoubleMatrixDatasetFastSubsetLoader(options.getY());

LinkedHashSet<String> sharedGenes = new LinkedHashSet<>(corePredictionZscoresLoader.getOriginalRowMap().keySet());
sharedGenes.retainAll(expressionDataLoader.getOriginalRowMap().keySet());
Set<String> sharedGenes = new LinkedHashSet<>(corePredictionZscoresLoader.getOriginalRowMap());
sharedGenes.retainAll(expressionDataLoader.getOriginalRowMap());

DoubleMatrixDataset<String, String> corePredictionZscores = corePredictionZscoresLoader.loadSubsetOfRowsBinaryDoubleData(sharedGenes);
DoubleMatrixDataset<String, String> expressionData = expressionDataLoader.loadSubsetOfRowsBinaryDoubleData(sharedGenes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import java.io.IOException;
import java.util.ArrayList;
import java.util.Map;
import java.util.Set;
import nl.systemsgenetics.downstreamer.DownstreamerOptions;
import umcg.genetica.math.matrix2.DoubleMatrixDataset;
import umcg.genetica.math.matrix2.DoubleMatrixDatasetFastSubsetLoader;
Expand All @@ -32,10 +32,10 @@ public static void printFirst1000(DownstreamerOptions options) throws IOExceptio

ArrayList<String> rowsToOutput = new ArrayList<>();

Map<String, Integer> rowMap = loader.getOriginalRowMap();
Set<String> rowMap = loader.getOriginalRowMap();

int i = 0;
for(String row : rowMap.keySet()){
for(String row : rowMap){
if(i++ > 1000){
break;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nl.systemsgenetics.downstreamer.io;

import com.google.common.io.Files;
import java.io.File;
import java.io.IOException;
import java.util.Random;
import nl.systemsgenetics.downstreamer.DownstreamerOptions;
import org.apache.commons.math3.util.Precision;
import org.apache.log4j.Logger;
import umcg.genetica.math.matrix2.DoubleMatrixDataset;

/**
*
* @author patri
*/
public class DatToDatg {

private static final Logger LOGGER = Logger.getLogger(DatToDatg.class);

public static void convert(DownstreamerOptions options) throws IOException{

String inputMatrix = options.getGwasZscoreMatrixPath();

if (inputMatrix.endsWith(".dat")) {
inputMatrix = inputMatrix.substring(0, inputMatrix.length() - 4);
}

File originalDat = new File(inputMatrix + ".dat");
File originalRow = new File(inputMatrix + ".rows.txt");
File originalCol = new File(inputMatrix + ".cols.txt");

LOGGER.info("Original " + originalDat.getAbsolutePath());

File workdir = new File(inputMatrix).getParentFile();
File tmpDir = new File(workdir, "tmpdir_" + String.valueOf(Math.abs(new Random().nextInt())));
tmpDir.mkdir();

LOGGER.info("Tmp dir: " + tmpDir.getAbsolutePath());


File originalDatTmp = new File(tmpDir, originalDat.getName());
File originalRowTmp = new File(tmpDir, originalRow.getName());
File originalColTmp = new File(tmpDir,originalCol.getName());

Files.move(originalDat, originalDatTmp);
Files.move(originalRow, originalRowTmp);
Files.move(originalCol, originalColTmp);

DoubleMatrixDataset<String, String> data = DoubleMatrixDataset.loadDoubleBinaryData(originalDatTmp.getAbsolutePath());

data.saveBinary(inputMatrix);

DoubleMatrixDataset<String, String> newData = DoubleMatrixDataset.loadDoubleBinaryData(inputMatrix);

compareTwoMatrices(data, newData,0);//This will throw IO exception if not equal

LOGGER.info("New file is identical to original");

originalDatTmp.delete();
originalRowTmp.delete();
originalColTmp.delete();

if(tmpDir.listFiles().length == 0){
tmpDir.delete();
}

}

public static void compareTwoMatrices(DoubleMatrixDataset<String, String> m1, DoubleMatrixDataset<String, String> m2) throws IOException {

compareTwoMatrices(m1, m2, 0.00000001);

}

public static void compareTwoMatrices(DoubleMatrixDataset<String, String> m1, DoubleMatrixDataset<String, String> m2, double delta) throws IOException {

if(m1.rows() != m2.rows()){
throw new IOException("Rows not equal");
}
if(m1.columns()!= m2.columns()){
throw new IOException("Cols not equal");
}

if(!m1.getRowObjects().equals(m2.getRowObjects())){
throw new IOException("Row names not equal");
}

if(!m1.getColObjects().equals(m2.getColObjects())){
throw new IOException("Col names not equal");
}

for (int r = 0; r < m1.rows(); ++r) {
for (int c = 0; c < m1.columns(); ++c) {
if(!Precision.equalsIncludingNaN(m1.getElementQuick(r, c), m2.getElementQuick(r, c), delta)){
throw new IOException("Difference at r: " + r + " c: " + c);
}

}
}

}


}
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ public CholeskyBasedPathwayEnrichments(final PathwayDatabase pathwayDatabase,
}

// Determine final set of genes to analyze and overlap with genes in pathway matrix
Set<String> pathwayGenes = pathwayMatrixLoader.getOriginalRowMap().keySet();
Set<String> pathwayGenes = pathwayMatrixLoader.getOriginalRowMap();
sharedGenes = new LinkedHashSet<>();

for (String gene : genesWithPvalue) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ public PathwayEnrichments(final PathwayDatabase pathwayDatabase,
}

// Determine final set of genes to analyze and overlap with genes in pathway matrix
Set<String> pathwayGenes = pathwayMatrixLoader.getOriginalRowMap().keySet();
Set<String> pathwayGenes = pathwayMatrixLoader.getOriginalRowMap();
sharedGenes = new LinkedHashSet<>();

for (String gene : genesWithPvalue) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ private static void calculateBetas(PathwayDatabase pathwayDatabase, final HashSe

pathwayMatrixLoader = new DoubleMatrixDatasetFastSubsetLoader(pathwayDatabase.getLocation());

Set<String> pathwayGenes = pathwayMatrixLoader.getOriginalRowMap().keySet();
Set<String> pathwayGenes = pathwayMatrixLoader.getOriginalRowMap();

sharedGenes = new LinkedHashSet<>();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,20 +58,20 @@ public static void expandAnnotations(DownstreamerOptions options) throws Excepti
final ArrayList<String> overlappingGenes;
if (fromScratch) {

overlappingGenes = new ArrayList<>(predictionMatrixLoader.getOriginalRowMap().keySet());
overlappingGenes = new ArrayList<>(predictionMatrixLoader.getOriginalRowMap());
overlappingGenes.retainAll(genes.keySet());

pathwayMatrix = new DoubleMatrixDataset<>(overlappingGenes, predictionMatrixLoader.getOriginalColMap().keySet());
pathwayMatrix = new DoubleMatrixDataset<>(overlappingGenes, predictionMatrixLoader.getOriginalColMap());
pathwayMatrix2 = pathwayMatrix;

} else {
DoubleMatrixDatasetFastSubsetLoader pathwayMatrixLoader = new DoubleMatrixDatasetFastSubsetLoader(pd.getLocation());
overlappingGenes = new ArrayList<>(pathwayMatrixLoader.getOriginalRowMap().keySet());
overlappingGenes = new ArrayList<>(pathwayMatrixLoader.getOriginalRowMap());
overlappingGenes.retainAll(genes.keySet());
//First load all genes in gene file so that they will be in output file
pathwayMatrix = pathwayMatrixLoader.loadSubsetOfRowsBinaryDoubleData(overlappingGenes);

overlappingGenes.retainAll(predictionMatrixLoader.getOriginalRowMap().keySet());
overlappingGenes.retainAll(predictionMatrixLoader.getOriginalRowMap());

//subset pathways to overlap with predictions
pathwayMatrix2 = pathwayMatrix.viewRowSelection(overlappingGenes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -428,12 +428,12 @@ public static void mergeBinMatrix(DownstreamerOptions options) throws IOExceptio
for (DoubleMatrixDatasetFastSubsetLoader datasetLoader : binMatrices) {
// Put the variant set in memory to avoid having to loop it later on
if (rowNameIntersection.isEmpty()) {
rowNameIntersection.addAll(datasetLoader.getOriginalRowMap().keySet());
rowNameIntersection.addAll(datasetLoader.getOriginalRowMap());
} else {
rowNameIntersection.retainAll(datasetLoader.getOriginalRowMap().keySet());
rowNameIntersection.retainAll(datasetLoader.getOriginalRowMap());
}

for (String newCol : datasetLoader.getOriginalColMap().keySet()) {
for (String newCol : datasetLoader.getOriginalColMap()) {

if (mergedColNames.contains(newCol)) {
int i = 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ public static DownstreamerStep3Results step3(DownstreamerOptions options) throws

// GWAS pvalues
//DoubleMatrixDataset<String, String> gwasSnpZscores = DoubleMatrixDataset.loadDoubleBinaryData(options.getGwasZscoreMatrixPath());
Set<String> traits = new DoubleMatrixDatasetFastSubsetLoader(options.getOutputBasePath() + "_genePvalues").getOriginalColMap().keySet();
Set<String> traits = new DoubleMatrixDatasetFastSubsetLoader(options.getOutputBasePath() + "_genePvalues").getOriginalColMap();

// Gene info
IntervalTreeMap<Gene> genes = IoUtils.readGenesAsIntervalTree(options.getGeneInfoFile());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@ public static void correlateGenes(DownstreamerOptions options) throws FileNotFou
} else {

DoubleMatrixDatasetFastSubsetLoader loader = new DoubleMatrixDatasetFastSubsetLoader(options.getGwasZscoreMatrixPath());
Map<String, Integer> rows = loader.getOriginalRowMap();
Set<String> rows = loader.getOriginalRowMap();

genes.retainAll(rows.keySet());
genes.retainAll(rows);

expressionMatrix = loader.loadSubsetOfRowsBinaryDoubleData(genes);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public static void investigateNetworkPatrick(DownstreamerOptions options) throws

DoubleMatrixDatasetFastSubsetLoader networkLoader = new DoubleMatrixDatasetFastSubsetLoader(options.getGwasZscoreMatrixPath());

Set<String> genesInNetwork = networkLoader.getOriginalRowMap().keySet();
Set<String> genesInNetwork = networkLoader.getOriginalRowMap();

ArrayList<String> geneNames = new ArrayList<>(genes.size());
for(Gene gene : genes){
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ private static Map<String, Map<String, List<PathwayDatabaseEnrichmentRecord>>> t

// Determine genes in the pathway
DoubleMatrixDatasetFastSubsetLoader pathwayMatrixLoader = new DoubleMatrixDatasetFastSubsetLoader(curTarget.getLocation());
Set<String> pathwayGenes = pathwayMatrixLoader.getOriginalRowMap().keySet();
Set<String> pathwayGenes = pathwayMatrixLoader.getOriginalRowMap();

// Determine the overlapping genes
Set<String> overlappingGenes = new HashSet<>(queryGenes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ private static void testPredictions(PathwayEnrichments step2Enrichment, List<Pat

final DoubleMatrixDatasetFastSubsetLoader pathwayMatrixLoader = new DoubleMatrixDatasetFastSubsetLoader(pathwayDatabase2.getLocation());

Set<String> pathwayGenes = pathwayMatrixLoader.getOriginalRowMap().keySet();
Set<String> pathwayGenes = pathwayMatrixLoader.getOriginalRowMap();

final LinkedHashSet<String> sharedGenes = new LinkedHashSet<>();

Expand Down Expand Up @@ -337,7 +337,7 @@ private static void testPredictionsGenePvalues(DoubleMatrixDataset<String, Strin

final DoubleMatrixDatasetFastSubsetLoader pathwayMatrixLoader = new DoubleMatrixDatasetFastSubsetLoader(pathwayDatabase2.getLocation());

Set<String> pathwayGenes = pathwayMatrixLoader.getOriginalRowMap().keySet();
Set<String> pathwayGenes = pathwayMatrixLoader.getOriginalRowMap();

final LinkedHashSet<String> sharedGenes = new LinkedHashSet<>();
final double bonfSigThreshold = 0.05d / sharedGenes.size();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1616,9 +1616,9 @@ combinedMeta$Tissue[combinedMeta$Cohort == "GSA"]



#save(combinedMeta, file = "combinedMeta_2022_09_15.RData")
#save(combinedMeta, file = "Metadata/combinedMeta_2022_09_15.RData")

load(file = "combinedMeta_2022_08_19.RData")
load(file = "Metadata/combinedMeta_2022_09_15.RData")

pcsAndMeta <- merge(expPcs[,1:100], combinedMeta, by = 0, all.x = T)
dim(pcsAndMeta)
Expand Down
Loading

0 comments on commit ce7e048

Please sign in to comment.