EBIvariation · apriltuesday · Oct 2, 2023 · Sep 29, 2023 · Sep 29, 2023 · Sep 29, 2023
diff --git a/README.md b/README.md
@@ -10,26 +10,43 @@ For instructions on how to process ClinVar data for the Open Targets platform, s
 
 ## Install
 
-The code requires Python 3.8+. You can install the library and its dependencies as follows (e.g. in a virtual environment):
+The code requires Python 3.8+, and you will also need Nextflow 21.10+ to run the pipelines. Refer to [Nextflow documentation](https://www.nextflow.io/docs/latest/getstarted.html) for specifics on installing Nextflow on your system.
 
+To install CMAT, first either clone the repository or download the latest released version from [here](https://github.com/EBIvariation/CMAT/releases):
 ```bash
 git clone git@github.com:EBIvariation/CMAT.git
+# OR
+wget -O CMAT.zip https://github.com/EBIvariation/CMAT/archive/refs/tags/v3.0.3.zip
+unzip CMAT.zip
+```
+
+Then install the library and its dependencies as follows (e.g. in a virtual environment):
+```bash
 cd CMAT
 pip install -r requirements.txt
 python setup.py install
 ```
 
-Running the pipelines also requires Nextflow 21.10+. Refer to [Nextflow documentation](https://www.nextflow.io/docs/latest/getstarted.html) for specifics on installing Nextflow on your system.
+You then need to set the `PYTHON_BIN` variable in the [Nextflow config](pipelines/nextflow.config), which will allow the
+Nextflow processes to access the correct Python executable.
 
-Finally, the pipelines currently require that the following environment variables be set:
+Finally, the instructions in this readme use the following environment variables as a convenience, they are not needed for the pipelines to run.
 ```bash
-# Path to directory where this repo is cloned
+# Path to directory where source code is downloaded
 export CODE_ROOT=
-# Path to python executable (allows nextflow processes to access python)
-export PYTHON_BIN=
 # Path to ontology mapping file (the provided path points to the version included in this repo)
 export LATEST_MAPPINGS=${CODE_ROOT}/mappings/latest_mappings.tsv
-````
+```
+
+To confirm everything is set up properly, you can run the annotation pipeline on the small dataset included with the tests.
+It should take a couple minutes to run and generate a file `annotated_clinvar.xml.gz` in the test directory.
+```bash
+mkdir testdir && cd testdir
+nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \
+  --output_dir . \
+  --clinvar ${CODE_ROOT}/tests/output_generation/resources/end2end/input.xml.gz \
+  --mappings ${LATEST_MAPPINGS}
+```
 
 ## Run
 
@@ -51,7 +68,7 @@ cd ${ANNOTATION_ROOT}
 mkdir -p gene_mapping logs
 
 # Run the nextflow pipeline, resuming execution of previous attempt if possible.
-nextflow run ${CODE_ROOT}/cmat/output_generation/pipeline.nf \
+nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \
   --output_dir ${ANNOTATION_ROOT} \
   --mappings ${LATEST_MAPPINGS} \
   -resume
@@ -81,7 +98,7 @@ mkdir -p ${CURATION_ROOT}
 cd ${CURATION_ROOT}
 
 # Run the nextflow pipeline, resuming execution of previous attempt if possible.
-nextflow run ${CODE_ROOT}/cmat/trait_mapping/generate.nf \
+nextflow run ${CODE_ROOT}/pipelines/generate_curation_spreadsheet.nf \
   --curation_root ${CURATION_ROOT} \
   --mappings ${LATEST_MAPPINGS} \
   --comments ${CURATOR_COMMENTS} \
@@ -108,7 +125,7 @@ Download the spreadsheet as a CSV file, making sure that all the data is visible
 cd ${CURATION_ROOT}
 
 # Run the nextflow pipeline, resuming execution of previous attempt if possible.
-nextflow run ${CODE_ROOT}/cmat/trait_mapping/export.nf \
+nextflow run ${CODE_ROOT}/pipelines/export_curation_spreadsheet.nf \
   --input_csv ${CURATION_ROOT}/finished_curation_spreadsheet.csv \
   --curation_root ${CURATION_ROOT} \
   --mappings ${LATEST_MAPPINGS} \

diff --git a/docs/manual-curation/step1-fetch-clinvar-data.md b/docs/manual-curation/step1-fetch-clinvar-data.md
@@ -11,7 +11,7 @@ mkdir -p ${CURATION_RELEASE_ROOT}
 cd ${CURATION_RELEASE_ROOT}
 
 # Run the nextflow pipeline, resuming execution of previous attempt if possible.
-nextflow run ${CODE_ROOT}/cmat/trait_mapping/generate.nf \
+nextflow run ${CODE_ROOT}/pipelines/generate_curation_spreadsheet.nf \
   --curation_root ${CURATION_RELEASE_ROOT} \
   -resume
 ```

diff --git a/docs/manual-curation/step3-export-results.md b/docs/manual-curation/step3-export-results.md
@@ -14,7 +14,7 @@ Once the manual curation is completed, download the spreadsheet as a CSV file, m
 cd ${CURATION_RELEASE_ROOT}
 
 # Run the nextflow pipeline, resuming execution of previous attempt if possible.
-nextflow run ${CODE_ROOT}/cmat/trait_mapping/export.nf \
+nextflow run ${CODE_ROOT}/pipelines/export_curation_spreadsheet.nf \
   --input_csv ${CURATION_RELEASE_ROOT}/finished_curation_spreadsheet.csv \
   --curation_root ${CURATION_RELEASE_ROOT} \
   --with_feedback \

diff --git a/docs/open-targets/generate-evidence-strings.md b/docs/open-targets/generate-evidence-strings.md
@@ -32,7 +32,7 @@ cd ${BATCH_ROOT}
 mkdir -p clinvar gene_mapping evidence_strings logs
 
 # Run the nextflow pipeline, resuming execution of previous attempt if possible.
-nextflow run ${CODE_ROOT}/cmat/output_generation/pipeline.nf \
+nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \
   --output_dir ${BATCH_ROOT} \
   --schema ${OT_SCHEMA_VERSION} \
   -resume

diff --git a/cmat/output_generation/pipeline.nf → pipelines/annotation_pipeline.nf b/cmat/output_generation/pipeline.nf → pipelines/annotation_pipeline.nf
@@ -30,6 +30,7 @@ if (!params.output_dir) {
     exit 1, helpMessage()
 }
 batchRoot = params.output_dir
+codeRoot = "${projectDir}/.."
 
 
 /*
@@ -125,15 +126,15 @@ process runSnpIndel {
 
     script:
     """
-    \${PYTHON_BIN} "\${CODE_ROOT}/bin/consequence_prediction/extract_variants_for_vep.py" --clinvar-xml ${clinvarXml} \
+    \${PYTHON_BIN} "${codeRoot}/bin/consequence_prediction/extract_variants_for_vep.py" --clinvar-xml ${clinvarXml} \
     | sort -u \
     | parallel \
         --halt now,fail=1    `# If any job fails, kill the remaining ones immediately and report failure` \
         --pipe               `# Input is read from STDIN and split by chunks`                             \
         -j 20                `# Number of concurrent workers`                                             \
         -N 200               `# Number of records (lines) per worker`                                     \
         --tmpdir .           `# Store temporary files in the current directory to avoid /tmp overflow`    \
-        \${PYTHON_BIN} "\${CODE_ROOT}/cmat/consequence_prediction/snp_indel_variants/pipeline.py" \
+        \${PYTHON_BIN} "${codeRoot}/cmat/consequence_prediction/snp_indel_variants/pipeline.py" \
     | sort -u > consequences_snp.tsv
     """
 }
@@ -158,7 +159,7 @@ process runRepeat {
 
    script:
    """
-   \${PYTHON_BIN} \${CODE_ROOT}/bin/consequence_prediction/run_repeat_expansion_variants.py \
+   \${PYTHON_BIN} ${codeRoot}/bin/consequence_prediction/run_repeat_expansion_variants.py \
         --clinvar-xml ${clinvarXml} \
         --output-consequences consequences_repeat.tsv
 
@@ -188,7 +189,7 @@ process runStructural {
 
    script:
    """
-   \${PYTHON_BIN} \${CODE_ROOT}/bin/consequence_prediction/run_structural_variants.py \
+   \${PYTHON_BIN} ${codeRoot}/bin/consequence_prediction/run_structural_variants.py \
         --clinvar-xml ${clinvarXml} \
         --output-consequences consequences_structural.tsv
 
@@ -227,7 +228,7 @@ process mapGenes {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/evaluation/map_genes.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/evaluation/map_genes.py \
         --clinvar-xml ${clinvarXml} \
         --output-file output_gene_mappings.tsv
     """
@@ -245,7 +246,7 @@ process mapXrefs {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/evaluation/map_xrefs.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/evaluation/map_xrefs.py \
         --clinvar-xml ${clinvarXml} \
         --output-file output_xref_mappings.tsv
     """
@@ -260,7 +261,7 @@ process checkLatestMappings {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/evaluation/check_latest_mappings.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/evaluation/check_latest_mappings.py \
         --latest-mappings ${params.mappings} \
         --output-file output_eval_latest.tsv
     """
@@ -293,7 +294,7 @@ process generateAnnotatedXml {
     def evalXrefFlag = evalXrefMapping != file("empty2")? "--eval-xref-file ${evalXrefMapping}" : ""
     def evalLatestFlag = evalLatest != file("empty3")? "--eval-latest-file ${evalLatest}" : ""
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/generate_annotated_xml.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/generate_annotated_xml.py \
         --clinvar-xml ${clinvarXml} \
         --efo-mapping ${params.mappings} \
         --gene-mapping ${consequenceMappings} \
@@ -324,7 +325,7 @@ process generateEvidence {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/evidence_string_generation.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/evidence_string_generation.py \
         --clinvar-xml ${clinvarXml} \
         --efo-mapping ${params.mappings} \
         --gene-mapping ${consequenceMappings} \
@@ -368,7 +369,7 @@ process convertXrefs {
     path "clinvar_xrefs.txt", emit: clinvarXrefs
 
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/traits_to_zooma_format.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/traits_to_zooma_format.py \
         --clinvar-xml ${clinvarXml} \
         --zooma-feedback clinvar_xrefs.txt
     """

diff --git a/cmat/trait_mapping/export.nf → pipelines/export_curation_spreadsheet.nf b/cmat/trait_mapping/export.nf → pipelines/export_curation_spreadsheet.nf
@@ -28,6 +28,7 @@ if (!params.curation_root or !params.input_csv) {
     exit 1, helpMessage()
 }
 curationRoot = params.curation_root
+codeRoot = "${projectDir}/.."
 
 
 /*
@@ -63,7 +64,7 @@ process exportTable {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/trait_mapping/export_curation_table.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/export_curation_table.py \
         -i ${params.input_csv} \
         -d finished_mappings_curation.tsv \
         -m terms_for_efo_import.txt \
@@ -143,7 +144,7 @@ process createEfoTable {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/trait_mapping/create_efo_table.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/create_efo_table.py \
         -i ${importTerms} \
         -o efo_import_table.tsv
     """

diff --git a/cmat/trait_mapping/generate.nf → pipelines/generate_curation_spreadsheet.nf b/cmat/trait_mapping/generate.nf → pipelines/generate_curation_spreadsheet.nf
@@ -32,6 +32,7 @@ if (!params.curation_root) {
     exit 1, helpMessage()
 }
 curationRoot = params.curation_root
+codeRoot = "${projectDir}/.."
 
 
 /*
@@ -78,7 +79,7 @@ process parseTraits {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/trait_mapping/parse_traits.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/parse_traits.py \
         -i ${clinvarXml} \
         -o parsed_traits.csv
     """
@@ -116,7 +117,7 @@ process processTraits {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/trait_mapping/process_traits.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/process_traits.py \
         -i ${traitChunk} \
         -o automated_traits_${traitChunk}.tsv \
         -c curation_traits_${traitChunk}.tsv
@@ -178,7 +179,7 @@ process createCurationTable {
 
     script:
     """
-    \${PYTHON_BIN} \${CODE_ROOT}/bin/trait_mapping/create_table_for_manual_curation.py \
+    \${PYTHON_BIN} ${codeRoot}/bin/trait_mapping/create_table_for_manual_curation.py \
         --traits-for-curation ${curationTraits} \
         --previous-mappings ${params.mappings} \
         --previous-comments ${params.comments} \

diff --git a/pipelines/nextflow.config b/pipelines/nextflow.config
@@ -0,0 +1,3 @@
+env {
+    PYTHON_BIN = 'python'
+}
diff --git a/setup.py b/setup.py
@@ -27,7 +27,7 @@ def get_requires():
     long_description = fh.read()
 
 setup(name='cmat',
-      version='3.0.3',
+      version='3.0.4.dev0',
       author_email='opentargets-clinvar@ebi.ac.uk',
       url='https://github.com/EBIvariation/eva-opentargets',
       packages=find_packages(),

diff --git a/tests/output_generation/evaluation/test_ols_utils.py b/tests/output_generation/evaluation/test_ols_utils.py
@@ -2,12 +2,12 @@
 
 
 def test_fetch_eval_data():
-    expected = ('MONDO:0004975', False, {'MONDO:0004975', 'Orphanet:238616'})
+    expected = ('MONDO:0004975', False, {'MONDO:0004975'})
     assert fetch_eval_data(db_iden=('MONDO', 'MONDO:0004975')) == expected
     assert fetch_eval_data(uri='http://purl.obolibrary.org/obo/MONDO_0004975') == expected
 
 
 def test_fetch_eval_data_include_neighbors():
-    expected = ('MONDO:0004975', False, {'MONDO:0004975', 'Orphanet:238616'},
+    expected = ('MONDO:0004975', False, {'MONDO:0004975'},
                 {'EFO:0005815', 'MONDO:0001627'}, {'MONDO:0100087', 'EFO:1001870'})
     assert fetch_eval_data(db_iden=('MONDO', 'MONDO:0004975'), include_neighbors=True) == expected
diff --git a/tests/output_generation/test_pipeline.sh b/tests/output_generation/test_pipeline.sh
@@ -6,15 +6,14 @@ export LC_COLLATE=C
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 export CODE_ROOT="$(dirname $(dirname "${SCRIPT_DIR}"))"
 
-export PYTHON_BIN=python
 export BATCH_ROOT_BASE=${SCRIPT_DIR}/resources/end2end
 
 CWD=${PWD}
 BATCH_ROOT=${BATCH_ROOT_BASE}/test_batch
 mkdir -p ${BATCH_ROOT}
 cd ${BATCH_ROOT}
 
-nextflow run ${CODE_ROOT}/cmat/output_generation/pipeline.nf \
+nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \
   --output_dir ${BATCH_ROOT} \
   --schema $(cat "${CODE_ROOT}/OT_SCHEMA_VERSION") \
   --clinvar ${BATCH_ROOT_BASE}/input.xml.gz \