Skip to content

Commit

Permalink
refactor and add some documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
tcezard committed Jan 29, 2024
1 parent da83513 commit 2c259d8
Show file tree
Hide file tree
Showing 7 changed files with 335 additions and 5 deletions.
55 changes: 55 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,57 @@
# eva-sub-cli
EVA Submission Command Line Interface for Validation




## Installation


## input file for the validation and submission tool

### The VCF file and association with reference genome

The path to the VCF files are provided via CSV file that links the VCF to their respective fasta sequence. This allows
us to support different assemblies for each VCF file
The CSV file `vcf_mapping.csv` contains the following columns vcf, fasta, report providing respectively:
- The VCF to validatio/upload
- The assembly in fasta format that was used to derive the VCF
- The assembly report associated with the assembly (if available) as found in NCBI assemblies (https://www.ncbi.nlm.nih.gov/genome/doc/ftpfaq/#files)


Example:
```shell
vcf,fasta,report
/full/path/to/vcf_file1.vcf,/full/path/to/genome.fa,/full/path/to/genome_assembly_report.txt
/full/path/to/vcf_file2.vcf,/full/path/to/genome.fa,/full/path/to/genome_assembly_report.txt
/full/path/to/vcf_file3.vcf,/full/path/to/genome2.fa,/full/path/to/genome_assembly_report2.txt
```

### The metadata spreadsheet

The metadata template can be found within the etc folder at `eva_sub_cli/etc/EVA_Submission_template.xlsx`
It should be populated following the instruction provided within the template

## Execution

### Validate and submit you dataset

To validate and submit run the following command

```shell
eva-sub-cli.py --metadata_xlsx metadata_spreadsheet.xlsx \
--vcf_files_mapping vcf_mapping.csv --submission_dir submission_dir
```

### Validate only

To validate and not submit run the following command

```shell
eva-sub-cli.py --metadata_xlsx metadata_spreadsheet.xlsx \
--vcf_files_mapping vcf_mapping.csv --submission_dir submission_dir
--tasks VALIDATE
```
### Submit only

All submission must have been validated. You cannot run the submission without validation
Binary file added eva_sub_cli/etc/EVA_Submission_template.xlsx
Binary file not shown.
275 changes: 275 additions & 0 deletions eva_sub_cli/native_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
import argparse
import csv
import os
import re
import subprocess
import time

from ebi_eva_common_pyutils.command_utils import run_command_with_output

from eva_sub_cli import ETC_DIR
from eva_sub_cli.reporter import Reporter
from ebi_eva_common_pyutils.logger import logging_config

logger = logging_config.get_logger(__name__)

docker_path = 'docker'
container_image = 'ebivariation/eva-sub-cli'
container_tag = 'v0.0.1.dev2'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = '/opt/vcf_validation/vcf_validation_output'
container_etc_dir = '/opt/eva_sub_cli/etc'

VALIDATION_OUTPUT_DIR = "validation_output"


class NativeValidator(Reporter):

def __init__(self, mapping_file, output_dir, metadata_json=None,
metadata_xlsx=None, submission_config=None):
self.mapping_file = mapping_file
self.metadata_json = metadata_json
self.metadata_xlsx = metadata_xlsx

self.spreadsheet2json_conf = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml")
# validator write to the validation output directory
# If the submission_config is not set it will also be written to the VALIDATION_OUTPUT_DIR
super().__init__(self._find_vcf_file(), os.path.join(output_dir, VALIDATION_OUTPUT_DIR),
submission_config=submission_config)

def _validate(self):
self.run_docker_validator()

def _find_vcf_file(self):
vcf_files = []
with open(self.mapping_file) as open_file:
reader = csv.DictReader(open_file, delimiter=',')
for row in reader:
vcf_files.append(row['vcf'])
return vcf_files

def get_docker_validation_cmd(self):
if self.metadata_xlsx and not self.metadata_json:
docker_cmd = (
f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf "
f"--vcf_files_mapping {container_validation_dir}/{self.mapping_file} "
f"--metadata_xlsx {container_validation_dir}/{self.metadata_xlsx} "
f"--conversion_configuration {container_validation_dir}/{self.spreadsheet2json_conf} "
f"--schema_dir {container_etc_dir} "
f"--output_dir {container_validation_output_dir}"
)
else:
docker_cmd = (
f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf "
f"--vcf_files_mapping {container_validation_dir}/{self.mapping_file} "
f"--metadata_json {container_validation_dir}/{self.metadata_json} "
f"--schema_dir {container_etc_dir} "
f"--output_dir {container_validation_output_dir}"
)
print(docker_cmd)
return docker_cmd

def run_docker_validator(self):
# verify mapping file exists
if not os.path.exists(self.mapping_file):
raise RuntimeError(f'Mapping file {self.mapping_file} not found')

# verify all files mentioned in metadata files exist
files_missing, missing_files_list = self.check_if_file_missing()
if files_missing:
raise RuntimeError(f"some files (vcf/fasta) mentioned in metadata file could not be found. "
f"Missing files list {missing_files_list}")

# check if docker container is ready for running validation
self.verify_docker_env()

try:
# remove all existing files from container
run_command_with_output(
"Remove existing files from validation directory in container",
f"{self.docker_path} exec {self.container_name} rm -rf work {container_validation_dir}"
)

# copy all required files to container (mapping file, vcf and fasta)
self.copy_files_to_container()

docker_cmd = self.get_docker_validation_cmd()
# start validation
# FIXME: If nextflow fails in the docker exec still exit with error code 0
run_command_with_output("Run Validation using Nextflow", docker_cmd)
# copy validation result to user host
run_command_with_output(
"Copy validation output from container to host",
f"{self.docker_path} cp {self.container_name}:{container_validation_output_dir} {self.output_dir}"
)
except subprocess.CalledProcessError as ex:
logger.error(ex)

def check_if_file_missing(self):
files_missing = False
missing_files_list = []
with open(self.mapping_file) as open_file:
reader = csv.DictReader(open_file, delimiter=',')
for row in reader:
if not os.path.exists(row['vcf']):
files_missing = True
missing_files_list.append(row['vcf'])
if not os.path.exists(row['fasta']):
files_missing = True
missing_files_list.append(row['fasta'])
if not os.path.exists(row['report']):
files_missing = True
missing_files_list.append(row['report'])

return files_missing, missing_files_list

def verify_docker_is_installed(self):
try:
run_command_with_output(
"check docker is installed and available on the path",
f"{self.docker_path} --version"
)
except subprocess.CalledProcessError as ex:
logger.error(ex)
raise RuntimeError(f"Please make sure docker ({self.docker_path}) is installed and available on the path")

def verify_container_is_running(self):
container_run_cmd_ouptut = run_command_with_output("check if container is running", f"{self.docker_path} ps", return_process_output=True)
if container_run_cmd_ouptut is not None and self.container_name in container_run_cmd_ouptut:
logger.info(f"Container ({self.container_name}) is running")
return True
else:
logger.info(f"Container ({self.container_name}) is not running")
return False

def verify_container_is_stopped(self):
container_stop_cmd_output = run_command_with_output(
"check if container is stopped",
f"{self.docker_path} ps -a"
, return_process_output=True
)
if container_stop_cmd_output is not None and self.container_name in container_stop_cmd_output:
logger.info(f"Container ({self.container_name}) is in stop state")
return True
else:
logger.info(f"Container ({self.container_name}) is not in stop state")
return False

def try_restarting_container(self):
logger.info(f"Trying to restart container {self.container_name}")
try:
run_command_with_output("Try restarting container", f"{self.docker_path} start {self.container_name}")
if not self.verify_container_is_running():
raise RuntimeError(f"Container ({self.container_name}) could not be restarted")
except subprocess.CalledProcessError as ex:
logger.error(ex)
raise RuntimeError(f"Container ({self.container_name}) could not be restarted")

def verify_image_available_locally(self):
container_images_cmd_ouptut = run_command_with_output(
"Check if validator image is present",
f"{self.docker_path} images",
return_process_output=True
)
if container_images_cmd_ouptut is not None and re.search(container_image + r'\s+' + container_tag, container_images_cmd_ouptut):
logger.info(f"Container ({container_image}) image is available locally")
return True
else:
logger.info(f"Container ({container_image}) image is not available locally")
return False

def run_container(self):
logger.info(f"Trying to run container {self.container_name}")
try:
run_command_with_output(
"Try running container",
f"{self.docker_path} run -it --rm -d --name {self.container_name} {container_image}:{container_tag}"
)
# stopping execution to give some time to container to get up and running
time.sleep(5)
if not self.verify_container_is_running():
raise RuntimeError(f"Container ({self.container_name}) could not be started")
except subprocess.CalledProcessError as ex:
logger.error(ex)
raise RuntimeError(f"Container ({self.container_name}) could not be started")

def stop_running_container(self):
if not self.verify_container_is_stopped():
run_command_with_output(
"Stop the running container",
f"{self.docker_path} stop {self.container_name}"
)

def download_container_image(self):
logger.info(f"Pulling container ({container_image}) image")
try:
run_command_with_output("pull container image", f"{self.docker_path} pull {container_image}:{container_tag}")
except subprocess.CalledProcessError as ex:
logger.error(ex)
raise RuntimeError(f"Cannot pull container ({container_image}) image")
# Give the pull command some time to complete
time.sleep(5)
self.run_container()

def verify_docker_env(self):
self.verify_docker_is_installed()

if not self.verify_container_is_running():
if self.verify_container_is_stopped():
self.try_restarting_container()
else:
if self.verify_image_available_locally():
self.run_container()
else:
self.download_container_image()

def copy_files_to_container(self):
def _copy(file_description, file_path):
run_command_with_output(
f"Create directory structure for copying {file_description} into container",
(f"{self.docker_path} exec {self.container_name} "
f"mkdir -p {container_validation_dir}/{os.path.dirname(file_path)}")
)
run_command_with_output(
f"Copy {file_description} to container",
(f"{self.docker_path} cp {file_path} "
f"{self.container_name}:{container_validation_dir}/{file_path}")
)
_copy('vcf metadata file', self.mapping_file)
if self.metadata_json:
_copy('json metadata file', self.metadata_json)
if self.metadata_xlsx:
_copy('excel metadata file', self.metadata_xlsx)
_copy('configuration', self.spreadsheet2json_conf)
with open(self.mapping_file) as open_file:
reader = csv.DictReader(open_file, delimiter=',')
for row in reader:
_copy('vcf files', row['vcf'])
_copy('fasta files', row['fasta'])
_copy('assembly report files', row['report'])


if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Run pre-submission validation checks on VCF files', add_help=False)
parser.add_argument("--docker_path", help="Full path to the docker installation, "
"not required if docker is available on path", required=False)
parser.add_argument("--container_name", help="Name of the docker container", required=False)
parser.add_argument("--vcf_files_mapping",
help="csv file with the mappings for vcf files, fasta and assembly report", required=True)
parser.add_argument("--output_dir", help="Directory where the validation output reports will be made available",
required=True)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--metadata_json",
help="Json file that describe the project, analysis, samples and files")
group.add_argument("--metadata_xlsx",
help="Excel spreadsheet that describe the project, analysis, samples and files")
args = parser.parse_args()

docker_path = args.docker_path if args.docker_path else 'docker'
docker_container_name = args.container_name if args.container_name else container_image

logging_config.add_stdout_handler()
validator = DockerValidator(args.vcf_files_mapping, args.output_dir, args.metadata_json, args.metadata_xlsx,
docker_container_name, docker_path)
validator.validate()
validator.create_reports()
File renamed without changes.
2 changes: 1 addition & 1 deletion tests/test_docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def setUp(self):
container_name='eva-sub-cli-test'
)
shutil.copyfile(
os.path.join(self.resources_folder, 'EVA_Submission_template.V1.1.4.xlsx'),
os.path.join(self.resources_folder, 'EVA_Submission_test.xlsx'),
self.metadata_xlsx
)

Expand Down
6 changes: 3 additions & 3 deletions tests/test_submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def setUp(self) -> None:
self.token = 'a token'
with patch('eva_sub_cli.submit.get_auth', return_value=Mock(token=self.token)):
vcf_files = [os.path.join(self.resource_dir, 'vcf_files', 'example2.vcf.gz')]
metadata_file = os.path.join(self.resource_dir, 'EVA_Submission_template.V1.1.4.xlsx')
metadata_file = os.path.join(self.resource_dir, 'EVA_Submission_test.xlsx')
self.submitter = StudySubmitter(submission_dir=self.test_sub_dir, vcf_files=vcf_files,
metadata_file=metadata_file)

Expand Down Expand Up @@ -102,7 +102,7 @@ def test_upload_submission(self):
mock_submit_response = MagicMock()
mock_submit_response.status_code = 200
test_url = 'http://example.com/'
with patch.object(StudySubmitter, 'upload_file') as mock_upload_file, \
with patch.object(StudySubmitter, '_upload_file') as mock_upload_file, \
patch.object(self.submitter, 'sub_config', {READY_FOR_SUBMISSION_TO_EVA: True}):
self.submitter.sub_config[SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL] = test_url
self.submitter._upload_submission()
Expand All @@ -113,7 +113,7 @@ def test_upload_submission(self):
def test_upload_file(self):
test_url = 'http://example.com/'
with patch('eva_sub_cli.submit.requests.put') as mock_put:
file_to_upload = os.path.join(self.resource_dir, 'EVA_Submission_template.V1.1.4.xlsx')
file_to_upload = os.path.join(self.resource_dir, 'EVA_Submission_test.xlsx')
self.submitter._upload_file(submission_upload_url=test_url, input_file=file_to_upload)
assert mock_put.mock_calls[0][1][0] == test_url + os.path.basename(file_to_upload)
# Cannot test the content of the upload as opening the same file twice give different object
2 changes: 1 addition & 1 deletion tests/test_xlsx2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class TestXlsReader(TestCase):
biosample_schema = os.path.abspath(os.path.join(__file__, "../../eva_sub_cli/etc/eva-biosamples.json", ))

def test_conversion_2_json(self) -> None:
xls_filename = os.path.join(self.resource_dir, 'EVA_Submission_template.V1.1.4.xlsx')
xls_filename = os.path.join(self.resource_dir, 'EVA_Submission_test.xlsx')
self.parser = XlsxParser(xls_filename, self.conf_filename)
output_json = os.path.join(self.resource_dir, 'EVA_Submission_template.V1.1.4.json')
self.parser.json(output_json)
Expand Down

0 comments on commit 2c259d8

Please sign in to comment.