Skip to content

Commit

Permalink
Merge pull request #20 from tcezard/EVA3488_remove_report
Browse files Browse the repository at this point in the history
EVA-3488 - Remove requirement for assembly report / other refactor
  • Loading branch information
tcezard authored Feb 14, 2024
2 parents cc4644d + d5296c0 commit 6b19408
Show file tree
Hide file tree
Showing 8 changed files with 166 additions and 79 deletions.
16 changes: 10 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,27 @@
EVA Submission Command Line Interface for Validation




## Installation

TBD

## Input files for the validation and submission tool

### The VCF file and association with reference genome
There are two ways of specifying the VCF files and associated assembly

### Using `--vcf_files` and `--assembly_fasta`

This allows you to provide multiple VCF files to validate and a single associated genome file.
The VCF files and the associated genome file must use the same chromosome naming convention

### Using `--vcf_files_mapping`

The path to the VCF files are provided via CSV file that links the VCF to their respective fasta sequence. This allows
us to support different assemblies for each VCF file
The CSV file `vcf_mapping.csv` contains the following columns vcf, fasta, report providing respectively:
- The VCF to validate/upload
- The assembly in fasta format that was used to derive the VCF
- The assembly report associated with the assembly (if available) as found in NCBI assemblies (https://www.ncbi.nlm.nih.gov/genome/doc/ftpfaq/#files)

- (Optional) The assembly report associated with the assembly (if available) as found in NCBI assemblies (https://www.ncbi.nlm.nih.gov/genome/doc/ftpfaq/#files)

Example:
```shell
Expand Down Expand Up @@ -48,7 +52,7 @@ To validate and submit run the following command

```shell
eva-sub-cli.py --metadata_xlsx metadata_spreadsheet.xlsx \
--vcf_files_mapping vcf_mapping.csv --submission_dir submission_dir
--vcf_files vcf_file1.vcf vcf_file2.vcf --assembly_fasta assembly.fa --submission_dir submission_dir
```

### Validate only
Expand Down
65 changes: 48 additions & 17 deletions bin/eva-sub-cli.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,63 @@
#!/usr/bin/env python

import os
import sys
from argparse import ArgumentParser

from ebi_eva_common_pyutils.logger import logging_config

from eva_sub_cli import main
from eva_sub_cli.main import VALIDATE, SUBMIT

def validate_command_line_arguments(args, argparser):

if args.vcf_files_mapping and (args.vcf_files or args.assembly_fasta):
print("Specify vcf_files and assembly_fasta OR a vcf_files_mapping in CSV. Not both")
argparser.print_usage()
sys.exit(1)

if (args.vcf_files and not args.assembly_fasta) or (not args.vcf_files and args.assembly_fasta):
print("When using --vcf_files and --assembly_fasta, both need to be specified")
argparser.print_usage()
sys.exit(1)

if SUBMIT in args.tasks and (
not (args.username or os.environ.get('ENAWEBINACCOUNT')) or
not (args.password or os.environ.get('ENAWEBINPASSWORD'))):
print("To submit your data, you need to provide a Webin username and password")
argparser.print_usage()
sys.exit(1)


if __name__ == "__main__":
argparser = ArgumentParser(description='EVA Submission CLI - validate and submit data to EVA')
argparser.add_argument('--tasks', nargs='*', choices=[VALIDATE, SUBMIT], default=[SUBMIT],
help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the outcome of '
'previous runs. Selecting SUBMIT will run validate only if the validation was not performed '
'successfully before and then run the submission.')
argparser.add_argument('--submission_dir', required=True, type=str,
help='Full path to the directory where all processing will be done '
'and submission info is/will be stored')
argparser.add_argument("--vcf_files_mapping", required=True,
vcf_group = argparser.add_argument_group(
'Input VCF and assembly',
"Specify the VCF files and associated assembly with the following options. If you used different assemblies "
"for different VCF files then use --vcf_file_mapping"
)
vcf_group.add_argument('--vcf_files', nargs='+', help="One or several vcf files to validate")
vcf_group.add_argument('--assembly_fasta',
help="The fasta file containing the reference genome from which the variants were derived")
vcf_group.add_argument("--vcf_files_mapping",
help="csv file with the mappings for vcf files, fasta and assembly report")
group = argparser.add_mutually_exclusive_group(required=True)
group.add_argument("--metadata_json",
help="Json file that describe the project, analysis, samples and files")
group.add_argument("--metadata_xlsx",
help="Excel spreadsheet that describe the project, analysis, samples and files")
argparser.add_argument("--username",
help="Username used for connecting to the ENA webin account")
argparser.add_argument("--password",
help="Password used for connecting to the ENA webin account")

metadata_group = argparser.add_argument_group('Metadata', 'Specify the metadata in a spreadsheet or in a JSON file')
metadata_group = metadata_group.add_mutually_exclusive_group(required=True)
metadata_group.add_argument("--metadata_json",
help="Json file that describe the project, analysis, samples and files")
metadata_group.add_argument("--metadata_xlsx",
help="Excel spreadsheet that describe the project, analysis, samples and files")
argparser.add_argument('--tasks', nargs='*', choices=[VALIDATE, SUBMIT], default=[SUBMIT],
help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the outcome of '
'previous runs. Selecting SUBMIT will run validate only if the validation was not performed '
'successfully before and then run the submission.')
credential_group = argparser.add_argument_group('Credential', 'Specify the Webin credential you want to use to '
'upload to the EVA')
credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account")
credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account")
argparser.add_argument("--resume", default=False, action='store_true',
help="Resume the process execution from where it left of. This is currently only supported "
"for the upload part of the SUBMIT task.")
Expand All @@ -36,5 +66,6 @@

logging_config.add_stdout_handler()

main.orchestrate_process(args.submission_dir, args.vcf_files_mapping, args.metadata_json, args.metadata_xlsx,
args.tasks, args.resume)
validate_command_line_arguments(args, argparser)
# Pass on all the arguments
main.orchestrate_process(**args.__dict__)
2 changes: 0 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ RUN curl -LJo /usr/local/bin/vcf_validator https://github.com/EBIvariation/vcf-
# Install biovalidator and make it executable
RUN git clone https://github.com/elixir-europe/biovalidator.git \
&& cd biovalidator \
&& chmod +x src/biovalidator.js \
&& sed -i 's/dist/src/' package.json \
&& npm install \
&& npm link

Expand Down
34 changes: 13 additions & 21 deletions eva_sub_cli/docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,16 @@

from ebi_eva_common_pyutils.command_utils import run_command_with_output

from eva_sub_cli import ETC_DIR
from eva_sub_cli.reporter import Reporter
from ebi_eva_common_pyutils.logger import logging_config

logger = logging_config.get_logger(__name__)

docker_path = 'docker'
container_image = 'ebivariation/eva-sub-cli'
container_tag = 'v0.0.1.dev3'
container_tag = 'v0.0.1.dev4'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = '/opt/vcf_validation/vcf_validation_output'
container_etc_dir = '/opt/eva_sub_cli/etc'
container_validation_output_dir = 'vcf_validation_output'

VALIDATION_OUTPUT_DIR = "validation_output"

Expand All @@ -31,14 +29,12 @@ def __init__(self, mapping_file, output_dir, metadata_json=None,
# If the submission_config is not set it will also be written to the VALIDATION_OUTPUT_DIR
super().__init__(mapping_file, os.path.join(output_dir, VALIDATION_OUTPUT_DIR),
submission_config=submission_config)

self.docker_path = docker_path
self.metadata_json = metadata_json
self.metadata_xlsx = metadata_xlsx
self.container_name = container_name
if self.container_name is None:
self.container_name = container_image.split('/')[1] + '.' + container_tag
self.spreadsheet2json_conf = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml")

def _validate(self):
self.run_docker_validator()
Expand All @@ -47,21 +43,19 @@ def get_docker_validation_cmd(self):
if self.metadata_xlsx and not self.metadata_json:
docker_cmd = (
f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf "
f"--vcf_files_mapping {container_validation_dir}/{self.mapping_file} "
f"--metadata_xlsx {container_validation_dir}/{self.metadata_xlsx} "
f"--conversion_configuration {container_validation_dir}/{self.spreadsheet2json_conf} "
f"--schema_dir {container_etc_dir} "
f"--base_dir {container_validation_dir} "
f"--vcf_files_mapping {self.mapping_file} "
f"--metadata_xlsx {self.metadata_xlsx} "
f"--output_dir {container_validation_output_dir}"
)
else:
docker_cmd = (
f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf "
f"--vcf_files_mapping {container_validation_dir}/{self.mapping_file} "
f"--metadata_json {container_validation_dir}/{self.metadata_json} "
f"--schema_dir {container_etc_dir} "
f"--base_dir {container_validation_dir} "
f"--vcf_files_mapping {self.mapping_file} "
f"--metadata_json {self.metadata_json} "
f"--output_dir {container_validation_output_dir}"
)
print(docker_cmd)
return docker_cmd

def run_docker_validator(self):
Expand Down Expand Up @@ -89,14 +83,13 @@ def run_docker_validator(self):
self.copy_files_to_container()

docker_cmd = self.get_docker_validation_cmd()
print(docker_cmd)
# start validation
# FIXME: If nextflow fails in the docker exec still exit with error code 0
run_command_with_output("Run Validation using Nextflow", docker_cmd)
# copy validation result to user host
run_command_with_output(
"Copy validation output from container to host",
f"{self.docker_path} cp {self.container_name}:{container_validation_output_dir} {self.output_dir}"
f"{self.docker_path} cp {self.container_name}:{container_validation_dir}/{container_validation_output_dir} {self.output_dir}"
)
except subprocess.CalledProcessError as ex:
logger.error(ex)
Expand Down Expand Up @@ -140,9 +133,7 @@ def verify_container_is_running(self):

def verify_container_is_stopped(self):
container_stop_cmd_output = run_command_with_output(
"check if container is stopped",
f"{self.docker_path} ps -a"
, return_process_output=True
"check if container is stopped", f"{self.docker_path} ps -a", return_process_output=True
)
if container_stop_cmd_output is not None and self.container_name in container_stop_cmd_output:
logger.info(f"Container ({self.container_name}) is in stop state")
Expand Down Expand Up @@ -236,13 +227,14 @@ def _copy(file_description, file_path):
_copy('json metadata file', self.metadata_json)
if self.metadata_xlsx:
_copy('excel metadata file', self.metadata_xlsx)
_copy('configuration', self.spreadsheet2json_conf)
with open(self.mapping_file) as open_file:
reader = csv.DictReader(open_file, delimiter=',')
for row in reader:
_copy('vcf files', row['vcf'])
_copy('fasta files', row['fasta'])
_copy('assembly report files', row['report'])
# report is optional
if row['report']:
_copy('assembly report files', row['report'])


if __name__ == "__main__":
Expand Down
22 changes: 20 additions & 2 deletions eva_sub_cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
VALIDATE = 'validate'
SUBMIT = 'submit'


def get_vcf_files(mapping_file):
vcf_files = []
with open(mapping_file) as open_file:
Expand All @@ -21,12 +22,28 @@ def get_vcf_files(mapping_file):
return vcf_files


def orchestrate_process(submission_dir, vcf_files_mapping, metadata_json, metadata_xlsx, tasks, resume):
def create_vcf_files_mapping(submission_dir, vcf_files, assembly_fasta):
mapping_file = os.path.join(submission_dir, 'vcf_mapping_file.csv')
with open(mapping_file, 'w') as open_file:
writer = csv.writer(open_file, delimiter=',')
writer.writerow(['vcf', 'fasta', 'report'])
for vcf_file in vcf_files:
writer.writerow([os.path.abspath(vcf_file), os.path.abspath(assembly_fasta)])
return mapping_file


def orchestrate_process(submission_dir, vcf_files_mapping, vcf_files, assembly_fasta, metadata_json, metadata_xlsx,
tasks, resume, username=None, password=None, **kwargs):
# load config
config_file_path = os.path.join(submission_dir, SUB_CLI_CONFIG_FILE)
sub_config = WritableConfig(config_file_path, version=__version__)

# Get the provided metadata
metadata_file = metadata_json or metadata_xlsx

# Get the provided VCF and assembly
if vcf_files and assembly_fasta:
vcf_files_mapping = create_vcf_files_mapping(submission_dir, vcf_files, assembly_fasta)
vcf_files = get_vcf_files(vcf_files_mapping)

# Validation is mandatory so if submit is requested then VALIDATE must have run before or be requested as well
Expand All @@ -41,5 +58,6 @@ def orchestrate_process(submission_dir, vcf_files_mapping, metadata_json, metada
validator.create_reports()
validator.update_config_with_validation_result()
if SUBMIT in tasks:
with StudySubmitter(submission_dir, vcf_files, metadata_file, submission_config=sub_config) as submitter:
with StudySubmitter(submission_dir, vcf_files, metadata_file, submission_config=sub_config,
username=username, password=password) as submitter:
submitter.submit(resume=resume)
Loading

0 comments on commit 6b19408

Please sign in to comment.