Skip to content

Commit

Permalink
Merge pull request #23 from apriltuesday/EVA-3225
Browse files Browse the repository at this point in the history
EVA-3225 - Add native execution for validation and refactor
  • Loading branch information
apriltuesday authored Feb 20, 2024
2 parents 22cc935 + 0d25132 commit b37f3b8
Show file tree
Hide file tree
Showing 16 changed files with 164 additions and 101 deletions.
8 changes: 5 additions & 3 deletions bin/eva-sub-cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@

from ebi_eva_common_pyutils.logger import logging_config

from eva_sub_cli import main
from eva_sub_cli.main import VALIDATE, SUBMIT
from eva_sub_cli import main
from eva_sub_cli.main import VALIDATE, SUBMIT, DOCKER, NATIVE

def validate_command_line_arguments(args, argparser):

def validate_command_line_arguments(args, argparser):
if args.vcf_files_mapping and (args.vcf_files or args.assembly_fasta):
print("Specify vcf_files and assembly_fasta OR a vcf_files_mapping in CSV. Not both")
argparser.print_usage()
Expand Down Expand Up @@ -54,6 +54,8 @@ def validate_command_line_arguments(args, argparser):
help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the outcome of '
'previous runs. Selecting SUBMIT will run validate only if the validation was not performed '
'successfully before and then run the submission.')
argparser.add_argument('--executor', choices=[DOCKER, NATIVE], default=NATIVE,
help='Select an execution type for running validation')
credential_group = argparser.add_argument_group('Credential', 'Specify the Webin credential you want to use to '
'upload to the EVA')
credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account")
Expand Down
24 changes: 15 additions & 9 deletions eva_sub_cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,17 @@
import csv
import os
from ebi_eva_common_pyutils.config import WritableConfig
from ebi_eva_common_pyutils.logger import logging_config

from eva_sub_cli import SUB_CLI_CONFIG_FILE, __version__
from eva_sub_cli.docker_validator import DockerValidator
from eva_sub_cli.reporter import READY_FOR_SUBMISSION_TO_EVA
from eva_sub_cli.validators.docker_validator import DockerValidator
from eva_sub_cli.validators.native_validator import NativeValidator
from eva_sub_cli.validators.validator import READY_FOR_SUBMISSION_TO_EVA
from eva_sub_cli.submit import StudySubmitter

VALIDATE = 'validate'
SUBMIT = 'submit'
DOCKER = 'docker'
NATIVE = 'native'


def get_vcf_files(mapping_file):
Expand All @@ -33,7 +35,7 @@ def create_vcf_files_mapping(submission_dir, vcf_files, assembly_fasta):


def orchestrate_process(submission_dir, vcf_files_mapping, vcf_files, assembly_fasta, metadata_json, metadata_xlsx,
tasks, resume, username=None, password=None, **kwargs):
tasks, executor, resume, username=None, password=None, **kwargs):
# load config
config_file_path = os.path.join(submission_dir, SUB_CLI_CONFIG_FILE)
sub_config = WritableConfig(config_file_path, version=__version__)
Expand All @@ -52,11 +54,15 @@ def orchestrate_process(submission_dir, vcf_files_mapping, vcf_files, assembly_f
tasks.append(VALIDATE)

if VALIDATE in tasks:
with DockerValidator(vcf_files_mapping, submission_dir, metadata_json, metadata_xlsx,
submission_config=sub_config) as validator:
validator.validate()
validator.create_reports()
validator.update_config_with_validation_result()
if executor == DOCKER:
with DockerValidator(vcf_files_mapping, submission_dir, metadata_json, metadata_xlsx,
submission_config=sub_config) as validator:
validator.validate_and_report()
# default to native execution
else:
with NativeValidator(vcf_files_mapping, submission_dir, metadata_json, metadata_xlsx,
submission_config=sub_config) as validator:
validator.validate_and_report()
if SUBMIT in tasks:
with StudySubmitter(submission_dir, vcf_files, metadata_file, submission_config=sub_config,
username=username, password=password) as submitter:
Expand Down
16 changes: 9 additions & 7 deletions eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,17 @@ params.output_dir = null
params.metadata_json = null
params.metadata_xlsx = null

// executables
// executables - external tools
params.executable = [
"vcf_validator": "vcf_validator",
"vcf_assembly_checker": "vcf_assembly_checker",
"biovalidator": "biovalidator"
]
// python scripts - installed as part of eva-sub-cli
params.python_scripts = [
"samples_checker": "samples_checker.py",
"fasta_checker": "check_fasta_insdc.py",
"xlsx2json": "xlsx2json.py",
"biovalidator": "biovalidator"
"xlsx2json": "xlsx2json.py"
]
// validation tasks
params.validation_tasks = [ "vcf_check", "assembly_check", "samples_check", "metadata_check", "insdc_check"]
Expand Down Expand Up @@ -60,7 +63,6 @@ def joinBasePath(path) {
output_dir = joinBasePath(params.output_dir)

workflow {

// Prepare the file path
vcf_channel = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
.splitCsv(header:true)
Expand Down Expand Up @@ -170,7 +172,7 @@ process convert_xlsx_2_json {
metadata_json = metadata_xlsx.getBaseName() + '.json'

"""
$params.executable.xlsx2json --metadata_xlsx $metadata_xlsx --metadata_json metadata.json --conversion_configuration $conversion_configuration
$params.python_scripts.xlsx2json --metadata_xlsx $metadata_xlsx --metadata_json metadata.json --conversion_configuration $conversion_configuration
"""
}

Expand Down Expand Up @@ -205,7 +207,7 @@ process sample_name_concordance {

script:
"""
$params.executable.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml
$params.python_scripts.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml
"""
}

Expand All @@ -224,6 +226,6 @@ process insdc_checker {

script:
"""
$params.executable.fasta_checker --input_fasta $fasta_file --output_yaml ${fasta_file}_check.yml
$params.python_scripts.fasta_checker --input_fasta $fasta_file --output_yaml ${fasta_file}_check.yml
"""
}
2 changes: 1 addition & 1 deletion eva_sub_cli/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from eva_sub_cli import SUB_CLI_CONFIG_FILE, __version__, SUBMISSION_WS_VAR
from eva_sub_cli.auth import get_auth
from eva_sub_cli.reporter import READY_FOR_SUBMISSION_TO_EVA
from eva_sub_cli.validators.validator import READY_FOR_SUBMISSION_TO_EVA

SUB_CLI_CONFIG_KEY_SUBMISSION_ID = "submission_id"
SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL = "submission_upload_url"
Expand Down
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,28 @@
import time

from ebi_eva_common_pyutils.command_utils import run_command_with_output

from eva_sub_cli.reporter import Reporter
from ebi_eva_common_pyutils.logger import logging_config

from eva_sub_cli.validators.validator import Validator, VALIDATION_OUTPUT_DIR

logger = logging_config.get_logger(__name__)

docker_path = 'docker'
container_image = 'ebivariation/eva-sub-cli'
container_tag = 'v0.0.1.dev4'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = 'vcf_validation_output'

VALIDATION_OUTPUT_DIR = "validation_output"


class DockerValidator(Reporter):
class DockerValidator(Validator):

def __init__(self, mapping_file, output_dir, metadata_json=None,
metadata_xlsx=None, container_name=None, docker_path='docker', submission_config=None):
# validator write to the validation output directory
# If the submission_config is not set it will also be written to the VALIDATION_OUTPUT_DIR
super().__init__(mapping_file, os.path.join(output_dir, VALIDATION_OUTPUT_DIR),
metadata_json=metadata_json, metadata_xlsx=metadata_xlsx,
submission_config=submission_config)
self.docker_path = docker_path
self.metadata_json = metadata_json
self.metadata_xlsx = metadata_xlsx
self.container_name = container_name
if self.container_name is None:
self.container_name = container_image.split('/')[1] + '.' + container_tag
Expand Down Expand Up @@ -59,16 +55,6 @@ def get_docker_validation_cmd(self):
return docker_cmd

def run_docker_validator(self):
# verify mapping file exists
if not os.path.exists(self.mapping_file):
raise RuntimeError(f'Mapping file {self.mapping_file} not found')

# verify all files mentioned in metadata files exist
files_missing, missing_files_list = self.check_if_file_missing()
if files_missing:
raise RuntimeError(f"some files (vcf/fasta) mentioned in metadata file could not be found. "
f"Missing files list {missing_files_list}")

# check if docker container is ready for running validation
self.verify_docker_env()

Expand All @@ -94,24 +80,6 @@ def run_docker_validator(self):
except subprocess.CalledProcessError as ex:
logger.error(ex)

def check_if_file_missing(self):
files_missing = False
missing_files_list = []
with open(self.mapping_file) as open_file:
reader = csv.DictReader(open_file, delimiter=',')
for row in reader:
if not os.path.exists(row['vcf']):
files_missing = True
missing_files_list.append(row['vcf'])
if not os.path.exists(row['fasta']):
files_missing = True
missing_files_list.append(row['fasta'])
if not os.path.exists(row['report']):
files_missing = True
missing_files_list.append(row['report'])

return files_missing, missing_files_list

def verify_docker_is_installed(self):
try:
run_command_with_output(
Expand All @@ -123,8 +91,8 @@ def verify_docker_is_installed(self):
raise RuntimeError(f"Please make sure docker ({self.docker_path}) is installed and available on the path")

def verify_container_is_running(self):
container_run_cmd_ouptut = run_command_with_output("check if container is running", f"{self.docker_path} ps", return_process_output=True)
if container_run_cmd_ouptut is not None and self.container_name in container_run_cmd_ouptut:
container_run_cmd_output = run_command_with_output("check if container is running", f"{self.docker_path} ps", return_process_output=True)
if container_run_cmd_output is not None and self.container_name in container_run_cmd_output:
logger.info(f"Container ({self.container_name}) is running")
return True
else:
Expand Down
62 changes: 62 additions & 0 deletions eva_sub_cli/validators/native_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os
import subprocess

from ebi_eva_common_pyutils.command_utils import run_command_with_output
from ebi_eva_common_pyutils.logger import logging_config

from eva_sub_cli.validators.validator import Validator

logger = logging_config.get_logger(__name__)


class NativeValidator(Validator):

def __init__(self, mapping_file, output_dir, metadata_json=None, metadata_xlsx=None,
vcf_validator_path='vcf_validator', assembly_checker_path='vcf_assembly_checker',
biovalidator_path='biovalidator', submission_config=None):
super().__init__(mapping_file, output_dir, metadata_json=metadata_json, metadata_xlsx=metadata_xlsx,
submission_config=submission_config)
self.vcf_validator_path = vcf_validator_path
self.assembly_checker_path = assembly_checker_path
self.biovalidator_path = biovalidator_path

def _validate(self):
self.run_validator()

def run_validator(self):
self.verify_executables_installed()
try:
command = self.get_validation_cmd()
run_command_with_output("Run Validation using Nextflow", command)
except subprocess.CalledProcessError as ex:
logger.error(ex)

def get_validation_cmd(self):
if self.metadata_xlsx and not self.metadata_json:
metadata_flag = f"--metadata_xlsx {self.metadata_xlsx}"
else:
metadata_flag = f"--metadata_json {self.metadata_json}"
path_to_workflow = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
'nextflow/validation.nf')
return (
f"nextflow run {path_to_workflow} "
f"--vcf_files_mapping {self.mapping_file} "
f"{metadata_flag} "
f"--output_dir {self.output_dir} "
f"--executable.vcf_validator {self.vcf_validator_path} "
f"--executable.vcf_assembly_checker {self.assembly_checker_path} "
f"--executable.biovalidator {self.biovalidator_path}"
)

def verify_executables_installed(self):
for name, path in [('vcf-validator', self.vcf_validator_path),
('vcf-assembly-checker', self.assembly_checker_path),
('biovalidator', self.biovalidator_path)]:
try:
run_command_with_output(
f"Check {name} is installed and available on the path",
f"{path} --version"
)
except subprocess.CalledProcessError as ex:
logger.error(ex)
raise RuntimeError(f"Please make sure {name} ({path}) is installed and available on the path")
48 changes: 45 additions & 3 deletions eva_sub_cli/reporter.py → eva_sub_cli/validators/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@
from eva_sub_cli.report import generate_html_report
from ebi_eva_common_pyutils.logger import logging_config

VALIDATION_OUTPUT_DIR = "validation_output"
VALIDATION_RESULTS = 'validation_results'
READY_FOR_SUBMISSION_TO_EVA = 'ready_for_submission_to_eva'

logger = logging_config.get_logger(__name__)


def resolve_single_file_path(file_path):
files = glob.glob(file_path)
if len(files) == 0:
Expand All @@ -26,9 +28,10 @@ def resolve_single_file_path(file_path):
return files[0]


class Reporter:
class Validator:

def __init__(self, mapping_file, output_dir, submission_config: WritableConfig = None):
def __init__(self, mapping_file, output_dir, metadata_json=None, metadata_xlsx=None,
submission_config: WritableConfig = None):
self.output_dir = output_dir
self.mapping_file = mapping_file
vcf_files, fasta_files = self._find_vcf_and_fasta_files()
Expand All @@ -37,6 +40,8 @@ def __init__(self, mapping_file, output_dir, submission_config: WritableConfig =
self.results = {}
self.project_title = None # TODO fill this from metadata?
self.validation_date = datetime.datetime.now()
self.metadata_json = metadata_json
self.metadata_xlsx = metadata_xlsx
if submission_config:
self.sub_config = submission_config
else:
Expand All @@ -60,13 +65,50 @@ def _find_vcf_and_fasta_files(self):
fasta_files.append(row['fasta'])
return vcf_files, fasta_files

def validate_and_report(self):
self.validate()
self.report()

def validate(self):
self.verify_files_present()
self._validate()
self._collect_validation_workflow_results()

def report(self):
self.create_reports()
self.update_config_with_validation_result()

def _validate(self):
raise NotImplementedError

def verify_files_present(self):
# verify mapping file exists
if not os.path.exists(self.mapping_file):
raise RuntimeError(f'Mapping file {self.mapping_file} not found')

# verify all files mentioned in metadata files exist
files_missing, missing_files_list = self.check_if_file_missing()
if files_missing:
raise RuntimeError(f"some files (vcf/fasta) mentioned in metadata file could not be found. "
f"Missing files list {missing_files_list}")

def check_if_file_missing(self):
files_missing = False
missing_files_list = []
with open(self.mapping_file) as open_file:
reader = csv.DictReader(open_file, delimiter=',')
for row in reader:
if not os.path.exists(row['vcf']):
files_missing = True
missing_files_list.append(row['vcf'])
if not os.path.exists(row['fasta']):
files_missing = True
missing_files_list.append(row['fasta'])
if not os.path.exists(row['report']):
files_missing = True
missing_files_list.append(row['report'])
return files_missing, missing_files_list

def update_config_with_validation_result(self):
self.sub_config.set(VALIDATION_RESULTS, value=self.results)
self.sub_config.set(READY_FOR_SUBMISSION_TO_EVA, value=self.verify_ready_for_submission_to_eva())
Expand Down Expand Up @@ -160,7 +202,7 @@ def vcf_check_errors_is_critical(self, error):
return False
return True

def _collect_validation_workflow_results(self, ):
def _collect_validation_workflow_results(self):
# Collect information from the output and summarise in the config
self._collect_vcf_check_results()
self._collect_assembly_check_results()
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name='eva_sub_cli',
packages=['eva_sub_cli'],
packages=['eva_sub_cli', 'eva_sub_cli.validators'],
package_data={'eva_sub_cli': ['nextflow/*', 'etc/*', 'VERSION', 'jinja_templates/*']},
version=version,
license='Apache',
Expand Down
Loading

0 comments on commit b37f3b8

Please sign in to comment.