From 44a48155280eedd03b86e4cd627352c01a88eb81 Mon Sep 17 00:00:00 2001 From: April Shen Date: Mon, 16 Sep 2024 12:30:29 +0100 Subject: [PATCH 1/4] update help message and vcf-validator version --- docker/Dockerfile | 2 +- eva_sub_cli/executables/cli.py | 39 +++++++++++++++++----------------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 608aa79..fb6093f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,6 +1,6 @@ FROM python:3.10 -ENV vcf_validator_version=0.9.4 +ENV vcf_validator_version=0.9.7 ENV NXF_VER=22.10.6 WORKDIR /opt diff --git a/eva_sub_cli/executables/cli.py b/eva_sub_cli/executables/cli.py index e3d029c..0b1433b 100755 --- a/eva_sub_cli/executables/cli.py +++ b/eva_sub_cli/executables/cli.py @@ -35,40 +35,42 @@ def validate_command_line_arguments(args, argparser): print(f"'{args.submission_dir}' does not have write permissions or is not a directory.") sys.exit(1) + def parse_args(cmd_line_args): - argparser = ArgumentParser(prog='eva-sub-cli', description='EVA Submission CLI - validate and submit data to EVA') + argparser = ArgumentParser(prog='eva-sub-cli', + description='EVA Submission CLI - validate and submit data to EVA. ' + 'For full details, please see https://github.com/EBIvariation/eva-sub-cli') argparser.add_argument('--version', action='version', version=f'%(prog)s {eva_sub_cli.__version__}') argparser.add_argument('--submission_dir', required=True, type=str, - help='Path to the directory where all processing will be done ' - 'and submission info is/will be stored') + help='Path to the directory where all processing is done and submission info is stored') vcf_group = argparser.add_argument_group( 'Input VCF and assembly', "Specify the VCF files and associated assembly with the following options. If you used different assemblies " - "for different VCF files then include these in the metadata file." + "for different VCF files, then you must include these in the metadata file rather than specifying them here." ) - vcf_group.add_argument('--vcf_files', nargs='+', help="One or several vcf files to validate") + vcf_group.add_argument('--vcf_files', nargs='+', help="One or more VCF files to validate") vcf_group.add_argument('--reference_fasta', - help="The fasta file containing the reference genome from which the variants were derived") + help="The FASTA file containing the reference genome from which the variants were derived") metadata_group = argparser.add_argument_group('Metadata', 'Specify the metadata in a spreadsheet or in a JSON file') metadata_group = metadata_group.add_mutually_exclusive_group(required=True) metadata_group.add_argument("--metadata_json", - help="Json file that describe the project, analysis, samples and files") + help="JSON file that describes the project, analysis, samples and files") metadata_group.add_argument("--metadata_xlsx", - help="Excel spreadsheet that describe the project, analysis, samples and files") + help="Excel spreadsheet that describes the project, analysis, samples and files") argparser.add_argument('--tasks', nargs='+', choices=[VALIDATE, SUBMIT], default=[SUBMIT], type=str.lower, - help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the' - ' outcome of previous runs. Selecting SUBMIT will run validate only if the validation' - ' was not performed successfully before and then run the submission.') + help='Select a task to perform (default SUBMIT). VALIDATE will run the validation' + ' regardless of the outcome of previous runs. SUBMIT will run validate only if' + ' the validation was not performed successfully before and then run the submission.') argparser.add_argument('--executor', choices=[DOCKER, NATIVE], default=NATIVE, type=str.lower, - help='Select an execution type for running validation (default native)') - credential_group = argparser.add_argument_group('Credential', 'Specify the Webin credential you want to use to ' - 'upload to the EVA') - credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account") - credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account") + help='Select the execution type for running validation (default native)') + credential_group = argparser.add_argument_group('Credentials', 'Specify the ENA Webin credentials you want to use ' + 'to submit to the EVA') + credential_group.add_argument("--username", help="Username for your ENA Webin account") + credential_group.add_argument("--password", help="Password for your ENA Webin account") argparser.add_argument('--shallow', action='store_true', default=False, help='Set the validation to be performed on the first 10000 records of the VCF. ' - 'Only applies if the number of record exceed 10000') + 'Only applies if the number of records exceed 10000') argparser.add_argument('--debug', action='store_true', default=False, help='Set the script to output debug messages') args = argparser.parse_args(cmd_line_args) @@ -77,7 +79,6 @@ def parse_args(cmd_line_args): def main(): - args = parse_args(sys.argv[1:]) args.submission_dir = os.path.abspath(args.submission_dir) @@ -96,4 +97,4 @@ def main(): except SubmissionNotFoundException as snfe: print(f'{snfe}. Please contact EVA Helpdesk') except SubmissionStatusException as sse: - print(f'{sse}. Please try again later. If the problem persists, please contact EVA Helpdesk') \ No newline at end of file + print(f'{sse}. Please try again later. If the problem persists, please contact EVA Helpdesk') From 05bf175f29715fdb140ef1b048915de289b82a77 Mon Sep 17 00:00:00 2001 From: April Shen Date: Mon, 16 Sep 2024 15:01:11 +0100 Subject: [PATCH 2/4] document shallow validation and add final result statement --- README.md | 10 ++++++++-- eva_sub_cli/validators/docker_validator.py | 2 +- eva_sub_cli/validators/validator.py | 3 ++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 84b499b..81dbd9e 100644 --- a/README.md +++ b/README.md @@ -68,12 +68,12 @@ The path to the VCF files are provided in the Files section of the metadata and This allows us to support different assemblies for each VCF file. Please check the below sections `The metadata spreadsheet` and `The metadata JSON` for the format and options available in metadata files. -### The metadata spreadsheet +#### The metadata spreadsheet The metadata template can be found within the etc folder at `eva_sub_cli/etc/EVA_Submission_template.xlsx` It should be populated following the instruction provided within the template -### The metadata JSON +#### The metadata JSON The metadata can also be provided via a JSON file which should conform to the schema located at `eva_sub_cli/etc/eva_schema.json` @@ -114,3 +114,9 @@ or eva-sub-cli.py --metadata_xlsx metadata_spreadsheet.xlsx --submission_dir submission_dir --tasks SUBMIT ``` Will only submit the data and not validate. + +### Shallow validation + +If you are working with extremely large VCF files and find that validation takes a very long time, you can add the +argument `--shallow` to the command, which will validate only the first 10,000 lines in each VCF. Note that running +shallow validation will **not** be sufficient for actual submission. diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py index c9e370b..2294979 100644 --- a/eva_sub_cli/validators/docker_validator.py +++ b/eva_sub_cli/validators/docker_validator.py @@ -11,7 +11,7 @@ logger = logging_config.get_logger(__name__) container_image = 'ebivariation/eva-sub-cli' -container_tag = 'v0.0.1.dev17' +container_tag = 'v0.0.1' container_validation_dir = '/opt/vcf_validation' container_validation_output_dir = 'vcf_validation_output' diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py index 79e23b2..b5a0729 100755 --- a/eva_sub_cli/validators/validator.py +++ b/eva_sub_cli/validators/validator.py @@ -478,5 +478,6 @@ def create_reports(self): file_path = os.path.join(self.output_dir, 'report.html') with open(file_path, "w") as f: f.write(report_html) - self.info(f'View the validation report in your browser: {file_path}') + self.info(f'Validation result: {"SUCCESS" if self.verify_ready_for_submission_to_eva() else "FAILURE"}') + self.info(f'View the full report in your browser: {file_path}') return file_path From 17a00cdcf26cc32c13f9342e7f3e10122996a390 Mon Sep 17 00:00:00 2001 From: April Shen Date: Mon, 16 Sep 2024 16:17:41 +0100 Subject: [PATCH 3/4] fix test --- tests/test_docker_validator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py index 5eca0e1..7db488c 100644 --- a/tests/test_docker_validator.py +++ b/tests/test_docker_validator.py @@ -109,8 +109,8 @@ def test_validate(self): with open(assembly_check_log_file) as assembly_check_log_file: assembly_check_logs = assembly_check_log_file.readlines() - self.assertEqual('[info] Number of matches: 247/247\n', assembly_check_logs[5]) - self.assertEqual('[info] Percentage of matches: 100%\n', assembly_check_logs[6]) + self.assertEqual('[info] Number of matches: 247/247\n', assembly_check_logs[4]) + self.assertEqual('[info] Percentage of matches: 100%\n', assembly_check_logs[5]) # Assert Samples concordance expected_checker = { From 4058a1e951d4e15d5fe87e26d4372801dd9a65d6 Mon Sep 17 00:00:00 2001 From: April Shen Date: Tue, 17 Sep 2024 08:45:29 +0100 Subject: [PATCH 4/4] Update README.md Co-authored-by: Timothee Cezard --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 81dbd9e..61d8231 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,6 @@ Will only submit the data and not validate. ### Shallow validation -If you are working with extremely large VCF files and find that validation takes a very long time, you can add the +If you are working with large VCF files and find that validation takes a very long time, you can add the argument `--shallow` to the command, which will validate only the first 10,000 lines in each VCF. Note that running shallow validation will **not** be sufficient for actual submission.