Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3635 - Fixes from Testing #53

Merged
merged 8 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions eva_sub_cli/executables/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def main():
argparser = ArgumentParser(prog='eva-sub-cli', description='EVA Submission CLI - validate and submit data to EVA')
argparser.add_argument('--version', action='version', version=f'%(prog)s {eva_sub_cli.__version__}')
argparser.add_argument('--submission_dir', required=True, type=str,
help='Full path to the directory where all processing will be done '
help='Path to the directory where all processing will be done '
'and submission info is/will be stored')
vcf_group = argparser.add_argument_group(
'Input VCF and assembly',
Expand All @@ -57,7 +57,7 @@ def main():
help="Json file that describe the project, analysis, samples and files")
metadata_group.add_argument("--metadata_xlsx",
help="Excel spreadsheet that describe the project, analysis, samples and files")
argparser.add_argument('--tasks', nargs='*', choices=[VALIDATE, SUBMIT], default=[SUBMIT], type=str.lower,
argparser.add_argument('--tasks', nargs='+', choices=[VALIDATE, SUBMIT], default=[SUBMIT], type=str.lower,
help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the'
' outcome of previous runs. Selecting SUBMIT will run validate only if the validation'
' was not performed successfully before and then run the submission.')
Expand All @@ -67,12 +67,18 @@ def main():
'upload to the EVA')
credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account")
credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account")
argparser.add_argument('--debug', action='store_true', default=False, help='Set the script to output debug messages')

args = argparser.parse_args()

validate_command_line_arguments(args, argparser)

logging_config.add_stdout_handler()
args.submission_dir = os.path.abspath(args.submission_dir)

if args.debug:
logging_config.add_stdout_handler(logging.DEBUG)
else:
logging_config.add_stdout_handler()
logging_config.add_file_handler(os.path.join(args.submission_dir, 'eva_submission.log'), logging.DEBUG)

try:
Expand Down
2 changes: 1 addition & 1 deletion eva_sub_cli/executables/xlsx2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,6 @@ def main():
try:
parser.json(args.metadata_json)
except Exception as e:
parser.add_error(e)
parser.add_error(f'An Error was raised while converting the spreadsheet to JSON: {repr(e)}')
finally:
parser.save_errors(args.errors_yaml)
3 changes: 1 addition & 2 deletions eva_sub_cli/metadata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ def get_files_per_analysis(metadata):
"""Returns mapping of analysis alias to filenames, based on metadata."""
files_per_analysis = defaultdict(list)
for file_info in metadata.get('files', []):
if file_info.get('fileType') == 'vcf':
files_per_analysis[file_info.get('analysisAlias')].append(file_info.get('fileName'))
files_per_analysis[file_info.get('analysisAlias')].append(file_info.get('fileName'))
return {
analysis_alias: set(filepaths)
for analysis_alias, filepaths in files_per_analysis.items()
Expand Down
14 changes: 9 additions & 5 deletions eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -195,12 +195,13 @@ process convert_xlsx_2_json {
output:
path "metadata.json", emit: metadata_json, optional: true
path "metadata_conversion_errors.yml", emit: errors_yaml
path "xlsx2json.log", emit: xlsx2json_log

script:
metadata_json = metadata_xlsx.getBaseName() + '.json'

"""
$params.python_scripts.xlsx2json --metadata_xlsx $metadata_xlsx --metadata_json metadata.json --errors_yaml metadata_conversion_errors.yml --conversion_configuration $conversion_configuration
$params.python_scripts.xlsx2json --metadata_xlsx $metadata_xlsx --metadata_json metadata.json --errors_yaml metadata_conversion_errors.yml --conversion_configuration $conversion_configuration > xlsx2json.log 2>&1
"""
}

Expand All @@ -217,7 +218,7 @@ process metadata_json_validation {

script:
"""
$params.executable.biovalidator --schema $schema_dir/eva_schema.json --ref $schema_dir/eva-biosamples.json --data $metadata_json > metadata_validation.txt
$params.executable.biovalidator --schema $schema_dir/eva_schema.json --ref $schema_dir/eva-biosamples.json --data $metadata_json > metadata_validation.txt 2>&1
"""
}

Expand All @@ -232,10 +233,11 @@ process sample_name_concordance {

output:
path "sample_checker.yml", emit: sample_checker_yml
path "sample_checker.log", emit: sample_checker_log

script:
"""
$params.python_scripts.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml
$params.python_scripts.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml > sample_checker.log
tcezard marked this conversation as resolved.
Show resolved Hide resolved
"""
}

Expand All @@ -251,10 +253,11 @@ process insdc_checker {

output:
path "${fasta_file}_check.yml", emit: fasta_checker_yml
path "fasta_checker.log", emit: fasta_checker_log

script:
"""
$params.python_scripts.fasta_checker --metadata_json $metadata_json --vcf_files $vcf_files --input_fasta $fasta_file --output_yaml ${fasta_file}_check.yml
$params.python_scripts.fasta_checker --metadata_json $metadata_json --vcf_files $vcf_files --input_fasta $fasta_file --output_yaml ${fasta_file}_check.yml > fasta_checker.log 2>&1
"""
}

Expand All @@ -269,9 +272,10 @@ process metadata_semantic_check {

output:
path "metadata_semantic_check.yml", emit: metadata_semantic_check_yml
path "semantic_checker.log", emit: semantic_checker_log

script:
"""
$params.python_scripts.semantic_checker --metadata_json $metadata_json --output_yaml metadata_semantic_check.yml
$params.python_scripts.semantic_checker --metadata_json $metadata_json --output_yaml metadata_semantic_check.yml > semantic_checker.log
"""
}
28 changes: 20 additions & 8 deletions eva_sub_cli/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def get_project_title_and_create_vcf_files_mapping(submission_dir, vcf_files, re
project_title, vcf_files_mapping = get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, True)

for mapping in vcf_files_mapping:
writer.writerow(mapping);
writer.writerow(mapping)

return project_title, mapping_file

Expand All @@ -80,6 +80,7 @@ def get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, mapping_

return project_title, vcf_fasta_report_mapping


def get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, mapping_req=False):
workbook = load_workbook(metadata_xlsx)

Expand Down Expand Up @@ -108,23 +109,27 @@ def get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, mapping_
files_headers[cell.value] = cell.column - 1

for row in files_sheet.iter_rows(min_row=2, values_only=True):
file_name = row[files_headers['File Name']]
file_name = os.path.abspath(row[files_headers['File Name']])
analysis_alias = row[files_headers['Analysis Alias']]
reference_fasta = analysis_alias_dict[analysis_alias]
reference_fasta = os.path.abspath(analysis_alias_dict[analysis_alias])
if not (file_name and os.path.isfile(file_name)):
raise FileNotFoundError(f'The variant file {file_name} provided in spreadsheet {metadata_xlsx} does not exist')
if not (reference_fasta and os.path.isfile(reference_fasta)):
raise FileNotFoundError(f'The reference fasta {reference_fasta} in spreadsheet {metadata_xlsx} does not exist')
vcf_fasta_report_mapping.append([os.path.abspath(file_name), os.path.abspath(reference_fasta), ''])

return project_title, vcf_fasta_report_mapping


def check_validation_required(tasks, sub_config):
def check_validation_required(tasks, sub_config, username=None, password=None):
# Validation is mandatory so if submit is requested then VALIDATE must have run before or be requested as well
if SUBMIT in tasks:
if not sub_config.get(READY_FOR_SUBMISSION_TO_EVA, False):
return True
submission_id = sub_config.get(SUB_CLI_CONFIG_KEY_SUBMISSION_ID, None)
if submission_id:
try:
submission_status = SubmissionWSClient().get_submission_status(submission_id)
submission_status = SubmissionWSClient(username, password).get_submission_status(submission_id)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Passing the username and password is required because the auth is a global variable and subsequent call will not have the username/password if it is initially created without them.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't remember, did you check whether it worked without the global auth?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did try briefly and it didn't work first time so didn't push it since it wasn't pat of this ticket

if submission_status == 'FAILED':
return True
else:
Expand All @@ -150,13 +155,20 @@ def orchestrate_process(submission_dir, vcf_files, reference_fasta, metadata_jso

metadata_file = metadata_json or metadata_xlsx
if not os.path.exists(os.path.abspath(metadata_file)):
raise FileNotFoundError(f'The provided metadata file {metadata_file} does not exist')
raise FileNotFoundError(f'The provided metadata file {os.path.abspath(metadata_file)} does not exist')

if metadata_json:
metadata_json = os.path.abspath(metadata_json)
if metadata_xlsx:
metadata_xlsx = os.path.abspath(metadata_xlsx)

# Get the provided Project Title and VCF files mapping (VCF, Fasta and Report)
project_title, vcf_files_mapping = get_project_title_and_create_vcf_files_mapping(submission_dir, vcf_files, reference_fasta, metadata_json, metadata_xlsx)
project_title, vcf_files_mapping = get_project_title_and_create_vcf_files_mapping(
submission_dir, vcf_files, reference_fasta, metadata_json, metadata_xlsx
)
vcf_files = get_vcf_files(vcf_files_mapping)

if VALIDATE not in tasks and check_validation_required(tasks, sub_config):
if VALIDATE not in tasks and check_validation_required(tasks, sub_config, username, password):
tasks.append(VALIDATE)

if VALIDATE in tasks:
Expand Down
8 changes: 4 additions & 4 deletions eva_sub_cli/submission_ws.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,19 @@ def _submission_status_url(self, submission_id):

def mark_submission_uploaded(self, submission_id, metadata_json):
response = requests.put(self._submission_uploaded_url(submission_id),
headers={'Accept': 'application/hal+json', 'Authorization': 'Bearer ' + self.auth.token},
data=metadata_json)
headers={'Accept': 'application/json', 'Authorization': 'Bearer ' + self.auth.token},
json=metadata_json)
response.raise_for_status()
return response.json()

def initiate_submission(self):
response = requests.post(self._submission_initiate_url(), headers={'Accept': 'application/hal+json',
response = requests.post(self._submission_initiate_url(), headers={'Accept': 'application/json',
'Authorization': 'Bearer ' + self.auth.token})
response.raise_for_status()
return response.json()

@retry(exceptions=(HTTPError,), tries=3, delay=2, backoff=1.2, jitter=(1, 3))
def get_submission_status(self, submission_id):
response = requests.get(self.get_submission_status_url(submission_id))
response = requests.get(self._submission_status_url(submission_id))
response.raise_for_status()
return response.text
3 changes: 2 additions & 1 deletion eva_sub_cli/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ def _upload_submission(self):
def _upload_file(self, submission_upload_url, input_file):
base_name = os.path.basename(input_file)
self.debug(f'Transfer {base_name} to EVA FTP')
r = requests.put(os.path.join(submission_upload_url, base_name), data=open(input_file, 'rb'))
with open(input_file, 'rb') as f:
r = requests.put(os.path.join(submission_upload_url, base_name), data=open(input_file, 'rb'))
tcezard marked this conversation as resolved.
Show resolved Hide resolved
r.raise_for_status()
self.debug(f'Upload of {base_name} completed')

Expand Down
18 changes: 9 additions & 9 deletions eva_sub_cli/validators/docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
logger = logging_config.get_logger(__name__)

container_image = 'ebivariation/eva-sub-cli'
container_tag = 'v0.0.1.dev14'
container_tag = 'v0.0.1.dev15'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = 'vcf_validation_output'

Expand Down Expand Up @@ -100,10 +100,10 @@ def verify_container_is_running(self):
try:
container_run_cmd_output = self._run_quiet_command("check if container is running", f"{self.docker_path} ps", return_process_output=True)
if container_run_cmd_output is not None and self.container_name in container_run_cmd_output:
logger.info(f"Container ({self.container_name}) is running")
logger.debug(f"Container ({self.container_name}) is running")
return True
else:
logger.info(f"Container ({self.container_name}) is not running")
logger.debug(f"Container ({self.container_name}) is not running")
return False
except subprocess.CalledProcessError as ex:
logger.error(ex)
Expand All @@ -114,10 +114,10 @@ def verify_container_is_stopped(self):
"check if container is stopped", f"{self.docker_path} ps -a", return_process_output=True
)
if container_stop_cmd_output is not None and self.container_name in container_stop_cmd_output:
logger.info(f"Container ({self.container_name}) is in stop state")
logger.debug(f"Container ({self.container_name}) is in stop state")
return True
else:
logger.info(f"Container ({self.container_name}) is not in stop state")
logger.debug(f"Container ({self.container_name}) is not in stop state")
return False

def try_restarting_container(self):
Expand All @@ -137,14 +137,14 @@ def verify_image_available_locally(self):
return_process_output=True
)
if container_images_cmd_ouptut is not None and re.search(container_image + r'\s+' + container_tag, container_images_cmd_ouptut):
logger.info(f"Container ({container_image}) image is available locally")
logger.debug(f"Container ({container_image}) image is available locally")
return True
else:
logger.info(f"Container ({container_image}) image is not available locally")
logger.debug(f"Container ({container_image}) image is not available locally")
return False

def run_container(self):
logger.info(f"Trying to run container {self.container_name}")
logger.debug(f"Trying to run container {self.container_name}")
try:
self._run_quiet_command(
"Try running container",
Expand All @@ -166,7 +166,7 @@ def stop_running_container(self):
)

def download_container_image(self):
logger.info(f"Pulling container ({container_image}) image")
logger.debug(f"Pulling container ({container_image}) image")
try:
self._run_quiet_command("pull container image", f"{self.docker_path} pull {container_image}:{container_tag}")
except subprocess.CalledProcessError as ex:
Expand Down
4 changes: 4 additions & 0 deletions eva_sub_cli/validators/native_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,15 @@ def _validate(self):

def run_validator(self):
self.verify_executables_installed()
curr_wd = os.getcwd()
try:
command = self.get_validation_cmd()
os.chdir(self.submission_dir)
self._run_quiet_command("Run Validation using Nextflow", command)
except subprocess.CalledProcessError as ex:
logger.error(ex)
finally:
os.chdir(curr_wd)

def get_validation_cmd(self):
if self.metadata_xlsx and not self.metadata_json:
Expand Down
17 changes: 11 additions & 6 deletions eva_sub_cli/validators/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,10 @@ def _collect_file_info_to_metadata(self):
file_name_2_md5[os.path.basename(vcf_file)] = md5sum
file_path_2_file_size[vcf_file] = file_size
file_name_2_file_size[os.path.basename(vcf_file)] = file_size
else:
self.error(
f"Cannot locate file_info.txt at {os.path.join(self.output_dir, 'other_validations', 'file_info.txt')}"
)
if self.metadata_json_post_validation:
with open(self.metadata_json_post_validation) as open_file:
try:
Expand All @@ -553,12 +557,11 @@ def _collect_file_info_to_metadata(self):
files_from_metadata = json_data.get('files', [])
if files_from_metadata:
for file_dict in json_data.get('files', []):
if file_dict.get('fileType') == 'vcf':
file_path = self._validation_file_path_for(file_dict.get('fileName'))
file_dict['md5'] = file_path_2_md5.get(file_path) or \
file_name_2_md5.get(file_dict.get('fileName')) or ''
file_dict['fileSize'] = file_path_2_file_size.get(file_path) or \
file_name_2_file_size.get(file_dict.get('fileName')) or ''
file_path = self._validation_file_path_for(file_dict.get('fileName'))
file_dict['md5'] = file_path_2_md5.get(file_path) or \
file_name_2_md5.get(file_dict.get('fileName')) or ''
file_dict['fileSize'] = file_path_2_file_size.get(file_path) or \
file_name_2_file_size.get(file_dict.get('fileName')) or ''
file_rows.append(file_dict)
else:
self.error('No file found in metadata and multiple analysis alias exist: '
Expand All @@ -570,6 +573,8 @@ def _collect_file_info_to_metadata(self):
if json_data:
with open(self.metadata_json_post_validation, 'w') as open_file:
json.dump(json_data, open_file)
else:
self.error(f'Cannot locate the metadata in JSON format in {os.path.join(self.output_dir, "metadata.json")}')

def get_vcf_fasta_analysis_mapping(self):
vcf_fasta_analysis_mapping = []
Expand Down
Loading
Loading