From a78d1cc5855fe21e4118339c1b8ce5484aba6117 Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Thu, 31 Oct 2019 12:01:51 -0400 Subject: [PATCH 1/6] add splitting of gcextract2 sqlite; JIRA: PGAPX-584 --- expr/supplemental_data_split_dir.cwl | 8 +++++++- wf_common.cwl | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/expr/supplemental_data_split_dir.cwl b/expr/supplemental_data_split_dir.cwl index 12eab2e..8d4cc11 100644 --- a/expr/supplemental_data_split_dir.cwl +++ b/expr/supplemental_data_split_dir.cwl @@ -107,7 +107,11 @@ expression: | break; case 'TaxSynon.tsv': r['tax_synon'] = l[i]; - break; } + break; + case 'GCExtract2.sqlite': + r['gcextract2_sqlite'] = l[i]; + break; + } } return r; } @@ -133,6 +137,8 @@ outputs: defline_cleanup_rules: # defline_cleanup_rules # ${GP_HOME}/etc/product_rules.prt type: File + gcextract2_sqlite: + type: File gene_master_ini: type: File genemark_path: diff --git a/wf_common.cwl b/wf_common.cwl index ff78825..a488516 100755 --- a/wf_common.cwl +++ b/wf_common.cwl @@ -112,6 +112,7 @@ steps: - CDDdata2 - CDDdata - defline_cleanup_rules + - gcextract2_sqlite - gene_master_ini - genemark_path - hmm_path From a627cfd1cabf095bc5cddcbb1aa257af13d6343b Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Thu, 31 Oct 2019 12:18:11 -0400 Subject: [PATCH 2/6] pass new resource GCExtract2 to the kmer plane; JIRA: PGAPX-584 --- bacterial_kmer/wf_bacterial_kmer.cwl | 2 ++ progs/ani_top_identification.cwl | 4 ++++ task_types/tt_ani_top_n.cwl | 2 ++ 3 files changed, 8 insertions(+) diff --git a/bacterial_kmer/wf_bacterial_kmer.cwl b/bacterial_kmer/wf_bacterial_kmer.cwl index a9f7dba..32a3de1 100644 --- a/bacterial_kmer/wf_bacterial_kmer.cwl +++ b/bacterial_kmer/wf_bacterial_kmer.cwl @@ -17,6 +17,7 @@ inputs: ANI_cutoff: File kmer_reference_assemblies: File tax_synon: File + gcextract2_sqlite: File outputs: Identify_Top_N_ANI_annot: type: File @@ -223,4 +224,5 @@ steps: blast_align: Assembly_Assembly_BLASTn/blast_align ref_assembly_taxid: ref_assembly_taxid tax_synon: tax_synon + gcextract2_sqlite: gcextract2_sqlite out: [top,annot] diff --git a/progs/ani_top_identification.cwl b/progs/ani_top_identification.cwl index 3e5c704..ea2f1ae 100644 --- a/progs/ani_top_identification.cwl +++ b/progs/ani_top_identification.cwl @@ -45,6 +45,10 @@ inputs: type: File inputBinding: prefix: -tax-syn-table + gcextract2_sqlite: + type: File + inputBinding: + prefix: -gcextract2-sqlite o: type: string? default: ani-tax-report.xml diff --git a/task_types/tt_ani_top_n.cwl b/task_types/tt_ani_top_n.cwl index cab5cb9..b339553 100644 --- a/task_types/tt_ani_top_n.cwl +++ b/task_types/tt_ani_top_n.cwl @@ -9,6 +9,7 @@ inputs: asn_cache: Directory ref_assembly_taxid: int tax_synon: File + gcextract2_sqlite: File outputs: top: type: File @@ -58,4 +59,5 @@ steps: default: 0 ref_assembly_taxid: ref_assembly_taxid tax_synon: tax_synon + gcextract2_sqlite: gcextract2_sqlite out: [annot, top] From b9046809e78bc99dffde40f9aae8dde21651edec Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Fri, 1 Nov 2019 10:06:30 -0400 Subject: [PATCH 3/6] successful testing, CWL part is done; more testing needed to make sure actual output is correct; JIRA: PGAPX-242 --- bacterial_kmer/wf_bacterial_kmer.cwl | 22 +++++--- expr/supplemental_data_split_dir.cwl | 4 +- progs/ani_top_identification.cwl | 10 ++-- progs/assm_assm_blastn_create_jobs.cwl | 31 ++++++++--- progs/assm_assm_blastn_wnode.cwl | 35 +++++++++---- progs/gc_extract_ids.cwl | 21 ++++++++ progs/gc_get_assembly.cwl | 4 ++ progs/gpx_qsubmit-xml.cwl | 67 ++++++++++++++++++++++++ task_types/tt_ani_top_n.cwl | 27 ++-------- task_types/tt_assm_assm_blastn_wnode.cwl | 5 +- task_types/tt_gcaccess_from_list.cwl | 2 + 11 files changed, 175 insertions(+), 53 deletions(-) create mode 100644 progs/gc_extract_ids.cwl create mode 100644 progs/gpx_qsubmit-xml.cwl diff --git a/bacterial_kmer/wf_bacterial_kmer.cwl b/bacterial_kmer/wf_bacterial_kmer.cwl index 32a3de1..80e72ae 100644 --- a/bacterial_kmer/wf_bacterial_kmer.cwl +++ b/bacterial_kmer/wf_bacterial_kmer.cwl @@ -17,7 +17,8 @@ inputs: ANI_cutoff: File kmer_reference_assemblies: File tax_synon: File - gcextract2_sqlite: File + taxon_db: File + gcextract2_sqlite: File outputs: Identify_Top_N_ANI_annot: type: File @@ -167,13 +168,21 @@ steps: run: ../task_types/tt_gcaccess_from_list.cwl in: gc_id_list: Extract_Top_Assemblies/gc_id_list + gc_cache: gc_cache out: [gencoll_asn] + Extract_Input_GenColl_IDs: + label: Extract Input GenColl IDs + doc: Input is input ASN.1 file for our target assembly + run: ../progs/gc_extract_ids.cwl + in: + input: gencoll_asn + out: [output] Assembly_Assembly_BLASTn: label: Assembly Assembly BLASTn doc: This is rather standard blast run: ../task_types/tt_assm_assm_blastn_wnode.cwl in: - queries_gc_id_list: List_sqlite/keys + queries_gc_id_list: Extract_Input_GenColl_IDs/output subjects_gc_id_list: Extract_Top_Assemblies/gc_id_list # this will brea here ref_gencoll_asn: Get_Top_Assemblies_GenColl_ASN/gencoll_asn @@ -185,7 +194,7 @@ steps: gc_seq_cache: gc_seq_cache gc_cache: gc_cache compart: - default: "true" + default: true evalue: default: 0.0001 gapextend: @@ -201,11 +210,11 @@ steps: merge_engine: default: "tree-merger" soft_masking: - default: "true" + default: 'true' task: default: megablast use_common_components: - default: "true" + default: true window_size: default: 150 word_size: @@ -224,5 +233,6 @@ steps: blast_align: Assembly_Assembly_BLASTn/blast_align ref_assembly_taxid: ref_assembly_taxid tax_synon: tax_synon - gcextract2_sqlite: gcextract2_sqlite + gcextract2_sqlite: gcextract2_sqlite + taxon_db: taxon_db out: [top,annot] diff --git a/expr/supplemental_data_split_dir.cwl b/expr/supplemental_data_split_dir.cwl index 8d4cc11..183c00b 100644 --- a/expr/supplemental_data_split_dir.cwl +++ b/expr/supplemental_data_split_dir.cwl @@ -111,7 +111,7 @@ expression: | case 'GCExtract2.sqlite': r['gcextract2_sqlite'] = l[i]; break; - } + } } return r; } @@ -138,7 +138,7 @@ outputs: # defline_cleanup_rules # ${GP_HOME}/etc/product_rules.prt type: File gcextract2_sqlite: - type: File + type: File gene_master_ini: type: File genemark_path: diff --git a/progs/ani_top_identification.cwl b/progs/ani_top_identification.cwl index ea2f1ae..be3834a 100644 --- a/progs/ani_top_identification.cwl +++ b/progs/ani_top_identification.cwl @@ -32,7 +32,7 @@ inputs: query_assembly: type: File inputBinding: - prefix: -ANI_cutoff + prefix: -query-assembly ref_assembly_id: type: int? inputBinding: @@ -45,8 +45,12 @@ inputs: type: File inputBinding: prefix: -tax-syn-table - gcextract2_sqlite: - type: File + taxon_db: + type: File + inputBinding: + prefix: -taxon-db + gcextract2_sqlite: + type: File inputBinding: prefix: -gcextract2-sqlite o: diff --git a/progs/assm_assm_blastn_create_jobs.cwl b/progs/assm_assm_blastn_create_jobs.cwl index 23148aa..9422f6c 100644 --- a/progs/assm_assm_blastn_create_jobs.cwl +++ b/progs/assm_assm_blastn_create_jobs.cwl @@ -1,20 +1,39 @@ #!/usr/bin/env cwl-runner label: "assm_assm_blastn_create_jobs" class: CommandLineTool -baseCommand: submit_kmer_compare +baseCommand: assm_assm_blastn_create_jobs cwlVersion: v1.0 - +requirements: + - class: InlineJavascriptRequirement + - class: InitialWorkDirRequirement + listing: + - entryname: q.mft + entry: |- + ${ + var blob = '# q.mft created for assm_assm_blastn_create_jobs from input "queries_gc_id_list" File\n'; + if(inputs.queries_gc_id_list != null) { blob += inputs.queries_gc_id_list.path + '\n'; } + return blob; + } + - entryname: t.mft + entry: |- + ${ + var blob = '# t.mft created for assm_assm_blastn_create_jobs from input "subjects_gc_id_list" File\n'; + if(inputs.subjects_gc_id_list != null) { blob += inputs.subjects_gc_id_list.path + '\n'; } + return blob; + } +arguments: [ -query-assemblies-manifest, q.mft, -target-assemblies-manifest, t.mft ] +# ~/gpipe-debug-bin/assm_assm_blastn_create_jobs -affinity-bin 10 +# -query-assemblies-manifest inp/query_ids.mft -target-assemblies-manifest inp/subject_ids.mft -output inp/jobs.xml +# inputs: affinity_bin: type: int? + inputBinding: + prefix: -affinity-bin queries_gc_id_list: type: File? - inputBinding: - prefix: -query-assemblies subjects_gc_id_list: type: File? - inputBinding: - prefix: -target-assemblies output_xml_file_name: type: string? default: jobs.xml diff --git a/progs/assm_assm_blastn_wnode.cwl b/progs/assm_assm_blastn_wnode.cwl index aacd4ca..7bb814e 100644 --- a/progs/assm_assm_blastn_wnode.cwl +++ b/progs/assm_assm_blastn_wnode.cwl @@ -2,23 +2,36 @@ cwlVersion: v1.0 label: "assm_assm_blastn_wnode" class: CommandLineTool -# -# You might need something like this: -# -# requirements: -# - class: InitialWorkDirRequirement -# listing: -# - entry: $(inputs.asn_cache) -# writable: True -# - entry: $(inputs.blastdb_dir) -# writable: False +requirements: + - class: InlineJavascriptRequirement + - class: InitialWorkDirRequirement + listing: + + - entryname: queries-and-targets.mft + entry: |- + ${ + var blob = '# queries-and-targets.mft created for assm_assm_blastn_wnode from input "target_set" File\n'; + if(inputs.target_set != null) { + for(var i=0; i + This workflow is specialized for the case when there is an XML input + +class: CommandLineTool +requirements: + - class: InlineJavascriptRequirement + - class: InitialWorkDirRequirement + listing: + - entry: ${ var cs=0; var s=inputs.asn_cache.length-1; var as = cs; if(as >= s) {as = s }; return inputs.asn_cache[as]; } + writable: False + - entry: ${ var cs=1; var s=inputs.asn_cache.length-1; var as = cs; if(as >= s) {as = s }; return inputs.asn_cache[as]; } + writable: False + +baseCommand: gpx_qsubmit +inputs: + affinity: + type: string? + default: subject + inputBinding: + prefix: -affinity + asn_cache: + type: Directory[]? + inputBinding: + prefix: -asn-cache + itemSeparator: "," + batch_size: + type: int? + inputBinding: + prefix: -batch-size + max_batch_length: + type: int? + inputBinding: + prefix: -max-batch-length + nogenbank: + type: boolean? + inputBinding: + # prefix: -nogenbank # commenting this as a hail mary + NxM_threshold: + type: int? + inputBinding: + prefix: -NxM-threshold + overlap: + type: int? + inputBinding: + prefix: -overlap + subseq_size: + type: int? + inputBinding: + prefix: -subseq-size + xml_jobs: + type: File? + inputBinding: + prefix: -xml-jobs + output_xml_jobs: + type: string + default: jobs.xml + inputBinding: + prefix: -o + + +outputs: + jobs: + type: File + outputBinding: + glob: $(inputs.output_xml_jobs) diff --git a/task_types/tt_ani_top_n.cwl b/task_types/tt_ani_top_n.cwl index b339553..b6416f1 100644 --- a/task_types/tt_ani_top_n.cwl +++ b/task_types/tt_ani_top_n.cwl @@ -8,6 +8,7 @@ inputs: ANI_cutoff: File asn_cache: Directory ref_assembly_taxid: int + taxon_db: File tax_synon: File gcextract2_sqlite: File outputs: @@ -17,29 +18,6 @@ outputs: annot: type: File outputSource: ani_top_identification/annot -#/panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/bacterial_pipeline/system/2018-03-13.build2663/bin/ani_top_identification \ -# -ANI_cutoff \ -# /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/bacterial_pipeline/system/2018-03-13.build2663/third-party/data/BacterialPipeline/ANI_cutoff/ANI_cutoff.xml \ -# -N \ -# 25 \ -# -asn-cache \ -# /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/bacterial_pipeline/data56/Mycoplasma_genitalium_G37/Mycoplasma_genitalium_External_PGAP.4585524/sequence_cache,/panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe_id_cache/full_id_cache \ -# -input-manifest \ -# /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/bacterial_pipeline/data56/Mycoplasma_genitalium_G37/Mycoplasma_genitalium_External_PGAP.4585524/4829637/ani_top_n.455674852/inp/assm_aligns.mft \ -# -min-gap \ -# 10000 \ -# -min-region \ -# 1000 \ -# -o \ -# /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/bacterial_pipeline/data56/Mycoplasma_genitalium_G37/Mycoplasma_genitalium_External_PGAP.4585524/4829637/ani_top_n.455674852/out/ani-tax-report.xml \ -# -o-annot \ -# /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/bacterial_pipeline/data56/Mycoplasma_genitalium_G37/Mycoplasma_genitalium_External_PGAP.4585524/4829637/ani_top_n.455674852/out/annot.asn \ -# -query-assembly \ -# /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/bacterial_pipeline/data56/Mycoplasma_genitalium_G37/Mycoplasma_genitalium_External_PGAP.4585524/4829637/gc_create_from_sequences.455674892/out/gencoll.asn \ -# -ref-assembly-id \ -# 0 \ -# -ref-assembly-taxid \ -# 243273 steps: ani_top_identification: @@ -59,5 +37,6 @@ steps: default: 0 ref_assembly_taxid: ref_assembly_taxid tax_synon: tax_synon - gcextract2_sqlite: gcextract2_sqlite + taxon_db: taxon_db + gcextract2_sqlite: gcextract2_sqlite out: [annot, top] diff --git a/task_types/tt_assm_assm_blastn_wnode.cwl b/task_types/tt_assm_assm_blastn_wnode.cwl index 929321b..5e097f1 100644 --- a/task_types/tt_assm_assm_blastn_wnode.cwl +++ b/task_types/tt_assm_assm_blastn_wnode.cwl @@ -43,7 +43,7 @@ steps: subjects_gc_id_list: subjects_gc_id_list out: [output] gpx_qsubmit: - run: ../progs/gpx_qsubmit.cwl + run: ../progs/gpx_qsubmit-xml.cwl in: affinity: affinity asn_cache: @@ -55,6 +55,9 @@ steps: assm_assm_blastn_wnode: run: ../progs/assm_assm_blastn_wnode.cwl in: + target_set: + source: [gencoll_asn, ref_gencoll_asn] + linkMerge: merge_flattened asn_cache: source: [asn_cache, gc_seq_cache] linkMerge: merge_flattened diff --git a/task_types/tt_gcaccess_from_list.cwl b/task_types/tt_gcaccess_from_list.cwl index 2df55e2..012fd55 100644 --- a/task_types/tt_gcaccess_from_list.cwl +++ b/task_types/tt_gcaccess_from_list.cwl @@ -4,6 +4,7 @@ label: "gcaccess_from_list" class: Workflow # task type inputs: gc_id_list: File + gc_cache: File outputs: gencoll_asn: type: File @@ -15,6 +16,7 @@ steps: mode: default: AllSequences release_id_list: gc_id_list + gc_cache: gc_cache out: [gencoll_asn] # this is for the future we might need this in general case # gc_get_molecules: From ecc4c4c6130a3d12edbb9ad65a3a96242c67c71e Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Fri, 1 Nov 2019 10:06:54 -0400 Subject: [PATCH 4/6] fixing tab; JIRA: PGAPX-242 --- wf_common.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wf_common.cwl b/wf_common.cwl index a488516..d3cf4aa 100755 --- a/wf_common.cwl +++ b/wf_common.cwl @@ -112,7 +112,7 @@ steps: - CDDdata2 - CDDdata - defline_cleanup_rules - - gcextract2_sqlite + - gcextract2_sqlite - gene_master_ini - genemark_path - hmm_path From 4d7f3f8e6ba8f4789d1fa5d1b25acda6c710e41c Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Tue, 5 Nov 2019 08:32:26 -0500 Subject: [PATCH 5/6] added prog_annot_stats as output of the workflow for Pathogen Detect; JIRA: PGAPX-593 --- wf_common.cwl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/wf_common.cwl b/wf_common.cwl index d3cf4aa..2f06c07 100755 --- a/wf_common.cwl +++ b/wf_common.cwl @@ -778,7 +778,7 @@ steps: val_res_den_xml: passdata/val_res_den_xml it: default: true - out: + out: - id: var_proc_annot_stats_xml - id: var_proc_annot_details_xml Validate_Annotation_xsltproc_asnvalidate: @@ -925,5 +925,6 @@ outputs: sqn: type: File outputSource: add_checksum_sqn/output - - + proc_annot_stats: + type: File + outputSource: Validate_Annotation_proc_annot_stats/var_proc_annot_stats_xml From f438561b673ae2f7198019f2f1bdcf115da5a482 Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Thu, 7 Nov 2019 12:22:00 -0500 Subject: [PATCH 6/6] pass ref_assembly_id from the top; JIRA: PGAPX-602 --- bacterial_kmer/wf_bacterial_kmer.cwl | 2 ++ task_types/tt_kmer_top_n_extract.cwl | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bacterial_kmer/wf_bacterial_kmer.cwl b/bacterial_kmer/wf_bacterial_kmer.cwl index 80e72ae..9a161e8 100644 --- a/bacterial_kmer/wf_bacterial_kmer.cwl +++ b/bacterial_kmer/wf_bacterial_kmer.cwl @@ -14,6 +14,7 @@ inputs: gc_cache: File kmer_cache_sqlite: File ref_assembly_taxid: int + ref_assembly_id: int ANI_cutoff: File kmer_reference_assemblies: File tax_synon: File @@ -148,6 +149,7 @@ steps: in: top_distances: Identify_Top_N/top_distances ref_assembly_taxid: ref_assembly_taxid + ref_assembly_id: ref_assembly_id out: [tax_report, gc_id_list] Build_Kmer_Tree: label: Build Kmer Tree diff --git a/task_types/tt_kmer_top_n_extract.cwl b/task_types/tt_kmer_top_n_extract.cwl index bc5fb3e..b2a406a 100644 --- a/task_types/tt_kmer_top_n_extract.cwl +++ b/task_types/tt_kmer_top_n_extract.cwl @@ -5,6 +5,7 @@ class: Workflow # task type inputs: top_distances: File ref_assembly_taxid: int + ref_assembly_id: int outputs: tax_report: type: File @@ -17,8 +18,7 @@ steps: run: ../progs/kmer_top_n_extract.cwl in: input: top_distances - ref_assembly_id: - default: 0 # because input is FASTA, and original value is ${GP_gencoll_release} + ref_assembly_id: ref_assembly_id ref_assembly_taxid: ref_assembly_taxid threshold: default: 0.1