From 0209a4615f3c10449bb77b4136405a113734a7a8 Mon Sep 17 00:00:00 2001
From: ericjove <123645716+ericjove@users.noreply.github.com>
Date: Fri, 6 Oct 2023 09:04:30 -0400
Subject: [PATCH 01/16] JIRIA: PGAPX-1175 Added new --prefix command line
 option.

---
 scripts/pgap.py | 60 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/scripts/pgap.py b/scripts/pgap.py
index 65eb7b9..d018c95 100755
--- a/scripts/pgap.py
+++ b/scripts/pgap.py
@@ -892,6 +892,56 @@ def create_simple_input_yaml_file(fasta_location, genus_species, output_filename
 
     return os.path.abspath(output_filename)
 
+def validate_prefix(prefix):
+
+    """
+    Validates the given prefix to ensure it can be used in a filename on Linux, macOS, and Windows.
+    
+    Exits the program with an error message if the prefix is not valid.
+    
+    Valid Prefix:
+    - Contains only alphanumeric characters, underscores, or hyphens.
+    e.g., "my_prefix", "prefix123", "123_prefix", "prefix-123"
+    
+    Invalid Prefix:
+    - Contains any characters other than alphanumeric characters, underscores, or hyphens.
+    e.g., "my prefix", "prefix#", "prefix@", "prefix!"
+    
+    Note: This function is compatible with Linux, macOS, and Windows filenames.
+    """
+    if not re.match("^[a-zA-Z0-9_\-]+$", prefix):
+        sys.exit(f"The provided prefix '{prefix}' is invalid. A valid prefix should only contain alphanumeric characters, underscores, and hyphens.")
+    return True
+
+def apply_prefix_to_output_dir(output_dir, prefix):
+    """
+    Removes the default prefix "annot" and adds the given prefix to each file in the specified directory.
+
+    Parameters:
+    - output_dir (str): The path of the directory containing the files to rename.
+    - prefix (str): The prefix to add to each file name.
+
+    Returns:
+    - None
+    """
+    if not os.path.exists(output_dir):
+        print(f"The directory {output_dir} does not exist.")
+        return
+
+    for filename in os.listdir(output_dir):
+        file_path = os.path.join(output_dir, filename)
+        if os.path.isfile(file_path):
+            # Remove existing 'annot' prefix if present
+            new_filename = filename
+            if filename.startswith("annot"):
+                new_filename = filename[5:]
+
+            # Add the new prefix
+            new_file_path = os.path.join(output_dir, prefix + new_filename)
+            
+            # Rename the file
+            os.rename(file_path, new_file_path)
+
 def main():
 
     parser = argparse.ArgumentParser(description="Input must be provided as:\n"
@@ -958,6 +1008,8 @@ def main():
                         #help='Set a maximum time for pipeline to run, format is D:H:M:S, H:M:S, or M:S, or S (default: %(default)s)')
     parser.add_argument('-q', '--quiet', action='store_true',
                         help='Quiet mode, for scripts')
+    parser.add_argument('--prefix', type=str,
+                        help='Set the prefix for output files (default: "annot")')
     parser.add_argument('--no-self-update', action='store_true',
                         dest='no_self_up',
                         help='Do not attempt to update this script')
@@ -971,6 +1023,10 @@ def main():
                         
     args = parser.parse_args()
 
+    # Ensure that user provided prefix is valid.
+    if args.prefix:
+        validate_prefix(args.prefix) 
+
     # const storing the initial working directory.
     # Please do not modify this variable's value.
     ORIGINAL_WORKSPACE = os.getcwd()
@@ -1054,6 +1110,10 @@ def main():
                             os.remove(submol_modified)
             remove_empty_files(outputdir)
 
+                       
+            if args.prefix:
+                apply_prefix_to_output_dir(outputdir, args.prefix)
+
     except (Exception, KeyboardInterrupt) as exc:
         if args.debug:
             raise

From af468ae34c82dc88399aeedfd8f12f1e87052367 Mon Sep 17 00:00:00 2001
From: ericjove <123645716+ericjove@users.noreply.github.com>
Date: Fri, 6 Oct 2023 09:16:46 -0400
Subject: [PATCH 02/16] JIRA: PGAPX-1175 Removed errroneous

---
 scripts/pgap.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/pgap.py b/scripts/pgap.py
index d018c95..761677c 100755
--- a/scripts/pgap.py
+++ b/scripts/pgap.py
@@ -1110,7 +1110,6 @@ def main():
                             os.remove(submol_modified)
             remove_empty_files(outputdir)
 
-                       
             if args.prefix:
                 apply_prefix_to_output_dir(outputdir, args.prefix)
 

From 1b874287c38b67ef5fc47cde2c436b39ae8ac00a Mon Sep 17 00:00:00 2001
From: "Badretdin, Azat" <badrazat@ncbi.nlm.nih.gov>
Date: Tue, 28 Nov 2023 13:32:05 -0500
Subject: [PATCH 03/16] Organism name: genus or any level below genus; JIRA:
 PGAPX-1197

---
 scripts/pgap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/pgap.py b/scripts/pgap.py
index 761677c..60ae899 100755
--- a/scripts/pgap.py
+++ b/scripts/pgap.py
@@ -954,7 +954,7 @@ def main():
 
     parser.add_argument('-g', '--genome', type=str, help='Path to genomic fasta')
 
-    parser.add_argument('-s', '--organism', type=str, help='Binomial name')
+    parser.add_argument('-s', '--organism', type=str, help='Organism name: genus or any level below genus')
     parser.add_argument('input', nargs='?', help=argparse.SUPPRESS)
                         
 

From aeeace979246305e7efdc9299d37181b615990de Mon Sep 17 00:00:00 2001
From: "Badretdin, Azat" <badrazat@ncbi.nlm.nih.gov>
Date: Thu, 30 Nov 2023 11:12:48 -0500
Subject: [PATCH 04/16] changed help to Organism name: genus, genus species, or
 more specific and known to NCBI Taxonomy, see
 https://github.com/ncbi/pgap/wiki/Input-Files#taxonomy-information for more
 information; JIRA: PGAPX-1197

---
 scripts/pgap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/pgap.py b/scripts/pgap.py
index 60ae899..662fdd7 100755
--- a/scripts/pgap.py
+++ b/scripts/pgap.py
@@ -954,7 +954,7 @@ def main():
 
     parser.add_argument('-g', '--genome', type=str, help='Path to genomic fasta')
 
-    parser.add_argument('-s', '--organism', type=str, help='Organism name: genus or any level below genus')
+    parser.add_argument('-s', '--organism', type=str, help='Organism name: genus, genus species, or more specific and known to NCBI Taxonomy, see https://github.com/ncbi/pgap/wiki/Input-Files#taxonomy-information for more information')
     parser.add_argument('input', nargs='?', help=argparse.SUPPRESS)
                         
 

From bd17caff0120150fbcfe4c00623ee7d31ca4f7fc Mon Sep 17 00:00:00 2001
From: "Badretdin, Azat" <badrazat@ncbi.nlm.nih.gov>
Date: Mon, 4 Dec 2023 13:07:32 -0500
Subject: [PATCH 05/16] panfs->VAST; JIRA: PGAPX-1226

---
 bacterial_trna/trnascan_wnode.cwl |  2 +-
 progs/gencode2trnamodel.cwl       |  2 +-
 wf_common.cwl                     | 20 +++++++++++++++++---
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/bacterial_trna/trnascan_wnode.cwl b/bacterial_trna/trnascan_wnode.cwl
index 8446fe8..f6db487 100644
--- a/bacterial_trna/trnascan_wnode.cwl
+++ b/bacterial_trna/trnascan_wnode.cwl
@@ -40,7 +40,7 @@ inputs:
       prefix: -g
   binary:
     type: string?
-    default: /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/ThirdParty/tRNAscan-SE/production/bin/tRNAscan-SE
+    default: /netmnt/vast01/gp/ThirdParty/tRNAscan-SE/production/bin/tRNAscan-SE
     inputBinding:
       prefix: -tRNAscan
   taxid:
diff --git a/progs/gencode2trnamodel.cwl b/progs/gencode2trnamodel.cwl
index b6eaa90..385bcae 100644
--- a/progs/gencode2trnamodel.cwl
+++ b/progs/gencode2trnamodel.cwl
@@ -26,7 +26,7 @@ inputs:
              # genetic_codes = "ystmito";
          # }
 expression: |
-  ${ var gc = inputs.gencode; var gc2 = ""; if( gc == 4 ) { gc2="othmito"; } else if ( gc==6 ) { gc2="cilnuc"; } else if ( gc == 9) { gc2 = "echdmito" } else if ( gc == 5) { gc2 = "invmito"  } else if ( gc == 2) { gc2 = "vertmito"  } else if ( gc == 3) { gc2 = "ystmito"  } ; if ( gc2 != "" ) { return { "output": "/panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/ThirdParty/tRNAscan-SE/production/lib/tRNAscan-SE/gcode/gcode."+gc2 }; } else { return { "output": null }; } }
+  ${ var gc = inputs.gencode; var gc2 = ""; if( gc == 4 ) { gc2="othmito"; } else if ( gc==6 ) { gc2="cilnuc"; } else if ( gc == 9) { gc2 = "echdmito" } else if ( gc == 5) { gc2 = "invmito"  } else if ( gc == 2) { gc2 = "vertmito"  } else if ( gc == 3) { gc2 = "ystmito"  } ; if ( gc2 != "" ) { return { "output": "/netmnt/vast01/gp/ThirdParty/tRNAscan-SE/production/lib/tRNAscan-SE/gcode/gcode."+gc2 }; } else { return { "output": null }; } }
 
 outputs:
   output: string?
diff --git a/wf_common.cwl b/wf_common.cwl
index 3832b59..fedba6b 100755
--- a/wf_common.cwl
+++ b/wf_common.cwl
@@ -120,6 +120,7 @@ steps:
       - 23s_model_path
       - AntiFamLib
       - all_order_specific_blastdb_file
+      - amr_finder_plus_database
       - asn2pas_xsl
       - identification_db_dir
       - CDDdata2
@@ -398,7 +399,7 @@ steps:
       Generate_23S_rRNA_Annotation_annotation: bacterial_noncoding/annotations_23s
       Post_process_CMsearch_annotations_annots_5S: bacterial_noncoding/annotations_5s
       genemark_path: 
-        default: /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/ThirdParty/GeneMark/
+        default: /netmnt/vast01/gp/ThirdParty/GeneMark/
       thresholds: passdata/thresholds
     out: [lds2,seqids,proteins, aligns, annotation, out_hmm_params, outseqs, prot_ids, models1]
 
@@ -472,7 +473,7 @@ steps:
         wp_hashes: passdata/wp_hashes
         taxon_db: passdata/taxon_db
         genemark_path: 
-          default: /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/ThirdParty/GeneMark/
+          default: /netmnt/vast01/gp/ThirdParty/GeneMark/
     out:
         - id: Find_Best_Evidence_Alignments_aligns
         - id: Run_GeneMark_Post_models
@@ -561,8 +562,18 @@ steps:
     # # tasktype coded, input/output matches
     # # application not coded
   # ###############################################
-  # # AMR plane is for later stages skipping
+  # # AMR plane 
   # ###############################################
+  AMR_naming:
+    run: amr_naming/wf_amr_naming.cwl
+    in:
+      annotation: bacterial_annot_4/out_annotation
+      # aka Bacterial_Annot_Filter/out_annotation
+      database: passdata/amr_finder_plus_database
+      passdata: passdata/taxon_db
+      taxid: taxid
+    out: [amr_report]
+      
   bacterial_orthology_conditional:
     run: bacterial_orthology/wf_bacterial_orthology_conditional.cwl
     in:
@@ -1062,4 +1073,7 @@ outputs:
   checkm_results: 
     type: File
     outputSource: checkm/checkm_results
+  amr_report:
+    type: File
+    outputSource: AMR_naming/amr_report
   

From 7569d6ed7b62a2f0f600c12f6579b69d0073bb7d Mon Sep 17 00:00:00 2001
From: "Badretdin, Azat" <badrazat@ncbi.nlm.nih.gov>
Date: Mon, 4 Dec 2023 14:50:44 -0500
Subject: [PATCH 06/16] previous commit accidentally grabbed changed for AMR
 that are not ready yet; JIRA: PGAPX-1226

---
 wf_common.cwl | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)
 mode change 100755 => 100644 wf_common.cwl

diff --git a/wf_common.cwl b/wf_common.cwl
old mode 100755
new mode 100644
index fedba6b..598d5a7
--- a/wf_common.cwl
+++ b/wf_common.cwl
@@ -120,7 +120,6 @@ steps:
       - 23s_model_path
       - AntiFamLib
       - all_order_specific_blastdb_file
-      - amr_finder_plus_database
       - asn2pas_xsl
       - identification_db_dir
       - CDDdata2
@@ -562,18 +561,8 @@ steps:
     # # tasktype coded, input/output matches
     # # application not coded
   # ###############################################
-  # # AMR plane 
+  # # AMR plane is for later stages skipping 
   # ###############################################
-  AMR_naming:
-    run: amr_naming/wf_amr_naming.cwl
-    in:
-      annotation: bacterial_annot_4/out_annotation
-      # aka Bacterial_Annot_Filter/out_annotation
-      database: passdata/amr_finder_plus_database
-      passdata: passdata/taxon_db
-      taxid: taxid
-    out: [amr_report]
-      
   bacterial_orthology_conditional:
     run: bacterial_orthology/wf_bacterial_orthology_conditional.cwl
     in:
@@ -1073,7 +1062,4 @@ outputs:
   checkm_results: 
     type: File
     outputSource: checkm/checkm_results
-  amr_report:
-    type: File
-    outputSource: AMR_naming/amr_report
   

From 3b8acd25a515b365b39d258fbb14a69cbb5a743a Mon Sep 17 00:00:00 2001
From: "Badretdin, Azat" <badrazat@ncbi.nlm.nih.gov>
Date: Tue, 19 Dec 2023 10:36:13 -0500
Subject: [PATCH 07/16] remove clade_assign; JIRA: PGAPX-1227

---
 clade_assign/align_sort.cwl                | 28 ----------
 clade_assign/assign_clade_bacteria.cwl     | 65 ----------------------
 clade_assign/gpx_make_outputs.cwl          | 28 ----------
 clade_assign/gpx_qsubmit.cwl               | 52 -----------------
 clade_assign/tblastn_wnode.cwl             | 49 ----------------
 clade_assign/wf_assign_clade.cwl           | 40 -------------
 clade_assign/wf_clade_assign.cwl           | 44 ---------------
 clade_assign/wf_find_marker_alignments.cwl | 44 ---------------
 8 files changed, 350 deletions(-)
 delete mode 100644 clade_assign/align_sort.cwl
 delete mode 100644 clade_assign/assign_clade_bacteria.cwl
 delete mode 100644 clade_assign/gpx_make_outputs.cwl
 delete mode 100644 clade_assign/gpx_qsubmit.cwl
 delete mode 100644 clade_assign/tblastn_wnode.cwl
 delete mode 100644 clade_assign/wf_assign_clade.cwl
 delete mode 100644 clade_assign/wf_clade_assign.cwl
 delete mode 100755 clade_assign/wf_find_marker_alignments.cwl

diff --git a/clade_assign/align_sort.cwl b/clade_assign/align_sort.cwl
deleted file mode 100644
index f8edbff..0000000
--- a/clade_assign/align_sort.cwl
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env cwl-runner
-
-cwlVersion: v1.2
-class: CommandLineTool
-
-label: "Assign Clade, align_sort"
-
-
-baseCommand: align_sort
-arguments: [ -ifmt, seq-align-set,  -k, "query,subject", -nogenbank ]
-
-inputs:
-  hits:
-    type: File
-    inputBinding:
-      prefix: -input
-  output:
-    type: string?
-    default: sorted-aligns.asn
-    inputBinding:
-      prefix: -o
-
-
-outputs: 
-  sorted_aligns:
-    type: File
-    outputBinding:
-      glob: $(inputs.output)
diff --git a/clade_assign/assign_clade_bacteria.cwl b/clade_assign/assign_clade_bacteria.cwl
deleted file mode 100644
index ee8eaea..0000000
--- a/clade_assign/assign_clade_bacteria.cwl
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/env cwl-runner
-
-cwlVersion: v1.2
-class: CommandLineTool
-
-label: "Assign Clade, assign_clade_bacteria"
-
-
-baseCommand: assign_clade_bacteria
-arguments: [ -comp_based_stats, "F", -lower-threshold, "0.004", -matrix, BLOSUM80, -min-markers, "17", -release-id, "0",  -seg, "22 2.2 2.5", -soft_masking, "true", -task, tblastn, -threshold, "18", -upper-threshold, "0.01", -word_size, "6", -nogenbank ]
-
-inputs:
-  conffile:
-    type: File?
-    default:
-      class: File
-      location: /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/etc/bact/ncbi.ini
-    inputBinding:
-      prefix: -conffile
-  assembly_id:
-    type: string
-    inputBinding:
-      prefix: -assembly-taxid
-  sorted_aligns:
-    type: File
-    inputBinding:
-      prefix: -hits 
-  asn_cache:
-    type: Directory
-    inputBinding:
-      prefix: -asn-cache
-      valueFrom: $(inputs.asn_cache.basename),$(inputs.CladeMarkers_asn_cache.basename)
-  CladeMarkers_asn_cache:
-    type: Directory
-  ani:
-    type: File
-    inputBinding:
-      prefix: -ani 
-#  clade_tree:
-#    type: File
-#      inputBinding:
-#      prefix: -clade-tree 
-#  clade_tree_manifest:
-#    type: File?
-#    default:
-#      class: File
-#      location: ../input/dummy.mft
-#    inputBinding:
-#      prefix: -clade-tree-manifest
-  reference_set:
-    type: File
-    inputBinding:
-      prefix: -reference-set
-  output:
-    type: string?
-    default: clade_assignment.xml
-    inputBinding:
-      prefix: -o
-
-outputs: 
-  clade_assignment:
-    type: File
-    outputBinding:
-      glob: $(inputs.output)
-
diff --git a/clade_assign/gpx_make_outputs.cwl b/clade_assign/gpx_make_outputs.cwl
deleted file mode 100644
index f2599d7..0000000
--- a/clade_assign/gpx_make_outputs.cwl
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env cwl-runner
-
-cwlVersion: v1.2
-class: CommandLineTool
-
-label: "Find Marker Alignments, gather"
-
-    
-baseCommand: gpx_make_outputs
-arguments: [ -unzip, '*', -num-partitions, "1" ]
-
-inputs:
-  input_path:
-    type: Directory
-    inputBinding:
-      prefix: -input-path
-  output_name:
-    type: string?
-    default: "blast.#.asn"
-    inputBinding:
-      prefix: -output
-
-outputs:
-  blast_align:
-    type: File
-    outputBinding:
-      #glob: $(inputs.output_name)
-      glob: blast.*.asn
diff --git a/clade_assign/gpx_qsubmit.cwl b/clade_assign/gpx_qsubmit.cwl
deleted file mode 100644
index 69bd13a..0000000
--- a/clade_assign/gpx_qsubmit.cwl
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env cwl-runner
-
-cwlVersion: v1.2
-class: CommandLineTool
-
-label: "Find Marker Alignments, scatter"
-
-
-requirements:
-  - class: InitialWorkDirRequirement
-    listing:
-      - entry: $(inputs.asn_cache)
-        writable: False
-      - entry: $(inputs.CladeMarkers_asn_cache)
-        writable: False
-      - entry: $(inputs.blastdb_dir)
-        writable: False
-    
-baseCommand: gpx_qsubmit
-arguments: [ -affinity, "subject", -max-batch-length, "10000", -nogenbank ]
-
-inputs:
-  asn_cache:
-    type: Directory
-    inputBinding:
-      prefix: -asn-cache
-      valueFrom: $(inputs.asn_cache.basename),$(inputs.CladeMarkers_asn_cache.basename)
-  CladeMarkers_asn_cache:
-    type: Directory
-  seqids:
-    type: File
-    inputBinding:
-      prefix: -ids
-  blastdb_dir:
-    type: Directory
-  blastdb:
-    type: string?
-    default: blastdb
-    inputBinding:
-      prefix: -db
-      valueFrom: $(inputs.blastdb_dir.path)/$(inputs.blastdb)
-  output:
-    type: string?
-    default: jobs.xml
-    inputBinding:
-      prefix: -output
-
-outputs:
-  jobs:
-    type: File
-    outputBinding:
-      glob: $(inputs.output)
diff --git a/clade_assign/tblastn_wnode.cwl b/clade_assign/tblastn_wnode.cwl
deleted file mode 100644
index 2f40c70..0000000
--- a/clade_assign/tblastn_wnode.cwl
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env cwl-runner
-
-cwlVersion: v1.2
-class: CommandLineTool
-
-label: Find Marker Alignments, execute"
-
-
-requirements:
-  - class: InitialWorkDirRequirement
-    listing:
-      - entry: $(inputs.asn_cache)
-        writable: False
-      - entry: $(inputs.CladeMarkers_asn_cache)
-        writable: False
-      - entry: $(inputs.blastdb_dir)
-        writable: False
-    
-baseCommand: tblastn_wnode
-
-arguments: [  -backlog, "1", -comp_based_stats, "F", -db_gencode, "4", -delay, "0", -evalue, "0.001", -matrix, BLOSUM80, -max-jobs, "1", -seg, "22 2.2 2.5", -soft_masking, "true", -threshold, "18", -word_size, "6", -nogenbank ]
-
-inputs:
-  asn_cache:
-    type: Directory
-    inputBinding:
-      prefix: -asn-cache
-      valueFrom: $(inputs.asn_cache.basename),$(inputs.CladeMarkers_asn_cache.basename)
-  CladeMarkers_asn_cache:
-    type: Directory
-  input_jobs:
-    type: File?
-    default: 
-      class: File 
-      location: jobs.xml
-    inputBinding:
-      prefix: -input-jobs
-  output_dir:
-    type: string?
-    default: output
-    inputBinding:
-      prefix: -O
-      
-outputs:
-  outdir:
-    type: Directory
-    outputBinding:
-      glob: $(inputs.output_dir)
-
diff --git a/clade_assign/wf_assign_clade.cwl b/clade_assign/wf_assign_clade.cwl
deleted file mode 100644
index 19d6c5a..0000000
--- a/clade_assign/wf_assign_clade.cwl
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env cwl-runner
-
-cwlVersion: v1.2
-class: Workflow
-
-label: "Assign Clade"
-
-inputs:
-  asn_cache: Directory
-  CladeMarkers_asn_cache: Directory
-  assembly_id: string
-  hits: File
-  ani: File
-#  clade_tree: File
-  reference_set: File
-
-outputs:
-  clade_assignment:
-    type: File
-    outputSource: assign_clade_bacteria/clade_assignment
-
-steps:
-  align_sort:
-    run: align_sort.cwl
-    in:
-      hits: hits
-    out: [ sorted_aligns ]
-
-  assign_clade_bacteria:
-    run: assign_clade_bacteria.cwl
-    in:
-      sorted_aligns: align_sort/sorted_aligns
-      assembly_id: assembly_id
-      asn_cache: asn_cache
-      CladeMarkers_asn_cache: CladeMarkers_asn_cache
-      ani: ani
-#      clade_tree: clade_tree
-      reference_set: reference_set
-    out: [ clade_assignment ]
-
diff --git a/clade_assign/wf_clade_assign.cwl b/clade_assign/wf_clade_assign.cwl
deleted file mode 100644
index c1295e3..0000000
--- a/clade_assign/wf_clade_assign.cwl
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env cwl-runner
-
-cwlVersion: v1.2
-class: Workflow
-
-label: "Assign Clade plane complete workflow"
-
-requirements: 
-  - class: SubworkflowFeatureRequirement 
- 
-inputs:
-  asn_cache: Directory
-  CladeMarkers_asn_cache: Directory
-  seqids: File
-  blastdb_dir: Directory
-  assembly_id: string
-  ani: File
-  reference_set: File
-
-outputs:
-  clade_assignment:
-    type: File
-    outputSource: wf_assign_clade/clade_assignment
-
-steps:
-  wf_assign_clade:
-    run: wf_assign_clade.cwl
-    in:
-      hits: wf_find_marker_alignments/blast_align
-      assembly_id: assembly_id
-      asn_cache: asn_cache
-      CladeMarkers_asn_cache: CladeMarkers_asn_cache
-      ani: ani
-      reference_set: reference_set
-    out: [ clade_assignment ]
-    
-  wf_find_marker_alignments:
-    run: wf_find_marker_alignments.cwl
-    in:
-      asn_cache: asn_cache
-      CladeMarkers_asn_cache: CladeMarkers_asn_cache
-      seqids: seqids
-      blastdb_dir: blastdb_dir
-    out: [blast_align]
diff --git a/clade_assign/wf_find_marker_alignments.cwl b/clade_assign/wf_find_marker_alignments.cwl
deleted file mode 100755
index 5cc801b..0000000
--- a/clade_assign/wf_find_marker_alignments.cwl
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env cwl-runner
-
-cwlVersion: v1.2
-class: Workflow
-
-label: "Find Marker Alignments"
-
-inputs:
-  asn_cache: Directory
-  CladeMarkers_asn_cache: Directory
-  seqids: File
-  blastdb_dir: Directory
-
-outputs: 
-  blast_align:
-    type: File
-    outputSource: gpx_make_outputs/blast_align
-
-steps:
-  gpx_qsubmit:
-    run: gpx_qsubmit.cwl
-    in:
-      asn_cache: asn_cache
-      CladeMarkers_asn_cache: CladeMarkers_asn_cache
-      seqids: seqids
-      blastdb_dir: blastdb_dir
-    out: [jobs]
-  
-  tblastn_wnode:
-    run: tblastn_wnode.cwl
-    in:
-      asn_cache: asn_cache
-      CladeMarkers_asn_cache: CladeMarkers_asn_cache
-      input_jobs: gpx_qsubmit/jobs
-      blastdb_dir: blastdb_dir
-    out: [outdir]
-
-  gpx_make_outputs:
-    run: gpx_make_outputs.cwl
-    in:
-      input_path: tblastn_wnode/outdir
-    out: [blast_align]
-
-

From f5d66effb5c3d9871b61d21868ae87e2de088332 Mon Sep 17 00:00:00 2001
From: "Badretdin, Azat" <badrazat@ncbi.nlm.nih.gov>
Date: Wed, 24 Jan 2024 06:12:44 -0500
Subject: [PATCH 08/16] step renaming that does not involve file renaming;
 JIRA: PGAPX-1206

---
 .../test_preserve_annot_markup/test.cwl       | 10 ++---
 .../unit_tests/test_univ_prot_stats/test.cwl  |  4 +-
 protein_alignment/cat.cwl                     |  4 +-
 protein_alignment/wf_align_filter.cwl         |  4 +-
 protein_alignment/wf_protein_alignment.cwl    |  4 +-
 wf_common.cwl                                 | 42 +++++++++----------
 6 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/progs/unit_tests/test_preserve_annot_markup/test.cwl b/progs/unit_tests/test_preserve_annot_markup/test.cwl
index d8ea225..b4ed8d1 100755
--- a/progs/unit_tests/test_preserve_annot_markup/test.cwl
+++ b/progs/unit_tests/test_preserve_annot_markup/test.cwl
@@ -44,7 +44,7 @@ steps:
   Final_Bacterial_Package_asn_cleanup: # TESTED as part of "last couple of nodes" test
     run: progs/asn_cleanup.cwl
     in:
-      # inp_annotation: bacterial_annot_4/out_annotation 
+      # inp_annotation: bacterial_annot_2nd_pass/out_annotation 
       # inp_annotation: bacterial_annot_4_out_annotation_bypass # , this bypass does not work: SQD-4522
       # using oroginal input from official buildrun template (that is from fam_report output)
       inp_annotation: fam_report_bypass
@@ -177,10 +177,10 @@ steps:
     in:
       annot_request_id: 
         default: -1 # this is dummy annot_request_id
-      # hmm_search: bacterial_annot_3/Search_Naming_HMMs_hmm_hits # Search Naming HMMs bacterial_annot 3       
-      hmm_search: bacterial_annot_3_Search_Naming_HMMs_hmm_hits_bypass # for bacterial_annot_3/Search_Naming_HMMs_hmm_hits # Search Naming HMMs bacterial_annot 3       
-      # hmm_search_proteins: bacterial_annot_3/Run_GeneMark_Post_models # genemark models
-      hmm_search_proteins: bacterial_annot_3_Run_GeneMark_Post_models_bypass # for bacterial_annot_3/Run_GeneMark_Post_models # genemark models
+      # hmm_search: bacterial_annot_misc/Search_Naming_HMMs_hmm_hits # Search Naming HMMs bacterial_annot 3       
+      hmm_search: bacterial_annot_3_Search_Naming_HMMs_hmm_hits_bypass # for bacterial_annot_misc/Search_Naming_HMMs_hmm_hits # Search Naming HMMs bacterial_annot 3       
+      # hmm_search_proteins: bacterial_annot_misc/Run_GeneMark_Post_models # genemark models
+      hmm_search_proteins: bacterial_annot_3_Run_GeneMark_Post_models_bypass # for bacterial_annot_misc/Run_GeneMark_Post_models # genemark models
       input:  Final_Bacterial_Package_final_bact_asn/outfull
       univ_prot_xml:  univ_prot_xml # /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/home/badrazat/local-install/2018-05-17/third-party/data/BacterialPipeline/uniColl/ver-3.2/universal.xml 
       val_res_den_xml:  val_res_den_xml # /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/home/badrazat/local-install/2018-05-17/etc/validation-results.xml
diff --git a/progs/unit_tests/test_univ_prot_stats/test.cwl b/progs/unit_tests/test_univ_prot_stats/test.cwl
index c9c2154..e51e711 100755
--- a/progs/unit_tests/test_univ_prot_stats/test.cwl
+++ b/progs/unit_tests/test_univ_prot_stats/test.cwl
@@ -19,8 +19,8 @@ steps:
     in:
       annot_request_id: 
         default: -1 # this is dummy annot_request_id
-      hmm_search: hmm_hits # bacterial_annot_3_Search_Naming_HMMs_hmm_hits_bypass # for bacterial_annot_3/Search_Naming_HMMs_hmm_hits # Search Naming HMMs bacterial_annot 3       
-      hmm_search_proteins: hmm_search_proteins #  bacterial_annot_3_Run_GeneMark_Post_models_bypass # for bacterial_annot_3/Run_GeneMark_Post_models # genemark models
+      hmm_search: hmm_hits # bacterial_annot_3_Search_Naming_HMMs_hmm_hits_bypass # for bacterial_annot_misc/Search_Naming_HMMs_hmm_hits # Search Naming HMMs bacterial_annot 3       
+      hmm_search_proteins: hmm_search_proteins #  bacterial_annot_3_Run_GeneMark_Post_models_bypass # for bacterial_annot_misc/Run_GeneMark_Post_models # genemark models
       input:  outfull # Final_Bacterial_Package_final_bact_asn/outfull
       univ_prot_xml:  univ_prot_xml # /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/home/badrazat/local-install/2018-05-17/third-party/data/BacterialPipeline/uniColl/ver-3.2/universal.xml 
       val_res_den_xml:  val_res_den_xml # /panfs/pan1.be-md.ncbi.nlm.nih.gov/gpipe/home/badrazat/local-install/2018-05-17/etc/validation-results.xml
diff --git a/protein_alignment/cat.cwl b/protein_alignment/cat.cwl
index 9587ef4..c8a4bca 100644
--- a/protein_alignment/cat.cwl
+++ b/protein_alignment/cat.cwl
@@ -9,11 +9,11 @@ baseCommand: cat
 stdout: out.asn
 
 inputs:
-  file_in_1:
+  file_in_1st:
     type: File
     inputBinding:
       position: 1
-  file_in_2:
+  file_in_2nd:
     type: File
     inputBinding:
       position: 2
diff --git a/protein_alignment/wf_align_filter.cwl b/protein_alignment/wf_align_filter.cwl
index 3acbe75..14753b9 100644
--- a/protein_alignment/wf_align_filter.cwl
+++ b/protein_alignment/wf_align_filter.cwl
@@ -23,8 +23,8 @@ steps:
   cat:
     run: cat.cwl
     in:
-      file_in_1: blast_full
-      file_in_2: prosplign
+      file_in_1st: blast_full
+      file_in_2nd: prosplign
     out: [ file_out ]
      
   align_filter:
diff --git a/protein_alignment/wf_protein_alignment.cwl b/protein_alignment/wf_protein_alignment.cwl
index 35dc9f9..6cb6922 100755
--- a/protein_alignment/wf_protein_alignment.cwl
+++ b/protein_alignment/wf_protein_alignment.cwl
@@ -63,8 +63,8 @@ steps:
   cat:
     run: cat.cwl
     in:
-      file_in_1: Seed_Protein_Alignments/blast_align
-      file_in_2: Seed_Search_Compartments/blast_align
+      file_in_1st: Seed_Protein_Alignments/blast_align
+      file_in_2nd: Seed_Search_Compartments/blast_align
     out: [ file_out ]
 
   Sort_Seed_Hits:
diff --git a/wf_common.cwl b/wf_common.cwl
index 598d5a7..3c2d628 100644
--- a/wf_common.cwl
+++ b/wf_common.cwl
@@ -402,7 +402,7 @@ steps:
       thresholds: passdata/thresholds
     out: [lds2,seqids,proteins, aligns, annotation, out_hmm_params, outseqs, prot_ids, models1]
 
-  spurious_annot_1: # PLANE
+  spurious_annot_prelim: # PLANE
     run: spurious_annot/wf_spurious_annot_pass1.cwl
     in:
       Extract_ORF_Proteins_proteins: bacterial_annot/proteins
@@ -413,14 +413,14 @@ steps:
       scatter_gather_nchunks: scatter_gather_nchunks
     out: [AntiFam_tainted_proteins_I___oseqids]
 
-  bacterial_annot_2: # PLANE
+  bacterial_annot_1st_pass: # PLANE
     run: bacterial_annot/wf_bacterial_annot_pass2.cwl
     in:
         lds2: bacterial_annot/lds2
         proteins: bacterial_annot/proteins
         prot_ids_A: bacterial_annot/seqids
         prot_ids_B1: bacterial_annot/prot_ids
-        prot_ids_B2: spurious_annot_1/AntiFam_tainted_proteins_I___oseqids
+        prot_ids_B2: spurious_annot_prelim/AntiFam_tainted_proteins_I___oseqids
         identification_db_dir: passdata/identification_db_dir
         blastdb: Get_Proteins/selected_blastdb
         annotation: bacterial_annot/outseqs
@@ -447,11 +447,11 @@ steps:
       taxid: taxid
       tax_sql_file: passdata/taxon_db
       gc_assembly: genomic_source/gencoll_asn
-      compartments: bacterial_annot_2/aligns
+      compartments: bacterial_annot_1st_pass/aligns
       all_prots: Get_Proteins/all_prots
     out: [align, align_non_match]
 
-  bacterial_annot_3:
+  bacterial_annot_misc:
     run: bacterial_annot/wf_bacterial_annot_pass3.cwl
     in:
         AntiFamLib: passdata/AntiFamLib
@@ -484,35 +484,35 @@ steps:
         - id: Name_by_WPs_names
         - id: PGAP_plus_ab_initio_annotation
 
-  spurious_annot_2:
+  spurious_annot_final:
     run: spurious_annot/wf_spurious_annot_pass2.cwl
     in:
-      Extract_Model_Proteins_proteins: bacterial_annot_3/Extract_Model_Proteins_proteins
-      Extract_Model_Proteins_seqids: bacterial_annot_3/Extract_Model_Proteins_seqids
-      Extract_Model_Proteins_lds2: bacterial_annot_3/Extract_Model_Proteins_lds2
+      Extract_Model_Proteins_proteins: bacterial_annot_misc/Extract_Model_Proteins_proteins
+      Extract_Model_Proteins_seqids: bacterial_annot_misc/Extract_Model_Proteins_seqids
+      Extract_Model_Proteins_lds2: bacterial_annot_misc/Extract_Model_Proteins_lds2
       AntiFamLib: passdata/AntiFamLib
       sequence_cache: genomic_source/asncache
       scatter_gather_nchunks: scatter_gather_nchunks
-      input_models: bacterial_annot_3/PGAP_plus_ab_initio_annotation
+      input_models: bacterial_annot_misc/PGAP_plus_ab_initio_annotation
     out:
       - AntiFam_tainted_proteins___oseqids
       - Good_AntiFam_filtered_annotations_out
       - Good_AntiFam_filtered_proteins_output
 
-  bacterial_annot_4:
+  bacterial_annot_2nd_pass:
     run: bacterial_annot/wf_bacterial_annot_pass4.cwl
     in:
-        lds2: bacterial_annot_3/Extract_Model_Proteins_lds2
-        proteins: bacterial_annot_3/Extract_Model_Proteins_proteins
-        annotation: spurious_annot_2/Good_AntiFam_filtered_annotations_out
-        Good_AntiFam_filtered_proteins_gilist: spurious_annot_2/Good_AntiFam_filtered_proteins_output
+        lds2: bacterial_annot_misc/Extract_Model_Proteins_lds2
+        proteins: bacterial_annot_misc/Extract_Model_Proteins_proteins
+        annotation: spurious_annot_final/Good_AntiFam_filtered_annotations_out
+        Good_AntiFam_filtered_proteins_gilist: spurious_annot_final/Good_AntiFam_filtered_proteins_output
         sequence_cache: genomic_source/asncache
         uniColl_cache: passdata/uniColl_cache
         identification_db_dir: passdata/identification_db_dir
         naming_sqlite: passdata/naming_sqlite
-        hmm_assignments:  bacterial_annot_3/Assign_Naming_HMM_to_Proteins_assignments
-        wp_assignments:  bacterial_annot_3/Name_by_WPs_names
-        Extract_Model_Proteins_prot_ids: bacterial_annot_3/Extract_Model_Proteins_seqids
+        hmm_assignments:  bacterial_annot_misc/Assign_Naming_HMM_to_Proteins_assignments
+        wp_assignments:  bacterial_annot_misc/Name_by_WPs_names
+        Extract_Model_Proteins_prot_ids: bacterial_annot_misc/Extract_Model_Proteins_seqids
         CDDdata: passdata/CDDdata
         CDDdata2: passdata/CDDdata2
         thresholds: passdata/thresholds
@@ -584,7 +584,7 @@ steps:
   Add_Locus_Tags:
     run: progs/add_locus_tags.cwl
     in:
-        input: bacterial_annot_4/out_annotation
+        input: bacterial_annot_2nd_pass/out_annotation
         locus_tag_prefix: locus_tag_prefix
         dbname: dbname
     out: [output]
@@ -866,8 +866,8 @@ steps:
     in:
       annot_request_id:
         default: -1 # this is dummy annot_request_id
-      hmm_search: bacterial_annot_3/Search_Naming_HMMs_hmm_hits
-      hmm_search_proteins: bacterial_annot_3/PGAP_plus_ab_initio_annotation
+      hmm_search: bacterial_annot_misc/Search_Naming_HMMs_hmm_hits
+      hmm_search_proteins: bacterial_annot_misc/PGAP_plus_ab_initio_annotation
       input:  Final_Bacterial_Package_final_bact_asn/outfull
       univ_prot_xml:  passdata/univ_prot_xml
       val_res_den_xml:  passdata/val_res_den_xml

From 85e9bef0fe674bd68ddd883fdec17d1f9735bb85 Mon Sep 17 00:00:00 2001
From: "Badretdin, Azat" <badrazat@ncbi.nlm.nih.gov>
Date: Thu, 25 Jan 2024 11:21:38 -0500
Subject: [PATCH 09/16] rename files and eponymous steps and references; JIRA:
 PGAPX-1206

---
 .../{gpx_qsubmit_1.cwl => gpx_qsubmit_seqids.cwl}             | 0
 .../{tblastn_wnode_1.cwl => tblastn_wnode_db.cwl}             | 0
 protein_alignment/wf_protein_alignment.cwl                    | 2 +-
 protein_alignment/{wf_seed_1.cwl => wf_seed_seqids.cwl}       | 4 ++--
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename protein_alignment/{gpx_qsubmit_1.cwl => gpx_qsubmit_seqids.cwl} (100%)
 rename protein_alignment/{tblastn_wnode_1.cwl => tblastn_wnode_db.cwl} (100%)
 rename protein_alignment/{wf_seed_1.cwl => wf_seed_seqids.cwl} (92%)

diff --git a/protein_alignment/gpx_qsubmit_1.cwl b/protein_alignment/gpx_qsubmit_seqids.cwl
similarity index 100%
rename from protein_alignment/gpx_qsubmit_1.cwl
rename to protein_alignment/gpx_qsubmit_seqids.cwl
diff --git a/protein_alignment/tblastn_wnode_1.cwl b/protein_alignment/tblastn_wnode_db.cwl
similarity index 100%
rename from protein_alignment/tblastn_wnode_1.cwl
rename to protein_alignment/tblastn_wnode_db.cwl
diff --git a/protein_alignment/wf_protein_alignment.cwl b/protein_alignment/wf_protein_alignment.cwl
index 6cb6922..506fd00 100755
--- a/protein_alignment/wf_protein_alignment.cwl
+++ b/protein_alignment/wf_protein_alignment.cwl
@@ -51,7 +51,7 @@ steps:
     out: [ blast_align ]
 
   Seed_Protein_Alignments:
-    run: wf_seed_1.cwl
+    run: wf_seed_seqids.cwl
     in:
       db_gencode: Compute_Gencode_int/value
       asn_cache: asn_cache
diff --git a/protein_alignment/wf_seed_1.cwl b/protein_alignment/wf_seed_seqids.cwl
similarity index 92%
rename from protein_alignment/wf_seed_1.cwl
rename to protein_alignment/wf_seed_seqids.cwl
index 1655ac4..9e257d9 100644
--- a/protein_alignment/wf_seed_1.cwl
+++ b/protein_alignment/wf_seed_seqids.cwl
@@ -19,7 +19,7 @@ outputs:
 
 steps:
   gpx_qsubmit:
-    run: gpx_qsubmit_1.cwl
+    run: gpx_qsubmit_seqids.cwl
     in:
       asn_cache: asn_cache
       uniColl_asn_cache: uniColl_asn_cache
@@ -28,7 +28,7 @@ steps:
     out: [jobs]
   
   tblastn_wnode:
-    run: tblastn_wnode_1.cwl
+    run: tblastn_wnode_db.cwl
     in:
       db_gencode: db_gencode
       asn_cache: asn_cache

From 11bf2dc3db609d00941571dbc56f6da0653dd53b Mon Sep 17 00:00:00 2001
From: "Badretdin, Azat" <badrazat@ncbi.nlm.nih.gov>
Date: Mon, 29 Jan 2024 10:49:05 -0500
Subject: [PATCH 10/16]  (1) hook up pgap.cwl with 4 diagnostic exception
 queries that we had in wf_common.cwl workflow (2) add modified default
 exception XML queries to input.yaml file generation for the case when user
 defaulted to command line specs for metadata. Modification includes addition
 of exempting GENERIC_BadSubmissionAuthorName; JIRA: PGAPX-1246

---
 pgap.cwl        |  9 +++++++++
 scripts/pgap.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/pgap.cwl b/pgap.cwl
index 82bc3fb..4272564 100755
--- a/pgap.cwl
+++ b/pgap.cwl
@@ -36,6 +36,15 @@ inputs:
     default: true
   uuid_in:
     type: File?
+  xpath_fail_initial_asndisc: 
+      type: string?
+  xpath_fail_initial_asnvalidate: 
+      type: string?
+  xpath_fail_final_asndisc: 
+      type: string?
+  xpath_fail_final_asnvalidate: 
+      type: string?
+    
 outputs:
   calls:
     outputSource: vecscreen/calls
diff --git a/scripts/pgap.py b/scripts/pgap.py
index 662fdd7..0590910 100755
--- a/scripts/pgap.py
+++ b/scripts/pgap.py
@@ -183,7 +183,11 @@ def __init__(self, params, local_input, pipeline):
             self.submol = self.create_submolfile(submol, params.ani_output, params.ani_hr_output, params.args.auto_correct_tax)
         else:
             self.submol = None
-        self.yaml = self.create_inputfile(local_input)
+        add_std_validation_exemptions = False
+        args = self.params.args
+        if not args.input and args.genome and args.organism:
+            add_std_validation_exemptions = True
+        self.yaml = self.create_inputfile(local_input, add_std_validation_exemptions)
         if self.params.docker_type in ['singularity', 'apptainer']:
             self.make_singularity_cmd()
         elif self.params.docker_type == 'podman':
@@ -368,7 +372,7 @@ def create_submolfile(self, local_submol, ani_output, ani_hr_output, auto_correc
         return yaml
             
         
-    def create_inputfile(self, local_input):
+    def create_inputfile(self, local_input, add_std_validation_exemptions):
         with tempfile.NamedTemporaryFile(mode='w',
                                          suffix=".yaml",
                                          prefix="pgap_input_",
@@ -398,6 +402,46 @@ def create_inputfile(self, local_input):
             if os.path.exists(uuidfile) and os.stat(uuidfile).st_size != 0:
                 fOut.write(u'make_uuid: false\n')
                 fOut.write(u'uuid_in: { class: File, location: /pgap/output/uuid.txt }\n')
+            if add_std_validation_exemptions:
+                fOut.write(f"""
+xpath_fail_initial_asnvalidate: >
+        //*[
+            ( @severity="ERROR" or @severity="REJECT" )
+            and not(contains(@code, "GENERIC_MissingPubRequirement")) 
+            and not(contains(@code, "GENERIC_BadSubmissionAuthorName")) 
+            and not(contains(@code, "SEQ_DESCR_ChromosomeLocation")) 
+            and not(contains(@code, "SEQ_DESCR_MissingLineage")) 
+            and not(contains(@code, "SEQ_DESCR_NoTaxonID")) 
+            and not(contains(@code, "SEQ_DESCR_OrganismIsUndefinedSpecies"))
+            and not(contains(@code, "SEQ_DESCR_StrainWithEnvironSample"))
+            and not(contains(@code, "SEQ_DESCR_BacteriaMissingSourceQualifier"))
+            and not(contains(@code, "SEQ_DESCR_UnwantedCompleteFlag")) 
+            and not(contains(@code, "SEQ_FEAT_BadCharInAuthorLastName")) 
+            and not(contains(@code, "SEQ_FEAT_ShortIntron")) 
+            and not(contains(@code, "SEQ_INST_InternalNsInSeqRaw")) 
+            and not(contains(@code, "SEQ_INST_ProteinsHaveGeneralID")) 
+            and not(contains(@code, "SEQ_PKG_NucProtProblem")) 
+            and not(contains(@code, "SEQ_PKG_ComponentMissingTitle")) 
+        ]
+xpath_fail_final_asnvalidate: >
+        //*[( @severity="ERROR" or @severity="REJECT" )
+            and not(contains(@code, "GENERIC_MissingPubRequirement")) 
+            and not(contains(@code, "GENERIC_BadSubmissionAuthorName")) 
+            and not(contains(@code, "SEQ_DESCR_ChromosomeLocation")) 
+            and not(contains(@code, "SEQ_DESCR_MissingLineage")) 
+            and not(contains(@code, "SEQ_DESCR_NoTaxonID")) 
+            and not(contains(@code, "SEQ_DESCR_OrganismIsUndefinedSpecies"))
+            and not(contains(@code, "SEQ_DESCR_StrainWithEnvironSample"))
+            and not(contains(@code, "SEQ_DESCR_BacteriaMissingSourceQualifier"))
+            and not(contains(@code, "SEQ_DESCR_UnwantedCompleteFlag")) 
+            and not(contains(@code, "SEQ_FEAT_BadCharInAuthorLastName")) 
+            and not(contains(@code, "SEQ_FEAT_ShortIntron")) 
+            and not(contains(@code, "SEQ_INST_InternalNsInSeqRaw")) 
+            and not(contains(@code, "SEQ_INST_ProteinsHaveGeneralID")) 
+            and not(contains(@code, "SEQ_PKG_ComponentMissingTitle")) 
+            and not(contains(@code, "SEQ_PKG_NucProtProblem")) 
+        ]
+""")
             fOut.flush()
         return yaml
         

From c431db4e6a1860d6b55be2dc86aae02c77efe089 Mon Sep 17 00:00:00 2001
From: "Badretdin, Azat" <badrazat@ncbi.nlm.nih.gov>
Date: Mon, 29 Jan 2024 11:56:38 -0500
Subject: [PATCH 11/16]  added a comment in Python script to explain that now
 we are committed to last name 'Lastname' and, similarly,to first name setting
 as default: they are coded in asn_validate tool; JIRA: PGAPX-1246

---
 scripts/pgap.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/pgap.py b/scripts/pgap.py
index 0590910..6a5b323 100755
--- a/scripts/pgap.py
+++ b/scripts/pgap.py
@@ -354,6 +354,10 @@ def create_submolfile(self, local_submol, ani_output, ani_hr_output, auto_correc
             if  has_authors == False:
                 fOut.write(u'authors:\n')
                 fOut.write(u'    - author:\n')
+                #
+                # note: do not change these defaults, they are coded now 
+                # in standard diagnostics asnvalidate tool, that's how GenBank detects that users did not provide correct names
+                #
                 fOut.write(u"        first_name: 'Firstname'\n")
                 fOut.write(u"        last_name: 'Lastname'\n")
             if  has_contact_info == False:

From cfd5aa88a03ec84835db9903fabe32da384df66e Mon Sep 17 00:00:00 2001
From: "Badretdin, Azat" <badrazat@ncbi.nlm.nih.gov>
Date: Mon, 29 Jan 2024 12:37:53 -0500
Subject: [PATCH 12/16]  pass the 4 validation exemption strings from pgap
 input to wf_common input; JIRA: PGAPX-1246

---
 pgap.cwl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pgap.cwl b/pgap.cwl
index 4272564..963194c 100755
--- a/pgap.cwl
+++ b/pgap.cwl
@@ -194,6 +194,10 @@ steps:
       make_uuid: make_uuid
       uuid_in: uuid_in
       blast_hits_cache_data: blast_hits_cache_data
+      xpath_fail_initial_asndisc: xpath_fail_initial_asndisc
+      xpath_fail_initial_asnvalidate: xpath_fail_initial_asnvalidate
+      xpath_fail_final_asndisc: xpath_fail_final_asndisc
+      xpath_fail_final_asnvalidate: xpath_fail_final_asnvalidate
     out: [gbent, gbk, gff, nucleotide_fasta, protein_fasta, cds_nucleotide_fasta, cds_protein_fasta, sqn, initial_asndisc_error_diag, initial_asnval_error_diag, final_asndisc_error_diag, final_asnval_error_diag, checkm_raw, checkm_results]
     run: wf_common.cwl
   Generate_Annotation_Reports_gff_enhanced:

From 304b59e5f9947bfa5d4ca45f154089898cd8d0b2 Mon Sep 17 00:00:00 2001
From: Azat Badretdin <38532187+azat-badretdin@users.noreply.github.com>
Date: Mon, 29 Jan 2024 15:27:59 -0500
Subject: [PATCH 13/16] args.input is created already by this time, do not
 check for it. JIRA: PGAPX-1246

---
 scripts/pgap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/pgap.py b/scripts/pgap.py
index 6a5b323..4493d97 100755
--- a/scripts/pgap.py
+++ b/scripts/pgap.py
@@ -185,7 +185,7 @@ def __init__(self, params, local_input, pipeline):
             self.submol = None
         add_std_validation_exemptions = False
         args = self.params.args
-        if not args.input and args.genome and args.organism:
+        if args.genome and args.organism:
             add_std_validation_exemptions = True
         self.yaml = self.create_inputfile(local_input, add_std_validation_exemptions)
         if self.params.docker_type in ['singularity', 'apptainer']:

From 83d7e319d694bc2caa02a604b637a15aae944a0c Mon Sep 17 00:00:00 2001
From: "Badretdin, Azat" <badrazat@ncbi.nlm.nih.gov>
Date: Sun, 31 Mar 2024 07:21:40 -0400
Subject: [PATCH 14/16] do not use blast_hits_cache in orthology graph; JIRA:
 PGAPX-1283

---
 wf_common.cwl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/wf_common.cwl b/wf_common.cwl
index 3c2d628..da3472b 100644
--- a/wf_common.cwl
+++ b/wf_common.cwl
@@ -574,7 +574,6 @@ steps:
       asn_cache: 
         source: [passdata/uniColl_nuc_cache, genomic_source/asncache]
         linkMerge: merge_flattened
-      blast_hits_cache: blast_hits_cache_data_split_dir/blast_hits_cache
       genus_list: genus_list_file2ints/values
       blastdb:
         default: [blastdb]

From 5d73f9d29542235613311ce7d9432bbbaa10d0d5 Mon Sep 17 00:00:00 2001
From: George Coulouris <coulouri@lmem14.be-md.ncbi.nlm.nih.gov>
Date: Mon, 15 Apr 2024 10:10:21 -0400
Subject: [PATCH 15/16] PGAPX-1268 force platform to linux/amd64 for apple
 silicon

---
 scripts/pgap.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/scripts/pgap.py b/scripts/pgap.py
index 4493d97..e82db79 100755
--- a/scripts/pgap.py
+++ b/scripts/pgap.py
@@ -219,6 +219,8 @@ def __init__(self, params, local_input, pipeline):
 
     def make_docker_cmd(self):
         self.cmd = [self.params.docker_cmd, 'run', '-i', '--rm' ]
+        self.cmd.extend(['--platform', 'linux/amd64'])
+
         if self.params.docker_user_remap:
             self.cmd.extend(['--user', str(os.getuid()) + ":" + str(os.getgid())])
         self.cmd.extend([
@@ -531,11 +533,13 @@ def launch(self):
                 for line in fIn:
                     f.write(line)
             f.write("--- End YAML Input ---\n\n")
-            # Show runtime parameters in the log
-            f.write("--- Start Runtime Report ---\n")            
-            self.record_runtime(f)
-            f.write("\n--- End Runtime Report ---\n\n")            
-            f.flush()
+
+            if platform.system() != "Darwin":
+                # Show runtime parameters in the log
+                f.write("--- Start Runtime Report ---\n")
+                self.record_runtime(f)
+                f.write("\n--- End Runtime Report ---\n\n")
+
             try:
                 proc = subprocess.Popen(self.cmd, stdout=f, stderr=subprocess.STDOUT)
                 proc.wait()

From a1851f7b930a08bb100e81329b24d0aaa7a644e8 Mon Sep 17 00:00:00 2001
From: "Badretdin, Azat" <badrazat@ncbi.nlm.nih.gov>
Date: Tue, 16 Apr 2024 18:40:35 -0400
Subject: [PATCH 16/16] switch to ncbi_crisper 1.03; JIRA: PGAPX-1293

---
 bacterial_mobile_elem/ncbi_crisper_wnode.cwl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bacterial_mobile_elem/ncbi_crisper_wnode.cwl b/bacterial_mobile_elem/ncbi_crisper_wnode.cwl
index d8e53f1..c0b4a96 100644
--- a/bacterial_mobile_elem/ncbi_crisper_wnode.cwl
+++ b/bacterial_mobile_elem/ncbi_crisper_wnode.cwl
@@ -16,7 +16,7 @@ inputs:
       prefix: -input-jobs
   crisper_path:
     type: string?
-    default: /opt/crispr/1.02/bin/
+    default: /opt/crispr/1.03/bin/
     inputBinding:
       prefix: -ncbi-crisper-path
   output_dir: