Merge pull request #62 from ENCODE-DCC/hotfix_for_v1.1.6

Hotfix for v1.1.6
ENCODE-DCC · Apr 9, 2019 · 9563c12 · 9563c12
2 parents 81d11d7 + 81721e3
commit 9563c12
Show file tree

Hide file tree

Showing 34 changed files with 200 additions and 144 deletions.
diff --git a/backends/backend.conf b/backends/backend.conf
@@ -1,7 +1,7 @@
 include required(classpath("application"))
 
 backend {
-  default = "Local"
+  default = "local"
   providers {
 
     pbs {
@@ -20,9 +20,9 @@ backend {
           -N ${job_name} \
           -o ${out} \
           -e ${err} \
-          ${"-lselect=1:ncpus=" + cpu + ":mem=" + memory_mb/1024 + "gb"} \
-          ${"-lwalltime=" + time + ":0:0"} \
-          ${if gpu>1 then "-lngpus=" + gpu else ""} \
+          ${true="-lselect=1:ncpus=" false="" defined(cpu)}${cpu}${true=":mem=" false="" defined(memory_mb)}${memory_mb}${true="mb" false="" defined(memory_mb)} \
+          ${true="-lwalltime=" false="" defined(time)}${time}${true=":0:0" false="" defined(time)} \
+          ${true="-lngpus=" false="" gpu>1}${if gpu>1 then gpu else ""} \
           -V \
           ${script}
         """
@@ -44,18 +44,32 @@ backend {
           Int? memory_mb
           String singularity_container
           String? singularity_bindpath
-          String? singularity_ld_library_path
         """
         submit = """
-          ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (echo "chmod u+x ${script} && LD_LIBRARY_PATH=${singularity_ld_library_path}:$LD_LIBRARY_PATH SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}" | qsub \
+          echo "SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1)cromwell-executions,${singularity_bindpath},$SINGULARITY_BINDPATH singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} /bin/bash ${script}" | qsub \
           -N ${job_name} \
           -o ${out} \
           -e ${err} \
-          ${"-lselect=1:ncpus=" + cpu + ":mem=" + memory_mb/1024 + "gb"} \
-          ${"-lwalltime=" + time + ":0:0"} \
-          ${if gpu>1 then "-lngpus=" + gpu else ""} \
-          -V)
+          ${true="-lselect=1:ncpus=" false="" defined(cpu)}${cpu}${true=":mem=" false="" defined(memory_mb)}${memory_mb}${true="mb" false="" defined(memory_mb)} \
+          ${true="-lwalltime=" false="" defined(time)}${time}${true=":0:0" false="" defined(time)} \
+          ${true="-lngpus=" false="" gpu>1}${if gpu>1 then gpu else ""} \
+          -V
+          # If you see an error "The job was aborted from outside Cromwell"
+          #   then check your singularity settings in a workflow options JSON file
+          #   (e.g. check if you have an image file defined by "singularity_container")
+          # Also, make sure that your input data files (and genome database files)
+          #   are on directories recursively bound by
+          #   "singularity_bindpath" in a workflow options JSON file
+          #   or singularity's built-in environment variable SINGULARITY_BINDPATH.
         """
+        # cromwell is desinged to monitor rc (return code) file, which is generated/controlled
+        # in ${script}, so if singularity does not run it due to some problems in singuarlity's
+        # internal settings then rc file is not generated.
+        # this can result in hanging of a cromwell process.
+        # setting the below parameter enables monitoring by "check-alive".
+        # it will take about "exit-code-timeout-seconds" x 3 time to detect failure.
+        exit-code-timeout-seconds = 180
+
         kill = "qdel ${job_id}"
         check-alive = "qstat -j ${job_id}"
         job-id-regex = "(\\d+)"
@@ -65,7 +79,7 @@ backend {
     slurm_singularity {
       actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory"
       config {
-        script-epilogue = "sleep 30"
+        script-epilogue = "sleep 30 && sync"
         concurrent-job-limit = 50
         runtime-attributes = """
           Int cpu = 1
@@ -77,10 +91,9 @@ backend {
           String? slurm_extra_param
           String singularity_container
           String? singularity_bindpath
-          String? singularity_ld_library_path
         """
         submit = """
-          ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (sbatch \
+          sbatch \
           --export=ALL \
           -J ${job_name} \
           -D ${cwd} \
@@ -89,16 +102,34 @@ backend {
           ${"-t " + time*60} \
           -n 1 \
           --ntasks-per-node=1 \
-          ${"--cpus-per-task=" + cpu} \
-          ${"--mem=" + memory_mb} \
+          ${true="--cpus-per-task=" false="" defined(cpu)}${cpu} \
+          ${true="--mem=" false="" defined(memory_mb)}${memory_mb} \
           ${"-p " + slurm_partition} \
           ${"--account " + slurm_account} \
-          ${"--gres gpu:" + gpu} \
+          ${true="--gres gpu:" false="" defined(gpu)}${gpu} \
           ${slurm_extra_param} \
-          --wrap "chmod u+x ${script} && LD_LIBRARY_PATH=${singularity_ld_library_path}:$LD_LIBRARY_PATH SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}")
+          --wrap "SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1)cromwell-executions,${singularity_bindpath},$SINGULARITY_BINDPATH singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} /bin/bash ${script}"
+          # If you see an error "The job was aborted from outside Cromwell"
+          #   then check your singularity settings in a workflow options JSON file
+          #   (e.g. check if you have an image file defined by "singularity_container")
+          # Also, make sure that your input data files (and genome database files)
+          #   are on directories recursively bound by
+          #   "singularity_bindpath" in a workflow options JSON file
+          #   or singularity's built-in environment variable SINGULARITY_BINDPATH.
         """
         kill = "scancel ${job_id}"
-        check-alive = "squeue -j ${job_id}"
+        # cromwell is desinged to monitor rc (return code) file, which is generated/controlled
+        # in ${script}, so if singularity does not run it due to some problems in singuarlity's
+        # internal settings then rc file is not generated.
+        # this can result in hanging of a cromwell process.
+        # setting the below parameter enables monitoring by "check-alive".
+        # it will take about "exit-code-timeout-seconds" x 3 time to detect failure.
+        exit-code-timeout-seconds = 180
+
+        # cromwell responds only to non-zero exit code from "check-alive",
+        # but "squeue -j [JOB_ID]" returns zero exit code even when job is not found
+        # workaround to exit with 1 (like SGE's qstat -j [JOB_ID] does) for such cases.
+        check-alive = "CHK_ALIVE=$(squeue --noheader -j ${job_id}); if [ -z $CHK_ALIVE ]; then /bin/bash -c 'exit 1'; else echo $CHK_ALIVE; fi"
         job-id-regex = "Submitted batch job (\\d+).*"
       }
     }
@@ -118,27 +149,41 @@ backend {
           String? sge_extra_param
           String singularity_container
           String? singularity_bindpath
-          String? singularity_ld_library_path
         """
         submit = """
-          ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (echo "chmod u+x ${script} && LD_LIBRARY_PATH=${singularity_ld_library_path}:$LD_LIBRARY_PATH SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}" | qsub \
+          echo "SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1)cromwell-executions,${singularity_bindpath},$SINGULARITY_BINDPATH singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} /bin/bash ${script}" | qsub \
           -S /bin/sh \
           -terse \
           -b n \
           -N ${job_name} \
           -wd ${cwd} \
           -o ${out} \
           -e ${err} \
-          ${if cpu>1 then "-pe " + sge_pe + " " + cpu else " "} \
-          ${"-l h_vmem=" + memory_mb/cpu + "m"} \
-          ${"-l s_vmem=" + memory_mb/cpu + "m"} \
-          ${"-l h_rt=" + time*3600} \
-          ${"-l s_rt=" + time*3600} \
+          ${if cpu>1 then "-pe " + sge_pe + " " else ""}${if cpu>1 then cpu else ""} \
+          ${true="-l h_vmem=$(expr " false="" defined(memory_mb)}${memory_mb}${true=" / " false="" defined(memory_mb)}${if defined(memory_mb) then cpu else ""}${true=")m" false="" defined(memory_mb)} \
+          ${true="-l s_vmem=$(expr " false="" defined(memory_mb)}${memory_mb}${true=" / " false="" defined(memory_mb)}${if defined(memory_mb) then cpu else ""}${true=")m" false="" defined(memory_mb)} \
+          ${true="-l h_rt=" false="" defined(time)}${time}${true=":00:00" false="" defined(time)}\
+          ${true="-l s_rt=" false="" defined(time)}${time}${true=":00:00" false="" defined(time)}\
           ${"-q " + sge_queue} \
           ${"-l gpu=" + gpu} \
           ${sge_extra_param} \
-          -V)
+          -V
+          # If you see an error "The job was aborted from outside Cromwell"
+          #   then check your singularity settings in a workflow options JSON file
+          #   (e.g. check if you have an image file defined by "singularity_container")
+          # Also, make sure that your input data files (and genome database files)
+          #   are on directories recursively bound by
+          #   "singularity_bindpath" in a workflow options JSON file
+          #   or singularity's built-in environment variable SINGULARITY_BINDPATH.
         """
+        # cromwell is desinged to monitor rc (return code) file, which is generated/controlled
+        # in ${script}, so if singularity does not run it due to some problems in singuarlity's
+        # internal settings then rc file is not generated.
+        # this can result in hanging of a cromwell process.
+        # setting the below parameter enables monitoring by "check-alive".
+        # it will take about "exit-code-timeout-seconds" x 3 time to detect failure.
+        exit-code-timeout-seconds = 180
+
         kill = "qdel ${job_id}"
         check-alive = "qstat -j ${job_id}"
         job-id-regex = "(\\d+)"
@@ -150,18 +195,15 @@ backend {
       config {
         script-epilogue = "sleep 5 && sync"
         concurrent-job-limit = 10
+        run-in-background = true
         runtime-attributes = """
           Int? gpu
           String singularity_container
           String? singularity_bindpath
-          String? singularity_ld_library_path
         """
         submit = """
-          ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (chmod u+x ${script} && LD_LIBRARY_PATH=${singularity_ld_library_path}:$LD_LIBRARY_PATH SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script} & echo $! && disown)
+          SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1)cromwell-executions,${singularity_bindpath},$SINGULARITY_BINDPATH singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} /bin/bash ${script}
         """
-        job-id-regex = "(\\d+)"
-        check-alive = "ps -ef | grep -v grep | grep ${job_id}" 
-        kill = "kill -9 ${job_id}"
       }
     }
 
@@ -172,6 +214,13 @@ backend {
       }
     }
 
+    local {
+      actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory"
+      config {
+        concurrent-job-limit = 10
+      }
+    }
+
     sge {
       actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory"
       config {
@@ -195,13 +244,13 @@ backend {
         -wd ${cwd} \
         -o ${out} \
         -e ${err} \
-        ${if cpu>1 then "-pe " + sge_pe + " " + cpu else " "} \
-        ${"-l h_vmem=" + memory_mb/cpu + "m"} \
-        ${"-l s_vmem=" + memory_mb/cpu + "m"} \
-        ${"-l h_rt=" + time*3600} \
-        ${"-l s_rt=" + time*3600} \
+        ${if cpu>1 then "-pe " + sge_pe + " " else ""}${if cpu>1 then cpu else ""} \
+        ${true="-l h_vmem=$(expr " false="" defined(memory_mb)}${memory_mb}${true=" / " false="" defined(memory_mb)}${if defined(memory_mb) then cpu else ""}${true=")m" false="" defined(memory_mb)} \
+        ${true="-l s_vmem=$(expr " false="" defined(memory_mb)}${memory_mb}${true=" / " false="" defined(memory_mb)}${if defined(memory_mb) then cpu else ""}${true=")m" false="" defined(memory_mb)} \
+        ${true="-l h_rt=" false="" defined(time)}${time}${true=":00:00" false="" defined(time)}\
+        ${true="-l s_rt=" false="" defined(time)}${time}${true=":00:00" false="" defined(time)}\
         ${"-q " + sge_queue} \
-        ${"-l gpu=" + gpu} \
+        ${true="-l gpu=" false="" defined(gpu)}${gpu} \
         ${sge_extra_param} \
         -V \
         ${script}
@@ -236,11 +285,11 @@ backend {
         ${"-t " + time*60} \
         -n 1 \
         --ntasks-per-node=1 \
-        ${"--cpus-per-task=" + cpu} \
-        ${"--mem=" + memory_mb} \
+        ${true="--cpus-per-task=" false="" defined(cpu)}${cpu} \
+        ${true="--mem=" false="" defined(memory_mb)}${memory_mb} \
         ${"-p " + slurm_partition} \
         ${"--account " + slurm_account} \
-        ${"--gres gpu:" + gpu} \
+        ${true="--gres gpu:" false="" defined(gpu)}${gpu} \
         ${slurm_extra_param} \
         --wrap "/bin/bash ${script}"
         """

diff --git a/chip.wdl b/chip.wdl
@@ -2,7 +2,7 @@
 # Author: Jin Lee (leepc12@gmail.com)
 
 workflow chip {
-	String pipeline_ver = 'v1.1.6'
+	String pipeline_ver = 'v1.1.6.1'
 	### sample name, description
 	String title = 'Untitled'
 	String description = 'No description'

diff --git a/conda/requirements.txt b/conda/requirements.txt
@@ -12,26 +12,26 @@ ucsc-bedclip
 ucsc-bedtobigbed
 ucsc-twobittofa
 macs2 ==2.1.1.20160309 #2.1.0 (no binaries for OSX)
-boost ==1.57.0
+boost #==1.57.0
 openblas ==0.2.20
 numpy ==1.11.3 #1.13.3 #1.10.2 (no binaries for OSX) #1.9.0, 1.8.2 conflicts with ATAQC
-matplotlib #==1.5.1
+matplotlib ==1.5.3 #==1.5.1
 six==1.11.0 # to fix (ImportError: cannot import name _thread)
 python-dateutil==2.6.1
 libgfortran==3.0
-graphviz ==2.38.0
+#graphviz ==2.38.0
 libtool
 ghostscript # pdf2png
 pigz
 zlib
 sambamba ==0.6.6 # to fix seg fault error in 0.6.1
-r ==3.2.2
+r ==3.3.2 #==3.2.2
 r-snow
 r-snowfall
 r-bitops
 r-catools
 bioconductor-rsamtools
-r-spp ==1.13
+r-spp ==1.14 #==1.13
 #glibc #segmentation fault in conda with openssl
 pyfaidx ==0.4.7.1
 cromwell==0.34

diff --git a/conda/requirements_py3.txt b/conda/requirements_py3.txt
@@ -8,7 +8,7 @@ bedtools ==2.26.0
 java-jdk ==8.0.92
 # to resolve 'CXXABI_1.3.9' not found issue
 libgcc==5.2.0 # this does not work with MacOS...
-matplotlib #==1.5.1
+matplotlib ==1.5.3 #==1.5.1
 ncurses ==6.1
 tabix==0.2.6
 readline==6.2
diff --git a/docs/dev.md b/docs/dev.md
@@ -2,9 +2,9 @@
 
 ## Command line for version change
 ```bash
-PREV_VER=v1.1.5
-NEW_VER=v1.1.6
-for f in $(grep -rl ${PREV_VER} --include=*.{wdl,md,sh,yml})
+PREV_VER=v1.1.6.1
+NEW_VER=v1.1.6.1
+for f in $(grep -rl ${PREV_VER} --include=*.{wdl,md,sh})
 do
   sed -i "s/${PREV_VER}/${NEW_VER}/g" ${f}
 done
@@ -24,7 +24,7 @@ Run the following command line locally to build out DX workflows for this pipeli
 
 ```bash
 # version
-VER=v1.1.6
+VER=v1.1.6.1
 
 # general
 java -jar ~/dxWDL-0.77.jar compile chip.wdl -project "ENCODE Uniform Processing Pipelines" -extras workflow_opts/docker.json -f -folder /ChIP-seq2/workflows/$VER/general -defaults examples/dx/template_general.json

diff --git a/docs/tutorial_dx_web.md b/docs/tutorial_dx_web.md
@@ -15,8 +15,8 @@ This document describes instruction for the item 2).
 
 3. Move to one of the following workflow directories according to the platform you have chosen for your project (AWS or Azure). These DX workflows are pre-built with all parameters defined.
 
-* [AWS test workflow](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6/test_ENCSR936XTK_subsampled_chr19_only)
-* [Azure test workflow](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6/test_ENCSR936XTK_subsampled_chr19_only)
+* [AWS test workflow](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6.1/test_ENCSR936XTK_subsampled_chr19_only)
+* [Azure test workflow](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6.1/test_ENCSR936XTK_subsampled_chr19_only)
 
 4. Copy it to your project by right-clicking on the DX workflow `chip` and choose "Copy". 
 
@@ -40,16 +40,16 @@ This document describes instruction for the item 2).
 1. DNAnexus allows only one copy of a workflow per project. The example workflow in the previous section is pre-built for the subsampled test sample [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/) with all parameters defined already.
 
 2. Copy one of the following workflows according to the platform you have chosen for your project (AWS or Azure).
-* [AWS general](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6/general) without pre-defined reference genome.
-* [AWS hg38](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6/hg38) with pre-defined hg38 reference genome.
-* [AWS hg19](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6/hg19) with pre-defined hg19 reference genome.
-* [AWS mm10](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6/mm10) with pre-defined mm10 reference genome.
-* [AWS mm9](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6/mm9) with pre-defined mm9 reference genome.
-* [Azure general](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6/general) without pre-defined reference genome.
-* [Azure hg38](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6/hg38) with pre-defined hg38 reference genome.
-* [Azure hg19](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6/hg19) with pre-defined hg19 reference genome.
-* [Azure mm10](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6/mm10) with pre-defined mm10 reference genome.
-* [Azure mm9](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6/mm9) with pre-defined mm9 reference genome.
+* [AWS general](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6.1/general) without pre-defined reference genome.
+* [AWS hg38](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6.1/hg38) with pre-defined hg38 reference genome.
+* [AWS hg19](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6.1/hg19) with pre-defined hg19 reference genome.
+* [AWS mm10](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6.1/mm10) with pre-defined mm10 reference genome.
+* [AWS mm9](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6.1/mm9) with pre-defined mm9 reference genome.
+* [Azure general](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6.1/general) without pre-defined reference genome.
+* [Azure hg38](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6.1/hg38) with pre-defined hg38 reference genome.
+* [Azure hg19](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6.1/hg19) with pre-defined hg19 reference genome.
+* [Azure mm10](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6.1/mm10) with pre-defined mm10 reference genome.
+* [Azure mm9](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6.1/mm9) with pre-defined mm9 reference genome.
 
 3. Click on the DX workflow `chip`.