Merge pull request #13 from dufeiyu/update

Use intermediate_results_dir for dragen task and Make inputspreadsheet optional
dhslab · Jul 31, 2023 · b1064cf · b1064cf
2 parents b0d80d9 + dee1275
commit b1064cf
Show file tree

Hide file tree

Showing 2 changed files with 111 additions and 120 deletions.
diff --git a/Soma.wdl b/Soma.wdl
@@ -2,19 +2,19 @@ version 1.0
 
 workflow Soma {
     input {
-        File InputSpreadSheet
         File SampleSheet
         # sample sheet has this structure:
         # index  name  RG_ID  RG_FLOWCELL  RG_LANE  RG_LIB  RG_SAMPLE [R1] [R2]
 
+        File? InputSpreadSheet
         File? DemuxSampleSheet
         String? IlluminaDir
+        String? XferLabel
         String? DragenEnv
 
         Boolean DataTransfer
         Boolean RmRunDir
 
-        String XferLabel
         String OutputDir
         String Queue
         String JobGroup
@@ -23,9 +23,10 @@ workflow Soma {
 
         String SomaRepo
         String CoverageBed  = SomaRepo + "/accessory_files/SOMA.all.bed"
-        String CovLevels = "100,500,1000,1500"
         String HaplotectBed = SomaRepo + "/accessory_files/SOMA.haplotect.bed"
         String QC_py        = SomaRepo + "/scripts/QC_metrics.py"
+
+        String CovLevels = "100,500,1000,1500"
     }
 
     String DragenReference = "/storage1/fs1/gtac-mgi/Active/CLE/reference/dragen_hg38"
@@ -40,6 +41,7 @@ workflow Soma {
         call dragen_demux {
             input: Dir=IlluminaDir,
                    OutputDir=OutputDir,
+                   DemuxFastqDir=DemuxFastqDir,
                    SampleSheet=DemuxSampleSheet,
                    DragenEnv=DragenEnv,
                    DragenDockerImage=DragenDockerImage,
@@ -110,28 +112,22 @@ workflow Soma {
     }
 
     if (defined(DemuxSampleSheet)){
-        call move_demux_fastq {
-            input: order_by=gather_files.done,
-                   Batch=basename(OutputDir),
-                   DemuxFastqDir=DemuxFastqDir,
-                   queue=DragenQueue,
-                   jobGroup=JobGroup
-        }
-
         if (DataTransfer) {
             call data_transfer {
-                input: order_by=move_demux_fastq.done,
+                input: QcAll=batch_qc.QC_all,
+                #QcFile= OutputDir + '/' + basename(OutputDir) + '_Genoox.xlsx',
                 QcFile=batch_qc.QC_file,
-                XferLabel=XferLabel,
                 BatchFastqDir= DemuxFastqDir + '/' + basename(OutputDir),
+                InputSpreadSheet=InputSpreadSheet,
+                XferLabel=XferLabel,
                 queue=Queue,
                 jobGroup=JobGroup
             }
         }
 
         if (RmRunDir) {
             call remove_rundir {
-                input: order_by=move_demux_fastq.done,
+                input: order_by=gather_files.done,
                        rundir=IlluminaDir,
                        queue=DragenQueue,
                        jobGroup=JobGroup
@@ -144,30 +140,27 @@ workflow Soma {
 task dragen_demux {
      input {
          String? Dir
-         String OutputDir
-         File? SampleSheet
          String? DragenEnv
+         File? SampleSheet
+         String OutputDir
+         String DemuxFastqDir
          String DragenDockerImage
          String jobGroup
          String queue
      }
 
-     String batch = basename(OutputDir)
-     String StagingDir = "/staging/runs/Soma/"
-     String LocalFastqDir = StagingDir + "demux_fastq/" + batch
-     String LocalReportDir = LocalFastqDir + "/Reports"
-     String LocalSampleSheet = StagingDir + "sample_sheet/" + batch + '.csv'
-     String log = StagingDir + "log/" + batch + "_demux.log"
-     String DemuxReportDir = OutputDir + "/dragen_demux_reports"
+     String LocalFastqDir   = "/staging/runs/Soma/demux_fastq/" + basename(OutputDir)
+     String OutputFastqDir  = DemuxFastqDir + "/" + basename(OutputDir)
+     String OutputReportDir = OutputFastqDir + "/Reports"
+     String DemuxReportDir  = OutputDir + "/dragen_demux_reports"
 
      command <<<
-         /bin/cp ~{SampleSheet} ~{LocalSampleSheet} && \
-         /opt/edico/bin/dragen --bcl-conversion-only true --bcl-only-matched-reads true --strict-mode true --sample-sheet ~{LocalSampleSheet} --bcl-input-directory ~{Dir} --output-directory ~{LocalFastqDir} &> ~{log} && \
-         /bin/ls ~{LocalFastqDir}/*_R1_001.fastq.gz > Read1_list.txt && \
-         /bin/ls ~{LocalFastqDir}/*_R2_001.fastq.gz > Read2_list.txt && \
-         /bin/mv ~{log} ./ && \
-         /bin/rm -f ~{LocalSampleSheet} && \
-         /bin/cp -r ~{LocalReportDir} ~{DemuxReportDir}
+         /bin/mkdir ~{LocalFastqDir} && \
+         /opt/edico/bin/dragen --bcl-conversion-only true --bcl-only-matched-reads true --strict-mode true --sample-sheet ~{SampleSheet} \
+         --bcl-input-directory ~{Dir} --intermediate-results-dir ~{LocalFastqDir} --output-directory ~{OutputFastqDir} && \
+         /bin/ls ~{OutputFastqDir}/*_R1_001.fastq.gz > Read1_list.txt && \
+         /bin/ls ~{OutputFastqDir}/*_R2_001.fastq.gz > Read2_list.txt && \
+         /bin/cp -r ~{OutputReportDir} ~{DemuxReportDir}
      >>>
 
      runtime {
@@ -247,25 +240,20 @@ task dragen_align {
      }
 
      String batch = basename(OutputDir)
-     String StagingDir = "/staging/runs/Soma/"
-     String LocalAlignDir = StagingDir + "align/" + batch
-     String LocalSampleDir = LocalAlignDir + "/" + SubDir
-     String log = StagingDir + "log/" + Name + "_align.log"
-
-     String outdir = OutputDir + "/" + SubDir
-     String dragen_outdir = outdir + "/dragen"
-
+     String LocalAlignDir = "/staging/runs/Soma/align/" + batch + "/" + SubDir
+     String DragenOutdir = OutputDir + "/" + SubDir + "/dragen"
 
      command {
-         if [ ! -d "${LocalAlignDir}" ]; then
-             /bin/mkdir ${LocalAlignDir}
-         fi
-
-         /bin/mkdir ${LocalSampleDir} && \
-         /bin/mkdir ${outdir} && \
-         /opt/edico/bin/dragen -r ${DragenRef} --tumor-fastq1 ${fastq1} --tumor-fastq2 ${fastq2} --RGSM-tumor ${SM} --RGID-tumor ${RG} --RGLB-tumor ${LB} --enable-map-align true --enable-sort true --enable-map-align-output true --enable-variant-caller=true --vc-enable-umi-solid true --vc-combine-phased-variants-distance 3 --vc-enable-orientation-bias-filter true --vc-enable-triallelic-filter false --vc-target-bed ${CoverageBed} --gc-metrics-enable=true --qc-coverage-ignore-overlaps=true --qc-coverage-region-1 ${CoverageBed} --qc-coverage-reports-1 cov_report --qc-coverage-region-1-thresholds ${CovLevels} --umi-enable true --umi-library-type=random-simplex --umi-min-supporting-reads ${readfamilysize} --umi-metrics-interval-file ${CoverageBed} --output-dir ${LocalSampleDir} --output-file-prefix ${Name} --output-format CRAM &> ${log} && \
-         /bin/mv ${log} ./ && \
-         /bin/mv ${LocalSampleDir} ${dragen_outdir}
+         /bin/mkdir -p ${LocalAlignDir} && \
+         /bin/mkdir -p ${DragenOutdir}  && \
+         /opt/edico/bin/dragen -r ${DragenRef} --tumor-fastq1 ${fastq1} --tumor-fastq2 ${fastq2} --RGSM-tumor ${SM} --RGID-tumor ${RG} --RGLB-tumor ${LB} \
+         --umi-enable true --umi-library-type=random-simplex --umi-min-supporting-reads ${readfamilysize} --umi-metrics-interval-file ${CoverageBed} \
+         --enable-map-align true --enable-sort true --enable-map-align-output true --gc-metrics-enable=true \
+         --enable-variant-caller=true --vc-target-bed ${CoverageBed} --vc-enable-umi-solid true --vc-enable-triallelic-filter false \
+         --vc-combine-phased-variants-distance 3 --vc-enable-orientation-bias-filter true \
+         --qc-coverage-ignore-overlaps=true --qc-coverage-region-1 ${CoverageBed} --qc-coverage-reports-1 cov_report \
+         --qc-coverage-region-1-thresholds ${CovLevels} \
+         --intermediate-results-dir ${LocalAlignDir} --output-dir ${DragenOutdir} --output-file-prefix ${Name} --output-format BAM
      }
 
      runtime {
@@ -278,8 +266,8 @@ task dragen_align {
      }
 
      output {
-         File cram = "${dragen_outdir}/${Name}_tumor.cram"
-         File crai = "${dragen_outdir}/${Name}_tumor.cram.crai"
+         File cram = "${DragenOutdir}/${Name}_tumor.cram"
+         File crai = "${DragenOutdir}/${Name}_tumor.cram.crai"
      }
 }
 
@@ -345,20 +333,23 @@ task gather_files {
 task batch_qc {
      input {
          Array[String] order_by
-         String InputSpreadSheet
          String BatchDir
          String QC_py
          String queue
          String jobGroup
+         String? InputSpreadSheet
      }
      String batch = basename(BatchDir)
 
      command {
          if [ -n "$(/bin/ls -d ${BatchDir}/H_*)" ]; then
-             /bin/chmod -R 777 ${BatchDir}/H_*
+             /bin/chmod -R 666 ${BatchDir}/H_*
+         fi
+         if [ -n "${InputSpreadSheet}" ]; then
+             /usr/bin/python3 ${QC_py} -s ${InputSpreadSheet} -d ${BatchDir}
+         else
+             /usr/bin/python3 ${QC_py} -d ${BatchDir}
          fi
-
-         /usr/bin/python3 ${QC_py} ${InputSpreadSheet} ${BatchDir}
      }
      runtime {
          docker_image: "docker1(catgumag/pandas-scibioxl:20220107)"
@@ -367,39 +358,14 @@ task batch_qc {
          job_group: jobGroup
      }
      output {
-         File QC_file = "${BatchDir}/${batch}_Genoox.xlsx"
-     }
-}
-
-task move_demux_fastq {
-     input {
-         Array[String] order_by
-         String Batch
-         String DemuxFastqDir
-         String queue
-         String jobGroup
-     }
-
-     String LocalDemuxFastqDir = "/staging/runs/Soma/demux_fastq/" + Batch
-
-     command {
-         if [ -d "${LocalDemuxFastqDir}" ]; then
-             /bin/mv ${LocalDemuxFastqDir} ${DemuxFastqDir}
-         fi
-     }
-     runtime {
-         docker_image: "docker1(ubuntu:xenial)"
-         queue: queue
-         job_group: jobGroup
-     }
-     output {
-         String done = stdout()
+         File  QC_all  = "${BatchDir}/${batch}_QC.xlsx"
+         File? QC_file = "${BatchDir}/${batch}_Genoox.xlsx"
      }
 }
 
 task remove_rundir {
      input {
-         String order_by
+         Array[String] order_by
          String? rundir
          String queue
          String jobGroup
@@ -421,9 +387,10 @@ task remove_rundir {
 
 task data_transfer {
      input {
-         String order_by
-         String QcFile
-         String XferLabel
+         String? InputSpreadSheet
+         String? XferLabel
+         String? QcFile
+         String QcAll
          String BatchFastqDir
          String queue
          String jobGroup
@@ -432,7 +399,9 @@ task data_transfer {
      command {
          set -eo pipefail && \
          /bin/mkdir xfer_staging && \
-         /bin/cp ${QcFile} xfer_staging && \
+         if [ -n "${InputSpreadSheet}" ]; then
+             /bin/cp ${QcFile} xfer_staging && \
+         fi
          /bin/cp ${BatchFastqDir}/H_*.fastq.gz xfer_staging && \
          /usr/local/bin/aws s3 cp xfer_staging s3://genoox-upload-wustl/gtacmgi/${XferLabel} --recursive && \
          /usr/bin/touch done.txt && \

diff --git a/scripts/QC_metrics.py b/scripts/QC_metrics.py
@@ -1,24 +1,46 @@
 #!/usr/bin/env python
 
 import os
+import re
 import csv
 import sys
 import glob
+import argparse
 import pandas as pd
 
-if len(sys.argv) != 3:
-    print("Provide sample_spreadsheet and workflow_output_dir in order")
-    sys.exit(1)
-
-in_ss      = sys.argv[1]
-out_dir    = os.path.abspath(sys.argv[2])
+def get_lib_list(directory):
+    prefixes = ["H_", "TW"]
+    dir_names = [x for x in os.listdir(directory) if os.path.isdir(os.path.join(directory, x)) and
+                 x.startswith(tuple(prefixes)) and 'lib' in x]
+    liblist = []
+    pattern = r'(^(H_|TW)\S+-lib\d+)_[ATCG]'
+    for dir_name in dir_names:
+        match = re.match(pattern, dir_name)
+        if match:
+            liblist.append(match.group(1))
+        else:
+            sys.exit('Fail to extract library name from ' + dir_name)
+    return liblist
+
+parser = argparse.ArgumentParser(description='Make QC Excel Spreadsheet')
+parser.add_argument('-d','--batchdir',required=True,help='workflow batch output dir')
+parser.add_argument('-s','--samplesheet',required=False,help='input sample spreadsheet with 2 tabs')
+
+args = parser.parse_args()
+
+in_ss      = args.samplesheet
+out_dir    = os.path.abspath(args.batchdir)
 batch_name = os.path.basename(out_dir)
 
 if not os.path.isdir(out_dir):
     sys.exit("Workflow output dir does not exist: " + out_dir)
 
-in_df = pd.read_excel(in_ss, sheet_name='QC Metrics')
-lib_list = in_df['SAMPLE ID'].tolist()
+if in_ss:
+    in_df = pd.read_excel(in_ss, sheet_name='QC Metrics')
+    lib_list = in_df['SAMPLE ID'].tolist()
+else:
+    lib_list = get_lib_list(out_dir)
+    in_df = pd.DataFrame({'Samples' : lib_list})
 
 hap_scores      = []
 hap_sites       = []
@@ -131,31 +153,31 @@
     "PCT_TARGET_1500x"          : pct_target_1500
 })
 
+out_file1 = os.path.join(out_dir, batch_name + "_QC.xlsx")
 all_df = pd.concat([in_df, qc_df], axis=1)
-
-sss_df = pd.DataFrame({
-    'Library'          : lib_list, 
-    'Total Bases'      : total_bases,
-    'Percent Q30 (R1)' : pct_q30_1,
-    'Percent Q30 (R2)' : pct_q30_2
-})
-
-sample_list   = [x.split('-lib')[0] for x in lib_list]
-hap_scores_pct = ['{:.2f}%'.format(x * 100) for x in hap_scores]
-
-fcs_df = pd.DataFrame({
-    'Sample'                : sample_list, 
-    'contaminationestimate' : hap_scores_pct
-})
-
-out_file1 = os.path.join(out_dir, batch_name + "_Genoox.xlsx")
-#writer    = pd.ExcelWriter(out_file1, engine='xlsxwriter')
-writer    = pd.ExcelWriter(out_file1)
-
-in_df.to_excel (writer, sheet_name='QC Metrics - qPCR', index=False)
-sss_df.to_excel(writer, sheet_name='Single Sample Stats', index=False, float_format="%.2f")
-fcs_df.to_excel(writer, sheet_name='Final Coverage Stats - TCP', index=False, float_format="%.2f")
-writer.save()
-
-out_file2 = os.path.join(out_dir, batch_name + "_QC.xlsx")
-all_df.to_excel(out_file2, sheet_name='All QC', index=False)
+all_df.to_excel(out_file1, sheet_name='All QC', index=False)
+
+if in_ss:
+    sss_df = pd.DataFrame({
+        'Library'          : lib_list,
+        'Total Bases'      : total_bases,
+        'Percent Q30 (R1)' : pct_q30_1,
+        'Percent Q30 (R2)' : pct_q30_2
+    })
+
+    sample_list = [x.split('-lib')[0] for x in lib_list]
+    hap_scores_pct = ['{:.2f}%'.format(x * 100) for x in hap_scores]
+
+    fcs_df = pd.DataFrame({
+        'Sample'                : sample_list,
+        'contaminationestimate' : hap_scores_pct
+    })
+
+    out_file2 = os.path.join(out_dir, batch_name + "_Genoox.xlsx")
+    writer    = pd.ExcelWriter(out_file2)
+    #writer = pd.ExcelWriter(out_file1, engine='xlsxwriter')
+
+    in_df.to_excel (writer, sheet_name='QC Metrics - qPCR', index=False)
+    sss_df.to_excel(writer, sheet_name='Single Sample Stats', index=False, float_format="%.2f")
+    fcs_df.to_excel(writer, sheet_name='Final Coverage Stats - TCP', index=False, float_format="%.2f")
+    writer.save()