From dee12754b8f736c2fa35f170b918da7873fa4d4c Mon Sep 17 00:00:00 2001 From: Feiyu Du Date: Sun, 30 Jul 2023 20:54:51 -0500 Subject: [PATCH] Use intermediate_results_dir for dragen task and Make workflow to handle the situation without input spreadsheet --- Soma.wdl | 139 ++++++++++++++++-------------------------- scripts/QC_metrics.py | 92 +++++++++++++++++----------- 2 files changed, 111 insertions(+), 120 deletions(-) diff --git a/Soma.wdl b/Soma.wdl index c48373f..f1af01b 100644 --- a/Soma.wdl +++ b/Soma.wdl @@ -2,19 +2,19 @@ version 1.0 workflow Soma { input { - File InputSpreadSheet File SampleSheet # sample sheet has this structure: # index name RG_ID RG_FLOWCELL RG_LANE RG_LIB RG_SAMPLE [R1] [R2] + File? InputSpreadSheet File? DemuxSampleSheet String? IlluminaDir + String? XferLabel String? DragenEnv Boolean DataTransfer Boolean RmRunDir - String XferLabel String OutputDir String Queue String JobGroup @@ -23,9 +23,10 @@ workflow Soma { String SomaRepo String CoverageBed = SomaRepo + "/accessory_files/SOMA.all.bed" - String CovLevels = "100,500,1000,1500" String HaplotectBed = SomaRepo + "/accessory_files/SOMA.haplotect.bed" String QC_py = SomaRepo + "/scripts/QC_metrics.py" + + String CovLevels = "100,500,1000,1500" } String DragenReference = "/storage1/fs1/gtac-mgi/Active/CLE/reference/dragen_hg38" @@ -40,6 +41,7 @@ workflow Soma { call dragen_demux { input: Dir=IlluminaDir, OutputDir=OutputDir, + DemuxFastqDir=DemuxFastqDir, SampleSheet=DemuxSampleSheet, DragenEnv=DragenEnv, DragenDockerImage=DragenDockerImage, @@ -110,20 +112,14 @@ workflow Soma { } if (defined(DemuxSampleSheet)){ - call move_demux_fastq { - input: order_by=gather_files.done, - Batch=basename(OutputDir), - DemuxFastqDir=DemuxFastqDir, - queue=DragenQueue, - jobGroup=JobGroup - } - if (DataTransfer) { call data_transfer { - input: order_by=move_demux_fastq.done, + input: QcAll=batch_qc.QC_all, + #QcFile= OutputDir + '/' + basename(OutputDir) + '_Genoox.xlsx', QcFile=batch_qc.QC_file, - XferLabel=XferLabel, BatchFastqDir= DemuxFastqDir + '/' + basename(OutputDir), + InputSpreadSheet=InputSpreadSheet, + XferLabel=XferLabel, queue=Queue, jobGroup=JobGroup } @@ -131,7 +127,7 @@ workflow Soma { if (RmRunDir) { call remove_rundir { - input: order_by=move_demux_fastq.done, + input: order_by=gather_files.done, rundir=IlluminaDir, queue=DragenQueue, jobGroup=JobGroup @@ -144,30 +140,27 @@ workflow Soma { task dragen_demux { input { String? Dir - String OutputDir - File? SampleSheet String? DragenEnv + File? SampleSheet + String OutputDir + String DemuxFastqDir String DragenDockerImage String jobGroup String queue } - String batch = basename(OutputDir) - String StagingDir = "/staging/runs/Soma/" - String LocalFastqDir = StagingDir + "demux_fastq/" + batch - String LocalReportDir = LocalFastqDir + "/Reports" - String LocalSampleSheet = StagingDir + "sample_sheet/" + batch + '.csv' - String log = StagingDir + "log/" + batch + "_demux.log" - String DemuxReportDir = OutputDir + "/dragen_demux_reports" + String LocalFastqDir = "/staging/runs/Soma/demux_fastq/" + basename(OutputDir) + String OutputFastqDir = DemuxFastqDir + "/" + basename(OutputDir) + String OutputReportDir = OutputFastqDir + "/Reports" + String DemuxReportDir = OutputDir + "/dragen_demux_reports" command <<< - /bin/cp ~{SampleSheet} ~{LocalSampleSheet} && \ - /opt/edico/bin/dragen --bcl-conversion-only true --bcl-only-matched-reads true --strict-mode true --sample-sheet ~{LocalSampleSheet} --bcl-input-directory ~{Dir} --output-directory ~{LocalFastqDir} &> ~{log} && \ - /bin/ls ~{LocalFastqDir}/*_R1_001.fastq.gz > Read1_list.txt && \ - /bin/ls ~{LocalFastqDir}/*_R2_001.fastq.gz > Read2_list.txt && \ - /bin/mv ~{log} ./ && \ - /bin/rm -f ~{LocalSampleSheet} && \ - /bin/cp -r ~{LocalReportDir} ~{DemuxReportDir} + /bin/mkdir ~{LocalFastqDir} && \ + /opt/edico/bin/dragen --bcl-conversion-only true --bcl-only-matched-reads true --strict-mode true --sample-sheet ~{SampleSheet} \ + --bcl-input-directory ~{Dir} --intermediate-results-dir ~{LocalFastqDir} --output-directory ~{OutputFastqDir} && \ + /bin/ls ~{OutputFastqDir}/*_R1_001.fastq.gz > Read1_list.txt && \ + /bin/ls ~{OutputFastqDir}/*_R2_001.fastq.gz > Read2_list.txt && \ + /bin/cp -r ~{OutputReportDir} ~{DemuxReportDir} >>> runtime { @@ -247,25 +240,20 @@ task dragen_align { } String batch = basename(OutputDir) - String StagingDir = "/staging/runs/Soma/" - String LocalAlignDir = StagingDir + "align/" + batch - String LocalSampleDir = LocalAlignDir + "/" + SubDir - String log = StagingDir + "log/" + Name + "_align.log" - - String outdir = OutputDir + "/" + SubDir - String dragen_outdir = outdir + "/dragen" - + String LocalAlignDir = "/staging/runs/Soma/align/" + batch + "/" + SubDir + String DragenOutdir = OutputDir + "/" + SubDir + "/dragen" command { - if [ ! -d "${LocalAlignDir}" ]; then - /bin/mkdir ${LocalAlignDir} - fi - - /bin/mkdir ${LocalSampleDir} && \ - /bin/mkdir ${outdir} && \ - /opt/edico/bin/dragen -r ${DragenRef} --tumor-fastq1 ${fastq1} --tumor-fastq2 ${fastq2} --RGSM-tumor ${SM} --RGID-tumor ${RG} --RGLB-tumor ${LB} --enable-map-align true --enable-sort true --enable-map-align-output true --enable-variant-caller=true --vc-enable-umi-solid true --vc-combine-phased-variants-distance 3 --vc-enable-orientation-bias-filter true --vc-enable-triallelic-filter false --vc-target-bed ${CoverageBed} --gc-metrics-enable=true --qc-coverage-ignore-overlaps=true --qc-coverage-region-1 ${CoverageBed} --qc-coverage-reports-1 cov_report --qc-coverage-region-1-thresholds ${CovLevels} --umi-enable true --umi-library-type=random-simplex --umi-min-supporting-reads ${readfamilysize} --umi-metrics-interval-file ${CoverageBed} --output-dir ${LocalSampleDir} --output-file-prefix ${Name} --output-format CRAM &> ${log} && \ - /bin/mv ${log} ./ && \ - /bin/mv ${LocalSampleDir} ${dragen_outdir} + /bin/mkdir -p ${LocalAlignDir} && \ + /bin/mkdir -p ${DragenOutdir} && \ + /opt/edico/bin/dragen -r ${DragenRef} --tumor-fastq1 ${fastq1} --tumor-fastq2 ${fastq2} --RGSM-tumor ${SM} --RGID-tumor ${RG} --RGLB-tumor ${LB} \ + --umi-enable true --umi-library-type=random-simplex --umi-min-supporting-reads ${readfamilysize} --umi-metrics-interval-file ${CoverageBed} \ + --enable-map-align true --enable-sort true --enable-map-align-output true --gc-metrics-enable=true \ + --enable-variant-caller=true --vc-target-bed ${CoverageBed} --vc-enable-umi-solid true --vc-enable-triallelic-filter false \ + --vc-combine-phased-variants-distance 3 --vc-enable-orientation-bias-filter true \ + --qc-coverage-ignore-overlaps=true --qc-coverage-region-1 ${CoverageBed} --qc-coverage-reports-1 cov_report \ + --qc-coverage-region-1-thresholds ${CovLevels} \ + --intermediate-results-dir ${LocalAlignDir} --output-dir ${DragenOutdir} --output-file-prefix ${Name} --output-format BAM } runtime { @@ -278,8 +266,8 @@ task dragen_align { } output { - File cram = "${dragen_outdir}/${Name}_tumor.cram" - File crai = "${dragen_outdir}/${Name}_tumor.cram.crai" + File cram = "${DragenOutdir}/${Name}_tumor.cram" + File crai = "${DragenOutdir}/${Name}_tumor.cram.crai" } } @@ -345,20 +333,23 @@ task gather_files { task batch_qc { input { Array[String] order_by - String InputSpreadSheet String BatchDir String QC_py String queue String jobGroup + String? InputSpreadSheet } String batch = basename(BatchDir) command { if [ -n "$(/bin/ls -d ${BatchDir}/H_*)" ]; then - /bin/chmod -R 777 ${BatchDir}/H_* + /bin/chmod -R 666 ${BatchDir}/H_* + fi + if [ -n "${InputSpreadSheet}" ]; then + /usr/bin/python3 ${QC_py} -s ${InputSpreadSheet} -d ${BatchDir} + else + /usr/bin/python3 ${QC_py} -d ${BatchDir} fi - - /usr/bin/python3 ${QC_py} ${InputSpreadSheet} ${BatchDir} } runtime { docker_image: "docker1(catgumag/pandas-scibioxl:20220107)" @@ -367,39 +358,14 @@ task batch_qc { job_group: jobGroup } output { - File QC_file = "${BatchDir}/${batch}_Genoox.xlsx" - } -} - -task move_demux_fastq { - input { - Array[String] order_by - String Batch - String DemuxFastqDir - String queue - String jobGroup - } - - String LocalDemuxFastqDir = "/staging/runs/Soma/demux_fastq/" + Batch - - command { - if [ -d "${LocalDemuxFastqDir}" ]; then - /bin/mv ${LocalDemuxFastqDir} ${DemuxFastqDir} - fi - } - runtime { - docker_image: "docker1(ubuntu:xenial)" - queue: queue - job_group: jobGroup - } - output { - String done = stdout() + File QC_all = "${BatchDir}/${batch}_QC.xlsx" + File? QC_file = "${BatchDir}/${batch}_Genoox.xlsx" } } task remove_rundir { input { - String order_by + Array[String] order_by String? rundir String queue String jobGroup @@ -421,9 +387,10 @@ task remove_rundir { task data_transfer { input { - String order_by - String QcFile - String XferLabel + String? InputSpreadSheet + String? XferLabel + String? QcFile + String QcAll String BatchFastqDir String queue String jobGroup @@ -432,7 +399,9 @@ task data_transfer { command { set -eo pipefail && \ /bin/mkdir xfer_staging && \ - /bin/cp ${QcFile} xfer_staging && \ + if [ -n "${InputSpreadSheet}" ]; then + /bin/cp ${QcFile} xfer_staging && \ + fi /bin/cp ${BatchFastqDir}/H_*.fastq.gz xfer_staging && \ /usr/local/bin/aws s3 cp xfer_staging s3://genoox-upload-wustl/gtacmgi/${XferLabel} --recursive && \ /usr/bin/touch done.txt && \ diff --git a/scripts/QC_metrics.py b/scripts/QC_metrics.py index 8175954..9eed09d 100644 --- a/scripts/QC_metrics.py +++ b/scripts/QC_metrics.py @@ -1,24 +1,46 @@ #!/usr/bin/env python import os +import re import csv import sys import glob +import argparse import pandas as pd -if len(sys.argv) != 3: - print("Provide sample_spreadsheet and workflow_output_dir in order") - sys.exit(1) - -in_ss = sys.argv[1] -out_dir = os.path.abspath(sys.argv[2]) +def get_lib_list(directory): + prefixes = ["H_", "TW"] + dir_names = [x for x in os.listdir(directory) if os.path.isdir(os.path.join(directory, x)) and + x.startswith(tuple(prefixes)) and 'lib' in x] + liblist = [] + pattern = r'(^(H_|TW)\S+-lib\d+)_[ATCG]' + for dir_name in dir_names: + match = re.match(pattern, dir_name) + if match: + liblist.append(match.group(1)) + else: + sys.exit('Fail to extract library name from ' + dir_name) + return liblist + +parser = argparse.ArgumentParser(description='Make QC Excel Spreadsheet') +parser.add_argument('-d','--batchdir',required=True,help='workflow batch output dir') +parser.add_argument('-s','--samplesheet',required=False,help='input sample spreadsheet with 2 tabs') + +args = parser.parse_args() + +in_ss = args.samplesheet +out_dir = os.path.abspath(args.batchdir) batch_name = os.path.basename(out_dir) if not os.path.isdir(out_dir): sys.exit("Workflow output dir does not exist: " + out_dir) -in_df = pd.read_excel(in_ss, sheet_name='QC Metrics') -lib_list = in_df['SAMPLE ID'].tolist() +if in_ss: + in_df = pd.read_excel(in_ss, sheet_name='QC Metrics') + lib_list = in_df['SAMPLE ID'].tolist() +else: + lib_list = get_lib_list(out_dir) + in_df = pd.DataFrame({'Samples' : lib_list}) hap_scores = [] hap_sites = [] @@ -131,31 +153,31 @@ "PCT_TARGET_1500x" : pct_target_1500 }) +out_file1 = os.path.join(out_dir, batch_name + "_QC.xlsx") all_df = pd.concat([in_df, qc_df], axis=1) - -sss_df = pd.DataFrame({ - 'Library' : lib_list, - 'Total Bases' : total_bases, - 'Percent Q30 (R1)' : pct_q30_1, - 'Percent Q30 (R2)' : pct_q30_2 -}) - -sample_list = [x.split('-lib')[0] for x in lib_list] -hap_scores_pct = ['{:.2f}%'.format(x * 100) for x in hap_scores] - -fcs_df = pd.DataFrame({ - 'Sample' : sample_list, - 'contaminationestimate' : hap_scores_pct -}) - -out_file1 = os.path.join(out_dir, batch_name + "_Genoox.xlsx") -#writer = pd.ExcelWriter(out_file1, engine='xlsxwriter') -writer = pd.ExcelWriter(out_file1) - -in_df.to_excel (writer, sheet_name='QC Metrics - qPCR', index=False) -sss_df.to_excel(writer, sheet_name='Single Sample Stats', index=False, float_format="%.2f") -fcs_df.to_excel(writer, sheet_name='Final Coverage Stats - TCP', index=False, float_format="%.2f") -writer.save() - -out_file2 = os.path.join(out_dir, batch_name + "_QC.xlsx") -all_df.to_excel(out_file2, sheet_name='All QC', index=False) +all_df.to_excel(out_file1, sheet_name='All QC', index=False) + +if in_ss: + sss_df = pd.DataFrame({ + 'Library' : lib_list, + 'Total Bases' : total_bases, + 'Percent Q30 (R1)' : pct_q30_1, + 'Percent Q30 (R2)' : pct_q30_2 + }) + + sample_list = [x.split('-lib')[0] for x in lib_list] + hap_scores_pct = ['{:.2f}%'.format(x * 100) for x in hap_scores] + + fcs_df = pd.DataFrame({ + 'Sample' : sample_list, + 'contaminationestimate' : hap_scores_pct + }) + + out_file2 = os.path.join(out_dir, batch_name + "_Genoox.xlsx") + writer = pd.ExcelWriter(out_file2) + #writer = pd.ExcelWriter(out_file1, engine='xlsxwriter') + + in_df.to_excel (writer, sheet_name='QC Metrics - qPCR', index=False) + sss_df.to_excel(writer, sheet_name='Single Sample Stats', index=False, float_format="%.2f") + fcs_df.to_excel(writer, sheet_name='Final Coverage Stats - TCP', index=False, float_format="%.2f") + writer.save()