Merge pull request #18 from dufeiyu/new_naming

Update with new sample naming
dhslab · Sep 5, 2023 · 22d0474 · 22d0474
2 parents a61452a + 27d953c
commit 22d0474
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 27 deletions.
diff --git a/Soma.wdl b/Soma.wdl
@@ -342,8 +342,8 @@ task batch_qc {
      String batch = basename(BatchDir)
 
      command {
-         if [ -n "$(/bin/ls -d ${BatchDir}/H_*)" ]; then
-             /bin/chmod -R 666 ${BatchDir}/H_*
+         if [ -n "$(/bin/ls -d ${BatchDir}/G*)" ]; then
+             /bin/chmod -R 666 ${BatchDir}/G*
          fi
          if [ -n "${InputSpreadSheet}" ]; then
              /usr/bin/python3 ${QC_py} -s ${InputSpreadSheet} -d ${BatchDir}
@@ -402,8 +402,8 @@ task data_transfer {
          if [ -n "${InputSpreadSheet}" ]; then
              /bin/cp ${QcFile} xfer_staging
          fi
-         /bin/cp ${BatchFastqDir}/H_*.fastq.gz xfer_staging && \
-         /usr/local/bin/aws s3 cp xfer_staging s3://genoox-upload-wustl/gtacmgi/${XferLabel} --recursive && \
+         /bin/cp ${BatchFastqDir}/*.fastq.gz xfer_staging && \
+         /usr/local/bin/aws s3 cp xfer_staging s3://genoox-upload-wustl/gtacmgi/${XferLabel} --exclude "Undetermined*" --recursive && \
          /usr/bin/touch done.txt && \
          /usr/local/bin/aws s3 cp done.txt s3://genoox-upload-wustl/gtacmgi/${XferLabel}
      }

diff --git a/scripts/QC_metrics.py b/scripts/QC_metrics.py
@@ -8,19 +8,15 @@
 import argparse
 import pandas as pd
 
-def get_lib_list(directory):
-    prefixes = ["H_", "TW"]
-    dir_names = [x for x in os.listdir(directory) if os.path.isdir(os.path.join(directory, x)) and
-                 x.startswith(tuple(prefixes)) and 'lib' in x]
-    liblist = []
-    pattern = r'(^(H_|TW)\S+-lib\d+)_[ATCG]'
+def get_sample_list(directory):
+    dir_names = [x for x in os.listdir(directory) if os.path.isdir(os.path.join(directory, x))]
+    sample_names = []
+    pattern = r'(^\S+)_[ATCG]{10}'
     for dir_name in dir_names:
         match = re.match(pattern, dir_name)
         if match:
-            liblist.append(match.group(1))
-        else:
-            sys.exit('Fail to extract library name from ' + dir_name)
-    return liblist
+            sample_names.append(match.group(1))
+    return sample_names
 
 parser = argparse.ArgumentParser(description='Make QC Excel Spreadsheet')
 parser.add_argument('-d','--batchdir',required=True,help='workflow batch output dir')
@@ -37,10 +33,10 @@ def get_lib_list(directory):
 
 if in_ss:
     in_df = pd.read_excel(in_ss, sheet_name='QC Metrics')
-    lib_list = in_df['SAMPLE ID'].tolist()
+    sample_list = in_df['SAMPLE ID'].tolist()
 else:
-    lib_list = get_lib_list(out_dir)
-    in_df = pd.DataFrame({'Samples' : lib_list})
+    sample_list = get_sample_list(out_dir)
+    in_df = pd.DataFrame({'Samples' : sample_list})
 
 hap_scores      = []
 hap_sites       = []
@@ -60,8 +56,8 @@ def get_lib_list(directory):
 pct_target_1500 = []
 total_giga_bases= []
 
-for lib_name in lib_list:
-    search = os.path.join(out_dir, f"{lib_name}*")
+for sample_name in sample_list:
+    search = os.path.join(out_dir, f"{sample_name}*")
     sample_dir = glob.glob(search)[0]
     if not os.path.isdir(sample_dir):
         sys.exit(sample_dir + " is not a valid sample directory")
@@ -70,7 +66,7 @@ def get_lib_list(directory):
     mapping_metrics = glob.glob(os.path.join(dragen_dir, "*.mapping_metrics.csv"))[0]
     target_metrics = glob.glob(os.path.join(dragen_dir, "*.target_bed_coverage_metrics.csv"))[0]
     umi_metrics = glob.glob(os.path.join(dragen_dir, "*.umi_metrics.csv"))[0]
-    haplotect_out = os.path.join(sample_dir, f"{lib_name}.haplotect.txt")
+    haplotect_out = os.path.join(sample_dir, f"{sample_name}.haplotect.txt")
 
     if not (os.path.isfile(mapping_metrics) and os.path.isfile(target_metrics) and os.path.isfile(umi_metrics) and os.path.isfile(haplotect_out)):
         sys.exit(f"No dragen mapping and/or target metrics and/or umi metrics and/or haplotect out for {sample_dir}")
@@ -159,13 +155,12 @@ def get_lib_list(directory):
 
 if in_ss:
     sss_df = pd.DataFrame({
-        'Library'          : lib_list,
+        'Library'          : sample_list,
         'Total Bases'      : total_bases,
         'Percent Q30 (R1)' : pct_q30_1,
         'Percent Q30 (R2)' : pct_q30_2
     })
 
-    sample_list = [x.split('-lib')[0] for x in lib_list]
     hap_scores_pct = ['{:.2f}%'.format(x * 100) for x in hap_scores]
 
     fcs_df = pd.DataFrame({

diff --git a/scripts/launcher.pl b/scripts/launcher.pl
@@ -65,19 +65,20 @@
     unless ($row->[0] =~ /\d+/) {
         die "Lane number is expected, Check sample sheet spreadsheet";
     }
-    my ($lane, $flowcell, $lib, $index, $exception) = @$row;
+    my ($lane, $flowcell, $name, $index, $exception) = @$row;
 
-    $lib =~ s/\s+//g;
-    my ($name) = $lib =~ /^(\S+)\-lib/;
+    $name =~ s/\s+//g;
+    my $lib = $name;
+    $lib .= '-lib1' unless $lib =~ /lib/;
 
     my ($index1, $index2) = $index =~ /([ATGC]{10})\-([ATGC]{10})/;
     my $fix_index2 = rev_comp($index2);
 
     $exception = 'NONE' unless $exception;
 
-    $ds_str .= join ',', $lane, $lib, $lib, '', $index1, $fix_index2;
+    $ds_str .= join ',', $lane, $name, $name, '', $index1, $fix_index2;
     $ds_str .= "\n";
-    $si_str .= join "\t", $index1.'-'.$fix_index2, $lib, $seq_id, $flowcell, $lane, $lib, $name;
+    $si_str .= join "\t", $index1.'-'.$fix_index2, $name, $seq_id, $flowcell, $lane, $lib, $name;
     $si_str .= "\n";
 
     $seq_id++;