Skip to content

Commit

Permalink
Merge pull request #18 from dufeiyu/new_naming
Browse files Browse the repository at this point in the history
Update with new sample naming
  • Loading branch information
dufeiyu authored Sep 5, 2023
2 parents a61452a + 27d953c commit 22d0474
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 27 deletions.
8 changes: 4 additions & 4 deletions Soma.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -342,8 +342,8 @@ task batch_qc {
String batch = basename(BatchDir)

command {
if [ -n "$(/bin/ls -d ${BatchDir}/H_*)" ]; then
/bin/chmod -R 666 ${BatchDir}/H_*
if [ -n "$(/bin/ls -d ${BatchDir}/G*)" ]; then
/bin/chmod -R 666 ${BatchDir}/G*
fi
if [ -n "${InputSpreadSheet}" ]; then
/usr/bin/python3 ${QC_py} -s ${InputSpreadSheet} -d ${BatchDir}
Expand Down Expand Up @@ -402,8 +402,8 @@ task data_transfer {
if [ -n "${InputSpreadSheet}" ]; then
/bin/cp ${QcFile} xfer_staging
fi
/bin/cp ${BatchFastqDir}/H_*.fastq.gz xfer_staging && \
/usr/local/bin/aws s3 cp xfer_staging s3://genoox-upload-wustl/gtacmgi/${XferLabel} --recursive && \
/bin/cp ${BatchFastqDir}/*.fastq.gz xfer_staging && \
/usr/local/bin/aws s3 cp xfer_staging s3://genoox-upload-wustl/gtacmgi/${XferLabel} --exclude "Undetermined*" --recursive && \
/usr/bin/touch done.txt && \
/usr/local/bin/aws s3 cp done.txt s3://genoox-upload-wustl/gtacmgi/${XferLabel}
}
Expand Down
31 changes: 13 additions & 18 deletions scripts/QC_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,15 @@
import argparse
import pandas as pd

def get_lib_list(directory):
prefixes = ["H_", "TW"]
dir_names = [x for x in os.listdir(directory) if os.path.isdir(os.path.join(directory, x)) and
x.startswith(tuple(prefixes)) and 'lib' in x]
liblist = []
pattern = r'(^(H_|TW)\S+-lib\d+)_[ATCG]'
def get_sample_list(directory):
dir_names = [x for x in os.listdir(directory) if os.path.isdir(os.path.join(directory, x))]
sample_names = []
pattern = r'(^\S+)_[ATCG]{10}'
for dir_name in dir_names:
match = re.match(pattern, dir_name)
if match:
liblist.append(match.group(1))
else:
sys.exit('Fail to extract library name from ' + dir_name)
return liblist
sample_names.append(match.group(1))
return sample_names

parser = argparse.ArgumentParser(description='Make QC Excel Spreadsheet')
parser.add_argument('-d','--batchdir',required=True,help='workflow batch output dir')
Expand All @@ -37,10 +33,10 @@ def get_lib_list(directory):

if in_ss:
in_df = pd.read_excel(in_ss, sheet_name='QC Metrics')
lib_list = in_df['SAMPLE ID'].tolist()
sample_list = in_df['SAMPLE ID'].tolist()
else:
lib_list = get_lib_list(out_dir)
in_df = pd.DataFrame({'Samples' : lib_list})
sample_list = get_sample_list(out_dir)
in_df = pd.DataFrame({'Samples' : sample_list})

hap_scores = []
hap_sites = []
Expand All @@ -60,8 +56,8 @@ def get_lib_list(directory):
pct_target_1500 = []
total_giga_bases= []

for lib_name in lib_list:
search = os.path.join(out_dir, f"{lib_name}*")
for sample_name in sample_list:
search = os.path.join(out_dir, f"{sample_name}*")
sample_dir = glob.glob(search)[0]
if not os.path.isdir(sample_dir):
sys.exit(sample_dir + " is not a valid sample directory")
Expand All @@ -70,7 +66,7 @@ def get_lib_list(directory):
mapping_metrics = glob.glob(os.path.join(dragen_dir, "*.mapping_metrics.csv"))[0]
target_metrics = glob.glob(os.path.join(dragen_dir, "*.target_bed_coverage_metrics.csv"))[0]
umi_metrics = glob.glob(os.path.join(dragen_dir, "*.umi_metrics.csv"))[0]
haplotect_out = os.path.join(sample_dir, f"{lib_name}.haplotect.txt")
haplotect_out = os.path.join(sample_dir, f"{sample_name}.haplotect.txt")

if not (os.path.isfile(mapping_metrics) and os.path.isfile(target_metrics) and os.path.isfile(umi_metrics) and os.path.isfile(haplotect_out)):
sys.exit(f"No dragen mapping and/or target metrics and/or umi metrics and/or haplotect out for {sample_dir}")
Expand Down Expand Up @@ -159,13 +155,12 @@ def get_lib_list(directory):

if in_ss:
sss_df = pd.DataFrame({
'Library' : lib_list,
'Library' : sample_list,
'Total Bases' : total_bases,
'Percent Q30 (R1)' : pct_q30_1,
'Percent Q30 (R2)' : pct_q30_2
})

sample_list = [x.split('-lib')[0] for x in lib_list]
hap_scores_pct = ['{:.2f}%'.format(x * 100) for x in hap_scores]

fcs_df = pd.DataFrame({
Expand Down
11 changes: 6 additions & 5 deletions scripts/launcher.pl
Original file line number Diff line number Diff line change
Expand Up @@ -65,19 +65,20 @@
unless ($row->[0] =~ /\d+/) {
die "Lane number is expected, Check sample sheet spreadsheet";
}
my ($lane, $flowcell, $lib, $index, $exception) = @$row;
my ($lane, $flowcell, $name, $index, $exception) = @$row;

$lib =~ s/\s+//g;
my ($name) = $lib =~ /^(\S+)\-lib/;
$name =~ s/\s+//g;
my $lib = $name;
$lib .= '-lib1' unless $lib =~ /lib/;

my ($index1, $index2) = $index =~ /([ATGC]{10})\-([ATGC]{10})/;
my $fix_index2 = rev_comp($index2);

$exception = 'NONE' unless $exception;

$ds_str .= join ',', $lane, $lib, $lib, '', $index1, $fix_index2;
$ds_str .= join ',', $lane, $name, $name, '', $index1, $fix_index2;
$ds_str .= "\n";
$si_str .= join "\t", $index1.'-'.$fix_index2, $lib, $seq_id, $flowcell, $lane, $lib, $name;
$si_str .= join "\t", $index1.'-'.$fix_index2, $name, $seq_id, $flowcell, $lane, $lib, $name;
$si_str .= "\n";

$seq_id++;
Expand Down

0 comments on commit 22d0474

Please sign in to comment.