-
Notifications
You must be signed in to change notification settings - Fork 0
/
atg_fastq_merger.py
196 lines (153 loc) · 7.55 KB
/
atg_fastq_merger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# ATG_fastq_merger.py
# Version: 20240413
# Author: Amir Taheri-Ghahfarokhi
# Contact:
# Email: Amir.Taheri.Ghahfarokhi@Gmail.com
# Linkedin: https://www.linkedin.com/in/ghahfarokhi/
import argparse
import pandas as pd
import subprocess
import os
import re
import sys
def check_samples_file(samples_file):
"""Check if the input file contains the required columns."""
if not os.path.exists(samples_file):
sys.exit("Error: The provided samples file does not exist.")
df = pd.read_csv(samples_file, sep="\t")
required_columns = ["sample_name", "read1", "read2"]
input_columns = df.columns.tolist()
missing_columns = [col for col in required_columns if col not in input_columns]
if missing_columns:
sys.exit(f"ERROR: Input file is missing the following required columns: {', '.join(missing_columns)}")
else:
return df
def run_flash(sample_name, read1, read2, out_dir):
log_file = os.path.join(out_dir, sample_name + ".flash.log")
command = ["flash", read1, read2, "-o", os.path.join(out_dir, sample_name)]
with open(log_file, 'w') as f:
result = subprocess.run(command, stdout=f, stderr=subprocess.PIPE)
if result.returncode != 0:
error_message = f"Error running flash for sample {sample_name}: {result.stderr.decode()}"
print(error_message)
return None
return log_file
def parse_flash_log_file(log_file):
total_pairs = None
combined_pairs = None
uncombined_pairs = None
percent_combined = None
with open(log_file, "r") as log:
lines = log.readlines()
for line in lines:
total_pairs_match = re.search(r"Total pairs:\s+(\d+)", line)
combined_pairs_match = re.search(r"Combined pairs:\s+(\d+)", line)
uncombined_pairs_match = re.search(r"Uncombined pairs:\s+(\d+)", line)
percent_combined_match = re.search(r"Percent combined:\s+([\d.]+)%", line)
if total_pairs_match:
total_pairs = int(total_pairs_match.group(1))
elif combined_pairs_match:
combined_pairs = int(combined_pairs_match.group(1))
elif uncombined_pairs_match:
uncombined_pairs = int(uncombined_pairs_match.group(1))
elif percent_combined_match:
percent_combined = float(percent_combined_match.group(1))
return total_pairs, combined_pairs, uncombined_pairs, percent_combined
def clean_unwanted_flash_files(sample_name, out_dir):
os.rename(os.path.join(out_dir, sample_name + ".extendedFrags.fastq"), os.path.join(out_dir, sample_name + ".fastq"))
os.remove(os.path.join(out_dir, sample_name + ".flash.log"))
os.remove(os.path.join(out_dir, sample_name + ".hist"))
os.remove(os.path.join(out_dir, sample_name + ".histogram"))
os.remove(os.path.join(out_dir, sample_name + ".notCombined_1.fastq"))
os.remove(os.path.join(out_dir, sample_name + ".notCombined_2.fastq"))
def print_help():
help = """
This script uses FLASH with default parameters to merge paired-end illumina reads.
Author: Amir.Taheri.Ghahfarokhi@Gmail.com
Version: 20240413
usage:
For merging R1 and R2 for a single sample:
python merger.py [--sample-name SAMPLE_NAME] [--read1 READ1] [--read2 READ2] [--out-dir PATH_TO_OUTPUT_DIR]
or
python merger.py [-n SAMPLE_NAME] [-r1 READ1] [-r2 READ2] [-o PATH_TO_OUTPUT_DIR]
For merging R1 and R2 for a list of samples listed in a samples-file:
python merger.py [--samples-file PATH_TO_SAMPLES_TSV] [--out-dir PATH_TO_OUTPUT_DIR]
or
python merger.py [-f PATH_TO_SAMPLES_TSV] [-o PATH_TO_OUTPUT_DIR]
Important Note 1:
SAMPLES_TSV file is expected to be tab-delimited file with the following header:
sample_name read1 read2
Important Note 2:
sample_name must be alphanumeric (i.e., avoid &, $, @, -, %, * and empty space in the file names)
References
FLASH : Magoč T, Salzberg SL. FLASH: fast length adjustment of short reads to improve genome assemblies. Bioinformatics. 2011 Nov 1;27(21):2957-63. doi: 10.1093/bioinformatics/btr507. Epub 2011 Sep 7. PMID: 21903629; PMCID: PMC3198573.
"""
print(help)
def parse_arguments():
if len(sys.argv) == 1 or '-h' in sys.argv or '--help' in sys.argv:
print_help()
sys.exit(1)
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument('-n', "--sample-name")
parser.add_argument('-r2', "--read1")
parser.add_argument('-r1', "--read2")
parser.add_argument('-f', "--samples-file")
parser.add_argument('-o', "--out-dir")
parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS)
return parser.parse_args()
def main():
args = parse_arguments()
if args.out_dir:
# Check if the output directory exists, if not, create it
if not os.path.exists(args.out_dir):
os.makedirs(args.out_dir)
else:
print_help()
sys.exit("ERROR: argument --out-dir is not provided!")
out_dir = args.out_dir
if args.sample_name and args.read1 and args.read2:
sample_name = args.sample_name
read1 = args.read1
read2 = args.read2
print(f"Running FLASH for sample {sample_name}")
log_file = run_flash(sample_name, read1, read2, out_dir)
if log_file:
total_pairs, combined_pairs, uncombined_pairs, percent_combined = parse_flash_log_file(log_file)
if None in (total_pairs, combined_pairs, uncombined_pairs, percent_combined):
print(f"Error: Required information not found in flash log file for sample {sample_name}")
return
else:
print(f"sample_name: {sample_name}, TotalPairs: {total_pairs}, PercentCombined: {percent_combined}")
clean_unwanted_flash_files(sample_name, out_dir)
elif args.samples_file:
samples_file = args.samples_file
df = check_samples_file(samples_file)
# Making column names lower case
df.columns = map(str.lower, df.columns)
for index, row in df.iterrows():
sample_name = row["sample_name"]
read1 = row["read1"]
read2 = row["read2"]
print(f"Running FLASH for sample {sample_name}")
log_file = run_flash(sample_name, read1, read2, out_dir)
if log_file:
total_pairs, combined_pairs, uncombined_pairs, percent_combined = parse_flash_log_file(log_file)
if None in (total_pairs, combined_pairs, uncombined_pairs, percent_combined):
print(f"Error: Required information not found in flash log file for sample {sample_name}")
continue
else:
print(f"sample_name: {sample_name}, TotalPairs: {total_pairs}, PercentCombined: {percent_combined}")
# Add statistics as new columns to the DataFrame
df.loc[index, "TotalPairs"] = total_pairs
df.loc[index, "CombinedPairs"] = combined_pairs
df.loc[index, "UncombinedPairs"] = uncombined_pairs
df.loc[index, "PercentCombined"] = percent_combined
clean_unwanted_flash_files(sample_name, out_dir)
# Save the updated DataFrame back to the TSV file
stat_file = os.path.join(out_dir, "merger_stats.tsv")
df.to_csv(stat_file, sep="\t", index=False)
print(f"Merger is done! flash statistics are saved here: {stat_file}\n")
else:
print_help()
if __name__ == "__main__":
main()