forked from LangilleLab/microbiome_helper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fix_ITS2_spf.py
executable file
·100 lines (72 loc) · 3.21 KB
/
fix_ITS2_spf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python
from __future__ import print_function
import argparse
import csv
__author__ = "Gavin Douglas"
__credits__ = ["Gavin Douglas"]
__license__ = "GPL"
__version__ = "0.1"
__maintainer__ = "Gavin Douglas"
__email__ = "gavin.douglas@dal.ca"
__status__ = "Development"
parser = argparse.ArgumentParser(description="Fix STAMP-formatted OTU table\
so that all \"unidentified\" taxa are children of unique\
parents. This is done by appending \"X\" to the name of the closest\
higher-order taxonomic label that is defined. Additional \"X\" characters\
will be added to distinguish undefined children at different levels.",
epilog='''Usage example: fix_ITS2_spf.py -i\
otu_table.spf -o otu_table_fixed.spf''',
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("-i", "--input", help="Input STAMP file", required=True)
parser.add_argument("-o", "--output", help="Output fixed STAMP file",
required=True)
parser.add_argument("-c", "--col_count", help="Number of taxonomic columns in\
file (default=7)", required=False, default=7, type=int)
def main():
header_marker = 0
args = parser.parse_args()
outfile = open(args.output, "w")
out_writer = csv.writer(outfile, delimiter="\t", lineterminator="\n")
with open(args.input, "r") as infile:
for line in infile:
# Remove newline character and split by tab.
line = line.rstrip("\r\n")
line_split = line.split("\t")
# Print out header and go to next line.
if header_marker == 0:
header_marker += 1
outfile.write(line + "\n")
continue
# If no labels are unidentified then
# print out line and move to next one.
if "unidentified" not in line.lower():
outfile.write(line + "\n")
continue
# Get list of all taxonomic levels.
taxa = line_split[0:args.col_count]
# Inititalize output list, string containing the most recently
# defined parent, and a counter of the number of unidentified
# levels since last defined parent.
out_taxa = []
defined_parent = "Unknown"
num_unidentified = 0
# Loop over each label (from higher to lower levels).
# If label is unidentified then add Xs to end.
# The number of Xs added is equal to the number of undefined
# parents + 1.
for label in taxa:
if "unidentified" in label.lower():
num_unidentified += 1
out_taxa.append(defined_parent + "_" +
num_unidentified * "X")
else:
# If defined label then set as last defined parent and
# reset unidentified counter.
out_taxa.append(label)
defined_parent = label
num_unidentified = 0
line_split[0:args.col_count] = out_taxa
out_writer.writerow(line_split)
outfile.close()
if __name__ == "__main__":
main()