-
Notifications
You must be signed in to change notification settings - Fork 0
/
blastcblast_stats.py
131 lines (106 loc) · 4.6 KB
/
blastcblast_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -*- coding: utf-8 -*-
"""
Original author : Ahmed Elsherbini
Date 20-11-2024
Spyder Editor
"""
###########################################################
import pandas as pd
from Bio import Entrez
import ete3
from ete3 import NCBITaxa, TreeStyle, PieChartFace, faces
import argparse
import warnings
###########################################################
warnings.filterwarnings("ignore")
my_parser = argparse.ArgumentParser(description='Hello!')
my_parser.add_argument('-i', '--input', action='store', metavar='input', type=str, help='the path to your file')
my_parser.add_argument('-og', '--outgroup', action='store', metavar='outgroup', type=str, help='the name to your outgroup species')
args = my_parser.parse_args()
f_name = args.input
og = args.outgroup
#f_name = "y.csv"
#og = "deinococcus_radiodurans"
try:
Entrez.email = 'drahmed@gmail.com'
df = pd.read_csv(f_name, header=None,on_bad_lines='skip')
if df[0].iloc[0] == 'Description':
df.rename(columns=df.iloc[0], inplace = True)
df['Scientific Name'] = df['Scientific Name'].apply(lambda x: ' '.join(str(x).split()[:2]) if isinstance(x, str) else str(x))
df = df['Scientific Name'].value_counts().reset_index()
df.columns = ['Species', 'count']
substring = 'Scientific Name'
filterx = df['Species'].str.contains(substring)
df = df[~filterx]
substring = 'sp.'
filterx = df['Species'].str.contains(substring)
substring = 'nan'
filterx = df['Species'].str.contains(substring)
df = df[~filterx]
assmebly = []
print("I think this is a NCBI blast input")
else:
df = df.iloc[:, 0].to_frame()
df = df[0].str.split().str[:2].str.join(' ').to_frame()
df = df[0].value_counts().to_frame()
df = df.reset_index()
df.columns = ['Species', 'count']
substring = 'sp.'
filterx = df['Species'].str.contains(substring)
substring = 'nan'
filterx = df['Species'].str.contains(substring)
df = df[~filterx]
assmebly = []
print("I think this is a Cblaster input")
print("And this is blastCblast_stats (Ahmed Elsherbini)")
for row in df['Species']:
species_name = str(row)
handle = Entrez.esearch(db="assembly", term=species_name, retmode="xml")
record = Entrez.read(handle)
count = int(record['Count'])
print(f"Number of {species_name} occurrences in NCBI assembly database: {count}")
assmebly.append(count)
df['assembly'] = assmebly
df['%_in_assembly_db'] = df['count'] / df['assembly'] * 100
file_name = 'database_percentage_%s.csv' % (f_name[:-4])
df.to_csv(file_name, index=False)
print("Done for the database file")
# Taxonomy retrieval and tree generation
ncbi = NCBITaxa()
species_taxids = {}
species_names = pd.Series(df["Species"])
# Add outlier if provided
if og:
og = og.replace("_", " ")
species_names.loc[len(species_names)] = og
for species_name in species_names:
taxid = ncbi.get_name_translator([species_name])
species_taxids[species_name] = taxid[species_name][0] if taxid else None
taxa_ids = [taxid for taxid in species_taxids.values() if taxid]
tree = ncbi.get_topology(taxa_ids)
def annotate_tree_with_scientific_names(tree):
for node in tree.traverse():
if node.is_leaf():
taxid = int(node.name)
scientific_name = ncbi.get_taxid_translator([taxid]).get(taxid)
node.name = scientific_name if scientific_name else "Unknown"
annotate_tree_with_scientific_names(tree)
output_file = "%s_tree.nwk" % (f_name[:-4])
tree.write(outfile=output_file)
# Prepare data for pie charts
df1 = df[['Species', '%_in_assembly_db']]
pie_data = df1.set_index('Species')['%_in_assembly_db'].to_dict()
def layout(node):
if node.is_leaf() and node.name in pie_data:
pie_values = [pie_data[node.name], 100 - pie_data[node.name]]
colors = ["green", "lightgray"]
pie_face = PieChartFace(pie_values, colors=colors, width=50, height=50)
faces.add_face_to_node(pie_face, node, column=1)
ts = TreeStyle()
ts.layout_fn = layout
ts.show_leaf_name = True
#ts.title.add_face(faces.TextFace("Phylogenetic Tree with Pie Charts", fsize=12), column=0)
tree.render("%s_tree_with_pies.pdf" % (f_name[:-4]), tree_style=ts)
print("Tree with pie charts rendered successfully!")
except Exception as e:
print("An error occurred:", e)