-
Notifications
You must be signed in to change notification settings - Fork 1
/
s21.py
103 lines (92 loc) · 3.16 KB
/
s21.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import numpy as np
import pandas as pd
def extract_seq(file_name):
'''
Given a text file, return a dictionary of headers and DNA sequences
'''
f = open(file_name,"r")
lines = f.readlines()
seq = []
header = []
a = ""
for line in lines:
if line[0]=='>':
if len(a)!=0:
seq.append(a)
b = ""
for i in range(1,len(line)):
if line[i]!='\n' and line[i]!='\r':
b = b + line[i]
header.append(b)
a=""
if line[0]!='>':
for i in range(len(line)):
if line[i]!='\n' and line[i]!=" " and line[i]!='\r':
a = a + line[i]
seq.append(a)
return seq,header
def get_penalty(file_name):
'''
Retrieves cost matrix as a dictionary from BLOSUM62
'''
df = open('BLOSUM62.txt')
lines = df.readlines()
penalty = {}
seq_list = []
for i in range(1,len(lines[0])):
if lines[0][i]!=' ' and lines[0][i]!="\n":
seq_list.append(lines[0][i])
for i in range(1,len(lines)):
bp1 = lines[i][0]
z = 0
j = 1
while j<len(lines[i]):
num=0
str1 = ""
if (lines[i][j]=='-' or (lines[i][j]!='\n' and lines[i][j]!='\r' and lines[i][j]!=" ")):
while lines[i][j]!='\n' and lines[i][j]!=' ' and lines[i][j]!='\r':
str1 = str1 + lines[i][j]
j=j+1
if j>=len(lines[i]):
break
penalty[bp1+seq_list[z]] = str1
z = z+1
j=j+1
return penalty,seq_list
def find_distance(seq1,seq2,penalty):
'''
Given the penalty matrix and sequences; the function returns their distance
'''
distance = 0
for i in range(len(seq1)):
if seq1[i]=='-' and seq2[i]=='-':
continue
elif seq1[i] == '-' and seq2[i] != '-':
print(i,seq1[i],seq1[i-1])
if seq1[i-1] != '-' or (seq1[i-1]=='-' and seq2[i-1]=='-'):
distance = distance - 11
elif seq1[i-1] == '-' :
distance = distance - 1
elif seq2[i] == '-' and seq1[i] != '-':
if seq2[i-1] != '-' or (seq1[i-1]=='-' and seq2[i-1]=='-'):
distance = distance - 11
elif seq2[i-1] == '-' :
distance = distance - 1
elif seq1[i]!='-' and seq2[i]!='-':
distance = distance + int(penalty[seq2[i] + seq1[i]])
return distance
if __name__ == "__main__":
seq2,header2 = extract_seq('Protein_alignment.txt')
print("Number of headers : " , len(header2))
print("Number of sequences: " , len(seq2))
penalty,seq_list = get_penalty('BLOSUM62.txt')
dist_matrix = np.zeros((len(seq2) , len(seq2)))
for i in range(len(seq2)):
for j in range(len(seq2)):
dist_matrix[i][j] = find_distance(seq2[i],seq2[j],penalty)
data = {}
for i in range(len(seq2)):
data[header2[i]] = dist_matrix[i]
df = pd.DataFrame(data, columns = header2, index=header2)
df.to_csv('Pdistance.csv')
print(df)