-
Notifications
You must be signed in to change notification settings - Fork 1
/
functions.py
126 lines (99 loc) · 4.26 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
from numpy import nan
import numpy as np
from sklearn.metrics import roc_curve
def delta_score(df, name, index):
'''calulate the delta score
@param df: dataframe that contains the scores
@param name: name of the splice prediction tool
@param index: index of the scores
'''
# Define the name of the programs for which a delta score is calculated with corresponding max value
delta = dict()
delta['SSFL'] = 100
delta['MES'] = 12
delta['NNS'] = 1
delta['GS'] = 15
delta['SpliceRover'] = 1
delta['DSSP'] = 1
wt = name + '_wt'
var = name + '_var'
score = (float(df.at[index,wt])-float(df.at[index,var]))/delta[name]
return np.absolute(score)
def read_scores_from_excel(f, sheetname, fillna = True, diall = False):
''' This function takes an excel sheet with splice prediction scores, fills missing values with 0,
and calculates delta scores if necessary. It stores the resulting primary scores in a dataframe.
@param f: name of the excel file
@param sheetname: name of the sheet with the splice prediction scores
@param fillna: If set to True, missing values are replaced with 0.
@param diall: If set to True, it includes all tools for DI variants, even the ones that cannot predict scores
Returns primary (delta) scores for all tools for each variant in the dataset
'''
# store the scores in a dataframe
di = pd.read_excel(f, sheetname, engine='openpyxl')
# replace missing values with 0
if fillna == True:
di = di.replace(nan, 0)
# create a dictionary to store the values
delta_scores = dict()
for index in di.index:
element = []
# get the % mutant RNA
value = di.at[index,'% Mutant RNA']
if value > 20:
element.append(1)
else:
element.append(0)
# add the absolute value of the CADD score
element.append(np.absolute(di.at[index,'CADD']))
# add the DSSP score
element.append(delta_score(di, 'DSSP', index))
# add the GeneSplicer score
element.append(delta_score(di, 'GS', index))
# add the MaxEntScan score
element.append(delta_score(di, 'MES', index))
if 'NCSS' in sheetname or diall == True:
# add the absolute value of the MMsplice score
element.append(np.absolute(di.at[index,'MMSplice']))
# add the NNSPLICE score
element.append(delta_score(di, 'NNS', index))
if 'NCSS' in sheetname or diall == True:
# add the absolute value of the SPIDEX score
element.append(np.absolute(di.at[index,'Spidex']))
# add the SpliceAI score
element.append(di.at[index,'SpliceAI'])
# add the SpliceRover score
element.append(delta_score(di, 'SpliceRover', index))
# add the SSFL score
element.append(delta_score(di, 'SSFL', index))
delta_scores[index] = element
delta_df = pd.DataFrame(delta_scores)
delta_df = delta_df.transpose()
return delta_df
def reverse_sequence(s):
''' Converts a sequence into the sequence of the complementary strand'''
new_sequence = ''
for base in s:
if base == 'A':
new_sequence = new_sequence + 'T'
elif base == 'T':
new_sequence = new_sequence + 'A'
elif base == 'G':
new_sequence = new_sequence + 'C'
elif base == 'C':
new_sequence = new_sequence + 'G'
else:
new_sequence = new_sequence + base
return new_sequence[::-1]
def Find_Optimal_Cutoff(target, predicted):
""" Find the optimal probability cutoff point for a classification model related to event rate
@target : Matrix with dependent or target data, where rows are observations
@predicted : Matrix with predicted data, where rows are observations
Returns list type, with optimal cutoff value
adapted from: https://stackoverflow.com/questions/28719067/roc-curve-and-cut-off-point-python
"""
fpr, tpr, threshold = roc_curve(target, predicted)
i = np.arange(len(tpr))
roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
return list(roc_t['threshold'])