-
Notifications
You must be signed in to change notification settings - Fork 1
/
pairwise_comparison_S16.py
110 lines (89 loc) · 3.21 KB
/
pairwise_comparison_S16.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from __future__ import print_function
import os,sys
import argparse
import re
# import math
# import numpy as np
from collections import defaultdict
from itertools import combinations
# import string
# import fractions
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# from collections import Counter
# import errno
# import shutil
def plot(filename):
palette = {
'human1': 'tab:blue',
'human2': 'tab:green',
'chimp1': 'tab:orange',
'chimp2': 'tab:red',
'bonobo' : 'tab:purple',
'gorilla' : 'tab:grey',
'B.orang' : 'tab:pink',
"S.orang" : 'black'
}
indata = pd.read_csv(filename)
# #for fam in ['BPY2', 'CDY', 'DAZ', 'HSFY', 'PRY', 'RBMY', 'TSPY', 'VCY', 'XKRY']:
for species in ['human1','human2', 'chimp1', 'chimp2','bonobo','gorilla','B.orang','S.orang']:
sns.countplot(data=indata, x="gene_family", hue="species", palette = palette)
plt.xlabel("Gene family")
plt.ylabel("Count")
plt.title("Shared with {0}".format(species))
plt.savefig(filename+"{0}.pdf".format(species))
plt.clf()
def get_shared_between_species(gene_fam_shared_table, s1, s2):
identical = 0
for h in gene_fam_shared_table:
if s1 in gene_fam_shared_table[h] and s2 in gene_fam_shared_table[h]:
identical += gene_fam_shared_table[h][s1]
identical += gene_fam_shared_table[h][s2]
return identical
def get_nr_shared(infile, typ ='hash'):
# initialize counts table
shared_table = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
for line in open(infile, 'r'):
hash_,gene_family,species,transcript_id,seq = line.split(',')
if typ == 'hash':
shared_table[gene_family][hash_][species] += 1
elif typ == 'sequence':
shared_table[gene_family][seq][species] += 1
for s1,s2 in combinations(['human1','human2', 'chimp1', 'chimp2','bonobo','gorilla','B.orang','S.orang'], 2):
print(s1,s2)
l = []
print('BPY2,CDY,DAZ,HSFY,PRY,RBMY,TSPY,VCY,XKRY')
for fam in ['BPY2', 'CDY', 'DAZ', 'HSFY', 'PRY', 'RBMY', 'TSPY', 'VCY', 'XKRY']:
identical = get_shared_between_species(shared_table[fam], s1,s2)
l.append(identical)
print(','.join([str(n) for n in l]))
return shared_table
def main(args):
outfile = open(args.outfile, 'w')
outfile.write('hash,gene_family,species,transcript_id,seq\n')
for line in open(args.infile, 'r'):
#print(line.split())
hash_, acc, seq = line.split()
gene_family = acc.split('_')[0][1:]
species = acc.split('_')[1]
transcript_id = '_'.join(acc.split('_')[2:])
if species == 'h' or species == 'cb':
species = acc.split('_')[2]
transcript_id = '_'.join(acc.split('_')[3:])
outfile.write('{0},{1},{2},{3},{4}\n'.format(hash_, gene_family,species, transcript_id,seq.strip()))
outfile.close()
print('NUMBER IDENTICAL (HASH)')
get_nr_shared(outfile.name, typ ='hash')
print()
print('NUMBER IDENTICAL (SEQUENCE)')
get_nr_shared(outfile.name, typ ='sequence')
# plot(outfile.name)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Evaluate pacbio IsoSeq transcripts.")
parser.add_argument('infile', type=str, help='All transcripts in space separated format from excel')
parser.add_argument('outfile', type=str, help='Path to processed csv')
args = parser.parse_args()
main(args)