-
Notifications
You must be signed in to change notification settings - Fork 0
/
barcode_stats.py
102 lines (88 loc) · 2.92 KB
/
barcode_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pysam
import sys
import os
import portion as P
bamfile = pysam.AlignmentFile(sys.argv[1], "rb")
try:
os.makedirs(sys.argv[2])
except:
pass
iter = bamfile.fetch("chr1", 1000000, 3000000)
max_barcodes = 20000
current_barcodes = 0
superdict = {}
for record in iter:
try:
bx_tag = record.get_tag("BX")
except:
continue
if bx_tag not in superdict and current_barcodes < max_barcodes:
superdict[bx_tag] = []
current_barcodes += 1
if bx_tag not in superdict:
continue
superdict[bx_tag].append((record.reference_start, record.reference_end))
molecule_length = []
molecule_coverage = []
max_diff = 10000
for barcode, vect in superdict.items():
if len(vect) <= 2:
continue
print(barcode)
#print(vect)
aligned = 0
unaligned = 0
start = vect[0][0]
superinterval = P.empty()
for interval in vect:
superinterval = superinterval | P.closed(interval[0], interval[1])
superinterval = list(superinterval)
if len(superinterval) <= 1:
continue
for i in range(len(vect) - 1):
if vect[i + 1][0] - vect[i][1] > max_diff:
aligned += vect[i][1] - vect[i][0]
if vect[i][1] - start > 5000:
molecule_coverage.append(aligned/float(aligned+unaligned))
if vect[i][1] - start > 5000:
molecule_length.append(vect[i][1] - start)
start = vect[i+1][0]
aligned = 0
unaligned = 0
else:
aligned += vect[i][1] - vect[i][0]
unaligned += vect[i+1][0] - vect[i][1]
aligned += vect[-1][1] - vect[-1][0]
if vect[i][1] - start > 5000:
molecule_coverage.append(aligned/float(aligned+unaligned))
if vect[i][1] - start > 5000:
molecule_length.append(vect[-1][1] - start)
from matplotlib import pyplot as plt
import numpy as np
bins1 = np.arange(0, 1, 0.01)
bins2 = np.arange(0, 200000, 1000)
print(molecule_length)
print(molecule_coverage)
with open(sys.argv[2] + "/mol_length.csv", 'w') as molecule_length_csv:
molecule_length_csv.write("length\n")
for item in molecule_length:
molecule_length_csv.write(str(item)+"\n")
with open(sys.argv[2] + "/mol_coverage.csv", 'w') as molecule_length_csv:
molecule_length_csv.write("coverage\n")
for item in molecule_coverage:
molecule_length_csv.write(str(item)+"\n")
plt.hist(molecule_length, bins=bins2, alpha=0.5)
plt.title('Molecule length distribution')
plt.xlabel('Molecule length observed')
plt.ylabel('Count')
plt.savefig(sys.argv[2] + "/mol_length.png", dpi = 150)
plt.close()
plt.hist(molecule_coverage, bins=bins1, alpha=0.5)
plt.title('Molecule coverage distribution')
plt.xlabel('Molecule coverage observed')
plt.ylabel('Count')
plt.savefig(sys.argv[2] + "/mol_coverage.png", dpi = 150)
plt.close()
fig, ax = plt.subplots()
ax.scatter(molecule_length,molecule_coverage)
plt.savefig(sys.argv[2] + "/scatterplot.png", dpi = 150)