-
Notifications
You must be signed in to change notification settings - Fork 0
/
N50_Stats.py
76 lines (54 loc) · 1.94 KB
/
N50_Stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#! /usr/bin python
# need to change the above line to suit the path to python on your system
# N50 code originally from Cara Magnabosco was modified by Kate Wathen-Dunn
# https://caramagnabosco.wordpress.com/
# accessed 2nd July 2016, modified 5th July 2016
# Used to calculate the N50 of assembled contigs in a fasta file
# where N50 is the contig length, so that half the bases in the entire
# assembly are found in contigs of this length or below. (Cara M)
# Code also ouptputs the minimum, maximum, mean and median contig lengths
# as well as the total number of assembled bases and number of contigs
# in the assembly fasta file. (Kate WD)
# Execute by: python N50_Stats.py fastafile
from Bio import SeqIO
import sys
import numpy
input = open(sys.argv[1], 'rU')
seqlength = []
for record in SeqIO.parse(input, 'fasta'):
bp = len(record.seq)
seqlength.append(bp)
seqlength = sorted(seqlength)
num_contig = len(seqlength)
print ("the number of contigs is: %d" % num_contig)
print ("the min sequence length is: %d" % seqlength[0])
print ("the max sequence length is: %d" % seqlength[-1])
total_bp = sum(seqlength)
mean_contig_length = total_bp/len(seqlength)
median_contig_length = numpy.median(seqlength)
print ("the total assembled bases is: %d" % total_bp)
print ("the mean contig length is: %d" % mean_contig_length)
print ("the median contig length is: %d" % median_contig_length)
unique = []
for entry in seqlength:
if not entry in unique:
unique.append(entry)
n50 = []
for entry in unique:
multiplier = seqlength.count(entry) * entry
for i in range(multiplier):
n50.append(entry)
index = len(n50)/2
avg = []
if index % 2 == 0:
index_int = int(index)
first = n50[index_int-1]
second = n50[index_int]
avg.append(first)
avg.append(second)
result = numpy.mean(avg)
print ("the N50 is : %d" % result)
else:
index_int = int(index)
print("the N50 is: %d" % n50[index_int-1])
input.close()