forked from sc-zhang/bioscripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
split_fasta_by_count.py
executable file
·53 lines (46 loc) · 1.19 KB
/
split_fasta_by_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python
import sys, os
def split_fasta_by_count(in_fa, is_seq, cnt, out_dir):
if not os.path.isdir(out_dir):
os.mkdir(out_dir)
seq_db = {}
id = ''
seq = ''
with open(in_fa, 'r') as f_in:
for line in f_in:
if line[0] == '>':
if seq != '':
seq_db[id] = seq
id = line.strip()[1:]
seq = ''
else:
seq += line
seq_db[id] = seq
total_seq_cnt = len(seq_db)
tmp_cnt = int(round(total_seq_cnt*1.0/cnt+0.5))
if is_seq:
file_cnt = tmp_cnt
seq_cnt = cnt
else:
file_cnt = cnt
seq_cnt = tmp_cnt
fn = in_fa.replace('.fasta', '').replace('.fa', '')
id_list = seq_db.keys()
for i in range(0, file_cnt):
with open(os.path.join(out_dir, fn+"_"+str(i)+".fa"), 'w') as f_out:
for j in range(0, seq_cnt):
index = i*seq_cnt+j
if index < len(id_list):
f_out.write(">%s\n%s"%(id_list[index], seq_db[id_list[index]]))
if __name__ == "__main__":
if len(sys.argv) < 5:
print("Usage: python "+sys.argv[0]+" <in_fasta> <S/F> <count> <out_dir>")
else:
in_fa = sys.argv[1]
if sys.argv[2].lower() == 's':
is_seq = True
else:
is_seq = False
cnt = int(sys.argv[3])
out_dir = sys.argv[4]
split_fasta_by_count(in_fa, is_seq, cnt, out_dir)