-
Notifications
You must be signed in to change notification settings - Fork 0
/
synDataGenRun.py
118 lines (100 loc) · 4.18 KB
/
synDataGenRun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# Written by Jeff Robertson (thejar@vt.edu)
# Uses seq_gen.py written by Jake Martinez (jrm98@vt.edu)
# and orange_pipeline.py
from seq_gen import generate
from subprocess import call
import time
import random
import shutil
import os
import sys
import pdb
def main():
timestamp = time.strftime("%y-%m-%d_%H-%M-%S")
random.seed(time.time())
resdir = "datasets_" + timestamp
os.mkdir(resdir)
with open("dataGenRun_" + timestamp + '.log', 'w') as logFile:
# print and log s
def out(s):
print(str(s))
print >>logFile, time.asctime(), str(s)
logFile.flush()
try:
# number of fasta files/datasets to generate and run
dataSetCount = 50
# motif width range (inclusive)
wRange = (7, 13)
# number of sequences
N = 35
# sequence length
n = 1000
# maximum starting location of motif in sequence
maxloc = -1
# minimum starting location of motif in sequence
minloc = 0
# output directory, used when calling pipeline and for saving results
directory = "./"
# max number of motifs in a single sequence
nmotifs = 1
# test with dyad motifs?
dyad = False
out("Starting synDataGenRun with:")
out(timestamp)
out(dataSetCount)
out(wRange)
out(N)
out(n)
out(maxloc)
out(minloc)
out(directory)
out(nmotifs)
for dataSetNum in xrange(dataSetCount):
# output filename, used when calling pipeline and for saving results
output = time.strftime("%y-%m-%d_%H-%M-%S.fasta")
# motif length
w = random.randint(wRange[0], wRange[1])
# max SNPs/motif
e = random.randint(1, w/4)
# prob of a SNP in the motif
ep = random.randint(25, 80) / 100.0
# prob motif is in sequence
P = random.randint(65, 100) / 100.0
out("Generating data set #" + str(dataSetNum) + " with:")
out(output)
out(w)
out(e)
out(ep)
out(P)
# gen positive fasta
generate(w, N, n, e, ep, P, minloc, maxloc, output, directory, False, nmotifs, dyad)
# gen negative fasta
generate(w, N, n, e, ep, P, minloc, maxloc, "neg_"+output, directory, True, nmotifs, dyad)
#get motif, is there a better way?
with open(directory+output, 'r') as fin:
motif = fin.readline().split("'")[1]
out("datasets generated with motif " + motif + ", running pipeline:")
cmdLine = ('./orange_pipeline.py -c synData.cfg -w ' + str(len(motif)) + ' ' + directory + output +
' ' + directory + 'neg_' + output)
out(cmdLine)
with open(resdir+'/'+motif+'.log', 'w') as lout:
call([cmdLine], shell=True, stdout=lout)
out("pipeline finished, copying data and results to "+resdir+'/'+motif)
# file not necessary, 2/3 the space
try:
os.unlink('results/cmfSeeds')
except:
# if the file didn't exist, that means the pipeline didn't run (or at least cmf) and something is wrong
pdb.set_trace()
shutil.copytree('results', resdir+'/'+motif)
shutil.copyfile(directory+output, resdir+'/'+motif+'.fasta')
out("done copying")
# delete fasta files
for f in filter(lambda a: output.split('.')[0] in a and 'fasta' in a,
os.listdir(directory)):
out("deleting " + directory+f)
os.unlink(directory+f)
except:
out(sys.exc_info())
if __name__ == "__main__":
main()