-
Notifications
You must be signed in to change notification settings - Fork 0
/
split_impute.py
103 lines (85 loc) · 2.89 KB
/
split_impute.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import re
import subprocess
from subprocess import PIPE
import sys
import time
import argparse
import gzip
def get_wc(impute):
global stdout_
if '.gz' in impute:
print 'DETECTED AN GZIPPED {} , PLEASE WAIT WHILE LINES ARE BEING COUNTED'.format(impute)
wc_command = 'zcat '+impute+ ' | wc -l'
else:
wc_command = 'wc -l < '+impute
wc_ = subprocess.Popen(wc_command, shell=True, stdout=PIPE, stderr=PIPE)
stdout, stderr = wc_.communicate()
if stderr:
print 'ERROR TRY AGAIN NOT A VALID FILE OR CHECK DIRECTORY FOR FILE'
else:
stdout_ = stdout.strip()
return(stdout_)
def make_file(chunkid, chunk_n):
global chunk_filename
chunk_filename = chunkid+'_'+str(chunk_n)+'.gen'
#chunk_write = open(chunk_filename, 'w')
return(chunk_filename)
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-impute', help='A impute file in the gen format', required=True)
parser.add_argument('-chunks', help='how many chunks you want to divide the files into', required=True)
parser.add_argument('-chunkid', help='A string for chunkid', required=True)
parser.add_argument('-gz', choices=['1', '0'], help='Should the output be gzipped? (1 -yes or 0 -no)', required=False)
args=parser.parse_args()
impute=args.impute ## the impute file
chunks = int(args.chunks) ## no of chunks
chunkid = args.chunkid ## stringID for chunks
gzipped = int(args.gz)
print args
line_num = int(get_wc(impute))
lines_chunk = line_num/chunks
print 'REQUESTED {} CHUNKS CALCULATED {} AS LINE LOAD PER CHUNK '.format(chunks, lines_chunk)
#remainder = float(line_num) % float(chunks)
if lines_chunk > line_num:
raise ValueError('CHUNK_LINES GREATER THAN TOTAL LINE_NUM')
processed_buf = 0
line_track = 0
chunk_track = 1
lines_for_chunk =0
outfile_name = make_file(chunkid, chunk_track)
if gzipped == 1:
print 'REQUESTED OUTPUT IN GZIP FORMAT'
outfile = gzip.open(outfile_name+'.gz', 'w')
else:
outfile = open(outfile_name, 'w')
if '.gz' in impute:
file_handle = gzip.open(impute, 'rb')
else:
file_handle = open(impute, 'r')
for line in file_handle:
line_track += 1
#lines_for_chunk += 1
if line_track == 10000:
processed_buf += 10000
PercentProcessed ='{0:.1f}'.format((float(processed_buf)/float(line_num))*100)
print ' <<<<< {} PROCESSED LINES {} % percent done from {} >>>>>'.format(processed_buf, PercentProcessed, impute)
line_track = 1
if lines_for_chunk == lines_chunk:
outfile.close()
chunk_track += 1
outfile_name_new=make_file(chunkid =chunkid, chunk_n =chunk_track)
if gzipped == 1:
outfile = gzip.open(outfile_name_new+'.gz', 'w')
else:
outfile = open(outfile_name_new, 'w')
outfile.write(line)
lines_for_chunk = 1
else:
outfile.write(line)
lines_for_chunk += 1
outfile.close()
file_handle.close()
# if chunk_track == chunks/2:
# print '<<<<<<< PROGRESS - 50% DONE >>>>>>>'
if __name__ == "__main__": main()