-
Notifications
You must be signed in to change notification settings - Fork 0
/
groupTreemix.py
51 lines (42 loc) · 1.47 KB
/
groupTreemix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 10 15:14:08 2019
@author: YudongCai
@Email: yudongcai216@gmail.com
"""
import click
import numpy as np
import pandas as pd
from collections import defaultdict
from collections import OrderedDict
@click.command()
@click.option('--treemix', help='treemix input file')
@click.option('--groupfile', help='two cols, sampleID groupID')
@click.option('--outfile', help='output file name')
def main(treemix, groupfile, outfile):
"""
merge treemix input file according to thier group (--groupfile)
"""
groups = defaultdict(list)
with open(groupfile) as f:
for line in f:
sample, group = line.strip().split()
groups[group].append(sample)
df = pd.read_csv(treemix, sep='\s+', header=0)
first_haplotype = df.iloc[:, :].applymap(lambda s: int(s.split(',')[0]))
second_haplotype = df.iloc[:, :].applymap(lambda s: int(s.split(',')[1]))
fdf = OrderedDict()
sdf = OrderedDict()
for group, samples in groups.items():
fdf[group] = first_haplotype[samples].sum(axis=1)
sdf[group] = second_haplotype[samples].sum(axis=1)
fdf = pd.DataFrame(fdf)
sdf = pd.DataFrame(sdf)
mdf = fdf.astype('str') + ',' + sdf.astype('str')
if outfile.split('.')[-1] == 'gz':
comp = 'gzip'
else:
comp = None
mdf.to_csv(outfile, sep=' ', index=False, compression=comp)
if __name__ == '__main__':
main()