-
Notifications
You must be signed in to change notification settings - Fork 1
/
pre_EBSeq.py
56 lines (46 loc) · 1.79 KB
/
pre_EBSeq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python
# Author: rachel.legendre@pasteur.fr
from os.path import basename, join
from os import getcwd, system, environ
import argparse
from shutil import copyfile
import tempfile
import csv
from collections import Counter
import pandas as pd
def __main__():
parser = argparse.ArgumentParser()
parser.add_argument('--inputs', action='append', nargs='*')
parser.add_argument('--outvector')
parser.add_argument('--outtable')
args = parser.parse_args()
IGvector = args.outvector
outtable = args.outtable
inputs = args.inputs
working_directory = getcwd()
dfs = []
#Build the Expression table from the "expected_count" column of RSEM count table
for (filename, cond) in inputs:
# read the csv, making sure the first two columns are str
df = pd.read_csv(filename, sep='\t', converters={0: str, 1: str})
# throw away all but the first two columns
df = df.iloc[:, [0,1,4]]
# change the column names so they won't collide during concatenation
df = df.rename(index=str, columns={"expected_count": cond})
dfs.append(df)
# concatenate them horizontally
df_final = reduce(lambda left, right: pd.merge(left, right, on=['gene_id','transcript_id']), dfs)
# write it out
df_final.to_csv(outtable, index=None, sep="\t")
#get IG vector from the Expression Table
#The IG Vector is a table with only one column of numbers (integers)
df2 = pd.read_csv(outtable, sep='\t', converters={0: str, 1: str})
ids= df2[['transcript_id', 'gene_id']]
counts = Counter(ids['gene_id'])
gene_order = list(ids['gene_id'])
with open(IGvector, 'wb') as IG:
for gene in gene_order:
nbG = counts[gene]
IG.write(str(nbG) + '\n')
if __name__ == "__main__":
__main__()