-
Notifications
You must be signed in to change notification settings - Fork 0
/
combine24.pl
98 lines (77 loc) · 2.64 KB
/
combine24.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
## combine CSV files version 0.24 Jerry Lin 03/11/2014
## Take PubChem CSV files and LINCS CID list and combine into single files
## Usage: perl combine1.pl [index file/CID] [output file]
use warnings;
use strict;
use Text::CSV;
## -----------Read target GI/AID file-----------------
my %gi_uniprot; #gi to uniprot hash
my %aid_uniprot; #AID to uniprot hash
open GIFILE,"<gi2uniprot.tsv";
while (my $line = <GIFILE>){
chomp $line;
$line = uc $line;
my @col = split "\t",$line;
if($col[0]){$gi_uniprot{$col[0]} = $col[1];}
}#while
close GIFILE;
open AIDFILE,"<aid2uniprot.tsv";
while (my $line = <AIDFILE>){
chomp $line;
$line = uc $line;
my @col = split "\t",$line;
if($col[0]){$aid_uniprot{$col[0]}=$col[1];}
}#while
close AIDFILE;
##------------------------------------------------------
my $indexfile= "input.tsv"; #input/index file
if ($ARGV[0]){$indexfile=$ARGV[0];}
my $outfile = "combine.csv"; #output/combined file
if ($ARGV[1]){$outfile=$ARGV[1];}
open MISS,">misslog.csv"; #log file for missing CIDs
open FILEH,"<$indexfile";
open OUTF, ">$outfile";
my $line = <FILEH>;my @title= split "\t",$line; ## read title;
my $csv = Text::CSV->new({ sep_char => ',' });
while(my $line = <FILEH>){
chomp $line;
my @field = split "\t", $line;
my $cid ="";
if ($field[7]){$cid=$field[7];}
my $filename = 'CID_'.$cid.'_assaydata.csv';
if (-e $filename) {
print "\nProcessing CID_$cid";
open CSVF,"<$filename";
my $line2 = <CSVF>; # read title of csv file
while (my $line2 = <CSVF>){
chomp $line2;
print ".";
my @field2;
if ($csv->parse($line2)) {@field2 = $csv->fields();}
if ($field2[8]){
my $target=$field2[8];
$target =~ /gi:(\d+)/;
$field2[8]="";
if($1){$field2[8]=$1;}
} # endif ($filed[8])
my $uniprot ="";
if($field2[8]){
if ($gi_uniprot{$field2[8]}){
$uniprot = $gi_uniprot{$field2[8]};
}
}else{
if($field2[5]){
my $assay = uc $field2[5];
if($aid_uniprot{$assay}){$uniprot= $aid_uniprot{$assay};} #end if
} #end if ($field[5]) AID test
} #end if ($field[8]) GI test
foreach(@field){print OUTF "$_\t";}
foreach(@field2){print OUTF "$_\t";}
print OUTF "$uniprot\n";
}#while(<CSVF>)
close CSVF;
}else{ print MISS "$field[0]\t$cid\n";} #end if (-e filename)
}# end while(<INPUT>)
close FILEH;
close OUTF;
close MISS;