forked from deweylab/RSEM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Transcripts.h
145 lines (117 loc) · 3.92 KB
/
Transcripts.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/*
* transcripts are numbered from 1. 0 is reserved for noise isoform
*/
#ifndef TRANSCRIPTS_H_
#define TRANSCRIPTS_H_
#include<cstdio>
#include<cstdlib>
#include<cassert>
#include<fstream>
#include<vector>
#include<algorithm>
#include<map>
#include<string>
#include "utils.h"
#include "my_assert.h"
#include "Transcript.h"
class Transcripts {
public:
Transcripts(int type = 0) {
M = 0; this->type = type;
transcripts.clear();
transcripts.push_back(Transcript());
e2i.clear(); i2e.clear();
}
int getM() { return M; }
// used in shrinking the transcripts
void setM(int M) { this->M = M; transcripts.resize(M + 1); }
void move(int from, int to) {
assert(from >= to);
if (from > to) transcripts[to] = transcripts[from];
}
int getType() { return type; }
void setType(int type) { this->type = type; }
bool isAlleleSpecific() { return type == 2; }
const Transcript& getTranscriptAt(int pos) {
assert(pos > 0 && pos <= M);
return transcripts[pos];
}
void add(const Transcript& transcript) {
transcripts.push_back(transcript);
++M;
}
void sort() {
std::sort(transcripts.begin(), transcripts.end());
}
void readFrom(const char*);
void writeTo(const char*);
//Eid: external sid
int getInternalSid(int eid) {
assert(eid > 0 && eid <= M);
return e2i[eid];
}
const Transcript& getTranscriptViaEid(int eid) {
return transcripts[getInternalSid(eid)];
}
void buildMappings(int, char**, const char* = NULL);
private:
int M, type; // type 0 from genome, 1 standalone transcriptome, 2 allele-specific
std::vector<Transcript> transcripts;
std::vector<int> e2i, i2e; // external sid to internal sid, internal sid to external sid
};
void Transcripts::readFrom(const char* inpF) {
std::string line;
std::ifstream fin(inpF);
if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
fin>>M>>type;
getline(fin, line);
transcripts.assign(M + 1, Transcript());
for (int i = 1; i <= M; i++) {
transcripts[i].read(fin);
}
fin.close();
}
void Transcripts::writeTo(const char* outF) {
std::ofstream fout(outF);
fout<<M<<" "<<type<<std::endl;
for (int i = 1; i <= M; i++) {
transcripts[i].write(fout);
}
fout.close();
}
void Transcripts::buildMappings(int n_targets, char** target_name, const char* imdName) {
std::map<std::string, int> dict;
std::map<std::string, int>::iterator iter;
std::vector<bool> appeared;
general_assert(n_targets > 0, "The SAM/BAM file declares less than one reference sequence!");
general_assert(n_targets <= M, "The SAM/BAM file declares more reference sequences (" + itos(n_targets) + ") than RSEM knows (" + itos(M) + ")!");
if (n_targets < M) fprintf(stderr, "Warning: The SAM/BAM file declares less reference sequences (%d) than RSEM knows (%d)! Please make sure that you aligned your reads against transcript sequences instead of genome.\n", n_targets, M);
dict.clear();
for (int i = 1; i <= M; i++) {
const std::string& tid = isAlleleSpecific() ? transcripts[i].getSeqName() : transcripts[i].getTranscriptID();
iter = dict.find(tid);
general_assert(iter == dict.end(), "RSEM's indices might be corrupted, " + tid + " appears more than once!");
dict[tid] = i;
}
e2i.assign(M + 1, 0);
i2e.assign(M + 1, 0);
appeared.assign(M + 1, false);
for (int i = 0; i < n_targets; i++) {
iter = dict.find(std::string(target_name[i]));
general_assert(iter != dict.end(), "RSEM can not recognize reference sequence name " + cstrtos(target_name[i]) + "!");
general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " appears more than once in the SAM/BAM file!");
e2i[i + 1] = iter->second;
i2e[iter->second] = i + 1;
iter->second = -1;
appeared[e2i[i + 1]] = true;
}
if (imdName != NULL) {
char omitF[STRLEN];
sprintf(omitF, "%s.omit", imdName);
FILE *fo = fopen(omitF, "w");
for (int i = 1; i <= M; i++)
if (!appeared[i]) fprintf(fo, "%d\n", i);
fclose(fo);
}
}
#endif /* TRANSCRIPTS_H_ */