Skip to content

Commit

Permalink
simplified start/stop codon handling
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Hiller committed Aug 24, 2017
1 parent 49e59eb commit 7e45ba3
Show file tree
Hide file tree
Showing 14 changed files with 668 additions and 537 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ doc:
.PHONY: test
test: ${objects} cesar
tests.sh ./cesar
make -C test


.PHONY: test/valgrind
test/valgrind: ${objects} cesar
Expand Down
3 changes: 0 additions & 3 deletions extra/tables/human/firstCodon_profile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,3 @@ A T C G
0.453 0.063 0.103 0.380
0.281 0.123 0.395 0.201
0.180 0.076 0.465 0.280
0.999 0.000 0.001 0.000
0.000 0.999 0.000 0.000
0.000 0.001 0.000 0.999
27 changes: 22 additions & 5 deletions src/Alignment.c
Original file line number Diff line number Diff line change
Expand Up @@ -142,12 +142,9 @@ struct Alignment* Alignment__create(struct Fasta* fasta, uint8_t query_id, struc
} else if (!strncmp("split_codon", path[i]->name, 11)) {
self->reference[t+j] = Literal__char(fasta->references[reference_id]->sequence[r++]) + lower;
self->query[t+j] = Literal__char(fasta->queries[query_id]->sequence[q++]) + lower;
} else if (!strncmp("match_codon", path[i]->name, 11)) {
} else if (!strncmp("start_codon", path[i]->name, 11) || !strncmp("stop_codon", path[i]->name, 10) || !strncmp("match_codon", path[i]->name, 11)) {
self->reference[t+j] = Literal__char(fasta->references[reference_id]->sequence[r++]);
self->query[t+j] = Literal__char(fasta->queries[query_id]->sequence[q++]);
} else if (!strncmp("match_split", path[i]->name, 11)) {
self->reference[t+j] = Literal__char(fasta->references[reference_id]->sequence[r++]) + lower;
self->query[t+j] = Literal__char(fasta->queries[query_id]->sequence[q++]) + lower;
} else if (!strncmp("match", path[i]->name, 5)) { // donor, acceptor
self->reference[t+j] = ' ';
self->query[t+j] = Literal__char(fasta->queries[query_id]->sequence[q++]) + lower;
Expand Down Expand Up @@ -195,6 +192,25 @@ struct Alignment* Alignment__create(struct Fasta* fasta, uint8_t query_id, struc
self->query[t+j] = '-';
}
}

// between_acc_split -> between_match_ins
if (!strncmp("between_acc_split", path[i-1]->name, 17) &&
!strncmp("between_match_ins", path[i]->name, 17)) {
for (; j < 3; j++) {
self->reference[t+j] = Literal__char(fasta->references[reference_id]->sequence[r++]);
self->query[t+j] = '-';
}
}

// end_codon -> between_split_donor
if (!strncmp("end_codon", path[i-1]->name, 9) &&
!strncmp("between_match_donor", path[i]->name, 19)) {
for (; j < 3; j++) {
self->reference[t+j] = Literal__char(fasta->references[reference_id]->sequence[r++]);
self->query[t+j] = '-';
}
}

if (i > 0 && !strncmp("between_split_donor", path[i]->name, 19) &&
!strncmp("end_codon", path[i-1]->name, 9) && path[i-1]->custom ==
fasta->references[reference_id]->num_codons-1) {
Expand All @@ -217,9 +233,10 @@ struct Alignment* Alignment__create(struct Fasta* fasta, uint8_t query_id, struc
char bases[4] = "";
Literal__str(path[i]->num_emissions, path[i]->reference, bases);
for (; j < 3*deleted_codons; j++) {
if (r >= fasta->references[reference_id]->length && reference_id < fasta->num_references+1) {
if (r >= fasta->references[reference_id]->length && reference_id <= fasta->num_references) {
r = 0;
reference_id++;
logv(3, "next reference: %u", reference_id)
}
self->reference[t+j] = Literal__char(fasta->references[reference_id]->sequence[r]);
self->query[t+j] = '-';
Expand Down
4 changes: 2 additions & 2 deletions src/Cesar.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ char g_loglevel = LOGLEVEL;
* The main program.
*/
int main(int argc, char* argv[argc]) {
struct EmissionTable emission_tables[6];
struct EmissionTable emission_tables[8];

struct Params parameters;
Params__create(&parameters, emission_tables);
Expand Down Expand Up @@ -112,7 +112,7 @@ int main(int argc, char* argv[argc]) {
logv(1, "Query %u length: %lu", i, fasta.queries[i]->length);
qlength += fasta.queries[i]->length;
}
float mem = 7e-9*(rlength*qlength + 4*rlength); // in GB. Factor of 17 bytes is taken from measurements.
float mem = 7e-9*(rlength*qlength + 4*rlength); // in GB. Factor 7e-9 is taken from measurements.
if (mem > (float)parameters.max_memory) {
die("The memory consumption is limited to %u GB by default. Your attempt requires %u GB. You can change the limit via --max-memory.", parameters.max_memory, (uint8_t)mem);
} else {
Expand Down
25 changes: 25 additions & 0 deletions src/EmissionTable.c
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,31 @@ bool EmissionTable__init(struct EmissionTable* self, EMISSION_ID_T num_emissions
return true;
}

/**
* Set the emission probability to one n-th for each of n codons.
* @param self the EmissionTable.
* @param num_codons the number of codons (n).
* @param codons an array of a series of codons (e.g. [TAATGATAG] for all stop codons).
* @return success boolean.
*/
bool EmissionTable__init_single_codons(struct EmissionTable* self, uint8_t num_codons, Literal codons[num_codons]) {
EmissionTable__init(self, 3, LAMBDA_DISTRIBUTION);
LOGODD_T one_nth = 1.0 - Logodd__log((double) num_codons);

for(uint8_t i = 0; i < num_codons; i++) {
// for each query, set logodd concerning
EMISSION_ID_T row = Literal__uint(self->num_literals, &codons[3*i]);
for (uint8_t j = 0; j < num_codons; j++) {
EMISSION_ID_T column = Literal__uint(self->num_literals, &codons[3*j]);
if (! LogoddMatrix__set(self->values, column, row, one_nth)) {
return false;
}
}
}
return true;
}


/**
* Destroy an emission table's sub structure.
* Notice: This only frees memory that has been allocated during init, not the given pointer itself.
Expand Down
1 change: 1 addition & 0 deletions src/EmissionTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ typedef struct EmissionTable {
bool EmissionTable__read(struct EmissionTable* self, char* filename);
LOGODD_T EmissionTable__by_literals(struct EmissionTable* self, Literal reference[], Literal query[]);
bool EmissionTable__init(struct EmissionTable* self, EMISSION_ID_T num_literals, Distribution distribution);
bool EmissionTable__init_single_codons(struct EmissionTable* self, EMISSION_ID_T num_literals, Literal codons[]);
bool EmissionTable__destroy(struct EmissionTable* self);
bool EmissionTable__set(struct EmissionTable* self, Literal sequence[], LOGODD_T logodd);
bool EmissionTable__forbid(struct EmissionTable* self, Literal sequence[]);
Expand Down
168 changes: 127 additions & 41 deletions src/Model.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,13 @@ bool forward_deletions(struct HMM* hmm, struct Params* params, size_t num_codons
break;
}
if (i == 0) {
State__add_incoming(codons[i], params->multiple_cd_factors[j] + params->cd_acc, codons[i+j+2]);
logv(5, "cd_acc: %e from: %lu to: %lu", Logodd__exp(params->multiple_cd_factors[j]+params->cd_acc), i, i+j+2);
State__add_incoming(codons[i], params->multiple_cd_factors[j] + params->cd_acc, codons[i+j+2]);
} else if (num_codons == i+j+2) {
State__add_incoming(codons[i], params->multiple_cd_factors[j] + params->cd_do, codons[i+j+2]);
logv(5, "cd_do: %e from: %lu to: %lu", Logodd__exp(params->multiple_cd_factors[j]+params->cd_do), i, i+j+2);
State__add_incoming(codons[i], params->multiple_cd_factors[j] + params->cd_do, codons[i+j+2]);
} else {
logv(5, "cd_do: %e from: %lu to: %lu", Logodd__exp(params->multiple_cd_factors[j] * Logodd__exp(params->cd_logodd)), i, i+j+2);
State__add_incoming(codons[i], Logodd__log(params->multiple_cd_factors[j] * Logodd__exp(params->cd_logodd)), codons[i+j+2]);
}
}
Expand All @@ -81,15 +82,15 @@ bool match_codon(struct HMM* hmm, Params* params, size_t index, Literal codon[3]
char codon_str[4] = "";
Literal__str(3, codon, codon_str);

/*
for (uint8_t i=0; i < params->num_stop_codons; i++) {
if (codon[0] == params->stop_codons[i*3] &&
codon[0+1] == params->stop_codons[i*3+1] &&
codon[0+2] == params->stop_codons[i*3+2]) {
die("Reference contains full stop codon %s: Codon %lu", codon_str, index);
if (!last) {
for (uint8_t i=0; i < params->num_stop_codons; i++) {
if (codon[0] == params->stop_codons[i*3] &&
codon[0+1] == params->stop_codons[i*3+1] &&
codon[0+2] == params->stop_codons[i*3+2]) {
die("Reference contains full stop codon %s: Codon %lu", codon_str, index);
}
}
}
*/

// match_curr_codon
struct State* match_codon = HMM__new_state(hmm);
Expand All @@ -100,6 +101,7 @@ bool match_codon(struct HMM* hmm, Params* params, size_t index, Literal codon[3]
struct State* next_cluster = HMM__new_state(hmm);

State__init(match_codon, "match_codon", 3, codon, params->emission_table_64_LAMBDA);
logv(4, "Match: %c%c%c", Literal__char(codon[0]), Literal__char(codon[1]), Literal__char(codon[2]));
match_codon->custom = index;
State__init_uniform(insert_codon_codon, "insert_codon_codon", 3, params->emission_table_61_LAMBDA);
insert_codon_codon->custom = index;
Expand Down Expand Up @@ -194,6 +196,67 @@ struct HMM* multi_exon(struct Params* params, struct Fasta* fasta, struct Profil
logv(1, "There are %i references.", fasta->num_references);
for (uint8_t i=0; i < fasta->num_references; i++) {
struct Sequence* reference = fasta->references[i];
bool reference_has_start = false;
bool reference_has_stop = false;

if (i==0) {
/* Find out whether the reference has a start codon.
* This has several possible cases:
* 1. The reference has a split codon --> no start codon to be expected; warn user if she expects otherwise (-f)
* ~2. The user used -f / --first-exon~
* 3. The user didn't give -f but the reference begins with start codon. --> warn user
*/
reference_has_start = reference->start_split_length == 0;
if (!reference_has_start && params->firstexon) {
warn("Found split codon at acc-site, but you declared -f/--firstexon");
}
//reference_has_start &= params->firstexon;
bool match = true;
if (reference_has_start) {
for(uint8_t j = 0; j < params->num_start_codons; j++) {
match = true;
for(uint8_t k = 0; k < 3; k++) {
match &= reference->sequence[k] == params->start_codons[3*j+k];
}
if (match) {
break;
}
}
if (params->firstexon && !match) {
warn("Couldn't find declared start codon in reference %i. (counting from zero)", i);
}
}
reference_has_start &= match;
}
logv(1, "reference[%i] has start: %s", i, reference_has_start ? "true" : "false");

if (i==fasta->num_references-1) {
reference_has_stop = reference->end_split_length == 0;
if (!reference_has_stop && params->lastexon) {
warn("Found split codon at do-site, but you declared -l/--lastexon");
}
bool match = true;
if (reference_has_stop) {
for(uint8_t j = 0; j < params->num_stop_codons; j++) {
match = true;
for(uint8_t k = 0; k < 3; k++) {
logv(1, "reference[%i]->sequence[%i-3-%i] == params->stop-codons[3*%i+%i]: %c == %c", i,
reference->num_codon_bases, k, j, k,
Literal__char(reference->sequence[reference->start_split_length+reference->end_split_length+reference->num_codon_bases-3+k]),
Literal__char(params->stop_codons[3*j+k]));
match &= reference->sequence[reference->start_split_length+reference->end_split_length+reference->num_codon_bases-3+k] == params->stop_codons[3*j+k];
}
if (match) {
break;
}
}
if (params->lastexon && !match) {
warn("Couldn't find declared stop codon in reference %i. (counting from zero)", i);
}
}
reference_has_stop &= match;
logv(1, "reference[%i] has stop: %s", i, reference_has_stop ? "true" : "false");
}

// intron start
struct State* intron_start;
Expand All @@ -218,20 +281,25 @@ struct HMM* multi_exon(struct Params* params, struct Fasta* fasta, struct Profil
State__add_incoming(intron_start, params->b2_bas, between_acc_split);
State__add_incoming(match_acceptor_end, 0, between_acc_split);

logv(1, "reference->start_split_length: %u", reference->start_split_length);
logv(1, "reference[%i]->start_split_length: %u", i, reference->start_split_length);
struct State* split_codon_acceptor = HMM__new_state(hmm);
switch (reference->start_split_length) {
case 0:
State__init_silent(split_codon_acceptor, "split_codon_acceptor");
break;
case 1:
State__init_uniform(split_codon_acceptor, "split_codon_acceptor", 1, params->emission_table_4_UNIFORM);
break;
case 2:
State__init_uniform(split_codon_acceptor, "split_codon_acceptor", 2, params->emission_table_16_UNIFORM);
break;
default:
die("Invalid number of split codon nucleotides: %u", reference->start_split_length);
if (reference_has_start) {
// start
State__init(split_codon_acceptor, "match_codon", 3, params->start_codons, params->emission_table_start_LAMBDA);
} else {
switch (reference->start_split_length) {
case 0:
State__init_silent(split_codon_acceptor, "split_codon_acceptor");
break;
case 1:
State__init_uniform(split_codon_acceptor, "split_codon_acceptor", 1, params->emission_table_4_UNIFORM);
break;
case 2:
State__init_uniform(split_codon_acceptor, "split_codon_acceptor", 2, params->emission_table_16_UNIFORM);
break;
default:
die("Invalid number of split codon nucleotides: %u", reference->start_split_length);
}
}
State__add_incoming(between_acc_split, Logodd__log(1.0 - Logodd__exp(params->fs_logodd)), split_codon_acceptor);

Expand All @@ -244,13 +312,17 @@ struct HMM* multi_exon(struct Params* params, struct Fasta* fasta, struct Profil
State__add_incoming(insert_codon_acceptor, params->i3_i1_acc, insert_codon_acceptor);

struct State* between_split_ins = HMM__new_state(hmm);
State__init_silent(between_split_ins, "between_split_ins");
if (reference_has_start) {
State__init_silent(between_split_ins, "between_match_ins");
} else {
State__init_silent(between_split_ins, "between_split_ins");
}
State__add_incoming(split_codon_acceptor, 0.0, between_split_ins);
State__add_incoming(between_acc_split, params->fs_logodd, between_split_ins);
State__add_incoming(between_split_ins, params->splice_nti, insert_1nt_acceptor);
State__add_incoming(between_split_ins, params->splice_i1, insert_codon_acceptor);

// codons
// others
struct State* start_first_codon = HMM__new_state(hmm);
State__init_silent(start_first_codon, "start_first_codon");
State__add_incoming(between_split_ins, params->splice_js, start_first_codon);
Expand All @@ -262,35 +334,49 @@ struct HMM* multi_exon(struct Params* params, struct Fasta* fasta, struct Profil
struct State** codons = (struct State**) SAFEMALLOC(sizeof(struct State*) * (2 + reference->num_codons));

// Iterate over Codons
create_codon_chain(hmm, params, &num_codons, codons, reference->num_codon_bases, &reference->sequence[reference->codons_offset], start_first_codon, &end_last_codon);
uint8_t start_offset = 0;
if (reference_has_start) start_offset = 3;
uint8_t stop_offset = 0;
if (reference_has_stop) stop_offset = 3;

create_codon_chain(hmm, params, &num_codons, codons, reference->num_codon_bases-start_offset-stop_offset, &reference->sequence[reference->codons_offset+start_offset], start_first_codon, &end_last_codon);

logv(1, "Codons:\t%lu", num_codons);
assert(num_codons == reference->num_codons);
logv(1, "Non-Start/Stop-Codons:\t%lu", num_codons);
assert(num_codons == reference->num_codons - ((stop_offset+start_offset)/3));

forward_deletions(hmm, params, num_codons, codons);

free(codons);

// donor
logv(1, "reference->end_split_length: %u", reference->end_split_length);
logv(1, "reference[%i]->end_split_length: %u", i, reference->end_split_length);
struct State* split_codon_donor = HMM__new_state(hmm);
switch (reference->end_split_length) {
case 0:
State__init_silent(split_codon_donor, "split_codon_donor");
break;
case 1:
State__init_uniform(split_codon_donor, "split_codon_donor", 1, params->emission_table_4_UNIFORM);
break;
case 2:
State__init_uniform(split_codon_donor, "split_codon_donor", 2, params->emission_table_16_UNIFORM);
break;
default:
die("Invalid number of split codon nucleotides in file %s: %u", params->fasta_file, params->split_emissions_donor);
if (reference_has_stop) {
// match the stop
State__init(split_codon_donor, "match_codon", 3, params->stop_codons, params->emission_table_stop_LAMBDA);
} else {
switch (reference->end_split_length) {
case 0:
State__init_silent(split_codon_donor, "split_codon_donor");
break;
case 1:
State__init_uniform(split_codon_donor, "split_codon_donor", 1, params->emission_table_4_UNIFORM);
break;
case 2:
State__init_uniform(split_codon_donor, "split_codon_donor", 2, params->emission_table_16_UNIFORM);
break;
default:
die("Invalid number of split codon nucleotides in file %s: %u", params->fasta_file, params->split_emissions_donor);
}
}
State__add_incoming(end_last_codon, params->js_scd, split_codon_donor);

struct State* between_split_donor = HMM__new_state(hmm);
State__init_silent(between_split_donor, "between_split_donor");
if (reference_has_stop) {
State__init_silent(between_split_donor, "between_match_donor");
} else {
State__init_silent(between_split_donor, "between_split_donor");
}
split_codons[num_split_codons++] = between_split_donor;

State__add_incoming(end_last_codon, params->fs_logodd, between_split_donor);
Expand Down
Loading

0 comments on commit 7e45ba3

Please sign in to comment.