Skip to content

Commit

Permalink
16 bit integers (#13)
Browse files Browse the repository at this point in the history
* 8bit->16bit integers

* deleted objects from index, fixed emission table

* added gitignore

* fixed test samples

Co-authored-by: Bogdan Kirilenko <kirilenk@hiller-mac-21.mpi-cbg.de>
  • Loading branch information
kirilenkobm and Bogdan Kirilenko authored Nov 20, 2020
1 parent d3d85f6 commit 4ef55db
Show file tree
Hide file tree
Showing 29 changed files with 111 additions and 108 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.idea/
src/*.o
cesar
2 changes: 1 addition & 1 deletion extra/samples/example1.out
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
>referenceExon
gCCTGGGAACTTCACCTACCACATCCCTGTCAGTAGTGGCACCCCACTGCACCTCAGCCTGACTCTGCAGATGaa
>mouse
ctttcctcatttcctcaggcttcagtatagcatgaggctgaggaggagagagggagaccggcaaagtggccttgcttaggtaccatctttgcccctttaggCTTGGCAACTTCACCTACCACATCCCTGTCAGCAGCAGCACACCACTGCACCTCAGCCTGACCCTGCAGATGaagtgagtgctggtgtgtgggtatgtgtgggggaccatgtggaagccctcagaaaagtgaaagccaagtgcttactaaatttattacgtggagggtccaggc
ctttcctcatttcctcaggcttcagtatagcatgaggctgaggaggagagagggagaccggcaaagtggccttgcttaggtaccatctttgcccctttagGCTTGGCAACTTCACCTACCACATCCCTGTCAGCAGCAGCACACCACTGCACCTCAGCCTGACCCTGCAGATGAAgtgagtgctggtgtgtgggtatgtgtgggggaccatgtggaagccctcagaaaagtgaaagccaagtgcttactaaatttattacgtggagggtccaggc
2 changes: 1 addition & 1 deletion extra/samples/sample0.out
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
>referenceExon
aaCCCAAACCCAAACCCAAACCCAAACCCaa
>query
cttccccatttttatctcatagacCCCAAACCCAAACCCAAACCCAAACCCaagtaaaaa
cttccccatttttatctcatagACCCCAAACCCAAACCCAAACCCAAACCCAAgtaaaaa
2 changes: 1 addition & 1 deletion extra/samples/sample1.out
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
>referenceExon
aaCCCAAACCCAAACCCAAACCCAAACCCaa
>query
cttccccatttttatctcatag--CCCAAACCCAAACCCAAACCCAAACCCaagtaaaaa
cttccccatttttatctcatag--CCCAAACCCAAACCCAAACCCAAACCCAAgtaaaaa
2 changes: 1 addition & 1 deletion extra/samples/sample2.out
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
>referenceExon
aCCCAAACCCAAACCCAAACCCAAACCCaa
>query
cttccccatttttatctcatag-CCCAAACCCAAACCCAAACCCAAACCCaagtaaaaa
cttccccatttttatctcatag-CCCAAACCCAAACCCAAACCCAAACCCAAgtaaaaa
2 changes: 1 addition & 1 deletion extra/samples/sample3.out
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
>referenceExon
aaCCCAAACCCAAACCCAAACCCAAACCCaa
>query
cttccccatttttatctcatagacCCCAAACCCAAACCCAAACCCAAACCC--gtaaaaa
cttccccatttttatctcatagACCCCAAACCCAAACCCAAACCCAAACCC--gtaaaaa
2 changes: 1 addition & 1 deletion extra/samples/sample4.out
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
>referenceExon
aaCCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCC
>query
cttccccatttttatctcatagac------------------------------------------------------------------------CCC------------------------------
cttccccatttttatctcatagAC------------------------------------------------------------------------CCC------------------------------
2 changes: 1 addition & 1 deletion extra/samples/sample5.out
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
>referenceExon
aaCCTAAACCCAAACCCAAACCC
>query
cttccccatttttatctcatagacCCT------------------
cttccccatttttatctcatagACCCT------------------
2 changes: 1 addition & 1 deletion extra/samples/sample6.out
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
>referenceExon
aaCCCAAACCCAAACCCAAACCCAAACCCaa
>query
cttccccatttttatctcatagacCCCAAA-CCAAACCCAAACCCAAACCCaagtaaaaa
cttccccatttttatctcatagACCCCAAA-CCAAACCCAAACCCAAACCCAAgtaaaaa
20 changes: 10 additions & 10 deletions src/Alignment.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@
bool find_best_deletion(struct State* deletion, struct EmissionTable* emission_table, Literal* reference, Literal* query, char* result) {

double prob=LOGODD_NEGINF, highest_prob=LOGODD_NEGINF;
uint_fast8_t best_pos=0, pos=0;
uint_fast16_t best_pos=0, pos=0;
Literal lookup_query[3];

if (deletion->num_emissions == 2) {
for(; pos < 3; pos++) {
for (uint_fast8_t i=0; i < 3; i++) {
for (uint_fast16_t i=0; i < 3; i++) {
if (pos == i) {
lookup_query[i] = LITERAL_N;
continue;
Expand All @@ -47,7 +47,7 @@ bool find_best_deletion(struct State* deletion, struct EmissionTable* emission_t
prob = EmissionTable__by_literals(emission_table, reference, lookup_query);
if (prob >= highest_prob) {
highest_prob = prob;
for (uint_fast8_t j=0; j < 3; j++) {
for (uint_fast16_t j=0; j < 3; j++) {
result[j] = Literal__char(lookup_query[j]);
}
result[pos] = '-';
Expand All @@ -63,7 +63,7 @@ bool find_best_deletion(struct State* deletion, struct EmissionTable* emission_t
} else if (deletion->num_emissions == 1) {
strncpy(result, "---", 3);
for(; pos < 3; pos++) {
for (uint_fast8_t i=0; i < 3; i++) {
for (uint_fast16_t i=0; i < 3; i++) {
if (pos == i) {
lookup_query[i] = query[0];
} else {
Expand Down Expand Up @@ -97,14 +97,14 @@ bool find_best_deletion(struct State* deletion, struct EmissionTable* emission_t
* @param path the Viterbi path.
* @return the alignment.
*/
struct Alignment* Alignment__create(struct Fasta* fasta, uint8_t query_id, struct Params* params, size_t path_length, struct State** path) {
uint8_t reference_id = 0;
struct Alignment* Alignment__create(struct Fasta* fasta, uint16_t query_id, struct Params* params, size_t path_length, struct State** path) {
uint16_t reference_id = 0;
const char lower = 'a' - 'A';
int numAlignedRefChars = 0; /* for sanity check that the entire reference seq is contained in the alignment */

struct Alignment* self = (struct Alignment*) SAFEMALLOC(sizeof(struct Alignment));
size_t length = fasta->queries[query_id]->length;
for (uint8_t ref_id=0; ref_id < fasta->num_references; ref_id++) {
for (uint16_t ref_id=0; ref_id < fasta->num_references; ref_id++) {
length += fasta->references[ref_id]->length;
}
self->reference = (char*) SAFECALLOC(sizeof(char), length+1+20*fasta->num_references);
Expand All @@ -114,10 +114,10 @@ struct Alignment* Alignment__create(struct Fasta* fasta, uint8_t query_id, struc
char* deletion;
char bases[4] = "";

uint_fast8_t pending_deletion=0;
uint_fast16_t pending_deletion=0;
size_t q = 0, r = 0, t = 0;
for (size_t i=1; i < path_length; i++) {
uint8_t j=0, emissions = path[i]->num_emissions;
uint16_t j=0, emissions = path[i]->num_emissions;

// deleting 1nt and 2nt will always emit 3 bases/dashes to maintain reading frame intact.
if (!strncmp("delete_1nt", path[i]->name, 10) || !strncmp("delete_2nt", path[i]->name, 10)) {
Expand Down Expand Up @@ -281,7 +281,7 @@ struct Alignment* Alignment__create(struct Fasta* fasta, uint8_t query_id, struc

/* sanity check that the entire reference seq is contained in the alignment */
int totalRefLen = 0;
for (uint8_t i=0; i < fasta->num_references; i++) {
for (uint16_t i=0; i < fasta->num_references; i++) {
totalRefLen += fasta->references[i]->length;
}
if (numAlignedRefChars != totalRefLen) {
Expand Down
2 changes: 1 addition & 1 deletion src/Alignment.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ typedef struct Alignment {
char* query;
} Alignment;

struct Alignment* Alignment__create(struct Fasta* fasta, uint8_t query_id, struct Params* params, size_t path_length, struct State** sequence);
struct Alignment* Alignment__create(struct Fasta* fasta, uint16_t query_id, struct Params* params, size_t path_length, struct State** sequence);
bool Alignment__destroy(struct Alignment* self);

#endif // ALIGNMENT_H_
2 changes: 1 addition & 1 deletion src/Arguments.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ bool Arguments__read(int argc, char** argv, struct Params* parameters) {
die("Insufficient number of arguments. Please provide at least an input file.");
}

uint8_t num_input_files = 0;
uint16_t num_input_files = 0;
int i = 1; // [0] is the cesar binary itself.
while (argv[i] && i < argc) {
char* argument = argv[i];
Expand Down
16 changes: 8 additions & 8 deletions src/Cesar.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ int main(int argc, char* argv[argc]) {
logv(1, "prefix %s\n", prefix);

/* load profiles for each reference exon */
for (uint8_t i=0; i < fasta.num_references; i++) {
for (uint16_t i=0; i < fasta.num_references; i++) {
struct Sequence* reference = fasta.references[i];

/* load acceptor or first codon profile */
Expand Down Expand Up @@ -161,15 +161,15 @@ int main(int argc, char* argv[argc]) {
size_t rlength = 0;
size_t qlength = 0;
size_t qlength_max = 0;
for (uint8_t i=0; i < fasta.num_references; i++) {
for (uint16_t i=0; i < fasta.num_references; i++) {
struct Sequence* reference = fasta.references[i];
num_states += 6 + 6 * reference->num_codons + 1 + 2 + 2 + 22 + 6; /* 22 and 6 for acc and donor states */

logv(1, "Reference %u length: %lu", i, fasta.references[i]->length);
logv(1, "Reference %u split codon lengths: %u %u", i, fasta.references[i]->start_split_length, fasta.references[i]->end_split_length);
rlength += fasta.references[i]->length;
}
for (uint8_t i=0; i < fasta.num_queries; i++) {
for (uint16_t i=0; i < fasta.num_queries; i++) {
logv(1, "Query %u length: %lu", i, fasta.queries[i]->length);
qlength += fasta.queries[i]->length;
if (fasta.queries[i]->length > qlength_max) {
Expand Down Expand Up @@ -197,14 +197,14 @@ int main(int argc, char* argv[argc]) {

if (g_loglevel >= 7) {
char* tmp;
for (uint8_t i=0; i < fasta.num_references; i++) {
for (uint16_t i=0; i < fasta.num_references; i++) {
tmp = SAFECALLOC(sizeof(char), SEQUENCENAMELENGTH + fasta.references[i]->length);
Literal__str(fasta.references[i]->length, fasta.references[i]->sequence, tmp);
logv(1, ">original %s\n%s", fasta.references[i]->name, tmp);
free(tmp);
}

for (uint8_t i=0; i < fasta.num_queries; i++) {
for (uint16_t i=0; i < fasta.num_queries; i++) {
tmp = SAFECALLOC(sizeof(char), SEQUENCENAMELENGTH + fasta.queries[i]->length);
Literal__str(fasta.queries[i]->length, fasta.queries[i]->sequence, tmp);
logv(1, ">original %s\n%s", fasta.queries[i]->name, tmp);
Expand All @@ -222,10 +222,10 @@ int main(int argc, char* argv[argc]) {
fclose(dotfile);
}

for (uint8_t q=0; q < fasta.num_queries; q++) {
for (uint16_t q=0; q < fasta.num_queries; q++) {

size_t length = 2*fasta.queries[q]->length;
for (uint8_t i=0; i < fasta.num_references; i++) {
for (uint16_t i=0; i < fasta.num_references; i++) {
length += fasta.references[i]->length;
}

Expand All @@ -246,7 +246,7 @@ int main(int argc, char* argv[argc]) {
}

HMM__destroy(hmm);
for (uint8_t i=0; i<fasta.num_references; i++) {
for (uint16_t i=0; i<fasta.num_references; i++) {
if(acceptors[i] != NULL) {
Profile__destroy(acceptors[i]);
}
Expand Down
24 changes: 12 additions & 12 deletions src/EmissionTable.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ bool EmissionTable__read(struct EmissionTable* self, char* filename) {
// corresponds to the line number.
if (i==0) {
Literal* codon = (Literal*) SAFEMALLOC(sizeof(Literal) * self->num_literals);
for (uint8_t j=0; j < self->num_literals; j++) {
for (uint16_t j=0; j < self->num_literals; j++) {
codon[j] = Literal__from_char(token[j]);
}

Expand Down Expand Up @@ -121,7 +121,7 @@ bool EmissionTable__set(struct EmissionTable* self, Literal sequence[], LOGODD_T
bool result = true;
// for each query, set logodd concerning
EMISSION_ID_T row = Literal__uint(self->num_literals, sequence);
for (uint8_t column=0; column < pow(4, self->num_literals); column++) {
for (uint16_t column=0; column < pow(4, self->num_literals); column++) {
result &= LogoddMatrix__set(self->values, column, row, logodd);
if (!result) {
break;
Expand All @@ -143,7 +143,7 @@ bool EmissionTable__forbid(struct EmissionTable* self, Literal sequence[]) {
/**
* Recursively check for Ns in query and replace them by A,C,T,G while recording those in visited.
*/
void EmissionTable__variants(uint8_t num_literals, Literal query[num_literals], uint8_t position, bool visited[]) {
void EmissionTable__variants(uint16_t num_literals, Literal query[num_literals], uint16_t position, bool visited[]) {
if (g_loglevel >= 6) {
char tmp[4] = "";
Literal__str(num_literals, query, tmp);
Expand All @@ -160,7 +160,7 @@ void EmissionTable__variants(uint8_t num_literals, Literal query[num_literals],
}

Literal copy[4];
for (uint8_t i=0; i < num_literals; i++) {
for (uint16_t i=0; i < num_literals; i++) {
copy[i] = query[i];
}

Expand All @@ -178,8 +178,8 @@ void EmissionTable__variants(uint8_t num_literals, Literal query[num_literals],
* @return the probability to emit query.
*/
LOGODD_T EmissionTable__by_literals(struct EmissionTable* self, Literal reference[], Literal query[]) {
uint8_t reference_Ns = Literal__Ns(self->num_literals, reference);
uint8_t query_Ns = Literal__Ns(self->num_literals, query);
uint16_t reference_Ns = Literal__Ns(self->num_literals, reference);
uint16_t query_Ns = Literal__Ns(self->num_literals, query);

if (query_Ns == 0 && reference_Ns == 0) {
EMISSION_ID_T row = Literal__uint(self->num_literals, reference);
Expand All @@ -199,7 +199,7 @@ LOGODD_T EmissionTable__by_literals(struct EmissionTable* self, Literal referenc
bool* visited_reference = (bool*) SAFECALLOC(sizeof(bool), self->values->num_columns);
EmissionTable__variants(self->num_literals, reference, 0, visited_reference);

uint8_t reference_visits = 0;
uint16_t reference_visits = 0;
double total_sum = 0;
for (EMISSION_ID_T row = 0; row < self->values->num_rows; row++) {

Expand All @@ -211,7 +211,7 @@ LOGODD_T EmissionTable__by_literals(struct EmissionTable* self, Literal referenc
bool* visited_query = (bool*) SAFECALLOC(sizeof(bool), self->values->num_columns);
EmissionTable__variants(self->num_literals, query, 0, visited_query);

uint8_t query_visits = 0;
uint16_t query_visits = 0;
double sum = 0;
for (EMISSION_ID_T column = 0; column < self->values->num_columns; column++) {
if (!visited_query[column]) {
Expand Down Expand Up @@ -271,14 +271,14 @@ bool EmissionTable__init(struct EmissionTable* self, EMISSION_ID_T num_emissions
* @param codons an array of a series of codons (e.g. [TAATGATAG] for all stop codons).
* @return success boolean.
*/
bool EmissionTable__init_single_codons(struct EmissionTable* self, uint8_t num_codons, Literal codons[num_codons]) {
bool EmissionTable__init_single_codons(struct EmissionTable* self, EMISSION_ID_T num_codons, Literal codons[num_codons]) {
EmissionTable__init(self, 3, LAMBDA_DISTRIBUTION);
LOGODD_T one_nth = 1.0 - Logodd__log((double) num_codons);

for(uint8_t i = 0; i < num_codons; i++) {
for(uint16_t i = 0; i < num_codons; i++) {
// for each query, set logodd concerning
EMISSION_ID_T row = Literal__uint(self->num_literals, &codons[3*i]);
for (uint8_t j = 0; j < num_codons; j++) {
for (uint16_t j = 0; j < num_codons; j++) {
EMISSION_ID_T column = Literal__uint(self->num_literals, &codons[3*j]);
if (! LogoddMatrix__set(self->values, column, row, one_nth)) {
return false;
Expand Down Expand Up @@ -324,7 +324,7 @@ bool EmissionTable__str(struct EmissionTable* self, char buffer[]) {

bool EmissionTable__emittable(struct EmissionTable* self, EMISSION_ID_T row) {
bool emittable = false;
uint8_t column = 0;
uint16_t column = 0;
while (column < self->values->num_columns && !emittable) {
emittable = EmissionTable__get(self, column++, row) != LOGODD_NEGINF;
}
Expand Down
4 changes: 2 additions & 2 deletions src/EmissionTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
typedef struct EmissionTable {
LogoddMatrix* values;
Distribution distribution;
uint8_t num_literals;
uint16_t num_literals;
} EmissionTable;


Expand All @@ -27,7 +27,7 @@ bool EmissionTable__init_single_codons(struct EmissionTable* self, EMISSION_ID_T
bool EmissionTable__destroy(struct EmissionTable* self);
bool EmissionTable__set(struct EmissionTable* self, Literal sequence[], LOGODD_T logodd);
bool EmissionTable__forbid(struct EmissionTable* self, Literal sequence[]);
LOGODD_T EmissionTable__get(struct EmissionTable* self, uint_fast8_t column, uint_fast8_t row);
LOGODD_T EmissionTable__get(struct EmissionTable* self, uint_fast16_t column, uint_fast16_t row);
bool EmissionTable__str(struct EmissionTable* self, char buffer[]);
bool EmissionTable__emittable(struct EmissionTable* self, EMISSION_ID_T reference);

Expand Down
12 changes: 6 additions & 6 deletions src/Fasta.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ bool Fasta__init(struct Fasta* self) {
* @return success boolean.
*/
bool Fasta__destroy(struct Fasta* self) {
for(uint8_t i = 0; i < self->num_references; i++) {
for(uint16_t i = 0; i < self->num_references; i++) {
Sequence__destroy(self->references[i]);
}
free(self->references);

for(uint8_t i = 0; i < self->num_queries; i++) {
for(uint16_t i = 0; i < self->num_queries; i++) {
Sequence__destroy(self->queries[i]);
}
free(self->queries);
Expand Down Expand Up @@ -103,10 +103,10 @@ bool Fasta__read(struct Fasta* self, char* filename) {
struct Sequence* sequence = SAFECALLOC(sizeof(Sequence), 1);
Sequence__init(sequence);

uint8_t name_length = 0;
uint8_t acc_length = 0;
uint8_t do_length = 0;
uint8_t state = 0;
uint16_t name_length = 0;
uint16_t acc_length = 0;
uint16_t do_length = 0;
uint16_t state = 0;
size_t lineno = 0;
bool reached_codons = false;
bool reached_queries = false;
Expand Down
4 changes: 2 additions & 2 deletions src/Fasta.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
#include "Sequence.h"

typedef struct Fasta {
uint8_t num_references;
uint8_t num_queries;
uint16_t num_references;
uint16_t num_queries;
struct Sequence** references;
struct Sequence** queries;
} Fasta;
Expand Down
Loading

0 comments on commit 4ef55db

Please sign in to comment.