This repository has been archived by the owner on Dec 24, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Dna.cpp
184 lines (151 loc) · 4.97 KB
/
Dna.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
//
// Created by arrouan on 01/10/18.
//
#include "Dna.h"
#include <cassert>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <cstring>
Dna::Dna(int dna_length, Threefry::Gen &&rng) : n_bytes_(ceil((dna_length + PROM_SIZE) / 8.0)), length_(dna_length)
{
seq_ = std::vector<uint8_t>(n_bytes_);
// Generate a random genome
size_t byte_idx;
size_t bit_idx;
// Fill DNA with random data
for (int i = 0; i < dna_length; ++i)
{
unsigned int x = rng.random(NB_BASE);
byte_idx = i / 8;
bit_idx = i % 8;
seq_[byte_idx] |= x << bit_idx;
// Mirror bit
if (i < PROM_SIZE)
{
byte_idx = (i + dna_length) / 8;
bit_idx = (i + dna_length) % 8;
seq_[byte_idx] |= x << bit_idx;
}
}
}
void Dna::print() const
{
for (int i = 0; i < length_; ++i)
{
if (i % 8 == 0 && i != 0)
{
std::cout << " ";
}
std::cout << (bool)(seq_[i / 8] >> (i % 8) & uint8_t(1));
}
for (int i = length_; i < length_ + PROM_SIZE; i++)
{
if (i % 8 == 0 && i != 0)
{
std::cout << " ";
}
std::cout << (bool)(seq_[i / 8] >> (i % 8) & uint8_t(1));
}
std::cout << std::endl;
}
void Dna::save(gzFile backup_file)
{
size_t n_bytes = n_bytes_;
size_t length = length_;
gzwrite(backup_file, &n_bytes, sizeof(n_bytes));
gzwrite(backup_file, &length, sizeof(length_));
gzwrite(backup_file, seq_.data(), n_bytes * sizeof(seq_[0]));
}
void Dna::load(gzFile backup_file)
{
size_t n_bytes;
size_t length;
gzread(backup_file, &n_bytes, sizeof(n_bytes));
gzread(backup_file, &length, sizeof(length));
uint8_t tmp_seq[n_bytes];
gzread(backup_file, tmp_seq, n_bytes * sizeof(tmp_seq[0]));
length_ = length;
n_bytes_ = n_bytes;
seq_ = std::vector<uint8_t>(tmp_seq, tmp_seq + n_bytes);
}
void Dna::set(int pos, bool val)
{
size_t byte_idx = pos / 8;
size_t bit_idx = pos % 8;
// Clear bit, then set its value
seq_[byte_idx] &= ~(1 << bit_idx);
seq_[byte_idx] |= val << bit_idx;
// Mirror bit in the last region of the DNA (22 lsb)
if (pos < PROM_SIZE)
{
byte_idx = (pos + length_) / 8;
bit_idx = (pos + length_) % 8;
seq_[byte_idx] &= ~(1 << bit_idx);
seq_[byte_idx] |= val << bit_idx;
}
}
void Dna::do_switch(int pos)
{
size_t byte_idx = pos / 8;
size_t bit_idx = pos % 8;
seq_[byte_idx] ^= 1 << bit_idx;
// Mirror bit in the last region of the DNA (22 lsb)
if (pos < PROM_SIZE)
{
byte_idx = (pos + length_) / 8;
bit_idx = (pos + length_) % 8;
seq_[byte_idx] ^= 1 << bit_idx;
}
}
int Dna::promoter_at(int pos)
{
size_t byte_idx = pos / 8;
size_t bit_idx = pos % 8;
// Retrieve 4 bytes of DNA including the 22 bits that interests us
uint32_t dna_partial = *(uint32_t *)(seq_.data() + byte_idx);
// Chop chop 🔪 the DNA to 22 bits and calculate its Hamming distance to the promoter sequence:
// - First we aligning the bit that interests us (the bit at position `pos` in the DNA) on the right hand side of our 4 bytes.
// - Then we only keeping the 22 least significant bits so that we can easily compute the Hamming distance to the promoter sequence.
uint32_t dna_to_cmp = (dna_partial >> bit_idx) & PROM_MASK;
return __builtin_popcount(dna_to_cmp ^ PROM_SEQ);
}
// Given a, b, c, d boolean variable and X random boolean variable,
// a terminator look like : a b c d X X X !d !c !b !a
int Dna::terminator_at(int pos)
{
size_t byte_idx = pos / 8;
size_t bit_idx = pos % 8;
// Retrieve 4 bytes of DNA including the 11 bits that interests us
// We are obliged to get 4 bytes because worst case scenario is the position being on the right
uint32_t dna_partial = *(uint32_t *)(seq_.data() + byte_idx);
uint16_t dna_to_cmp = (dna_partial >> bit_idx) & TERM_MASK;
return TERM_DIST_LOOKUP[dna_to_cmp];
}
bool Dna::shine_dal_start(int pos)
{
size_t byte_idx = pos / 8;
size_t bit_idx = pos % 8;
uint32_t dna_partial = *(uint32_t *)(seq_.data() + byte_idx);
uint16_t dna_to_cmp = (dna_partial >> bit_idx) & SHINE_DAL_SEQ_MASK;
uint8_t dist = __builtin_popcount(dna_to_cmp ^ SHINE_DAL_SEQ);
// The sequence is only considered a start if it is strictly equal to the Shine-Dalgarno sequence 011011****000
return dist == 0;
}
bool Dna::protein_stop(int pos)
{
size_t byte_idx = pos / 8;
size_t bit_idx = pos % 8;
uint16_t dna_partial = *(uint16_t *)(seq_.data() + byte_idx);
uint16_t dna_to_cmp = (dna_partial >> bit_idx) & PROTEIN_END_MASK;
uint8_t dist = __builtin_popcount(dna_to_cmp ^ PROTEIN_END);
return dist == 0;
}
int Dna::codon_at(int pos)
{
size_t byte_idx = pos / 8;
size_t bit_idx = pos % 8;
uint16_t dna_partial = *(uint16_t *)(seq_.data() + byte_idx);
uint8_t codon = (dna_partial >> bit_idx) & CODON_MASK;
return CODON_VALUE_LOOKUP[codon];
}