Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Experiment compacting counttables into a nodetable given a max abundance criterion #1874

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions include/oxli/hashtable.hh
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,24 @@ class Nodetable : public oxli::MurmurHashtable
public:
explicit Nodetable(WordLength ksize, std::vector<uint64_t> sizes)
: MurmurHashtable(ksize, new BitStorage(sizes)) { } ;

void compose_init(Counttable& other, BoundedCounterType max)
{
if (get_tablesizes() != other.get_tablesizes()) {
std::cerr << "Table size mismatch\n";
return;
}
store->compose_init(other.get_raw_tables(), max);
}

void compose_update(Counttable& other, BoundedCounterType max)
{
if (get_tablesizes() != other.get_tablesizes()) {
std::cerr << "Table size mismatch\n";
return;
}
store->compose_update(other.get_raw_tables(), max);
}
};

}
Expand Down
41 changes: 37 additions & 4 deletions include/oxli/storage.hh
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class Storage
protected:
bool _supports_bigcount;
bool _use_bigcount;
Byte ** _counts;

public:
Storage() : _supports_bigcount(false), _use_bigcount(false) { } ;
Expand All @@ -75,6 +76,42 @@ public:

void set_use_bigcount(bool b);
bool get_use_bigcount();

inline void compose_init(Byte **othertable, BoundedCounterType max)
{
size_t _n_tables = n_tables();
std::vector<uint64_t> _tablesizes = get_tablesizes();

for (size_t row = 0; row < _n_tables; row++) {
for (size_t bin = 0; bin < _tablesizes[row]; bin++) {
BoundedCounterType count = othertable[row][bin];
if (count <= max) {
unsigned char bit = bin % 8;
_counts[row][bin] |= 1UL << bit;
}
}
}
}

inline void compose_update(Byte **othertable, BoundedCounterType max)
{
size_t _n_tables = n_tables();
std::vector<uint64_t> _tablesizes = get_tablesizes();

for (size_t row = 0; row < _n_tables; row++) {
for (size_t bin = 0; bin < _tablesizes[row]; bin++) {
BoundedCounterType count = othertable[row][bin];
unsigned char bit = bin % 8;
unsigned char isset = (_counts[row][bin] >> bit) & 1U;
if (count <= max && isset) {
_counts[row][bin] |= 1UL << bit;
}
else {
_counts[row][bin] &= ~(1UL << bit);
}
}
}
}
};


Expand All @@ -99,7 +136,6 @@ protected:
size_t _n_tables;
uint64_t _occupied_bins;
uint64_t _n_unique_kmers;
Byte ** _counts;

public:
BitStorage(std::vector<uint64_t>& tablesizes) :
Expand Down Expand Up @@ -252,7 +288,6 @@ protected:
uint64_t _n_unique_kmers;
std::array<std::mutex, 32> mutexes;
static constexpr uint8_t _max_count{15};
Byte ** _counts;

// Compute index into the table, this retrieves the correct byte
// which you then need to select the correct nibble from
Expand Down Expand Up @@ -496,8 +531,6 @@ protected:
uint64_t _n_unique_kmers;
uint64_t _occupied_bins;

Byte ** _counts;

// initialize counts with empty hashtables.
void _allocate_counters()
{
Expand Down
2 changes: 2 additions & 0 deletions khmer/_oxli/graphs.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ cdef extern from "oxli/hashtable.hh" namespace "oxli" nogil:

cdef cppclass CpNodetable "oxli::Nodetable" (CpMurmurHashtable):
CpNodetable(WordLength, vector[uint64_t])
void compose_init(CpCounttable&, BoundedCounterType)
void compose_update(CpCounttable&, BoundedCounterType)

cdef cppclass CpQFCounttable "oxli::QFCounttable" (CpHashtable):
CpQFCounttable(WordLength, uint64_t) except +oxli_raise_py_error
Expand Down
6 changes: 6 additions & 0 deletions khmer/_oxli/graphs.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,12 @@ cdef class Nodetable(Hashtable):
self._nt_this = make_shared[CpNodetable](k, _primes)
self._ht_this = <shared_ptr[CpHashtable]>self._nt_this

def compose_init(self, Counttable ct, int max):
deref(self._nt_this).compose_init(deref(ct._ct_this), max)

def compose_update(self, Counttable ct, int max):
deref(self._nt_this).compose_update(deref(ct._ct_this), max)


cdef class Hashgraph(Hashtable):

Expand Down
30 changes: 30 additions & 0 deletions tests/test_counttable.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,3 +199,33 @@ def test_init_with_primes(sketchtype):
primes = khmer.get_n_primes_near_x(4, random.randint(1000, 2000))
sketch = sketchtype(31, 1, 1, primes=primes)
assert sketch.hashsizes() == primes


def test_compose():
counts1 = khmer.Counttable(21, 1e4, 4)
counts2 = khmer.Counttable(21, 1e4, 4)

counts1.add('GATTACAGATTACAGATTACA')
counts2.add('GATTACAGATTACAGATTACA')

counts1.add('TAGATCTGCTTGAAACAAGTG')
for _ in range(5):
counts2.add('TAGATCTGCTTGAAACAAGTG')

for _ in range(5):
counts1.add('AAGTGGATTTGAGAAAAAAGT')
counts2.add('AAGTGGATTTGAGAAAAAAGT')

for _ in range(5):
counts1.add('GGGGGGGGGGGGGGGGGGGGG')
counts2.add('GGGGGGGGGGGGGGGGGGGGG')

kmers = khmer.Nodetable(21, 1e4, 4)
kmers.compose_init(counts1, 3)
kmers.compose_update(counts2, 3)

assert kmers.get('GATTACAGATTACAGATTACA') == 1
assert kmers.get('TAGATCTGCTTGAAACAAGTG') == 0
assert kmers.get('AAGTGGATTTGAGAAAAAAGT') == 0
assert kmers.get('GGGGGGGGGGGGGGGGGGGGG') == 0
assert kmers.get('AAAAAAAAAAAAAAAAAAAAA') == 1