Skip to content

Commit

Permalink
19853: Fixes bug with null nominals, improves bit vector heuristic th…
Browse files Browse the repository at this point in the history
…reshold for performance (#110)
  • Loading branch information
howsohazard authored Apr 5, 2024
1 parent 0c12c69 commit 112ccab
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 15 deletions.
19 changes: 14 additions & 5 deletions src/Amalgam/IntegerSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -426,10 +426,11 @@ class BitArrayIntegerSet
size_t num_indices = size();
size_t end_index = std::min(up_to_index, end_integer);

//if dense, loop over, assuming likely to hit
//writing out this code yields notably better performance than
//using ContainsWithoutMaximumIndexCheck and attempting to let the compiler optimize
if(num_indices / num_buckets > 20)
//there are three loops optimized for different densities, high, medium high, and sparse
//the heuristics have been tuned by performance testing across a couple of CPU architectures
//and different data sets
size_t indices_per_bucket = num_indices / num_buckets;
if(indices_per_bucket >= 48)
{
for(size_t bucket = 0, index = 0;
bucket < num_buckets; bucket++, index++)
Expand All @@ -443,6 +444,14 @@ class BitArrayIntegerSet
}
}
}
else if(indices_per_bucket >= 32)
{
for(size_t index = 0; index < end_index; index++)
{
if(ContainsWithoutMaximumIndexCheck(index))
func(index);
}
}
else //use the iterator, which is more efficient when sparse
{
auto iter = begin();
Expand Down Expand Up @@ -483,7 +492,7 @@ class BitArrayIntegerSet
//sets bucket and bit to the values pointing to the next id in the hash
// assumes that bucket and bit point to a valid index
//if there are no more ids, then it will return bit 0 of the lowest bucket that is not populated
void FindNext(size_t &bucket, size_t &bit)
inline void FindNext(size_t &bucket, size_t &bit)
{
bit++;

Expand Down
22 changes: 12 additions & 10 deletions src/Amalgam/SeparableBoxFilterDataStore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -947,6 +947,9 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
{
AccumulatePartialSums(column->nullIndices, query_feature_index, unknown_unknown_term);
AccumulatePartialSums(column->nanIndices, query_feature_index, unknown_unknown_term);
auto nas_iter = column->stringIdValueToIndices.find(string_intern_pool.NOT_A_STRING_ID);
if(nas_iter != end(column->stringIdValueToIndices))
AccumulatePartialSums(*nas_iter->second, query_feature_index, unknown_unknown_term);

//TODO 17631: accumulate distance terms smaller than these, which might not be these two, return the next largest
return std::min(known_unknown_term, unknown_unknown_term);
Expand All @@ -961,6 +964,9 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
double known_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy);
AccumulatePartialSums(column->nullIndices, query_feature_index, known_unknown_term);
AccumulatePartialSums(column->nanIndices, query_feature_index, known_unknown_term);
auto nas_iter = column->stringIdValueToIndices.find(string_intern_pool.NOT_A_STRING_ID);
if(nas_iter != end(column->stringIdValueToIndices))
AccumulatePartialSums(*nas_iter->second, query_feature_index, known_unknown_term);
accumulated_known_to_unknown = true;
}

Expand Down Expand Up @@ -1002,22 +1008,18 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
//else value_type == ENIVT_NULL and already covered above

//if known to unknown is less than a symmetric nominal nonmatch, then need to accumulate those too
double nonmatch_dist_term = feature_attribs.nominalSymmetricNonMatchDistanceTerm.GetValue(high_accuracy);
double known_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy);
if(!accumulated_known_to_unknown && known_unknown_term < nonmatch_dist_term)
if(!accumulated_known_to_unknown)
{
BitArrayIntegerSet &known_unknown_indices = parametersAndBuffers.potentialMatchesSet;
known_unknown_indices = enabled_indices;
column->nullIndices.EraseTo(known_unknown_indices);
column->nanIndices.EraseTo(known_unknown_indices);
//find nas values
double known_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy);
AccumulatePartialSums(column->nullIndices, query_feature_index, known_unknown_term);
AccumulatePartialSums(column->nanIndices, query_feature_index, known_unknown_term);
auto nas_iter = column->stringIdValueToIndices.find(string_intern_pool.NOT_A_STRING_ID);
if(nas_iter != end(column->stringIdValueToIndices))
known_unknown_indices.erase(*nas_iter->second);
AccumulatePartialSums(known_unknown_indices, query_feature_index, known_unknown_term);
AccumulatePartialSums(*nas_iter->second, query_feature_index, known_unknown_term);
}

//return the value that the remainder of the entities have
double nonmatch_dist_term = feature_attribs.nominalSymmetricNonMatchDistanceTerm.GetValue(high_accuracy);
feature_data.SetPrecomputedRemainingIdenticalDistanceTerm(nonmatch_dist_term);
return nonmatch_dist_term;
}
Expand Down

0 comments on commit 112ccab

Please sign in to comment.