19853: Fixes bug with null nominals, improves bit vector heuristic th…

…reshold for performance (#110)
howsoai · Apr 5, 2024 · 112ccab · 112ccab
1 parent 0c12c69
commit 112ccab
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 15 deletions.
diff --git a/src/Amalgam/IntegerSet.h b/src/Amalgam/IntegerSet.h
@@ -426,10 +426,11 @@ class BitArrayIntegerSet
 		size_t num_indices = size();
 		size_t end_index = std::min(up_to_index, end_integer);
 
-		//if dense, loop over, assuming likely to hit
-		//writing out this code yields notably better performance than
-		//using ContainsWithoutMaximumIndexCheck and attempting to let the compiler optimize
-		if(num_indices / num_buckets > 20)
+		//there are three loops optimized for different densities, high, medium high, and sparse
+		//the heuristics have been tuned by performance testing across a couple of CPU architectures
+		//and different data sets
+		size_t indices_per_bucket = num_indices / num_buckets;
+		if(indices_per_bucket >= 48)
 		{
 			for(size_t bucket = 0, index = 0;
 				bucket < num_buckets; bucket++, index++)
@@ -443,6 +444,14 @@ class BitArrayIntegerSet
 				}
 			}
 		}
+		else if(indices_per_bucket >= 32)
+		{
+			for(size_t index = 0; index < end_index; index++)
+			{
+				if(ContainsWithoutMaximumIndexCheck(index))
+					func(index);
+			}
+		}
 		else //use the iterator, which is more efficient when sparse
 		{
 			auto iter = begin();
@@ -483,7 +492,7 @@ class BitArrayIntegerSet
 	//sets bucket and bit to the values pointing to the next id in the hash
 	// assumes that bucket and bit point to a valid index
 	//if there are no more ids, then it will return bit 0 of the lowest bucket that is not populated
-	void FindNext(size_t &bucket, size_t &bit)
+	inline void FindNext(size_t &bucket, size_t &bit)
 	{
 		bit++;
 

diff --git a/src/Amalgam/SeparableBoxFilterDataStore.cpp b/src/Amalgam/SeparableBoxFilterDataStore.cpp
@@ -947,6 +947,9 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
 		{
 			AccumulatePartialSums(column->nullIndices, query_feature_index, unknown_unknown_term);
 			AccumulatePartialSums(column->nanIndices, query_feature_index, unknown_unknown_term);
+			auto nas_iter = column->stringIdValueToIndices.find(string_intern_pool.NOT_A_STRING_ID);
+			if(nas_iter != end(column->stringIdValueToIndices))
+				AccumulatePartialSums(*nas_iter->second, query_feature_index, unknown_unknown_term);
 
 			//TODO 17631: accumulate distance terms smaller than these, which might not be these two, return the next largest
 			return std::min(known_unknown_term, unknown_unknown_term);
@@ -961,6 +964,9 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
 		double known_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy);
 		AccumulatePartialSums(column->nullIndices, query_feature_index, known_unknown_term);
 		AccumulatePartialSums(column->nanIndices, query_feature_index, known_unknown_term);
+		auto nas_iter = column->stringIdValueToIndices.find(string_intern_pool.NOT_A_STRING_ID);
+		if(nas_iter != end(column->stringIdValueToIndices))
+			AccumulatePartialSums(*nas_iter->second, query_feature_index, known_unknown_term);
 		accumulated_known_to_unknown = true;
 	}
 
@@ -1002,22 +1008,18 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
 		//else value_type == ENIVT_NULL and already covered above
 
 		//if known to unknown is less than a symmetric nominal nonmatch, then need to accumulate those too
-		double nonmatch_dist_term = feature_attribs.nominalSymmetricNonMatchDistanceTerm.GetValue(high_accuracy);
-		double known_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy);
-		if(!accumulated_known_to_unknown && known_unknown_term < nonmatch_dist_term)
+		if(!accumulated_known_to_unknown)
 		{
-			BitArrayIntegerSet &known_unknown_indices = parametersAndBuffers.potentialMatchesSet;
-			known_unknown_indices = enabled_indices;
-			column->nullIndices.EraseTo(known_unknown_indices);
-			column->nanIndices.EraseTo(known_unknown_indices);
-			//find nas values
+			double known_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy);
+			AccumulatePartialSums(column->nullIndices, query_feature_index, known_unknown_term);
+			AccumulatePartialSums(column->nanIndices, query_feature_index, known_unknown_term);
 			auto nas_iter = column->stringIdValueToIndices.find(string_intern_pool.NOT_A_STRING_ID);
 			if(nas_iter != end(column->stringIdValueToIndices))
-				known_unknown_indices.erase(*nas_iter->second);
-			AccumulatePartialSums(known_unknown_indices, query_feature_index, known_unknown_term);
+				AccumulatePartialSums(*nas_iter->second, query_feature_index, known_unknown_term);
 		}
 
 		//return the value that the remainder of the entities have
+		double nonmatch_dist_term = feature_attribs.nominalSymmetricNonMatchDistanceTerm.GetValue(high_accuracy);
 		feature_data.SetPrecomputedRemainingIdenticalDistanceTerm(nonmatch_dist_term);
 		return nonmatch_dist_term;
 	}