From 9d24ae31c3f80e358dfcc931057adb6f7d263806 Mon Sep 17 00:00:00 2001
From: howsohazard <143410553+howsohazard@users.noreply.github.com>
Date: Thu, 18 Apr 2024 16:48:14 -0400
Subject: [PATCH] 20025: Implements more sparse deviation logic, fixes rare
 edge cases with null string values and code/string distances (#121)

---
 src/Amalgam/GeneralizedDistance.h           | 412 +++++++++++++++-----
 src/Amalgam/HashMaps.h                      |  35 ++
 src/Amalgam/SeparableBoxFilterDataStore.cpp | 258 +++++++-----
 src/Amalgam/SeparableBoxFilterDataStore.h   |  54 ++-
 src/Amalgam/evaluablenode/EvaluableNode.h   |  11 +
 5 files changed, 548 insertions(+), 222 deletions(-)
diff --git a/src/Amalgam/GeneralizedDistance.h b/src/Amalgam/GeneralizedDistance.h
index ae2de42b..9a270646 100644
--- a/src/Amalgam/GeneralizedDistance.h
+++ b/src/Amalgam/GeneralizedDistance.h
@@ -100,6 +100,40 @@ class GeneralizedDistanceEvaluator
 			typeAttributes.maxCyclicDifference = std::numeric_limits<double>::quiet_NaN();
 		}
 
+		//returns true if the feature is nominal
+		__forceinline bool IsFeatureNominal()
+		{
+			return (featureType <= GeneralizedDistanceEvaluator::FDT_NOMINAL_CODE);
+		}
+
+		//returns true if the feature is nominal
+		__forceinline bool IsFeatureContinuous()
+		{
+			return (featureType >= GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC);
+		}
+
+		//returns true if the feature is cyclic
+		__forceinline bool IsFeatureCyclic()
+		{
+			return (featureType == GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC_CYCLIC);
+		}
+
+		//returns true if the feature has a deviation
+		__forceinline bool DoesFeatureHaveDeviation()
+		{
+			return (deviation > 0);
+		}
+
+		//returns true if the feature is a nominal that only has one difference value for match and one for nonmatch
+		__forceinline bool IsFeatureSymmetricNominal()
+		{
+			if(!IsFeatureNominal())
+				return false;
+
+			return (nominalNumberSparseDeviationMatrix.size() == 0
+				&& nominalStringSparseDeviationMatrix.size() == 0);
+		}
+
 		//the type of comparison for each feature
 		// this type is 32-bit aligned to make sure the whole structure is aligned
 		FeatureDifferenceType featureType;
@@ -131,70 +165,29 @@ class GeneralizedDistanceEvaluator
 		double deviationReciprocal;
 
 		//contains the deviations for a given nominal value for each other nominal value
+		//if the nominal value is not found, then the attribute defaultDeviation should be used
 		template<typename NominalValueType, typename EqualComparison = std::equal_to<NominalValueType>>
-		class SparseNominalDeviationValues
+		class SparseNominalDeviationValues : public SmallMap<NominalValueType, double, EqualComparison>
 		{
 		public:
 			inline SparseNominalDeviationValues()
 				: defaultDeviation(0.0)
 			{	}
 
-			using value_type = NominalValueType;
-
-			//returns an iterator to deviations that matches the nominal key
-			inline auto FindDeviationIterator(NominalValueType key)
-			{
-				return std::find_if(begin(deviations), end(deviations),
-					[key](auto i)
-					{	return EqualComparison{}(i.first, key);	}
-				);
-			}
-
-			//deviations for each value; unknown should be stored as special nonvalue (e.g., NaN, NaS)
-			//store as a vector of pairs instead of a map because either only one value will be looked up once,
-			//in which case there's no advantage to having a map, or many distance term values will be looked up
-			//repeatedly, which is handled by a RepeatedGeneralizedDistanceEvaluator, which uses a map
-			std::vector<std::pair<NominalValueType, double>> deviations;
 			double defaultDeviation;
 		};
 
-		//contains the deviations for a given nominal value for each other nominal value
-		template<typename NominalValueType, typename EqualComparison = std::equal_to<NominalValueType>>
-		class SparseNominalDeviationMatrix
-		{
-		public:
-			inline SparseNominalDeviationMatrix()
-			{	}
-
-			using value_type = NominalValueType;
-
-			//returns an iterator to deviation values that matches the nominal key
-			inline auto FindDeviationValuesIterator(NominalValueType key)
-			{
-				return std::find_if(begin(deviationValues), end(deviationValues),
-					[key](auto i)
-					{	return EqualComparison{}(i.first, key);	}
-				);
-			}
-
-			//deviation values for each value; unknown should be stored as special nonvalue (e.g., NaN, NaS)
-			//store as a vector of pairs instead of a map because either only one value will be looked up once,
-			//in which case there's no advantage to having a map, or many distance term values will be looked up
-			//repeatedly, which is handled by a RepeatedGeneralizedDistanceEvaluator, which uses a map
-			std::vector<std::pair<NominalValueType, SparseNominalDeviationValues<NominalValueType, EqualComparison>>> deviationValues;
-		};
-
 		//sparse deviation matrix if the nominal is a string
 		//store as a vector of pairs instead of a map because either only one value will be looked up once,
 		//in which case there's no advantage to having a map, or many distance term values will be looked up
 		//repeatedly, which is handled by a RepeatedGeneralizedDistanceEvaluator, which uses a map
-		SparseNominalDeviationMatrix<StringInternPool::StringID> nominalStringSparseDeviationMatrix;
+		SmallMap<StringInternPool::StringID, SparseNominalDeviationValues<StringInternPool::StringID>> nominalStringSparseDeviationMatrix;
 
 		//sparse deviation matrix if the nominal is a number
 		//store as a vector of pairs instead of a map because either only one value will be looked up once,
 		//in which case there's no advantage to having a map, or many distance term values will be looked up
 		//repeatedly, which is handled by a RepeatedGeneralizedDistanceEvaluator, which uses a map
-		SparseNominalDeviationMatrix<double, DoubleNanHashComparator> nominalNumberSparseDeviationMatrix;
+		SmallMap<double, SparseNominalDeviationValues<double, DoubleNanHashComparator>, DoubleNanHashComparator> nominalNumberSparseDeviationMatrix;
 
 		//distance term to use if both values being compared are unknown
 		//the difference will be NaN if unknown
@@ -310,36 +303,31 @@ class GeneralizedDistanceEvaluator
 	//returns true if the feature is nominal
 	__forceinline bool IsFeatureNominal(size_t feature_index)
 	{
-		return (featureAttribs[feature_index].featureType <= GeneralizedDistanceEvaluator::FDT_NOMINAL_CODE);
+		return featureAttribs[feature_index].IsFeatureNominal();
 	}
 
 	//returns true if the feature is nominal
 	__forceinline bool IsFeatureContinuous(size_t feature_index)
 	{
-		return (featureAttribs[feature_index].featureType >= GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC);
+		return featureAttribs[feature_index].IsFeatureContinuous();
 	}
 
 	//returns true if the feature is cyclic
 	__forceinline bool IsFeatureCyclic(size_t feature_index)
 	{
-		return (featureAttribs[feature_index].featureType == GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC_CYCLIC);
+		return featureAttribs[feature_index].IsFeatureCyclic();
 	}
 
 	//returns true if the feature has a deviation
 	__forceinline bool DoesFeatureHaveDeviation(size_t feature_index)
 	{
-		return (featureAttribs[feature_index].deviation > 0);
+		return featureAttribs[feature_index].DoesFeatureHaveDeviation();
 	}
 
 	//returns true if the feature is a nominal that only has one difference value for match and one for nonmatch
 	__forceinline bool IsFeatureSymmetricNominal(size_t feature_index)
 	{
-		if(!IsFeatureNominal(feature_index))
-			return false;
-
-		auto &feature_attribs = featureAttribs[feature_index];
-		return (feature_attribs.nominalNumberSparseDeviationMatrix.deviationValues.size() == 0
-			&& feature_attribs.nominalStringSparseDeviationMatrix.deviationValues.size() == 0);
+		return featureAttribs[feature_index].IsFeatureSymmetricNominal();
 	}
 
 	//returns true if a known to unknown distance term would be less than or same as an exact match
@@ -403,15 +391,15 @@ class GeneralizedDistanceEvaluator
 		}
 
 		double deviation = std::numeric_limits<double>::quiet_NaN();
-		if(a_type == ENIVT_NUMBER && feature_attribs.nominalNumberSparseDeviationMatrix.deviationValues.size() > 0)
+		if(a_type == ENIVT_NUMBER && feature_attribs.nominalNumberSparseDeviationMatrix.size() > 0)
 		{
-			auto outer_it = feature_attribs.nominalNumberSparseDeviationMatrix.FindDeviationValuesIterator(a.number);
-			if(outer_it != std::end(feature_attribs.nominalNumberSparseDeviationMatrix.deviationValues))
+			auto outer_it = feature_attribs.nominalNumberSparseDeviationMatrix.find(a.number);
+			if(outer_it != std::end(feature_attribs.nominalNumberSparseDeviationMatrix))
 			{
 				auto &ndd = outer_it->second;
-				auto inner_it = ndd.FindDeviationIterator(b.number);
+				auto inner_it = ndd.find(b.number);
 
-				if(inner_it == end(ndd.deviations))
+				if(inner_it == end(ndd))
 					deviation = ndd.defaultDeviation;
 				else
 					deviation = inner_it->second;
@@ -424,15 +412,15 @@ class GeneralizedDistanceEvaluator
 					deviation = feature_attribs.knownToUnknownDistanceTerm.deviation;
 			}
 		}
-		else if(a_type == ENIVT_STRING_ID && feature_attribs.nominalStringSparseDeviationMatrix.deviationValues.size() > 0)
+		else if(a_type == ENIVT_STRING_ID && feature_attribs.nominalStringSparseDeviationMatrix.size() > 0)
 		{
-			auto outer_it = feature_attribs.nominalStringSparseDeviationMatrix.FindDeviationValuesIterator(a.stringID);
-			if(outer_it != std::end(feature_attribs.nominalStringSparseDeviationMatrix.deviationValues))
+			auto outer_it = feature_attribs.nominalStringSparseDeviationMatrix.find(a.stringID);
+			if(outer_it != std::end(feature_attribs.nominalStringSparseDeviationMatrix))
 			{
 				auto &ndd = outer_it->second;
-				auto inner_it = ndd.FindDeviationIterator(b.stringID);
+				auto inner_it = ndd.find(b.stringID);
 
-				if(inner_it == end(ndd.deviations))
+				if(inner_it == end(ndd))
 					deviation = ndd.defaultDeviation;
 				else
 					deviation = inner_it->second;
@@ -505,8 +493,9 @@ class GeneralizedDistanceEvaluator
 			return -std::numeric_limits<double>::infinity();
 	}
 
+	//TODO 17631: remove this?
 	//computes the base of the difference between two nominal values that exactly match without exponentiation
-	__forceinline double ComputeDistanceTermNominalBaseExactMatchFromDeviation(size_t index, double deviation, bool high_accuracy)
+	__forceinline double ComputeDistanceTermBaseNominalExactMatchFromDeviation(size_t index, double deviation, bool high_accuracy)
 	{
 		if(!DoesFeatureHaveDeviation(index) || computeSurprisal)
 			return 0.0;
@@ -516,15 +505,16 @@ class GeneralizedDistanceEvaluator
 
 	//TODO 17631: genericize this for use in ComputeDistanceTermNominal -- may need to take in two deviations,
 	// exact match deviation and nonmatch deviation?  Or just change the calls to pass in 1-deviation?
-	//make sure lines up with ComputeDistanceTermNominalBaseExactMatchFromDeviation for exact match, and maybe remove ComputeDistanceTermNominalBaseExactMatchFromDeviation
+	//make sure lines up with ComputeDistanceTermBaseNominalExactMatchFromDeviation for exact match, and maybe remove ComputeDistanceTermBaseNominalExactMatchFromDeviation
 
 	//computes the base of the difference between two nominal values that do not match without exponentiation
-	__forceinline double ComputeDistanceTermNominalBaseNonMatchFromDeviation(size_t index, double deviation, bool high_accuracy)
+	__forceinline double ComputeDistanceTermBaseNominalNonMatchFromDeviation(size_t index, double deviation, bool high_accuracy)
 	{
 		if(computeSurprisal)
 		{
 			//need to have at least two classes in existence
 			double nominal_count = std::max(featureAttribs[index].typeAttributes.nominalCount, 2.0);
+			//TODO 17631: change to be weighted average: prob of nominal * deviation of random guessing
 			double prob_max_entropy_match = 1 / nominal_count;
 
 			//find probability that the correct class was selected
@@ -533,10 +523,10 @@ class GeneralizedDistanceEvaluator
 
 			//find the probability that any other class besides the correct class was selected
 			//divide the probability among the other classes
-			double prop_class_given_nonmatch = (1 - prob_class_given_match) / (nominal_count - 1);
+			double prob_class_given_nonmatch = (1 - prob_class_given_match) / (nominal_count - 1);
 
 			double surprisal_class_given_match = -std::log(prob_class_given_match);
-			double surprisal_class_given_nonmatch = -std::log(prop_class_given_nonmatch);
+			double surprisal_class_given_nonmatch = -std::log(prob_class_given_nonmatch);
 
 			//the surprisal of the class matching on a different value is the difference between
 			//how surprised it would be given a nonmatch but without the surprisal given a match
@@ -565,17 +555,86 @@ class GeneralizedDistanceEvaluator
 		}
 	}
 
+	//TODO 17631: finish this and integrate it
+	//returns the base of the distance term for nominal comparisons for a match
+	//given the probablility of the class being observed given that it is a match
+	__forceinline double ComputeDistanceTermBaseNominalMatchFromMatchProbabilities(size_t index,
+		double prob_class_given_match, bool high_accuracy)
+	{
+		if(!DoesFeatureHaveDeviation(index) || computeSurprisal)
+			return 0.0;
+
+		return 1 - prob_class_given_match;
+	}
+
+	//TODO 17631: finish this and integrate it
+	// for a given prob_class_given_match, which is the probability that the classes compared should have been a match,
+	// and prob_class_given_nonmatch, the probability that the particular comparison class does not match
+	__forceinline double ComputeDistanceTermBaseNominalNonmatchFromMatchProbabilities(size_t index,
+		double prob_class_given_match, double prob_class_given_nonmatch, bool high_accuracy)
+	{
+		if(computeSurprisal)
+		{
+			double surprisal_class_given_match = -std::log(prob_class_given_match);
+			double surprisal_class_given_nonmatch = -std::log(prob_class_given_nonmatch);
+
+			//the surprisal of the class matching on a different value is the difference between
+			//how surprised it would be given a nonmatch but without the surprisal given a match
+			double dist_term = surprisal_class_given_nonmatch - surprisal_class_given_match;
+
+			//ensure it doesn't go below zero in case of numerical precision issues
+			return std::max(dist_term, 0.0);
+		}
+		else if(DoesFeatureHaveDeviation(index))
+		{
+			//add together uncertainties from a nonmatch,
+			// plus a nonmatch of a nonmatch to get a match
+			double dist_term = (1 - prob_class_given_match) + (1 - prob_class_given_nonmatch);
+			return dist_term;
+		}
+		else
+		{
+			return 1.0;
+		}
+	}
+
+	//TODO 17631: finish this and integrate it
+	//for inputs to this method, if not using SDM, b_deviation = (1 - a_deviation) / (nominal_count - 1)
+	__forceinline double ComputeDistanceTermNominalBaseFromDeviations(size_t index, bool match,
+		double match_deviation, double nonmatch_deviation, bool high_accuracy)
+	{
+		//need to have at least two classes in existence
+		double nominal_count = std::max(featureAttribs[index].typeAttributes.nominalCount, 2.0);
+		//TODO 17631: change to be weighted average: prob of nominal * deviation of random guessing
+		double prob_max_entropy_match = 1 / nominal_count;
+
+		//find probability that the correct class was selected
+		//can't go below base probability of guessing
+		double prob_class_given_match = std::max(1 - match_deviation, prob_max_entropy_match);
+
+		//find the probability that any other class besides the correct class was selected,
+		//but cannot exceed the probability of a match
+		double prob_class_given_nonmatch = std::min(1 - nonmatch_deviation, prob_class_given_match);
+
+		if(match)
+			return ComputeDistanceTermBaseNominalMatchFromMatchProbabilities(index,
+				prob_class_given_match, high_accuracy);
+		else
+			return ComputeDistanceTermBaseNominalNonmatchFromMatchProbabilities(index,
+				prob_class_given_match, prob_class_given_nonmatch, high_accuracy);
+	}
+
 	//computes the distance term for a nominal when two universally symmetric nominals are equal
 	__forceinline double ComputeDistanceTermNominalUniversallySymmetricExactMatch(size_t index, bool high_accuracy)
 	{
-		double dist_term = ComputeDistanceTermNominalBaseExactMatchFromDeviation(index, featureAttribs[index].deviation, high_accuracy);
+		double dist_term = ComputeDistanceTermBaseNominalExactMatchFromDeviation(index, featureAttribs[index].deviation, high_accuracy);
 		return ContextuallyExponentiateAndWeightDifferenceTerm(dist_term, index, high_accuracy);
 	}
 
 	//computes the distance term for a nominal when two universally symmetric nominals are not equal
 	__forceinline double ComputeDistanceTermNominalUniversallySymmetricNonMatch(size_t index, bool high_accuracy)
 	{
-		double dist_term = ComputeDistanceTermNominalBaseNonMatchFromDeviation(index, featureAttribs[index].deviation, high_accuracy);
+		double dist_term = ComputeDistanceTermBaseNominalNonMatchFromDeviation(index, featureAttribs[index].deviation, high_accuracy);
 		return ContextuallyExponentiateAndWeightDifferenceTerm(dist_term, index, high_accuracy);
 	}
 
@@ -722,6 +781,7 @@ class GeneralizedDistanceEvaluator
 			{
 				//need to have at least two classes in existence
 				double nominal_count = std::max(featureAttribs[index].typeAttributes.nominalCount, 2.0);
+				//TODO 17631: change to be weighted average: prob of nominal * deviation of random guessing
 				double prob_max_entropy_match = 1 / nominal_count;
 
 				//find probability that the correct class was selected
@@ -926,7 +986,7 @@ class GeneralizedDistanceEvaluator
 		for(size_t i = 0; i < featureAttribs.size(); i++)
 		{
 			auto &feature_attribs = featureAttribs[i];
-			if(IsFeatureNominal(i))
+			if(feature_attribs.IsFeatureNominal())
 			{
 				//ensure if a feature has deviations they're not too small to underflow
 				if(DoesFeatureHaveDeviation(i))
@@ -1055,8 +1115,64 @@ class RepeatedGeneralizedDistanceEvaluator
 		: distEvaluator(dist_evaluator)
 	{	}
 
+	//for the feature index, computes and stores the distance terms for nominal values
+	inline void ComputeAndStoreNominalDistanceTerms(size_t index)
+	{
+		bool compute_approximate = distEvaluator->NeedToPrecomputeApproximate();
+
+		//make sure there's room for the interned index
+		if(featureData.size() <= index)
+			featureData.resize(index + 1);
+
+		auto &feature_attributes = distEvaluator->featureAttribs[index];
+		auto &feature_data = featureData[index];
+
+		//since most of the computations will be using approximate if it is needed,
+		//only set to high accuracy if not using approximate
+		feature_data.precomputedNominalDistanceTermsHighAccuracy = (!compute_approximate);
+
+		if(feature_data.targetValue.nodeType == ENIVT_NUMBER)
+		{
+			auto &sdm = feature_attributes.nominalNumberSparseDeviationMatrix;
+			auto target_value = feature_data.targetValue.nodeValue.number;
+
+			auto deviations_for_value = sdm.find(target_value);
+			if(deviations_for_value != end(sdm))
+			{
+				auto &deviations = deviations_for_value->second;
+				for(auto &[value, deviation] : deviations)
+				{
+					//TODO 17631: finish this
+					double dist_term = deviation;// = distEvaluator->ComputeDistanceTermNominalBaseFromDeviations(index, value == sid, )
+					feature_data.nominalNumberDistanceTerms.emplace(value, dist_term);
+				}
+
+				//TODO: deviations.defaultDeviation
+			}
+		}
+		else if(feature_data.targetValue.nodeType == ENIVT_STRING_ID)
+		{
+			auto &sdm = feature_attributes.nominalStringSparseDeviationMatrix;
+			auto target_sid = feature_data.targetValue.nodeValue.stringID;
+
+			auto deviations_for_value = sdm.find(target_sid);
+			if(deviations_for_value != end(sdm))
+			{
+				auto &deviations = deviations_for_value->second;
+				for(auto &[value, deviation] : deviations)
+				{
+					//TODO 17631: finish this
+					double dist_term = deviation;// distEvaluator->ComputeDistanceTermNominalBaseFromDeviations(index, value == sid, )
+					feature_data.nominalStringDistanceTerms.emplace(value, dist_term);
+				}
+
+				//TODO: deviations.defaultDeviation
+			}
+		}
+	}
+
 	//for the feature index, computes and stores the distance terms as measured from value to each interned value
-	inline void ComputeAndStoreInternedNumberValuesAndDistanceTerms(double value, size_t index, std::vector<double> *interned_values)
+	inline void ComputeAndStoreInternedNumberValuesAndDistanceTerms(size_t index, std::vector<double> *interned_values)
 	{
 		bool compute_accurate = distEvaluator->NeedToPrecomputeAccurate();
 		bool compute_approximate = distEvaluator->NeedToPrecomputeApproximate();
@@ -1065,39 +1181,41 @@ class RepeatedGeneralizedDistanceEvaluator
 		if(featureData.size() <= index)
 			featureData.resize(index + 1);
 
-		auto &feature_interns = featureData[index];
-		feature_interns.internedNumberIndexToNumberValue = interned_values;
+		auto &feature_data = featureData[index];
+		feature_data.internedNumberIndexToNumberValue = interned_values;
 
 		if(interned_values == nullptr)
 		{
-			feature_interns.internedDistanceTerms.clear();
+			feature_data.internedDistanceTerms.clear();
 			return;
 		}
 
-		feature_interns.internedDistanceTerms.resize(interned_values->size());
+		feature_data.internedDistanceTerms.resize(interned_values->size());
 
 		auto &feature_attribs = distEvaluator->featureAttribs[index];
+
+		double value = feature_data.targetValue.GetValueAsNumber();
 		if(FastIsNaN(value))
 		{
 			//first entry is unknown-unknown distance
-			feature_interns.internedDistanceTerms[0] = feature_attribs.unknownToUnknownDistanceTerm;
+			feature_data.internedDistanceTerms[0] = feature_attribs.unknownToUnknownDistanceTerm;
 			
 			auto k_to_unk = feature_attribs.knownToUnknownDistanceTerm;
-			for(size_t i = 1; i < feature_interns.internedDistanceTerms.size(); i++)
-				feature_interns.internedDistanceTerms[i] = k_to_unk;
+			for(size_t i = 1; i < feature_data.internedDistanceTerms.size(); i++)
+				feature_data.internedDistanceTerms[i] = k_to_unk;
 		}
 		else
 		{
 			//first entry is known-unknown distance
-			feature_interns.internedDistanceTerms[0] = feature_attribs.knownToUnknownDistanceTerm;
+			feature_data.internedDistanceTerms[0] = feature_attribs.knownToUnknownDistanceTerm;
 
-			for(size_t i = 1; i < feature_interns.internedDistanceTerms.size(); i++)
+			for(size_t i = 1; i < feature_data.internedDistanceTerms.size(); i++)
 			{
 				double difference = value - (*interned_values)[i];
 				if(compute_accurate)
-					feature_interns.internedDistanceTerms[i].SetValue(distEvaluator->ComputeDistanceTermContinuousNonNullRegular(difference, index, true), true);
+					feature_data.internedDistanceTerms[i].SetValue(distEvaluator->ComputeDistanceTermContinuousNonNullRegular(difference, index, true), true);
 				if(compute_approximate)
-					feature_interns.internedDistanceTerms[i].SetValue(distEvaluator->ComputeDistanceTermContinuousNonNullRegular(difference, index, false), false);
+					feature_data.internedDistanceTerms[i].SetValue(distEvaluator->ComputeDistanceTermContinuousNonNullRegular(difference, index, false), false);
 			}
 		}
 	}
@@ -1127,42 +1245,125 @@ class RepeatedGeneralizedDistanceEvaluator
 
 	//returns the inner term of the Minkowski norm summation given that the feature is nominal
 	//and the data type being compared from is numeric
-	//if types_match is false, then the value is ignored
-	__forceinline double ComputeDistanceTermNominalNumeric(double value,
-		size_t index, bool types_match, bool high_accuracy)
+	//if value_type_numeric is false, then the value is ignored
+	__forceinline double ComputeDistanceTermNominalNumeric(double value, bool value_type_numeric,
+		size_t index, bool high_accuracy)
 	{
-		//TODO 17631: implement this
-		return 0.0;
+		auto &feature_data = featureData[index];
+		if(feature_data.nominalNumberDistanceTerms.size() > 0)
+		{
+			//TODO 17631: implement this
+		}
+
+		if(value_type_numeric && value == feature_data.targetValue.GetValueAsNumber())
+			return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy);
+		else
+			return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy);
 	}
 
 	//returns the inner term of the Minkowski norm summation given that the feature is nominal
 	//and the data type being compared from is string
-	//if types_match is false, then the value is ignored
-	__forceinline double ComputeDistanceTermNominalString(StringInternPool::StringID value,
-		size_t index, bool types_match, bool high_accuracy)
+	//if value_type_string is false, then the value is ignored
+	__forceinline double ComputeDistanceTermNominalString(StringInternPool::StringID value, bool value_type_string,
+		size_t index, bool high_accuracy)
 	{
-		//TODO 17631: implement this
-		return 0.0;
+		auto &feature_data = featureData[index];
+		if(feature_data.nominalStringDistanceTerms.size() > 0)
+		{
+			//TODO 17631: implement this
+		}
+
+		if(value_type_string && value == feature_data.targetValue.GetValueAsStringIDIfExists())
+			return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy);
+		else
+			return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy);
+	}
+
+	//returns the distance term given that it is nominal
+	__forceinline double ComputeDistanceTermNominal(EvaluableNodeImmediateValue other_value,
+		EvaluableNodeImmediateValueType other_type, size_t index, bool high_accuracy)
+	{
+		//TODO 17631: make this more efficient, placeholder for now
+		auto &feature_data = featureData[index];
+		return distEvaluator->ComputeDistanceTermNominal(feature_data.targetValue.nodeValue, other_value,
+			feature_data.targetValue.nodeType, other_type, index, high_accuracy);
+	}
+
+	//returns the smallest nonmatching distance term for the nominal given other_value
+	__forceinline double ComputeDistanceTermNominalSmallestNonmatch(double other_value,
+		size_t index, bool high_accuracy)
+	{
+		//TODO 17631: implement this, placeholder for now
+		return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy);
+	}
+
+	//returns the smallest nonmatching distance term for the nominal given other_value
+	__forceinline double ComputeDistanceTermNominalSmallestNonmatch(StringInternPool::StringID other_value,
+		size_t index, bool high_accuracy)
+	{
+		//TODO 17631: implement this, placeholder for now
+		return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy);
+	}
+
+	//returns the smallest nonmatching distance term regardless of value
+	__forceinline double ComputeDistanceTermNominalSmallestNonmatch(size_t index, bool high_accuracy)
+	{
+		//TODO 17631: implement this, placeholder for now
+		return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy);
+	}
+
+	//for all nominal distance term values that equal dist_term for the given high_accuracy,
+	//it will call func passing in the numeric value
+	template<typename Func>
+	__forceinline void IterateOverNominalValuesWithLessOrEqualDistanceTermsNumeric(double dist_term, size_t index, bool high_accuracy,
+		Func func)
+	{
+		auto &feature_data = featureData[index];
+		for(auto &entry : feature_data.nominalNumberDistanceTerms)
+		{
+			if(entry.second <= dist_term)
+				func(entry.first);
+		}
+	}
+
+	//for all nominal distance term values that equal dist_term for the given high_accuracy,
+	//it will call func passing in the string id value
+	template<typename Func>
+	__forceinline void IterateOverNominalValuesWithLessOrEqualDistanceTermsString(double dist_term, size_t index, bool high_accuracy,
+		Func func)
+	{
+		auto &feature_data = featureData[index];
+		for(auto &entry : feature_data.nominalStringDistanceTerms)
+		{
+			if(entry.second <= dist_term)
+				func(entry.first);
+		}
+	}
+
+	//returns the smallest distance term larger than dist_term
+	__forceinline double ComputeDistanceTermNominalNextSmallest(double dist_term, size_t index, bool high_accuracy)
+	{
+		//TODO 17631: implement this, placeholder for now
+		return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy);
 	}
 
 	//computes the inner term of the Minkowski norm summation
 	__forceinline double ComputeDistanceTerm(EvaluableNodeImmediateValue other_value,
 		EvaluableNodeImmediateValueType other_type, size_t index, bool high_accuracy)
 	{
-		//TODO 17631: improve the logic and efficiency
 		auto &feature_data = featureData[index];
 
 		//if nominal, don't need to compute absolute value of diff because just need to compare to 0
 		if(distEvaluator->IsFeatureNominal(index))
-			return distEvaluator->ComputeDistanceTermNominal(feature_data.targetValue, other_value,
-				feature_data.targetValueType, other_type, index, high_accuracy);
+			return ComputeDistanceTermNominal(other_value, other_type, index, high_accuracy);
 
-		double diff = distEvaluator->ComputeDifference(feature_data.targetValue, other_value,
-			feature_data.targetValueType, other_type, distEvaluator->featureAttribs[index].featureType);
+		//TODO 17631: improve the logic and efficiency here down
+		double diff = distEvaluator->ComputeDifference(feature_data.targetValue.nodeValue, other_value,
+			feature_data.targetValue.nodeType, other_type, distEvaluator->featureAttribs[index].featureType);
 
 		if(FastIsNaN(diff))
-			return distEvaluator->LookupNullDistanceTerm(feature_data.targetValue, other_value,
-				feature_data.targetValueType, other_type, index, high_accuracy);
+			return distEvaluator->LookupNullDistanceTerm(feature_data.targetValue.nodeValue, other_value,
+				feature_data.targetValue.nodeType, other_type, index, high_accuracy);
 
 		return distEvaluator->ComputeDistanceTermContinuousNonNullRegular(diff, index, high_accuracy);
 	}
@@ -1192,8 +1393,7 @@ class RepeatedGeneralizedDistanceEvaluator
 		EffectiveFeatureDifferenceType effectiveFeatureType;
 
 		//target that the distance will be computed to
-		EvaluableNodeImmediateValueType targetValueType;
-		EvaluableNodeImmediateValue targetValue;
+		EvaluableNodeImmediateValueWithType targetValue;
 
 		//the distance term for EFDT_REMAINING_IDENTICAL_PRECOMPUTED
 		double precomputedRemainingIdenticalDistanceTerm;
@@ -1201,10 +1401,12 @@ class RepeatedGeneralizedDistanceEvaluator
 		std::vector<double> *internedNumberIndexToNumberValue;
 		std::vector<GeneralizedDistanceEvaluator::DistanceTerms> internedDistanceTerms;
 
-		//TODO 17631: genericize ComputeAndStoreInternedNumberValuesAndDistanceTerms to precompute these when appropriate
 		//used to store distance terms for the respective targetValue for the sparse deviation matrix
 		FastHashMap<StringInternPool::StringID, double> nominalStringDistanceTerms;
 		FastHashMap<double, double> nominalNumberDistanceTerms;
+
+		//if true, then nominalStringDistanceTerms and nominalNumberDistanceTerms are high accuracy, otherwise approximate
+		bool precomputedNominalDistanceTermsHighAccuracy;
 	};
 
 	//for each feature, precomputed distance terms for each interned value looked up by intern index
diff --git a/src/Amalgam/HashMaps.h b/src/Amalgam/HashMaps.h
index 3a7ace2f..2e1897de 100644
--- a/src/Amalgam/HashMaps.h
+++ b/src/Amalgam/HashMaps.h
@@ -44,3 +44,38 @@ template<typename K, typename V, typename H = std::hash<K>, typename E = std::eq
 using CompactHashMap = ska::bytell_hash_map<K, V, H, E, A>;
 
 #endif
+
+//implements a map via a vector, where entries are looked up sequentially for brute force
+//useful for very small hash maps (generally less than 30-40 entries) and for hash maps
+//where entries are only found once
+//note that, like other fast maps, iterators may be invalidated when the map is altered
+template<typename K, typename V, typename E = std::equal_to<K>>
+class SmallMap : public std::vector<std::pair<K, V>>
+{
+public:
+
+	using key_type = K;
+	using mapped_type = V;
+	using value_type = std::pair<K, V>;
+
+	//returns an iterator to deviation values that matches the nominal key
+	inline auto find(K key)
+	{
+		return std::find_if(
+			std::begin(*this),
+			std::end(*this),
+			[key](auto i)
+			{	return E{}(i.first, key);	}
+		);
+	}
+
+	//implement the map version of emplace, but allow for use of default constructor for the value
+	template <class... Args>
+	inline auto emplace(K key, Args&&... args)
+	{
+		if constexpr(sizeof...(Args) == 0)
+			return this->emplace_back(key, V{});
+		else
+			return this->emplace_back(key, std::forward<Args>(args)...);
+	}
+};
diff --git a/src/Amalgam/SeparableBoxFilterDataStore.cpp b/src/Amalgam/SeparableBoxFilterDataStore.cpp
index 5c10e2c7..fc13dc50 100644
--- a/src/Amalgam/SeparableBoxFilterDataStore.cpp
+++ b/src/Amalgam/SeparableBoxFilterDataStore.cpp
@@ -331,12 +331,10 @@ void SeparableBoxFilterDataStore::FindEntitiesWithinDistance(GeneralizedDistance
 	for(size_t query_feature_index = 0; query_feature_index < dist_eval.featureAttribs.size(); query_feature_index++)
 	{
 		size_t absolute_feature_index = dist_eval.featureAttribs[query_feature_index].featureIndex;
-		auto target_value = r_dist_eval.featureData[query_feature_index].targetValue;
-		auto target_value_type = r_dist_eval.featureData[query_feature_index].targetValueType;
-
 		auto &column_data = columnData[absolute_feature_index];
+		auto &target_value = r_dist_eval.featureData[query_feature_index].targetValue;
 
-		if(target_value_type == ENIVT_NULL || (target_value_type == ENIVT_NUMBER && FastIsNaN(target_value.number)) )
+		if(target_value.IsNullEquivalent())
 		{
 			//add the appropriate unknown distance to each element
 			double unknown_unknown_term = dist_eval.ComputeDistanceTermUnknownToUnknown(query_feature_index, high_accuracy);
@@ -359,7 +357,7 @@ void SeparableBoxFilterDataStore::FindEntitiesWithinDistance(GeneralizedDistance
 			continue;
 		}
 
-		if(target_value_type == ENIVT_NUMBER)
+		if(target_value.nodeType == ENIVT_NUMBER)
 		{
 			//below we branch to optimize the number of distance terms that need to be computed to solve minimum distance problem
 			//if there are fewer enabled_indices than the number of unique values for this feature, plus one for unknown values
@@ -735,8 +733,7 @@ void SeparableBoxFilterDataStore::FindNearestEntities(GeneralizedDistanceEvaluat
 			//if the target_value is a null/nan, unknown-unknown differences have already been accounted for
 			//since they are partial matches
 			auto &feature_data = r_dist_eval.featureData[i];
-			if(feature_data.targetValueType == ENIVT_NULL
-					|| (feature_data.targetValueType == ENIVT_NUMBER && FastIsNaN(feature_data.targetValue.number)))
+			if(feature_data.targetValue.IsNullEquivalent())
 				continue;
 			
 			if(dist_eval.ComputeDistanceTermKnownToUnknown(i, high_accuracy) > worst_candidate_distance)
@@ -892,42 +889,38 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
 	size_t absolute_feature_index = feature_attribs.featureIndex;
 	auto &column = columnData[absolute_feature_index];
 	auto effective_feature_type = feature_data.effectiveFeatureType;
-	EvaluableNodeImmediateValue value = feature_data.targetValue;
-	EvaluableNodeImmediateValueType value_type = feature_data.targetValueType;
+	auto &value = feature_data.targetValue;
 
-	bool value_is_null = EvaluableNodeImmediateValue::IsNullEquivalent(value_type, value);
 	//need to accumulate values for nulls if the value is a null
-	if(value_is_null)
+	if(value.IsNullEquivalent())
 	{
 		double unknown_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermUnknownToUnknown(query_feature_index, high_accuracy);
-		double known_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy);
 
 		//if it's either a symmetric nominal or continuous, or if sparse deviation matrix but no null value,
 		// then there are only two values, unknown to known or known
-		if(r_dist_eval.distEvaluator->IsFeatureSymmetricNominal(query_feature_index)
-			|| r_dist_eval.distEvaluator->IsFeatureContinuous(query_feature_index)
-			|| (r_dist_eval.distEvaluator->IsFeatureNominal(query_feature_index) &&
+		if(feature_attribs.IsFeatureSymmetricNominal()
+			|| feature_attribs.IsFeatureContinuous()
+			|| (feature_attribs.IsFeatureNominal() &&
 				!r_dist_eval.HasNominalSpecificKnownToUnknownDistanceTerm(query_feature_index)))
 		{
-			//if all cases are equidistant, then don't compute anything
-			if(unknown_unknown_term == known_unknown_term)
+			double known_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy);
+
+			//if all cases are equidistant and nonzero, then don't compute anything
+			if(unknown_unknown_term == known_unknown_term && unknown_unknown_term > 0)
 				return unknown_unknown_term;
 
 			//find nas values -- common to both branches below
 			auto nas_iter = column->stringIdValueToIndices.find(string_intern_pool.NOT_A_STRING_ID);
 
-			if(unknown_unknown_term < known_unknown_term)
+			if(unknown_unknown_term < known_unknown_term || known_unknown_term == 0.0)
 			{
 				AccumulatePartialSums(column->nullIndices, query_feature_index, unknown_unknown_term);
 				AccumulatePartialSums(column->nanIndices, query_feature_index, unknown_unknown_term);
 				if(nas_iter != end(column->stringIdValueToIndices))
 					AccumulatePartialSums(*nas_iter->second, query_feature_index, unknown_unknown_term);
-
-				//return the larger value that the remainder of the entities have
-				feature_data.SetPrecomputedRemainingIdenticalDistanceTerm(known_unknown_term);
-				return known_unknown_term;
 			}
-			else //known_unknown_term < unknown_unknown_term
+
+			if(known_unknown_term < unknown_unknown_term || unknown_unknown_term == 0.0)
 			{
 				BitArrayIntegerSet &known_unknown_indices = parametersAndBuffers.potentialMatchesSet;
 				known_unknown_indices = enabled_indices;
@@ -936,12 +929,18 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
 				if(nas_iter != end(column->stringIdValueToIndices))
 					known_unknown_indices.erase(*nas_iter->second);
 				AccumulatePartialSums(known_unknown_indices, query_feature_index, known_unknown_term);
-
-				//return the larger value that the remainder of the entities have
-				feature_data.SetPrecomputedRemainingIdenticalDistanceTerm(unknown_unknown_term);
-				return unknown_unknown_term;
 			}
 
+			double larget_term_not_computed = std::max(known_unknown_term, unknown_unknown_term);
+			//if the largest term not computed is zero, then have computed everything,
+			// so set the remaining value to infinity to push this term off sorting of uncomputed distances
+			// and make search more efficient
+			if(larget_term_not_computed == 0.0)
+				larget_term_not_computed = std::numeric_limits<double>::infinity();
+
+			//make computing the rest more efficient
+			feature_data.SetPrecomputedRemainingIdenticalDistanceTerm(larget_term_not_computed);
+			return larget_term_not_computed;
 		}
 		else //nonsymmetric nominal -- need to compute
 		{
@@ -951,15 +950,35 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
 			if(nas_iter != end(column->stringIdValueToIndices))
 				AccumulatePartialSums(*nas_iter->second, query_feature_index, unknown_unknown_term);
 
-			//TODO 17631: accumulate distance terms smaller than these, which might not be these two, return the next largest
-			return std::min(known_unknown_term, unknown_unknown_term);
+			double nonmatch_dist_term = r_dist_eval.ComputeDistanceTermNominalSmallestNonmatch(query_feature_index, high_accuracy);
+			//if the next closest match is larger, no need to compute any more values
+			if(nonmatch_dist_term > unknown_unknown_term)
+				return nonmatch_dist_term;
+
+			//if there are terms smaller than unknown_unknown_term, then need to compute any other nominal values
+			r_dist_eval.IterateOverNominalValuesWithLessOrEqualDistanceTermsNumeric(unknown_unknown_term, query_feature_index, high_accuracy,
+				[this, &r_dist_eval, &column, query_feature_index, high_accuracy](double number_value)
+				{
+					AccumulatePartialSumsForNominalNumberValueIfExists(r_dist_eval, number_value, query_feature_index, *column, high_accuracy);
+				});
+
+			r_dist_eval.IterateOverNominalValuesWithLessOrEqualDistanceTermsString(unknown_unknown_term, query_feature_index, high_accuracy,
+				[this, &r_dist_eval, &column, query_feature_index, high_accuracy](StringInternPool::StringID sid)
+				{
+					AccumulatePartialSumsForNominalStringIdValueIfExists(r_dist_eval, sid, query_feature_index, *column, high_accuracy);
+				});
+
+			return r_dist_eval.ComputeDistanceTermNominalNextSmallest(unknown_unknown_term, query_feature_index, high_accuracy);;
 		}
 	}
 
-	//need to accumulate nulls if they're closer than an exact match
-	//but if made it here, then the value itself isn't null
-	bool accumulated_known_to_unknown = false;
-	if(r_dist_eval.distEvaluator->IsKnownToUnknownDistanceLessThanOrEqualToExactMatch(query_feature_index))
+	bool is_feature_symmetric_nominal = r_dist_eval.distEvaluator->IsFeatureSymmetricNominal(query_feature_index);
+
+	//if made it here, then the value itself is not a null, so only need to consider unknown to known distances
+	//need to accumulate nulls if it's a symmetric nominal feature, because then there's only one value left,
+	//or if the nulls are closer than an exact match
+	if(is_feature_symmetric_nominal
+		|| r_dist_eval.distEvaluator->IsKnownToUnknownDistanceLessThanOrEqualToExactMatch(query_feature_index))
 	{
 		double known_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy);
 		AccumulatePartialSums(column->nullIndices, query_feature_index, known_unknown_term);
@@ -967,36 +986,23 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
 		auto nas_iter = column->stringIdValueToIndices.find(string_intern_pool.NOT_A_STRING_ID);
 		if(nas_iter != end(column->stringIdValueToIndices))
 			AccumulatePartialSums(*nas_iter->second, query_feature_index, known_unknown_term);
-		accumulated_known_to_unknown = true;
 	}
 
 	//if nominal, only need to compute the exact match
-	if(r_dist_eval.distEvaluator->IsFeatureSymmetricNominal(query_feature_index))
+	if(is_feature_symmetric_nominal)
 	{
-		if(value_type == ENIVT_NUMBER)
+		if(value.nodeType == ENIVT_NUMBER)
 		{
-			auto [value_index, exact_index_found] = column->FindExactIndexForValue(value.number);
-			if(exact_index_found)
-			{
-				double term = feature_attribs.nominalSymmetricMatchDistanceTerm.GetValue(high_accuracy);
-				AccumulatePartialSums(column->sortedNumberValueEntries[value_index]->indicesWithValue, query_feature_index, term);
-			}
+			AccumulatePartialSumsForNominalNumberValueIfExists(r_dist_eval, value.nodeValue.number, query_feature_index, *column, high_accuracy);
 		}
-		else if(value_type == ENIVT_STRING_ID)
+		else if(value.nodeType == ENIVT_STRING_ID)
 		{
-			auto value_found = column->stringIdValueToIndices.find(value.stringID);
-			if(value_found != end(column->stringIdValueToIndices))
-			{
-				double term = feature_attribs.nominalSymmetricMatchDistanceTerm.GetValue(high_accuracy);
-				AccumulatePartialSums(*(value_found->second), query_feature_index, term);
-			}
+			AccumulatePartialSumsForNominalStringIdValueIfExists(r_dist_eval, value.nodeValue.stringID, query_feature_index, *column, high_accuracy);
 		}
-		else if(value_type == ENIVT_CODE)
+		else if(value.nodeType == ENIVT_CODE)
 		{
 			//compute partial sums for all code of matching size
-			size_t code_size = 1;
-			if(value_type == ENIVT_CODE)
-				code_size = EvaluableNode::GetDeepSize(value.code);
+			size_t code_size = EvaluableNode::GetDeepSize(value.nodeValue.code);
 
 			auto value_found = column->valueCodeSizeToIndices.find(code_size);
 			if(value_found != end(column->valueCodeSizeToIndices))
@@ -1007,43 +1013,70 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
 		}
 		//else value_type == ENIVT_NULL and already covered above
 
-		//if known to unknown is less than a symmetric nominal nonmatch, then need to accumulate those too
-		if(!accumulated_known_to_unknown)
-		{
-			double known_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy);
-			AccumulatePartialSums(column->nullIndices, query_feature_index, known_unknown_term);
-			AccumulatePartialSums(column->nanIndices, query_feature_index, known_unknown_term);
-			auto nas_iter = column->stringIdValueToIndices.find(string_intern_pool.NOT_A_STRING_ID);
-			if(nas_iter != end(column->stringIdValueToIndices))
-				AccumulatePartialSums(*nas_iter->second, query_feature_index, known_unknown_term);
-		}
-
 		//return the value that the remainder of the entities have
 		double nonmatch_dist_term = feature_attribs.nominalSymmetricNonMatchDistanceTerm.GetValue(high_accuracy);
 		feature_data.SetPrecomputedRemainingIdenticalDistanceTerm(nonmatch_dist_term);
 		return nonmatch_dist_term;
 	}
-	else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_STRING)
+	else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_STRING)
 	{
-		if(value_type == ENIVT_STRING_ID)
-		{
-			auto value_found = column->stringIdValueToIndices.find(value.stringID);
-			if(value_found != end(column->stringIdValueToIndices))
+		//initialize to zero, because if don't find an exact match, but there are distance terms of
+		//0, then need to accumulate those later
+		double accumulated_term = 0.0;
+		if(value.nodeType == ENIVT_STRING_ID)
+			accumulated_term = AccumulatePartialSumsForNominalStringIdValueIfExists(
+				r_dist_eval, value.nodeValue.stringID, query_feature_index, *column, high_accuracy);
+
+		double nonmatch_dist_term = r_dist_eval.ComputeDistanceTermNominalSmallestNonmatch(query_feature_index, high_accuracy);
+		//if the next closest match is larger, no need to compute any more values
+		if(nonmatch_dist_term > accumulated_term)
+			return nonmatch_dist_term;
+
+		//need to iterate over everything with the same distance term
+		r_dist_eval.IterateOverNominalValuesWithLessOrEqualDistanceTermsString(accumulated_term, query_feature_index, high_accuracy,
+			[this, &value, &r_dist_eval, &column, query_feature_index, high_accuracy](StringInternPool::StringID sid)
 			{
-				double term = r_dist_eval.distEvaluator->ComputeDistanceTermContinuousExactMatch(query_feature_index, high_accuracy);
-				AccumulatePartialSums(*(value_found->second), query_feature_index, term);
-			}
-		}
+				//don't want to double-accumulate the found value
+				if(sid != value.nodeValue.stringID)
+					AccumulatePartialSumsForNominalStringIdValueIfExists(
+						r_dist_eval, value.nodeValue.stringID, query_feature_index, *column, high_accuracy);
+			});
 
-		//the next closest string will have an edit distance of 1
-		return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonCyclicNonNullRegular(1.0, query_feature_index, high_accuracy);
+		return r_dist_eval.ComputeDistanceTermNominalNextSmallest(nonmatch_dist_term, query_feature_index, high_accuracy);
+	}
+	else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_NUMERIC)
+	{
+		//initialize to zero, because if don't find an exact match, but there are distance terms of
+		//0, then need to accumulate those later
+		double accumulated_term = 0.0;
+		if(value.nodeType == ENIVT_NUMBER)
+			accumulated_term = AccumulatePartialSumsForNominalNumberValueIfExists(
+				r_dist_eval, value.nodeValue.number, query_feature_index, *column, high_accuracy);
+
+		double nonmatch_dist_term = r_dist_eval.ComputeDistanceTermNominalSmallestNonmatch(query_feature_index, high_accuracy);
+		//if the next closest match is larger, no need to compute any more values
+		if(nonmatch_dist_term > accumulated_term)
+			return nonmatch_dist_term;
+
+		//need to iterate over everything with the same distance term
+		r_dist_eval.IterateOverNominalValuesWithLessOrEqualDistanceTermsNumeric(accumulated_term, query_feature_index, high_accuracy,
+			[this, &value, &r_dist_eval, &column, query_feature_index, high_accuracy](double number_value)
+			{
+				//don't want to double-accumulate the found value
+				if(!EqualIncludingNaN(number_value, value.nodeValue.number))
+					AccumulatePartialSumsForNominalNumberValueIfExists(
+						r_dist_eval, value.nodeValue.number, query_feature_index, *column, high_accuracy);
+			});
+
+		return r_dist_eval.ComputeDistanceTermNominalNextSmallest(nonmatch_dist_term, query_feature_index, high_accuracy);
 	}
-	else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_CODE)
+	else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_CODE
+		|| effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_CODE)
 	{
 		//compute partial sums for all code of matching size
 		size_t code_size = 1;
-		if(value_type == ENIVT_CODE)
-			code_size = EvaluableNode::GetDeepSize(value.code);
+		if(value.nodeType == ENIVT_CODE)
+			code_size = EvaluableNode::GetDeepSize(value.nodeValue.code);
 
 		auto value_found = column->valueCodeSizeToIndices.find(code_size);
 		if(value_found != end(column->valueCodeSizeToIndices))
@@ -1052,25 +1085,36 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
 			ComputeAndAccumulatePartialSums(r_dist_eval, entity_indices, query_feature_index, absolute_feature_index, high_accuracy);
 		}
 
-		//next most similar code must be at least a distance of 1 edit away
-		return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonCyclicNonNullRegular(1.0, query_feature_index, high_accuracy);
-	}
-	else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_STRING)
-	{
-		//TODO 17631: finish this; need to account for all values that are smaller than the current
-	}
-	else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_NUMERIC)
-	{
-		//TODO 17631: finish this; need to account for all values that are smaller than the current
+		if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_CODE)
+		{
+			double nonmatch_dist_term = r_dist_eval.ComputeDistanceTermNominalSmallestNonmatch(query_feature_index, high_accuracy);
+			return nonmatch_dist_term;
+		}
+		else //RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_CODE
+		{
+			//next most similar code must be at least a distance of 1 edit away
+			return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonCyclicNonNullRegular(1.0, query_feature_index, high_accuracy);
+		}
 	}
-	else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_CODE)
+	else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_STRING)
 	{
-		//TODO 17631: finish this; need to account for all values that are smaller than the current
+		if(value.nodeType == ENIVT_STRING_ID)
+		{
+			auto value_found = column->stringIdValueToIndices.find(value.nodeValue.stringID);
+			if(value_found != end(column->stringIdValueToIndices))
+			{
+				double term = r_dist_eval.distEvaluator->ComputeDistanceTermContinuousExactMatch(query_feature_index, high_accuracy);
+				AccumulatePartialSums(*(value_found->second), query_feature_index, term);
+			}
+		}
+
+		//the next closest string will have an edit distance of 1
+		return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonCyclicNonNullRegular(1.0, query_feature_index, high_accuracy);
 	}
 	//else feature_type == FDT_CONTINUOUS_NUMERIC or FDT_CONTINUOUS_UNIVERSALLY_NUMERIC
 
 	//if not a number or no numbers available, then no size
-	if(value_type != ENIVT_NUMBER || column->sortedNumberValueEntries.size() == 0)
+	if(value.nodeType != ENIVT_NUMBER || column->sortedNumberValueEntries.size() == 0)
 		return GetMaxDistanceTermForContinuousFeature(r_dist_eval, query_feature_index, absolute_feature_index, high_accuracy);
 
 	bool cyclic_feature = r_dist_eval.distEvaluator->IsFeatureCyclic(query_feature_index);
@@ -1078,14 +1122,14 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
 	if(cyclic_feature)
 		cycle_length = feature_attribs.typeAttributes.maxCyclicDifference;
 
-	auto [value_index, exact_index_found] = column->FindClosestValueIndexForValue(value.number, cycle_length);
+	auto [value_index, exact_index_found] = column->FindClosestValueIndexForValue(value.nodeValue.number, cycle_length);
 
 	double term = 0.0;
 	if(exact_index_found)
 		term = r_dist_eval.distEvaluator->ComputeDistanceTermContinuousExactMatch(query_feature_index, high_accuracy);
 	else
 		term = r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonNullRegular(
-			value.number - column->sortedNumberValueEntries[value_index]->value.number, query_feature_index, high_accuracy);
+			value.nodeValue.number - column->sortedNumberValueEntries[value_index]->value.number, query_feature_index, high_accuracy);
 
 	size_t num_entities_computed = AccumulatePartialSums(column->sortedNumberValueEntries[value_index]->indicesWithValue, query_feature_index, term);
 
@@ -1123,7 +1167,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
 			if(lower_value_index > 0)
 			{
 				next_lower_index = lower_value_index - 1;
-				lower_diff = std::abs(value.number - column->sortedNumberValueEntries[next_lower_index]->value.number);
+				lower_diff = std::abs(value.nodeValue.number - column->sortedNumberValueEntries[next_lower_index]->value.number);
 				compute_lower = true;
 			}
 		}
@@ -1140,7 +1184,9 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
 				break;
 
 			next_lower_index = next_index;
-			lower_diff = GeneralizedDistanceEvaluator::ConstrainDifferenceToCyclicDifference(std::abs(value.number - column->sortedNumberValueEntries[next_lower_index]->value.number), cycle_length);
+			lower_diff = GeneralizedDistanceEvaluator::ConstrainDifferenceToCyclicDifference(
+				std::abs(value.nodeValue.number - column->sortedNumberValueEntries[next_lower_index]->value.number),
+				cycle_length);
 			compute_lower = true;
 		}
 
@@ -1153,7 +1199,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
 			if(upper_value_index + 1 < num_unique_number_values)
 			{
 				next_upper_index = upper_value_index + 1;
-				upper_diff = std::abs(value.number - column->sortedNumberValueEntries[next_upper_index]->value.number);
+				upper_diff = std::abs(value.nodeValue.number - column->sortedNumberValueEntries[next_upper_index]->value.number);
 				compute_upper = true;
 			}
 		}
@@ -1170,7 +1216,8 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
 				break;
 
 			next_upper_index = next_index;
-			upper_diff = GeneralizedDistanceEvaluator::ConstrainDifferenceToCyclicDifference(std::abs(value.number - column->sortedNumberValueEntries[next_upper_index]->value.number), cycle_length);
+			upper_diff = GeneralizedDistanceEvaluator::ConstrainDifferenceToCyclicDifference(
+				std::abs(value.nodeValue.number - column->sortedNumberValueEntries[next_upper_index]->value.number), cycle_length);
 			compute_upper = true;
 		}
 
@@ -1253,7 +1300,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R
 void SeparableBoxFilterDataStore::PopulateInitialPartialSums(RepeatedGeneralizedDistanceEvaluator &r_dist_eval,
 	size_t top_k, size_t radius_column_index, bool high_accuracy,
 	BitArrayIntegerSet &enabled_indices, std::vector<double> &min_unpopulated_distances, std::vector<double> &min_distance_by_unpopulated_count)
-{;
+{
 	if(radius_column_index < columnData.size())
 	{
 		auto &partial_sums = parametersAndBuffers.partialSums;
@@ -1396,14 +1443,11 @@ void SeparableBoxFilterDataStore::PopulateTargetValueAndLabelIndex(RepeatedGener
 	feature_data.internedNumberIndexToNumberValue = nullptr;
 	feature_data.internedDistanceTerms.clear();
 
-	if(feature_type == GeneralizedDistanceEvaluator::FDT_NOMINAL_NUMERIC
-		|| feature_type == GeneralizedDistanceEvaluator::FDT_NOMINAL_STRING
-		|| feature_type == GeneralizedDistanceEvaluator::FDT_NOMINAL_CODE
+	if(feature_attribs.IsFeatureNominal()
 		|| feature_type == GeneralizedDistanceEvaluator::FDT_CONTINUOUS_STRING
 		|| feature_type == GeneralizedDistanceEvaluator::FDT_CONTINUOUS_CODE)
 	{
-		feature_data.targetValue = position_value;
-		feature_data.targetValueType = position_value_type;
+		feature_data.targetValue = EvaluableNodeImmediateValueWithType(position_value, position_value_type);
 
 		if(feature_type == GeneralizedDistanceEvaluator::FDT_NOMINAL_NUMERIC)
 			effective_feature_type = RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_NUMERIC;
@@ -1415,6 +1459,9 @@ void SeparableBoxFilterDataStore::PopulateTargetValueAndLabelIndex(RepeatedGener
 			effective_feature_type = RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_STRING;
 		else if(feature_type == GeneralizedDistanceEvaluator::FDT_CONTINUOUS_CODE)
 			effective_feature_type = RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_CODE;
+
+		if(feature_attribs.IsFeatureNominal())
+			r_dist_eval.ComputeAndStoreNominalDistanceTerms(query_feature_index);
 	}
 	else // feature_type is some form of continuous numeric
 	{
@@ -1422,8 +1469,7 @@ void SeparableBoxFilterDataStore::PopulateTargetValueAndLabelIndex(RepeatedGener
 		double position_value_numeric = (position_value_type == ENIVT_NUMBER
 			? position_value.number : std::numeric_limits<double>::quiet_NaN());
 
-		feature_data.targetValue = position_value_numeric;
-		feature_data.targetValueType = ENIVT_NUMBER;
+		feature_data.targetValue = EvaluableNodeImmediateValueWithType(position_value_numeric);
 
 		//set up effective_feature_type
 		auto &column_data = columnData[feature_attribs.featureIndex];
@@ -1439,7 +1485,7 @@ void SeparableBoxFilterDataStore::PopulateTargetValueAndLabelIndex(RepeatedGener
 			else
 				effective_feature_type = RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_NUMERIC_PRECOMPUTED;
 
-			r_dist_eval.ComputeAndStoreInternedNumberValuesAndDistanceTerms(position_value_numeric, query_feature_index, &column_data->internedNumberIndexToNumberValue);
+			r_dist_eval.ComputeAndStoreInternedNumberValuesAndDistanceTerms(query_feature_index, &column_data->internedNumberIndexToNumberValue);
 		}
 		else
 		{
diff --git a/src/Amalgam/SeparableBoxFilterDataStore.h b/src/Amalgam/SeparableBoxFilterDataStore.h
index 1a5b70e7..d00ece43 100644
--- a/src/Amalgam/SeparableBoxFilterDataStore.h
+++ b/src/Amalgam/SeparableBoxFilterDataStore.h
@@ -652,6 +652,38 @@ class SeparableBoxFilterDataStore
 			return AccumulatePartialSums(entity_indices.GetBaisContainer(), query_feature_index, term);
 	}
 
+	//accumulates the partial sums for the specified value
+	// returns the distance term evaluated, or 0.0 if value was not found
+	inline double AccumulatePartialSumsForNominalNumberValueIfExists(RepeatedGeneralizedDistanceEvaluator &r_dist_eval,
+		double value, size_t query_feature_index, SBFDSColumnData &column, bool high_accuracy)
+	{
+		auto [value_index, exact_index_found] = column.FindExactIndexForValue(value);
+		if(exact_index_found)
+		{
+			double term = r_dist_eval.ComputeDistanceTermNominalNumeric(value, true, query_feature_index, high_accuracy);
+			AccumulatePartialSums(column.sortedNumberValueEntries[value_index]->indicesWithValue, query_feature_index, term);
+			return term;
+		}
+
+		return 0.0;
+	}
+
+	//accumulates the partial sums for the specified value
+	// returns the distance term evaluated, or 0.0 if value was not found
+	inline double AccumulatePartialSumsForNominalStringIdValueIfExists(RepeatedGeneralizedDistanceEvaluator &r_dist_eval,
+		StringInternPool::StringID value, size_t query_feature_index, SBFDSColumnData &column, bool high_accuracy)
+	{
+		auto value_found = column.stringIdValueToIndices.find(value);
+		if(value_found != end(column.stringIdValueToIndices))
+		{
+			double term = r_dist_eval.ComputeDistanceTermNominalString(value, true, query_feature_index, high_accuracy);
+			AccumulatePartialSums(*(value_found->second), query_feature_index, term);
+			return term;
+		}
+
+		return 0.0;
+	}
+
 	//search a projection width in terms of bucket count or number of collected entities
 	//accumulates partial sums
 	//searches until num_entities_to_populate are popluated or other heuristics have been reached
@@ -725,7 +757,7 @@ class SeparableBoxFilterDataStore
 		{
 			auto &feature_attribs = r_dist_eval.distEvaluator->featureAttribs[query_feature_index];
 			return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonCyclicOneNonNullRegular(
-				feature_data.targetValue.number - GetValue(entity_index, feature_attribs.featureIndex).number,
+				feature_data.targetValue.nodeValue.number - GetValue(entity_index, feature_attribs.featureIndex).number,
 				query_feature_index, high_accuracy);
 		}
 
@@ -742,7 +774,7 @@ class SeparableBoxFilterDataStore
 			auto &column_data = columnData[feature_attribs.featureIndex];
 			if(column_data->numberIndices.contains(entity_index))
 				return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonCyclicOneNonNullRegular(
-					feature_data.targetValue.number - GetValue(entity_index, feature_attribs.featureIndex).number,
+					feature_data.targetValue.nodeValue.number - GetValue(entity_index, feature_attribs.featureIndex).number,
 					query_feature_index, high_accuracy);
 			else
 				return r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy);
@@ -754,7 +786,7 @@ class SeparableBoxFilterDataStore
 			auto &column_data = columnData[feature_attribs.featureIndex];
 			if(column_data->numberIndices.contains(entity_index))
 				return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousOneNonNullRegular(
-					feature_data.targetValue.number - GetValue(entity_index, feature_attribs.featureIndex).number,
+					feature_data.targetValue.nodeValue.number - GetValue(entity_index, feature_attribs.featureIndex).number,
 					query_feature_index, high_accuracy);
 			else
 				return r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy);
@@ -777,11 +809,11 @@ class SeparableBoxFilterDataStore
 			auto &column_data = columnData[feature_attribs.featureIndex];
 			if(column_data->stringIdIndices.contains(entity_index))
 				return r_dist_eval.ComputeDistanceTermNominalString(
-					GetValue(entity_index, feature_attribs.featureIndex).stringID,
-					query_feature_index, true, high_accuracy);
+					GetValue(entity_index, feature_attribs.featureIndex).stringID, true,
+					query_feature_index, high_accuracy);
 			else
-				return r_dist_eval.ComputeDistanceTermNominalString(string_intern_pool.EMPTY_STRING_ID,
-					query_feature_index, false, high_accuracy);
+				return r_dist_eval.ComputeDistanceTermNominalString(string_intern_pool.EMPTY_STRING_ID, false,
+					query_feature_index, high_accuracy);
 		}
 
 		case RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_NUMERIC:
@@ -790,11 +822,11 @@ class SeparableBoxFilterDataStore
 			auto &column_data = columnData[feature_attribs.featureIndex];
 			if(column_data->numberIndices.contains(entity_index))
 				return r_dist_eval.ComputeDistanceTermNominalNumeric(
-					GetValue(entity_index, feature_attribs.featureIndex).number,
-					query_feature_index, true, high_accuracy);
+					GetValue(entity_index, feature_attribs.featureIndex).number, true,
+					query_feature_index, high_accuracy);
 			else
-				return r_dist_eval.ComputeDistanceTermNominalNumeric(0.0,
-					query_feature_index, false, high_accuracy);
+				return r_dist_eval.ComputeDistanceTermNominalNumeric(0.0, false,
+					query_feature_index, high_accuracy);
 		}
 
 		default:
diff --git a/src/Amalgam/evaluablenode/EvaluableNode.h b/src/Amalgam/evaluablenode/EvaluableNode.h
index e0d82641..0d62dbed 100644
--- a/src/Amalgam/evaluablenode/EvaluableNode.h
+++ b/src/Amalgam/evaluablenode/EvaluableNode.h
@@ -1062,6 +1062,11 @@ class EvaluableNodeImmediateValueWithType
 		: nodeType(ENIVT_NULL)
 	{	}
 
+	constexpr EvaluableNodeImmediateValueWithType(EvaluableNodeImmediateValue node_value,
+		EvaluableNodeImmediateValueType node_type)
+		: nodeType(node_type), nodeValue(node_value)
+	{	}
+
 	__forceinline EvaluableNodeImmediateValueWithType(bool value)
 	{
 		nodeType = ENIVT_NUMBER;
@@ -1254,6 +1259,12 @@ class EvaluableNodeImmediateValueWithType
 		return EvaluableNodeImmediateValue::AreEqual(a.nodeType, a.nodeValue, b.nodeType, b.nodeValue);
 	}
 
+	//returns true if it is a null or null equivalent
+	constexpr bool IsNullEquivalent()
+	{
+		return EvaluableNodeImmediateValue::IsNullEquivalent(nodeType, nodeValue);
+	}
+
 	EvaluableNodeImmediateValueType nodeType;
 	EvaluableNodeImmediateValue nodeValue;
 };