From 9d24ae31c3f80e358dfcc931057adb6f7d263806 Mon Sep 17 00:00:00 2001 From: howsohazard <143410553+howsohazard@users.noreply.github.com> Date: Thu, 18 Apr 2024 16:48:14 -0400 Subject: [PATCH] 20025: Implements more sparse deviation logic, fixes rare edge cases with null string values and code/string distances (#121) --- src/Amalgam/GeneralizedDistance.h | 412 +++++++++++++++----- src/Amalgam/HashMaps.h | 35 ++ src/Amalgam/SeparableBoxFilterDataStore.cpp | 258 +++++++----- src/Amalgam/SeparableBoxFilterDataStore.h | 54 ++- src/Amalgam/evaluablenode/EvaluableNode.h | 11 + 5 files changed, 548 insertions(+), 222 deletions(-) diff --git a/src/Amalgam/GeneralizedDistance.h b/src/Amalgam/GeneralizedDistance.h index ae2de42b..9a270646 100644 --- a/src/Amalgam/GeneralizedDistance.h +++ b/src/Amalgam/GeneralizedDistance.h @@ -100,6 +100,40 @@ class GeneralizedDistanceEvaluator typeAttributes.maxCyclicDifference = std::numeric_limits::quiet_NaN(); } + //returns true if the feature is nominal + __forceinline bool IsFeatureNominal() + { + return (featureType <= GeneralizedDistanceEvaluator::FDT_NOMINAL_CODE); + } + + //returns true if the feature is nominal + __forceinline bool IsFeatureContinuous() + { + return (featureType >= GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC); + } + + //returns true if the feature is cyclic + __forceinline bool IsFeatureCyclic() + { + return (featureType == GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC_CYCLIC); + } + + //returns true if the feature has a deviation + __forceinline bool DoesFeatureHaveDeviation() + { + return (deviation > 0); + } + + //returns true if the feature is a nominal that only has one difference value for match and one for nonmatch + __forceinline bool IsFeatureSymmetricNominal() + { + if(!IsFeatureNominal()) + return false; + + return (nominalNumberSparseDeviationMatrix.size() == 0 + && nominalStringSparseDeviationMatrix.size() == 0); + } + //the type of comparison for each feature // this type is 32-bit aligned to make sure the whole structure is aligned FeatureDifferenceType featureType; @@ -131,70 +165,29 @@ class GeneralizedDistanceEvaluator double deviationReciprocal; //contains the deviations for a given nominal value for each other nominal value + //if the nominal value is not found, then the attribute defaultDeviation should be used template> - class SparseNominalDeviationValues + class SparseNominalDeviationValues : public SmallMap { public: inline SparseNominalDeviationValues() : defaultDeviation(0.0) { } - using value_type = NominalValueType; - - //returns an iterator to deviations that matches the nominal key - inline auto FindDeviationIterator(NominalValueType key) - { - return std::find_if(begin(deviations), end(deviations), - [key](auto i) - { return EqualComparison{}(i.first, key); } - ); - } - - //deviations for each value; unknown should be stored as special nonvalue (e.g., NaN, NaS) - //store as a vector of pairs instead of a map because either only one value will be looked up once, - //in which case there's no advantage to having a map, or many distance term values will be looked up - //repeatedly, which is handled by a RepeatedGeneralizedDistanceEvaluator, which uses a map - std::vector> deviations; double defaultDeviation; }; - //contains the deviations for a given nominal value for each other nominal value - template> - class SparseNominalDeviationMatrix - { - public: - inline SparseNominalDeviationMatrix() - { } - - using value_type = NominalValueType; - - //returns an iterator to deviation values that matches the nominal key - inline auto FindDeviationValuesIterator(NominalValueType key) - { - return std::find_if(begin(deviationValues), end(deviationValues), - [key](auto i) - { return EqualComparison{}(i.first, key); } - ); - } - - //deviation values for each value; unknown should be stored as special nonvalue (e.g., NaN, NaS) - //store as a vector of pairs instead of a map because either only one value will be looked up once, - //in which case there's no advantage to having a map, or many distance term values will be looked up - //repeatedly, which is handled by a RepeatedGeneralizedDistanceEvaluator, which uses a map - std::vector>> deviationValues; - }; - //sparse deviation matrix if the nominal is a string //store as a vector of pairs instead of a map because either only one value will be looked up once, //in which case there's no advantage to having a map, or many distance term values will be looked up //repeatedly, which is handled by a RepeatedGeneralizedDistanceEvaluator, which uses a map - SparseNominalDeviationMatrix nominalStringSparseDeviationMatrix; + SmallMap> nominalStringSparseDeviationMatrix; //sparse deviation matrix if the nominal is a number //store as a vector of pairs instead of a map because either only one value will be looked up once, //in which case there's no advantage to having a map, or many distance term values will be looked up //repeatedly, which is handled by a RepeatedGeneralizedDistanceEvaluator, which uses a map - SparseNominalDeviationMatrix nominalNumberSparseDeviationMatrix; + SmallMap, DoubleNanHashComparator> nominalNumberSparseDeviationMatrix; //distance term to use if both values being compared are unknown //the difference will be NaN if unknown @@ -310,36 +303,31 @@ class GeneralizedDistanceEvaluator //returns true if the feature is nominal __forceinline bool IsFeatureNominal(size_t feature_index) { - return (featureAttribs[feature_index].featureType <= GeneralizedDistanceEvaluator::FDT_NOMINAL_CODE); + return featureAttribs[feature_index].IsFeatureNominal(); } //returns true if the feature is nominal __forceinline bool IsFeatureContinuous(size_t feature_index) { - return (featureAttribs[feature_index].featureType >= GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC); + return featureAttribs[feature_index].IsFeatureContinuous(); } //returns true if the feature is cyclic __forceinline bool IsFeatureCyclic(size_t feature_index) { - return (featureAttribs[feature_index].featureType == GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC_CYCLIC); + return featureAttribs[feature_index].IsFeatureCyclic(); } //returns true if the feature has a deviation __forceinline bool DoesFeatureHaveDeviation(size_t feature_index) { - return (featureAttribs[feature_index].deviation > 0); + return featureAttribs[feature_index].DoesFeatureHaveDeviation(); } //returns true if the feature is a nominal that only has one difference value for match and one for nonmatch __forceinline bool IsFeatureSymmetricNominal(size_t feature_index) { - if(!IsFeatureNominal(feature_index)) - return false; - - auto &feature_attribs = featureAttribs[feature_index]; - return (feature_attribs.nominalNumberSparseDeviationMatrix.deviationValues.size() == 0 - && feature_attribs.nominalStringSparseDeviationMatrix.deviationValues.size() == 0); + return featureAttribs[feature_index].IsFeatureSymmetricNominal(); } //returns true if a known to unknown distance term would be less than or same as an exact match @@ -403,15 +391,15 @@ class GeneralizedDistanceEvaluator } double deviation = std::numeric_limits::quiet_NaN(); - if(a_type == ENIVT_NUMBER && feature_attribs.nominalNumberSparseDeviationMatrix.deviationValues.size() > 0) + if(a_type == ENIVT_NUMBER && feature_attribs.nominalNumberSparseDeviationMatrix.size() > 0) { - auto outer_it = feature_attribs.nominalNumberSparseDeviationMatrix.FindDeviationValuesIterator(a.number); - if(outer_it != std::end(feature_attribs.nominalNumberSparseDeviationMatrix.deviationValues)) + auto outer_it = feature_attribs.nominalNumberSparseDeviationMatrix.find(a.number); + if(outer_it != std::end(feature_attribs.nominalNumberSparseDeviationMatrix)) { auto &ndd = outer_it->second; - auto inner_it = ndd.FindDeviationIterator(b.number); + auto inner_it = ndd.find(b.number); - if(inner_it == end(ndd.deviations)) + if(inner_it == end(ndd)) deviation = ndd.defaultDeviation; else deviation = inner_it->second; @@ -424,15 +412,15 @@ class GeneralizedDistanceEvaluator deviation = feature_attribs.knownToUnknownDistanceTerm.deviation; } } - else if(a_type == ENIVT_STRING_ID && feature_attribs.nominalStringSparseDeviationMatrix.deviationValues.size() > 0) + else if(a_type == ENIVT_STRING_ID && feature_attribs.nominalStringSparseDeviationMatrix.size() > 0) { - auto outer_it = feature_attribs.nominalStringSparseDeviationMatrix.FindDeviationValuesIterator(a.stringID); - if(outer_it != std::end(feature_attribs.nominalStringSparseDeviationMatrix.deviationValues)) + auto outer_it = feature_attribs.nominalStringSparseDeviationMatrix.find(a.stringID); + if(outer_it != std::end(feature_attribs.nominalStringSparseDeviationMatrix)) { auto &ndd = outer_it->second; - auto inner_it = ndd.FindDeviationIterator(b.stringID); + auto inner_it = ndd.find(b.stringID); - if(inner_it == end(ndd.deviations)) + if(inner_it == end(ndd)) deviation = ndd.defaultDeviation; else deviation = inner_it->second; @@ -505,8 +493,9 @@ class GeneralizedDistanceEvaluator return -std::numeric_limits::infinity(); } + //TODO 17631: remove this? //computes the base of the difference between two nominal values that exactly match without exponentiation - __forceinline double ComputeDistanceTermNominalBaseExactMatchFromDeviation(size_t index, double deviation, bool high_accuracy) + __forceinline double ComputeDistanceTermBaseNominalExactMatchFromDeviation(size_t index, double deviation, bool high_accuracy) { if(!DoesFeatureHaveDeviation(index) || computeSurprisal) return 0.0; @@ -516,15 +505,16 @@ class GeneralizedDistanceEvaluator //TODO 17631: genericize this for use in ComputeDistanceTermNominal -- may need to take in two deviations, // exact match deviation and nonmatch deviation? Or just change the calls to pass in 1-deviation? - //make sure lines up with ComputeDistanceTermNominalBaseExactMatchFromDeviation for exact match, and maybe remove ComputeDistanceTermNominalBaseExactMatchFromDeviation + //make sure lines up with ComputeDistanceTermBaseNominalExactMatchFromDeviation for exact match, and maybe remove ComputeDistanceTermBaseNominalExactMatchFromDeviation //computes the base of the difference between two nominal values that do not match without exponentiation - __forceinline double ComputeDistanceTermNominalBaseNonMatchFromDeviation(size_t index, double deviation, bool high_accuracy) + __forceinline double ComputeDistanceTermBaseNominalNonMatchFromDeviation(size_t index, double deviation, bool high_accuracy) { if(computeSurprisal) { //need to have at least two classes in existence double nominal_count = std::max(featureAttribs[index].typeAttributes.nominalCount, 2.0); + //TODO 17631: change to be weighted average: prob of nominal * deviation of random guessing double prob_max_entropy_match = 1 / nominal_count; //find probability that the correct class was selected @@ -533,10 +523,10 @@ class GeneralizedDistanceEvaluator //find the probability that any other class besides the correct class was selected //divide the probability among the other classes - double prop_class_given_nonmatch = (1 - prob_class_given_match) / (nominal_count - 1); + double prob_class_given_nonmatch = (1 - prob_class_given_match) / (nominal_count - 1); double surprisal_class_given_match = -std::log(prob_class_given_match); - double surprisal_class_given_nonmatch = -std::log(prop_class_given_nonmatch); + double surprisal_class_given_nonmatch = -std::log(prob_class_given_nonmatch); //the surprisal of the class matching on a different value is the difference between //how surprised it would be given a nonmatch but without the surprisal given a match @@ -565,17 +555,86 @@ class GeneralizedDistanceEvaluator } } + //TODO 17631: finish this and integrate it + //returns the base of the distance term for nominal comparisons for a match + //given the probablility of the class being observed given that it is a match + __forceinline double ComputeDistanceTermBaseNominalMatchFromMatchProbabilities(size_t index, + double prob_class_given_match, bool high_accuracy) + { + if(!DoesFeatureHaveDeviation(index) || computeSurprisal) + return 0.0; + + return 1 - prob_class_given_match; + } + + //TODO 17631: finish this and integrate it + // for a given prob_class_given_match, which is the probability that the classes compared should have been a match, + // and prob_class_given_nonmatch, the probability that the particular comparison class does not match + __forceinline double ComputeDistanceTermBaseNominalNonmatchFromMatchProbabilities(size_t index, + double prob_class_given_match, double prob_class_given_nonmatch, bool high_accuracy) + { + if(computeSurprisal) + { + double surprisal_class_given_match = -std::log(prob_class_given_match); + double surprisal_class_given_nonmatch = -std::log(prob_class_given_nonmatch); + + //the surprisal of the class matching on a different value is the difference between + //how surprised it would be given a nonmatch but without the surprisal given a match + double dist_term = surprisal_class_given_nonmatch - surprisal_class_given_match; + + //ensure it doesn't go below zero in case of numerical precision issues + return std::max(dist_term, 0.0); + } + else if(DoesFeatureHaveDeviation(index)) + { + //add together uncertainties from a nonmatch, + // plus a nonmatch of a nonmatch to get a match + double dist_term = (1 - prob_class_given_match) + (1 - prob_class_given_nonmatch); + return dist_term; + } + else + { + return 1.0; + } + } + + //TODO 17631: finish this and integrate it + //for inputs to this method, if not using SDM, b_deviation = (1 - a_deviation) / (nominal_count - 1) + __forceinline double ComputeDistanceTermNominalBaseFromDeviations(size_t index, bool match, + double match_deviation, double nonmatch_deviation, bool high_accuracy) + { + //need to have at least two classes in existence + double nominal_count = std::max(featureAttribs[index].typeAttributes.nominalCount, 2.0); + //TODO 17631: change to be weighted average: prob of nominal * deviation of random guessing + double prob_max_entropy_match = 1 / nominal_count; + + //find probability that the correct class was selected + //can't go below base probability of guessing + double prob_class_given_match = std::max(1 - match_deviation, prob_max_entropy_match); + + //find the probability that any other class besides the correct class was selected, + //but cannot exceed the probability of a match + double prob_class_given_nonmatch = std::min(1 - nonmatch_deviation, prob_class_given_match); + + if(match) + return ComputeDistanceTermBaseNominalMatchFromMatchProbabilities(index, + prob_class_given_match, high_accuracy); + else + return ComputeDistanceTermBaseNominalNonmatchFromMatchProbabilities(index, + prob_class_given_match, prob_class_given_nonmatch, high_accuracy); + } + //computes the distance term for a nominal when two universally symmetric nominals are equal __forceinline double ComputeDistanceTermNominalUniversallySymmetricExactMatch(size_t index, bool high_accuracy) { - double dist_term = ComputeDistanceTermNominalBaseExactMatchFromDeviation(index, featureAttribs[index].deviation, high_accuracy); + double dist_term = ComputeDistanceTermBaseNominalExactMatchFromDeviation(index, featureAttribs[index].deviation, high_accuracy); return ContextuallyExponentiateAndWeightDifferenceTerm(dist_term, index, high_accuracy); } //computes the distance term for a nominal when two universally symmetric nominals are not equal __forceinline double ComputeDistanceTermNominalUniversallySymmetricNonMatch(size_t index, bool high_accuracy) { - double dist_term = ComputeDistanceTermNominalBaseNonMatchFromDeviation(index, featureAttribs[index].deviation, high_accuracy); + double dist_term = ComputeDistanceTermBaseNominalNonMatchFromDeviation(index, featureAttribs[index].deviation, high_accuracy); return ContextuallyExponentiateAndWeightDifferenceTerm(dist_term, index, high_accuracy); } @@ -722,6 +781,7 @@ class GeneralizedDistanceEvaluator { //need to have at least two classes in existence double nominal_count = std::max(featureAttribs[index].typeAttributes.nominalCount, 2.0); + //TODO 17631: change to be weighted average: prob of nominal * deviation of random guessing double prob_max_entropy_match = 1 / nominal_count; //find probability that the correct class was selected @@ -926,7 +986,7 @@ class GeneralizedDistanceEvaluator for(size_t i = 0; i < featureAttribs.size(); i++) { auto &feature_attribs = featureAttribs[i]; - if(IsFeatureNominal(i)) + if(feature_attribs.IsFeatureNominal()) { //ensure if a feature has deviations they're not too small to underflow if(DoesFeatureHaveDeviation(i)) @@ -1055,8 +1115,64 @@ class RepeatedGeneralizedDistanceEvaluator : distEvaluator(dist_evaluator) { } + //for the feature index, computes and stores the distance terms for nominal values + inline void ComputeAndStoreNominalDistanceTerms(size_t index) + { + bool compute_approximate = distEvaluator->NeedToPrecomputeApproximate(); + + //make sure there's room for the interned index + if(featureData.size() <= index) + featureData.resize(index + 1); + + auto &feature_attributes = distEvaluator->featureAttribs[index]; + auto &feature_data = featureData[index]; + + //since most of the computations will be using approximate if it is needed, + //only set to high accuracy if not using approximate + feature_data.precomputedNominalDistanceTermsHighAccuracy = (!compute_approximate); + + if(feature_data.targetValue.nodeType == ENIVT_NUMBER) + { + auto &sdm = feature_attributes.nominalNumberSparseDeviationMatrix; + auto target_value = feature_data.targetValue.nodeValue.number; + + auto deviations_for_value = sdm.find(target_value); + if(deviations_for_value != end(sdm)) + { + auto &deviations = deviations_for_value->second; + for(auto &[value, deviation] : deviations) + { + //TODO 17631: finish this + double dist_term = deviation;// = distEvaluator->ComputeDistanceTermNominalBaseFromDeviations(index, value == sid, ) + feature_data.nominalNumberDistanceTerms.emplace(value, dist_term); + } + + //TODO: deviations.defaultDeviation + } + } + else if(feature_data.targetValue.nodeType == ENIVT_STRING_ID) + { + auto &sdm = feature_attributes.nominalStringSparseDeviationMatrix; + auto target_sid = feature_data.targetValue.nodeValue.stringID; + + auto deviations_for_value = sdm.find(target_sid); + if(deviations_for_value != end(sdm)) + { + auto &deviations = deviations_for_value->second; + for(auto &[value, deviation] : deviations) + { + //TODO 17631: finish this + double dist_term = deviation;// distEvaluator->ComputeDistanceTermNominalBaseFromDeviations(index, value == sid, ) + feature_data.nominalStringDistanceTerms.emplace(value, dist_term); + } + + //TODO: deviations.defaultDeviation + } + } + } + //for the feature index, computes and stores the distance terms as measured from value to each interned value - inline void ComputeAndStoreInternedNumberValuesAndDistanceTerms(double value, size_t index, std::vector *interned_values) + inline void ComputeAndStoreInternedNumberValuesAndDistanceTerms(size_t index, std::vector *interned_values) { bool compute_accurate = distEvaluator->NeedToPrecomputeAccurate(); bool compute_approximate = distEvaluator->NeedToPrecomputeApproximate(); @@ -1065,39 +1181,41 @@ class RepeatedGeneralizedDistanceEvaluator if(featureData.size() <= index) featureData.resize(index + 1); - auto &feature_interns = featureData[index]; - feature_interns.internedNumberIndexToNumberValue = interned_values; + auto &feature_data = featureData[index]; + feature_data.internedNumberIndexToNumberValue = interned_values; if(interned_values == nullptr) { - feature_interns.internedDistanceTerms.clear(); + feature_data.internedDistanceTerms.clear(); return; } - feature_interns.internedDistanceTerms.resize(interned_values->size()); + feature_data.internedDistanceTerms.resize(interned_values->size()); auto &feature_attribs = distEvaluator->featureAttribs[index]; + + double value = feature_data.targetValue.GetValueAsNumber(); if(FastIsNaN(value)) { //first entry is unknown-unknown distance - feature_interns.internedDistanceTerms[0] = feature_attribs.unknownToUnknownDistanceTerm; + feature_data.internedDistanceTerms[0] = feature_attribs.unknownToUnknownDistanceTerm; auto k_to_unk = feature_attribs.knownToUnknownDistanceTerm; - for(size_t i = 1; i < feature_interns.internedDistanceTerms.size(); i++) - feature_interns.internedDistanceTerms[i] = k_to_unk; + for(size_t i = 1; i < feature_data.internedDistanceTerms.size(); i++) + feature_data.internedDistanceTerms[i] = k_to_unk; } else { //first entry is known-unknown distance - feature_interns.internedDistanceTerms[0] = feature_attribs.knownToUnknownDistanceTerm; + feature_data.internedDistanceTerms[0] = feature_attribs.knownToUnknownDistanceTerm; - for(size_t i = 1; i < feature_interns.internedDistanceTerms.size(); i++) + for(size_t i = 1; i < feature_data.internedDistanceTerms.size(); i++) { double difference = value - (*interned_values)[i]; if(compute_accurate) - feature_interns.internedDistanceTerms[i].SetValue(distEvaluator->ComputeDistanceTermContinuousNonNullRegular(difference, index, true), true); + feature_data.internedDistanceTerms[i].SetValue(distEvaluator->ComputeDistanceTermContinuousNonNullRegular(difference, index, true), true); if(compute_approximate) - feature_interns.internedDistanceTerms[i].SetValue(distEvaluator->ComputeDistanceTermContinuousNonNullRegular(difference, index, false), false); + feature_data.internedDistanceTerms[i].SetValue(distEvaluator->ComputeDistanceTermContinuousNonNullRegular(difference, index, false), false); } } } @@ -1127,42 +1245,125 @@ class RepeatedGeneralizedDistanceEvaluator //returns the inner term of the Minkowski norm summation given that the feature is nominal //and the data type being compared from is numeric - //if types_match is false, then the value is ignored - __forceinline double ComputeDistanceTermNominalNumeric(double value, - size_t index, bool types_match, bool high_accuracy) + //if value_type_numeric is false, then the value is ignored + __forceinline double ComputeDistanceTermNominalNumeric(double value, bool value_type_numeric, + size_t index, bool high_accuracy) { - //TODO 17631: implement this - return 0.0; + auto &feature_data = featureData[index]; + if(feature_data.nominalNumberDistanceTerms.size() > 0) + { + //TODO 17631: implement this + } + + if(value_type_numeric && value == feature_data.targetValue.GetValueAsNumber()) + return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy); + else + return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy); } //returns the inner term of the Minkowski norm summation given that the feature is nominal //and the data type being compared from is string - //if types_match is false, then the value is ignored - __forceinline double ComputeDistanceTermNominalString(StringInternPool::StringID value, - size_t index, bool types_match, bool high_accuracy) + //if value_type_string is false, then the value is ignored + __forceinline double ComputeDistanceTermNominalString(StringInternPool::StringID value, bool value_type_string, + size_t index, bool high_accuracy) { - //TODO 17631: implement this - return 0.0; + auto &feature_data = featureData[index]; + if(feature_data.nominalStringDistanceTerms.size() > 0) + { + //TODO 17631: implement this + } + + if(value_type_string && value == feature_data.targetValue.GetValueAsStringIDIfExists()) + return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy); + else + return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy); + } + + //returns the distance term given that it is nominal + __forceinline double ComputeDistanceTermNominal(EvaluableNodeImmediateValue other_value, + EvaluableNodeImmediateValueType other_type, size_t index, bool high_accuracy) + { + //TODO 17631: make this more efficient, placeholder for now + auto &feature_data = featureData[index]; + return distEvaluator->ComputeDistanceTermNominal(feature_data.targetValue.nodeValue, other_value, + feature_data.targetValue.nodeType, other_type, index, high_accuracy); + } + + //returns the smallest nonmatching distance term for the nominal given other_value + __forceinline double ComputeDistanceTermNominalSmallestNonmatch(double other_value, + size_t index, bool high_accuracy) + { + //TODO 17631: implement this, placeholder for now + return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy); + } + + //returns the smallest nonmatching distance term for the nominal given other_value + __forceinline double ComputeDistanceTermNominalSmallestNonmatch(StringInternPool::StringID other_value, + size_t index, bool high_accuracy) + { + //TODO 17631: implement this, placeholder for now + return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy); + } + + //returns the smallest nonmatching distance term regardless of value + __forceinline double ComputeDistanceTermNominalSmallestNonmatch(size_t index, bool high_accuracy) + { + //TODO 17631: implement this, placeholder for now + return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy); + } + + //for all nominal distance term values that equal dist_term for the given high_accuracy, + //it will call func passing in the numeric value + template + __forceinline void IterateOverNominalValuesWithLessOrEqualDistanceTermsNumeric(double dist_term, size_t index, bool high_accuracy, + Func func) + { + auto &feature_data = featureData[index]; + for(auto &entry : feature_data.nominalNumberDistanceTerms) + { + if(entry.second <= dist_term) + func(entry.first); + } + } + + //for all nominal distance term values that equal dist_term for the given high_accuracy, + //it will call func passing in the string id value + template + __forceinline void IterateOverNominalValuesWithLessOrEqualDistanceTermsString(double dist_term, size_t index, bool high_accuracy, + Func func) + { + auto &feature_data = featureData[index]; + for(auto &entry : feature_data.nominalStringDistanceTerms) + { + if(entry.second <= dist_term) + func(entry.first); + } + } + + //returns the smallest distance term larger than dist_term + __forceinline double ComputeDistanceTermNominalNextSmallest(double dist_term, size_t index, bool high_accuracy) + { + //TODO 17631: implement this, placeholder for now + return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy); } //computes the inner term of the Minkowski norm summation __forceinline double ComputeDistanceTerm(EvaluableNodeImmediateValue other_value, EvaluableNodeImmediateValueType other_type, size_t index, bool high_accuracy) { - //TODO 17631: improve the logic and efficiency auto &feature_data = featureData[index]; //if nominal, don't need to compute absolute value of diff because just need to compare to 0 if(distEvaluator->IsFeatureNominal(index)) - return distEvaluator->ComputeDistanceTermNominal(feature_data.targetValue, other_value, - feature_data.targetValueType, other_type, index, high_accuracy); + return ComputeDistanceTermNominal(other_value, other_type, index, high_accuracy); - double diff = distEvaluator->ComputeDifference(feature_data.targetValue, other_value, - feature_data.targetValueType, other_type, distEvaluator->featureAttribs[index].featureType); + //TODO 17631: improve the logic and efficiency here down + double diff = distEvaluator->ComputeDifference(feature_data.targetValue.nodeValue, other_value, + feature_data.targetValue.nodeType, other_type, distEvaluator->featureAttribs[index].featureType); if(FastIsNaN(diff)) - return distEvaluator->LookupNullDistanceTerm(feature_data.targetValue, other_value, - feature_data.targetValueType, other_type, index, high_accuracy); + return distEvaluator->LookupNullDistanceTerm(feature_data.targetValue.nodeValue, other_value, + feature_data.targetValue.nodeType, other_type, index, high_accuracy); return distEvaluator->ComputeDistanceTermContinuousNonNullRegular(diff, index, high_accuracy); } @@ -1192,8 +1393,7 @@ class RepeatedGeneralizedDistanceEvaluator EffectiveFeatureDifferenceType effectiveFeatureType; //target that the distance will be computed to - EvaluableNodeImmediateValueType targetValueType; - EvaluableNodeImmediateValue targetValue; + EvaluableNodeImmediateValueWithType targetValue; //the distance term for EFDT_REMAINING_IDENTICAL_PRECOMPUTED double precomputedRemainingIdenticalDistanceTerm; @@ -1201,10 +1401,12 @@ class RepeatedGeneralizedDistanceEvaluator std::vector *internedNumberIndexToNumberValue; std::vector internedDistanceTerms; - //TODO 17631: genericize ComputeAndStoreInternedNumberValuesAndDistanceTerms to precompute these when appropriate //used to store distance terms for the respective targetValue for the sparse deviation matrix FastHashMap nominalStringDistanceTerms; FastHashMap nominalNumberDistanceTerms; + + //if true, then nominalStringDistanceTerms and nominalNumberDistanceTerms are high accuracy, otherwise approximate + bool precomputedNominalDistanceTermsHighAccuracy; }; //for each feature, precomputed distance terms for each interned value looked up by intern index diff --git a/src/Amalgam/HashMaps.h b/src/Amalgam/HashMaps.h index 3a7ace2f..2e1897de 100644 --- a/src/Amalgam/HashMaps.h +++ b/src/Amalgam/HashMaps.h @@ -44,3 +44,38 @@ template, typename E = std::eq using CompactHashMap = ska::bytell_hash_map; #endif + +//implements a map via a vector, where entries are looked up sequentially for brute force +//useful for very small hash maps (generally less than 30-40 entries) and for hash maps +//where entries are only found once +//note that, like other fast maps, iterators may be invalidated when the map is altered +template> +class SmallMap : public std::vector> +{ +public: + + using key_type = K; + using mapped_type = V; + using value_type = std::pair; + + //returns an iterator to deviation values that matches the nominal key + inline auto find(K key) + { + return std::find_if( + std::begin(*this), + std::end(*this), + [key](auto i) + { return E{}(i.first, key); } + ); + } + + //implement the map version of emplace, but allow for use of default constructor for the value + template + inline auto emplace(K key, Args&&... args) + { + if constexpr(sizeof...(Args) == 0) + return this->emplace_back(key, V{}); + else + return this->emplace_back(key, std::forward(args)...); + } +}; diff --git a/src/Amalgam/SeparableBoxFilterDataStore.cpp b/src/Amalgam/SeparableBoxFilterDataStore.cpp index 5c10e2c7..fc13dc50 100644 --- a/src/Amalgam/SeparableBoxFilterDataStore.cpp +++ b/src/Amalgam/SeparableBoxFilterDataStore.cpp @@ -331,12 +331,10 @@ void SeparableBoxFilterDataStore::FindEntitiesWithinDistance(GeneralizedDistance for(size_t query_feature_index = 0; query_feature_index < dist_eval.featureAttribs.size(); query_feature_index++) { size_t absolute_feature_index = dist_eval.featureAttribs[query_feature_index].featureIndex; - auto target_value = r_dist_eval.featureData[query_feature_index].targetValue; - auto target_value_type = r_dist_eval.featureData[query_feature_index].targetValueType; - auto &column_data = columnData[absolute_feature_index]; + auto &target_value = r_dist_eval.featureData[query_feature_index].targetValue; - if(target_value_type == ENIVT_NULL || (target_value_type == ENIVT_NUMBER && FastIsNaN(target_value.number)) ) + if(target_value.IsNullEquivalent()) { //add the appropriate unknown distance to each element double unknown_unknown_term = dist_eval.ComputeDistanceTermUnknownToUnknown(query_feature_index, high_accuracy); @@ -359,7 +357,7 @@ void SeparableBoxFilterDataStore::FindEntitiesWithinDistance(GeneralizedDistance continue; } - if(target_value_type == ENIVT_NUMBER) + if(target_value.nodeType == ENIVT_NUMBER) { //below we branch to optimize the number of distance terms that need to be computed to solve minimum distance problem //if there are fewer enabled_indices than the number of unique values for this feature, plus one for unknown values @@ -735,8 +733,7 @@ void SeparableBoxFilterDataStore::FindNearestEntities(GeneralizedDistanceEvaluat //if the target_value is a null/nan, unknown-unknown differences have already been accounted for //since they are partial matches auto &feature_data = r_dist_eval.featureData[i]; - if(feature_data.targetValueType == ENIVT_NULL - || (feature_data.targetValueType == ENIVT_NUMBER && FastIsNaN(feature_data.targetValue.number))) + if(feature_data.targetValue.IsNullEquivalent()) continue; if(dist_eval.ComputeDistanceTermKnownToUnknown(i, high_accuracy) > worst_candidate_distance) @@ -892,42 +889,38 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R size_t absolute_feature_index = feature_attribs.featureIndex; auto &column = columnData[absolute_feature_index]; auto effective_feature_type = feature_data.effectiveFeatureType; - EvaluableNodeImmediateValue value = feature_data.targetValue; - EvaluableNodeImmediateValueType value_type = feature_data.targetValueType; + auto &value = feature_data.targetValue; - bool value_is_null = EvaluableNodeImmediateValue::IsNullEquivalent(value_type, value); //need to accumulate values for nulls if the value is a null - if(value_is_null) + if(value.IsNullEquivalent()) { double unknown_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermUnknownToUnknown(query_feature_index, high_accuracy); - double known_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy); //if it's either a symmetric nominal or continuous, or if sparse deviation matrix but no null value, // then there are only two values, unknown to known or known - if(r_dist_eval.distEvaluator->IsFeatureSymmetricNominal(query_feature_index) - || r_dist_eval.distEvaluator->IsFeatureContinuous(query_feature_index) - || (r_dist_eval.distEvaluator->IsFeatureNominal(query_feature_index) && + if(feature_attribs.IsFeatureSymmetricNominal() + || feature_attribs.IsFeatureContinuous() + || (feature_attribs.IsFeatureNominal() && !r_dist_eval.HasNominalSpecificKnownToUnknownDistanceTerm(query_feature_index))) { - //if all cases are equidistant, then don't compute anything - if(unknown_unknown_term == known_unknown_term) + double known_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy); + + //if all cases are equidistant and nonzero, then don't compute anything + if(unknown_unknown_term == known_unknown_term && unknown_unknown_term > 0) return unknown_unknown_term; //find nas values -- common to both branches below auto nas_iter = column->stringIdValueToIndices.find(string_intern_pool.NOT_A_STRING_ID); - if(unknown_unknown_term < known_unknown_term) + if(unknown_unknown_term < known_unknown_term || known_unknown_term == 0.0) { AccumulatePartialSums(column->nullIndices, query_feature_index, unknown_unknown_term); AccumulatePartialSums(column->nanIndices, query_feature_index, unknown_unknown_term); if(nas_iter != end(column->stringIdValueToIndices)) AccumulatePartialSums(*nas_iter->second, query_feature_index, unknown_unknown_term); - - //return the larger value that the remainder of the entities have - feature_data.SetPrecomputedRemainingIdenticalDistanceTerm(known_unknown_term); - return known_unknown_term; } - else //known_unknown_term < unknown_unknown_term + + if(known_unknown_term < unknown_unknown_term || unknown_unknown_term == 0.0) { BitArrayIntegerSet &known_unknown_indices = parametersAndBuffers.potentialMatchesSet; known_unknown_indices = enabled_indices; @@ -936,12 +929,18 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R if(nas_iter != end(column->stringIdValueToIndices)) known_unknown_indices.erase(*nas_iter->second); AccumulatePartialSums(known_unknown_indices, query_feature_index, known_unknown_term); - - //return the larger value that the remainder of the entities have - feature_data.SetPrecomputedRemainingIdenticalDistanceTerm(unknown_unknown_term); - return unknown_unknown_term; } + double larget_term_not_computed = std::max(known_unknown_term, unknown_unknown_term); + //if the largest term not computed is zero, then have computed everything, + // so set the remaining value to infinity to push this term off sorting of uncomputed distances + // and make search more efficient + if(larget_term_not_computed == 0.0) + larget_term_not_computed = std::numeric_limits::infinity(); + + //make computing the rest more efficient + feature_data.SetPrecomputedRemainingIdenticalDistanceTerm(larget_term_not_computed); + return larget_term_not_computed; } else //nonsymmetric nominal -- need to compute { @@ -951,15 +950,35 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R if(nas_iter != end(column->stringIdValueToIndices)) AccumulatePartialSums(*nas_iter->second, query_feature_index, unknown_unknown_term); - //TODO 17631: accumulate distance terms smaller than these, which might not be these two, return the next largest - return std::min(known_unknown_term, unknown_unknown_term); + double nonmatch_dist_term = r_dist_eval.ComputeDistanceTermNominalSmallestNonmatch(query_feature_index, high_accuracy); + //if the next closest match is larger, no need to compute any more values + if(nonmatch_dist_term > unknown_unknown_term) + return nonmatch_dist_term; + + //if there are terms smaller than unknown_unknown_term, then need to compute any other nominal values + r_dist_eval.IterateOverNominalValuesWithLessOrEqualDistanceTermsNumeric(unknown_unknown_term, query_feature_index, high_accuracy, + [this, &r_dist_eval, &column, query_feature_index, high_accuracy](double number_value) + { + AccumulatePartialSumsForNominalNumberValueIfExists(r_dist_eval, number_value, query_feature_index, *column, high_accuracy); + }); + + r_dist_eval.IterateOverNominalValuesWithLessOrEqualDistanceTermsString(unknown_unknown_term, query_feature_index, high_accuracy, + [this, &r_dist_eval, &column, query_feature_index, high_accuracy](StringInternPool::StringID sid) + { + AccumulatePartialSumsForNominalStringIdValueIfExists(r_dist_eval, sid, query_feature_index, *column, high_accuracy); + }); + + return r_dist_eval.ComputeDistanceTermNominalNextSmallest(unknown_unknown_term, query_feature_index, high_accuracy);; } } - //need to accumulate nulls if they're closer than an exact match - //but if made it here, then the value itself isn't null - bool accumulated_known_to_unknown = false; - if(r_dist_eval.distEvaluator->IsKnownToUnknownDistanceLessThanOrEqualToExactMatch(query_feature_index)) + bool is_feature_symmetric_nominal = r_dist_eval.distEvaluator->IsFeatureSymmetricNominal(query_feature_index); + + //if made it here, then the value itself is not a null, so only need to consider unknown to known distances + //need to accumulate nulls if it's a symmetric nominal feature, because then there's only one value left, + //or if the nulls are closer than an exact match + if(is_feature_symmetric_nominal + || r_dist_eval.distEvaluator->IsKnownToUnknownDistanceLessThanOrEqualToExactMatch(query_feature_index)) { double known_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy); AccumulatePartialSums(column->nullIndices, query_feature_index, known_unknown_term); @@ -967,36 +986,23 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R auto nas_iter = column->stringIdValueToIndices.find(string_intern_pool.NOT_A_STRING_ID); if(nas_iter != end(column->stringIdValueToIndices)) AccumulatePartialSums(*nas_iter->second, query_feature_index, known_unknown_term); - accumulated_known_to_unknown = true; } //if nominal, only need to compute the exact match - if(r_dist_eval.distEvaluator->IsFeatureSymmetricNominal(query_feature_index)) + if(is_feature_symmetric_nominal) { - if(value_type == ENIVT_NUMBER) + if(value.nodeType == ENIVT_NUMBER) { - auto [value_index, exact_index_found] = column->FindExactIndexForValue(value.number); - if(exact_index_found) - { - double term = feature_attribs.nominalSymmetricMatchDistanceTerm.GetValue(high_accuracy); - AccumulatePartialSums(column->sortedNumberValueEntries[value_index]->indicesWithValue, query_feature_index, term); - } + AccumulatePartialSumsForNominalNumberValueIfExists(r_dist_eval, value.nodeValue.number, query_feature_index, *column, high_accuracy); } - else if(value_type == ENIVT_STRING_ID) + else if(value.nodeType == ENIVT_STRING_ID) { - auto value_found = column->stringIdValueToIndices.find(value.stringID); - if(value_found != end(column->stringIdValueToIndices)) - { - double term = feature_attribs.nominalSymmetricMatchDistanceTerm.GetValue(high_accuracy); - AccumulatePartialSums(*(value_found->second), query_feature_index, term); - } + AccumulatePartialSumsForNominalStringIdValueIfExists(r_dist_eval, value.nodeValue.stringID, query_feature_index, *column, high_accuracy); } - else if(value_type == ENIVT_CODE) + else if(value.nodeType == ENIVT_CODE) { //compute partial sums for all code of matching size - size_t code_size = 1; - if(value_type == ENIVT_CODE) - code_size = EvaluableNode::GetDeepSize(value.code); + size_t code_size = EvaluableNode::GetDeepSize(value.nodeValue.code); auto value_found = column->valueCodeSizeToIndices.find(code_size); if(value_found != end(column->valueCodeSizeToIndices)) @@ -1007,43 +1013,70 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R } //else value_type == ENIVT_NULL and already covered above - //if known to unknown is less than a symmetric nominal nonmatch, then need to accumulate those too - if(!accumulated_known_to_unknown) - { - double known_unknown_term = r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy); - AccumulatePartialSums(column->nullIndices, query_feature_index, known_unknown_term); - AccumulatePartialSums(column->nanIndices, query_feature_index, known_unknown_term); - auto nas_iter = column->stringIdValueToIndices.find(string_intern_pool.NOT_A_STRING_ID); - if(nas_iter != end(column->stringIdValueToIndices)) - AccumulatePartialSums(*nas_iter->second, query_feature_index, known_unknown_term); - } - //return the value that the remainder of the entities have double nonmatch_dist_term = feature_attribs.nominalSymmetricNonMatchDistanceTerm.GetValue(high_accuracy); feature_data.SetPrecomputedRemainingIdenticalDistanceTerm(nonmatch_dist_term); return nonmatch_dist_term; } - else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_STRING) + else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_STRING) { - if(value_type == ENIVT_STRING_ID) - { - auto value_found = column->stringIdValueToIndices.find(value.stringID); - if(value_found != end(column->stringIdValueToIndices)) + //initialize to zero, because if don't find an exact match, but there are distance terms of + //0, then need to accumulate those later + double accumulated_term = 0.0; + if(value.nodeType == ENIVT_STRING_ID) + accumulated_term = AccumulatePartialSumsForNominalStringIdValueIfExists( + r_dist_eval, value.nodeValue.stringID, query_feature_index, *column, high_accuracy); + + double nonmatch_dist_term = r_dist_eval.ComputeDistanceTermNominalSmallestNonmatch(query_feature_index, high_accuracy); + //if the next closest match is larger, no need to compute any more values + if(nonmatch_dist_term > accumulated_term) + return nonmatch_dist_term; + + //need to iterate over everything with the same distance term + r_dist_eval.IterateOverNominalValuesWithLessOrEqualDistanceTermsString(accumulated_term, query_feature_index, high_accuracy, + [this, &value, &r_dist_eval, &column, query_feature_index, high_accuracy](StringInternPool::StringID sid) { - double term = r_dist_eval.distEvaluator->ComputeDistanceTermContinuousExactMatch(query_feature_index, high_accuracy); - AccumulatePartialSums(*(value_found->second), query_feature_index, term); - } - } + //don't want to double-accumulate the found value + if(sid != value.nodeValue.stringID) + AccumulatePartialSumsForNominalStringIdValueIfExists( + r_dist_eval, value.nodeValue.stringID, query_feature_index, *column, high_accuracy); + }); - //the next closest string will have an edit distance of 1 - return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonCyclicNonNullRegular(1.0, query_feature_index, high_accuracy); + return r_dist_eval.ComputeDistanceTermNominalNextSmallest(nonmatch_dist_term, query_feature_index, high_accuracy); + } + else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_NUMERIC) + { + //initialize to zero, because if don't find an exact match, but there are distance terms of + //0, then need to accumulate those later + double accumulated_term = 0.0; + if(value.nodeType == ENIVT_NUMBER) + accumulated_term = AccumulatePartialSumsForNominalNumberValueIfExists( + r_dist_eval, value.nodeValue.number, query_feature_index, *column, high_accuracy); + + double nonmatch_dist_term = r_dist_eval.ComputeDistanceTermNominalSmallestNonmatch(query_feature_index, high_accuracy); + //if the next closest match is larger, no need to compute any more values + if(nonmatch_dist_term > accumulated_term) + return nonmatch_dist_term; + + //need to iterate over everything with the same distance term + r_dist_eval.IterateOverNominalValuesWithLessOrEqualDistanceTermsNumeric(accumulated_term, query_feature_index, high_accuracy, + [this, &value, &r_dist_eval, &column, query_feature_index, high_accuracy](double number_value) + { + //don't want to double-accumulate the found value + if(!EqualIncludingNaN(number_value, value.nodeValue.number)) + AccumulatePartialSumsForNominalNumberValueIfExists( + r_dist_eval, value.nodeValue.number, query_feature_index, *column, high_accuracy); + }); + + return r_dist_eval.ComputeDistanceTermNominalNextSmallest(nonmatch_dist_term, query_feature_index, high_accuracy); } - else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_CODE) + else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_CODE + || effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_CODE) { //compute partial sums for all code of matching size size_t code_size = 1; - if(value_type == ENIVT_CODE) - code_size = EvaluableNode::GetDeepSize(value.code); + if(value.nodeType == ENIVT_CODE) + code_size = EvaluableNode::GetDeepSize(value.nodeValue.code); auto value_found = column->valueCodeSizeToIndices.find(code_size); if(value_found != end(column->valueCodeSizeToIndices)) @@ -1052,25 +1085,36 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R ComputeAndAccumulatePartialSums(r_dist_eval, entity_indices, query_feature_index, absolute_feature_index, high_accuracy); } - //next most similar code must be at least a distance of 1 edit away - return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonCyclicNonNullRegular(1.0, query_feature_index, high_accuracy); - } - else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_STRING) - { - //TODO 17631: finish this; need to account for all values that are smaller than the current - } - else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_NUMERIC) - { - //TODO 17631: finish this; need to account for all values that are smaller than the current + if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_CODE) + { + double nonmatch_dist_term = r_dist_eval.ComputeDistanceTermNominalSmallestNonmatch(query_feature_index, high_accuracy); + return nonmatch_dist_term; + } + else //RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_CODE + { + //next most similar code must be at least a distance of 1 edit away + return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonCyclicNonNullRegular(1.0, query_feature_index, high_accuracy); + } } - else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_CODE) + else if(effective_feature_type == RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_STRING) { - //TODO 17631: finish this; need to account for all values that are smaller than the current + if(value.nodeType == ENIVT_STRING_ID) + { + auto value_found = column->stringIdValueToIndices.find(value.nodeValue.stringID); + if(value_found != end(column->stringIdValueToIndices)) + { + double term = r_dist_eval.distEvaluator->ComputeDistanceTermContinuousExactMatch(query_feature_index, high_accuracy); + AccumulatePartialSums(*(value_found->second), query_feature_index, term); + } + } + + //the next closest string will have an edit distance of 1 + return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonCyclicNonNullRegular(1.0, query_feature_index, high_accuracy); } //else feature_type == FDT_CONTINUOUS_NUMERIC or FDT_CONTINUOUS_UNIVERSALLY_NUMERIC //if not a number or no numbers available, then no size - if(value_type != ENIVT_NUMBER || column->sortedNumberValueEntries.size() == 0) + if(value.nodeType != ENIVT_NUMBER || column->sortedNumberValueEntries.size() == 0) return GetMaxDistanceTermForContinuousFeature(r_dist_eval, query_feature_index, absolute_feature_index, high_accuracy); bool cyclic_feature = r_dist_eval.distEvaluator->IsFeatureCyclic(query_feature_index); @@ -1078,14 +1122,14 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R if(cyclic_feature) cycle_length = feature_attribs.typeAttributes.maxCyclicDifference; - auto [value_index, exact_index_found] = column->FindClosestValueIndexForValue(value.number, cycle_length); + auto [value_index, exact_index_found] = column->FindClosestValueIndexForValue(value.nodeValue.number, cycle_length); double term = 0.0; if(exact_index_found) term = r_dist_eval.distEvaluator->ComputeDistanceTermContinuousExactMatch(query_feature_index, high_accuracy); else term = r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonNullRegular( - value.number - column->sortedNumberValueEntries[value_index]->value.number, query_feature_index, high_accuracy); + value.nodeValue.number - column->sortedNumberValueEntries[value_index]->value.number, query_feature_index, high_accuracy); size_t num_entities_computed = AccumulatePartialSums(column->sortedNumberValueEntries[value_index]->indicesWithValue, query_feature_index, term); @@ -1123,7 +1167,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R if(lower_value_index > 0) { next_lower_index = lower_value_index - 1; - lower_diff = std::abs(value.number - column->sortedNumberValueEntries[next_lower_index]->value.number); + lower_diff = std::abs(value.nodeValue.number - column->sortedNumberValueEntries[next_lower_index]->value.number); compute_lower = true; } } @@ -1140,7 +1184,9 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R break; next_lower_index = next_index; - lower_diff = GeneralizedDistanceEvaluator::ConstrainDifferenceToCyclicDifference(std::abs(value.number - column->sortedNumberValueEntries[next_lower_index]->value.number), cycle_length); + lower_diff = GeneralizedDistanceEvaluator::ConstrainDifferenceToCyclicDifference( + std::abs(value.nodeValue.number - column->sortedNumberValueEntries[next_lower_index]->value.number), + cycle_length); compute_lower = true; } @@ -1153,7 +1199,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R if(upper_value_index + 1 < num_unique_number_values) { next_upper_index = upper_value_index + 1; - upper_diff = std::abs(value.number - column->sortedNumberValueEntries[next_upper_index]->value.number); + upper_diff = std::abs(value.nodeValue.number - column->sortedNumberValueEntries[next_upper_index]->value.number); compute_upper = true; } } @@ -1170,7 +1216,8 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R break; next_upper_index = next_index; - upper_diff = GeneralizedDistanceEvaluator::ConstrainDifferenceToCyclicDifference(std::abs(value.number - column->sortedNumberValueEntries[next_upper_index]->value.number), cycle_length); + upper_diff = GeneralizedDistanceEvaluator::ConstrainDifferenceToCyclicDifference( + std::abs(value.nodeValue.number - column->sortedNumberValueEntries[next_upper_index]->value.number), cycle_length); compute_upper = true; } @@ -1253,7 +1300,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(R void SeparableBoxFilterDataStore::PopulateInitialPartialSums(RepeatedGeneralizedDistanceEvaluator &r_dist_eval, size_t top_k, size_t radius_column_index, bool high_accuracy, BitArrayIntegerSet &enabled_indices, std::vector &min_unpopulated_distances, std::vector &min_distance_by_unpopulated_count) -{; +{ if(radius_column_index < columnData.size()) { auto &partial_sums = parametersAndBuffers.partialSums; @@ -1396,14 +1443,11 @@ void SeparableBoxFilterDataStore::PopulateTargetValueAndLabelIndex(RepeatedGener feature_data.internedNumberIndexToNumberValue = nullptr; feature_data.internedDistanceTerms.clear(); - if(feature_type == GeneralizedDistanceEvaluator::FDT_NOMINAL_NUMERIC - || feature_type == GeneralizedDistanceEvaluator::FDT_NOMINAL_STRING - || feature_type == GeneralizedDistanceEvaluator::FDT_NOMINAL_CODE + if(feature_attribs.IsFeatureNominal() || feature_type == GeneralizedDistanceEvaluator::FDT_CONTINUOUS_STRING || feature_type == GeneralizedDistanceEvaluator::FDT_CONTINUOUS_CODE) { - feature_data.targetValue = position_value; - feature_data.targetValueType = position_value_type; + feature_data.targetValue = EvaluableNodeImmediateValueWithType(position_value, position_value_type); if(feature_type == GeneralizedDistanceEvaluator::FDT_NOMINAL_NUMERIC) effective_feature_type = RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_NUMERIC; @@ -1415,6 +1459,9 @@ void SeparableBoxFilterDataStore::PopulateTargetValueAndLabelIndex(RepeatedGener effective_feature_type = RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_STRING; else if(feature_type == GeneralizedDistanceEvaluator::FDT_CONTINUOUS_CODE) effective_feature_type = RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_CODE; + + if(feature_attribs.IsFeatureNominal()) + r_dist_eval.ComputeAndStoreNominalDistanceTerms(query_feature_index); } else // feature_type is some form of continuous numeric { @@ -1422,8 +1469,7 @@ void SeparableBoxFilterDataStore::PopulateTargetValueAndLabelIndex(RepeatedGener double position_value_numeric = (position_value_type == ENIVT_NUMBER ? position_value.number : std::numeric_limits::quiet_NaN()); - feature_data.targetValue = position_value_numeric; - feature_data.targetValueType = ENIVT_NUMBER; + feature_data.targetValue = EvaluableNodeImmediateValueWithType(position_value_numeric); //set up effective_feature_type auto &column_data = columnData[feature_attribs.featureIndex]; @@ -1439,7 +1485,7 @@ void SeparableBoxFilterDataStore::PopulateTargetValueAndLabelIndex(RepeatedGener else effective_feature_type = RepeatedGeneralizedDistanceEvaluator::EFDT_CONTINUOUS_NUMERIC_PRECOMPUTED; - r_dist_eval.ComputeAndStoreInternedNumberValuesAndDistanceTerms(position_value_numeric, query_feature_index, &column_data->internedNumberIndexToNumberValue); + r_dist_eval.ComputeAndStoreInternedNumberValuesAndDistanceTerms(query_feature_index, &column_data->internedNumberIndexToNumberValue); } else { diff --git a/src/Amalgam/SeparableBoxFilterDataStore.h b/src/Amalgam/SeparableBoxFilterDataStore.h index 1a5b70e7..d00ece43 100644 --- a/src/Amalgam/SeparableBoxFilterDataStore.h +++ b/src/Amalgam/SeparableBoxFilterDataStore.h @@ -652,6 +652,38 @@ class SeparableBoxFilterDataStore return AccumulatePartialSums(entity_indices.GetBaisContainer(), query_feature_index, term); } + //accumulates the partial sums for the specified value + // returns the distance term evaluated, or 0.0 if value was not found + inline double AccumulatePartialSumsForNominalNumberValueIfExists(RepeatedGeneralizedDistanceEvaluator &r_dist_eval, + double value, size_t query_feature_index, SBFDSColumnData &column, bool high_accuracy) + { + auto [value_index, exact_index_found] = column.FindExactIndexForValue(value); + if(exact_index_found) + { + double term = r_dist_eval.ComputeDistanceTermNominalNumeric(value, true, query_feature_index, high_accuracy); + AccumulatePartialSums(column.sortedNumberValueEntries[value_index]->indicesWithValue, query_feature_index, term); + return term; + } + + return 0.0; + } + + //accumulates the partial sums for the specified value + // returns the distance term evaluated, or 0.0 if value was not found + inline double AccumulatePartialSumsForNominalStringIdValueIfExists(RepeatedGeneralizedDistanceEvaluator &r_dist_eval, + StringInternPool::StringID value, size_t query_feature_index, SBFDSColumnData &column, bool high_accuracy) + { + auto value_found = column.stringIdValueToIndices.find(value); + if(value_found != end(column.stringIdValueToIndices)) + { + double term = r_dist_eval.ComputeDistanceTermNominalString(value, true, query_feature_index, high_accuracy); + AccumulatePartialSums(*(value_found->second), query_feature_index, term); + return term; + } + + return 0.0; + } + //search a projection width in terms of bucket count or number of collected entities //accumulates partial sums //searches until num_entities_to_populate are popluated or other heuristics have been reached @@ -725,7 +757,7 @@ class SeparableBoxFilterDataStore { auto &feature_attribs = r_dist_eval.distEvaluator->featureAttribs[query_feature_index]; return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonCyclicOneNonNullRegular( - feature_data.targetValue.number - GetValue(entity_index, feature_attribs.featureIndex).number, + feature_data.targetValue.nodeValue.number - GetValue(entity_index, feature_attribs.featureIndex).number, query_feature_index, high_accuracy); } @@ -742,7 +774,7 @@ class SeparableBoxFilterDataStore auto &column_data = columnData[feature_attribs.featureIndex]; if(column_data->numberIndices.contains(entity_index)) return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonCyclicOneNonNullRegular( - feature_data.targetValue.number - GetValue(entity_index, feature_attribs.featureIndex).number, + feature_data.targetValue.nodeValue.number - GetValue(entity_index, feature_attribs.featureIndex).number, query_feature_index, high_accuracy); else return r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy); @@ -754,7 +786,7 @@ class SeparableBoxFilterDataStore auto &column_data = columnData[feature_attribs.featureIndex]; if(column_data->numberIndices.contains(entity_index)) return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousOneNonNullRegular( - feature_data.targetValue.number - GetValue(entity_index, feature_attribs.featureIndex).number, + feature_data.targetValue.nodeValue.number - GetValue(entity_index, feature_attribs.featureIndex).number, query_feature_index, high_accuracy); else return r_dist_eval.distEvaluator->ComputeDistanceTermKnownToUnknown(query_feature_index, high_accuracy); @@ -777,11 +809,11 @@ class SeparableBoxFilterDataStore auto &column_data = columnData[feature_attribs.featureIndex]; if(column_data->stringIdIndices.contains(entity_index)) return r_dist_eval.ComputeDistanceTermNominalString( - GetValue(entity_index, feature_attribs.featureIndex).stringID, - query_feature_index, true, high_accuracy); + GetValue(entity_index, feature_attribs.featureIndex).stringID, true, + query_feature_index, high_accuracy); else - return r_dist_eval.ComputeDistanceTermNominalString(string_intern_pool.EMPTY_STRING_ID, - query_feature_index, false, high_accuracy); + return r_dist_eval.ComputeDistanceTermNominalString(string_intern_pool.EMPTY_STRING_ID, false, + query_feature_index, high_accuracy); } case RepeatedGeneralizedDistanceEvaluator::EFDT_NOMINAL_NUMERIC: @@ -790,11 +822,11 @@ class SeparableBoxFilterDataStore auto &column_data = columnData[feature_attribs.featureIndex]; if(column_data->numberIndices.contains(entity_index)) return r_dist_eval.ComputeDistanceTermNominalNumeric( - GetValue(entity_index, feature_attribs.featureIndex).number, - query_feature_index, true, high_accuracy); + GetValue(entity_index, feature_attribs.featureIndex).number, true, + query_feature_index, high_accuracy); else - return r_dist_eval.ComputeDistanceTermNominalNumeric(0.0, - query_feature_index, false, high_accuracy); + return r_dist_eval.ComputeDistanceTermNominalNumeric(0.0, false, + query_feature_index, high_accuracy); } default: diff --git a/src/Amalgam/evaluablenode/EvaluableNode.h b/src/Amalgam/evaluablenode/EvaluableNode.h index e0d82641..0d62dbed 100644 --- a/src/Amalgam/evaluablenode/EvaluableNode.h +++ b/src/Amalgam/evaluablenode/EvaluableNode.h @@ -1062,6 +1062,11 @@ class EvaluableNodeImmediateValueWithType : nodeType(ENIVT_NULL) { } + constexpr EvaluableNodeImmediateValueWithType(EvaluableNodeImmediateValue node_value, + EvaluableNodeImmediateValueType node_type) + : nodeType(node_type), nodeValue(node_value) + { } + __forceinline EvaluableNodeImmediateValueWithType(bool value) { nodeType = ENIVT_NUMBER; @@ -1254,6 +1259,12 @@ class EvaluableNodeImmediateValueWithType return EvaluableNodeImmediateValue::AreEqual(a.nodeType, a.nodeValue, b.nodeType, b.nodeValue); } + //returns true if it is a null or null equivalent + constexpr bool IsNullEquivalent() + { + return EvaluableNodeImmediateValue::IsNullEquivalent(nodeType, nodeValue); + } + EvaluableNodeImmediateValueType nodeType; EvaluableNodeImmediateValue nodeValue; };