Skip to content

Commit

Permalink
18683: Changes difference types in distance computations to be explic…
Browse files Browse the repository at this point in the history
…it and moves deviation parameters to be more consistent. MAJOR (#41)
  • Loading branch information
howsohazard authored Dec 19, 2023
1 parent 969e423 commit 2eeda97
Show file tree
Hide file tree
Showing 13 changed files with 361 additions and 379 deletions.
14 changes: 7 additions & 7 deletions docs/language.js

Large diffs are not rendered by default.

138 changes: 58 additions & 80 deletions src/Amalgam/GeneralizedDistance.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,12 @@ class GeneralizedDistance
// align at 32-bits in order to play nice with data alignment where it is used
enum FeatureDifferenceType : uint32_t
{
FDT_NOMINAL,
//nominal based on numeric equivalence
FDT_NOMINAL_NUMERIC,
//nominal based on string equivalence
FDT_NOMINAL_STRING,
//nominal based on code equivalence
FDT_NOMINAL_CODE,
//continuous without cycles, may contain nonnumeric data
FDT_CONTINUOUS_NUMERIC,
//like FDT_CONTINUOUS_NUMERIC, but has cycles
Expand Down Expand Up @@ -82,17 +87,17 @@ class GeneralizedDistance
if(compute_accurate)
{
feature_params.unknownToUnknownDistanceTerm.SetValue(
ComputeDistanceTermNonNull(feature_params.unknownToUnknownDifference, index, true), true);
ComputeDistanceTermNonNull(feature_params.unknownToUnknownDistanceTerm.difference, index, true), true);
}

if(compute_approximate)
{
feature_params.unknownToUnknownDistanceTerm.SetValue(
ComputeDistanceTermNonNull(feature_params.unknownToUnknownDifference, index, false), false);
ComputeDistanceTermNonNull(feature_params.unknownToUnknownDistanceTerm.difference, index, false), false);
}

//if knownToUnknownDifference is same as unknownToUnknownDifference, can copy distance term instead of recomputing
if(feature_params.knownToUnknownDifference == feature_params.unknownToUnknownDifference)
if(feature_params.knownToUnknownDistanceTerm.difference == feature_params.unknownToUnknownDistanceTerm.difference)
{
feature_params.knownToUnknownDistanceTerm = feature_params.unknownToUnknownDistanceTerm;
}
Expand All @@ -102,13 +107,13 @@ class GeneralizedDistance
if(compute_accurate)
{
feature_params.knownToUnknownDistanceTerm.SetValue(
ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference, index, true), true);
ComputeDistanceTermNonNull(feature_params.knownToUnknownDistanceTerm.difference, index, true), true);
}

if(compute_approximate)
{
feature_params.knownToUnknownDistanceTerm.SetValue(
ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference, index, false), false);
ComputeDistanceTermNonNull(feature_params.knownToUnknownDistanceTerm.difference, index, false), false);
}
}

Expand Down Expand Up @@ -230,42 +235,57 @@ class GeneralizedDistance
return (highAccuracy || recomputeAccurateDistances);
}

//stores a pair of exact and approximate values
//stores the computed exact and approximate distance terms
// which can be referenced by getting the value at the corresponding offset
//the values default to 0.0 on initialization
class ExactApproxValuePair
class DistanceTerms
{
public:
//offset for each precision level
static constexpr int APPROX = 0;
static constexpr int EXACT = 1;

__forceinline ExactApproxValuePair(double initial_value = 0.0)
__forceinline DistanceTerms(double initial_value = 0.0)
{
exactApproxPair = { initial_value, initial_value };
distanceTerm = { initial_value, initial_value };
}

constexpr double GetValue(bool high_accuracy)
{
return exactApproxPair[high_accuracy ? EXACT : APPROX];
return distanceTerm[high_accuracy ? EXACT : APPROX];
}

constexpr double GetValue(int offset)
{
return exactApproxPair[offset];
return distanceTerm[offset];
}

__forceinline void SetValue(double value, int offset)
{
exactApproxPair[offset] = value;
distanceTerm[offset] = value;
}

__forceinline void SetValue(double value, bool high_accuracy)
{
exactApproxPair[high_accuracy ? EXACT : APPROX] = value;
distanceTerm[high_accuracy ? EXACT : APPROX] = value;
}

std::array<double, 2> exactApproxPair;
std::array<double, 2> distanceTerm;
};

//stores the computed exact and approximate distance terms, as well as the difference
//the values default to 0.0 on initialization
class DistanceTermsWithDifference
: public DistanceTerms
{
public:
__forceinline DistanceTermsWithDifference(double initial_value = 0.0)
: DistanceTerms(initial_value)
{
difference = initial_value;
}

double difference;
};

//update cached nominal deltas based on highAccuracy and recomputeAccurateDistances, caching what is needed given those flags
Expand All @@ -276,9 +296,10 @@ class GeneralizedDistance

for(size_t i = 0; i < featureParams.size(); i++)
{
auto &feat_params = featureParams[i];
if(feat_params.featureType == FDT_NOMINAL)
if(IsFeatureNominal(i))
{
auto &feat_params = featureParams[i];

//ensure if a feature has deviations they're not too small to underflow
if(DoesFeatureHaveDeviation(i))
{
Expand Down Expand Up @@ -315,7 +336,7 @@ class GeneralizedDistance
//returns true if the feature is nominal
__forceinline bool IsFeatureNominal(size_t feature_index)
{
return (featureParams[feature_index].featureType == FDT_NOMINAL);
return (featureParams[feature_index].featureType <= FDT_NOMINAL_CODE);
}

//returns true if the feature is cyclic
Expand All @@ -335,7 +356,7 @@ class GeneralizedDistance
__forceinline bool IsKnownToUnknownDistanceLessThanOrEqualToExactMatch(size_t feature_index)
{
auto &feature_params = featureParams[feature_index];
return (feature_params.knownToUnknownDifference <= feature_params.deviation);
return (feature_params.knownToUnknownDistanceTerm.difference <= feature_params.deviation);
}

//computes the exponentiation of d to 1/p
Expand All @@ -353,7 +374,7 @@ class GeneralizedDistance
return fastPowInverseP.FastPow(d);
}

//computes the exponentiation of d to p given precision being from ExactApproxValuePair
//computes the exponentiation of d to p given precision being from DistanceTerms
__forceinline double ExponentiateDifferenceTerm(double d, bool high_accuracy)
{
if(pValue == 1)
Expand All @@ -371,21 +392,16 @@ class GeneralizedDistance
//returns the maximum difference
inline double GetMaximumDifference(size_t index)
{
auto &feature_params = featureParams[index];
switch(feature_params.featureType)
{
case FDT_NOMINAL:
if(IsFeatureNominal(index))
return 1.0;

case FDT_CONTINUOUS_NUMERIC_CYCLIC:
return feature_params.typeAttributes.maxCyclicDifference / 2;
if(IsFeatureCyclic(index))
return featureParams[index].typeAttributes.maxCyclicDifference / 2;

default:
if(feature_params.weight > 0)
return std::numeric_limits<double>::infinity();
else
return -std::numeric_limits<double>::infinity();
}
if(featureParams[index].weight > 0)
return std::numeric_limits<double>::infinity();
else
return -std::numeric_limits<double>::infinity();
}

//computes the distance term for a nominal when two universally symmetric nominals are equal
Expand Down Expand Up @@ -665,21 +681,6 @@ class GeneralizedDistance
return ExponentiateDifferenceTerm(diff, high_accuracy) * featureParams[index].weight;
}

//computes the inner term of the Minkowski norm summation for a single index for p non-zero and non-infinite
//where at least one of the values is non-null
__forceinline double ComputeDistanceTermRegularOneNonNull(double diff, size_t index, bool high_accuracy)
{
if(FastIsNaN(diff))
return ComputeDistanceTermKnownToUnknown(index, high_accuracy);

//if nominal, don't need to compute absolute value of diff because just need to compare to 0
if(IsFeatureNominal(index))
return (diff == 0.0) ? ComputeDistanceTermNominalUniversallySymmetricExactMatchPrecomputed(index, high_accuracy)
: ComputeDistanceTermNominalUniversallySymmetricNonMatchPrecomputed(index, high_accuracy);

return ComputeDistanceTermNonNominalNonNullRegular(diff, index, high_accuracy);
}

//computes the inner term of the Minkowski norm summation for a single index for p non-zero and non-infinite
__forceinline double ComputeDistanceTermRegular(EvaluableNodeImmediateValue a, EvaluableNodeImmediateValue b,
EvaluableNodeImmediateValueType a_type, EvaluableNodeImmediateValueType b_type, size_t index, bool high_accuracy)
Expand All @@ -696,24 +697,6 @@ class GeneralizedDistance
return ComputeDistanceTermNonNominalNonNullRegular(diff, index, high_accuracy);
}

//computes the inner term of the Minkowski norm summation for a single index that isn't null,
//but computes only from the distance (does not take into account feature measurement type)
__forceinline double ComputeDistanceTermFromNonNullDifferenceOnly(double diff, size_t index, bool high_accuracy)
{
if(pValue == 0.0)
{
if(high_accuracy)
return std::pow(diff, featureParams[index].weight);
else
return FastPow(diff, featureParams[index].weight);
}
else if(pValue == std::numeric_limits<double>::infinity()
|| pValue == -std::numeric_limits<double>::infinity())
return diff * featureParams[index].weight;
else
return ExponentiateDifferenceTerm(diff, high_accuracy) * featureParams[index].weight;
}

//returns the distance term for the either one or two unknown values
__forceinline double LookupNullDistanceTerm(EvaluableNodeImmediateValue a, EvaluableNodeImmediateValue b,
EvaluableNodeImmediateValueType a_type, EvaluableNodeImmediateValueType b_type, size_t index, bool high_accuracy)
Expand Down Expand Up @@ -748,7 +731,8 @@ class GeneralizedDistance
if(a_type == ENIVT_NULL || b_type == ENIVT_NULL)
return std::numeric_limits<double>::quiet_NaN();

if(feature_type == FDT_NOMINAL)
if(feature_type == FDT_NOMINAL_NUMERIC
|| feature_type == FDT_NOMINAL_STRING || feature_type == FDT_NOMINAL_CODE)
{
if(a_type == ENIVT_NUMBER && b_type == ENIVT_NUMBER)
return (a.number == b.number ? 0.0 : 1.0);
Expand Down Expand Up @@ -868,9 +852,7 @@ class GeneralizedDistance
weight(1.0),
internedNumberIndexToNumberValue(nullptr), deviation(0.0),
unknownToUnknownDistanceTerm(std::numeric_limits<double>::quiet_NaN()),
knownToUnknownDistanceTerm(std::numeric_limits<double>::quiet_NaN()),
unknownToUnknownDifference(std::numeric_limits<double>::quiet_NaN()),
knownToUnknownDifference(std::numeric_limits<double>::quiet_NaN())
knownToUnknownDistanceTerm(std::numeric_limits<double>::quiet_NaN())
{
typeAttributes.maxCyclicDifference = std::numeric_limits<double>::quiet_NaN();
}
Expand All @@ -887,14 +869,14 @@ class GeneralizedDistance
double weight;

//distance terms for nominals
ExactApproxValuePair nominalMatchDistanceTerm;
ExactApproxValuePair nominalNonMatchDistanceTerm;
DistanceTerms nominalMatchDistanceTerm;
DistanceTerms nominalNonMatchDistanceTerm;

//pointer to a lookup table of indices to values if the feature is an interned number
std::vector<double> *internedNumberIndexToNumberValue;

//precomputed distance terms for each interned value looked up by intern index
std::vector<ExactApproxValuePair> internDistanceTerms;
std::vector<DistanceTerms> internDistanceTerms;

//type attributes dependent on featureType
union
Expand All @@ -911,16 +893,12 @@ class GeneralizedDistance
double deviation;

//distance term to use if both values being compared are unknown
ExactApproxValuePair unknownToUnknownDistanceTerm;
//the difference will be NaN if unknown
DistanceTermsWithDifference unknownToUnknownDistanceTerm;

//distance term to use if one value is known and the other is unknown
ExactApproxValuePair knownToUnknownDistanceTerm;

//difference between two values if both are unknown (NaN if unknown)
double unknownToUnknownDifference;

//difference between two values if one is known and the other is unknown (NaN if unknown)
double knownToUnknownDifference;
//the difference will be NaN if unknown
DistanceTermsWithDifference knownToUnknownDistanceTerm;
};

std::vector<FeatureParams> featureParams;
Expand Down
13 changes: 8 additions & 5 deletions src/Amalgam/Opcodes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -316,11 +316,13 @@ void StringInternPool::InitializeStaticStrings()
EmplaceStaticString(ENBISI_accessing_entity, "accessing_entity");

//distance types
EmplaceStaticString(ENBISI_nominal, "nominal");
EmplaceStaticString(ENBISI_continuous, "continuous");
EmplaceStaticString(ENBISI_cyclic, "cyclic");
//string already an opcode
EmplaceStaticString(ENBISI_code, "code");
EmplaceStaticString(ENBISI_nominal_numeric, "nominal_numeric");
EmplaceStaticString(ENBISI_nominal_string, "nominal_string");
EmplaceStaticString(ENBISI_nominal_code, "nominal_code");
EmplaceStaticString(ENBISI_continuous_numeric, "continuous_numeric");
EmplaceStaticString(ENBISI_continuous_numeric_cyclic, "continuous_numeric_cyclic");
EmplaceStaticString(ENBISI_continuous_string, "continuous_string");
EmplaceStaticString(ENBISI_continuous_code, "continuous_code");

//distance parameter values
EmplaceStaticString(ENBISI_surprisal_to_prob, "surprisal_to_prob");
Expand All @@ -331,6 +333,7 @@ void StringInternPool::InitializeStaticStrings()
EmplaceStaticString(ENBISI_recompute_precise, "recompute_precise");

//format opcode types
EmplaceStaticString(ENBISI_code, "code");
EmplaceStaticString(ENBISI_Base16, "Base16");
EmplaceStaticString(ENBISI_Base64, "Base64");
EmplaceStaticString(ENBISI_int8, "int8");
Expand Down
13 changes: 8 additions & 5 deletions src/Amalgam/Opcodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -535,11 +535,13 @@ enum EvaluableNodeBuiltInStringId
ENBISI_accessing_entity,

//distance types
ENBISI_nominal,
ENBISI_continuous,
ENBISI_cyclic,
//ENBISI_string, //string is already covered
ENBISI_code,
ENBISI_nominal_numeric,
ENBISI_nominal_string,
ENBISI_nominal_code,
ENBISI_continuous_numeric,
ENBISI_continuous_numeric_cyclic,
ENBISI_continuous_string,
ENBISI_continuous_code,

//distance parameter values
ENBISI_surprisal_to_prob,
Expand All @@ -550,6 +552,7 @@ enum EvaluableNodeBuiltInStringId
ENBISI_recompute_precise,

//format opcode types
ENBISI_code,
ENBISI_Base16,
ENBISI_Base64,
ENBISI_int8,
Expand Down
4 changes: 3 additions & 1 deletion src/Amalgam/SBFDSColumnData.h
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,9 @@ class SBFDSColumnData
{
switch(feature_params.featureType)
{
case GeneralizedDistance::FDT_NOMINAL:
case GeneralizedDistance::FDT_NOMINAL_NUMERIC:
case GeneralizedDistance::FDT_NOMINAL_STRING:
case GeneralizedDistance::FDT_NOMINAL_CODE:
return 1.0;

case GeneralizedDistance::FDT_CONTINUOUS_NUMERIC:
Expand Down
6 changes: 3 additions & 3 deletions src/Amalgam/SeparableBoxFilterDataStore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -347,8 +347,8 @@ void SeparableBoxFilterDataStore::FindEntitiesWithinDistance(GeneralizedDistance
for(auto &value_entry : column_data->sortedNumberValueEntries)
{
//get distance term that is applicable to each entity in this bucket
double distance_term = dist_params.ComputeDistanceTermRegularOneNonNull(
target_value.number - value_entry->value.number, query_feature_index, high_accuracy);
double distance_term = dist_params.ComputeDistanceTermRegular(
target_value.number, value_entry->value.number, ENIVT_NUMBER, ENIVT_NUMBER, query_feature_index, high_accuracy);

//for each bucket, add term to their sums
for(auto entity_index : value_entry->indicesWithValue)
Expand Down Expand Up @@ -990,7 +990,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G

//if not a number or no numbers available, then no size
if(value_type != ENIVT_NUMBER || column->sortedNumberValueEntries.size() == 0)
return GetMaxDistanceTermFromValue(dist_params, value, value_type, query_feature_index, absolute_feature_index, high_accuracy);
return GetMaxDistanceTermFromContinuousValue(dist_params, value, value_type, query_feature_index, absolute_feature_index, high_accuracy);

bool cyclic_feature = dist_params.IsFeatureCyclic(query_feature_index);
double cycle_length = std::numeric_limits<double>::infinity();
Expand Down
Loading

0 comments on commit 2eeda97

Please sign in to comment.