From 2eeda97f47159ad6c2b72f15007d3576fe8def40 Mon Sep 17 00:00:00 2001 From: howsohazard <143410553+howsohazard@users.noreply.github.com> Date: Tue, 19 Dec 2023 14:17:11 -0500 Subject: [PATCH] 18683: Changes difference types in distance computations to be explicit and moves deviation parameters to be more consistent. MAJOR (#41) --- docs/language.js | 14 +- src/Amalgam/GeneralizedDistance.h | 138 ++++----- src/Amalgam/Opcodes.cpp | 13 +- src/Amalgam/Opcodes.h | 13 +- src/Amalgam/SBFDSColumnData.h | 4 +- src/Amalgam/SeparableBoxFilterDataStore.cpp | 6 +- src/Amalgam/SeparableBoxFilterDataStore.h | 29 +- src/Amalgam/amlg_code/full_test.amlg | 100 +++--- src/Amalgam/amlg_code/module_test.json | 2 +- src/Amalgam/amlg_code/module_test2.amlg | 2 +- src/Amalgam/entity/EntityQueryBuilder.h | 122 ++++---- .../interpreter/InterpreterOpcodesMath.cpp | 12 +- src/Amalgam/out.txt | 285 +++++++++--------- 13 files changed, 361 insertions(+), 379 deletions(-) diff --git a/docs/language.js b/docs/language.js index 837e4529..e7a9d38a 100644 --- a/docs/language.js +++ b/docs/language.js @@ -401,7 +401,7 @@ var data = [ { "parameter" : "generalized_distance list|assoc|number weights list|assoc distance_types list|assoc attributes list|assoc|number deviations number p_value list|assoc|* vector1 [list|assoc|* vector2] [list value_names]", "output" : "number", - "description" : "Computes the generalized norm between vector1 and vector2 (or an equivalent zero vector if unspecified) with parameter specified by the p_value (2 being Euclidian distance), using the numerical distance or edit distance as appropriate. The parameter value_names, if specified as a list of the names of the values, will transform via unzipping any assoc into a list for the respective parameter in the order of the value_names, or if a number will use the number repeatedly for every element. weights is a list of dimension weights to use for the query, each value mapping to its respective element in the vectors. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal\" (checks for exact matches), \"continuous\" (takes the numeric difference between two values), \"cyclic\" (takes the numeric difference where the min and max wrap around), \"string\" (computes the edit distance between strings), and \"code\" (computes the edit distance between trees or graphs of code). \nFor attributes, the particular distance_types specifies what particular attributes are expected. In all cases, there is the option to specify a list of values, where the second last value is the difference to use when one of the values being compared is null, and the last value is the difference to use when both of the values are null. If the last value is omitted, it will use the second last value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations is a list of numbers that are used during distance calculation, per-element, prior to exponentiation. Specifying null as deviations is equivalent to setting each deviation to 0. If any vector value is null or evaluates to nan, or any of the differences between vector1 and vector2 evaluate to null or nan, then it will compute a corresponding maximum distance value based on the properties of the feature.", + "description" : "Computes the generalized norm between vector1 and vector2 (or an equivalent zero vector if unspecified) with parameter specified by the p_value (2 being Euclidian distance), using the numerical distance or edit distance as appropriate. The parameter value_names, if specified as a list of the names of the values, will transform via unzipping any assoc into a list for the respective parameter in the order of the value_names, or if a number will use the number repeatedly for every element. weights is a list of dimension weights to use for the query, each value mapping to its respective element in the vectors. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. If any vector value is null or evaluates to nan, or any of the differences between vector1 and vector2 evaluate to null or nan, then it will compute a corresponding maximum distance value based on the properties of the feature.", "example" : "(print (generalized_distance 0.01 (null) (null) (list null (list 0 360)) (list 0.5 0.0) (list 0 2 3) (list 1 2 3)))\n(print (generalized_distance 0.01 (list 0.25 0.25 0.5) (null) (null) (null) (list 1 2 3) (list 0 2 3) ))\n(generalized_distance 1 (list 0.3333 0.3333 0.3333) (list 5 0) (null) (null) (list 1 2 3) (list 10 2 10) )" }, @@ -1599,7 +1599,7 @@ var data = [ "parameter" : "query_within_generalized_distance number max_distance list axis_labels list axis_values list|assoc|number weights list|assoc distance_types list|assoc attributes list|assoc|number deviations [number p_value] [string|number distance_transform] [string entity_weight_label_name] [number random_seed] [string radius_label] [string numerical_precision] [* output_sorted_list]", "output" : "query", "new value" : "new", - "description" : "When used as a query argument, selects entities which represent a point within a certain generalized norm to a given point. axis_labels specifies the names of the coordinate axes (as labels on the target entity), and axis_values the specifies the corresponding values for the point to test from. p_value is the generalized norm parameter. weights is a list or assoc of dimension weights to use for the query, each value mapping to its respective element in the vectors. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal\" (checks for exact matches), \"continuous\" (takes the numeric difference between two values), \"cyclic\" (takes the numeric difference where the min and max wrap around), \"string\" (computes the edit distance between strings), and \"code\" (computes the edit distance between trees or graphs of code). For attributes, the particular distance_types specifies what is expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values available. For continuous, a null means unbounded where distance for a null will be computed automatically from the relevant data; a single number indicates the difference between a value and a null, a specified uncertainty. Cyclic requires either a single value or a list of two values; a list of two values indicates that the first value, the lower bound, will wrap around to the upper bound, the second value specified; if only a single number is provided instead of a list, then it will assume that number for the upper bound and 0 for the lower bound. For the string distance type, the value specified can be a number indicating the maximum possible string length, inferred if null is provided. For code, the value specified can be a number indicating the maximum number of nodes in the code (including labels), inferred if null is provided. Deviations contains numbers that are used during the distance calculation, per-element, prior to exponentiation. Specifying null as deviations is equivalent to setting each deviation to 0. max_distance is the maximum distance allowed. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their distances. If these distances are returned, then a transform may be applied to them based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities before being returned. If distance_transform is a number or omitted, which will default to 1.0, then it will be treated as a distance weight exponent, and will be applied to each distance as distance^distance_weight_exponent. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", + "description" : "When used as a query argument, selects entities which represent a point within a certain generalized norm to a given point. axis_labels specifies the names of the coordinate axes (as labels on the target entity), and axis_values the specifies the corresponding values for the point to test from. p_value is the generalized norm parameter. weights is a list or assoc of dimension weights to use for the query, each value mapping to its respective element in the vectors. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. For attributes, the particular distance_types specifies what is expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values available. For continuous, a null means unbounded where distance for a null will be computed automatically from the relevant data; a single number indicates the difference between a value and a null, a specified uncertainty. Cyclic requires either a single value or a list of two values; a list of two values indicates that the first value, the lower bound, will wrap around to the upper bound, the second value specified; if only a single number is provided instead of a list, then it will assume that number for the upper bound and 0 for the lower bound. For the string distance type, the value specified can be a number indicating the maximum possible string length, inferred if null is provided. For code, the value specified can be a number indicating the maximum number of nodes in the code (including labels), inferred if null is provided. Deviations contains numbers that are used during the distance calculation, per-element, prior to exponentiation. Specifying null as deviations is equivalent to setting each deviation to 0. max_distance is the maximum distance allowed. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their distances. If these distances are returned, then a transform may be applied to them based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities before being returned. If distance_transform is a number or omitted, which will default to 1.0, then it will be treated as a distance weight exponent, and will be applied to each distance as distance^distance_weight_exponent. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", "example" : "(contained_entities \"TestContainerExec\" (list\n (query_within_generalized_distance 60 (list \"x\" \"y\") (list 0.0 0.0) (null) (null) (null) (null) 0.5 1 (null) \"random seed 1234\" \"radius\")\n))" }, @@ -1607,7 +1607,7 @@ var data = [ "parameter" : "query_nearest_generalized_distance number entities_returned list axis_labels list axis_values list|assoc weights list|assoc distance_types list|assoc attributes list|assoc deviations [number p_value] [string|number distance_transform] [string entity_weight_label_name] [number random_seed] [string radius_label] [string numerical_precision] [* output_sorted_list]", "output" : "query", "new value" : "new", - "description" : "When used as a query argument, selects the closest entities which represent a point within a certain generalized norm to a given point. axis_labels specifies the names of the coordinate axes (as labels on the target entity), and axis_values the specifies the corresponding values for the point to test from. p_value is the generalized norm parameter. weights is a list or assoc of dimension weights to use for the query, each value mapping to its respective element in the vectors. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal\" (checks for exact matches), \"continuous\" (takes the numeric difference between two values), \"cyclic\" (takes the numeric difference where the min and max wrap around), \"string\" (computes the edit distance between strings), and \"code\" (computes the edit distance between trees or graphs of code). \nFor attributes, the particular distance_types specifies what particular attributes are expected. In all cases, there is the option to specify a list of values, where the second last value is the difference to use when one of the values being compared is null, and the last value is the difference to use when both of the values are null. If the last value is omitted, it will use the second last value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations contains numbers that are used during the distance calculation, per-element, prior to exponentiation. Specifying null as deviations is equivalent to setting each deviation to 0. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their distances. If these distances are returned, then a transform may be applied to them based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities before being returned. If distance_transform is a number or omitted, which will default to 1.0, then it will be treated as a distance weight exponent, and will be applied to each distance as distance^distance_weight_exponent. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", + "description" : "When used as a query argument, selects the closest entities which represent a point within a certain generalized norm to a given point. axis_labels specifies the names of the coordinate axes (as labels on the target entity), and axis_values the specifies the corresponding values for the point to test from. p_value is the generalized norm parameter. weights is a list or assoc of dimension weights to use for the query, each value mapping to its respective element in the vectors. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their distances. If these distances are returned, then a transform may be applied to them based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities before being returned. If distance_transform is a number or omitted, which will default to 1.0, then it will be treated as a distance weight exponent, and will be applied to each distance as distance^distance_weight_exponent. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", "example" : "(contained_entities \"TestContainerExec\" (list\n (query_nearest_generalized_distance (list \"x\" \"y\") (list 0.0 0.0) 0.5 (list 0.25 0.75) (list 5 0) (list null (list 0 360)) (list 0.5 0.0) 10 \"radius\")\n))\n(contained_entities \"TestContainerExec\" (list\n (query_nearest_generalized_distance (list \"x\" \"y\") (list 0.0 0.0) 0.5 (null) (null) 10 \"radius\")\n))" }, @@ -1616,7 +1616,7 @@ var data = [ "output" : "query", "new value" : "new", "concurrency" : true, - "description" : "When used as a query argument, computes the case conviction for every case given in case_ids_to_compute with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal\" (checks for exact matches), \"continuous\" (takes the numeric difference between two values), \"cyclic\" (takes the numeric difference where the min and max wrap around), \"string\" (computes the edit distance between strings), and \"code\" (computes the edit distance between trees or graphs of code). \nFor attributes, the particular distance_types specifies what particular attributes are expected. In all cases, there is the option to specify a list of values, where the second last value is the difference to use when one of the values being compared is null, and the last value is the difference to use when both of the values are null. If the last value is omitted, it will use the second last value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations contains numbers that are used during the distance calculation, per-element, prior to exponentiation. Specifying null as deviations is equivalent to setting each deviation to 0. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If conviction_of_removal is true, then it will compute the conviction as if the entities specified by entity_ids_to_compute were removed; if false (the default), then will compute the conviction as if those entities were added or included. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", + "description" : "When used as a query argument, computes the case conviction for every case given in case_ids_to_compute with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If conviction_of_removal is true, then it will compute the conviction as if the entities specified by entity_ids_to_compute were removed; if false (the default), then will compute the conviction as if those entities were added or included. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", "example" : "(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_convictions (list \"feature_1\" \"feature_2\") (list entity_id_1 entity_id_2 entity_id 3) 1.0 (list 0.25 0.75) (list 5 0) (list null (list 0 360)) (list 0.5 0.0) 10 \"radius\")\n))\n(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_convictions (list \"x\" \"y\") (null) 2.0 (null) (null) 10 \"radius\")\n))" }, @@ -1625,7 +1625,7 @@ var data = [ "output" : "query", "new value" : "new", "concurrency" : true, - "description" : "When used as a query argument, computes the case kl divergence for every case given in case_ids_to_compute as a group with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal\" (checks for exact matches), \"continuous\" (takes the numeric difference between two values), \"cyclic\" (takes the numeric difference where the min and max wrap around), \"string\" (computes the edit distance between strings), and \"code\" (computes the edit distance between trees or graphs of code). \nFor attributes, the particular distance_types specifies what particular attributes are expected. In all cases, there is the option to specify a list of values, where the second last value is the difference to use when one of the values being compared is null, and the last value is the difference to use when both of the values are null. If the last value is omitted, it will use the second last value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations contains numbers that are used during the distance calculation, per-element, prior to exponentiation. Specifying null as deviations is equivalent to setting each deviation to 0. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If conviction_of_removal is true, then it will compute the conviction as if the entities specified by entity_ids_to_compute were removed; if false (the default), then will compute the conviction as if those entities were added or included.", + "description" : "When used as a query argument, computes the case kl divergence for every case given in case_ids_to_compute as a group with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If conviction_of_removal is true, then it will compute the conviction as if the entities specified by entity_ids_to_compute were removed; if false (the default), then will compute the conviction as if those entities were added or included.", "example" : "(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_group_kl_divergence (list \"feature_1\" \"feature_2\") (list entity_id_1 entity_id_2 entity_id 3) 1.0 (list 0.25 0.75) (list 5 0) (list null (list 0 360)) (list 0.5 0.0) 10 \"radius\")\n))\n(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_group_kl_divergence (list \"x\" \"y\") (null) 2.0 (null) (null) 10 \"radius\")\n))" }, @@ -1634,7 +1634,7 @@ var data = [ "output" : "query", "new value" : "new", "concurrency" : true, - "description" : "When used as a query argument, computes the case conviction for every case given in case_ids_to_compute with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal\" (checks for exact matches), \"continuous\" (takes the numeric difference between two values), \"cyclic\" (takes the numeric difference where the min and max wrap around), \"string\" (computes the edit distance between strings), and \"code\" (computes the edit distance between trees or graphs of code). \nFor attributes, the particular distance_types specifies what particular attributes are expected. In all cases, there is the option to specify a list of values, where the second last value is the difference to use when one of the values being compared is null, and the last value is the difference to use when both of the values are null. If the last value is omitted, it will use the second last value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations contains numbers that are used during the distance calculation, per-element, prior to exponentiation. Specifying null as deviations is equivalent to setting each deviation to 0. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", + "description" : "When used as a query argument, computes the case conviction for every case given in case_ids_to_compute with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", "example" : "(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_distance_contributions (list \"feature_1\" \"feature_2\") (list entity_id_1 entity_id_2 entity_id 3) 1.0 (list 0.25 0.75) (list 5 0) (list null (list 0 360)) (list 0.5 0.0) 10 \"radius\")\n))\n(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_distance_contributions (list \"x\" \"y\") (null) 2.0 (null) (null) 10 \"radius\")\n))" }, @@ -1643,7 +1643,7 @@ var data = [ "output" : "query", "new value" : "new", "concurrency" : true, - "description" : "When used as a query argument, computes the case conviction for every case given in case_ids_to_compute with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal\" (checks for exact matches), \"continuous\" (takes the numeric difference between two values), \"cyclic\" (takes the numeric difference where the min and max wrap around), \"string\" (computes the edit distance between strings), and \"code\" (computes the edit distance between trees or graphs of code). \nFor attributes, the particular distance_types specifies what particular attributes are expected. In all cases, there is the option to specify a list of values, where the second last value is the difference to use when one of the values being compared is null, and the last value is the difference to use when both of the values are null. If the last value is omitted, it will use the second last value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations contains numbers that are used during the distance calculation, per-element, prior to exponentiation. Specifying null as deviations is equivalent to setting each deviation to 0. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If conviction_of_removal is true, then it will compute the conviction as if the entities specified by entity_ids_to_compute were removed; if false (the default), then will compute the conviction as if those entities were added or included. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", + "description" : "When used as a query argument, computes the case conviction for every case given in case_ids_to_compute with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If conviction_of_removal is true, then it will compute the conviction as if the entities specified by entity_ids_to_compute were removed; if false (the default), then will compute the conviction as if those entities were added or included. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", "example" : "(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_kl_divergences (list \"feature_1\" \"feature_2\") (list entity_id_1 entity_id_2 entity_id 3) 1.0 (list 0.25 0.75) (list 5 0) (list null (list 0 360)) (list 0.5 0.0) 10 \"radius\")\n))\n(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_kl_divergences (list \"x\" \"y\") (null) 2.0 (null) (null) 10 \"radius\")\n))" }, diff --git a/src/Amalgam/GeneralizedDistance.h b/src/Amalgam/GeneralizedDistance.h index 9a64d1ed..3e32eee0 100644 --- a/src/Amalgam/GeneralizedDistance.h +++ b/src/Amalgam/GeneralizedDistance.h @@ -22,7 +22,12 @@ class GeneralizedDistance // align at 32-bits in order to play nice with data alignment where it is used enum FeatureDifferenceType : uint32_t { - FDT_NOMINAL, + //nominal based on numeric equivalence + FDT_NOMINAL_NUMERIC, + //nominal based on string equivalence + FDT_NOMINAL_STRING, + //nominal based on code equivalence + FDT_NOMINAL_CODE, //continuous without cycles, may contain nonnumeric data FDT_CONTINUOUS_NUMERIC, //like FDT_CONTINUOUS_NUMERIC, but has cycles @@ -82,17 +87,17 @@ class GeneralizedDistance if(compute_accurate) { feature_params.unknownToUnknownDistanceTerm.SetValue( - ComputeDistanceTermNonNull(feature_params.unknownToUnknownDifference, index, true), true); + ComputeDistanceTermNonNull(feature_params.unknownToUnknownDistanceTerm.difference, index, true), true); } if(compute_approximate) { feature_params.unknownToUnknownDistanceTerm.SetValue( - ComputeDistanceTermNonNull(feature_params.unknownToUnknownDifference, index, false), false); + ComputeDistanceTermNonNull(feature_params.unknownToUnknownDistanceTerm.difference, index, false), false); } //if knownToUnknownDifference is same as unknownToUnknownDifference, can copy distance term instead of recomputing - if(feature_params.knownToUnknownDifference == feature_params.unknownToUnknownDifference) + if(feature_params.knownToUnknownDistanceTerm.difference == feature_params.unknownToUnknownDistanceTerm.difference) { feature_params.knownToUnknownDistanceTerm = feature_params.unknownToUnknownDistanceTerm; } @@ -102,13 +107,13 @@ class GeneralizedDistance if(compute_accurate) { feature_params.knownToUnknownDistanceTerm.SetValue( - ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference, index, true), true); + ComputeDistanceTermNonNull(feature_params.knownToUnknownDistanceTerm.difference, index, true), true); } if(compute_approximate) { feature_params.knownToUnknownDistanceTerm.SetValue( - ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference, index, false), false); + ComputeDistanceTermNonNull(feature_params.knownToUnknownDistanceTerm.difference, index, false), false); } } @@ -230,42 +235,57 @@ class GeneralizedDistance return (highAccuracy || recomputeAccurateDistances); } - //stores a pair of exact and approximate values + //stores the computed exact and approximate distance terms // which can be referenced by getting the value at the corresponding offset //the values default to 0.0 on initialization - class ExactApproxValuePair + class DistanceTerms { public: //offset for each precision level static constexpr int APPROX = 0; static constexpr int EXACT = 1; - __forceinline ExactApproxValuePair(double initial_value = 0.0) + __forceinline DistanceTerms(double initial_value = 0.0) { - exactApproxPair = { initial_value, initial_value }; + distanceTerm = { initial_value, initial_value }; } constexpr double GetValue(bool high_accuracy) { - return exactApproxPair[high_accuracy ? EXACT : APPROX]; + return distanceTerm[high_accuracy ? EXACT : APPROX]; } constexpr double GetValue(int offset) { - return exactApproxPair[offset]; + return distanceTerm[offset]; } __forceinline void SetValue(double value, int offset) { - exactApproxPair[offset] = value; + distanceTerm[offset] = value; } __forceinline void SetValue(double value, bool high_accuracy) { - exactApproxPair[high_accuracy ? EXACT : APPROX] = value; + distanceTerm[high_accuracy ? EXACT : APPROX] = value; } - std::array exactApproxPair; + std::array distanceTerm; + }; + + //stores the computed exact and approximate distance terms, as well as the difference + //the values default to 0.0 on initialization + class DistanceTermsWithDifference + : public DistanceTerms + { + public: + __forceinline DistanceTermsWithDifference(double initial_value = 0.0) + : DistanceTerms(initial_value) + { + difference = initial_value; + } + + double difference; }; //update cached nominal deltas based on highAccuracy and recomputeAccurateDistances, caching what is needed given those flags @@ -276,9 +296,10 @@ class GeneralizedDistance for(size_t i = 0; i < featureParams.size(); i++) { - auto &feat_params = featureParams[i]; - if(feat_params.featureType == FDT_NOMINAL) + if(IsFeatureNominal(i)) { + auto &feat_params = featureParams[i]; + //ensure if a feature has deviations they're not too small to underflow if(DoesFeatureHaveDeviation(i)) { @@ -315,7 +336,7 @@ class GeneralizedDistance //returns true if the feature is nominal __forceinline bool IsFeatureNominal(size_t feature_index) { - return (featureParams[feature_index].featureType == FDT_NOMINAL); + return (featureParams[feature_index].featureType <= FDT_NOMINAL_CODE); } //returns true if the feature is cyclic @@ -335,7 +356,7 @@ class GeneralizedDistance __forceinline bool IsKnownToUnknownDistanceLessThanOrEqualToExactMatch(size_t feature_index) { auto &feature_params = featureParams[feature_index]; - return (feature_params.knownToUnknownDifference <= feature_params.deviation); + return (feature_params.knownToUnknownDistanceTerm.difference <= feature_params.deviation); } //computes the exponentiation of d to 1/p @@ -353,7 +374,7 @@ class GeneralizedDistance return fastPowInverseP.FastPow(d); } - //computes the exponentiation of d to p given precision being from ExactApproxValuePair + //computes the exponentiation of d to p given precision being from DistanceTerms __forceinline double ExponentiateDifferenceTerm(double d, bool high_accuracy) { if(pValue == 1) @@ -371,21 +392,16 @@ class GeneralizedDistance //returns the maximum difference inline double GetMaximumDifference(size_t index) { - auto &feature_params = featureParams[index]; - switch(feature_params.featureType) - { - case FDT_NOMINAL: + if(IsFeatureNominal(index)) return 1.0; - case FDT_CONTINUOUS_NUMERIC_CYCLIC: - return feature_params.typeAttributes.maxCyclicDifference / 2; + if(IsFeatureCyclic(index)) + return featureParams[index].typeAttributes.maxCyclicDifference / 2; - default: - if(feature_params.weight > 0) - return std::numeric_limits::infinity(); - else - return -std::numeric_limits::infinity(); - } + if(featureParams[index].weight > 0) + return std::numeric_limits::infinity(); + else + return -std::numeric_limits::infinity(); } //computes the distance term for a nominal when two universally symmetric nominals are equal @@ -665,21 +681,6 @@ class GeneralizedDistance return ExponentiateDifferenceTerm(diff, high_accuracy) * featureParams[index].weight; } - //computes the inner term of the Minkowski norm summation for a single index for p non-zero and non-infinite - //where at least one of the values is non-null - __forceinline double ComputeDistanceTermRegularOneNonNull(double diff, size_t index, bool high_accuracy) - { - if(FastIsNaN(diff)) - return ComputeDistanceTermKnownToUnknown(index, high_accuracy); - - //if nominal, don't need to compute absolute value of diff because just need to compare to 0 - if(IsFeatureNominal(index)) - return (diff == 0.0) ? ComputeDistanceTermNominalUniversallySymmetricExactMatchPrecomputed(index, high_accuracy) - : ComputeDistanceTermNominalUniversallySymmetricNonMatchPrecomputed(index, high_accuracy); - - return ComputeDistanceTermNonNominalNonNullRegular(diff, index, high_accuracy); - } - //computes the inner term of the Minkowski norm summation for a single index for p non-zero and non-infinite __forceinline double ComputeDistanceTermRegular(EvaluableNodeImmediateValue a, EvaluableNodeImmediateValue b, EvaluableNodeImmediateValueType a_type, EvaluableNodeImmediateValueType b_type, size_t index, bool high_accuracy) @@ -696,24 +697,6 @@ class GeneralizedDistance return ComputeDistanceTermNonNominalNonNullRegular(diff, index, high_accuracy); } - //computes the inner term of the Minkowski norm summation for a single index that isn't null, - //but computes only from the distance (does not take into account feature measurement type) - __forceinline double ComputeDistanceTermFromNonNullDifferenceOnly(double diff, size_t index, bool high_accuracy) - { - if(pValue == 0.0) - { - if(high_accuracy) - return std::pow(diff, featureParams[index].weight); - else - return FastPow(diff, featureParams[index].weight); - } - else if(pValue == std::numeric_limits::infinity() - || pValue == -std::numeric_limits::infinity()) - return diff * featureParams[index].weight; - else - return ExponentiateDifferenceTerm(diff, high_accuracy) * featureParams[index].weight; - } - //returns the distance term for the either one or two unknown values __forceinline double LookupNullDistanceTerm(EvaluableNodeImmediateValue a, EvaluableNodeImmediateValue b, EvaluableNodeImmediateValueType a_type, EvaluableNodeImmediateValueType b_type, size_t index, bool high_accuracy) @@ -748,7 +731,8 @@ class GeneralizedDistance if(a_type == ENIVT_NULL || b_type == ENIVT_NULL) return std::numeric_limits::quiet_NaN(); - if(feature_type == FDT_NOMINAL) + if(feature_type == FDT_NOMINAL_NUMERIC + || feature_type == FDT_NOMINAL_STRING || feature_type == FDT_NOMINAL_CODE) { if(a_type == ENIVT_NUMBER && b_type == ENIVT_NUMBER) return (a.number == b.number ? 0.0 : 1.0); @@ -868,9 +852,7 @@ class GeneralizedDistance weight(1.0), internedNumberIndexToNumberValue(nullptr), deviation(0.0), unknownToUnknownDistanceTerm(std::numeric_limits::quiet_NaN()), - knownToUnknownDistanceTerm(std::numeric_limits::quiet_NaN()), - unknownToUnknownDifference(std::numeric_limits::quiet_NaN()), - knownToUnknownDifference(std::numeric_limits::quiet_NaN()) + knownToUnknownDistanceTerm(std::numeric_limits::quiet_NaN()) { typeAttributes.maxCyclicDifference = std::numeric_limits::quiet_NaN(); } @@ -887,14 +869,14 @@ class GeneralizedDistance double weight; //distance terms for nominals - ExactApproxValuePair nominalMatchDistanceTerm; - ExactApproxValuePair nominalNonMatchDistanceTerm; + DistanceTerms nominalMatchDistanceTerm; + DistanceTerms nominalNonMatchDistanceTerm; //pointer to a lookup table of indices to values if the feature is an interned number std::vector *internedNumberIndexToNumberValue; //precomputed distance terms for each interned value looked up by intern index - std::vector internDistanceTerms; + std::vector internDistanceTerms; //type attributes dependent on featureType union @@ -911,16 +893,12 @@ class GeneralizedDistance double deviation; //distance term to use if both values being compared are unknown - ExactApproxValuePair unknownToUnknownDistanceTerm; + //the difference will be NaN if unknown + DistanceTermsWithDifference unknownToUnknownDistanceTerm; //distance term to use if one value is known and the other is unknown - ExactApproxValuePair knownToUnknownDistanceTerm; - - //difference between two values if both are unknown (NaN if unknown) - double unknownToUnknownDifference; - - //difference between two values if one is known and the other is unknown (NaN if unknown) - double knownToUnknownDifference; + //the difference will be NaN if unknown + DistanceTermsWithDifference knownToUnknownDistanceTerm; }; std::vector featureParams; diff --git a/src/Amalgam/Opcodes.cpp b/src/Amalgam/Opcodes.cpp index 42b01695..0243143a 100644 --- a/src/Amalgam/Opcodes.cpp +++ b/src/Amalgam/Opcodes.cpp @@ -316,11 +316,13 @@ void StringInternPool::InitializeStaticStrings() EmplaceStaticString(ENBISI_accessing_entity, "accessing_entity"); //distance types - EmplaceStaticString(ENBISI_nominal, "nominal"); - EmplaceStaticString(ENBISI_continuous, "continuous"); - EmplaceStaticString(ENBISI_cyclic, "cyclic"); - //string already an opcode - EmplaceStaticString(ENBISI_code, "code"); + EmplaceStaticString(ENBISI_nominal_numeric, "nominal_numeric"); + EmplaceStaticString(ENBISI_nominal_string, "nominal_string"); + EmplaceStaticString(ENBISI_nominal_code, "nominal_code"); + EmplaceStaticString(ENBISI_continuous_numeric, "continuous_numeric"); + EmplaceStaticString(ENBISI_continuous_numeric_cyclic, "continuous_numeric_cyclic"); + EmplaceStaticString(ENBISI_continuous_string, "continuous_string"); + EmplaceStaticString(ENBISI_continuous_code, "continuous_code"); //distance parameter values EmplaceStaticString(ENBISI_surprisal_to_prob, "surprisal_to_prob"); @@ -331,6 +333,7 @@ void StringInternPool::InitializeStaticStrings() EmplaceStaticString(ENBISI_recompute_precise, "recompute_precise"); //format opcode types + EmplaceStaticString(ENBISI_code, "code"); EmplaceStaticString(ENBISI_Base16, "Base16"); EmplaceStaticString(ENBISI_Base64, "Base64"); EmplaceStaticString(ENBISI_int8, "int8"); diff --git a/src/Amalgam/Opcodes.h b/src/Amalgam/Opcodes.h index e89b2f09..dc1c1c24 100644 --- a/src/Amalgam/Opcodes.h +++ b/src/Amalgam/Opcodes.h @@ -535,11 +535,13 @@ enum EvaluableNodeBuiltInStringId ENBISI_accessing_entity, //distance types - ENBISI_nominal, - ENBISI_continuous, - ENBISI_cyclic, - //ENBISI_string, //string is already covered - ENBISI_code, + ENBISI_nominal_numeric, + ENBISI_nominal_string, + ENBISI_nominal_code, + ENBISI_continuous_numeric, + ENBISI_continuous_numeric_cyclic, + ENBISI_continuous_string, + ENBISI_continuous_code, //distance parameter values ENBISI_surprisal_to_prob, @@ -550,6 +552,7 @@ enum EvaluableNodeBuiltInStringId ENBISI_recompute_precise, //format opcode types + ENBISI_code, ENBISI_Base16, ENBISI_Base64, ENBISI_int8, diff --git a/src/Amalgam/SBFDSColumnData.h b/src/Amalgam/SBFDSColumnData.h index f88a9adf..21f0fec9 100644 --- a/src/Amalgam/SBFDSColumnData.h +++ b/src/Amalgam/SBFDSColumnData.h @@ -684,7 +684,9 @@ class SBFDSColumnData { switch(feature_params.featureType) { - case GeneralizedDistance::FDT_NOMINAL: + case GeneralizedDistance::FDT_NOMINAL_NUMERIC: + case GeneralizedDistance::FDT_NOMINAL_STRING: + case GeneralizedDistance::FDT_NOMINAL_CODE: return 1.0; case GeneralizedDistance::FDT_CONTINUOUS_NUMERIC: diff --git a/src/Amalgam/SeparableBoxFilterDataStore.cpp b/src/Amalgam/SeparableBoxFilterDataStore.cpp index ec33e1fe..948ea641 100644 --- a/src/Amalgam/SeparableBoxFilterDataStore.cpp +++ b/src/Amalgam/SeparableBoxFilterDataStore.cpp @@ -347,8 +347,8 @@ void SeparableBoxFilterDataStore::FindEntitiesWithinDistance(GeneralizedDistance for(auto &value_entry : column_data->sortedNumberValueEntries) { //get distance term that is applicable to each entity in this bucket - double distance_term = dist_params.ComputeDistanceTermRegularOneNonNull( - target_value.number - value_entry->value.number, query_feature_index, high_accuracy); + double distance_term = dist_params.ComputeDistanceTermRegular( + target_value.number, value_entry->value.number, ENIVT_NUMBER, ENIVT_NUMBER, query_feature_index, high_accuracy); //for each bucket, add term to their sums for(auto entity_index : value_entry->indicesWithValue) @@ -990,7 +990,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G //if not a number or no numbers available, then no size if(value_type != ENIVT_NUMBER || column->sortedNumberValueEntries.size() == 0) - return GetMaxDistanceTermFromValue(dist_params, value, value_type, query_feature_index, absolute_feature_index, high_accuracy); + return GetMaxDistanceTermFromContinuousValue(dist_params, value, value_type, query_feature_index, absolute_feature_index, high_accuracy); bool cyclic_feature = dist_params.IsFeatureCyclic(query_feature_index); double cycle_length = std::numeric_limits::infinity(); diff --git a/src/Amalgam/SeparableBoxFilterDataStore.h b/src/Amalgam/SeparableBoxFilterDataStore.h index c8f7b9bb..4dd38726 100644 --- a/src/Amalgam/SeparableBoxFilterDataStore.h +++ b/src/Amalgam/SeparableBoxFilterDataStore.h @@ -71,16 +71,13 @@ class SeparableBoxFilterDataStore numEntities = 0; } - //Gets the maximum possible distance term from value + //Gets the maximum possible distance term from value assuming the feature is continuous // absolute_feature_index is the offset to access the feature relative to the entire data store // query_feature_index is relative to dist_params - inline double GetMaxDistanceTermFromValue(GeneralizedDistance &dist_params, + inline double GetMaxDistanceTermFromContinuousValue(GeneralizedDistance &dist_params, EvaluableNodeImmediateValue &value, EvaluableNodeImmediateValueType value_type, size_t query_feature_index, size_t absolute_feature_index, bool high_accuracy) { - if(dist_params.IsFeatureNominal(query_feature_index)) - return dist_params.ComputeDistanceTermNominalUniversallySymmetricNonMatchPrecomputed(query_feature_index, high_accuracy); - double max_diff = columnData[absolute_feature_index]->GetMaxDifferenceTermFromValue( dist_params.featureParams[query_feature_index], value_type, value); return dist_params.ComputeDistanceTermNonNominalNonNullRegular(max_diff, query_feature_index, high_accuracy); @@ -870,21 +867,25 @@ class SeparableBoxFilterDataStore auto &feature_type = dist_params.featureParams[query_feature_index].featureType; auto &effective_feature_type = dist_params.featureParams[query_feature_index].effectiveFeatureType; - if(feature_type == GeneralizedDistance::FDT_NOMINAL + if(feature_type == GeneralizedDistance::FDT_NOMINAL_NUMERIC + || feature_type == GeneralizedDistance::FDT_NOMINAL_STRING + || feature_type == GeneralizedDistance::FDT_NOMINAL_CODE || feature_type == GeneralizedDistance::FDT_CONTINUOUS_STRING || feature_type == GeneralizedDistance::FDT_CONTINUOUS_CODE) { target_values.push_back(position_value); target_value_types.push_back(position_value_type); - if(feature_type == GeneralizedDistance::FDT_NOMINAL) + if(feature_type == GeneralizedDistance::FDT_NOMINAL_NUMERIC + || feature_type == GeneralizedDistance::FDT_NOMINAL_STRING + || feature_type == GeneralizedDistance::FDT_NOMINAL_CODE) effective_feature_type = GeneralizedDistance::EFDT_NOMINAL_UNIVERSALLY_SYMMETRIC_PRECOMPUTED; else if(feature_type == GeneralizedDistance::FDT_CONTINUOUS_STRING) effective_feature_type = GeneralizedDistance::EFDT_CONTINUOUS_STRING; else if(feature_type == GeneralizedDistance::FDT_CONTINUOUS_CODE) effective_feature_type = GeneralizedDistance::EFDT_CONTINUOUS_CODE; } - else // feature_type is some form of numeric + else // feature_type is some form of continuous numeric { //looking for continuous; if not a number, so just put as nan double position_value_numeric = (position_value_type == ENIVT_NUMBER ? position_value.number : std::numeric_limits::quiet_NaN()); @@ -965,16 +966,16 @@ class SeparableBoxFilterDataStore //if either known or unknown to unknown is missing, need to compute difference // and store it where it is needed double unknown_distance_term = 0.0; - if(FastIsNaN(feature_params.knownToUnknownDifference) - || FastIsNaN(feature_params.unknownToUnknownDifference)) + if(FastIsNaN(feature_params.knownToUnknownDistanceTerm.difference) + || FastIsNaN(feature_params.unknownToUnknownDistanceTerm.difference)) { unknown_distance_term = columnData[column_index]->GetMaxDifferenceTermFromValue( feature_params, target_value_types[i], target_values[i]); - if(FastIsNaN(feature_params.knownToUnknownDifference)) - feature_params.knownToUnknownDifference = unknown_distance_term; - if(FastIsNaN(feature_params.unknownToUnknownDifference)) - feature_params.unknownToUnknownDifference = unknown_distance_term; + if(FastIsNaN(feature_params.knownToUnknownDistanceTerm.difference)) + feature_params.knownToUnknownDistanceTerm.difference = unknown_distance_term; + if(FastIsNaN(feature_params.unknownToUnknownDistanceTerm.difference)) + feature_params.unknownToUnknownDistanceTerm.difference = unknown_distance_term; } dist_params.ComputeAndStoreUncertaintyDistanceTerms(i, diff --git a/src/Amalgam/amlg_code/full_test.amlg b/src/Amalgam/amlg_code/full_test.amlg index ea21c682..2179c4bf 100644 --- a/src/Amalgam/amlg_code/full_test.amlg +++ b/src/Amalgam/amlg_code/full_test.amlg @@ -379,21 +379,21 @@ (print "14 " (generalized_distance (list 1 1) (null) (null) (null) 0 (list .nan 4) ) "\n") (print "15 " (generalized_distance (list 0.5 0.5) (null) (null) (null) 2 (list .nan 4) ) "\n") (print "16 " (generalized_distance (list 0.5 0.5) (null) (null) (null) 0 (list .nan 4) ) "\n") - (print "17 " (generalized_distance (null) (list "nominal") (list 1) (null) 1 (list 1 2 3) (list 10 2 4) ) "\n") - (print "18 " (generalized_distance (null) (list "nominal") (list 1) (null) 1 (list 1 2 3) (list 10 2 10) ) "\n") - (print "19 " (generalized_distance (null) (list "nominal") (list 1) (null) 1 (list 1 2 3) (list 10 2 10) ) "\n") - (print "20 " (generalized_distance (list 0.3333 0.3333 0.3333) (list "nominal") (list 1) (null) 1 (list 1 2 3) (list 10 2 4) ) "\n") - (print "21 " (generalized_distance (list 0.3333 0.3333 0.3333) (list "nominal") (list 1) (null) 1 (list 1 2 3) (list 10 2 10) ) "\n") - (print "22 " (generalized_distance (list 0.3333 0.3333 0.3333) (list "nominal") (list 1) (null) 1 (list 1 2 3) (list 10 2 10) ) "\n") - (print "23 " (generalized_distance (list 0.3333 0.3333 0.3333) (list "nominal" "cyclic" "cyclic") (list 1 360 12) (null) 1 (list 1 2 3) (list 10 2 10) ) "\n") - (print "24 " (generalized_distance (list 0.3333 0.3333 0.3333) (list "nominal") (list 1) (list 0.25 180 -12) 1 (list 1 2 3) (list 10 2 10) ) "\n") - (print "25 " (generalized_distance (list 1 0 1 ) (list "continuous" "nominal" "nominal") (list (null) 5 5) (list .1 .1 .1 ) 2 (list 4 4 (null) ) (list 2 (null) (null) ) ) "\n" ) - (print "26 " (generalized_distance (list 1 0 1 ) (list "continuous" "nominal" "nominal") (list (null) 5 5) (null) 2 (list 4 4 (null) ) (list 2 (null) (null) ) ) "\n") - (print "27 " (generalized_distance (list 1 0 1 1 ) (list "continuous" "nominal" "nominal") (list (null) 5 5) (list .1 .1 .1 .1) 2 (list 4 4 (null) 4) (list 2 (null) (null) 2) ) "\n" ) - (print "28 " (generalized_distance (list 1 0 1 1 ) (list "continuous" "nominal" "nominal") (list (null) 5 5) (null) 2 (list 4 4 (null) 4) (list 2 (null) (null) 2) ) "\n" ) + (print "17 " (generalized_distance (null) (list "nominal_numeric") (list 1) (null) 1 (list 1 2 3) (list 10 2 4) ) "\n") + (print "18 " (generalized_distance (null) (list "nominal_numeric") (list 1) (null) 1 (list 1 2 3) (list 10 2 10) ) "\n") + (print "19 " (generalized_distance (null) (list "nominal_numeric") (list 1) (null) 1 (list 1 2 3) (list 10 2 10) ) "\n") + (print "20 " (generalized_distance (list 0.3333 0.3333 0.3333) (list "nominal_numeric") (list 1) (null) 1 (list 1 2 3) (list 10 2 4) ) "\n") + (print "21 " (generalized_distance (list 0.3333 0.3333 0.3333) (list "nominal_numeric") (list 1) (null) 1 (list 1 2 3) (list 10 2 10) ) "\n") + (print "22 " (generalized_distance (list 0.3333 0.3333 0.3333) (list "nominal_numeric") (list 1) (null) 1 (list 1 2 3) (list 10 2 10) ) "\n") + (print "23 " (generalized_distance (list 0.3333 0.3333 0.3333) (list "nominal_numeric" "continuous_numeric_cyclic" "continuous_numeric_cyclic") (list 1 360 12) (null) 1 (list 1 2 3) (list 10 2 10) ) "\n") + (print "24 " (generalized_distance (list 0.3333 0.3333 0.3333) (list "nominal_numeric") (list 1) (list 0.25 180 -12) 1 (list 1 2 3) (list 10 2 10) ) "\n") + (print "25 " (generalized_distance (list 1 0 1 ) (list "continuous_numeric" "nominal_numeric" "nominal_numeric") (list (null) 5 5) (list .1 .1 .1 ) 2 (list 4 4 (null) ) (list 2 (null) (null) ) ) "\n" ) + (print "26 " (generalized_distance (list 1 0 1 ) (list "continuous_numeric" "nominal_numeric" "nominal_numeric") (list (null) 5 5) (null) 2 (list 4 4 (null) ) (list 2 (null) (null) ) ) "\n") + (print "27 " (generalized_distance (list 1 0 1 1 ) (list "continuous_numeric" "nominal_numeric" "nominal_numeric") (list (null) 5 5) (list .1 .1 .1 .1) 2 (list 4 4 (null) 4) (list 2 (null) (null) 2) ) "\n" ) + (print "28 " (generalized_distance (list 1 0 1 1 ) (list "continuous_numeric" "nominal_numeric" "nominal_numeric") (list (null) 5 5) (null) 2 (list 4 4 (null) 4) (list 2 (null) (null) 2) ) "\n" ) (print "29 " (generalized_distance (list 1 0 1 1 1) (null) (null) (null) 1 (list 4 4 4 4 4) (list 2 (null) 2 2 2) ) "\n" ) (print "30 " (generalized_distance (assoc x 1 y 1 z 1) - (assoc y "continuous" x "nominal" z "continuous") + (assoc y "continuous_numeric" x "nominal_numeric" z "continuous_numeric") (assoc z 5) (null) 1 @@ -402,18 +402,20 @@ (null) (list "x" "y" "z") ) "\n" ) ;should print 4 - (print "31 " (generalized_distance (list 1 1 1) (list "continuous" "nominal" "nominal") (list (null) 5 5) (null) 1 (list 4 4 (null)) (list 2 2 (null))) "\n") + (print "31 " (generalized_distance (list 1 1 1) (list "continuous_numeric" "nominal_numeric" "nominal_numeric") (list (null) 5 5) (null) 1 (list 4 4 (null)) (list 2 2 (null))) "\n") ;should print 4 - (print "32 " (generalized_distance (list 1 1 1 1) (list "continuous" "nominal" "nominal" "continuous") (list 2 5 5 2) (null) 0 (list 4 4 4 4) (list 2 2 2 (null))) "\n") + (print "32 " (generalized_distance (list 1 1 1 1) (list "continuous_numeric" "nominal_numeric" "nominal_numeric" "continuous_numeric") + (list (null) 5 5 (null)) (list (list 0 2) (null) (null) (list 0 2)) 0 (list 4 4 4 4) (list 2 2 2 (null))) "\n") ;should print 4 - (print "33 " (generalized_distance (list 1 1 1 1) (list "continuous" "nominal" "nominal" "continuous") (list 1 5 5 1) (null) 1 (list 4 "s" "s" 4) (list 2 "s" 2 (null))) "\n") + (print "33 " (generalized_distance (list 1 1 1 1) (list "continuous_numeric" "nominal_string" "nominal_string" "continuous_numeric") + (list (null) 5 5 (null)) (list (list 0 1) (null) (null) (list 0 1)) 1 (list 4 "s" "s" 4) (list 2 "s" 2 (null))) "\n") ;should print 2 - (print "34 " (generalized_distance (list 1 1) (list "code" "nominal") (list 0 5) (null) 1 (list (list 1 2 3 4 5) "s") (list (list 1 2 3) "s") ) "\n") + (print "34 " (generalized_distance (list 1 1) (list "continuous_code" "nominal_string") (list 0 5) (null) 1 (list (list 1 2 3 4 5) "s") (list (list 1 2 3) "s") ) "\n") ;should print 3ish - (print "35 " (generalized_distance (list 1 1) (list "code" "nominal") (list 0 5) (null) 1 (list (list 1.5 2 3 4 5) "s") (list (list 1 2 3) "s") ) "\n") + (print "35 " (generalized_distance (list 1 1) (list "continuous_code" "nominal_string") (list 0 5) (null) 1 (list (list 1.5 2 3 4 5) "s") (list (list 1 2 3) "s") ) "\n") (print "--entropy--\n") (print (entropy (list 0.5 0.5)) "\n") @@ -2566,31 +2568,31 @@ (print "1: " (contained_entities "TestContainerSimilarCode" (list - (query_nearest_generalized_distance 2 (list "y" ) (list 1) (list 1) (list "code") (null) (null) 1 1) + (query_nearest_generalized_distance 2 (list "y" ) (list 1) (list 1) (list "continuous_code") (null) (null) 1 1) )) ) (print "2: " (contained_entities "TestContainerSimilarCode" (list - (query_nearest_generalized_distance 4 (list "y" ) (list (list 1 2)) (list 1) (list "code") (null) (null) 1 1) + (query_nearest_generalized_distance 4 (list "y" ) (list (list 1 2)) (list 1) (list "continuous_code") (null) (null) 1 1) )) ) (print "3: " (contained_entities "TestContainerSimilarCode" (list - (query_nearest_generalized_distance 4 (list "y" ) (list (list (list 1 2))) (list 1) (list "code") (null) (null) 1 1) + (query_nearest_generalized_distance 4 (list "y" ) (list (list (list 1 2))) (list 1) (list "continuous_code") (null) (null) 1 1) )) ) (print "4: " (contained_entities "TestContainerSimilarCode" (list - (query_nearest_generalized_distance 4 (list "x" "y" ) (list 1 (list (list 1 2))) (list 1 1) (list "continuous" "code") (null) (null) 1 1) + (query_nearest_generalized_distance 4 (list "x" "y" ) (list 1 (list (list 1 2))) (list 1 1) (list "continuous_numeric" "continuous_code") (null) (null) 1 1) )) ) (print "5: " (contained_entities "TestContainerSimilarCode" (list - (query_nearest_generalized_distance 3 (list "s" ) (list "s0") (list 1) (list "string") (null) (null) 1 1) + (query_nearest_generalized_distance 3 (list "s" ) (list "s0") (list 1) (list "continuous_string") (null) (null) 1 1) )) ) @@ -2803,17 +2805,17 @@ (print (contained_entities "TestContainerExec" (list (query_exists "x") - (query_nearest_generalized_distance 2 (list "x" "y") (list 0.0 0.0) (list 2 1) (list "nominal" "cyclic") (list 1 360) (null) 0.01 1 (null) "random seed 1234" "radius") + (query_nearest_generalized_distance 2 (list "x" "y") (list 0.0 0.0) (list 2 1) (list "nominal_numeric" "continuous_numeric_cyclic") (list 1 360) (null) 0.01 1 (null) "random seed 1234" "radius") ))) (print "assoc-based: " (contained_entities "TestContainerExec" (list (query_exists "x") - (query_nearest_generalized_distance 2 (list "x" "y") (list 0.0 0.0) (associate "x" 2 "y" 1) (associate "x" "nominal" "y" "cyclic") (list 1 360) (null) 0.01 1 (null) "random seed 1234" "radius") + (query_nearest_generalized_distance 2 (list "x" "y") (list 0.0 0.0) (associate "x" 2 "y" 1) (associate "x" "nominal_numeric" "y" "continuous_numeric_cyclic") (list 1 360) (null) 0.01 1 (null) "random seed 1234" "radius") ))) (print (contained_entities "TestContainerExec" (list (query_exists "x") - (query_nearest_generalized_distance 2 (list "x" "y") (list 0.0 0.0) (list 2 1) (list "nominal" "continuous") (list 1) (list 0.1 -0.2) 0.01 1 (null) "random seed 1234" "radius") + (query_nearest_generalized_distance 2 (list "x" "y") (list 0.0 0.0) (list 2 1) (list "nominal_numeric" "continuous_numeric") (list 1) (list 0.1 -0.2) 0.01 1 (null) "random seed 1234" "radius") ))) (print "--contained_entities caching and permissions--\n") @@ -3141,7 +3143,7 @@ (print (compute_on_contained_entities (list (query_exists "is_cyclic") - (query_nearest_generalized_distance 2 (list "rank_c" "degree_c") (list 4 0) (null) (list "continuous" "cyclic") (list (null) 360) (null) 1 1 (null) "random seed 1234" (null)) + (query_nearest_generalized_distance 2 (list "rank_c" "degree_c") (list 4 0) (null) (list "continuous_numeric" "continuous_numeric_cyclic") (list (null) 360) (null) 1 1 (null) "random seed 1234" (null)) ))) (create_entities "CyclicTestEntity" (null)) @@ -3175,7 +3177,7 @@ (list "deg") (list 350) (null) ; weights - (list "cyclic") ; types + (list "continuous_numeric_cyclic") ; types (list 360) ; attributes (null); deviations 1 ; p @@ -3481,9 +3483,9 @@ (list "A" "B") ;labels (list 9 2) ;values (null) ;weights - (list "continuous" "continuous") ;distance types - (list (list 5 6) (list 5 5)) ;attributes - (null) + (list "continuous_numeric" "continuous_numeric") ;distance types + (null) ;attributes + (list (list 0 5 6) (list 0 5 5)) ;deviations 2 ;p-value ) )) @@ -3499,9 +3501,9 @@ (list "A" "B") ;labels (list (null) (null)) ;values (null) ;weights - (list "continuous" "continuous") ;distance types - (list (list 1 0) (list 1 0)) ;attributes - (null) + (list "continuous_numeric" "continuous_numeric") ;distance types + (null) ;attributes + (list (list 0 1 0) (list 0 1 0)) ;deviations 2 ;p-value ) )) @@ -3512,9 +3514,9 @@ ;weights (list 2.5 10 5 0.033333333 3.333333333 20) ;types - (list "nominal" "nominal" "nominal" "continuous" "nominal" "continuous") + (list "nominal_numeric" "nominal_numeric" "nominal_numeric" "continuous_numeric" "nominal_numeric" "continuous_numeric") ;attributes - (list 3 100 7 0 10 0) + (list 3 100 7 (null) 10 (null)) ;deviations (list 0.4 0.1 0.2 30 0.3 0.05) ;p @@ -3529,9 +3531,9 @@ ;weights (list 2.5 10 5 0.033333333 3.333333333 20) ;types - (list "nominal" "nominal" "nominal" "continuous" "nominal" "continuous") + (list "nominal_numeric" "nominal_numeric" "nominal_numeric" "continuous_numeric" "nominal_numeric" "continuous_numeric") ;attributes - (list 3 100 7 0 10 0) + (list 3 100 7 (null) 10 (null)) ;deviations (list 0.4 0.1 0.2 30 0.3 0.05) ;p @@ -3548,9 +3550,9 @@ ;weights (list 2.5 10 5 0.033333333 3.333333333 20) ;types - (list "nominal" "nominal" "nominal" "continuous" "nominal" "continuous") + (list "nominal_numeric" "nominal_numeric" "nominal_numeric" "continuous_numeric" "nominal_numeric" "continuous_numeric") ;attributes - (list 3 100 7 0 10 0) + (list 3 100 7 (null) 10 (null)) ;deviations (list 0.4 0.1 0.2 30 0.3 0.05) ;p @@ -3655,9 +3657,9 @@ ;weights (list 2.5 10 5 0.033333333 3.333333333 20) ;types - (list "nominal" "nominal" "nominal" "continuous" "nominal" "continuous") + (list "nominal_numeric" "nominal_numeric" "nominal_numeric" "continuous_numeric" "nominal_numeric" "continuous_numeric") ;attributes - (list 3 100 7 0 10 0) + (list 3 100 7 (null) 10 (null)) ;deviations (list 0.4 0.1 0.2 30 0.3 0.05) ;p @@ -3720,7 +3722,7 @@ (list "A" "B") (list 4 9) (null) ; context_weights - (list "nominal" "nominal") ; types + (list "nominal_numeric" "nominal_numeric") ; types (list 1 1) ; attributes (null) ; context_deviations 0.1 ; p_parameter @@ -3738,7 +3740,7 @@ (list "B" "A") (list 9 4) (null) ; context_weights - (list "nominal" "nominal") ; types + (list "nominal_numeric" "nominal_numeric") ; types (list 1 1) ; attributes (null) ; context_deviations 0.1 ; p_parameter @@ -3854,15 +3856,15 @@ )) "\n") (print "cyclic KL: " (compute_on_contained_entities "BoxConvictionTestContainer" (list - (compute_entity_kl_divergences 1 (list "x" "y") (null) (null) (list "cyclic" "continuous") (list 3.5 (null)) (null) 2.0 -1 (null) "fixed_seed" (null) "recompute_precise") + (compute_entity_kl_divergences 1 (list "x" "y") (null) (null) (list "continuous_numeric_cyclic" "continuous_numeric") (list 3.5 (null)) (null) 2.0 -1 (null) "fixed_seed" (null) "recompute_precise") ))) (print "cyclic conviction: " (compute_on_contained_entities "BoxConvictionTestContainer" (list - (compute_entity_convictions 1 (list "x" "y") (null) (null) (list "cyclic" "continuous") (list 3.5 (null)) (null) 2.0 -1 (null) "fixed_seed" (null) "recompute_precise") + (compute_entity_convictions 1 (list "x" "y") (null) (null) (list "continuous_numeric_cyclic" "continuous_numeric") (list 3.5 (null)) (null) 2.0 -1 (null) "fixed_seed" (null) "recompute_precise") ))) (print "cyclic group kl divergence: " (compute_on_contained_entities "BoxConvictionTestContainer" (list - (compute_entity_group_kl_divergence 1 (list "x" "y") (list "vert4") (null) (list "cyclic" "continuous") (list 3.5 (null)) (null) 2.0 -1 (null) "fixed_seed" (null) "recompute_precise") + (compute_entity_group_kl_divergence 1 (list "x" "y") (list "vert4") (null) (list "continuous_numeric_cyclic" "continuous_numeric") (list 3.5 (null)) (null) 2.0 -1 (null) "fixed_seed" (null) "recompute_precise") )) "\n") (print "surprisal transforms\n") @@ -3895,7 +3897,7 @@ (list "x") (list 0) (null) ; context_weights - (list "continuous") ; types + (list "continuous_numeric") ; types (null) ; attributes (null) ; context_deviations 1 ; p_parameter @@ -3919,7 +3921,7 @@ (list "x") (list 0) (null) ; context_weights - (list "continuous") ; types + (list "continuous_numeric") ; types (null) ; attributes (null) ; context_deviations 1 ; p_parameter diff --git a/src/Amalgam/amlg_code/module_test.json b/src/Amalgam/amlg_code/module_test.json index 5ebc1778..385b451e 100644 --- a/src/Amalgam/amlg_code/module_test.json +++ b/src/Amalgam/amlg_code/module_test.json @@ -1 +1 @@ -[{"a":3,"b":4},{"d":null,"c":"c"}] \ No newline at end of file +[{"b":4,"a":3},{"d":null,"c":"c"}] \ No newline at end of file diff --git a/src/Amalgam/amlg_code/module_test2.amlg b/src/Amalgam/amlg_code/module_test2.amlg index 3f90ac17..91ec7e06 100644 --- a/src/Amalgam/amlg_code/module_test2.amlg +++ b/src/Amalgam/amlg_code/module_test2.amlg @@ -1,8 +1,8 @@ (assoc + b #b (true) a #a 1 hello #hello (print "hello\n") d #d 100000000 c #c 0.1 - b #b (true) ) diff --git a/src/Amalgam/entity/EntityQueryBuilder.h b/src/Amalgam/entity/EntityQueryBuilder.h index 89eb7923..da408c5b 100644 --- a/src/Amalgam/entity/EntityQueryBuilder.h +++ b/src/Amalgam/entity/EntityQueryBuilder.h @@ -39,6 +39,18 @@ namespace EntityQueryBuilder || type == ENT_COMPUTE_ENTITY_KL_DIVERGENCES); } + //populates deviation data for feature_params from deviation_node + inline void PopulateFeatureDeviationData(GeneralizedDistance::FeatureParams &feature_params, EvaluableNode *deviation_node) + { + if(deviation_node == nullptr) + { + feature_params.deviation = 0.0; + return; + } + + feature_params.deviation = EvaluableNode::ToNumber(deviation_node, 0.0); + } + //populates the features of dist_params based on either num_elements or element_names for each of the // four different attribute parameters based on its type (using num_elements if list or immediate, element_names if assoc) inline void PopulateDistanceFeatureParameters(GeneralizedDistance &dist_params, @@ -70,12 +82,15 @@ namespace EntityQueryBuilder StringInternPool::StringID feature_type_id = EvaluableNode::ToStringIDIfExists(en); switch(feature_type_id) { - case ENBISI_nominal: feature_type = GeneralizedDistance::FDT_NOMINAL; break; - case ENBISI_continuous: feature_type = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC; break; - case ENBISI_cyclic: feature_type = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC_CYCLIC; break; - case GetStringIdFromNodeTypeFromString(ENT_STRING): feature_type = GeneralizedDistance::FDT_CONTINUOUS_STRING; break; - case ENBISI_code: feature_type = GeneralizedDistance::FDT_CONTINUOUS_CODE; break; - default: feature_type = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC; break; + case ENBISI_nominal_numeric: feature_type = GeneralizedDistance::FDT_NOMINAL_NUMERIC; break; + case ENBISI_nominal_string: feature_type = GeneralizedDistance::FDT_NOMINAL_STRING; break; + case ENBISI_nominal_code: feature_type = GeneralizedDistance::FDT_NOMINAL_CODE; break; + case ENBISI_continuous_numeric: feature_type = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC; break; + case ENBISI_continuous_numeric_cyclic: feature_type = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC_CYCLIC; break; + case ENBISI_continuous_string: feature_type = GeneralizedDistance::FDT_CONTINUOUS_STRING; break; + case ENBISI_continuous_code: feature_type = GeneralizedDistance::FDT_CONTINUOUS_CODE; break; + + default: feature_type = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC; break; } } dist_params.featureParams[i].featureType = feature_type; @@ -87,65 +102,68 @@ namespace EntityQueryBuilder [&dist_params](size_t i, bool found, EvaluableNode *en) { if(i < dist_params.featureParams.size()) { - dist_params.featureParams[i].unknownToUnknownDifference = std::numeric_limits::quiet_NaN(); - dist_params.featureParams[i].knownToUnknownDifference = std::numeric_limits::quiet_NaN(); - //get attributes based on feature type switch(dist_params.featureParams[i].featureType) { - case GeneralizedDistance::FDT_NOMINAL: + case GeneralizedDistance::FDT_NOMINAL_NUMERIC: + case GeneralizedDistance::FDT_NOMINAL_STRING: + case GeneralizedDistance::FDT_NOMINAL_CODE: if(found && !EvaluableNode::IsNull(en)) - { - if(en->EvaluableNode::IsOrderedArray()) - { - auto &ocn = en->GetOrderedChildNodesReference(); - size_t ocn_size = ocn.size(); - if(ocn_size > 0) - dist_params.featureParams[i].typeAttributes.nominalCount = EvaluableNode::ToNumber(ocn[0]); - if(ocn_size > 1) - dist_params.featureParams[i].knownToUnknownDifference = EvaluableNode::ToNumber(ocn[1]); - if(ocn_size > 2) - dist_params.featureParams[i].unknownToUnknownDifference = EvaluableNode::ToNumber(ocn[2]); - } - else //treat as singular value - { - dist_params.featureParams[i].typeAttributes.nominalCount = EvaluableNode::ToNumber(en); - } - } - else - { - dist_params.featureParams[i].typeAttributes.nominalCount = 0.0; - } + dist_params.featureParams[i].typeAttributes.nominalCount = EvaluableNode::ToNumber(en, 1); break; case GeneralizedDistance::FDT_CONTINUOUS_NUMERIC_CYCLIC: + if(found && !EvaluableNode::IsNull(en)) + dist_params.featureParams[i].typeAttributes.maxCyclicDifference = EvaluableNode::ToNumber(en); + else //can't be cyclic without a range + dist_params.featureParams[i].featureType = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC; + break; + + default: + break; + } + } + }); + + //get deviations + EvaluableNode::ConvertChildNodesAndStoreValue(deviations_node, element_names, num_elements, + [&dist_params](size_t i, bool found, EvaluableNode *en) { + if(i < dist_params.featureParams.size()) + { + dist_params.featureParams[i].deviation = 0.0; + dist_params.featureParams[i].unknownToUnknownDistanceTerm.difference = std::numeric_limits::quiet_NaN(); + dist_params.featureParams[i].knownToUnknownDistanceTerm.difference = std::numeric_limits::quiet_NaN(); + + //get deviations based on feature type + switch(dist_params.featureParams[i].featureType) + { + case GeneralizedDistance::FDT_NOMINAL_NUMERIC: + case GeneralizedDistance::FDT_NOMINAL_STRING: + case GeneralizedDistance::FDT_NOMINAL_CODE: if(found && !EvaluableNode::IsNull(en)) { if(en->EvaluableNode::IsOrderedArray()) { auto &ocn = en->GetOrderedChildNodesReference(); size_t ocn_size = ocn.size(); + if(ocn_size > 0) - dist_params.featureParams[i].typeAttributes.maxCyclicDifference = EvaluableNode::ToNumber(ocn[0]); + PopulateFeatureDeviationData(dist_params.featureParams[i], ocn[0]); + if(ocn_size > 1) - dist_params.featureParams[i].knownToUnknownDifference = EvaluableNode::ToNumber(ocn[1]); + dist_params.featureParams[i].knownToUnknownDistanceTerm.difference = EvaluableNode::ToNumber(ocn[1]); + if(ocn_size > 2) - dist_params.featureParams[i].unknownToUnknownDifference = EvaluableNode::ToNumber(ocn[2]); + dist_params.featureParams[i].unknownToUnknownDistanceTerm.difference = EvaluableNode::ToNumber(ocn[2]); } else //treat as singular value { - dist_params.featureParams[i].typeAttributes.maxCyclicDifference = EvaluableNode::ToNumber(en); + PopulateFeatureDeviationData(dist_params.featureParams[i], en); } } - else //can't be cyclic without a range - { - dist_params.featureParams[i].featureType = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC; - } break; - case GeneralizedDistance::FDT_CONTINUOUS_NUMERIC: - case GeneralizedDistance::FDT_CONTINUOUS_STRING: - case GeneralizedDistance::FDT_CONTINUOUS_CODE: + default: if(found && !EvaluableNode::IsNull(en)) { if(en->EvaluableNode::IsOrderedArray()) @@ -153,31 +171,21 @@ namespace EntityQueryBuilder auto &ocn = en->GetOrderedChildNodesReference(); size_t ocn_size = ocn.size(); if(ocn_size > 0) - dist_params.featureParams[i].knownToUnknownDifference = EvaluableNode::ToNumber(ocn[0]); + dist_params.featureParams[i].deviation = EvaluableNode::ToNumber(ocn[0]); if(ocn_size > 1) - dist_params.featureParams[i].unknownToUnknownDifference = EvaluableNode::ToNumber(ocn[1]); + dist_params.featureParams[i].knownToUnknownDistanceTerm.difference = EvaluableNode::ToNumber(ocn[1]); + if(ocn_size > 2) + dist_params.featureParams[i].unknownToUnknownDistanceTerm.difference = EvaluableNode::ToNumber(ocn[2]); } else //treat as singular value { - dist_params.featureParams[i].knownToUnknownDifference = EvaluableNode::ToNumber(en); + dist_params.featureParams[i].deviation = EvaluableNode::ToNumber(en); } } break; } } }); - - //get deviations - EvaluableNode::ConvertChildNodesAndStoreValue(deviations_node, element_names, num_elements, - [&dist_params](size_t i, bool found, EvaluableNode *en) { - if(i < dist_params.featureParams.size()) - { - if(found) - dist_params.featureParams[i].deviation = EvaluableNode::ToNumber(en); - else - dist_params.featureParams[i].deviation = 0.0; - } - }); } diff --git a/src/Amalgam/interpreter/InterpreterOpcodesMath.cpp b/src/Amalgam/interpreter/InterpreterOpcodesMath.cpp index 3cd56157..72255edf 100644 --- a/src/Amalgam/interpreter/InterpreterOpcodesMath.cpp +++ b/src/Amalgam/interpreter/InterpreterOpcodesMath.cpp @@ -1110,16 +1110,16 @@ EvaluableNodeReference Interpreter::InterpretNode_ENT_GENERALIZED_DISTANCE(Evalu auto &feature_params = dist_params.featureParams[i]; //if one is nan and the other is not, the use the non-nan one for both - if(FastIsNaN(feature_params.unknownToUnknownDifference)) + if(FastIsNaN(feature_params.unknownToUnknownDistanceTerm.difference)) { - if(!FastIsNaN(feature_params.knownToUnknownDifference)) - feature_params.unknownToUnknownDifference = feature_params.knownToUnknownDifference; + if(!FastIsNaN(feature_params.knownToUnknownDistanceTerm.difference)) + feature_params.unknownToUnknownDistanceTerm.difference = feature_params.knownToUnknownDistanceTerm.difference; else - feature_params.unknownToUnknownDifference = dist_params.GetMaximumDifference(i); + feature_params.unknownToUnknownDistanceTerm.difference = dist_params.GetMaximumDifference(i); } - if(FastIsNaN(feature_params.knownToUnknownDifference)) - feature_params.knownToUnknownDifference = feature_params.unknownToUnknownDifference; + if(FastIsNaN(feature_params.knownToUnknownDistanceTerm.difference)) + feature_params.knownToUnknownDistanceTerm.difference = feature_params.unknownToUnknownDistanceTerm.difference; dist_params.ComputeAndStoreUncertaintyDistanceTerms(i); } diff --git a/src/Amalgam/out.txt b/src/Amalgam/out.txt index 543b4150..2035856e 100644 --- a/src/Amalgam/out.txt +++ b/src/Amalgam/out.txt @@ -233,11 +233,11 @@ notakeyword (print "hello") (list .nan .nan .infinity -.infinity) -(assoc a 1 c (list "alpha" "beta" "gamma") b 2) +(assoc b 2 a 1 c (list "alpha" "beta" "gamma")) (assoc + b 2 a 1 c (list "alpha" "beta" "gamma") - b 2 ) --if-- @@ -587,7 +587,7 @@ abcdef 0.14384103622589045 --first-- 4 -1 +2 1 0 a @@ -602,18 +602,18 @@ a (list 1 2 3 4 5 6) (list) (assoc - b 2 + a 1 c 3 d 4 e 5 f 6 ) -(assoc b 2 f 6) +(assoc c 3 d 4) (assoc - b 2 + a 1 c 3 d 4 - f 6 + e 5 ) (assoc a 1 @@ -636,7 +636,7 @@ abcdef .nas --last-- this -1 +2 1 0 c @@ -651,18 +651,18 @@ c (list 1 2 3 4 5 6) (list) (assoc - b 2 + a 1 c 3 d 4 e 5 f 6 ) -(assoc b 2 f 6) +(assoc c 3 d 4) (assoc - b 2 + a 1 c 3 d 4 - f 6 + e 5 ) (assoc a 1 @@ -1008,7 +1008,7 @@ abcdef "2020-06-08 lunes 11.33.48" ) --indices-- -(list "a" "c" "b" "4") +(list "b" "a" "4" "c") (list 0 1 @@ -1020,7 +1020,7 @@ abcdef 7 ) --values-- -(list 1 3 2 "d") +(list 2 1 "d" 3) (list "a" 1 @@ -1041,7 +1041,7 @@ abcdef 4 "d" ) -(list 1 0 3 2 "d") +(list 2 1 0 "d" 3) (list 1 2 @@ -1252,7 +1252,7 @@ current_index: 2 interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" raaa 2 rwww 1 - start_time 1701560610.772288 + start_time 1702689639.314631 www 1 x 12 zz 10 @@ -1295,7 +1295,7 @@ current_index: 2 interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" raaa 2 rwww 1 - start_time 1701560610.772288 + start_time 1702689639.314631 www 1 x 12 zz 10 @@ -1337,7 +1337,7 @@ current_index: 2 interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" raaa 2 rwww 1 - start_time 1701560610.772288 + start_time 1702689639.314631 www 1 x 12 zz 10 @@ -1470,22 +1470,22 @@ true --weighted_rand-- b -(list "b" "b" "a" "b") +(list "b" "b" "b" "b") b (list "a" "b" @(get (target 0) 0) @(get (target 0) 1)) (list "a" @(get (target 0) 0) "b" @(get (target 0) 2)) -infinity test c or d: (list "d" "d" "d" "c") +infinity test c or d: (list "c" "c" "c" "d") infinity test c or d: (list "c" @(get (target 0) 0) "d" @(get (target 0) 0)) -(assoc a 25 b 45 c 30) +(assoc a 30 b 46 c 24) (assoc a 29 b 44 c 27) -(list "7" "3" "1") +(list "7" "2" "6") --get_rand_seed-- °³È¼¿\¨KOaVÆT zÿ @@ -1532,8 +1532,8 @@ infinity test c or d: (list "c" @(get (target 0) 0) "d" @(get (target 0) 0)) string --set_type-- (- 3 4) -(list "a" 4 "b" 3) -(list "a" 4 "b" 3) +(list "b" 3 "a" 4) +(list "b" 3 "a" 4) (assoc a 4 b 3) 8.7 (parallel @@ -1582,9 +1582,10 @@ string (assoc a 3 b 4) (assoc c "c") ) -21: [{"a":3,"b":4},{"d":null,"c":"c"}] +21: [{"b":4,"a":3},{"d":null,"c":"c"}] 22: [{"a":3,"b":4},{"c":"c","d":null}] -23: a: 1 +23: b: 2 +a: 1 e: - a - b @@ -1592,7 +1593,6 @@ e: - .inf d: 4 c: 3 -b: 2 24: a: 1 b: 2 @@ -1605,7 +1605,7 @@ e: - .inf 25: (assoc a 1) -current date-time in epoch: 2023-12-02-18.43.31.0723590 +current date-time in epoch: 2023-12-15-20.20.39.6032800 2020-06-07 00:22:59 1391230800 1391230800 @@ -1655,7 +1655,7 @@ domingo, jun. 07, 2020 ) ) (assoc - labelA #labelQ #labelA + labelA #labelA #labelQ (lambda #labelB (true) ) @@ -1672,7 +1672,7 @@ domingo, jun. 07, 2020 ) ) (assoc - labelA #labelQ #labelA + labelA #labelA #labelQ (lambda #labelB (true) ) @@ -1894,22 +1894,27 @@ decrypted: hello 12 (values) (append) - (associate b 1 (mutate) 2) + (associate + #a "a" + (and) + "b" + 2 + ) ) (list 1 2 - (-) + 3 4 (associate "alpha" 5 "beta" 6) (associate "nest" (associate - (-) - (list 7 8 9) + "count" + (list (+) 8 9) ) "end" - (list 10 11 12) + (list 10 11 (+)) ) ) --commonality-- @@ -2325,12 +2330,14 @@ decrypted: hello --mix-- (list 1 - 3.5 + 2 + 4 + 3 5.5 - 8 + 7.5 9.5 11.5 - 13 + 13.5 ) (list @@ -2340,10 +2347,10 @@ decrypted: hello ;comment 4 1 3.5 - 6 + 5.5 7.5 - 10 - 11.5 + 9 + 12 13.5 ) (list @@ -2353,20 +2360,16 @@ decrypted: hello (associate "a" 3 "b" 4) (lambda (if - true + false 1 (parallel (get_entity_comments) 1 - (lambda - (print - (list 9) - ) - ) + (lambda (null)) ) ) ) - (list 6) + (list 5 6) ) (list 1 @@ -2384,14 +2387,13 @@ decrypted: hello (list 2 9) ) ) - 1 ) ) ) - (list 5 6) ) (list (true) + 2 3.5 5.5 7.5 @@ -2400,21 +2402,22 @@ decrypted: hello 13.5 ) (list + (true) 2 3 + 5 8 - 7 9 + 11 14 - 13 ) -1 -1 +4 +4 2.5 2.5 -abcdeomxyz abcdexyz -abcdoxyz +abcdomxyz +abcdeoxyz --mix_labels-- (list 1 @@ -2626,57 +2629,39 @@ flatten restore with parallel --mutate_entity-- (list 1 - (get) + a 3 4 - 5 - (floor) - (assoc) (set_type) (call) - 10 - 11 - b - 13 - 14 - (associate "a" (contains_index) b 2) -) - -(list - 1 - (dot_product) - 3 - (null) - 5 - 6 - (query_not_exists) + 7 8 - 9 + b 10 - XXWA6ez + 11 12 - (call_entity) - 14 - (associate b 1 "b" b) + 13 + (contains_index) + (associate) ) +8 (list 1 (+) 3 - 4 (+) + (*) 6 7 8 - (-) - 10 - 11 (+) (*) + 11 + (+) + 13 14 - (associate "a" 1 "b" (+)) - (*) + (associate) ) --commonality_entities-- @@ -2689,10 +2674,10 @@ MergeEntityChild1 (associate "x" 3 "y" 4) MergeEntityChild2 (associate "p" 3 "q" 4) -_2169689611 -(associate "e" 3 "f" 4) _2280722175 (associate "E" 3 "F" 4) +_2169689611 +(associate "e" 3 "f" 4) --union_entities-- (associate "b" 4 "a" 3 "c" 3) MergeEntityChild1 @@ -2710,17 +2695,6 @@ MergeEntityChild2 "w" 7 ) -_2169689611 -(associate - "e" - 3 - "f" - 4 - "g" - 5 - "h" - 6 -) _2280722175 (associate "E" @@ -2732,11 +2706,7 @@ _2280722175 "H" 6 ) -(parallel - ##p - (list "_3990396532" "_3330773578" "_3990396532" "_3330773578") -) -_3330773578 +_2169689611 (associate "e" 3 @@ -2747,6 +2717,10 @@ _3330773578 "h" 6 ) +(parallel + ##p + (list "_3990396532" "_3330773578" "_3990396532" "_3330773578") +) _3990396532 (associate "E" @@ -2758,6 +2732,17 @@ _3990396532 "H" 6 ) +_3330773578 +(associate + "e" + 3 + "f" + 4 + "g" + 5 + "h" + 6 +) --difference_entities-- (declare (assoc _ (null)) @@ -3376,22 +3361,22 @@ _3128546630 --mix_entities-- (associate "b" 4 "a" 3) MergeEntityChild1 -(associate "x" 3 "y" 4) +(associate "x" 3 "y" 4 "z" 5) MergeEntityChild2 (associate "p" 3 "q" 4 - "v" - 6 + "u" + 5 "w" 7 ) -_2169689611 -(associate "e" 3 "f" 4) _2280722175 -(associate "E" 3 "F" 4 "G" 5) +(associate "E" 3 "F" 4) +_2169689611 +(associate "e" 3 "f" 4 "g" 5) --get_entity_comments-- Full test This is a suite of unit tests. @@ -3446,7 +3431,7 @@ deep sets --set_entity_root_permission-- RootTest -1701560611.471558 +1702689639.806468 (true) RootTest @@ -3670,7 +3655,7 @@ hello ) ) ) - (set_entity_rand_seed new_entity "<¢õ®¦gR0Ñ”-I´»ÿ") + (set_entity_rand_seed new_entity "û;\t·‰ŸuÞ0Ñ”-I´»ÿ") (set_entity_rand_seed (first (create_entities @@ -3680,7 +3665,7 @@ hello ) ) ) - "’‰<ö°ÿ°öXe/·äï$ÿ" + "ò~¸·\tªÛk¿ÑÆ_WϤÿ" ) (set_entity_rand_seed (first @@ -3713,7 +3698,7 @@ hello ) ) ) - (set_entity_rand_seed new_entity "<¢õ®¦gR0Ñ”-I´»ÿ") + (set_entity_rand_seed new_entity "û;\t·‰ŸuÞ0Ñ”-I´»ÿ") (set_entity_rand_seed (first (create_entities @@ -3818,74 +3803,74 @@ store to .json normally (list "Child2" "Child7") (list "Child1" "Child5") (list "Child3" "Child4") -(list "Child1" "Child4" "Child6" "Child7") -(list "Child2" "Child3" "Child6" "Child7") +(list "Child1" "Child3" "Child5" "Child7") +(list "Child2" "Child4" "Child6" "Child7") (list "Child4" "Child6") --query_sample-- -(list "Child3") -(list "Child5" "Child2") -(list "Child6") +(list "Child4") +(list "Child7" "Child1") (list "Child6") +(list "Child2") --query_weighted_sample-- -(list "Child1") -(list "Child1") +(list "Child2") +(list "Child2") (list - "Child1" - "Child6" - "Child1" - "Child2" "Child2" "Child2" "Child2" "Child1" - "Child6" "Child2" "Child2" "Child2" "Child1" + "Child4" "Child1" "Child2" "Child2" - "Child4" + "Child6" + "Child2" + "Child1" + "Child1" + "Child2" "Child1" "Child1" "Child2" ) (list - "Child7" - "Child5" "Child1" - "Child2" - "Child2" - "Child4" "Child1" "Child1" "Child2" "Child2" - "Child2" + "Child1" + "Child1" "Child1" "Child2" - "Child4" + "Child1" "Child1" "Child2" "Child2" + "Child1" + "Child1" + "Child1" + "Child1" "Child2" - "Child4" "Child1" + "Child2" ) (list + "Child6" "Child2" + "Child5" "Child2" + "Child6" "Child2" "Child2" "Child2" - "Child5" - "Child2" - "Child5" "Child2" "Child2" ) -(list "Child2") +(list "Child2" "Child3") --query_in_entity_list-- (list "Child6" "Child7") --query_not_in_entity_list-- @@ -4254,15 +4239,15 @@ case conviction:(assoc ) cyclic feature nearest neighbors: (assoc cyclic1 1 cyclic5 0.5) cyclic test expected: 155, 200, 190 ... deg values of 0 8 and 12: -200: 0.05555555555555555 (null - ##deg 8 -) 190: 0.045454545454545456 (null ##deg 12 ) 155: 0.1 (null ##deg 0 ) +200: 0.05555555555555555 (null + ##deg 8 +) --contains_label-- (true) @@ -4439,13 +4424,13 @@ distance symmetry tests (list (list "B" - "C" - "D" - "A" "I" + "A" + "C" "F" + "D" "J" - "E" + "G" ) (list 0 @@ -4462,12 +4447,12 @@ distance symmetry tests (list "B" "A" - "C" "F" + "C" "D" "I" - "H" "J" + "G" ) (list 0 @@ -4660,4 +4645,4 @@ Expecting 1000: 1000 concurrent entity writes successful: (true) --total execution time-- -1.2860238552093506 +1.1452429294586182