diff --git a/docs/language.js b/docs/language.js index e7a9d38a..bd9e1dda 100644 --- a/docs/language.js +++ b/docs/language.js @@ -399,9 +399,9 @@ var data = [ }, { - "parameter" : "generalized_distance list|assoc|number weights list|assoc distance_types list|assoc attributes list|assoc|number deviations number p_value list|assoc|* vector1 [list|assoc|* vector2] [list value_names]", + "parameter" : "generalized_distance list|assoc|number weights list|assoc distance_types list|assoc attributes list|assoc|number deviations number p_value list|assoc|* vector1 [list|assoc|* vector2] [list value_names] [bool surprisal_space]", "output" : "number", - "description" : "Computes the generalized norm between vector1 and vector2 (or an equivalent zero vector if unspecified) with parameter specified by the p_value (2 being Euclidian distance), using the numerical distance or edit distance as appropriate. The parameter value_names, if specified as a list of the names of the values, will transform via unzipping any assoc into a list for the respective parameter in the order of the value_names, or if a number will use the number repeatedly for every element. weights is a list of dimension weights to use for the query, each value mapping to its respective element in the vectors. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. If any vector value is null or evaluates to nan, or any of the differences between vector1 and vector2 evaluate to null or nan, then it will compute a corresponding maximum distance value based on the properties of the feature.", + "description" : "Computes the generalized norm between vector1 and vector2 (or an equivalent zero vector if unspecified) with parameter specified by the p_value (2 being Euclidian distance), using the numerical distance or edit distance as appropriate. The parameter value_names, if specified as a list of the names of the values, will transform via unzipping any assoc into a list for the respective parameter in the order of the value_names, or if a number will use the number repeatedly for every element. weights is a list of dimension weights to use for the query, each value mapping to its respective element in the vectors. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. If any vector value is null or evaluates to nan, or any of the differences between vector1 and vector2 evaluate to null or nan, then it will compute a corresponding maximum distance value based on the properties of the feature. If surprisal space is true, which defaults to false, it will perform all computations in surprisal space.", "example" : "(print (generalized_distance 0.01 (null) (null) (list null (list 0 360)) (list 0.5 0.0) (list 0 2 3) (list 1 2 3)))\n(print (generalized_distance 0.01 (list 0.25 0.25 0.5) (null) (null) (null) (list 1 2 3) (list 0 2 3) ))\n(generalized_distance 1 (list 0.3333 0.3333 0.3333) (list 5 0) (null) (null) (list 1 2 3) (list 10 2 10) )" }, @@ -1599,7 +1599,7 @@ var data = [ "parameter" : "query_within_generalized_distance number max_distance list axis_labels list axis_values list|assoc|number weights list|assoc distance_types list|assoc attributes list|assoc|number deviations [number p_value] [string|number distance_transform] [string entity_weight_label_name] [number random_seed] [string radius_label] [string numerical_precision] [* output_sorted_list]", "output" : "query", "new value" : "new", - "description" : "When used as a query argument, selects entities which represent a point within a certain generalized norm to a given point. axis_labels specifies the names of the coordinate axes (as labels on the target entity), and axis_values the specifies the corresponding values for the point to test from. p_value is the generalized norm parameter. weights is a list or assoc of dimension weights to use for the query, each value mapping to its respective element in the vectors. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. For attributes, the particular distance_types specifies what is expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values available. For continuous, a null means unbounded where distance for a null will be computed automatically from the relevant data; a single number indicates the difference between a value and a null, a specified uncertainty. Cyclic requires either a single value or a list of two values; a list of two values indicates that the first value, the lower bound, will wrap around to the upper bound, the second value specified; if only a single number is provided instead of a list, then it will assume that number for the upper bound and 0 for the lower bound. For the string distance type, the value specified can be a number indicating the maximum possible string length, inferred if null is provided. For code, the value specified can be a number indicating the maximum number of nodes in the code (including labels), inferred if null is provided. Deviations contains numbers that are used during the distance calculation, per-element, prior to exponentiation. Specifying null as deviations is equivalent to setting each deviation to 0. max_distance is the maximum distance allowed. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their distances. If these distances are returned, then a transform may be applied to them based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities before being returned. If distance_transform is a number or omitted, which will default to 1.0, then it will be treated as a distance weight exponent, and will be applied to each distance as distance^distance_weight_exponent. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", + "description" : "When used as a query argument, selects entities which represent a point within a certain generalized norm to a given point. axis_labels specifies the names of the coordinate axes (as labels on the target entity), and axis_values the specifies the corresponding values for the point to test from. p_value is the generalized norm parameter. weights is a list or assoc of dimension weights to use for the query, each value mapping to its respective element in the vectors. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. For attributes, the particular distance_types specifies what is expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values available. For continuous, a null means unbounded where distance for a null will be computed automatically from the relevant data; a single number indicates the difference between a value and a null, a specified uncertainty. Cyclic requires either a single value or a list of two values; a list of two values indicates that the first value, the lower bound, will wrap around to the upper bound, the second value specified; if only a single number is provided instead of a list, then it will assume that number for the upper bound and 0 for the lower bound. For the string distance type, the value specified can be a number indicating the maximum possible string length, inferred if null is provided. For code, the value specified can be a number indicating the maximum number of nodes in the code (including labels), inferred if null is provided. Deviations contains numbers that are used during the distance calculation, per-element, prior to exponentiation. Specifying null as deviations is equivalent to setting each deviation to 0. max_distance is the maximum distance allowed. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their distances. If these distances are returned, then a transform may be applied to them based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be calculated as surprisals and will be transformed back into probabilities before being returned. If distance_transform is a number or omitted, which will default to 1.0, then it will be treated as a distance weight exponent, and will be applied to each distance as distance^distance_weight_exponent. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", "example" : "(contained_entities \"TestContainerExec\" (list\n (query_within_generalized_distance 60 (list \"x\" \"y\") (list 0.0 0.0) (null) (null) (null) (null) 0.5 1 (null) \"random seed 1234\" \"radius\")\n))" }, @@ -1607,7 +1607,7 @@ var data = [ "parameter" : "query_nearest_generalized_distance number entities_returned list axis_labels list axis_values list|assoc weights list|assoc distance_types list|assoc attributes list|assoc deviations [number p_value] [string|number distance_transform] [string entity_weight_label_name] [number random_seed] [string radius_label] [string numerical_precision] [* output_sorted_list]", "output" : "query", "new value" : "new", - "description" : "When used as a query argument, selects the closest entities which represent a point within a certain generalized norm to a given point. axis_labels specifies the names of the coordinate axes (as labels on the target entity), and axis_values the specifies the corresponding values for the point to test from. p_value is the generalized norm parameter. weights is a list or assoc of dimension weights to use for the query, each value mapping to its respective element in the vectors. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their distances. If these distances are returned, then a transform may be applied to them based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities before being returned. If distance_transform is a number or omitted, which will default to 1.0, then it will be treated as a distance weight exponent, and will be applied to each distance as distance^distance_weight_exponent. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", + "description" : "When used as a query argument, selects the closest entities which represent a point within a certain generalized norm to a given point. axis_labels specifies the names of the coordinate axes (as labels on the target entity), and axis_values the specifies the corresponding values for the point to test from. p_value is the generalized norm parameter. weights is a list or assoc of dimension weights to use for the query, each value mapping to its respective element in the vectors. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their distances. If these distances are returned, then a transform may be applied to them based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be calculated as surprisals and will be transformed back into probabilities before being returned. If distance_transform is a number or omitted, which will default to 1.0, then it will be treated as a distance weight exponent, and will be applied to each distance as distance^distance_weight_exponent. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", "example" : "(contained_entities \"TestContainerExec\" (list\n (query_nearest_generalized_distance (list \"x\" \"y\") (list 0.0 0.0) 0.5 (list 0.25 0.75) (list 5 0) (list null (list 0 360)) (list 0.5 0.0) 10 \"radius\")\n))\n(contained_entities \"TestContainerExec\" (list\n (query_nearest_generalized_distance (list \"x\" \"y\") (list 0.0 0.0) 0.5 (null) (null) 10 \"radius\")\n))" }, @@ -1616,7 +1616,7 @@ var data = [ "output" : "query", "new value" : "new", "concurrency" : true, - "description" : "When used as a query argument, computes the case conviction for every case given in case_ids_to_compute with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If conviction_of_removal is true, then it will compute the conviction as if the entities specified by entity_ids_to_compute were removed; if false (the default), then will compute the conviction as if those entities were added or included. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", + "description" : "When used as a query argument, computes the case conviction for every case given in case_ids_to_compute with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be calculated as surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If conviction_of_removal is true, then it will compute the conviction as if the entities specified by entity_ids_to_compute were removed; if false (the default), then will compute the conviction as if those entities were added or included. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", "example" : "(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_convictions (list \"feature_1\" \"feature_2\") (list entity_id_1 entity_id_2 entity_id 3) 1.0 (list 0.25 0.75) (list 5 0) (list null (list 0 360)) (list 0.5 0.0) 10 \"radius\")\n))\n(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_convictions (list \"x\" \"y\") (null) 2.0 (null) (null) 10 \"radius\")\n))" }, @@ -1625,7 +1625,7 @@ var data = [ "output" : "query", "new value" : "new", "concurrency" : true, - "description" : "When used as a query argument, computes the case kl divergence for every case given in case_ids_to_compute as a group with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If conviction_of_removal is true, then it will compute the conviction as if the entities specified by entity_ids_to_compute were removed; if false (the default), then will compute the conviction as if those entities were added or included.", + "description" : "When used as a query argument, computes the case kl divergence for every case given in case_ids_to_compute as a group with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be calculated as surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If conviction_of_removal is true, then it will compute the conviction as if the entities specified by entity_ids_to_compute were removed; if false (the default), then will compute the conviction as if those entities were added or included.", "example" : "(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_group_kl_divergence (list \"feature_1\" \"feature_2\") (list entity_id_1 entity_id_2 entity_id 3) 1.0 (list 0.25 0.75) (list 5 0) (list null (list 0 360)) (list 0.5 0.0) 10 \"radius\")\n))\n(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_group_kl_divergence (list \"x\" \"y\") (null) 2.0 (null) (null) 10 \"radius\")\n))" }, @@ -1634,7 +1634,7 @@ var data = [ "output" : "query", "new value" : "new", "concurrency" : true, - "description" : "When used as a query argument, computes the case conviction for every case given in case_ids_to_compute with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", + "description" : "When used as a query argument, computes the case conviction for every case given in case_ids_to_compute with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be calculated as surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", "example" : "(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_distance_contributions (list \"feature_1\" \"feature_2\") (list entity_id_1 entity_id_2 entity_id 3) 1.0 (list 0.25 0.75) (list 5 0) (list null (list 0 360)) (list 0.5 0.0) 10 \"radius\")\n))\n(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_distance_contributions (list \"x\" \"y\") (null) 2.0 (null) (null) 10 \"radius\")\n))" }, @@ -1643,7 +1643,7 @@ var data = [ "output" : "query", "new value" : "new", "concurrency" : true, - "description" : "When used as a query argument, computes the case conviction for every case given in case_ids_to_compute with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be assumed to be surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If conviction_of_removal is true, then it will compute the conviction as if the entities specified by entity_ids_to_compute were removed; if false (the default), then will compute the conviction as if those entities were added or included. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", + "description" : "When used as a query argument, computes the case conviction for every case given in case_ids_to_compute with respect to *all* cases in the contained entities set input during a query. If case_ids_to_compute is null/emptylist, case conviction is computed for all cases. feature_labels specifies the names of the features to consider the during computation. p_value is the generalized norm parameter. If weights is null, then it will assume that the weights are 1 and additionally will ignore null values for the vectors instead of treating them as unknown differences. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_numeric\", \"nominal_string\", \"nominal_code\", \"continuous_numeric\", \"continuous_numeric_cyclic\", \"continuous_string\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviations is equivalent to setting each deviation to 0. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. entities_returned specifies the number of entities to return. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precison, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal_to_prob\" then distances will be calculated as surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If conviction_of_removal is true, then it will compute the conviction as if the entities specified by entity_ids_to_compute were removed; if false (the default), then will compute the conviction as if those entities were added or included. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", "example" : "(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_kl_divergences (list \"feature_1\" \"feature_2\") (list entity_id_1 entity_id_2 entity_id 3) 1.0 (list 0.25 0.75) (list 5 0) (list null (list 0 360)) (list 0.5 0.0) 10 \"radius\")\n))\n(compute_on_contained_entities \"TestContainerExec\" (list\n (compute_entity_kl_divergences (list \"x\" \"y\") (null) 2.0 (null) (null) 10 \"radius\")\n))" }, diff --git a/src/Amalgam/GeneralizedDistance.h b/src/Amalgam/GeneralizedDistance.h index 3e32eee0..cc8e0643 100644 --- a/src/Amalgam/GeneralizedDistance.h +++ b/src/Amalgam/GeneralizedDistance.h @@ -87,13 +87,13 @@ class GeneralizedDistance if(compute_accurate) { feature_params.unknownToUnknownDistanceTerm.SetValue( - ComputeDistanceTermNonNull(feature_params.unknownToUnknownDistanceTerm.difference, index, true), true); + ComputeDistanceTermMatchOnNull(index, feature_params.unknownToUnknownDistanceTerm.difference, true), true); } if(compute_approximate) { feature_params.unknownToUnknownDistanceTerm.SetValue( - ComputeDistanceTermNonNull(feature_params.unknownToUnknownDistanceTerm.difference, index, false), false); + ComputeDistanceTermMatchOnNull(index, feature_params.unknownToUnknownDistanceTerm.difference, false), false); } //if knownToUnknownDifference is same as unknownToUnknownDifference, can copy distance term instead of recomputing @@ -107,13 +107,13 @@ class GeneralizedDistance if(compute_accurate) { feature_params.knownToUnknownDistanceTerm.SetValue( - ComputeDistanceTermNonNull(feature_params.knownToUnknownDistanceTerm.difference, index, true), true); + ComputeDistanceTermMatchOnNull(index, feature_params.knownToUnknownDistanceTerm.difference, true), true); } if(compute_approximate) { feature_params.knownToUnknownDistanceTerm.SetValue( - ComputeDistanceTermNonNull(feature_params.knownToUnknownDistanceTerm.difference, index, false), false); + ComputeDistanceTermMatchOnNull(index, feature_params.knownToUnknownDistanceTerm.difference, false), false); } } @@ -136,7 +136,7 @@ class GeneralizedDistance } //for the feature index, computes and stores the distance terms as measured from value to each interned value - inline void ComputeAndStoreInternedNumberValuesAndDistanceTerms(size_t index, double value, std::vector *interned_values) + inline void ComputeAndStoreInternedNumberValuesAndDistanceTerms(double value, size_t index, std::vector *interned_values) { bool compute_accurate = NeedToPrecomputeAccurate(); bool compute_approximate = NeedToPrecomputeApproximate(); @@ -160,9 +160,9 @@ class GeneralizedDistance { double difference = value - interned_values->at(i); if(compute_accurate) - feature_params.internDistanceTerms[i].SetValue(ComputeDistanceTermNonNominalNonNullRegular(difference, index, true), true); + feature_params.internDistanceTerms[i].SetValue(ComputeDistanceTermContinuousNonNullRegular(difference, index, true), true); if(compute_approximate) - feature_params.internDistanceTerms[i].SetValue(ComputeDistanceTermNonNominalNonNullRegular(difference, index, false), false); + feature_params.internDistanceTerms[i].SetValue(ComputeDistanceTermContinuousNonNullRegular(difference, index, false), false); } } @@ -194,6 +194,17 @@ class GeneralizedDistance return s_two_over_sqrt_pi * deviation * FastExp(-term * term) - diff * std::erfc(term); //2*sigma*(e^(-1*(diff^2)/((2*simga)^2)))/sqrt(pi) - diff*erfc(diff/(2*sigma)) } + //surprisal in nats of each of the different distributions given the appropriate uncertainty + //this is equal to the nats of entropy of the distribution plus the entropy of the uncertainty + //in the case of Laplace, the Laplace distribution is one nat, and the mean absolute deviation is half of that, + //therefore the value is 1.5 + //these values can be computed via ComputeDeviationPartLaplace(0.0, 1) for each of the corresponding methods + //deviations other than 1 can be used, but then the result should be divided by that deviation, yielding the same value + static constexpr double s_surprisal_of_laplace = 1.5; + static constexpr double s_surprisal_of_laplace_approx = 1.500314205; + static constexpr double s_surprisal_of_gaussian = 1.1283791670955126; + static constexpr double s_surprisal_of_gaussian_approx = 1.128615528679644; + //computes the Lukaszyk–Karmowski metric deviation component for the minkowski distance equation given the feature difference and feature deviation //assumes deviation is nonnegative __forceinline double ComputeDeviationPart(const double diff, const double deviation, bool high_accuracy) @@ -212,6 +223,20 @@ class GeneralizedDistance #endif } + //converts a difference with deviation to surprisal, and removes the appropriate assumption of uncertainty + //for Laplace, the Laplace distribution has 1 nat worth of information, but additionally, there is a 50/50 chance that the + //difference is within the mean absolute error, yielding an overcounting of an additional 1/2 nat. So the total reduction is 1.5 nats + __forceinline double ComputeSurprisalFromDifferenceWithDeviation(const double difference_with_deviation, const double deviation, bool high_accuracy) + { + #ifdef DISTANCE_USE_LAPLACE_LK_METRIC + double base_surprisal = (high_accuracy ? s_surprisal_of_laplace : s_surprisal_of_laplace_approx); + #else + double base_surprisal = (high_accuracy ? s_surprisal_of_gaussian : s_surprisal_of_gaussian_approx); + #endif + + return (difference_with_deviation / deviation) - base_surprisal; + } + //constrains the difference to the cycle length for cyclic distances __forceinline static double ConstrainDifferenceToCyclicDifference(double difference, double cycle_length) { @@ -374,7 +399,7 @@ class GeneralizedDistance return fastPowInverseP.FastPow(d); } - //computes the exponentiation of d to p given precision being from DistanceTerms + //computes the exponentiation of d to p __forceinline double ExponentiateDifferenceTerm(double d, bool high_accuracy) { if(pValue == 1) @@ -389,6 +414,34 @@ class GeneralizedDistance return fastPowP.FastPow(d); } + //exponentiats and weights the difference term contextually based on pValue + //note that it has extra logic to account for extreme values like infinity, negative infinity, and 0 + __forceinline double ContextuallyExponentiateAndWeightDifferenceTerm(double dist_term, size_t index, bool high_accuracy) + { + if(dist_term == 0.0) + return 0.0; + + double weight = featureParams[index].weight; + if(pValue == 0) + { + if(high_accuracy) + return std::pow(dist_term, weight); + else + return FastPow(dist_term, weight); + } + else if(pValue == std::numeric_limits::infinity() + || pValue == -std::numeric_limits::infinity()) + { + //infinite pValues are treated the same as 1 for distance terms, + //and are the same value regardless of high_accuracy + return dist_term * weight; + } + else + { + return ExponentiateDifferenceTerm(dist_term, high_accuracy) * weight; + } + } + //returns the maximum difference inline double GetMaximumDifference(size_t index) { @@ -404,102 +457,76 @@ class GeneralizedDistance return -std::numeric_limits::infinity(); } - //computes the distance term for a nominal when two universally symmetric nominals are equal - __forceinline double ComputeDistanceTermNominalUniversallySymmetricExactMatch(size_t index, bool high_accuracy) + //computes the base of the difference between two nominal values that exactly match without exponentiation + __forceinline double ComputeDistanceTermNominalBaseExactMatchFromDeviation(size_t index, double deviation, bool high_accuracy) { - if(!DoesFeatureHaveDeviation(index)) + if(!DoesFeatureHaveDeviation(index) || computeSurprisal) return 0.0; - double weight = featureParams[index].weight; - double deviation = featureParams[index].deviation; - - //infinite pValues are treated the same as 1 for distance terms, - //and are the same value regardless of high_accuracy - if(pValue == 1 || pValue == std::numeric_limits::infinity() - || pValue == -std::numeric_limits::infinity()) - return deviation * weight; - - if(pValue == 0) - { - if(high_accuracy) - return std::pow(deviation, weight); - else - return FastPow(deviation, weight); - } - else - { - if(high_accuracy) - return std::pow(deviation, pValue) * weight; - else - return FastPow(deviation, pValue) * weight; - } + return deviation; } - //computes the distance term for a nominal when two universally symmetric nominals are not equal - __forceinline double ComputeDistanceTermNominalUniversallySymmetricNonMatch(size_t index, bool high_accuracy) + //computes the base of the difference between two nominal values that do not match without exponentiation + __forceinline double ComputeDistanceTermNominalBaseNonMatchFromDeviation(size_t index, double deviation, bool high_accuracy) { - double weight = featureParams[index].weight; - if(DoesFeatureHaveDeviation(index)) + if(computeSurprisal) + { + //need to have at least two classes in existence + double nominal_count = std::max(featureParams[index].typeAttributes.nominalCount, 2.0); + double prob_max_entropy_match = 1 / nominal_count; + + //find probability that the correct class was selected + //can't go below base probability of guessing + double prob_class_given_match = std::max(1 - deviation, prob_max_entropy_match); + + //find the probability that any other class besides the correct class was selected + //divide the probability among the other classes + double prop_class_given_nonmatch = (1 - prob_class_given_match) / (nominal_count - 1); + + double surprisal_class_given_match = -std::log(prob_class_given_match); + double surprisal_class_given_nonmatch = -std::log(prop_class_given_nonmatch); + + //the surprisal of the class matching on a different value is the difference between + //how surprised it would be given a nonmatch but without the surprisal given a match + double dist_term = surprisal_class_given_nonmatch - surprisal_class_given_match; + return dist_term; + } + else if(DoesFeatureHaveDeviation(index)) { - double deviation = featureParams[index].deviation; double nominal_count = featureParams[index].typeAttributes.nominalCount; // n = number of nominal classes // match: deviation ^ p * weight // non match: (deviation + (1 - deviation) / (n - 1)) ^ p * weight //if there is only one nominal class, the smallest delta value it could be is the specified smallest delta, otherwise it's 1.0 - double mismatch_deviation = 1.0; + double dist_term = 0; if(nominal_count > 1) - mismatch_deviation = (deviation + (1 - deviation) / (nominal_count - 1)); - - //infinite pValues are treated the same as 1 for distance terms, - //and are the same value regardless of high_accuracy - if(pValue == 1 || pValue == std::numeric_limits::infinity() - || pValue == -std::numeric_limits::infinity()) - return mismatch_deviation * weight; - - if(pValue == 0) - { - if(high_accuracy) - return std::pow(mismatch_deviation, weight); - else - return FastPow(mismatch_deviation, weight); - } + dist_term = (deviation + (1 - deviation) / (nominal_count - 1)); else - { - if(high_accuracy) - return std::pow(mismatch_deviation, pValue) * weight; - else - return FastPow(mismatch_deviation, pValue) * weight; - } + dist_term = 1; + + return dist_term; } else { - if(high_accuracy) - { - if(pValue != 0.0) - return weight; - else - return 1.0; - } - else - { - if(pValue != 0.0) - { - //special handling for infinities - if(pValue == std::numeric_limits::infinity() || pValue == -std::numeric_limits::infinity()) - return weight; - else //since FastPow isn't exact for 1.0, need to compute the value - return weight * FastPowNonZeroExp(1.0, pValue); - } - else //pValue == 0.0 - { - return FastPow(1.0, weight); - } - } + return 1.0; } } + //computes the distance term for a nominal when two universally symmetric nominals are equal + __forceinline double ComputeDistanceTermNominalUniversallySymmetricExactMatch(size_t index, bool high_accuracy) + { + double dist_term = ComputeDistanceTermNominalBaseExactMatchFromDeviation(index, featureParams[index].deviation, high_accuracy); + return ContextuallyExponentiateAndWeightDifferenceTerm(dist_term, index, high_accuracy); + } + + //computes the distance term for a nominal when two universally symmetric nominals are not equal + __forceinline double ComputeDistanceTermNominalUniversallySymmetricNonMatch(size_t index, bool high_accuracy) + { + double dist_term = ComputeDistanceTermNominalBaseNonMatchFromDeviation(index, featureParams[index].deviation, high_accuracy); + return ContextuallyExponentiateAndWeightDifferenceTerm(dist_term, index, high_accuracy); + } + //returns the precomputed distance term for a nominal when two universally symmetric nominals are equal __forceinline double ComputeDistanceTermNominalUniversallySymmetricExactMatchPrecomputed(size_t index, bool high_accuracy) { @@ -537,20 +564,20 @@ class GeneralizedDistance } //computes the inner term for a non-nominal with an exact match of values - __forceinline double ComputeDistanceTermNonNominalExactMatch(size_t index, bool high_accuracy) + __forceinline double ComputeDistanceTermContinuousExactMatch(size_t index, bool high_accuracy) { - if(!DoesFeatureHaveDeviation(index)) + if(!DoesFeatureHaveDeviation(index) || computeSurprisal) return 0.0; - //apply deviations + //apply deviations -- if computeSurprisal, will be caught above and always return 0.0 double diff = ComputeDeviationPart(0.0, featureParams[index].deviation, high_accuracy); //exponentiate and return with weight return ExponentiateDifferenceTerm(diff, high_accuracy) * featureParams[index].weight; } - //computes the base of the difference between two values non-nominal (e.g., continuous) - __forceinline double ComputeDifferenceTermBaseNonNominal(double diff, size_t index, bool high_accuracy) + //computes the base of the difference between two continuous values without exponentiation + __forceinline double ComputeDifferenceTermBaseContinuous(double diff, size_t index, bool high_accuracy) { //compute absolute value diff = std::abs(diff); @@ -561,29 +588,37 @@ class GeneralizedDistance //apply deviations if(DoesFeatureHaveDeviation(index)) + { diff += ComputeDeviationPart(diff, featureParams[index].deviation, high_accuracy); + if(computeSurprisal) + diff = ComputeSurprisalFromDifferenceWithDeviation(diff, featureParams[index].deviation, high_accuracy); + } return diff; } //computes the base of the difference between two values non-nominal (e.g., continuous) that isn't cyclic - __forceinline double ComputeDifferenceTermBaseNonNominalNonCyclic(double diff, size_t index, bool high_accuracy) + __forceinline double ComputeDifferenceTermBaseContinuousNonCyclic(double diff, size_t index, bool high_accuracy) { //compute absolute value diff = std::abs(diff); //apply deviations if(DoesFeatureHaveDeviation(index)) + { diff += ComputeDeviationPart(diff, featureParams[index].deviation, high_accuracy); + if(computeSurprisal) + diff = ComputeSurprisalFromDifferenceWithDeviation(diff, featureParams[index].deviation, high_accuracy); + } return diff; } //computes the distance term for a non-nominal (e.g., continuous) for p non-zero and non-infinite with no nulls // diff can be negative - __forceinline double ComputeDistanceTermNonNominalNonNullRegular(double diff, size_t index, bool high_accuracy) + __forceinline double ComputeDistanceTermContinuousNonNullRegular(double diff, size_t index, bool high_accuracy) { - diff = ComputeDifferenceTermBaseNonNominal(diff, index, high_accuracy); + diff = ComputeDifferenceTermBaseContinuous(diff, index, high_accuracy); //exponentiate and return with weight return ExponentiateDifferenceTerm(diff, high_accuracy) * featureParams[index].weight; @@ -591,9 +626,9 @@ class GeneralizedDistance //computes the distance term for a non-nominal (e.g., continuous) for p non-zero and non-infinite with max of one null // diff can be negative - __forceinline double ComputeDistanceTermNonNominalOneNonNullRegular(double diff, size_t index, bool high_accuracy) + __forceinline double ComputeDistanceTermContinuousOneNonNullRegular(double diff, size_t index, bool high_accuracy) { - diff = ComputeDifferenceTermBaseNonNominal(diff, index, high_accuracy); + diff = ComputeDifferenceTermBaseContinuous(diff, index, high_accuracy); //exponentiate and return with weight return ExponentiateDifferenceTerm(diff, high_accuracy) * featureParams[index].weight; @@ -601,9 +636,9 @@ class GeneralizedDistance //computes the distance term for a non-nominal (e.g., continuous) for p non-zero and non-infinite that isn't cyclic with no nulls // diff can be negative - __forceinline double ComputeDistanceTermNonNominalNonCyclicNonNullRegular(double diff, size_t index, bool high_accuracy) + __forceinline double ComputeDistanceTermContinuousNonCyclicNonNullRegular(double diff, size_t index, bool high_accuracy) { - diff = ComputeDifferenceTermBaseNonNominalNonCyclic(diff, index, high_accuracy); + diff = ComputeDifferenceTermBaseContinuousNonCyclic(diff, index, high_accuracy); //exponentiate and return with weight return ExponentiateDifferenceTerm(diff, high_accuracy) * featureParams[index].weight; @@ -611,12 +646,12 @@ class GeneralizedDistance //computes the distance term for a non-nominal (e.g., continuous) for p non-zero and non-infinite that isn't cyclic with max of one null // diff can be negative - __forceinline double ComputeDistanceTermNonNominalNonCyclicOneNonNullRegular(double diff, size_t index, bool high_accuracy) + __forceinline double ComputeDistanceTermContinuousNonCyclicOneNonNullRegular(double diff, size_t index, bool high_accuracy) { if(FastIsNaN(diff)) return ComputeDistanceTermKnownToUnknown(index, high_accuracy); - diff = ComputeDifferenceTermBaseNonNominalNonCyclic(diff, index, high_accuracy); + diff = ComputeDifferenceTermBaseContinuousNonCyclic(diff, index, high_accuracy); //exponentiate and return with weight return ExponentiateDifferenceTerm(diff, high_accuracy) * featureParams[index].weight; @@ -635,12 +670,9 @@ class GeneralizedDistance return (diff == 0.0) ? ComputeDistanceTermNominalUniversallySymmetricExactMatchPrecomputed(index, high_accuracy) : ComputeDistanceTermNominalUniversallySymmetricNonMatchPrecomputed(index, high_accuracy); - diff = ComputeDifferenceTermBaseNonNominal(diff, index, high_accuracy); + diff = ComputeDifferenceTermBaseContinuous(diff, index, high_accuracy); - if(high_accuracy) - return std::pow(diff, featureParams[index].weight); - else - return FastPow(diff, featureParams[index].weight); + return ContextuallyExponentiateAndWeightDifferenceTerm(diff, index, high_accuracy); } //computes the inner term of the Minkowski norm summation for a single index for p=infinity or -infinity @@ -656,29 +688,41 @@ class GeneralizedDistance return (diff == 0.0) ? ComputeDistanceTermNominalUniversallySymmetricExactMatchPrecomputed(index, high_accuracy) : ComputeDistanceTermNominalUniversallySymmetricNonMatchPrecomputed(index, high_accuracy); - diff = ComputeDifferenceTermBaseNonNominal(diff, index, high_accuracy); + diff = ComputeDifferenceTermBaseContinuous(diff, index, high_accuracy); - return diff * featureParams[index].weight; + return ContextuallyExponentiateAndWeightDifferenceTerm(diff, index, high_accuracy); } - //computes the inner term of the Minkowski norm summation for a single index regardless of pValue - __forceinline double ComputeDistanceTermNonNull(double diff, size_t index, bool high_accuracy) + //computes the inner term of the Minkowski norm when a term matches a null value + //for a given deviation with regard to the null + __forceinline double ComputeDistanceTermMatchOnNull(size_t index, double deviation, bool high_accuracy) { - if(!IsFeatureNominal(index)) - diff = ComputeDifferenceTermBaseNonNominal(diff, index, high_accuracy); - - if(pValue == 0.0) + double diff = 0; + if(IsFeatureNominal(index)) { - if(high_accuracy) - return std::pow(diff, featureParams[index].weight); - else - return FastPow(diff, featureParams[index].weight); + if(computeSurprisal) + { + //need to have at least two classes in existence + double nominal_count = std::max(featureParams[index].typeAttributes.nominalCount, 2.0); + double prob_max_entropy_match = 1 / nominal_count; + + //find probability that the correct class was selected + //can't go below base probability of guessing + double prob_class_given_match = std::max(1 - deviation, prob_max_entropy_match); + + diff = -std::log(prob_class_given_match); + } + else //nonsurprisal nominals just use the deviation as provided + { + diff = deviation; + } } - else if(pValue == std::numeric_limits::infinity() - || pValue == -std::numeric_limits::infinity()) - return diff * featureParams[index].weight; else - return ExponentiateDifferenceTerm(diff, high_accuracy) * featureParams[index].weight; + { + diff = ComputeDifferenceTermBaseContinuous(deviation, index, high_accuracy); + } + + return ContextuallyExponentiateAndWeightDifferenceTerm(diff, index, high_accuracy); } //computes the inner term of the Minkowski norm summation for a single index for p non-zero and non-infinite @@ -694,7 +738,7 @@ class GeneralizedDistance return (diff == 0.0) ? ComputeDistanceTermNominalUniversallySymmetricExactMatchPrecomputed(index, high_accuracy) : ComputeDistanceTermNominalUniversallySymmetricNonMatchPrecomputed(index, high_accuracy); - return ComputeDistanceTermNonNominalNonNullRegular(diff, index, high_accuracy); + return ComputeDistanceTermContinuousNonNullRegular(diff, index, high_accuracy); } //returns the distance term for the either one or two unknown values @@ -912,6 +956,10 @@ class GeneralizedDistance //computed inverse of pValue double inversePValue; + //if true, it will perform computations resulting in surprisal before + //the exponentiation + bool computeSurprisal; + //if true, then all computations should be performed with high accuracy bool highAccuracy; //if true, then estimates should be computed with low accuracy, but final results with high accuracy diff --git a/src/Amalgam/SeparableBoxFilterDataStore.cpp b/src/Amalgam/SeparableBoxFilterDataStore.cpp index 68add3d3..c84c40f1 100644 --- a/src/Amalgam/SeparableBoxFilterDataStore.cpp +++ b/src/Amalgam/SeparableBoxFilterDataStore.cpp @@ -1011,13 +1011,13 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G auto value_found = column->stringIdValueToIndices.find(value.stringID); if(value_found != end(column->stringIdValueToIndices)) { - double term = dist_params.ComputeDistanceTermNonNominalExactMatch(query_feature_index, high_accuracy); + double term = dist_params.ComputeDistanceTermContinuousExactMatch(query_feature_index, high_accuracy); AccumulatePartialSums(*(value_found->second), query_feature_index, term); } } //the next closest string will have an edit distance of 1 - return dist_params.ComputeDistanceTermNonNominalNonCyclicNonNullRegular(1.0, query_feature_index, high_accuracy); + return dist_params.ComputeDistanceTermContinuousNonCyclicNonNullRegular(1.0, query_feature_index, high_accuracy); } else if(effective_feature_type == GeneralizedDistance::EFDT_CONTINUOUS_CODE) { @@ -1035,7 +1035,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G } //next most similar code must be at least a distance of 1 edit away - return dist_params.ComputeDistanceTermNonNominalNonCyclicNonNullRegular(1.0, query_feature_index, high_accuracy); + return dist_params.ComputeDistanceTermContinuousNonCyclicNonNullRegular(1.0, query_feature_index, high_accuracy); } //else feature_type == FDT_CONTINUOUS_NUMERIC or FDT_CONTINUOUS_UNIVERSALLY_NUMERIC @@ -1052,9 +1052,9 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G double term = 0.0; if(exact_index_found) - term = dist_params.ComputeDistanceTermNonNominalExactMatch(query_feature_index, high_accuracy); + term = dist_params.ComputeDistanceTermContinuousExactMatch(query_feature_index, high_accuracy); else - term = dist_params.ComputeDistanceTermNonNominalNonNullRegular( + term = dist_params.ComputeDistanceTermContinuousNonNullRegular( value.number - column->sortedNumberValueEntries[value_index]->value.number, query_feature_index, high_accuracy); size_t num_entities_computed = AccumulatePartialSums(column->sortedNumberValueEntries[value_index]->indicesWithValue, query_feature_index, term); @@ -1203,7 +1203,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G break; } - term = dist_params.ComputeDistanceTermNonNominalNonNullRegular(next_closest_diff, query_feature_index, high_accuracy); + term = dist_params.ComputeDistanceTermContinuousNonNullRegular(next_closest_diff, query_feature_index, high_accuracy); num_entities_computed += AccumulatePartialSums( column->sortedNumberValueEntries[next_closest_index]->indicesWithValue, query_feature_index, term); diff --git a/src/Amalgam/SeparableBoxFilterDataStore.h b/src/Amalgam/SeparableBoxFilterDataStore.h index dfeb84ae..8da33ec5 100644 --- a/src/Amalgam/SeparableBoxFilterDataStore.h +++ b/src/Amalgam/SeparableBoxFilterDataStore.h @@ -80,7 +80,7 @@ class SeparableBoxFilterDataStore { double max_diff = columnData[absolute_feature_index]->GetMaxDifferenceTermFromValue( dist_params.featureParams[query_feature_index], value_type, value); - return dist_params.ComputeDistanceTermNonNominalNonNullRegular(max_diff, query_feature_index, high_accuracy); + return dist_params.ComputeDistanceTermContinuousNonNullRegular(max_diff, query_feature_index, high_accuracy); } //gets the matrix cell index for the specified index @@ -737,7 +737,7 @@ class SeparableBoxFilterDataStore case GeneralizedDistance::EFDT_CONTINUOUS_UNIVERSALLY_NUMERIC: { const size_t column_index = target_label_indices[query_feature_index]; - return dist_params.ComputeDistanceTermNonNominalNonCyclicOneNonNullRegular( + return dist_params.ComputeDistanceTermContinuousNonCyclicOneNonNullRegular( target_values[query_feature_index].number - GetValue(entity_index, column_index).number, query_feature_index, high_accuracy); } @@ -754,7 +754,7 @@ class SeparableBoxFilterDataStore const size_t column_index = target_label_indices[query_feature_index]; auto &column_data = columnData[column_index]; if(column_data->numberIndices.contains(entity_index)) - return dist_params.ComputeDistanceTermNonNominalNonCyclicOneNonNullRegular( + return dist_params.ComputeDistanceTermContinuousNonCyclicOneNonNullRegular( target_values[query_feature_index].number - GetValue(entity_index, column_index).number, query_feature_index, high_accuracy); else @@ -766,7 +766,7 @@ class SeparableBoxFilterDataStore const size_t column_index = target_label_indices[query_feature_index]; auto &column_data = columnData[column_index]; if(column_data->numberIndices.contains(entity_index)) - return dist_params.ComputeDistanceTermNonNominalOneNonNullRegular( + return dist_params.ComputeDistanceTermContinuousOneNonNullRegular( target_values[query_feature_index].number - GetValue(entity_index, column_index).number, query_feature_index, high_accuracy); else @@ -922,7 +922,7 @@ class SeparableBoxFilterDataStore else effective_feature_type = GeneralizedDistance::EFDT_CONTINUOUS_NUMERIC_PRECOMPUTED; - dist_params.ComputeAndStoreInternedNumberValuesAndDistanceTerms(query_feature_index, position_value_numeric, &column_data->internedNumberIndexToNumberValue); + dist_params.ComputeAndStoreInternedNumberValuesAndDistanceTerms(position_value_numeric, query_feature_index, &column_data->internedNumberIndexToNumberValue); } else { diff --git a/src/Amalgam/amlg_code/full_test.amlg b/src/Amalgam/amlg_code/full_test.amlg index 6c3eb390..b27e0b35 100644 --- a/src/Amalgam/amlg_code/full_test.amlg +++ b/src/Amalgam/amlg_code/full_test.amlg @@ -417,6 +417,15 @@ ;should print 3ish (print "35 " (generalized_distance (list 1 1) (list "continuous_code" "nominal_string") (list 0 5) (null) 1 (list (list 1.5 2 3 4 5) "s") (list (list 1 2 3) "s") ) "\n") + ;surprisal + ;should both be 0 + (print "36 " (generalized_distance (list 1 1) (list "continuous_numeric" "continuous_numeric") (null) (list 0.5 0.5) 1 (list 1 1) (list 1 1) (null) (true) ) "\n" ) + (print "37 " (generalized_distance (list 1 1) (list "nominal_numeric" "nominal_numeric") (null) (list 0.5 0.5) 1 (list 1 1) (list 1 1) (null) (true) ) "\n" ) + + ;surprisal + (print "38 " (generalized_distance (list 1 1) (list "continuous_numeric" "continuous_numeric") (null) (list 0.5 0.5) 1 (list 1 1) (list 2 2) (null) (true) ) "\n" ) + (print "39 " (generalized_distance (list 1 1) (list "nominal_numeric" "nominal_numeric") (list 2 2) (list 0.25 0.25) 1 (list 1 1) (list 2 2) (null) (true) ) "\n" ) + (print "--entropy--\n") (print (entropy (list 0.5 0.5)) "\n") (print (entropy (list 0.5 0.5) (list 0.25 0.75) -1 1) "\n") @@ -3889,7 +3898,6 @@ ;should be: ;(list "vert0" "vert1" "vert2" "vert3") - ;(list 0.049787068367863944 0.049787068367863944 0.01831563888873418 0.006737946999085467) (print "probabilities: " (compute_on_contained_entities "SurprisalTransformContainer" (list (query_nearest_generalized_distance @@ -3899,9 +3907,9 @@ (null) ; context_weights (list "continuous_numeric") ; types (null) ; attributes - (null) ; context_deviations + (list 0.25) ; context_deviations 1 ; p_parameter - "surprisal_to_prob" ; dwe = 1 means return computed distance to each case + "surprisal_to_prob" ; distance transform (null) ; weight (rand) (null) @@ -3913,7 +3921,6 @@ ;should be ;(list "vert0" "vert2" "vert3" "vert1") - ;(list 0.09709538455906153 0.01831563888873418 0.006737946999085467 0) (print "weighted probabilities: " (compute_on_contained_entities "SurprisalTransformContainer" (list (query_nearest_generalized_distance @@ -3923,9 +3930,9 @@ (null) ; context_weights (list "continuous_numeric") ; types (null) ; attributes - (null) ; context_deviations + (list 0.25) ; context_deviations 1 ; p_parameter - "surprisal_to_prob" ; dwe = 1 means return computed distance to each case + "surprisal_to_prob" ; distance transform "weight" ; weight (rand) (null) @@ -3941,12 +3948,12 @@ ;should be approx 2.123 (print "surprisal contribution: " (compute_on_contained_entities "SurprisalTransformContainer" (list - (compute_entity_distance_contributions 4 (list "x") (list "testvert") (null) (null) (null) (null) 1 "surprisal_to_prob" (null) "fixed_seed" (null) "precise") + (compute_entity_distance_contributions 4 (list "x") (list "testvert") (null) (null) (null) (list 0.25) 1 "surprisal_to_prob" (null) "fixed_seed" (null) "precise") ))) ;should be approx 2.123 (print "weighted surprisal contribution: " (compute_on_contained_entities "SurprisalTransformContainer" (list - (compute_entity_distance_contributions 4 (list "x") (list "testvert") (null) (null) (null) (null) 1 "surprisal_to_prob" "weight" "fixed_seed" (null) "precise") + (compute_entity_distance_contributions 4 (list "x") (list "testvert") (null) (null) (null) (list 0.25) 1 "surprisal_to_prob" "weight" "fixed_seed" (null) "precise") ))) (print "--concurrency tests--\n") diff --git a/src/Amalgam/amlg_code/test.amlg b/src/Amalgam/amlg_code/test.amlg index 55668ac8..772c7805 100644 --- a/src/Amalgam/amlg_code/test.amlg +++ b/src/Amalgam/amlg_code/test.amlg @@ -1,28 +1,4 @@ (seq - (create_entities "BoxConvictionTestContainer" (null) ) + (print "17 " (generalized_distance (null) (list "nominal_numeric") (list 1) (null) 1 (list 1 2 3) (list 10 2 4) ) "\n") - (create_entities (list "BoxConvictionTestContainer" "vert0") (lambda - (null ##x 0 ##y 0 ##weight 2) - ) ) - - (create_entities (list "BoxConvictionTestContainer" "vert1") (lambda - (null ##x 0 ##y 1 ##weight 1) - ) ) - - (create_entities (list "BoxConvictionTestContainer" "vert2") (lambda - (null ##x 1 ##y 0 ##weight 1) - ) ) - - (create_entities (list "BoxConvictionTestContainer" "vert3") (lambda - (null ##x 2 ##y 1 ##weight 1) - ) ) - - ;should print: - ;dc: (list - ;(list "vert0" "vert1" "vert2" "vert3") - ;(list 1 1 1 1.4142135623730951) - ;) - (print "dc: " (compute_on_contained_entities "BoxConvictionTestContainer" (list - (compute_entity_distance_contributions 1 (list "x" "y") (list "vert3") (null) (null) (null) (null) 2.0 -1 (null) "fixed_seed" (null) "recompute_precise" (true)) - ))) ) \ No newline at end of file diff --git a/src/Amalgam/entity/EntityQueries.cpp b/src/Amalgam/entity/EntityQueries.cpp index defb2696..ca349630 100644 --- a/src/Amalgam/entity/EntityQueries.cpp +++ b/src/Amalgam/entity/EntityQueries.cpp @@ -738,7 +738,7 @@ EvaluableNodeReference EntityQueryCondition::GetMatchingEntities(Entity *contain } //transform distances as appropriate - EntityQueriesStatistics::DistanceTransform distance_transform(transformSuprisalToProb, + EntityQueriesStatistics::DistanceTransform distance_transform(distParams.computeSurprisal, distanceWeightExponent, weightLabel != StringInternPool::NOT_A_STRING_ID, [this](Entity *e, double &weight_value) { return e->GetValueAtLabelAsNumber(weightLabel, weight_value); }); @@ -775,7 +775,7 @@ EvaluableNodeReference EntityQueryCondition::GetMatchingEntities(Entity *contain entity_values.push_back(DistanceReferencePair(GetConditionDistanceMeasure(matching_entities[i], high_accuracy), matching_entities[i])); //transform distances as appropriate - EntityQueriesStatistics::DistanceTransform distance_transform(transformSuprisalToProb, + EntityQueriesStatistics::DistanceTransform distance_transform(distParams.computeSurprisal, distanceWeightExponent, weightLabel != StringInternPool::NOT_A_STRING_ID, [this](Entity *e, double &weight_value) { return e->GetValueAtLabelAsNumber(weightLabel, weight_value); }); diff --git a/src/Amalgam/entity/EntityQueries.h b/src/Amalgam/entity/EntityQueries.h index 7e44bffb..0849da7c 100644 --- a/src/Amalgam/entity/EntityQueries.h +++ b/src/Amalgam/entity/EntityQueries.h @@ -82,9 +82,6 @@ class EntityQueryCondition //only applicable when transformSuprisalToProb is false double distanceWeightExponent; - //if true, the values will be transformed from surprisal to probability; if false, will perform a distance transform - bool transformSuprisalToProb; - //if ENT_QUERY_SELECT has a start offset bool hasStartOffset; diff --git a/src/Amalgam/entity/EntityQueryBuilder.h b/src/Amalgam/entity/EntityQueryBuilder.h index f6129bcc..de81f1bb 100644 --- a/src/Amalgam/entity/EntityQueryBuilder.h +++ b/src/Amalgam/entity/EntityQueryBuilder.h @@ -330,15 +330,15 @@ namespace EntityQueryBuilder cur_condition->distParams.pValue = p_value; //value transforms for whatever is measured as "distance" - cur_condition->transformSuprisalToProb = false; cur_condition->distanceWeightExponent = 1.0; + cur_condition->distParams.computeSurprisal = false; if(ocn.size() > DISTANCE_VALUE_TRANSFORM) { EvaluableNode *dwe_param = ocn[DISTANCE_VALUE_TRANSFORM]; if(!EvaluableNode::IsNull(dwe_param)) { if(dwe_param->GetType() == ENT_STRING && dwe_param->GetStringIDReference() == ENBISI_surprisal_to_prob) - cur_condition->transformSuprisalToProb = true; + cur_condition->distParams.computeSurprisal = true; else //try to convert to number cur_condition->distanceWeightExponent = EvaluableNode::ToNumber(dwe_param, 1.0); } diff --git a/src/Amalgam/entity/EntityQueryCaches.cpp b/src/Amalgam/entity/EntityQueryCaches.cpp index 339aba00..21f37ff4 100644 --- a/src/Amalgam/entity/EntityQueryCaches.cpp +++ b/src/Amalgam/entity/EntityQueryCaches.cpp @@ -249,7 +249,7 @@ void EntityQueryCaches::GetMatchingEntities(EntityQueryCondition *cond, BitArray weight_column = sbfds.GetColumnIndexFromLabelId(cond->weightLabel); auto get_weight = sbfds.GetNumberValueFromEntityIndexFunction(weight_column); - EntityQueriesStatistics::DistanceTransform distance_transform(cond->transformSuprisalToProb, + EntityQueriesStatistics::DistanceTransform distance_transform(cond->distParams.computeSurprisal, cond->distanceWeightExponent, use_entity_weights, get_weight); //if first, need to populate with all entities diff --git a/src/Amalgam/interpreter/InterpreterOpcodesMath.cpp b/src/Amalgam/interpreter/InterpreterOpcodesMath.cpp index 2c8740af..76b616da 100644 --- a/src/Amalgam/interpreter/InterpreterOpcodesMath.cpp +++ b/src/Amalgam/interpreter/InterpreterOpcodesMath.cpp @@ -1015,9 +1015,9 @@ EvaluableNodeReference Interpreter::InterpretNode_ENT_GENERALIZED_DISTANCE(Evalu //get value_names if applicable std::vector value_names; - if(ocn.size() > 8) + if(ocn.size() > 7) { - EvaluableNodeReference value_names_node = InterpretNodeForImmediateUse(ocn[8]); + EvaluableNodeReference value_names_node = InterpretNodeForImmediateUse(ocn[7]); if(!EvaluableNode::IsNull(value_names_node)) { //extract the names for each value into value_names @@ -1034,6 +1034,10 @@ EvaluableNodeReference Interpreter::InterpretNode_ENT_GENERALIZED_DISTANCE(Evalu evaluableNodeManager->FreeNodeTreeIfPossible(value_names_node); } + dist_params.computeSurprisal = false; + if(ocn.size() > 8) + dist_params.computeSurprisal = InterpretNodeIntoBoolValue(ocn[8], false); + //get the origin and destination std::vector location; std::vector location_types; diff --git a/src/Amalgam/out.txt b/src/Amalgam/out.txt index c82b36d2..cdd57973 100644 --- a/src/Amalgam/out.txt +++ b/src/Amalgam/out.txt @@ -572,12 +572,16 @@ abcdef 27 3.0000000031604355 28 3 29 .nan -30 5 +30 6 31 4 32 4 33 4 34 2 35 2.6009928340740736 +36 0 +37 0 +38 1.6766764161830636 +39 2.197224577336219 --entropy-- 0.6931471805599453 0.14384103622589045 @@ -1041,7 +1045,7 @@ abcdef 4 "d" ) -(list 2 1 0 "d" 3) +(list 2 1 "d" 0 3) (list 1 2 @@ -1236,7 +1240,7 @@ current_index: 2 8 ) accum_string "abcdef" - argv (list "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") + argv (list "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") bar (declare (assoc x 6) (+ x 2) @@ -1249,10 +1253,10 @@ current_index: 2 A (assoc B 2) B 2 ) - interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" + interpreter "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" raaa 2 rwww 1 - start_time 1705540501.507736 + start_time 1705544023.896367 www 1 x 12 zz 10 @@ -1279,7 +1283,7 @@ current_index: 2 8 ) accum_string "abcdef" - argv (list "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") + argv (list "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") bar (declare (assoc x 6) (+ x 2) @@ -1292,10 +1296,10 @@ current_index: 2 A (assoc B 2) B 2 ) - interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" + interpreter "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" raaa 2 rwww 1 - start_time 1705540501.507736 + start_time 1705544023.896367 www 1 x 12 zz 10 @@ -1321,7 +1325,7 @@ current_index: 2 8 ) accum_string "abcdef" - argv (list "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") + argv (list "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") bar (declare (assoc x 6) (+ x 2) @@ -1334,10 +1338,10 @@ current_index: 2 A (assoc B 2) B 2 ) - interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" + interpreter "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" raaa 2 rwww 1 - start_time 1705540501.507736 + start_time 1705544023.896367 www 1 x 12 zz 10 @@ -1485,7 +1489,7 @@ infinity test c or d: (list "c" @(get (target 0) 0) "d" @(get (target 0) 0)) (assoc a 29 b 44 c 27) -(list "7" "2" "6") +(list "7" "2" "1") --get_rand_seed-- °³È¼¿\¨KOaVÆT zÿ @@ -1605,7 +1609,7 @@ e: - .inf 25: (assoc a 1) -current date-time in epoch: 2024-01-17-20.15.01.7597390 +current date-time in epoch: 2024-01-17-21.13.43.9895940 2020-06-07 00:22:59 1391230800 1391230800 @@ -2132,6 +2136,13 @@ decrypted: hello (assoc _ (null)) (replace _ + (list) + (lambda + (assoc + a 2 + g (list 1 4) + ) + ) (list "g") (lambda (list @@ -2139,14 +2150,22 @@ decrypted: hello (current_value 1) 0 ) - 4 - ) - ) - (list) - (lambda - (assoc - a 2 - g (list 1 @(get (get (get (get (target 4) 1) 2) 0) 1)) + @(get + (get + (get + (get + (get + (target 3) + 1 + ) + 2 + ) + 0 + ) + "g" + ) + 1 + ) ) ) ) @@ -3423,7 +3442,7 @@ deep sets --set_entity_root_permission-- RootTest -1705540501.997628 +1705544024.28218 (true) RootTest @@ -3647,7 +3666,7 @@ hello ) ) ) - (set_entity_rand_seed new_entity "3÷U±\\H0Ñ”-I´»ÿ") + (set_entity_rand_seed new_entity "»½»í4±àø0Ñ”-I´»ÿ") (set_entity_rand_seed (first (create_entities @@ -3657,7 +3676,7 @@ hello ) ) ) - "òσÉÀ…÷A“Àƒ ’¿èÿ" + "kOY™î-:ëû” /! ÿ" ) (set_entity_rand_seed (first @@ -3690,7 +3709,7 @@ hello ) ) ) - (set_entity_rand_seed new_entity "3÷U±\\H0Ñ”-I´»ÿ") + (set_entity_rand_seed new_entity "»½»í4±àø0Ñ”-I´»ÿ") (set_entity_rand_seed (first (create_entities @@ -4236,12 +4255,12 @@ cyclic test expected: 155, 200, 190 ... deg values of 0 8 and 12: 190: 0.045454545454545456 (null ##deg 12 ) -155: 0.1 (null - ##deg 0 -) 200: 0.05555555555555555 (null ##deg 8 ) +155: 0.1 (null + ##deg 0 +) --contains_label-- (true) @@ -4563,16 +4582,16 @@ cyclic group kl divergence: 0.05872535496117583 surprisal transforms probabilities: (list (list "vert0" "vert1" "vert2" "vert3") - (list 0.049787068367863944 0.049787068367863944 0.01831563888873418 0.006737946999085467) + (list 2.7535180455541748e-05 2.7535180455541748e-05 5.043471233780792e-07 9.237449443012879e-09) ) weighted probabilities: (list (list "vert0" "vert2" "vert3" "vert1") - (list 0.09709538455906153 0.01831563888873418 0.006737946999085467 0) + (list 5.506960272483319e-05 5.043471233780792e-07 9.237449443012879e-09 0) ) -surprisal contribution: (assoc testvert 2.1235610034676564) -weighted surprisal contribution: (assoc testvert 2.1235610034676564) +surprisal contribution: (assoc testvert 9.797630298091489) +weighted surprisal contribution: (assoc testvert 9.797630298091489) --concurrency tests-- hello hello @@ -4639,4 +4658,4 @@ Expecting 1000: 1000 concurrent entity writes successful: (true) --total execution time-- -1.1092939376831055 +1.1927149295806885