From 7f33d68988b861b2215bf1f5575ce2ea2ee7fade Mon Sep 17 00:00:00 2001 From: jue-yuan Date: Wed, 27 Nov 2024 23:25:21 +0000 Subject: [PATCH 1/5] [GLE-8861] feat(vector): built-in TG function for pairwise vector embedding; --- gds/vector/cosine_distance.gsql | 56 ++++++++++++++++++++ gds/vector/dimension_count.gsql | 37 +++++++++++++ gds/vector/distance.gsql | 85 ++++++++++++++++++++++++++++++ gds/vector/elements_sum.gsql | 41 ++++++++++++++ gds/vector/euclidean_distance.gsql | 62 ++++++++++++++++++++++ gds/vector/ip_distance.gsql | 58 ++++++++++++++++++++ gds/vector/kth_element.gsql | 54 +++++++++++++++++++ gds/vector/norm.gsql | 79 +++++++++++++++++++++++++++ 8 files changed, 472 insertions(+) create mode 100644 gds/vector/cosine_distance.gsql create mode 100644 gds/vector/dimension_count.gsql create mode 100644 gds/vector/distance.gsql create mode 100644 gds/vector/elements_sum.gsql create mode 100644 gds/vector/euclidean_distance.gsql create mode 100644 gds/vector/ip_distance.gsql create mode 100644 gds/vector/kth_element.gsql create mode 100644 gds/vector/norm.gsql diff --git a/gds/vector/cosine_distance.gsql b/gds/vector/cosine_distance.gsql new file mode 100644 index 00000000..28664c85 --- /dev/null +++ b/gds/vector/cosine_distance.gsql @@ -0,0 +1,56 @@ +CREATE FUNCTION gds.vector.cosine_distance(list list1, list list2) RETURNS(float) { + + /* + First Author: Jue Yuan + First Commit Date: Nov 27, 2024 + + Recent Author: Jue Yuan + Recent Commit Date: Nov 27, 2024 + + Maturity: + alpha + + Description: + Calculates the cosine distance between two vectors represented as lists of doubles. + The cosine distance is derived from the cosine similarity and provides a measure of the angle + between two non-zero vectors in a multi-dimensional space. A distance of 0 indicates identical + vectors, while a distance of 1 indicates orthogonal (maximally dissimilar) vectors. + + Parameters: + list list1: + The first vector as a list of double values. + list list2: + The second vector as a list of double values. + + Returns: + float: + The cosine distance between the two input vectors. + Exceptions: + list_size_mismatch (90000): + Raised when the input lists are not of equal size. + + Logic Overview: + Validates that both input vectors have the same length. + Computes the inner (dot) product of the two vectors. + Calculates the magnitudes (Euclidean norms) of both vectors. + Returns the cosine distance as 1 - (inner product) / (product of magnitudes). + + Use Case: + This function is commonly used in machine learning, natural language processing, + and information retrieval tasks to quantify the similarity between vector representations, + such as word embeddings or document feature vectors. + */ + + EXCEPTION list_size_mismatch (90000); + ListAccum @@myList1 = list1; + ListAccum @@myList2 = list2; + + IF (@@myList1.size() != @@myList2.size()) THEN + RAISE list_size_mismatch ("Two lists provided for gds.vector.cosine_distance have different sizes."); + END; + + double innerP = inner_product(@@myList1, @@myList2); + double v1_magn = sqrt(inner_product(@@myList1, @@myList1)); + double v2_magn = sqrt(inner_product(@@myList2, @@myList2)); + RETURN (1 - innerP / (v1_magn * v2_magn)); +} \ No newline at end of file diff --git a/gds/vector/dimension_count.gsql b/gds/vector/dimension_count.gsql new file mode 100644 index 00000000..0476e3b0 --- /dev/null +++ b/gds/vector/dimension_count.gsql @@ -0,0 +1,37 @@ +CREATE FUNCTION gds.vector.dimension_count(list list1) RETURNS(int) { + + /* + First Author: Jue Yuan + First Commit Date: Nov 27, 2024 + + Recent Author: Jue Yuan + Recent Commit Date: Nov 27, 2024 + + Maturity: + alpha + + Description: + Returns the number of dimensions (elements) in a given vector, represented as a list of double values. + This function is useful for determining the size or dimensionality of input vectors in mathematical + and data processing operations. + + Parameters: + list list1: + The input vector as a list of double values. + + Returns: + int: + The number of elements (dimensions) in the input vector. + + Logic Overview: + Accepts a list of double values as input. + Calculates the size of the list, which corresponds to the number of dimensions. + Returns the size as an integer. + Use Case: + This function is valuable in vector-based computations, such as machine learning or data analysis tasks, + where understanding the dimensionality of vectors is crucial for validation, preprocessing, or compatibility checks. + */ + + ListAccum @@myList1 = list1; + RETURN @@myList1.size(); +} \ No newline at end of file diff --git a/gds/vector/distance.gsql b/gds/vector/distance.gsql new file mode 100644 index 00000000..d35ce32b --- /dev/null +++ b/gds/vector/distance.gsql @@ -0,0 +1,85 @@ +CREATE FUNCTION gds.vector.distance(list list1, list list2, string metric) RETURNS(float) { + + /* + First Author: Jue Yuan + First Commit Date: Nov 27, 2024 + + Recent Author: Jue Yuan + Recent Commit Date: Nov 27, 2024 + + Maturity: + alpha + + Description: + Calculates the distance between two vectors represented as lists of double values, + based on a specified distance metric. This function supports multiple metrics, + allowing for flexible similarity or dissimilarity measurements in various computational tasks. + + Parameters: + list list1: + The first vector as a list of double values. + list list2: + The second vector as a list of double values. + string metric: + The distance metric to use. Supported metrics are: + "cosine": Cosine distance + "euclidean": Euclidean distance + "ip": Inner product (dot product) + Returns: + float: + The computed distance between the two input vectors based on the specified metric. + + Exceptions: + list_size_mismatch (90000): + Raised when the input vectors are not of equal size. + invalid_metric_type (90001): + Raised when an unsupported distance metric is provided. + + Logic Overview: + Input Validation: + Ensures both vectors have the same size. + Metric Handling: + Cosine Distance: + Calculated as 1 - (inner product of vectors) / (product of magnitudes). + Euclidean Distance: + Computes the square root of the sum of squared differences between corresponding elements. + Inner Product: + Directly computes the dot product of the two vectors. + + Error Handling: + Raises an exception if the provided metric is invalid. + + Use Case: + This function is essential for machine learning, data science, and information retrieval applications, + where distance or similarity calculations between vector representations (such as embeddings or feature vectors) are required. + */ + + EXCEPTION list_size_mismatch (90000); + EXCEPTION invalid_metric_type (90001); + ListAccum @@myList1 = list1; + ListAccum @@myList2 = list2; + + IF (@@myList1.size() != @@myList2.size()) THEN + RAISE list_size_mismatch ("Two lists provided for gds.vector.distance have different sizes."); + END; + + SumAccum @@myResult; + SumAccum @@sqrSum; + + CASE lower(metric) + WHEN "cosine" THEN + @@myResult = 1 - inner_product(@@myList1, @@myList2) / (sqrt(inner_product(@@myList1, @@myList1)) * sqrt(inner_product(@@myList2, @@myList2))); + WHEN "euclidean" THEN + FOREACH i IN [0, @@myList1.size() - 1 ] DO + @@sqrSum += (@@myList1.get(i) - @@myList2.get(i)) * (@@myList1.get(i) - @@myList2.get(i)); + END; + @@myResult = sqrt(@@sqrSum); + WHEN "ip" THEN + @@myResult = inner_product(@@myList1, @@myList2); + ELSE + RAISE invalid_metric_type ("Invalid metric algorithm provided, currently supported: cosine, euclidean and ip."); + END + ; + + RETURN @@myResult; +} \ No newline at end of file diff --git a/gds/vector/elements_sum.gsql b/gds/vector/elements_sum.gsql new file mode 100644 index 00000000..30626704 --- /dev/null +++ b/gds/vector/elements_sum.gsql @@ -0,0 +1,41 @@ +CREATE FUNCTION gds.vector.elements_sum(list list1) RETURNS(float) { + + /* + First Author: Jue Yuan + First Commit Date: Nov 27, 2024 + + Recent Author: Jue Yuan + Recent Commit Date: Nov 27, 2024 + + Maturity: + alpha + + Description: + Calculates the sum of all elements in a vector, represented as a list of double values. + This function is useful for aggregating vector components in mathematical and statistical operations. + + Parameters: + list list1: + The input vector as a list of double values. + + Returns: + float: + The sum of all elements in the input vector. + + Logic Overview: + Iterates through each element in the input list. + Accumulates the sum of all elements. + Returns the final sum as a floating-point value. + + Use Case: + This function is valuable in various data processing tasks, such as computing vector norms, + validating data integrity, or performing aggregations in machine learning and statistical analysis. + */ + + SumAccum @@mySum; + + FOREACH i IN list1 DO + @@mySum += i; + END; + RETURN @@mySum; +} \ No newline at end of file diff --git a/gds/vector/euclidean_distance.gsql b/gds/vector/euclidean_distance.gsql new file mode 100644 index 00000000..167a5637 --- /dev/null +++ b/gds/vector/euclidean_distance.gsql @@ -0,0 +1,62 @@ +CREATE FUNCTION gds.vector.euclidean_distance(list list1, list list2) RETURNS(float) { + + /* + First Author: Jue Yuan + First Commit Date: Nov 27, 2024 + + Recent Author: Jue Yuan + Recent Commit Date: Nov 27, 2024 + + Maturity: + alpha + + Description: + Calculates the Euclidean distance between two vectors represented as lists of double values. + Euclidean distance measures the straight-line distance between two points in multi-dimensional space, + making it a fundamental metric in various computational and analytical applications. + + Parameters: + list list1: + The first vector as a list of double values. + list list2: + The second vector as a list of double values. + + Returns: + float: + The Euclidean distance between the two input vectors. + + Exceptions: + list_size_mismatch (90000): Raised when the input vectors are not of equal size. + + Logic Overview: + Input Validation: + Ensures both vectors have the same length. + Distance Calculation: + Iterates through corresponding elements of both vectors. + Computes the sum of the squared differences between each pair of elements. + Returns the square root of the accumulated sum, representing the Euclidean distance. + + Formula: + Distance = sqrt((x1 - y1)^2 + (x2 - y2)^2 + ... + (xn - yn)^2) + Where xi and yi are elements of list1 and list2, respectively. + + Use Case: + This function is widely used in machine learning (e.g., k-nearest neighbors), data science, + and pattern recognition tasks to measure the similarity or dissimilarity between data points. + */ + + EXCEPTION list_size_mismatch (90000); + ListAccum @@myList1 = list1; + ListAccum @@myList2 = list2; + + IF (@@myList1.size() != @@myList2.size()) THEN + RAISE list_size_mismatch ("Two lists provided for gds.vector.euclidean_distance have different sizes."); + END; + + SumAccum @@sqrSum; + FOREACH i IN [0, @@myList1.size() - 1 ] DO + @@sqrSum += (@@myList1.get(i) - @@myList2.get(i)) * (@@myList1.get(i) - @@myList2.get(i)); + END; + + RETURN sqrt(@@sqrSum); +} \ No newline at end of file diff --git a/gds/vector/ip_distance.gsql b/gds/vector/ip_distance.gsql new file mode 100644 index 00000000..840253f5 --- /dev/null +++ b/gds/vector/ip_distance.gsql @@ -0,0 +1,58 @@ +CREATE FUNCTION gds.vector.ip_distance(list list1, list list2) RETURNS(float) { + + /* + First Author: Jue Yuan + First Commit Date: Nov 27, 2024 + + Recent Author: Jue Yuan + Recent Commit Date: Nov 27, 2024 + + Maturity: + alpha + + Description: + Calculates the inner product (dot product) between two vectors represented as lists of double values. + The inner product is a key measure in linear algebra, indicating the magnitude of the projection of one vector onto another. + This function provides a similarity measure commonly used in machine learning and data analysis. + + Parameters: + list list1: + The first vector as a list of double values. + list list2: + The second vector as a list of double values. + + Returns: + float: + The inner product (dot product) of the two input vectors. + + Exceptions: + list_size_mismatch (90000): + Raised when the input vectors are not of equal size. + + Logic Overview: + Input Validation: + Ensures both vectors have the same length. + Inner Product Calculation: + Computes the sum of the element-wise products of the two vectors. + + Formula: + Inner Product = (x1 x y1) + (x2 x y2) + ... + (xn x yn) + Where xi and yi are elements of list1 and list2, respectively. + + Use Case: + This function is widely used in: + Calculating similarity in machine learning models (e.g., recommendation systems). + Performing vector projections in linear algebra. + Evaluating similarity between embeddings in natural language processing (NLP). + */ + + EXCEPTION list_size_mismatch (90000); + ListAccum @@myList1 = list1; + ListAccum @@myList2 = list2; + + IF (@@myList1.size() != @@myList2.size()) THEN + RAISE list_size_mismatch ("Two lists provided for gds.vector.euclidean_distance have different sizes."); + END; + + RETURN inner_product(@@myList1, @@myList2); +} \ No newline at end of file diff --git a/gds/vector/kth_element.gsql b/gds/vector/kth_element.gsql new file mode 100644 index 00000000..b832cbc8 --- /dev/null +++ b/gds/vector/kth_element.gsql @@ -0,0 +1,54 @@ +CREATE FUNCTION gds.vector.kth_element(list list1, int kth_index) RETURNS(float) { + + /* + First Author: Jue Yuan + First Commit Date: Nov 27, 2024 + + Recent Author: Jue Yuan + Recent Commit Date: Nov 27, 2024 + + Maturity: + alpha + + Description: + Retrieves the k-th element from a vector, represented as a list of double values. + This function ensures safe access by validating the index against the vector's size, + preventing out-of-range errors. + + Parameters: + list list1: + The input vector as a list of double values. + int kth_index: + The zero-based index of the element to retrieve. + + Returns: + float: + The value of the element at the specified k-th index in the input vector. + + Exceptions: + out_of_range (90000): + Raised when the specified index is either negative or exceeds the size of the input vector. + + Logic Overview: + Input Validation: + Checks if the provided index is within the valid range (0 to list size - 1). + Raises an exception if the index is out of range. + Element Retrieval: + Returns the element at the specified index. + + Use Case: + This function is useful in scenarios where specific elements of a vector need to be accessed programmatically, such as: + Extracting features from a dataset. + Implementing custom vector operations in data processing pipelines. + Accessing indexed components in mathematical computations. + */ + + EXCEPTION out_of_range (90000); + + ListAccum @@myList1 = list1; + IF (kth_index >= @@myList1.size() OR kth_index < 0) THEN + RAISE out_of_range("Kth index provided for gds.vector.kth_element is out of the range of this list."); + END; + + RETURN @@myList1.get(kth_index); +} \ No newline at end of file diff --git a/gds/vector/norm.gsql b/gds/vector/norm.gsql new file mode 100644 index 00000000..a51de149 --- /dev/null +++ b/gds/vector/norm.gsql @@ -0,0 +1,79 @@ +CREATE FUNCTION gds.vector.norm(list list1, string metric) RETURNS(float) { + + /* + First Author: Jue Yuan + First Commit Date: Nov 27, 2024 + + Recent Author: Jue Yuan + Recent Commit Date: Nov 27, 2024 + + Maturity: + alpha + + Description: + Computes the norm (magnitude) of a vector based on a specified metric. + The norm is a measure of a vector's length in a mathematical space and is commonly used + in data normalization and vector calculations. + + Parameters: + list list1: + The input vector as a list of double values. + string metric: + The norm metric to apply. Supported metrics are: + "euclidean": Euclidean norm (L2 norm). + "ip": Inner product norm (dot product with the zero vector). + + Returns: + float: + The computed norm of the vector based on the selected metric. + + Exceptions: + invalid_metric_type (90001): + Raised when the specified metric is not supported. + + Logic Overview: + Metric Handling: + Euclidean Norm (L2 norm): + Creates a zero vector of the same size as the input. + Computes the square root of the sum of squared differences between the input vector and the zero vector. + Formula: + ||v||2 = sqrt(x1^2 + x2^2 + ... + xn^2)​ + Inner Product Norm (IP norm): + Computes the inner product of the input vector with the zero vector, resulting in a value of 0 + (the norm of any vector with the zero vector is trivially 0). + Error Handling: + Raises an exception if an unsupported metric is provided. + + Use Case: + This function is helpful in scenarios such as: + Data preprocessing (e.g., normalizing vector magnitudes). + Calculating distances and similarities in machine learning algorithms. + Evaluating vector lengths for mathematical or physics-related computations. + */ + + EXCEPTION invalid_metric_type (90001); + ListAccum @@myList1 = list1; + ListAccum @@myList2; + + FOREACH i IN [0, @@myList1.size() - 1] DO + @@myList2 += 0; + end; + + SumAccum @@myResult; + SumAccum @@sqrSum; + + CASE lower(metric) + WHEN "euclidean" THEN + FOREACH i IN [0, @@myList1.size() - 1 ] DO + @@sqrSum += (@@myList1.get(i) - @@myList2.get(i)) * (@@myList1.get(i) - @@myList2.get(i)); + END; + @@myResult = sqrt(@@sqrSum); + WHEN "ip" THEN + @@myResult = inner_product(@@myList1, @@myList2); + ELSE + RAISE invalid_metric_type ("Invalid metric algorithm provided, currently supported: euclidean and ip."); + END + ; + + RETURN @@myResult; +} \ No newline at end of file From 870042fa7cbeca9bab02f49855a5760c39d36f2f Mon Sep 17 00:00:00 2001 From: jue-yuan Date: Tue, 3 Dec 2024 00:53:32 +0000 Subject: [PATCH 2/5] [GLE-8861] change euclidean to l2; --- gds/vector/distance.gsql | 8 ++++---- gds/vector/ip_distance.gsql | 2 +- gds/vector/{euclidean_distance.gsql => l2_distance.gsql} | 4 ++-- gds/vector/norm.gsql | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) rename gds/vector/{euclidean_distance.gsql => l2_distance.gsql} (93%) diff --git a/gds/vector/distance.gsql b/gds/vector/distance.gsql index d35ce32b..64042f68 100644 --- a/gds/vector/distance.gsql +++ b/gds/vector/distance.gsql @@ -23,7 +23,7 @@ CREATE FUNCTION gds.vector.distance(list list1, list list2, stri string metric: The distance metric to use. Supported metrics are: "cosine": Cosine distance - "euclidean": Euclidean distance + "l2": Euclidean distance "ip": Inner product (dot product) Returns: float: @@ -41,7 +41,7 @@ CREATE FUNCTION gds.vector.distance(list list1, list list2, stri Metric Handling: Cosine Distance: Calculated as 1 - (inner product of vectors) / (product of magnitudes). - Euclidean Distance: + L2 Distance: Computes the square root of the sum of squared differences between corresponding elements. Inner Product: Directly computes the dot product of the two vectors. @@ -69,7 +69,7 @@ CREATE FUNCTION gds.vector.distance(list list1, list list2, stri CASE lower(metric) WHEN "cosine" THEN @@myResult = 1 - inner_product(@@myList1, @@myList2) / (sqrt(inner_product(@@myList1, @@myList1)) * sqrt(inner_product(@@myList2, @@myList2))); - WHEN "euclidean" THEN + WHEN "l2" THEN FOREACH i IN [0, @@myList1.size() - 1 ] DO @@sqrSum += (@@myList1.get(i) - @@myList2.get(i)) * (@@myList1.get(i) - @@myList2.get(i)); END; @@ -77,7 +77,7 @@ CREATE FUNCTION gds.vector.distance(list list1, list list2, stri WHEN "ip" THEN @@myResult = inner_product(@@myList1, @@myList2); ELSE - RAISE invalid_metric_type ("Invalid metric algorithm provided, currently supported: cosine, euclidean and ip."); + RAISE invalid_metric_type ("Invalid metric algorithm provided, currently supported: cosine, l2 and ip."); END ; diff --git a/gds/vector/ip_distance.gsql b/gds/vector/ip_distance.gsql index 840253f5..0dcfa49f 100644 --- a/gds/vector/ip_distance.gsql +++ b/gds/vector/ip_distance.gsql @@ -51,7 +51,7 @@ CREATE FUNCTION gds.vector.ip_distance(list list1, list list2) R ListAccum @@myList2 = list2; IF (@@myList1.size() != @@myList2.size()) THEN - RAISE list_size_mismatch ("Two lists provided for gds.vector.euclidean_distance have different sizes."); + RAISE list_size_mismatch ("Two lists provided for gds.vector.ip_distance have different sizes."); END; RETURN inner_product(@@myList1, @@myList2); diff --git a/gds/vector/euclidean_distance.gsql b/gds/vector/l2_distance.gsql similarity index 93% rename from gds/vector/euclidean_distance.gsql rename to gds/vector/l2_distance.gsql index 167a5637..ef718a2c 100644 --- a/gds/vector/euclidean_distance.gsql +++ b/gds/vector/l2_distance.gsql @@ -1,4 +1,4 @@ -CREATE FUNCTION gds.vector.euclidean_distance(list list1, list list2) RETURNS(float) { +CREATE FUNCTION gds.vector.l2_distance(list list1, list list2) RETURNS(float) { /* First Author: Jue Yuan @@ -50,7 +50,7 @@ CREATE FUNCTION gds.vector.euclidean_distance(list list1, list l ListAccum @@myList2 = list2; IF (@@myList1.size() != @@myList2.size()) THEN - RAISE list_size_mismatch ("Two lists provided for gds.vector.euclidean_distance have different sizes."); + RAISE list_size_mismatch ("Two lists provided for gds.vector.l2_distance have different sizes."); END; SumAccum @@sqrSum; diff --git a/gds/vector/norm.gsql b/gds/vector/norm.gsql index a51de149..fdf664fb 100644 --- a/gds/vector/norm.gsql +++ b/gds/vector/norm.gsql @@ -20,7 +20,7 @@ CREATE FUNCTION gds.vector.norm(list list1, string metric) RETURNS(float The input vector as a list of double values. string metric: The norm metric to apply. Supported metrics are: - "euclidean": Euclidean norm (L2 norm). + "l2": Euclidean norm (L2 norm). "ip": Inner product norm (dot product with the zero vector). Returns: @@ -63,7 +63,7 @@ CREATE FUNCTION gds.vector.norm(list list1, string metric) RETURNS(float SumAccum @@sqrSum; CASE lower(metric) - WHEN "euclidean" THEN + WHEN "l2" THEN FOREACH i IN [0, @@myList1.size() - 1 ] DO @@sqrSum += (@@myList1.get(i) - @@myList2.get(i)) * (@@myList1.get(i) - @@myList2.get(i)); END; @@ -71,7 +71,7 @@ CREATE FUNCTION gds.vector.norm(list list1, string metric) RETURNS(float WHEN "ip" THEN @@myResult = inner_product(@@myList1, @@myList2); ELSE - RAISE invalid_metric_type ("Invalid metric algorithm provided, currently supported: euclidean and ip."); + RAISE invalid_metric_type ("Invalid metric algorithm provided, currently supported: l2 and ip."); END ; From 51605229438ee97871e50e5932efe393c79d26c4 Mon Sep 17 00:00:00 2001 From: jue-yuan Date: Tue, 3 Dec 2024 18:38:38 +0000 Subject: [PATCH 3/5] [GLE-8861] add missing range for foreach statements; --- gds/vector/distance.gsql | 2 +- gds/vector/l2_distance.gsql | 2 +- gds/vector/norm.gsql | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gds/vector/distance.gsql b/gds/vector/distance.gsql index 64042f68..06d4df37 100644 --- a/gds/vector/distance.gsql +++ b/gds/vector/distance.gsql @@ -70,7 +70,7 @@ CREATE FUNCTION gds.vector.distance(list list1, list list2, stri WHEN "cosine" THEN @@myResult = 1 - inner_product(@@myList1, @@myList2) / (sqrt(inner_product(@@myList1, @@myList1)) * sqrt(inner_product(@@myList2, @@myList2))); WHEN "l2" THEN - FOREACH i IN [0, @@myList1.size() - 1 ] DO + FOREACH i IN RANGE [0, @@myList1.size() - 1 ] DO @@sqrSum += (@@myList1.get(i) - @@myList2.get(i)) * (@@myList1.get(i) - @@myList2.get(i)); END; @@myResult = sqrt(@@sqrSum); diff --git a/gds/vector/l2_distance.gsql b/gds/vector/l2_distance.gsql index ef718a2c..1c53a2aa 100644 --- a/gds/vector/l2_distance.gsql +++ b/gds/vector/l2_distance.gsql @@ -54,7 +54,7 @@ CREATE FUNCTION gds.vector.l2_distance(list list1, list list2) R END; SumAccum @@sqrSum; - FOREACH i IN [0, @@myList1.size() - 1 ] DO + FOREACH i IN RANGE [0, @@myList1.size() - 1 ] DO @@sqrSum += (@@myList1.get(i) - @@myList2.get(i)) * (@@myList1.get(i) - @@myList2.get(i)); END; diff --git a/gds/vector/norm.gsql b/gds/vector/norm.gsql index fdf664fb..49e60bb2 100644 --- a/gds/vector/norm.gsql +++ b/gds/vector/norm.gsql @@ -55,7 +55,7 @@ CREATE FUNCTION gds.vector.norm(list list1, string metric) RETURNS(float ListAccum @@myList1 = list1; ListAccum @@myList2; - FOREACH i IN [0, @@myList1.size() - 1] DO + FOREACH i IN RANGE [0, @@myList1.size() - 1] DO @@myList2 += 0; end; @@ -64,7 +64,7 @@ CREATE FUNCTION gds.vector.norm(list list1, string metric) RETURNS(float CASE lower(metric) WHEN "l2" THEN - FOREACH i IN [0, @@myList1.size() - 1 ] DO + FOREACH i IN RANGE [0, @@myList1.size() - 1 ] DO @@sqrSum += (@@myList1.get(i) - @@myList2.get(i)) * (@@myList1.get(i) - @@myList2.get(i)); END; @@myResult = sqrt(@@sqrSum); From 3d49227b7c75db24cb1f7e40cdf5309065e75eb3 Mon Sep 17 00:00:00 2001 From: jue-yuan Date: Thu, 5 Dec 2024 19:56:12 +0000 Subject: [PATCH 4/5] [GLE-8861] address comments; --- gds/vector/cosine_distance.gsql | 15 +++++++++++++-- gds/vector/distance.gsql | 20 +++++++++++++++++--- gds/vector/norm.gsql | 13 +++---------- 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/gds/vector/cosine_distance.gsql b/gds/vector/cosine_distance.gsql index 28664c85..342ed1c0 100644 --- a/gds/vector/cosine_distance.gsql +++ b/gds/vector/cosine_distance.gsql @@ -28,6 +28,8 @@ CREATE FUNCTION gds.vector.cosine_distance(list list1, list list Exceptions: list_size_mismatch (90000): Raised when the input lists are not of equal size. + zero_divisor(90001); + Raised either list is all zero to avoid zero-divisor issue. Logic Overview: Validates that both input vectors have the same length. @@ -42,6 +44,7 @@ CREATE FUNCTION gds.vector.cosine_distance(list list1, list list */ EXCEPTION list_size_mismatch (90000); + EXCEPTION zero_divisor(90001); ListAccum @@myList1 = list1; ListAccum @@myList2 = list2; @@ -49,8 +52,16 @@ CREATE FUNCTION gds.vector.cosine_distance(list list1, list list RAISE list_size_mismatch ("Two lists provided for gds.vector.cosine_distance have different sizes."); END; - double innerP = inner_product(@@myList1, @@myList2); + double inner_p = inner_product(@@myList1, @@myList2); double v1_magn = sqrt(inner_product(@@myList1, @@myList1)); double v2_magn = sqrt(inner_product(@@myList2, @@myList2)); - RETURN (1 - innerP / (v1_magn * v2_magn)); + IF (abs(v1_magn) < 0.0000001) THEN + // use a small positive float to avoid numeric comparison error + RAISE zero_divisor ("The elements in the first list are all zero. It will introduce a zero divisor."); + END; + IF (abs(v1_magn) < 0.0000001) THEN + // use a small positive float to avoid numeric comparison error + RAISE zero_divisor ("The elements in the second list are all zero. It will introduce a zero divisor."); + END; + RETURN (1 - inner_p / (v1_magn * v2_magn)); } \ No newline at end of file diff --git a/gds/vector/distance.gsql b/gds/vector/distance.gsql index 06d4df37..b17af378 100644 --- a/gds/vector/distance.gsql +++ b/gds/vector/distance.gsql @@ -32,7 +32,9 @@ CREATE FUNCTION gds.vector.distance(list list1, list list2, stri Exceptions: list_size_mismatch (90000): Raised when the input vectors are not of equal size. - invalid_metric_type (90001): + zero_divisor(90001); + Raised either list is all zero to avoid zero-divisor issue. + invalid_metric_type (90002): Raised when an unsupported distance metric is provided. Logic Overview: @@ -55,7 +57,8 @@ CREATE FUNCTION gds.vector.distance(list list1, list list2, stri */ EXCEPTION list_size_mismatch (90000); - EXCEPTION invalid_metric_type (90001); + EXCEPTION zero_divisor(90001); + EXCEPTION invalid_metric_type (90002); ListAccum @@myList1 = list1; ListAccum @@myList2 = list2; @@ -68,7 +71,18 @@ CREATE FUNCTION gds.vector.distance(list list1, list list2, stri CASE lower(metric) WHEN "cosine" THEN - @@myResult = 1 - inner_product(@@myList1, @@myList2) / (sqrt(inner_product(@@myList1, @@myList1)) * sqrt(inner_product(@@myList2, @@myList2))); + double inner_p = inner_product(@@myList1, @@myList2); + double v1_magn = sqrt(inner_product(@@myList1, @@myList1)); + double v2_magn = sqrt(inner_product(@@myList2, @@myList2)); + IF (abs(v1_magn) < 0.0000001) THEN + // use a small positive float to avoid numeric comparison error + RAISE zero_divisor ("The elements in the first list are all zero. It will introduce a zero divisor."); + END; + IF (abs(v2_magn) < 0.0000001) THEN + // use a small positive float to avoid numeric comparison error + RAISE zero_divisor ("The elements in the second list are all zero. It will introduce a zero divisor."); + END; + @@myResult = 1 - inner_p / (v1_magn * v2_magn); WHEN "l2" THEN FOREACH i IN RANGE [0, @@myList1.size() - 1 ] DO @@sqrSum += (@@myList1.get(i) - @@myList2.get(i)) * (@@myList1.get(i) - @@myList2.get(i)); diff --git a/gds/vector/norm.gsql b/gds/vector/norm.gsql index 49e60bb2..feee0e1b 100644 --- a/gds/vector/norm.gsql +++ b/gds/vector/norm.gsql @@ -53,23 +53,16 @@ CREATE FUNCTION gds.vector.norm(list list1, string metric) RETURNS(float EXCEPTION invalid_metric_type (90001); ListAccum @@myList1 = list1; - ListAccum @@myList2; - - FOREACH i IN RANGE [0, @@myList1.size() - 1] DO - @@myList2 += 0; - end; SumAccum @@myResult; SumAccum @@sqrSum; CASE lower(metric) WHEN "l2" THEN - FOREACH i IN RANGE [0, @@myList1.size() - 1 ] DO - @@sqrSum += (@@myList1.get(i) - @@myList2.get(i)) * (@@myList1.get(i) - @@myList2.get(i)); - END; - @@myResult = sqrt(@@sqrSum); + @@myResult = sqrt(inner_product(@@myList1, @@myList1)); WHEN "ip" THEN - @@myResult = inner_product(@@myList1, @@myList2); + // the result of inner product between any vector and all-zero vector should always be 0 + @@myResult = 0; ELSE RAISE invalid_metric_type ("Invalid metric algorithm provided, currently supported: l2 and ip."); END From bef452dafad796eced970a0f65e105c371bc3d20 Mon Sep 17 00:00:00 2001 From: jue-yuan Date: Mon, 9 Dec 2024 20:12:40 +0000 Subject: [PATCH 5/5] [GLE-8861] add OR REPLACE for each GSQL function; --- gds/vector/cosine_distance.gsql | 2 +- gds/vector/dimension_count.gsql | 2 +- gds/vector/distance.gsql | 2 +- gds/vector/elements_sum.gsql | 2 +- gds/vector/ip_distance.gsql | 2 +- gds/vector/kth_element.gsql | 2 +- gds/vector/l2_distance.gsql | 2 +- gds/vector/norm.gsql | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/gds/vector/cosine_distance.gsql b/gds/vector/cosine_distance.gsql index 342ed1c0..01323dce 100644 --- a/gds/vector/cosine_distance.gsql +++ b/gds/vector/cosine_distance.gsql @@ -1,4 +1,4 @@ -CREATE FUNCTION gds.vector.cosine_distance(list list1, list list2) RETURNS(float) { +CREATE OR REPLACE FUNCTION gds.vector.cosine_distance(list list1, list list2) RETURNS(float) { /* First Author: Jue Yuan diff --git a/gds/vector/dimension_count.gsql b/gds/vector/dimension_count.gsql index 0476e3b0..ac3903f7 100644 --- a/gds/vector/dimension_count.gsql +++ b/gds/vector/dimension_count.gsql @@ -1,4 +1,4 @@ -CREATE FUNCTION gds.vector.dimension_count(list list1) RETURNS(int) { +CREATE OR REPLACE FUNCTION gds.vector.dimension_count(list list1) RETURNS(int) { /* First Author: Jue Yuan diff --git a/gds/vector/distance.gsql b/gds/vector/distance.gsql index b17af378..d56e9893 100644 --- a/gds/vector/distance.gsql +++ b/gds/vector/distance.gsql @@ -1,4 +1,4 @@ -CREATE FUNCTION gds.vector.distance(list list1, list list2, string metric) RETURNS(float) { +CREATE OR REPLACE FUNCTION gds.vector.distance(list list1, list list2, string metric) RETURNS(float) { /* First Author: Jue Yuan diff --git a/gds/vector/elements_sum.gsql b/gds/vector/elements_sum.gsql index 30626704..e8af16f1 100644 --- a/gds/vector/elements_sum.gsql +++ b/gds/vector/elements_sum.gsql @@ -1,4 +1,4 @@ -CREATE FUNCTION gds.vector.elements_sum(list list1) RETURNS(float) { +CREATE OR REPLACE FUNCTION gds.vector.elements_sum(list list1) RETURNS(float) { /* First Author: Jue Yuan diff --git a/gds/vector/ip_distance.gsql b/gds/vector/ip_distance.gsql index 0dcfa49f..a33f490e 100644 --- a/gds/vector/ip_distance.gsql +++ b/gds/vector/ip_distance.gsql @@ -1,4 +1,4 @@ -CREATE FUNCTION gds.vector.ip_distance(list list1, list list2) RETURNS(float) { +CREATE OR REPLACE FUNCTION gds.vector.ip_distance(list list1, list list2) RETURNS(float) { /* First Author: Jue Yuan diff --git a/gds/vector/kth_element.gsql b/gds/vector/kth_element.gsql index b832cbc8..3fab788b 100644 --- a/gds/vector/kth_element.gsql +++ b/gds/vector/kth_element.gsql @@ -1,4 +1,4 @@ -CREATE FUNCTION gds.vector.kth_element(list list1, int kth_index) RETURNS(float) { +CREATE OR REPLACE FUNCTION gds.vector.kth_element(list list1, int kth_index) RETURNS(float) { /* First Author: Jue Yuan diff --git a/gds/vector/l2_distance.gsql b/gds/vector/l2_distance.gsql index 1c53a2aa..9c386b11 100644 --- a/gds/vector/l2_distance.gsql +++ b/gds/vector/l2_distance.gsql @@ -1,4 +1,4 @@ -CREATE FUNCTION gds.vector.l2_distance(list list1, list list2) RETURNS(float) { +CREATE OR REPLACE FUNCTION gds.vector.l2_distance(list list1, list list2) RETURNS(float) { /* First Author: Jue Yuan diff --git a/gds/vector/norm.gsql b/gds/vector/norm.gsql index feee0e1b..436273c0 100644 --- a/gds/vector/norm.gsql +++ b/gds/vector/norm.gsql @@ -1,4 +1,4 @@ -CREATE FUNCTION gds.vector.norm(list list1, string metric) RETURNS(float) { +CREATE OR REPLACE FUNCTION gds.vector.norm(list list1, string metric) RETURNS(float) { /* First Author: Jue Yuan