mongodb · blink1073 · Oct 1, 2024 · Aug 22, 2024 · Aug 23, 2024 · Aug 23, 2024
diff --git a/.evergreen/resync-specs.sh b/.evergreen/resync-specs.sh
@@ -76,6 +76,9 @@ do
     atlas-data-lake-testing|data_lake)
       cpjson atlas-data-lake-testing/tests/ data_lake
       ;;
+    bson-binary-vector|bson_binary_vector)
+      cpjson bson-binary-vector/tests/ bson_binary_vector
+      ;;
     bson-corpus|bson_corpus)
       cpjson bson-corpus/tests/ bson_corpus
       ;;

diff --git a/bson/binary.py b/bson/binary.py
@@ -13,7 +13,10 @@
 # limitations under the License.
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Tuple, Type, Union
+import struct
+from dataclasses import dataclass
+from enum import Enum
+from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, Union
 from uuid import UUID
 
 """Tools for representing BSON binary data.
@@ -191,21 +194,76 @@ class UuidRepresentation:
 """
 
 
+VECTOR_SUBTYPE = 9
+"""BSON binary subtype for densely packed vector data.
+
+.. versionadded:: 4.9
+"""
+
+
 USER_DEFINED_SUBTYPE = 128
 """BSON binary subtype for any user defined structure.
 """
 
 
+class BinaryVectorDtype(Enum):
+    """Datatypes of vector subtype.
+
+    :param FLOAT32: (0x27) Pack list of :class:`float` as float32
+    :param INT8: (0x03) Pack list of :class:`int` in [-128, 127] as signed int8
+    :param PACKED_BIT: (0x10) Pack list of :class:`int` in [0, 255] as unsigned uint8
+
+    The `PACKED_BIT` value represents a special case where vector values themselves
+    can only be of two values (0 or 1) but these are packed together into groups of 8,
+    a byte. In Python, these are displayed as ints in range [0, 255]
+
+    Each value is of type bytes with a length of one.
+
+    .. versionadded:: 4.9
+    """
+
+    INT8 = b"\x03"
+    FLOAT32 = b"\x27"
+    PACKED_BIT = b"\x10"
+
+
+# Map from bytes to enum value, for decoding.
+DTYPE_FROM_HEX = {key.value: key for key in BinaryVectorDtype}
+
+
+@dataclass
+class BinaryVector:
+    """Vector of numbers along with metadata for binary interoperability.
+
+    :param data: Sequence of numbers representing the mathematical vector.
+    :param dtype:  The data type stored in binary
+    :param padding: The number of bits in the final byte that are to be ignored
+      when a vector element's size is less than a byte
+      and the length of the vector is not a multiple of 8.
+
+    .. versionadded:: 4.9
+    """
+
+    data: Sequence[float | int]
 __slots__ = ("__time", "__inc") 
 __slots__ = ("__time", "__inc") 
+    dtype: BinaryVectorDtype
+    padding: Optional[int] = 0
+
+
 class Binary(bytes):
     """Representation of BSON binary data.
 
-    This is necessary because we want to represent Python strings as
-    the BSON string type. We need to wrap binary data so we can tell
+    We want to represent Python strings as the BSON string type.
+    We need to wrap binary data so that we can tell
     the difference between what should be considered binary data and
     what should be considered a string when we encode to BSON.
 
-    Raises TypeError if `data` is not an instance of :class:`bytes`
-    or `subtype` is not an instance of :class:`int`.
+    Subtype 9 provides a space-efficient representation of 1-dimensional vector data.
+    Its data is prepended with two bytes of metadata.
+    The first (dtype) describes its data type, such as float32 or int8.
+    The second (padding) prescribes the number of bits to ignore in the final byte.
+    This is relevant when the element size of the dtype is not a multiple of 8.
+
+    Raises TypeError if subtype` is not an instance of :class:`int`.
     Raises ValueError if `subtype` is not in [0, 256).
 
     .. note::
@@ -218,7 +276,10 @@ class Binary(bytes):
         to use
 
     .. versionchanged:: 3.9
-      Support any bytes-like type that implements the buffer protocol.
+       Support any bytes-like type that implements the buffer protocol.
+
+    .. versionchanged:: 4.9
+       Addition of vector subtype.
     """
 
     _type_marker = 5
@@ -337,6 +398,90 @@ def as_uuid(self, uuid_representation: int = UuidRepresentation.STANDARD) -> UUI
             f"cannot decode subtype {self.subtype} to {UUID_REPRESENTATION_NAMES[uuid_representation]}"
         )
 
+    @classmethod
+    def from_vector(
+        cls: Type[Binary],
+        vector: list[int, float],
+        dtype: BinaryVectorDtype,
+        padding: Optional[int] = 0,
+    ) -> Binary:
+        """Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers.
+
+        To interpret the representation of the numbers, a data type must be included.
+        See :class:`~bson.binary.BinaryVectorDtype` for available types and descriptions.
+
+        The dtype and padding are prepended to the binary data's value.
+
+        :param vector: List of values
+        :param dtype: Data type of the values
+        :param padding: For fractional bytes, number of bits to ignore at end of vector.
+        :return: Binary packed data identified by dtype and padding.
+
+        .. versionadded:: 4.9
+        """
+        if dtype == BinaryVectorDtype.INT8:  # pack ints in [-128, 127] as signed int8
+            format_str = "b"
+            assert not padding, f"padding does not apply to {dtype=}"
+        elif dtype == BinaryVectorDtype.PACKED_BIT:  # pack ints in [0, 255] as unsigned uint8
+            format_str = "B"
+        elif dtype == BinaryVectorDtype.FLOAT32:  # pack floats as float32
+            format_str = "f"
+            assert not padding, f"padding does not apply to {dtype=}"
+        else:
+            raise NotImplementedError("%s not yet supported" % dtype)
+
+        metadata = struct.pack("<sB", dtype.value, padding)
+        data = struct.pack(f"{len(vector)}{format_str}", *vector)
+        return cls(metadata + data, subtype=VECTOR_SUBTYPE)
+
+    def as_vector(self, uncompressed: Optional[bool] = False) -> BinaryVector:
+        """From the Binary, create a list of numbers, along with dtype and padding.
+
+
+        :param uncompressed: If true, return the true mathematical vector.
+            This is only necessary for datatypes where padding is applicable.
+            For example, setting this to True for a PACKED_BIT vector will result
+            in a List[int] of zeros and ones.
+        :return: List of numbers, along with dtype and padding.
+
+        .. versionadded:: 4.9
+        """
+
+        position = 0
+        dtype, padding = struct.unpack_from("<sB", self, position)
+        position += 2
+        dtype = BinaryVectorDtype(dtype)
+        n_values = len(self) - position
+
+        if dtype == BinaryVectorDtype.INT8:
+            dtype_format = "b"
+            format_string = f"{n_values}{dtype_format}"
+            vector = list(struct.unpack_from(format_string, self, position))
+            return BinaryVector(vector, dtype, padding)
+
+        elif dtype == BinaryVectorDtype.FLOAT32:
+            n_bytes = len(self) - position
+            n_values = n_bytes // 4
+            assert n_bytes % 4 == 0
+            vector = list(struct.unpack_from(f"{n_values}f", self, position))
+            return BinaryVector(vector, dtype, padding)
+
+        elif dtype == BinaryVectorDtype.PACKED_BIT:
+            # data packed as uint8
+            dtype_format = "B"
+            unpacked_uint8s = list(struct.unpack_from(f"{n_values}{dtype_format}", self, position))
+            if not uncompressed:
+                return BinaryVector(unpacked_uint8s, dtype, padding)
+            else:
+                bits = []
+                for uint8 in unpacked_uint8s:
+                    bits.extend([int(bit) for bit in f"{uint8:08b}"])
+                vector = bits[:-padding] if padding else bits
+                return BinaryVector(vector, dtype, padding)
+
+        else:
+            raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name)
+
     @property
     def subtype(self) -> int:
         """Subtype of this binary data."""

diff --git a/doc/api/bson/binary.rst b/doc/api/bson/binary.rst
@@ -21,6 +21,14 @@
    .. autoclass:: UuidRepresentation
       :members:
 
+   .. autoclass:: BinaryVectorDtype
+      :members:
+      :show-inheritance:
+
+   .. autoclass:: BinaryVector
+      :members:
+
+
    .. autoclass:: Binary(data, subtype=BINARY_SUBTYPE)
       :members:
       :show-inheritance:
diff --git a/test/bson_binary_vector/float32.json b/test/bson_binary_vector/float32.json
@@ -0,0 +1,45 @@
+{
+  "description": "Tests of Binary subtype 9, Vectors, with dtype FLOAT32",
+  "test_key": "vector",
+  "tests": [
+    {
+      "description": "Simple Vector FLOAT32",
+      "valid": true,
+      "vector": [127.0, 7.0],
+      "dtype_hex": "0x27",
+      "dtype_alias": "FLOAT32",
+      "padding": 0,
+      "canonical_bson": "1C00000005766563746F72000A0000000927000000FE420000E04000", 
+      "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwAAAP5CAADgQA==\", \"subType\": \"09\"}}}"
+    },
+    {
+      "description": "Empty Vector FLOAT32",
+      "valid": true,
+      "vector": [],
+      "dtype_hex": "0x27",
+      "dtype_alias": "FLOAT32",
+      "padding": 0,
+      "canonical_bson": "1400000005766563746F72000200000009270000",
+      "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwA=\", \"subType\": \"09\"}}}"
+    },
+    {
+      "description": "Infinity Vector FLOAT32",
+      "valid": true,
+      "vector": ["-inf", 0.0, "inf"],
+      "dtype_hex": "0x27",
+      "dtype_alias": "FLOAT32",
+      "padding": 0,
+      "canonical_bson": "2000000005766563746F72000E000000092700000080FF000000000000807F00",
+      "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwAAAID/AAAAAAAAgH8=\", \"subType\": \"09\"}}}"
+    },
+    {
+      "description": "FLOAT32 with padding",
+      "valid": false,
+      "vector": [127.0, 7.0],
+      "dtype_hex": "0x27",
+      "dtype_alias": "FLOAT32",
+      "padding": 3
+    }
+  ]
+}
+
diff --git a/test/bson_binary_vector/int8.json b/test/bson_binary_vector/int8.json
@@ -0,0 +1,59 @@
+{
+  "description": "Tests of Binary subtype 9, Vectors, with dtype INT8",
+  "test_key": "vector",
+  "tests": [
+    {
+      "description": "Simple Vector INT8",
+      "valid": true,
+      "vector": [127, 7],
+      "dtype_hex": "0x03",
+      "dtype_alias": "INT8",
+      "padding": 0,
+      "canonical_bson": "1600000005766563746F7200040000000903007F0700",
+      "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"AwB/Bw==\", \"subType\": \"09\"}}}"
+    },
+    {
+      "description": "Empty Vector INT8",
+      "valid": true,
+      "vector": [],
+      "dtype_hex": "0x03",
+      "dtype_alias": "INT8",
+      "padding": 0,
+      "canonical_bson": "1400000005766563746F72000200000009030000",
+      "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"AwA=\", \"subType\": \"09\"}}}"
+    },
+    {
+      "description": "Overflow Vector INT8",
+      "valid": false,
+      "vector": [128],
+      "dtype_hex": "0x03",
+      "dtype_alias": "INT8",
+      "padding": 0
+    },
+    {
+      "description": "Underflow Vector INT8",
+      "valid": false,
+      "vector": [-129],
+      "dtype_hex": "0x03",
+      "dtype_alias": "INT8",
+      "padding": 0
+    },
+    {
+      "description": "INT8 with padding",
+      "valid": false,
+      "vector": [127, 7],
+      "dtype_hex": "0x03",
+      "dtype_alias": "INT8",
+      "padding": 3
+    },
+    {
+      "description": "INT8 with float inputs",
+      "valid": false,
+      "vector": [127.77, 7.77],
+      "dtype_hex": "0x03",
+      "dtype_alias": "INT8",
+      "padding": 0
+    }
+  ]
+}
+
diff --git a/test/bson_binary_vector/packed_bit.json b/test/bson_binary_vector/packed_bit.json
@@ -0,0 +1,53 @@
+{
+  "description": "Tests of Binary subtype 9, Vectors, with dtype PACKED_BIT",
+  "test_key": "vector",
+  "tests": [
+    {
+      "description": "Simple Vector PACKED_BIT",
+      "valid": true,
+      "vector": [127, 7],
+      "dtype_hex": "0x10",
+      "dtype_alias": "PACKED_BIT",
+      "padding": 0,
+      "canonical_bson": "1600000005766563746F7200040000000910007F0700",
+      "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAB/Bw==\", \"subType\": \"09\"}}}"
+    },
+    {
+      "description": "Empty Vector PACKED_BIT",
+      "valid": true,
+      "vector": [],
+      "dtype_hex": "0x10",
+      "dtype_alias": "PACKED_BIT",
+      "padding": 0,
+      "canonical_bson": "1400000005766563746F72000200000009100000",
+      "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAA=\", \"subType\": \"09\"}}}"
+    },
+    {
+      "description": "PACKED_BIT with padding",
+      "valid": true,
+      "vector": [127, 7],
+      "dtype_hex": "0x10",
+      "dtype_alias": "PACKED_BIT",
+      "padding": 3,
+      "canonical_bson": "1600000005766563746F7200040000000910037F0700",
+      "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAN/Bw==\", \"subType\": \"09\"}}}"
+    },
+    {
+      "description": "Overflow Vector PACKED_BIT",
+      "valid": false,
+      "vector": [256],
+      "dtype_hex": "0x10",
+      "dtype_alias": "PACKED_BIT",
+      "padding": 0
+    },
+    {
+      "description": "Underflow Vector PACKED_BIT",
+      "valid": false,
+      "vector": [-1],
+      "dtype_hex": "0x10",
+      "dtype_alias": "PACKED_BIT",
+      "padding": 0
+    }
+  ]
+}
+