diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a9e3394fe74..c07c5b5b90f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,11 +54,12 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.7 - - name: Prepare enviroment + - name: Prepare environment run: | python -m pip install --upgrade pip python -m pip install poetry poetry install --without dev + poetry run pip install tensorflow==2.11.0 - name: Test basic import run: poetry run python -c 'from docarray import DocumentArray, BaseDocument' @@ -111,11 +112,12 @@ jobs: python -m pip install --upgrade pip python -m pip install poetry poetry install --all-extras + poetry run pip install tensorflow==2.11.0 - name: Test id: test run: | - poetry run pytest ${{ matrix.test-path }} + poetry run pytest -m "not tensorflow" ${{ matrix.test-path }} timeout-minutes: 30 # env: # JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}" @@ -159,7 +161,7 @@ jobs: - name: Test id: test run: | - poetry run pytest ${{ matrix.test-path }} + poetry run pytest -m "not tensorflow" ${{ matrix.test-path }} timeout-minutes: 30 @@ -181,12 +183,40 @@ jobs: python -m pip install --upgrade pip python -m pip install poetry poetry install --all-extras - pip install protobuf==3.19.0 # we check that we support 3.19 + poetry run pip install protobuf==3.19.0 # we check that we support 3.19 + + - name: Test + id: test + run: | + poetry run pytest -m 'proto' tests + timeout-minutes: 30 + + + docarray-test-tensorflow: + needs: [lint-ruff, check-black, import-test] + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: [3.7] + steps: + - uses: actions/checkout@v2.5.0 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Prepare environment + run: | + python -m pip install --upgrade pip + python -m pip install poetry + poetry install --all-extras + poetry run pip install protobuf==3.19.0 + poetry run pip install tensorflow==2.11.0 - name: Test id: test run: | - poetry run pytest -k 'proto' tests + poetry run pytest -m 'tensorflow' tests timeout-minutes: 30 diff --git a/README.md b/README.md index 8c9039e8391..90034909ba6 100644 --- a/README.md +++ b/README.md @@ -209,22 +209,22 @@ class MyMultiModalModel(nn.Module): self.text_encoder = TextEncoder() def forward(self, text_1, text_2, image_1, image_2, audio_1, audio_2): - emnedding_text_1 = self.text_encoder(text_1) - emnedding_text_2 = self.text_encoder(text_2) + embedding_text_1 = self.text_encoder(text_1) + embedding_text_2 = self.text_encoder(text_2) - emnedding_image_1 = self.image_encoder(image_1) - emnedding_image_2 = self.image_encoder(image_2) + embedding_image_1 = self.image_encoder(image_1) + embedding_image_2 = self.image_encoder(image_2) - emnedding_audio_1 = self.image_encoder(audio_1) - emnedding_audio_2 = self.image_encoder(audio_2) + embedding_audio_1 = self.image_encoder(audio_1) + embedding_audio_2 = self.image_encoder(audio_2) return ( - emnedding_text_1, - emnedding_text_2, - emnedding_image_1, - emnedding_image_2, - emnedding_audio_1, - emnedding_audio_2, + embedding_text_1, + embedding_text_2, + embedding_image_1, + embedding_image_2, + embedding_audio_1, + embedding_audio_2, ) ``` @@ -258,14 +258,14 @@ class MyPodcastModel(nn.Module): self.image_encoder = ImageEncoder() self.text_encoder = TextEncoder() - def forward_podcast(da: DocumentArray[Podcast]) -> DocumentArray[Podcast]: + def forward_podcast(self, da: DocumentArray[Podcast]) -> DocumentArray[Podcast]: da.audio.embedding = self.audio_encoder(da.audio.tensor) da.text.embedding = self.text_encoder(da.text.tensor) da.image.embedding = self.image_encoder(da.image.tensor) return da - def forward(da: DocumentArray[PairPodcast]) -> DocumentArray[PairPodcast]: + def forward(self, da: DocumentArray[PairPodcast]) -> DocumentArray[PairPodcast]: da.left = self.forward_podcast(da.left) da.right = self.forward_podcast(da.right) @@ -277,6 +277,49 @@ You instantly win in code readability and maintainability. And for the same pric schema definition (see below). Everything handles in a pythonic manner by relying on type hints. +## Coming from TensorFlow + +Similar to the PyTorch approach, you can also use DocArray with TensorFlow to handle and represent multi-modal data inside your ML model. + +First off, to use DocArray with TensorFlow we first need to install it as follows: +``` +pip install tensorflow==2.11.0 +pip install protobuf==3.19.0 +``` + +Compared to using DocArray with PyTorch, there is one main difference when using it with TensorFlow:\ +While DocArray's `TorchTensor` is a subclass of `torch.Tensor`, this is not the case for the `TensorFlowTensor`: Due to technical limitations on `tf.Tensor`, docarray's `TensorFlowTensor` is not a subclass of `tf.Tensor` but instead stores a `tf.Tensor` in its `.tensor` attribute. + +How does this effect you? Whenever you want to access the tensor data to e.g. do operations with it or hand it to your ML model, instead of handing over your `TensorFlowTensor` instance, you need to access its `.tensor` attribute. + +This would look like the following: + +```python +from typing import Optional + +from docarray import DocumentArray, BaseDocument + +import tensorflow as tf + + +class Podcast(BaseDocument): + audio_tensor: Optional[AudioTensorFlowTensor] + embedding: Optional[AudioTensorFlowTensor] + + +class MyPodcastModel(tf.keras.Model): + def __init__(self): + super().__init__() + self.audio_encoder = AudioEncoder() + + def call(self, inputs: DocumentArray[Podcast]) -> DocumentArray[Podcast]: + inputs.audio_tensor.embedding = self.audio_encoder( + inputs.audio_tensor.tensor + ) # access audio_tensor's .tensor attribute + return inputs +``` + + ## Coming from FastAPI diff --git a/docarray/array/array_stacked.py b/docarray/array/array_stacked.py index 1a4f3bfbb40..d9f6ed83ebb 100644 --- a/docarray/array/array_stacked.py +++ b/docarray/array/array_stacked.py @@ -27,14 +27,22 @@ from pydantic.fields import ModelField from docarray.proto import DocumentArrayStackedProto - from docarray.typing import TorchTensor - from docarray.typing.tensor.abstract_tensor import AbstractTensor try: from docarray.typing import TorchTensor except ImportError: TorchTensor = None # type: ignore +try: + import tensorflow as tf # type: ignore + + from docarray.typing import TensorFlowTensor + + tf_available = True +except (ImportError, TypeError): + TensorFlowTensor = None # type: ignore + tf_available = False + T = TypeVar('T', bound='DocumentArrayStacked') IndexIterType = Union[slice, Iterable[int], Iterable[bool], None] @@ -163,7 +171,26 @@ def _create_columns( tensor_columns: Dict[str, AbstractTensor] = dict() for field, type_ in column_schema.items(): - if issubclass(type_, AbstractTensor): + if tf_available and isinstance(getattr(docs[0], field), TensorFlowTensor): + # tf.Tensor does not allow item assignment, therefore the optimized way + # of initializing an empty array and assigning values to it iteratively + # does not work here, therefore handle separately. + tf_stack = [] + for i, doc in enumerate(docs): + val = getattr(doc, field) + if val is None: + val = tensor_type.get_comp_backend().none_value() + tf_stack.append(val.tensor) + del val.tensor + + stacked: tf.Tensor = tf.stack(tf_stack) + tensor_columns[field] = TensorFlowTensor(stacked) + for i, doc in enumerate(docs): + val = getattr(doc, field) + x = tensor_columns[field][i].tensor + val.tensor = x + + elif issubclass(type_, AbstractTensor): tensor = getattr(docs[0], field) column_shape = ( (len(docs), *tensor.shape) if tensor is not None else (len(docs),) @@ -190,7 +217,8 @@ def _create_columns( # We thus chose to convert the individual rank 0 tensors to rank 1 # This does mean that stacking rank 0 tensors will transform them # to rank 1 - if tensor_columns[field].ndim == 1: + tensor = tensor_columns[field] + if tensor.get_comp_backend().n_dim(tensor) == 1: setattr(doc, field, tensor_columns[field][i : i + 1]) else: setattr(doc, field, tensor_columns[field][i]) diff --git a/docarray/computation/abstract_comp_backend.py b/docarray/computation/abstract_comp_backend.py index d29af1c2cd4..b351bedcfcb 100644 --- a/docarray/computation/abstract_comp_backend.py +++ b/docarray/computation/abstract_comp_backend.py @@ -19,76 +19,77 @@ class AbstractComputationalBackend(ABC, typing.Generic[TTensor]): That way, DocArray can leverage native implementations from all frameworks. """ - @staticmethod + @classmethod @abstractmethod def stack( - tensors: Union[List['TTensor'], Tuple['TTensor']], dim: int = 0 + cls, tensors: Union[List['TTensor'], Tuple['TTensor']], dim: int = 0 ) -> 'TTensor': """ Stack a list of tensors along a new axis. """ ... - @staticmethod + @classmethod @abstractmethod - def n_dim(array: 'TTensor') -> int: + def n_dim(cls, array: 'TTensor') -> int: """ Get the number of the array dimensions. """ ... - @staticmethod + @classmethod @abstractmethod - def squeeze(tensor: 'TTensor') -> 'TTensor': + def squeeze(cls, tensor: 'TTensor') -> 'TTensor': """ Returns a tensor with all the dimensions of tensor of size 1 removed. """ ... - @staticmethod + @classmethod @abstractmethod - def to_numpy(array: 'TTensor') -> 'np.ndarray': + def to_numpy(cls, array: 'TTensor') -> 'np.ndarray': """ Convert array to np.ndarray. """ ... - @staticmethod + @classmethod @abstractmethod def empty( + cls, shape: Tuple[int, ...], dtype: Optional[Any] = None, device: Optional[Any] = None, ) -> 'TTensor': ... - @staticmethod + @classmethod @abstractmethod - def none_value() -> typing.Any: + def none_value(cls) -> typing.Any: """Provide a compatible value that represents None in the Tensor Backend.""" ... - @staticmethod + @classmethod @abstractmethod - def to_device(tensor: 'TTensor', device: str) -> 'TTensor': + def to_device(cls, tensor: 'TTensor', device: str) -> 'TTensor': """Move the tensor to the specified device.""" ... - @staticmethod + @classmethod @abstractmethod - def device(tensor: 'TTensor') -> Optional[str]: + def device(cls, tensor: 'TTensor') -> Optional[str]: """Return device on which the tensor is allocated.""" ... - @staticmethod + @classmethod @abstractmethod - def shape(tensor: 'TTensor') -> Tuple[int, ...]: + def shape(cls, tensor: 'TTensor') -> Tuple[int, ...]: """Get shape of tensor""" ... - @staticmethod + @classmethod @abstractmethod - def reshape(tensor: 'TTensor', shape: Tuple[int, ...]) -> 'TTensor': + def reshape(cls, tensor: 'TTensor', shape: Tuple[int, ...]) -> 'TTensor': """ Gives a new shape to tensor without changing its data. @@ -99,9 +100,9 @@ def reshape(tensor: 'TTensor', shape: Tuple[int, ...]) -> 'TTensor': """ ... - @staticmethod + @classmethod @abstractmethod - def detach(tensor: 'TTensor') -> 'TTensor': + def detach(cls, tensor: 'TTensor') -> 'TTensor': """ Returns the tensor detached from its current graph. @@ -110,21 +111,22 @@ def detach(tensor: 'TTensor') -> 'TTensor': """ ... - @staticmethod + @classmethod @abstractmethod - def dtype(tensor: 'TTensor') -> Any: + def dtype(cls, tensor: 'TTensor') -> Any: """Get the data type of the tensor.""" ... - @staticmethod + @classmethod @abstractmethod - def isnan(tensor: 'TTensor') -> 'TTensor': + def isnan(cls, tensor: 'TTensor') -> 'TTensor': """Check element-wise for nan and return result as a boolean array""" ... - @staticmethod + @classmethod @abstractmethod def minmax_normalize( + cls, tensor: 'TTensor', t_range: Tuple = (0, 1), x_range: Optional[Tuple] = None, diff --git a/docarray/computation/abstract_numpy_based_backend.py b/docarray/computation/abstract_numpy_based_backend.py new file mode 100644 index 00000000000..85320b28938 --- /dev/null +++ b/docarray/computation/abstract_numpy_based_backend.py @@ -0,0 +1,81 @@ +import types +from abc import ABC +from typing import Any, Callable, List, Optional, Tuple, TypeVar, Union + +import numpy as np + +from docarray.computation import AbstractComputationalBackend + +T = TypeVar('T') + + +class AbstractNumpyBasedBackend(AbstractComputationalBackend[T], ABC): + """ + Abstract base class for computational backends that are based on numpy. + This includes numpy (np) itself and tensorflow.experimental.numpy (tnp). + The overlap of those two is gathered in this abstract backend. Other functions + should be defined in corresponding subclasses. + """ + + _module: types.ModuleType + # The method _get_tensor() transforms the input of the backends methods to a + # handleable type that the backends _module can work with, whereas _cast_output() + # casts the output of a methods back to the original input type. This is especially + # relevant w.r.t. the TensorFlowTensor class: + # If a TensorFlowTensor instance is input to a function, we first want to transform + # it to a tf.Tensor, since the tf.Tensor is what the TensorFlowBackend's _module + # (tnp) works on. If the function returns a tf.Tensor, we want to cast it back to a + # TensorFlowTensor. + _cast_output: Callable + _get_tensor: Callable + + @classmethod + def stack(cls, tensors: Union[List[T], Tuple[T]], dim: int = 0) -> T: + """Stack a list of tensors along a new axis.""" + t = [cls._get_tensor(t) for t in tensors] + return cls._cast_output(cls._module.stack(t, axis=dim)) + + @classmethod + def n_dim(cls, array: T) -> int: + """Get the number of the array dimensions.""" + return cls._module.ndim(cls._get_tensor(array)) + + @classmethod + def squeeze(cls, tensor: T) -> T: + """ + Returns a tensor with all the dimensions of tensor of size 1 removed. + """ + return cls._cast_output(cls._module.squeeze(cls._get_tensor(tensor))) + + @classmethod + def empty( + cls, + shape: Tuple[int, ...], + dtype: Optional[Any] = None, + device: Optional[Any] = None, + ) -> T: + if cls._module is np and device is not None: + raise NotImplementedError('Numpy does not support devices (GPU).') + return cls._cast_output(cls._module.empty(shape, dtype=dtype)) + + @classmethod + def shape(cls, array: T) -> Tuple[int, ...]: + """Get shape of array""" + return tuple(cls._module.shape(cls._get_tensor(array))) + + @classmethod + def reshape(cls, array: T, shape: Tuple[int, ...]) -> T: + """ + Gives a new shape to array without changing its data. + + :param array: array to be reshaped + :param shape: the new shape + :return: a array with the same data and number of elements as array + but with the specified shape. + """ + return cls._cast_output(cls._module.reshape(cls._get_tensor(array), shape)) + + @classmethod + def isnan(cls, tensor: T) -> T: + """Check element-wise for nan and return result as a boolean array""" + return cls._cast_output(cls._module.isnan(cls._get_tensor(tensor))) diff --git a/docarray/computation/numpy_backend.py b/docarray/computation/numpy_backend.py index afa2733c074..45b43d763d4 100644 --- a/docarray/computation/numpy_backend.py +++ b/docarray/computation/numpy_backend.py @@ -1,9 +1,10 @@ import warnings -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple import numpy as np from docarray.computation import AbstractComputationalBackend +from docarray.computation.abstract_numpy_based_backend import AbstractNumpyBasedBackend def _expand_if_single_axis(*matrices: np.ndarray) -> List[np.ndarray]: @@ -29,76 +30,40 @@ def _expand_if_scalar(arr: np.ndarray) -> np.ndarray: return arr -class NumpyCompBackend(AbstractComputationalBackend[np.ndarray]): +def identity(array: np.ndarray) -> np.ndarray: + return array + + +class NumpyCompBackend(AbstractNumpyBasedBackend): """ Computational backend for Numpy. """ - @staticmethod - def stack( - tensors: Union[List['np.ndarray'], Tuple['np.ndarray']], dim: int = 0 - ) -> 'np.ndarray': - return np.stack(tensors, axis=dim) + _module = np + _cast_output = identity + _get_tensor = identity - @staticmethod - def to_device(tensor: 'np.ndarray', device: str) -> 'np.ndarray': + @classmethod + def to_device(cls, tensor: 'np.ndarray', device: str) -> 'np.ndarray': """Move the tensor to the specified device.""" raise NotImplementedError('Numpy does not support devices (GPU).') - @staticmethod - def device(tensor: 'np.ndarray') -> Optional[str]: + @classmethod + def device(cls, tensor: 'np.ndarray') -> Optional[str]: """Return device on which the tensor is allocated.""" return None - @staticmethod - def n_dim(array: 'np.ndarray') -> int: - return array.ndim - - @staticmethod - def squeeze(tensor: 'np.ndarray') -> 'np.ndarray': - """ - Returns a tensor with all the dimensions of tensor of size 1 removed. - """ - return tensor.squeeze() - - @staticmethod - def to_numpy(array: 'np.ndarray') -> 'np.ndarray': + @classmethod + def to_numpy(cls, array: 'np.ndarray') -> 'np.ndarray': return array - @staticmethod - def empty( - shape: Tuple[int, ...], - dtype: Optional[Any] = None, - device: Optional[Any] = None, - ) -> 'np.ndarray': - if device is not None: - raise NotImplementedError('Numpy does not support devices (GPU).') - return np.empty(shape, dtype=dtype) - - @staticmethod - def none_value() -> Any: + @classmethod + def none_value(cls) -> Any: """Provide a compatible value that represents None in numpy.""" return None - @staticmethod - def shape(array: 'np.ndarray') -> Tuple[int, ...]: - """Get shape of array""" - return array.shape - - @staticmethod - def reshape(array: 'np.ndarray', shape: Tuple[int, ...]) -> 'np.ndarray': - """ - Gives a new shape to array without changing its data. - - :param array: array to be reshaped - :param shape: the new shape - :return: a array with the same data and number of elements as array - but with the specified shape. - """ - return array.reshape(shape) - - @staticmethod - def detach(tensor: 'np.ndarray') -> 'np.ndarray': + @classmethod + def detach(cls, tensor: 'np.ndarray') -> 'np.ndarray': """ Returns the tensor detached from its current graph. @@ -107,18 +72,14 @@ def detach(tensor: 'np.ndarray') -> 'np.ndarray': """ return tensor - @staticmethod - def dtype(tensor: 'np.ndarray') -> np.dtype: + @classmethod + def dtype(cls, tensor: 'np.ndarray') -> np.dtype: """Get the data type of the tensor.""" return tensor.dtype - @staticmethod - def isnan(tensor: 'np.ndarray') -> 'np.ndarray': - """Check element-wise for nan and return result as a boolean array""" - return np.isnan(tensor) - - @staticmethod + @classmethod def minmax_normalize( + cls, tensor: 'np.ndarray', t_range: Tuple = (0, 1), x_range: Optional[Tuple] = None, diff --git a/docarray/computation/tensorflow_backend.py b/docarray/computation/tensorflow_backend.py new file mode 100644 index 00000000000..afee8668542 --- /dev/null +++ b/docarray/computation/tensorflow_backend.py @@ -0,0 +1,280 @@ +import typing +from typing import Callable, List, Optional, Tuple + +import numpy as np +import tensorflow as tf # type: ignore +import tensorflow._api.v2.experimental.numpy as tnp # type: ignore + +from docarray.computation import AbstractComputationalBackend +from docarray.computation.abstract_numpy_based_backend import AbstractNumpyBasedBackend +from docarray.typing import TensorFlowTensor + + +def _unsqueeze_if_single_axis(*matrices: tf.Tensor) -> List[tf.Tensor]: + """ + Unsqueezes tensors that only have one axis, at dim 0. + This ensures that all outputs can be treated as matrices, not vectors. + + :param matrices: Matrices to be unsqueezed + :return: List of the input matrices, + where single axis matrices are unsqueezed at dim 0. + """ + unsqueezed = [] + for m in matrices: + if len(m.shape) == 1: + unsqueezed.append(tf.expand_dims(m, axis=0)) + else: + unsqueezed.append(m) + return unsqueezed + + +def _unsqueeze_if_scalar(t: tf.Tensor) -> tf.Tensor: + """ + Unsqueezes tensor of a scalar, from shape () to shape (1,). + + :param t: tensor to unsqueeze. + :return: unsqueezed tf.Tensor + """ + if len(t.shape) == 0: # avoid scalar output + t = tf.expand_dims(t, 0) + return t + + +def norm_left(t: tf.Tensor) -> TensorFlowTensor: + return TensorFlowTensor(tensor=t) + + +def norm_right(t: TensorFlowTensor) -> tf.Tensor: + return t.tensor + + +class TensorFlowCompBackend(AbstractNumpyBasedBackend[TensorFlowTensor]): + """ + Computational backend for TensorFlow. + """ + + _module = tnp + _cast_output: Callable = norm_left + _get_tensor: Callable = norm_right + + @classmethod + def to_numpy(cls, array: 'TensorFlowTensor') -> 'np.ndarray': + return cls._get_tensor(array).numpy() + + @classmethod + def none_value(cls) -> typing.Any: + """Provide a compatible value that represents None in numpy.""" + return tf.constant(float('nan')) + + @classmethod + def to_device(cls, tensor: 'TensorFlowTensor', device: str) -> 'TensorFlowTensor': + """Move the tensor to the specified device.""" + if cls.device(tensor) == device: + return tensor + else: + with tf.device(device): + return cls._cast_output(tf.identity(cls._get_tensor(tensor))) + + @classmethod + def device(cls, tensor: 'TensorFlowTensor') -> Optional[str]: + """Return device on which the tensor is allocated.""" + return cls._get_tensor(tensor).device + + @classmethod + def detach(cls, tensor: 'TensorFlowTensor') -> 'TensorFlowTensor': + """ + Returns the tensor detached from its current graph. + + :param tensor: tensor to be detached + :return: a detached tensor with the same data. + """ + return cls._cast_output(tf.stop_gradient(cls._get_tensor(tensor))) + + @classmethod + def dtype(cls, tensor: 'TensorFlowTensor') -> tf.dtypes: + """Get the data type of the tensor.""" + return cls._get_tensor(tensor).dtype + + @classmethod + def minmax_normalize( + cls, + tensor: 'TensorFlowTensor', + t_range: Tuple = (0.0, 1.0), + x_range: Optional[Tuple] = None, + eps: float = 1e-7, + ) -> 'TensorFlowTensor': + a, b = t_range + + t = tf.cast(cls._get_tensor(tensor), tf.float32) + min_d = x_range[0] if x_range else tnp.min(t, axis=-1, keepdims=True) + max_d = x_range[1] if x_range else tnp.max(t, axis=-1, keepdims=True) + + i = (b - a) * (t - min_d) / (max_d - min_d + tf.constant(eps) + a) + + normalized = tnp.clip(i, *((a, b) if a < b else (b, a))) + return cls._cast_output(tf.cast(normalized, tensor.tensor.dtype)) + + class Retrieval(AbstractComputationalBackend.Retrieval[TensorFlowTensor]): + """ + Abstract class for retrieval and ranking functionalities + """ + + @staticmethod + def top_k( + values: 'TensorFlowTensor', + k: int, + descending: bool = False, + device: Optional[str] = None, + ) -> Tuple['TensorFlowTensor', 'TensorFlowTensor']: + """ + Retrieves the top k smallest values in `values`, + and returns them alongside their indices in the input `values`. + Can also be used to retrieve the top k largest values, + by setting the `descending` flag. + + :param values: TensorFlowTensor of values to rank. + Should be of shape (n_queries, n_values_per_query). + Inputs of shape (n_values_per_query,) will be expanded + to (1, n_values_per_query). + :param k: number of values to retrieve + :param descending: retrieve largest values instead of smallest values + :param device: the computational device to use. + :return: Tuple of TensorFlowTensors containing the retrieved values, and + their indices. Both are of shape (n_queries, k) + """ + comp_be = TensorFlowCompBackend + if device is not None: + values = comp_be.to_device(values, device) + + tf_values: tf.Tensor = comp_be._get_tensor(values) + if len(tf_values.shape) <= 1: + tf_values = tf.expand_dims(tf_values, axis=0) + + len_tf_values = ( + tf_values.shape[-1] if len(tf_values.shape) > 1 else len(tf_values) + ) + k = min(k, len_tf_values) + + if not descending: + tf_values = -tf_values + + result = tf.math.top_k(input=tf_values, k=k, sorted=True) + res_values = result.values + res_indices = result.indices + + if not descending: + res_values = -result.values + + return comp_be._cast_output(res_values), comp_be._cast_output(res_indices) + + class Metrics(AbstractComputationalBackend.Metrics[TensorFlowTensor]): + """ + Abstract base class for metrics (distances and similarities). + """ + + @staticmethod + def cosine_sim( + x_mat: 'TensorFlowTensor', + y_mat: 'TensorFlowTensor', + eps: float = 1e-7, + device: Optional[str] = None, + ) -> 'TensorFlowTensor': + """Pairwise cosine similarities between all vectors in x_mat and y_mat. + + :param x_mat: tensor of shape (n_vectors, n_dim), where n_vectors is the + number of vectors and n_dim is the number of dimensions of each example. + :param y_mat: tensor of shape (n_vectors, n_dim), where n_vectors is the + number of vectors and n_dim is the number of dimensions of each example. + :param eps: a small jitter to avoid divde by zero + :param device: the device to use for computations. + If not provided, the devices of x_mat and y_mat are used. + :return: Tensor of shape (n_vectors, n_vectors) containing all pairwise + cosine distances. + The index [i_x, i_y] contains the cosine distance between + x_mat[i_x] and y_mat[i_y]. + """ + comp_be = TensorFlowCompBackend + x_mat_tf: tf.Tensor = comp_be._get_tensor(x_mat) + y_mat_tf: tf.Tensor = comp_be._get_tensor(y_mat) + + with tf.device(device): + x_mat_tf = tf.identity(x_mat_tf) + y_mat_tf = tf.identity(y_mat_tf) + + x_mat_tf, y_mat_tf = _unsqueeze_if_single_axis(x_mat_tf, y_mat_tf) + + a_n = tf.linalg.normalize(x_mat_tf, axis=1)[1] + b_n = tf.linalg.normalize(y_mat_tf, axis=1)[1] + a_norm = x_mat_tf / tf.clip_by_value( + a_n, clip_value_min=eps, clip_value_max=tf.float32.max + ) + b_norm = y_mat_tf / tf.clip_by_value( + b_n, clip_value_min=eps, clip_value_max=tf.float32.max + ) + sims = tf.squeeze(tf.linalg.matmul(a_norm, tf.transpose(b_norm))) + sims = _unsqueeze_if_scalar(sims) + + return comp_be._cast_output(sims) + + @staticmethod + def euclidean_dist( + x_mat: 'TensorFlowTensor', + y_mat: 'TensorFlowTensor', + device: Optional[str] = None, + ) -> 'TensorFlowTensor': + """Pairwise Euclidian distances between all vectors in x_mat and y_mat. + + :param x_mat: tensor of shape (n_vectors, n_dim), where n_vectors is the + number of vectors and n_dim is the number of dimensions of each example. + :param y_mat: tensor of shape (n_vectors, n_dim), where n_vectors is the + number of vectors and n_dim is the number of dimensions of each example. + :param device: the device to use for pytorch computations. + If not provided, the devices of x_mat and y_mat are used. + :return: Tensor of shape (n_vectors, n_vectors) containing all pairwise + euclidian distances. + The index [i_x, i_y] contains the euclidian distance between + x_mat[i_x] and y_mat[i_y]. + """ + comp_be = TensorFlowCompBackend + x_mat_tf: tf.Tensor = comp_be._get_tensor(x_mat) + y_mat_tf: tf.Tensor = comp_be._get_tensor(y_mat) + + with tf.device(device): + x_mat_tf = tf.identity(x_mat_tf) + y_mat_tf = tf.identity(y_mat_tf) + + x_mat_tf, y_mat_tf = _unsqueeze_if_single_axis(x_mat_tf, y_mat_tf) + + dists = tf.squeeze(tf.norm(tf.subtract(x_mat_tf, y_mat_tf), axis=-1)) + dists = _unsqueeze_if_scalar(dists) + + return comp_be._cast_output(dists) + + @staticmethod + def sqeuclidean_dist( + x_mat: 'TensorFlowTensor', + y_mat: 'TensorFlowTensor', + device: Optional[str] = None, + ) -> 'TensorFlowTensor': + """Pairwise Squared Euclidian distances between all vectors + in x_mat and y_mat. + + :param x_mat: tensor of shape (n_vectors, n_dim), where n_vectors is the + number of vectors and n_dim is the number of dimensions of each + example. + :param y_mat: tensor of shape (n_vectors, n_dim), where n_vectors is the + number of vectors and n_dim is the number of dimensions of each + example. + :param device: the device to use for pytorch computations. + If not provided, the devices of x_mat and y_mat are used. + :return: Tensor of shape (n_vectors, n_vectors) containing all pairwise + euclidian distances. + The index [i_x, i_y] contains the euclidian distance between + x_mat[i_x] and y_mat[i_y]. + """ + dists = TensorFlowCompBackend.Metrics.euclidean_dist(x_mat, y_mat) + squared: tf.Tensor = tf.math.square( + TensorFlowCompBackend._get_tensor(dists) + ) + + return TensorFlowCompBackend._cast_output(squared) diff --git a/docarray/computation/torch_backend.py b/docarray/computation/torch_backend.py index c05f9fb4c29..936b16f3188 100644 --- a/docarray/computation/torch_backend.py +++ b/docarray/computation/torch_backend.py @@ -23,7 +23,7 @@ def _unsqueeze_if_single_axis(*matrices: torch.Tensor) -> List[torch.Tensor]: return unsqueezed -def _usqueeze_if_scalar(t: torch.Tensor): +def _unsqueeze_if_scalar(t: torch.Tensor): if len(t.shape) == 0: # avoid scalar output t = t.unsqueeze(0) return t @@ -34,24 +34,25 @@ class TorchCompBackend(AbstractComputationalBackend[torch.Tensor]): Computational backend for PyTorch. """ - @staticmethod + @classmethod def stack( - tensors: Union[List['torch.Tensor'], Tuple['torch.Tensor']], dim: int = 0 + cls, tensors: Union[List['torch.Tensor'], Tuple['torch.Tensor']], dim: int = 0 ) -> 'torch.Tensor': return torch.stack(tensors, dim=dim) - @staticmethod - def to_device(tensor: 'torch.Tensor', device: str) -> 'torch.Tensor': + @classmethod + def to_device(cls, tensor: 'torch.Tensor', device: str) -> 'torch.Tensor': """Move the tensor to the specified device.""" return tensor.to(device) - @staticmethod - def device(tensor: 'torch.Tensor') -> Optional[str]: + @classmethod + def device(cls, tensor: 'torch.Tensor') -> Optional[str]: """Return device on which the tensor is allocated.""" return str(tensor.device) - @staticmethod + @classmethod def empty( + cls, shape: Tuple[int, ...], dtype: Optional[Any] = None, device: Optional[Any] = None, @@ -64,32 +65,34 @@ def empty( return torch.empty(shape, **extra_param) - @staticmethod - def n_dim(array: 'torch.Tensor') -> int: + @classmethod + def n_dim(cls, array: 'torch.Tensor') -> int: return array.ndim - @staticmethod - def squeeze(tensor: 'torch.Tensor') -> 'torch.Tensor': + @classmethod + def squeeze(cls, tensor: 'torch.Tensor') -> 'torch.Tensor': """ Returns a tensor with all the dimensions of tensor of size 1 removed. """ return torch.squeeze(tensor) - @staticmethod - def to_numpy(array: 'torch.Tensor') -> 'np.ndarray': + @classmethod + def to_numpy(cls, array: 'torch.Tensor') -> 'np.ndarray': return array.cpu().detach().numpy() - @staticmethod - def none_value() -> Any: + @classmethod + def none_value( + cls, + ) -> Any: """Provide a compatible value that represents None in torch.""" return torch.tensor(float('nan')) - @staticmethod - def shape(tensor: 'torch.Tensor') -> Tuple[int, ...]: + @classmethod + def shape(cls, tensor: 'torch.Tensor') -> Tuple[int, ...]: return tuple(tensor.shape) - @staticmethod - def reshape(tensor: 'torch.Tensor', shape: Tuple[int, ...]) -> 'torch.Tensor': + @classmethod + def reshape(cls, tensor: 'torch.Tensor', shape: Tuple[int, ...]) -> 'torch.Tensor': """ Gives a new shape to tensor without changing its data. @@ -101,8 +104,8 @@ def reshape(tensor: 'torch.Tensor', shape: Tuple[int, ...]) -> 'torch.Tensor': """ return tensor.reshape(shape) - @staticmethod - def detach(tensor: 'torch.Tensor') -> 'torch.Tensor': + @classmethod + def detach(cls, tensor: 'torch.Tensor') -> 'torch.Tensor': """ Returns the tensor detached from its current graph. @@ -111,18 +114,19 @@ def detach(tensor: 'torch.Tensor') -> 'torch.Tensor': """ return tensor.detach() - @staticmethod - def dtype(tensor: 'torch.Tensor') -> torch.dtype: + @classmethod + def dtype(cls, tensor: 'torch.Tensor') -> torch.dtype: """Get the data type of the tensor.""" return tensor.dtype - @staticmethod - def isnan(tensor: 'torch.Tensor') -> 'torch.Tensor': + @classmethod + def isnan(cls, tensor: 'torch.Tensor') -> 'torch.Tensor': """Check element-wise for nan and return result as a boolean array""" return torch.isnan(tensor) - @staticmethod + @classmethod def minmax_normalize( + cls, tensor: 'torch.Tensor', t_range: Tuple = (0, 1), x_range: Optional[Tuple] = None, @@ -234,7 +238,7 @@ def cosine_sim( a_norm = x_mat / torch.clamp(a_n, min=eps) b_norm = y_mat / torch.clamp(b_n, min=eps) sims = torch.mm(a_norm, b_norm.transpose(0, 1)).squeeze() - return _usqueeze_if_scalar(sims) + return _unsqueeze_if_scalar(sims) @staticmethod def euclidean_dist( @@ -261,7 +265,7 @@ def euclidean_dist( x_mat, y_mat = _unsqueeze_if_single_axis(x_mat, y_mat) dists = torch.cdist(x_mat, y_mat).squeeze() - return _usqueeze_if_scalar(dists) + return _unsqueeze_if_scalar(dists) @staticmethod def sqeuclidean_dist( @@ -289,4 +293,4 @@ def sqeuclidean_dist( x_mat, y_mat = _unsqueeze_if_single_axis(x_mat, y_mat) - return _usqueeze_if_scalar((torch.cdist(x_mat, y_mat) ** 2).squeeze()) + return _unsqueeze_if_scalar((torch.cdist(x_mat, y_mat) ** 2).squeeze()) diff --git a/docarray/typing/__init__.py b/docarray/typing/__init__.py index c1aac2b1f1f..29292965509 100644 --- a/docarray/typing/__init__.py +++ b/docarray/typing/__init__.py @@ -31,6 +31,7 @@ 'AnyUrl', 'ID', 'AnyTensor', + 'TensorFlowTensor', 'NdArrayEmbedding', 'ImageBytes', 'ImageTensor', @@ -56,3 +57,12 @@ 'ImageTorchTensor', ] ) + +try: + import tensorflow as tf # type: ignore # noqa: F401 +except (ImportError, TypeError): + pass +else: + from docarray.typing.tensor import TensorFlowTensor # noqa: F401 + + __all__.extend(['TensorFlowTensor']) diff --git a/docarray/typing/proto_register.py b/docarray/typing/proto_register.py index 4a1fe77dad9..ff7dd1038dd 100644 --- a/docarray/typing/proto_register.py +++ b/docarray/typing/proto_register.py @@ -1,13 +1,15 @@ -from typing import Callable, Dict, Type +from typing import Callable, Dict, Type, TypeVar from docarray.typing.abstract_type import AbstractType _PROTO_TYPE_NAME_TO_CLASS: Dict[str, Type[AbstractType]] = {} +T = TypeVar('T', bound='AbstractType') + def _register_proto( proto_type_name: str, -) -> Callable[[Type[AbstractType]], Type[AbstractType]]: +) -> Callable[[Type[T]], Type[T]]: """Register a new type to be used in the protobuf serialization. This will add the type key to the global registry of types key used in the proto @@ -34,7 +36,7 @@ class MyType(AbstractType): f'the key {proto_type_name} is already registered in the global registry' ) - def _register(cls: Type['AbstractType']) -> Type['AbstractType']: + def _register(cls: Type[T]) -> Type[T]: cls._proto_type_name = proto_type_name _PROTO_TYPE_NAME_TO_CLASS[proto_type_name] = cls diff --git a/docarray/typing/tensor/__init__.py b/docarray/typing/tensor/__init__.py index 98b99eff92b..fbd4e0b24ba 100644 --- a/docarray/typing/tensor/__init__.py +++ b/docarray/typing/tensor/__init__.py @@ -10,6 +10,7 @@ 'NdArrayEmbedding', 'ImageNdArray', 'ImageTensor', + 'TensorFlowTensor', ] try: @@ -22,3 +23,12 @@ from docarray.typing.tensor.torch_tensor import TorchTensor # noqa: F401 __all__.extend(['TorchEmbedding', 'TorchTensor', 'ImageTorchTensor']) + +try: + import tensorflow as tf # type: ignore # noqa: F401 +except (ImportError, TypeError): + pass +else: + from docarray.typing.tensor.tensorflow_tensor import TensorFlowTensor # noqa: F401 + + __all__.extend(['TensorFlowTensor']) diff --git a/docarray/typing/tensor/abstract_tensor.py b/docarray/typing/tensor/abstract_tensor.py index 822a580c580..c2451fa3272 100644 --- a/docarray/typing/tensor/abstract_tensor.py +++ b/docarray/typing/tensor/abstract_tensor.py @@ -69,7 +69,7 @@ def __instancecheck__(cls, instance): _cls.__unparametrizedcls__ ): # This is not None if the tensor is parametrized if ( - _cls.get_comp_backend().shape(instance) + instance.get_comp_backend().shape(instance) != _cls.__docarray_target_shape__ ): return False @@ -265,9 +265,3 @@ def _docarray_to_json_compatible(self): :return: a representation of the tensor compatible with orjson """ ... - - @property - @abc.abstractmethod - def ndim(self) -> int: - """The number of dimensions / rank of this tensor.""" - ... diff --git a/docarray/typing/tensor/ndarray.py b/docarray/typing/tensor/ndarray.py index 8e75ea72732..7c2ef023764 100644 --- a/docarray/typing/tensor/ndarray.py +++ b/docarray/typing/tensor/ndarray.py @@ -151,7 +151,7 @@ def unwrap(self) -> np.ndarray: import numpy as np t1 = NdArray.validate(np.zeros((3, 224, 224)), None, None) - # here t is a docarray TenNdArray + # here t1 is a docarray NdArray t2 = t.unwrap() # here t2 is a pure np.ndarray but t1 is still a Docarray NdArray # But both share the same underlying memory diff --git a/docarray/typing/tensor/tensor.py b/docarray/typing/tensor/tensor.py index 565cb26e5a6..9d46e08a408 100644 --- a/docarray/typing/tensor/tensor.py +++ b/docarray/typing/tensor/tensor.py @@ -4,10 +4,28 @@ try: import torch # noqa: F401 -except ImportError: - AnyTensor = Union[NdArray] # type: ignore -else: from docarray.typing.tensor.torch_tensor import TorchTensor # noqa: F401 + is_torch_available = True +except ImportError: + is_torch_available = False + +try: + import tensorflow as tf # type: ignore # noqa: F401 + + from docarray.typing.tensor.tensorflow_tensor import TensorFlowTensor # noqa: F401 + + is_tf_available = True +except (ImportError, TypeError): + is_tf_available = False + + +if is_torch_available and is_tf_available: + AnyTensor = Union[NdArray, TorchTensor, TensorFlowTensor] +elif is_torch_available: AnyTensor = Union[NdArray, TorchTensor] # type: ignore +elif is_tf_available: + AnyTensor = Union[NdArray, TensorFlowTensor] # type: ignore +else: + AnyTensor = Union[NdArray] # type: ignore diff --git a/docarray/typing/tensor/tensorflow_tensor.py b/docarray/typing/tensor/tensorflow_tensor.py new file mode 100644 index 00000000000..d004650b630 --- /dev/null +++ b/docarray/typing/tensor/tensorflow_tensor.py @@ -0,0 +1,292 @@ +from typing import TYPE_CHECKING, Any, Dict, Generic, Type, TypeVar, Union, cast + +import numpy as np +import tensorflow as tf # type: ignore + +from docarray.typing.proto_register import _register_proto +from docarray.typing.tensor.abstract_tensor import AbstractTensor + +if TYPE_CHECKING: + from pydantic.fields import ModelField + from pydantic import BaseConfig + from docarray.proto import NdArrayProto + from docarray.computation.tensorflow_backend import TensorFlowCompBackend + +from docarray.base_document.base_node import BaseNode + +T = TypeVar('T', bound='TensorFlowTensor') +ShapeT = TypeVar('ShapeT') + +tf_base: type = type(tf.Tensor) +node_base: type = type(BaseNode) + + +# the mypy error suppression below should not be necessary anymore once the following +# is released in mypy: https://github.com/python/mypy/pull/14135 +class metaTensorFlow( + AbstractTensor.__parametrized_meta__, # type: ignore + node_base, # type: ignore + tf_base, # type: ignore +): # type: ignore + pass + + +@_register_proto(proto_type_name='tensorflow_tensor') +class TensorFlowTensor(AbstractTensor, Generic[ShapeT], metaclass=metaTensorFlow): + """ + TensorFlowTensor class with a :attr:`~docarray.typing.TensorFlowTensor.tensor` + attribute of type :class:`tf.Tensor`, intended for use in a Document. + + This enables (de)serialization from/to protobuf and json, data validation, + and coersion from compatible types like numpy.ndarray. + + This type can also be used in a parametrized way, specifying the shape of the + tensor. + + In comparison to :class:`~docarray.typing.TorchTensor` and + :class:`~docarray.typing.NdArray`, :class:`~docarray.typing.TensorFlowTensor` is not + a subclass of :class:`tf.Tensor` (or :class:`torch.Tensor`, :class:`np.ndarray` + respectively). + Instead, the :class:`tf.Tensor` is stored in + :attr:`~docarray.typing.TensorFlowTensor.tensor`. + Therefore, to do operations on the actual tensor data you have to always access the + :attr:`~docarray.typing.TensorFlowTensor.tensor` attribute. + + EXAMPLE USAGE + + .. code-block:: python + + import tensorflow as tf + from docarray.typing import TensorFlowTensor + + + t = TensorFlowTensor(tensor=tf.zeros((224, 224))) + + # tensorflow functions + broadcasted = tf.broadcast_to(t.tensor, (3, 224, 224)) + broadcasted = tf.broadcast_to(t.unwrap(), (3, 224, 224)) + broadcasted = tf.broadcast_to(t, (3, 224, 224)) # this will fail + + # tensorflow.Tensor methods: + arr = t.tensor.numpy() + arr = t.unwrap().numpy() + arr = t.numpy() # this will fail + + The :class:`~docarray.computation.tensorflow_backend.TensorFlowBackend` however, + operates on our :class:`~docarray.typing.TensorFlowTensor` instances. + Here, you do not have to access the :attr:`~docarray.typing.TensorFlowTensor.tensor` + but can instead just hand over your :class:`~docarray.typing.TensorFlowTensor` + instance. + + .. code-block:: python + + import tensorflow as tf + from docarray.typing import TensorFlowTensor + + + zeros = TensorFlowTensor(tensor=tf.zeros((3, 224, 224))) + + comp_be = zeros.get_comp_backend() + reshaped = comp_be.reshape(zeros, (224, 224, 3)) + assert comp_be.shape(reshaped) == (224, 224, 3) + + You can use :class:`~docarray.typing.TensorFlowTensor` in a Document as follows: + + .. code-block:: python + + from docarray import BaseDocument + from docarray.typing import TensorFlowTensor + import tensorflow as tf + + + class MyDoc(BaseDocument): + tensor: TensorFlowTensor + image_tensor: TensorFlowTensor[3, 224, 224] + square_crop: TensorFlowTensor[3, 'x', 'x'] + + + # create a document with tensors + doc = MyDoc( + tensor=tf.zeros((128,)), + image_tensor=tf.zeros((3, 224, 224)), + square_crop=tf.zeros((3, 64, 64)), + ) + + # automatic shape conversion + doc = MyDoc( + tensor=tf.zeros((128,)), + image_tensor=tf.zeros((224, 224, 3)), # will reshape to (3, 224, 224) + square_crop=tf.zeros((3, 128, 128)), + ) + + # !! The following will raise an error due to shape mismatch !! + doc = MyDoc( + tensor=tf.zeros((128,)), + image_tensor=tf.zeros((224, 224)), # this will fail validation + square_crop=tf.zeros((3, 128, 64)), # this will also fail validation + ) + + """ + + __parametrized_meta__ = metaTensorFlow + + def __init__(self, tensor: tf.Tensor): + super().__init__() + self.tensor = tensor + + def __getitem__(self, item): + from docarray.computation.tensorflow_backend import TensorFlowCompBackend + + tensor = self.unwrap() + if tensor is not None: + tensor = tensor[item] + return TensorFlowCompBackend._cast_output(t=tensor) + + def __setitem__(self, index, value): + """Set a slice of this tensor's tf.Tensor""" + t = self.unwrap() + value = tf.cast(value, dtype=t.dtype) + var = tf.Variable(t) + var[index].assign(value) + self.tensor = tf.constant(var) + + def __iter__(self): + """Iterate over the elements of this tensor's tf.Tensor.""" + tensor = self.unwrap() + for i in range(len(tensor)): + yield tensor[i] + + @classmethod + def __get_validators__(cls): + # one or more validators may be yielded which will be called in the + # order to validate the input, each validator will receive as an input + # the value returned from the previous validator + yield cls.validate + + @classmethod + def validate( + cls: Type[T], + value: Union[T, np.ndarray, Any], + field: 'ModelField', + config: 'BaseConfig', + ) -> T: + if isinstance(value, TensorFlowTensor): + return cast(T, value) + elif isinstance(value, tf.Tensor): + return cls._docarray_from_native(value) + else: + try: + arr: tf.Tensor = tf.constant(value) + return cls(tensor=arr) + except Exception: + pass # handled below + raise ValueError( + f'Expected a tensorflow.Tensor compatible type, got {type(value)}' + ) + + @classmethod + def _docarray_from_native(cls: Type[T], value: Union[tf.Tensor, T]) -> T: + """ + Create a TensorFlowTensor from a tensorflow.Tensor or TensorFlowTensor + instance. + + :param value: instance of tf.Tensor or TensorFlowTensor + :return: a TensorFlowTensor + """ + if isinstance(value, TensorFlowTensor): + if cls.__unparametrizedcls__: # None if the tensor is parametrized + value.__class__ = cls.__unparametrizedcls__ + else: + value.__class__ = cls + return cast(T, value) + else: + if cls.__unparametrizedcls__: # None if the tensor is parametrized + cls_param = cls.__unparametrizedcls__ + else: + cls_param = cls + return cls_param(tensor=value) + + @staticmethod + def get_comp_backend() -> 'TensorFlowCompBackend': + """Return the computational backend of the tensor""" + from docarray.computation.tensorflow_backend import TensorFlowCompBackend + + return TensorFlowCompBackend() + + @classmethod + def __modify_schema__(cls, field_schema: Dict[str, Any]) -> None: + # this is needed to dump to json + field_schema.update(type='string', format='tensor') + + def _docarray_to_json_compatible(self) -> np.ndarray: + """ + Convert TensorFlowTensor into a json compatible object + :return: a representation of the tensor compatible with orjson + """ + return self.unwrap().numpy() + + def to_protobuf(self) -> 'NdArrayProto': + """ + Transform self into an NdArrayProto protobuf message. + """ + from docarray.proto import NdArrayProto + + nd_proto = NdArrayProto() + + value_np = self.tensor.numpy() + nd_proto.dense.buffer = value_np.tobytes() + nd_proto.dense.ClearField('shape') + nd_proto.dense.shape.extend(list(value_np.shape)) + nd_proto.dense.dtype = value_np.dtype.str + + return nd_proto + + @classmethod + def from_protobuf(cls: Type[T], pb_msg: 'NdArrayProto') -> 'T': + """ + Read ndarray from a proto msg. + :param pb_msg: + :return: a TensorFlowTensor + """ + source = pb_msg.dense + if source.buffer: + x = np.frombuffer(bytearray(source.buffer), dtype=source.dtype) + return cls.from_ndarray(x.reshape(source.shape)) + elif len(source.shape) > 0: + return cls.from_ndarray(np.zeros(source.shape)) + else: + raise ValueError( + f'Proto message {pb_msg} cannot be cast to a TensorFlowTensor.' + ) + + @classmethod + def from_ndarray(cls: Type[T], value: np.ndarray) -> T: + """Create a TensorFlowTensor from a numpy array. + + :param value: the numpy array + :return: a TensorFlowTensor + """ + return cls._docarray_from_native(tf.convert_to_tensor(value)) + + def unwrap(self) -> tf.Tensor: + """ + Return the original tensorflow.Tensor without any memory copy. + + The original view rest intact and is still a Document TensorFlowTensor + but the return object is a pure tf.Tensor but both object share + the same memory layout. + + EXAMPLE USAGE + .. code-block:: python + from docarray.typing import TensorFlowTensor + import tensorflow as tf + + t1 = TensorFlowTensor.validate(tf.zeros((3, 224, 224)), None, None) + # here t1 is a docarray TensorFlowTensor + t2 = t.unwrap() + # here t2 is a pure tf.Tensor but t1 is still a Docarray TensorFlowTensor + + + :return: a tf.Tensor + """ + return self.tensor diff --git a/docarray/typing/tensor/torch_tensor.py b/docarray/typing/tensor/torch_tensor.py index d33bdd9d3fe..53bfb50a87c 100644 --- a/docarray/typing/tensor/torch_tensor.py +++ b/docarray/typing/tensor/torch_tensor.py @@ -10,7 +10,6 @@ if TYPE_CHECKING: from pydantic.fields import ModelField from pydantic import BaseConfig - import numpy as np from docarray.proto import NdArrayProto from docarray.computation.torch_backend import TorchCompBackend @@ -123,7 +122,7 @@ def _docarray_to_json_compatible(self) -> np.ndarray: Convert torchTensor into a json compatible object :return: a representation of the tensor compatible with orjson """ - return self.numpy() ## might need to check device later + return self.numpy() ## might need to check device later def unwrap(self) -> torch.Tensor: """ diff --git a/pyproject.toml b/pyproject.toml index f41df6db23a..2c2c2a48abe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,5 +81,6 @@ markers = [ "slow: marks tests as slow (deselect with '-m \"not slow\"')", "internet: marks tests as requiring internet (deselect with '-m \"not internet\"')", "asyncio: marks that run async tests", - "proto: mark tests that run with proto" + "proto: mark tests that run with proto", + "tensorflow: marks test using tensorflow and proto 3" ] diff --git a/tests/integrations/typing/test_tensor.py b/tests/integrations/typing/test_tensor.py index 397bce00387..afd0095bd87 100644 --- a/tests/integrations/typing/test_tensor.py +++ b/tests/integrations/typing/test_tensor.py @@ -1,9 +1,18 @@ import numpy as np +import pytest import torch from docarray import BaseDocument from docarray.typing import AnyTensor, NdArray, TorchTensor +try: + import tensorflow as tf + import tensorflow._api.v2.experimental.numpy as tnp # type: ignore + + from docarray.typing import TensorFlowTensor +except (ImportError, TypeError): + TensorFlowTensor = None + def test_set_tensor(): class MyDocument(BaseDocument): @@ -20,3 +29,15 @@ class MyDocument(BaseDocument): assert isinstance(d.tensor, TorchTensor) assert isinstance(d.tensor, torch.Tensor) assert (d.tensor == torch.zeros((3, 224, 224))).all() + + +@pytest.mark.tensorflow +def test_set_tensor(): + class MyDocument(BaseDocument): + tensor: AnyTensor + + d = MyDocument(tensor=tf.zeros((3, 224, 224))) + + assert isinstance(d.tensor, TensorFlowTensor) + assert isinstance(d.tensor.tensor, tf.Tensor) + assert tnp.allclose(d.tensor.tensor, tf.zeros((3, 224, 224))) diff --git a/tests/integrations/typing/test_tensorflow_tensor.py b/tests/integrations/typing/test_tensorflow_tensor.py new file mode 100644 index 00000000000..c82e9f63394 --- /dev/null +++ b/tests/integrations/typing/test_tensorflow_tensor.py @@ -0,0 +1,23 @@ +import pytest + +from docarray import BaseDocument + +try: + import tensorflow as tf + import tensorflow._api.v2.experimental.numpy as tnp # type: ignore + + from docarray.typing import TensorFlowTensor +except (ImportError, TypeError): + pass + + +@pytest.mark.tensorflow +def test_set_tensorflow_tensor(): + class MyDocument(BaseDocument): + t: TensorFlowTensor + + doc = MyDocument(t=tf.zeros((3, 224, 224))) + + assert isinstance(doc.t, TensorFlowTensor) + assert isinstance(doc.t.tensor, tf.Tensor) + assert tnp.allclose(doc.t.tensor, tf.zeros((3, 224, 224))) diff --git a/tests/integrations/typing/test_typing_proto.py b/tests/integrations/typing/test_typing_proto.py index 4aa88020c3f..9d5b8040ee3 100644 --- a/tests/integrations/typing/test_typing_proto.py +++ b/tests/integrations/typing/test_typing_proto.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import torch from docarray import BaseDocument @@ -15,6 +16,7 @@ ) +@pytest.mark.proto def test_proto_all_types(): class Mymmdoc(BaseDocument): tensor: NdArray @@ -45,3 +47,42 @@ class Mymmdoc(BaseDocument): assert isinstance(value, np.ndarray) or isinstance(value, torch.Tensor) else: assert isinstance(value, doc._get_field_type(field)) + + +@pytest.mark.tensorflow +def test_proto_all_types_proto3(): + import tensorflow as tf + + from docarray.typing import TensorFlowTensor + + class Mymmdoc(BaseDocument): + tensor: NdArray + torch_tensor: TorchTensor + tf_tensor: TensorFlowTensor + embedding: AnyEmbedding + any_url: AnyUrl + image_url: ImageUrl + text_url: TextUrl + mesh_url: Mesh3DUrl + point_cloud_url: PointCloud3DUrl + + doc = Mymmdoc( + tensor=np.zeros((3, 224, 224)), + torch_tensor=torch.zeros((3, 224, 224)), + tf_tensor=tf.zeros((3, 224, 224)), + embedding=np.zeros((100, 1)), + any_url='http://jina.ai', + image_url='http://jina.ai/bla.jpg', + text_url='http://jina.ai', + mesh_url='http://jina.ai/mesh.obj', + point_cloud_url='http://jina.ai/mesh.obj', + ) + + new_doc = AnyDocument.from_protobuf(doc.to_protobuf()) + + for field, value in new_doc: + if field == 'embedding': + # embedding is a Union type, not supported by isinstance + assert isinstance(value, np.ndarray) or isinstance(value, torch.Tensor) + else: + assert isinstance(value, doc._get_field_type(field)) diff --git a/tests/units/array/test_array.py b/tests/units/array/test_array.py index 49279cbd818..69b6231140b 100644 --- a/tests/units/array/test_array.py +++ b/tests/units/array/test_array.py @@ -7,6 +7,11 @@ from docarray import BaseDocument, DocumentArray from docarray.typing import NdArray, TorchTensor +try: + from docarray.typing import TensorFlowTensor +except (ImportError, TypeError): + pass + @pytest.fixture() def da(): @@ -208,12 +213,15 @@ class Mmdoc(BaseDocument): assert text == f'hello{i}' +@pytest.mark.tensorflow def test_get_bulk_attributes_union_type_nested(): class MyDoc(BaseDocument): embedding: Union[Optional[TorchTensor], Optional[NdArray]] - embedding2: Optional[Union[TorchTensor, NdArray]] + embedding2: Optional[Union[TorchTensor, NdArray, TensorFlowTensor]] embedding3: Optional[Optional[TorchTensor]] - embedding4: Union[Optional[Union[TorchTensor, NdArray]], TorchTensor] + embedding4: Union[ + Optional[Union[TorchTensor, NdArray, TensorFlowTensor]], TorchTensor + ] da = DocumentArray[MyDoc]( [ diff --git a/tests/units/array/test_array_stacked_tf.py b/tests/units/array/test_array_stacked_tf.py new file mode 100644 index 00000000000..b7c4570d31c --- /dev/null +++ b/tests/units/array/test_array_stacked_tf.py @@ -0,0 +1,276 @@ +from typing import Optional, Union + +import pytest + +from docarray import BaseDocument, DocumentArray +from docarray.array import DocumentArrayStacked +from docarray.typing import AnyTensor, NdArray + +try: + import tensorflow as tf + import tensorflow._api.v2.experimental.numpy as tnp + + from docarray.typing import TensorFlowTensor +except (ImportError, TypeError): + pass + + +@pytest.fixture() +def batch(): + class Image(BaseDocument): + tensor: TensorFlowTensor[3, 224, 224] + + import tensorflow as tf + + batch = DocumentArray[Image]( + [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)] + ) + + return batch.stack() + + +@pytest.mark.tensorflow +def test_len(batch): + assert len(batch) == 10 + + +@pytest.mark.tensorflow +def test_getitem(batch): + for i in range(len(batch)): + item = batch[i] + assert isinstance(item.tensor, TensorFlowTensor) + assert tnp.allclose(item.tensor.tensor, tf.zeros((3, 224, 224))) + + +@pytest.mark.tensorflow +def test_get_slice(batch): + sliced = batch[0:2] + assert isinstance(sliced, DocumentArrayStacked) + assert len(sliced) == 2 + + +@pytest.mark.tensorflow +def test_iterator(batch): + for doc in batch: + assert tnp.allclose(doc.tensor.tensor, tf.zeros((3, 224, 224))) + + +@pytest.mark.tensorflow +def test_stack_setter(batch): + + batch.tensor = tf.ones((10, 3, 224, 224)) + + assert tnp.allclose(batch.tensor, tf.ones((10, 3, 224, 224))) + + +@pytest.mark.tensorflow +def test_set_after_stacking(batch): + class Image(BaseDocument): + tensor: TensorFlowTensor[3, 224, 224] + + batch = DocumentArray[Image]( + [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)] + ) + + batch = batch.stack() + batch.tensor.tensor = tf.ones((10, 3, 224, 224)) + for i, doc in enumerate(batch): + assert tnp.allclose(doc.tensor.tensor, batch.tensor.tensor[i]) + + +@pytest.mark.tensorflow +def test_stack_optional(batch): + + assert tnp.allclose( + batch._tensor_columns['tensor'].tensor, tf.zeros((10, 3, 224, 224)) + ) + assert tnp.allclose(batch.tensor.tensor, tf.zeros((10, 3, 224, 224))) + + +@pytest.mark.tensorflow +def test_stack_mod_nested_document(): + class Image(BaseDocument): + tensor: TensorFlowTensor[3, 224, 224] + + class MMdoc(BaseDocument): + img: Image + + batch = DocumentArray[MMdoc]( + [MMdoc(img=Image(tensor=tf.zeros((3, 224, 224)))) for _ in range(10)] + ) + + batch = batch.stack() + + assert tnp.allclose( + batch._doc_columns['img']._tensor_columns['tensor'].tensor, + tf.zeros((10, 3, 224, 224)), + ) + + assert tnp.allclose(batch.img.tensor.tensor, tf.zeros((10, 3, 224, 224))) + + +@pytest.mark.tensorflow +def test_convert_to_da(batch): + da = batch.unstack() + + for doc in da: + assert tnp.allclose(doc.tensor.tensor, tf.zeros((3, 224, 224))) + + +@pytest.mark.tensorflow +def test_unstack_nested_document(): + class Image(BaseDocument): + tensor: TensorFlowTensor[3, 224, 224] + + class MMdoc(BaseDocument): + img: Image + + batch = DocumentArray[MMdoc]( + [MMdoc(img=Image(tensor=tf.zeros((3, 224, 224)))) for _ in range(10)] + ) + + batch = batch.stack() + da = batch.unstack() + + for doc in da: + assert tnp.allclose(doc.img.tensor.tensor, tf.zeros((3, 224, 224))) + + +@pytest.mark.tensorflow +def test_stack_call(): + class Image(BaseDocument): + tensor: TensorFlowTensor[3, 224, 224] + + da = DocumentArray[Image]( + [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)] + ) + + da = da.stack() + + assert len(da) == 10 + + assert da.tensor.tensor.shape == (10, 3, 224, 224) + + +@pytest.mark.tensorflow +def test_context_manager(): + class Image(BaseDocument): + tensor: TensorFlowTensor[3, 224, 224] + + da = DocumentArray[Image]( + [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)] + ) + + with da.stacked_mode() as da: + assert len(da) == 10 + + assert da.tensor.tensor.shape == ((10, 3, 224, 224)) + + da.tensor = tf.ones((10, 3, 224, 224)) + + tensor = da.tensor + + assert isinstance(tensor, list) + for doc in da: + assert tnp.allclose(doc.tensor.tensor, tf.ones((3, 224, 224))) + + +@pytest.mark.tensorflow +def test_stack_union(): + class Image(BaseDocument): + tensor: Union[NdArray[3, 224, 224], TensorFlowTensor[3, 224, 224]] + + batch = DocumentArray[Image]( + [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)] + ) + batch[3].tensor = tf.zeros((3, 224, 224)) + + # union fields aren't actually stacked + # just checking that there is no error + batch.stack() + + +@pytest.mark.tensorflow +def test_any_tensor_with_tf(): + tensor = tf.zeros((3, 224, 224)) + + class Image(BaseDocument): + tensor: AnyTensor + + da = DocumentArray[Image]( + [Image(tensor=tensor) for _ in range(10)], + tensor_type=TensorFlowTensor, + ).stack() + + for i in range(len(da)): + assert tnp.allclose(da[i].tensor.tensor, tensor) + + assert 'tensor' in da._tensor_columns.keys() + assert isinstance(da._tensor_columns['tensor'], TensorFlowTensor) + + +@pytest.mark.tensorflow +def test_any_tensor_with_optional(): + tensor = tf.zeros((3, 224, 224)) + + class Image(BaseDocument): + tensor: Optional[AnyTensor] + + class TopDoc(BaseDocument): + img: Image + + da = DocumentArray[TopDoc]( + [TopDoc(img=Image(tensor=tensor)) for _ in range(10)], + tensor_type=TensorFlowTensor, + ).stack() + + for i in range(len(da)): + assert tnp.allclose(da.img[i].tensor.tensor, tensor) + + assert 'tensor' in da.img._tensor_columns.keys() + assert isinstance(da.img._tensor_columns['tensor'], TensorFlowTensor) + assert isinstance(da.img._tensor_columns['tensor'].tensor, tf.Tensor) + + +@pytest.mark.tensorflow +def test_get_from_slice_stacked(): + class Doc(BaseDocument): + text: str + tensor: TensorFlowTensor + + da = DocumentArray[Doc]( + [Doc(text=f'hello{i}', tensor=tf.zeros((3, 224, 224))) for i in range(10)] + ).stack() + + da_sliced = da[0:10:2] + assert isinstance(da_sliced, DocumentArrayStacked) + + tensors = da_sliced.tensor.tensor + assert tensors.shape == (5, 3, 224, 224) + + +@pytest.mark.tensorflow +def test_stack_none(): + class MyDoc(BaseDocument): + tensor: Optional[AnyTensor] + + da = DocumentArray[MyDoc]( + [MyDoc(tensor=None) for _ in range(10)], tensor_type=TensorFlowTensor + ).stack() + + assert 'tensor' in da._tensor_columns.keys() + + +@pytest.mark.tensorflow +def test_keep_dtype_tf(): + class MyDoc(BaseDocument): + tensor: TensorFlowTensor + + da = DocumentArray[MyDoc]( + [MyDoc(tensor=tf.zeros([2, 4], dtype=tf.int32)) for _ in range(3)] + ) + assert da[0].tensor.tensor.dtype == tf.int32 + + da = da.stack() + assert da[0].tensor.tensor.dtype == tf.int32 + assert da.tensor.tensor.dtype == tf.int32 diff --git a/tests/units/computation_backends/numpy_backend/test_basics.py b/tests/units/computation_backends/numpy_backend/test_basics.py index 29cebb0d22b..fde525a459f 100644 --- a/tests/units/computation_backends/numpy_backend/test_basics.py +++ b/tests/units/computation_backends/numpy_backend/test_basics.py @@ -1,7 +1,9 @@ import numpy as np import pytest +from pydantic import parse_obj_as from docarray.computation.numpy_backend import NumpyCompBackend +from docarray.typing import NdArray def test_to_device(): @@ -87,3 +89,16 @@ def test_minmax_normalize(array, t_range, x_range, result): tensor=array, t_range=t_range, x_range=x_range ) assert np.allclose(output, result) + + +def test_stack(): + t0 = parse_obj_as(NdArray, np.zeros((3, 224, 224))) + t1 = parse_obj_as(NdArray, np.ones((3, 224, 224))) + + stacked1 = NumpyCompBackend.stack([t0, t1], dim=0) + assert isinstance(stacked1, np.ndarray) + assert stacked1.shape == (2, 3, 224, 224) + + stacked2 = NumpyCompBackend.stack([t0, t1], dim=-1) + assert isinstance(stacked2, np.ndarray) + assert stacked2.shape == (3, 224, 224, 2) diff --git a/tests/units/computation_backends/tensorflow_backend/__init__.py b/tests/units/computation_backends/tensorflow_backend/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/units/computation_backends/tensorflow_backend/test_basics.py b/tests/units/computation_backends/tensorflow_backend/test_basics.py new file mode 100644 index 00000000000..7c896749647 --- /dev/null +++ b/tests/units/computation_backends/tensorflow_backend/test_basics.py @@ -0,0 +1,135 @@ +import numpy as np +import pytest + +try: + import tensorflow as tf + + from docarray.computation.tensorflow_backend import TensorFlowCompBackend + from docarray.typing import TensorFlowTensor +except (ImportError, TypeError): + pass + + +@pytest.mark.tensorflow +@pytest.mark.parametrize( + 'shape,result', + [ + ((5), 1), + ((1, 5), 2), + ((5, 5), 2), + ((), 0), + ], +) +def test_n_dim(shape, result): + array = TensorFlowTensor(tf.zeros(shape)) + assert TensorFlowCompBackend.n_dim(array) == result + + +@pytest.mark.tensorflow +@pytest.mark.parametrize( + 'shape,result', + [ + ((10,), (10,)), + ((5, 5), (5, 5)), + ((), ()), + ], +) +def test_shape(shape, result): + array = TensorFlowTensor(tf.zeros(shape)) + shape = TensorFlowCompBackend.shape(array) + assert shape == result + assert type(shape) == tuple + + +@pytest.mark.tensorflow +def test_to_device(): + array = TensorFlowTensor(tf.constant([1, 2, 3])) + array = TensorFlowCompBackend.to_device(array, 'CPU:0') + assert array.tensor.device.endswith('CPU:0') + + +@pytest.mark.tensorflow +@pytest.mark.parametrize('dtype', ['int64', 'float64', 'int8', 'double']) +def test_dtype(dtype): + array = TensorFlowTensor(tf.constant([1, 2, 3], dtype=getattr(tf, dtype))) + assert TensorFlowCompBackend.dtype(array) == dtype + + +@pytest.mark.tensorflow +def test_empty(): + array = TensorFlowCompBackend.empty((10, 3)) + assert array.tensor.shape == (10, 3) + + +@pytest.mark.tensorflow +def test_empty_dtype(): + tf_tensor = TensorFlowCompBackend.empty((10, 3), dtype=tf.int32) + assert tf_tensor.tensor.shape == (10, 3) + assert tf_tensor.tensor.dtype == tf.int32 + + +@pytest.mark.tensorflow +def test_empty_device(): + tensor = TensorFlowCompBackend.empty((10, 3), device='CPU:0') + assert tensor.tensor.shape == (10, 3) + assert tensor.tensor.device.endswith('CPU:0') + + +@pytest.mark.tensorflow +def test_squeeze(): + tensor = TensorFlowTensor(tf.zeros(shape=(1, 1, 3, 1))) + squeezed = TensorFlowCompBackend.squeeze(tensor) + assert squeezed.tensor.shape == (3,) + + +@pytest.mark.tensorflow +@pytest.mark.parametrize( + 'data_input,t_range,x_range,data_result', + [ + ( + [0, 1, 2, 3, 4, 5], + (0, 10), + None, + [0, 2, 4, 6, 8, 10], + ), + ( + [0, 1, 2, 3, 4, 5], + (0, 10), + (0, 10), + [0, 1, 2, 3, 4, 5], + ), + ( + [[0.0, 1.0], [0.0, 1.0]], + (0, 10), + None, + [[0.0, 10.0], [0.0, 10.0]], + ), + ], +) +def test_minmax_normalize(data_input, t_range, x_range, data_result): + array = TensorFlowTensor(tf.constant(data_input)) + output = TensorFlowCompBackend.minmax_normalize( + tensor=array, t_range=t_range, x_range=x_range + ) + assert np.allclose(output.tensor, tf.constant(data_result)) + + +@pytest.mark.tensorflow +def test_reshape(): + tensor = TensorFlowTensor(tf.zeros((3, 224, 224))) + reshaped = TensorFlowCompBackend.reshape(tensor, (224, 224, 3)) + assert reshaped.tensor.shape == (224, 224, 3) + + +@pytest.mark.tensorflow +def test_stack(): + t0 = TensorFlowTensor(tf.zeros((3, 224, 224))) + t1 = TensorFlowTensor(tf.ones((3, 224, 224))) + + stacked1 = TensorFlowCompBackend.stack([t0, t1], dim=0) + assert isinstance(stacked1, TensorFlowTensor) + assert stacked1.tensor.shape == (2, 3, 224, 224) + + stacked2 = TensorFlowCompBackend.stack([t0, t1], dim=-1) + assert isinstance(stacked2, TensorFlowTensor) + assert stacked2.tensor.shape == (3, 224, 224, 2) diff --git a/tests/units/computation_backends/tensorflow_backend/test_metrics.py b/tests/units/computation_backends/tensorflow_backend/test_metrics.py new file mode 100644 index 00000000000..354b61612e5 --- /dev/null +++ b/tests/units/computation_backends/tensorflow_backend/test_metrics.py @@ -0,0 +1,79 @@ +import pytest + +try: + import tensorflow as tf + + from docarray.computation.tensorflow_backend import TensorFlowCompBackend + from docarray.typing import TensorFlowTensor + + metrics = TensorFlowCompBackend.Metrics +except (ImportError, TypeError): + metrics = None + + +@pytest.mark.tensorflow +def test_cosine_sim_tf(): + a = TensorFlowTensor(tf.random.normal((128,))) + b = TensorFlowTensor(tf.random.normal((128,))) + assert metrics.cosine_sim(a, b).tensor.shape == (1,) + assert metrics.cosine_sim(a, b).tensor == metrics.cosine_sim(b, a).tensor + tf.experimental.numpy.allclose(metrics.cosine_sim(a, a).tensor, tf.ones(1)) + + a = TensorFlowTensor(tf.random.normal((10, 3))) + b = TensorFlowTensor(tf.random.normal((5, 3))) + assert metrics.cosine_sim(a, b).tensor.shape == (10, 5) + assert metrics.cosine_sim(b, a).tensor.shape == (5, 10) + diag_dists = tf.linalg.diag(metrics.cosine_sim(b, b).tensor) # self-comparisons + tf.experimental.numpy.allclose(diag_dists, tf.ones(5)) + + +@pytest.mark.tensorflow +def test_euclidean_dist_tf(): + a = TensorFlowTensor(tf.random.normal((128,))) + b = TensorFlowTensor(tf.random.normal((128,))) + assert metrics.euclidean_dist(a, b).tensor.shape == (1,) + assert metrics.euclidean_dist(a, b).tensor == metrics.euclidean_dist(b, a).tensor + tf.experimental.numpy.allclose(metrics.euclidean_dist(a, a).tensor, tf.zeros(1)) + + a = TensorFlowTensor(tf.zeros((1, 1))) + b = TensorFlowTensor(tf.ones((4, 1))) + assert metrics.euclidean_dist(a, b).tensor.shape == (4,) + tf.experimental.numpy.allclose( + metrics.euclidean_dist(a, b).tensor, metrics.euclidean_dist(b, a).tensor + ) + tf.experimental.numpy.allclose(metrics.euclidean_dist(a, a).tensor, tf.zeros(1)) + + a = TensorFlowTensor(tf.constant([0.0, 2.0, 0.0])) + b = TensorFlowTensor(tf.constant([0.0, 0.0, 2.0])) + desired_output_singleton: tf.Tensor = tf.math.sqrt( + tf.constant([2.0**2.0 + 2.0**2.0]) + ) + tf.experimental.numpy.allclose( + metrics.euclidean_dist(a, b).tensor, desired_output_singleton + ) + + a = TensorFlowTensor(tf.constant([[0.0, 2.0, 0.0], [0.0, 0.0, 2.0]])) + b = TensorFlowTensor(tf.constant([[0.0, 0.0, 2.0], [0.0, 2.0, 0.0]])) + desired_output_singleton = tf.constant([[2.828427, 0.0], [0.0, 2.828427]]) + tf.experimental.numpy.allclose( + metrics.euclidean_dist(a, b).tensor, desired_output_singleton + ) + + +@pytest.mark.tensorflow +def test_sqeuclidean_dist_torch(): + a = TensorFlowTensor(tf.random.normal((128,))) + b = TensorFlowTensor(tf.random.normal((128,))) + assert metrics.sqeuclidean_dist(a, b).tensor.shape == (1,) + tf.experimental.numpy.allclose( + metrics.sqeuclidean_dist(a, b).tensor, + metrics.euclidean_dist(a, b).tensor ** 2, + ) + + a = TensorFlowTensor(tf.random.normal((1, 1))) + b = TensorFlowTensor(tf.random.normal((4, 1))) + assert metrics.sqeuclidean_dist(b, a).tensor.shape == (4,) + tf.experimental.numpy.allclose( + metrics.sqeuclidean_dist(a, b).tensor, + metrics.euclidean_dist(a, b).tensor ** 2, + ) diff --git a/tests/units/computation_backends/tensorflow_backend/test_retrieval.py b/tests/units/computation_backends/tensorflow_backend/test_retrieval.py new file mode 100644 index 00000000000..0eb789b9e1d --- /dev/null +++ b/tests/units/computation_backends/tensorflow_backend/test_retrieval.py @@ -0,0 +1,62 @@ +import pytest + +try: + import tensorflow as tf + import tensorflow._api.v2.experimental.numpy as tnp + + from docarray.computation.tensorflow_backend import TensorFlowCompBackend + from docarray.typing import TensorFlowTensor +except (ImportError, TypeError): + pass + + +@pytest.mark.tensorflow +def test_top_k_descending_false(): + top_k = TensorFlowCompBackend.Retrieval.top_k + + a = TensorFlowTensor(tf.constant([1, 4, 2, 7, 4, 9, 2])) + vals, indices = top_k(a, 3, descending=False) + + assert vals.tensor.shape == (1, 3) + assert indices.tensor.shape == (1, 3) + assert tnp.allclose(tnp.squeeze(vals.tensor), tf.constant([1, 2, 2])) + assert tnp.allclose(tnp.squeeze(indices.tensor), tf.constant([0, 2, 6])) or ( + tnp.allclose(tnp.squeeze.indices.tensor), + tf.constant([0, 6, 2]), + ) + + a = TensorFlowTensor(tf.constant([[1, 4, 2, 7, 4, 9, 2], [11, 6, 2, 7, 3, 10, 4]])) + vals, indices = top_k(a, 3, descending=False) + assert vals.tensor.shape == (2, 3) + assert indices.tensor.shape == (2, 3) + assert tnp.allclose(vals.tensor[0], tf.constant([1, 2, 2])) + assert tnp.allclose(indices.tensor[0], tf.constant([0, 2, 6])) or tnp.allclose( + indices.tensor[0], tf.constant([0, 6, 2]) + ) + assert tnp.allclose(vals.tensor[1], tf.constant([2, 3, 4])) + assert tnp.allclose(indices.tensor[1], tf.constant([2, 4, 6])) + + +@pytest.mark.tensorflow +def test_top_k_descending_true(): + top_k = TensorFlowCompBackend.Retrieval.top_k + + a = TensorFlowTensor(tf.constant([1, 4, 2, 7, 4, 9, 2])) + vals, indices = top_k(a, 3, descending=True) + + assert vals.tensor.shape == (1, 3) + assert indices.tensor.shape == (1, 3) + assert tnp.allclose(tnp.squeeze(vals.tensor), tf.constant([9, 7, 4])) + assert tnp.allclose(tnp.squeeze(indices.tensor), tf.constant([5, 3, 1])) + + a = TensorFlowTensor(tf.constant([[1, 4, 2, 7, 4, 9, 2], [11, 6, 2, 7, 3, 10, 4]])) + vals, indices = top_k(a, 3, descending=True) + + assert vals.tensor.shape == (2, 3) + assert indices.tensor.shape == (2, 3) + + assert tnp.allclose(vals.tensor[0], tf.constant([9, 7, 4])) + assert tnp.allclose(indices.tensor[0], tf.constant([5, 3, 1])) + + assert tnp.allclose(vals.tensor[1], tf.constant([11, 10, 7])) + assert tnp.allclose(indices.tensor[1], tf.constant([0, 5, 3])) diff --git a/tests/units/document/proto/test_proto_based_object.py b/tests/units/document/proto/test_proto_based_object.py index 6d2b7a79b7b..ecec88fb6e6 100644 --- a/tests/units/document/proto/test_proto_based_object.py +++ b/tests/units/document/proto/test_proto_based_object.py @@ -1,9 +1,11 @@ import numpy as np +import pytest from docarray.proto import DocumentProto, NodeProto from docarray.typing import NdArray +@pytest.mark.proto def test_ndarray(): original_ndarray = np.zeros((3, 224, 224)) @@ -15,6 +17,7 @@ def test_ndarray(): assert (tensor == original_ndarray).all() +@pytest.mark.proto def test_document_proto_set(): data = {} diff --git a/tests/units/typing/tensor/test_audio_tensor.py b/tests/units/typing/tensor/test_audio_tensor.py index 9a2bf64fa69..341c226b35d 100644 --- a/tests/units/typing/tensor/test_audio_tensor.py +++ b/tests/units/typing/tensor/test_audio_tensor.py @@ -54,6 +54,7 @@ def test_illegal_validation(cls_tensor, tensor): parse_obj_as(cls_tensor, tensor) +@pytest.mark.proto @pytest.mark.parametrize( 'cls_tensor,tensor,proto_key', [ diff --git a/tests/units/typing/tensor/test_cross_backend.py b/tests/units/typing/tensor/test_cross_backend.py index b2e0ba6ec53..702cd678d6f 100644 --- a/tests/units/typing/tensor/test_cross_backend.py +++ b/tests/units/typing/tensor/test_cross_backend.py @@ -1,14 +1,29 @@ import numpy as np +import pytest from pydantic import parse_obj_as from docarray.typing import NdArray, TorchTensor +try: + from docarray.typing import TensorFlowTensor +except (ImportError, TypeError): + pass + +@pytest.mark.tensorflow def test_coercion_behavior(): t_np = parse_obj_as(NdArray[128], np.zeros(128)) t_th = parse_obj_as(TorchTensor[128], np.zeros(128)) + t_tf = parse_obj_as(TensorFlowTensor[128], np.zeros(128)) assert isinstance(t_np, NdArray[128]) + assert not isinstance(t_np, TensorFlowTensor[128]) assert not isinstance(t_np, TorchTensor[128]) + assert isinstance(t_th, TorchTensor[128]) assert not isinstance(t_th, NdArray[128]) + assert not isinstance(t_th, TensorFlowTensor[128]) + + assert isinstance(t_tf, TensorFlowTensor[128]) + assert not isinstance(t_tf, TorchTensor[128]) + assert not isinstance(t_tf, NdArray[128]) diff --git a/tests/units/typing/tensor/test_embedding.py b/tests/units/typing/tensor/test_embedding.py index 447351c543c..f7eebcf4caf 100644 --- a/tests/units/typing/tensor/test_embedding.py +++ b/tests/units/typing/tensor/test_embedding.py @@ -1,10 +1,12 @@ import numpy as np +import pytest from pydantic.tools import parse_obj_as, schema_json_of from docarray.base_document.io.json import orjson_dumps from docarray.typing import AnyEmbedding +@pytest.mark.proto def test_proto_embedding(): embedding = parse_obj_as(AnyEmbedding, np.zeros((3, 224, 224))) diff --git a/tests/units/typing/tensor/test_tensor.py b/tests/units/typing/tensor/test_tensor.py index c7b72d77303..1d36a4c355e 100644 --- a/tests/units/typing/tensor/test_tensor.py +++ b/tests/units/typing/tensor/test_tensor.py @@ -8,6 +8,7 @@ from docarray.typing.tensor import NdArrayEmbedding +@pytest.mark.proto def test_proto_tensor(): tensor = parse_obj_as(NdArray, np.zeros((3, 224, 224))) diff --git a/tests/units/typing/tensor/test_tensor_flow_tensor.py b/tests/units/typing/tensor/test_tensor_flow_tensor.py new file mode 100644 index 00000000000..e1727acabb9 --- /dev/null +++ b/tests/units/typing/tensor/test_tensor_flow_tensor.py @@ -0,0 +1,178 @@ +import numpy as np +import pytest +from pydantic import schema_json_of +from pydantic.tools import parse_obj_as + +from docarray.base_document.io.json import orjson_dumps + +try: + import tensorflow as tf + import tensorflow._api.v2.experimental.numpy as tnp # type: ignore + from tensorflow.python.framework.errors_impl import InvalidArgumentError + + from docarray.typing import TensorFlowTensor +except (ImportError, TypeError): + pass + + +@pytest.mark.tensorflow +def test_proto_tensor(): + from docarray.proto.pb2.docarray_pb2 import NdArrayProto + + tensor = parse_obj_as(TensorFlowTensor, tf.zeros((3, 224, 224))) + proto = tensor.to_protobuf() + assert isinstance(proto, NdArrayProto) + + from_proto = TensorFlowTensor.from_protobuf(proto) + assert isinstance(from_proto, TensorFlowTensor) + assert tnp.allclose(tensor.tensor, from_proto.tensor) + + +@pytest.mark.tensorflow +def test_json_schema(): + schema_json_of(TensorFlowTensor) + + +@pytest.mark.tensorflow +def test_dump_json(): + tensor = parse_obj_as(TensorFlowTensor, tf.zeros((3, 224, 224))) + orjson_dumps(tensor) + + +@pytest.mark.tensorflow +def test_unwrap(): + tf_tensor = parse_obj_as(TensorFlowTensor, tf.zeros((3, 224, 224))) + unwrapped = tf_tensor.unwrap() + + assert not isinstance(unwrapped, TensorFlowTensor) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(unwrapped, tf.Tensor) + + assert np.allclose(unwrapped, np.zeros((3, 224, 224))) + + +@pytest.mark.tensorflow +def test_from_ndarray(): + nd = np.array([1, 2, 3]) + tensor = TensorFlowTensor.from_ndarray(nd) + assert isinstance(tensor, TensorFlowTensor) + assert isinstance(tensor.tensor, tf.Tensor) + + +@pytest.mark.tensorflow +def test_parametrized(): + # correct shape, single axis + tf_tensor = parse_obj_as(TensorFlowTensor[128], tf.zeros(128)) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(tf_tensor.tensor, tf.Tensor) + assert tf_tensor.tensor.shape == (128,) + + # correct shape, multiple axis + tf_tensor = parse_obj_as(TensorFlowTensor[3, 224, 224], tf.zeros((3, 224, 224))) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(tf_tensor.tensor, tf.Tensor) + assert tf_tensor.tensor.shape == (3, 224, 224) + + # wrong but reshapable shape + tf_tensor = parse_obj_as(TensorFlowTensor[3, 224, 224], tf.zeros((224, 3, 224))) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(tf_tensor.tensor, tf.Tensor) + assert tf_tensor.tensor.shape == (3, 224, 224) + + # wrong and not reshapable shape + with pytest.raises(InvalidArgumentError): + parse_obj_as(TensorFlowTensor[3, 224, 224], tf.zeros((224, 224))) + + +@pytest.mark.tensorflow +def test_parametrized_with_str(): + # test independent variable dimensions + tf_tensor = parse_obj_as(TensorFlowTensor[3, 'x', 'y'], tf.zeros((3, 224, 224))) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(tf_tensor.tensor, tf.Tensor) + assert tf_tensor.tensor.shape == (3, 224, 224) + + tf_tensor = parse_obj_as(TensorFlowTensor[3, 'x', 'y'], tf.zeros((3, 60, 128))) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(tf_tensor.tensor, tf.Tensor) + assert tf_tensor.tensor.shape == (3, 60, 128) + + with pytest.raises(ValueError): + parse_obj_as(TensorFlowTensor[3, 'x', 'y'], tf.zeros((4, 224, 224))) + + with pytest.raises(ValueError): + parse_obj_as(TensorFlowTensor[3, 'x', 'y'], tf.zeros((100, 1))) + + # test dependent variable dimensions + tf_tensor = parse_obj_as(TensorFlowTensor[3, 'x', 'x'], tf.zeros((3, 224, 224))) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(tf_tensor.tensor, tf.Tensor) + assert tf_tensor.tensor.shape == (3, 224, 224) + + with pytest.raises(ValueError): + _ = parse_obj_as(TensorFlowTensor[3, 'x', 'x'], tf.zeros((3, 60, 128))) + + with pytest.raises(ValueError): + _ = parse_obj_as(TensorFlowTensor[3, 'x', 'x'], tf.zeros((3, 60))) + + +@pytest.mark.tensorflow +@pytest.mark.parametrize('shape', [(3, 224, 224), (224, 224, 3)]) +def test_parameterized_tensor_class_name(shape): + MyTFT = TensorFlowTensor[3, 224, 224] + tensor = parse_obj_as(MyTFT, tf.zeros(shape)) + + assert MyTFT.__name__ == 'TensorFlowTensor[3, 224, 224]' + assert MyTFT.__qualname__ == 'TensorFlowTensor[3, 224, 224]' + + assert tensor.__class__.__name__ == 'TensorFlowTensor' + assert tensor.__class__.__qualname__ == 'TensorFlowTensor' + assert f'{tensor.tensor[0][0][0]}' == '0.0' + + +@pytest.mark.tensorflow +def test_parametrized_subclass(): + c1 = TensorFlowTensor[128] + c2 = TensorFlowTensor[128] + assert issubclass(c1, c2) + assert issubclass(c1, TensorFlowTensor) + + assert not issubclass(c1, TensorFlowTensor[256]) + + +@pytest.mark.tensorflow +def test_parametrized_instance(): + t = parse_obj_as(TensorFlowTensor[128], tf.zeros((128,))) + assert isinstance(t, TensorFlowTensor[128]) + assert isinstance(t, TensorFlowTensor) + # assert isinstance(t, tf.Tensor) + + assert not isinstance(t, TensorFlowTensor[256]) + assert not isinstance(t, TensorFlowTensor[2, 128]) + assert not isinstance(t, TensorFlowTensor[2, 2, 64]) + + +@pytest.mark.tensorflow +def test_parametrized_equality(): + t1 = parse_obj_as(TensorFlowTensor[128], tf.zeros((128,))) + t2 = parse_obj_as(TensorFlowTensor[128], tf.zeros((128,))) + assert tf.experimental.numpy.allclose(t1.tensor, t2.tensor) + + +@pytest.mark.tensorflow +def test_parametrized_operations(): + t1 = parse_obj_as(TensorFlowTensor[128], tf.zeros((128,))) + t2 = parse_obj_as(TensorFlowTensor[128], tf.zeros((128,))) + t_result = t1.tensor + t2.tensor + assert isinstance(t_result, tf.Tensor) + assert not isinstance(t_result, TensorFlowTensor) + assert not isinstance(t_result, TensorFlowTensor[128]) + + +@pytest.mark.tensorflow +def test_set_item(): + t = TensorFlowTensor(tensor=tf.zeros((3, 224, 224))) + t[0] = tf.ones((1, 224, 224)) + assert tnp.allclose(t.tensor[0], tf.ones((1, 224, 224))) + assert tnp.allclose(t.tensor[1], tf.zeros((1, 224, 224))) + assert tnp.allclose(t.tensor[2], tf.zeros((1, 224, 224))) diff --git a/tests/units/typing/tensor/test_torch_tensor.py b/tests/units/typing/tensor/test_torch_tensor.py index b2b836a5a05..e56adef11ca 100644 --- a/tests/units/typing/tensor/test_torch_tensor.py +++ b/tests/units/typing/tensor/test_torch_tensor.py @@ -6,6 +6,7 @@ from docarray.typing import TorchEmbedding, TorchTensor +@pytest.mark.proto def test_proto_tensor(): tensor = parse_obj_as(TorchTensor, torch.zeros(3, 224, 224)) diff --git a/tests/units/typing/url/test_image_url.py b/tests/units/typing/url/test_image_url.py index 63ade56c3a4..cc95a074c3c 100644 --- a/tests/units/typing/url/test_image_url.py +++ b/tests/units/typing/url/test_image_url.py @@ -32,6 +32,7 @@ def test_image_url(): assert isinstance(tensor, np.ndarray) +@pytest.mark.proto def test_proto_image_url(): uri = parse_obj_as(ImageUrl, REMOTE_JPG) diff --git a/tests/units/typing/url/test_mesh_url.py b/tests/units/typing/url/test_mesh_url.py index 534a052929a..83297cde56d 100644 --- a/tests/units/typing/url/test_mesh_url.py +++ b/tests/units/typing/url/test_mesh_url.py @@ -85,6 +85,7 @@ def test_validation(file_format, path_to_file): assert isinstance(url, str) +@pytest.mark.proto def test_proto_mesh_url(): uri = parse_obj_as(Mesh3DUrl, REMOTE_OBJ_FILE) uri._to_node_protobuf() diff --git a/tests/units/typing/url/test_point_cloud_url.py b/tests/units/typing/url/test_point_cloud_url.py index 5d36686901a..f209a62afb9 100644 --- a/tests/units/typing/url/test_point_cloud_url.py +++ b/tests/units/typing/url/test_point_cloud_url.py @@ -88,6 +88,7 @@ def test_validation(file_format, path_to_file): assert isinstance(url, str) +@pytest.mark.proto def test_proto_point_cloud_url(): uri = parse_obj_as(PointCloud3DUrl, REMOTE_OBJ_FILE) uri._to_node_protobuf() diff --git a/tests/units/util/test_typing.py b/tests/units/util/test_typing.py index bcbed3fd9b1..66eb135d7ff 100644 --- a/tests/units/util/test_typing.py +++ b/tests/units/util/test_typing.py @@ -6,6 +6,11 @@ from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.utils._typing import is_tensor_union, is_type_tensor +try: + from docarray.typing import TensorFlowTensor +except (ImportError, TypeError): + TensorFlowTensor = None + @pytest.mark.parametrize( 'type_, is_tensor', @@ -24,6 +29,18 @@ def test_is_type_tensor(type_, is_tensor): assert is_type_tensor(type_) == is_tensor +@pytest.mark.tensorflow +@pytest.mark.parametrize( + 'type_, is_tensor', + [ + (TensorFlowTensor, True), + (Optional[TensorFlowTensor], False), + ], +) +def test_is_type_tensor_with_tf(type_, is_tensor): + assert is_type_tensor(type_) == is_tensor + + @pytest.mark.parametrize( 'type_, is_union_tensor', [ @@ -40,3 +57,17 @@ def test_is_type_tensor(type_, is_tensor): ) def test_is_union_type_tensor(type_, is_union_tensor): assert is_tensor_union(type_) == is_union_tensor + + +@pytest.mark.tensorflow +@pytest.mark.parametrize( + 'type_, is_union_tensor', + [ + (TensorFlowTensor, False), + (Optional[TensorFlowTensor], True), + (Union[NdArray, TorchTensor, TensorFlowTensor], True), + (Union[NdArray, TorchTensor, Optional[TensorFlowTensor]], True), + ], +) +def test_is_union_type_tensor_with_tf(type_, is_union_tensor): + assert is_tensor_union(type_) == is_union_tensor