Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,14 @@ This follow [Pydantic Model](https://pydantic-docs.helpmanual.io/usage/models/)

It is similar to the dataclass from the (old) docarray


```python
from docarray.typing import Tensor
from docarray.typing import NdArray
Comment thread
JohannesMessner marked this conversation as resolved.
import numpy as np


class Banner(Document):
text: str
image: Tensor
image: NdArray


banner = Banner(text='DocArray is amazing', image=np.zeros((3, 224, 224)))
Expand Down
4 changes: 2 additions & 2 deletions docarray/document/mixins/proto.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
AnyUrl,
Embedding,
ImageUrl,
Tensor,
NdArray,
TextUrl,
TorchTensor,
)
Expand All @@ -31,7 +31,7 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocumentProto') -> T:
# this if else statement need to be refactored it is too long
# the check should be delegated to the type level
content_type_dict = dict(
tensor=Tensor,
ndarray=NdArray,
torch_tensor=TorchTensor,
embedding=Embedding,
any_url=AnyUrl,
Expand Down
2 changes: 1 addition & 1 deletion docarray/proto/docarray.proto
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ message NodeProto {
bytes blob = 1;

// the ndarray of the image/audio/video document
NdArrayProto tensor = 2;
NdArrayProto ndarray = 2;

// a text
string text = 3;
Expand Down
16 changes: 8 additions & 8 deletions docarray/proto/pb2/docarray_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions docarray/typing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from docarray.typing.id import ID
from docarray.typing.tensor import Tensor, TorchTensor
from docarray.typing.tensor import NdArray, Tensor, TorchTensor
from docarray.typing.tensor.embedding import Embedding
from docarray.typing.url import AnyUrl, ImageUrl, TextUrl

__all__ = [
'TorchTensor',
'Tensor',
'NdArray',
'Embedding',
'ImageUrl',
'TextUrl',
'AnyUrl',
'ID',
'Tensor',
]
3 changes: 2 additions & 1 deletion docarray/typing/tensor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from docarray.typing.tensor.ndarray import NdArray
from docarray.typing.tensor.tensor import Tensor
from docarray.typing.tensor.torch_tensor import TorchTensor

__all__ = ['Tensor', 'TorchTensor']
__all__ = ['NdArray', 'TorchTensor', 'Tensor']
4 changes: 2 additions & 2 deletions docarray/typing/tensor/embedding.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from typing import TypeVar

from docarray.proto import NodeProto
from docarray.typing.tensor import Tensor
from docarray.typing.tensor import NdArray

T = TypeVar('T', bound='Embedding')


class Embedding(Tensor):
class Embedding(NdArray):
def _to_node_protobuf(self: T, field: str = 'tensor') -> NodeProto:
"""Convert Document into a NodeProto protobuf message. This function should
be called when the Document is nested into another Document that need to be
Expand Down
121 changes: 121 additions & 0 deletions docarray/typing/tensor/ndarray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar, Union, cast

import numpy as np

from docarray.typing.abstract_type import AbstractType

if TYPE_CHECKING:
from pydantic.fields import ModelField
from pydantic import BaseConfig

from docarray.proto import NdArrayProto, NodeProto

T = TypeVar('T', bound='NdArray')


class NdArray(np.ndarray, AbstractType):
@classmethod
def __get_validators__(cls):
# one or more validators may be yielded which will be called in the
# order to validate the input, each validator will receive as an input
# the value returned from the previous validator
yield cls.validate

@classmethod
def validate(
cls: Type[T],
value: Union[T, np.ndarray, List[Any], Tuple[Any], Any],
field: 'ModelField',
config: 'BaseConfig',
) -> T:
if isinstance(value, np.ndarray):
return cls.from_ndarray(value)
elif isinstance(value, NdArray):
return cast(T, value)
elif isinstance(value, list) or isinstance(value, tuple):
try:
arr_from_list: np.ndarray = np.asarray(value)
return cls.from_ndarray(arr_from_list)
except Exception:
pass # handled below
else:
try:
arr: np.ndarray = np.ndarray(value)
return cls.from_ndarray(arr)
except Exception:
pass # handled below
raise ValueError(f'Expected a numpy.ndarray compatible type, got {type(value)}')

@classmethod
def from_ndarray(cls: Type[T], value: np.ndarray) -> T:
return value.view(cls)

@classmethod
def __modify_schema__(cls, field_schema: Dict[str, Any]) -> None:
# this is needed to dump to json
field_schema.update(type='string', format='tensor')

def _to_json_compatible(self) -> np.ndarray:
"""
Convert tensor into a json compatible object
:return: a list representation of the tensor
"""
return self.unwrap()

def unwrap(self) -> np.ndarray:
"""
Return the original ndarray without any memory copy.

The original view rest intact and is still a Document NdArray
but the return object is a pure np.ndarray but both object share
the same memory layout.

EXAMPLE USAGE
.. code-block:: python
from docarray.typing import NdArray
import numpy as np

t1 = NdArray.validate(np.zeros((3, 224, 224)), None, None)
# here t is a docarray TenNdArray
t2 = t.unwrap()
# here t2 is a pure np.ndarray but t1 is still a Docarray NdArray
# But both share the same underlying memory


:return: a numpy ndarray
"""
return self.view(np.ndarray)

def _to_node_protobuf(self: T, field: str = 'ndarray') -> NodeProto:
"""Convert itself into a NodeProto protobuf message. This function should
be called when the Document is nested into another Document that need to be
converted into a protobuf
:param field: field in which to store the content in the node proto
:return: the nested item protobuf message
"""
nd_proto = NdArrayProto()
self._flush_tensor_to_proto(nd_proto, value=self)
return NodeProto(**{field: nd_proto})

@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'NdArrayProto') -> 'T':
"""
read ndarray from a proto msg
:param pb_msg:
:return: a numpy array
"""
source = pb_msg.dense
if source.buffer:
x = np.frombuffer(source.buffer, dtype=source.dtype)
return cls.from_ndarray(x.reshape(source.shape))
elif len(source.shape) > 0:
return cls.from_ndarray(np.zeros(source.shape))
else:
raise ValueError(f'proto message {pb_msg} cannot be cast to a NdArray')

@staticmethod
def _flush_tensor_to_proto(pb_msg: 'NdArrayProto', value: 'NdArray'):
pb_msg.dense.buffer = value.tobytes()
pb_msg.dense.ClearField('shape')
pb_msg.dense.shape.extend(list(value.shape))
pb_msg.dense.dtype = value.dtype.str
123 changes: 4 additions & 119 deletions docarray/typing/tensor/tensor.py
Original file line number Diff line number Diff line change
@@ -1,121 +1,6 @@
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar, Union, cast
from typing import Union

import numpy as np
from docarray.typing.tensor.ndarray import NdArray
from docarray.typing.tensor.torch_tensor import TorchTensor

from docarray.typing.abstract_type import AbstractType

if TYPE_CHECKING:
from pydantic.fields import ModelField
from pydantic import BaseConfig

from docarray.proto import NdArrayProto, NodeProto

T = TypeVar('T', bound='Tensor')


class Tensor(np.ndarray, AbstractType):
@classmethod
def __get_validators__(cls):
# one or more validators may be yielded which will be called in the
# order to validate the input, each validator will receive as an input
# the value returned from the previous validator
yield cls.validate

@classmethod
def validate(
cls: Type[T],
value: Union[T, np.ndarray, List[Any], Tuple[Any], Any],
field: 'ModelField',
config: 'BaseConfig',
) -> T:
if isinstance(value, np.ndarray):
return cls.from_ndarray(value)
elif isinstance(value, Tensor):
return cast(T, value)
elif isinstance(value, list) or isinstance(value, tuple):
try:
arr_from_list: np.ndarray = np.asarray(value)
return cls.from_ndarray(arr_from_list)
except Exception:
pass # handled below
else:
try:
arr: np.ndarray = np.ndarray(value)
return cls.from_ndarray(arr)
except Exception:
pass # handled below
raise ValueError(f'Expected a numpy.ndarray compatible type, got {type(value)}')

@classmethod
def from_ndarray(cls: Type[T], value: np.ndarray) -> T:
return value.view(cls)

@classmethod
def __modify_schema__(cls, field_schema: Dict[str, Any]) -> None:
# this is needed to dump to json
field_schema.update(type='string', format='tensor')

def _to_json_compatible(self) -> np.ndarray:
"""
Convert tensor into a json compatible object
:return: a list representation of the tensor
"""
return self.unwrap()

def unwrap(self) -> np.ndarray:
"""
Return the original ndarray without any memory copy.

The original view rest intact and is still a Document Tensor
but the return object is a pure np.ndarray but both object share
the same memory layout.

EXAMPLE USAGE
.. code-block:: python
from docarray.typing import Tensor
import numpy as np

t1 = Tensor.validate(np.zeros((3, 224, 224)), None, None)
# here t is a docarray Tensor
t2 = t.unwrap()
# here t2 is a pure np.ndarray but t1 is still a Docarray Tensor
# But both share the same underlying memory


:return: a numpy ndarray
"""
return self.view(np.ndarray)

def _to_node_protobuf(self: T, field: str = 'tensor') -> NodeProto:
"""Convert itself into a NodeProto protobuf message. This function should
be called when the Document is nested into another Document that need to be
converted into a protobuf
:param field: field in which to store the content in the node proto
:return: the nested item protobuf message
"""
nd_proto = NdArrayProto()
self._flush_tensor_to_proto(nd_proto, value=self)
return NodeProto(**{field: nd_proto})

@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'NdArrayProto') -> 'T':
"""
read ndarray from a proto msg
:param pb_msg:
:return: a numpy array
"""
source = pb_msg.dense
if source.buffer:
x = np.frombuffer(source.buffer, dtype=source.dtype)
return cls.from_ndarray(x.reshape(source.shape))
elif len(source.shape) > 0:
return cls.from_ndarray(np.zeros(source.shape))
else:
raise ValueError(f'proto message {pb_msg} cannot be cast to a Tensor')

@staticmethod
def _flush_tensor_to_proto(pb_msg: 'NdArrayProto', value: 'Tensor'):
pb_msg.dense.buffer = value.tobytes()
pb_msg.dense.ClearField('shape')
pb_msg.dense.shape.extend(list(value.shape))
pb_msg.dense.dtype = value.dtype.str
Tensor = Union[NdArray, TorchTensor]
6 changes: 3 additions & 3 deletions tests/integrations/array/test_array_proto.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import numpy as np

from docarray import DocumentArray, Document, Image, Text
from docarray.typing import Tensor
from docarray import Document, DocumentArray, Image, Text
from docarray.typing import NdArray


def test_simple_proto():
class CustomDoc(Document):
text: str
tensor: Tensor
tensor: NdArray
Comment thread
JohannesMessner marked this conversation as resolved.

da = DocumentArray(
[CustomDoc(text='hello', tensor=np.zeros((3, 224, 224))) for _ in range(10)]
Expand Down
Loading