Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions docarray/array/documentarray.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from typing import Iterable, Type

from docarray.document import AnyDocument, BaseDocument
from docarray.document import AnyDocument, BaseDocument, BaseNode
from docarray.document.abstract_document import AbstractDocument
from docarray.typing import BaseNode

from .abstract_array import AbstractDocumentArray
from .mixins import ProtoArrayMixin
Expand Down
3 changes: 2 additions & 1 deletion docarray/document/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from docarray.document.any_document import AnyDocument
from docarray.document.base_node import BaseNode
from docarray.document.document import BaseDocument

__all__ = ['AnyDocument', 'BaseDocument']
__all__ = ['AnyDocument', 'BaseDocument', 'BaseNode']
11 changes: 10 additions & 1 deletion docarray/document/abstract_document.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
from typing import Dict, Iterable
from abc import abstractmethod
from typing import TYPE_CHECKING, Dict, Iterable, Type

from pydantic.fields import ModelField

if TYPE_CHECKING:
from docarray.document.mixins.proto import ProtoMixin


class AbstractDocument(Iterable):
__fields__: Dict[str, ModelField]

@classmethod
@abstractmethod
Comment thread
JohannesMessner marked this conversation as resolved.
def _get_nested_document_class(cls, field: str) -> Type['ProtoMixin']:
Comment thread
JohannesMessner marked this conversation as resolved.
...
16 changes: 13 additions & 3 deletions docarray/document/document.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import os
from typing import Union
from uuid import UUID
from typing import Type

from pydantic import BaseModel, Field

from docarray.document.abstract_document import AbstractDocument
from docarray.document.base_node import BaseNode
from docarray.typing import ID

from .mixins import ProtoMixin

Expand All @@ -15,4 +15,14 @@ class BaseDocument(BaseModel, ProtoMixin, AbstractDocument, BaseNode):
The base class for Document
"""

id: Union[int, str, UUID] = Field(default_factory=lambda: os.urandom(16).hex())
id: ID = Field(default_factory=lambda: ID.validate(os.urandom(16).hex()))

@classmethod
def _get_nested_document_class(cls, field: str) -> Type['BaseDocument']:
"""
Accessing the nested python Class define in the schema. Could be useful for
reconstruction of Document in serialization/deserilization
:param field: name of the field
:return:
"""
return cls.__fields__[field].type_
33 changes: 17 additions & 16 deletions docarray/document/mixins/proto.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,14 @@
from typing import Any, Dict, Type
from typing import Any, Dict

from docarray.proto import DocumentProto, NodeProto
from docarray.typing import Tensor
from pydantic.tools import parse_obj_as

from ..abstract_document import AbstractDocument
from ..base_node import BaseNode
from docarray.document.abstract_document import AbstractDocument
from docarray.document.base_node import BaseNode
from docarray.proto import DocumentProto, NodeProto
from docarray.typing import ID, AnyUrl, Embedding, ImageUrl, Tensor


class ProtoMixin(AbstractDocument, BaseNode):
@classmethod
def _get_nested_document_class(cls, field: str) -> Type['ProtoMixin']:
"""
Accessing the nested python Class define in the schema. Could be useful for
reconstruction of Document in serialization/deserilization
:param field: name of the field
:return:
"""
return cls.__fields__[field].type_

@classmethod
def from_protobuf(cls, pb_msg: 'DocumentProto') -> 'ProtoMixin':
"""create a Document from a protobuf message"""
Expand All @@ -30,8 +21,18 @@ def from_protobuf(cls, pb_msg: 'DocumentProto') -> 'ProtoMixin':

content_type = value.WhichOneof('content')

# this if else statement need to be refactored it is too long
# the check should be delegated to the type level
if content_type == 'tensor':
fields[field] = Tensor.read_ndarray(value.tensor)
fields[field] = Tensor._read_from_proto(value.tensor)
elif content_type == 'embedding':
fields[field] = Embedding._read_from_proto(value.embedding)
elif content_type == 'any_url':
fields[field] = parse_obj_as(AnyUrl, value.any_url)
elif content_type == 'image_url':
fields[field] = parse_obj_as(ImageUrl, value.image_url)
elif content_type == 'id':
fields[field] = parse_obj_as(ID, value.id)
elif content_type == 'text':
fields[field] = value.text
elif content_type == 'nested':
Expand Down
2 changes: 1 addition & 1 deletion docarray/predefined_document/text.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Optional

from docarray.document import BaseDocument
from docarray.typing.ndarray import Embedding, Tensor
from docarray.typing.embedding import Embedding, Tensor


class Text(BaseDocument):
Expand Down
9 changes: 9 additions & 0 deletions docarray/proto/docarray.proto
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,15 @@ message NodeProto {

// a sub DocumentArray
DocumentArrayProto chunks = 5;

NdArrayProto embedding = 6;

string any_url = 7;
Comment thread
samsja marked this conversation as resolved.

string image_url = 8;

string id = 9;
Comment thread
samsja marked this conversation as resolved.

}

}
Expand Down
38 changes: 20 additions & 18 deletions docarray/proto/pb2/docarray_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 5 additions & 5 deletions docarray/typing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from docarray.document.base_node import BaseNode
from docarray.typing.embedding import Embedding
from docarray.typing.id import ID
from docarray.typing.tensor import Tensor
from docarray.typing.url import AnyUrl, ImageUrl

from docarray.typing.ndarray import Embedding, Tensor
from docarray.typing.url import ImageUrl

__all__ = ['Tensor', 'Embedding', 'BaseNode', 'ImageUrl']
__all__ = ['Tensor', 'Embedding', 'ImageUrl', 'AnyUrl', 'ID']
18 changes: 18 additions & 0 deletions docarray/typing/embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from typing import TypeVar

from docarray.proto import NodeProto
from docarray.typing.tensor import Tensor

T = TypeVar('T', bound='Embedding')


class Embedding(Tensor):
def _to_node_protobuf(self: T, field: str = 'tensor') -> NodeProto:
"""Convert Document into a NodeProto protobuf message. This function should
be called when the Document is nested into another Document that need to be
converted into a protobuf
:param field: field in which to store the content in the node proto
:return: the nested item protobuf message
"""

return super()._to_node_protobuf(field='embedding')
45 changes: 45 additions & 0 deletions docarray/typing/id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from typing import TYPE_CHECKING, Optional, Type, TypeVar, Union
from uuid import UUID

from docarray.document.base_node import BaseNode
from docarray.proto import NodeProto

if TYPE_CHECKING:
from pydantic import BaseConfig
from pydantic.fields import ModelField


T = TypeVar('T', bound='ID')


class ID(str, BaseNode):
"""
Represent an unique ID
"""

@classmethod
def __get_validators__(cls):
yield cls.validate

@classmethod
def validate(
cls: Type[T],
value: Union[str, int, UUID],
field: Optional['ModelField'] = None,
config: Optional['BaseConfig'] = None,
) -> T:

try:
id: str = str(value)
return cls(id)
except Exception:
raise ValueError(f'Expected a str, int or UUID, got {type(value)}')

def _to_node_protobuf(self) -> NodeProto:
"""Convert an ID into a NodeProto message. This function should
be called when the self is nested into another Document that need to be
converted into a protobuf

:return: the nested item protobuf message
"""
return NodeProto(id=self)
3 changes: 0 additions & 3 deletions docarray/typing/ndarray.py

This file was deleted.

13 changes: 6 additions & 7 deletions docarray/typing/tensor/tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,20 +40,19 @@ def validate(
def from_ndarray(cls: Type[T], value: np.ndarray) -> T:
return value.view(cls)

def _to_node_protobuf(self: T) -> NodeProto:
def _to_node_protobuf(self: T, field: str = 'tensor') -> NodeProto:
"""Convert Document into a NodeProto protobuf message. This function should
be called when the Document is nested into another Document that need to be
converted into a protobuf

:param field: field in which to store the content in the node proto
:return: the nested item protobuf message
"""
nd_proto = NdArrayProto()
self.flush_ndarray(nd_proto, value=self)
NodeProto(tensor=nd_proto)
return NodeProto(tensor=nd_proto)
self._flush_tensor_to_proto(nd_proto, value=self)
return NodeProto(**{field: nd_proto})

@classmethod
def read_ndarray(cls: Type[T], pb_msg: 'NdArrayProto') -> 'T':
def _read_from_proto(cls: Type[T], pb_msg: 'NdArrayProto') -> 'T':
"""
read ndarray from a proto msg
:param pb_msg:
Expand All @@ -69,7 +68,7 @@ def read_ndarray(cls: Type[T], pb_msg: 'NdArrayProto') -> 'T':
raise ValueError(f'proto message {pb_msg} cannot be cast to a Tensor')

@staticmethod
def flush_ndarray(pb_msg: 'NdArrayProto', value: 'Tensor'):
def _flush_tensor_to_proto(pb_msg: 'NdArrayProto', value: 'Tensor'):
pb_msg.dense.buffer = value.tobytes()
pb_msg.dense.ClearField('shape')
pb_msg.dense.shape.extend(list(value.shape))
Expand Down
5 changes: 3 additions & 2 deletions docarray/typing/url/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .image_url import ImageUrl
from docarray.typing.url.any_url import AnyUrl
from docarray.typing.url.image_url import ImageUrl

__all__ = ['ImageUrl']
__all__ = ['ImageUrl', 'AnyUrl']
2 changes: 1 addition & 1 deletion docarray/typing/url/any_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ def _to_node_protobuf(self) -> NodeProto:

:return: the nested item protobuf message
"""
return NodeProto(text=str(self))
return NodeProto(any_url=str(self))
12 changes: 11 additions & 1 deletion docarray/typing/url/image_url.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
import numpy as np

from .any_url import AnyUrl
from docarray.proto import NodeProto
from docarray.typing.url.any_url import AnyUrl


class ImageUrl(AnyUrl):
def _to_node_protobuf(self) -> NodeProto:
"""Convert Document into a NodeProto protobuf message. This function should
be called when the Document is nested into another Document that need to
be converted into a protobuf

:return: the nested item protobuf message
"""
return NodeProto(image_url=str(self))

def load(self) -> np.ndarray:
"""
transform the url in a image Tensor
Expand Down
25 changes: 25 additions & 0 deletions tests/integrations/typing/test_typing_proto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import numpy as np

from docarray import Document
from docarray.document import AnyDocument
from docarray.typing import AnyUrl, Embedding, ImageUrl, Tensor


def test_proto_all_types():
class Mymmdoc(Document):
tensor: Tensor
embedding: Embedding
any_url: AnyUrl
image_url: ImageUrl

doc = Mymmdoc(
tensor=np.zeros((3, 224, 224)),
embedding=np.zeros((100, 1)),
any_url='http://jina.ai',
image_url='http://jina.ai',
)

new_doc = AnyDocument.from_protobuf(doc.to_protobuf())

for field, value in new_doc:
assert isinstance(value, doc._get_nested_document_class(field))
6 changes: 3 additions & 3 deletions tests/units/document/proto/test_proto_based_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ def test_nested_optional_item_proto():
def test_ndarray():
nd_proto = NdArrayProto()
original_tensor = np.zeros((3, 224, 224))
Tensor.flush_ndarray(nd_proto, value=original_tensor)
Tensor._flush_tensor_to_proto(nd_proto, value=original_tensor)
nested_item = NodeProto(tensor=nd_proto)
tensor = Tensor.read_ndarray(nested_item.tensor)
tensor = Tensor._read_from_proto(nested_item.tensor)

assert (tensor == original_tensor).all()

Expand All @@ -31,7 +31,7 @@ def test_document_proto_set():

nd_proto = NdArrayProto()
original_tensor = np.zeros((3, 224, 224))
Tensor.flush_ndarray(nd_proto, value=original_tensor)
Tensor._flush_tensor_to_proto(nd_proto, value=original_tensor)

nested_item2 = NodeProto(tensor=nd_proto)

Expand Down
Empty file added tests/units/typing/__init__.py
Empty file.
Loading