diff --git a/docarray/array/chunk.py b/docarray/array/chunk.py index 7a3edf3b5dc..ea445702f62 100644 --- a/docarray/array/chunk.py +++ b/docarray/array/chunk.py @@ -1,4 +1,10 @@ -from typing import TYPE_CHECKING +import itertools +from typing import ( + TYPE_CHECKING, + Generator, + Iterator, + Sequence, +) from .document import DocumentArray @@ -24,6 +30,15 @@ def __init__(self, docs, reference_doc: 'Document'): """ self._ref_doc = reference_doc super().__init__(docs) + if ( + isinstance( + docs, (DocumentArray, Sequence, Generator, Iterator, itertools.chain) + ) + and self._ref_doc is not None + ): + for d in docs: + d.parent_id = self._ref_doc.id + d.granularity = self._ref_doc.granularity + 1 def append(self, document: 'Document'): """Add a sub-document (i.e chunk) to the current Document. diff --git a/docarray/array/document.py b/docarray/array/document.py index da8a29facb2..42efeded634 100644 --- a/docarray/array/document.py +++ b/docarray/array/document.py @@ -242,7 +242,7 @@ def __bool__(self): return len(self) > 0 def __repr__(self): - return f'<{typename(self)} (length={len(self)}) at {id(self)}>' + return f'<{self.__class__.__name__} (length={len(self)}) at {id(self)}>' def __add__(self, other: 'Document'): v = type(self)() diff --git a/docarray/array/match.py b/docarray/array/match.py index 1f5c171acd9..2b33828f3fb 100644 --- a/docarray/array/match.py +++ b/docarray/array/match.py @@ -1,4 +1,10 @@ -from typing import TYPE_CHECKING +import itertools +from typing import ( + TYPE_CHECKING, + Generator, + Iterator, + Sequence, +) from .. import DocumentArray @@ -18,13 +24,20 @@ class MatchArray(DocumentArray): def __init__(self, docs, reference_doc: 'Document'): self._ref_doc = reference_doc super().__init__(docs) + if ( + isinstance( + docs, (DocumentArray, Sequence, Generator, Iterator, itertools.chain) + ) + and self._ref_doc is not None + ): + for d in docs: + d.adjacency = self._ref_doc.adjacency + 1 def append(self, document: 'Document'): """Add a matched document to the current Document. :param document: Sub-document to be added """ - document.granularity = self._ref_doc.granularity document.adjacency = self._ref_doc.adjacency + 1 super().append(document) diff --git a/docarray/array/mixins/io/dataframe.py b/docarray/array/mixins/io/dataframe.py index 61a899c0409..65d2fd7637e 100644 --- a/docarray/array/mixins/io/dataframe.py +++ b/docarray/array/mixins/io/dataframe.py @@ -21,7 +21,7 @@ def to_dataframe(self, **kwargs) -> 'DataFrame': """ from pandas import DataFrame - return DataFrame.from_dict(self.to_list_safe(), **kwargs) + return DataFrame.from_dict(self.to_list(), **kwargs) @classmethod def from_dataframe(cls: Type['T'], df: 'DataFrame') -> 'T': diff --git a/docarray/array/mixins/io/json.py b/docarray/array/mixins/io/json.py index adbe16d1d6b..4b962632cde 100644 --- a/docarray/array/mixins/io/json.py +++ b/docarray/array/mixins/io/json.py @@ -55,12 +55,12 @@ def from_json(cls: Type['T'], file: Union[str, TextIO]) -> 'T': return cls.load_json(file) @classmethod - def from_list_safe(cls: Type['T'], values: List) -> 'T': + def from_list(cls: Type['T'], values: List) -> 'T': from .... import Document return cls(Document.from_dict(v) for v in values) - def to_list_safe(self) -> List: + def to_list(self, strict: bool = True) -> List: """Convert the object into a Python list. .. note:: @@ -68,11 +68,11 @@ def to_list_safe(self) -> List: :return: a Python list """ - return [d.to_dict() for d in self] + return [d.to_dict(strict=strict) for d in self] def to_json(self) -> str: """Convert the object into a JSON string. Can be loaded via :meth:`.load_json`. :return: a Python list """ - return json.dumps(self.to_list_safe()) + return json.dumps(self.to_list()) diff --git a/docarray/base.py b/docarray/base.py index 99151c13701..e6a05dce29f 100644 --- a/docarray/base.py +++ b/docarray/base.py @@ -16,6 +16,7 @@ def __init__( _obj: Optional['T'] = None, copy: bool = False, field_resolver: Optional[Dict[str, str]] = None, + unknown_fields_handler: str = 'catch', **kwargs, ): self._data = None @@ -32,23 +33,26 @@ def __init__( kwargs = {field_resolver.get(k, k): v for k, v in kwargs.items()} _unknown_kwargs = None - if hasattr(self, '_unresolved_fields_dest'): - _unresolved = set(kwargs.keys()).difference( - {f.name for f in fields(self._data_class)} - ) - if _unresolved: - _unknown_kwargs = {k: kwargs[k] for k in _unresolved} - for k in _unresolved: - kwargs.pop(k) + _unresolved = set(kwargs.keys()).difference( + {f.name for f in fields(self._data_class)} + ) + + if _unresolved: + if unknown_fields_handler == 'raise': + raise AttributeError(f'unknown attributes: {_unresolved}') + + _unknown_kwargs = {k: kwargs[k] for k in _unresolved} + for k in _unresolved: + kwargs.pop(k) self._data = self._data_class(self) for k, v in kwargs.items(): setattr(self._data, k, v) - if _unknown_kwargs: + if _unknown_kwargs and unknown_fields_handler == 'catch': getattr(self, self._unresolved_fields_dest).update(_unknown_kwargs) - if _obj is None and not kwargs: + if _obj is None and not kwargs and self._data is None: self._data = self._data_class(self) if self._data is None: @@ -100,8 +104,8 @@ def __hash__(self): def __repr__(self): content = str(self.non_empty_fields) - content += f' at {id(self)}' - return f'<{typename(self)} {content.strip()}>' + content += f' at {getattr(self, "id", id(self))}' + return f'<{self.__class__.__name__} {content.strip()}>' def __bytes__(self): return self.to_bytes() diff --git a/docarray/document/data.py b/docarray/document/data.py index e8f341d0966..e0a42dcfa0b 100644 --- a/docarray/document/data.py +++ b/docarray/document/data.py @@ -84,6 +84,7 @@ def __setattr__(self, key, value): self.text = value else: self.blob = value + value = None elif key == 'chunks': from ..array.chunk import ChunkArray diff --git a/docarray/document/mixins/plot.py b/docarray/document/mixins/plot.py index 3c2d72c4ccd..1fa72a5fd6f 100644 --- a/docarray/document/mixins/plot.py +++ b/docarray/document/mixins/plot.py @@ -7,66 +7,6 @@ class PlotMixin: """Provide helper functions for :class:`Document` to plot and visualize itself. """ - @property - def _mermaid_id(self): - if not hasattr(self, '__mermaid_id'): - self.__mermaid_id = random_identity() - return self.__mermaid_id - - def __mermaid_str__(self): - results = [] - _id = f'{self._mermaid_id[:3]}~Document~' - - for idx, c in enumerate(self.chunks): - results.append( - f'{_id} --> "{idx + 1}/{len(self.chunks)}" {c._mermaid_id[:3]}~Document~: chunks' - ) - results.append(c.__mermaid_str__()) - - for idx, c in enumerate(self.matches): - results.append( - f'{_id} ..> "{idx + 1}/{len(self.matches)}" {c._mermaid_id[:3]}~Document~: matches' - ) - results.append(c.__mermaid_str__()) - - content = self.to_dict() - if 'chunks' in content: - content.pop('chunks') - if 'matches' in content: - content.pop('matches') - if content: - results.append(f'class {_id}{{') - for k, v in content.items(): - if isinstance(v, (str, int, float, bytes)): - results.append(f'+{k} {str(v)[:10]}') - else: - results.append(f'+{k}({type(getattr(self, k, v))})') - results.append('}') - - return '\n'.join(results) - - def _mermaid_to_url(self, img_type: str) -> str: - """ - Rendering the current flow as a url points to a SVG, it needs internet connection - - :param img_type: the type of image to be generated - :return: the url pointing to a SVG - """ - mermaid_str = ( - """ - %%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#FFC666'}}}%% - classDiagram - - """ - + self.__mermaid_str__() - ) - - encoded_str = base64.b64encode(bytes(mermaid_str.strip(), 'utf-8')).decode( - 'utf-8' - ) - - return f'https://mermaid.ink/{img_type}/{encoded_str}' - def _ipython_display_(self): """Displays the object in IPython as a side effect""" self.summary() @@ -92,7 +32,7 @@ def _plot_recursion(self, _str_list, indent, box_char='├─'): _str_list, indent=len(prefix) + 4, box_char='└─' ) - def plot_image(self): + def plot(self): """ Plot image data from :attr:`.blob` or :attr:`.uri`. """ from IPython.display import Image, display @@ -104,38 +44,3 @@ def plot_image(self): display(Image(self.uri)) else: raise ValueError('`uri` and `blob` is empty') - - def plot(self, output: Optional[str] = None, inline_display: bool = False) -> None: - """ - Visualize the Document recursively. - - :param output: a filename specifying the name of the image to be created, - the suffix svg/jpg determines the file type of the output image - :param inline_display: show image directly inside the Jupyter Notebook - """ - image_type = 'svg' - if ( - not output.endswith('.svg') - and not output.endswith('.jpg') - and not output.endswith('.jpeg') - ): - raise ValueError('`output` can be only SVG/JPG format') - elif output.endswith('.jpg') or output.endswith('.jpeg'): - image_type = 'img' - - url = self._mermaid_to_url(image_type) - showed = False - if inline_display: - try: - from IPython.display import Image, display - - display(Image(url=url)) - showed = True - except: - # no need to panic users - pass - - if output: - download_mermaid_url(url, output) - elif not showed: - print(f'Document visualization: {url}') diff --git a/docarray/document/mixins/porting.py b/docarray/document/mixins/porting.py index c704acebbbf..dc01b880e4e 100644 --- a/docarray/document/mixins/porting.py +++ b/docarray/document/mixins/porting.py @@ -1,5 +1,6 @@ +import dataclasses import pickle -from typing import Optional, TYPE_CHECKING, Type, Dict +from typing import Optional, TYPE_CHECKING, Type, Dict, Any from ...helper import compress_bytes, decompress_bytes @@ -26,13 +27,16 @@ def from_json(cls: Type['T'], obj: str) -> 'T': json_format.Parse(obj, pb_msg) return cls.from_protobuf(pb_msg) - def to_dict(self): - from google.protobuf.json_format import MessageToDict + def to_dict(self, strict: bool = True) -> Dict[str, Any]: + if strict: + from google.protobuf.json_format import MessageToDict - return MessageToDict( - self.to_protobuf(), - preserving_proto_field_name=True, - ) + return MessageToDict( + self.to_protobuf(), + preserving_proto_field_name=True, + ) + else: + return dataclasses.asdict(self._data) def to_bytes( self, protocol: str = 'pickle', compress: Optional[str] = None @@ -54,6 +58,13 @@ def from_bytes( protocol: str = 'pickle', compress: Optional[str] = None, ) -> 'T': + """Build Document object from binary bytes + + :param data: binary bytes + :param protocol: protocol to use + :param compress: compress method to use + :return: a Document object + """ bstr = decompress_bytes(data, algorithm=compress) if protocol == 'pickle': d = pickle.loads(bstr) diff --git a/docarray/document/mixins/property.py b/docarray/document/mixins/property.py index 6e57241d81b..8330b8ce60f 100644 --- a/docarray/document/mixins/property.py +++ b/docarray/document/mixins/property.py @@ -11,6 +11,7 @@ class PropertyMixin(_PropertyMixin): def _clear_content(self): + self._data.content = None self._data.text = None self._data.blob = None self._data.buffer = None diff --git a/docarray/proto/io/__init__.py b/docarray/proto/io/__init__.py index 29e5a387f6b..4bf679fde2c 100644 --- a/docarray/proto/io/__init__.py +++ b/docarray/proto/io/__init__.py @@ -13,6 +13,7 @@ def parse_proto(pb_msg: 'DocumentProto') -> 'Document': from ... import Document from ...score import NamedScore + fields = {} for (field, value) in pb_msg.ListFields(): f_name = field.name @@ -27,7 +28,9 @@ def parse_proto(pb_msg: 'DocumentProto') -> 'Document': elif f_name == 'scores' or f_name == 'evaluations': fields[f_name] = {} for k, v in value.items(): - fields[f_name][k] = NamedScore({ff.name: vv for (ff, vv) in v.ListFields()}) + fields[f_name][k] = NamedScore( + {ff.name: vv for (ff, vv) in v.ListFields()} + ) else: fields[f_name] = value return Document(**fields) @@ -53,13 +56,17 @@ def flush_proto(doc: 'Document') -> 'DocumentProto': setattr(getattr(pb_msg, key)[kk], ff, getattr(vv, ff)) elif key == 'location': pb_msg.location.extend(value) + elif key == 'content': + pass # intentionally ignore `content` field as it is just a proxy else: # other simple fields setattr(pb_msg, key, value) except RecursionError as ex: if len(ex.args) >= 1: - ex.args = (f'Field `{key}` contains cyclic reference in memory. ' - f'Could it be your Document is referring to itself?',) + ex.args = ( + f'Field `{key}` contains cyclic reference in memory. ' + f'Could it be your Document is referring to itself?', + ) raise except Exception as ex: if len(ex.args) >= 1: diff --git a/docs/fundamentals/document/attribute.md b/docs/fundamentals/document/attribute.md index ea9f5b35e0e..0f91f8ee300 100644 --- a/docs/fundamentals/document/attribute.md +++ b/docs/fundamentals/document/attribute.md @@ -1,59 +1,185 @@ -# Set Attributes +# Access Attributes -Set an attribute as you would with any Python object: +Use `.` expression to get/set the value of an attribute as you would with any Python object: ```python from docarray import Document d = Document() d.text = 'hello world' + +print(d.text) ``` ```text - +hello world ``` - To unset attribute, simply assign it to `None`: ```python d.text = None ``` -or use {meth}`~docarray.Document.pop`: +or use {meth}`~docarray.base.BaseDCType.pop`: ```python d.pop('text') ``` -```text - + +One can unset multiple attributes `.pop()`: + +```python +d.pop('text', 'id', 'mime_type') ``` +You can check which attributes are set by `.non_empty_fields`. + + +## Content attributes + +Among all attributes, content attributes, namely `.text`, `.blob`, and `.buffer` are super important as they contain the actual content. + +They correspond to string-like data (e.g. for natural language), `ndarray`-like data (e.g. for image/audio/video data), and binary data for general purpose, respectively. + + +| Attribute | Accept type | Use case | +| --- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| --- | +| `doc.text` | Python string | Contain text | +| `doc.blob` | A Python (nested) list/tuple of numbers, Numpy `ndarray`, SciPy sparse matrix (`spmatrix`), TensorFlow dense & sparse tensor, PyTorch dense & sparse tensor, PaddlePaddle dense tensor | Contain image/video/audio | +| `doc.buffer` | Binary string | Contain intermediate IO buffer | + + +Each Document can contain only one type of content. That means these three attributes are mutually exclusive. Let's see an example: -One can unset multiple attributes with {meth}`~docarray.Document.pop`: ```python -d.pop('text', 'id', 'mime_type') +import numpy as np +from docarray import Document + +d = Document(text='hello') +d.blob = np.array([1, 2, 3]) + +print(d) ``` ```text - + ``` -## Tags +As one can see `text` field is reset to empty. + +But what if you want to represent more than one kind of information? Say, to fully represent a PDF page you need to store both image and text. In this case, you can use {ref}`nested Document`s by putting image into one sub-Document, and text into another sub-Document. + +```python +from docarray import Document + +d = Document(chunks=[Document(blob=...), Document(text=...)]) +``` + + +The principle is each Document contains only one modality of information. In practice, this principle makes your full solution more clear and easier to maintain. -`Document` contains the {attr}`~docarray.Document.tags` attribute that can hold a map-like structure that can map arbitrary values. -In practice, you can store meta information in `tags`. +There is also a `.content` sugar getter/setter of the content fields. The content will be automatically grabbed or assigned to either `text`, `buffer`, or `blob` field based on the given type. ```python -from jina import Document +from docarray import Document +d = Document(content='hello') +print(d) +``` -doc = Document(tags={'dimensions': {'height': 5.0, 'weight': 10.0, 'last_modified': 'Monday'}}) +```text + +``` -doc.tags['dimensions'] +```python +d.content = [1, 2, 3] +print(d) ``` ```text -{'weight': 10.0, 'height': 5.0, 'last_modified': 'Monday'} + +``` + +You can also check which content field is set by `.content_type`. + +## Load content from URI + +A quite common pattern is loading content from a URI instead of assigning them directly in the code. + +This can be easily done with `.uri` attribute. The value of `.uri` can point to either local URI, remote URI or [data URI](https://en.wikipedia.org/wiki/Data_URI_scheme). + +````{tab} Local image URI + + +```python +from docarray import Document + +d1 = Document(uri='apple.png').load_uri_to_image_blob() +print(d1.content_type, d1.content) +``` + +```console +blob [[[255 255 255] + [255 255 255] + [255 255 255] + ... +``` +```` + + +````{tab} Remote text URI + +```python +from docarray import Document + +d1 = Document(uri='https://www.gutenberg.org/files/1342/1342-0.txt').load_uri_to_text() + +print(d1.content_type, d1.content) +``` + + +```console +text The Project Gutenberg eBook of Pride and Prejudice, by Jane Austen + +This eBook is for the use of anyone anywhere in the United States and +most other parts of the wor +``` +```` + +````{tab} Inline data URI + +```python +from docarray import Document + +d1 = Document(uri='''data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA +AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO +9TXL0Y4OHwAAAABJRU5ErkJggg== +''').load_uri_to_image_blob() + +print(d1.content_type, d1.content) +``` +```console +blob [[[255 255 255] + [255 0 0] + [255 0 0] + [255 0 0] + [255 255 255]] + ... +``` + +```` + +There are more `.load_uri_to_*` functions that allow you to read {ref}`text`, {ref}`image`, {ref}`video`, {ref}`3D mesh`, {ref}`audio` and {ref}`tabular` data. + +```{figure} images/doc-load-autocomplete.png +:width: 60% +``` + +```{admonition} Convert content to data URI +:class: tip +Inline data URI is helpful when you need a quick visualization in HTML, as it embeds all resources directly into that HTML. + +You can convert a URI to a data URI using `doc.load_uri_to_datauri()`. This will fetch the resource and make it inline. ``` \ No newline at end of file diff --git a/docs/fundamentals/document/construct.md b/docs/fundamentals/document/construct.md index 580693f014d..59abf4916eb 100644 --- a/docs/fundamentals/document/construct.md +++ b/docs/fundamentals/document/construct.md @@ -1,104 +1,200 @@ # Construct -````{tab} Empty document +Initializing a Document object is super easy. This chapter introduces the ways of constructing empty Document, filled Document. One can also construct Document from bytes, JSON, Protobuf message as introduced {ref}`in the next chapter`. + +## Construct an empty Document ```python -from jina import Document +from docarray import Document d = Document() ``` -```` +```text + +``` -````{tab} From attributes +Every Document will have a unique random `id` that helps you identify this Document. It can be used to {ref}`access this Document inside a DocumentArray`. You can override this `id` or assign your own `id` during construction, as demonstrated below. + +## Construct with attributes + +This is the most common usage of the constructor: initializing a Document object with given attributes. ```python -from jina import Document +from docarray import Document import numpy +d0 = Document(id='my_id') d1 = Document(text='hello') d2 = Document(buffer=b'\f1') d3 = Document(blob=numpy.array([1, 2, 3])) d4 = Document(uri='https://jina.ai', - mime_type='text/plain', - granularity=1, - adjacency=3, - tags={'foo': 'bar'}) + mime_type='text/plain', + granularity=1, + adjacency=3, + tags={'foo': 'bar'}) ``` - -```console - - - - +```text + + + + + ``` -```` +````{tip} +When you `print()` a Document, you get a string representation such as ``. It shows the non-empty attributes of that Document as well as its `id`, which helps you understand the content of that Document. +```text + + ^^^^^^^^^^^^^^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + | | + | | + non-empty fields | + Document.id +``` +```` -````{tab} From another Document +One can also wrap the keyword arguments into `dict`. The following ways of initialization have the same effect: ```python -from jina import Document +d1 = Document(uri='https://jina.ai', + mime_type='text/plain', + granularity=1, + adjacency=3) + +d2 = Document(dict(uri='https://jina.ai', + mime_type='text/plain', + granularity=1, + adjacency=3)) + +d3 = Document({'uri': 'https://jina.ai', + 'mime_type': 'text/plain', + 'granularity': 1, + 'adjacency': 3}) +``` -d = Document(content='hello, world!') -d1 = d +### Nested Document -assert id(d) == id(d1) # True +```{seealso} +To learn more about nested Document, please read {ref}`recursive-nested-document`. ``` -To make a deep copy, use `copy=True`: +Document can be nested inside `.chunks` and `.matches`. The nested structure can be specified directly during construction: ```python -d1 = Document(d, copy=True) +from docarray import Document + +d = Document( + id='d0', + chunks=[Document(id='d1', chunks=Document(id='d2'))], + matches=[Document(id='d3')], +) -assert id(d) == id(d1) # False +print(d) ``` -```` +```text + +``` + +For a nested Document, print its root does not give you much information. You can use {meth}`~docarray.document.mixins.plot.PlotMixin.summary`. For example, `d.summary()` gives you a more intuitive overview of the structure. + +```text + + └─ matches + └─ + └─ chunks + └─ + └─ chunks + └─ +``` + +When using in Jupyter notebook/Google Colab, Document is automatically prettified. + +```{figure} images/doc-in-jupyter.png +``` -`````{tab} From dict or JSON string + +### Unknown attributes handling + +If you give an unknown attribute (i.e. not one of the built-in Document attributes), they will be automatically "caught" into `.tags` attributes. For example, ```python -from jina import Document -import json +from docarray import Document + +d = Document(hello='world') -d = {'id': 'hello123', 'content': 'world'} -d1 = Document(d) +print(d, d.tags) +``` -d = json.dumps({'id': 'hello123', 'content': 'world'}) -d2 = Document(d) +```text + +{'hello': 'world'} ``` -````{admonition} Parsing unrecognized fields -:class: tip +You can change this "`catch`" behavior to `drop` (silently drop unknown attributes) or `raise` (raise a `AttributeError`) by specifying `unknown_fields_handler`. + +### Resolve unknown attributes with rules -Unrecognized fields in a `dict`/JSON string are automatically put into the Document's `.tags` field: +One can resolve external fields into built-in attributes by specifying a mapping in `field_resolver`. For example, to resolve the field `hello` as the `id` attribute: ```python -from jina import Document +from docarray import Document -d1 = Document({'id': 'hello123', 'foo': 'bar'}) +d = Document(hello='world', field_resolver={'hello': 'id'}) + +print(d) ``` ```text - + ``` -You can use `field_resolver` to map external field names to `Document` attributes: +One can see `id` of the Document object is set to `world`. + + +## Copy from another Document + +To make a deep copy of a Document, use `copy=True`: ```python -from jina import Document +from docarray import Document + +d = Document(text='hello') +d1 = Document(d, copy=True) -d1 = Document({'id': 'hello123', 'foo': 'bar'}, field_resolver={'foo': 'content'}) +print(d==d1, id(d)==id(d1)) ``` ```text - +True False ``` -```` +That indicates `d` and `d1` have identical content, but they are different objects in memory. + + +If you want to keep the memory address of a Document object while only copying the content from another Document, you can use {meth}`~docarray.base.BaseDCType.copy_from`. + +```python +from docarray import Document + +d1 = Document(text='hello') +d2 = Document(text='world') + +print(id(d1)) +d1.copy_from(d2) +print(d1.text) +print(id(d1)) +``` + +```text +4479829968 +world +4479829968 +``` +## What's next? -````` +One can also construct Document from bytes, JSON, Protobuf message. These methods are introduced {ref}`in the next chapter`. diff --git a/docs/fundamentals/document/content.md b/docs/fundamentals/document/content.md deleted file mode 100644 index 5e47b17fc0b..00000000000 --- a/docs/fundamentals/document/content.md +++ /dev/null @@ -1,122 +0,0 @@ -# Content - -{attr}`~docarray.Document.text`, {attr}`~docarray.Document.blob`, and {attr}`~docarray.Document.buffer` are the three content attributes of a Document. They correspond to string-like data (e.g. for natural language), `ndarray`-like data (e.g. for image/audio/video data), and binary data for general purpose, respectively. Each Document can contain only one type of content. - -| Attribute | Accept type | Use case | -| --- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| --- | -| `doc.text` | Python string | Contain text | -| `doc.blob` | A Python (nested) list/tuple of numbers, Numpy `ndarray`, SciPy sparse matrix (`spmatrix`), TensorFlow dense & sparse tensor, PyTorch dense & sparse tensor, PaddlePaddle dense tensor | Contain image/video/audio | -| `doc.buffer` | Binary string | Contain intermediate IO buffer | - -````{admonition} Exclusivity of the content -:class: important - -Note that one `Document` can only contain one type of `content`: either `text`, `buffer`, or `blob`. If you set one, the others will be cleared. - -```python -import numpy as np - -d = Document(text='hello') -d.blob = np.array([1]) - -d.text # <- now it's empty -``` - -```` - -````{admonition} Why a Document contains only data type -:class: question - -What if you want to represent more than one kind of information? Say, to fully represent a PDF page you need to store both image and text. In this case, you can use {ref}`nested Document`s by putting image into one sub-Document, and text into another. - -```python -d = Document(chunks=[Document(blob=...), Document(text=...)]) -``` - - -The principle is each Document contains only one modality. This makes the whole logic clearer. -```` - -```{tip} -There is also a `doc.content` sugar getter/setter of the above non-empty field. The content will be automatically grabbed or assigned to either `text`, `buffer`, or `blob` field based on the given type. -``` - - - -## Load content from URI - -Often, you need to load data from a URI instead of assigning them directly in your code, {attr}`~docarray.Document.uri` is the attribute you must learn. - -After setting `.uri`, you can load data into `.text`/`.buffer`/`.blob` as follows. - -The value of `.uri` can point to either local URI, remote URI or [data URI](https://en.wikipedia.org/wiki/Data_URI_scheme). - -````{tab} Local image URI - - -```python -from jina import Document - -d1 = Document(uri='apple.png').load_uri_to_image_blob() -print(d1.content_type, d1.content) -``` - -```console -blob [[[255 255 255] - [255 255 255] - [255 255 255] - ... -``` -```` - - -````{tab} Remote text URI - -```python -from jina import Document - -d1 = Document(uri='https://www.gutenberg.org/files/1342/1342-0.txt').load_uri_to_text() - -print(d1.content_type, d1.content) -``` - - -```console -text The Project Gutenberg eBook of Pride and Prejudice, by Jane Austen - -This eBook is for the use of anyone anywhere in the United States and -most other parts of the wor -``` -```` - -````{tab} Inline data URI - -```python -from jina import Document - -d1 = Document(uri='''data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA -AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO -9TXL0Y4OHwAAAABJRU5ErkJggg== -''').load_uri_to_image_blob() - -print(d1.content_type, d1.content) -``` -```console -blob [[[255 255 255] - [255 0 0] - [255 0 0] - [255 0 0] - [255 255 255]] - ... -``` - -```` - -There are more `.load_uri_to_*` functions that allow you to read {ref}`text`, {ref}`image`, {ref}`video`, {ref}`3D mesh`, {ref}`audio` and {ref}`tabular` data into Jina. - -```{admonition} Write to data URI -:class: tip -Inline data URI is helpful when you need a quick visualization in HTML, as it embeds all resources directly into that HTML. - -You can convert a URI to a data URI using `doc.load_uri_to_datauri()`. This will fetch the resource and make it inline. -``` diff --git a/docs/fundamentals/document/embedding.md b/docs/fundamentals/document/embedding.md index 8b637a024e8..eda073e20bc 100644 --- a/docs/fundamentals/document/embedding.md +++ b/docs/fundamentals/document/embedding.md @@ -1,8 +1,6 @@ # Embedding -Embedding is a multi-dimensional representation of a `Document` (often a `[1, D]` vector). It serves as a very important piece in the neural search. - -Document has an attribute {attr}`~docarray.Document.embedding` to contain the embedding information. +Embedding is a multi-dimensional representation of a Document (often a `[1, D]` vector). It serves as a very important piece in machine learning. The attribute {attr}`~docarray.Document.embedding` is designed to contain the embedding information of a Document. Like `.blob`, you can assign it with a Python (nested) List/Tuple, Numpy `ndarray`, SciPy sparse matrix (`spmatrix`), TensorFlow dense and sparse tensor, PyTorch dense and sparse tensor, or PaddlePaddle dense tensor. @@ -11,7 +9,8 @@ import numpy as np import scipy.sparse as sp import torch import tensorflow as tf -from jina import Document + +from docarray import Document d0 = Document(embedding=[1, 2, 3]) d1 = Document(embedding=np.array([1, 2, 3])) @@ -21,15 +20,23 @@ d4 = Document(embedding=torch.tensor([1, 2, 3])) d5 = Document(embedding=tf.sparse.from_dense(np.array([[1, 2, 3], [4, 5, 6]]))) ``` -## Fill embedding from DNN model +## Fill embedding via a DNN model + +Usually you don't want to assign embedding manually, but instead doing something like: + +```text +d.blob \ +d.text ---> some DNN model ---> d.embedding +d.buffer / +``` ```{admonition} On multiple Documents :class: tip -This is a syntax sugar on single Document, which leverages {meth}`~jina.types.arrays.mixins.embed.EmbedMixin.embed` underneath. To embed multiple Documents, do not use this feature in a for-loop. Instead, read more details in {ref}`embed-via-model`. +To embed multiple Documents, do not use this feature in a for-loop. Instead, put all Documents in a DocumentArray and call `.embed()`. You can find out more in {ref}`embed-via-model`. ``` -Once a `Document` has `.blob` set, you can use a deep neural network to {meth}`~jina.types.arrays.mixins.embed.EmbedMixin.embed` it, which means filling `Document.embedding`. For example, our `Document` looks like the following: +Once a `Document` has content field set, you can use a deep neural network to {meth}`~docarray.document.mixins.sugar.SingletonSugarMixin.embed` it, which means filling `Document.embedding`. For example, our `Document` looks like the following: ```python q = (Document(uri='/Users/hanxiao/Downloads/left/00003.jpg') @@ -38,7 +45,7 @@ q = (Document(uri='/Users/hanxiao/Downloads/left/00003.jpg') .set_image_blob_channel_axis(-1, 0)) ``` -Let's embed it into vector via ResNet: +Let's embed it into vector via ResNet50: ```python import torchvision @@ -51,13 +58,13 @@ q.embed(model) ```{admonition} On multiple Documents :class: tip -This is a syntax sugar on single Document, which leverages {meth}`~jina.types.arrays.mixins.match.MatchMixin.match` underneath. To match multiple Documents, do not use this feature in a for-loop. Instead, find out more in {ref}`match-documentarray`. +To match multiple Documents, do not use this feature in a for-loop. Instead, find out more in {ref}`match-documentarray`. ``` -Once a Document has `.embedding` filled, it can be "matched". In this example, we build ten Documents and put them into a {ref}`DocumentArray`, and then use another Document to search against them. +Documents have `.embedding` set can be "matched" against each other. In this example, we build ten Documents and put them into a {ref}`DocumentArray`, and then use another Document to search against them. ```python -from jina import DocumentArray, Document +from docarray import DocumentArray, Document import numpy as np da = DocumentArray.empty(10) @@ -66,11 +73,22 @@ da.embeddings = np.random.random([10, 256]) q = Document(embedding=np.random.random([256])) q.match(da) -print(q.matches[0]) +q.summary() ``` -```console - +```text + + └─ matches + ├─ + ├─ + ├─ + ├─ + ├─ + ├─ + ├─ + ├─ + ├─ + └─ ``` diff --git a/docs/fundamentals/document/fluent-interface.md b/docs/fundamentals/document/fluent-interface.md index ac31b032fdc..93ecac6ad86 100644 --- a/docs/fundamentals/document/fluent-interface.md +++ b/docs/fundamentals/document/fluent-interface.md @@ -1,9 +1,9 @@ # Fluent Interface -Jina provides a simple fluent interface for `Document` that allows one to process (often preprocess) a Document object by chaining methods. For example to read an image file as `numpy.ndarray`, resize it, normalize it and then store it to another file; one can simply do: +Document provides a simple fluent interface that allows one to process (often preprocess) a Document object by chaining methods. For example to read an image file as `numpy.ndarray`, resize it, normalize it and then store it to another file; one can simply do: ```python -from jina import Document +from docarray import Document d = ( Document(uri='apple.png') @@ -14,68 +14,32 @@ d = ( ) ``` -```{figure} apple.png +```{figure} images/apple.png :scale: 20% Original `apple.png` ``` -```{figure} apple1.png +```{figure} images/apple1.png :scale: 50% Processed `apple1.png` ``` -````{important} + Note that, chaining methods always modify the original Document in-place. That means the above example is equivalent to: ```python -from jina import Document +from docarray import Document d = Document(uri='apple.png') (d.load_uri_to_image_blob() .set_image_blob_shape((64, 64)) .set_image_blob_normalization() - .dump_image_blob_to_file('apple1.png')) + .save_image_blob_to_file('apple1.png')) ``` -```` - -## Parallelization - -Fluent interface is super useful when processing a large {class}`~docarray.DocumentArray` or {class}`~docarray.DocumentArrayMemmap`. One can leverage {meth}`~jina.types.arrays.mixins.parallel.ParallelMixin.map` to speed up things quite a lot. - -The following example shows the time difference on preprocessing ~6000 image Documents. - -```python -from jina import DocumentArray -from jina.logging.profile import TimeContext - -docs = DocumentArray.from_files('*.jpg') -def foo(d): - return (d.load_uri_to_image_blob() - .set_image_blob_normalization() - .set_image_blob_channel_axis(-1, 0)) - -with TimeContext('map-process'): - for d in docs.map(foo, backend='process'): - pass - -with TimeContext('map-thread'): - for d in docs.map(foo, backend='thread'): - pass - -with TimeContext('for-loop'): - for d in docs: - foo(d) -``` - -```text -map-process ... map-process takes 5 seconds (5.55s) -map-thread ... map-thread takes 10 seconds (10.28s) -for-loop ... for-loop takes 18 seconds (18.52s) -``` ## Methods @@ -95,57 +59,58 @@ and {attr}`.buffer`. Provide helper functions for {class}`Document` to support text data. - {meth}`~docarray.document.mixins.text.TextDataMixin.convert_blob_to_text` - {meth}`~docarray.document.mixins.text.TextDataMixin.convert_text_to_blob` -- {meth}`~docarray.document.mixins.text.TextDataMixin.dump_text_to_datauri` +- {meth}`~docarray.document.mixins.text.TextDataMixin.convert_text_to_datauri` - {meth}`~docarray.document.mixins.text.TextDataMixin.load_uri_to_text` +### BufferData +Provide helper functions for {class}`Document` to handle binary data. +- {meth}`~docarray.document.mixins.buffer.BufferDataMixin.convert_buffer_to_datauri` +- {meth}`~docarray.document.mixins.buffer.BufferDataMixin.load_uri_to_buffer` +- {meth}`~docarray.document.mixins.buffer.BufferDataMixin.save_buffer_to_file` + + ### ImageData Provide helper functions for {class}`Document` to support image data. - {meth}`~docarray.document.mixins.image.ImageDataMixin.convert_buffer_to_image_blob` - {meth}`~docarray.document.mixins.image.ImageDataMixin.convert_image_blob_to_buffer` - {meth}`~docarray.document.mixins.image.ImageDataMixin.convert_image_blob_to_sliding_windows` - {meth}`~docarray.document.mixins.image.ImageDataMixin.convert_image_blob_to_uri` -- {meth}`~docarray.document.mixins.image.ImageDataMixin.dump_image_blob_to_file` - {meth}`~docarray.document.mixins.image.ImageDataMixin.load_uri_to_image_blob` +- {meth}`~docarray.document.mixins.image.ImageDataMixin.save_image_blob_to_file` - {meth}`~docarray.document.mixins.image.ImageDataMixin.set_image_blob_channel_axis` - {meth}`~docarray.document.mixins.image.ImageDataMixin.set_image_blob_inv_normalization` - {meth}`~docarray.document.mixins.image.ImageDataMixin.set_image_blob_normalization` - {meth}`~docarray.document.mixins.image.ImageDataMixin.set_image_blob_shape` -### AudioData -Provide helper functions for {class}`Document` to support audio data. -- {meth}`~docarray.document.mixins.audio.AudioDataMixin.dump_audio_blob_to_file` -- {meth}`~docarray.document.mixins.audio.AudioDataMixin.load_uri_to_audio_blob` +### ContentProperty +Provide helper functions for {class}`Document` to allow universal content property access. +- {meth}`~docarray.document.mixins.content.ContentPropertyMixin.convert_content_to_datauri` -### BufferData -Provide helper functions for {class}`Document` to handle binary data. -- {meth}`~docarray.document.mixins.buffer.BufferDataMixin.dump_buffer_to_datauri` -- {meth}`~docarray.document.mixins.buffer.BufferDataMixin.load_uri_to_buffer` +### SingletonSugar +Provide sugary syntax for {class}`Document` by inheriting methods from {class}`DocumentArray` +- {meth}`~docarray.document.mixins.sugar.SingletonSugarMixin.embed` +- {meth}`~docarray.document.mixins.sugar.SingletonSugarMixin.match` -### DumpFile -Provide helper functions for {class}`Document` to dump content to a file. -- {meth}`~docarray.document.mixins.dump.UriFileMixin.dump_buffer_to_file` -- {meth}`~docarray.document.mixins.dump.UriFileMixin.dump_uri_to_file` +### Porting +- {meth}`~docarray.document.mixins.porting.PortingMixin.from_bytes` +- {meth}`~docarray.document.mixins.porting.PortingMixin.from_dict` +- {meth}`~docarray.document.mixins.porting.PortingMixin.from_json` -### ContentProperty -Provide helper functions for {class}`Document` to allow universal content property access. -- {meth}`~docarray.document.mixins.content.ContentPropertyMixin.dump_content_to_datauri` +### Protobuf -### VideoData -Provide helper functions for {class}`Document` to support video data. -- {meth}`~docarray.document.mixins.video.VideoDataMixin.dump_video_blob_to_file` -- {meth}`~docarray.document.mixins.video.VideoDataMixin.load_uri_to_video_blob` +- {meth}`~docarray.document.mixins.protobuf.ProtobufMixin.from_protobuf` -### SingletonSugar -Provide sugary syntax for {class}`Document` by inheriting methods from {class}`DocumentArray` -- {meth}`~docarray.document.mixins.sugar.SingletonSugarMixin.embed` -- {meth}`~docarray.document.mixins.sugar.SingletonSugarMixin.match` +### AudioData +Provide helper functions for {class}`Document` to support audio data. +- {meth}`~docarray.document.mixins.audio.AudioDataMixin.load_uri_to_audio_blob` +- {meth}`~docarray.document.mixins.audio.AudioDataMixin.save_audio_blob_to_file` ### MeshData @@ -153,4 +118,15 @@ Provide helper functions for {class}`Document` to support 3D mesh data and point - {meth}`~docarray.document.mixins.mesh.MeshDataMixin.load_uri_to_point_cloud_blob` +### VideoData +Provide helper functions for {class}`Document` to support video data. +- {meth}`~docarray.document.mixins.video.VideoDataMixin.load_uri_to_video_blob` +- {meth}`~docarray.document.mixins.video.VideoDataMixin.save_video_blob_to_file` + + +### UriFile +Provide helper functions for {class}`Document` to dump content to a file. +- {meth}`~docarray.document.mixins.dump.UriFileMixin.save_uri_to_file` + + diff --git a/docs/fundamentals/document/images/apple.png b/docs/fundamentals/document/images/apple.png new file mode 100644 index 00000000000..aa0fa74f78b Binary files /dev/null and b/docs/fundamentals/document/images/apple.png differ diff --git a/docs/fundamentals/document/images/apple1.png b/docs/fundamentals/document/images/apple1.png new file mode 100644 index 00000000000..2b35109c4bf Binary files /dev/null and b/docs/fundamentals/document/images/apple1.png differ diff --git a/docs/fundamentals/document/images/doc-auto-summary.png b/docs/fundamentals/document/images/doc-auto-summary.png new file mode 100644 index 00000000000..4ab881ec473 Binary files /dev/null and b/docs/fundamentals/document/images/doc-auto-summary.png differ diff --git a/docs/fundamentals/document/images/doc-in-jupyter.png b/docs/fundamentals/document/images/doc-in-jupyter.png new file mode 100644 index 00000000000..f58777ccc3b Binary files /dev/null and b/docs/fundamentals/document/images/doc-in-jupyter.png differ diff --git a/docs/fundamentals/document/images/doc-load-autocomplete.png b/docs/fundamentals/document/images/doc-load-autocomplete.png new file mode 100644 index 00000000000..1ff23c9c414 Binary files /dev/null and b/docs/fundamentals/document/images/doc-load-autocomplete.png differ diff --git a/docs/fundamentals/document/images/doc-plot-in-jupyter.png b/docs/fundamentals/document/images/doc-plot-in-jupyter.png new file mode 100644 index 00000000000..2815f183e6c Binary files /dev/null and b/docs/fundamentals/document/images/doc-plot-in-jupyter.png differ diff --git a/docs/fundamentals/document/images/document-attributes.svg b/docs/fundamentals/document/images/document-attributes.svg new file mode 100644 index 00000000000..80f20e08163 --- /dev/null +++ b/docs/fundamentals/document/images/document-attributes.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/fundamentals/document/index.md b/docs/fundamentals/document/index.md index 080650c69f0..9eb8459f533 100644 --- a/docs/fundamentals/document/index.md +++ b/docs/fundamentals/document/index.md @@ -1,6 +1,48 @@ # Document -{class}`~docarray.document.Document` is the basic data type in Jina. Whether you're working with text, image, video, audio, or 3D meshes, they can be all represent as `Document`. +{class}`~docarray.document.Document` is the basic data type in DocArray. Whether you're working with text, image, video, audio, 3D meshes or the nested or the combined of them, you can always represent them as Document. + +A Document object has a predefined data structure as below, each of the attributes can be set/get with the dot expression as you would do with any Python object. + + +| Attribute | Type | Description | +|-------------|-----------------| ----------- | +| id | string | A hexdigest that represents a unique document ID | +| buffer | bytes | the raw binary content of this document, which often represents the original document when comes into jina | +| blob | `ndarray`-like | the ndarray of the image/audio/video document | +| text | string | a text document | +| granularity | int | the depth of the recursive chunk structure | +| adjacency | int | the width of the recursive match structure | +| parent_id | string | the parent id from the previous granularity | +| weight | float | The weight of this document | +| uri | string | a uri of the document could be: a local file path, a remote url starts with http or https or data URI scheme | +| modality | string | modality, an identifier to the modality this document belongs to. In the scope of multi/cross modal search | +| mime_type | string | mime type of this document, for buffer content, this is required; for other contents, this can be guessed | +| offset | float | the offset of the doc | +| location | float | the position of the doc, could be start and end index of a string; could be x,y (top, left) coordinate of an image crop; could be timestamp of an audio clip | +| chunks | `DocumentArray` | list of the sub-documents of this document (recursive structure) | +| matches | `DocumentArray` | the matched documents on the same level (recursive structure) | +| embedding | `ndarray`-like | the embedding of this document | +| tags | dict | a structured data value, consisting of field which map to dynamically typed values. | +| scores | `NamedScore` | Scores performed on the document, each element corresponds to a metric | +| evaluations | `NamedScore` | Evaluations performed on the document, each element corresponds to a metric | + +The data structure of the Document is comprehensive and well-organized. One can categorize those attributes into the following groups: + +- Content related: `uri`, `text`, `blob`, `buffer`; +- Nest structure related: `chunks`, `matches`, `granularity`, `adjacency`, `parent_id`; +- Common side information or metadata: `id`, `modality`, `mime_type`, `offset`, `location`, `weight`; + - Further information: `tags`; +- Computational related: `scores`, `evaluations`, `embedding`. + +This picture depicts how you may want to construct or comprehend a Document object. + + + +```{figure} images/document-attributes.svg +``` + + ```{toctree} :hidden: @@ -8,7 +50,6 @@ construct serialization attribute -content embedding nested visualization diff --git a/docs/fundamentals/document/nested.md b/docs/fundamentals/document/nested.md index baac456b1fc..db99662be84 100644 --- a/docs/fundamentals/document/nested.md +++ b/docs/fundamentals/document/nested.md @@ -1,17 +1,16 @@ (recursive-nested-document)= -## Nested Structure +# Nested Structure -`Document` can be nested both horizontally and vertically. The following graphic illustrates the recursive `Document` structure. Each `Document` can have multiple "chunks" -and "matches", which are `Document` as well. +Document can be nested both horizontally and vertically via `.matches` and `.chunks`. The picture below illustrates the recursive Document structure. -| Attribute | Description | -| --- | --- | -| `doc.chunks` | The list of sub-Documents of this Document. They have `granularity + 1` but same `adjacency` | +| Attribute | Description | +| --- |-------------------------------------------------------------------------------------------------| +| `doc.chunks` | The list of sub-Documents of this Document. They have `granularity + 1` but same `adjacency` | | `doc.matches` | The list of matched Documents of this Document. They have `adjacency + 1` but same `granularity` | -| `doc.granularity` | The recursion "depth" of the recursive chunks structure | -| `doc.adjacency` | The recursion "width" of the recursive match structure | +| `doc.granularity` | The "depth" of the nested chunks structure | +| `doc.adjacency` | The "width" of the nested match structure | You can add **chunks** (sub-Document) and **matches** (neighbour-Document) to a `Document`: @@ -37,45 +36,28 @@ You can add **chunks** (sub-Document) and **matches** (neighbour-Document) to a d.matches.append(Document()) ``` -````{admonition} Note -:class: note -Both `doc.chunks` and `doc.matches` return `ChunkArray` and `MatchArray`, which are sub-classes -of {ref}`DocumentArray`. We will introduce `DocumentArray` later. -```` +Both `doc.chunks` and `doc.matches` return {ref}`DocumentArray`. -`````{admonition} Caveat: order matters -:class: alert +To get a clear picture of a nested Document, use {meth}`~docarray.document.mixins.plot.PlotMixin.summary`, e.g.: - -When adding sub-Documents to `Document.chunks`, avoid creating them in one line, otherwise the recursive Document structure will not be correct. This is because `chunks` use `ref_doc` to control their `granularity`. At `chunk` creation time the `chunk` doesn't know anything about its parent, and will get a wrong `granularity` value. - -````{tab} ✅ Do ```python -from jina import Document - -root_document = Document(text='i am root') -# add one chunk to root -root_document.chunks.append(Document(text='i am chunk 1')) -root_document.chunks.extend([ - Document(text='i am chunk 2'), - Document(text='i am chunk 3'), -]) # add multiple chunks to root +d.summary() ``` -```` -````{tab} 😔 Don't -```python -from jina import Document - -root_document = Document( - text='i am root', - chunks=[ - Document(text='i am chunk 2'), - Document(text='i am chunk 3'), - ] -) +```text + + └─ matches + ├─ + └─ + └─ chunks + ├─ + └─ ``` -```` + +## What's next? + +When you have multiple Documents with nested structures, traversing over certain chunks and matches can be crucial. Fortunately, this is extremely simple thanks to DocumentArray as shown in {ref}`access-elements`. + +Note that some methods rely on these two attributes, some methods require these two attributes to be filled in advance. For example, {meth}`~docarray.array.mixins.match.MatchMixin.match` will fill `.matches`, whereas {meth}`~docarray.array.mixins.evaluation.EvaluationMixin.evaluate` requires `.matches` to be filled. -````` diff --git a/docs/fundamentals/document/serialization.md b/docs/fundamentals/document/serialization.md index 847c98921b8..7982da094b6 100644 --- a/docs/fundamentals/document/serialization.md +++ b/docs/fundamentals/document/serialization.md @@ -1,48 +1,162 @@ +(serialize)= # Serialization -You can serialize a `Document` into JSON string via {meth}`~jina.types.mixin.ProtoTypeMixin.to_json` or Python dict via {meth}`~jina.types.mixin.ProtoTypeMixin.to_dict` or binary string via {meth}`bytes`: -````{tab} JSON +DocArray is designed to be "ready-to-wire": it assumes you always want to send/receive Document over network across microservices. Hence, serialization of Document is important. This chapter introduces multiple serialization methods of a single Document. + +```{tip} +One should use DocumentArray for serializing multiple Documents, instead of looping over Documents one by one. The former is much faster and yield more compact serialization. +``` + + +## From/to JSON + +```{important} +This feature requires `protobuf` dependency. You can do `pip install docarray[full]` to install it. +``` + +You can serialize a Document as a JSON string via {meth}`~docarray.document.mixins.porting.PortingMixin.to_json`, and then read from it via {meth}`~docarray.document.mixins.porting.PortingMixin.from_json`. + ```python -from jina import Document +from docarray import Document +import numpy as np + +d_as_json = Document(text='hello, world', embedding=np.array([1, 2, 3])).to_json() -Document(content='hello, world', embedding=[1, 2, 3]).to_json() +d = Document.from_json(d_as_json) + +print(d_as_json, d) ``` -```json +```text { - "embedding": [ - 1, - 2, - 3 - ], - "id": "9e36927e576b11ec81971e008a366d48", + "embedding": { + "cls_name": "numpy", + "dense": { + "buffer": "AQAAAAAAAAACAAAAAAAAAAMAAAAAAAAA", + "dtype": " +``` + + +## From/to dict + +```{important} +This feature requires `protobuf` dependency. You can do `pip install docarray[full]` to install it. +``` + +You can serialize a Document as a Python `dict` via {meth}`~docarray.document.mixins.porting.PortingMixin.to_dict`, and then read from it via {meth}`~docarray.document.mixins.porting.PortingMixin.from_dict`. + +```python +from docarray import Document +import numpy as np + +d_as_dict = Document(text='hello, world', embedding=np.array([1, 2, 3])).to_dict() + +d = Document.from_dict(d_as_dict) + +print(d_as_dict, d) +``` + +```text +{'id': 'b29d39066d5611ec87661e008a366d49', 'text': 'hello, world', 'mime_type': 'text/plain', 'embedding': {'dense': {'buffer': 'AQAAAAAAAAACAAAAAAAAAAMAAAAAAAAA', 'shape': [3], 'dtype': ' +``` + +```{note} +Note that the result dict is very "stricted" in the sense that all fields and values boil down to very basic data type such as `int`, `float`, `string`. This behavior is designed due to the "serialization to `dict`" is often an intermediate step of serializing into JSON/YAML. Hence all values in `dict` must be schema-friendly. After all, a Python `dict` object means nothing if you are not working in Python. + +You can use `to_dict(strict=False)` to override this behavior. This will preserve the original Python data type of every value, which may not be JSON-friendly. But hey, you want it. +``` + +## From/to bytes +```{important} +Depending on your values of `protocol` and `compress` arguments, this feature may require `protobuf` and `lz4` dependencies. You can do `pip install docarray[full]` to install it. ``` -```` -````{tab} Binary + +Bytes or binary or buffer, how ever you want to call it, it probably the most common & compact wire format. DocArray provides {meth}`~docarray.document.mixins.porting.PortingMixin.to_bytes` and {meth}`~docarray.document.mixins.porting.PortingMixin.from_bytes` to serialize Document object into bytes. + ```python -from jina import Document +from docarray import Document +import numpy as np + +d = Document(text='hello, world', embedding=np.array([1, 2, 3])) +d_bytes = d.to_bytes() + +d_r = Document.from_bytes(d_bytes) + +print(d_bytes, d_r) +``` + +```text +b'\x80\x03cdocarray.document\nDocument\nq\x00)\x81q\x01}q\x02X\x05\x00\x00\x00_dataq\x03cdocarray.document.data\nDocumentData\nq\x04)\x81q\x05}q\x06(X\x0e\x00\x00\x00_reference_docq\x07h\x01X\x02\x00\x00\x00idq\x08X \x00\x00\x005d29a9f26d5911ec88d51e008a366d49q\tX\t\x00\x00\x00parent_... -bytes(Document(content='hello, world', embedding=[1, 2, 3])) + ``` +Default serialization protocol is `pickle`, you can change it to `protobuf` by specifying `.to_bytes(protocol='protobuf')`. You can also add compression to it and make the result bytes smaller. For example, + +```python +d = Document(text='hello, world', embedding=np.array([1, 2, 3])) +print(len(d.to_bytes(protocol='protobuf', compress='gzip'))) ``` -b'\n aad94436576b11ec81551e008a366d48R\ntext/plainj\x0chello, world\x9a\x01+\n"\n\x18\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x12\x01\x03\x1a\x03 + +id: "d66463b46d6a11ecbf891e008a366d49" +uri: "apple.jpg" +mime_type: "image/jpeg" + + ``` -```` + +One can refer to the [Protobuf specification of `Document`](../../proto/index.md) for details. + + +## What's next? + +Serializing single Document can be useful but often we want to do things in bulk, say hundreds or one million Documents at once. In that case, looping over each Document and serializing one by one is inefficient. In DocumentArray, we will introduce the similar interfaces {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.to_bytes`, {meth}`~docarray.array.mixins.io.json.JsonIOMixin.to_json`, and {meth}`~docarray.array.mixins.io.json.JsonIOMixin.to_list` that allows one to serialize multiple Documents much faster and more compact. \ No newline at end of file diff --git a/docs/fundamentals/document/visualization.md b/docs/fundamentals/document/visualization.md index 61030cd6680..710a34a9684 100644 --- a/docs/fundamentals/document/visualization.md +++ b/docs/fundamentals/document/visualization.md @@ -1,12 +1,16 @@ # Visualization -To better see the Document's nested structure, you can use {meth}`~jina.types.document.mixins.plot.PlotMixin.plot` function. If you are using JupyterLab/Notebook, -all `Document` objects will be auto-rendered: +If you have an image Document (with possible image data in `.uri`/`.blob`), you can directly visualize it via {meth}`~docarray.document.mixins.plot.PlotMixin.plot`. +```{figure} images/doc-in-jupyter.png +``` + + +To better see the Document's nested structure, you can use {meth}`~docarray.document.mixins.plot.PlotMixin.summary`. ```{code-block} python --- -emphasize-lines: 13 +emphasize-lines: 13,14 --- import numpy as np from docarray import Document @@ -23,7 +27,17 @@ d0.matches.append(d3) d0.summary() ``` +```text + + └─ matches + └─ + └─ chunks + └─ + └─ chunks + └─ +``` + +When using Notebook/Colab, this is auto-rendered. -```{figure} ../../../.github/images/four-symbol-docs.svg -:align: center +```{figure} images/doc-auto-summary.png ``` diff --git a/docs/fundamentals/documentarray/access-elements.md b/docs/fundamentals/documentarray/access-elements.md index ecc06ba1987..8b9f4744ab8 100644 --- a/docs/fundamentals/documentarray/access-elements.md +++ b/docs/fundamentals/documentarray/access-elements.md @@ -1,3 +1,4 @@ +(access-elements)= # Access Elements Like a `List` *and* a `Dict`, elements in `DocumentArray` can be accessed via integer index, string `id` or `slice` indices: diff --git a/docs/fundamentals/documentarray/index.md b/docs/fundamentals/documentarray/index.md index 90b55bb94be..6cdf889804f 100644 --- a/docs/fundamentals/documentarray/index.md +++ b/docs/fundamentals/documentarray/index.md @@ -20,6 +20,8 @@ access-elements access-attributes embedding matching +evaluation parallelization visualization +list-like ``` \ No newline at end of file diff --git a/docs/fundamentals/documentarray-api.md b/docs/fundamentals/documentarray/list-like.md similarity index 99% rename from docs/fundamentals/documentarray-api.md rename to docs/fundamentals/documentarray/list-like.md index 86813b3fdc2..483b957b658 100644 --- a/docs/fundamentals/documentarray-api.md +++ b/docs/fundamentals/documentarray/list-like.md @@ -1,4 +1,4 @@ -# List-like Interface +# Other List-like Features One can see `DocumentArray` as a Python list. Hence, many Python high-level iterator functions/tools can be used on `DocumentArray` as well. diff --git a/docs/index.md b/docs/index.md index ffbbd5474a3..9b8a396fa99 100644 --- a/docs/index.md +++ b/docs/index.md @@ -65,7 +65,7 @@ not installing `docarray` correctly. You are probably still using an old `docarr ``` ```{toctree} -:caption: Fundamentals +:caption: User Guides :hidden: fundamentals/document/index.md @@ -74,7 +74,7 @@ fundamentals/documentarray/index.md ```{toctree} -:caption: API Reference +:caption: Developer References :hidden: :maxdepth: 1 diff --git a/scripts/update-fluent-interface.py b/scripts/update-fluent-interface.py new file mode 100644 index 00000000000..24519297f39 --- /dev/null +++ b/scripts/update-fluent-interface.py @@ -0,0 +1,58 @@ +import inspect +import re +import sys +from collections import defaultdict + +from docarray import Document + +all_meth = defaultdict(list) +for f in inspect.getmembers(Document): + if ( + callable(f[1]) + and not f[1].__name__.startswith('_') + and not f[0].startswith('_') + ): + + if 'return' in inspect.getfullargspec(f[1]).annotations and str( + inspect.getfullargspec(f[1]).annotations['return'] + ) in ('~T', 'T'): + module_name = f[1].__qualname__.split('.')[0].replace('Mixin', '') + desc = ( + inspect.getdoc( + vars(sys.modules[f[1].__module__])[f[1].__qualname__.split('.')[0]] + ) + or '' + ) + all_meth[ + ( + module_name, + desc.strip() + .replace(':class:', '{class}') + .replace(':attr:', '{attr}'), + ) + ].append(f'{{meth}}`~{f[1].__module__}.{f[1].__qualname__}`') + +all_s = [] +for k, v in all_meth.items(): + all_s.append(f'### {k[0].strip()}') + all_s.append(f'{k[1].strip()}') + for vv in v: + all_s.append(f'- {vv}') + + all_s.append('\n') + + +doc_md = '../docs/fundamentals/document/fluent-interface.md' +text = '\n'.join(all_s) + +with open(doc_md) as fp: + _old = fp.read() + _new = re.sub( + r'(\s*?\n).*(\n\s*?)', + rf'\g<1>{text}\g<2>', + _old, + flags=re.DOTALL, + ) + +with open(doc_md, 'w') as fp: + fp.write(_new) diff --git a/tests/unit/array/test_from_to_bytes.py b/tests/unit/array/test_from_to_bytes.py index 610869b0b5d..b1599cc3a45 100644 --- a/tests/unit/array/test_from_to_bytes.py +++ b/tests/unit/array/test_from_to_bytes.py @@ -77,4 +77,4 @@ def test_from_to_protobuf(target_da): @pytest.mark.parametrize('target_da', [DocumentArray.empty(100), random_docs(100)]) def test_from_to_safe_list(target_da): - DocumentArray.from_list_safe(target_da.to_list_safe()) + DocumentArray.from_list(target_da.to_list()) diff --git a/tests/unit/document/test_docdata.py b/tests/unit/document/test_docdata.py index 78ddff5a4ee..05d2c35bdcc 100644 --- a/tests/unit/document/test_docdata.py +++ b/tests/unit/document/test_docdata.py @@ -217,3 +217,51 @@ def test_doc_content(): np.testing.assert_equal(d.content, c) d.buffer = b'123' assert d.buffer == b'123' + + +def test_dict_constructor(): + + d1 = Document( + uri='https://jina.ai', mime_type='text/plain', granularity=1, adjacency=3 + ) + + d2 = Document( + dict(uri='https://jina.ai', mime_type='text/plain', granularity=1, adjacency=3) + ) + + d3 = Document( + { + 'uri': 'https://jina.ai', + 'mime_type': 'text/plain', + 'granularity': 1, + 'adjacency': 3, + } + ) + + assert d1 != d2 + d1.id = None + d2.id = None + d3.id = None + assert d1 == d2 == d3 + + +def test_unknown_fields_behavior(): + d = Document(hello='world') + assert d.tags == {'hello': 'world'} + + d = Document(hello='world', unknown_fields_handler='drop') + assert d.tags == {} + + with pytest.raises(AttributeError): + d = Document(hello='world', unknown_fields_handler='raise') + + +def test_content_setter_as_proxy(): + d = Document(content='hello') + assert d.content == 'hello' + + assert 'content' not in d.non_empty_fields + assert 'text' in d.non_empty_fields + d.content = [1, 2, 3] + assert 'blob' in d.non_empty_fields + assert 'text' not in d.non_empty_fields diff --git a/tests/unit/document/test_pickle.py b/tests/unit/document/test_pickle.py index 92cbda334e4..10df0ad29e8 100644 --- a/tests/unit/document/test_pickle.py +++ b/tests/unit/document/test_pickle.py @@ -2,6 +2,8 @@ import pytest +from docarray import Document +from docarray.document.data import DocumentData from docarray.base import BaseDCType from tests import random_docs @@ -18,3 +20,16 @@ def test_pickle_dump_load_real_doc(): assert dr == d assert dr.embedding is not None assert len(dr.chunks) == len(d.chunks) + + +def test_pickle_rely_on_data_class_and_document_class(): + # TODO (Han): This is not really a designed behavior, but atm I see no harm + # of having it, and no real usecases that against it. + + d = Document() + d.id = 'hello' + setattr(d, 'foo', 'bar') + assert getattr(d, 'foo') == 'bar' + r_d = Document.from_bytes(d.to_bytes(protocol='pickle')) + assert r_d.id == d.id + assert getattr(r_d, 'foo') == 'bar' diff --git a/tests/unit/document/test_summary.py b/tests/unit/document/test_summary.py index 285f2d1309c..6516dbc50c2 100644 --- a/tests/unit/document/test_summary.py +++ b/tests/unit/document/test_summary.py @@ -21,9 +21,9 @@ def test_single_doc_summary(): def test_plot_image(): d = Document(uri=os.path.join(cur_dir, 'toydata/test.png')) - d.plot_image() + d.plot() d.load_uri_to_image_blob() d.uri = None - d.plot_image() + d.plot()