diff --git a/.github/requirements-cicd.txt b/.github/requirements-cicd.txt index 49704affe6b..63670f3d87b 100644 --- a/.github/requirements-cicd.txt +++ b/.github/requirements-cicd.txt @@ -12,4 +12,5 @@ matplotlib rich Pillow lz4 -fastapi \ No newline at end of file +fastapi +jupyterlab \ No newline at end of file diff --git a/README.md b/README.md index f3ac4fa9de5..a1cdd2908d8 100644 --- a/README.md +++ b/README.md @@ -9,11 +9,13 @@ -DocArray is a library for nested, unstructured data such as text, image, audio, video, 3D mesh. Its Pythonic interface allows deep learning engineers to easily preprocess, embed, search, recommend and transfer the data. +DocArray is a library for nested, unstructured data such as text, image, audio, video, 3D mesh. It allows deep learning engineers to easily preprocess, embed, search, recommend and transfer the data. 🌌 **All data types**: super-expressive data structure for representing complicated/mixed/nested text, image, video, audio, 3D mesh data. -🧑‍🔬 **Data science powerhouse**: easy-to-use functions for facilitating data scientists work on embedding, matching, visualizing, evaluating via Torch/Tensorflow/ONNX/PaddlePaddle. +🐍 **Pythonic API**: easy-to-use idioms and interfaces just as the native Python List. If you know how to Python, you know how to DocArray. + +🧑‍🔬 **Data science powerhouse**: greatly facilitate data scientists work on embedding, matching, visualizing, evaluating via Torch/Tensorflow/ONNX/PaddlePaddle. 🚡 **Portable**: ready to wire at anytime with efficient and compact serialization from/to Protobuf, binary, JSON, CSV, dataframe. @@ -196,6 +198,31 @@ recall@5 0.0573470744680851 More metrics can be used such as `precision_at_k`, `ndcg_at_k`, `hit_at_k`. + + +### Save results + +You can save a DocumentArray to binary, JSON, dict, dataframe, CSV or Protobuf message. In its simplest form, + +```python +left_da.save('left_da.bin') +``` + +To reuse it, do `left_da = DocumentArray.load('left_da.bin')`. + +If you want to transfer a DoucmentArray from one machine to another or share it with your colleagues, you can do: + +```python +left_da.push(token='my_shared_da') +``` + +```python +left_da = DocumentArray.pull(token='my_shared_da') +``` + +Anyone knows the token `my_shared_da` can pull and work on it. + + Intrigued? That's only scratching the surface of what DocArray is capable of. [Read our docs to learn more](https://docarray.jina.ai). diff --git a/docarray/array/mixins/io/common.py b/docarray/array/mixins/io/common.py index 5e821b11e7d..17b04e6e80d 100644 --- a/docarray/array/mixins/io/common.py +++ b/docarray/array/mixins/io/common.py @@ -8,7 +8,7 @@ class CommonIOMixin: """The common IO helper function for arrays. """ def save( - self, file: Union[str, TextIO, BinaryIO], file_format: str = 'json' + self, file: Union[str, TextIO, BinaryIO], file_format: str = 'binary' ) -> None: """Save array elements into a JSON, a binary file or a CSV file. @@ -28,7 +28,7 @@ def save( @classmethod def load( - cls: Type['T'], file: Union[str, TextIO, BinaryIO], file_format: str = 'json' + cls: Type['T'], file: Union[str, TextIO, BinaryIO], file_format: str = 'binary' ) -> 'T': """Load array elements from a JSON or a binary file, or a CSV file. diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index 2767f482f61..2cb97ae3606 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -1,6 +1,6 @@ import io from contextlib import nullcontext -from typing import Type, TYPE_CHECKING +from typing import Type, TYPE_CHECKING, Optional from ....helper import get_request_header @@ -13,7 +13,9 @@ class PushPullMixin: _service_url = 'https://apihubble.jina.ai/v2/rpc/da.' - def push(self, token: str, show_progress: bool = False) -> None: + def push( + self, token: str, show_progress: bool = False, compress: Optional[str] = None + ) -> None: """Push this DocumentArray object to Jina Cloud which can be later retrieved via :meth:`.push` .. note:: @@ -53,7 +55,7 @@ def read(self, n=-1): dict_data = { 'file': ( 'DocumentArray', - self.to_bytes(protocol='protobuf', compress='gzip'), + self.to_bytes(protocol='protobuf', compress=compress), ), 'token': token, } @@ -69,7 +71,12 @@ def read(self, n=-1): requests.post(self._service_url + 'push', data=body, headers=headers) @classmethod - def pull(cls: Type['T'], token: str, show_progress: bool = False) -> 'T': + def pull( + cls: Type['T'], + token: str, + show_progress: bool = False, + compress: Optional[str] = None, + ) -> 'T': """Pulling a :class:`DocumentArray` from Jina Cloud Service to local. :param token: the upload token set during :meth:`.push` @@ -103,7 +110,9 @@ def pull(cls: Type['T'], token: str, show_progress: bool = False) -> 'T': if show_progress: progress.update(task_id, advance=len(chunk)) - return cls.from_bytes(f.getvalue(), protocol='protobuf', compress='lz4') + return cls.from_bytes( + f.getvalue(), protocol='protobuf', compress=compress + ) def _get_progressbar(show_progress): diff --git a/docarray/document/mixins/plot.py b/docarray/document/mixins/plot.py index 5473ab79edb..3c2d72c4ccd 100644 --- a/docarray/document/mixins/plot.py +++ b/docarray/document/mixins/plot.py @@ -54,10 +54,10 @@ def _mermaid_to_url(self, img_type: str) -> str: """ mermaid_str = ( """ - %%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#FFC666'}}}%% - classDiagram - - """ + %%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#FFC666'}}}%% + classDiagram + + """ + self.__mermaid_str__() ) @@ -69,7 +69,41 @@ def _mermaid_to_url(self, img_type: str) -> str: def _ipython_display_(self): """Displays the object in IPython as a side effect""" - self.plot(inline_display=True) + self.summary() + + def summary(self) -> None: + """ Print non-empty fields and nested structure of this Document object.""" + _str_list = [] + self._plot_recursion(_str_list, indent=0) + print('\n'.join(_str_list)) + + def _plot_recursion(self, _str_list, indent, box_char='├─'): + prefix = (' ' * indent + box_char) if indent else '' + _str_list.append(f'{prefix} {self}') + + for a in ('matches', 'chunks'): + if getattr(self, a): + prefix = ' ' * (indent + 4) + '└─' + _str_list.append(f'{prefix} {a}') + + for d in getattr(self, a)[:-1]: + d._plot_recursion(_str_list, indent=len(prefix) + 4) + getattr(self, a)[-1]._plot_recursion( + _str_list, indent=len(prefix) + 4, box_char='└─' + ) + + def plot_image(self): + """ Plot image data from :attr:`.blob` or :attr:`.uri`. """ + from IPython.display import Image, display + + if self.blob is not None: + import PIL.Image + + display(PIL.Image.fromarray(self.blob)) + elif self.uri: + display(Image(self.uri)) + else: + raise ValueError('`uri` and `blob` is empty') def plot(self, output: Optional[str] = None, inline_display: bool = False) -> None: """ diff --git a/docs/index.md b/docs/index.md index c37dfa6eb32..bc591616cd9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -7,6 +7,10 @@ ## Install +```{tip} +Jina 3.x users do not need to install `docarray` separately, it is shipped with Jina. To check your Jina version, type `jina -vf` in the console. +``` + Make sure you have Python 3.7+ and `numpy` installed on Linux/Mac/Windows: ````{tab} Basic install @@ -41,6 +45,17 @@ The following dependencies will be installed to enable additional features: Alternatively, you can first do basic installation and then install missing dependencies on-demand. ```` +```pycon +>>> import docarray +>>> docarray.__version__ +'0.1.0' +``` + +```{attention} +If the printed version is smaller than `0.1.0`, say `0.0.x`, then you are +not installing `docarray` correctly. You are probably still using an old `docarray` shipped with Jina 2.x. +``` + diff --git a/tests/unit/document/test_summary.py b/tests/unit/document/test_summary.py new file mode 100644 index 00000000000..285f2d1309c --- /dev/null +++ b/tests/unit/document/test_summary.py @@ -0,0 +1,29 @@ +import os + +from docarray import Document + +cur_dir = os.path.dirname(os.path.abspath(__file__)) + + +def test_single_doc_summary(): + # empty doc + Document().summary() + # nested doc + Document( + chunks=[ + Document(), + Document(chunks=[Document()]), + Document(), + ], + matches=[Document(), Document()], + ).summary() + + +def test_plot_image(): + d = Document(uri=os.path.join(cur_dir, 'toydata/test.png')) + d.plot_image() + + d.load_uri_to_image_blob() + d.uri = None + + d.plot_image()