diff --git a/docarray/array/mixins/io/binary.py b/docarray/array/mixins/io/binary.py index 316e66fabdf..8cf760de65a 100644 --- a/docarray/array/mixins/io/binary.py +++ b/docarray/array/mixins/io/binary.py @@ -18,7 +18,7 @@ class BinaryIOMixin: def load_binary( cls: Type['T'], file: Union[str, BinaryIO, bytes], - protocol: str = 'pickle-once', + protocol: str = 'pickle-array', compress: Optional[str] = None, ) -> 'T': """Load array elements from a LZ4-compressed binary file. @@ -45,14 +45,14 @@ def load_binary( d = decompress_bytes(d, algorithm=compress) compress = None - if protocol == 'protobuf-once': + if protocol == 'protobuf-array': from ....proto.docarray_pb2 import DocumentArrayProto dap = DocumentArrayProto() dap.ParseFromString(d) return cls.from_protobuf(dap) - elif protocol == 'pickle-once': + elif protocol == 'pickle-array': return pickle.loads(d) else: _len = len(random_uuid().bytes) @@ -66,7 +66,7 @@ def load_binary( def from_bytes( cls: Type['T'], data: bytes, - protocol: str = 'pickle-once', + protocol: str = 'pickle-array', compress: Optional[str] = None, ) -> 'T': return cls.load_binary(data, protocol=protocol, compress=compress) @@ -74,7 +74,7 @@ def from_bytes( def save_binary( self, file: Union[str, BinaryIO], - protocol: str = 'pickle-once', + protocol: str = 'pickle-array', compress: Optional[str] = None, ) -> None: """Save array elements into a LZ4 compressed binary file. @@ -98,7 +98,7 @@ def save_binary( def to_bytes( self, - protocol: str = 'pickle-once', + protocol: str = 'pickle-array', compress: Optional[str] = None, _file_ctx: Optional[BinaryIO] = None, ) -> bytes: @@ -121,9 +121,9 @@ def to_bytes( fc = f compress = None with fc: - if protocol == 'protobuf-once': + if protocol == 'protobuf-array': f.write(self.to_protobuf().SerializePartialToString()) - elif protocol == 'pickle-once': + elif protocol == 'pickle-array': f.write(pickle.dumps(self)) else: for d in self: diff --git a/docs/fundamentals/document/serialization.md b/docs/fundamentals/document/serialization.md index 7982da094b6..16ee8c38513 100644 --- a/docs/fundamentals/document/serialization.md +++ b/docs/fundamentals/document/serialization.md @@ -4,7 +4,7 @@ DocArray is designed to be "ready-to-wire": it assumes you always want to send/receive Document over network across microservices. Hence, serialization of Document is important. This chapter introduces multiple serialization methods of a single Document. ```{tip} -One should use DocumentArray for serializing multiple Documents, instead of looping over Documents one by one. The former is much faster and yield more compact serialization. +One should use {ref}`DocumentArray for serializing multiple Documents`, instead of looping over Documents one by one. The former is much faster and yield more compact serialization. ``` @@ -47,38 +47,7 @@ print(d_as_json, d) ``` - -## From/to dict - -```{important} -This feature requires `protobuf` dependency. You can do `pip install docarray[full]` to install it. -``` - -You can serialize a Document as a Python `dict` via {meth}`~docarray.document.mixins.porting.PortingMixin.to_dict`, and then read from it via {meth}`~docarray.document.mixins.porting.PortingMixin.from_dict`. - -```python -from docarray import Document -import numpy as np - -d_as_dict = Document(text='hello, world', embedding=np.array([1, 2, 3])).to_dict() - -d = Document.from_dict(d_as_dict) - -print(d_as_dict, d) -``` - -```text -{'id': 'b29d39066d5611ec87661e008a366d49', 'text': 'hello, world', 'mime_type': 'text/plain', 'embedding': {'dense': {'buffer': 'AQAAAAAAAAACAAAAAAAAAAMAAAAAAAAA', 'shape': [3], 'dtype': ' -``` - -```{note} -Note that the result dict is very "stricted" in the sense that all fields and values boil down to very basic data type such as `int`, `float`, `string`. This behavior is designed due to the "serialization to `dict`" is often an intermediate step of serializing into JSON/YAML. Hence all values in `dict` must be schema-friendly. After all, a Python `dict` object means nothing if you are not working in Python. - -You can use `to_dict(strict=False)` to override this behavior. This will preserve the original Python data type of every value, which may not be JSON-friendly. But hey, you want it. -``` - +(doc-in-bytes)= ## From/to bytes ```{important} @@ -127,6 +96,43 @@ Note that when deserializing from a non-default binary serialization, you need t d = Document.from_bytes(d_bytes, protocol='protobuf', compress='gzip') ``` +```{tip} +If you go with default `protcol` and `compress` settings, you can simply use `bytes(d)`, which is more Pythonic. +``` + + +## From/to dict + +```{important} +This feature requires `protobuf` dependency. You can do `pip install docarray[full]` to install it. +``` + +You can serialize a Document as a Python `dict` via {meth}`~docarray.document.mixins.porting.PortingMixin.to_dict`, and then read from it via {meth}`~docarray.document.mixins.porting.PortingMixin.from_dict`. + +```python +from docarray import Document +import numpy as np + +d_as_dict = Document(text='hello, world', embedding=np.array([1, 2, 3])).to_dict() + +d = Document.from_dict(d_as_dict) + +print(d_as_dict, d) +``` + +```text +{'id': 'b29d39066d5611ec87661e008a366d49', 'text': 'hello, world', 'mime_type': 'text/plain', 'embedding': {'dense': {'buffer': 'AQAAAAAAAAACAAAAAAAAAAMAAAAAAAAA', 'shape': [3], 'dtype': ' +``` + +(strict-arg-explain)= +```{note} +Note that the result dict is very "stricted" in the sense that all fields and values boil down to very basic data type such as `int`, `float`, `string`. This behavior is designed due to the "serialization to `dict`" is often an intermediate step of serializing into JSON/YAML. Hence all values in `dict` must be schema-friendly. After all, a Python `dict` object means nothing if you are not working in Python. + +You can use `to_dict(strict=False)` to override this behavior. This will preserve the original Python data type of every value, which may not be JSON-friendly. But hey, you want it. +``` + ## From/to Protobuf ```{important} diff --git a/docs/fundamentals/documentarray/construct.md b/docs/fundamentals/documentarray/construct.md index f14156584c1..dfca91b7e90 100644 --- a/docs/fundamentals/documentarray/construct.md +++ b/docs/fundamentals/documentarray/construct.md @@ -6,11 +6,62 @@ ```python from docarray import DocumentArray +da = DocumentArray() +``` + +```text + +``` + +Now you can use list-like interfaces such as `.append()` and `.extend()` as you would add elements to a Python List. + +```python +da.append(Document(text='hello world!')) +da.extend([Document(text='hello'), Document(text='world!')]) +``` + +```text + +``` + +Directly printing a DocumentArray does not show you too much useful information, you can use {meth}`~docarray.array.mixins.plot.PlotMixin.summary`. + +```{important} +This feature requires `rich` dependency. You can do `pip install docarray[full]` to install it. +``` + +```python +da.summary() +``` + +```text + Documents Summary + + Length 3 + Homogenous Documents True + Common Attributes ('id', 'mime_type', 'text') + + Attributes Summary + + Attribute Data type #Unique values Has empty value + ────────────────────────────────────────────────────────── + id ('str',) 3 False + mime_type ('str',) 1 False + text ('str',) 3 False +``` + +## Construct with empty Documents + +Like `numpy.zeros()`, you can quickly build a DocumentArray with only empty Documents: + +```python +from docarray import DocumentArray + da = DocumentArray.empty(10) ``` ```text - + ``` ## Construct from list-like objects @@ -41,6 +92,7 @@ da = DocumentArray((Document() for _ in range(10))) ``` ```` + As DocumentArray itself is also a "list-like object that yields `Document`", you can also construct DocumentArray from another DocumentArray: ```python diff --git a/docs/fundamentals/documentarray/images/benchmark-size.svg b/docs/fundamentals/documentarray/images/benchmark-size.svg new file mode 100644 index 00000000000..4031944ff9d --- /dev/null +++ b/docs/fundamentals/documentarray/images/benchmark-size.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/fundamentals/documentarray/images/benchmark-time.svg b/docs/fundamentals/documentarray/images/benchmark-time.svg new file mode 100644 index 00000000000..129b4a982bf --- /dev/null +++ b/docs/fundamentals/documentarray/images/benchmark-time.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/fundamentals/documentarray/images/da-push.png b/docs/fundamentals/documentarray/images/da-push.png new file mode 100644 index 00000000000..82e564faa7a Binary files /dev/null and b/docs/fundamentals/documentarray/images/da-push.png differ diff --git a/docs/fundamentals/documentarray/serialization.md b/docs/fundamentals/documentarray/serialization.md index dce862a5b81..0ac446c6208 100644 --- a/docs/fundamentals/documentarray/serialization.md +++ b/docs/fundamentals/documentarray/serialization.md @@ -1,13 +1,247 @@ +(docarray-serialization)= # Serialization DocArray is designed to be "ready-to-wire" at anytime. Serialization is important. DocumentArray provides multiple serialization methods that allows one transfer DocumentArray object over network and across different microservices. +- JSON string: `.from_json()`/`.to_json()` +- Bytes (compressed): `.from_bytes()`/`.to_bytes()` +- Protobuf Message: `.from_protobuf()`/`.to_protobuf()` +- Python List: `.from_list()`/`.to_list()` +- Pandas Dataframe: `.from_dataframe()`/`.to_dataframe()` +- Cloud: `.push()`/`.pull()` + ## From/to JSON +```{important} +This feature requires `protobuf` dependency. You can do `pip install docarray[full]` to install it. +``` + +```python +from docarray import DocumentArray, Document + +da = DocumentArray([Document(text='hello'), Document(text='world')]) +da.to_json() +``` + +```text +[{"id": "72db9a7e6e3211ec97f51e008a366d49", "text": "hello", "mime_type": "text/plain"}, {"id": "72db9cb86e3211ec97f51e008a366d49", "text": "world", "mime_type": "text/plain"}] +``` + + +```python +da_r = DocumentArray.from_json(da.to_json()) + +da_r.summary() +``` + +```text + Documents Summary + + Length 2 + Homogenous Documents True + Common Attributes ('id', 'mime_type', 'text') + + Attributes Summary + + Attribute Data type #Unique values Has empty value + ────────────────────────────────────────────────────────── + id ('str',) 2 False + mime_type ('str',) 1 False + text ('str',) 2 False + +``` + + ## From/to bytes +```{important} +Depending on your values of `protocol` and `compress` arguments, this feature may require `protobuf` and `lz4` dependencies. You can do `pip install docarray[full]` to install it. +``` + +Serialization into bytes often yield more compact representation than in JSON. Similar to {ref}`the Document serialization`, DocumentArray can be serialized with different `protocol` and `compress` combinations. In its most simple form, + +```python +from docarray import DocumentArray, Document + +da = DocumentArray([Document(text='hello'), Document(text='world')]) +da.to_bytes() +``` + +```text +b'\x80\x03cdocarray.array.document\nDocumentArray\nq\x00)\x81q\x01}q\x02(X\x05\x00\x00\x00_dataq\x03]q\x04(cdocarray.document\nDocument\nq\x05) ... +``` + +```python +da_r = DocumentArray.from_bytes(da.to_bytes()) + +da_r.summary() +``` + +```text + Documents Summary + + Length 2 + Homogenous Documents True + Common Attributes ('id', 'mime_type', 'text') + + Attributes Summary + + Attribute Data type #Unique values Has empty value + ────────────────────────────────────────────────────────── + id ('str',) 2 False + mime_type ('str',) 1 False + text ('str',) 2 False +``` + +```{tip} +If you go with default `protcol` and `compress` settings, you can simply use `bytes(da)`, which is more Pythonic. +``` + +The table below summarize the supported serialization protocols and compressions: + +| `protocol=...` | Description | Remarks | +|--------------------------|-----------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------| +| `pickle-array` (default) | Serialize the whole array in one-shot using Python `pickle` | Often fastest. Not portable to other languages. Insecure in production. | +| `protobuf-array` | Serialize the whole array using [`DocumentArrayProto`](../../../proto/#docarray.DocumentArrayProto). | Portable to other languages if they implement `DocumentArrayProto`. 2GB max-size (pre-compression) restriction by Protobuf. | +| `pickle` | Serialize elements one-by-one using Python `pickle`. | Allow streaming. Not portable to other languages. Insecure in production. | +| `protobuf` | Serialize elements one-by-one using [`DocumentProto`](../../../proto/#docarray.DocumentProto). | Allow streaming. Portable to other languages if they implement `DocumentProto`. No max-size restriction | + +For compressions, the following algorithms are supported: `lz4`, `bz2`, `lzma`, `zlib`, `gzip`. The most frequently used ones are `lz4` (fastest) and `gzip` (most widely used). + +If you specified non-default `protocol` and `compress` in {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.to_bytes`, you will need to specify the same in {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.from_bytes`. + +Depending on the use cases, you can choose the one works best for you. Here is a benchmark on serializing a DocumentArray with one million near-empty Documents (i.e. init with `DocumentArray.empty(...)` where each Document has only `id`). + +```{figure} images/benchmark-size.svg +``` + +```{figure} images/benchmark-time.svg +``` + +The benchmark was conducted [on the codebase of Jan. 5, 2022](https://github.com/jina-ai/docarray/tree/a56067e486d2318e05bcf6088bd1436040107ad2). + +Depending on how you want to interpret the results, the figures above can be an over-estimation/under-estimation of the serialization latency: one may argue that near-empty Documents are not realistic, but serializing a DocumentArray with one million Documents is also unreal. In practice, DocumentArray passing across microservices are relatively small, say at thousands, for better overlapping the network latency and computational overhead. + + +### Wire format of `pickle` and `protobuf` + +When set `protocol=pickle` or `protobuf`, the result binary string looks like the following: + +```text +----------------------------------------------------------------------------------- +| Delimiter | doc1.to_bytes() | Delimiter | doc2.to_bytes() | Delimiter | ... +----------------------------------------------------------------------------------- + | | + | | + | | + Fixed-length | + | + Variable-length +``` + +Here `Delimiter` is a 16-bytes separator such as `b'g\x81\xcc\x1c\x0f\x93L\xed\xa2\xb0s)\x9c\xf9\xf6\xf2'` used for setting the boundary of each Document's serialization. Given a `to_bytes(protocol='pickle/protobuf')` binary string, once we know the first 16 bytes, the boundary is clear. Consequently, one can leverage this format to stream Documents, drop, skip, or early-stop, etc. + ## From/to Protobuf +Serializing to Protobuf Message is less frequently used, unless you are using Python Protobuf API. Nonetheless, you can use {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.from_protobuf` and {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.to_protobuf` to get a Protobuf Message object in Python. + +```python +from docarray import DocumentArray, Document + +da = DocumentArray([Document(text='hello'), Document(text='world')]) +da.to_bytes() +``` + +```text +docs { + id: "2571b8b66e4d11ec9f271e008a366d49" + text: "hello" + mime_type: "text/plain" +} +docs { + id: "2571ba466e4d11ec9f271e008a366d49" + text: "world" + mime_type: "text/plain" +} +``` + ## From/to list -## From/to dataframe \ No newline at end of file +Serializing to/from Python list is less frequently used for the same reason as `Document.to_dict()`: it is often an intermediate step of serializing to JSON. You can do: + +```python +from docarray import DocumentArray, Document + +da = DocumentArray([Document(text='hello'), Document(text='world')]) +da.to_list() +``` + +```text +[{'id': 'ae55782a6e4d11ec803c1e008a366d49', 'text': 'hello', 'mime_type': 'text/plain'}, {'id': 'ae557a146e4d11ec803c1e008a366d49', 'text': 'world', 'mime_type': 'text/plain'}] +``` + +There is an argument `strict` shares {ref}`the same semantic` as in `Document.to_dict()`. + +## From/to dataframe + +```{important} +This feature requires `pandas` dependency. You can do `pip install docarray[full]` to install it. +``` + +One can convert between a DocumentArray object and a `pandas.dataframe` object. + +```python +from docarray import DocumentArray, Document + +da = DocumentArray([Document(text='hello'), Document(text='world')]) +da.to_dataframe() +``` + +```text + id text mime_type +0 43cb93b26e4e11ec8b731e008a366d49 hello text/plain +1 43cb95746e4e11ec8b731e008a366d49 world text/plain +``` + +To build a DocumentArray from dataframe, + +```python +df = ... +da = DocumentArray.from_dataframe(df) +``` + +## From/to cloud + +```{important} +This feature requires `rich` and `requests` dependency. You can do `pip install docarray[full]` to install it. +``` + +{meth}`~docarray.array.mixins.io.pushpull.PushPullMixin.push` and {meth}`~docarray.array.mixins.io.pushpull.PushPullMixin.pull` allows you to share a DocumentArray object across machines. + +Considering you are working on a GPU machine via Google Colab/Jupyter. After preprocessing and embedding, you got everything you need in a DocumentArray. You can easily transfer it to the local laptop via: + +```python +from docarray import DocumentArray + +da = DocumentArray(...) # heavylifting, processing, GPU task, ... +da.push(token='myda123') +``` + +```{figure} images/da-push.png +``` + +Then on your local laptop, simply + +```python +from docarray import DocumentArray + +da = DocumentArray.pull(token='myda123') +``` + +Now you can continue the work at local, analyzing `da` or visualizing it. Your friends & colleagues who know the token `myda123` can also pull that DocumentArray. It's useful when you want to quickly share the results with your colleagues & friends. + +For more information of this feature, please refer to {class}`~jina.types.arrays.mixins.io.pushpull.PushPullMixin`. + +```{danger} +The lifetime of the storage is not promised at the momennt: could be a day, could be a week. Do not use it for persistence in production. Only consider this as temporary transmission or a clipboard. +``` \ No newline at end of file diff --git a/tests/unit/array/test_from_to_bytes.py b/tests/unit/array/test_from_to_bytes.py index b1599cc3a45..f64eec6f30a 100644 --- a/tests/unit/array/test_from_to_bytes.py +++ b/tests/unit/array/test_from_to_bytes.py @@ -28,7 +28,7 @@ def get_ndarrays_for_ravel(): @pytest.mark.parametrize('ndarray_val, is_sparse', get_ndarrays_for_ravel()) @pytest.mark.parametrize('target_da', [DocumentArray.empty(100), random_docs(100)]) @pytest.mark.parametrize( - 'protocol', ['protobuf', 'protobuf-once', 'pickle', 'pickle-once'] + 'protocol', ['protobuf', 'protobuf-array', 'pickle', 'pickle-array'] ) @pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) def test_to_from_bytes(target_da, protocol, compress, ndarray_val, is_sparse): @@ -54,7 +54,7 @@ def test_to_from_bytes(target_da, protocol, compress, ndarray_val, is_sparse): @pytest.mark.parametrize('target_da', [DocumentArray.empty(100), random_docs(100)]) @pytest.mark.parametrize( - 'protocol', ['protobuf', 'protobuf-once', 'pickle', 'pickle-once'] + 'protocol', ['protobuf', 'protobuf-array', 'pickle', 'pickle-array'] ) @pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) def test_save_bytes(target_da, protocol, compress, tmpfile):