-
+
-The data structure for unstructured data
+The data structure for multimodal data
| left/00018.jpg | -right/00018.jpg | -left/00131.jpg | -right/00131.jpg | -
|---|---|---|---|
![]() |
- ![]() |
- ![]() |
- ![]() |
-
Oops, we couldn't find that page.
-You can try "asking our docs" on the right corner of the page to find answer.
-Otherwise, please create a Github issue and one of our team will respond.
- -''', -} -notfound_no_urls_prefix = True - -apidoc_module_dir = repo_dir -apidoc_output_dir = 'api' -apidoc_excluded_paths = ['tests', 'legacy', 'hub', 'toy*', 'setup.py'] -apidoc_separate_modules = True -apidoc_extra_args = ['-t', 'template/'] -autodoc_member_order = 'bysource' -autodoc_mock_imports = ['argparse', 'numpy', 'np', 'tensorflow', 'torch', 'scipy'] -autoclass_content = 'both' -set_type_checking_flag = False -html_last_updated_fmt = '' -nitpicky = True -nitpick_ignore = [('py:class', 'type')] -linkcheck_ignore = [ - # Avoid link check on local uri - 'http://0.0.0.0:*', - 'pods/encode.yml', - 'https://github.com/jina-ai/docarray/commit/*', - '.github/*', - 'extra-requirements.txt', - 'fastentrypoints.py' '../../101', - '../../102', - 'http://www.twinsun.com/tz/tz-link.htm', # Broken link from pytz library - 'https://urllib3.readthedocs.io/en/latest/contrib.html#google-app-engine', # Broken link from urllib3 library - 'https://linuxize.com/post/how-to-add-swap-space-on-ubuntu-20-04/', - # This link works but gets 403 error on linkcheck -] -linkcheck_timeout = 20 -linkcheck_retries = 2 -linkcheck_anchors = False - -ogp_site_url = 'https://docarray.jina.ai/' -ogp_image = 'https://docarray.jina.ai/_static/banner.png' -ogp_use_first_image = True -ogp_description_length = 300 -ogp_type = 'website' -ogp_site_name = f'DocArray {os.environ.get("SPHINX_MULTIVERSION_VERSION", version)} Documentation' - -ogp_custom_meta_tags = [ - '', - '', - '', - '', - '', - ''' - - - - - - ''', -] - - -def add_server_address(app): - # This makes variable `server_address` available to docbot.js - server_address = app.config['server_address'] - js_text = "var server_address = '%s';" % server_address - app.add_js_file(None, body=js_text) - - -def setup(app): - from sphinx.domains.python import PyField - from sphinx.util.docfields import Field - from sphinx.locale import _ - - app.add_object_type( - 'confval', - 'confval', - objname='configuration value', - indextemplate='pair: %s; configuration value', - doc_field_types=[ - PyField( - 'type', - label=_('Type'), - has_arg=False, - names=('type',), - bodyrolename='class', - ), - Field( - 'default', - label=_('Default'), - has_arg=False, - names=('default',), - ), - ], - ) - # app.add_config_value( - # name='server_address', - # default=os.getenv('JINA_DOCSBOT_SERVER', 'https://docsbot.jina.ai'), - # rebuild='', - # ) - # app.connect('builder-inited', add_server_address) \ No newline at end of file diff --git a/docs/data_types/3d_mesh/3d_mesh.md b/docs/data_types/3d_mesh/3d_mesh.md new file mode 100644 index 00000000000..4727f12cb78 --- /dev/null +++ b/docs/data_types/3d_mesh/3d_mesh.md @@ -0,0 +1,2697 @@ +# 🧬 3D Mesh + +DocArray supports many different modalities including `3D Mesh`. +This section will show you how to load and handle 3D data using DocArray. + +A 3D mesh is the structural build of a 3D model consisting of polygons. Most 3D meshes are created via professional software packages, such as commercial suites like [Unity](https://unity.com/), or the open-source [Blender](https://www.blender.org/). + +!!! note + This feature requires `trimesh`. You can install all necessary dependencies via: + + ```cmd + pip install "docarray[mesh]" + ``` + +## Vertices and Faces representation + +A 3D mesh can be represented by its vertices and faces: + +- **Vertices** are points in a 3D space, represented as a tensor of shape `(n_points, 3)`. +- **Faces** are triangular surfaces that are defined by three points in 3D space, corresponding to the three vertices of a triangle. They can be represented as a tensor of shape `(n_faces, 3)`. Each number in that tensor refers to an index of a vertex in the tensor of vertices. + +### Load vertices and faces + +First, let's define our class `MyMesh3D`, which extends [`BaseDoc`][docarray.base_doc.doc.BaseDoc] and provides attributes to store our 3D data: + +- The `mesh_url` attribute of type [`Mesh3DUrl`][docarray.typing.url.url_3d.mesh_url.Mesh3DUrl]. +- The optional `tensors` attribute, of type [`VerticesAndFaces`][docarray.documents.mesh.vertices_and_faces.VerticesAndFaces] + - The `VerticesAndFaces` class has the attributes `vertices` and `faces`, both of type [`AnyTensor`](../../../../api_references/typing/tensor/tensor). This especially comes in handy later when we want to display our 3D mesh. + +!!! tip + Check out our predefined [`Mesh3D`](#getting-started-predefined-docs) to get started and play around with our 3D features. + +But for now, let's create a `MyMesh3D` instance with a URL to a remote `.obj` file: + +```python +from typing import Optional + +from docarray import BaseDoc +from docarray.documents.mesh.vertices_and_faces import VerticesAndFaces +from docarray.typing import Mesh3DUrl + + +class MyMesh3D(BaseDoc): + mesh_url: Mesh3DUrl + tensors: Optional[VerticesAndFaces] = None + + +doc = MyMesh3D(mesh_url="https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj") +``` + +To load the vertices and faces information, you can call [`.load()`][docarray.typing.url.url_3d.mesh_url.Mesh3DUrl.load] on the [`Mesh3DUrl`][docarray.typing.url.url_3d.mesh_url.Mesh3DUrl] instance. This will return a [`VerticesAndFaces`][docarray.documents.mesh.vertices_and_faces.VerticesAndFaces] object: + +```python +doc.tensors = doc.mesh_url.load() +doc.summary() +``` + +| + |
| + |
| hello.wav | -olleh.wav | -
|---|---|
| - | - |
+ Hi! +
+ + diff --git a/tests/toydata/test.log b/tests/toydata/test.log new file mode 100644 index 00000000000..c0a0e904924 --- /dev/null +++ b/tests/toydata/test.log @@ -0,0 +1,6 @@ +2022-11-25 12:34:56 INFO: Program started +2022-11-25 12:35:01 ERROR: Unable to open file 'input.txt' +2022-11-25 12:35:10 INFO: 42 records processed successfully +2022-11-25 12:35:15 WARNING: Possible data corruption detected +2022-11-25 12:35:22 ERROR: Out of memory +2022-11-25 12:35:30 INFO: Program completed diff --git a/tests/toydata/test.md b/tests/toydata/test.md new file mode 100644 index 00000000000..320ec9ccc5f --- /dev/null +++ b/tests/toydata/test.md @@ -0,0 +1,4 @@ +# Hello +## This is a markdown file +### For testing stuff +Thank you! diff --git a/tests/unit/document/toydata/test.png b/tests/toydata/test.png similarity index 100% rename from tests/unit/document/toydata/test.png rename to tests/toydata/test.png diff --git a/tests/toydata/tetrahedron.mtl b/tests/toydata/tetrahedron.mtl new file mode 100644 index 00000000000..1bccd4474e4 --- /dev/null +++ b/tests/toydata/tetrahedron.mtl @@ -0,0 +1,22 @@ + +newmtl red +Ka 0.4449 0.0000 0.0000 +Kd 0.7714 0.0000 0.0000 +Ks 0.8857 0.0000 0.0000 +illum 2 +Ns 136.4300 + +newmtl lime +Ka 0.0000 0.5000 0.0000 +Kd 0.0000 1.0000 0.0000 +Ks 0.0000 0.5000 0.0000 +illum 2 +Ns 65.8900 + +newmtl gold +Ka 0.5265 0.2735 0.0122 +Kd 1.0000 0.5184 0.0286 +Ks 0.3000 0.3000 0.3000 +illum 2 +Ns 123.2600 + diff --git a/tests/toydata/tetrahedron.obj b/tests/toydata/tetrahedron.obj new file mode 100644 index 00000000000..40347bad7b7 --- /dev/null +++ b/tests/toydata/tetrahedron.obj @@ -0,0 +1,20 @@ +# tetrahedron.obj +# + +mtllib tetrahedron.mtl + +g tetrahedron + +v 1.00 1.00 1.00 +v 2.00 1.00 1.00 +v 1.00 2.00 1.00 +v 1.00 1.00 2.00 + +usemtl lime +f 1 3 2 +usemtl gold +f 1 4 3 +usemtl lime +f 1 2 4 +usemtl red +f 2 3 4 diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/unit/array/__init__.py b/tests/unit/array/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/unit/array/mixins/__init__.py b/tests/unit/array/mixins/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/unit/array/mixins/test-net.onnx b/tests/unit/array/mixins/test-net.onnx deleted file mode 100644 index f354c2d39f4..00000000000 Binary files a/tests/unit/array/mixins/test-net.onnx and /dev/null differ diff --git a/tests/unit/array/mixins/test_content.py b/tests/unit/array/mixins/test_content.py deleted file mode 100644 index df4bba9add8..00000000000 --- a/tests/unit/array/mixins/test_content.py +++ /dev/null @@ -1,68 +0,0 @@ -import numpy as np -import pytest - -from docarray import DocumentArray - - -@pytest.mark.parametrize('cls', [DocumentArray]) -@pytest.mark.parametrize( - 'content_attr', ['texts', 'embeddings', 'blobs', 'buffers', 'contents'] -) -def test_content_empty_getter_return_none(cls, content_attr): - da = cls() - assert getattr(da, content_attr) is None - - -@pytest.mark.parametrize('cls', [DocumentArray]) -@pytest.mark.parametrize( - 'content_attr', - [ - ('texts', ''), - ('embeddings', np.array([])), - ('blobs', np.array([])), - ('buffers', []), - ('contents', []), - ], -) -def test_content_empty_setter(cls, content_attr): - da = cls() - setattr(da, content_attr[0], content_attr[1]) - assert getattr(da, content_attr[0]) is None - - -@pytest.mark.parametrize('cls', [DocumentArray]) -@pytest.mark.parametrize( - 'content_attr', - [ - ('texts', ['s'] * 10), - ('blobs', np.random.random([10, 2])), - ('buffers', [b's'] * 10), - ], -) -def test_content_getter_setter(cls, content_attr): - da = cls.empty(10) - setattr(da, content_attr[0], content_attr[1]) - np.testing.assert_equal(da.contents, content_attr[1]) - da.contents = content_attr[1] - np.testing.assert_equal(da.contents, content_attr[1]) - np.testing.assert_equal(getattr(da, content_attr[0]), content_attr[1]) - da.contents = None - assert da.contents is None - - -@pytest.mark.parametrize('da_len', [0, 1, 2]) -def test_content_empty(da_len): - da = DocumentArray.empty(da_len) - assert not da.texts - assert not da.contents - assert not da.blobs - assert not da.buffers - - da.texts = ['hello'] * da_len - if da_len == 0: - assert not da.contents - else: - assert da.contents == ['hello'] * da_len - assert da.texts == ['hello'] * da_len - assert not da.blobs - assert not da.buffers diff --git a/tests/unit/array/mixins/test_embed.py b/tests/unit/array/mixins/test_embed.py deleted file mode 100644 index 1f25749fa50..00000000000 --- a/tests/unit/array/mixins/test_embed.py +++ /dev/null @@ -1,71 +0,0 @@ -import os - -import numpy as np -import onnxruntime -import paddle -import pytest -import tensorflow as tf -import torch - -from docarray import DocumentArray - -random_embed_models = { - 'keras': lambda: tf.keras.Sequential( - [tf.keras.layers.Dropout(0.5), tf.keras.layers.BatchNormalization()] - ), - 'pytorch': lambda: torch.nn.Sequential( - torch.nn.Dropout(0.5), torch.nn.BatchNorm1d(128) - ), - 'paddle': lambda: paddle.nn.Sequential( - paddle.nn.Dropout(0.5), paddle.nn.BatchNorm1D(128) - ), -} -cur_dir = os.path.dirname(os.path.abspath(__file__)) -torch.onnx.export( - random_embed_models['pytorch'](), - torch.rand(1, 128), - os.path.join(cur_dir, 'test-net.onnx'), - do_constant_folding=True, # whether to execute constant folding for optimization - input_names=['input'], # the model's input names - output_names=['output'], # the model's output names - dynamic_axes={ - 'input': {0: 'batch_size'}, # variable length axes - 'output': {0: 'batch_size'}, - }, -) - -random_embed_models['onnx'] = lambda: onnxruntime.InferenceSession( - os.path.join(cur_dir, 'test-net.onnx') -) - - -@pytest.mark.parametrize('framework', ['onnx', 'keras', 'pytorch', 'paddle']) -@pytest.mark.parametrize('da', [DocumentArray]) -@pytest.mark.parametrize('N', [2, 1000]) -@pytest.mark.parametrize('batch_size', [1, 256]) -@pytest.mark.parametrize('to_numpy', [True, False]) -def test_embedding_on_random_network(framework, da, N, batch_size, to_numpy): - docs = da.empty(N) - docs.blobs = np.random.random([N, 128]).astype(np.float32) - embed_model = random_embed_models[framework]() - docs.embed(embed_model, batch_size=batch_size, to_numpy=to_numpy) - - r = docs.embeddings - if hasattr(r, 'numpy'): - r = r.numpy() - embed1 = r.copy() - - # reset - docs.embeddings = np.random.random([N, 128]).astype(np.float32) - - # try it again, it should yield the same result - docs.embed(embed_model, batch_size=batch_size, to_numpy=to_numpy) - np.testing.assert_array_almost_equal(docs.embeddings, embed1) - - # reset - docs.embeddings = np.random.random([N, 128]).astype(np.float32) - - # now do this one by one - docs[: int(N / 2)].embed(embed_model, batch_size=batch_size, to_numpy=to_numpy) - docs[-int(N / 2) :].embed(embed_model, batch_size=batch_size, to_numpy=to_numpy) - np.testing.assert_array_almost_equal(docs.embeddings, embed1) diff --git a/tests/unit/array/mixins/test_empty.py b/tests/unit/array/mixins/test_empty.py deleted file mode 100644 index 480eec09f65..00000000000 --- a/tests/unit/array/mixins/test_empty.py +++ /dev/null @@ -1,8 +0,0 @@ -from docarray import DocumentArray - - -def test_empty_non_zero(): - da = DocumentArray.empty(10) - assert len(da) == 10 - da = DocumentArray.empty() - assert len(da) == 0 diff --git a/tests/unit/array/mixins/test_eval_class.py b/tests/unit/array/mixins/test_eval_class.py deleted file mode 100644 index 0c33d8309e4..00000000000 --- a/tests/unit/array/mixins/test_eval_class.py +++ /dev/null @@ -1,107 +0,0 @@ -import copy - -import numpy as np -import pytest - -from docarray import DocumentArray, Document - - -@pytest.mark.parametrize( - 'metric_fn, kwargs', - [ - ('r_precision', {}), - ('precision_at_k', {}), - ('hit_at_k', {}), - ('average_precision', {}), - ('reciprocal_rank', {}), - ('recall_at_k', {'max_rel': 9}), - ('f1_score_at_k', {'max_rel': 9}), - ('ndcg_at_k', {}), - ], -) -def test_eval_mixin_perfect_match(metric_fn, kwargs): - da = DocumentArray.empty(10) - da.embeddings = np.random.random([10, 256]) - da.match(da, exclude_self=True) - r = da.evaluate(da, metric=metric_fn, **kwargs) - assert isinstance(r, float) - assert r == 1.0 - for d in da: - assert d.evaluations[metric_fn].value == 1.0 - - -@pytest.mark.parametrize( - 'metric_fn, kwargs', - [ - ('r_precision', {}), - ('precision_at_k', {}), - ('hit_at_k', {}), - ('average_precision', {}), - ('reciprocal_rank', {}), - ('recall_at_k', {'max_rel': 9}), - ('f1_score_at_k', {'max_rel': 9}), - ('ndcg_at_k', {}), - ], -) -def test_eval_mixin_zero_match(metric_fn, kwargs): - da1 = DocumentArray.empty(10) - da1.embeddings = np.random.random([10, 256]) - da1.match(da1, exclude_self=True) - - da2 = copy.deepcopy(da1) - da2.embeddings = np.random.random([10, 256]) - da2.match(da2, exclude_self=True) - - r = da1.evaluate(da2, metric=metric_fn, **kwargs) - assert isinstance(r, float) - assert r == 1.0 - for d in da1: - d: Document - assert d.evaluations[metric_fn].value == 1.0 - - -def test_diff_len_should_raise(): - da1 = DocumentArray.empty(10) - da2 = DocumentArray.empty(5) - with pytest.raises(ValueError): - da1.evaluate(da2, metric='precision_at_k') - - -def test_diff_hash_fun_should_raise(): - da1 = DocumentArray.empty(10) - da2 = DocumentArray.empty(10) - with pytest.raises(ValueError): - da1.evaluate(da2, metric='precision_at_k') - - -def test_same_hash_same_len_fun_should_work(): - da1 = DocumentArray.empty(10) - da1.embeddings = np.random.random([10, 3]) - da1.match(da1) - da2 = DocumentArray.empty(10) - da2.embeddings = np.random.random([10, 3]) - da2.match(da2) - with pytest.raises(ValueError): - da1.evaluate(da2, metric='precision_at_k') - for d1, d2 in zip(da1, da2): - d1.id = d2.id - - da1.evaluate(da2, metric='precision_at_k') - - -def test_adding_noise(): - da = DocumentArray.empty(10) - - da.embeddings = np.random.random([10, 3]) - da.match(da, exclude_self=True) - - da2 = copy.deepcopy(da) - - for d in da2: - d.matches.extend(DocumentArray.empty(10)) - d.matches = d.matches.shuffle() - - assert da2.evaluate(da, metric='precision_at_k', k=10) < 1.0 - - for d in da2: - assert 0.0 < d.evaluations['precision_at_k'].value < 1.0 diff --git a/tests/unit/array/mixins/test_getset.py b/tests/unit/array/mixins/test_getset.py deleted file mode 100644 index 1a747f041c5..00000000000 --- a/tests/unit/array/mixins/test_getset.py +++ /dev/null @@ -1,156 +0,0 @@ -import numpy as np -import pytest -import scipy.sparse -import tensorflow as tf -import torch -from scipy.sparse import csr_matrix - -from docarray import DocumentArray, Document -from tests import random_docs - -rand_array = np.random.random([10, 3]) - - -def da_and_dam(): - rand_docs = random_docs(100) - da = DocumentArray() - da.extend(rand_docs) - return (da,) - - -@pytest.mark.parametrize( - 'array', - [ - rand_array, - torch.Tensor(rand_array), - tf.constant(rand_array), - csr_matrix(rand_array), - ], -) -def test_set_embeddings_multi_kind(array): - da = DocumentArray([Document() for _ in range(10)]) - da.embeddings = array - - -@pytest.mark.parametrize('da', da_and_dam()) -def test_da_get_embeddings(da): - np.testing.assert_almost_equal(da._get_attributes('embedding'), da.embeddings) - np.testing.assert_almost_equal(da[:, 'embedding'], da.embeddings) - - -@pytest.mark.parametrize('da', da_and_dam()) -def test_embeddings_setter_da(da): - emb = np.random.random((100, 128)) - da.embeddings = emb - np.testing.assert_almost_equal(da.embeddings, emb) - - for x, doc in zip(emb, da): - np.testing.assert_almost_equal(x, doc.embedding) - - da.embeddings = None - if hasattr(da, 'flush'): - da.flush() - assert not da.embeddings - - -@pytest.mark.parametrize('da', da_and_dam()) -def test_embeddings_wrong_len(da): - embeddings = np.ones((2, 10)) - - with pytest.raises(ValueError): - da.embeddings = embeddings - - -@pytest.mark.parametrize('da', da_and_dam()) -def test_blobs_getter_da(da): - blobs = np.random.random((100, 10, 10)) - da.blobs = blobs - assert len(da) == 100 - np.testing.assert_almost_equal(da.blobs, blobs) - - da.blobs = None - if hasattr(da, 'flush'): - da.flush() - assert not da.blobs - - -@pytest.mark.parametrize('da', da_and_dam()) -def test_texts_getter_da(da): - assert len(da.texts) == 100 - assert da.texts == da[:, 'text'] - texts = ['text' for _ in range(100)] - da.texts = texts - assert da.texts == texts - - for x, doc in zip(texts, da): - assert x == doc.text - - da.texts = None - if hasattr(da, 'flush'): - da.flush() - - # unfortunately protobuf does not distinguish None and '' on string - # so non-set str field in Pb is '' - assert not da.texts - - -@pytest.mark.parametrize('da', da_and_dam()) -def test_texts_wrong_len(da): - texts = ['hello'] - - with pytest.raises(ValueError): - da.texts = texts - - -@pytest.mark.parametrize('da', da_and_dam()) -def test_blobs_wrong_len(da): - blobs = np.ones((2, 10, 10)) - - with pytest.raises(ValueError): - da.blobs = blobs - - -@pytest.mark.parametrize('da', da_and_dam()) -def test_buffers_getter_setter(da): - with pytest.raises(ValueError): - da.buffers = [b'cc', b'bb', b'aa', b'dd'] - - da.buffers = [b'aa'] * len(da) - assert da.buffers == [b'aa'] * len(da) - - da.buffers = None - if hasattr(da, 'flush'): - da.flush() - - # unfortunately protobuf does not distinguish None and '' on string - # so non-set str field in Pb is '' - assert not da.buffers - - -def test_zero_embeddings(): - a = np.zeros([10, 6]) - da = DocumentArray.empty(10) - - # all zero, dense - da.embeddings = a - np.testing.assert_almost_equal(da.embeddings, a) - for d in da: - assert d.embedding.shape == (6,) - - # all zero, sparse - sp_a = scipy.sparse.coo_matrix(a) - da.embeddings = sp_a - np.testing.assert_almost_equal(da.embeddings.todense(), sp_a.todense()) - for d in da: - # scipy sparse row-vector can only be a (1, m) not squeezible - assert d.embedding.shape == (1, 6) - - # near zero, sparse - a = np.random.random([10, 6]) - a[a > 0.1] = 0 - sp_a = scipy.sparse.coo_matrix(a) - da.embeddings = sp_a - np.testing.assert_almost_equal(da.embeddings.todense(), sp_a.todense()) - for d in da: - # scipy sparse row-vector can only be a (1, m) not squeezible - assert d.embedding.shape == (1, 6) diff --git a/tests/unit/array/mixins/test_group.py b/tests/unit/array/mixins/test_group.py deleted file mode 100644 index 90316f3dfe4..00000000000 --- a/tests/unit/array/mixins/test_group.py +++ /dev/null @@ -1,90 +0,0 @@ -import pytest - -from docarray import DocumentArray, Document - - -def da_for_batching(): - da = DocumentArray.empty(100) - return (da,) - - -def docarray_for_split(): - da = DocumentArray() - da.append(Document(tags={'category': 'c'})) - da.append(Document(tags={'category': 'c'})) - da.append(Document(tags={'category': 'b'})) - da.append(Document(tags={'category': 'a'})) - da.append(Document(tags={'category': 'a'})) - return (da,) - - -def docarray_for_split_at_zero(): - da = DocumentArray() - da.append(Document(tags={'category': 0.0})) - da.append(Document(tags={'category': 0.0})) - da.append(Document(tags={'category': 1.0})) - da.append(Document(tags={'category': 2.0})) - da.append(Document(tags={'category': 2.0})) - return (da,) - - -def docarray_for_nest_split(): - da = DocumentArray() - da.append(Document(tags={'nest': {'category': 'c'}})) - da.append(Document(tags={'nest': {'category': 'c'}})) - da.append(Document(tags={'nest': {'category': 'b'}})) - da.append(Document(tags={'nest': {'category': 'a'}})) - da.append(Document(tags={'nest': {'category': 'a'}})) - return (da,) - - -@pytest.mark.parametrize('da', docarray_for_split()) -def test_split(da): - rv = da.split_by_tag('category') - assert isinstance(rv, dict) - assert sorted(list(rv.keys())) == ['a', 'b', 'c'] - # assure order is preserved c, b, a - assert list(rv.keys()) == ['c', 'b', 'a'] - # original input c, c, b, a, a - assert len(rv['c']) == 2 - assert len(rv['b']) == 1 - assert len(rv['a']) == 2 - rv = da.split_by_tag('random') - assert not rv # wrong tag returns empty dict - - -@pytest.mark.parametrize('da', docarray_for_split_at_zero()) -def test_split_at_zero(da): - rv = da.split_by_tag('category') - assert isinstance(rv, dict) - assert sorted(list(rv.keys())) == [0.0, 1.0, 2.0] - - -@pytest.mark.parametrize('da', docarray_for_nest_split()) -def test_dunder_split(da): - rv = da.split_by_tag('nest__category') - assert isinstance(rv, dict) - assert sorted(list(rv.keys())) == ['a', 'b', 'c'] - # assure order is preserved c, b, a - assert list(rv.keys()) == ['c', 'b', 'a'] - # original input c, c, b, a, a - assert len(rv['c']) == 2 - assert len(rv['b']) == 1 - assert len(rv['a']) == 2 - - assert len(da.split_by_tag('nest__random')) == 1 - - -@pytest.mark.parametrize('da', da_for_batching()) -@pytest.mark.parametrize('batch_size', [1, 5, 100, 200]) -@pytest.mark.parametrize('shuffle', [True, False]) -def test_batching(da, batch_size, shuffle): - all_ids = [] - for v in da.batch(batch_size=batch_size, shuffle=shuffle): - assert len(v) <= batch_size - all_ids.extend(v[:, 'id']) - - if shuffle: - assert all_ids != da[:, 'id'] - else: - assert all_ids == da[:, 'id'] diff --git a/tests/unit/array/mixins/test_io.py b/tests/unit/array/mixins/test_io.py deleted file mode 100644 index 21c5510daff..00000000000 --- a/tests/unit/array/mixins/test_io.py +++ /dev/null @@ -1,107 +0,0 @@ -import os -import uuid - -import numpy as np -import pytest - -from docarray import DocumentArray -from tests import random_docs - - -def da_and_dam(): - da = DocumentArray(random_docs(100)) - return (da,) - - -@pytest.mark.slow -@pytest.mark.parametrize('method', ['json', 'binary']) -@pytest.mark.parametrize('da', da_and_dam()) -def test_document_save_load(method, tmp_path, da): - tmp_file = os.path.join(tmp_path, 'test') - da.save(tmp_file, file_format=method) - da_r = type(da).load(tmp_file, file_format=method) - - assert type(da) is type(da_r) - assert len(da) == len(da_r) - for d, d_r in zip(da, da_r): - assert d.id == d_r.id - np.testing.assert_equal(d.embedding, d_r.embedding) - assert d.content == d_r.content - - -@pytest.mark.parametrize('flatten_tags', [True, False]) -@pytest.mark.parametrize('da', da_and_dam()) -def test_da_csv_write(flatten_tags, tmp_path, da): - tmpfile = os.path.join(tmp_path, 'test.csv') - da.save_csv(tmpfile, flatten_tags) - with open(tmpfile) as fp: - assert len([v for v in fp]) == len(da) + 1 - - -@pytest.mark.parametrize('da', [DocumentArray]) -def test_from_ndarray(da): - _da = da.from_ndarray(np.random.random([10, 256])) - assert len(_da) == 10 - - -@pytest.mark.parametrize('da', [DocumentArray]) -def test_from_files(da): - assert len(da.from_files(patterns='*.*', to_dataturi=True, size=1)) == 1 - - -cur_dir = os.path.dirname(os.path.abspath(__file__)) - - -@pytest.mark.parametrize('da', [DocumentArray]) -def test_from_ndjson(da): - with open(os.path.join(cur_dir, 'docs.jsonlines')) as fp: - _da = da.from_ndjson(fp) - assert len(_da) == 2 - - -@pytest.mark.parametrize('da_cls', [DocumentArray]) -def test_from_to_pd_dataframe(da_cls): - # simple - assert len(da_cls.from_dataframe(da_cls.empty(2).to_dataframe())) == 2 - - # more complicated - da = da_cls.empty(2) - da.embeddings = [[1, 2, 3], [4, 5, 6]] - da.blobs = [[1, 2], [2, 1]] - da[0].tags = {'hello': 'world'} - da2 = da_cls.from_dataframe(da.to_dataframe()) - assert da2[0].tags == {'hello': 'world'} - assert da2[1].tags == {} - - -@pytest.mark.parametrize('da_cls', [DocumentArray]) -def test_from_to_bytes(da_cls): - # simple - assert len(da_cls.load_binary(bytes(da_cls.empty(2)))) == 2 - - # more complicated - da = da_cls.empty(2) - da.embeddings = [[1, 2, 3], [4, 5, 6]] - da.blobs = [[1, 2], [2, 1]] - da[0].tags = {'hello': 'world'} - da2 = da_cls.load_binary(bytes(da)) - assert da2.blobs == [[1, 2], [2, 1]] - assert da2.embeddings == [[1, 2, 3], [4, 5, 6]] - assert da2[0].tags == {'hello': 'world'} - assert da2[1].tags == {} - - -@pytest.mark.parametrize('da_cls', [DocumentArray]) -@pytest.mark.parametrize('show_progress', [True, False]) -def test_push_pull_io(da_cls, show_progress): - da1 = da_cls.empty(10) - da1.embeddings = np.random.random([len(da1), 256]) - random_texts = [str(uuid.uuid1()) for _ in da1] - da1.texts = random_texts - - da1.push('myda', show_progress=show_progress) - - da2 = da_cls.pull('myda', show_progress=show_progress) - - assert len(da1) == len(da2) == 10 - assert da1.texts == da2.texts == random_texts diff --git a/tests/unit/array/mixins/test_magic.py b/tests/unit/array/mixins/test_magic.py deleted file mode 100644 index 76ade193d7c..00000000000 --- a/tests/unit/array/mixins/test_magic.py +++ /dev/null @@ -1,47 +0,0 @@ -import pytest - -from docarray import DocumentArray - -N = 100 - - -def da_and_dam(): - da = DocumentArray.empty(N) - return (da,) - - -@pytest.mark.parametrize('da', da_and_dam()) -def test_iter_len_bool(da): - j = 0 - for _ in da: - j += 1 - assert j == N - assert j == len(da) - assert da - da.clear() - assert not da - - -@pytest.mark.parametrize('da', da_and_dam()) -def test_repr(da): - assert f'length={N}' in repr(da) - - -@pytest.mark.parametrize('da', da_and_dam()) -def test_iadd(da): - oid = id(da) - dap = DocumentArray.empty(10) - da += dap - assert len(da) == N + len(dap) - nid = id(da) - assert nid == oid - - -@pytest.mark.parametrize('da', da_and_dam()) -def test_add(da): - oid = id(da) - dap = DocumentArray.empty(10) - da = da + dap - assert len(da) == N + len(dap) - nid = id(da) - assert nid != oid diff --git a/tests/unit/array/mixins/test_match.py b/tests/unit/array/mixins/test_match.py deleted file mode 100644 index 43592259593..00000000000 --- a/tests/unit/array/mixins/test_match.py +++ /dev/null @@ -1,517 +0,0 @@ -import copy - -import numpy as np -import paddle -import pytest -import scipy.sparse as sp -import tensorflow as tf -import torch -from scipy.sparse import csr_matrix, bsr_matrix, coo_matrix, csc_matrix -from scipy.spatial.distance import cdist as scipy_cdist - -from docarray import Document, DocumentArray - - -@pytest.fixture() -def doc_lists(): - d1 = Document(embedding=np.array([0, 0, 0])) - d2 = Document(embedding=np.array([3, 0, 0])) - d3 = Document(embedding=np.array([1, 0, 0])) - d4 = Document(embedding=np.array([2, 0, 0])) - - d1_m = Document(embedding=np.array([1, 0, 0])) - d2_m = Document(embedding=np.array([2, 0, 0])) - d3_m = Document(embedding=np.array([0, 0, 1])) - d4_m = Document(embedding=np.array([0, 0, 2])) - d5_m = Document(embedding=np.array([0, 0, 3])) - - return [d1, d2, d3, d4], [d1_m, d2_m, d3_m, d4_m, d5_m] - - -@pytest.fixture -def docarrays_for_embedding_distance_computation(doc_lists): - D1, D2 = doc_lists - da1 = DocumentArray(D1) - da2 = DocumentArray(D2) - return da1, da2 - - -@pytest.fixture -def docarrays_for_embedding_distance_computation_sparse(): - d1 = Document(embedding=sp.csr_matrix([0, 0, 0])) - d2 = Document(embedding=sp.csr_matrix([3, 0, 0])) - d3 = Document(embedding=sp.csr_matrix([1, 0, 0])) - d4 = Document(embedding=sp.csr_matrix([2, 0, 0])) - - d1_m = Document(embedding=sp.csr_matrix([1, 0, 0])) - d2_m = Document(embedding=sp.csr_matrix([2, 0, 0])) - d3_m = Document(embedding=sp.csr_matrix([0, 0, 1])) - d4_m = Document(embedding=sp.csr_matrix([0, 0, 2])) - d5_m = Document(embedding=sp.csr_matrix([0, 0, 3])) - - D1 = DocumentArray([d1, d2, d3, d4]) - D2 = DocumentArray([d1_m, d2_m, d3_m, d4_m, d5_m]) - return D1, D2 - - -@pytest.fixture -def embeddings(): - return np.array([[1, 0, 0], [2, 0, 0], [3, 0, 0]]) - - -def doc_lists_to_doc_arrays(doc_lists, *args, **kwargs): - doc_list1, doc_list2 = doc_lists - D1 = DocumentArray() - D1.extend(doc_list1) - D2 = DocumentArray() - D2.extend(doc_list2) - return D1, D2 - - -@pytest.mark.parametrize( - 'limit, batch_size', [(1, None), (2, None), (None, None), (1, 1), (1, 2), (2, 1)] -) -@pytest.mark.parametrize('only_id', [True, False]) -def test_matching_retrieves_correct_number( - doc_lists, - limit, - batch_size, - tmpdir, - only_id, -): - D1, D2 = doc_lists_to_doc_arrays( - doc_lists, - ) - D1.match( - D2, metric='sqeuclidean', limit=limit, batch_size=batch_size, only_id=only_id - ) - for m in D1[:, 'matches']: - if limit is None: - assert len(m) == len(D2) - else: - assert len(m) == limit - - -@pytest.mark.parametrize('metric', ['sqeuclidean', 'cosine']) -@pytest.mark.parametrize('only_id', [True, False]) -def test_matching_same_results_with_sparse( - docarrays_for_embedding_distance_computation, - docarrays_for_embedding_distance_computation_sparse, - metric, - only_id, -): - D1, D2 = docarrays_for_embedding_distance_computation - D1_sp, D2_sp = docarrays_for_embedding_distance_computation_sparse - - # use match with numpy arrays - D1.match(D2, metric=metric, only_id=only_id) - distances = [] - for m in D1[:, 'matches']: - for d in m: - distances.extend([d.scores[metric].value]) - - # use match with sparse arrays - D1_sp.match(D2_sp, metric=metric, is_sparse=True) - distances_sparse = [] - for m in D1[:, 'matches']: - for d in m: - distances_sparse.extend([d.scores[metric].value]) - - np.testing.assert_equal(distances, distances_sparse) - - -@pytest.mark.parametrize('metric', ['sqeuclidean', 'cosine']) -@pytest.mark.parametrize('only_id', [True, False]) -def test_matching_same_results_with_batch( - docarrays_for_embedding_distance_computation, metric, only_id -): - D1, D2 = docarrays_for_embedding_distance_computation - D1_batch = copy.deepcopy(D1) - D2_batch = copy.deepcopy(D2) - - # use match without batches - D1.match(D2, metric=metric, only_id=only_id) - distances = [] - for m in D1[:, 'matches']: - for d in m: - distances.extend([d.scores[metric].value]) - - # use match with batches - D1_batch.match(D2_batch, metric=metric, batch_size=10) - - distances_batch = [] - for m in D1[:, 'matches']: - for d in m: - distances_batch.extend([d.scores[metric].value]) - - np.testing.assert_equal(distances, distances_batch) - - -@pytest.mark.parametrize('metric', ['euclidean', 'cosine']) -@pytest.mark.parametrize('only_id', [True, False]) -def test_matching_scipy_cdist( - docarrays_for_embedding_distance_computation, metric, only_id -): - def scipy_cdist_metric(X, Y, *args): - return scipy_cdist(X, Y, metric=metric) - - D1, D2 = docarrays_for_embedding_distance_computation - D1_scipy = copy.deepcopy(D1) - - # match with our custom metric - D1.match(D2, metric=metric) - distances = [] - for m in D1[:, 'matches']: - for d in m: - distances.extend([d.scores[metric].value]) - - # match with callable cdist function from scipy - D1_scipy.match(D2, metric=scipy_cdist_metric, only_id=only_id) - distances_scipy = [] - for m in D1[:, 'matches']: - for d in m: - distances_scipy.extend([d.scores[metric].value]) - - np.testing.assert_equal(distances, distances_scipy) - - -@pytest.mark.parametrize( - 'normalization, metric', - [ - ((0, 1), 'sqeuclidean'), - (None, 'euclidean'), - ((0, 1), 'euclidean'), - (None, 'cosine'), - ((0, 1), 'cosine'), - ], -) -@pytest.mark.parametrize('use_scipy', [True, False]) -@pytest.mark.parametrize('only_id', [True, False]) -def test_matching_retrieves_closest_matches( - doc_lists, - normalization, - metric, - use_scipy, - only_id, -): - """ - Tests if match.values are returned 'low to high' if normalization is True or 'high to low' otherwise - """ - D1, D2 = doc_lists_to_doc_arrays( - doc_lists, - ) - D1.match( - D2, - metric=metric, - limit=3, - normalization=normalization, - use_scipy=use_scipy, - only_id=only_id, - ) - expected_sorted_values = [ - D1[0].matches[i].scores['sqeuclidean'].value for i in range(3) - ] - if normalization: - assert min(expected_sorted_values) >= 0 - assert max(expected_sorted_values) <= 1 - else: - assert expected_sorted_values == sorted(expected_sorted_values) - - -@pytest.mark.parametrize('buffer_pool_size', [1000, 3]) -@pytest.mark.parametrize('first_memmap', [True, False]) -@pytest.mark.parametrize('second_memmap', [True, False]) -@pytest.mark.parametrize('only_id', [True, False]) -def test_2arity_function( - first_memmap, second_memmap, doc_lists, tmpdir, buffer_pool_size, only_id -): - def dotp(x, y, *args): - return np.dot(x, np.transpose(y)) - - D1, D2 = doc_lists_to_doc_arrays( - doc_lists, - tmpdir, - first_memmap, - second_memmap, - buffer_pool_size=buffer_pool_size, - ) - D1.match(D2, metric=dotp, use_scipy=True, only_id=only_id) - - for d in D1: - for m in d.matches: - assert 'dotp' in m.scores - - -@pytest.mark.parametrize('only_id', [True, False]) -def test_match_inclusive(only_id): - """Call match function, while the other :class:`DocumentArray` is itself - or have same :class:`Document`. - """ - # The document array da1 match with itself. - da1 = DocumentArray( - [ - Document(embedding=np.array([1, 2, 3])), - Document(embedding=np.array([1, 0, 1])), - Document(embedding=np.array([1, 1, 2])), - ] - ) - - da1.match(da1, only_id=only_id) - assert len(da1) == 3 - traversed = da1.traverse_flat(traversal_paths='m,mm,mmm') - assert len(traversed) == 9 - # The document array da2 shares same documents with da1 - da2 = DocumentArray([Document(embedding=np.array([4, 1, 3])), da1[0], da1[1]]) - da1.match(da2, only_id=only_id) - assert len(da2) == 3 - traversed = da1.traverse_flat(traversal_paths='m,mm,mmm') - assert len(traversed) == 9 - - -@pytest.mark.parametrize('exclude_self, num_matches', [(True, 1), (False, 2)]) -@pytest.mark.parametrize('only_id', [True, False]) -def test_match_exclude_self(exclude_self, num_matches, only_id): - da1 = DocumentArray( - [ - Document(id='1', embedding=np.array([1, 2])), - Document(id='2', embedding=np.array([3, 4])), - ] - ) - da2 = DocumentArray( - [ - Document(id='1', embedding=np.array([1, 2])), - Document(id='2', embedding=np.array([3, 4])), - ] - ) - da1.match(da2, exclude_self=exclude_self, only_id=only_id) - for d in da1: - assert len(d.matches) == num_matches - - -@pytest.fixture() -def get_pair_document_array(): - da1 = DocumentArray( - [ - Document(id='1', embedding=np.array([1, 2])), - Document(id='2', embedding=np.array([3, 4])), - ] - ) - da2 = DocumentArray( - [ - Document(id='1', embedding=np.array([1, 2])), - Document(id='2', embedding=np.array([3, 4])), - Document(id='3', embedding=np.array([4, 5])), - ] - ) - yield da1, da2 - - -@pytest.mark.parametrize( - 'limit, expect_len, exclude_self', - [ - (2, 2, True), - (1, 1, True), - (3, 2, True), - (2, 2, False), - (1, 1, False), - (3, 3, False), - ], -) -def test_match_exclude_self_limit_2( - get_pair_document_array, exclude_self, limit, expect_len -): - da1, da2 = get_pair_document_array - da1.match(da2, exclude_self=exclude_self, limit=limit) - for d in da1: - assert len(d.matches) == expect_len - - -@pytest.mark.parametrize( - 'lhs, rhs', - [ - (DocumentArray(), DocumentArray()), - ( - DocumentArray( - [ - Document(embedding=np.array([3, 4])), - Document(embedding=np.array([4, 5])), - ] - ), - DocumentArray( - [ - Document(embedding=np.array([3, 4])), - Document(embedding=np.array([4, 5])), - ] - ), - ), - ( - DocumentArray(), - DocumentArray( - [ - Document(embedding=np.array([3, 4])), - Document(embedding=np.array([4, 5])), - ] - ), - ), - ( - ( - DocumentArray( - [ - Document(embedding=np.array([3, 4])), - Document(embedding=np.array([4, 5])), - ] - ) - ), - DocumentArray(), - ), - (None, DocumentArray()), - (DocumentArray(), None), - ], -) -def test_match_none(lhs, rhs): - if lhs is not None: - lhs.match(rhs) - if rhs is not None: - rhs.match(lhs) - - -@pytest.fixture() -def get_two_docarray(): - d1 = Document(embedding=np.array([0, 0, 0])) - d1c1 = Document(embedding=np.array([0, 1, 0])) - - d2 = Document(embedding=np.array([1, 0, 0])) - d2c1 = Document(embedding=np.array([1, 1, 0])) - d2c2 = Document(embedding=np.array([1, 0, 1])) - - d3 = Document(embedding=np.array([2, 1, 1])) - d3c1 = Document(embedding=np.array([2, 1, 0])) - d3c2 = Document(embedding=np.array([2, 0, 1])) - d3c3 = Document(embedding=np.array([2, 0, 0])) - - d4 = Document(embedding=np.array([3, 1, 1])) - d4c1 = Document(embedding=np.array([3, 1, 0])) - d4c2 = Document(embedding=np.array([3, 0, 1])) - d4c3 = Document(embedding=np.array([3, 0, 0])) - d4c4 = Document(embedding=np.array([3, 1, 1])) - - d1.chunks.extend([d1c1]) - d2.chunks.extend([d2c1, d2c2]) - d3.chunks.extend([d3c1, d3c2, d3c3]) - d4.chunks.extend([d4c1, d4c2, d4c3, d4c4]) - - da1 = DocumentArray([d1, d2]) - da2 = DocumentArray([d3, d4]) - yield da1, da2 - - -def test_match_with_traversal_path(get_two_docarray): - da1, da2 = get_two_docarray - da1.match(da2.traverse_flat('c')) - assert len(da1[0].matches) == len(da2[0].chunks) + len(da2[1].chunks) - - da2.match(da1.traverse_flat('c')) - assert len(da2[0].matches) == len(da1[0].chunks) + len(da1[1].chunks) - - -def test_match_on_two_sides_chunks(get_two_docarray): - da1, da2 = get_two_docarray - da2.traverse_flat('c').match(da1.traverse_flat('c')) - assert len(da2[0].matches) == 0 - assert len(da2[0].chunks[0].matches) == len(da1[0].chunks) + len(da1[1].chunks) - - da1.traverse_flat('c').match(da2.traverse_flat('c')) - assert len(da1[0].matches) == 0 - assert len(da1[0].chunks[0].matches) == len(da2[0].chunks) + len(da2[1].chunks) - - -@pytest.mark.parametrize('exclude_self', [True, False]) -@pytest.mark.parametrize('limit', [1, 2, 3]) -def test_exclude_self_should_keep_limit(limit, exclude_self): - da = DocumentArray( - [ - Document(embedding=np.array([3, 1, 0])), - Document(embedding=np.array([3, 0, 1])), - Document(embedding=np.array([3, 0, 0])), - Document(embedding=np.array([3, 1, 1])), - ] - ) - da.match(da, exclude_self=exclude_self, limit=limit) - for d in da: - assert len(d.matches) == limit - if exclude_self: - for m in d.matches: - assert d.id != m.id - - -@pytest.mark.parametrize('only_id', [True, False]) -def test_only_id(docarrays_for_embedding_distance_computation, only_id): - D1, D2 = docarrays_for_embedding_distance_computation - D1.match(D2, only_id=only_id) - for d in D1: - for m in d.matches: - assert (m.embedding is None) == only_id - assert m.id - - -@pytest.mark.parametrize( - 'match_kwargs', - [ - dict(limit=5, normalization=(1, 0), batch_size=10), - dict(normalization=(1, 0), batch_size=10), - dict(normalization=(1, 0)), - dict(), - ], -) -@pytest.mark.parametrize('nnz_ratio', [0.5, 1]) -def test_dense_vs_sparse_match(match_kwargs, nnz_ratio): - N = 100 - D = 256 - sp_embed = np.random.random([N, D]) - sp_embed[sp_embed > nnz_ratio] = 0 - - da1 = DocumentArray.empty(N) - da2 = DocumentArray.empty(N) - - # use sparse embedding - da1.embeddings = sp.coo_matrix(sp_embed) - da1.texts = [str(j) for j in range(N)] - size_sp = sum(d.nbytes for d in da1) - da1.match(da1, **match_kwargs) - - sparse_result = [m.text for m in da1[0].matches] - - # use dense embedding - da2.embeddings = sp_embed - da2.texts = [str(j) for j in range(N)] - size_dense = sum(d.nbytes for d in da2) - da2.match(da2, **match_kwargs) - dense_result = [m.text for m in da2[0].matches] - - assert sparse_result == dense_result - - print( - f'sparse DA: {size_sp} bytes is {size_sp / size_dense * 100:.0f}% of dense DA {size_dense} bytes' - ) - - -def get_ndarrays(): - a = np.random.random([10, 3]) - a[a > 0.5] = 0 - return [ - a, - torch.tensor(a), - tf.constant(a), - paddle.to_tensor(a), - csr_matrix(a), - bsr_matrix(a), - coo_matrix(a), - csc_matrix(a), - ] - - -@pytest.mark.parametrize('ndarray_val', get_ndarrays()) -def test_diff_framework_match(ndarray_val): - da = DocumentArray.empty(10) - da.embeddings = ndarray_val - da.match(da) diff --git a/tests/unit/array/mixins/test_parallel.py b/tests/unit/array/mixins/test_parallel.py deleted file mode 100644 index b05f1f215e6..00000000000 --- a/tests/unit/array/mixins/test_parallel.py +++ /dev/null @@ -1,112 +0,0 @@ -import os -import pytest - -from docarray import DocumentArray, Document - - -def foo(d: Document): - return ( - d.load_uri_to_image_blob() - .set_image_blob_normalization() - .set_image_blob_channel_axis(-1, 0) - .set_image_blob_shape((222, 222), 0) - ) - - -def foo_batch(da: DocumentArray): - for d in da: - foo(d) - return da - - -@pytest.mark.skipif( - 'GITHUB_WORKFLOW' in os.environ, - reason='this test somehow fail on Github CI, but it MUST run successfully on local', -) -@pytest.mark.parametrize( - 'da_cls', - [ - DocumentArray, - ], -) -@pytest.mark.parametrize('backend', ['process', 'thread']) -@pytest.mark.parametrize('num_worker', [1, 2, None]) -def test_parallel_map(pytestconfig, da_cls, backend, num_worker): - da = da_cls.from_files(f'{pytestconfig.rootdir}/**/*.jpeg')[:10] - - # use a generator - for d in da.map(foo, backend, num_worker=num_worker): - assert d.blob.shape == (3, 222, 222) - - da = da_cls.from_files(f'{pytestconfig.rootdir}/**/*.jpeg')[:10] - - # use as list, here the caveat is when using process backend you can not modify thing in-place - list(da.map(foo, backend, num_worker=num_worker)) - if backend == 'thread': - assert da.blobs.shape == (len(da), 3, 222, 222) - else: - assert da.blobs is None - - da = da_cls.from_files(f'{pytestconfig.rootdir}/**/*.jpeg')[:10] - da_new = da.apply(foo) - assert da_new.blobs.shape == (len(da_new), 3, 222, 222) - - -@pytest.mark.skipif( - 'GITHUB_WORKFLOW' in os.environ, - reason='this test somehow fail on Github CI, but it MUST run successfully on local', -) -@pytest.mark.parametrize( - 'da_cls', - [ - DocumentArray, - ], -) -@pytest.mark.parametrize('backend', ['thread']) -@pytest.mark.parametrize('num_worker', [1, 2, None]) -@pytest.mark.parametrize('b_size', [1, 2, 256]) -def test_parallel_map_batch(pytestconfig, da_cls, backend, num_worker, b_size): - da = da_cls.from_files(f'{pytestconfig.rootdir}/**/*.jpeg')[:10] - - # use a generator - for _da in da.map_batch( - foo_batch, batch_size=b_size, backend=backend, num_worker=num_worker - ): - for d in _da: - assert d.blob.shape == (3, 222, 222) - - da = da_cls.from_files(f'{pytestconfig.rootdir}/**/*.jpeg')[:10] - - # use as list, here the caveat is when using process backend you can not modify thing in-place - list( - da.map_batch( - foo_batch, batch_size=b_size, backend=backend, num_worker=num_worker - ) - ) - if backend == 'thread': - assert da.blobs.shape == (len(da), 3, 222, 222) - else: - assert da.blobs is None - - da_new = da.apply_batch(foo_batch, batch_size=b_size) - assert da_new.blobs.shape == (len(da_new), 3, 222, 222) - - -@pytest.mark.skipif( - 'GITHUB_WORKFLOW' in os.environ, - reason='this test somehow fail on Github CI, but it MUST run successfully on local', -) -@pytest.mark.parametrize( - 'da_cls', - [ - DocumentArray, - ], -) -def test_map_lambda(pytestconfig, da_cls): - da = da_cls.from_files(f'{pytestconfig.rootdir}/**/*.jpeg')[:10] - - for d in da: - assert d.blob is None - - for d in da.map(lambda x: x.load_uri_to_image_blob()): - assert d.blob is not None diff --git a/tests/unit/array/mixins/test_plot.py b/tests/unit/array/mixins/test_plot.py deleted file mode 100644 index 1ec701244b8..00000000000 --- a/tests/unit/array/mixins/test_plot.py +++ /dev/null @@ -1,72 +0,0 @@ -import json -import os -import random - -import numpy as np -import pytest - -from docarray import DocumentArray, Document - - -def test_sprite_image_generator(pytestconfig, tmpdir): - da = DocumentArray.from_files( - [ - f'{pytestconfig.rootdir}/**/*.png', - f'{pytestconfig.rootdir}/**/*.jpg', - f'{pytestconfig.rootdir}/**/*.jpeg', - ] - ) - da.plot_image_sprites(tmpdir / 'sprint_da.png') - assert os.path.exists(tmpdir / 'sprint_da.png') - - -def da_and_dam(): - embeddings = np.array([[1, 0, 0], [2, 0, 0], [3, 0, 0]]) - doc_array = DocumentArray( - [ - Document(embedding=x, tags={'label': random.randint(0, 5)}) - for x in embeddings - ] - ) - - return (doc_array,) - - -@pytest.mark.parametrize('da', da_and_dam()) -def test_plot_embeddings(da): - p = da.plot_embeddings(start_server=False) - assert os.path.exists(p) - assert os.path.exists(os.path.join(p, 'config.json')) - with open(os.path.join(p, 'config.json')) as fp: - config = json.load(fp) - assert len(config['embeddings']) == 1 - assert config['embeddings'][0]['tensorShape'] == list(da.embeddings.shape) - - -def test_plot_embeddings_same_path(tmpdir): - da1 = DocumentArray.empty(100) - da1.embeddings = np.random.random([100, 5]) - p1 = da1.plot_embeddings(start_server=False, path=tmpdir) - da2 = DocumentArray.empty(768) - da2.embeddings = np.random.random([768, 5]) - p2 = da2.plot_embeddings(start_server=False, path=tmpdir) - assert p1 == p2 - assert os.path.exists(p1) - with open(os.path.join(p1, 'config.json')) as fp: - config = json.load(fp) - assert len(config['embeddings']) == 2 - - -def test_summary_homo_hetero(): - da = DocumentArray.empty(100) - da._get_attributes() - da.summary() - - da[0].pop('id') - da.summary() - - -def test_empty_get_attributes(): - da = DocumentArray.empty(10) - da[0].pop('id') - print(da[:, 'id']) diff --git a/tests/unit/array/mixins/test_reduce.py b/tests/unit/array/mixins/test_reduce.py deleted file mode 100644 index 0d0851b6c7c..00000000000 --- a/tests/unit/array/mixins/test_reduce.py +++ /dev/null @@ -1,209 +0,0 @@ -from copy import deepcopy - -import numpy as np - -from docarray import DocumentArray, Document - - -def test_reduce_doc(): - doc1 = Document( - text='doc1', - matches=[Document(id='m0'), Document(id='m2')], - chunks=[Document(id='c0'), Document(id='c1')], - ) - - doc2 = Document( - text='doc2', - embedding=np.zeros(3), - matches=[Document(id='m0'), Document(id='m1')], - chunks=[Document(id='c0'), Document(id='c2')], - ) - - DocumentArray._reduce_doc(doc1, doc2) - - assert doc1.text == 'doc1' - assert (doc1.embedding == np.zeros(3)).all() - for i in range(3): - assert f'c{i}' in doc1.chunks - assert f'm{i}' in doc1.matches - - -def test_reduce(): - da1, da2 = ( - DocumentArray( - [ - Document( - id='r0', - text='da1', - matches=[ - Document(id='r0m0'), - Document(id='r0m2'), - Document(id='r0m1'), - ], - ), - Document( - id='r2', - text='da1', - matches=[ - Document(id='r2m0'), - Document(id='r2m2'), - Document(id='r2m1'), - ], - ), - ] - ), - DocumentArray( - [ - Document( - id='r0', - text='da2', - matches=[ - Document(id='r0m0'), - Document(id='r0m1'), - Document(id='r0m3'), - ], - ), - Document( - id='r1', - text='da2', - matches=[ - Document(id='r1m0'), - Document(id='r1m1'), - Document(id='r1m2'), - Document(id='r1m3'), - ], - ), - Document( - id='r2', - text='da2', - matches=[ - Document(id='r2m3'), - ], - ), - ] - ), - ) - - da1.reduce(da2) - - for i in range(3): - assert f'r{i}' in da1 - for j in range(4): - assert f'r{i}m{j}' in da1[f'r{i}'].matches - - assert da1['r0'].text == 'da1' - assert da1['r1'].text == 'da2' - assert da1['r2'].text == 'da1' - - -def test_reduce_nested(): - da1, da2 = ( - DocumentArray( - [ - Document( - id='r1', - chunks=[ - Document( - id='c1', - chunks=[Document(id='c1c2')], - matches=[Document(id='c1m2')], - ), - ], - matches=[ - Document( - id='m1', chunks=[Document(id='m1c1'), Document(id='m1c2')] - ), - Document(id='m2'), - ], - ), - ] - ), - DocumentArray( - [ - Document( - id='r1', - chunks=[ - Document( - id='c1', - chunks=[Document(id='c1c1')], - matches=[Document(id='c1m1')], - ), - Document( - id='c2', - chunks=[Document(id='c2c1'), Document(id='c2c2')], - matches=[Document(id='c2m1'), Document(id='c2m2')], - ), - ], - matches=[ - Document( - id='m1', matches=[Document(id='m1m1'), Document(id='m1m2')] - ), - Document( - id='m2', - chunks=[Document(id='m2c1'), Document(id='m2c2')], - matches=[Document(id='m2m1'), Document(id='m2m2')], - ), - ], - ), - ] - ), - ) - - da1.reduce(da2) - for i in range(1, 3): - assert f'c{i}' in da1[0].chunks - assert f'm{i}' in da1[0].matches - for j in range(1, 3): - assert f'c{i}c{j}' in da1[0].chunks[f'c{i}'].chunks - assert f'c{i}m{j}' in da1[0].chunks[f'c{i}'].matches - - assert f'm{i}c{j}' in da1[0].matches[f'm{i}'].chunks - assert f'm{i}m{j}' in da1[0].matches[f'm{i}'].matches - - -def test_reduce_mat(): - docs = DocumentArray([Document(id=f'r{i}') for i in range(10)]) - doc_matrix = [deepcopy(docs) for _ in range(10)] - for i, da in enumerate(doc_matrix): - for doc in da: - doc.matches.append(Document(id=str(i))) - - reduced_da = doc_matrix[0].reduce_all(doc_matrix[1:]) - for doc in reduced_da: - for i in range(10): - assert str(i) in doc.matches - - -def test_reduce_data_props(): - da1, da2, da3 = ( - DocumentArray([Document(id='r', text='doc1', chunks=[Document(id='c1')])]), - DocumentArray( - [ - Document( - id='r', - text='doc2', - embedding=np.zeros(3), - chunks=[Document(id='c2')], - ) - ] - ), - DocumentArray( - [ - Document( - id='r', - text='doc3', - tags={'a': 'b'}, - matches=[Document(id='m3')], - ) - ] - ), - ) - da1.reduce_all([da2, da3]) - assert da1[0].text == 'doc1' - assert da1[0].id == 'r' - - # chunks and matches merged, not overridden - assert da1[0].chunks[0].id == 'c1' - assert da1[0].chunks[1].id == 'c2' - assert da1[0].matches[0].id == 'm3' - assert da1[0].tags == {'a': 'b'} diff --git a/tests/unit/array/mixins/test_sample.py b/tests/unit/array/mixins/test_sample.py deleted file mode 100644 index ddb15dc4fe1..00000000000 --- a/tests/unit/array/mixins/test_sample.py +++ /dev/null @@ -1,50 +0,0 @@ -import pytest - -from docarray import DocumentArray - - -def da_and_dam(N): - da = DocumentArray.empty(N) - return (da,) - - -@pytest.mark.parametrize('da', da_and_dam(100)) -def test_sample(da): - sampled = da.sample(1) - assert len(sampled) == 1 - sampled = da.sample(5) - assert len(sampled) == 5 - assert isinstance(sampled, DocumentArray) - with pytest.raises(ValueError): - da.sample(101) # can not sample with k greater than lenth of document array. - - -@pytest.mark.parametrize('da', da_and_dam(100)) -def test_sample_with_seed(da): - sampled_1 = da.sample(5, seed=1) - sampled_2 = da.sample(5, seed=1) - sampled_3 = da.sample(5, seed=2) - assert len(sampled_1) == len(sampled_2) == len(sampled_3) == 5 - assert sampled_1 == sampled_2 - assert sampled_1 != sampled_3 - - -@pytest.mark.parametrize('da', da_and_dam(100)) -def test_shuffle(da): - shuffled = da.shuffle() - assert len(shuffled) == len(da) - assert isinstance(shuffled, DocumentArray) - ids_before_shuffle = [d.id for d in da] - ids_after_shuffle = [d.id for d in shuffled] - assert ids_before_shuffle != ids_after_shuffle - assert sorted(ids_before_shuffle) == sorted(ids_after_shuffle) - - -@pytest.mark.parametrize('da', da_and_dam(100)) -def test_shuffle_with_seed(da): - shuffled_1 = da.shuffle(seed=1) - shuffled_2 = da.shuffle(seed=1) - shuffled_3 = da.shuffle(seed=2) - assert len(shuffled_1) == len(shuffled_2) == len(shuffled_3) == len(da) - assert shuffled_1 == shuffled_2 - assert shuffled_1 != shuffled_3 diff --git a/tests/unit/array/mixins/test_text.py b/tests/unit/array/mixins/test_text.py deleted file mode 100644 index 07d8d2d343a..00000000000 --- a/tests/unit/array/mixins/test_text.py +++ /dev/null @@ -1,98 +0,0 @@ -import numpy as np -import pytest - -from docarray import DocumentArray, Document - - -def da_and_dam(): - da = DocumentArray( - [ - Document(text='hello'), - Document(text='hello world'), - Document(text='goodbye world!'), - ] - ) - - return (da,) - - -@pytest.mark.parametrize('min_freq', [1, 2, 3]) -@pytest.mark.parametrize('da', da_and_dam()) -def test_da_vocabulary(da, min_freq): - vocab = da.get_vocabulary(min_freq) - if min_freq <= 1: - assert set(vocab.values()) == {2, 3, 4} # 0,1 are reserved - assert set(vocab.keys()) == {'hello', 'world', 'goodbye'} - elif min_freq == 2: - assert set(vocab.values()) == {2, 3} # 0,1 are reserved - assert set(vocab.keys()) == {'hello', 'world'} - elif min_freq == 3: - assert not vocab.values() - assert not vocab.keys() - - -@pytest.mark.parametrize('test_docs', da_and_dam()) -def test_da_text_to_blob_non_max_len(test_docs): - vocab = test_docs.get_vocabulary() - for d in test_docs: - d.convert_text_to_blob(vocab) - np.testing.assert_array_equal(test_docs[0].blob, [2]) - np.testing.assert_array_equal(test_docs[1].blob, [2, 3]) - np.testing.assert_array_equal(test_docs[2].blob, [4, 3]) - for d in test_docs: - d.convert_blob_to_text(vocab) - - assert test_docs[0].text == 'hello' - assert test_docs[1].text == 'hello world' - assert test_docs[2].text == 'goodbye world' - - -@pytest.mark.parametrize('test_docs', da_and_dam()) -def test_da_text_to_blob_max_len_3(test_docs): - vocab = test_docs.get_vocabulary() - for d in test_docs: - d.convert_text_to_blob(vocab, max_length=3) - np.testing.assert_array_equal(test_docs[0].blob, [0, 0, 2]) - np.testing.assert_array_equal(test_docs[1].blob, [0, 2, 3]) - np.testing.assert_array_equal(test_docs[2].blob, [0, 4, 3]) - for d in test_docs: - d.convert_blob_to_text(vocab) - - assert test_docs[0].text == 'hello' - assert test_docs[1].text == 'hello world' - assert test_docs[2].text == 'goodbye world' - - -@pytest.mark.parametrize('test_docs', da_and_dam()) -def test_da_text_to_blob_max_len_1(test_docs): - vocab = test_docs.get_vocabulary() - for d in test_docs: - d.convert_text_to_blob(vocab, max_length=1) - np.testing.assert_array_equal(test_docs[0].blob, [2]) - np.testing.assert_array_equal(test_docs[1].blob, [3]) - np.testing.assert_array_equal(test_docs[2].blob, [3]) - for d in test_docs: - d.convert_blob_to_text(vocab) - - assert test_docs[0].text == 'hello' - assert test_docs[1].text == 'world' - assert test_docs[2].text == 'world' - - -@pytest.mark.parametrize('da', da_and_dam()) -def test_convert_text_blob_random_text(da): - texts = ['a short phrase', 'word', 'this is a much longer sentence'] - da.clear() - da.extend(Document(text=t) for t in texts) - vocab = da.get_vocabulary() - - # encoding - for d in da: - d.convert_text_to_blob(vocab, max_length=10) - - # decoding - for d in da: - d.convert_blob_to_text(vocab) - - assert texts - assert da.texts == texts diff --git a/tests/unit/array/mixins/test_traverse.py b/tests/unit/array/mixins/test_traverse.py deleted file mode 100644 index 627b7de1db0..00000000000 --- a/tests/unit/array/mixins/test_traverse.py +++ /dev/null @@ -1,368 +0,0 @@ -import itertools -import types - -import numpy as np -import pytest - -from docarray import Document, DocumentArray -from tests import random_docs - -# some random prime number for sanity check -num_docs = 7 -num_chunks_per_doc = 11 -num_matches_per_doc = 3 -num_matches_per_chunk = 5 - - -@pytest.fixture -def doc_req(): - """Build a dummy request that has docs""" - ds = list(random_docs(num_docs, num_chunks_per_doc)) - # add some random matches - for d in ds: - for _ in range(num_matches_per_doc): - d.matches.append(Document(content='hello')) - for c in d.chunks: - for _ in range(num_matches_per_chunk): - c.matches.append(Document(content='world')) - yield DocumentArray(ds) - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_type(doc_req, filter_fn): - ds = doc_req.traverse('r', filter_fn=filter_fn) - assert isinstance(ds, types.GeneratorType) - assert isinstance(list(ds)[0], DocumentArray) - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_root(doc_req, filter_fn): - ds = list(doc_req.traverse('r', filter_fn=filter_fn)) - assert len(ds) == 1 - assert len(ds[0]) == num_docs - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_chunk(doc_req, filter_fn): - ds = list(doc_req.traverse('c', filter_fn=filter_fn)) - assert len(ds) == num_docs - assert len(ds[0]) == num_chunks_per_doc - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_root_plus_chunk(doc_req, filter_fn): - ds = list(doc_req.traverse('c,r', filter_fn=filter_fn)) - assert len(ds) == num_docs + 1 - assert len(ds[0]) == num_chunks_per_doc - assert len(ds[-1]) == num_docs - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_chunk_plus_root(doc_req, filter_fn): - ds = list(doc_req.traverse('r,c', filter_fn=filter_fn)) - assert len(ds) == 1 + num_docs - assert len(ds[-1]) == num_chunks_per_doc - assert len(ds[0]) == num_docs - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_match(doc_req, filter_fn): - ds = list(doc_req.traverse('m', filter_fn=filter_fn)) - assert len(ds) == num_docs - assert len(ds[0]) == num_matches_per_doc - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_match_chunk(doc_req, filter_fn): - ds = list(doc_req.traverse('cm', filter_fn=filter_fn)) - assert len(ds) == num_docs * num_chunks_per_doc - assert len(ds[0]) == num_matches_per_chunk - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_root_match_chunk(doc_req, filter_fn): - ds = list(doc_req.traverse('r,c,m,cm', filter_fn=filter_fn)) - assert len(ds) == 1 + num_docs + num_docs + num_docs * num_chunks_per_doc - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_flatten_embedding(doc_req, filter_fn): - flattened_results = doc_req.traverse_flat('r,c', filter_fn=filter_fn) - ds = flattened_results.embeddings - assert ds.shape == (num_docs + num_chunks_per_doc * num_docs, 10) - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_flatten_root(doc_req, filter_fn): - ds = list(doc_req.traverse_flat('r', filter_fn=filter_fn)) - assert len(ds) == num_docs - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_flatten_chunk(doc_req, filter_fn): - ds = list(doc_req.traverse_flat('c', filter_fn=filter_fn)) - assert len(ds) == num_docs * num_chunks_per_doc - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_flatten_root_plus_chunk(doc_req, filter_fn): - ds = list(doc_req.traverse_flat('c,r', filter_fn=filter_fn)) - assert len(ds) == num_docs + num_docs * num_chunks_per_doc - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_flatten_match(doc_req, filter_fn): - ds = list(doc_req.traverse_flat('m', filter_fn=filter_fn)) - assert len(ds) == num_docs * num_matches_per_doc - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_flatten_match_chunk(doc_req, filter_fn): - ds = list(doc_req.traverse_flat('cm', filter_fn=filter_fn)) - assert len(ds) == num_docs * num_chunks_per_doc * num_matches_per_chunk - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_flatten_root_match_chunk(doc_req, filter_fn): - ds = list(doc_req.traverse_flat('r,c,m,cm', filter_fn=filter_fn)) - assert ( - len(ds) - == num_docs - + num_chunks_per_doc * num_docs - + num_matches_per_doc * num_docs - + num_docs * num_chunks_per_doc * num_matches_per_chunk - ) - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_flattened_per_path_embedding(doc_req, filter_fn): - flattened_results = list(doc_req.traverse_flat_per_path('r,c', filter_fn=filter_fn)) - ds = flattened_results[0].embeddings - assert ds.shape == (num_docs, 10) - - ds = flattened_results[1].embeddings - assert ds.shape == (num_docs * num_chunks_per_doc, 10) - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_flattened_per_path_root(doc_req, filter_fn): - ds = list(doc_req.traverse_flat_per_path('r', filter_fn=filter_fn)) - assert len(ds[0]) == num_docs - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_flattened_per_path_chunk(doc_req, filter_fn): - ds = list(doc_req.traverse_flat_per_path('c', filter_fn=filter_fn)) - assert len(ds[0]) == num_docs * num_chunks_per_doc - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_flattened_per_path_root_plus_chunk(doc_req, filter_fn): - ds = list(doc_req.traverse_flat_per_path('c,r', filter_fn=filter_fn)) - assert len(ds[0]) == num_docs * num_chunks_per_doc - assert len(ds[1]) == num_docs - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_flattened_per_path_match(doc_req, filter_fn): - ds = list(doc_req.traverse_flat_per_path('m', filter_fn=filter_fn)) - assert len(ds[0]) == num_docs * num_matches_per_doc - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_flattened_per_path_match_chunk(doc_req, filter_fn): - ds = list(doc_req.traverse_flat_per_path('cm', filter_fn=filter_fn)) - assert len(ds[0]) == num_docs * num_chunks_per_doc * num_matches_per_chunk - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_flattened_per_path_root_match_chunk(doc_req, filter_fn): - ds = list(doc_req.traverse_flat_per_path('r,c,m,cm', filter_fn=filter_fn)) - assert len(ds[0]) == num_docs - assert len(ds[1]) == num_chunks_per_doc * num_docs - assert len(ds[2]) == num_matches_per_doc * num_docs - assert len(ds[3]) == num_docs * num_chunks_per_doc * num_matches_per_chunk - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_docuset_traverse_over_iterator_HACKY(filter_fn): - # HACKY USAGE DO NOT RECOMMEND: can also traverse over "runtime"-documentarray - ds = DocumentArray(random_docs(num_docs, num_chunks_per_doc)).traverse( - 'r', filter_fn=filter_fn - ) - assert len(list(list(ds)[0])) == num_docs - - ds = DocumentArray(random_docs(num_docs, num_chunks_per_doc)).traverse( - 'c', filter_fn=filter_fn - ) - ds = list(ds) - assert len(ds) == num_docs - assert len(ds[0]) == num_chunks_per_doc - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_docuset_traverse_over_iterator_CAVEAT(filter_fn): - # HACKY USAGE's CAVEAT: but it can not iterate over an iterator twice - ds = DocumentArray(random_docs(num_docs, num_chunks_per_doc)).traverse( - 'r,c', filter_fn=filter_fn - ) - # note that random_docs is a generator and can be only used once, - # therefore whoever comes first wil get iterated, and then it becomes empty - assert len(list(ds)) == 1 + num_docs - - ds = DocumentArray(random_docs(num_docs, num_chunks_per_doc)).traverse( - 'c,r', filter_fn=filter_fn - ) - assert len(list(ds)) == num_docs + 1 - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_doc_iter_method(filter_fn): - ds = list(random_docs(10)) - - for d in DocumentArray(ds): - assert d.text == 'hello world' - - for d in DocumentArray(ds).traverse_flat('c,r', filter_fn=filter_fn): - d.text = 'modified' - - for d in DocumentArray(ds): - assert d.text == 'modified' - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_matcharray(filter_fn): - doc = Document( - matches=[ - Document(id=f'm{i}', chunks=[Document(id=f'm{i}c{j}') for j in range(3)]) - for i in range(3) - ] - ) - flat_docs = doc.matches.traverse_flat('r,c', filter_fn=filter_fn) - assert isinstance(flat_docs, DocumentArray) - assert len(flat_docs) == 12 - - -@pytest.mark.parametrize('filter_fn', [(lambda d: True), None]) -def test_traverse_chunkarray(filter_fn): - doc = Document( - chunks=[ - Document(id=f'c{i}', matches=[Document(id=f'c{i}m{j}') for j in range(3)]) - for i in range(3) - ] - ) - flat_docs = doc.chunks.traverse_flat('r,m', filter_fn=filter_fn) - assert isinstance(flat_docs, DocumentArray) - assert len(flat_docs) == 12 - - -@pytest.mark.parametrize('use_dam', [True, False]) -@pytest.mark.parametrize( - ('filter_fn', 'docs_len'), - [ - (lambda d: False, 0), - (lambda d: d.text == 'hello', num_docs * num_matches_per_doc), - ( - lambda d: d.text == 'world', - num_docs * num_chunks_per_doc * num_matches_per_chunk, - ), - ( - lambda d: True, - num_docs - + num_docs * num_chunks_per_doc - + num_docs * num_matches_per_doc - + num_docs * num_chunks_per_doc * num_matches_per_chunk, - ), - ( - None, - num_docs - + num_docs * num_matches_per_doc - + num_docs * num_chunks_per_doc - + num_docs * num_chunks_per_doc * num_matches_per_chunk, - ), - ], -) -def test_filter_fn_traverse_flat(filter_fn, docs_len, doc_req, use_dam, tmp_path): - docs = doc_req - ds = list(docs.traverse_flat('r,c,m,cm', filter_fn=filter_fn)) - assert len(ds) == docs_len - assert all(isinstance(d, Document) for d in ds) - - -@pytest.mark.parametrize('use_dam', [True, False]) -@pytest.mark.parametrize( - ('filter_fn', 'docs_len'), - [ - (lambda d: False, [0, 0, 0, 0]), - (lambda d: d.text == 'hello', [0, 0, num_docs * num_matches_per_doc, 0]), - ( - lambda d: d.text == 'world', - [0, 0, 0, num_docs * num_chunks_per_doc * num_matches_per_chunk], - ), - ( - lambda d: True, - [ - num_docs, - num_docs * num_chunks_per_doc, - num_docs * num_matches_per_doc, - num_docs * num_chunks_per_doc * num_matches_per_chunk, - ], - ), - ( - None, - [ - num_docs, - num_docs * num_chunks_per_doc, - num_docs * num_matches_per_doc, - num_docs * num_chunks_per_doc * num_matches_per_chunk, - ], - ), - ], -) -def test_filter_fn_traverse_flat_per_path( - filter_fn, doc_req, docs_len, use_dam, tmp_path -): - docs = doc_req - ds = list(docs.traverse_flat_per_path('r,c,m,cm', filter_fn=filter_fn)) - assert len(ds) == 4 - for seq, length in zip(ds, docs_len): - assert isinstance(seq, DocumentArray) - assert len(list(seq)) == length - - -def test_traversal_path(): - da = DocumentArray([Document() for _ in range(6)]) - assert len(da) == 6 - - da.traverse_flat('r') - - -def test_traverse_flat_root_itself(): - da = DocumentArray([Document() for _ in range(100)]) - res = da.traverse_flat('r') - assert id(res) == id(da) - - -def da_and_dam(N): - da = DocumentArray(random_docs(N)) - return (da,) - - -@pytest.mark.parametrize('da', da_and_dam(100)) -def test_flatten(da): - daf = da.flatten() - assert len(daf) == 600 - assert isinstance(daf, DocumentArray) - assert len(set(d.id for d in daf)) == 600 - - # flattened DA can not be flattened again - daf = daf.flatten() - assert len(daf) == 600 - - -def test_flatten_no_copy(): - da = da_and_dam(100)[0] - daf = da.flatten() - new_text = 'hi i changed it!' - daf[53].text = new_text - assert da[daf[53].id].text == new_text diff --git a/tests/unit/array/test_advance_indexing.py b/tests/unit/array/test_advance_indexing.py deleted file mode 100644 index d87fdd80af9..00000000000 --- a/tests/unit/array/test_advance_indexing.py +++ /dev/null @@ -1,221 +0,0 @@ -import numpy as np -import pytest - -from docarray import DocumentArray, Document - - -@pytest.fixture -def docarray100(): - yield DocumentArray(Document(text=j) for j in range(100)) - - -def test_getter_int_str(docarray100): - # getter - assert docarray100[99].text == 99 - assert docarray100[np.int(99)].text == 99 - assert docarray100[-1].text == 99 - assert docarray100[0].text == 0 - # string index - assert docarray100[docarray100[0].id].text == 0 - assert docarray100[docarray100[99].id].text == 99 - assert docarray100[docarray100[-1].id].text == 99 - - with pytest.raises(IndexError): - docarray100[100] - - with pytest.raises(KeyError): - docarray100['adsad'] - - -def test_setter_int_str(docarray100): - # setter - docarray100[99] = Document(text='hello') - docarray100[0] = Document(text='world') - - assert docarray100[99].text == 'hello' - assert docarray100[-1].text == 'hello' - assert docarray100[0].text == 'world' - - docarray100[docarray100[2].id] = Document(text='doc2') - # string index - assert docarray100[docarray100[2].id].text == 'doc2' - - -def test_del_int_str(docarray100): - zero_id = docarray100[0].id - del docarray100[0] - assert len(docarray100) == 99 - assert zero_id not in docarray100 - - new_zero_id = docarray100[0].id - new_doc_zero = docarray100[0] - del docarray100[new_zero_id] - assert len(docarray100) == 98 - assert zero_id not in docarray100 - assert new_doc_zero not in docarray100 - - -def test_slice(docarray100): - # getter - assert len(docarray100[1:5]) == 4 - assert len(docarray100[1:100:5]) == 20 # 1 to 100, sep with 5 - - # setter - with pytest.raises(TypeError, match='can only assign an iterable'): - docarray100[1:5] = Document(text='repl') - - docarray100[1:5] = [Document(text=f'repl{j}') for j in range(4)] - for d in docarray100[1:5]: - assert d.text.startswith('repl') - assert len(docarray100) == 100 - - # del - zero_doc = docarray100[0] - twenty_doc = docarray100[20] - del docarray100[0:20] - assert len(docarray100) == 80 - assert zero_doc not in docarray100 - assert twenty_doc in docarray100 - - -def test_sequence_bool_index(docarray100): - # getter - mask = [True, False] * 50 - assert len(docarray100[mask]) == 50 - assert len(docarray100[[True, False]]) == 1 - - # setter - mask = [True, False] * 50 - docarray100[mask] = [Document(text=f'repl{j}') for j in range(50)] - - for idx, d in enumerate(docarray100): - if idx % 2 == 0: - # got replaced - assert d.text.startswith('repl') - else: - assert isinstance(d.text, int) - - # del - del docarray100[mask] - assert len(docarray100) == 50 - - del docarray100[mask] - assert len(docarray100) == 25 - - -@pytest.mark.parametrize('nparray', [lambda x: x, np.array, tuple]) -def test_sequence_int(docarray100, nparray): - # getter - idx = nparray([1, 3, 5, 7, -1, -2]) - assert len(docarray100[idx]) == len(idx) - - # setter - docarray100[idx] = [Document(text='repl') for _ in range(len(idx))] - for _id in idx: - assert docarray100[_id].text == 'repl' - - # del - idx = [-3, -4, -5, 9, 10, 11] - del docarray100[idx] - assert len(docarray100) == 100 - len(idx) - - -def test_sequence_str(docarray100): - # getter - idx = [d.id for d in docarray100[1, 3, 5, 7, -1, -2]] - - assert len(docarray100[idx]) == len(idx) - assert len(docarray100[tuple(idx)]) == len(idx) - - # setter - docarray100[idx] = [Document(text='repl') for _ in range(len(idx))] - idx = [d.id for d in docarray100[1, 3, 5, 7, -1, -2]] - for _id in idx: - assert docarray100[_id].text == 'repl' - - # del - idx = [d.id for d in docarray100[-3, -4, -5, 9, 10, 11]] - del docarray100[idx] - assert len(docarray100) == 100 - len(idx) - - -def test_docarray_list_tuple(docarray100): - assert isinstance(docarray100[99, 98], DocumentArray) - assert len(docarray100[99, 98]) == 2 - - -def test_path_syntax_indexing(): - da = DocumentArray().empty(3) - for d in da: - d.chunks = DocumentArray.empty(5) - d.matches = DocumentArray.empty(7) - for c in d.chunks: - c.chunks = DocumentArray.empty(3) - assert len(da['@c']) == 3 * 5 - assert len(da['@c:1']) == 3 - assert len(da['@c-1:']) == 3 - assert len(da['@c1']) == 3 - assert len(da['@c-2:']) == 3 * 2 - assert len(da['@c1:3']) == 3 * 2 - assert len(da['@c1:3c']) == (3 * 2) * 3 - assert len(da['@c1:3,c1:3c']) == (3 * 2) + (3 * 2) * 3 - assert len(da['@c 1:3 , c 1:3 c']) == (3 * 2) + (3 * 2) * 3 - assert len(da['@cc']) == 3 * 5 * 3 - assert len(da['@cc,m']) == 3 * 5 * 3 + 3 * 7 - assert len(da['@r:1cc,m']) == 1 * 5 * 3 + 3 * 7 - - -def test_attribute_indexing(): - da = DocumentArray.empty(10) - for v in da[:, 'id']: - assert v - da[:, 'mime_type'] = [f'type {j}' for j in range(10)] - for v in da[:, 'mime_type']: - assert v - del da[:, 'mime_type'] - for v in da[:, 'mime_type']: - assert not v - - da[:, ['text', 'mime_type']] = [ - [f'hello {j}' for j in range(10)], - [f'type {j}' for j in range(10)], - ] - da.summary() - - for v in da[:, ['mime_type', 'text']]: - for vv in v: - assert vv - - -def test_blob_attribute_selector(): - import scipy.sparse - - sp_embed = np.random.random([3, 10]) - sp_embed[sp_embed > 0.1] = 0 - sp_embed = scipy.sparse.coo_matrix(sp_embed) - - da = DocumentArray.empty(3) - - da[:, 'embedding'] = sp_embed - - assert da[:, 'embedding'].shape == (3, 10) - - for d in da: - assert d.embedding.shape == (1, 10) - - v1, v2 = da[:, ['embedding', 'id']] - assert isinstance(v1, scipy.sparse.coo_matrix) - assert isinstance(v2, list) - - v1, v2 = da[:, ['id', 'embedding']] - assert isinstance(v2, scipy.sparse.coo_matrix) - assert isinstance(v1, list) - - -def test_advance_selector_mixed(): - da = DocumentArray.empty(10) - da.embeddings = np.random.random([10, 3]) - da.match(da, exclude_self=True) - - assert len(da[:, ('id', 'embedding', 'matches')]) == 3 - assert len(da[:, ('id', 'embedding', 'matches')][0]) == 10 diff --git a/tests/unit/array/test_construct.py b/tests/unit/array/test_construct.py deleted file mode 100644 index 302a6359c2c..00000000000 --- a/tests/unit/array/test_construct.py +++ /dev/null @@ -1,62 +0,0 @@ -import pytest - -from docarray import Document, DocumentArray - - -@pytest.mark.parametrize('da_cls', [DocumentArray]) -def test_construct_docarray(da_cls): - da = da_cls() - assert len(da) == 0 - - da = da_cls(Document()) - assert len(da) == 1 - - da = da_cls([Document(), Document()]) - assert len(da) == 2 - - da = da_cls((Document(), Document())) - assert len(da) == 2 - - da = da_cls((Document() for _ in range(10))) - assert len(da) == 10 - - da1 = da_cls(da) - assert len(da1) == 10 - - -@pytest.mark.parametrize('da_cls', [DocumentArray]) -@pytest.mark.parametrize('is_copy', [True, False]) -def test_docarray_copy_singleton(da_cls, is_copy): - d = Document() - da = da_cls(d, copy=is_copy) - d.id = 'hello' - if is_copy: - assert da[0].id != 'hello' - else: - assert da[0].id == 'hello' - - -@pytest.mark.parametrize('da_cls', [DocumentArray]) -@pytest.mark.parametrize('is_copy', [True, False]) -def test_docarray_copy_da(da_cls, is_copy): - d1 = Document() - d2 = Document() - da = da_cls([d1, d2], copy=is_copy) - d1.id = 'hello' - if is_copy: - assert da[0].id != 'hello' - else: - assert da[0].id == 'hello' - - -@pytest.mark.parametrize('da_cls', [DocumentArray]) -@pytest.mark.parametrize('is_copy', [True, False]) -def test_docarray_copy_list(da_cls, is_copy): - d1 = Document() - d2 = Document() - da = da_cls([d1, d2], copy=is_copy) - d1.id = 'hello' - if is_copy: - assert da[0].id != 'hello' - else: - assert da[0].id == 'hello' diff --git a/tests/unit/array/test_from_to_bytes.py b/tests/unit/array/test_from_to_bytes.py deleted file mode 100644 index 2ac85ca9364..00000000000 --- a/tests/unit/array/test_from_to_bytes.py +++ /dev/null @@ -1,89 +0,0 @@ -import numpy as np -import pytest -import tensorflow as tf -import torch -from scipy.sparse import csr_matrix, coo_matrix, bsr_matrix, csc_matrix - -from docarray import DocumentArray -from docarray.math.ndarray import to_numpy_array -from tests import random_docs - - -def get_ndarrays_for_ravel(): - a = np.random.random([100, 3]) - a[a > 0.5] = 0 - return [ - (a, False), - (torch.tensor(a), False), - (tf.constant(a), False), - (torch.tensor(a).to_sparse(), True), - # (tf.sparse.from_dense(a), True), - (csr_matrix(a), True), - (bsr_matrix(a), True), - (coo_matrix(a), True), - (csc_matrix(a), True), - ] - - -@pytest.mark.parametrize('ndarray_val, is_sparse', get_ndarrays_for_ravel()) -@pytest.mark.parametrize('target_da', [DocumentArray.empty(100), random_docs(100)]) -@pytest.mark.parametrize( - 'protocol', ['protobuf', 'protobuf-array', 'pickle', 'pickle-array'] -) -@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) -def test_to_from_bytes(target_da, protocol, compress, ndarray_val, is_sparse): - bstr = target_da.to_bytes(protocol=protocol, compress=compress) - print(protocol, compress, len(bstr)) - da2 = DocumentArray.from_bytes(bstr, protocol=protocol, compress=compress) - assert len(da2) == len(target_da) - - target_da.embeddings = ndarray_val - target_da.blobs = ndarray_val - bstr = target_da.to_bytes(protocol=protocol, compress=compress) - print(protocol, compress, len(bstr)) - da2 = DocumentArray.from_bytes(bstr, protocol=protocol, compress=compress) - assert len(da2) == len(target_da) - - np.testing.assert_almost_equal( - to_numpy_array(target_da.embeddings), to_numpy_array(da2.embeddings) - ) - np.testing.assert_almost_equal( - to_numpy_array(target_da.blobs), to_numpy_array(da2.blobs) - ) - - -@pytest.mark.parametrize('target_da', [DocumentArray.empty(100), random_docs(100)]) -@pytest.mark.parametrize( - 'protocol', ['protobuf', 'protobuf-array', 'pickle', 'pickle-array'] -) -@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) -def test_save_bytes(target_da, protocol, compress, tmpfile): - target_da.save_binary(tmpfile, protocol=protocol, compress=compress) - target_da.save_binary(str(tmpfile), protocol=protocol, compress=compress) - - with open(tmpfile, 'wb') as fp: - target_da.save_binary(fp, protocol=protocol, compress=compress) - - DocumentArray.load_binary(tmpfile, protocol=protocol, compress=compress) - DocumentArray.load_binary(str(tmpfile), protocol=protocol, compress=compress) - with open(tmpfile, 'rb') as fp: - DocumentArray.load_binary(fp, protocol=protocol, compress=compress) - - -@pytest.mark.parametrize('target_da', [DocumentArray.empty(100), random_docs(100)]) -def test_from_to_protobuf(target_da): - DocumentArray.from_protobuf(target_da.to_protobuf()) - - -@pytest.mark.parametrize('target_da', [DocumentArray.empty(100), random_docs(100)]) -def test_from_to_safe_list(target_da): - DocumentArray.from_list(target_da.to_list()) - - -@pytest.mark.parametrize('protocol', ['protobuf', 'pickle']) -@pytest.mark.parametrize('show_progress', [True, False]) -def test_push_pull_show_progress(show_progress, protocol): - da = DocumentArray.empty(1000) - r = da.to_bytes(_show_progress=show_progress, protocol=protocol) - da_r = DocumentArray.from_bytes(r, _show_progress=show_progress, protocol=protocol) - assert da == da_r diff --git a/tests/unit/array/test_ravel_unravel.py b/tests/unit/array/test_ravel_unravel.py deleted file mode 100644 index 6ba872294f7..00000000000 --- a/tests/unit/array/test_ravel_unravel.py +++ /dev/null @@ -1,96 +0,0 @@ -import numpy as np -import paddle -import pytest -import tensorflow as tf - -from docarray.math.ndarray import to_numpy_array -import torch -from scipy.sparse import csr_matrix, coo_matrix, bsr_matrix, csc_matrix - -from docarray import DocumentArray, Document - - -def get_ndarrays_for_ravel(): - a = np.random.random([10, 3]) - a[a > 0.5] = 0 - return [ - (a, False), - (torch.tensor(a), False), - (tf.constant(a), False), - (paddle.to_tensor(a), False), - (torch.tensor(a).to_sparse(), True), - # (tf.sparse.from_dense(a), True), - (csr_matrix(a), True), - (bsr_matrix(a), True), - (coo_matrix(a), True), - (csc_matrix(a), True), - ] - - -@pytest.mark.parametrize('ndarray_val, is_sparse', get_ndarrays_for_ravel()) -@pytest.mark.parametrize('attr', ['embeddings', 'blobs']) -def test_ravel_embeddings_blobs(ndarray_val, attr, is_sparse): - da = DocumentArray.empty(10) - setattr(da, attr, ndarray_val) - ndav = getattr(da, attr) - - # test read/getter - assert type(ndav) is type(ndarray_val) - - if is_sparse: - if hasattr(ndav, 'todense'): - ndav = (ndav.todense(),) - ndarray_val = ndarray_val.todense() - if hasattr(ndav, 'to_dense'): - ndav = (ndav.to_dense(),) - ndarray_val = ndarray_val.to_dense() - if isinstance(ndav, tf.SparseTensor): - ndav = tf.sparse.to_dense(ndav) - ndarray_val = tf.sparse.to_dense(ndarray_val) - - if isinstance(ndav, tuple): - ndav = ndav[0] - if hasattr(ndav, 'numpy'): - ndav = ndav.numpy() - ndarray_val = ndarray_val.numpy() - - np.testing.assert_almost_equal(ndav, ndarray_val) - - -@pytest.mark.parametrize('sparse_cls', [csr_matrix, csc_matrix, bsr_matrix, coo_matrix]) -def test_bsr_coo_unravel(sparse_cls): - a = np.random.random([10, 72]) - a[a > 0.5] = 0 - - da = DocumentArray.empty(10) - for d, a_row in zip(da, a): - d.embedding = sparse_cls(a_row) - - np.testing.assert_almost_equal(a, da.embeddings.todense()) - - -def get_ndarrays(): - a = np.random.random([10, 3]) - a[a > 0.5] = 0 - return [ - (a, False), - (torch.tensor(a), False), - (tf.constant(a), False), - (paddle.to_tensor(a), False), - (torch.tensor(a).to_sparse(), True), - (tf.sparse.from_dense(a), True), - (csr_matrix(a), True), - (bsr_matrix(a), True), - (coo_matrix(a), True), - (csc_matrix(a), True), - ] - - -@pytest.mark.parametrize('ndarray_val, is_sparse', get_ndarrays()) -@pytest.mark.parametrize('attr', ['embedding', 'blob']) -def test_ndarray_force_numpy(ndarray_val, attr, is_sparse): - d = Document() - setattr(d, attr, ndarray_val) - ndav = to_numpy_array(getattr(d, attr)) - assert isinstance(ndav, np.ndarray) - assert ndav.shape == (10, 3) diff --git a/tests/unit/array/test_sequence.py b/tests/unit/array/test_sequence.py deleted file mode 100644 index 6ba6679f936..00000000000 --- a/tests/unit/array/test_sequence.py +++ /dev/null @@ -1,24 +0,0 @@ -import pytest - -from docarray import Document, DocumentArray - - -@pytest.mark.parametrize('da_cls', [DocumentArray]) -def test_insert(da_cls): - da = da_cls() - assert not len(da) - da.insert(0, Document(text='hello')) - da.insert(0, Document(text='world')) - assert len(da) == 2 - assert da[0].text == 'world' - assert da[1].text == 'hello' - - -@pytest.mark.parametrize('da_cls', [DocumentArray]) -def test_append_extend(da_cls): - da = da_cls() - da.append(Document()) - da.append(Document()) - assert len(da) == 2 - da.extend([Document(), Document()]) - assert len(da) == 4 diff --git a/tests/unit/document/__init__.py b/tests/unit/document/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/unit/document/test_converters.py b/tests/unit/document/test_converters.py deleted file mode 100644 index 37923f84e1e..00000000000 --- a/tests/unit/document/test_converters.py +++ /dev/null @@ -1,236 +0,0 @@ -import os - -import numpy as np -import pytest - -from docarray import Document -from docarray.document.generators import from_files -from docarray.helper import __windows__ - -cur_dir = os.path.dirname(os.path.abspath(__file__)) - - -def test_video_convert_pipe(pytestconfig, tmpdir): - num_d = 0 - fname = str(tmpdir / f'tmp{num_d}.mp4') - d = Document(uri=os.path.join(cur_dir, 'toydata/mov_bbb.mp4')) - d.load_uri_to_video_blob() - d.save_video_blob_to_file(fname) - assert os.path.exists(fname) - - -def test_audio_convert_pipe(pytestconfig, tmpdir): - num_d = 0 - for d in from_files(f'{cur_dir}/toydata/*.wav'): - fname = str(tmpdir / f'tmp{num_d}.wav') - d.load_uri_to_audio_blob() - d.blob = d.blob[::-1] - d.save_audio_blob_to_file(fname) - assert os.path.exists(fname) - num_d += 1 - assert num_d - - -def test_image_convert_pipe(pytestconfig): - for d in from_files(f'{pytestconfig.rootdir}/.github/**/*.png'): - ( - d.load_uri_to_image_blob() - .convert_uri_to_datauri() - .set_image_blob_shape((64, 64)) - .set_image_blob_normalization() - .set_image_blob_channel_axis(-1, 0) - ) - assert d.blob.shape == (3, 64, 64) - assert d.uri - - -def test_uri_to_blob(): - doc = Document(uri=os.path.join(cur_dir, 'toydata/test.png')) - doc.load_uri_to_image_blob() - assert isinstance(doc.blob, np.ndarray) - assert doc.blob.shape == (85, 152, 3) # h,w,c - assert doc.mime_type == 'image/png' - - -def test_datauri_to_blob(): - doc = Document(uri=os.path.join(cur_dir, 'toydata/test.png')) - doc.convert_uri_to_datauri() - assert not doc.blob - assert doc.mime_type == 'image/png' - - -def test_buffer_to_blob(): - doc = Document(uri=os.path.join(cur_dir, 'toydata/test.png')) - doc.load_uri_to_buffer() - doc.convert_buffer_to_image_blob() - assert isinstance(doc.blob, np.ndarray) - assert doc.mime_type == 'image/png' - assert doc.blob.shape == (85, 152, 3) # h,w,c - - -def test_convert_buffer_to_blob(): - rand_state = np.random.RandomState(0) - array = rand_state.random([10, 10]) - doc = Document(content=array.tobytes()) - assert doc.content_type == 'buffer' - intialiazed_buffer = doc.buffer - - doc.convert_buffer_to_blob() - assert doc.content_type == 'blob' - converted_buffer_in_one_of = doc.buffer - assert intialiazed_buffer != converted_buffer_in_one_of - np.testing.assert_almost_equal(doc.content.reshape([10, 10]), array) - - -@pytest.mark.parametrize('shape, channel_axis', [((3, 32, 32), 0), ((32, 32, 3), -1)]) -def test_image_normalize(shape, channel_axis): - doc = Document(content=np.random.randint(0, 255, shape, dtype=np.uint8)) - doc.set_image_blob_normalization(channel_axis=channel_axis) - assert doc.blob.ndim == 3 - assert doc.blob.shape == shape - assert doc.blob.dtype == np.float32 - - -@pytest.mark.parametrize( - 'arr_size, channel_axis, height, width', - [ - ([32, 28, 3], -1, 32, 28), # h, w, c (rgb) - ([3, 32, 28], 0, 32, 28), # c, h, w (rgb) - ([1, 32, 28], 0, 32, 28), # c, h, w, (greyscale) - ([32, 28, 1], -1, 32, 28), # h, w, c, (greyscale) - ], -) -def test_convert_image_blob_to_uri(arr_size, channel_axis, width, height): - doc = Document(content=np.random.randint(0, 255, arr_size)) - assert doc.blob.any() - assert not doc.uri - doc.set_image_blob_shape(channel_axis=channel_axis, shape=(width, height)) - - doc.convert_image_blob_to_uri(channel_axis=channel_axis) - assert doc.uri.startswith('data:image/png;base64,') - assert doc.mime_type == 'image/png' - assert doc.blob.any() # assure after conversion blob still exist. - - -@pytest.mark.xfail( - condition=__windows__, reason='x-python is not detected on windows CI' -) -@pytest.mark.parametrize( - 'uri, mimetype', - [ - (__file__, 'text/x-python'), - ('http://google.com/index.html', 'text/html'), - ('https://google.com/index.html', 'text/html'), - ], -) -def test_convert_uri_to_buffer(uri, mimetype): - d = Document(uri=uri) - assert not d.buffer - d.load_uri_to_buffer() - assert d.buffer - assert d.mime_type == mimetype - - -@pytest.mark.parametrize( - 'converter', ['convert_buffer_to_datauri', 'convert_content_to_datauri'] -) -def test_convert_buffer_to_uri(converter): - d = Document(content=open(__file__).read().encode(), mime_type='text/x-python') - assert d.buffer - getattr(d, converter)() - assert d.uri.startswith('data:text/x-python;') - - -@pytest.mark.parametrize( - 'converter', ['convert_text_to_datauri', 'convert_content_to_datauri'] -) -def test_convert_text_to_uri(converter): - d = Document(content=open(__file__).read()) - assert d.text - getattr(d, converter)() - assert d.uri.startswith('data:text/plain;') - - -@pytest.mark.xfail( - condition=__windows__, reason='x-python is not detected on windows CI' -) -@pytest.mark.parametrize( - 'uri, mimetype', - [ - pytest.param( - __file__, - 'text/x-python', - marks=pytest.mark.xfail( - condition=__windows__, reason='x-python is not detected on windows CI' - ), - ), - ('http://google.com/index.html', 'text/html'), - ('https://google.com/index.html', 'text/html'), - ], -) -def test_convert_uri_to_text(uri, mimetype): - doc = Document(uri=uri, mime_type=mimetype) - doc.load_uri_to_text() - if mimetype == 'text/html': - assert '' in doc.text - elif mimetype == 'text/x-python': - text_from_file = open(__file__).read() - assert doc.text == text_from_file - - -def test_convert_text_to_uri_and_back(): - text_from_file = open(__file__).read() - doc = Document(content=text_from_file, mime_type='text/x-python') - assert doc.text - assert doc.mime_type == 'text/x-python' - doc.convert_text_to_datauri() - doc.load_uri_to_text() - assert doc.mime_type == 'text/plain' - assert doc.text == text_from_file - - -def test_convert_text_diff_encoding(tmpfile): - otext = 'testä' - text = otext.encode('iso8859') - with open(tmpfile, 'wb') as fp: - fp.write(text) - with pytest.raises(UnicodeDecodeError): - d = Document(uri=str(tmpfile)).load_uri_to_text() - - d = Document(uri=str(tmpfile)).load_uri_to_text(charset='iso8859') - assert d.text == otext - - with open(tmpfile, 'w', encoding='iso8859') as fp: - fp.write(otext) - with pytest.raises(UnicodeDecodeError): - d = Document(uri=str(tmpfile)).load_uri_to_text() - - d = Document(uri=str(tmpfile)).load_uri_to_text(charset='iso8859') - assert d.text == otext - - -def test_convert_content_to_uri(): - d = Document(content=np.random.random([10, 10])) - with pytest.raises(NotImplementedError): - d.convert_content_to_datauri() - - -@pytest.mark.parametrize( - 'uri, mimetype', - [ - (__file__, 'text/x-python'), - ('http://google.com/index.html', 'text/html'), - ('https://google.com/index.html', 'text/html'), - ], -) -def test_convert_uri_to_data_uri(uri, mimetype): - doc = Document(uri=uri, mime_type=mimetype) - doc.convert_uri_to_datauri() - assert doc.uri.startswith(f'data:{mimetype}') - assert doc.mime_type == mimetype - - -def test_glb_converters(): - doc = Document(uri=os.path.join(cur_dir, 'toydata/test.glb')) - doc.load_uri_to_point_cloud_blob(2000) - assert doc.blob.shape == (2000, 3) diff --git a/tests/unit/document/test_docdata.py b/tests/unit/document/test_docdata.py deleted file mode 100644 index ff39b24bb68..00000000000 --- a/tests/unit/document/test_docdata.py +++ /dev/null @@ -1,263 +0,0 @@ -import numpy as np -import pytest - -from docarray import Document, DocumentArray -from docarray.array.chunk import ChunkArray -from docarray.array.match import MatchArray -from docarray.score import NamedScore - - -@pytest.mark.parametrize('init_args', [None, dict(id=123), Document()]) -@pytest.mark.parametrize('copy', [True, False]) -def test_construct_doc(init_args, copy): - Document(init_args, copy) - - -def test_doc_hash_identical(): - d1 = Document(text='hello') - d2 = Document(text='hello') - assert hash(d1) != hash(d2) - assert d1 != d2 - d1.id = d2.id - assert hash(d1) == hash(d2) - assert d1 == d2 - - -def test_doc_hash_complicate_content(): - d1 = Document(text='hello', embedding=np.array([1, 2, 3]), id=1) - d2 = Document(text='hello', embedding=np.array([1, 2, 3]), id=1) - assert d1 == d2 - assert hash(d1) == hash(d2) - - -def test_pop_field(): - d1 = Document(text='hello', embedding=np.array([1, 2, 3]), id=1) - assert d1.non_empty_fields == ('id', 'mime_type', 'text', 'embedding') - d1.pop('text') - assert d1.non_empty_fields == ('id', 'mime_type', 'embedding') - d1.pop('id', 'embedding', 'mime_type') - assert d1.non_empty_fields == tuple() - - d1.pop('foobar') - with pytest.raises(AttributeError): - assert d1.foobar - - -def test_clear_fields(): - d1 = Document(text='hello', embedding=np.array([1, 2, 3]), id=1) - d1.clear() - assert d1.non_empty_fields == tuple() - - -def test_exclusive_content(): - d = Document(text='hello') - assert d.content_type == 'text' - d.buffer = b'123' - assert d.buffer - assert not d.text - assert not d.blob - assert d.content_type == 'buffer' - d.blob = [1, 2, 3] - assert d.blob - assert not d.buffer - assert not d.text - assert d.content_type == 'blob' - d.text = 'hello' - assert d.text - assert not d.buffer - assert not d.blob - assert d.content_type == 'text' - - -def test_content_setter(): - d = Document() - assert not d.content_type - d.content = 'hello' - assert d.content_type == 'text' - d.content = None - assert not d.content_type - - -def test_chunks_matches_setter(): - d = Document(chunks=[Document()], matches=[Document(), Document()]) - assert len(d.chunks) == 1 - assert len(d.matches) == 2 - assert isinstance(d.chunks, DocumentArray) - assert isinstance(d.chunks, ChunkArray) - assert isinstance(d.matches, DocumentArray) - assert isinstance(d.matches, MatchArray) - - -def test_empty_doc_chunks_matches(): - assert isinstance(Document().chunks, DocumentArray) - assert isinstance(Document().matches, DocumentArray) - assert isinstance(Document().matches, MatchArray) - assert isinstance(Document().chunks, ChunkArray) - - d = Document() - d.chunks.append(Document()) - assert isinstance(d.chunks, ChunkArray) - - d.chunks = [Document(), Document()] - assert isinstance(d.chunks, ChunkArray) - - -def test_chunk_match_increase_granularity(): - d = Document() - d.chunks.append(Document()) - assert d.chunks[0].granularity == 1 - assert id(d.chunks.reference_doc) == id(d) - d.matches.append(Document()) - assert d.matches[0].adjacency == 1 - assert id(d.matches.reference_doc) == id(d) - - d = d.chunks[0] - d.chunks.append(Document()) - assert d.chunks[0].granularity == 2 - assert id(d.chunks.reference_doc) == id(d) - - d.matches.append(Document()) - assert d.matches[0].adjacency == 1 - assert id(d.matches.reference_doc) == id(d) - - -def test_offset(): - d1 = Document(offset=1.0) - d2 = Document() - d2.offset = 1.0 - assert d1.offset == d2.offset == 1.0 - - -def test_exclusive_content_2(): - d = Document(text='hello', buffer=b'sda') - assert len(d.non_empty_fields) == 3 - d.content = b'sda' - assert d.content == b'sda' - assert 'buffer' in d.non_empty_fields - d = Document(content='hello') - assert d.content_type == 'text' - d = Document(content=b'hello') - assert d.content_type == 'buffer' - d = Document(content=[1, 2, 3]) - assert d.content_type == 'blob' - - -def test_get_attr_values(): - d = Document( - **{ - 'id': '123', - 'text': 'document', - 'feature1': 121, - 'name': 'name', - 'tags': {'id': 'identity', 'a': 'b', 'c': 'd', 'e': [0, 1, {'f': 'g'}]}, - } - ) - d.scores['metric'] = NamedScore(value=42) - - required_keys = [ - 'id', - 'text', - 'tags__name', - 'tags__feature1', - 'scores__metric__value', - 'tags__c', - 'tags__id', - 'tags__e__2__f', - ] - res = d._get_attributes(*required_keys) - assert len(res) == len(required_keys) - assert res[required_keys.index('id')] == '123' - assert res[required_keys.index('tags__feature1')] == 121 - assert res[required_keys.index('tags__name')] == 'name' - assert res[required_keys.index('text')] == 'document' - assert res[required_keys.index('tags__c')] == 'd' - assert res[required_keys.index('tags__id')] == 'identity' - assert res[required_keys.index('scores__metric__value')] == 42 - assert res[required_keys.index('tags__e__2__f')] == 'g' - - required_keys_2 = ['tags', 'text'] - res2 = d._get_attributes(*required_keys_2) - assert len(res2) == 2 - assert res2[required_keys_2.index('text')] == 'document' - assert res2[required_keys_2.index('tags')] == d.tags - - d = Document({'id': '123', 'tags': {'outterkey': {'innerkey': 'real_value'}}}) - required_keys_3 = ['tags__outterkey__innerkey'] - res3 = d._get_attributes(*required_keys_3) - assert res3 == 'real_value' - - d = Document(content=np.array([1, 2, 3])) - res4 = np.stack(d._get_attributes(*['blob'])) - np.testing.assert_equal(res4, np.array([1, 2, 3])) - - -def test_set_get_mime(): - a = Document() - a.mime_type = 'jpg' - assert a.mime_type == 'image/jpeg' - b = Document() - b.mime_type = 'jpeg' - assert b.mime_type == 'image/jpeg' - c = Document() - c.mime_type = '.jpg' - assert c.mime_type == 'image/jpeg' - - -def test_doc_content(): - d = Document() - assert d.content is None - d.text = 'abc' - assert d.content == 'abc' - c = np.random.random([10, 10]) - d.blob = c - np.testing.assert_equal(d.content, c) - d.buffer = b'123' - assert d.buffer == b'123' - - -def test_dict_constructor(): - - d1 = Document( - uri='https://jina.ai', mime_type='text/plain', granularity=1, adjacency=3 - ) - - d2 = Document( - dict(uri='https://jina.ai', mime_type='text/plain', granularity=1, adjacency=3) - ) - - d3 = Document( - { - 'uri': 'https://jina.ai', - 'mime_type': 'text/plain', - 'granularity': 1, - 'adjacency': 3, - } - ) - - assert d1 != d2 - d1.id = None - d2.id = None - d3.id = None - assert d1 == d2 == d3 - - -def test_unknown_fields_behavior(): - d = Document(hello='world') - assert d.tags == {'hello': 'world'} - - d = Document(hello='world', unknown_fields_handler='drop') - assert d.tags == {} - - with pytest.raises(AttributeError): - d = Document(hello='world', unknown_fields_handler='raise') - - -def test_content_setter_as_proxy(): - d = Document(content='hello') - assert d.content == 'hello' - - assert 'content' not in d.non_empty_fields - assert 'text' in d.non_empty_fields - d.content = [1, 2, 3] - assert 'blob' in d.non_empty_fields - assert 'text' not in d.non_empty_fields diff --git a/tests/unit/document/test_feature_hashing.py b/tests/unit/document/test_feature_hashing.py deleted file mode 100644 index edd81932709..00000000000 --- a/tests/unit/document/test_feature_hashing.py +++ /dev/null @@ -1,20 +0,0 @@ -import pytest - -from docarray import DocumentArray -from docarray.math.ndarray import to_numpy_array - - -@pytest.mark.parametrize('n_dim', [2, 4, 100]) -@pytest.mark.parametrize('sparse', [True, False]) -@pytest.mark.parametrize('metric', ['jaccard', 'cosine']) -def test_feature_hashing(n_dim, sparse, metric): - da = DocumentArray.empty(3) - da.texts = ['hello world', 'world, bye', 'hello bye'] - da.apply(lambda d: d.embed_feature_hashing(n_dim=n_dim, sparse=sparse)) - assert da.embeddings.shape == (3, n_dim) - da.embeddings = to_numpy_array(da.embeddings) - da.match(da, metric=metric, use_scipy=True) - result = da['@m', ('id', f'scores__{metric}__value')] - assert len(result) == 2 - assert result[1][0] == 0.0 - assert result[1][1] > 0.0 diff --git a/tests/unit/document/test_io.py b/tests/unit/document/test_io.py deleted file mode 100644 index 4f226d6b954..00000000000 --- a/tests/unit/document/test_io.py +++ /dev/null @@ -1,306 +0,0 @@ -import os -import shutil - -import numpy as np -import pytest - -from docarray import Document, DocumentArray -from docarray.document.generators import ( - from_files, - from_ndarray, - from_lines, - from_csv, - from_huggingface_datasets, -) - -cur_dir = os.path.dirname(os.path.abspath(__file__)) - - -@pytest.fixture(scope='function') -def filepath(tmpdir): - input_filepath = os.path.join(tmpdir, 'input_file.csv') - with open(input_filepath, 'w') as input_file: - input_file.writelines(["1\n", "2\n", "3\n"]) - return input_filepath - - -@pytest.fixture(scope='function') -def dataset_configs(): - config = { - 'adversarial': { - 'dataset_path': 'adversarial_qa', - 'name': 'adversarialQA', - 'split': 'test', - }, - 'tweet_eval': { - 'dataset_path': 'tweet_eval', - 'name': 'emoji', - 'split': 'train+test', - }, - } - return config - - -def test_input_lines_with_filepath(filepath): - result = list(from_lines(filepath=filepath, size=2)) - assert len(result) == 2 - assert isinstance(result[0], Document) - - -def test_input_csv_from_file(): - with open(os.path.join(cur_dir, 'toydata/docs.csv')) as fp: - result = list(from_csv(fp)) - assert len(result) == 2 - assert isinstance(result[0], Document) - assert result[0].tags['source'] == 'testsrc' - - -def test_input_csv_from_lines(): - with open(os.path.join(cur_dir, 'toydata/docs.csv')) as fp: - result = list(from_lines(fp, line_format='csv')) - assert len(result) == 2 - assert isinstance(result[0], Document) - assert result[0].tags['source'] == 'testsrc' - - -def test_input_csv_from_lines_field_resolver(): - with open(os.path.join(cur_dir, 'toydata/docs.csv')) as fp: - result = list( - from_lines(fp, line_format='csv', field_resolver={'question': 'text'}) - ) - assert len(result) == 2 - assert isinstance(result[0], Document) - assert result[0].tags['source'] == 'testsrc' - assert not result[0].uri - assert result[0].text - - -@pytest.mark.parametrize( - 'da', - [ - DocumentArray, - ], -) -def test_input_csv_from_strings(da): - result = da.from_csv(os.path.join(cur_dir, 'toydata/docs.csv')) - assert len(result) == 2 - assert isinstance(result[0], Document) - assert result[0].tags['source'] == 'testsrc' - - -def test_input_lines_with_empty_filepath_and_lines(): - with pytest.raises(ValueError): - lines = from_lines(lines=None, filepath=None) - for _ in lines: - pass - - -def test_input_lines_with_jsonlines_docs(): - result = list(from_lines(filepath=os.path.join(cur_dir, 'toydata/docs.jsonlines'))) - assert len(result) == 2 - assert result[0].text == "a" - assert result[1].text == "b" - - -@pytest.mark.parametrize( - 'size, sampling_rate', - [ - (None, None), - (1, None), - (None, 0.5), - ], -) -@pytest.mark.parametrize( - 'da', - [ - DocumentArray, - ], -) -def test_input_lines_with_jsonlines_file(size, sampling_rate, da): - result = da.from_lines( - filepath=os.path.join(cur_dir, 'toydata/docs.jsonlines'), - size=size, - sampling_rate=sampling_rate, - ) - - assert len(result) == size if size is not None else 2 - if sampling_rate is None: - assert result[0].text == "a" - if size is None: - assert result[1].text == "b" - - -@pytest.mark.parametrize( - 'size, sampling_rate', - [ - (None, None), - (1, None), - (None, 0.5), - ], -) -def test_input_lines_with_jsonslines(size, sampling_rate): - with open(os.path.join(cur_dir, 'toydata/docs.jsonlines')) as fp: - lines = fp.readlines() - result = list( - from_lines( - lines=lines, line_format='json', size=size, sampling_rate=sampling_rate - ) - ) - assert len(result) == size if size is not None else 2 - if sampling_rate is None: - assert result[0].text == "a" - if size is None: - assert result[1].text == "b" - - -def test_input_lines_with_jsonlines_docs_groundtruth(): - result = list( - from_lines(filepath=os.path.join(cur_dir, 'toydata/docs_groundtruth.jsonlines')) - ) - assert len(result) == 2 - assert result[0][0].text == "a" - assert result[0][1].text == "b" - assert result[1][0].text == "c" - assert result[1][1].text == "d" - - -@pytest.mark.parametrize( - 'size, sampling_rate', - [ - (None, None), - (1, None), - (None, 0.5), - ], -) -def test_input_huggingface_datasets_from_path(dataset_configs, size, sampling_rate): - result = list( - from_huggingface_datasets( - dataset_configs['adversarial']['dataset_path'], - size=size, - name=dataset_configs['adversarial']['name'], - sampling_rate=sampling_rate, - split=dataset_configs['adversarial']['split'], - ) - ) - - if size is not None: - assert len(result) == size - - assert isinstance(result[0], Document) - - -def test_input_huggingface_datasets_with_tweet_dataset(dataset_configs): - result = list( - from_huggingface_datasets( - dataset_configs['tweet_eval']['dataset_path'], - name=dataset_configs['tweet_eval']['name'], - split=dataset_configs['tweet_eval']['split'], - ) - ) - assert isinstance(result[0], Document) - assert result[0].text - - -@pytest.mark.parametrize( - 'da', - [ - DocumentArray, - ], -) -def test_input_huggingface_datasets_from_csv_file(dataset_configs, da): - field_resolver = {'question': 'text'} - result = da.from_huggingface_datasets( - 'csv', - field_resolver=field_resolver, - data_files=os.path.join(cur_dir, 'toydata/docs.csv'), - split='train', - ) - - assert len(result) == 2 - assert isinstance(result[0], Document) - assert result[0].text == 'What are the symptoms?' - assert result[0].tags['source'] == 'testsrc' - - -@pytest.mark.parametrize( - 'da', - [ - DocumentArray, - ], -) -def test_input_huggingface_datasets_with_field_resolver(dataset_configs, da): - field_resolver = {'question': 'text'} - result = da.from_huggingface_datasets( - dataset_configs['adversarial']['dataset_path'], - field_resolver=field_resolver, - name=dataset_configs['adversarial']['name'], - split=dataset_configs['adversarial']['split'], - ) - - assert isinstance(result[0], Document) - assert result[0].text - assert 'title' in result[0].tags - - -def test_input_huggingface_datasets_with_filter_fields(dataset_configs): - field_resolver = {'question': 'text'} - result = list( - from_huggingface_datasets( - dataset_configs['adversarial']['dataset_path'], - field_resolver=field_resolver, - filter_fields=True, - name=dataset_configs['adversarial']['name'], - split=dataset_configs['adversarial']['split'], - ) - ) - assert isinstance(result[0], Document) - assert result[0].text - assert not 'title' in result[0].tags - - -def test_input_huggingface_datasets_with_no_split(dataset_configs): - with pytest.raises(ValueError): - result = from_huggingface_datasets( - dataset_configs['adversarial']['dataset_path'], - name=dataset_configs['adversarial']['name'], - ) - for _ in result: - pass - - -def test_input_huggingface_datasets_with_filter_fields_and_no_resolver(dataset_configs): - with pytest.raises(ValueError): - result = from_huggingface_datasets( - dataset_configs['adversarial']['dataset_path'], - name=dataset_configs['adversarial']['name'], - filter_fields=True, - ) - for _ in result: - pass - - -@pytest.mark.parametrize( - 'patterns, recursive, size, sampling_rate, read_mode', - [ - ('*.*', True, None, None, None), - ('*.*', False, None, None, None), - ('*.*', True, 2, None, None), - ('*.*', True, 2, None, 'rb'), - ('*.*', True, None, 0.5, None), - ], -) -def test_input_files(patterns, recursive, size, sampling_rate, read_mode): - DocumentArray( - from_files( - patterns=patterns, - recursive=recursive, - size=size, - sampling_rate=sampling_rate, - read_mode=read_mode, - ) - ) - - -def test_from_files_with_uri(): - for d in from_files(patterns='*.*', to_dataturi=True, size=10): - assert d.uri.startswith('data:') diff --git a/tests/unit/document/test_namedscore.py b/tests/unit/document/test_namedscore.py deleted file mode 100644 index c128a2d014a..00000000000 --- a/tests/unit/document/test_namedscore.py +++ /dev/null @@ -1,11 +0,0 @@ -import pytest - -from docarray.score import NamedScore - - -@pytest.mark.parametrize( - 'init_args', [None, dict(value=123, description='hello'), NamedScore()] -) -@pytest.mark.parametrize('copy', [True, False]) -def test_construct_ns(init_args, copy): - NamedScore(init_args, copy) diff --git a/tests/unit/document/test_ndarray.py b/tests/unit/document/test_ndarray.py deleted file mode 100644 index 91bb76d1410..00000000000 --- a/tests/unit/document/test_ndarray.py +++ /dev/null @@ -1,35 +0,0 @@ -import numpy as np -import paddle -import pytest -import tensorflow as tf -import torch -from scipy.sparse import csr_matrix, coo_matrix, bsr_matrix, csc_matrix - -from docarray import Document - - -def get_ndarrays(): - a = np.random.random([10, 3]) - a[a > 0.5] = 0 - return [ - (a, False), - (torch.tensor(a), False), - (tf.constant(a), False), - (paddle.to_tensor(a), False), - (torch.tensor(a).to_sparse(), True), - (tf.sparse.from_dense(a), True), - (csr_matrix(a), True), - (bsr_matrix(a), True), - (coo_matrix(a), True), - (csc_matrix(a), True), - ] - - -@pytest.mark.parametrize('ndarray_val, is_sparse', get_ndarrays()) -@pytest.mark.parametrize('attr', ['embedding', 'blob']) -def test_ndarray_force_numpy(ndarray_val, attr, is_sparse): - d = Document() - setattr(d, attr, ndarray_val) - assert type(getattr(Document.from_protobuf(d.to_protobuf()), attr)) is type( - ndarray_val - ) diff --git a/tests/unit/document/test_pickle.py b/tests/unit/document/test_pickle.py deleted file mode 100644 index 10df0ad29e8..00000000000 --- a/tests/unit/document/test_pickle.py +++ /dev/null @@ -1,35 +0,0 @@ -import pickle - -import pytest - -from docarray import Document -from docarray.document.data import DocumentData -from docarray.base import BaseDCType -from tests import random_docs - - -@pytest.mark.parametrize('cls', BaseDCType.__subclasses__()) -def test_pickle_dump_load(cls): - r = pickle.loads(pickle.dumps(cls())) - isinstance(r, cls) - - -def test_pickle_dump_load_real_doc(): - for d in random_docs(10): - dr = pickle.loads(pickle.dumps(d)) - assert dr == d - assert dr.embedding is not None - assert len(dr.chunks) == len(d.chunks) - - -def test_pickle_rely_on_data_class_and_document_class(): - # TODO (Han): This is not really a designed behavior, but atm I see no harm - # of having it, and no real usecases that against it. - - d = Document() - d.id = 'hello' - setattr(d, 'foo', 'bar') - assert getattr(d, 'foo') == 'bar' - r_d = Document.from_bytes(d.to_bytes(protocol='pickle')) - assert r_d.id == d.id - assert getattr(r_d, 'foo') == 'bar' diff --git a/tests/unit/document/test_porting.py b/tests/unit/document/test_porting.py deleted file mode 100644 index e90b927fc67..00000000000 --- a/tests/unit/document/test_porting.py +++ /dev/null @@ -1,22 +0,0 @@ -import pytest - -from docarray import Document, DocumentArray -from tests import random_docs - - -@pytest.mark.parametrize('protocol', ['protobuf', 'pickle']) -@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) -def test_to_from_bytes(protocol, compress): - d = Document(embedding=[1, 2, 3, 4, 5], text='hello') - bstr = d.to_bytes(protocol=protocol, compress=compress) - print(protocol, compress, len(bstr)) - d2 = Document.from_bytes(bstr, protocol=protocol, compress=compress) - assert d2.non_empty_fields == d.non_empty_fields - - -@pytest.mark.parametrize('target', [DocumentArray.empty(10), random_docs(10)]) -def test_dict_json(target): - for d in target: - d1 = Document.from_dict(d.to_dict()) - d2 = Document.from_json(d.to_json()) - assert d1 == d2 diff --git a/tests/unit/document/test_protobuf.py b/tests/unit/document/test_protobuf.py deleted file mode 100644 index d844e12aeac..00000000000 --- a/tests/unit/document/test_protobuf.py +++ /dev/null @@ -1,34 +0,0 @@ -import numpy as np -import pytest - -from docarray import Document -from docarray.score import NamedScore - - -@pytest.mark.parametrize( - 'doc', - [ - Document(tags={'hello': 'world', 'sad': {'nest': 123}, 'hello12': 1.2}), - Document(scores={'hello': NamedScore(value=1.0, description='hello')}), - Document(location=[1.0, 2.0, 3.0]), - Document(chunks=[Document()], matches=[Document(), Document()]), - ], -) -def test_to_from_protobuf(doc): - docr = Document.from_protobuf(doc.to_protobuf()) - assert docr == doc - - -def test_to_protobuf(): - with pytest.raises(TypeError): - Document(text='hello', embedding=np.array([1, 2, 3]), id=1).to_protobuf() - - with pytest.raises(AttributeError): - Document(tags=1).to_protobuf() - - assert ( - Document(text='hello', embedding=np.array([1, 2, 3])).to_protobuf().text - == 'hello' - ) - assert Document(tags={'hello': 'world'}).to_protobuf().tags - assert len(Document(chunks=[Document(), Document()]).to_protobuf().chunks) == 2 diff --git a/tests/unit/document/test_repr_str.py b/tests/unit/document/test_repr_str.py deleted file mode 100644 index d793440aec4..00000000000 --- a/tests/unit/document/test_repr_str.py +++ /dev/null @@ -1,37 +0,0 @@ -import pytest - -from docarray import Document -from docarray.array.chunk import ChunkArray -from docarray.array.match import MatchArray -from docarray.score import NamedScore - - -@pytest.mark.parametrize( - 'obj', - [ - Document(), - NamedScore(), - MatchArray([Document()], Document()), - ChunkArray([Document()], Document()), - ], -) -def test_builtin_str_repr_no_content(obj): - print(obj) - print(f'{obj!r}') - - -@pytest.mark.parametrize( - 'obj', - [ - Document(content='123', chunks=[Document(content='abc')]), - NamedScore( - op_name='operation', - value=10.0, - ref_id='10' * 16, - description='score description', - ), - ], -) -def test_builtin_str_repr_has_content(obj): - print(obj) - print(f'{obj!r}') diff --git a/tests/unit/document/test_summary.py b/tests/unit/document/test_summary.py deleted file mode 100644 index 6516dbc50c2..00000000000 --- a/tests/unit/document/test_summary.py +++ /dev/null @@ -1,29 +0,0 @@ -import os - -from docarray import Document - -cur_dir = os.path.dirname(os.path.abspath(__file__)) - - -def test_single_doc_summary(): - # empty doc - Document().summary() - # nested doc - Document( - chunks=[ - Document(), - Document(chunks=[Document()]), - Document(), - ], - matches=[Document(), Document()], - ).summary() - - -def test_plot_image(): - d = Document(uri=os.path.join(cur_dir, 'toydata/test.png')) - d.plot() - - d.load_uri_to_image_blob() - d.uri = None - - d.plot() diff --git a/tests/unit/document/toydata/docs.jsonlines b/tests/unit/document/toydata/docs.jsonlines deleted file mode 100644 index 58b3fbfe534..00000000000 --- a/tests/unit/document/toydata/docs.jsonlines +++ /dev/null @@ -1,2 +0,0 @@ -{"text": "a"} -{"text": "b"} diff --git a/tests/unit/document/toydata/mov_bbb.mp4 b/tests/unit/document/toydata/mov_bbb.mp4 deleted file mode 100644 index 0a4dd5b4017..00000000000 Binary files a/tests/unit/document/toydata/mov_bbb.mp4 and /dev/null differ diff --git a/tests/unit/document/toydata/olleh.wav b/tests/unit/document/toydata/olleh.wav deleted file mode 100644 index 28523a7750f..00000000000 Binary files a/tests/unit/document/toydata/olleh.wav and /dev/null differ diff --git a/tests/unit/test_placeholder.py b/tests/unit/test_placeholder.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/units/__init__.py b/tests/units/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/array/__init__.py b/tests/units/array/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/array/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/array/stack/__init__.py b/tests/units/array/stack/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/array/stack/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/array/stack/storage/__init__.py b/tests/units/array/stack/storage/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/array/stack/storage/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/array/stack/storage/test_array_stack_with_optional.py b/tests/units/array/stack/storage/test_array_stack_with_optional.py new file mode 100644 index 00000000000..182b0178593 --- /dev/null +++ b/tests/units/array/stack/storage/test_array_stack_with_optional.py @@ -0,0 +1,88 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional + +import numpy as np +import pytest + +from docarray import BaseDoc, DocList, DocVec +from docarray.typing import NdArray + + +class Nested(BaseDoc): + tensor: NdArray + + +class Image(BaseDoc): + features: Optional[Nested] = None + + +def test_optional_field(): + docs = DocVec[Image]([Image() for _ in range(10)]) + + assert docs.features is None + + docs.features = DocList[Nested]([Nested(tensor=np.zeros(10)) for _ in range(10)]) + + assert docs.features.tensor.shape == (10, 10) + + for doc in docs: + assert doc.features.tensor.shape == (10,) + + +def test_set_none(): + docs = DocVec[Image]( + [Image(features=Nested(tensor=np.zeros(10))) for _ in range(10)] + ) + assert docs.features.tensor.shape == (10, 10) + + docs.features = None + + assert docs.features is None + + for doc in docs: + assert doc.features is None + + +def test_set_doc(): + docs = DocVec[Image]( + [Image(features=Nested(tensor=np.zeros(10))) for _ in range(10)] + ) + assert docs.features.tensor.shape == (10, 10) + + for doc in docs: + doc.features = Nested(tensor=np.ones(10)) + + with pytest.raises(ValueError): + doc.features = None + + +def test_set_doc_none(): + docs = DocVec[Image]([Image() for _ in range(10)]) + + assert docs.features is None + + for doc in docs: + with pytest.raises(ValueError): + doc.features = Nested(tensor=np.ones(10)) + + +def test_no_uniform_none(): + with pytest.raises(ValueError): + DocVec[Image]([Image(), Image(features=Nested(tensor=np.zeros(10)))]) + + with pytest.raises(ValueError): + DocVec[Image]([Image(features=Nested(tensor=np.zeros(10))), Image()]) diff --git a/tests/units/array/stack/storage/test_storage.py b/tests/units/array/stack/storage/test_storage.py new file mode 100644 index 00000000000..b91585d3737 --- /dev/null +++ b/tests/units/array/stack/storage/test_storage.py @@ -0,0 +1,104 @@ +import numpy as np + +from docarray import BaseDoc +from docarray.array import DocVec +from docarray.array.doc_vec.column_storage import ColumnStorageView +from docarray.typing import AnyTensor + + +def test_column_storage_init(): + class InnerDoc(BaseDoc): + price: int + + class MyDoc(BaseDoc): + tensor: AnyTensor + name: str + doc: InnerDoc + + docs = [ + MyDoc(tensor=np.zeros(10), name='hello', doc=InnerDoc(price=i)) + for i in range(4) + ] + + storage = DocVec[MyDoc](docs)._storage + + assert (storage.tensor_columns['tensor'] == np.zeros((4, 10))).all() + for name in storage.any_columns['name']: + assert name == 'hello' + inner_docs = storage.doc_columns['doc'] + assert isinstance(inner_docs, DocVec) + for i, doc in enumerate(inner_docs): + assert isinstance(doc, InnerDoc) + assert doc.price == i + + +def test_column_storage_view(): + class MyDoc(BaseDoc): + tensor: AnyTensor + name: str + + docs = [MyDoc(tensor=np.zeros((10, 10)), name='hello', id=str(i)) for i in range(4)] + + storage = DocVec[MyDoc](docs)._storage + + view = ColumnStorageView(0, storage) + + assert view['id'] == '0' + assert (view['tensor'] == np.zeros(10)).all() + assert view['name'] == 'hello' + + view['id'] = '1' + view['tensor'] = np.ones(10) + view['name'] = 'byebye' + + assert storage.any_columns['id'][0] == '1' + assert (storage.tensor_columns['tensor'][0] == np.ones(10)).all() + assert storage.any_columns['name'][0] == 'byebye' + + +def test_column_storage_to_dict(): + class MyDoc(BaseDoc): + tensor: AnyTensor + name: str + + docs = [MyDoc(tensor=np.zeros((10, 10)), name='hello', id=str(i)) for i in range(4)] + + storage = DocVec[MyDoc](docs)._storage + + view = ColumnStorageView(0, storage) + + dict_view = view.to_dict() + + assert dict_view['id'] == '0' + assert (dict_view['tensor'] == np.zeros(10)).all() + assert np.may_share_memory(dict_view['tensor'], view['tensor']) + assert dict_view['name'] == 'hello' + + +def test_storage_view_dict_like(): + class MyDoc(BaseDoc): + tensor: AnyTensor + name: str + + docs = [MyDoc(tensor=np.zeros((10, 10)), name='hello', id=str(i)) for i in range(4)] + + storage = DocVec[MyDoc](docs)._storage + + view = ColumnStorageView(0, storage) + + assert list(view.keys()) == ['id', 'name', 'tensor'] + + # since boolean value of np array is ambiguous, we iterate manually + for val_view, val_reference in zip(view.values(), ['0', 'hello', np.zeros(10)]): + if isinstance(val_view, np.ndarray): + assert (val_view == val_reference).all() + else: + assert val_view == val_reference + for item_view, item_reference in zip( + view.items(), [('id', '0'), ('name', 'hello'), ('tensor', np.zeros(10))] + ): + if isinstance(item_view[1], np.ndarray): + assert item_view[0] == item_reference[0] + assert (item_view[1] == item_reference[1]).all() + else: + assert item_view == item_reference diff --git a/tests/units/array/stack/test_array_stacked.py b/tests/units/array/stack/test_array_stacked.py new file mode 100644 index 00000000000..b1b385840dd --- /dev/null +++ b/tests/units/array/stack/test_array_stacked.py @@ -0,0 +1,680 @@ +from typing import Dict, Optional, Union + +import numpy as np +import pytest +import torch +from pydantic import parse_obj_as + +from docarray import BaseDoc, DocList +from docarray.array import DocVec +from docarray.documents import ImageDoc +from docarray.exceptions.exceptions import UnusableObjectError +from docarray.typing import AnyEmbedding, AnyTensor, NdArray, TorchTensor + + +@pytest.fixture() +def batch(): + class ImageDoc(BaseDoc): + tensor: TorchTensor[3, 224, 224] + + batch = DocVec[ImageDoc]( + [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] + ) + + return batch + + +@pytest.fixture() +def nested_batch(): + class ImageDoc(BaseDoc): + tensor: TorchTensor[3, 224, 224] + + class MMdoc(BaseDoc): + img: DocList[ImageDoc] + + batch = DocList[MMdoc]( + [ + MMdoc( + img=DocList[ImageDoc]( + [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] + ) + ) + for _ in range(10) + ] + ) + + return batch.to_doc_vec() + + +def test_create_from_list_docs(): + list_ = [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] + da_stacked = DocVec[ImageDoc](docs=list_, tensor_type=TorchTensor) + assert len(da_stacked) == 10 + assert da_stacked.tensor.shape == tuple([10, 3, 224, 224]) + + +def test_len(batch): + assert len(batch) == 10 + + +def test_create_from_None(): + with pytest.raises(ValueError): + DocVec[ImageDoc]([]) + + +def test_getitem(batch): + for i in range(len(batch)): + assert (batch[i].tensor == torch.zeros(3, 224, 224)).all() + + +def test_iterator(batch): + for doc in batch: + assert (doc.tensor == torch.zeros(3, 224, 224)).all() + + +def test_stack_setter(): + class ImageDoc(BaseDoc): + tensor: TorchTensor[3, 224, 224] + + batch = DocList[ImageDoc]( + [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] + ) + + batch = batch.to_doc_vec() + batch.tensor = torch.ones(10, 3, 224, 224) + + assert (batch.tensor == torch.ones(10, 3, 224, 224)).all() + + for i, doc in enumerate(batch): + assert (doc.tensor == batch.tensor[i]).all() + + +def test_stack_setter_np(): + class ImageDoc(BaseDoc): + tensor: NdArray[3, 224, 224] + + batch = DocList[ImageDoc]( + [ImageDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)] + ) + + batch = batch.to_doc_vec() + batch.tensor = np.ones((10, 3, 224, 224)) + + assert (batch.tensor == np.ones((10, 3, 224, 224))).all() + + for i, doc in enumerate(batch): + assert (doc.tensor == batch.tensor[i]).all() + + +def test_stack_optional(batch): + assert ( + batch._storage.tensor_columns['tensor'] == torch.zeros(10, 3, 224, 224) + ).all() + assert (batch.tensor == torch.zeros(10, 3, 224, 224)).all() + + +def test_stack_numpy(): + class ImageDoc(BaseDoc): + tensor: NdArray[3, 224, 224] + + batch = DocList[ImageDoc]( + [ImageDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)] + ) + + batch = batch.to_doc_vec() + + assert ( + batch._storage.tensor_columns['tensor'] == np.zeros((10, 3, 224, 224)) + ).all() + assert (batch.tensor == np.zeros((10, 3, 224, 224))).all() + assert ( + batch.tensor.ctypes.data == batch._storage.tensor_columns['tensor'].ctypes.data + ) + + +def test_stack(batch): + assert ( + batch._storage.tensor_columns['tensor'] == torch.zeros(10, 3, 224, 224) + ).all() + assert (batch.tensor == torch.zeros(10, 3, 224, 224)).all() + assert batch._storage.tensor_columns['tensor'].data_ptr() == batch.tensor.data_ptr() + + for doc, tensor in zip(batch, batch.tensor): + assert doc.tensor.data_ptr() == tensor.data_ptr() + + for i in range(len(batch)): + assert batch[i].tensor.data_ptr() == batch.tensor[i].data_ptr() + + +def test_stack_mod_nested_document(): + class ImageDoc(BaseDoc): + tensor: TorchTensor[3, 224, 224] + + class MMdoc(BaseDoc): + img: ImageDoc + + batch = DocList[MMdoc]( + [MMdoc(img=ImageDoc(tensor=torch.zeros(3, 224, 224))) for _ in range(10)] + ) + + batch = batch.to_doc_vec() + + assert ( + batch._storage.doc_columns['img']._storage.tensor_columns['tensor'] + == torch.zeros(10, 3, 224, 224) + ).all() + + assert (batch.img.tensor == torch.zeros(10, 3, 224, 224)).all() + + assert ( + batch._storage.doc_columns['img']._storage.tensor_columns['tensor'].data_ptr() + == batch.img.tensor.data_ptr() + ) + + +def test_stack_nested_DocArray(nested_batch): + for i in range(len(nested_batch)): + assert ( + nested_batch[i].img._storage.tensor_columns['tensor'] + == torch.zeros(10, 3, 224, 224) + ).all() + assert (nested_batch[i].img.tensor == torch.zeros(10, 3, 224, 224)).all() + assert ( + nested_batch[i].img._storage.tensor_columns['tensor'].data_ptr() + == nested_batch[i].img.tensor.data_ptr() + ) + + +def test_convert_to_da(batch): + class ImageDoc(BaseDoc): + tensor: TorchTensor[3, 224, 224] + + batch = DocList[ImageDoc]( + [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] + ) + + batch = batch.to_doc_vec() + da = batch.to_doc_list() + + for doc in da: + assert (doc.tensor == torch.zeros(3, 224, 224)).all() + + +def test_unstack_nested_document(): + class ImageDoc(BaseDoc): + tensor: TorchTensor[3, 224, 224] + + class MMdoc(BaseDoc): + img: ImageDoc + + batch = DocList[MMdoc]( + [MMdoc(img=ImageDoc(tensor=torch.zeros(3, 224, 224))) for _ in range(10)] + ) + + batch = batch.to_doc_vec() + + da = batch.to_doc_list() + + for doc in da: + assert (doc.img.tensor == torch.zeros(3, 224, 224)).all() + + +def test_unstack_nested_DocArray(nested_batch): + batch = nested_batch.to_doc_list() + for i in range(len(batch)): + assert isinstance(batch[i].img, DocList) + for doc in batch[i].img: + assert (doc.tensor == torch.zeros(3, 224, 224)).all() + + +def test_stack_call(): + class ImageDoc(BaseDoc): + tensor: TorchTensor[3, 224, 224] + + da = DocList[ImageDoc]( + [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] + ) + + da = da.to_doc_vec() + + assert len(da) == 10 + + assert da.tensor.shape == (10, 3, 224, 224) + + +def test_stack_union(): + class ImageDoc(BaseDoc): + tensor: Union[NdArray[3, 224, 224], TorchTensor[3, 224, 224]] + + batch = DocList[ImageDoc]( + [ImageDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)] + ) + batch[3].tensor = np.zeros((3, 224, 224)) + + # union fields aren't actually doc_vec + # just checking that there is no error + batch.to_doc_vec() + + +@pytest.mark.parametrize( + 'tensor_type,tensor', + [(TorchTensor, torch.zeros(3, 224, 224)), (NdArray, np.zeros((3, 224, 224)))], +) +def test_any_tensor_with_torch(tensor_type, tensor): + class ImageDoc(BaseDoc): + tensor: AnyTensor + + da = DocVec[ImageDoc]( + [ImageDoc(tensor=tensor) for _ in range(10)], + tensor_type=tensor_type, + ) + + for i in range(len(da)): + assert (da[i].tensor == tensor).all() + + assert 'tensor' in da._storage.tensor_columns.keys() + assert isinstance(da._storage.tensor_columns['tensor'], tensor_type) + + +def test_any_tensor_with_optional(): + tensor = torch.zeros(3, 224, 224) + + class ImageDoc(BaseDoc): + tensor: Optional[AnyTensor] = None + + class TopDoc(BaseDoc): + img: ImageDoc + + da = DocVec[TopDoc]( + [TopDoc(img=ImageDoc(tensor=tensor)) for _ in range(10)], + tensor_type=TorchTensor, + ) + + for i in range(len(da)): + assert (da.img[i].tensor == tensor).all() + + assert 'tensor' in da.img._storage.tensor_columns.keys() + assert isinstance(da.img._storage.tensor_columns['tensor'], TorchTensor) + + +def test_dict_stack(): + class MyDoc(BaseDoc): + my_dict: Dict[str, int] + + da = DocVec[MyDoc]([MyDoc(my_dict={'a': 1, 'b': 2}) for _ in range(10)]) + + da.my_dict + + +def test_get_from_slice_stacked(): + class Doc(BaseDoc): + text: str + tensor: NdArray + + N = 10 + + da = DocVec[Doc]( + [Doc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)] + ) + + da_sliced = da[0:10:2] + assert isinstance(da_sliced, DocVec) + + tensors = da_sliced.tensor + assert tensors.shape == (5, 3, 224, 224) + + texts = da_sliced.text + assert len(texts) == 5 + for i, text in enumerate(texts): + assert text == f'hello{i * 2}' + + +def test_stack_embedding(): + class MyDoc(BaseDoc): + embedding: AnyEmbedding + + da = DocVec[MyDoc]([MyDoc(embedding=np.zeros(10)) for _ in range(10)]) + + assert 'embedding' in da._storage.tensor_columns.keys() + assert (da.embedding == np.zeros((10, 10))).all() + + +@pytest.mark.parametrize('tensor_backend', [TorchTensor, NdArray]) +def test_stack_none(tensor_backend): + class MyDoc(BaseDoc): + tensor: Optional[AnyTensor] = None + + da = DocVec[MyDoc]( + [MyDoc(tensor=None) for _ in range(10)], tensor_type=tensor_backend + ) + + assert 'tensor' in da._storage.tensor_columns.keys() + + +def test_to_device(): + da = DocVec[ImageDoc]([ImageDoc(tensor=torch.zeros(3, 5))], tensor_type=TorchTensor) + assert da.tensor.device == torch.device('cpu') + da.to('meta') + assert da.tensor.device == torch.device('meta') + + +def test_to_device_with_nested_da(): + class Video(BaseDoc): + images: DocVec[ImageDoc] + + da_image = DocVec[ImageDoc]( + [ImageDoc(tensor=torch.zeros(3, 5))], tensor_type=TorchTensor + ) + + da = DocVec[Video]([Video(images=da_image)]) + assert da.images[0].tensor.device == torch.device('cpu') + da.to('meta') + assert da.images[0].tensor.device == torch.device('meta') + + +def test_to_device_nested(): + class MyDoc(BaseDoc): + tensor: TorchTensor + docs: ImageDoc + + da = DocVec[MyDoc]( + [MyDoc(tensor=torch.zeros(3, 5), docs=ImageDoc(tensor=torch.zeros(3, 5)))], + tensor_type=TorchTensor, + ) + assert da.tensor.device == torch.device('cpu') + assert da.docs.tensor.device == torch.device('cpu') + da.to('meta') + assert da.tensor.device == torch.device('meta') + assert da.docs.tensor.device == torch.device('meta') + + +def test_to_device_numpy(): + da = DocVec[ImageDoc]([ImageDoc(tensor=np.zeros((3, 5)))], tensor_type=NdArray) + with pytest.raises(NotImplementedError): + da.to('meta') + + +def test_keep_dtype_torch(): + class MyDoc(BaseDoc): + tensor: TorchTensor + + da = DocList[MyDoc]( + [MyDoc(tensor=torch.zeros([2, 4], dtype=torch.int32)) for _ in range(3)] + ) + assert da[0].tensor.dtype == torch.int32 + + da = da.to_doc_vec() + assert da[0].tensor.dtype == torch.int32 + assert da.tensor.dtype == torch.int32 + + +def test_keep_dtype_np(): + class MyDoc(BaseDoc): + tensor: NdArray + + da = DocList[MyDoc]( + [MyDoc(tensor=np.zeros([2, 4], dtype=np.int32)) for _ in range(3)] + ) + assert da[0].tensor.dtype == np.int32 + + da = da.to_doc_vec() + assert da[0].tensor.dtype == np.int32 + assert da.tensor.dtype == np.int32 + + +def test_del_item(batch): + assert len(batch) == 10 + assert batch.tensor.shape[0] == 10 + with pytest.raises(NotImplementedError): + del batch[2] + + +def test_np_scalar(): + class MyDoc(BaseDoc): + scalar: NdArray + + da = DocList[MyDoc]([MyDoc(scalar=np.array(2.0)) for _ in range(3)]) + assert all(doc.scalar.ndim == 0 for doc in da) + assert all(doc.scalar == 2.0 for doc in da) + + stacked_da = da.to_doc_vec() + assert type(stacked_da.scalar) == NdArray + + assert all(type(doc.scalar) == NdArray for doc in stacked_da) + assert all(doc.scalar.ndim == 1 for doc in stacked_da) + assert all(doc.scalar == 2.0 for doc in stacked_da) + + # Make sure they share memory + stacked_da.scalar[0] = 3.0 + assert stacked_da[0].scalar == 3.0 + + +def test_torch_scalar(): + class MyDoc(BaseDoc): + scalar: TorchTensor + + da = DocList[MyDoc]( + [MyDoc(scalar=torch.tensor(2.0)) for _ in range(3)], + ) + assert all(doc.scalar.ndim == 0 for doc in da) + assert all(doc.scalar == 2.0 for doc in da) + stacked_da = da.to_doc_vec(tensor_type=TorchTensor) + assert type(stacked_da.scalar) == TorchTensor + + assert all(type(doc.scalar) == TorchTensor for doc in stacked_da) + assert all(doc.scalar.ndim == 1 for doc in stacked_da) # TODO failing here + assert all(doc.scalar == 2.0 for doc in stacked_da) + + stacked_da.scalar[0] = 3.0 + assert stacked_da[0].scalar == 3.0 + + +def test_np_nan(): + class MyDoc(BaseDoc): + scalar: Optional[NdArray] = None + + da = DocList[MyDoc]([MyDoc() for _ in range(3)]) + assert all(doc.scalar is None for doc in da) + assert all(doc.scalar == doc.scalar for doc in da) + stacked_da = da.to_doc_vec() + assert stacked_da.scalar is None + + assert all(doc.scalar is None for doc in stacked_da) + # Stacking them turns them into np.nan + + +def test_from_storage(): + class ImageDoc(BaseDoc): + tensor: TorchTensor[3, 224, 224] + + batch = DocVec[ImageDoc]( + [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] + ) + + DocVec[ImageDoc].from_columns_storage(batch._storage) + + +def test_validate_from_da(): + class ImageDoc(BaseDoc): + tensor: TorchTensor[3, 224, 224] + + batch = DocList[ImageDoc]( + [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] + ) + + da = parse_obj_as(DocVec[ImageDoc], batch) + + assert isinstance(da, DocVec) + for d in da: + assert isinstance(d, ImageDoc) + + +def test_validation_column_tensor(batch): + batch.tensor = torch.zeros(10, 3, 224, 244) + assert isinstance(batch.tensor, TorchTensor) + + +def test_validation_column_tensor_fail(batch): + with pytest.raises(ValueError): + batch.tensor = ['hello'] * 10 + + with pytest.raises(ValueError): + batch.tensor = torch.zeros(11, 3, 224, 244) + + +@pytest.fixture() +def batch_nested_doc(): + class Inner(BaseDoc): + hello: str + + class Doc(BaseDoc): + inner: Inner + + batch = DocVec[Doc]([Doc(inner=Inner(hello='hello')) for _ in range(10)]) + return batch, Doc, Inner + + +def test_validation_column_doc(batch_nested_doc): + batch, Doc, Inner = batch_nested_doc + + batch.inner = DocList[Inner]([Inner(hello='hello') for _ in range(10)]) + assert isinstance(batch.inner, DocVec) + for d in batch.inner: + assert isinstance(d, Inner) + + +def test_validation_list_doc(batch_nested_doc): + batch, Doc, Inner = batch_nested_doc + + batch.inner = [Inner(hello='hello') for _ in range(10)] + assert isinstance(batch.inner, DocVec) + for d in batch.inner: + assert isinstance(d, Inner) + + +def test_validation_col_doc_fail(batch_nested_doc): + batch, Doc, Inner = batch_nested_doc + + with pytest.raises(ValueError): + batch.inner = ['hello'] * 10 + + with pytest.raises(ValueError): + batch.inner = DocList[Inner]([Inner(hello='hello') for _ in range(11)]) + + +def test_doc_view_update(batch): + batch[0].tensor = 12 * torch.ones(3, 224, 224) + assert (batch.tensor[0] == 12 * torch.ones(3, 224, 224)).all() + + +def test_doc_view_nested(batch_nested_doc): + batch, Doc, Inner = batch_nested_doc + batch[0].inner = Inner(hello='world') + assert batch.inner[0].hello == 'world' + + +def test_type_error_no_doc_type(): + + with pytest.raises(TypeError): + DocVec([BaseDoc() for _ in range(10)]) + + +def test_doc_view_dict(batch: DocVec[ImageDoc]): + doc_view = batch[0] + assert doc_view.is_view() + d = doc_view.dict() + assert d['tensor'].shape == (3, 224, 224) + assert d['id'] == doc_view.id + + doc_view_two = batch[1] + assert doc_view_two.is_view() + d = doc_view_two.dict() + assert d['tensor'].shape == (3, 224, 224) + assert d['id'] == doc_view_two.id + + +def test_doc_vec_equality(): + class Text(BaseDoc): + text: str + + da = DocVec[Text]([Text(text='hello') for _ in range(10)]) + da2 = DocList[Text]([Text(text='hello') for _ in range(10)]) + + assert da != da2 + assert da == da2.to_doc_vec() + + +@pytest.mark.parametrize('tensor_type', [TorchTensor, NdArray]) +def test_doc_vec_equality_tensor(tensor_type): + class Text(BaseDoc): + tens: tensor_type + + da = DocVec[Text]( + [Text(tens=[1, 2, 3, 4]) for _ in range(10)], tensor_type=tensor_type + ) + da2 = DocVec[Text]( + [Text(tens=[1, 2, 3, 4]) for _ in range(10)], tensor_type=tensor_type + ) + assert da == da2 + + da2 = DocVec[Text]( + [Text(tens=[1, 2, 3, 4, 5]) for _ in range(10)], tensor_type=tensor_type + ) + assert da != da2 + + +@pytest.mark.tensorflow +def test_doc_vec_equality_tf(): + from docarray.typing import TensorFlowTensor + + class Text(BaseDoc): + tens: TensorFlowTensor + + da = DocVec[Text]( + [Text(tens=[1, 2, 3, 4]) for _ in range(10)], tensor_type=TensorFlowTensor + ) + da2 = DocVec[Text]( + [Text(tens=[1, 2, 3, 4]) for _ in range(10)], tensor_type=TensorFlowTensor + ) + assert da == da2 + + da2 = DocVec[Text]( + [Text(tens=[1, 2, 3, 4, 5]) for _ in range(10)], tensor_type=TensorFlowTensor + ) + assert da != da2 + + +def test_doc_vec_nested(batch_nested_doc): + batch, Doc, Inner = batch_nested_doc + batch2 = DocVec[Doc]([Doc(inner=Inner(hello='hello')) for _ in range(10)]) + + assert batch == batch2 + + +def test_doc_vec_tensor_type(): + class ImageDoc(BaseDoc): + tensor: AnyTensor + + da = DocVec[ImageDoc]([ImageDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)]) + + da2 = DocVec[ImageDoc]( + [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)], + tensor_type=TorchTensor, + ) + + assert da != da2 + + +def teste_unusable_state_raises_exception(): + from docarray import DocVec + from docarray.documents import ImageDoc + + docs = DocVec[ImageDoc]([ImageDoc(url='http://url.com/foo.png') for _ in range(10)]) + + docs.to_doc_list() + + with pytest.raises(UnusableObjectError): + docs.url + + with pytest.raises(UnusableObjectError): + docs.url = 'hi' diff --git a/tests/units/array/stack/test_array_stacked_jax.py b/tests/units/array/stack/test_array_stacked_jax.py new file mode 100644 index 00000000000..86f1399a40d --- /dev/null +++ b/tests/units/array/stack/test_array_stacked_jax.py @@ -0,0 +1,301 @@ +from typing import Optional, Union + +import pytest + +from docarray import BaseDoc, DocList +from docarray.array import DocVec +from docarray.typing import ( + AnyEmbedding, + AnyTensor, + AudioTensor, + ImageTensor, + NdArray, + VideoTensor, +) +from docarray.utils._internal.misc import is_jax_available + +jax_available = is_jax_available() +if jax_available: + import jax.numpy as jnp + + from docarray.typing import JaxArray + + +@pytest.fixture() +@pytest.mark.jax +def batch(): + + import jax.numpy as jnp + + class Image(BaseDoc): + tensor: JaxArray[3, 224, 224] + + batch = DocList[Image]([Image(tensor=jnp.zeros((3, 224, 224))) for _ in range(10)]) + + return batch.to_doc_vec() + + +@pytest.fixture() +@pytest.mark.jax +def nested_batch(): + class Image(BaseDoc): + tensor: JaxArray[3, 224, 224] + + class MMdoc(BaseDoc): + img: DocList[Image] + + batch = DocVec[MMdoc]( + [ + MMdoc( + img=DocList[Image]( + [Image(tensor=jnp.zeros((3, 224, 224))) for _ in range(10)] + ) + ) + for _ in range(10) + ] + ) + + return batch + + +@pytest.mark.jax +def test_len(batch): + assert len(batch) == 10 + + +@pytest.mark.jax +def test_getitem(batch): + for i in range(len(batch)): + item = batch[i] + assert isinstance(item.tensor, JaxArray) + assert jnp.allclose(item.tensor.tensor, jnp.zeros((3, 224, 224))) + + +@pytest.mark.jax +def test_get_slice(batch): + sliced = batch[0:2] + assert isinstance(sliced, DocVec) + assert len(sliced) == 2 + + +@pytest.mark.jax +def test_iterator(batch): + for doc in batch: + assert jnp.allclose(doc.tensor.tensor, jnp.zeros((3, 224, 224))) + + +@pytest.mark.jax +def test_set_after_stacking(): + class Image(BaseDoc): + tensor: JaxArray[3, 224, 224] + + batch = DocVec[Image]([Image(tensor=jnp.zeros((3, 224, 224))) for _ in range(10)]) + + batch.tensor = jnp.ones((10, 3, 224, 224)) + assert jnp.allclose(batch.tensor.tensor, jnp.ones((10, 3, 224, 224))) + for i, doc in enumerate(batch): + assert jnp.allclose(doc.tensor.tensor, batch.tensor.tensor[i]) + + +@pytest.mark.jax +def test_stack_optional(batch): + assert jnp.allclose( + batch._storage.tensor_columns['tensor'].tensor, jnp.zeros((10, 3, 224, 224)) + ) + assert jnp.allclose(batch.tensor.tensor, jnp.zeros((10, 3, 224, 224))) + + +@pytest.mark.jax +def test_stack_mod_nested_document(): + class Image(BaseDoc): + tensor: JaxArray[3, 224, 224] + + class MMdoc(BaseDoc): + img: Image + + batch = DocList[MMdoc]( + [MMdoc(img=Image(tensor=jnp.zeros((3, 224, 224)))) for _ in range(10)] + ).to_doc_vec() + + assert jnp.allclose( + batch._storage.doc_columns['img']._storage.tensor_columns['tensor'].tensor, + jnp.zeros((10, 3, 224, 224)), + ) + + assert jnp.allclose(batch.img.tensor.tensor, jnp.zeros((10, 3, 224, 224))) + + +@pytest.mark.jax +def test_stack_nested_DocArray(nested_batch): + for i in range(len(nested_batch)): + assert jnp.allclose( + nested_batch[i].img._storage.tensor_columns['tensor'].tensor, + jnp.zeros((10, 3, 224, 224)), + ) + + assert jnp.allclose( + nested_batch[i].img.tensor.tensor, jnp.zeros((10, 3, 224, 224)) + ) + + +@pytest.mark.jax +def test_convert_to_da(batch): + da = batch.to_doc_list() + + for doc in da: + assert jnp.allclose(doc.tensor.tensor, jnp.zeros((3, 224, 224))) + + +@pytest.mark.jax +def test_unstack_nested_document(): + class Image(BaseDoc): + tensor: JaxArray[3, 224, 224] + + class MMdoc(BaseDoc): + img: Image + + batch = DocVec[MMdoc]( + [MMdoc(img=Image(tensor=jnp.zeros((3, 224, 224)))) for _ in range(10)] + ) + assert isinstance(batch.img._storage.tensor_columns['tensor'], JaxArray) + da = batch.to_doc_list() + + for doc in da: + assert jnp.allclose(doc.img.tensor.tensor, jnp.zeros((3, 224, 224))) + + +@pytest.mark.jax +def test_unstack_nested_DocArray(nested_batch): + batch = nested_batch.to_doc_list() + for i in range(len(batch)): + assert isinstance(batch[i].img, DocList) + for doc in batch[i].img: + assert jnp.allclose(doc.tensor.tensor, jnp.zeros((3, 224, 224))) + + +@pytest.mark.jax +def test_stack_call(): + class Image(BaseDoc): + tensor: JaxArray[3, 224, 224] + + da = DocList[Image]([Image(tensor=jnp.zeros((3, 224, 224))) for _ in range(10)]) + + da = da.to_doc_vec() + + assert len(da) == 10 + + assert da.tensor.tensor.shape == (10, 3, 224, 224) + + +@pytest.mark.jax +def test_stack_union(): + class Image(BaseDoc): + tensor: Union[JaxArray[3, 224, 224], NdArray[3, 224, 224]] + + DocVec[Image]( + [Image(tensor=jnp.zeros((3, 224, 224))) for _ in range(10)], + tensor_type=JaxArray, + ) + + # union fields aren't actually doc_vec + # just checking that there is no error + + +@pytest.mark.jax +def test_setitem_tensor(batch): + batch[3].tensor.tensor = jnp.zeros((3, 224, 224)) + + +@pytest.mark.jax +@pytest.mark.skip('not working yet') +def test_setitem_tensor_direct(batch): + batch[3].tensor = jnp.zeros((3, 224, 224)) + + +@pytest.mark.jax +@pytest.mark.parametrize( + 'cls_tensor', [ImageTensor, AudioTensor, VideoTensor, AnyEmbedding, AnyTensor] +) +def test_generic_tensors_with_jnp(cls_tensor): + tensor = jnp.zeros((3, 224, 224)) + + class Image(BaseDoc): + tensor: cls_tensor + + da = DocVec[Image]( + [Image(tensor=tensor) for _ in range(10)], + tensor_type=JaxArray, + ) + + for i in range(len(da)): + assert jnp.allclose(da[i].tensor.tensor, tensor) + + assert 'tensor' in da._storage.tensor_columns.keys() + assert isinstance(da._storage.tensor_columns['tensor'], JaxArray) + + +@pytest.mark.jax +@pytest.mark.parametrize( + 'cls_tensor', [ImageTensor, AudioTensor, VideoTensor, AnyEmbedding, AnyTensor] +) +def test_generic_tensors_with_optional(cls_tensor): + tensor = jnp.zeros((3, 224, 224)) + + class Image(BaseDoc): + tensor: Optional[cls_tensor] = None + + class TopDoc(BaseDoc): + img: Image + + da = DocVec[TopDoc]( + [TopDoc(img=Image(tensor=tensor)) for _ in range(10)], + tensor_type=JaxArray, + ) + + for i in range(len(da)): + assert jnp.allclose(da.img[i].tensor.tensor, tensor) + + assert 'tensor' in da.img._storage.tensor_columns.keys() + assert isinstance(da.img._storage.tensor_columns['tensor'], JaxArray) + assert isinstance(da.img._storage.tensor_columns['tensor'].tensor, jnp.ndarray) + + +@pytest.mark.jax +def test_get_from_slice_stacked(): + class Doc(BaseDoc): + text: str + tensor: JaxArray + + da = DocVec[Doc]( + [Doc(text=f'hello{i}', tensor=jnp.zeros((3, 224, 224))) for i in range(10)] + ) + + da_sliced = da[0:10:2] + assert isinstance(da_sliced, DocVec) + + tensors = da_sliced.tensor.tensor + assert tensors.shape == (5, 3, 224, 224) + + +@pytest.mark.jax +def test_stack_none(): + class MyDoc(BaseDoc): + tensor: Optional[AnyTensor] = None + + da = DocVec[MyDoc]([MyDoc(tensor=None) for _ in range(10)], tensor_type=JaxArray) + assert 'tensor' in da._storage.tensor_columns.keys() + + +@pytest.mark.jax +def test_keep_dtype_jnp(): + class MyDoc(BaseDoc): + tensor: JaxArray + + da = DocList[MyDoc]( + [MyDoc(tensor=jnp.zeros([2, 4], dtype=jnp.int32)) for _ in range(3)] + ) + assert da[0].tensor.tensor.dtype == jnp.int32 + + da = da.to_doc_vec() + assert da[0].tensor.tensor.dtype == jnp.int32 + assert da.tensor.tensor.dtype == jnp.int32 diff --git a/tests/units/array/stack/test_array_stacked_tf.py b/tests/units/array/stack/test_array_stacked_tf.py new file mode 100644 index 00000000000..da055fcd8ee --- /dev/null +++ b/tests/units/array/stack/test_array_stacked_tf.py @@ -0,0 +1,303 @@ +from typing import Optional, Union + +import pytest + +from docarray import BaseDoc, DocList +from docarray.array import DocVec +from docarray.typing import ( + AnyEmbedding, + AnyTensor, + AudioTensor, + ImageTensor, + NdArray, + VideoTensor, +) +from docarray.utils._internal.misc import is_tf_available + +tf_available = is_tf_available() +if tf_available: + import tensorflow as tf + import tensorflow._api.v2.experimental.numpy as tnp + + from docarray.typing import TensorFlowTensor + + +@pytest.fixture() +def batch(): + class Image(BaseDoc): + tensor: TensorFlowTensor[3, 224, 224] + + import tensorflow as tf + + batch = DocList[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) + + return batch.to_doc_vec() + + +@pytest.fixture() +def nested_batch(): + class Image(BaseDoc): + tensor: TensorFlowTensor[3, 224, 224] + + class MMdoc(BaseDoc): + img: DocList[Image] + + import tensorflow as tf + + batch = DocVec[MMdoc]( + [ + MMdoc( + img=DocList[Image]( + [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)] + ) + ) + for _ in range(10) + ] + ) + + return batch + + +@pytest.mark.tensorflow +def test_len(batch): + assert len(batch) == 10 + + +@pytest.mark.tensorflow +def test_getitem(batch): + for i in range(len(batch)): + item = batch[i] + assert isinstance(item.tensor, TensorFlowTensor) + assert tnp.allclose(item.tensor.tensor, tf.zeros((3, 224, 224))) + + +@pytest.mark.tensorflow +def test_get_slice(batch): + sliced = batch[0:2] + assert isinstance(sliced, DocVec) + assert len(sliced) == 2 + + +@pytest.mark.tensorflow +def test_iterator(batch): + for doc in batch: + assert tnp.allclose(doc.tensor.tensor, tf.zeros((3, 224, 224))) + + +@pytest.mark.tensorflow +def test_set_after_stacking(): + class Image(BaseDoc): + tensor: TensorFlowTensor[3, 224, 224] + + batch = DocVec[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) + + batch.tensor = tf.ones((10, 3, 224, 224)) + assert tnp.allclose(batch.tensor.tensor, tf.ones((10, 3, 224, 224))) + for i, doc in enumerate(batch): + assert tnp.allclose(doc.tensor.tensor, batch.tensor.tensor[i]) + + +@pytest.mark.tensorflow +def test_stack_optional(batch): + assert tnp.allclose( + batch._storage.tensor_columns['tensor'].tensor, tf.zeros((10, 3, 224, 224)) + ) + assert tnp.allclose(batch.tensor.tensor, tf.zeros((10, 3, 224, 224))) + + +@pytest.mark.tensorflow +def test_stack_mod_nested_document(): + class Image(BaseDoc): + tensor: TensorFlowTensor[3, 224, 224] + + class MMdoc(BaseDoc): + img: Image + + batch = DocList[MMdoc]( + [MMdoc(img=Image(tensor=tf.zeros((3, 224, 224)))) for _ in range(10)] + ).to_doc_vec() + + assert tnp.allclose( + batch._storage.doc_columns['img']._storage.tensor_columns['tensor'].tensor, + tf.zeros((10, 3, 224, 224)), + ) + + assert tnp.allclose(batch.img.tensor.tensor, tf.zeros((10, 3, 224, 224))) + + +@pytest.mark.tensorflow +def test_stack_nested_DocArray(nested_batch): + for i in range(len(nested_batch)): + assert tnp.allclose( + nested_batch[i].img._storage.tensor_columns['tensor'].tensor, + tf.zeros((10, 3, 224, 224)), + ) + + assert tnp.allclose( + nested_batch[i].img.tensor.tensor, tf.zeros((10, 3, 224, 224)) + ) + + +@pytest.mark.tensorflow +def test_convert_to_da(batch): + da = batch.to_doc_list() + + for doc in da: + assert tnp.allclose(doc.tensor.tensor, tf.zeros((3, 224, 224))) + + +@pytest.mark.tensorflow +def test_unstack_nested_document(): + class Image(BaseDoc): + tensor: TensorFlowTensor[3, 224, 224] + + class MMdoc(BaseDoc): + img: Image + + batch = DocVec[MMdoc]( + [MMdoc(img=Image(tensor=tf.zeros((3, 224, 224)))) for _ in range(10)] + ) + assert isinstance(batch.img._storage.tensor_columns['tensor'], TensorFlowTensor) + da = batch.to_doc_list() + + for doc in da: + assert tnp.allclose(doc.img.tensor.tensor, tf.zeros((3, 224, 224))) + + +@pytest.mark.tensorflow +def test_unstack_nested_DocArray(nested_batch): + batch = nested_batch.to_doc_list() + for i in range(len(batch)): + assert isinstance(batch[i].img, DocList) + for doc in batch[i].img: + assert tnp.allclose(doc.tensor.tensor, tf.zeros((3, 224, 224))) + + +@pytest.mark.tensorflow +def test_stack_call(): + class Image(BaseDoc): + tensor: TensorFlowTensor[3, 224, 224] + + da = DocList[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) + + da = da.to_doc_vec() + + assert len(da) == 10 + + assert da.tensor.tensor.shape == (10, 3, 224, 224) + + +@pytest.mark.tensorflow +def test_stack_union(): + class Image(BaseDoc): + tensor: Union[TensorFlowTensor[3, 224, 224], NdArray[3, 224, 224]] + + DocVec[Image]( + [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)], + tensor_type=TensorFlowTensor, + ) + + # union fields aren't actually doc_vec + # just checking that there is no error + + +@pytest.mark.tensorflow +def test_setitem_tensor(batch): + batch[3].tensor.tensor = tf.zeros((3, 224, 224)) + + +@pytest.mark.skip('not working yet') +@pytest.mark.tensorflow +def test_setitem_tensor_direct(batch): + batch[3].tensor = tf.zeros((3, 224, 224)) + + +@pytest.mark.parametrize( + 'cls_tensor', [ImageTensor, AudioTensor, VideoTensor, AnyEmbedding, AnyTensor] +) +@pytest.mark.tensorflow +def test_generic_tensors_with_tf(cls_tensor): + tensor = tf.zeros((3, 224, 224)) + + class Image(BaseDoc): + tensor: cls_tensor + + da = DocVec[Image]( + [Image(tensor=tensor) for _ in range(10)], + tensor_type=TensorFlowTensor, + ) + + for i in range(len(da)): + assert tnp.allclose(da[i].tensor.tensor, tensor) + + assert 'tensor' in da._storage.tensor_columns.keys() + assert isinstance(da._storage.tensor_columns['tensor'], TensorFlowTensor) + + +@pytest.mark.parametrize( + 'cls_tensor', [ImageTensor, AudioTensor, VideoTensor, AnyEmbedding, AnyTensor] +) +@pytest.mark.tensorflow +def test_generic_tensors_with_optional(cls_tensor): + tensor = tf.zeros((3, 224, 224)) + + class Image(BaseDoc): + tensor: Optional[cls_tensor] + + class TopDoc(BaseDoc): + img: Image + + da = DocVec[TopDoc]( + [TopDoc(img=Image(tensor=tensor)) for _ in range(10)], + tensor_type=TensorFlowTensor, + ) + + for i in range(len(da)): + assert tnp.allclose(da.img[i].tensor.tensor, tensor) + + assert 'tensor' in da.img._storage.tensor_columns.keys() + assert isinstance(da.img._storage.tensor_columns['tensor'], TensorFlowTensor) + assert isinstance(da.img._storage.tensor_columns['tensor'].tensor, tf.Tensor) + + +@pytest.mark.tensorflow +def test_get_from_slice_stacked(): + class Doc(BaseDoc): + text: str + tensor: TensorFlowTensor + + da = DocVec[Doc]( + [Doc(text=f'hello{i}', tensor=tf.zeros((3, 224, 224))) for i in range(10)] + ) + + da_sliced = da[0:10:2] + assert isinstance(da_sliced, DocVec) + + tensors = da_sliced.tensor.tensor + assert tensors.shape == (5, 3, 224, 224) + + +@pytest.mark.tensorflow +def test_stack_none(): + class MyDoc(BaseDoc): + tensor: Optional[AnyTensor] = None + + da = DocVec[MyDoc]( + [MyDoc(tensor=None) for _ in range(10)], tensor_type=TensorFlowTensor + ) + assert 'tensor' in da._storage.tensor_columns.keys() + + +@pytest.mark.tensorflow +def test_keep_dtype_tf(): + class MyDoc(BaseDoc): + tensor: TensorFlowTensor + + da = DocList[MyDoc]( + [MyDoc(tensor=tf.zeros([2, 4], dtype=tf.int32)) for _ in range(3)] + ) + assert da[0].tensor.tensor.dtype == tf.int32 + + da = da.to_doc_vec() + assert da[0].tensor.tensor.dtype == tf.int32 + assert da.tensor.tensor.dtype == tf.int32 diff --git a/tests/units/array/stack/test_init.py b/tests/units/array/stack/test_init.py new file mode 100644 index 00000000000..232c9276002 --- /dev/null +++ b/tests/units/array/stack/test_init.py @@ -0,0 +1,48 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np + +from docarray import BaseDoc +from docarray.array.doc_vec.doc_vec import DocVec +from docarray.typing import AnyTensor, NdArray + + +def test_da_init(): + class MyDoc(BaseDoc): + tensor: AnyTensor + name: str + + docs = [MyDoc(tensor=np.zeros(10), name='hello') for _ in range(4)] + + da = DocVec[MyDoc](docs, tensor_type=NdArray) + + assert (da._storage.tensor_columns['tensor'] == np.zeros((4, 10))).all() + assert da._storage.any_columns['name'] == ['hello' for _ in range(4)] + + +def test_da_iter(): + class MyDoc(BaseDoc): + tensor: AnyTensor + name: str + + docs = [MyDoc(tensor=i * np.zeros((10, 10)), name=f'hello{i}') for i in range(4)] + + da = DocVec[MyDoc](docs, tensor_type=NdArray) + + for i, doc in enumerate(da): + assert isinstance(doc, MyDoc) + assert (doc.tensor == i * np.zeros((10, 10))).all() + assert doc.name == f'hello{i}' diff --git a/tests/units/array/stack/test_proto.py b/tests/units/array/stack/test_proto.py new file mode 100644 index 00000000000..d46766cde30 --- /dev/null +++ b/tests/units/array/stack/test_proto.py @@ -0,0 +1,348 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from typing import Dict, Optional, Union + +import numpy as np +import pytest +import torch + +from docarray import BaseDoc, DocList +from docarray.array import DocVec +from docarray.typing import NdArray, TorchTensor + + +@pytest.fixture() +def batch(): + class Image(BaseDoc): + tensor: TorchTensor[3, 224, 224] + + batch = DocList[Image]([Image(tensor=torch.zeros(3, 224, 224)) for _ in range(10)]) + + return batch.to_doc_vec() + + +@pytest.mark.proto +def test_proto_stacked_mode_torch(batch): + batch.from_protobuf(batch.to_protobuf()) + + +@pytest.mark.proto +def test_proto_stacked_mode_numpy(): + class MyDoc(BaseDoc): + tensor: NdArray[3, 224, 224] + + da = DocList[MyDoc]([MyDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)]) + + da = da.to_doc_vec() + + da.from_protobuf(da.to_protobuf()) + + +@pytest.mark.proto +def test_stacked_proto(): + class CustomDocument(BaseDoc): + image: NdArray + + da = DocList[CustomDocument]( + [CustomDocument(image=np.zeros((3, 224, 224))) for _ in range(10)] + ).to_doc_vec() + + da2 = DocVec[CustomDocument].from_protobuf(da.to_protobuf()) + + assert isinstance(da2, DocVec) + assert da.doc_type == da2.doc_type + assert (da2.image == da.image).all() + + +@pytest.mark.proto +def test_proto_none_tensor_column(): + class MyOtherDoc(BaseDoc): + embedding: Union[NdArray, None] = None + other_embedding: NdArray + third_embedding: Union[NdArray, None] = None + + da = DocVec[MyOtherDoc]( + [ + MyOtherDoc( + other_embedding=np.random.random(512), + ), + MyOtherDoc(other_embedding=np.random.random(512)), + ] + ) + assert da._storage.tensor_columns['embedding'] is None + assert da._storage.tensor_columns['other_embedding'] is not None + assert da._storage.tensor_columns['third_embedding'] is None + + proto = da.to_protobuf() + da_after = DocVec[MyOtherDoc].from_protobuf(proto) + + assert da_after._storage.tensor_columns['embedding'] is None + assert da_after._storage.tensor_columns['other_embedding'] is not None + assert ( + da_after._storage.tensor_columns['other_embedding'] + == da._storage.tensor_columns['other_embedding'] + ).all() + assert da_after._storage.tensor_columns['third_embedding'] is None + + +@pytest.mark.proto +def test_proto_none_doc_column(): + class InnerDoc(BaseDoc): + embedding: NdArray + + class MyDoc(BaseDoc): + inner: Union[InnerDoc, None] = None + other_inner: Union[InnerDoc, None] = None + + da = DocVec[MyDoc]( + [ + MyDoc(other_inner=InnerDoc(embedding=np.random.random(512))), + MyDoc(other_inner=InnerDoc(embedding=np.random.random(512))), + ] + ) + assert da._storage.doc_columns['inner'] is None + assert len(da._storage.doc_columns['other_inner']) == 2 + + proto = da.to_protobuf() + da_after = DocVec[MyDoc].from_protobuf(proto) + + assert da_after._storage.doc_columns['inner'] is None + assert len(da._storage.doc_columns['other_inner']) == 2 + assert (da.other_inner.embedding == da_after.other_inner.embedding).all() + + +@pytest.mark.proto +def test_proto_none_docvec_column(): + class InnerDoc(BaseDoc): + embedding: NdArray + + class MyDoc(BaseDoc): + inner_l: Union[DocList[InnerDoc], None] = None + inner_v: Union[DocVec[InnerDoc], None] = None + inner_exists_v: Union[DocVec[InnerDoc], None] = None + inner_exists_l: Union[DocList[InnerDoc], None] = None + + def _make_inner_list(): + return DocList[InnerDoc]( + [ + InnerDoc(embedding=np.random.random(512)), + InnerDoc(embedding=np.random.random(512)), + ] + ) + + da = DocVec[MyDoc]( + [ + MyDoc( + inner_exists_l=_make_inner_list(), + inner_exists_v=_make_inner_list().to_doc_vec(), + ), + MyDoc( + inner_exists_l=_make_inner_list(), + inner_exists_v=_make_inner_list().to_doc_vec(), + ), + ] + ) + assert da._storage.docs_vec_columns['inner_l'] is None + assert da._storage.docs_vec_columns['inner_v'] is None + assert len(da._storage.docs_vec_columns['inner_exists_l']) == 2 + assert len(da._storage.docs_vec_columns['inner_exists_v']) == 2 + assert da.inner_exists_l[0].embedding.shape == (2, 512) + assert da.inner_exists_l[1].embedding.shape == (2, 512) + assert da.inner_exists_v[0].embedding.shape == (2, 512) + assert da.inner_exists_v[1].embedding.shape == (2, 512) + + proto = da.to_protobuf() + da_after = DocVec[MyDoc].from_protobuf(proto) + + assert da_after._storage.docs_vec_columns['inner_l'] is None + assert da_after._storage.docs_vec_columns['inner_v'] is None + assert len(da._storage.docs_vec_columns['inner_exists_l']) == 2 + assert len(da._storage.docs_vec_columns['inner_exists_v']) == 2 + assert ( + da.inner_exists_l[0].embedding == da_after.inner_exists_l[0].embedding + ).all() + assert ( + da.inner_exists_l[1].embedding == da_after.inner_exists_l[1].embedding + ).all() + assert ( + da.inner_exists_v[0].embedding == da_after.inner_exists_v[0].embedding + ).all() + assert ( + da.inner_exists_v[1].embedding == da_after.inner_exists_v[1].embedding + ).all() + + +@pytest.mark.proto +def test_proto_any_column(): + class MyDoc(BaseDoc): + embedding: NdArray + text: str + d: Dict + + da = DocVec[MyDoc]( + [ + MyDoc( + embedding=np.random.random(512), + text='hi', + d={'a': 1}, + ), + MyDoc(embedding=np.random.random(512), text='there', d={'b': 2}), + ] + ) + assert da._storage.tensor_columns['embedding'].shape == (2, 512) + assert da._storage.any_columns['text'] == ['hi', 'there'] + assert da._storage.any_columns['d'] == [{'a': 1}, {'b': 2}] + + proto = da.to_protobuf() + da_after = DocVec[MyDoc].from_protobuf(proto) + + assert da_after.doc_type == da.doc_type + assert da._storage.tensor_columns['embedding'].shape == (2, 512) + assert ( + da_after._storage.tensor_columns['embedding'] + == da._storage.tensor_columns['embedding'] + ).all() + assert da._storage.any_columns['text'] == ['hi', 'there'] + assert da._storage.any_columns['d'] == [{'a': 1}, {'b': 2}] + + assert (da_after.embedding == da.embedding).all() + assert da_after.text == da.text + assert da_after.d == da.d + + +@pytest.mark.proto +def test_proto_none_any_column(): + class MyDoc(BaseDoc): + text: Optional[str] = None + d: Optional[Dict] = None + + da = DocVec[MyDoc]( + [ + MyDoc(), + MyDoc(), + ] + ) + assert da._storage.any_columns['text'] == [None, None] + assert da._storage.any_columns['d'] == [None, None] + + proto = da.to_protobuf() + da_after = DocVec[MyDoc].from_protobuf(proto) + + assert da_after._storage.any_columns['text'] == [None, None] + assert da_after._storage.any_columns['d'] == [None, None] + + +@pytest.mark.skipif('GITHUB_WORKFLOW' in os.environ, reason='Flaky in Github') +@pytest.mark.proto +@pytest.mark.parametrize('tensor_type', [NdArray, TorchTensor]) +def test_proto_tensor_type(tensor_type): + class InnerDoc(BaseDoc): + embedding: tensor_type + + class MyDoc(BaseDoc): + tensor: tensor_type + inner: InnerDoc + inner_v: DocVec[InnerDoc] + + def _get_rand_tens(): + arr = np.random.random(512) + return tensor_type.from_ndarray(arr) if tensor_type == TorchTensor else arr + + da = DocVec[MyDoc]( + [ + MyDoc( + tensor=_get_rand_tens(), + inner=InnerDoc(embedding=_get_rand_tens()), + inner_v=DocVec[InnerDoc]([InnerDoc(embedding=_get_rand_tens())]), + ), + MyDoc( + tensor=_get_rand_tens(), + inner=InnerDoc(embedding=_get_rand_tens()), + inner_v=DocVec[InnerDoc]([InnerDoc(embedding=_get_rand_tens())]), + ), + ] + ) + assert isinstance(da.tensor, tensor_type) + assert da.tensor.shape == (2, 512) + assert isinstance(da.inner.embedding, tensor_type) + assert da.inner.embedding.shape == (2, 512) + assert isinstance(da.inner_v[0].embedding, tensor_type) + assert da.inner_v[0].embedding.shape == (1, 512) + + proto = da.to_protobuf() + da_after = DocVec[MyDoc].from_protobuf(proto, tensor_type=tensor_type) + + assert isinstance(da_after.tensor, tensor_type) + assert (da.tensor == da_after.tensor).all() + assert isinstance(da_after.inner.embedding, tensor_type) + assert (da.inner.embedding == da_after.inner.embedding).all() + assert isinstance(da_after.inner_v[0].embedding, tensor_type) + assert (da.inner_v[0].embedding == da_after.inner_v[0].embedding).all() + + +@pytest.mark.tensorflow +def test_proto_tensor_type_tf(): + import tensorflow as tf + + from docarray.typing import TensorFlowTensor + + class InnerDoc(BaseDoc): + embedding: TensorFlowTensor + + class MyDoc(BaseDoc): + tensor: TensorFlowTensor + inner: InnerDoc + inner_v: DocVec[InnerDoc] + + def _get_rand_tens(): + arr = np.random.random(512) + return TensorFlowTensor.from_ndarray(arr) + + da = DocVec[MyDoc]( + [ + MyDoc( + tensor=_get_rand_tens(), + inner=InnerDoc(embedding=_get_rand_tens()), + inner_v=DocVec[InnerDoc]([InnerDoc(embedding=_get_rand_tens())]), + ), + MyDoc( + tensor=_get_rand_tens(), + inner=InnerDoc(embedding=_get_rand_tens()), + inner_v=DocVec[InnerDoc]([InnerDoc(embedding=_get_rand_tens())]), + ), + ] + ) + assert isinstance(da.tensor, TensorFlowTensor) + assert len(da.tensor) == 2 + assert isinstance(da.inner.embedding, TensorFlowTensor) + assert len(da.inner.embedding) == 2 + assert isinstance(da.inner_v[0].embedding, TensorFlowTensor) + assert len(da.inner_v[0].embedding) == 1 + + proto = da.to_protobuf() + da_after = DocVec[MyDoc].from_protobuf(proto, tensor_type=TensorFlowTensor) + + assert isinstance(da_after.tensor, TensorFlowTensor) + assert tf.math.reduce_all(tf.equal(da.tensor.tensor, da_after.tensor.tensor)) + assert isinstance(da_after.inner.embedding, TensorFlowTensor) + assert tf.math.reduce_all( + tf.equal(da.inner.embedding.tensor, da_after.inner.embedding.tensor) + ) + assert isinstance(da_after.inner_v[0].embedding, TensorFlowTensor) + assert tf.math.reduce_all( + tf.equal(da.inner_v[0].embedding.tensor, da_after.inner_v[0].embedding.tensor) + ) diff --git a/tests/units/array/test_array.py b/tests/units/array/test_array.py new file mode 100644 index 00000000000..8e51cc1c37e --- /dev/null +++ b/tests/units/array/test_array.py @@ -0,0 +1,524 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional, TypeVar, Union + +import numpy as np +import pytest +import torch +from pydantic import parse_obj_as + +from docarray import BaseDoc, DocList +from docarray.typing import ImageUrl, NdArray, TorchTensor +from docarray.utils._internal.misc import is_tf_available + +tf_available = is_tf_available() +if tf_available: + import tensorflow as tf + + from docarray.typing import TensorFlowTensor + + +@pytest.fixture() +def da(): + class Text(BaseDoc): + text: str + + return DocList[Text]([Text(text=f'hello {i}') for i in range(10)]) + + +def test_iterate(da): + for doc, doc2 in zip(da, da): + assert doc.id == doc2.id + + +def test_append(): + class Text(BaseDoc): + text: str + + da = DocList[Text]([]) + + da.append(Text(text='hello', id='1')) + + assert len(da) == 1 + assert da[0].id == '1' + + +def test_extend(): + class Text(BaseDoc): + text: str + + da = DocList[Text]([Text(text='hello', id=str(i)) for i in range(10)]) + + da.extend([Text(text='hello', id=str(10 + i)) for i in range(10)]) + + assert len(da) == 20 + for da, i in zip(da, range(20)): + assert da.id == str(i) + + +def test_extend_itself(): + class Text(BaseDoc): + text: str + + da = DocList[Text]([Text(text='hello', id=str(i)) for i in range(10)]) + + da.extend(da) + + assert len(da) == 20 + + +def test_slice(da): + da2 = da[0:5] + assert type(da2) == da.__class__ + assert len(da2) == 5 + + +def test_document_array(): + class Text(BaseDoc): + text: str + + da = DocList([Text(text='hello') for _ in range(10)]) + + assert len(da) == 10 + + +def test_empty_array(): + da = DocList() + len(da) == 0 + + +def test_document_array_fixed_type(): + class Text(BaseDoc): + text: str + + da = DocList[Text]([Text(text='hello') for _ in range(10)]) + + assert len(da) == 10 + + +def test_ndarray_equality(): + class Text(BaseDoc): + tensor: NdArray + + arr1 = Text(tensor=np.zeros(5)) + arr2 = Text(tensor=np.zeros(5)) + arr3 = Text(tensor=np.ones(5)) + arr4 = Text(tensor=np.zeros(4)) + + assert arr1 == arr2 + assert arr1 != arr3 + assert arr1 != arr4 + + +def test_tensor_equality(): + class Text(BaseDoc): + tensor: TorchTensor + + torch1 = Text(tensor=torch.zeros(128)) + torch2 = Text(tensor=torch.zeros(128)) + torch3 = Text(tensor=torch.zeros(126)) + torch4 = Text(tensor=torch.ones(128)) + + assert torch1 == torch2 + assert torch1 != torch3 + assert torch1 != torch4 + + +def test_documentarray(): + class Text(BaseDoc): + text: str + + da1 = DocList([Text(text='hello')]) + da2 = DocList([Text(text='hello')]) + + assert da1 == da2 + assert da1 == [Text(text='hello') for _ in range(len(da1))] + assert da2 == [Text(text='hello') for _ in range(len(da2))] + + +@pytest.mark.tensorflow +def test_tensorflowtensor_equality(): + class Text(BaseDoc): + tensor: TensorFlowTensor + + tensor1 = Text(tensor=tf.constant([1, 2, 3, 4, 5, 6])) + tensor2 = Text(tensor=tf.constant([1, 2, 3, 4, 5, 6])) + tensor3 = Text(tensor=tf.constant([[1.0, 2.0], [3.0, 5.0]])) + tensor4 = Text(tensor=tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])) + + assert tensor1 == tensor2 + assert tensor1 != tensor3 + assert tensor1 != tensor4 + + +def test_text_tensor(): + class Text1(BaseDoc): + tensor: NdArray + + class Text2(BaseDoc): + tensor: TorchTensor + + arr_tensor1 = Text1(tensor=np.zeros(2)) + arr_tensor2 = Text2(tensor=torch.zeros(2)) + + assert arr_tensor1 == arr_tensor2 + + +def test_get_bulk_attributes_function(): + class Mmdoc(BaseDoc): + text: str + tensor: NdArray + + N = 10 + + da = DocList[Mmdoc]( + (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) + ) + + tensors = da._get_data_column('tensor') + + assert len(tensors) == N + for tensor in tensors: + assert tensor.shape == (3, 224, 224) + + texts = da._get_data_column('text') + + assert len(texts) == N + for i, text in enumerate(texts): + assert text == f'hello{i}' + + +def test_set_attributes(): + class InnerDoc(BaseDoc): + text: str + + class Mmdoc(BaseDoc): + inner: InnerDoc + + N = 10 + + da = DocList[Mmdoc]((Mmdoc(inner=InnerDoc(text=f'hello{i}')) for i in range(N))) + + list_docs = [InnerDoc(text=f'hello{i}') for i in range(N)] + da._set_data_column('inner', list_docs) + + for doc, list_doc in zip(da, list_docs): + assert doc.inner == list_doc + + +def test_get_bulk_attributes(): + class Mmdoc(BaseDoc): + text: str + tensor: NdArray + + N = 10 + + da = DocList[Mmdoc]( + (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) + ) + + tensors = da.tensor + + assert len(tensors) == N + for tensor in tensors: + assert tensor.shape == (3, 224, 224) + + texts = da.text + + assert len(texts) == N + for i, text in enumerate(texts): + assert text == f'hello{i}' + + +def test_get_bulk_attributes_document(): + class InnerDoc(BaseDoc): + text: str + + class Mmdoc(BaseDoc): + inner: InnerDoc + + N = 10 + + da = DocList[Mmdoc]((Mmdoc(inner=InnerDoc(text=f'hello{i}')) for i in range(N))) + + assert isinstance(da.inner, DocList) + + +def test_get_bulk_attributes_optional_type(): + class Mmdoc(BaseDoc): + text: str + tensor: Optional[NdArray] + + N = 10 + + da = DocList[Mmdoc]( + (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) + ) + + tensors = da.tensor + + assert len(tensors) == N + for tensor in tensors: + assert tensor.shape == (3, 224, 224) + + texts = da.text + + assert len(texts) == N + for i, text in enumerate(texts): + assert text == f'hello{i}' + + +def test_get_bulk_attributes_union_type(): + class Mmdoc(BaseDoc): + text: str + tensor: Union[NdArray, TorchTensor] + + N = 10 + + da = DocList[Mmdoc]( + (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) + ) + + tensors = da.tensor + + assert len(tensors) == N + assert isinstance(tensors, list) + for tensor in tensors: + assert tensor.shape == (3, 224, 224) + + texts = da.text + + assert len(texts) == N + for i, text in enumerate(texts): + assert text == f'hello{i}' + + +@pytest.mark.tensorflow +def test_get_bulk_attributes_union_type_nested(): + class MyDoc(BaseDoc): + embedding: Union[Optional[TorchTensor], Optional[NdArray]] + embedding2: Optional[Union[TorchTensor, NdArray, TensorFlowTensor]] + embedding3: Optional[Optional[TorchTensor]] + embedding4: Union[ + Optional[Union[TorchTensor, NdArray, TensorFlowTensor]], TorchTensor + ] + + da = DocList[MyDoc]( + [ + MyDoc( + embedding=torch.rand(10), + embedding2=torch.rand(10), + embedding3=torch.rand(10), + embedding4=torch.rand(10), + ) + for _ in range(10) + ] + ) + + for attr in ['embedding', 'embedding2', 'embedding3', 'embedding4']: + tensors = getattr(da, attr) + assert len(tensors) == 10 + assert isinstance(tensors, list) + for tensor in tensors: + assert tensor.shape == (10,) + + +def test_get_from_slice(): + class Doc(BaseDoc): + text: str + tensor: NdArray + + N = 10 + + da = DocList[Doc]( + (Doc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) + ) + + da_sliced = da[0:10:2] + assert isinstance(da_sliced, DocList) + + tensors = da_sliced.tensor + assert len(tensors) == 5 + for tensor in tensors: + assert tensor.shape == (3, 224, 224) + + texts = da_sliced.text + assert len(texts) == 5 + for i, text in enumerate(texts): + assert text == f'hello{i * 2}' + + +def test_del_item(da): + assert len(da) == 10 + del da[2] + assert len(da) == 9 + assert da.text == [ + 'hello 0', + 'hello 1', + 'hello 3', + 'hello 4', + 'hello 5', + 'hello 6', + 'hello 7', + 'hello 8', + 'hello 9', + ] + del da[0:2] + assert len(da) == 7 + assert da.text == [ + 'hello 3', + 'hello 4', + 'hello 5', + 'hello 6', + 'hello 7', + 'hello 8', + 'hello 9', + ] + + +def test_generic_type_var(): + T = TypeVar('T', bound=BaseDoc) + + def f(a: DocList[T]) -> DocList[T]: + return a + + def g(a: DocList['BaseDoc']) -> DocList['BaseDoc']: + return a + + a = DocList() + f(a) + g(a) + + +def test_construct(): + class Text(BaseDoc): + text: str + + docs = [Text(text=f'hello {i}') for i in range(10)] + [BaseDoc()] + + da = DocList[Text].construct(docs) + + assert type(da[-1]) == BaseDoc + + +def test_reverse(): + class Text(BaseDoc): + text: str + + docs = [Text(text=f'hello {i}') for i in range(10)] + + da = DocList[Text](docs) + da.reverse() + assert da[-1].text == 'hello 0' + assert da[0].text == 'hello 9' + + +class Image(BaseDoc): + tensor: Optional[NdArray] = None + url: ImageUrl + + +def test_remove(): + images = [Image(url=f'http://url.com/foo_{i}.png') for i in range(3)] + da = DocList[Image](images) + da.remove(images[1]) + assert len(da) == 2 + assert da[0] == images[0] + assert da[1] == images[2] + + +def test_pop(): + images = [Image(url=f'http://url.com/foo_{i}.png') for i in range(3)] + da = DocList[Image](images) + popped = da.pop(1) + assert len(da) == 2 + assert popped == images[1] + assert da[0] == images[0] + assert da[1] == images[2] + + +def test_sort(): + images = [ + Image(url=f'http://url.com/foo_{i}.png', tensor=NdArray(i)) for i in [2, 0, 1] + ] + da = DocList[Image](images) + da.sort(key=lambda img: len(img.tensor)) + assert len(da) == 3 + assert da[0].url == 'http://url.com/foo_0.png' + assert da[1].url == 'http://url.com/foo_1.png' + + +def test_optional_field(): + from typing import Optional + + from docarray import BaseDoc, DocList + from docarray.typing import ImageUrl, NdArray + + class Nested(BaseDoc): + tensor: NdArray + + class Image(BaseDoc): + url: ImageUrl + features: Optional[Nested] = None + + docs = DocList[Image]([Image(url='http://url.com/foo.png') for _ in range(10)]) + + assert docs.features == [None for _ in range(10)] + assert isinstance(docs.features, list) + assert not isinstance(docs.features, DocList) + + +def test_validate_list_dict(): + images = [ + dict(url=f'http://url.com/foo_{i}.png', tensor=NdArray(i)) for i in [2, 0, 1] + ] + + # docs = DocList[Image]([Image(url=image['url'], tensor=image['tensor']) for image in images]) + + docs = parse_obj_as(DocList[Image], images) + + assert docs.url == [ + 'http://url.com/foo_2.png', + 'http://url.com/foo_0.png', + 'http://url.com/foo_1.png', + ] + + +def test_legacy_doc(): + from docarray.documents.legacy import LegacyDocument + + newDoc = LegacyDocument() + da = DocList[LegacyDocument]([newDoc]) + da.summary() + + +def test_parameterize_list(): + from docarray import DocList, BaseDoc + + with pytest.raises(TypeError) as excinfo: + da = DocList[BaseDoc()] + assert da is None + + assert str(excinfo.value) == 'Expecting a type, got object instead' + + +def test_not_double_subcriptable(): + from docarray import DocList + from docarray.documents import TextDoc + + with pytest.raises(TypeError) as excinfo: + da = DocList[TextDoc][TextDoc] + assert da is None diff --git a/tests/units/array/test_array_from_to_bytes.py b/tests/units/array/test_array_from_to_bytes.py new file mode 100644 index 00000000000..0ab952ce4a7 --- /dev/null +++ b/tests/units/array/test_array_from_to_bytes.py @@ -0,0 +1,148 @@ +import pytest + +from docarray import BaseDoc, DocList, DocVec +from docarray.documents import ImageDoc +from docarray.typing import NdArray, TorchTensor + + +class MyDoc(BaseDoc): + embedding: NdArray + text: str + image: ImageDoc + + +@pytest.mark.parametrize( + 'protocol', ['pickle-array', 'protobuf-array', 'protobuf', 'pickle'] +) +@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) +@pytest.mark.parametrize('show_progress', [False, True]) +@pytest.mark.parametrize('array_cls', [DocList, DocVec]) +def test_from_to_bytes(protocol, compress, show_progress, array_cls): + da = array_cls[MyDoc]( + [ + MyDoc( + embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') + ), + MyDoc(embedding=[5, 4, 3, 2, 1], text='hello world', image=ImageDoc()), + ] + ) + bytes_da = da.to_bytes( + protocol=protocol, compress=compress, show_progress=show_progress + ) + da2 = array_cls[MyDoc].from_bytes( + bytes_da, protocol=protocol, compress=compress, show_progress=show_progress + ) + assert len(da2) == 2 + assert len(da) == len(da2) + for d1, d2 in zip(da, da2): + assert d1.embedding.tolist() == d2.embedding.tolist() + assert d1.text == d2.text + assert d1.image.url == d2.image.url + assert da[1].image.url is None + assert da2[1].image.url is None + + +@pytest.mark.parametrize( + 'protocol', ['pickle-array', 'protobuf-array', 'protobuf', 'pickle'] +) +@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) +@pytest.mark.parametrize('show_progress', [False, True]) # [False, True]) +@pytest.mark.parametrize('array_cls', [DocList, DocVec]) +def test_from_to_base64(protocol, compress, show_progress, array_cls): + da = array_cls[MyDoc]( + [ + MyDoc( + embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') + ), + MyDoc(embedding=[5, 4, 3, 2, 1], text='hello world', image=ImageDoc()), + ] + ) + bytes_da = da.to_base64( + protocol=protocol, compress=compress, show_progress=show_progress + ) + da2 = array_cls[MyDoc].from_base64( + bytes_da, protocol=protocol, compress=compress, show_progress=show_progress + ) + assert len(da2) == 2 + assert len(da) == len(da2) + for d1, d2 in zip(da, da2): + assert d1.embedding.tolist() == d2.embedding.tolist() + assert d1.text == d2.text + assert d1.image.url == d2.image.url + + assert da[1].image.url is None + assert da2[1].image.url is None + + +# test_from_to_base64('protobuf', 'lz4', False, DocVec) +class MyTensorTypeDocNdArray(BaseDoc): + embedding: NdArray + text: str + image: ImageDoc + + +class MyTensorTypeDocTorchTensor(BaseDoc): + embedding: TorchTensor + text: str + image: ImageDoc + + +@pytest.mark.parametrize( + 'doc_type, tensor_type', + [(MyTensorTypeDocNdArray, NdArray), (MyTensorTypeDocTorchTensor, TorchTensor)], +) +@pytest.mark.parametrize('protocol', ['protobuf-array', 'pickle-array']) +def test_from_to_base64_tensor_type(doc_type, tensor_type, protocol): + da = DocVec[doc_type]( + [ + doc_type( + embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') + ), + doc_type(embedding=[5, 4, 3, 2, 1], text='hello world', image=ImageDoc()), + ], + tensor_type=tensor_type, + ) + bytes_da = da.to_base64(protocol=protocol) + da2 = DocVec[doc_type].from_base64( + bytes_da, tensor_type=tensor_type, protocol=protocol + ) + assert da2.tensor_type == tensor_type + assert isinstance(da2.embedding, tensor_type) + + +@pytest.mark.parametrize('tensor_type', [NdArray, TorchTensor]) +def test_from_to_bytes_tensor_type(tensor_type): + da = DocVec[MyDoc]( + [ + MyDoc( + embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') + ), + MyDoc(embedding=[5, 4, 3, 2, 1], text='hello world', image=ImageDoc()), + ], + tensor_type=tensor_type, + ) + bytes_da = da.to_bytes() + da2 = DocVec[MyDoc].from_bytes(bytes_da, tensor_type=tensor_type) + assert da2.tensor_type == tensor_type + assert isinstance(da2.embedding, tensor_type) + + +def test_union_type_error(tmp_path): + from typing import Union + + from docarray.documents import TextDoc + + class CustomDoc(BaseDoc): + ud: Union[TextDoc, ImageDoc] = TextDoc(text='union type') + + docs = DocList[CustomDoc]([CustomDoc(ud=TextDoc(text='union type'))]) + + with pytest.raises(ValueError): + docs.from_bytes(docs.to_bytes()) + + class BasisUnion(BaseDoc): + ud: Union[int, str] + + docs_basic = DocList[BasisUnion]([BasisUnion(ud="hello")]) + docs_copy = DocList[BasisUnion].from_bytes(docs_basic.to_bytes()) + assert docs_copy == docs_basic diff --git a/tests/units/array/test_array_from_to_csv.py b/tests/units/array/test_array_from_to_csv.py new file mode 100644 index 00000000000..07d353ffc0f --- /dev/null +++ b/tests/units/array/test_array_from_to_csv.py @@ -0,0 +1,166 @@ +import os +from typing import Optional + +import pytest + +from docarray import BaseDoc, DocList, DocVec +from docarray.documents import ImageDoc +from tests import TOYDATA_DIR + + +@pytest.fixture() +def nested_doc_cls(): + class MyDoc(BaseDoc): + count: Optional[int] = None + text: str + + class MyDocNested(MyDoc): + image: ImageDoc + image2: ImageDoc + + return MyDocNested + + +def test_to_from_csv(tmpdir, nested_doc_cls): + da = DocList[nested_doc_cls]( + [ + nested_doc_cls( + count=0, + text='hello', + image=ImageDoc(url='aux.png'), + image2=ImageDoc(url='aux.png'), + ), + nested_doc_cls(text='hello world', image=ImageDoc(), image2=ImageDoc()), + ] + ) + tmp_file = str(tmpdir / 'tmp.csv') + da.to_csv(tmp_file) + assert os.path.isfile(tmp_file) + + da_from = DocList[nested_doc_cls].from_csv(tmp_file) + assert isinstance(da_from, DocList) + for doc1, doc2 in zip(da, da_from): + assert doc1 == doc2 + + +def test_from_csv_nested(nested_doc_cls): + da = DocList[nested_doc_cls].from_csv( + file_path=str(TOYDATA_DIR / 'docs_nested.csv') + ) + assert isinstance(da, DocList) + assert len(da) == 3 + + for i, doc in enumerate(da): + assert doc.count.__class__ == int + assert doc.count == int(f'{i}{i}{i}') + + assert doc.text.__class__ == str + assert doc.text == f'hello {i}' + + assert doc.image.__class__ == ImageDoc + assert doc.image.tensor is None + assert doc.image.embedding is None + assert doc.image.bytes_ is None + + assert doc.image2.__class__ == ImageDoc + assert doc.image2.tensor is None + assert doc.image2.embedding is None + assert doc.image2.bytes_ is None + + assert da[0].image2.url == 'image_10.png' + assert da[1].image2.url is None + assert da[2].image2.url is None + + +@pytest.fixture() +def nested_doc(): + class Inner(BaseDoc): + img: Optional[ImageDoc] = None + + class Middle(BaseDoc): + img: Optional[ImageDoc] = None + inner: Optional[Inner] = None + + class Outer(BaseDoc): + img: Optional[ImageDoc] = None + middle: Optional[Middle] = None + + doc = Outer( + img=ImageDoc(), middle=Middle(img=ImageDoc(), inner=Inner(img=ImageDoc())) + ) + return doc + + +def test_from_csv_without_schema_raise_exception(): + with pytest.raises(TypeError, match='no document schema defined'): + DocList.from_csv(file_path=str(TOYDATA_DIR / 'docs_nested.csv')) + + +def test_from_csv_with_wrong_schema_raise_exception(nested_doc): + with pytest.raises(ValueError, match='Column names do not match the schema'): + DocList[nested_doc.__class__].from_csv(file_path=str(TOYDATA_DIR / 'docs.csv')) + + +def test_from_remote_csv_file(): + remote_url = 'https://github.com/docarray/docarray/blob/main/tests/toydata/books.csv?raw=true' + + class Book(BaseDoc): + title: str + author: str + year: int + + books = DocList[Book].from_csv(file_path=remote_url) + assert isinstance(books, DocList) + + assert len(books) == 3 + + +def test_doc_list_error(tmpdir): + class Book(BaseDoc): + title: str + + # not testing DocVec bc it already fails here (as it should!) + docs = DocList([Book(title='hello'), Book(title='world')]) + tmp_file = str(tmpdir / 'tmp.csv') + with pytest.raises(TypeError): + docs.to_csv(tmp_file) + + +def test_union_type_error(tmp_path): + from typing import Union + + from docarray.documents import TextDoc + + class CustomDoc(BaseDoc): + ud: Union[TextDoc, ImageDoc] = TextDoc(text='union type') + + docs = DocList[CustomDoc]([CustomDoc(ud=TextDoc(text='union type'))]) + + with pytest.raises(ValueError): + docs.to_csv(str(tmp_path) + ".csv") + DocList[CustomDoc].from_csv(str(tmp_path) + ".csv") + + class BasisUnion(BaseDoc): + ud: Union[int, str] + + docs_basic = DocList[BasisUnion]([BasisUnion(ud="hello")]) + docs_basic.to_csv(str(tmp_path) + ".csv") + docs_copy = DocList[BasisUnion].from_csv(str(tmp_path) + ".csv") + assert docs_copy == docs_basic + + +def test_to_from_csv_docvec_raises(): + class Book(BaseDoc): + title: str + author: str + year: int + + books = DocVec[Book]( + [Book(title='It\'s me, hi', author='I\'m the problem it\'s me', year=2022)] + ) + + with pytest.raises(NotImplementedError): + books.to_csv('dummy/file/path') + + with pytest.raises(NotImplementedError): + DocVec[Book].from_csv('dummy/file/path') diff --git a/tests/units/array/test_array_from_to_json.py b/tests/units/array/test_array_from_to_json.py new file mode 100644 index 00000000000..2324652c6d0 --- /dev/null +++ b/tests/units/array/test_array_from_to_json.py @@ -0,0 +1,180 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional, Dict, List + +import numpy as np +import pytest +import torch + +from docarray import BaseDoc, DocList, DocVec +from docarray.documents import ImageDoc +from docarray.typing import NdArray, TorchTensor + + +class MyDoc(BaseDoc): + embedding: NdArray + text: str + image: ImageDoc + + +def test_from_to_json_doclist(): + da = DocList[MyDoc]( + [ + MyDoc( + embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') + ), + MyDoc(embedding=[5, 4, 3, 2, 1], text='hello world', image=ImageDoc()), + ] + ) + json_da = da.to_json() + da2 = DocList[MyDoc].from_json(json_da) + assert len(da2) == 2 + assert len(da) == len(da2) + for d1, d2 in zip(da, da2): + assert d1.embedding.tolist() == d2.embedding.tolist() + assert d1.text == d2.text + assert d1.image.url == d2.image.url + assert da[1].image.url is None + assert da2[1].image.url is None + + +@pytest.mark.parametrize('tensor_type', [TorchTensor, NdArray]) +def test_from_to_json_docvec(tensor_type): + def generate_docs(tensor_type): + class InnerDoc(BaseDoc): + tens: tensor_type + + class MyDoc(BaseDoc): + text: str + num: Optional[int] = None + tens: tensor_type + tens_none: Optional[tensor_type] = None + inner: InnerDoc + inner_none: Optional[InnerDoc] = None + inner_vec: DocVec[InnerDoc] + inner_vec_none: Optional[DocVec[InnerDoc]] = None + + def _rand_vec_gen(tensor_type): + arr = np.random.rand(5) + if tensor_type == TorchTensor: + arr = torch.from_numpy(arr).to(torch.float32) + return arr + + inner = InnerDoc(tens=_rand_vec_gen(tensor_type)) + inner_vec = DocVec[InnerDoc]([inner, inner], tensor_type=tensor_type) + vec = DocVec[MyDoc]( + [ + MyDoc( + text=str(i), + num=None, + tens=_rand_vec_gen(tensor_type), + inner=inner, + inner_none=None, + inner_vec=inner_vec, + inner_vec_none=None, + ) + for i in range(5) + ], + tensor_type=tensor_type, + ) + return vec + + v = generate_docs(tensor_type) + json_str = v.to_json() + + v_after = DocVec[v.doc_type].from_json(json_str, tensor_type=tensor_type) + + assert v_after.tensor_type == v.tensor_type + assert set(v_after._storage.columns.keys()) == set(v._storage.columns.keys()) + assert v_after._storage == v._storage + + +@pytest.mark.tensorflow +def test_from_to_json_docvec_tf(): + from docarray.typing import TensorFlowTensor + + def generate_docs(): + class InnerDoc(BaseDoc): + tens: TensorFlowTensor + + class MyDoc(BaseDoc): + text: str + num: Optional[int] = None + tens: TensorFlowTensor + tens_none: Optional[TensorFlowTensor] = None + inner: InnerDoc + inner_none: Optional[InnerDoc] = None + inner_vec: DocVec[InnerDoc] + inner_vec_none: Optional[DocVec[InnerDoc]] = None + + inner = InnerDoc(tens=np.random.rand(5)) + inner_vec = DocVec[InnerDoc]([inner, inner], tensor_type=TensorFlowTensor) + vec = DocVec[MyDoc]( + [ + MyDoc( + text=str(i), + num=None, + tens=np.random.rand(5), + inner=inner, + inner_none=None, + inner_vec=inner_vec, + inner_vec_none=None, + ) + for i in range(5) + ], + tensor_type=TensorFlowTensor, + ) + return vec + + v = generate_docs() + json_str = v.to_json() + + v_after = DocVec[v.doc_type].from_json(json_str, tensor_type=TensorFlowTensor) + + assert v_after.tensor_type == v.tensor_type + assert set(v_after._storage.columns.keys()) == set(v._storage.columns.keys()) + assert v_after._storage == v._storage + + +def test_union_type(): + from typing import Union + + from docarray.documents import TextDoc + + class CustomDoc(BaseDoc): + ud: Union[TextDoc, ImageDoc] = TextDoc(text='union type') + + docs = DocList[CustomDoc]([CustomDoc(ud=TextDoc(text='union type'))]) + + docs_copy = docs.from_json(docs.to_json()) + assert docs == docs_copy + + +@pytest.mark.parametrize('tensor_type', [NdArray, TorchTensor]) +def test_from_to_json_tensor_type(tensor_type): + da = DocVec[MyDoc]( + [ + MyDoc( + embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') + ), + MyDoc(embedding=[5, 4, 3, 2, 1], text='hello world', image=ImageDoc()), + ], + tensor_type=tensor_type, + ) + json_da = da.to_json() + da2 = DocVec[MyDoc].from_json(json_da, tensor_type=tensor_type) + assert da2.tensor_type == tensor_type + assert isinstance(da2.embedding, tensor_type) diff --git a/tests/units/array/test_array_from_to_pandas.py b/tests/units/array/test_array_from_to_pandas.py new file mode 100644 index 00000000000..440398562ff --- /dev/null +++ b/tests/units/array/test_array_from_to_pandas.py @@ -0,0 +1,162 @@ +from typing import List, Optional + +import pandas as pd +import pytest + +from docarray import BaseDoc, DocList, DocVec +from docarray.documents import ImageDoc +from docarray.typing import NdArray, TorchTensor + + +@pytest.fixture() +def nested_doc_cls(): + class MyDoc(BaseDoc): + count: Optional[int] = None + text: str + + class MyDocNested(MyDoc): + image: ImageDoc + lst: List[str] + + return MyDocNested + + +@pytest.mark.parametrize('doc_vec', [False, True]) +def test_to_from_pandas_df(nested_doc_cls, doc_vec): + da = DocList[nested_doc_cls]( + [ + nested_doc_cls( + count=0, + text='hello', + image=ImageDoc(url='aux.png'), + lst=["hello", "world"], + ), + nested_doc_cls( + text='hello world', image=ImageDoc(), lst=["hello", "world"] + ), + ] + ) + if doc_vec: + da = da.to_doc_vec() + df = da.to_dataframe() + assert isinstance(df, pd.DataFrame) + assert len(df) == 2 + assert ( + df.columns + == [ + 'id', + 'count', + 'text', + 'image__id', + 'image__url', + 'image__tensor', + 'image__embedding', + 'image__bytes_', + 'lst', + ] + ).all() + + if doc_vec: + da_from_df = DocVec[nested_doc_cls].from_dataframe(df) + assert isinstance(da_from_df, DocVec) + else: + da_from_df = DocList[nested_doc_cls].from_dataframe(df) + assert isinstance(da_from_df, DocList) + for doc1, doc2 in zip(da, da_from_df): + assert doc1 == doc2 + + +@pytest.fixture() +def nested_doc(): + class Inner(BaseDoc): + img: Optional[ImageDoc] = None + + class Middle(BaseDoc): + img: Optional[ImageDoc] = None + inner: Optional[Inner] = None + + class Outer(BaseDoc): + img: Optional[ImageDoc] = None + middle: Optional[Middle] = None + + doc = Outer( + img=ImageDoc(), middle=Middle(img=ImageDoc(), inner=Inner(img=ImageDoc())) + ) + return doc + + +@pytest.mark.parametrize('array_cls', [DocList, DocVec]) +def test_from_pandas_without_schema_raise_exception(array_cls): + with pytest.raises(TypeError, match='no document schema defined'): + df = pd.DataFrame( + columns=['title', 'count'], data=[['title 0', 0], ['title 1', 1]] + ) + array_cls.from_dataframe(df=df) + + +@pytest.mark.parametrize('array_cls', [DocList, DocVec]) +def test_from_pandas_with_wrong_schema_raise_exception(nested_doc, array_cls): + with pytest.raises(ValueError, match='Column names do not match the schema'): + df = pd.DataFrame( + columns=['title', 'count'], data=[['title 0', 0], ['title 1', 1]] + ) + array_cls[nested_doc.__class__].from_dataframe(df=df) + + +def test_doc_list_error(): + class Book(BaseDoc): + title: str + + # not testing DocVec bc it already fails here (as it should!) + docs = DocList([Book(title='hello'), Book(title='world')]) + with pytest.raises(TypeError): + docs.to_dataframe() + + +@pytest.mark.proto +def test_union_type_error(): + from typing import Union + + from docarray.documents import TextDoc + + class CustomDoc(BaseDoc): + ud: Union[TextDoc, ImageDoc] = TextDoc(text='union type') + + docs = DocList[CustomDoc]([CustomDoc(ud=TextDoc(text='union type'))]) + + with pytest.raises(ValueError): + DocList[CustomDoc].from_dataframe(docs.to_dataframe()) + + class BasisUnion(BaseDoc): + ud: Union[int, str] + + docs_basic = DocList[BasisUnion]([BasisUnion(ud="hello")]) + docs_copy = DocList[BasisUnion].from_dataframe(docs_basic.to_dataframe()) + assert docs_copy == docs_basic + + +@pytest.mark.parametrize('tensor_type', [NdArray, TorchTensor]) +@pytest.mark.parametrize('tensor_len', [0, 5]) +def test_from_to_pandas_tensor_type(tensor_type, tensor_len): + class MyDoc(BaseDoc): + embedding: tensor_type + text: str + image: ImageDoc + + da = DocVec[MyDoc]( + [ + MyDoc( + embedding=list(range(tensor_len)), + text='hello', + image=ImageDoc(url='aux.png'), + ), + MyDoc( + embedding=list(range(tensor_len)), text='hello world', image=ImageDoc() + ), + ], + tensor_type=tensor_type, + ) + df_da = da.to_dataframe() + da2 = DocVec[MyDoc].from_dataframe(df_da, tensor_type=tensor_type) + assert da2.tensor_type == tensor_type + assert isinstance(da2.embedding, tensor_type) diff --git a/tests/units/array/test_array_proto.py b/tests/units/array/test_array_proto.py new file mode 100644 index 00000000000..8b6cc172725 --- /dev/null +++ b/tests/units/array/test_array_proto.py @@ -0,0 +1,167 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import pytest +from typing import Dict, List + +from docarray import BaseDoc, DocList +from docarray.base_doc import AnyDoc +from docarray.documents import ImageDoc, TextDoc +from docarray.typing import NdArray + + +@pytest.mark.proto +def test_simple_proto(): + class CustomDoc(BaseDoc): + text: str + tensor: NdArray + + da = DocList( + [CustomDoc(text='hello', tensor=np.zeros((3, 224, 224))) for _ in range(10)] + ) + + new_da = DocList[CustomDoc].from_protobuf(da.to_protobuf()) + + for doc1, doc2 in zip(da, new_da): + assert doc1.text == doc2.text + assert (doc1.tensor == doc2.tensor).all() + + +@pytest.mark.proto +def test_nested_proto(): + class CustomDocument(BaseDoc): + text: TextDoc + image: ImageDoc + + da = DocList[CustomDocument]( + [ + CustomDocument( + text=TextDoc(text='hello'), + image=ImageDoc(tensor=np.zeros((3, 224, 224))), + ) + for _ in range(10) + ] + ) + + DocList[CustomDocument].from_protobuf(da.to_protobuf()) + + +@pytest.mark.proto +def test_nested_proto_any_doc(): + class CustomDocument(BaseDoc): + text: TextDoc + image: ImageDoc + + da = DocList[CustomDocument]( + [ + CustomDocument( + text=TextDoc(text='hello'), + image=ImageDoc(tensor=np.zeros((3, 224, 224))), + ) + for _ in range(10) + ] + ) + + DocList.from_protobuf(da.to_protobuf()) + + +@pytest.mark.proto +def test_any_doc_list_proto(): + doc = AnyDoc(hello='world') + pt = DocList([doc]).to_protobuf() + docs = DocList.from_protobuf(pt) + assert docs[0].hello == 'world' + + +@pytest.mark.proto +def test_any_nested_doc_list_proto(): + from docarray import BaseDoc, DocList + + class TextDocWithId(BaseDoc): + id: str + text: str + + class ResultTestDoc(BaseDoc): + matches: DocList[TextDocWithId] + + index_da = DocList[TextDocWithId]( + [TextDocWithId(id=f'{i}', text=f'ID {i}') for i in range(10)] + ) + + out_da = DocList[ResultTestDoc]([ResultTestDoc(matches=index_da[0:2])]) + pb = out_da.to_protobuf() + docs = DocList.from_protobuf(pb) + assert docs[0].matches[0].id == '0' + assert len(docs[0].matches) == 2 + assert len(docs) == 1 + + +@pytest.mark.proto +def test_union_type_error(): + from typing import Union + + class CustomDoc(BaseDoc): + ud: Union[TextDoc, ImageDoc] = TextDoc(text='union type') + + docs = DocList[CustomDoc]([CustomDoc(ud=TextDoc(text='union type'))]) + + with pytest.raises(ValueError): + DocList[CustomDoc].from_protobuf(docs.to_protobuf()) + + class BasisUnion(BaseDoc): + ud: Union[int, str] + + docs_basic = DocList[BasisUnion]([BasisUnion(ud="hello")]) + docs_copy = DocList[BasisUnion].from_protobuf(docs_basic.to_protobuf()) + assert docs_copy == docs_basic + + +class MySimpleDoc(BaseDoc): + title: str + + +class MyComplexDoc(BaseDoc): + content_dict_doclist: Dict[str, DocList[MySimpleDoc]] + content_dict_list: Dict[str, List[MySimpleDoc]] + aux_dict: Dict[str, int] + + +def test_to_from_proto_complex(): + da = DocList[MyComplexDoc]( + [ + MyComplexDoc( + content_dict_doclist={ + 'test1': DocList[MySimpleDoc]( + [MySimpleDoc(title='123'), MySimpleDoc(title='456')] + ) + }, + content_dict_list={ + 'test1': [MySimpleDoc(title='123'), MySimpleDoc(title='456')] + }, + aux_dict={'a': 0}, + ) + ] + ) + da2 = DocList[MyComplexDoc].from_protobuf(da.to_protobuf()) + assert len(da2) == 1 + d2 = da2[0] + assert d2.aux_dict == {'a': 0} + assert len(d2.content_dict_doclist['test1']) == 2 + assert d2.content_dict_doclist['test1'][0].title == '123' + assert d2.content_dict_doclist['test1'][1].title == '456' + assert len(d2.content_dict_list['test1']) == 2 + assert d2.content_dict_list['test1'][0].title == '123' + assert d2.content_dict_list['test1'][1].title == '456' diff --git a/tests/units/array/test_array_save_load.py b/tests/units/array/test_array_save_load.py new file mode 100644 index 00000000000..b5ee6b616e4 --- /dev/null +++ b/tests/units/array/test_array_save_load.py @@ -0,0 +1,137 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import numpy as np +import pytest + +from docarray import BaseDoc, DocList, DocVec +from docarray.documents import ImageDoc +from docarray.typing import NdArray, TorchTensor + + +class MyDoc(BaseDoc): + embedding: NdArray + text: str + image: ImageDoc + + +@pytest.mark.slow +@pytest.mark.parametrize( + 'protocol', ['pickle-array', 'protobuf-array', 'protobuf', 'pickle', 'json-array'] +) +@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) +@pytest.mark.parametrize('show_progress', [False, True]) +@pytest.mark.parametrize('array_cls', [DocList, DocVec]) +def test_array_save_load_binary(protocol, compress, tmp_path, show_progress, array_cls): + tmp_file = os.path.join(tmp_path, 'test') + + da = array_cls[MyDoc]( + [ + MyDoc( + embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') + ), + MyDoc(embedding=[5, 4, 3, 2, 1], text='hello world', image=ImageDoc()), + ] + ) + + da.save_binary( + tmp_file, protocol=protocol, compress=compress, show_progress=show_progress + ) + + da2 = array_cls[MyDoc].load_binary( + tmp_file, protocol=protocol, compress=compress, show_progress=show_progress + ) + + assert len(da2) == 2 + assert len(da) == len(da2) + for d1, d2 in zip(da, da2): + assert d1.embedding.tolist() == d2.embedding.tolist() + assert d1.text == d2.text + assert d1.image.url == d2.image.url + assert da[1].image.url is None + assert da2[1].image.url is None + + +@pytest.mark.slow +@pytest.mark.parametrize( + 'protocol', ['pickle-array', 'protobuf-array', 'protobuf', 'pickle', 'json-array'] +) +@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) +@pytest.mark.parametrize('show_progress', [False, True]) +@pytest.mark.parametrize('to_doc_vec', [True, False]) +def test_array_save_load_binary_streaming( + protocol, compress, tmp_path, show_progress, to_doc_vec +): + tmp_file = os.path.join(tmp_path, 'test') + array_cls = DocVec if to_doc_vec else DocList + + da = DocList[MyDoc]() + + def _extend_da(num_docs=100): + for _ in range(num_docs): + da.extend( + [ + MyDoc( + embedding=np.random.rand(3, 2), + text='hello', + image=ImageDoc(url='aux.png'), + ), + ] + ) + + _extend_da() + if to_doc_vec: + da = da.to_doc_vec() + + da.save_binary( + tmp_file, protocol=protocol, compress=compress, show_progress=show_progress + ) + + da_after = array_cls[MyDoc].load_binary( + tmp_file, protocol=protocol, compress=compress, show_progress=show_progress + ) + + for i, doc in enumerate(da_after): + assert doc.id == da[i].id + assert doc.text == da[i].text + assert doc.image.url == da[i].image.url + + assert i == 99 + + +@pytest.mark.parametrize('tensor_type', [NdArray, TorchTensor]) +def test_save_load_tensor_type(tensor_type, tmp_path): + tmp_file = os.path.join(tmp_path, 'test123') + + class MyDoc(BaseDoc): + embedding: tensor_type + text: str + image: ImageDoc + + da = DocVec[MyDoc]( + [ + MyDoc( + embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') + ), + MyDoc(embedding=[5, 4, 3, 2, 1], text='hello world', image=ImageDoc()), + ], + tensor_type=tensor_type, + ) + da.save_binary(tmp_file) + da2 = DocVec[MyDoc].load_binary(tmp_file, tensor_type=tensor_type) + assert da2.tensor_type == tensor_type + assert isinstance(da2.embedding, tensor_type) diff --git a/tests/units/array/test_batching.py b/tests/units/array/test_batching.py new file mode 100644 index 00000000000..0387b7a2b91 --- /dev/null +++ b/tests/units/array/test_batching.py @@ -0,0 +1,61 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import pytest + +from docarray import BaseDoc, DocList +from docarray.typing import NdArray + + +@pytest.mark.parametrize('shuffle', [False, True]) +@pytest.mark.parametrize('stack', [False, True]) +@pytest.mark.parametrize('batch_size,n_batches', [(16, 7), (10, 10)]) +def test_batch(shuffle, stack, batch_size, n_batches): + class MyDoc(BaseDoc): + id: int + tensor: NdArray + + t_shape = (32, 32) + da = DocList[MyDoc]( + [ + MyDoc( + id=str(i), + tensor=np.zeros(t_shape), + ) + for i in range(100) + ] + ) + if stack: + da = da.to_doc_vec() + + batches = list(da._batch(batch_size=batch_size, shuffle=shuffle)) + assert len(batches) == n_batches + + for i, batch in enumerate(batches): + if i < n_batches - 1: + assert len(batch) == batch_size + if stack: + assert batch.tensor.shape == (batch_size, *t_shape) + else: + assert len(batch) <= batch_size + + non_shuffled_ids = [ + i for i in range(i * batch_size, min((i + 1) * batch_size, len(da))) + ] + if not shuffle: + assert batch.id == non_shuffled_ids + else: + assert not (batch.id == non_shuffled_ids) diff --git a/tests/units/array/test_doclist_schema.py b/tests/units/array/test_doclist_schema.py new file mode 100644 index 00000000000..02a5f562807 --- /dev/null +++ b/tests/units/array/test_doclist_schema.py @@ -0,0 +1,22 @@ +import pytest +from docarray import BaseDoc, DocList +from docarray.utils._internal.pydantic import is_pydantic_v2 + + +@pytest.mark.skipif(not is_pydantic_v2, reason='Feature only available for Pydantic V2') +def test_schema_nested(): + # check issue https://github.com/docarray/docarray/issues/1521 + + class Doc1Test(BaseDoc): + aux: str + + class DocDocTest(BaseDoc): + docs: DocList[Doc1Test] + + assert 'Doc1Test' in DocDocTest.schema()['$defs'] + d = DocDocTest(docs=DocList[Doc1Test]([Doc1Test(aux='aux')])) + + assert isinstance(d.docs, DocList) + for dd in d.docs: + assert isinstance(dd, Doc1Test) + assert d.docs.aux == ['aux'] diff --git a/tests/units/array/test_generic_array.py b/tests/units/array/test_generic_array.py new file mode 100644 index 00000000000..92d77d2a405 --- /dev/null +++ b/tests/units/array/test_generic_array.py @@ -0,0 +1,34 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from docarray import BaseDoc, DocList +from docarray.base_doc import AnyDoc + + +def test_generic_init(): + class Text(BaseDoc): + text: str + + da = DocList[Text]([]) + da.doc_type == Text + + assert isinstance(da, DocList) + + +def test_normal_access_init(): + da = DocList([]) + da.doc_type == AnyDoc + + assert isinstance(da, DocList) diff --git a/tests/units/array/test_indexing.py b/tests/units/array/test_indexing.py new file mode 100644 index 00000000000..f733b1e6630 --- /dev/null +++ b/tests/units/array/test_indexing.py @@ -0,0 +1,265 @@ +import numpy as np +import pytest +import torch + +from docarray import DocList, DocVec +from docarray.documents import TextDoc +from docarray.typing import TorchTensor + + +@pytest.fixture() +def da(): + texts = [f'hello {i}' for i in range(10)] + tensors = [torch.ones((4,)) * i for i in range(10)] + return DocList[TextDoc]( + [TextDoc(text=text, embedding=tens) for text, tens in zip(texts, tensors)], + ) + + +@pytest.fixture() +def da_to_set(): + texts = [f'hello {2*i}' for i in range(5)] + tensors = [torch.ones((4,)) * i * 2 for i in range(5)] + return DocList[TextDoc]( + [TextDoc(text=text, embedding=tens) for text, tens in zip(texts, tensors)], + ) + + +########### +# getitem +########### + + +@pytest.mark.parametrize('stack', [True, False]) +def test_simple_getitem(stack, da): + if stack: + da = da.to_doc_vec(tensor_type=TorchTensor) + + assert torch.all(da[0].embedding == torch.zeros((4,))) + assert da[0].text == 'hello 0' + + +@pytest.mark.parametrize('stack', [True, False]) +def test_get_none(stack, da): + if stack: + da = da.to_doc_vec(tensor_type=TorchTensor) + + assert da[None] is da + + +@pytest.mark.parametrize('stack', [True, False]) +@pytest.mark.parametrize('index', [(1, 2, 3, 4, 6), [1, 2, 3, 4, 6]]) +def test_iterable_getitem(stack, da, index): + if stack: + da = da.to_doc_vec(tensor_type=TorchTensor) + + indexed_da = da[index] + + for pos, d in zip(index, indexed_da): + assert d.text == f'hello {pos}' + assert torch.all(d.embedding == torch.ones((4,)) * pos) + + +@pytest.mark.parametrize('stack', [True, False]) +@pytest.mark.parametrize('index_dtype', [torch.int64]) +def test_torchtensor_getitem(stack, da, index_dtype): + if stack: + da = da.to_doc_vec(tensor_type=TorchTensor) + + index = torch.tensor([1, 2, 3, 4, 6], dtype=index_dtype) + + indexed_da = da[index] + + for pos, d in zip(index, indexed_da): + assert d.text == f'hello {pos}' + assert torch.all(d.embedding == torch.ones((4,)) * pos) + + +@pytest.mark.parametrize('stack', [True, False]) +@pytest.mark.parametrize('index_dtype', [int, np.int_, np.int32, np.int64]) +def test_nparray_getitem(stack, da, index_dtype): + if stack: + da = da.to_doc_vec(tensor_type=TorchTensor) + + index = np.array([1, 2, 3, 4, 6], dtype=index_dtype) + + indexed_da = da[index] + for pos, d in zip(index, indexed_da): + assert d.text == f'hello {pos}' + assert torch.all(d.embedding == torch.ones((4,)) * pos) + + +@pytest.mark.parametrize('stack', [True, False]) +@pytest.mark.parametrize( + 'index', + [ + [False, True, True, True, True, False, True, False, False, False], + (False, True, True, True, True, False, True, False, False, False), + torch.tensor([0, 1, 1, 1, 1, 0, 1, 0, 0, 0], dtype=torch.bool), + np.array([0, 1, 1, 1, 1, 0, 1, 0, 0, 0], dtype=bool), + ], +) +def test_boolmask_getitem(stack, da, index): + if stack: + da = da.to_doc_vec(tensor_type=TorchTensor) + + indexed_da = da[index] + + mask_true_idx = [1, 2, 3, 4, 6] + + for pos, d in zip(mask_true_idx, indexed_da): + assert d.text == f'hello {pos}' + assert torch.all(d.embedding == torch.ones((4,)) * pos) + + +########### +# setitem +########### + + +@pytest.mark.parametrize('stack_left', [True, False]) +def test_simple_setitem(stack_left, da, da_to_set): + if stack_left: + da = da.to_doc_vec(tensor_type=TorchTensor) + + da[0] = da_to_set[0] + + assert torch.all(da[0].embedding == da_to_set[0].embedding) + assert da[0].text == da_to_set[0].text + + +@pytest.mark.parametrize('stack_left', [True, False]) +@pytest.mark.parametrize('stack_right', [True, False]) +@pytest.mark.parametrize('index', [(1, 2, 3, 4, 6), [1, 2, 3, 4, 6]]) +def test_iterable_setitem(stack_left, stack_right, da, da_to_set, index): + if stack_left: + da = da.to_doc_vec(tensor_type=TorchTensor) + if stack_right: + da_to_set = da_to_set.to_doc_vec(tensor_type=TorchTensor) + + da[index] = da_to_set + + i_da_to_set = 0 + for i, d in enumerate(da): + if i in index: + d_reference = da_to_set[i_da_to_set] + assert d.text == d_reference.text + assert torch.all(d.embedding == d_reference.embedding) + i_da_to_set += 1 + else: + assert d.text == f'hello {i}' + assert torch.all(d.embedding == torch.ones((4,)) * i) + + +@pytest.mark.parametrize('stack_left', [True, False]) +@pytest.mark.parametrize('stack_right', [True, False]) +@pytest.mark.parametrize('index_dtype', [torch.int64]) +def test_torchtensor_setitem(stack_left, stack_right, da, da_to_set, index_dtype): + if stack_left: + da = da.to_doc_vec(tensor_type=TorchTensor) + if stack_right: + da_to_set = da_to_set.to_doc_vec(tensor_type=TorchTensor) + + index = torch.tensor([1, 2, 3, 4, 6], dtype=index_dtype) + + da[index] = da_to_set + + i_da_to_set = 0 + for i, d in enumerate(da): + if i in index: + d_reference = da_to_set[i_da_to_set] + assert d.text == d_reference.text + assert torch.all(d.embedding == d_reference.embedding) + i_da_to_set += 1 + else: + assert d.text == f'hello {i}' + assert torch.all(d.embedding == torch.ones((4,)) * i) + + +@pytest.mark.parametrize('stack_left', [True, False]) +@pytest.mark.parametrize('stack_right', [True, False]) +@pytest.mark.parametrize('index_dtype', [int, np.int_, np.int32, np.int64]) +def test_nparray_setitem(stack_left, stack_right, da, da_to_set, index_dtype): + if stack_left: + da = da.to_doc_vec(tensor_type=TorchTensor) + if stack_right: + da_to_set = da_to_set.to_doc_vec(tensor_type=TorchTensor) + + index = np.array([1, 2, 3, 4, 6], dtype=index_dtype) + + da[index] = da_to_set + + i_da_to_set = 0 + for i, d in enumerate(da): + if i in index: + d_reference = da_to_set[i_da_to_set] + assert d.text == d_reference.text + assert torch.all(d.embedding == d_reference.embedding) + i_da_to_set += 1 + else: + assert d.text == f'hello {i}' + assert torch.all(d.embedding == torch.ones((4,)) * i) + + +@pytest.mark.parametrize('stack_left', [True, False]) +@pytest.mark.parametrize('stack_right', [True, False]) +@pytest.mark.parametrize( + 'index', + [ + [False, True, True, True, True, False, True, False, False, False], + (False, True, True, True, True, False, True, False, False, False), + torch.tensor([0, 1, 1, 1, 1, 0, 1, 0, 0, 0], dtype=torch.bool), + np.array([0, 1, 1, 1, 1, 0, 1, 0, 0, 0], dtype=bool), + ], +) +def test_boolmask_setitem(stack_left, stack_right, da, da_to_set, index): + if stack_left: + da = da.to_doc_vec(tensor_type=TorchTensor) + if stack_right: + da_to_set = da_to_set.to_doc_vec(tensor_type=TorchTensor) + + da[index] = da_to_set + + mask_true_idx = [1, 2, 3, 4, 6] + i_da_to_set = 0 + for i, d in enumerate(da): + if i in mask_true_idx: + d_reference = da_to_set[i_da_to_set] + assert d.text == d_reference.text + assert torch.all(d.embedding == d_reference.embedding) + i_da_to_set += 1 + else: + assert d.text == f'hello {i}' + assert torch.all(d.embedding == torch.ones((4,)) * i) + + +def test_setitem_update_column(): + texts = [f'hello {i}' for i in range(10)] + tensors = [torch.ones((4,)) * (i + 1) for i in range(10)] + da = DocVec[TextDoc]( + [TextDoc(text=text, embedding=tens) for text, tens in zip(texts, tensors)], + tensor_type=TorchTensor, + ) + + da[0] = TextDoc(text='hello', embedding=torch.zeros((4,))) + + assert da[0].text == 'hello' + assert (da[0].embedding == torch.zeros((4,))).all() + assert (da.embedding[0] == torch.zeros((4,))).all() + + assert da._storage.any_columns['text'][0] == 'hello' + assert (da._storage.tensor_columns['embedding'][0] == torch.zeros((4,))).all() + assert (da._storage.tensor_columns['embedding'][0] == torch.zeros((4,))).all() + + +@pytest.mark.parametrize( + 'index', + [ + [False, True, True, True, True, False, True, False, False, False], + (False, True, True, True, True, False, True, False, False, False), + torch.tensor([0, 1, 1, 1, 1, 0, 1, 0, 0, 0], dtype=torch.bool), + np.array([0, 1, 1, 1, 1, 0, 1, 0, 0, 0], dtype=bool), + ], +) +def test_del_getitem(da, index): + del da[index] diff --git a/tests/units/array/test_traverse.py b/tests/units/array/test_traverse.py new file mode 100644 index 00000000000..4c513148bd4 --- /dev/null +++ b/tests/units/array/test_traverse.py @@ -0,0 +1,118 @@ +from typing import Optional + +import pytest +import torch + +from docarray import BaseDoc, DocList +from docarray.array.any_array import AnyDocArray +from docarray.documents import TextDoc +from docarray.typing import TorchTensor + +num_docs = 5 +num_sub_docs = 2 +num_sub_sub_docs = 3 + + +@pytest.fixture +def multi_model_docs(): + class SubSubDoc(BaseDoc): + sub_sub_text: TextDoc + sub_sub_tensor: TorchTensor[2] + + class SubDoc(BaseDoc): + sub_text: TextDoc + sub_da: DocList[SubSubDoc] + + class MultiModalDoc(BaseDoc): + mm_text: TextDoc + mm_tensor: Optional[TorchTensor[3, 2, 2]] = None + mm_da: DocList[SubDoc] + + docs = DocList[MultiModalDoc]( + [ + MultiModalDoc( + mm_text=TextDoc(text=f'hello{i}'), + mm_da=[ + SubDoc( + sub_text=TextDoc(text=f'sub_{i}_1'), + sub_da=DocList[SubSubDoc]( + [ + SubSubDoc( + sub_sub_text=TextDoc(text='subsub'), + sub_sub_tensor=torch.zeros(2), + ) + for _ in range(num_sub_sub_docs) + ] + ), + ) + for _ in range(num_sub_docs) + ], + ) + for i in range(num_docs) + ] + ) + + return docs + + +@pytest.mark.parametrize( + 'access_path,len_result', + [ + ('mm_text', num_docs), # List of 5 Text objs + ('mm_text__text', num_docs), # List of 5 strings + ('mm_da', num_docs * num_sub_docs), # List of 5 * 2 SubDoc objs + ('mm_da__sub_text', num_docs * num_sub_docs), # List of 5 * 2 Text objs + ( + 'mm_da__sub_da', + num_docs * num_sub_docs * num_sub_sub_docs, + ), # List of 5 * 2 * 3 SubSubDoc objs + ( + 'mm_da__sub_da__sub_sub_text', + num_docs * num_sub_docs * num_sub_sub_docs, + ), # List of 5 * 2 * 3 Text objs + ], +) +def test_traverse_flat(multi_model_docs, access_path, len_result): + traversed = multi_model_docs.traverse_flat(access_path) + assert len(traversed) == len_result + + +def test_traverse_stacked_da(): + class Image(BaseDoc): + tensor: TorchTensor[3, 224, 224] + + batch = DocList[Image]( + [ + Image( + tensor=torch.zeros(3, 224, 224), + ) + for _ in range(2) + ] + ) + + batch_stacked = batch.to_doc_vec() + tensors = batch_stacked.traverse_flat(access_path='tensor') + + assert tensors.shape == (2, 3, 224, 224) + assert isinstance(tensors, torch.Tensor) + + +@pytest.mark.parametrize( + 'input_list,output_list', + [ + ([1, 2, 3], [1, 2, 3]), + ([[1], [2], [3]], [1, 2, 3]), + ([[[1]], [[2]], [[3]]], [[1], [2], [3]]), + ], +) +def test_flatten_one_level(input_list, output_list): + flattened = AnyDocArray._flatten_one_level(sequence=input_list) + assert flattened == output_list + + +def test_flatten_one_level_list_of_da(): + doc = BaseDoc() + input_list = [DocList([doc, doc, doc])] + + flattened = AnyDocArray._flatten_one_level(sequence=input_list) + assert flattened == [doc, doc, doc] diff --git a/tests/units/computation_backends/__init__.py b/tests/units/computation_backends/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/computation_backends/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/computation_backends/backend_comparisons/__init__.py b/tests/units/computation_backends/backend_comparisons/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/computation_backends/backend_comparisons/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/computation_backends/backend_comparisons/test_metrics.py b/tests/units/computation_backends/backend_comparisons/test_metrics.py new file mode 100644 index 00000000000..f899bc44d39 --- /dev/null +++ b/tests/units/computation_backends/backend_comparisons/test_metrics.py @@ -0,0 +1,74 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + +from docarray.computation.numpy_backend import NumpyCompBackend +from docarray.computation.torch_backend import TorchCompBackend + +np_metrics = NumpyCompBackend.Metrics +torch_metrics = TorchCompBackend.Metrics + + +def test_cosine_sim_compare(): + a = torch.rand(128) + b = torch.rand(128) + torch.testing.assert_close( + torch_metrics.cosine_sim(a, b), + torch.from_numpy(np_metrics.cosine_sim(a.numpy(), b.numpy())), + ) + + a = torch.rand(10, 3) + b = torch.rand(5, 3) + torch.testing.assert_close( + torch_metrics.cosine_sim(a, b), + torch.from_numpy(np_metrics.cosine_sim(a.numpy(), b.numpy())), + ) + + +def test_euclidean_dist_compare(): + a = torch.rand(128) + b = torch.rand(128) + torch.testing.assert_close( + torch_metrics.euclidean_dist(a, b), + torch.from_numpy(np_metrics.euclidean_dist(a.numpy(), b.numpy())).to( + torch.float32 + ), + ) + + a = torch.rand(10, 3) + b = torch.rand(5, 3) + torch.testing.assert_close( + torch_metrics.euclidean_dist(a, b), + torch.from_numpy(np_metrics.euclidean_dist(a.numpy(), b.numpy())), + ) + + +def test_sqeuclidean_dist_compare(): + a = torch.rand(128) + b = torch.rand(128) + torch.testing.assert_close( + torch_metrics.sqeuclidean_dist(a, b), + torch.from_numpy(np_metrics.sqeuclidean_dist(a.numpy(), b.numpy())).to( + torch.float32 + ), + ) + + a = torch.rand(10, 3) + b = torch.rand(5, 3) + torch.testing.assert_close( + torch_metrics.sqeuclidean_dist(a, b), + torch.from_numpy(np_metrics.sqeuclidean_dist(a.numpy(), b.numpy())), + ) diff --git a/tests/units/computation_backends/jax_backend/__init__.py b/tests/units/computation_backends/jax_backend/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/computation_backends/jax_backend/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/computation_backends/jax_backend/test_basics.py b/tests/units/computation_backends/jax_backend/test_basics.py new file mode 100644 index 00000000000..db064430c9b --- /dev/null +++ b/tests/units/computation_backends/jax_backend/test_basics.py @@ -0,0 +1,163 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest + +from docarray.utils._internal.misc import is_jax_available + +jax_available = is_jax_available() +if jax_available: + print("is jax available", jax_available) + import jax + import jax.numpy as jnp + + from docarray.computation.jax_backend import JaxCompBackend + from docarray.typing import JaxArray + + jax.config.update("jax_enable_x64", True) + + +@pytest.mark.jax +@pytest.mark.parametrize( + 'shape,result', + [ + ((5), 1), + ((1, 5), 2), + ((5, 5), 2), + ((), 0), + ], +) +def test_n_dim(shape, result): + + array = JaxArray(jnp.zeros(shape)) + assert JaxCompBackend.n_dim(array) == result + + +@pytest.mark.jax +@pytest.mark.parametrize( + 'shape,result', + [ + ((10,), (10,)), + ((5, 5), (5, 5)), + ((), ()), + ], +) +def test_shape(shape, result): + array = JaxArray(jnp.zeros(shape)) + shape = JaxCompBackend.shape(array) + assert shape == result + assert type(shape) == tuple + + +@pytest.mark.jax +def test_to_device(): + array = JaxArray(jnp.zeros((3))) + array = JaxCompBackend.to_device(array, 'cpu') + assert array.tensor.device().platform.endswith('cpu') + + +@pytest.mark.jax +@pytest.mark.parametrize( + 'dtype,result_type', + [ + ('int64', 'int64'), + ('float64', 'float64'), + ('int8', 'int8'), + ('double', 'float64'), + ], +) +def test_dtype(dtype, result_type): + array = JaxArray(jnp.array([1, 2, 3], dtype=dtype)) + assert JaxCompBackend.dtype(array) == result_type + + +@pytest.mark.jax +def test_empty(): + array = JaxCompBackend.empty((10, 3)) + assert array.tensor.shape == (10, 3) + + +@pytest.mark.jax +def test_empty_dtype(): + tf_tensor = JaxCompBackend.empty((10, 3), dtype=jnp.int32) + assert tf_tensor.tensor.shape == (10, 3) + assert tf_tensor.tensor.dtype == jnp.int32 + + +@pytest.mark.jax +def test_empty_device(): + tensor = JaxCompBackend.empty((10, 3), device='cpu') + assert tensor.tensor.shape == (10, 3) + assert tensor.tensor.device().platform.endswith('cpu') + + +@pytest.mark.jax +def test_squeeze(): + tensor = JaxArray(jnp.zeros(shape=(1, 1, 3, 1))) + squeezed = JaxCompBackend.squeeze(tensor) + assert squeezed.tensor.shape == (3,) + + +@pytest.mark.jax +@pytest.mark.parametrize( + 'data_input,t_range,x_range,data_result', + [ + ( + [0, 1, 2, 3, 4, 5], + (0, 10), + None, + [0, 2, 4, 6, 8, 10], + ), + ( + [0, 1, 2, 3, 4, 5], + (0, 10), + (0, 10), + [0, 1, 2, 3, 4, 5], + ), + ( + [[0.0, 1.0], [0.0, 1.0]], + (0, 10), + None, + [[0.0, 10.0], [0.0, 10.0]], + ), + ], +) +def test_minmax_normalize(data_input, t_range, x_range, data_result): + array = JaxArray(jnp.array(data_input)) + output = JaxCompBackend.minmax_normalize( + tensor=array, t_range=t_range, x_range=x_range + ) + assert jnp.allclose(output.tensor, jnp.array(data_result)) + + +@pytest.mark.jax +def test_reshape(): + tensor = JaxArray(jnp.zeros((3, 224, 224))) + reshaped = JaxCompBackend.reshape(tensor, (224, 224, 3)) + assert reshaped.tensor.shape == (224, 224, 3) + + +@pytest.mark.jax +def test_stack(): + t0 = JaxArray(jnp.zeros((3, 224, 224))) + t1 = JaxArray(jnp.ones((3, 224, 224))) + + stacked1 = JaxCompBackend.stack([t0, t1], dim=0) + assert isinstance(stacked1, JaxArray) + assert stacked1.tensor.shape == (2, 3, 224, 224) + + stacked2 = JaxCompBackend.stack([t0, t1], dim=-1) + assert isinstance(stacked2, JaxArray) + assert stacked2.tensor.shape == (3, 224, 224, 2) diff --git a/tests/units/computation_backends/jax_backend/test_metrics.py b/tests/units/computation_backends/jax_backend/test_metrics.py new file mode 100644 index 00000000000..50dc6339d63 --- /dev/null +++ b/tests/units/computation_backends/jax_backend/test_metrics.py @@ -0,0 +1,81 @@ +import pytest + +from docarray.utils._internal.misc import is_jax_available + +jax_available = is_jax_available() +if jax_available: + import jax + import jax.numpy as jnp + + from docarray.computation.jax_backend import JaxCompBackend + from docarray.typing import JaxArray + + metrics = JaxCompBackend.Metrics +else: + metrics = None + + +@pytest.mark.jax +def test_cosine_sim_jax(): + a = JaxArray(jax.random.uniform(jax.random.PRNGKey(0), shape=(128,))) + b = JaxArray(jax.random.uniform(jax.random.PRNGKey(1), shape=(128,))) + assert metrics.cosine_sim(a, b).tensor.shape == (1,) + assert metrics.cosine_sim(a, b).tensor == metrics.cosine_sim(b, a).tensor + + assert jnp.allclose(metrics.cosine_sim(a, a).tensor, jnp.ones((1,))) + + a = JaxArray(jax.random.uniform(jax.random.PRNGKey(2), shape=(10, 3))) + b = JaxArray(jax.random.uniform(jax.random.PRNGKey(3), shape=(5, 3))) + assert metrics.cosine_sim(a, b).tensor.shape == (10, 5) + assert metrics.cosine_sim(b, a).tensor.shape == (5, 10) + diag_dists = jnp.diagonal(metrics.cosine_sim(b, b).tensor) # self-comparisons + assert jnp.allclose(diag_dists, jnp.ones((5,))) + + +@pytest.mark.jax +@pytest.mark.skip +def test_euclidean_dist_jax(): + a = JaxArray(jax.random.normal(jax.random.PRNGKey(0), shape=(128,))) + b = JaxArray(jax.random.normal(jax.random.PRNGKey(1), shape=(128,))) + assert metrics.euclidean_dist(a, b).tensor.shape == (1,) + assert jnp.allclose( + metrics.euclidean_dist(a, b).tensor, metrics.euclidean_dist(b, a).tensor + ) + + assert jnp.allclose(metrics.euclidean_dist(a, a).tensor, jnp.zeros((1,))) + + a = JaxArray(jnp.zeros((1, 1))) + b = JaxArray(jnp.ones((4, 1))) + assert metrics.euclidean_dist(a, b).tensor.shape == (4,) + assert jnp.allclose( + metrics.euclidean_dist(a, b).tensor, metrics.euclidean_dist(b, a).tensor + ) + assert jnp.allclose(metrics.euclidean_dist(a, a).tensor, jnp.zeros((1,))) + + a = JaxArray(jnp.array([0.0, 2.0, 0.0])) + b = JaxArray(jnp.array([0.0, 0.0, 2.0])) + desired_output_singleton = jnp.sqrt(jnp.array([2.0**2.0 + 2.0**2.0])) + assert jnp.allclose(metrics.euclidean_dist(a, b).tensor, desired_output_singleton) + + a = JaxArray(jnp.array([[0.0, 2.0, 0.0], [0.0, 0.0, 2.0]])) + b = JaxArray(jnp.array([[0.0, 0.0, 2.0], [0.0, 2.0, 0.0]])) + desired_output_singleton = jnp.array([[2.828427, 0.0], [0.0, 2.828427]]) + + assert jnp.allclose(metrics.euclidean_dist(a, b).tensor, desired_output_singleton) + + +@pytest.mark.jax +def test_sqeuclidea_dist_jnp(): + a = JaxArray(jax.random.uniform(jax.random.PRNGKey(0), shape=(128,))) + b = JaxArray(jax.random.uniform(jax.random.PRNGKey(1), shape=(128,))) + assert metrics.sqeuclidean_dist(a, b).tensor.shape == (1,) + assert jnp.allclose( + metrics.sqeuclidean_dist(a, b).tensor, metrics.euclidean_dist(a, b).tensor ** 2 + ) + + a = JaxArray(jax.random.uniform(jax.random.PRNGKey(2), shape=(10, 3))) + b = JaxArray(jax.random.uniform(jax.random.PRNGKey(3), shape=(5, 3))) + assert metrics.sqeuclidean_dist(a, b).tensor.shape == (10, 5) + assert jnp.allclose( + metrics.sqeuclidean_dist(a, b).tensor, metrics.euclidean_dist(a, b).tensor ** 2 + ) diff --git a/tests/units/computation_backends/jax_backend/test_retrieval.py b/tests/units/computation_backends/jax_backend/test_retrieval.py new file mode 100644 index 00000000000..7d827e2d383 --- /dev/null +++ b/tests/units/computation_backends/jax_backend/test_retrieval.py @@ -0,0 +1,81 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest + +from docarray.utils._internal.misc import is_jax_available + +jax_available = is_jax_available() +if jax_available: + import jax.numpy as jnp + + from docarray.computation.jax_backend import JaxCompBackend + from docarray.typing import JaxArray + + metrics = JaxCompBackend.Metrics +else: + metrics = None + + +@pytest.mark.jax +def test_top_k_descending_false(): + top_k = JaxCompBackend.Retrieval.top_k + + a = JaxArray(jnp.array([1, 4, 2, 7, 4, 9, 2])) + vals, indices = top_k(a, 3, descending=False) + + assert vals.tensor.shape == (1, 3) + assert indices.tensor.shape == (1, 3) + assert jnp.allclose(jnp.squeeze(vals.tensor), jnp.array([1, 2, 2])) + assert jnp.allclose(jnp.squeeze(indices.tensor), jnp.array([0, 2, 6])) or ( + jnp.allclose(jnp.squeeze.indices.tensor), + jnp.array([0, 6, 2]), + ) + + a = JaxArray(jnp.array([[1, 4, 2, 7, 4, 9, 2], [11, 6, 2, 7, 3, 10, 4]])) + vals, indices = top_k(a, 3, descending=False) + assert vals.tensor.shape == (2, 3) + assert indices.tensor.shape == (2, 3) + assert jnp.allclose(vals.tensor[0], jnp.array([1, 2, 2])) + assert jnp.allclose(indices.tensor[0], jnp.array([0, 2, 6])) or jnp.allclose( + indices.tensor[0], jnp.array([0, 6, 2]) + ) + assert jnp.allclose(vals.tensor[1], jnp.array([2, 3, 4])) + assert jnp.allclose(indices.tensor[1], jnp.array([2, 4, 6])) + + +@pytest.mark.jax +def test_top_k_descending_true(): + top_k = JaxCompBackend.Retrieval.top_k + + a = JaxArray(jnp.array([1, 4, 2, 7, 4, 9, 2])) + vals, indices = top_k(a, 3, descending=True) + + assert vals.tensor.shape == (1, 3) + assert indices.tensor.shape == (1, 3) + assert jnp.allclose(jnp.squeeze(vals.tensor), jnp.array([9, 7, 4])) + assert jnp.allclose(jnp.squeeze(indices.tensor), jnp.array([5, 3, 1])) + + a = JaxArray(jnp.array([[1, 4, 2, 7, 4, 9, 2], [11, 6, 2, 7, 3, 10, 4]])) + vals, indices = top_k(a, 3, descending=True) + + assert vals.tensor.shape == (2, 3) + assert indices.tensor.shape == (2, 3) + + assert jnp.allclose(vals.tensor[0], jnp.array([9, 7, 4])) + assert jnp.allclose(indices.tensor[0], jnp.array([5, 3, 1])) + + assert jnp.allclose(vals.tensor[1], jnp.array([11, 10, 7])) + assert jnp.allclose(indices.tensor[1], jnp.array([0, 5, 3])) diff --git a/tests/units/computation_backends/numpy_backend/__init__.py b/tests/units/computation_backends/numpy_backend/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/computation_backends/numpy_backend/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/computation_backends/numpy_backend/test_basics.py b/tests/units/computation_backends/numpy_backend/test_basics.py new file mode 100644 index 00000000000..7ab511db9ad --- /dev/null +++ b/tests/units/computation_backends/numpy_backend/test_basics.py @@ -0,0 +1,119 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import pytest +from pydantic import parse_obj_as + +from docarray.computation.numpy_backend import NumpyCompBackend +from docarray.typing import NdArray + + +def test_to_device(): + with pytest.raises(NotImplementedError): + NumpyCompBackend.to_device(np.random.rand(10, 3), 'meta') + + +@pytest.mark.parametrize( + 'array,result', + [ + (np.zeros((5)), 1), + (np.zeros((1, 5)), 2), + (np.zeros((5, 5)), 2), + (np.zeros(()), 0), + ], +) +def test_n_dim(array, result): + assert NumpyCompBackend.n_dim(array) == result + + +@pytest.mark.parametrize( + 'array,result', + [ + (np.zeros((10,)), (10,)), + (np.zeros((5, 5)), (5, 5)), + (np.zeros(()), ()), + ], +) +def test_shape(array, result): + shape = NumpyCompBackend.shape(array) + assert shape == result + assert type(shape) == tuple + + +def test_device(): + array = np.array([1, 2, 3]) + assert NumpyCompBackend.device(array) is None + + +@pytest.mark.parametrize('dtype', [np.int64, np.float64, int, float]) +def test_dtype(dtype): + array = np.array([1, 2, 3], dtype=dtype) + assert NumpyCompBackend.dtype(array) == dtype + + +def test_empty(): + array = NumpyCompBackend.empty((10, 3)) + assert array.shape == (10, 3) + + +def test_empty_dtype(): + tensor = NumpyCompBackend.empty((10, 3), dtype=np.int32) + assert tensor.shape == (10, 3) + assert tensor.dtype == np.int32 + + +def test_empty_device(): + with pytest.raises(NotImplementedError): + NumpyCompBackend.empty((10, 3), device='meta') + + +def test_squeeze(): + tensor = np.zeros(shape=(1, 1, 3, 1)) + squeezed = NumpyCompBackend.squeeze(tensor) + assert squeezed.shape == (3,) + + +@pytest.mark.parametrize( + 'array,t_range,x_range,result', + [ + (np.array([0, 1, 2, 3, 4, 5]), (0, 10), None, np.array([0, 2, 4, 6, 8, 10])), + (np.array([0, 1, 2, 3, 4, 5]), (0, 10), (0, 10), np.array([0, 1, 2, 3, 4, 5])), + ( + np.array([[0.0, 1.0], [0.0, 1.0]]), + (0, 10), + None, + np.array([[0.0, 10.0], [0.0, 10.0]]), + ), + ], +) +def test_minmax_normalize(array, t_range, x_range, result): + output = NumpyCompBackend.minmax_normalize( + tensor=array, t_range=t_range, x_range=x_range + ) + assert np.allclose(output, result) + + +def test_stack(): + t0 = parse_obj_as(NdArray, np.zeros((3, 224, 224))) + t1 = parse_obj_as(NdArray, np.ones((3, 224, 224))) + + stacked1 = NumpyCompBackend.stack([t0, t1], dim=0) + assert isinstance(stacked1, np.ndarray) + assert stacked1.shape == (2, 3, 224, 224) + + stacked2 = NumpyCompBackend.stack([t0, t1], dim=-1) + assert isinstance(stacked2, np.ndarray) + assert stacked2.shape == (3, 224, 224, 2) diff --git a/tests/units/computation_backends/numpy_backend/test_metrics.py b/tests/units/computation_backends/numpy_backend/test_metrics.py new file mode 100644 index 00000000000..2cd0369e876 --- /dev/null +++ b/tests/units/computation_backends/numpy_backend/test_metrics.py @@ -0,0 +1,67 @@ +import numpy as np + +from docarray.computation.numpy_backend import NumpyCompBackend + +metrics = NumpyCompBackend.Metrics + + +def test_cosine_sim_np(): + a = np.random.rand(128) + b = np.random.rand(128) + assert metrics.cosine_sim(a, b).shape == (1,) + assert metrics.cosine_sim(a, b) == metrics.cosine_sim(b, a) + np.testing.assert_array_almost_equal(metrics.cosine_sim(a, a), np.ones((1,))) + + a = np.random.rand(10, 3) + b = np.random.rand(5, 3) + assert metrics.cosine_sim(a, b).shape == (10, 5) + assert metrics.cosine_sim(b, a).shape == (5, 10) + diag_dists = np.diagonal(metrics.cosine_sim(b, b)) # self-comparisons + np.testing.assert_array_almost_equal(diag_dists, np.ones((5,))) + + +def test_euclidean_dist_np(): + a = np.random.rand(128) + b = np.random.rand(128) + assert metrics.euclidean_dist(a, b).shape == (1,) + assert metrics.euclidean_dist(a, b) == metrics.euclidean_dist(b, a) + np.testing.assert_array_almost_equal(metrics.euclidean_dist(a, a), np.zeros((1,))) + + a = np.random.rand(10, 3) + b = np.random.rand(5, 3) + assert metrics.euclidean_dist(a, b).shape == (10, 5) + assert metrics.euclidean_dist(b, a).shape == (5, 10) + diag_dists = np.diagonal(metrics.euclidean_dist(b, b)) # self-comparisons + np.testing.assert_array_almost_equal(diag_dists, np.zeros((5,))) + + a = np.array([0.0, 2.0, 0.0]) + b = np.array([0.0, 0.0, 2.0]) + desired_output_singleton = np.sqrt(np.array([2.0**2.0 + 2.0**2.0])) + np.testing.assert_array_almost_equal( + metrics.euclidean_dist(a, b), desired_output_singleton + ) + + a = np.array([[0.0, 2.0, 0.0], [0.0, 0.0, 2.0]]) + b = np.array([[0.0, 0.0, 2.0], [0.0, 2.0, 0.0]]) + desired_output_singleton = np.array( + [[desired_output_singleton.item(), 0.0], [0.0, desired_output_singleton.item()]] + ) + np.testing.assert_array_almost_equal( + metrics.euclidean_dist(a, b), desired_output_singleton + ) + + +def test_sqeuclidea_dist_np(): + a = np.random.rand(128) + b = np.random.rand(128) + assert metrics.sqeuclidean_dist(a, b).shape == (1,) + np.testing.assert_array_almost_equal( + metrics.sqeuclidean_dist(a, b), metrics.euclidean_dist(a, b) ** 2 + ) + + a = np.random.rand(10, 3) + b = np.random.rand(5, 3) + assert metrics.sqeuclidean_dist(a, b).shape == (10, 5) + np.testing.assert_array_almost_equal( + metrics.sqeuclidean_dist(a, b), metrics.euclidean_dist(a, b) ** 2 + ) diff --git a/tests/units/computation_backends/numpy_backend/test_retrieval.py b/tests/units/computation_backends/numpy_backend/test_retrieval.py new file mode 100644 index 00000000000..5fa693dde61 --- /dev/null +++ b/tests/units/computation_backends/numpy_backend/test_retrieval.py @@ -0,0 +1,42 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np + +from docarray.computation.numpy_backend import NumpyCompBackend + + +def test_topk_numpy(): + top_k = NumpyCompBackend.Retrieval.top_k + + a = np.array([1, 4, 2, 7, 4, 9, 2]) + vals, indices = top_k(a, 3) + assert vals.shape == (1, 3) + assert indices.shape == (1, 3) + assert (vals.squeeze() == np.array([1, 2, 2])).all() + assert (indices.squeeze() == np.array([0, 2, 6])).all() or ( + indices.squeeze() == np.array([0, 6, 2]) + ).all() + + a = np.array([[1, 4, 2, 7, 4, 9, 2], [11, 6, 2, 7, 3, 10, 4]]) + vals, indices = top_k(a, 3) + assert vals.shape == (2, 3) + assert indices.shape == (2, 3) + assert (vals[0] == np.array([1, 2, 2])).all() + assert (indices[0] == np.array([0, 2, 6])).all() or ( + indices[0] == np.array([0, 6, 2]) + ).all() + assert (vals[1] == np.array([2, 3, 4])).all() + assert (indices[1] == np.array([2, 4, 6])).all() diff --git a/tests/units/computation_backends/tensorflow_backend/__init__.py b/tests/units/computation_backends/tensorflow_backend/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/computation_backends/tensorflow_backend/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/computation_backends/tensorflow_backend/test_basics.py b/tests/units/computation_backends/tensorflow_backend/test_basics.py new file mode 100644 index 00000000000..6747eecb87e --- /dev/null +++ b/tests/units/computation_backends/tensorflow_backend/test_basics.py @@ -0,0 +1,159 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import pytest + +from docarray.utils._internal.misc import is_tf_available + +tf_available = is_tf_available() +if tf_available: + import tensorflow as tf + + from docarray.computation.tensorflow_backend import TensorFlowCompBackend + from docarray.typing import TensorFlowTensor + + +@pytest.mark.tensorflow +@pytest.mark.parametrize( + 'shape,result', + [ + ((5), 1), + ((1, 5), 2), + ((5, 5), 2), + ((), 0), + ], +) +def test_n_dim(shape, result): + array = TensorFlowTensor(tf.zeros(shape)) + assert TensorFlowCompBackend.n_dim(array) == result + + +@pytest.mark.tensorflow +@pytest.mark.parametrize( + 'shape,result', + [ + ((10,), (10,)), + ((5, 5), (5, 5)), + ((), ()), + ], +) +def test_shape(shape, result): + array = TensorFlowTensor(tf.zeros(shape)) + shape = TensorFlowCompBackend.shape(array) + assert shape == result + assert type(shape) == tuple + + +@pytest.mark.tensorflow +def test_to_device(): + array = TensorFlowTensor(tf.constant([1, 2, 3])) + array = TensorFlowCompBackend.to_device(array, 'CPU:0') + assert array.tensor.device.endswith('CPU:0') + + +@pytest.mark.tensorflow +@pytest.mark.parametrize( + 'dtype,result_type', + [ + ('int64', 'int64'), + ('float64', 'float64'), + ('int8', 'int8'), + ('double', 'float64'), + ], +) +def test_dtype(dtype, result_type): + array = TensorFlowTensor(tf.constant([1, 2, 3], dtype=getattr(tf, dtype))) + assert TensorFlowCompBackend.dtype(array) == result_type + + +@pytest.mark.tensorflow +def test_empty(): + array = TensorFlowCompBackend.empty((10, 3)) + assert array.tensor.shape == (10, 3) + + +@pytest.mark.tensorflow +def test_empty_dtype(): + tf_tensor = TensorFlowCompBackend.empty((10, 3), dtype=tf.int32) + assert tf_tensor.tensor.shape == (10, 3) + assert tf_tensor.tensor.dtype == tf.int32 + + +@pytest.mark.tensorflow +def test_empty_device(): + tensor = TensorFlowCompBackend.empty((10, 3), device='CPU:0') + assert tensor.tensor.shape == (10, 3) + assert tensor.tensor.device.endswith('CPU:0') + + +@pytest.mark.tensorflow +def test_squeeze(): + tensor = TensorFlowTensor(tf.zeros(shape=(1, 1, 3, 1))) + squeezed = TensorFlowCompBackend.squeeze(tensor) + assert squeezed.tensor.shape == (3,) + + +@pytest.mark.tensorflow +@pytest.mark.parametrize( + 'data_input,t_range,x_range,data_result', + [ + ( + [0, 1, 2, 3, 4, 5], + (0, 10), + None, + [0, 2, 4, 6, 8, 10], + ), + ( + [0, 1, 2, 3, 4, 5], + (0, 10), + (0, 10), + [0, 1, 2, 3, 4, 5], + ), + ( + [[0.0, 1.0], [0.0, 1.0]], + (0, 10), + None, + [[0.0, 10.0], [0.0, 10.0]], + ), + ], +) +def test_minmax_normalize(data_input, t_range, x_range, data_result): + array = TensorFlowTensor(tf.constant(data_input)) + output = TensorFlowCompBackend.minmax_normalize( + tensor=array, t_range=t_range, x_range=x_range + ) + assert np.allclose(output.tensor, tf.constant(data_result)) + + +@pytest.mark.tensorflow +def test_reshape(): + tensor = TensorFlowTensor(tf.zeros((3, 224, 224))) + reshaped = TensorFlowCompBackend.reshape(tensor, (224, 224, 3)) + assert reshaped.tensor.shape == (224, 224, 3) + + +@pytest.mark.tensorflow +def test_stack(): + t0 = TensorFlowTensor(tf.zeros((3, 224, 224))) + t1 = TensorFlowTensor(tf.ones((3, 224, 224))) + + stacked1 = TensorFlowCompBackend.stack([t0, t1], dim=0) + assert isinstance(stacked1, TensorFlowTensor) + assert stacked1.tensor.shape == (2, 3, 224, 224) + + stacked2 = TensorFlowCompBackend.stack([t0, t1], dim=-1) + assert isinstance(stacked2, TensorFlowTensor) + assert stacked2.tensor.shape == (3, 224, 224, 2) diff --git a/tests/units/computation_backends/tensorflow_backend/test_metrics.py b/tests/units/computation_backends/tensorflow_backend/test_metrics.py new file mode 100644 index 00000000000..196297adf0b --- /dev/null +++ b/tests/units/computation_backends/tensorflow_backend/test_metrics.py @@ -0,0 +1,82 @@ +import pytest + +from docarray.utils._internal.misc import is_tf_available + +tf_available = is_tf_available() +if tf_available: + import tensorflow as tf + + from docarray.computation.tensorflow_backend import TensorFlowCompBackend + from docarray.typing import TensorFlowTensor + + metrics = TensorFlowCompBackend.Metrics +else: + metrics = None + + +@pytest.mark.tensorflow +def test_cosine_sim_tf(): + a = TensorFlowTensor(tf.random.normal((128,))) + b = TensorFlowTensor(tf.random.normal((128,))) + assert metrics.cosine_sim(a, b).tensor.shape == (1,) + assert metrics.cosine_sim(a, b).tensor == metrics.cosine_sim(b, a).tensor + tf.experimental.numpy.allclose(metrics.cosine_sim(a, a).tensor, tf.ones(1)) + + a = TensorFlowTensor(tf.random.normal((10, 3))) + b = TensorFlowTensor(tf.random.normal((5, 3))) + assert metrics.cosine_sim(a, b).tensor.shape == (10, 5) + assert metrics.cosine_sim(b, a).tensor.shape == (5, 10) + diag_dists = tf.linalg.diag(metrics.cosine_sim(b, b).tensor) # self-comparisons + tf.experimental.numpy.allclose(diag_dists, tf.ones(5)) + + +@pytest.mark.tensorflow +def test_euclidean_dist_tf(): + a = TensorFlowTensor(tf.random.normal((128,))) + b = TensorFlowTensor(tf.random.normal((128,))) + assert metrics.euclidean_dist(a, b).tensor.shape == (1,) + assert metrics.euclidean_dist(a, b).tensor == metrics.euclidean_dist(b, a).tensor + tf.experimental.numpy.allclose(metrics.euclidean_dist(a, a).tensor, tf.zeros(1)) + + a = TensorFlowTensor(tf.zeros((1, 1))) + b = TensorFlowTensor(tf.ones((4, 1))) + assert metrics.euclidean_dist(a, b).tensor.shape == (4,) + tf.experimental.numpy.allclose( + metrics.euclidean_dist(a, b).tensor, metrics.euclidean_dist(b, a).tensor + ) + tf.experimental.numpy.allclose(metrics.euclidean_dist(a, a).tensor, tf.zeros(1)) + + a = TensorFlowTensor(tf.constant([0.0, 2.0, 0.0])) + b = TensorFlowTensor(tf.constant([0.0, 0.0, 2.0])) + desired_output_singleton: tf.Tensor = tf.math.sqrt( + tf.constant([2.0**2.0 + 2.0**2.0]) + ) + tf.experimental.numpy.allclose( + metrics.euclidean_dist(a, b).tensor, desired_output_singleton + ) + + a = TensorFlowTensor(tf.constant([[0.0, 2.0, 0.0], [0.0, 0.0, 2.0]])) + b = TensorFlowTensor(tf.constant([[0.0, 0.0, 2.0], [0.0, 2.0, 0.0]])) + desired_output_singleton = tf.constant([[2.828427, 0.0], [0.0, 2.828427]]) + tf.experimental.numpy.allclose( + metrics.euclidean_dist(a, b).tensor, desired_output_singleton + ) + + +@pytest.mark.tensorflow +def test_sqeuclidean_dist_torch(): + a = TensorFlowTensor(tf.random.normal((128,))) + b = TensorFlowTensor(tf.random.normal((128,))) + assert metrics.sqeuclidean_dist(a, b).tensor.shape == (1,) + tf.experimental.numpy.allclose( + metrics.sqeuclidean_dist(a, b).tensor, + metrics.euclidean_dist(a, b).tensor ** 2, + ) + + a = TensorFlowTensor(tf.random.normal((1, 1))) + b = TensorFlowTensor(tf.random.normal((4, 1))) + assert metrics.sqeuclidean_dist(b, a).tensor.shape == (4,) + tf.experimental.numpy.allclose( + metrics.sqeuclidean_dist(a, b).tensor, + metrics.euclidean_dist(a, b).tensor ** 2, + ) diff --git a/tests/units/computation_backends/tensorflow_backend/test_retrieval.py b/tests/units/computation_backends/tensorflow_backend/test_retrieval.py new file mode 100644 index 00000000000..f4d40e7a317 --- /dev/null +++ b/tests/units/computation_backends/tensorflow_backend/test_retrieval.py @@ -0,0 +1,78 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest + +from docarray.utils._internal.misc import is_tf_available + +tf_available = is_tf_available() +if tf_available: + import tensorflow as tf + import tensorflow._api.v2.experimental.numpy as tnp + + from docarray.computation.tensorflow_backend import TensorFlowCompBackend + from docarray.typing import TensorFlowTensor + + +@pytest.mark.tensorflow +def test_top_k_descending_false(): + top_k = TensorFlowCompBackend.Retrieval.top_k + + a = TensorFlowTensor(tf.constant([1, 4, 2, 7, 4, 9, 2])) + vals, indices = top_k(a, 3, descending=False) + + assert vals.tensor.shape == (1, 3) + assert indices.tensor.shape == (1, 3) + assert tnp.allclose(tnp.squeeze(vals.tensor), tf.constant([1, 2, 2])) + assert tnp.allclose(tnp.squeeze(indices.tensor), tf.constant([0, 2, 6])) or ( + tnp.allclose(tnp.squeeze.indices.tensor), + tf.constant([0, 6, 2]), + ) + + a = TensorFlowTensor(tf.constant([[1, 4, 2, 7, 4, 9, 2], [11, 6, 2, 7, 3, 10, 4]])) + vals, indices = top_k(a, 3, descending=False) + assert vals.tensor.shape == (2, 3) + assert indices.tensor.shape == (2, 3) + assert tnp.allclose(vals.tensor[0], tf.constant([1, 2, 2])) + assert tnp.allclose(indices.tensor[0], tf.constant([0, 2, 6])) or tnp.allclose( + indices.tensor[0], tf.constant([0, 6, 2]) + ) + assert tnp.allclose(vals.tensor[1], tf.constant([2, 3, 4])) + assert tnp.allclose(indices.tensor[1], tf.constant([2, 4, 6])) + + +@pytest.mark.tensorflow +def test_top_k_descending_true(): + top_k = TensorFlowCompBackend.Retrieval.top_k + + a = TensorFlowTensor(tf.constant([1, 4, 2, 7, 4, 9, 2])) + vals, indices = top_k(a, 3, descending=True) + + assert vals.tensor.shape == (1, 3) + assert indices.tensor.shape == (1, 3) + assert tnp.allclose(tnp.squeeze(vals.tensor), tf.constant([9, 7, 4])) + assert tnp.allclose(tnp.squeeze(indices.tensor), tf.constant([5, 3, 1])) + + a = TensorFlowTensor(tf.constant([[1, 4, 2, 7, 4, 9, 2], [11, 6, 2, 7, 3, 10, 4]])) + vals, indices = top_k(a, 3, descending=True) + + assert vals.tensor.shape == (2, 3) + assert indices.tensor.shape == (2, 3) + + assert tnp.allclose(vals.tensor[0], tf.constant([9, 7, 4])) + assert tnp.allclose(indices.tensor[0], tf.constant([5, 3, 1])) + + assert tnp.allclose(vals.tensor[1], tf.constant([11, 10, 7])) + assert tnp.allclose(indices.tensor[1], tf.constant([0, 5, 3])) diff --git a/tests/units/computation_backends/torch_backend/__init__.py b/tests/units/computation_backends/torch_backend/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/computation_backends/torch_backend/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/computation_backends/torch_backend/test_basics.py b/tests/units/computation_backends/torch_backend/test_basics.py new file mode 100644 index 00000000000..b0b98980b7e --- /dev/null +++ b/tests/units/computation_backends/torch_backend/test_basics.py @@ -0,0 +1,161 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import pytest +import torch + +from docarray.computation.torch_backend import TorchCompBackend + + +def test_to_device(): + t = torch.rand(10, 3) + assert t.device == torch.device('cpu') + t = TorchCompBackend.to_device(t, 'meta') + assert t.device == torch.device('meta') + + +@pytest.mark.parametrize( + 'array,result', + [ + (torch.zeros((5)), 1), + (torch.zeros((1, 5)), 2), + (torch.zeros((5, 5)), 2), + (torch.zeros(()), 0), + ], +) +def test_n_dim(array, result): + assert TorchCompBackend.n_dim(array) == result + + +@pytest.mark.parametrize( + 'array,result', + [ + (torch.zeros((10,)), (10,)), + (torch.zeros((5, 5)), (5, 5)), + (torch.zeros(()), ()), + ], +) +def test_shape(array, result): + shape = TorchCompBackend.shape(array) + assert shape == result + assert type(shape) == tuple + + +@pytest.mark.parametrize('dtype', [torch.int64, torch.float64, torch.int, torch.float]) +def test_dtype(dtype): + tensor = torch.tensor([1, 2, 3], dtype=dtype) + assert TorchCompBackend.dtype(tensor) == dtype + + +def test_device(): + tensor = torch.tensor([1, 2, 3]) + assert TorchCompBackend.device(tensor) == 'cpu' + + +def test_empty(): + tensor = TorchCompBackend.empty((10, 3)) + assert tensor.shape == (10, 3) + + +def test_empty_dtype(): + tensor = TorchCompBackend.empty((10, 3), dtype=torch.int32) + assert tensor.shape == (10, 3) + assert tensor.dtype == torch.int32 + + +def test_empty_device(): + tensor = TorchCompBackend.empty((10, 3), device='meta') + assert tensor.shape == (10, 3) + assert tensor.device == torch.device('meta') + + +def test_squeeze(): + tensor = torch.zeros(size=(1, 1, 3, 1)) + squeezed = TorchCompBackend.squeeze(tensor) + assert squeezed.shape == (3,) + + +@pytest.mark.parametrize( + 'array,t_range,x_range,result', + [ + ( + torch.tensor([0, 1, 2, 3, 4, 5]), + (0, 10), + None, + torch.tensor([0, 2, 4, 6, 8, 10]), + ), + ( + torch.tensor([0, 1, 2, 3, 4, 5]), + (0, 10), + (0, 10), + torch.tensor([0, 1, 2, 3, 4, 5]), + ), + ( + torch.tensor([[0.0, 1.0], [0.0, 1.0]]), + (0, 10), + None, + torch.tensor([[0.0, 10.0], [0.0, 10.0]]), + ), + ], +) +def test_minmax_normalize(array, t_range, x_range, result): + output = TorchCompBackend.minmax_normalize( + tensor=array, t_range=t_range, x_range=x_range + ) + assert torch.allclose(output, result) + + +def test_reshape(): + a = torch.tensor([[[1, 2, 3], [4, 5, 6]]]) + b = TorchCompBackend.reshape(a, (2, 3)) + assert torch.equal(b, torch.tensor([[1, 2, 3], [4, 5, 6]])) + + +def test_copy(): + a = torch.tensor([1, 2, 3]) + b = TorchCompBackend.copy(a) + assert torch.equal(a, b) + + +def test_stack(): + a = torch.tensor([1, 2, 3]) + b = torch.tensor([4, 5, 6]) + stacked = TorchCompBackend.stack([a, b], dim=0) + assert torch.equal(stacked, torch.tensor([[1, 2, 3], [4, 5, 6]])) + + +def test_empty_all(): + shape = (2, 3) + dtype = torch.float32 + device = 'cpu' + a = TorchCompBackend.empty(shape, dtype, device) + assert a.shape == shape and a.dtype == dtype and a.device.type == device + + +def test_to_numpy(): + a = torch.tensor([1, 2, 3]) + b = TorchCompBackend.to_numpy(a) + assert np.array_equal(b, np.array(a)) + + +def test_none_value(): + assert torch.isnan(TorchCompBackend.none_value()) + + +def test_detach(): + a = torch.tensor([1.0, 2.0, 3.0], requires_grad=True) + b = TorchCompBackend.detach(a) + assert not b.requires_grad diff --git a/tests/units/computation_backends/torch_backend/test_metrics.py b/tests/units/computation_backends/torch_backend/test_metrics.py new file mode 100644 index 00000000000..c353e4f77bd --- /dev/null +++ b/tests/units/computation_backends/torch_backend/test_metrics.py @@ -0,0 +1,65 @@ +import torch + +from docarray.computation.torch_backend import TorchCompBackend + +metrics = TorchCompBackend.Metrics + + +def test_cosine_sim_torch(): + a = torch.rand(128) + b = torch.rand(128) + assert metrics.cosine_sim(a, b).shape == (1,) + assert metrics.cosine_sim(a, b) == metrics.cosine_sim(b, a) + torch.testing.assert_close(metrics.cosine_sim(a, a), torch.ones(1)) + + a = torch.rand(10, 3) + b = torch.rand(5, 3) + assert metrics.cosine_sim(a, b).shape == (10, 5) + assert metrics.cosine_sim(b, a).shape == (5, 10) + diag_dists = torch.diagonal(metrics.cosine_sim(b, b)) # self-comparisons + torch.testing.assert_allclose(diag_dists, torch.ones(5)) + + +def test_euclidean_dist_torch(): + a = torch.rand(128) + b = torch.rand(128) + assert metrics.euclidean_dist(a, b).shape == (1,) + assert metrics.euclidean_dist(a, b) == metrics.euclidean_dist(b, a) + torch.testing.assert_close(metrics.euclidean_dist(a, a), torch.zeros(1)) + + a = torch.rand(10, 3) + b = torch.rand(5, 3) + assert metrics.euclidean_dist(a, b).shape == (10, 5) + assert metrics.euclidean_dist(b, a).shape == (5, 10) + diag_dists = torch.diagonal(metrics.euclidean_dist(b, b)) # self-comparisons + torch.testing.assert_allclose(diag_dists, torch.zeros(5)) + + a = torch.tensor([0.0, 2.0, 0.0]) + b = torch.tensor([0.0, 0.0, 2.0]) + desired_output_singleton = torch.sqrt(torch.tensor([2.0**2.0 + 2.0**2.0])) + torch.testing.assert_close(metrics.euclidean_dist(a, b), desired_output_singleton) + + a = torch.tensor([[0.0, 2.0, 0.0], [0.0, 0.0, 2.0]]) + b = torch.tensor([[0.0, 0.0, 2.0], [0.0, 2.0, 0.0]]) + desired_output_singleton = torch.tensor( + [[desired_output_singleton.item(), 0.0], [0.0, desired_output_singleton.item()]] + ) + torch.testing.assert_close(metrics.euclidean_dist(a, b), desired_output_singleton) + + +def test_sqeuclidean_dist_torch(): + a = torch.rand(128) + b = torch.rand(128) + assert metrics.sqeuclidean_dist(a, b).shape == (1,) + torch.testing.assert_close( + metrics.sqeuclidean_dist(a, b), + metrics.euclidean_dist(a, b) ** 2, + ) + + a = torch.rand(10, 3) + b = torch.rand(5, 3) + assert metrics.sqeuclidean_dist(a, b).shape == (10, 5) + torch.testing.assert_close( + metrics.sqeuclidean_dist(a, b), + metrics.euclidean_dist(a, b) ** 2, + ) diff --git a/tests/units/computation_backends/torch_backend/test_retrieval.py b/tests/units/computation_backends/torch_backend/test_retrieval.py new file mode 100644 index 00000000000..56fc63afc18 --- /dev/null +++ b/tests/units/computation_backends/torch_backend/test_retrieval.py @@ -0,0 +1,42 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + +from docarray.computation.torch_backend import TorchCompBackend + + +def test_topk(): + top_k = TorchCompBackend.Retrieval.top_k + + a = torch.tensor([1, 4, 2, 7, 4, 9, 2]) + vals, indices = top_k(a, 3) + assert vals.shape == (1, 3) + assert indices.shape == (1, 3) + assert (vals.squeeze() == torch.tensor([1, 2, 2])).all() + assert (indices.squeeze() == torch.tensor([0, 2, 6])).all() or ( + indices.squeeze() == torch.tensor([0, 6, 2]) + ).all() + + a = torch.tensor([[1, 4, 2, 7, 4, 9, 2], [11, 6, 2, 7, 3, 10, 4]]) + vals, indices = top_k(a, 3) + assert vals.shape == (2, 3) + assert indices.shape == (2, 3) + assert (vals[0] == torch.tensor([1, 2, 2])).all() + assert (indices[0] == torch.tensor([0, 2, 6])).all() or ( + indices[0] == torch.tensor([0, 6, 2]) + ).all() + assert (vals[1] == torch.tensor([2, 3, 4])).all() + assert (indices[1] == torch.tensor([2, 4, 6])).all() diff --git a/tests/units/document/__init__.py b/tests/units/document/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/document/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/document/proto/__init__.py b/tests/units/document/proto/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/document/proto/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/document/proto/test_document_proto.py b/tests/units/document/proto/test_document_proto.py new file mode 100644 index 00000000000..0fc16482c6d --- /dev/null +++ b/tests/units/document/proto/test_document_proto.py @@ -0,0 +1,387 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, List, Optional, Set, Tuple + +import numpy as np +import pytest +import torch + +from docarray import DocList +from docarray.base_doc import AnyDoc, BaseDoc +from docarray.documents.image import ImageDoc +from docarray.typing import NdArray, TorchTensor +from docarray.utils._internal.misc import is_tf_available + +if is_tf_available(): + import tensorflow as tf + + +@pytest.mark.proto +def test_proto_simple(): + class CustomDoc(BaseDoc): + text: str + + doc = CustomDoc(text='hello') + + CustomDoc.from_protobuf(doc.to_protobuf()) + + +@pytest.mark.proto +def test_proto_ndarray(): + class CustomDoc(BaseDoc): + tensor: NdArray + + tensor = np.zeros((3, 224, 224)) + doc = CustomDoc(tensor=tensor) + + new_doc = CustomDoc.from_protobuf(doc.to_protobuf()) + + assert (new_doc.tensor == tensor).all() + + +@pytest.mark.proto +def test_proto_with_nested_doc(): + class CustomInnerDoc(BaseDoc): + tensor: NdArray + + class CustomDoc(BaseDoc): + text: str + inner: CustomInnerDoc + + doc = CustomDoc(text='hello', inner=CustomInnerDoc(tensor=np.zeros((3, 224, 224)))) + + CustomDoc.from_protobuf(doc.to_protobuf()) + + +@pytest.mark.proto +def test_proto_with_chunks_doc(): + class CustomInnerDoc(BaseDoc): + tensor: NdArray + + class CustomDoc(BaseDoc): + text: str + chunks: DocList[CustomInnerDoc] + + doc = CustomDoc( + text='hello', + chunks=DocList[CustomInnerDoc]( + [CustomInnerDoc(tensor=np.zeros((3, 224, 224))) for _ in range(5)], + ), + ) + + new_doc = CustomDoc.from_protobuf(doc.to_protobuf()) + + for chunk1, chunk2 in zip(doc.chunks, new_doc.chunks): + assert (chunk1.tensor == chunk2.tensor).all() + + +@pytest.mark.proto +def test_proto_with_nested_doc_pytorch(): + class CustomInnerDoc(BaseDoc): + tensor: TorchTensor + + class CustomDoc(BaseDoc): + text: str + inner: CustomInnerDoc + + doc = CustomDoc( + text='hello', inner=CustomInnerDoc(tensor=torch.zeros((3, 224, 224))) + ) + + CustomDoc.from_protobuf(doc.to_protobuf()) + + +@pytest.mark.proto +def test_proto_with_chunks_doc_pytorch(): + class CustomInnerDoc(BaseDoc): + tensor: TorchTensor + + class CustomDoc(BaseDoc): + text: str + chunks: DocList[CustomInnerDoc] + + doc = CustomDoc( + text='hello', + chunks=DocList[CustomInnerDoc]( + [CustomInnerDoc(tensor=torch.zeros((3, 224, 224))) for _ in range(5)], + ), + ) + + new_doc = CustomDoc.from_protobuf(doc.to_protobuf()) + + for chunk1, chunk2 in zip(doc.chunks, new_doc.chunks): + assert (chunk1.tensor == chunk2.tensor).all() + + +@pytest.mark.proto +def test_optional_field_in_doc(): + class CustomDoc(BaseDoc): + text: Optional[str] = None + + CustomDoc.from_protobuf(CustomDoc().to_protobuf()) + + +@pytest.mark.proto +def test_optional_field_nested_in_doc(): + class InnerDoc(BaseDoc): + title: str + + class CustomDoc(BaseDoc): + text: Optional[InnerDoc] = None + + CustomDoc.from_protobuf(CustomDoc().to_protobuf()) + + +@pytest.mark.proto +def test_integer_field(): + class Meow(BaseDoc): + age: int + wealth: float + registered: bool + + d = Meow(age=30, wealth=100.5, registered=True) + rebuilt_doc = Meow.from_protobuf(d.to_protobuf()) + assert rebuilt_doc.age == 30 + assert rebuilt_doc.wealth == 100.5 + assert rebuilt_doc.registered + + +@pytest.mark.proto +def test_list_set_dict_tuple_field(): + class MyDoc(BaseDoc): + list_: List + dict_: Dict + tuple_: Tuple + set_: Set + + d = MyDoc( + list_=[0, 1, 2], dict_={'a': 0, 'b': 1}, tuple_=tuple([0, 1]), set_={0, 1} + ) + rebuilt_doc = MyDoc.from_protobuf(d.to_protobuf()) + assert rebuilt_doc.list_ == [0, 1, 2] + assert rebuilt_doc.dict_ == {'a': 0, 'b': 1} + assert rebuilt_doc.tuple_ == (0, 1) + assert rebuilt_doc.set_ == {0, 1} + + +@pytest.mark.proto +@pytest.mark.parametrize( + 'dtype', + [ + np.uint, + np.uint8, + np.uint64, + int, + np.int8, + np.int64, + float, + np.float16, + np.longfloat, + np.double, + ], +) +def test_ndarray_dtype(dtype): + class MyDoc(BaseDoc): + tensor: NdArray + + doc = MyDoc(tensor=np.ndarray([1, 2, 3], dtype=dtype)) + assert doc.tensor.dtype == dtype + assert MyDoc.from_protobuf(doc.to_protobuf()).tensor.dtype == dtype + assert MyDoc.parse_obj(doc.dict()).tensor.dtype == dtype + + +@pytest.mark.proto +@pytest.mark.parametrize( + 'dtype', + [ + torch.uint8, + torch.int, + torch.int8, + torch.int64, + torch.float, + torch.float64, + torch.double, + ], +) +def test_torch_dtype(dtype): + class MyDoc(BaseDoc): + tensor: TorchTensor + + doc = MyDoc(tensor=torch.zeros([5, 5], dtype=dtype)) + assert doc.tensor.dtype == dtype + assert MyDoc.from_protobuf(doc.to_protobuf()).tensor.dtype == dtype + assert MyDoc.parse_obj(doc.dict()).tensor.dtype == dtype + + +@pytest.mark.proto +def test_nested_dict(): + class MyDoc(BaseDoc): + data: Dict + + doc = MyDoc(data={'data': (1, 2)}) + + MyDoc.from_protobuf(doc.to_protobuf()) + + +@pytest.mark.proto +def test_nested_dict_error(): + class MyDoc(BaseDoc): + data: Dict + + doc = MyDoc(data={0: (1, 2)}) + + with pytest.raises(ValueError, match="Protobuf only support string as key"): + doc.to_protobuf() + + +@pytest.mark.proto +def test_tuple_complex(): + class MyDoc(BaseDoc): + data: Tuple + + doc = MyDoc(data=(1, 2)) + + doc2 = MyDoc.from_protobuf(doc.to_protobuf()) + + assert doc2.data == (1, 2) + + +@pytest.mark.proto +def test_list_complex(): + class MyDoc(BaseDoc): + data: List + + doc = MyDoc(data=[(1, 2)]) + + doc2 = MyDoc.from_protobuf(doc.to_protobuf()) + + assert doc2.data == [(1, 2)] + + +@pytest.mark.proto +def test_nested_tensor_list(): + class MyDoc(BaseDoc): + data: List + + doc = MyDoc(data=[np.zeros(10)]) + + doc2 = MyDoc.from_protobuf(doc.to_protobuf()) + + assert isinstance(doc2.data[0], np.ndarray) + assert isinstance(doc2.data[0], NdArray) + + assert (doc2.data[0] == np.zeros(10)).all() + + +@pytest.mark.proto +def test_nested_tensor_dict(): + class MyDoc(BaseDoc): + data: Dict + + doc = MyDoc(data={'hello': np.zeros(10)}) + + doc2 = MyDoc.from_protobuf(doc.to_protobuf()) + + assert isinstance(doc2.data['hello'], np.ndarray) + assert isinstance(doc2.data['hello'], NdArray) + + assert (doc2.data['hello'] == np.zeros(10)).all() + + +@pytest.mark.proto +def test_super_complex_nested(): + class MyDoc(BaseDoc): + data: Dict + + data = {'hello': (torch.zeros(55), 1, 'hi', [torch.ones(55), np.zeros(10), (1, 2)])} + doc = MyDoc(data=data) + + doc2 = MyDoc.from_protobuf(doc.to_protobuf()) + + (doc2.data['hello'][3][0] == torch.ones(55)).all() + + +@pytest.mark.tensorflow +def test_super_complex_nested_tensorflow(): + class MyDoc(BaseDoc): + data: Dict + + data = {'hello': (torch.zeros(55), 1, 'hi', [tf.ones(55), np.zeros(10), (1, 2)])} + doc = MyDoc(data=data) + + MyDoc.from_protobuf(doc.to_protobuf()) + + +@pytest.mark.proto +def test_any_doc_proto(): + doc = AnyDoc(hello='world') + pt = doc.to_protobuf() + doc2 = AnyDoc.from_protobuf(pt) + assert doc2.hello == 'world' + + +@pytest.mark.proto +def test_nested_list(): + from typing import List + + from docarray import BaseDoc, DocList + from docarray.documents import TextDoc + + class TextDocWithId(TextDoc): + id: str + + class ResultTestDoc(BaseDoc): + matches: List[TextDocWithId] + + da = DocList[ResultTestDoc]( + [ + ResultTestDoc(matches=[TextDocWithId(id=f'{i}') for _ in range(10)]) + for i in range(10) + ] + ) + + DocList[ResultTestDoc].from_protobuf(da.to_protobuf()) + + +@pytest.mark.proto +def test_nested_dict_typed(): + from docarray import BaseDoc, DocList + from docarray.documents import TextDoc + + class TextDocWithId(TextDoc): + id: str + + class ResultTestDoc(BaseDoc): + matches: Dict[str, TextDocWithId] + + da = DocList[ResultTestDoc]( + [ + ResultTestDoc(matches={f'{i}': TextDocWithId(id=f'{i}') for _ in range(10)}) + for i in range(10) + ] + ) + + DocList[ResultTestDoc].from_protobuf(da.to_protobuf()) + + +def test_image_doc_proto(): + + doc = ImageDoc(url="aux.png") + pt = doc.to_protobuf() + assert "aux.png" in str(pt) + d2 = ImageDoc.from_protobuf(pt) + + assert doc.url == d2.url diff --git a/tests/units/document/proto/test_proto_based_object.py b/tests/units/document/proto/test_proto_based_object.py new file mode 100644 index 00000000000..69849dc99f6 --- /dev/null +++ b/tests/units/document/proto/test_proto_based_object.py @@ -0,0 +1,48 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import pytest + +from docarray.proto import DocProto, NodeProto +from docarray.typing import NdArray + + +@pytest.mark.proto +def test_ndarray(): + original_ndarray = np.zeros((3, 224, 224)) + + custom_ndarray = NdArray._docarray_from_native(original_ndarray) + + tensor = NdArray.from_protobuf(custom_ndarray.to_protobuf()) + + assert (tensor == original_ndarray).all() + + +@pytest.mark.proto +def test_document_proto_set(): + data = {} + + nested_item1 = NodeProto(text='hello') + + ndarray = NdArray._docarray_from_native(np.zeros((3, 224, 224))) + nd_proto = ndarray.to_protobuf() + + nested_item2 = NodeProto(ndarray=nd_proto) + + data['a'] = nested_item1 + data['b'] = nested_item2 + + DocProto(data=data) diff --git a/tests/units/document/test_any_document.py b/tests/units/document/test_any_document.py new file mode 100644 index 00000000000..7a235b45fed --- /dev/null +++ b/tests/units/document/test_any_document.py @@ -0,0 +1,92 @@ +from typing import Dict, List + +import numpy as np +import pytest + +from docarray import DocList +from docarray.base_doc import AnyDoc, BaseDoc +from docarray.typing import NdArray + + +def test_any_doc(): + class InnerDocument(BaseDoc): + text: str + tensor: NdArray + + class CustomDoc(BaseDoc): + inner: InnerDocument + text: str + + doc = CustomDoc( + text='bye', inner=InnerDocument(text='hello', tensor=np.zeros((3, 224, 224))) + ) + + any_doc = AnyDoc(**doc.__dict__) + + assert any_doc.text == doc.text + assert any_doc.inner.text == doc.inner.text + assert (any_doc.inner.tensor == doc.inner.tensor).all() + + +@pytest.mark.parametrize('protocol', ['proto', 'json']) +def test_any_document_from_to(protocol): + class InnerDoc(BaseDoc): + text: str + t: Dict[str, str] + + class DocTest(BaseDoc): + text: str + tags: Dict[str, int] + l_: List[int] + d: InnerDoc + ld: DocList[InnerDoc] + + inner_doc = InnerDoc(text='I am inner', t={'a': 'b'}) + da = DocList[DocTest]( + [ + DocTest( + text='type1', + tags={'type': 1}, + l_=[1, 2], + d=inner_doc, + ld=DocList[InnerDoc]([inner_doc]), + ), + DocTest( + text='type2', + tags={'type': 2}, + l_=[1, 2], + d=inner_doc, + ld=DocList[InnerDoc]([inner_doc]), + ), + ] + ) + + from docarray.base_doc import AnyDoc + + if protocol == 'proto': + aux = DocList[AnyDoc].from_protobuf(da.to_protobuf()) + else: + aux = DocList[AnyDoc].from_json(da.to_json()) + assert len(aux) == 2 + assert len(aux.id) == 2 + for i, d in enumerate(aux): + assert d.tags['type'] == i + 1 + assert d.text == f'type{i + 1}' + assert d.l_ == [1, 2] + if protocol == 'proto': + assert isinstance(d.d, AnyDoc) + assert d.d.text == 'I am inner' # inner Document is a Dict + assert d.d.t == {'a': 'b'} + else: + assert isinstance(d.d, dict) + assert d.d['text'] == 'I am inner' # inner Document is a Dict + assert d.d['t'] == {'a': 'b'} + assert len(d.ld) == 1 + if protocol == 'proto': + assert isinstance(d.ld[0], AnyDoc) + assert d.ld[0].text == 'I am inner' + assert d.ld[0].t == {'a': 'b'} + else: + assert isinstance(d.ld[0], dict) + assert d.ld[0]['text'] == 'I am inner' + assert d.ld[0]['t'] == {'a': 'b'} diff --git a/tests/units/document/test_base_document.py b/tests/units/document/test_base_document.py new file mode 100644 index 00000000000..2bd80af3763 --- /dev/null +++ b/tests/units/document/test_base_document.py @@ -0,0 +1,189 @@ +from typing import Any, List, Optional, Tuple + +import numpy as np +import orjson +import pytest + +from docarray import DocList, DocVec +from docarray.base_doc.doc import BaseDoc +from docarray.base_doc.io.json import orjson_dumps_and_decode +from docarray.typing import NdArray +from docarray.typing.tensor.abstract_tensor import AbstractTensor +from docarray.utils._internal.pydantic import is_pydantic_v2 + + +def test_base_document_init(): + doc = BaseDoc() + + assert doc.id is not None + + +def test_update(): + class MyDocument(BaseDoc): + content: str + title: Optional[str] = None + tags_: List + + doc1 = MyDocument( + content='Core content of the document', title='Title', tags_=['python', 'AI'] + ) + doc2 = MyDocument(content='Core content updated', tags_=['docarray']) + + doc1.update(doc2) + assert doc1.content == 'Core content updated' + assert doc1.title == 'Title' + assert doc1.tags_ == ['python', 'AI', 'docarray'] + + +def test_equal_nested_docs(): + import numpy as np + + from docarray import BaseDoc, DocList + from docarray.typing import NdArray + + class SimpleDoc(BaseDoc): + simple_tens: NdArray[10] + + class NestedDoc(BaseDoc): + docs: DocList[SimpleDoc] + + nested_docs = NestedDoc( + docs=DocList[SimpleDoc]([SimpleDoc(simple_tens=np.ones(10)) for j in range(2)]), + ) + + assert nested_docs == nested_docs + + +@pytest.fixture +def nested_docs(): + class SimpleDoc(BaseDoc): + simple_tens: NdArray[10] + + class NestedDoc(BaseDoc): + docs: DocList[SimpleDoc] + hello: str = 'world' + + nested_docs = NestedDoc( + docs=DocList[SimpleDoc]([SimpleDoc(simple_tens=np.ones(10)) for j in range(2)]), + ) + + return nested_docs + + +@pytest.fixture +def nested_docs_docvec(): + class SimpleDoc(BaseDoc): + simple_tens: NdArray[10] + + class NestedDoc(BaseDoc): + docs: DocVec[SimpleDoc] + hello: str = 'world' + + nested_docs = NestedDoc( + docs=DocList[SimpleDoc]([SimpleDoc(simple_tens=np.ones(10)) for j in range(2)]), + ) + + return nested_docs + + +def test_nested_to_dict(nested_docs): + d = nested_docs.dict() + assert (d['docs'][0]['simple_tens'] == np.ones(10)).all() + assert isinstance(d['docs'], list) + assert not isinstance(d['docs'], DocList) + + +def test_nested_docvec_to_dict(nested_docs_docvec): + d = nested_docs_docvec.dict() + assert (d['docs'][0]['simple_tens'] == np.ones(10)).all() + + +def test_nested_to_dict_exclude(nested_docs): + d = nested_docs.dict(exclude={'docs'}) + assert 'docs' not in d.keys() + + +def test_nested_to_dict_exclude_set(nested_docs): + d = nested_docs.dict(exclude={'hello'}) + assert 'hello' not in d.keys() + + +def test_nested_to_dict_exclude_dict(nested_docs): + d = nested_docs.dict(exclude={'hello': True}) + assert 'hello' not in d.keys() + + +def test_nested_to_json(nested_docs): + d = nested_docs.json() + nested_docs.__class__.parse_raw(d) + + +@pytest.fixture +def nested_none_docs(): + class SimpleDoc(BaseDoc): + simple_tens: NdArray[10] + + class NestedDoc(BaseDoc): + docs: Optional[DocList[SimpleDoc]] = None + hello: str = 'world' + + nested_docs = NestedDoc() + + return nested_docs + + +def test_nested_none_to_dict(nested_none_docs): + d = nested_none_docs.dict() + assert d == {'docs': None, 'hello': 'world', 'id': nested_none_docs.id} + + +def test_nested_none_to_json(nested_none_docs): + d = nested_none_docs.json() + d = nested_none_docs.__class__.parse_raw(d) + assert d.dict() == {'docs': None, 'hello': 'world', 'id': nested_none_docs.id} + + +def test_get_get_field_inner_type(): + class MyDoc(BaseDoc): + tuple_: Tuple + + field_type = MyDoc._get_field_inner_type("tuple_") + + assert field_type == Any + + +@pytest.mark.skipif( + is_pydantic_v2, reason="syntax only working with pydantic v1 for now" +) +def test_subclass_config(): + class MyDoc(BaseDoc): + x: str + + class Config(BaseDoc.Config): + arbitrary_types_allowed = True # just an example setting + + assert MyDoc.Config.json_loads == orjson.loads + assert MyDoc.Config.json_dumps == orjson_dumps_and_decode + assert ( + MyDoc.Config.json_encoders[AbstractTensor](3) == 3 + ) # dirty check that it is identity + assert MyDoc.Config.validate_assignment + assert not MyDoc.Config._load_extra_fields_from_protobuf + assert MyDoc.Config.arbitrary_types_allowed + + +@pytest.mark.skipif(not (is_pydantic_v2), reason="syntax only working with pydantic v2") +def test_subclass_config_v2(): + class MyDoc(BaseDoc): + x: str + + model_config = BaseDoc.ConfigDocArray( + arbitrary_types_allowed=True + ) # just an example setting + + assert ( + MyDoc.model_config['json_encoders'][AbstractTensor](3) == 3 + ) # dirty check that it is identity + assert MyDoc.model_config['validate_assignment'] + assert not MyDoc.model_config['_load_extra_fields_from_protobuf'] + assert MyDoc.model_config['arbitrary_types_allowed'] diff --git a/tests/units/document/test_doc_wo_id.py b/tests/units/document/test_doc_wo_id.py new file mode 100644 index 00000000000..4e2a8bba118 --- /dev/null +++ b/tests/units/document/test_doc_wo_id.py @@ -0,0 +1,31 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from docarray import DocList +from docarray.base_doc.doc import BaseDocWithoutId + + +def test_doc_list(): + class A(BaseDocWithoutId): + text: str + + cls_doc_list = DocList[A] + + da = cls_doc_list([A(text='hey here')]) + + assert isinstance(da, DocList) + for d in da: + assert isinstance(d, A) + assert not hasattr(d, 'id') diff --git a/tests/units/document/test_docs_operators.py b/tests/units/document/test_docs_operators.py new file mode 100644 index 00000000000..36cfc258811 --- /dev/null +++ b/tests/units/document/test_docs_operators.py @@ -0,0 +1,40 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from docarray.documents.text import TextDoc + + +def test_text_document_operators(): + doc = TextDoc(text='text', url='http://url.com') + + assert doc == 'text' + assert doc != 'http://url.com' + + doc2 = TextDoc(id=doc.id, text='text', url='http://url.com') + assert doc == doc2 + + doc3 = TextDoc(id='other-id', text='text', url='http://url.com') + assert doc == doc3 + + assert 't' in doc + assert 'a' not in doc + + t = TextDoc(text='this is my text document') + assert 'text' in t + assert 'docarray' not in t + + text = TextDoc() + assert text is not None + assert text.text is None diff --git a/tests/units/document/test_from_to_bytes.py b/tests/units/document/test_from_to_bytes.py new file mode 100644 index 00000000000..9ee971eb5c5 --- /dev/null +++ b/tests/units/document/test_from_to_bytes.py @@ -0,0 +1,117 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +from typing import Dict, List + +from docarray import BaseDoc, DocList +from docarray.documents import ImageDoc +from docarray.typing import NdArray + + +class MyDoc(BaseDoc): + embedding: NdArray + text: str + image: ImageDoc + + +class MySimpleDoc(BaseDoc): + title: str + + +class MyComplexDoc(BaseDoc): + content_dict_doclist: Dict[str, DocList[MySimpleDoc]] + content_dict_list: Dict[str, List[MySimpleDoc]] + aux_dict: Dict[str, int] + + +@pytest.mark.parametrize('protocol', ['protobuf', 'pickle']) +@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) +def test_to_from_bytes(protocol, compress): + d = MyDoc(embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png')) + + assert d.text == 'hello' + assert d.embedding.tolist() == [1, 2, 3, 4, 5] + assert d.image.url == 'aux.png' + bstr = d.to_bytes(protocol=protocol, compress=compress) + d2 = MyDoc.from_bytes(bstr, protocol=protocol, compress=compress) + assert d2.text == 'hello' + assert d2.embedding.tolist() == [1, 2, 3, 4, 5] + assert d2.image.url == 'aux.png' + + +@pytest.mark.parametrize('protocol', ['protobuf', 'pickle']) +@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) +def test_to_from_base64(protocol, compress): + d = MyDoc(embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png')) + + assert d.text == 'hello' + assert d.embedding.tolist() == [1, 2, 3, 4, 5] + assert d.image.url == 'aux.png' + bstr = d.to_base64(protocol=protocol, compress=compress) + d2 = MyDoc.from_base64(bstr, protocol=protocol, compress=compress) + assert d2.text == 'hello' + assert d2.embedding.tolist() == [1, 2, 3, 4, 5] + assert d2.image.url == 'aux.png' + + +@pytest.mark.parametrize('protocol', ['protobuf', 'pickle']) +@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) +def test_to_from_bytes_complex(protocol, compress): + d = MyComplexDoc( + content_dict_doclist={ + 'test1': DocList[MySimpleDoc]( + [MySimpleDoc(title='123'), MySimpleDoc(title='456')] + ) + }, + content_dict_list={ + 'test1': [MySimpleDoc(title='123'), MySimpleDoc(title='456')] + }, + aux_dict={'a': 0}, + ) + bstr = d.to_bytes(protocol=protocol, compress=compress) + d2 = MyComplexDoc.from_bytes(bstr, protocol=protocol, compress=compress) + assert d2.aux_dict == {'a': 0} + assert len(d2.content_dict_doclist['test1']) == 2 + assert d2.content_dict_doclist['test1'][0].title == '123' + assert d2.content_dict_doclist['test1'][1].title == '456' + assert len(d2.content_dict_list['test1']) == 2 + assert d2.content_dict_list['test1'][0].title == '123' + assert d2.content_dict_list['test1'][1].title == '456' + + +@pytest.mark.parametrize('protocol', ['protobuf', 'pickle']) +@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) +def test_to_from_base64_complex(protocol, compress): + d = MyComplexDoc( + content_dict_doclist={ + 'test1': DocList[MySimpleDoc]( + [MySimpleDoc(title='123'), MySimpleDoc(title='456')] + ) + }, + content_dict_list={ + 'test1': [MySimpleDoc(title='123'), MySimpleDoc(title='456')] + }, + aux_dict={'a': 0}, + ) + bstr = d.to_base64(protocol=protocol, compress=compress) + d2 = MyComplexDoc.from_base64(bstr, protocol=protocol, compress=compress) + assert d2.aux_dict == {'a': 0} + assert len(d2.content_dict_doclist['test1']) == 2 + assert d2.content_dict_doclist['test1'][0].title == '123' + assert d2.content_dict_doclist['test1'][1].title == '456' + assert len(d2.content_dict_list['test1']) == 2 + assert d2.content_dict_list['test1'][0].title == '123' + assert d2.content_dict_list['test1'][1].title == '456' diff --git a/tests/units/document/test_text_document.py b/tests/units/document/test_text_document.py new file mode 100644 index 00000000000..153e2922ead --- /dev/null +++ b/tests/units/document/test_text_document.py @@ -0,0 +1,30 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from docarray.documents import TextDoc + + +def test_text_document_init(): + text = TextDoc('hello world') + assert text.text == 'hello world' + assert text == 'hello world' + + text = TextDoc(text='hello world') + assert text.text == 'hello world' + assert text == 'hello world' + + text = TextDoc() + assert text is not None + assert text.text is None diff --git a/tests/units/document/test_to_schema.py b/tests/units/document/test_to_schema.py new file mode 100644 index 00000000000..ad0b7444acd --- /dev/null +++ b/tests/units/document/test_to_schema.py @@ -0,0 +1,92 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import pytest + +from docarray import BaseDoc +from docarray.base_doc.io.json import orjson_dumps +from docarray.typing import NdArray, TorchTensor + + +class NpDoc(BaseDoc): + embedding: NdArray[3, 4] + embedding_no_shape: NdArray + + +class TorchDoc(BaseDoc): + embedding: TorchTensor[3, 4] + embedding_no_shape: TorchTensor + + +def test_np_schema(): + schema = NpDoc.schema() + assert schema['properties']['embedding']['tensor/array shape'] == '[3, 4]' + assert schema['properties']['embedding']['type'] == 'array' + assert schema['properties']['embedding']['items']['type'] == 'number' + assert ( + schema['properties']['embedding']['example'] + == orjson_dumps(np.zeros([3, 4])).decode() + ) + + assert ( + schema['properties']['embedding_no_shape']['tensor/array shape'] + == 'not specified' + ) + assert schema['properties']['embedding_no_shape']['type'] == 'array' + assert schema['properties']['embedding']['items']['type'] == 'number' + + +def test_torch_schema(): + schema = TorchDoc.schema() + assert schema['properties']['embedding']['tensor/array shape'] == '[3, 4]' + assert schema['properties']['embedding']['type'] == 'array' + assert schema['properties']['embedding']['items']['type'] == 'number' + assert ( + schema['properties']['embedding']['example'] + == orjson_dumps(np.zeros([3, 4])).decode() + ) + + assert ( + schema['properties']['embedding_no_shape']['tensor/array shape'] + == 'not specified' + ) + assert schema['properties']['embedding_no_shape']['type'] == 'array' + assert schema['properties']['embedding']['items']['type'] == 'number' + + +@pytest.mark.tensorflow +def test_tensorflow_schema(): + from docarray.typing import TensorFlowTensor + + class TensorflowDoc(BaseDoc): + embedding: TensorFlowTensor[3, 4] + embedding_no_shape: TensorFlowTensor + + schema = TensorflowDoc.schema() + assert schema['properties']['embedding']['tensor/array shape'] == '[3, 4]' + assert schema['properties']['embedding']['type'] == 'array' + assert schema['properties']['embedding']['items']['type'] == 'number' + assert ( + schema['properties']['embedding']['example'] + == orjson_dumps(np.zeros([3, 4])).decode() + ) + + assert ( + schema['properties']['embedding_no_shape']['tensor/array shape'] + == 'not specified' + ) + assert schema['properties']['embedding_no_shape']['type'] == 'array' + assert schema['properties']['embedding']['items']['type'] == 'number' diff --git a/tests/units/document/test_update.py b/tests/units/document/test_update.py new file mode 100644 index 00000000000..5e76caa0dc2 --- /dev/null +++ b/tests/units/document/test_update.py @@ -0,0 +1,104 @@ +from typing import Dict, List, Optional, Set + +import pytest + +from docarray import BaseDoc, DocList +from docarray.documents import ImageDoc + + +class InnerDoc(BaseDoc): + integer: int + inner_list: List + + +class MMDoc(BaseDoc): + text: str = '' + price: int = 0 + categories: Optional[List[str]] = None + image: Optional[ImageDoc] = None + matches: Optional[DocList] = None + matches_with_same_id: Optional[DocList] = None + opt_int: Optional[int] = None + test_set: Optional[Set] = None + inner_doc: Optional[InnerDoc] = None + test_dict: Optional[Dict] = None + + +@pytest.fixture +def doc1(): + return MMDoc( + text='hey here', + categories=['a', 'b', 'c'], + price=10, + matches=DocList[MMDoc]([MMDoc()]), + matches_with_same_id=DocList[MMDoc]( + [MMDoc(id='a', matches=DocList[MMDoc]([MMDoc()]))] + ), + test_set={'a', 'a'}, + inner_doc=InnerDoc(integer=2, inner_list=['c', 'd']), + test_dict={'a': 0, 'b': 2, 'd': 4, 'z': 3}, + ) + + +@pytest.fixture +def doc2(doc1): + return MMDoc( + id=doc1.id, + text='hey here 2', + categories=['d', 'e', 'f'], + price=5, + opt_int=5, + matches=DocList[MMDoc]([MMDoc()]), + matches_with_same_id=DocList[MMDoc]( + [MMDoc(id='a', matches=DocList[MMDoc]([MMDoc()]))] + ), + test_set={'a', 'b'}, + inner_doc=InnerDoc(integer=3, inner_list=['a', 'b']), + test_dict={'a': 10, 'b': 10, 'c': 3, 'z': None}, + ) + + +def test_update_complex(doc1, doc2): + doc1.update(doc2) + # doc1 is changed in place (no extra memory) + assert doc1.text == 'hey here 2' + assert doc1.categories == ['a', 'b', 'c', 'd', 'e', 'f'] + assert len(doc1.matches) == 2 + assert doc1.opt_int == 5 + assert doc1.price == 5 + assert doc1.test_set == {'a', 'b'} + assert len(doc1.matches_with_same_id) == 1 + assert len(doc1.matches_with_same_id[0].matches) == 2 + assert doc1.inner_doc.integer == 3 + assert doc1.inner_doc.inner_list == ['c', 'd', 'a', 'b'] + assert doc1.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4, 'z': None} + + +def test_update_simple(): + class MyDocument(BaseDoc): + content: str + title: Optional[str] = None + tags_: List + + my_doc1 = MyDocument( + content='Core content of the document', title='Title', tags_=['python', 'AI'] + ) + my_doc2 = MyDocument(content='Core content updated', tags_=['docarray']) + + my_doc1.update(my_doc2) + assert my_doc1.content == 'Core content updated' + assert my_doc1.title == 'Title' + assert my_doc1.tags_ == ['python', 'AI', 'docarray'] + + +def test_update_different_schema_fails(): + class DocA(BaseDoc): + content: str + + class DocB(BaseDoc): + image: Optional[ImageDoc] = None + + docA = DocA(content='haha') + docB = DocB() + with pytest.raises(Exception): + docA.update(docB) diff --git a/tests/units/document/test_view.py b/tests/units/document/test_view.py new file mode 100644 index 00000000000..ecd53a918fa --- /dev/null +++ b/tests/units/document/test_view.py @@ -0,0 +1,52 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np + +from docarray import BaseDoc +from docarray.array import DocVec +from docarray.array.doc_vec.column_storage import ColumnStorageView +from docarray.typing import AnyTensor + + +def test_document_view(): + class MyDoc(BaseDoc): + tensor: AnyTensor + name: str + + docs = [MyDoc(tensor=np.zeros((10, 10)), name='hello', id=str(i)) for i in range(4)] + + doc_vec = DocVec[MyDoc](docs) + storage = doc_vec._storage + + result = str(doc_vec[0]) + assert 'MyDoc' in result + assert 'id' in result + assert 'tensor' in result + assert 'name' in result + + doc = MyDoc.from_view(ColumnStorageView(0, storage)) + assert doc.is_view() + assert doc.id == '0' + assert (doc.tensor == np.zeros(10)).all() + assert doc.name == 'hello' + + storage.columns['id'][0] = '12345' + storage.columns['tensor'][0] = np.ones(10) + storage.columns['name'][0] = 'byebye' + + assert doc.id == '12345' + assert (doc.tensor == np.ones(10)).all() + assert doc.name == 'byebye' diff --git a/tests/units/test_helper.py b/tests/units/test_helper.py new file mode 100644 index 00000000000..0c68fe9884d --- /dev/null +++ b/tests/units/test_helper.py @@ -0,0 +1,181 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional + +import pytest + +from docarray import BaseDoc, DocList +from docarray.documents import ImageDoc +from docarray.helper import ( + _access_path_dict_to_nested_dict, + _access_path_to_dict, + _dict_to_access_paths, + _is_access_path_valid, + _update_nested_dicts, + get_paths, +) + + +@pytest.fixture() +def nested_doc(): + class Inner(BaseDoc): + img: Optional[ImageDoc] + + class Middle(BaseDoc): + img: Optional[ImageDoc] + inner: Optional[Inner] + + class Outer(BaseDoc): + img: Optional[ImageDoc] + middle: Optional[Middle] + da: DocList[Inner] + + doc = Outer( + img=ImageDoc(), + middle=Middle(img=ImageDoc(), inner=Inner(img=ImageDoc())), + da=DocList[Inner]([Inner(img=ImageDoc(url='test.png'))]), + ) + return doc + + +def test_is_access_path_valid(nested_doc): + assert _is_access_path_valid(nested_doc.__class__, 'img') + assert _is_access_path_valid(nested_doc.__class__, 'middle__img') + assert _is_access_path_valid(nested_doc.__class__, 'middle__inner__img') + assert _is_access_path_valid(nested_doc.__class__, 'middle') + assert _is_access_path_valid(nested_doc.__class__, 'da__img__url') + + +def test_is_access_path_not_valid(nested_doc): + assert not _is_access_path_valid(nested_doc.__class__, 'inner') + assert not _is_access_path_valid(nested_doc.__class__, 'some__other__path') + assert not _is_access_path_valid(nested_doc.__class__, 'middle.inner') + + +def test_get_access_paths(): + class Painting(BaseDoc): + title: str + img: ImageDoc + + access_paths = Painting._get_access_paths() + assert access_paths == [ + 'id', + 'title', + 'img__id', + 'img__url', + 'img__tensor', + 'img__embedding', + 'img__bytes_', + ] + + +def test_dict_to_access_paths(): + d = { + 'a0': {'b0': {'c0': 0}, 'b1': {'c0': 1}}, + 'a1': {'b0': {'c0': 2, 'c1': 3}, 'b1': 4}, + } + casted = _dict_to_access_paths(d) + assert casted == { + 'a0__b0__c0': 0, + 'a0__b1__c0': 1, + 'a1__b0__c0': 2, + 'a1__b0__c1': 3, + 'a1__b1': 4, + } + + +def test_access_path_to_dict(): + access_path = 'a__b__c__d__e' + value = 1 + result = {'a': {'b': {'c': {'d': {'e': value}}}}} + assert _access_path_to_dict(access_path, value) == result + + +def test_access_path_dict_to_nested_dict(): + d = { + 'a0__b0__c0': 0, + 'a0__b1__c0': 1, + 'a1__b0__c0': 2, + 'a1__b0__c1': 3, + 'a1__b1': 4, + } + casted = _access_path_dict_to_nested_dict(d) + assert casted == { + 'a0': {'b0': {'c0': 0}, 'b1': {'c0': 1}}, + 'a1': {'b0': {'c0': 2, 'c1': 3}, 'b1': 4}, + } + + +def test_update_nested_dict(): + d1 = {'text': 'hello', 'image': {'tensor': None}} + d2 = {'image': {'url': 'some.png'}} + + _update_nested_dicts(d1, d2) + assert d1 == {'text': 'hello', 'image': {'tensor': None, 'url': 'some.png'}} + + +def test_get_paths(): + paths = list(get_paths(patterns='*.py')) + for path in paths: + assert path.endswith('.py') + + +def test_get_paths_recursive(): + paths_rec = list(get_paths(patterns='**', recursive=True)) + paths_not_rec = list(get_paths(patterns='**', recursive=False)) + + assert len(paths_rec) > len(paths_not_rec) + + +def test_get_paths_exclude(): + paths = list(get_paths(patterns='*.py')) + paths_wo_init = list(get_paths(patterns='*.py', exclude_regex='__init__.[a-z]*')) + + assert len(paths_wo_init) <= len(paths) + assert '__init__.py' not in paths_wo_init + + +def test_shallow_copy(): + from torch import rand + + from docarray import BaseDoc + from docarray.helper import _shallow_copy_doc + from docarray.typing import TorchTensor, VideoUrl + + class VideoDoc(BaseDoc): + url: VideoUrl + tensor_video: TorchTensor + + class MyDoc(BaseDoc): + docs: DocList[VideoDoc] + tensor: TorchTensor + + doc_ori = MyDoc( + docs=DocList[VideoDoc]( + [ + VideoDoc( + url=f'http://example.ai/videos/{i}', + tensor_video=rand(256), + ) + for i in range(10) + ] + ), + tensor=rand(256), + ) + + doc_copy = _shallow_copy_doc(doc_ori) + + assert doc_copy == doc_ori diff --git a/tests/units/typing/__init__.py b/tests/units/typing/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/typing/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/typing/da/__init__.py b/tests/units/typing/da/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/typing/da/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/typing/da/test_relations.py b/tests/units/typing/da/test_relations.py new file mode 100644 index 00000000000..cadac712f5a --- /dev/null +++ b/tests/units/typing/da/test_relations.py @@ -0,0 +1,59 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from docarray import BaseDoc, DocList +from docarray.utils._internal.pydantic import is_pydantic_v2 + + +@pytest.mark.skipif( + is_pydantic_v2, + reason="Subscripted generics cannot be used with class and instance checks", +) +def test_instance_and_equivalence(): + class MyDoc(BaseDoc): + text: str + + docs = DocList[MyDoc]([MyDoc(text='hello')]) + + assert issubclass(DocList[MyDoc], DocList[MyDoc]) + assert issubclass(docs.__class__, DocList[MyDoc]) + + assert isinstance(docs, DocList[MyDoc]) + + +@pytest.mark.skipif( + is_pydantic_v2, + reason="Subscripted generics cannot be used with class and instance checks", +) +def test_subclassing(): + class MyDoc(BaseDoc): + text: str + + class MyDocList(DocList[MyDoc]): + pass + + docs = MyDocList([MyDoc(text='hello')]) + + assert issubclass(MyDocList, DocList[MyDoc]) + assert issubclass(docs.__class__, DocList[MyDoc]) + + assert isinstance(docs, MyDocList) + assert isinstance(docs, DocList[MyDoc]) + + assert issubclass(MyDoc, BaseDoc) + assert not issubclass(DocList[MyDoc], DocList[BaseDoc]) + assert not issubclass(MyDocList, DocList[BaseDoc]) diff --git a/tests/units/typing/tensor/__init__.py b/tests/units/typing/tensor/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/typing/tensor/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/typing/tensor/test_audio_tensor.py b/tests/units/typing/tensor/test_audio_tensor.py new file mode 100644 index 00000000000..45b54caf654 --- /dev/null +++ b/tests/units/typing/tensor/test_audio_tensor.py @@ -0,0 +1,182 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import numpy as np +import pytest +import torch +from pydantic import parse_obj_as + +from docarray import BaseDoc +from docarray.typing import AudioTensor +from docarray.typing.bytes.audio_bytes import AudioBytes +from docarray.typing.tensor.audio.audio_ndarray import AudioNdArray +from docarray.typing.tensor.audio.audio_torch_tensor import AudioTorchTensor +from docarray.utils._internal.misc import is_tf_available + +tf_available = is_tf_available() +if tf_available: + import tensorflow as tf + import tensorflow._api.v2.experimental.numpy as tnp + + from docarray.typing.tensor.audio import AudioTensorFlowTensor + + +@pytest.mark.parametrize( + 'tensor,cls_audio_tensor,cls_tensor', + [ + (torch.zeros(1000, 2), AudioTorchTensor, torch.Tensor), + (np.zeros((1000, 2)), AudioNdArray, np.ndarray), + ], +) +def test_set_audio_tensor(tensor, cls_audio_tensor, cls_tensor): + class MyAudioDoc(BaseDoc): + tensor: cls_audio_tensor + + doc = MyAudioDoc(tensor=tensor) + assert isinstance(doc.tensor, cls_audio_tensor) + assert isinstance(doc.tensor, cls_tensor) + assert (doc.tensor == tensor).all() + + +@pytest.mark.tensorflow +def test_set_audio_tensorflow_tensor(): + class MyAudioDoc(BaseDoc): + tensor: AudioTensorFlowTensor + + doc = MyAudioDoc(tensor=tf.zeros((1000, 2))) + assert isinstance(doc.tensor, AudioTensorFlowTensor) + assert isinstance(doc.tensor.tensor, tf.Tensor) + assert tnp.allclose(doc.tensor.tensor, tf.zeros((1000, 2))) + + +@pytest.mark.parametrize( + 'cls_tensor,tensor', + [ + (AudioNdArray, np.zeros((1000, 2))), + (AudioTorchTensor, torch.zeros(1000, 2)), + (AudioTorchTensor, np.zeros((1000, 2))), + ], +) +def test_validation(cls_tensor, tensor): + arr = parse_obj_as(cls_tensor, tensor) + assert isinstance(arr, cls_tensor) + + +@pytest.mark.tensorflow +def test_validation_tensorflow(): + arr = parse_obj_as(AudioTensorFlowTensor, tf.zeros((1000, 2))) + assert isinstance(arr, AudioTensorFlowTensor) + + +@pytest.mark.parametrize( + 'cls_tensor,tensor,expect_error', + [ + (AudioNdArray, torch.zeros(1000, 2), False), + (AudioNdArray, 'hello', True), + (AudioTorchTensor, 'hello', True), + ], +) +def test_illegal_validation(cls_tensor, tensor, expect_error): + if expect_error: + with pytest.raises(ValueError): + parse_obj_as(cls_tensor, tensor) + else: + parse_obj_as(cls_tensor, tensor) + + +@pytest.mark.proto +@pytest.mark.parametrize( + 'cls_tensor,tensor,proto_key', + [ + (AudioTorchTensor, torch.zeros(1000, 2), AudioTorchTensor._proto_type_name), + (AudioNdArray, np.zeros((1000, 2)), AudioNdArray._proto_type_name), + ], +) +def test_proto_tensor(cls_tensor, tensor, proto_key): + tensor = parse_obj_as(cls_tensor, tensor) + proto = tensor._to_node_protobuf() + assert proto_key in str(proto) + + +@pytest.mark.tensorflow +def test_proto_tensor_tensorflow(): + tensor = parse_obj_as(AudioTensorFlowTensor, tf.zeros((1000, 2))) + proto = tensor._to_node_protobuf() + assert AudioTensorFlowTensor._proto_type_name in str(proto) + + +@pytest.mark.parametrize( + 'cls_tensor,tensor', + [ + (AudioTorchTensor, torch.zeros(1000, 2)), + (AudioNdArray, np.zeros((1000, 2))), + ], +) +def test_save_audio_tensor_to_wav_file(cls_tensor, tensor, tmpdir): + tmp_file = str(tmpdir / 'tmp.wav') + audio_tensor = parse_obj_as(cls_tensor, tensor) + audio_tensor.save(tmp_file) + assert os.path.isfile(tmp_file) + + +@pytest.mark.tensorflow +def test_save_audio_tensorflow_tensor_to_wav_file(tmpdir): + tmp_file = str(tmpdir / 'tmp.wav') + audio_tensor = parse_obj_as(AudioTensorFlowTensor, tf.zeros((1000, 2))) + audio_tensor.save(tmp_file) + assert os.path.isfile(tmp_file) + + +@pytest.mark.parametrize( + 'audio_tensor', + [ + parse_obj_as(AudioTorchTensor, torch.zeros(1000, 2)), + parse_obj_as(AudioNdArray, np.zeros((1000, 2))), + ], +) +def test_save_audio_tensor_to_bytes(audio_tensor): + b = audio_tensor.to_bytes() + isinstance(b, bytes) + isinstance(b, AudioBytes) + + +@pytest.mark.parametrize( + 'tensor,cls_audio_tensor,cls_tensor', + [ + (torch.zeros(1000, 2), AudioTorchTensor, torch.Tensor), + (np.zeros((1000, 2)), AudioNdArray, np.ndarray), + ], +) +def test_torch_ndarray_to_audio_tensor(tensor, cls_audio_tensor, cls_tensor): + class MyAudioDoc(BaseDoc): + tensor: AudioTensor + + doc = MyAudioDoc(tensor=tensor) + assert isinstance(doc.tensor, cls_audio_tensor) + assert isinstance(doc.tensor, cls_tensor) + assert (doc.tensor == tensor).all() + + +@pytest.mark.tensorflow +def test_tensorflow_to_audio_tensor(): + class MyAudioDoc(BaseDoc): + tensor: AudioTensor + + doc = MyAudioDoc(tensor=tf.zeros((1000, 2))) + assert isinstance(doc.tensor, AudioTensorFlowTensor) + assert isinstance(doc.tensor.tensor, tf.Tensor) + assert tnp.allclose(doc.tensor.tensor, tf.zeros((1000, 2))) diff --git a/tests/units/typing/tensor/test_cross_backend.py b/tests/units/typing/tensor/test_cross_backend.py new file mode 100644 index 00000000000..cd5403c49c7 --- /dev/null +++ b/tests/units/typing/tensor/test_cross_backend.py @@ -0,0 +1,44 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import pytest +from pydantic import parse_obj_as + +from docarray.typing import NdArray, TorchTensor + +try: + from docarray.typing import TensorFlowTensor +except (ImportError, TypeError): + pass + + +@pytest.mark.tensorflow +def test_coercion_behavior(): + t_np = parse_obj_as(NdArray[128], np.zeros(128)) + t_th = parse_obj_as(TorchTensor[128], np.zeros(128)) + t_tf = parse_obj_as(TensorFlowTensor[128], np.zeros(128)) + + assert isinstance(t_np, NdArray[128]) + assert not isinstance(t_np, TensorFlowTensor[128]) + assert not isinstance(t_np, TorchTensor[128]) + + assert isinstance(t_th, TorchTensor[128]) + assert not isinstance(t_th, NdArray[128]) + assert not isinstance(t_th, TensorFlowTensor[128]) + + assert isinstance(t_tf, TensorFlowTensor[128]) + assert not isinstance(t_tf, TorchTensor[128]) + assert not isinstance(t_tf, NdArray[128]) diff --git a/tests/units/typing/tensor/test_embedding.py b/tests/units/typing/tensor/test_embedding.py new file mode 100644 index 00000000000..078cb7fddbb --- /dev/null +++ b/tests/units/typing/tensor/test_embedding.py @@ -0,0 +1,75 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import pytest +import torch +from pydantic.tools import parse_obj_as, schema_json_of + +from docarray import BaseDoc +from docarray.base_doc.io.json import orjson_dumps +from docarray.typing import AnyEmbedding, NdArrayEmbedding, TorchEmbedding +from docarray.utils._internal.misc import is_tf_available + +tf_available = is_tf_available() +if tf_available: + import tensorflow as tf + + from docarray.computation.tensorflow_backend import tnp + from docarray.typing.tensor.embedding import TensorFlowEmbedding + + +@pytest.mark.proto +def test_proto_embedding(): + embedding = parse_obj_as(AnyEmbedding, np.zeros((3, 224, 224))) + + embedding._to_node_protobuf() + + +def test_json_schema(): + schema_json_of(AnyEmbedding) + + +def test_dump_json(): + tensor = parse_obj_as(AnyEmbedding, np.zeros((3, 224, 224))) + orjson_dumps(tensor) + + +@pytest.mark.parametrize( + 'tensor,cls_audio_tensor,cls_tensor', + [ + (torch.zeros(1000, 2), TorchEmbedding, torch.Tensor), + (np.zeros((1000, 2)), NdArrayEmbedding, np.ndarray), + ], +) +def test_torch_ndarray_to_any_embedding(tensor, cls_audio_tensor, cls_tensor): + class MyEmbeddingDoc(BaseDoc): + tensor: AnyEmbedding + + doc = MyEmbeddingDoc(tensor=tensor) + assert isinstance(doc.tensor, cls_audio_tensor) + assert isinstance(doc.tensor, cls_tensor) + assert (doc.tensor == tensor).all() + + +@pytest.mark.tensorflow +def test_tensorflow_to_any_embedding(): + class MyEmbeddingDoc(BaseDoc): + tensor: AnyEmbedding + + doc = MyEmbeddingDoc(tensor=tf.zeros((1000, 2))) + assert isinstance(doc.tensor, TensorFlowEmbedding) + assert isinstance(doc.tensor.tensor, tf.Tensor) + assert tnp.allclose(doc.tensor.tensor, tf.zeros((1000, 2))) diff --git a/tests/units/typing/tensor/test_image_tensor.py b/tests/units/typing/tensor/test_image_tensor.py new file mode 100644 index 00000000000..b05a71403a2 --- /dev/null +++ b/tests/units/typing/tensor/test_image_tensor.py @@ -0,0 +1,95 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import numpy as np +import pytest +import torch +from pydantic import parse_obj_as + +from docarray import BaseDoc +from docarray.typing import ImageBytes, ImageNdArray, ImageTensor, ImageTorchTensor +from docarray.utils._internal.misc import is_tf_available + +tf_available = is_tf_available() +if tf_available: + import tensorflow as tf + + from docarray.computation.tensorflow_backend import tnp + from docarray.typing.tensor.image import ImageTensorFlowTensor + + +@pytest.mark.parametrize( + 'cls_tensor,tensor', + [ + (ImageTorchTensor, torch.zeros((224, 224, 3))), + (ImageNdArray, np.zeros((224, 224, 3))), + ], +) +def test_save_image_tensor_to_file(cls_tensor, tensor, tmpdir): + tmp_file = str(tmpdir / 'tmp.jpg') + image_tensor = parse_obj_as(cls_tensor, tensor) + image_tensor.save(tmp_file) + assert os.path.isfile(tmp_file) + + +@pytest.mark.tensorflow +def test_save_image_tensorflow_tensor_to_file(tmpdir): + tmp_file = str(tmpdir / 'tmp.jpg') + image_tensor = parse_obj_as(ImageTensorFlowTensor, tf.zeros((224, 224, 3))) + image_tensor.save(tmp_file) + assert os.path.isfile(tmp_file) + + +@pytest.mark.parametrize( + 'image_tensor', + [ + parse_obj_as(ImageTorchTensor, torch.zeros(224, 224, 3)), + parse_obj_as(ImageNdArray, np.zeros((224, 224, 3))), + ], +) +def test_save_image_tensor_to_bytes(image_tensor): + b = image_tensor.to_bytes() + isinstance(b, bytes) + isinstance(b, ImageBytes) + + +@pytest.mark.parametrize( + 'tensor,cls_audio_tensor,cls_tensor', + [ + (torch.zeros(1000, 2), ImageTorchTensor, torch.Tensor), + (np.zeros((1000, 2)), ImageNdArray, np.ndarray), + ], +) +def test_torch_ndarray_to_image_tensor(tensor, cls_audio_tensor, cls_tensor): + class MyImageDoc(BaseDoc): + tensor: ImageTensor + + doc = MyImageDoc(tensor=tensor) + assert isinstance(doc.tensor, cls_audio_tensor) + assert isinstance(doc.tensor, cls_tensor) + assert (doc.tensor == tensor).all() + + +@pytest.mark.tensorflow +def test_tensorflow_to_image_tensor(): + class MyImageDoc(BaseDoc): + tensor: ImageTensor + + doc = MyImageDoc(tensor=tf.zeros((1000, 2))) + assert isinstance(doc.tensor, ImageTensorFlowTensor) + assert isinstance(doc.tensor.tensor, tf.Tensor) + assert tnp.allclose(doc.tensor.tensor, tf.zeros((1000, 2))) diff --git a/tests/units/typing/tensor/test_jax_array.py b/tests/units/typing/tensor/test_jax_array.py new file mode 100644 index 00000000000..34b4c979dfc --- /dev/null +++ b/tests/units/typing/tensor/test_jax_array.py @@ -0,0 +1,201 @@ +import numpy as np +import pytest +from pydantic import schema_json_of +from pydantic.tools import parse_obj_as + +from docarray.base_doc.io.json import orjson_dumps +from docarray.utils._internal.misc import is_jax_available + +jax_available = is_jax_available() +if jax_available: + import jax.numpy as jnp + from jax._src.core import InconclusiveDimensionOperation + + from docarray.typing import JaxArray + + +@pytest.mark.jax +def test_proto_tensor(): + from docarray.proto.pb2.docarray_pb2 import NdArrayProto + + tensor = parse_obj_as(JaxArray, jnp.zeros((3, 224, 224))) + proto = tensor.to_protobuf() + assert isinstance(proto, NdArrayProto) + + from_proto = JaxArray.from_protobuf(proto) + assert isinstance(from_proto, JaxArray) + assert jnp.allclose(tensor.tensor, from_proto.tensor) + + +@pytest.mark.jax +def test_json_schema(): + schema_json_of(JaxArray) + + +@pytest.mark.jax +@pytest.mark.skip +def test_dump_json(): + tensor = parse_obj_as(JaxArray, jnp.zeros((2, 56, 56))) + orjson_dumps(tensor) + + +@pytest.mark.jax +def test_unwrap(): + tf_tensor = parse_obj_as(JaxArray, jnp.zeros((3, 224, 224))) + unwrapped = tf_tensor.unwrap() + + assert not isinstance(unwrapped, JaxArray) + assert isinstance(tf_tensor, JaxArray) + assert isinstance(unwrapped, jnp.ndarray) + + assert np.allclose(unwrapped, np.zeros((3, 224, 224))) + + +@pytest.mark.jax +def test_from_ndarray(): + nd = np.array([1, 2, 3]) + tensor = JaxArray.from_ndarray(nd) + assert isinstance(tensor, JaxArray) + assert isinstance(tensor.tensor, jnp.ndarray) + + +@pytest.mark.jax +def test_ellipsis_in_shape(): + # ellipsis in the end, two extra dimensions needed + tf_tensor = parse_obj_as(JaxArray[3, ...], jnp.zeros((3, 128, 224))) + assert isinstance(tf_tensor, JaxArray) + assert isinstance(tf_tensor.tensor, jnp.ndarray) + assert tf_tensor.tensor.shape == (3, 128, 224) + + # ellipsis in the beginning, two extra dimensions needed + tf_tensor = parse_obj_as(JaxArray[..., 224], jnp.zeros((3, 128, 224))) + assert isinstance(tf_tensor, JaxArray) + assert isinstance(tf_tensor.tensor, jnp.ndarray) + assert tf_tensor.tensor.shape == (3, 128, 224) + + # more than one ellipsis in the shape + with pytest.raises(ValueError): + parse_obj_as(JaxArray[3, ..., 128, ...], jnp.zeros((3, 128, 224))) + + # wrong shape + with pytest.raises(ValueError): + parse_obj_as(JaxArray[3, 224, ...], jnp.zeros((3, 128, 224))) + + +@pytest.mark.jax +def test_parametrized(): + # correct shape, single axis + tf_tensor = parse_obj_as(JaxArray[128], jnp.zeros(128)) + assert isinstance(tf_tensor, JaxArray) + assert isinstance(tf_tensor.tensor, jnp.ndarray) + assert tf_tensor.tensor.shape == (128,) + + # correct shape, multiple axis + tf_tensor = parse_obj_as(JaxArray[3, 224, 224], jnp.zeros((3, 224, 224))) + assert isinstance(tf_tensor, JaxArray) + assert isinstance(tf_tensor.tensor, jnp.ndarray) + assert tf_tensor.tensor.shape == (3, 224, 224) + + # wrong but reshapable shape + tf_tensor = parse_obj_as(JaxArray[3, 224, 224], jnp.zeros((224, 3, 224))) + assert isinstance(tf_tensor, JaxArray) + assert isinstance(tf_tensor.tensor, jnp.ndarray) + assert tf_tensor.tensor.shape == (3, 224, 224) + + # wrong and not reshapable shape + with pytest.raises(InconclusiveDimensionOperation): + parse_obj_as(JaxArray[3, 224, 224], jnp.zeros((224, 224))) + + +@pytest.mark.jax +def test_parametrized_with_str(): + # test independent variable dimensions + tf_tensor = parse_obj_as(JaxArray[3, 'x', 'y'], jnp.zeros((3, 224, 224))) + assert isinstance(tf_tensor, JaxArray) + assert isinstance(tf_tensor.tensor, jnp.ndarray) + assert tf_tensor.tensor.shape == (3, 224, 224) + + tf_tensor = parse_obj_as(JaxArray[3, 'x', 'y'], jnp.zeros((3, 60, 128))) + assert isinstance(tf_tensor, JaxArray) + assert isinstance(tf_tensor.tensor, jnp.ndarray) + assert tf_tensor.tensor.shape == (3, 60, 128) + + with pytest.raises(ValueError): + parse_obj_as(JaxArray[3, 'x', 'y'], jnp.zeros((4, 224, 224))) + + with pytest.raises(ValueError): + parse_obj_as(JaxArray[3, 'x', 'y'], jnp.zeros((100, 1))) + + # test dependent variable dimensions + tf_tensor = parse_obj_as(JaxArray[3, 'x', 'x'], jnp.zeros((3, 224, 224))) + assert isinstance(tf_tensor, JaxArray) + assert isinstance(tf_tensor.tensor, jnp.ndarray) + assert tf_tensor.tensor.shape == (3, 224, 224) + + with pytest.raises(ValueError): + _ = parse_obj_as(JaxArray[3, 'x', 'x'], jnp.zeros((3, 60, 128))) + + with pytest.raises(ValueError): + _ = parse_obj_as(JaxArray[3, 'x', 'x'], jnp.zeros((3, 60))) + + +@pytest.mark.jax +@pytest.mark.parametrize('shape', [(3, 224, 224), (224, 224, 3)]) +def test_parameterized_tensor_class_name(shape): + MyTFT = JaxArray[3, 224, 224] + tensor = parse_obj_as(MyTFT, jnp.zeros(shape)) + + assert MyTFT.__name__ == 'JaxArray[3, 224, 224]' + assert MyTFT.__qualname__ == 'JaxArray[3, 224, 224]' + + assert tensor.__class__.__name__ == 'JaxArray' + assert tensor.__class__.__qualname__ == 'JaxArray' + assert f'{tensor.tensor[0][0][0]}' == '0.0' + + +@pytest.mark.jax +def test_parametrized_subclass(): + c1 = JaxArray[128] + c2 = JaxArray[128] + assert issubclass(c1, c2) + assert issubclass(c1, JaxArray) + + assert not issubclass(c1, JaxArray[256]) + + +@pytest.mark.jax +def test_parametrized_instance(): + t = parse_obj_as(JaxArray[128], jnp.zeros((128,))) + assert isinstance(t, JaxArray[128]) + assert isinstance(t, JaxArray) + # assert isinstance(t, jnp.ndarray) + + assert not isinstance(t, JaxArray[256]) + assert not isinstance(t, JaxArray[2, 128]) + assert not isinstance(t, JaxArray[2, 2, 64]) + + +@pytest.mark.jax +def test_parametrized_equality(): + t1 = parse_obj_as(JaxArray[128], jnp.zeros((128,))) + t2 = parse_obj_as(JaxArray[128], jnp.zeros((128,))) + assert jnp.allclose(t1.tensor, t2.tensor) + + +@pytest.mark.jax +def test_parametrized_operations(): + t1 = parse_obj_as(JaxArray[128], jnp.zeros((128,))) + t2 = parse_obj_as(JaxArray[128], jnp.zeros((128,))) + t_result = t1.tensor + t2.tensor + assert isinstance(t_result, jnp.ndarray) + assert not isinstance(t_result, JaxArray) + assert not isinstance(t_result, JaxArray[128]) + + +@pytest.mark.jax +def test_set_item(): + t = JaxArray(tensor=jnp.zeros((3, 224, 224))) + t[0] = jnp.ones((1, 224, 224)) + assert jnp.allclose(t.tensor[0], jnp.ones((1, 224, 224))) + assert jnp.allclose(t.tensor[1], jnp.zeros((1, 224, 224))) + assert jnp.allclose(t.tensor[2], jnp.zeros((1, 224, 224))) diff --git a/tests/units/typing/tensor/test_ndarray.py b/tests/units/typing/tensor/test_ndarray.py new file mode 100644 index 00000000000..93ed58b3824 --- /dev/null +++ b/tests/units/typing/tensor/test_ndarray.py @@ -0,0 +1,279 @@ +import numpy as np +import orjson +import pytest +import torch +from pydantic.tools import parse_obj_as, schema_json_of + +from docarray import BaseDoc +from docarray.base_doc.io.json import orjson_dumps +from docarray.typing import AudioNdArray, NdArray, TorchTensor +from docarray.typing.tensor import NdArrayEmbedding +from docarray.utils._internal.misc import is_tf_available + +tf_available = is_tf_available() +if tf_available: + import tensorflow as tf + + +@pytest.mark.proto +def test_proto_tensor(): + tensor = parse_obj_as(NdArray, np.zeros((3, 224, 224))) + + tensor._to_node_protobuf() + + +def test_from_list(): + tensor = parse_obj_as(NdArray, [[0.0, 0.0], [0.0, 0.0]]) + + assert (tensor == np.zeros((2, 2))).all() + + +def test_json_schema(): + schema_json_of(NdArray) + + +def test_dump_json(): + tensor = parse_obj_as(NdArray, np.zeros((3, 224, 224))) + orjson_dumps(tensor) + + +def test_load_json(): + tensor = parse_obj_as(NdArray, np.zeros((2, 2))) + + json = orjson_dumps(tensor) + print(json) + print(type(json)) + new_tensor = orjson.loads(json) + + assert (new_tensor == tensor).all() + + +def test_unwrap(): + tensor = parse_obj_as(NdArray, np.zeros((3, 224, 224))) + ndarray = tensor.unwrap() + + assert not isinstance(ndarray, NdArray) + assert isinstance(ndarray, np.ndarray) + assert isinstance(tensor, NdArray) + assert (ndarray == np.zeros((3, 224, 224))).all() + + +@pytest.mark.parametrize( + 'tensor_class, tensor_type, tensor_fn', + [(NdArray, np.ndarray, np.zeros), (TorchTensor, torch.Tensor, torch.zeros)], +) +def test_ellipsis_in_shape(tensor_class, tensor_type, tensor_fn): + # ellipsis in the end, two extra dimensions needed + tensor = parse_obj_as(tensor_class[3, ...], tensor_fn((3, 128, 224))) + assert isinstance(tensor, tensor_class) + assert isinstance(tensor, tensor_type) + assert tensor.shape == (3, 128, 224) + + # ellipsis in the middle, one extra dimension needed + tensor = parse_obj_as(tensor_class[3, ..., 224], tensor_fn((3, 128, 224))) + assert isinstance(tensor, tensor_class) + assert isinstance(tensor, tensor_type) + assert tensor.shape == (3, 128, 224) + + # ellipsis in the beginning, two extra dimensions needed + tensor = parse_obj_as(tensor_class[..., 224], tensor_fn((3, 128, 224))) + assert isinstance(tensor, tensor_class) + assert isinstance(tensor, tensor_type) + assert tensor.shape == (3, 128, 224) + + # more than one ellipsis in the shape + with pytest.raises(ValueError): + parse_obj_as(tensor_class[3, ..., 128, ...], tensor_fn((3, 128, 224))) + + # bigger dimension than expected + with pytest.raises(ValueError): + parse_obj_as(tensor_class[3, 128, 224, ...], tensor_fn((3, 128))) + + # no extra dimension needed + with pytest.raises(ValueError): + parse_obj_as(tensor_class[3, 128, 224, ...], tensor_fn((3, 128, 224))) + + # wrong shape + with pytest.raises(ValueError): + parse_obj_as(tensor_class[3, 224, ...], tensor_fn((3, 128, 224))) + + # passing only ellipsis as a shape + with pytest.raises(TypeError): + parse_obj_as(tensor_class[...], tensor_fn((3, 128, 224))) + + +@pytest.mark.parametrize( + 'tensor_class, tensor_type, tensor_fn', + [(NdArray, np.ndarray, np.zeros), (TorchTensor, torch.Tensor, torch.zeros)], +) +def test_parametrized(tensor_class, tensor_type, tensor_fn): + # correct shape, single axis + tensor = parse_obj_as(tensor_class[128], tensor_fn(128)) + assert isinstance(tensor, tensor_class) + assert isinstance(tensor, tensor_type) + assert tensor.shape == (128,) + + # correct shape, multiple axis + tensor = parse_obj_as(tensor_class[3, 224, 224], tensor_fn((3, 224, 224))) + assert isinstance(tensor, tensor_class) + assert isinstance(tensor, tensor_type) + assert tensor.shape == (3, 224, 224) + + # wrong but reshapable shape + tensor = parse_obj_as(tensor_class[3, 224, 224], tensor_fn((3, 224, 224))) + assert isinstance(tensor, tensor_class) + assert isinstance(tensor, tensor_type) + assert tensor.shape == (3, 224, 224) + + # wrong and not reshapable shape + with pytest.raises(ValueError): + parse_obj_as(tensor_class[3, 224, 224], tensor_fn((224, 224))) + + # test independent variable dimensions + tensor = parse_obj_as(tensor_class[3, 'x', 'y'], tensor_fn((3, 224, 224))) + assert isinstance(tensor, tensor_class) + assert isinstance(tensor, tensor_type) + assert tensor.shape == (3, 224, 224) + + tensor = parse_obj_as(tensor_class[3, 'x', 'y'], tensor_fn((3, 60, 128))) + assert isinstance(tensor, tensor_class) + assert isinstance(tensor, tensor_type) + assert tensor.shape == (3, 60, 128) + + with pytest.raises(ValueError): + parse_obj_as(tensor_class[3, 'x', 'y'], tensor_fn((4, 224, 224))) + + with pytest.raises(ValueError): + parse_obj_as(tensor_class[3, 'x', 'y'], tensor_fn((100, 1))) + + # test dependent variable dimensions + tensor = parse_obj_as(tensor_class[3, 'x', 'x'], tensor_fn((3, 224, 224))) + assert isinstance(tensor, tensor_class) + assert isinstance(tensor, tensor_type) + assert tensor.shape == (3, 224, 224) + + with pytest.raises(ValueError): + tensor = parse_obj_as(tensor_class[3, 'x', 'x'], tensor_fn((3, 60, 128))) + + with pytest.raises(ValueError): + tensor = parse_obj_as(tensor_class[3, 'x', 'x'], tensor_fn((3, 60))) + + +def test_np_embedding(): + # correct shape + tensor = parse_obj_as(NdArrayEmbedding[128], np.zeros((128,))) + assert isinstance(tensor, NdArrayEmbedding) + assert isinstance(tensor, NdArray) + assert isinstance(tensor, np.ndarray) + assert tensor.shape == (128,) + + # wrong shape at data setting time + with pytest.raises(ValueError): + parse_obj_as(NdArrayEmbedding[128], np.zeros((256,))) + + # illegal shape at class creation time + with pytest.raises(ValueError): + parse_obj_as(NdArrayEmbedding[128, 128], np.zeros((128, 128))) + + +def test_parametrized_subclass(): + c1 = NdArray[128] + c2 = NdArray[128] + assert issubclass(c1, c2) + assert issubclass(c1, NdArray) + assert issubclass(c1, np.ndarray) + + assert not issubclass(c1, NdArray[256]) + + +def test_parametrized_instance(): + t = parse_obj_as(NdArray[128], np.zeros(128)) + assert isinstance(t, NdArray[128]) + assert isinstance(t, NdArray) + assert isinstance(t, np.ndarray) + + assert not isinstance(t, NdArray[256]) + assert not isinstance(t, NdArray[2, 64]) + assert not isinstance(t, NdArray[2, 2, 32]) + + +def test_parametrized_equality(): + t1 = parse_obj_as(NdArray[128], np.zeros(128)) + t2 = parse_obj_as(NdArray[128], np.zeros(128)) + t3 = parse_obj_as(NdArray[128], np.ones(128)) + assert (t1 == t2).all() + assert not (t1 == t3).any() + + +def test_parametrized_operations(): + t1 = parse_obj_as(NdArray[128], np.zeros(128)) + t2 = parse_obj_as(NdArray[128], np.zeros(128)) + t_result = t1 + t2 + assert isinstance(t_result, np.ndarray) + assert isinstance(t_result, NdArray) + assert isinstance(t_result, NdArray[128]) + + +def test_class_equality(): + assert NdArray == NdArray + assert NdArray[128] == NdArray[128] + assert NdArray[128] != NdArray[256] + assert NdArray[128] != NdArray[2, 64] + assert not NdArray[128] == NdArray[2, 64] + + assert NdArrayEmbedding == NdArrayEmbedding + assert NdArrayEmbedding[128] == NdArrayEmbedding[128] + assert NdArrayEmbedding[128] != NdArrayEmbedding[256] + + assert AudioNdArray == AudioNdArray + assert AudioNdArray[128] == AudioNdArray[128] + assert AudioNdArray[128] != AudioNdArray[256] + + +def test_class_hash(): + assert hash(NdArray) == hash(NdArray) + assert hash(NdArray[128]) == hash(NdArray[128]) + assert hash(NdArray[128]) != hash(NdArray[256]) + assert hash(NdArray[128]) != hash(NdArray[2, 64]) + assert not hash(NdArray[128]) == hash(NdArray[2, 64]) + + assert hash(NdArrayEmbedding) == hash(NdArrayEmbedding) + assert hash(NdArrayEmbedding[128]) == hash(NdArrayEmbedding[128]) + assert hash(NdArrayEmbedding[128]) != hash(NdArrayEmbedding[256]) + + assert hash(AudioNdArray) == hash(AudioNdArray) + assert hash(AudioNdArray[128]) == hash(AudioNdArray[128]) + assert hash(AudioNdArray[128]) != hash(AudioNdArray[256]) + + +@pytest.mark.parametrize( + 'tensor', + [ + torch.zeros(10), + TorchTensor(torch.zeros(10)), + np.zeros(10), + ], +) +def test_torch_numpy_to_ndarray(tensor): + class MyAudioDoc(BaseDoc): + tensor: NdArray + + doc = MyAudioDoc(tensor=tensor) + assert isinstance(doc.tensor, np.ndarray) + assert isinstance(doc.tensor, NdArray) + assert isinstance(doc.tensor, NdArray[10]) + + +@pytest.mark.tensorflow +def test_tensorflow_to_ndarray(): + class MyAudioDoc(BaseDoc): + tensor: NdArray + + doc = MyAudioDoc( + tensor=tf.zeros( + 10, + ) + ) + assert isinstance(doc.tensor, np.ndarray) + assert isinstance(doc.tensor, NdArray) + assert isinstance(doc.tensor, NdArray[10]) diff --git a/tests/units/typing/tensor/test_np_ops.py b/tests/units/typing/tensor/test_np_ops.py new file mode 100644 index 00000000000..27da03c5aee --- /dev/null +++ b/tests/units/typing/tensor/test_np_ops.py @@ -0,0 +1,38 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np + +from docarray import BaseDoc +from docarray.typing import NdArray + + +def test_tensor_ops(): + class A(BaseDoc): + tensor: NdArray[3, 224, 224] + + class B(BaseDoc): + tensor: NdArray[3, 112, 224] + + tensor = A(tensor=np.ones((3, 224, 224))).tensor + tensord = A(tensor=np.ones((3, 224, 224))).tensor + tensorn = np.zeros((3, 224, 224)) + tensorhalf = B(tensor=np.ones((3, 112, 224))).tensor + tensorfull = np.concatenate([tensorhalf, tensorhalf], axis=1) + + assert type(tensor) == NdArray + assert type(tensor + tensord) == NdArray + assert type(tensor + tensorn) == NdArray + assert type(tensor + tensorfull) == NdArray diff --git a/tests/units/typing/tensor/test_tensor.py b/tests/units/typing/tensor/test_tensor.py new file mode 100644 index 00000000000..6d506201cbc --- /dev/null +++ b/tests/units/typing/tensor/test_tensor.py @@ -0,0 +1,48 @@ +import numpy as np +import pytest +import torch + +from docarray import BaseDoc +from docarray.typing import AnyTensor, NdArray, TorchTensor +from docarray.utils._internal.misc import is_tf_available + +tf_available = is_tf_available() +if tf_available: + import tensorflow as tf + + from docarray.computation.tensorflow_backend import tnp + from docarray.typing import TensorFlowTensor + + +@pytest.mark.parametrize( + 'tensor,cls_audio_tensor,cls_tensor', + [ + (torch.zeros(1000, 2), TorchTensor, torch.Tensor), + (np.zeros((1000, 2)), NdArray, np.ndarray), + ], +) +def test_torch_ndarray_to_any_tensor(tensor, cls_audio_tensor, cls_tensor): + class MyTensorDoc(BaseDoc): + tensor: AnyTensor + + doc = MyTensorDoc(tensor=tensor) + assert isinstance(doc.tensor, cls_audio_tensor) + assert isinstance(doc.tensor, cls_tensor) + assert doc.tensor.shape == (1000, 2) + assert (doc.tensor == tensor).all() + + +@pytest.mark.tensorflow +def test_tensorflow_to_any_tensor(): + class MyTensorDoc(BaseDoc): + tensor: AnyTensor + + doc = MyTensorDoc(tensor=tf.zeros((1000, 2))) + assert isinstance(doc.tensor, TensorFlowTensor) + assert isinstance(doc.tensor.tensor, tf.Tensor) + assert tnp.allclose(doc.tensor.tensor, tf.zeros((1000, 2))) + + +def test_equals_type(): + # see https://github.com/docarray/docarray/pull/1739 + assert not (TorchTensor == type) diff --git a/tests/units/typing/tensor/test_tensor_coercion.py b/tests/units/typing/tensor/test_tensor_coercion.py new file mode 100644 index 00000000000..e358e0eb7ee --- /dev/null +++ b/tests/units/typing/tensor/test_tensor_coercion.py @@ -0,0 +1,50 @@ +import numpy as np +import pytest +import torch +from pydantic import parse_obj_as + +from docarray.typing import NdArray, TorchTensor +from docarray.utils._internal.misc import is_tf_available + +tf_available = is_tf_available() +if tf_available: + import tensorflow as tf + + from docarray.typing import TensorFlowTensor +else: + + ### This is needed to fake the import of tensorflow when it is not installed + class TfNotInstalled: + def zeros(self, *args, **kwargs): + return 0 + + class TensorFlowTensor: + def _docarray_from_native(self, *args, **kwargs): + return 0 + + tf = TfNotInstalled() + + +pure_tensor_to_test = [ + np.zeros((3, 224, 224)), + torch.zeros(3, 224, 224), + tf.zeros((3, 224, 224)), +] + +docarray_tensor_to_test = [ + NdArray._docarray_from_native(np.zeros((3, 224, 224))), + TorchTensor._docarray_from_native(torch.zeros(3, 224, 224)), + TensorFlowTensor._docarray_from_native(tf.zeros((3, 224, 224))), +] + + +@pytest.mark.tensorflow +@pytest.mark.parametrize('tensor', pure_tensor_to_test + docarray_tensor_to_test) +@pytest.mark.parametrize('tensor_cls', [NdArray, TorchTensor, TensorFlowTensor]) +def test_torch_tensor_coerse(tensor_cls, tensor): + t = parse_obj_as(tensor_cls, tensor) + assert isinstance(t, tensor_cls) + + t_numpy = t._docarray_to_ndarray() + assert t_numpy.shape == (3, 224, 224) + assert (t_numpy == np.zeros((3, 224, 224))).all() diff --git a/tests/units/typing/tensor/test_tensor_flow_tensor.py b/tests/units/typing/tensor/test_tensor_flow_tensor.py new file mode 100644 index 00000000000..3a51f5a95aa --- /dev/null +++ b/tests/units/typing/tensor/test_tensor_flow_tensor.py @@ -0,0 +1,201 @@ +import numpy as np +import pytest +from pydantic import schema_json_of +from pydantic.tools import parse_obj_as + +from docarray.base_doc.io.json import orjson_dumps +from docarray.utils._internal.misc import is_tf_available + +tf_available = is_tf_available() +if tf_available: + import tensorflow as tf + import tensorflow._api.v2.experimental.numpy as tnp # type: ignore + from tensorflow.python.framework.errors_impl import InvalidArgumentError + + from docarray.typing import TensorFlowTensor + + +@pytest.mark.tensorflow +def test_proto_tensor(): + from docarray.proto.pb2.docarray_pb2 import NdArrayProto + + tensor = parse_obj_as(TensorFlowTensor, tf.zeros((3, 224, 224))) + proto = tensor.to_protobuf() + assert isinstance(proto, NdArrayProto) + + from_proto = TensorFlowTensor.from_protobuf(proto) + assert isinstance(from_proto, TensorFlowTensor) + assert tnp.allclose(tensor.tensor, from_proto.tensor) + + +@pytest.mark.tensorflow +def test_json_schema(): + schema_json_of(TensorFlowTensor) + + +@pytest.mark.tensorflow +def test_dump_json(): + tensor = parse_obj_as(TensorFlowTensor, tf.zeros((3, 224, 224))) + orjson_dumps(tensor) + + +@pytest.mark.tensorflow +def test_unwrap(): + tf_tensor = parse_obj_as(TensorFlowTensor, tf.zeros((3, 224, 224))) + unwrapped = tf_tensor.unwrap() + + assert not isinstance(unwrapped, TensorFlowTensor) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(unwrapped, tf.Tensor) + + assert np.allclose(unwrapped, np.zeros((3, 224, 224))) + + +@pytest.mark.tensorflow +def test_from_ndarray(): + nd = np.array([1, 2, 3]) + tensor = TensorFlowTensor.from_ndarray(nd) + assert isinstance(tensor, TensorFlowTensor) + assert isinstance(tensor.tensor, tf.Tensor) + + +@pytest.mark.tensorflow +def test_ellipsis_in_shape(): + # ellipsis in the end, two extra dimensions needed + tf_tensor = parse_obj_as(TensorFlowTensor[3, ...], tf.zeros((3, 128, 224))) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(tf_tensor.tensor, tf.Tensor) + assert tf_tensor.tensor.shape == (3, 128, 224) + + # ellipsis in the beginning, two extra dimensions needed + tf_tensor = parse_obj_as(TensorFlowTensor[..., 224], tf.zeros((3, 128, 224))) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(tf_tensor.tensor, tf.Tensor) + assert tf_tensor.tensor.shape == (3, 128, 224) + + # more than one ellipsis in the shape + with pytest.raises(ValueError): + parse_obj_as(TensorFlowTensor[3, ..., 128, ...], tf.zeros((3, 128, 224))) + + # wrong shape + with pytest.raises(ValueError): + parse_obj_as(TensorFlowTensor[3, 224, ...], tf.zeros((3, 128, 224))) + + +@pytest.mark.tensorflow +def test_parametrized(): + # correct shape, single axis + tf_tensor = parse_obj_as(TensorFlowTensor[128], tf.zeros(128)) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(tf_tensor.tensor, tf.Tensor) + assert tf_tensor.tensor.shape == (128,) + + # correct shape, multiple axis + tf_tensor = parse_obj_as(TensorFlowTensor[3, 224, 224], tf.zeros((3, 224, 224))) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(tf_tensor.tensor, tf.Tensor) + assert tf_tensor.tensor.shape == (3, 224, 224) + + # wrong but reshapable shape + tf_tensor = parse_obj_as(TensorFlowTensor[3, 224, 224], tf.zeros((224, 3, 224))) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(tf_tensor.tensor, tf.Tensor) + assert tf_tensor.tensor.shape == (3, 224, 224) + + # wrong and not reshapable shape + with pytest.raises(InvalidArgumentError): + parse_obj_as(TensorFlowTensor[3, 224, 224], tf.zeros((224, 224))) + + +@pytest.mark.tensorflow +def test_parametrized_with_str(): + # test independent variable dimensions + tf_tensor = parse_obj_as(TensorFlowTensor[3, 'x', 'y'], tf.zeros((3, 224, 224))) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(tf_tensor.tensor, tf.Tensor) + assert tf_tensor.tensor.shape == (3, 224, 224) + + tf_tensor = parse_obj_as(TensorFlowTensor[3, 'x', 'y'], tf.zeros((3, 60, 128))) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(tf_tensor.tensor, tf.Tensor) + assert tf_tensor.tensor.shape == (3, 60, 128) + + with pytest.raises(ValueError): + parse_obj_as(TensorFlowTensor[3, 'x', 'y'], tf.zeros((4, 224, 224))) + + with pytest.raises(ValueError): + parse_obj_as(TensorFlowTensor[3, 'x', 'y'], tf.zeros((100, 1))) + + # test dependent variable dimensions + tf_tensor = parse_obj_as(TensorFlowTensor[3, 'x', 'x'], tf.zeros((3, 224, 224))) + assert isinstance(tf_tensor, TensorFlowTensor) + assert isinstance(tf_tensor.tensor, tf.Tensor) + assert tf_tensor.tensor.shape == (3, 224, 224) + + with pytest.raises(ValueError): + _ = parse_obj_as(TensorFlowTensor[3, 'x', 'x'], tf.zeros((3, 60, 128))) + + with pytest.raises(ValueError): + _ = parse_obj_as(TensorFlowTensor[3, 'x', 'x'], tf.zeros((3, 60))) + + +@pytest.mark.tensorflow +@pytest.mark.parametrize('shape', [(3, 224, 224), (224, 224, 3)]) +def test_parameterized_tensor_class_name(shape): + MyTFT = TensorFlowTensor[3, 224, 224] + tensor = parse_obj_as(MyTFT, tf.zeros(shape)) + + assert MyTFT.__name__ == 'TensorFlowTensor[3, 224, 224]' + assert MyTFT.__qualname__ == 'TensorFlowTensor[3, 224, 224]' + + assert tensor.__class__.__name__ == 'TensorFlowTensor' + assert tensor.__class__.__qualname__ == 'TensorFlowTensor' + assert f'{tensor.tensor[0][0][0]}' == '0.0' + + +@pytest.mark.tensorflow +def test_parametrized_subclass(): + c1 = TensorFlowTensor[128] + c2 = TensorFlowTensor[128] + assert issubclass(c1, c2) + assert issubclass(c1, TensorFlowTensor) + + assert not issubclass(c1, TensorFlowTensor[256]) + + +@pytest.mark.tensorflow +def test_parametrized_instance(): + t = parse_obj_as(TensorFlowTensor[128], tf.zeros((128,))) + assert isinstance(t, TensorFlowTensor[128]) + assert isinstance(t, TensorFlowTensor) + # assert isinstance(t, tf.Tensor) + + assert not isinstance(t, TensorFlowTensor[256]) + assert not isinstance(t, TensorFlowTensor[2, 128]) + assert not isinstance(t, TensorFlowTensor[2, 2, 64]) + + +@pytest.mark.tensorflow +def test_parametrized_equality(): + t1 = parse_obj_as(TensorFlowTensor[128], tf.zeros((128,))) + t2 = parse_obj_as(TensorFlowTensor[128], tf.zeros((128,))) + assert tf.experimental.numpy.allclose(t1.tensor, t2.tensor) + + +@pytest.mark.tensorflow +def test_parametrized_operations(): + t1 = parse_obj_as(TensorFlowTensor[128], tf.zeros((128,))) + t2 = parse_obj_as(TensorFlowTensor[128], tf.zeros((128,))) + t_result = t1.tensor + t2.tensor + assert isinstance(t_result, tf.Tensor) + assert not isinstance(t_result, TensorFlowTensor) + assert not isinstance(t_result, TensorFlowTensor[128]) + + +@pytest.mark.tensorflow +def test_set_item(): + t = TensorFlowTensor(tensor=tf.zeros((3, 224, 224))) + t[0] = tf.ones((1, 224, 224)) + assert tnp.allclose(t.tensor[0], tf.ones((1, 224, 224))) + assert tnp.allclose(t.tensor[1], tf.zeros((1, 224, 224))) + assert tnp.allclose(t.tensor[2], tf.zeros((1, 224, 224))) diff --git a/tests/units/typing/tensor/test_torch_ops.py b/tests/units/typing/tensor/test_torch_ops.py new file mode 100644 index 00000000000..7e6e4a54f96 --- /dev/null +++ b/tests/units/typing/tensor/test_torch_ops.py @@ -0,0 +1,38 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + +from docarray import BaseDoc +from docarray.typing import TorchTensor + + +def test_tensor_ops(): + class A(BaseDoc): + tensor: TorchTensor[3, 224, 224] + + class B(BaseDoc): + tensor: TorchTensor[3, 112, 224] + + tensor = A(tensor=torch.ones(3, 224, 224)).tensor + tensord = A(tensor=torch.ones(3, 224, 224)).tensor + tensorn = torch.zeros(3, 224, 224) + tensorhalf = B(tensor=torch.ones(3, 112, 224)).tensor + tensorfull = torch.cat([tensorhalf, tensorhalf], dim=1) + + assert type(tensor) == TorchTensor + assert type(tensor + tensord) == TorchTensor + assert type(tensor + tensorn) == TorchTensor + assert type(tensor + tensorfull) == TorchTensor diff --git a/tests/units/typing/tensor/test_torch_tensor.py b/tests/units/typing/tensor/test_torch_tensor.py new file mode 100644 index 00000000000..dbe8b58a8e5 --- /dev/null +++ b/tests/units/typing/tensor/test_torch_tensor.py @@ -0,0 +1,250 @@ +import pytest +import torch +from pydantic.tools import parse_obj_as, schema_json_of + +from docarray import BaseDoc +from docarray.base_doc.io.json import orjson_dumps +from docarray.proto import DocProto +from docarray.typing import TorchEmbedding, TorchTensor + + +class MyDoc(BaseDoc): + tens: TorchTensor + + +@pytest.mark.proto +def test_proto_tensor(): + tensor = parse_obj_as(TorchTensor, torch.zeros(3, 224, 224)) + + tensor._to_node_protobuf() + + +def test_json_schema(): + schema_json_of(TorchTensor) + + +def test_dump_json(): + tensor = parse_obj_as(TorchTensor, torch.zeros(3, 224, 224)) + orjson_dumps(tensor) + + +def test_unwrap(): + tensor = parse_obj_as(TorchTensor, torch.zeros(3, 224, 224)) + ndarray = tensor.unwrap() + + assert not isinstance(ndarray, TorchTensor) + assert isinstance(tensor, TorchTensor) + assert isinstance(ndarray, torch.Tensor) + + assert tensor.data_ptr() == ndarray.data_ptr() + + assert (ndarray == torch.zeros(3, 224, 224)).all() + + +def test_parametrized_correct_axis_shape(): + # correct shape, single axis + tensor = parse_obj_as(TorchTensor[128], torch.zeros(128)) + assert isinstance(tensor, TorchTensor) + assert isinstance(tensor, torch.Tensor) + assert tensor.shape == (128,) + + +def test_correct_shape_multiple_axis(): + # correct shape, multiple axis + tensor = parse_obj_as(TorchTensor[3, 224, 224], torch.zeros(3, 224, 224)) + assert isinstance(tensor, TorchTensor) + assert isinstance(tensor, torch.Tensor) + assert tensor.shape == (3, 224, 224) + + +def test_wrong_but_reshapable(): + # wrong but reshapable shape + tensor = parse_obj_as(TorchTensor[3, 224, 224], torch.zeros(224, 3, 224)) + assert isinstance(tensor, TorchTensor) + assert isinstance(tensor, torch.Tensor) + assert tensor.shape == (3, 224, 224) + + # wrong and not reshapable shape + with pytest.raises(ValueError): + parse_obj_as(TorchTensor[3, 224, 224], torch.zeros(224, 224)) + + +def test_independent_variable_dim(): + # test independent variable dimensions + tensor = parse_obj_as(TorchTensor[3, 'x', 'y'], torch.zeros(3, 224, 224)) + assert isinstance(tensor, TorchTensor) + assert isinstance(tensor, torch.Tensor) + assert tensor.shape == (3, 224, 224) + + +def test_param(): + tensor = parse_obj_as(TorchTensor[3, 'x', 'y'], torch.zeros(3, 60, 128)) + assert isinstance(tensor, TorchTensor) + assert isinstance(tensor, torch.Tensor) + assert tensor.shape == (3, 60, 128) + + with pytest.raises(ValueError): + parse_obj_as(TorchTensor[3, 'x', 'y'], torch.zeros(4, 224, 224)) + + with pytest.raises(ValueError): + parse_obj_as(TorchTensor[3, 'x', 'y'], torch.zeros(100, 1)) + + +def test_dependent_variable_dim(): + # test dependent variable dimensions + tensor = parse_obj_as(TorchTensor[3, 'x', 'x'], torch.zeros(3, 224, 224)) + assert isinstance(tensor, TorchTensor) + assert isinstance(tensor, torch.Tensor) + assert tensor.shape == (3, 224, 224) + + with pytest.raises(ValueError): + _ = parse_obj_as(TorchTensor[3, 'x', 'x'], torch.zeros(3, 60, 128)) + + with pytest.raises(ValueError): + _ = parse_obj_as(TorchTensor[3, 'x', 'x'], torch.zeros(3, 60)) + + +@pytest.mark.parametrize('shape', [(3, 224, 224), (224, 224, 3)]) +def test_parameterized_tensor_class_name(shape): + MyTT = TorchTensor[3, 224, 224] + tensor = parse_obj_as(MyTT, torch.zeros(shape)) + + assert MyTT.__name__ == 'TorchTensor[3, 224, 224]' + assert MyTT.__qualname__ == 'TorchTensor[3, 224, 224]' + + assert tensor.__class__.__name__ == 'TorchTensor' + assert tensor.__class__.__qualname__ == 'TorchTensor' + assert f'{tensor[0][0][0]}' == 'TorchTensor(0.)' + + +def test_torch_embedding(): + # correct shape + tensor = parse_obj_as(TorchEmbedding[128], torch.zeros(128)) + assert isinstance(tensor, TorchEmbedding) + assert isinstance(tensor, torch.Tensor) + assert tensor.shape == (128,) + + # wrong shape at data setting time + with pytest.raises(ValueError): + parse_obj_as(TorchEmbedding[128], torch.zeros(256)) + + # illegal shape at class creation time + with pytest.raises(ValueError): + parse_obj_as(TorchEmbedding[128, 128], torch.zeros(128, 128)) + + +def test_parametrized_subclass(): + c1 = TorchTensor[128] + c2 = TorchTensor[128] + assert issubclass(c1, c2) + assert issubclass(c1, TorchTensor) + assert issubclass(c1, torch.Tensor) + + assert not issubclass(c1, TorchTensor[256]) + + +def test_parametrized_instance(): + t = parse_obj_as(TorchTensor[128], torch.zeros(128)) + assert isinstance(t, TorchTensor[128]) + assert isinstance(t, TorchTensor) + assert isinstance(t, torch.Tensor) + + assert not isinstance(t, TorchTensor[256]) + assert not isinstance(t, TorchTensor[2, 128]) + assert not isinstance(t, TorchTensor[2, 2, 64]) + + +def test_parametrized_equality(): + t1 = parse_obj_as(TorchTensor[128], torch.zeros(128)) + t2 = parse_obj_as(TorchTensor[128], torch.zeros(128)) + assert (t1 == t2).all() + + +def test_parametrized_operations(): + t1 = parse_obj_as(TorchTensor[128], torch.zeros(128)) + t2 = parse_obj_as(TorchTensor[128], torch.zeros(128)) + t_result = t1 + t2 + assert isinstance(t_result, torch.Tensor) + assert isinstance(t_result, TorchTensor) + assert isinstance(t_result, TorchTensor[128]) + + +def test_deepcopy(): + from docarray import BaseDoc + + class MMdoc(BaseDoc): + embedding: TorchEmbedding + + doc = MMdoc(embedding=torch.randn(32)) + doc_copy = doc.copy(deep=True) + + assert doc.embedding.data_ptr() != doc_copy.embedding.data_ptr() + assert (doc.embedding == doc_copy.embedding).all() + + doc_copy.embedding = torch.randn(32) + assert not (doc.embedding == doc_copy.embedding).all() + + +def test_deepcopy_tensor(): + from docarray import BaseDoc + + class MMdoc(BaseDoc): + embedding: TorchTensor + + doc = MMdoc(embedding=torch.randn(32)) + doc_copy = doc.copy(deep=True) + + assert doc.embedding.data_ptr() != doc_copy.embedding.data_ptr() + assert (doc.embedding == doc_copy.embedding).all() + + doc_copy.embedding = torch.randn(32) + assert not (doc.embedding == doc_copy.embedding).all() + + +@pytest.mark.parametrize('requires_grad', [True]) # , False]) +def test_json_serialization(requires_grad: bool): + orig_doc = MyDoc(tens=torch.rand(10, requires_grad=requires_grad)) + serialized_doc = orig_doc.to_json() + assert serialized_doc + assert isinstance(serialized_doc, str) + + new_doc = MyDoc.from_json(serialized_doc) + assert len(new_doc.tens) == 10 + + +@pytest.mark.parametrize('protocol', ['pickle', 'protobuf']) +@pytest.mark.parametrize('requires_grad', [True, False]) +def test_bytes_serialization(requires_grad, protocol): + orig_doc = MyDoc(tens=torch.rand(10, requires_grad=requires_grad)) + serialized_doc = orig_doc.to_bytes(protocol=protocol) + assert serialized_doc + assert isinstance(serialized_doc, bytes) + + conv_doc = MyDoc.from_bytes(serialized_doc, protocol=protocol) + assert isinstance(conv_doc.tens, TorchTensor) + assert conv_doc.tens.shape == (10,) + + +@pytest.mark.parametrize('protocol', ['pickle', 'protobuf']) +@pytest.mark.parametrize('requires_grad', [True, False]) +def test_base64_serialization(requires_grad, protocol): + orig_doc = MyDoc(tens=torch.rand(10, requires_grad=requires_grad)) + serialized_doc = orig_doc.to_base64(protocol=protocol) + assert serialized_doc + assert isinstance(serialized_doc, str) + + conv_doc = MyDoc.from_base64(serialized_doc, protocol=protocol) + assert isinstance(conv_doc.tens, TorchTensor) + assert conv_doc.tens.shape == (10,) + + +@pytest.mark.parametrize('requires_grad', [True, False]) +def test_protobuf_serialization(requires_grad: bool): + orig_doc = MyDoc(tens=torch.rand(10, requires_grad=requires_grad)) + serialized_doc = orig_doc.to_protobuf() + assert serialized_doc + assert isinstance(serialized_doc, DocProto) + + conv_doc = MyDoc.from_protobuf(serialized_doc) + assert isinstance(conv_doc.tens, TorchTensor) + assert conv_doc.tens.shape == (10,) diff --git a/tests/units/typing/tensor/test_video_tensor.py b/tests/units/typing/tensor/test_video_tensor.py new file mode 100644 index 00000000000..7cd44537d18 --- /dev/null +++ b/tests/units/typing/tensor/test_video_tensor.py @@ -0,0 +1,219 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import numpy as np +import pytest +import torch +from pydantic.tools import parse_obj_as + +from docarray import BaseDoc +from docarray.typing import ( + AudioNdArray, + AudioTorchTensor, + VideoBytes, + VideoNdArray, + VideoTensor, + VideoTorchTensor, +) +from docarray.utils._internal.misc import is_tf_available + +tf_available = is_tf_available() +if tf_available: + import tensorflow as tf + import tensorflow._api.v2.experimental.numpy as tnp + + from docarray.typing.tensor.video import VideoTensorFlowTensor + + +@pytest.mark.parametrize( + 'tensor,cls_video_tensor,cls_tensor', + [ + (torch.zeros(1, 224, 224, 3), VideoTorchTensor, torch.Tensor), + (np.zeros((1, 224, 224, 3)), VideoNdArray, np.ndarray), + ], +) +def test_set_video_tensor(tensor, cls_video_tensor, cls_tensor): + class MyVideoDoc(BaseDoc): + tensor: cls_video_tensor + + doc = MyVideoDoc(tensor=tensor) + + assert isinstance(doc.tensor, cls_video_tensor) + assert isinstance(doc.tensor, cls_tensor) + assert (doc.tensor == tensor).all() + + +@pytest.mark.tensorflow +def test_set_video_tensor_tensorflow(): + class MyVideoDoc(BaseDoc): + tensor: VideoTensorFlowTensor + + doc = MyVideoDoc(tensor=tf.zeros((1, 224, 224, 3))) + + assert isinstance(doc.tensor, VideoTensorFlowTensor) + assert isinstance(doc.tensor.tensor, tf.Tensor) + assert tnp.allclose(doc.tensor.tensor, tf.zeros((1, 224, 224, 3))) + + +@pytest.mark.parametrize( + 'cls_tensor,tensor', + [ + (VideoNdArray, np.zeros((1, 224, 224, 3))), + (VideoTorchTensor, torch.zeros(1, 224, 224, 3)), + (VideoTorchTensor, np.zeros((1, 224, 224, 3))), + ], +) +def test_validation(cls_tensor, tensor): + arr = parse_obj_as(cls_tensor, tensor) + assert isinstance(arr, cls_tensor) + + +@pytest.mark.tensorflow +def test_validation_tensorflow(): + arr = parse_obj_as(VideoTensorFlowTensor, np.zeros((1, 224, 224, 3))) + assert isinstance(arr, VideoTensorFlowTensor) + + arr = parse_obj_as(VideoTensorFlowTensor, tf.zeros((1, 224, 224, 3))) + assert isinstance(arr, VideoTensorFlowTensor) + + arr = parse_obj_as(VideoTensorFlowTensor, torch.zeros((1, 224, 224, 3))) + assert isinstance(arr, VideoTensorFlowTensor) + + +@pytest.mark.parametrize( + 'cls_tensor,tensor,expect_error', + [ + (VideoNdArray, torch.zeros(1, 224, 224, 3), False), + (VideoNdArray, torch.zeros(1, 224, 224, 100), True), + (VideoTorchTensor, torch.zeros(1, 224, 224, 3), False), + (VideoTorchTensor, torch.zeros(1, 224, 224, 100), True), + (VideoNdArray, 'hello', True), + (VideoTorchTensor, 'hello', True), + ], +) +def test_illegal_validation(cls_tensor, tensor, expect_error): + if expect_error: + with pytest.raises(ValueError): + parse_obj_as(cls_tensor, tensor) + else: + parse_obj_as(cls_tensor, tensor) + + +@pytest.mark.parametrize( + 'cls_tensor,tensor,proto_key', + [ + ( + VideoTorchTensor, + torch.zeros(1, 224, 224, 3), + VideoTorchTensor._proto_type_name, + ), + (VideoNdArray, np.zeros((1, 224, 224, 3)), VideoNdArray._proto_type_name), + ], +) +def test_proto_tensor(cls_tensor, tensor, proto_key): + tensor = parse_obj_as(cls_tensor, tensor) + proto = tensor._to_node_protobuf() + assert proto_key in str(proto) + + +@pytest.mark.tensorflow +def test_proto_tensor_tensorflow(): + tensor = parse_obj_as(VideoTensorFlowTensor, tf.zeros((1, 224, 224, 3))) + proto = tensor._to_node_protobuf() + assert VideoTensorFlowTensor._proto_type_name in str(proto) + + +@pytest.mark.parametrize( + 'video_tensor', + [ + parse_obj_as(VideoTorchTensor, torch.zeros(1, 224, 224, 3)), + parse_obj_as(VideoNdArray, np.zeros((1, 224, 224, 3))), + ], +) +def test_save_video_tensor_to_file(video_tensor, tmpdir): + tmp_file = str(tmpdir / 'tmp.mp4') + video_tensor.save(tmp_file) + assert os.path.isfile(tmp_file) + + +@pytest.mark.parametrize( + 'video_tensor', + [ + parse_obj_as(VideoTorchTensor, torch.zeros(1, 224, 224, 3)), + parse_obj_as(VideoNdArray, np.zeros((1, 224, 224, 3))), + ], +) +def test_save_video_tensor_to_bytes(video_tensor): + b = video_tensor.to_bytes() + isinstance(b, bytes) + isinstance(b, VideoBytes) + + +@pytest.mark.tensorflow +def test_save_video_tensorflow_tensor_to_file(tmpdir): + tmp_file = str(tmpdir / 'tmp.mp4') + video_tensor = parse_obj_as(VideoTensorFlowTensor, tf.zeros((1, 224, 224, 3))) + video_tensor.save(tmp_file) + assert os.path.isfile(tmp_file) + + +@pytest.mark.parametrize( + 'video_tensor', + [ + parse_obj_as(VideoTorchTensor, torch.zeros(1, 224, 224, 3)), + parse_obj_as(VideoNdArray, np.zeros((1, 224, 224, 3))), + ], +) +@pytest.mark.parametrize( + 'audio_tensor', + [ + parse_obj_as(AudioTorchTensor, torch.randn(100, 1, 1024).to(torch.float32)), + parse_obj_as(AudioNdArray, np.random.randn(100, 1, 1024).astype('float32')), + ], +) +def test_save_video_tensor_to_file_including_audio(video_tensor, audio_tensor, tmpdir): + tmp_file = str(tmpdir / 'tmp.mp4') + video_tensor.save(tmp_file, audio_tensor=audio_tensor) + assert os.path.isfile(tmp_file) + + +@pytest.mark.parametrize( + 'tensor,cls_audio_tensor,cls_tensor', + [ + (torch.zeros(2, 10, 10, 3), VideoTorchTensor, torch.Tensor), + (np.zeros((2, 10, 10, 3)), VideoNdArray, np.ndarray), + ], +) +def test_torch_ndarray_to_video_tensor(tensor, cls_audio_tensor, cls_tensor): + class MyAudioDoc(BaseDoc): + tensor: VideoTensor + + doc = MyAudioDoc(tensor=tensor) + assert isinstance(doc.tensor, cls_audio_tensor) + assert isinstance(doc.tensor, cls_tensor) + assert (doc.tensor == tensor).all() + + +@pytest.mark.tensorflow +def test_tensorflow_to_video_tensor(): + class MyAudioDoc(BaseDoc): + tensor: VideoTensor + + doc = MyAudioDoc(tensor=tf.zeros((2, 10, 10, 3))) + assert isinstance(doc.tensor, VideoTensorFlowTensor) + assert isinstance(doc.tensor.tensor, tf.Tensor) + assert tnp.allclose(doc.tensor.tensor, tf.zeros((2, 10, 10, 3))) diff --git a/tests/units/typing/test_bytes.py b/tests/units/typing/test_bytes.py new file mode 100644 index 00000000000..4415f809db5 --- /dev/null +++ b/tests/units/typing/test_bytes.py @@ -0,0 +1,40 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +from pydantic import parse_obj_as + +from docarray.typing import ImageBytes, ImageTensor, ImageUrl + +CUR_DIR = os.path.dirname(os.path.abspath(__file__)) +PATH_TO_IMAGE_DATA = os.path.join(CUR_DIR, '..', '..', 'toydata', 'image-data') +IMAGE_PATHS = { + 'png': os.path.join(PATH_TO_IMAGE_DATA, 'so_good.png'), + 'jpg': os.path.join(PATH_TO_IMAGE_DATA, '05984.jpg'), + 'jpeg': os.path.join(PATH_TO_IMAGE_DATA, '05984-2.jpeg'), +} + + +def test_bytes_load(): + url = parse_obj_as(ImageUrl, IMAGE_PATHS['png']) + + tensor = parse_obj_as(ImageTensor, url.load()) + + bytes_ = parse_obj_as(ImageBytes, tensor.to_bytes()) + + tensor_new = parse_obj_as(ImageTensor, bytes_.load()) + + assert (tensor_new == tensor).all() diff --git a/tests/units/typing/test_id.py b/tests/units/typing/test_id.py new file mode 100644 index 00000000000..10eb46694b4 --- /dev/null +++ b/tests/units/typing/test_id.py @@ -0,0 +1,52 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from uuid import UUID + +import pytest +from pydantic import schema_json_of +from pydantic.tools import parse_obj_as + +from docarray.base_doc.io.json import orjson_dumps +from docarray.typing import ID + + +@pytest.mark.parametrize( + 'id', ['1234', 1234, UUID('cf57432e-809e-4353-adbd-9d5c0d733868')] +) +def test_id_validation(id): + parsed_id = parse_obj_as(ID, id) + + assert parsed_id == str(id) + + +def test_json_schema(): + schema_json_of(ID) + + +def test_dump_json(): + id = parse_obj_as(ID, 1234) + orjson_dumps(id) + + +@pytest.mark.parametrize( + 'id', ['1234', 1234, UUID('cf57432e-809e-4353-adbd-9d5c0d733868')] +) +def test_operators(id): + parsed_id = parse_obj_as(ID, id) + assert parsed_id == str(id) + assert parsed_id != 'aljdñjd' + assert str(id)[0:1] in parsed_id + assert 'docarray' not in parsed_id diff --git a/tests/units/typing/url/__init__.py b/tests/units/typing/url/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/typing/url/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/typing/url/test_any_url.py b/tests/units/typing/url/test_any_url.py new file mode 100644 index 00000000000..d6633f1fe8a --- /dev/null +++ b/tests/units/typing/url/test_any_url.py @@ -0,0 +1,59 @@ +import pytest +from pydantic.tools import parse_obj_as, schema_json_of + +from docarray.base_doc.io.json import orjson_dumps +from docarray.typing import AnyUrl + + +@pytest.mark.proto +def test_proto_any_url(): + uri = parse_obj_as(AnyUrl, 'http://jina.ai/img.png') + + uri._to_node_protobuf() + + +def test_json_schema(): + schema_json_of(AnyUrl) + + +def test_dump_json(): + url = parse_obj_as(AnyUrl, 'http://jina.ai/img.png') + orjson_dumps(url) + + +@pytest.mark.parametrize( + 'relative_path', + [ + 'data/05978.jpg', + '../../data/05978.jpg', + ], +) +def test_relative_path(relative_path): + # see issue: https://github.com/docarray/docarray/issues/978 + url = parse_obj_as(AnyUrl, relative_path) + assert url == relative_path + + +def test_operators(): + url = parse_obj_as(AnyUrl, 'data/05978.jpg') + assert url == 'data/05978.jpg' + assert url != 'aljdñjd' + assert 'data' in url + assert 'docarray' not in url + + +def test_get_url_extension(): + # Test with a URL with extension + assert AnyUrl._get_url_extension('https://jina.ai/hey.md?model=gpt-4') == 'md' + assert AnyUrl._get_url_extension('https://jina.ai/text.txt') == 'txt' + assert AnyUrl._get_url_extension('bla.jpg') == 'jpg' + + # Test with a URL without extension + assert not AnyUrl._get_url_extension('https://jina.ai') + assert not AnyUrl._get_url_extension('https://jina.ai/?model=gpt-4') + + # Test with a text without extension + assert not AnyUrl._get_url_extension('some_text') + + # Test with empty input + assert not AnyUrl._get_url_extension('') diff --git a/tests/units/typing/url/test_audio_url.py b/tests/units/typing/url/test_audio_url.py new file mode 100644 index 00000000000..a787847abb0 --- /dev/null +++ b/tests/units/typing/url/test_audio_url.py @@ -0,0 +1,155 @@ +import os +from typing import Optional + +import numpy as np +import pytest +import torch +from pydantic.tools import parse_obj_as, schema_json_of + +from docarray import BaseDoc +from docarray.base_doc.io.json import orjson_dumps +from docarray.typing import AudioBytes, AudioTorchTensor, AudioUrl +from docarray.typing.url.mimetypes import ( + AUDIO_MIMETYPE, + IMAGE_MIMETYPE, + OBJ_MIMETYPE, + TEXT_MIMETYPE, + VIDEO_MIMETYPE, +) +from docarray.utils._internal.misc import is_tf_available +from tests import TOYDATA_DIR + +tf_available = is_tf_available() +if tf_available: + import tensorflow as tf + + from docarray.typing.tensor.audio import AudioTensorFlowTensor + +AUDIO_FILES = [ + str(TOYDATA_DIR / 'hello.wav'), + str(TOYDATA_DIR / 'olleh.wav'), +] +REMOTE_AUDIO_FILE = 'https://github.com/docarray/docarray/blob/main/tests/toydata/olleh.wav?raw=true' # noqa: E501 + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'file_url', + [*AUDIO_FILES, REMOTE_AUDIO_FILE], +) +def test_audio_url(file_url): + uri = parse_obj_as(AudioUrl, file_url) + tensor, _ = uri.load() + assert isinstance(tensor, np.ndarray) + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'file_url', + [*AUDIO_FILES, REMOTE_AUDIO_FILE], +) +def test_load_audio_url_to_audio_torch_tensor_field(file_url): + class MyAudioDoc(BaseDoc): + audio_url: AudioUrl + tensor: Optional[AudioTorchTensor] = None + + doc = MyAudioDoc(audio_url=file_url) + doc.tensor, _ = doc.audio_url.load() + + assert isinstance(doc.tensor, torch.Tensor) + assert isinstance(doc.tensor, AudioTorchTensor) + + +@pytest.mark.tensorflow +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'file_url', + [*AUDIO_FILES, REMOTE_AUDIO_FILE], +) +def test_load_audio_url_to_audio_tensorflow_tensor_field(file_url): + class MyAudioDoc(BaseDoc): + audio_url: AudioUrl + tensor: Optional[AudioTensorFlowTensor] = None + + doc = MyAudioDoc(audio_url=file_url) + doc.tensor, _ = doc.audio_url.load() + + assert isinstance(doc.tensor, AudioTensorFlowTensor) + assert isinstance(doc.tensor.tensor, tf.Tensor) + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'file_url', + [*AUDIO_FILES, REMOTE_AUDIO_FILE], +) +def test_load(file_url): + url = parse_obj_as(AudioUrl, file_url) + tensor, _ = url.load() + assert isinstance(tensor, np.ndarray) + + +def test_json_schema(): + schema_json_of(AudioUrl) + + +def test_dump_json(): + url = parse_obj_as(AudioUrl, REMOTE_AUDIO_FILE) + orjson_dumps(url) + + +@pytest.mark.parametrize( + 'path_to_file', + [*AUDIO_FILES, REMOTE_AUDIO_FILE], +) +def test_validation(path_to_file): + url = parse_obj_as(AudioUrl, path_to_file) + assert isinstance(url, AudioUrl) + assert isinstance(url, str) + + +@pytest.mark.proto +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'file_url', + [*AUDIO_FILES, REMOTE_AUDIO_FILE], +) +def test_proto_audio_url(file_url): + uri = parse_obj_as(AudioUrl, file_url) + proto = uri._to_node_protobuf() + assert 'audio_url' in str(proto) + + +def test_load_bytes(): + uri = parse_obj_as(AudioUrl, REMOTE_AUDIO_FILE) + audio_bytes = uri.load_bytes() + assert isinstance(audio_bytes, bytes) + assert isinstance(audio_bytes, AudioBytes) + assert len(audio_bytes) > 0 + + +@pytest.mark.parametrize( + 'file_type, file_source', + [ + (AUDIO_MIMETYPE, AUDIO_FILES[0]), + (AUDIO_MIMETYPE, AUDIO_FILES[1]), + (AUDIO_MIMETYPE, REMOTE_AUDIO_FILE), + (IMAGE_MIMETYPE, os.path.join(TOYDATA_DIR, 'test.png')), + (VIDEO_MIMETYPE, os.path.join(TOYDATA_DIR, 'mov_bbb.mp4')), + (TEXT_MIMETYPE, os.path.join(TOYDATA_DIR, 'test' 'test.html')), + (TEXT_MIMETYPE, os.path.join(TOYDATA_DIR, 'test' 'test.md')), + (TEXT_MIMETYPE, os.path.join(TOYDATA_DIR, 'penal_colony.txt')), + (OBJ_MIMETYPE, os.path.join(TOYDATA_DIR, 'test.glb')), + ], +) +def test_file_validation(file_type, file_source): + if file_type != AudioUrl.mime_type(): + with pytest.raises(ValueError): + parse_obj_as(AudioUrl, file_source) + else: + parse_obj_as(AudioUrl, file_source) diff --git a/tests/units/typing/url/test_image_url.py b/tests/units/typing/url/test_image_url.py new file mode 100644 index 00000000000..e5cc246da55 --- /dev/null +++ b/tests/units/typing/url/test_image_url.py @@ -0,0 +1,223 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import urllib + +import numpy as np +import PIL +import pytest +from PIL import Image +from pydantic.tools import parse_obj_as, schema_json_of + +from docarray.base_doc.io.json import orjson_dumps +from docarray.typing import ImageUrl +from docarray.typing.url.mimetypes import ( + OBJ_MIMETYPE, + AUDIO_MIMETYPE, + VIDEO_MIMETYPE, + IMAGE_MIMETYPE, + TEXT_MIMETYPE, +) +from tests import TOYDATA_DIR + +CUR_DIR = os.path.dirname(os.path.abspath(__file__)) +PATH_TO_IMAGE_DATA = os.path.join(CUR_DIR, '..', '..', '..', 'toydata', 'image-data') +IMAGE_PATHS = { + 'png': os.path.join(PATH_TO_IMAGE_DATA, 'so_good.png'), + 'jpg': os.path.join(PATH_TO_IMAGE_DATA, '05984.jpg'), + 'jpeg': os.path.join(PATH_TO_IMAGE_DATA, '05984-2.jpeg'), +} +REMOTE_JPG = ( + 'https://upload.wikimedia.org/wikipedia/commons/8/80/' + 'Dag_Sebastian_Ahlander_at_G%C3%B6teborg_Book_Fair_2012b.jpg' +) + + +@pytest.mark.slow +@pytest.mark.internet +def test_image_url(): + uri = parse_obj_as(ImageUrl, REMOTE_JPG) + + tensor = uri.load() + + assert isinstance(tensor, np.ndarray) + + +@pytest.mark.proto +def test_proto_image_url(): + uri = parse_obj_as(ImageUrl, REMOTE_JPG) + + uri._to_node_protobuf() + + +def test_json_schema(): + schema_json_of(ImageUrl) + + +def test_dump_json(): + url = parse_obj_as(ImageUrl, 'http://jina.ai/img.png') + orjson_dumps(url) + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'image_format,path_to_img', + [ + ('png', IMAGE_PATHS['png']), + ('jpg', IMAGE_PATHS['jpg']), + ('jpeg', IMAGE_PATHS['jpeg']), + ('remote-jpg', REMOTE_JPG), + ], +) +def test_load(image_format, path_to_img): + url = parse_obj_as(ImageUrl, path_to_img) + tensor = url.load() + assert isinstance(tensor, np.ndarray) + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'image_format,path_to_img', + [ + ('png', IMAGE_PATHS['png']), + ('jpg', IMAGE_PATHS['jpg']), + ('jpeg', IMAGE_PATHS['jpeg']), + ('remote-jpg', REMOTE_JPG), + ], +) +def test_load_pil(image_format, path_to_img): + url = parse_obj_as(ImageUrl, path_to_img) + img = url.load_pil() + assert isinstance(img, PIL.Image.Image) + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'image_format,path_to_img', + [ + ('png', IMAGE_PATHS['png']), + ('jpg', IMAGE_PATHS['jpg']), + ('jpeg', IMAGE_PATHS['jpeg']), + ('remote-jpg', REMOTE_JPG), + ], +) +@pytest.mark.parametrize('width,height', [(224, None), (None, 224), (224, 224)]) +def test_load_width_height(image_format, path_to_img, width, height): + url = parse_obj_as(ImageUrl, path_to_img) + tensor = url.load(width=width, height=height) + assert isinstance(tensor, np.ndarray) + + shape = tensor.shape + if width: + assert shape[1] == width + if height: + assert shape[0] == height + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'image_format,path_to_img', + [ + ('png', IMAGE_PATHS['png']), + ('jpg', IMAGE_PATHS['jpg']), + ('jpeg', IMAGE_PATHS['jpeg']), + ('remote-jpg', REMOTE_JPG), + ], +) +@pytest.mark.parametrize( + 'axis_layout', + [ + ('H', 'W', 'C'), + ('H', 'C', 'W'), + ('C', 'H', 'W'), + ('C', 'W', 'H'), + ('W', 'C', 'H'), + ('W', 'H', 'C'), + ], +) +def test_load_channel_axis(image_format, path_to_img, axis_layout): + sizes = {'H': 100, 'W': 200, 'C': 3} + url = parse_obj_as(ImageUrl, path_to_img) + tensor = url.load(axis_layout=axis_layout, height=sizes['H'], width=sizes['W']) + assert isinstance(tensor, np.ndarray) + + shape = tensor.shape + for axis, axis_name in enumerate(axis_layout): + assert shape[axis] == sizes[axis_name] + + +@pytest.mark.internet +def test_load_timeout(): + url = parse_obj_as(ImageUrl, REMOTE_JPG) + with pytest.raises(urllib.error.URLError): + _ = url.load(timeout=0.001) + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'image_format,path_to_img', + [ + ('png', IMAGE_PATHS['png']), + ('jpg', IMAGE_PATHS['jpg']), + ('jpeg', IMAGE_PATHS['jpeg']), + ('jpg', REMOTE_JPG), + ], +) +def test_load_to_bytes(image_format, path_to_img): + url = parse_obj_as(ImageUrl, path_to_img) + _bytes = url.load_bytes() + assert isinstance(_bytes, bytes) + img = Image.frombytes(mode='1', size=(224, 224), data=_bytes) + assert isinstance(img, Image.Image) + + +@pytest.mark.parametrize( + 'path_to_img', + [*IMAGE_PATHS.values(), REMOTE_JPG], +) +def test_validation(path_to_img): + url = parse_obj_as(ImageUrl, path_to_img) + assert isinstance(url, ImageUrl) + assert isinstance(url, str) + + +@pytest.mark.parametrize( + 'file_type, file_source', + [ + (IMAGE_MIMETYPE, IMAGE_PATHS['png']), + (IMAGE_MIMETYPE, IMAGE_PATHS['jpg']), + (IMAGE_MIMETYPE, IMAGE_PATHS['jpeg']), + (IMAGE_MIMETYPE, REMOTE_JPG), + (AUDIO_MIMETYPE, os.path.join(TOYDATA_DIR, 'hello.mp3')), + (AUDIO_MIMETYPE, os.path.join(TOYDATA_DIR, 'hello.wav')), + (VIDEO_MIMETYPE, os.path.join(TOYDATA_DIR, 'mov_bbb.mp4')), + (TEXT_MIMETYPE, os.path.join(TOYDATA_DIR, 'test' 'test.html')), + (TEXT_MIMETYPE, os.path.join(TOYDATA_DIR, 'test' 'test.md')), + (TEXT_MIMETYPE, os.path.join(TOYDATA_DIR, 'penal_colony.txt')), + (OBJ_MIMETYPE, os.path.join(TOYDATA_DIR, 'test.glb')), + ], +) +def test_file_validation(file_type, file_source): + if file_type != ImageUrl.mime_type(): + with pytest.raises(ValueError): + parse_obj_as(ImageUrl, file_source) + else: + parse_obj_as(ImageUrl, file_source) diff --git a/tests/units/typing/url/test_mesh_url.py b/tests/units/typing/url/test_mesh_url.py new file mode 100644 index 00000000000..df807ffa501 --- /dev/null +++ b/tests/units/typing/url/test_mesh_url.py @@ -0,0 +1,126 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import numpy as np +import pytest +from pydantic.tools import parse_obj_as, schema_json_of + +from docarray.base_doc.io.json import orjson_dumps +from docarray.typing import Mesh3DUrl, NdArray +from docarray.typing.url.mimetypes import ( + OBJ_MIMETYPE, + AUDIO_MIMETYPE, + VIDEO_MIMETYPE, + IMAGE_MIMETYPE, + TEXT_MIMETYPE, +) +from tests import TOYDATA_DIR + +MESH_FILES = { + 'obj': str(TOYDATA_DIR / 'tetrahedron.obj'), + 'glb': str(TOYDATA_DIR / 'test.glb'), + 'ply': str(TOYDATA_DIR / 'cube.ply'), +} +REMOTE_OBJ_FILE = 'https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj' + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'file_format, file_path', + [ + ('obj', MESH_FILES['obj']), + ('glb', MESH_FILES['glb']), + ('ply', MESH_FILES['ply']), + ('remote-obj', REMOTE_OBJ_FILE), + ], +) +def test_load(file_format, file_path): + url = parse_obj_as(Mesh3DUrl, file_path) + tensors = url.load() + + assert isinstance(tensors.vertices, np.ndarray) + assert isinstance(tensors.vertices, NdArray) + assert isinstance(tensors.faces, np.ndarray) + assert isinstance(tensors.faces, NdArray) + assert tensors.vertices.shape[1] == 3 + assert tensors.faces.shape[1] == 3 + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'file_path', + [*MESH_FILES.values(), REMOTE_OBJ_FILE], +) +@pytest.mark.parametrize('field', ['vertices', 'faces']) +def test_load_one_of_fields(file_path, field): + url = parse_obj_as(Mesh3DUrl, file_path) + field = getattr(url.load(), field) + + assert isinstance(field, np.ndarray) + assert isinstance(field, NdArray) + + +def test_json_schema(): + schema_json_of(Mesh3DUrl) + + +def test_dump_json(): + url = parse_obj_as(Mesh3DUrl, REMOTE_OBJ_FILE) + orjson_dumps(url) + + +@pytest.mark.parametrize( + 'path_to_file', + [*MESH_FILES.values(), REMOTE_OBJ_FILE], +) +def test_validation(path_to_file): + url = parse_obj_as(Mesh3DUrl, path_to_file) + assert isinstance(url, Mesh3DUrl) + assert isinstance(url, str) + + +@pytest.mark.proto +def test_proto_mesh_url(): + uri = parse_obj_as(Mesh3DUrl, REMOTE_OBJ_FILE) + uri._to_node_protobuf() + + +@pytest.mark.parametrize( + 'file_type, file_source', + [ + (OBJ_MIMETYPE, MESH_FILES['obj']), + (OBJ_MIMETYPE, MESH_FILES['glb']), + (OBJ_MIMETYPE, MESH_FILES['ply']), + (OBJ_MIMETYPE, REMOTE_OBJ_FILE), + (AUDIO_MIMETYPE, os.path.join(TOYDATA_DIR, 'hello.aac')), + (AUDIO_MIMETYPE, os.path.join(TOYDATA_DIR, 'hello.mp3')), + (AUDIO_MIMETYPE, os.path.join(TOYDATA_DIR, 'hello.ogg')), + (VIDEO_MIMETYPE, os.path.join(TOYDATA_DIR, 'mov_bbb.mp4')), + (IMAGE_MIMETYPE, os.path.join(TOYDATA_DIR, 'test.png')), + (TEXT_MIMETYPE, os.path.join(TOYDATA_DIR, 'test' 'test.html')), + (TEXT_MIMETYPE, os.path.join(TOYDATA_DIR, 'test' 'test.md')), + (TEXT_MIMETYPE, os.path.join(TOYDATA_DIR, 'penal_colony.txt')), + ], +) +def test_file_validation(file_type, file_source): + if file_type != Mesh3DUrl.mime_type(): + with pytest.raises(ValueError): + parse_obj_as(Mesh3DUrl, file_source) + else: + parse_obj_as(Mesh3DUrl, file_source) diff --git a/tests/units/typing/url/test_point_cloud_url.py b/tests/units/typing/url/test_point_cloud_url.py new file mode 100644 index 00000000000..3deb3e5779a --- /dev/null +++ b/tests/units/typing/url/test_point_cloud_url.py @@ -0,0 +1,130 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import numpy as np +import pytest +from pydantic.tools import parse_obj_as, schema_json_of + +from docarray.base_doc.io.json import orjson_dumps +from docarray.typing import NdArray, PointCloud3DUrl +from docarray.typing.url.mimetypes import ( + OBJ_MIMETYPE, + AUDIO_MIMETYPE, + VIDEO_MIMETYPE, + IMAGE_MIMETYPE, + TEXT_MIMETYPE, +) +from tests import TOYDATA_DIR + +MESH_FILES = { + 'obj': str(TOYDATA_DIR / 'tetrahedron.obj'), + 'glb': str(TOYDATA_DIR / 'test.glb'), + 'ply': str(TOYDATA_DIR / 'cube.ply'), +} +REMOTE_OBJ_FILE = 'https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj' + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'file_format, file_path', + [ + ('obj', MESH_FILES['obj']), + ('glb', MESH_FILES['glb']), + ('ply', MESH_FILES['ply']), + ('remote-obj', REMOTE_OBJ_FILE), + ], +) +def test_load(file_format, file_path): + n_samples = 100 + url = parse_obj_as(PointCloud3DUrl, file_path) + tensors = url.load(samples=n_samples) + + assert isinstance(tensors.points, np.ndarray) + assert isinstance(tensors.points, NdArray) + assert tensors.points.shape == (n_samples, 3) + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'file_format, file_path', + [ + ('obj', MESH_FILES['obj']), + ('glb', MESH_FILES['glb']), + ('ply', MESH_FILES['ply']), + ('remote-obj', REMOTE_OBJ_FILE), + ], +) +def test_load_with_multiple_geometries_true(file_format, file_path): + n_samples = 100 + url = parse_obj_as(PointCloud3DUrl, file_path) + tensors = url.load(samples=n_samples, multiple_geometries=True) + + assert isinstance(tensors.points, np.ndarray) + assert len(tensors.points.shape) == 3 + assert tensors.points.shape[1:] == (100, 3) + + +def test_json_schema(): + schema_json_of(PointCloud3DUrl) + + +def test_dump_json(): + url = parse_obj_as(PointCloud3DUrl, REMOTE_OBJ_FILE) + orjson_dumps(url) + + +@pytest.mark.parametrize( + 'path_to_file', + [*MESH_FILES.values(), REMOTE_OBJ_FILE], +) +def test_validation(path_to_file): + url = parse_obj_as(PointCloud3DUrl, path_to_file) + assert isinstance(url, PointCloud3DUrl) + assert isinstance(url, str) + + +@pytest.mark.proto +def test_proto_point_cloud_url(): + uri = parse_obj_as(PointCloud3DUrl, REMOTE_OBJ_FILE) + uri._to_node_protobuf() + + +@pytest.mark.parametrize( + 'file_type, file_source', + [ + (OBJ_MIMETYPE, MESH_FILES['obj']), + (OBJ_MIMETYPE, MESH_FILES['glb']), + (OBJ_MIMETYPE, MESH_FILES['ply']), + (OBJ_MIMETYPE, REMOTE_OBJ_FILE), + (AUDIO_MIMETYPE, os.path.join(TOYDATA_DIR, 'hello.aac')), + (AUDIO_MIMETYPE, os.path.join(TOYDATA_DIR, 'hello.mp3')), + (AUDIO_MIMETYPE, os.path.join(TOYDATA_DIR, 'hello.ogg')), + (VIDEO_MIMETYPE, os.path.join(TOYDATA_DIR, 'mov_bbb.mp4')), + (IMAGE_MIMETYPE, os.path.join(TOYDATA_DIR, 'test.png')), + (TEXT_MIMETYPE, os.path.join(TOYDATA_DIR, 'test' 'test.html')), + (TEXT_MIMETYPE, os.path.join(TOYDATA_DIR, 'test' 'test.md')), + (TEXT_MIMETYPE, os.path.join(TOYDATA_DIR, 'penal_colony.txt')), + ], +) +def test_file_validation(file_type, file_source): + if file_type != PointCloud3DUrl.mime_type(): + with pytest.raises(ValueError): + parse_obj_as(PointCloud3DUrl, file_source) + else: + parse_obj_as(PointCloud3DUrl, file_source) diff --git a/tests/units/typing/url/test_text_url.py b/tests/units/typing/url/test_text_url.py new file mode 100644 index 00000000000..a755344f394 --- /dev/null +++ b/tests/units/typing/url/test_text_url.py @@ -0,0 +1,119 @@ +import os +import urllib + +import pytest +from pydantic import parse_obj_as, schema_json_of + +from docarray.base_doc.io.json import orjson_dumps +from docarray.typing import TextUrl +from docarray.typing.url.mimetypes import ( + OBJ_MIMETYPE, + AUDIO_MIMETYPE, + VIDEO_MIMETYPE, + IMAGE_MIMETYPE, + TEXT_MIMETYPE, +) +from tests import TOYDATA_DIR + +REMOTE_TEXT_FILE = 'https://de.wikipedia.org/wiki/Brixen' +CUR_DIR = os.path.dirname(os.path.abspath(__file__)) +LOCAL_TEXT_FILES = [ + str(TOYDATA_DIR / 'penal_colony.txt'), + str(TOYDATA_DIR / 'test.md'), + str(TOYDATA_DIR / 'test.html'), + str(TOYDATA_DIR / 'test.css'), + str(TOYDATA_DIR / 'test.csv'), + str(TOYDATA_DIR / 'test.log'), +] +LOCAL_TEXT_FILES_AND_BEGINNING = [ + (str(TOYDATA_DIR / 'penal_colony.txt'), '“It’s a peculiar apparatus,”'), + (str(TOYDATA_DIR / 'test.md'), "# Hello"), + (str(TOYDATA_DIR / 'test.html'), ""), + (str(TOYDATA_DIR / 'test.css'), 'body {'), + (str(TOYDATA_DIR / 'test.csv'), "John,Doe"), + (str(TOYDATA_DIR / 'test.log'), "2022-11-25 12:34:56 INFO: Program started"), +] + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'url,expected_beginning', + [(REMOTE_TEXT_FILE, ''), *LOCAL_TEXT_FILES_AND_BEGINNING], +) +def test_load(url, expected_beginning): + uri = parse_obj_as(TextUrl, url) + + txt = uri.load() + assert txt.startswith(expected_beginning) + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize('url', [REMOTE_TEXT_FILE, *LOCAL_TEXT_FILES]) +def test_load_to_bytes(url): + uri = parse_obj_as(TextUrl, url) + + txt_bytes = uri.load_bytes() + assert isinstance(txt_bytes, bytes) + + +@pytest.mark.proto +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize('url', [REMOTE_TEXT_FILE, *LOCAL_TEXT_FILES]) +def test_proto_text_url(url): + uri = parse_obj_as(TextUrl, url) + + proto = uri._to_node_protobuf() + assert 'text_url' in str(proto) + + +@pytest.mark.internet +def test_load_timeout(): + url = parse_obj_as(TextUrl, REMOTE_TEXT_FILE) + with pytest.raises(urllib.error.URLError): + _ = url.load(timeout=0.001) + with pytest.raises(urllib.error.URLError): + _ = url.load_bytes(timeout=0.001) + + +def test_json_schema(): + schema_json_of(TextUrl) + + +@pytest.mark.internet +def test_dump_json(): + url = parse_obj_as(TextUrl, REMOTE_TEXT_FILE) + orjson_dumps(url) + + +@pytest.mark.parametrize( + 'path_to_file', + [REMOTE_TEXT_FILE, *LOCAL_TEXT_FILES], +) +def test_validation(path_to_file): + url = parse_obj_as(TextUrl, path_to_file) + assert isinstance(url, TextUrl) + assert isinstance(url, str) + + +@pytest.mark.parametrize( + 'file_type, file_source', + [ + *[(TEXT_MIMETYPE, file) for file in LOCAL_TEXT_FILES], + (TEXT_MIMETYPE, REMOTE_TEXT_FILE), + (AUDIO_MIMETYPE, os.path.join(TOYDATA_DIR, 'hello.aac')), + (AUDIO_MIMETYPE, os.path.join(TOYDATA_DIR, 'hello.mp3')), + (AUDIO_MIMETYPE, os.path.join(TOYDATA_DIR, 'hello.ogg')), + (IMAGE_MIMETYPE, os.path.join(TOYDATA_DIR, 'test.png')), + (VIDEO_MIMETYPE, os.path.join(TOYDATA_DIR, 'mov_bbb.mp4')), + (OBJ_MIMETYPE, os.path.join(TOYDATA_DIR, 'test.glb')), + ], +) +def test_file_validation(file_type, file_source): + if file_type != TextUrl.mime_type(): + with pytest.raises(ValueError): + parse_obj_as(TextUrl, file_source) + else: + parse_obj_as(TextUrl, file_source) diff --git a/tests/units/typing/url/test_video_url.py b/tests/units/typing/url/test_video_url.py new file mode 100644 index 00000000000..0bd889f37bf --- /dev/null +++ b/tests/units/typing/url/test_video_url.py @@ -0,0 +1,179 @@ +import os +from typing import Optional + +import numpy as np +import pytest +import torch +from pydantic.tools import parse_obj_as, schema_json_of + +from docarray import BaseDoc +from docarray.base_doc.io.json import orjson_dumps +from docarray.typing import ( + AudioNdArray, + NdArray, + VideoBytes, + VideoNdArray, + VideoTorchTensor, + VideoUrl, +) +from docarray.typing.url.mimetypes import ( + AUDIO_MIMETYPE, + IMAGE_MIMETYPE, + OBJ_MIMETYPE, + TEXT_MIMETYPE, + VIDEO_MIMETYPE, +) +from docarray.utils._internal.misc import is_tf_available +from tests import TOYDATA_DIR + +tf_available = is_tf_available() +if tf_available: + import tensorflow as tf + + from docarray.typing.tensor.video import VideoTensorFlowTensor + +LOCAL_VIDEO_FILE = str(TOYDATA_DIR / 'mov_bbb.mp4') +REMOTE_VIDEO_FILE = 'https://github.com/docarray/docarray/blob/main/tests/toydata/mov_bbb.mp4?raw=true' # noqa: E501 + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'file_url', + [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE], +) +def test_load(file_url): + url = parse_obj_as(VideoUrl, file_url) + video, audio, indices = url.load() + + assert isinstance(audio, np.ndarray) + assert isinstance(audio, AudioNdArray) + + assert isinstance(video, np.ndarray) + assert isinstance(video, VideoNdArray) + + assert isinstance(indices, np.ndarray) + assert isinstance(indices, NdArray) + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'file_url', + [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE], +) +@pytest.mark.parametrize( + 'field, attr_cls', + [ + ('video', VideoNdArray), + ('audio', AudioNdArray), + ('key_frame_indices', NdArray), + ], +) +def test_load_one_of_named_tuple_results(file_url, field, attr_cls): + url = parse_obj_as(VideoUrl, file_url) + result = getattr(url.load(), field) + + assert isinstance(result, np.ndarray) + assert isinstance(result, attr_cls) + + +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'file_url', + [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE], +) +def test_load_video_url_to_video_torch_tensor_field(file_url): + class MyVideoDoc(BaseDoc): + video_url: VideoUrl + tensor: Optional[VideoTorchTensor] = None + + doc = MyVideoDoc(video_url=file_url) + doc.tensor = doc.video_url.load().video + + assert isinstance(doc.tensor, torch.Tensor) + assert isinstance(doc.tensor, VideoTorchTensor) + + +@pytest.mark.tensorflow +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'file_url', + [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE], +) +def test_load_video_url_to_video_tensorflow_tensor_field(file_url): + class MyVideoDoc(BaseDoc): + video_url: VideoUrl + tensor: Optional[VideoTensorFlowTensor] = None + + doc = MyVideoDoc(video_url=file_url) + doc.tensor = doc.video_url.load().video + + assert isinstance(doc.tensor, VideoTensorFlowTensor) + assert isinstance(doc.tensor.tensor, tf.Tensor) + + +def test_json_schema(): + schema_json_of(VideoUrl) + + +def test_dump_json(): + url = parse_obj_as(VideoUrl, REMOTE_VIDEO_FILE) + orjson_dumps(url) + + +@pytest.mark.parametrize( + 'path_to_file', + [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE], +) +def test_validation(path_to_file): + url = parse_obj_as(VideoUrl, path_to_file) + assert isinstance(url, VideoUrl) + assert isinstance(url, str) + + +@pytest.mark.proto +@pytest.mark.slow +@pytest.mark.internet +@pytest.mark.parametrize( + 'file_url', + [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE], +) +def test_proto_video_url(file_url): + uri = parse_obj_as(VideoUrl, file_url) + proto = uri._to_node_protobuf() + assert 'video_url' in str(proto) + + +def test_load_bytes(): + file_url = LOCAL_VIDEO_FILE + uri = parse_obj_as(VideoUrl, file_url) + video_bytes = uri.load_bytes() + assert isinstance(video_bytes, bytes) + assert isinstance(video_bytes, VideoBytes) + assert len(video_bytes) > 0 + + +@pytest.mark.parametrize( + 'file_type, file_source', + [ + (VIDEO_MIMETYPE, LOCAL_VIDEO_FILE), + (VIDEO_MIMETYPE, REMOTE_VIDEO_FILE), + (AUDIO_MIMETYPE, os.path.join(TOYDATA_DIR, 'hello.aac')), + (AUDIO_MIMETYPE, os.path.join(TOYDATA_DIR, 'hello.mp3')), + (AUDIO_MIMETYPE, os.path.join(TOYDATA_DIR, 'hello.ogg')), + (IMAGE_MIMETYPE, os.path.join(TOYDATA_DIR, 'test.png')), + (TEXT_MIMETYPE, os.path.join(TOYDATA_DIR, 'test' 'test.html')), + (TEXT_MIMETYPE, os.path.join(TOYDATA_DIR, 'test' 'test.md')), + (TEXT_MIMETYPE, os.path.join(TOYDATA_DIR, 'penal_colony.txt')), + (OBJ_MIMETYPE, os.path.join(TOYDATA_DIR, 'test.glb')), + ], +) +def test_file_validation(file_type, file_source): + if file_type != VideoUrl.mime_type(): + with pytest.raises(ValueError): + parse_obj_as(VideoUrl, file_source) + else: + parse_obj_as(VideoUrl, file_source) diff --git a/tests/units/util/__init__.py b/tests/units/util/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/util/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/util/query_language/__init__.py b/tests/units/util/query_language/__init__.py new file mode 100644 index 00000000000..74f8f7582cd --- /dev/null +++ b/tests/units/util/query_language/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/units/util/query_language/test_lookup.py b/tests/units/util/query_language/test_lookup.py new file mode 100644 index 00000000000..844f5475b9e --- /dev/null +++ b/tests/units/util/query_language/test_lookup.py @@ -0,0 +1,68 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest + +from docarray.utils._internal.query_language.lookup import dunder_get, lookup + + +class A: + class B: + c = 0 + d = 'docarray' + e = [0, 1] + f = {} + + b: B = B() + + +@pytest.mark.parametrize('input', [A(), {'b': {'c': 0, 'd': 'docarray'}}]) +def test_dunder_get(input): + assert dunder_get(input, 'b__c') == 0 + + expected_exception = KeyError if isinstance(input, dict) else AttributeError + with pytest.raises(expected_exception): + _ = dunder_get(input, 'z') + + with pytest.raises(expected_exception): + _ = dunder_get(input, 'b__z') + + +@pytest.mark.parametrize( + 'input', [A(), {'b': {'c': 0, 'd': 'docarray', 'e': [0, 1], 'f': {}}}] +) +def test_lookup(input): + assert lookup('b__c.exact', 0, input) + assert not lookup('b__c.gt', 0, input) + assert lookup('b__c.gte', 0, input) + assert not lookup('b__c.lt', 0, input) + assert lookup('b__c.lte', 0, input) + assert lookup('b__d.regex', 'array*', input) + assert lookup('b__d.contains', 'array', input) + assert lookup('b__d.icontains', 'Array', input) + assert lookup('b__d.in', ['a', 'docarray'], input) + assert lookup('b__d.nin', ['a', 'b'], input) + assert lookup('b__d.startswith', 'doc', input) + assert lookup('b__d.istartswith', 'Doc', input) + assert lookup('b__d.endswith', 'array', input) + assert lookup('b__d.iendswith', 'Array', input) + assert lookup('b__e.size', 2, input) + assert not lookup('b__e.size', 3, input) + assert lookup('b__d.size', len('docarray'), input) + assert not lookup('b__e.size', len('docarray') + 1, input) + assert not lookup('b__z.exists', True, input) + assert lookup('b__z.exists', False, input) + assert not lookup('b__f__z.exists', True, input) + assert lookup('b__f__z.exists', False, input) diff --git a/tests/units/util/test_create_dynamic_code_class.py b/tests/units/util/test_create_dynamic_code_class.py new file mode 100644 index 00000000000..b7df497816d --- /dev/null +++ b/tests/units/util/test_create_dynamic_code_class.py @@ -0,0 +1,358 @@ +from typing import Any, Dict, List, Optional, Union, ClassVar + +import numpy as np +import pytest +from pydantic import Field + +from docarray import BaseDoc, DocList +from docarray.documents import TextDoc +from docarray.typing import AnyTensor, ImageUrl +from docarray.utils.create_dynamic_doc_class import ( + create_base_doc_from_schema, + create_pure_python_type_model, +) +from docarray.utils._internal.pydantic import is_pydantic_v2 + + +@pytest.mark.parametrize('transformation', ['proto', 'json']) +def test_create_pydantic_model_from_schema(transformation): + class Nested2Doc(BaseDoc): + value: str + classvar: ClassVar[str] = 'classvar2' + + class Nested1Doc(BaseDoc): + nested: Nested2Doc + classvar: ClassVar[str] = 'classvar1' + + class CustomDoc(BaseDoc): + tensor: Optional[AnyTensor] = None + url: ImageUrl + num: float = 0.5 + num_num: List[float] = [1.5, 2.5] + lll: List[List[List[int]]] = [[[5]]] + fff: List[List[List[float]]] = [[[5.2]]] + single_text: TextDoc + texts: DocList[TextDoc] + d: Dict[str, str] = {'a': 'b'} + di: Optional[Dict[str, int]] = None + u: Union[str, int] + lu: List[Union[str, int]] = [0, 1, 2] + tags: Optional[Dict[str, Any]] = None + nested: Nested1Doc + classvar: ClassVar[str] = 'classvar' + + CustomDocCopy = create_pure_python_type_model(CustomDoc) + new_custom_doc_model = create_base_doc_from_schema( + CustomDocCopy.schema(), 'CustomDoc', {} + ) + print(f'new_custom_doc_model {new_custom_doc_model.schema()}') + + original_custom_docs = DocList[CustomDoc]( + [ + CustomDoc( + num=3.5, + num_num=[4.5, 5.5], + url='photo.jpg', + lll=[[[40]]], + fff=[[[40.2]]], + d={'b': 'a'}, + texts=DocList[TextDoc]([TextDoc(text='hey ha', embedding=np.zeros(3))]), + single_text=TextDoc(text='single hey ha', embedding=np.zeros(2)), + u='a', + lu=[3, 4], + nested=Nested1Doc(nested=Nested2Doc(value='hello world')), + ) + ] + ) + for doc in original_custom_docs: + doc.tensor = np.zeros((10, 10, 10)) + doc.di = {'a': 2} + + if transformation == 'proto': + custom_partial_da = DocList[new_custom_doc_model].from_protobuf( + original_custom_docs.to_protobuf() + ) + original_back = DocList[CustomDoc].from_protobuf( + custom_partial_da.to_protobuf() + ) + elif transformation == 'json': + custom_partial_da = DocList[new_custom_doc_model].from_json( + original_custom_docs.to_json() + ) + original_back = DocList[CustomDoc].from_json(custom_partial_da.to_json()) + + assert len(custom_partial_da) == 1 + assert custom_partial_da[0].url == 'photo.jpg' + assert custom_partial_da[0].num == 3.5 + assert custom_partial_da[0].num_num == [4.5, 5.5] + assert custom_partial_da[0].lll == [[[40]]] + if is_pydantic_v2: + assert custom_partial_da[0].lu == [3, 4] + else: + assert custom_partial_da[0].lu == ['3', '4'] # Union validates back to string + assert custom_partial_da[0].fff == [[[40.2]]] + assert custom_partial_da[0].di == {'a': 2} + assert custom_partial_da[0].d == {'b': 'a'} + assert len(custom_partial_da[0].texts) == 1 + assert custom_partial_da[0].texts[0].text == 'hey ha' + assert custom_partial_da[0].texts[0].embedding.shape == (3,) + assert custom_partial_da[0].tensor.shape == (10, 10, 10) + assert custom_partial_da[0].u == 'a' + assert custom_partial_da[0].single_text.text == 'single hey ha' + assert custom_partial_da[0].single_text.embedding.shape == (2,) + assert original_back[0].nested.nested.value == 'hello world' + assert original_back[0].num == 3.5 + assert original_back[0].num_num == [4.5, 5.5] + assert original_back[0].classvar == 'classvar' + assert original_back[0].nested.classvar == 'classvar1' + assert original_back[0].nested.nested.classvar == 'classvar2' + + assert len(original_back) == 1 + assert original_back[0].url == 'photo.jpg' + assert original_back[0].lll == [[[40]]] + if is_pydantic_v2: + assert original_back[0].lu == [3, 4] # Union validates back to string + else: + assert original_back[0].lu == ['3', '4'] # Union validates back to string + assert original_back[0].fff == [[[40.2]]] + assert original_back[0].di == {'a': 2} + assert original_back[0].d == {'b': 'a'} + assert len(original_back[0].texts) == 1 + assert original_back[0].texts[0].text == 'hey ha' + assert original_back[0].texts[0].embedding.shape == (3,) + assert original_back[0].tensor.shape == (10, 10, 10) + assert original_back[0].u == 'a' + assert original_back[0].single_text.text == 'single hey ha' + assert original_back[0].single_text.embedding.shape == (2,) + + class TextDocWithId(BaseDoc): + ia: str + + TextDocWithIdCopy = create_pure_python_type_model(TextDocWithId) + new_textdoc_with_id_model = create_base_doc_from_schema( + TextDocWithIdCopy.schema(), 'TextDocWithId', {} + ) + print(f'new_textdoc_with_id_model {new_textdoc_with_id_model.schema()}') + + original_text_doc_with_id = DocList[TextDocWithId]( + [TextDocWithId(ia=f'ID {i}') for i in range(10)] + ) + if transformation == 'proto': + custom_da = DocList[new_textdoc_with_id_model].from_protobuf( + original_text_doc_with_id.to_protobuf() + ) + original_back = DocList[TextDocWithId].from_protobuf(custom_da.to_protobuf()) + elif transformation == 'json': + custom_da = DocList[new_textdoc_with_id_model].from_json( + original_text_doc_with_id.to_json() + ) + original_back = DocList[TextDocWithId].from_json(custom_da.to_json()) + + assert len(custom_da) == 10 + for i, doc in enumerate(custom_da): + assert doc.ia == f'ID {i}' + + assert len(original_back) == 10 + for i, doc in enumerate(original_back): + assert doc.ia == f'ID {i}' + + class ResultTestDoc(BaseDoc): + matches: DocList[TextDocWithId] + + ResultTestDocCopy = create_pure_python_type_model(ResultTestDoc) + new_result_test_doc_with_id_model = create_base_doc_from_schema( + ResultTestDocCopy.schema(), 'ResultTestDoc', {} + ) + result_test_docs = DocList[ResultTestDoc]( + [ResultTestDoc(matches=original_text_doc_with_id)] + ) + + if transformation == 'proto': + custom_da = DocList[new_result_test_doc_with_id_model].from_protobuf( + result_test_docs.to_protobuf() + ) + original_back = DocList[ResultTestDoc].from_protobuf(custom_da.to_protobuf()) + elif transformation == 'json': + custom_da = DocList[new_result_test_doc_with_id_model].from_json( + result_test_docs.to_json() + ) + original_back = DocList[ResultTestDoc].from_json(custom_da.to_json()) + + assert len(custom_da) == 1 + assert len(custom_da[0].matches) == 10 + for i, doc in enumerate(custom_da[0].matches): + assert doc.ia == f'ID {i}' + + assert len(original_back) == 1 + assert len(original_back[0].matches) == 10 + for i, doc in enumerate(original_back[0].matches): + assert doc.ia == f'ID {i}' + + +@pytest.mark.parametrize('transformation', ['proto', 'json']) +def test_create_empty_doc_list_from_schema(transformation): + class CustomDoc(BaseDoc): + tensor: Optional[AnyTensor] + url: ImageUrl + lll: List[List[List[int]]] = [[[5]]] + fff: List[List[List[float]]] = [[[5.2]]] + single_text: TextDoc + texts: DocList[TextDoc] + d: Dict[str, str] = {'a': 'b'} + di: Optional[Dict[str, int]] = None + u: Union[str, int] + lu: List[Union[str, int]] = [0, 1, 2] + tags: Optional[Dict[str, Any]] = None + lf: List[float] = [3.0, 4.1] + + CustomDocCopy = create_pure_python_type_model(CustomDoc) + new_custom_doc_model = create_base_doc_from_schema( + CustomDocCopy.schema(), 'CustomDoc' + ) + print(f'new_custom_doc_model {new_custom_doc_model.schema()}') + + original_custom_docs = DocList[CustomDoc]() + if transformation == 'proto': + custom_partial_da = DocList[new_custom_doc_model].from_protobuf( + original_custom_docs.to_protobuf() + ) + original_back = DocList[CustomDoc].from_protobuf( + custom_partial_da.to_protobuf() + ) + elif transformation == 'json': + custom_partial_da = DocList[new_custom_doc_model].from_json( + original_custom_docs.to_json() + ) + original_back = DocList[CustomDoc].from_json(custom_partial_da.to_json()) + + assert len(custom_partial_da) == 0 + assert len(original_back) == 0 + + class TextDocWithId(BaseDoc): + ia: str + + TextDocWithIdCopy = create_pure_python_type_model(TextDocWithId) + new_textdoc_with_id_model = create_base_doc_from_schema( + TextDocWithIdCopy.schema(), 'TextDocWithId', {} + ) + print(f'new_textdoc_with_id_model {new_textdoc_with_id_model.schema()}') + + original_text_doc_with_id = DocList[TextDocWithId]() + if transformation == 'proto': + custom_da = DocList[new_textdoc_with_id_model].from_protobuf( + original_text_doc_with_id.to_protobuf() + ) + original_back = DocList[TextDocWithId].from_protobuf(custom_da.to_protobuf()) + elif transformation == 'json': + custom_da = DocList[new_textdoc_with_id_model].from_json( + original_text_doc_with_id.to_json() + ) + original_back = DocList[TextDocWithId].from_json(custom_da.to_json()) + + assert len(original_back) == 0 + assert len(custom_da) == 0 + + class ResultTestDoc(BaseDoc): + matches: DocList[TextDocWithId] + + ResultTestDocCopy = create_pure_python_type_model(ResultTestDoc) + new_result_test_doc_with_id_model = create_base_doc_from_schema( + ResultTestDocCopy.schema(), 'ResultTestDoc', {} + ) + print( + f'new_result_test_doc_with_id_model {new_result_test_doc_with_id_model.schema()}' + ) + result_test_docs = DocList[ResultTestDoc]() + + if transformation == 'proto': + custom_da = DocList[new_result_test_doc_with_id_model].from_protobuf( + result_test_docs.to_protobuf() + ) + original_back = DocList[ResultTestDoc].from_protobuf(custom_da.to_protobuf()) + elif transformation == 'json': + custom_da = DocList[new_result_test_doc_with_id_model].from_json( + result_test_docs.to_json() + ) + original_back = DocList[ResultTestDoc].from_json(custom_da.to_json()) + + assert len(original_back) == 0 + assert len(custom_da) == 0 + + +def test_create_with_field_info(): + class CustomDoc(BaseDoc): + """Here I have the description of the class""" + + a: str = Field(examples=['Example here'], another_extra='I am another extra') + + CustomDocCopy = create_pure_python_type_model(CustomDoc) + new_custom_doc_model = create_base_doc_from_schema( + CustomDocCopy.schema(), 'CustomDoc' + ) + assert new_custom_doc_model.schema().get('properties')['a']['examples'] == [ + 'Example here' + ] + assert ( + new_custom_doc_model.schema().get('properties')['a']['another_extra'] + == 'I am another extra' + ) + assert ( + new_custom_doc_model.schema().get('description') + == 'Here I have the description of the class' + ) + + +def test_dynamic_class_creation_multiple_doclist_nested(): + from docarray import BaseDoc, DocList + + class MyTextDoc(BaseDoc): + text: str + + class QuoteFile(BaseDoc): + texts: DocList[MyTextDoc] + + class SearchResult(BaseDoc): + results: DocList[QuoteFile] = None + + models_created_by_name = {} + SearchResult_aux = create_pure_python_type_model(SearchResult) + m = create_base_doc_from_schema( + SearchResult_aux.schema(), 'SearchResult', models_created_by_name + ) + print(f'm {m.schema()}') + QuoteFile_reconstructed_in_gateway_from_Search_results = models_created_by_name[ + 'QuoteFile' + ] + textlist = DocList[models_created_by_name['MyTextDoc']]( + [models_created_by_name['MyTextDoc'](id='11', text='hey')] + ) + + reconstructed_in_gateway_from_Search_results = ( + QuoteFile_reconstructed_in_gateway_from_Search_results(id='0', texts=textlist) + ) + assert reconstructed_in_gateway_from_Search_results.texts[0].text == 'hey' + + +def test_id_optional(): + from docarray import BaseDoc + import json + + class MyTextDoc(BaseDoc): + text: str + opt: Optional[str] = None + + MyTextDoc_aux = create_pure_python_type_model(MyTextDoc) + td = create_base_doc_from_schema(MyTextDoc_aux.schema(), 'MyTextDoc') + print(f'{td.schema()}') + direct = MyTextDoc.from_json(json.dumps({"text": "text"})) + aux = MyTextDoc_aux.from_json(json.dumps({"text": "text"})) + indirect = td.from_json(json.dumps({"text": "text"})) + assert direct.text == 'text' + assert aux.text == 'text' + assert indirect.text == 'text' + direct = MyTextDoc(text='hey') + aux = MyTextDoc_aux(text='hey') + indirect = td(text='hey') + assert direct.text == 'hey' + assert aux.text == 'hey' + assert indirect.text == 'hey' diff --git a/tests/units/util/test_filter.py b/tests/units/util/test_filter.py new file mode 100644 index 00000000000..417bde4232e --- /dev/null +++ b/tests/units/util/test_filter.py @@ -0,0 +1,289 @@ +import json +from typing import Any, Dict, List, Optional + +import pytest + +from docarray import BaseDoc, DocList +from docarray.documents import ImageDoc, TextDoc +from docarray.utils.filter import filter_docs + + +class MMDoc(BaseDoc): + text_doc: TextDoc + text: str = '' + image: Optional[ImageDoc] = None + price: int = 0 + optional_num: Optional[int] = None + boolean: bool = False + categories: Optional[List[str]] = None + sub_docs: Optional[List[TextDoc]] = None + dictionary: Optional[Dict[str, Any]] = None + + +@pytest.fixture +def docs(): + mmdoc1 = MMDoc( + text_doc=TextDoc(text='Text Doc of Document 1'), + text='Text of Document 1', + sub_docs=[TextDoc(text='subtext1'), TextDoc(text='subtext2')], + dictionary={}, + ) + mmdoc2 = MMDoc( + text_doc=TextDoc(text='Text Doc of Document 2'), + text='Text of Document 2', + image=ImageDoc(url='exampleimage.jpg'), + price=3, + dictionary={'a': 0, 'b': 1, 'c': 2, 'd': {'e': 3}}, + ) + mmdoc3 = MMDoc( + text_doc=TextDoc(text='Text Doc of Document 3'), + text='Text of Document 3', + price=1000, + boolean=True, + categories=['cat1', 'cat2'], + sub_docs=[TextDoc(text='subtext1'), TextDoc(text='subtext2')], + optional_num=30, + dictionary={'a': 0, 'b': 1}, + ) + docs = DocList[MMDoc]([mmdoc1, mmdoc2, mmdoc3]) + + return docs + + +@pytest.mark.parametrize('dict_api', [True, False]) +def test_empty_filter(docs, dict_api): + q = {} if dict_api else '{}' + result = filter_docs(docs, q) + assert len(result) == len(docs) + + +@pytest.mark.parametrize('dict_api', [True, False]) +def test_simple_filter(docs, dict_api): + if dict_api: + method = lambda query: filter_docs(docs, query) # noqa: E731 + else: + method = lambda query: filter_docs(docs, json.dumps(query)) # noqa: E731 + + result = method({'text': {'$eq': 'Text of Document 1'}}) + assert len(result) == 1 + assert result[0].text == 'Text of Document 1' + + result = method({'text': {'$neq': 'Text of Document 1'}}) + assert len(result) == 2 + + result = method({'text_doc': {'$eq': 'Text Doc of Document 1'}}) + assert len(result) == 1 + assert result[0].text_doc == 'Text Doc of Document 1' + + result = method({'text_doc': {'$neq': 'Text Doc of Document 1'}}) + assert len(result) == 2 + + result = method({'text': {'$regex': 'Text*'}}) + assert len(result) == 3 + + result = method({'text': {'$regex': 'TeAxt*'}}) + assert len(result) == 0 + + result = method({'text_doc': {'$regex': 'Text*'}}) + assert len(result) == 3 + + result = method({'text_doc': {'$regex': 'TeAxt*'}}) + assert len(result) == 0 + + result = method({'price': {'$gte': 500}}) + assert len(result) == 1 + + result = method({'price': {'$lte': 500}}) + assert len(result) == 2 + + result = method({'dictionary': {'$eq': {}}}) + assert len(result) == 1 + assert result[0].dictionary == {} + + result = method({'dictionary': {'$eq': {'a': 0, 'b': 1}}}) + assert len(result) == 1 + assert result[0].dictionary == {'a': 0, 'b': 1} + + result = method({'text': {'$neq': 'Text of Document 1'}}) + assert len(result) == 2 + + # EXISTS DOES NOT SEEM TO WORK + result = method({'optional_num': {'$exists': True}}) + assert len(result) == 3 + result = method({'optional_num': {'$exists': False}}) + assert len(result) == 0 + + result = method({'price': {'$exists': True}}) + assert len(result) == 3 + result = method({'price': {'$exists': False}}) + assert len(result) == 0 + + # DOES NOT SEEM TO WORK WITH OPTIONAL NUMBERS + result = method({'optional_num': {'$gte': 20}}) + assert len(result) == 1 + + result = method({'optional_num': {'$lte': 20}}) + assert len(result) == 0 + + +@pytest.mark.parametrize('dict_api', [True, False]) +def test_nested_filter(docs, dict_api): + if dict_api: + method = lambda query: filter_docs(docs, query) # noqa: E731 + else: + method = lambda query: filter_docs(docs, json.dumps(query)) # noqa: E731 + + result = method({'dictionary__a': {'$eq': 0}}) + assert len(result) == 2 + for res in result: + assert res.dictionary['a'] == 0 + + result = method({'dictionary__c': {'$exists': True}}) + assert len(result) == 1 + assert result[0].dictionary['c'] == 2 + + result = method({'dictionary__d__e': {'$exists': True}}) + assert len(result) == 1 + assert result[0].dictionary['d'] == {'e': 3} + + result = method({'dictionary__d__e': {'$eq': 3}}) + assert len(result) == 1 + assert result[0].dictionary['d'] == {'e': 3} + + result = method({'image__url': {'$eq': 'exampleimage.jpg'}}) + assert len(result) == 1 + assert result[0].image.url == 'exampleimage.jpg' + + +@pytest.mark.parametrize('dict_api', [True, False]) +def test_array_simple_filters(docs, dict_api): + if dict_api: + method = lambda query: filter_docs(docs, query) # noqa: E731 + else: + method = lambda query: filter_docs(docs, json.dumps(query)) # noqa: E731 + + # SIZE DOES NOT SEEM TO WORK + result = method({'sub_docs': {'$size': 2}}) + assert len(result) == 2 + + result = method({'categories': {'$size': 2}}) + assert len(result) == 1 + + +@pytest.mark.parametrize('dict_api', [True, False]) +def test_placehold_filter(dict_api): + docs = DocList[MMDoc]( + [ + MMDoc(text='A', text_doc=TextDoc(text='A')), + MMDoc(text='A', text_doc=TextDoc(text='B')), + ] + ) + + if dict_api: + method = lambda query: filter_docs(docs, query) # noqa: E731 + else: + method = lambda query: filter_docs(docs, json.dumps(query)) # noqa: E731 + + # DOES NOT SEEM TO WORK + result = method({'text': {'$eq': '{text_doc}'}}) + assert len(result) == 1 + + result = method({'text_doc': {'$eq': '{text}'}}) + assert len(result) == 1 + + +@pytest.mark.parametrize('dict_api', [True, False]) +def test_logic_filter(docs, dict_api): + if dict_api: + method = lambda query: filter_docs(docs, query) # noqa: E731 + else: + method = lambda query: filter_docs(docs, json.dumps(query)) # noqa: E731 + result = method( + { + '$or': { + 'text': {'$eq': 'Text of Document 1'}, + 'text_doc': {'$eq': 'Text Doc of Document 2'}, + } + } + ) + assert len(result) == 2 + + result = method( + { + '$not': { + '$or': { + 'text': {'$eq': 'Text of Document 1'}, + 'text_doc': {'$eq': 'Text Doc of Document 2'}, + } + } + } + ) + assert len(result) == 1 + + result = method( + { + '$and': { + 'text': {'$eq': 'Text of Document 1'}, + 'text_doc': {'$eq': 'Text Doc of Document 2'}, + } + } + ) + assert len(result) == 0 + + result = method( + { + '$not': { + '$and': { + 'text': {'$eq': 'Text of Document 1'}, + 'text_doc': {'$eq': 'Text Doc of Document 2'}, + } + } + } + ) + assert len(result) == 3 + + +@pytest.mark.parametrize('dict_api', [True, False]) +def test_from_docstring(dict_api): + class MyDocument(BaseDoc): + caption: TextDoc + image: ImageDoc + price: int + + docs = DocList[MyDocument]( + [ + MyDocument( + caption='A tiger in the jungle', + image=ImageDoc(url='tigerphoto.png'), + price=100, + ), + MyDocument( + caption='A swimming turtle', + image=ImageDoc(url='turtlepic.png'), + price=50, + ), + MyDocument( + caption='A couple birdwatching with binoculars', + image=ImageDoc(url='binocularsphoto.png'), + price=30, + ), + ] + ) + + query = { + '$and': { + 'image__url': {'$regex': 'photo'}, + 'price': {'$lte': 50}, + } + } + + if dict_api: + method = lambda query: filter_docs(docs, query) # noqa: E731 + else: + method = lambda query: filter_docs(docs, json.dumps(query)) # noqa: E731 + + results = method(query) + assert len(results) == 1 + assert results[0].price == 30 + assert results[0].caption == 'A couple birdwatching with binoculars' + assert results[0].image.url == 'binocularsphoto.png' diff --git a/tests/units/util/test_find.py b/tests/units/util/test_find.py new file mode 100644 index 00000000000..ca7cbe7160a --- /dev/null +++ b/tests/units/util/test_find.py @@ -0,0 +1,413 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional, Union + +import numpy as np +import pytest +import torch + +from docarray import BaseDoc, DocList +from docarray.typing import NdArray, TorchTensor +from docarray.utils.find import find, find_batched + + +class TorchDoc(BaseDoc): + tensor: TorchTensor + + +class NdDoc(BaseDoc): + tensor: NdArray + + +@pytest.fixture() +def random_torch_query(): + return TorchDoc(tensor=torch.rand(128)) + + +@pytest.fixture() +def random_torch_batch_query(): + return DocList[TorchDoc]([TorchDoc(tensor=torch.rand(128)) for _ in range(5)]) + + +@pytest.fixture() +def random_nd_query(): + return NdDoc(tensor=np.random.rand(128)) + + +@pytest.fixture() +def random_nd_batch_query(): + return DocList[NdDoc]([NdDoc(tensor=np.random.rand(128)) for _ in range(5)]) + + +@pytest.fixture() +def random_torch_index(): + return DocList[TorchDoc](TorchDoc(tensor=torch.rand(128)) for _ in range(10)) + + +@pytest.fixture() +def random_nd_index(): + return DocList[NdDoc](NdDoc(tensor=np.random.rand(128)) for _ in range(10)) + + +@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +def test_find_torch(random_torch_query, random_torch_index, metric): + top_k, scores = find( + random_torch_index, + random_torch_query, + search_field='tensor', + limit=7, + metric=metric, + ) + assert len(top_k) == 7 + assert len(scores) == 7 + assert top_k.doc_type == random_torch_index.doc_type + + if metric.endswith('_dist'): + assert (torch.stack(sorted(scores)) == scores).all() + else: + assert (torch.stack(sorted(scores, reverse=True)) == scores).all() + + +def test_find_torch_tensor_query(random_torch_query, random_torch_index): + query = random_torch_query.tensor + top_k, scores = find( + random_torch_index, + query, + search_field='tensor', + limit=7, + metric='cosine_sim', + ) + assert len(top_k) == 7 + assert len(scores) == 7 + assert (torch.stack(sorted(scores, reverse=True)) == scores).all() + + +def test_find_torch_stacked(random_torch_query, random_torch_index): + random_torch_index = random_torch_index.to_doc_vec() + top_k, scores = find( + random_torch_index, + random_torch_query, + search_field='tensor', + limit=7, + metric='cosine_sim', + ) + assert len(top_k) == 7 + assert len(scores) == 7 + assert (torch.stack(sorted(scores, reverse=True)) == scores).all() + + +@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +def test_find_np(random_nd_query, random_nd_index, metric): + top_k, scores = find( + random_nd_index, + random_nd_query, + search_field='tensor', + limit=7, + metric=metric, + ) + assert len(top_k) == 7 + assert len(scores) == 7 + if metric.endswith('_dist'): + assert (sorted(scores) == scores).all() + else: + assert (sorted(scores, reverse=True) == scores).all() + + +def test_find_np_tensor_query(random_nd_query, random_nd_index): + query = random_nd_query.tensor + top_k, scores = find( + random_nd_index, + query, + search_field='tensor', + limit=7, + metric='cosine_sim', + ) + assert len(top_k) == 7 + assert len(scores) == 7 + assert (sorted(scores, reverse=True) == scores).all() + + +def test_find_np_stacked(random_nd_query, random_nd_index): + random_nd_index = random_nd_index.to_doc_vec() + top_k, scores = find( + random_nd_index, + random_nd_query, + search_field='tensor', + limit=7, + metric='cosine_sim', + ) + assert len(top_k) == 7 + assert len(scores) == 7 + assert (sorted(scores, reverse=True) == scores).all() + + +@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +def test_find_batched_torch(random_torch_batch_query, random_torch_index, metric): + documents, scores = find_batched( + random_torch_index, + random_torch_batch_query, + search_field='tensor', + limit=7, + metric=metric, + ) + assert len(documents) == len(random_torch_batch_query) + assert len(scores) == len(random_torch_batch_query) + for top_k, top_scores in zip(documents, scores): + assert len(top_k) == 7 + assert len(top_scores) == 7 + assert top_k.doc_type == random_torch_index.doc_type + + for sc in scores: + if metric.endswith('_dist'): + assert (torch.stack(sorted(sc)) == sc).all() + else: + assert (torch.stack(sorted(sc, reverse=True)) == sc).all() + + +def test_find_batched_torch_tensor_query(random_torch_batch_query, random_torch_index): + query = torch.stack(random_torch_batch_query.tensor) + documents, scores = find_batched( + random_torch_index, + query, + search_field='tensor', + limit=7, + metric='cosine_sim', + ) + assert len(documents) == len(random_torch_batch_query) + assert len(scores) == len(random_torch_batch_query) + for top_k, top_scores in zip(documents, scores): + assert len(top_k) == 7 + assert len(top_scores) == 7 + for sc in scores: + assert (torch.stack(sorted(sc, reverse=True)) == sc).all() + + +@pytest.mark.parametrize('stack_what', ['index', 'query', 'both']) +def test_find_batched_torch_stacked( + random_torch_batch_query, random_torch_index, stack_what +): + if stack_what in ('index', 'both'): + random_torch_index = random_torch_index.to_doc_vec() + if stack_what in ('query', 'both'): + random_torch_batch_query = random_torch_batch_query.to_doc_vec() + + documents, scores = find_batched( + random_torch_index, + random_torch_batch_query, + search_field='tensor', + limit=7, + metric='cosine_sim', + ) + assert len(documents) == len(random_torch_batch_query) + assert len(scores) == len(random_torch_batch_query) + for top_k, top_scores in zip(documents, scores): + assert len(top_k) == 7 + assert len(top_scores) == 7 + for sc in scores: + assert (torch.stack(sorted(sc, reverse=True)) == sc).all() + + +@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +def test_find_batched_np(random_nd_batch_query, random_nd_index, metric): + documents, scores = find_batched( + random_nd_index, + random_nd_batch_query, + search_field='tensor', + limit=7, + metric=metric, + ) + assert len(documents) == len(random_nd_batch_query) + assert len(scores) == len(random_nd_batch_query) + for top_k, top_scores in zip(documents, scores): + assert len(top_k) == 7 + assert len(top_scores) == 7 + for sc in scores: + if metric.endswith('_dist'): + assert (sorted(sc) == sc).all() + else: + assert (sorted(sc, reverse=True) == sc).all() + + +def test_find_batched_np_tensor_query(random_nd_batch_query, random_nd_index): + query = np.stack(random_nd_batch_query.tensor) + documents, scores = find_batched( + random_nd_index, + query, + search_field='tensor', + limit=7, + metric='cosine_sim', + ) + assert len(documents) == len(random_nd_batch_query) + assert len(scores) == len(random_nd_batch_query) + for top_k, top_scores in zip(documents, scores): + assert len(top_k) == 7 + assert len(top_scores) == 7 + for sc in scores: + assert (sorted(sc, reverse=True) == sc).all() + + +@pytest.mark.parametrize('stack_what', ['index', 'query', 'both']) +def test_find_batched_np_stacked(random_nd_batch_query, random_nd_index, stack_what): + if stack_what in ('index', 'both'): + random_nd_index = random_nd_index.to_doc_vec() + if stack_what in ('query', 'both'): + random_nd_batch_query = random_nd_batch_query.to_doc_vec() + documents, scores = find_batched( + random_nd_index, + random_nd_batch_query, + search_field='tensor', + limit=7, + metric='cosine_sim', + ) + assert len(documents) == len(random_nd_batch_query) + assert len(scores) == len(random_nd_batch_query) + for top_k, top_scores in zip(documents, scores): + assert len(top_k) == 7 + assert len(top_scores) == 7 + for sc in scores: + assert (sorted(sc, reverse=True) == sc).all() + + +def test_find_optional(): + class MyDoc(BaseDoc): + embedding: Optional[TorchTensor] + + query = MyDoc(embedding=torch.rand(10)) + index = DocList[MyDoc]([MyDoc(embedding=torch.rand(10)) for _ in range(10)]) + + top_k, scores = find( + index, + query, + search_field='embedding', + limit=7, + ) + assert len(top_k) == 7 + assert len(scores) == 7 + assert (torch.stack(sorted(scores, reverse=True)) == scores).all() + + +def test_find_union(): + class MyDoc(BaseDoc): + embedding: Union[TorchTensor, NdArray] + + query = MyDoc(embedding=torch.rand(10)) + index = DocList[MyDoc]([MyDoc(embedding=torch.rand(10)) for _ in range(10)]) + + top_k, scores = find( + index, + query, + search_field='embedding', + limit=7.0, + ) + assert len(top_k) == 7 + assert len(scores) == 7 + assert (torch.stack(sorted(scores, reverse=True)) == scores).all() + + +@pytest.mark.parametrize('stack', [False, True]) +def test_find_nested(stack): + class InnerDoc(BaseDoc): + title: str + embedding: TorchTensor + + class MyDoc(BaseDoc): + inner: InnerDoc + + query = MyDoc(inner=InnerDoc(title='query', embedding=torch.rand(2))) + index = DocList[MyDoc]( + [ + MyDoc(inner=InnerDoc(title=f'doc {i}', embedding=torch.rand(2))) + for i in range(10) + ] + ) + if stack: + index = index.to_doc_vec() + + top_k, scores = find( + index, + query, + search_field='inner__embedding', + limit=7, + ) + assert len(top_k) == 7 + assert len(scores) == 7 + assert (torch.stack(sorted(scores, reverse=True)) == scores).all() + + +def test_find_nested_union_optional(): + class MyDoc(BaseDoc): + embedding: Union[Optional[TorchTensor], Optional[NdArray]] + embedding2: Optional[Union[TorchTensor, NdArray]] + embedding3: Optional[Optional[TorchTensor]] + embedding4: Union[Optional[Union[TorchTensor, NdArray]], TorchTensor] + + query = MyDoc( + embedding=torch.rand(10), + embedding2=torch.rand(10), + embedding3=torch.rand(10), + embedding4=torch.rand(10), + ) + index = DocList[MyDoc]( + [ + MyDoc( + embedding=torch.rand(10), + embedding2=torch.rand(10), + embedding3=torch.rand(10), + embedding4=torch.rand(10), + ) + for _ in range(10) + ] + ) + + top_k, scores = find( + index, + query, + search_field='embedding', + limit=7, + ) + assert len(top_k) == 7 + assert len(scores) == 7 + assert (torch.stack(sorted(scores, reverse=True)) == scores).all() + + top_k, scores = find( + index, + query, + search_field='embedding2', + limit=7.0, + ) + assert len(top_k) == 7 + assert len(scores) == 7 + assert (torch.stack(sorted(scores, reverse=True)) == scores).all() + + top_k, scores = find( + index, + query, + search_field='embedding3', + limit=7, + ) + assert len(top_k) == 7 + assert len(scores) == 7 + assert (torch.stack(sorted(scores, reverse=True)) == scores).all() + + top_k, scores = find( + index, + query, + search_field='embedding4', + limit=7, + ) + assert len(top_k) == 7 + assert len(scores) == 7 + assert (torch.stack(sorted(scores, reverse=True)) == scores).all() diff --git a/tests/units/util/test_map.py b/tests/units/util/test_map.py new file mode 100644 index 00000000000..65dd3c17389 --- /dev/null +++ b/tests/units/util/test_map.py @@ -0,0 +1,101 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Generator, Optional + +import pytest + +from docarray import BaseDoc, DocList +from docarray.documents import ImageDoc +from docarray.typing import ImageUrl, NdArray +from docarray.utils.map import map_docs, map_docs_batched +from tests.units.typing.test_bytes import IMAGE_PATHS + +N_DOCS = 2 + + +def load_from_doc(d: ImageDoc) -> ImageDoc: + if d.url is not None: + d.tensor = d.url.load() + return d + + +@pytest.fixture() +def da(): + da = DocList[ImageDoc]([ImageDoc(url=IMAGE_PATHS['png']) for _ in range(N_DOCS)]) + return da + + +@pytest.mark.parametrize('backend', ['thread', 'process']) +def test_map(da, backend): + for tensor in da.tensor: + assert tensor is None + + docs = list(map_docs(docs=da, func=load_from_doc, backend=backend)) + + assert len(docs) == N_DOCS + for doc in docs: + assert doc.tensor is not None + + +def test_map_multiprocessing_lambda_func_raise_exception(da): + with pytest.raises(ValueError, match='Multiprocessing does not allow'): + list(map_docs(docs=da, func=lambda x: x, backend='process')) + + +def test_map_multiprocessing_local_func_raise_exception(da): + def local_func(x): + return x + + with pytest.raises(ValueError, match='Multiprocessing does not allow'): + list(map_docs(docs=da, func=local_func, backend='process')) + + +@pytest.mark.parametrize('backend', ['thread', 'process']) +def test_check_order(backend): + da = DocList[ImageDoc]([ImageDoc(id=str(i)) for i in range(N_DOCS)]) + + docs = list(map_docs(docs=da, func=load_from_doc, backend=backend)) + + assert len(docs) == N_DOCS + for i, doc in enumerate(docs): + assert doc.id == str(i) + + +def load_from_da(da: DocList) -> DocList: + for doc in da: + doc.tensor = doc.url.load() + return da + + +class MyImage(BaseDoc): + tensor: Optional[NdArray] = None + url: ImageUrl + + +@pytest.mark.slow +@pytest.mark.parametrize('n_docs,batch_size', [(10, 5), (10, 8)]) +@pytest.mark.parametrize('backend', ['thread', 'process']) +def test_map_docs_batched(n_docs, batch_size, backend): + da = DocList[MyImage]([MyImage(url=IMAGE_PATHS['png']) for _ in range(n_docs)]) + it = map_docs_batched( + docs=da, func=load_from_da, batch_size=batch_size, backend=backend + ) + assert isinstance(it, Generator) + + for batch in it: + assert isinstance(batch, DocList) + for d in batch: + assert isinstance(d, MyImage) diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py new file mode 100644 index 00000000000..816796831fc --- /dev/null +++ b/tests/units/util/test_reduce.py @@ -0,0 +1,139 @@ +from typing import Dict, List, Optional, Set + +import pytest + +from docarray import BaseDoc, DocList +from docarray.documents import ImageDoc +from docarray.utils.reduce import reduce, reduce_all + + +class InnerDoc(BaseDoc): + integer: int + inner_list: List + + +class MMDoc(BaseDoc): + text: str = '' + price: int = 0 + categories: Optional[List[str]] = None + image: Optional[ImageDoc] = None + matches: Optional[DocList] = None + matches_with_same_id: Optional[DocList] = None + opt_int: Optional[int] = None + test_set: Optional[Set] = None + inner_doc: Optional[InnerDoc] = None + test_dict: Optional[Dict] = None + + +@pytest.fixture +def doc1(): + return MMDoc( + text='hey here', + categories=['a', 'b', 'c'], + price=10, + matches=DocList[MMDoc]([MMDoc()]), + matches_with_same_id=DocList[MMDoc]( + [MMDoc(id='a', matches=DocList[MMDoc]([MMDoc()]))] + ), + test_set={'a', 'a'}, + inner_doc=InnerDoc(integer=2, inner_list=['c', 'd']), + test_dict={'a': 0, 'b': 2, 'd': 4, 'z': 3}, + ) + + +@pytest.fixture +def doc2(doc1): + return MMDoc( + id=doc1.id, + text='hey here 2', + categories=['d', 'e', 'f'], + price=5, + opt_int=5, + matches=DocList[MMDoc]([MMDoc()]), + matches_with_same_id=DocList[MMDoc]( + [MMDoc(id='a', matches=DocList[MMDoc]([MMDoc()]))] + ), + test_set={'a', 'b'}, + inner_doc=InnerDoc(integer=3, inner_list=['a', 'b']), + test_dict={'a': 10, 'b': 10, 'c': 3, 'z': None}, + ) + + +def test_reduce_different_ids(): + da1 = DocList[MMDoc]([MMDoc() for _ in range(10)]) + da2 = DocList[MMDoc]([MMDoc() for _ in range(10)]) + result = reduce(da1, da2) + assert len(result) == 20 + # da1 is changed in place (no extra memory) + assert len(da1) == 20 + + +def test_reduce(doc1, doc2): + da1 = DocList[MMDoc]([doc1, MMDoc()]) + da2 = DocList[MMDoc]([MMDoc(), doc2]) + result = reduce(da1, da2) + assert len(result) == 3 + # da1 is changed in place (no extra memory) + assert len(da1) == 3 + merged_doc = result[0] + assert merged_doc.text == 'hey here 2' + assert merged_doc.categories == ['a', 'b', 'c', 'd', 'e', 'f'] + assert len(merged_doc.matches) == 2 + assert merged_doc.opt_int == 5 + assert merged_doc.price == 5 + assert merged_doc.test_set == {'a', 'b'} + assert len(merged_doc.matches_with_same_id) == 1 + assert len(merged_doc.matches_with_same_id[0].matches) == 2 + assert merged_doc.inner_doc.integer == 3 + assert merged_doc.inner_doc.inner_list == ['c', 'd', 'a', 'b'] + + +def test_reduce_all(doc1, doc2): + da1 = DocList[MMDoc]([doc1, MMDoc()]) + da2 = DocList[MMDoc]([MMDoc(), doc2]) + da3 = DocList[MMDoc]([MMDoc(), MMDoc(), doc1]) + result = reduce_all([da1, da2, da3]) + assert len(result) == 5 + # da1 is changed in place (no extra memory) + assert len(da1) == 5 + merged_doc = result[0] + assert merged_doc.text == 'hey here 2' + assert merged_doc.categories == [ + 'a', + 'b', + 'c', + 'd', + 'e', + 'f', + 'a', + 'b', + 'c', + 'd', + 'e', + 'f', + ] + assert len(merged_doc.matches) == 2 + assert merged_doc.opt_int == 5 + assert merged_doc.price == 5 + assert merged_doc.test_set == {'a', 'b'} + assert len(merged_doc.matches_with_same_id) == 1 + assert len(merged_doc.matches_with_same_id[0].matches) == 2 + assert merged_doc.inner_doc.integer == 3 + assert merged_doc.inner_doc.inner_list == ['c', 'd', 'a', 'b', 'c', 'd', 'a', 'b'] + + +def test_update_ndarray(): + from docarray.typing import NdArray + + import numpy as np + + class MyDoc(BaseDoc): + embedding: NdArray[128] + + embedding1 = np.random.rand(128) + embedding2 = np.random.rand(128) + + doc1 = MyDoc(id='0', embedding=embedding1) + doc2 = MyDoc(id='0', embedding=embedding2) + doc1.update(doc2) + assert (doc1.embedding == embedding2).all() diff --git a/tests/units/util/test_typing.py b/tests/units/util/test_typing.py new file mode 100644 index 00000000000..f40fde4ab21 --- /dev/null +++ b/tests/units/util/test_typing.py @@ -0,0 +1,108 @@ +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, List, Optional, Set, Tuple, Union + +import pytest + +from docarray.typing import NdArray, TorchTensor +from docarray.typing.tensor.abstract_tensor import AbstractTensor +from docarray.utils._internal._typing import ( + is_tensor_union, + is_type_tensor, + safe_issubclass, +) +from docarray.utils._internal.misc import is_tf_available + +tf_available = is_tf_available() +if tf_available: + from docarray.typing import TensorFlowTensor +else: + TensorFlowTensor = None + + +@pytest.mark.parametrize( + 'type_, is_tensor', + [ + (int, False), + (TorchTensor, True), + (NdArray, True), + (AbstractTensor, True), + (Optional[TorchTensor], False), + (Union[TorchTensor, NdArray], False), + (None, False), + (Dict, False), + ], +) +def test_is_type_tensor(type_, is_tensor): + assert is_type_tensor(type_) == is_tensor + + +@pytest.mark.tensorflow +@pytest.mark.parametrize( + 'type_, is_tensor', + [ + (TensorFlowTensor, True), + (Optional[TensorFlowTensor], False), + ], +) +def test_is_type_tensor_with_tf(type_, is_tensor): + assert is_type_tensor(type_) == is_tensor + + +@pytest.mark.parametrize( + 'type_, is_union_tensor', + [ + (int, False), + (TorchTensor, False), + (NdArray, False), + (Optional[TorchTensor], True), + (Optional[NdArray], True), + (Union[NdArray, TorchTensor], True), + (Union[NdArray, TorchTensor, AbstractTensor], True), + (Union[NdArray, TorchTensor, Optional[TorchTensor]], True), + (Union[NdArray, TorchTensor, None], True), + ], +) +def test_is_union_type_tensor(type_, is_union_tensor): + assert is_tensor_union(type_) == is_union_tensor + + +@pytest.mark.tensorflow +@pytest.mark.parametrize( + 'type_, is_union_tensor', + [ + (TensorFlowTensor, False), + (Optional[TensorFlowTensor], True), + (Union[NdArray, TorchTensor, TensorFlowTensor], True), + (Union[NdArray, TorchTensor, Optional[TensorFlowTensor]], True), + ], +) +def test_is_union_type_tensor_with_tf(type_, is_union_tensor): + assert is_tensor_union(type_) == is_union_tensor + + +@pytest.mark.parametrize( + 'type_, cls, is_subclass', + [ + (List[str], object, False), + (List[List[int]], object, False), + (Set[str], object, False), + (Dict, object, False), + (Tuple[int, int], object, False), + ], +) +def test_safe_issubclass(type_, cls, is_subclass): + assert safe_issubclass(type_, cls) == is_subclass