From adf866164dd29c851295868a9e0507509dc9b7e7 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 7 Jun 2022 11:06:45 +0200 Subject: [PATCH 01/10] fix: cast column to its correct type before insertion in database --- docarray/array/storage/annlite/backend.py | 6 +++++- docarray/array/storage/base/backend.py | 5 ++++- docarray/array/storage/weaviate/backend.py | 7 +++++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/docarray/array/storage/annlite/backend.py b/docarray/array/storage/annlite/backend.py index 4523c057db0..4ee14bae822 100644 --- a/docarray/array/storage/annlite/backend.py +++ b/docarray/array/storage/annlite/backend.py @@ -33,7 +33,11 @@ class AnnliteConfig: class BackendMixin(BaseBackendMixin): """Provide necessary functions to enable this storage backend.""" - TYPE_MAP = {'str': 'TEXT', 'float': 'float', 'int': 'integer'} + TYPE_MAP = { + 'str': ('TEXT', str), + 'float': ('float', float), + 'int': ('integer', int), + } def _map_embedding(self, embedding: 'ArrayType') -> 'ArrayType': if embedding is None: diff --git a/docarray/array/storage/base/backend.py b/docarray/array/storage/base/backend.py index 14f32434aa1..09e5f98a074 100644 --- a/docarray/array/storage/base/backend.py +++ b/docarray/array/storage/base/backend.py @@ -25,10 +25,13 @@ def _get_storage_infos(self) -> Optional[Dict]: def _map_id(self, _id: str) -> str: return _id + def _map_column(self, value, col_type) -> str: + return self.TYPE_MAP[col_type][1](value) + def _map_embedding(self, embedding: 'ArrayType') -> 'ArrayType': from ....math.ndarray import to_numpy_array return to_numpy_array(embedding) def _map_type(self, col_type: str) -> str: - return self.TYPE_MAP[col_type] + return self.TYPE_MAP[col_type][0] diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index c693c1cecd9..03b6e39efe5 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -52,7 +52,7 @@ class WeaviateConfig: class BackendMixin(BaseBackendMixin): """Provide necessary functions to enable this storage backend.""" - TYPE_MAP = {'str': 'string', 'float': 'number', 'int': 'int'} + TYPE_MAP = {'str': ('string', str), 'float': ('number', float), 'int': ('int', int)} def _init_storage( self, @@ -310,7 +310,10 @@ def _doc2weaviate_create_payload(self, value: 'Document'): :param value: document to create a payload for :return: the payload dictionary """ - extra_columns = {col: value.tags.get(col) for col, _ in self._config.columns} + extra_columns = { + col: self._map_column(value.tags.get(col), dict(*self._config.columns)[col]) + for col, _ in self._config.columns + } return dict( data_object={ From 969909532c8544ec6633d87efbac4d8facb75e2a Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 7 Jun 2022 16:29:36 +0200 Subject: [PATCH 02/10] refactor: use namedtuple instead of dict --- docarray/array/storage/annlite/backend.py | 8 ++++---- docarray/array/storage/base/backend.py | 9 ++++++--- docarray/array/storage/weaviate/backend.py | 8 ++++++-- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/docarray/array/storage/annlite/backend.py b/docarray/array/storage/annlite/backend.py index 4ee14bae822..b7f89c8554b 100644 --- a/docarray/array/storage/annlite/backend.py +++ b/docarray/array/storage/annlite/backend.py @@ -11,7 +11,7 @@ import numpy as np -from ..base.backend import BaseBackendMixin +from ..base.backend import BaseBackendMixin, TypeMap from ....helper import dataclass_from_dict, filter_dict if TYPE_CHECKING: @@ -34,9 +34,9 @@ class BackendMixin(BaseBackendMixin): """Provide necessary functions to enable this storage backend.""" TYPE_MAP = { - 'str': ('TEXT', str), - 'float': ('float', float), - 'int': ('integer', int), + 'str': TypeMap(type='TEXT', converter=str), + 'float': TypeMap(type='float', converter=float), + 'int': TypeMap(type='integer', converter=int), } def _map_embedding(self, embedding: 'ArrayType') -> 'ArrayType': diff --git a/docarray/array/storage/base/backend.py b/docarray/array/storage/base/backend.py index 09e5f98a074..1ec44b92e74 100644 --- a/docarray/array/storage/base/backend.py +++ b/docarray/array/storage/base/backend.py @@ -1,13 +1,16 @@ from abc import ABC +from collections import namedtuple from dataclasses import is_dataclass, asdict from typing import Dict, Optional, TYPE_CHECKING if TYPE_CHECKING: from ....typing import DocumentArraySourceType, ArrayType +TypeMap = namedtuple('TypeMap', ['type', 'converter']) + class BaseBackendMixin(ABC): - TYPE_MAP: Dict + TYPE_MAP: Dict[str, TypeMap] def _init_storage( self, @@ -26,7 +29,7 @@ def _map_id(self, _id: str) -> str: return _id def _map_column(self, value, col_type) -> str: - return self.TYPE_MAP[col_type][1](value) + return self.TYPE_MAP[col_type].converter(value) def _map_embedding(self, embedding: 'ArrayType') -> 'ArrayType': from ....math.ndarray import to_numpy_array @@ -34,4 +37,4 @@ def _map_embedding(self, embedding: 'ArrayType') -> 'ArrayType': return to_numpy_array(embedding) def _map_type(self, col_type: str) -> str: - return self.TYPE_MAP[col_type][0] + return self.TYPE_MAP[col_type].type diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index 03b6e39efe5..97ff5a0ce67 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -15,7 +15,7 @@ from .... import Document from ....helper import dataclass_from_dict, filter_dict -from ..base.backend import BaseBackendMixin +from ..base.backend import BaseBackendMixin, TypeMap from ..registry import _REGISTRY if TYPE_CHECKING: @@ -52,7 +52,11 @@ class WeaviateConfig: class BackendMixin(BaseBackendMixin): """Provide necessary functions to enable this storage backend.""" - TYPE_MAP = {'str': ('string', str), 'float': ('number', float), 'int': ('int', int)} + TYPE_MAP = { + 'str': TypeMap(type='string', converter=str), + 'float': TypeMap(type='number', converter=float), + 'int': TypeMap(type='int', converter=int), + } def _init_storage( self, From 2a67ec94a34f56841ed5fafcd3814a664a02baa9 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Wed, 8 Jun 2022 14:23:50 +0200 Subject: [PATCH 03/10] fix: working dict creation --- docarray/array/storage/weaviate/backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index 97ff5a0ce67..d30b7536214 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -314,8 +314,9 @@ def _doc2weaviate_create_payload(self, value: 'Document'): :param value: document to create a payload for :return: the payload dictionary """ + columns_dict = {key: val for [key, val] in self._config.columns} extra_columns = { - col: self._map_column(value.tags.get(col), dict(*self._config.columns)[col]) + col: self._map_column(value.tags.get(col), columns_dict[col]) for col, _ in self._config.columns } From d5603a34376c0c9c2258c79716497e8bdc3b6a7c Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Wed, 8 Jun 2022 14:24:00 +0200 Subject: [PATCH 04/10] feat: add tests for weaviate --- .../unit/array/test_backend_configuration.py | 52 ++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/tests/unit/array/test_backend_configuration.py b/tests/unit/array/test_backend_configuration.py index 000dcb79d49..efa7919dcb4 100644 --- a/tests/unit/array/test_backend_configuration.py +++ b/tests/unit/array/test_backend_configuration.py @@ -1,6 +1,10 @@ +from typing import Tuple, Iterator + +import pytest import requests +import itertools -from docarray import DocumentArray +from docarray import DocumentArray, Document def test_weaviate_hnsw(start_storage): @@ -45,3 +49,49 @@ def test_weaviate_hnsw(start_storage): assert main_class.get('vectorIndexConfig', {}).get('cleanupIntervalSeconds') == 1000 assert main_class.get('vectorIndexConfig', {}).get('skip') is True assert main_class.get('vectorIndexConfig', {}).get('distance') == 'l2-squared' + + +def test_weaviate_da_w_protobuff(start_storage): + + N = 10 + + index = DocumentArray( + storage='weaviate', + config={ + 'name': 'Test', + 'columns': [('price', 'int')], + }, + ) + + docs = DocumentArray([Document(tags={'price': i}) for i in range(N)]) + docs = DocumentArray.from_protobuf( + docs.to_protobuf() + ) # same as streaming the da in jina + + index.extend(docs) + + assert len(index) == N + + +@pytest.mark.parametrize('type_da', [int, float, str]) +@pytest.mark.parametrize('type_column', ['int', 'float', 'str']) +def test_cast_columns_weaviate(start_storage, type_da, type_column, request): + + test_id = request.node.callspec.id.replace( + '-', '' + ) # remove '-' from the test id for the weaviate name + N = 10 + + index = DocumentArray( + storage='weaviate', + config={ + 'name': f'Test{test_id}', + 'columns': [('price', type_column)], + }, + ) + + docs = DocumentArray([Document(tags={'price': type_da(i)}) for i in range(10)]) + + index.extend(docs) + + assert len(index) == N From eb42738833257931cee10be6285c7adc99487658 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Wed, 8 Jun 2022 14:28:58 +0200 Subject: [PATCH 05/10] feat: add tests for annlite --- .../unit/array/test_backend_configuration.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/unit/array/test_backend_configuration.py b/tests/unit/array/test_backend_configuration.py index efa7919dcb4..1ad8eba4cbc 100644 --- a/tests/unit/array/test_backend_configuration.py +++ b/tests/unit/array/test_backend_configuration.py @@ -95,3 +95,24 @@ def test_cast_columns_weaviate(start_storage, type_da, type_column, request): index.extend(docs) assert len(index) == N + + +@pytest.mark.parametrize('type_da', [int, float, str]) +@pytest.mark.parametrize('type_column', ['int', 'float', 'str']) +def test_cast_columns_annlite(start_storage, type_da, type_column): + + N = 10 + + index = DocumentArray( + storage='annlite', + config={ + 'n_dim': 3, + 'columns': [('price', type_column)], + }, + ) + + docs = DocumentArray([Document(tags={'price': type_da(i)}) for i in range(10)]) + + index.extend(docs) + + assert len(index) == N From b7a969355398590d6fccb58e1ffcb195a2b88a8e Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Wed, 8 Jun 2022 16:09:05 +0200 Subject: [PATCH 06/10] feat: add tests for qdrant --- .../unit/array/test_backend_configuration.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/unit/array/test_backend_configuration.py b/tests/unit/array/test_backend_configuration.py index 1ad8eba4cbc..84ab30e06c9 100644 --- a/tests/unit/array/test_backend_configuration.py +++ b/tests/unit/array/test_backend_configuration.py @@ -116,3 +116,27 @@ def test_cast_columns_annlite(start_storage, type_da, type_column): index.extend(docs) assert len(index) == N + + +@pytest.mark.parametrize('type_da', [int, float, str]) +@pytest.mark.parametrize('type_column', ['int', 'float', 'str']) +def test_cast_columns_qdrant(start_storage, type_da, type_column, request): + + test_id = request.node.callspec.id.replace( + '-', '' + ) # remove '-' from the test id for the weaviate name + N = 10 + + index = DocumentArray( + storage='qdrant', + config={ + 'name': f'test{test_id}', + 'columns': [('price', type_column)], + }, + ) + + docs = DocumentArray([Document(tags={'price': type_da(i)}) for i in range(10)]) + + index.extend(docs) + + assert len(index) == N From 10e8c79c79ce4c6ed0cb96763104cc514b0f696f Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 9 Jun 2022 10:23:29 +0200 Subject: [PATCH 07/10] feat: update test --- tests/unit/array/test_backend_configuration.py | 3 ++- tests/unit/test_helper.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/unit/array/test_backend_configuration.py b/tests/unit/array/test_backend_configuration.py index 84ab30e06c9..86ddc969a52 100644 --- a/tests/unit/array/test_backend_configuration.py +++ b/tests/unit/array/test_backend_configuration.py @@ -130,7 +130,8 @@ def test_cast_columns_qdrant(start_storage, type_da, type_column, request): index = DocumentArray( storage='qdrant', config={ - 'name': f'test{test_id}', + 'collection_name': f'test{test_id}', + 'n_dim': 3, 'columns': [('price', type_column)], }, ) diff --git a/tests/unit/test_helper.py b/tests/unit/test_helper.py index e6bc4f6c4df..bb271f826e8 100644 --- a/tests/unit/test_helper.py +++ b/tests/unit/test_helper.py @@ -8,6 +8,7 @@ add_protocol_and_compress_to_file_path, filter_dict, get_full_version, + _safe_cast_int, ) @@ -61,3 +62,14 @@ def test_filter_dict(): def test_ci_vendor(): if 'GITHUB_WORKFLOW' in os.environ: assert get_full_version()['ci-vendor'] == 'GITHUB_ACTIONS' + + +@pytest.mark.parametrize('input,output', [(1, 1), (1.0, 1), ('1', 1)]) +def test_safe_cast(input, output): + assert output == _safe_cast_int(input) + + +@pytest.mark.parametrize('wrong_input', [1.5, 1.001, 2 / 3]) +def test_safe_cast_raise_error(wrong_input): + with pytest.raises(ValueError): + _safe_cast_int(wrong_input) From c1ece840c8b8263abff7dcc796a752b10f276b3c Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 9 Jun 2022 11:25:16 +0200 Subject: [PATCH 08/10] feat: add safe cast int --- docarray/array/storage/annlite/backend.py | 8 ++++---- docarray/array/storage/weaviate/backend.py | 4 ++-- docarray/helper.py | 17 ++++++++++++++++- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/docarray/array/storage/annlite/backend.py b/docarray/array/storage/annlite/backend.py index b7f89c8554b..25d737bc69e 100644 --- a/docarray/array/storage/annlite/backend.py +++ b/docarray/array/storage/annlite/backend.py @@ -12,7 +12,7 @@ import numpy as np from ..base.backend import BaseBackendMixin, TypeMap -from ....helper import dataclass_from_dict, filter_dict +from ....helper import dataclass_from_dict, filter_dict, _safe_cast_int if TYPE_CHECKING: from ....typing import DocumentArraySourceType, ArrayType @@ -34,9 +34,9 @@ class BackendMixin(BaseBackendMixin): """Provide necessary functions to enable this storage backend.""" TYPE_MAP = { - 'str': TypeMap(type='TEXT', converter=str), - 'float': TypeMap(type='float', converter=float), - 'int': TypeMap(type='integer', converter=int), + 'str': TypeMap(type='string', converter=str), + 'float': TypeMap(type='number', converter=float), + 'int': TypeMap(type='int', converter=_safe_cast_int), } def _map_embedding(self, embedding: 'ArrayType') -> 'ArrayType': diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index d30b7536214..2c3763c62b0 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -14,7 +14,7 @@ import weaviate from .... import Document -from ....helper import dataclass_from_dict, filter_dict +from ....helper import dataclass_from_dict, filter_dict, _safe_cast_int from ..base.backend import BaseBackendMixin, TypeMap from ..registry import _REGISTRY @@ -55,7 +55,7 @@ class BackendMixin(BaseBackendMixin): TYPE_MAP = { 'str': TypeMap(type='string', converter=str), 'float': TypeMap(type='number', converter=float), - 'int': TypeMap(type='int', converter=int), + 'int': TypeMap(type='int', converter=_safe_cast_int), } def _init_storage( diff --git a/docarray/helper.py b/docarray/helper.py index 30c074ae833..3be3ecb8473 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -5,7 +5,7 @@ import sys import uuid import warnings -from typing import Any, Dict, Optional, Sequence, Tuple +from typing import Any, Dict, Optional, Sequence, Tuple, Union __resources_path__ = os.path.join( os.path.dirname( @@ -441,3 +441,18 @@ def filter_dict(d: Dict) -> Dict: :return: filtered dict """ return dict(filter(lambda item: item[1] is not None, d.items())) + + +def _safe_cast_int(value: Union[str, int, float]) -> int: + """Safely cast string and float to an integer + It mainly avoids silently rounding down the float value + :param value: value to be cast + :return: cast integer + """ + if type(value) == float: + if value.is_integer(): + return int(value) + else: + raise ValueError(f"Can't safely cast {value} to an int") + else: + return int(value) From fa2f37c11b523a36c8dd685f20c5b33485734fe2 Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 9 Jun 2022 11:34:45 +0200 Subject: [PATCH 09/10] feat: improve helper function Co-authored-by: AlaeddineAbdessalem --- docarray/helper.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/docarray/helper.py b/docarray/helper.py index 3be3ecb8473..e97b46d17d0 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -449,10 +449,6 @@ def _safe_cast_int(value: Union[str, int, float]) -> int: :param value: value to be cast :return: cast integer """ - if type(value) == float: - if value.is_integer(): - return int(value) - else: - raise ValueError(f"Can't safely cast {value} to an int") - else: - return int(value) + if isinstance(value, float) and not value.is_integer(): + raise ValueError(f"Can't safely cast {value} to an int") + return int(value) From eef0c206cd6712459c6fcba96cba3e2222d7ba81 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 9 Jun 2022 13:43:45 +0200 Subject: [PATCH 10/10] fix: good type dict for annlite --- docarray/array/storage/annlite/backend.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docarray/array/storage/annlite/backend.py b/docarray/array/storage/annlite/backend.py index 25d737bc69e..b56e94b5ae8 100644 --- a/docarray/array/storage/annlite/backend.py +++ b/docarray/array/storage/annlite/backend.py @@ -34,9 +34,9 @@ class BackendMixin(BaseBackendMixin): """Provide necessary functions to enable this storage backend.""" TYPE_MAP = { - 'str': TypeMap(type='string', converter=str), - 'float': TypeMap(type='number', converter=float), - 'int': TypeMap(type='int', converter=_safe_cast_int), + 'str': TypeMap(type='TEXT', converter=str), + 'float': TypeMap(type='float', converter=float), + 'int': TypeMap(type='integer', converter=_safe_cast_int), } def _map_embedding(self, embedding: 'ArrayType') -> 'ArrayType':