From ee0bcff70a811f801a41956a8bfaa8cbf38bdcee Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Mon, 28 Feb 2022 18:14:00 +0100 Subject: [PATCH 1/7] fix(weaviate): remove ndim requirement in weaviate --- docarray/array/storage/base/getsetdel.py | 3 ++- docarray/array/storage/weaviate/backend.py | 27 +++++----------------- docs/advanced/document-store/weaviate.md | 2 +- tests/unit/array/docker-compose.yml | 2 +- 4 files changed, 10 insertions(+), 24 deletions(-) diff --git a/docarray/array/storage/base/getsetdel.py b/docarray/array/storage/base/getsetdel.py index e6ca6a6b735..adc36f4c915 100644 --- a/docarray/array/storage/base/getsetdel.py +++ b/docarray/array/storage/base/getsetdel.py @@ -281,4 +281,5 @@ def _save_offset2ids(self): ... def __del__(self): - self._save_offset2ids() + if hasattr(self, '_offset2ids'): + self._save_offset2ids() diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index 5c711cf6d75..f8464842af2 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -33,10 +33,9 @@ class WeaviateConfig: """This class stores the config variables to initialize connection to the Weaviate server""" - n_dim: int - host: Optional[str] = field(default="localhost") + host: Optional[str] = field(default='localhost') port: Optional[int] = field(default=8080) - protocol: Optional[int] = field(default="http") + protocol: Optional[str] = field(default='http') name: Optional[str] = None serialize_config: Dict = field(default_factory=dict) @@ -60,13 +59,12 @@ def _init_storage( """ if not config: - raise ValueError('Config object must be specified') + config = WeaviateConfig() elif isinstance(config, dict): config = dataclass_from_dict(WeaviateConfig, config) from ... import DocumentArray - self._n_dim = config.n_dim self._serialize_config = config.serialize_config if config.name and config.name != config.name.capitalize(): @@ -278,25 +276,12 @@ def _doc2weaviate_create_payload(self, value: 'Document'): :param value: document to create a payload for :return: the payload dictionary """ - if value.embedding is None: - embedding = np.zeros(self._n_dim) - else: + if value.embedding is not None: from ....math.ndarray import to_numpy_array embedding = to_numpy_array(value.embedding) - - if embedding.ndim > 1: - embedding = np.asarray(embedding).squeeze() - if embedding.shape != (self._n_dim,): - raise ValueError( - f'All documents must have embedding of shape n_dim: {self._n_dim}, receiving shape: {embedding.shape}' - ) - - # Weaviate expects vector to have dim 2 at least - # or get weaviate.exceptions.UnexpectedStatusCodeException: models.C11yVector - # hence we cast it to list of a single element - if len(embedding) == 1: - embedding = [embedding[0]] + else: + embedding = None return dict( data_object={'_serialized': value.to_base64(**self._serialize_config)}, diff --git a/docs/advanced/document-store/weaviate.md b/docs/advanced/document-store/weaviate.md index 8848b84480f..345762df99e 100644 --- a/docs/advanced/document-store/weaviate.md +++ b/docs/advanced/document-store/weaviate.md @@ -21,7 +21,7 @@ services: - '8080' - --scheme - http - image: semitechnologies/weaviate:1.9.0 + image: semitechnologies/weaviate:1.10.0 ports: - 8080:8080 restart: on-failure:0 diff --git a/tests/unit/array/docker-compose.yml b/tests/unit/array/docker-compose.yml index 175777badd7..21806311f3f 100644 --- a/tests/unit/array/docker-compose.yml +++ b/tests/unit/array/docker-compose.yml @@ -1,7 +1,7 @@ version: "3.3" services: weaviate: - image: semitechnologies/weaviate:1.9.0 + image: semitechnologies/weaviate:1.10.0 ports: - 8080:8080 environment: From 93d9aa316857d3343540966ca4aa8415b7740049 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Mon, 28 Feb 2022 18:19:36 +0100 Subject: [PATCH 2/7] docs(weaviate): remove ndim requirement in weaviate --- docs/advanced/document-store/weaviate.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/advanced/document-store/weaviate.md b/docs/advanced/document-store/weaviate.md index 345762df99e..e9614e1ca8c 100644 --- a/docs/advanced/document-store/weaviate.md +++ b/docs/advanced/document-store/weaviate.md @@ -48,7 +48,7 @@ Assuming service is started using the default configuration (i.e. server address ```python from docarray import DocumentArray -da = DocumentArray(storage='weaviate', config={'n_dim': 10}) +da = DocumentArray(storage='weaviate') ``` The usage would be the same as the ordinary DocumentArray. @@ -60,7 +60,7 @@ Note, that the `name` parameter in `config` needs to be capitalized. ```python from docarray import DocumentArray -da = DocumentArray(storage='weaviate', config={'name': 'Persisted', 'host': 'localhost', 'port': 1234, 'n_dim': 10}) +da = DocumentArray(storage='weaviate', config={'name': 'Persisted', 'host': 'localhost', 'port': 1234}) da.summary() ``` @@ -73,7 +73,6 @@ The following configs can be set: | Name | Description | Default | |--------------------|----------------------------------------------------------------------------------------|-----------------------------| -| `n_dim` | Number of dimensions of embeddings to be stored and retrieved | **This is always required** | | `host` | Hostname of the Weaviate server | 'localhost' | | `port` | port of the Weaviate server | 8080 | | `protocol` | protocol to be used. Can be 'http' or 'https' | 'http' | From ae891221890dbd3136aa7f62846b55066238cfe0 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Mon, 28 Feb 2022 18:31:41 +0100 Subject: [PATCH 3/7] docs(weaviate): remove ndim requirement in weaviate --- docarray/array/storage/weaviate/backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index f8464842af2..c5a5fa5cc9f 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -38,6 +38,7 @@ class WeaviateConfig: protocol: Optional[str] = field(default='http') name: Optional[str] = None serialize_config: Dict = field(default_factory=dict) + n_dim: Optional[int] = None # deprecated, not used anymore since weaviate 1.10 class BackendMixin(BaseBackendMixin): @@ -186,7 +187,7 @@ def _update_offset2ids_meta(self): ) else: self._offset2ids_wid = str(uuid.uuid1()) - self._client.data_object.create( + self._client.data_object.up( data_object={'_offset2ids': self._offset2ids.ids}, class_name=self._meta_name, uuid=self._offset2ids_wid, From c0a4f027892b30e639b2ec1a1e4ab40a5af8fe52 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Mon, 28 Feb 2022 20:07:25 +0100 Subject: [PATCH 4/7] docs(weaviate): remove ndim requirement in weaviate --- docarray/array/storage/weaviate/backend.py | 2 +- docarray/array/storage/weaviate/getsetdel.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index c5a5fa5cc9f..a811302540a 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -187,7 +187,7 @@ def _update_offset2ids_meta(self): ) else: self._offset2ids_wid = str(uuid.uuid1()) - self._client.data_object.up( + self._client.data_object.create( data_object={'_offset2ids': self._offset2ids.ids}, class_name=self._meta_name, uuid=self._offset2ids_wid, diff --git a/docarray/array/storage/weaviate/getsetdel.py b/docarray/array/storage/weaviate/getsetdel.py index 2998cdbe7bb..2d71b965845 100644 --- a/docarray/array/storage/weaviate/getsetdel.py +++ b/docarray/array/storage/weaviate/getsetdel.py @@ -37,11 +37,12 @@ def _set_doc_by_id(self, _id: str, value: 'Document'): """ if _id != value.id: self._del_doc_by_id(_id) - wid = self._wmap(value.id) + payload = self._doc2weaviate_create_payload(value) - if self._client.data_object.exists(wid): - self._client.data_object.delete(wid) - self._client.data_object.create(**payload) + if self._client.data_object.exists(payload['uuid']): + self._client.data_object.update(**payload) + else: + self._client.data_object.create(**payload) def _del_doc_by_id(self, _id: str): """Concrete implementation of base class' ``_del_doc_by_id`` From 04f4c078ad5449127f69bb526de11c8ba6ff4a76 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Mon, 28 Feb 2022 20:33:40 +0100 Subject: [PATCH 5/7] docs(weaviate): remove ndim requirement in weaviate --- docarray/array/storage/weaviate/getsetdel.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docarray/array/storage/weaviate/getsetdel.py b/docarray/array/storage/weaviate/getsetdel.py index 2d71b965845..f3a0da4af0e 100644 --- a/docarray/array/storage/weaviate/getsetdel.py +++ b/docarray/array/storage/weaviate/getsetdel.py @@ -14,12 +14,13 @@ def _getitem(self, wid: str) -> 'Document': :raises KeyError: raise error when weaviate id does not exist in storage :return: Document """ - resp = self._client.data_object.get_by_id(wid, with_vector=True) - if not resp: - raise KeyError(wid) - return Document.from_base64( - resp['properties']['_serialized'], **self._serialize_config - ) + try: + resp = self._client.data_object.get_by_id(wid, with_vector=True) + return Document.from_base64( + resp['properties']['_serialized'], **self._serialize_config + ) + except Exception as ex: + raise KeyError(wid) from ex def _get_doc_by_id(self, _id: str) -> 'Document': """Concrete implementation of base class' ``_get_doc_by_id`` @@ -40,9 +41,8 @@ def _set_doc_by_id(self, _id: str, value: 'Document'): payload = self._doc2weaviate_create_payload(value) if self._client.data_object.exists(payload['uuid']): - self._client.data_object.update(**payload) - else: - self._client.data_object.create(**payload) + self._client.data_object.delete(payload['uuid']) + self._client.data_object.create(**payload) def _del_doc_by_id(self, _id: str): """Concrete implementation of base class' ``_del_doc_by_id`` From 6f5c9aeb7f7dd594f99f483e45a8590a9a79184b Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Mon, 28 Feb 2022 20:36:43 +0100 Subject: [PATCH 6/7] docs(weaviate): remove ndim requirement in weaviate --- docarray/array/storage/weaviate/backend.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index a811302540a..13aabb03ca8 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -281,6 +281,15 @@ def _doc2weaviate_create_payload(self, value: 'Document'): from ....math.ndarray import to_numpy_array embedding = to_numpy_array(value.embedding) + + if embedding.ndim > 1: + embedding = np.asarray(embedding).squeeze() + + # Weaviate expects vector to have dim 2 at least + # or get weaviate.exceptions.UnexpectedStatusCodeException: models.C11yVector + # hence we cast it to list of a single element + if len(embedding) == 1: + embedding = [embedding[0]] else: embedding = None From 77ee8d1a070aa743fdc4b71e7a9ea9f680746f9b Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Mon, 28 Feb 2022 20:43:27 +0100 Subject: [PATCH 7/7] docs(weaviate): remove ndim requirement in weaviate --- tests/unit/array/mixins/test_content.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/tests/unit/array/mixins/test_content.py b/tests/unit/array/mixins/test_content.py index d38a423aa0c..8659341e1b9 100644 --- a/tests/unit/array/mixins/test_content.py +++ b/tests/unit/array/mixins/test_content.py @@ -147,16 +147,3 @@ def test_embeddings_setter(da_len, da_cls, config, start_storage): da.embeddings = np.random.rand(da_len, 5) for doc in da: assert doc.embedding.shape == (5,) - - -@pytest.mark.parametrize('da_len', [0, 1]) -@pytest.mark.parametrize('da_cls', [DocumentArrayWeaviate]) -@pytest.mark.parametrize( - 'config, n_dim', [({'n_dim': 1}, 1), (WeaviateConfig(n_dim=5), 5)] -) -def test_content_by_config(da_len, da_cls, config, n_dim): - with pytest.raises(ValueError): - da_cls(da_len) - - da = da_cls.empty(da_len, config=config) - assert da._n_dim == n_dim