From c2616fa4230ec356b7b1b816ed02d62d665d8eb1 Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Mon, 31 Jul 2023 12:25:07 +0200 Subject: [PATCH 1/4] feat: add example to ID of BaseDoc --- docarray/base_doc/doc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py index 9328a040957..05ef2e523ea 100644 --- a/docarray/base_doc/doc.py +++ b/docarray/base_doc/doc.py @@ -72,7 +72,7 @@ class MyDoc(BaseDoc): https://docs.pydantic.dev/usage/models/) and can be used in a similar way. """ - id: Optional[ID] = Field(default_factory=lambda: ID(os.urandom(16).hex())) + id: Optional[ID] = Field(default_factory=lambda: ID(os.urandom(16).hex()), example=os.urandom(16).hex()) class Config: json_loads = orjson.loads From 3f4e86fbdab2ad556c1b0d2820a9f8789923768b Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Mon, 31 Jul 2023 12:27:48 +0200 Subject: [PATCH 2/4] feat: add example and description to ID of BaseDoc Signed-off-by: Joan Fontanals Martinez --- docarray/base_doc/doc.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py index 05ef2e523ea..3e8f0a09a12 100644 --- a/docarray/base_doc/doc.py +++ b/docarray/base_doc/doc.py @@ -72,7 +72,11 @@ class MyDoc(BaseDoc): https://docs.pydantic.dev/usage/models/) and can be used in a similar way. """ - id: Optional[ID] = Field(default_factory=lambda: ID(os.urandom(16).hex()), example=os.urandom(16).hex()) + id: Optional[ID] = Field( + description='The ID of the BaseDoc. This is useful for indexing in vector stores. If not set by user, it will automatically be assigned a random value', + default_factory=lambda: ID(os.urandom(16).hex()), + example=os.urandom(16).hex(), + ) class Config: json_loads = orjson.loads From 46bcf5a5cf02f9c4d1c78debfe3d01a0c35e39da Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Mon, 31 Jul 2023 17:02:16 +0200 Subject: [PATCH 3/4] fix: fix the hnswlib tests Signed-off-by: Joan Fontanals Martinez --- docarray/index/backends/elasticv7.py | 13 ------------- docarray/index/backends/hnswlib.py | 2 +- tests/index/base_classes/test_base_doc_store.py | 16 ++++++++-------- 3 files changed, 9 insertions(+), 22 deletions(-) diff --git a/docarray/index/backends/elasticv7.py b/docarray/index/backends/elasticv7.py index db6e229bf25..96e47da1206 100644 --- a/docarray/index/backends/elasticv7.py +++ b/docarray/index/backends/elasticv7.py @@ -130,19 +130,6 @@ def execute_query(self, query: Dict[str, Any], *args, **kwargs) -> Any: # Helpers # ############################################### - # ElasticSearch helpers - def _create_index_mapping(self, col: '_ColumnInfo') -> Dict[str, Any]: - """Create a new HNSW index for a column, and initialize it.""" - - index = col.config.copy() - if 'type' not in index: - index['type'] = col.db_type - - if col.db_type == 'dense_vector' and col.n_dim: - index['dims'] = col.n_dim - - return index - def _form_search_body(self, query: np.ndarray, limit: int, search_field: str = '') -> Dict[str, Any]: # type: ignore body = { 'size': limit, diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index 8f08ae5c39d..c0ee904fb48 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -115,7 +115,7 @@ def __init__(self, db_config=None, **kwargs): sub_docs_exist = True if safe_issubclass(col.docarray_type, AnyDocArray): continue - if not col.config: + if not col.config or 'dim' not in col.config: # non-tensor type; don't create an index continue if not load_existing and ( diff --git a/tests/index/base_classes/test_base_doc_store.py b/tests/index/base_classes/test_base_doc_store.py index 09f46ee4535..fa9444dcf5e 100644 --- a/tests/index/base_classes/test_base_doc_store.py +++ b/tests/index/base_classes/test_base_doc_store.py @@ -157,7 +157,7 @@ def test_create_columns(): assert index._column_infos['id'].docarray_type == ID assert index._column_infos['id'].db_type == str assert index._column_infos['id'].n_dim is None - assert index._column_infos['id'].config == {'hi': 'there'} + assert index._column_infos['id'].config['hi'] == 'there' assert issubclass(index._column_infos['tens'].docarray_type, AbstractTensor) assert index._column_infos['tens'].db_type == str @@ -171,7 +171,7 @@ def test_create_columns(): assert index._column_infos['id'].docarray_type == ID assert index._column_infos['id'].db_type == str assert index._column_infos['id'].n_dim is None - assert index._column_infos['id'].config == {'hi': 'there'} + assert index._column_infos['id'].config['hi'] == 'there' assert issubclass(index._column_infos['tens_one'].docarray_type, AbstractTensor) assert index._column_infos['tens_one'].db_type == str @@ -190,7 +190,7 @@ def test_create_columns(): assert index._column_infos['id'].docarray_type == ID assert index._column_infos['id'].db_type == str assert index._column_infos['id'].n_dim is None - assert index._column_infos['id'].config == {'hi': 'there'} + assert index._column_infos['id'].config['hi'] == 'there' assert issubclass(index._column_infos['d__tens'].docarray_type, AbstractTensor) assert index._column_infos['d__tens'].db_type == str @@ -214,7 +214,7 @@ def test_create_columns(): assert index._subindices['d']._column_infos['id'].docarray_type == ID assert index._subindices['d']._column_infos['id'].db_type == str assert index._subindices['d']._column_infos['id'].n_dim is None - assert index._subindices['d']._column_infos['id'].config == {'hi': 'there'} + assert index._subindices['d']._column_infos['id'].config['hi'] == 'there' assert issubclass( index._subindices['d']._column_infos['tens'].docarray_type, AbstractTensor @@ -262,10 +262,10 @@ def test_create_columns(): assert ( index._subindices['d_root']._subindices['d']._column_infos['id'].n_dim is None ) - assert index._subindices['d_root']._subindices['d']._column_infos['id'].config == { - 'hi': 'there' - } - + assert ( + index._subindices['d_root']._subindices['d']._column_infos['id'].config['hi'] + == 'there' + ) assert issubclass( index._subindices['d_root'] ._subindices['d'] From c1b4aa5963719be2b433d2fbeef1369a238794ac Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Tue, 1 Aug 2023 02:48:29 +0200 Subject: [PATCH 4/4] fix: fix elastic v7 test Signed-off-by: Joan Fontanals Martinez --- docarray/index/backends/elastic.py | 23 +++++++++++++---------- docarray/index/backends/elasticv7.py | 7 +++++-- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/docarray/index/backends/elastic.py b/docarray/index/backends/elastic.py index b83ce826411..c008fa29de0 100644 --- a/docarray/index/backends/elastic.py +++ b/docarray/index/backends/elastic.py @@ -67,6 +67,9 @@ class ElasticDocIndex(BaseDocIndex, Generic[TSchema]): + _index_vector_params: Optional[Tuple[str]] = ('dims', 'similarity', 'index') + _index_vector_options: Optional[Tuple[str]] = ('m', 'ef_construction') + def __init__(self, db_config=None, **kwargs): """Initialize ElasticDocIndex""" super().__init__(db_config=db_config, **kwargs) @@ -82,9 +85,6 @@ def __init__(self, db_config=None, **kwargs): self._logger.debug('ElasticSearch client has been created') # ElasticSearh index setup - self._index_vector_params = ('dims', 'similarity', 'index') - self._index_vector_options = ('m', 'ef_construction') - mappings: Dict[str, Any] = { 'dynamic': True, '_source': {'enabled': 'true'}, @@ -572,20 +572,23 @@ def _filter_by_parent_id(self, id: str) -> List[str]: # Helpers # ############################################### - def _create_index_mapping(self, col: '_ColumnInfo') -> Dict[str, Any]: + @classmethod + def _create_index_mapping(cls, col: '_ColumnInfo') -> Dict[str, Any]: """Create a new HNSW index for a column, and initialize it.""" index = {'type': col.config['type'] if 'type' in col.config else col.db_type} if col.db_type == 'dense_vector': - for k in self._index_vector_params: - index[k] = col.config[k] + if cls._index_vector_params is not None: + for k in cls._index_vector_params: + index[k] = col.config[k] if col.n_dim: index['dims'] = col.n_dim - index['index_options'] = dict( - (k, col.config[k]) for k in self._index_vector_options - ) - index['index_options']['type'] = 'hnsw' + if cls._index_vector_options is not None: + index['index_options'] = dict( + (k, col.config[k]) for k in cls._index_vector_options + ) + index['index_options']['type'] = 'hnsw' return index def _send_requests( diff --git a/docarray/index/backends/elasticv7.py b/docarray/index/backends/elasticv7.py index 96e47da1206..6ff428b0436 100644 --- a/docarray/index/backends/elasticv7.py +++ b/docarray/index/backends/elasticv7.py @@ -1,13 +1,13 @@ import warnings from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Sequence, TypeVar, Union +from typing import Any, Dict, List, Optional, Sequence, TypeVar, Union, Tuple import numpy as np from pydantic import parse_obj_as from docarray import BaseDoc from docarray.index import ElasticDocIndex -from docarray.index.abstract import BaseDocIndex, _ColumnInfo +from docarray.index.abstract import BaseDocIndex from docarray.typing import AnyTensor from docarray.typing.tensor.ndarray import NdArray from docarray.utils.find import _FindResult @@ -17,6 +17,9 @@ class ElasticV7DocIndex(ElasticDocIndex): + _index_vector_params: Optional[Tuple[str]] = ('dims',) + _index_vector_options: Optional[Tuple[str]] = None + def __init__(self, db_config=None, **kwargs): """Initialize ElasticV7DocIndex""" from elasticsearch import __version__ as __es__version__