diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py index 9328a040957..3e8f0a09a12 100644 --- a/docarray/base_doc/doc.py +++ b/docarray/base_doc/doc.py @@ -72,7 +72,11 @@ class MyDoc(BaseDoc): https://docs.pydantic.dev/usage/models/) and can be used in a similar way. """ - id: Optional[ID] = Field(default_factory=lambda: ID(os.urandom(16).hex())) + id: Optional[ID] = Field( + description='The ID of the BaseDoc. This is useful for indexing in vector stores. If not set by user, it will automatically be assigned a random value', + default_factory=lambda: ID(os.urandom(16).hex()), + example=os.urandom(16).hex(), + ) class Config: json_loads = orjson.loads diff --git a/docarray/index/backends/elastic.py b/docarray/index/backends/elastic.py index b83ce826411..c008fa29de0 100644 --- a/docarray/index/backends/elastic.py +++ b/docarray/index/backends/elastic.py @@ -67,6 +67,9 @@ class ElasticDocIndex(BaseDocIndex, Generic[TSchema]): + _index_vector_params: Optional[Tuple[str]] = ('dims', 'similarity', 'index') + _index_vector_options: Optional[Tuple[str]] = ('m', 'ef_construction') + def __init__(self, db_config=None, **kwargs): """Initialize ElasticDocIndex""" super().__init__(db_config=db_config, **kwargs) @@ -82,9 +85,6 @@ def __init__(self, db_config=None, **kwargs): self._logger.debug('ElasticSearch client has been created') # ElasticSearh index setup - self._index_vector_params = ('dims', 'similarity', 'index') - self._index_vector_options = ('m', 'ef_construction') - mappings: Dict[str, Any] = { 'dynamic': True, '_source': {'enabled': 'true'}, @@ -572,20 +572,23 @@ def _filter_by_parent_id(self, id: str) -> List[str]: # Helpers # ############################################### - def _create_index_mapping(self, col: '_ColumnInfo') -> Dict[str, Any]: + @classmethod + def _create_index_mapping(cls, col: '_ColumnInfo') -> Dict[str, Any]: """Create a new HNSW index for a column, and initialize it.""" index = {'type': col.config['type'] if 'type' in col.config else col.db_type} if col.db_type == 'dense_vector': - for k in self._index_vector_params: - index[k] = col.config[k] + if cls._index_vector_params is not None: + for k in cls._index_vector_params: + index[k] = col.config[k] if col.n_dim: index['dims'] = col.n_dim - index['index_options'] = dict( - (k, col.config[k]) for k in self._index_vector_options - ) - index['index_options']['type'] = 'hnsw' + if cls._index_vector_options is not None: + index['index_options'] = dict( + (k, col.config[k]) for k in cls._index_vector_options + ) + index['index_options']['type'] = 'hnsw' return index def _send_requests( diff --git a/docarray/index/backends/elasticv7.py b/docarray/index/backends/elasticv7.py index db6e229bf25..6ff428b0436 100644 --- a/docarray/index/backends/elasticv7.py +++ b/docarray/index/backends/elasticv7.py @@ -1,13 +1,13 @@ import warnings from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Sequence, TypeVar, Union +from typing import Any, Dict, List, Optional, Sequence, TypeVar, Union, Tuple import numpy as np from pydantic import parse_obj_as from docarray import BaseDoc from docarray.index import ElasticDocIndex -from docarray.index.abstract import BaseDocIndex, _ColumnInfo +from docarray.index.abstract import BaseDocIndex from docarray.typing import AnyTensor from docarray.typing.tensor.ndarray import NdArray from docarray.utils.find import _FindResult @@ -17,6 +17,9 @@ class ElasticV7DocIndex(ElasticDocIndex): + _index_vector_params: Optional[Tuple[str]] = ('dims',) + _index_vector_options: Optional[Tuple[str]] = None + def __init__(self, db_config=None, **kwargs): """Initialize ElasticV7DocIndex""" from elasticsearch import __version__ as __es__version__ @@ -130,19 +133,6 @@ def execute_query(self, query: Dict[str, Any], *args, **kwargs) -> Any: # Helpers # ############################################### - # ElasticSearch helpers - def _create_index_mapping(self, col: '_ColumnInfo') -> Dict[str, Any]: - """Create a new HNSW index for a column, and initialize it.""" - - index = col.config.copy() - if 'type' not in index: - index['type'] = col.db_type - - if col.db_type == 'dense_vector' and col.n_dim: - index['dims'] = col.n_dim - - return index - def _form_search_body(self, query: np.ndarray, limit: int, search_field: str = '') -> Dict[str, Any]: # type: ignore body = { 'size': limit, diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index 8f08ae5c39d..c0ee904fb48 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -115,7 +115,7 @@ def __init__(self, db_config=None, **kwargs): sub_docs_exist = True if safe_issubclass(col.docarray_type, AnyDocArray): continue - if not col.config: + if not col.config or 'dim' not in col.config: # non-tensor type; don't create an index continue if not load_existing and ( diff --git a/tests/index/base_classes/test_base_doc_store.py b/tests/index/base_classes/test_base_doc_store.py index 09f46ee4535..fa9444dcf5e 100644 --- a/tests/index/base_classes/test_base_doc_store.py +++ b/tests/index/base_classes/test_base_doc_store.py @@ -157,7 +157,7 @@ def test_create_columns(): assert index._column_infos['id'].docarray_type == ID assert index._column_infos['id'].db_type == str assert index._column_infos['id'].n_dim is None - assert index._column_infos['id'].config == {'hi': 'there'} + assert index._column_infos['id'].config['hi'] == 'there' assert issubclass(index._column_infos['tens'].docarray_type, AbstractTensor) assert index._column_infos['tens'].db_type == str @@ -171,7 +171,7 @@ def test_create_columns(): assert index._column_infos['id'].docarray_type == ID assert index._column_infos['id'].db_type == str assert index._column_infos['id'].n_dim is None - assert index._column_infos['id'].config == {'hi': 'there'} + assert index._column_infos['id'].config['hi'] == 'there' assert issubclass(index._column_infos['tens_one'].docarray_type, AbstractTensor) assert index._column_infos['tens_one'].db_type == str @@ -190,7 +190,7 @@ def test_create_columns(): assert index._column_infos['id'].docarray_type == ID assert index._column_infos['id'].db_type == str assert index._column_infos['id'].n_dim is None - assert index._column_infos['id'].config == {'hi': 'there'} + assert index._column_infos['id'].config['hi'] == 'there' assert issubclass(index._column_infos['d__tens'].docarray_type, AbstractTensor) assert index._column_infos['d__tens'].db_type == str @@ -214,7 +214,7 @@ def test_create_columns(): assert index._subindices['d']._column_infos['id'].docarray_type == ID assert index._subindices['d']._column_infos['id'].db_type == str assert index._subindices['d']._column_infos['id'].n_dim is None - assert index._subindices['d']._column_infos['id'].config == {'hi': 'there'} + assert index._subindices['d']._column_infos['id'].config['hi'] == 'there' assert issubclass( index._subindices['d']._column_infos['tens'].docarray_type, AbstractTensor @@ -262,10 +262,10 @@ def test_create_columns(): assert ( index._subindices['d_root']._subindices['d']._column_infos['id'].n_dim is None ) - assert index._subindices['d_root']._subindices['d']._column_infos['id'].config == { - 'hi': 'there' - } - + assert ( + index._subindices['d_root']._subindices['d']._column_infos['id'].config['hi'] + == 'there' + ) assert issubclass( index._subindices['d_root'] ._subindices['d']