diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index be9984fc153..e06a3b9df21 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -271,6 +271,14 @@ This allows: * the reviewer to be very confident that the feature does what it is supposed to do before merging it into the code base. * the contributors to be sure that they don't break already-merged features when refactoring or modifying the code base. + +## Enable logging +If you need to monitor and debug your code, you can enable docarray logging: +```python +import logging +logging.getLogger('docarray').setLevel(logging.DEBUG) +``` + ## Compiling protobuf diff --git a/README.md b/README.md index 0f11bafe950..7a2b56b59b3 100644 --- a/README.md +++ b/README.md @@ -444,6 +444,38 @@ match = store.find( ) ``` +## Enable logging +You can see more logs by setting the log level to `DEBUG` or `INFO`. + +Example: +```python +from pydantic import Field +from docarray import BaseDocument +from docarray.doc_index.backends.hnswlib_doc_index import HnswDocumentIndex +from docarray.typing import NdArray +# import logging and set the level to DEBUG +import logging +logging.getLogger('docarray').setLevel(logging.DEBUG) + + +# define a simple document and create a document index +class SimpleDoc(BaseDocument): + vector: NdArray = Field(dim=10) + +doc_store = HnswDocumentIndex[SimpleDoc](work_dir='temp_path/') +``` + +```console +INFO - docarray - DB config created +INFO - docarray - Runtime config created +DEBUG - docarray - Working directory set to temp_path/ +WARNING - docarray - No index was created for `id` as it does not have a config +INFO - docarray - Created a new index for column `vector` +DEBUG - docarray - DB path set to temp_path/docs_sqlite.db +INFO - docarray - Connection to DB has been established +INFO - docarray - HnswDocumentIndex[SimpleDoc] has been initialized +``` + ## Install the alpha to try out the alpha you can install it via git: diff --git a/docarray/__init__.py b/docarray/__init__.py index 5e3e700a61f..03e65750c14 100644 --- a/docarray/__init__.py +++ b/docarray/__init__.py @@ -2,5 +2,13 @@ from docarray.array import DocumentArray, DocumentArrayStacked from docarray.base_document.document import BaseDocument +import logging __all__ = ['BaseDocument', 'DocumentArray', 'DocumentArrayStacked'] + +logger = logging.getLogger('docarray') + +handler = logging.StreamHandler() +formatter = logging.Formatter("%(levelname)s - %(name)s - %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) diff --git a/docarray/doc_index/abstract_doc_index.py b/docarray/doc_index/abstract_doc_index.py index 6a527b43bbb..e1fd847c841 100644 --- a/docarray/doc_index/abstract_doc_index.py +++ b/docarray/doc_index/abstract_doc_index.py @@ -27,6 +27,7 @@ from docarray.typing import AnyTensor from docarray.utils.find import FindResult, _FindResult from docarray.utils.misc import torch_imported +import logging if TYPE_CHECKING: from pydantic.fields import ModelField @@ -87,10 +88,13 @@ def __init__(self, db_config=None, **kwargs): 'A DocumentIndex must be typed with a Document type.' 'To do so, use the syntax: DocumentIndex[DocumentType]' ) + self._logger = logging.getLogger('docarray') self._db_config = db_config or self.DBConfig(**kwargs) if not isinstance(self._db_config, self.DBConfig): raise ValueError(f'db_config must be of type {self.DBConfig}') + self._logger.info('DB config created') self._runtime_config = self.RuntimeConfig() + self._logger.info('Runtime config created') self._column_infos: Dict[str, _ColumnInfo] = self._create_column_infos( self._schema ) @@ -346,6 +350,7 @@ def __delitem__(self, key: Union[str, Sequence[str]]): :param key: id or ids to delete from the Document index """ + self._logger.info(f'Deleting documents with id(s) {key} from the index') if isinstance(key, str): key = [key] self._del_items(key) @@ -371,10 +376,15 @@ def configure(self, runtime_config=None, **kwargs): def index(self, docs: Union[BaseDocument, Sequence[BaseDocument]], **kwargs): """index Documents into the index. - :param docs: Documents to index. NOTE: passing a Sequence of Documents that is - not a DocumentArray comes at a performance penalty, since compatibility - with the Index's schema need to be checked for every Document individually. + :param docs: Documents to index. """ + if not isinstance(docs, (BaseDocument, DocumentArray)): + self._logger.warning( + 'Passing a sequence of Documents that is not a DocumentArray comes at ' + 'a performance penalty, since compatibility with the schema of Index ' + 'needs to be checked for every Document individually.' + ) + self._logger.debug(f'Indexing {len(docs)} documents') docs_validated = self._validate_docs(docs) data_by_columns = self._get_col_value_dict(docs_validated) self._index(data_by_columns, **kwargs) @@ -397,6 +407,7 @@ def find( :param limit: maximum number of documents to return :return: a named tuple containing `documents` and `scores` """ + self._logger.debug(f'Executing `find` for search field {search_field}') if isinstance(query, BaseDocument): query_vec = self._get_values_by_column([query], search_field)[0] else: @@ -430,6 +441,7 @@ def find_batched( :param limit: maximum number of documents to return per query :return: a named tuple containing `documents` and `scores` """ + self._logger.debug(f'Executing `find_batched` for search field {search_field}') if isinstance(queries, Sequence): query_vec_list = self._get_values_by_column(queries, search_field) query_vec_np = np.stack( @@ -459,6 +471,7 @@ def filter( :param limit: maximum number of documents to return :return: a DocumentArray containing the documents that match the filter query """ + self._logger.debug(f'Executing `filter` for the query {filter_query}') docs = self._filter(filter_query, limit=limit, **kwargs) if isinstance(docs, List): @@ -478,6 +491,9 @@ def filter_batched( :param limit: maximum number of documents to return :return: a DocumentArray containing the documents that match the filter query """ + self._logger.debug( + f'Executing `filter_batched` for the queries {filter_queries}' + ) da_list = self._filter_batched(filter_queries, limit=limit, **kwargs) if len(da_list) > 0 and isinstance(da_list[0], List): @@ -499,6 +515,7 @@ def text_search( :param limit: maximum number of documents to return :return: a named tuple containing `documents` and `scores` """ + self._logger.debug(f'Executing `text_search` for search field {search_field}') if isinstance(query, BaseDocument): query_text = self._get_values_by_column([query], search_field)[0] else: @@ -526,6 +543,9 @@ def text_search_batched( :param limit: maximum number of documents to return :return: a named tuple containing `documents` and `scores` """ + self._logger.debug( + f'Executing `text_search_batched` for search field {search_field}' + ) if isinstance(queries[0], BaseDocument): query_docs: Sequence[BaseDocument] = cast(Sequence[BaseDocument], queries) query_texts: Sequence[str] = self._get_values_by_column( diff --git a/docarray/doc_index/backends/hnswlib_doc_index.py b/docarray/doc_index/backends/hnswlib_doc_index.py index 3c8c2df14e3..3d6923fff80 100644 --- a/docarray/doc_index/backends/hnswlib_doc_index.py +++ b/docarray/doc_index/backends/hnswlib_doc_index.py @@ -63,6 +63,7 @@ def __init__(self, db_config=None, **kwargs): super().__init__(db_config=db_config, **kwargs) self._db_config = cast(HnswDocumentIndex.DBConfig, self._db_config) self._work_dir = self._db_config.work_dir + self._logger.debug(f'Working directory set to {self._work_dir}') load_existing = os.path.exists(self._work_dir) and os.listdir(self._work_dir) Path(self._work_dir).mkdir(parents=True, exist_ok=True) @@ -83,18 +84,26 @@ def __init__(self, db_config=None, **kwargs): self._hnsw_indices = {} for col_name, col in self._column_infos.items(): if not col.config: - continue # do not create column index if no config is given + self._logger.warning( + f'No index was created for `{col_name}` as it does not have a config' + ) + continue if load_existing: self._hnsw_indices[col_name] = self._load_index(col_name, col) + self._logger.info(f'Loading an existing index for column `{col_name}`') else: self._hnsw_indices[col_name] = self._create_index(col) + self._logger.info(f'Created a new index for column `{col_name}`') # SQLite setup self._sqlite_db_path = os.path.join(self._work_dir, 'docs_sqlite.db') + self._logger.debug(f'DB path set to {self._sqlite_db_path}') self._sqlite_conn = sqlite3.connect(self._sqlite_db_path) + self._logger.info('Connection to DB has been established') self._sqlite_cursor = self._sqlite_conn.cursor() self._create_docs_table() self._sqlite_conn.commit() + self._logger.info(f'{self.__class__.__name__} has been initialized') ############################################### # Inner classes for query builder and configs # @@ -161,10 +170,11 @@ def index(self, docs: Union[BaseDocument, Sequence[BaseDocument]], **kwargs): """index a document into the store""" if kwargs: raise ValueError(f'{list(kwargs.keys())} are not valid keyword arguments') + + self._logger.debug(f'Indexing {len(docs)} documents') docs_validated = self._validate_docs(docs) data_by_columns = self._get_col_value_dict(docs_validated) hashed_ids = tuple(self._to_hashed_id(doc.id) for doc in docs_validated) - # indexing into HNSWLib and SQLite sequentially # could be improved by processing in parallel for col_name, index in self._hnsw_indices.items(): @@ -196,6 +206,7 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: elif op == 'filter': filter_conditions.append(op_kwargs['filter_query']) + self._logger.debug(f'Executing query {query}') docs_filtered = ann_docs for cond in filter_conditions: da_cls = DocumentArray.__class_getitem__( @@ -203,6 +214,7 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: ) docs_filtered = da_cls(filter_docs(docs_filtered, cond)) + self._logger.debug(f'{len(docs_filtered)} results found') docs_and_scores = zip( docs_filtered, (doc_to_score[doc.id] for doc in docs_filtered) ) @@ -240,7 +252,6 @@ def _filter( filter_query: Any, limit: int, ) -> DocumentArray: - raise NotImplementedError( f'{type(self)} does not support filter-only queries.' f' To perform post-filtering on a query, use' diff --git a/tests/doc_index/conftest.py b/tests/doc_index/conftest.py new file mode 100644 index 00000000000..497a740ae43 --- /dev/null +++ b/tests/doc_index/conftest.py @@ -0,0 +1,8 @@ +import pytest +import logging + + +@pytest.fixture(autouse=True) +def set_logger_level(): + logger = logging.getLogger('docarray') + logger.setLevel(logging.INFO)