From e9ab146433c05408a16adef88fbec1f6f3346276 Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Mon, 20 Mar 2023 09:38:37 +0100 Subject: [PATCH 01/10] feat: add minimal logger Signed-off-by: jupyterjazz --- docarray/__init__.py | 11 +++++++++++ docarray/doc_index/abstract_doc_index.py | 15 +++++++++++++++ .../doc_index/backends/hnswlib_doc_index.py | 19 +++++++++++++++++-- tests/doc_index/conftest.py | 8 ++++++++ 4 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 tests/doc_index/conftest.py diff --git a/docarray/__init__.py b/docarray/__init__.py index 20f1af1dbc7..886c8e3cfae 100644 --- a/docarray/__init__.py +++ b/docarray/__init__.py @@ -7,3 +7,14 @@ 'BaseDocument', 'DocumentArray', ] + +import logging + +logger = logging.getLogger('docarray') + + +handler = logging.StreamHandler() +formatter = logging.Formatter("%(levelname)s - %(name)s - %(message)s") + +handler.setFormatter(formatter) +logger.addHandler(handler) diff --git a/docarray/doc_index/abstract_doc_index.py b/docarray/doc_index/abstract_doc_index.py index 0a066bbbc8c..e91783a73bc 100644 --- a/docarray/doc_index/abstract_doc_index.py +++ b/docarray/doc_index/abstract_doc_index.py @@ -26,6 +26,11 @@ from docarray.utils.find import FindResult, _FindResult from docarray.utils.misc import torch_imported +import logging + +logger = logging.getLogger(__name__) + + if TYPE_CHECKING: from pydantic.fields import ModelField @@ -88,7 +93,9 @@ def __init__(self, db_config=None, **kwargs): self._db_config = db_config or self.DBConfig(**kwargs) if not isinstance(self._db_config, self.DBConfig): raise ValueError(f'db_config must be of type {self.DBConfig}') + logger.info('DB config created') self._runtime_config = self.RuntimeConfig() + logger.info('Runtime config created') self._column_infos: Dict[str, _ColumnInfo] = self._create_columns(self._schema) ############################################### @@ -342,6 +349,7 @@ def __delitem__(self, key: Union[str, Sequence[str]]): :param key: id or ids to delete from the Document Index """ + logger.info(f'Deleting documents with id(s) {key} from the index') if isinstance(key, str): key = [key] self._del_items(key) @@ -369,6 +377,7 @@ def index(self, docs: Union[BaseDocument, Sequence[BaseDocument]], **kwargs): :param docs: Documents to index """ + logger.info(f'Indexing {len(docs)} documents') data_by_columns = self._get_col_value_dict(docs) self._index(data_by_columns, **kwargs) @@ -390,6 +399,7 @@ def find( :param limit: maximum number of documents to return :return: a named tuple containing `documents` and `scores` """ + logger.info(f'Executing `find` for search field {search_field}') if isinstance(query, BaseDocument): query_vec = self._get_values_by_column([query], search_field)[0] else: @@ -423,6 +433,7 @@ def find_batched( :param limit: maximum number of documents to return per query :return: a named tuple containing `documents` and `scores` """ + logger.info(f'Executing `find_batched` for search field {search_field}') if isinstance(queries, Sequence): query_vec_list = self._get_values_by_column(queries, search_field) query_vec_np = np.stack( @@ -452,6 +463,7 @@ def filter( :param limit: maximum number of documents to return :return: a DocumentArray containing the documents that match the filter query """ + logger.info(f'Executing `filter` for the query {filter_query}') docs = self._filter(filter_query, limit=limit, **kwargs) if isinstance(docs, List): @@ -471,6 +483,7 @@ def filter_batched( :param limit: maximum number of documents to return :return: a DocumentArray containing the documents that match the filter query """ + logger.info(f'Executing `filter_batched` for the queries {filter_queries}') da_list = self._filter_batched(filter_queries, limit=limit, **kwargs) if len(da_list) > 0 and isinstance(da_list[0], List): @@ -492,6 +505,7 @@ def text_search( :param limit: maximum number of documents to return :return: a named tuple containing `documents` and `scores` """ + logger.info(f'Executing `text_search` for search field {search_field}') if isinstance(query, BaseDocument): query_text = self._get_values_by_column([query], search_field)[0] else: @@ -519,6 +533,7 @@ def text_search_batched( :param limit: maximum number of documents to return :return: a named tuple containing `documents` and `scores` """ + logger.info(f'Executing `text_search_batched` for search field {search_field}') if isinstance(queries[0], BaseDocument): query_docs: Sequence[BaseDocument] = cast(Sequence[BaseDocument], queries) query_texts: Sequence[str] = self._get_values_by_column( diff --git a/docarray/doc_index/backends/hnswlib_doc_index.py b/docarray/doc_index/backends/hnswlib_doc_index.py index d80e022e5f5..af03886fd1b 100644 --- a/docarray/doc_index/backends/hnswlib_doc_index.py +++ b/docarray/doc_index/backends/hnswlib_doc_index.py @@ -33,6 +33,9 @@ from docarray.utils.filter import filter as da_filter from docarray.utils.find import _FindResult from docarray.utils.misc import is_np_int, torch_imported +import logging + +logger = logging.getLogger('docarray') TSchema = TypeVar('TSchema', bound=BaseDocument) T = TypeVar('T', bound='HnswDocumentIndex') @@ -63,6 +66,7 @@ def __init__(self, db_config=None, **kwargs): super().__init__(db_config=db_config, **kwargs) self._db_config = cast(HnswDocumentIndex.DBConfig, self._db_config) self._work_dir = self._db_config.work_dir + logger.debug(f'Working directory set to {self._work_dir}') load_existing = os.path.exists(self._work_dir) and os.listdir(self._work_dir) Path(self._work_dir).mkdir(parents=True, exist_ok=True) @@ -83,18 +87,26 @@ def __init__(self, db_config=None, **kwargs): self._hnsw_indices = {} for col_name, col in self._column_infos.items(): if not col.config: - continue # do not create column index if no config is given + logger.warning( + f'No index was created for {col_name} as it does not have a config' + ) + continue if load_existing: self._hnsw_indices[col_name] = self._load_index(col_name, col) + logger.info(f'Loading an existing index for column {col_name}') else: self._hnsw_indices[col_name] = self._create_index(col) + logger.info(f'Created a new index for column {col_name}') # SQLite setup self._sqlite_db_path = os.path.join(self._work_dir, 'docs_sqlite.db') + logger.debug(f'DB path set to {self._sqlite_db_path}') self._sqlite_conn = sqlite3.connect(self._sqlite_db_path) + logger.info('Connection to DB has been established') self._sqlite_cursor = self._sqlite_conn.cursor() self._create_docs_table() self._sqlite_conn.commit() + logger.info(f'{self.__class__.__name__} has been initialized') ############################################### # Inner classes for query builder and configs # @@ -161,6 +173,8 @@ def index(self, docs: Union[BaseDocument, Sequence[BaseDocument]], **kwargs): """Index a document into the store""" if kwargs: raise ValueError(f'{list(kwargs.keys())} are not valid keyword arguments') + + logger.info(f'Indexing {len(docs)} documents') doc_seq = docs if isinstance(docs, Sequence) else [docs] data_by_columns = self._get_col_value_dict(doc_seq) hashed_ids = tuple(self._to_hashed_id(doc.id) for doc in doc_seq) @@ -196,6 +210,7 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: elif op == 'filter': filter_conditions.append(op_kwargs['filter_query']) + logger.debug(f'Executing query {query}') docs_filtered = ann_docs for cond in filter_conditions: da_cls = DocumentArray.__class_getitem__( @@ -203,6 +218,7 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: ) docs_filtered = da_cls(da_filter(docs_filtered, cond)) + logger.info(f'{len(docs_filtered)} results found') docs_and_scores = zip( docs_filtered, (doc_to_score[doc.id] for doc in docs_filtered) ) @@ -236,7 +252,6 @@ def _filter( filter_query: Any, limit: int, ) -> DocumentArray: - raise NotImplementedError( f'{type(self)} does not support filter-only queries.' f' To perform post-filtering on a query, use' diff --git a/tests/doc_index/conftest.py b/tests/doc_index/conftest.py new file mode 100644 index 00000000000..483853d5fe6 --- /dev/null +++ b/tests/doc_index/conftest.py @@ -0,0 +1,8 @@ +import pytest +import logging + + +@pytest.fixture(autouse=True) +def set_logger_level(): + logger = logging.getLogger('docarray') + logger.setLevel(logging.DEBUG) From 48ab1087d77d4878ee0da74940117faaaf8e8ff5 Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Mon, 20 Mar 2023 10:07:31 +0100 Subject: [PATCH 02/10] docs: add an example in contributing Signed-off-by: jupyterjazz --- CONTRIBUTING.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a6c14bb4dad..b01d0952e08 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -244,6 +244,36 @@ This allows: * the reviewer to be very confident that the feature does what it is supposed to do before merging it into the code base. * the contributors to be sure that they don't break already-merged features when refactoring or modifying the code base. + +## Enabling logging +See more logs about your code by setting the log level to `DEBUG`. + +Example: +```python +import ... +# import logging and set the level to DEBUG +import logging +logging.getLogger('docarray').setLevel(logging.DEBUG) + + +# define a simple document and create a document index +class SimpleDoc(BaseDocument): + vector: NdArray = Field(dim=10) + +doc_store = HnswDocumentIndex[SimpleDoc](work_dir='temp_path/') +``` + +```bash +INFO - docarray.doc_index.abstract_doc_index - DB config created +INFO - docarray.doc_index.abstract_doc_index - Runtime config created +DEBUG - docarray - Working directory set to temp_path/ +WARNING - docarray - No index was created for id as it does not have a config +INFO - docarray - Created a new index for column vector +DEBUG - docarray - DB path set to temp_path/docs_sqlite.db +INFO - docarray - Connection to DB has been established +INFO - docarray - HnswDocumentIndex[SimpleDoc] has been initialized +``` + ## Compiling protobuf From 2fe912c4d51df07e1d8ca4fc0db61b1bbd65118c Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Mon, 20 Mar 2023 10:09:17 +0100 Subject: [PATCH 03/10] docs: try different format Signed-off-by: jupyterjazz --- CONTRIBUTING.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b01d0952e08..b1e2d5127d0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -246,7 +246,7 @@ This allows: ## Enabling logging -See more logs about your code by setting the log level to `DEBUG`. +See more logs by setting the log level to `DEBUG`. Example: ```python @@ -263,7 +263,7 @@ class SimpleDoc(BaseDocument): doc_store = HnswDocumentIndex[SimpleDoc](work_dir='temp_path/') ``` -```bash +```console INFO - docarray.doc_index.abstract_doc_index - DB config created INFO - docarray.doc_index.abstract_doc_index - Runtime config created DEBUG - docarray - Working directory set to temp_path/ From 4a781087e0085a09134aaaf56d765f3c64203c20 Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Mon, 20 Mar 2023 10:20:34 +0100 Subject: [PATCH 04/10] refactor: set one logger name Signed-off-by: jupyterjazz --- CONTRIBUTING.md | 4 ++-- docarray/doc_index/abstract_doc_index.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b1e2d5127d0..b5a50337185 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -264,8 +264,8 @@ doc_store = HnswDocumentIndex[SimpleDoc](work_dir='temp_path/') ``` ```console -INFO - docarray.doc_index.abstract_doc_index - DB config created -INFO - docarray.doc_index.abstract_doc_index - Runtime config created +INFO - docarray - DB config created +INFO - docarray - Runtime config created DEBUG - docarray - Working directory set to temp_path/ WARNING - docarray - No index was created for id as it does not have a config INFO - docarray - Created a new index for column vector diff --git a/docarray/doc_index/abstract_doc_index.py b/docarray/doc_index/abstract_doc_index.py index e91783a73bc..d6978a051a0 100644 --- a/docarray/doc_index/abstract_doc_index.py +++ b/docarray/doc_index/abstract_doc_index.py @@ -28,7 +28,7 @@ import logging -logger = logging.getLogger(__name__) +logger = logging.getLogger('docarray') if TYPE_CHECKING: From a61eac2107be3f2e19ef5fd13d3f22312cba2440 Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Mon, 20 Mar 2023 10:35:33 +0100 Subject: [PATCH 05/10] docs: add quotes Signed-off-by: jupyterjazz --- CONTRIBUTING.md | 4 ++-- docarray/doc_index/backends/hnswlib_doc_index.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b5a50337185..2ae58390e83 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -267,8 +267,8 @@ doc_store = HnswDocumentIndex[SimpleDoc](work_dir='temp_path/') INFO - docarray - DB config created INFO - docarray - Runtime config created DEBUG - docarray - Working directory set to temp_path/ -WARNING - docarray - No index was created for id as it does not have a config -INFO - docarray - Created a new index for column vector +WARNING - docarray - No index was created for `id` as it does not have a config +INFO - docarray - Created a new index for column `vector` DEBUG - docarray - DB path set to temp_path/docs_sqlite.db INFO - docarray - Connection to DB has been established INFO - docarray - HnswDocumentIndex[SimpleDoc] has been initialized diff --git a/docarray/doc_index/backends/hnswlib_doc_index.py b/docarray/doc_index/backends/hnswlib_doc_index.py index af03886fd1b..78ac02f980e 100644 --- a/docarray/doc_index/backends/hnswlib_doc_index.py +++ b/docarray/doc_index/backends/hnswlib_doc_index.py @@ -88,15 +88,15 @@ def __init__(self, db_config=None, **kwargs): for col_name, col in self._column_infos.items(): if not col.config: logger.warning( - f'No index was created for {col_name} as it does not have a config' + f'No index was created for `{col_name}` as it does not have a config' ) continue if load_existing: self._hnsw_indices[col_name] = self._load_index(col_name, col) - logger.info(f'Loading an existing index for column {col_name}') + logger.info(f'Loading an existing index for column `{col_name}`') else: self._hnsw_indices[col_name] = self._create_index(col) - logger.info(f'Created a new index for column {col_name}') + logger.info(f'Created a new index for column `{col_name}`') # SQLite setup self._sqlite_db_path = os.path.join(self._work_dir, 'docs_sqlite.db') From 8c243e1587e87d2450bbe637ee6a0d42fa909d54 Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Mon, 20 Mar 2023 13:54:50 +0100 Subject: [PATCH 06/10] refactor: put logger as a class attr Signed-off-by: jupyterjazz --- docarray/doc_index/abstract_doc_index.py | 29 ++++++++++--------- .../doc_index/backends/hnswlib_doc_index.py | 23 +++++++-------- tests/doc_index/conftest.py | 2 +- 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/docarray/doc_index/abstract_doc_index.py b/docarray/doc_index/abstract_doc_index.py index d6978a051a0..ec36f43edd8 100644 --- a/docarray/doc_index/abstract_doc_index.py +++ b/docarray/doc_index/abstract_doc_index.py @@ -25,12 +25,8 @@ from docarray.typing import AnyTensor from docarray.utils.find import FindResult, _FindResult from docarray.utils.misc import torch_imported - import logging -logger = logging.getLogger('docarray') - - if TYPE_CHECKING: from pydantic.fields import ModelField @@ -90,12 +86,13 @@ def __init__(self, db_config=None, **kwargs): 'A DocumentIndex must be typed with a Document type.' 'To do so, use the syntax: DocumentIndex[DocumentType]' ) + self._logger = logging.getLogger('docarray') self._db_config = db_config or self.DBConfig(**kwargs) if not isinstance(self._db_config, self.DBConfig): raise ValueError(f'db_config must be of type {self.DBConfig}') - logger.info('DB config created') + self._logger.info('DB config created') self._runtime_config = self.RuntimeConfig() - logger.info('Runtime config created') + self._logger.info('Runtime config created') self._column_infos: Dict[str, _ColumnInfo] = self._create_columns(self._schema) ############################################### @@ -349,7 +346,7 @@ def __delitem__(self, key: Union[str, Sequence[str]]): :param key: id or ids to delete from the Document Index """ - logger.info(f'Deleting documents with id(s) {key} from the index') + self._logger.info(f'Deleting documents with id(s) {key} from the index') if isinstance(key, str): key = [key] self._del_items(key) @@ -377,7 +374,7 @@ def index(self, docs: Union[BaseDocument, Sequence[BaseDocument]], **kwargs): :param docs: Documents to index """ - logger.info(f'Indexing {len(docs)} documents') + self._logger.info(f'Indexing {len(docs)} documents') data_by_columns = self._get_col_value_dict(docs) self._index(data_by_columns, **kwargs) @@ -399,7 +396,7 @@ def find( :param limit: maximum number of documents to return :return: a named tuple containing `documents` and `scores` """ - logger.info(f'Executing `find` for search field {search_field}') + self._logger.info(f'Executing `find` for search field {search_field}') if isinstance(query, BaseDocument): query_vec = self._get_values_by_column([query], search_field)[0] else: @@ -433,7 +430,7 @@ def find_batched( :param limit: maximum number of documents to return per query :return: a named tuple containing `documents` and `scores` """ - logger.info(f'Executing `find_batched` for search field {search_field}') + self._logger.info(f'Executing `find_batched` for search field {search_field}') if isinstance(queries, Sequence): query_vec_list = self._get_values_by_column(queries, search_field) query_vec_np = np.stack( @@ -463,7 +460,7 @@ def filter( :param limit: maximum number of documents to return :return: a DocumentArray containing the documents that match the filter query """ - logger.info(f'Executing `filter` for the query {filter_query}') + self._logger.info(f'Executing `filter` for the query {filter_query}') docs = self._filter(filter_query, limit=limit, **kwargs) if isinstance(docs, List): @@ -483,7 +480,9 @@ def filter_batched( :param limit: maximum number of documents to return :return: a DocumentArray containing the documents that match the filter query """ - logger.info(f'Executing `filter_batched` for the queries {filter_queries}') + self._logger.info( + f'Executing `filter_batched` for the queries {filter_queries}' + ) da_list = self._filter_batched(filter_queries, limit=limit, **kwargs) if len(da_list) > 0 and isinstance(da_list[0], List): @@ -505,7 +504,7 @@ def text_search( :param limit: maximum number of documents to return :return: a named tuple containing `documents` and `scores` """ - logger.info(f'Executing `text_search` for search field {search_field}') + self._logger.info(f'Executing `text_search` for search field {search_field}') if isinstance(query, BaseDocument): query_text = self._get_values_by_column([query], search_field)[0] else: @@ -533,7 +532,9 @@ def text_search_batched( :param limit: maximum number of documents to return :return: a named tuple containing `documents` and `scores` """ - logger.info(f'Executing `text_search_batched` for search field {search_field}') + self._logger.info( + f'Executing `text_search_batched` for search field {search_field}' + ) if isinstance(queries[0], BaseDocument): query_docs: Sequence[BaseDocument] = cast(Sequence[BaseDocument], queries) query_texts: Sequence[str] = self._get_values_by_column( diff --git a/docarray/doc_index/backends/hnswlib_doc_index.py b/docarray/doc_index/backends/hnswlib_doc_index.py index 78ac02f980e..cd4019d7167 100644 --- a/docarray/doc_index/backends/hnswlib_doc_index.py +++ b/docarray/doc_index/backends/hnswlib_doc_index.py @@ -33,9 +33,6 @@ from docarray.utils.filter import filter as da_filter from docarray.utils.find import _FindResult from docarray.utils.misc import is_np_int, torch_imported -import logging - -logger = logging.getLogger('docarray') TSchema = TypeVar('TSchema', bound=BaseDocument) T = TypeVar('T', bound='HnswDocumentIndex') @@ -66,7 +63,7 @@ def __init__(self, db_config=None, **kwargs): super().__init__(db_config=db_config, **kwargs) self._db_config = cast(HnswDocumentIndex.DBConfig, self._db_config) self._work_dir = self._db_config.work_dir - logger.debug(f'Working directory set to {self._work_dir}') + self._logger.debug(f'Working directory set to {self._work_dir}') load_existing = os.path.exists(self._work_dir) and os.listdir(self._work_dir) Path(self._work_dir).mkdir(parents=True, exist_ok=True) @@ -87,26 +84,26 @@ def __init__(self, db_config=None, **kwargs): self._hnsw_indices = {} for col_name, col in self._column_infos.items(): if not col.config: - logger.warning( + self._logger.warning( f'No index was created for `{col_name}` as it does not have a config' ) continue if load_existing: self._hnsw_indices[col_name] = self._load_index(col_name, col) - logger.info(f'Loading an existing index for column `{col_name}`') + self._logger.info(f'Loading an existing index for column `{col_name}`') else: self._hnsw_indices[col_name] = self._create_index(col) - logger.info(f'Created a new index for column `{col_name}`') + self._logger.info(f'Created a new index for column `{col_name}`') # SQLite setup self._sqlite_db_path = os.path.join(self._work_dir, 'docs_sqlite.db') - logger.debug(f'DB path set to {self._sqlite_db_path}') + self._logger.debug(f'DB path set to {self._sqlite_db_path}') self._sqlite_conn = sqlite3.connect(self._sqlite_db_path) - logger.info('Connection to DB has been established') + self._logger.info('Connection to DB has been established') self._sqlite_cursor = self._sqlite_conn.cursor() self._create_docs_table() self._sqlite_conn.commit() - logger.info(f'{self.__class__.__name__} has been initialized') + self._logger.info(f'{self.__class__.__name__} has been initialized') ############################################### # Inner classes for query builder and configs # @@ -174,7 +171,7 @@ def index(self, docs: Union[BaseDocument, Sequence[BaseDocument]], **kwargs): if kwargs: raise ValueError(f'{list(kwargs.keys())} are not valid keyword arguments') - logger.info(f'Indexing {len(docs)} documents') + self._logger.info(f'Indexing {len(docs)} documents') doc_seq = docs if isinstance(docs, Sequence) else [docs] data_by_columns = self._get_col_value_dict(doc_seq) hashed_ids = tuple(self._to_hashed_id(doc.id) for doc in doc_seq) @@ -210,7 +207,7 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: elif op == 'filter': filter_conditions.append(op_kwargs['filter_query']) - logger.debug(f'Executing query {query}') + self._logger.debug(f'Executing query {query}') docs_filtered = ann_docs for cond in filter_conditions: da_cls = DocumentArray.__class_getitem__( @@ -218,7 +215,7 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: ) docs_filtered = da_cls(da_filter(docs_filtered, cond)) - logger.info(f'{len(docs_filtered)} results found') + self._logger.info(f'{len(docs_filtered)} results found') docs_and_scores = zip( docs_filtered, (doc_to_score[doc.id] for doc in docs_filtered) ) diff --git a/tests/doc_index/conftest.py b/tests/doc_index/conftest.py index 483853d5fe6..497a740ae43 100644 --- a/tests/doc_index/conftest.py +++ b/tests/doc_index/conftest.py @@ -5,4 +5,4 @@ @pytest.fixture(autouse=True) def set_logger_level(): logger = logging.getLogger('docarray') - logger.setLevel(logging.DEBUG) + logger.setLevel(logging.INFO) From f75d287c0bedc5ee59fa3f3fbe21d6609a05779b Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Mon, 20 Mar 2023 14:16:18 +0100 Subject: [PATCH 07/10] docs: small change Signed-off-by: jupyterjazz --- CONTRIBUTING.md | 4 ++-- docarray/__init__.py | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6c710dfee93..41d46f4353d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -271,8 +271,8 @@ This allows: * the reviewer to be very confident that the feature does what it is supposed to do before merging it into the code base. * the contributors to be sure that they don't break already-merged features when refactoring or modifying the code base. - -## Enabling logging + +## Enable logging See more logs by setting the log level to `DEBUG`. Example: diff --git a/docarray/__init__.py b/docarray/__init__.py index c279b11c4b0..03e65750c14 100644 --- a/docarray/__init__.py +++ b/docarray/__init__.py @@ -2,16 +2,13 @@ from docarray.array import DocumentArray, DocumentArrayStacked from docarray.base_document.document import BaseDocument +import logging __all__ = ['BaseDocument', 'DocumentArray', 'DocumentArrayStacked'] -import logging - logger = logging.getLogger('docarray') - handler = logging.StreamHandler() formatter = logging.Formatter("%(levelname)s - %(name)s - %(message)s") - handler.setFormatter(formatter) logger.addHandler(handler) From 6262ff30f334f4172eca1acdb86dd0b3dd8c3725 Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Tue, 21 Mar 2023 12:46:03 +0100 Subject: [PATCH 08/10] fix: typo Signed-off-by: jupyterjazz --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ff4e9b0a520..803ea420c62 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -273,7 +273,7 @@ This allows: ## Enable logging -Moinitor and debug your code by enabling docarray logging: +Monitor and debug your code by enabling docarray logging: ```python import logging logging.getLogger('docarray').setLevel(logging.DEBUG) From 744176a0821293f84b9eaecbef895edbece8328f Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Tue, 21 Mar 2023 15:54:33 +0100 Subject: [PATCH 09/10] docs: rephrase text Signed-off-by: jupyterjazz --- CONTRIBUTING.md | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 803ea420c62..e06a3b9df21 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -273,7 +273,7 @@ This allows: ## Enable logging -Monitor and debug your code by enabling docarray logging: +If you need to monitor and debug your code, you can enable docarray logging: ```python import logging logging.getLogger('docarray').setLevel(logging.DEBUG) diff --git a/README.md b/README.md index cafb436824b..97955df2b14 100644 --- a/README.md +++ b/README.md @@ -445,7 +445,7 @@ match = store.find( ``` ## Enable logging -See more logs by setting the log level to `DEBUG`. +You can see more logs by setting the log level to `DEBUG` or `INFO`. Example: ```python From 9ae165fd4a0e0c5b8a396e1ea4b145f550982d7c Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Wed, 22 Mar 2023 08:40:02 +0100 Subject: [PATCH 10/10] refactor: requested changes Signed-off-by: jupyterjazz --- README.md | 5 ++++- docarray/doc_index/abstract_doc_index.py | 16 ++++++++-------- docarray/doc_index/backends/hnswlib_doc_index.py | 4 ++-- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 97955df2b14..7a2b56b59b3 100644 --- a/README.md +++ b/README.md @@ -449,7 +449,10 @@ You can see more logs by setting the log level to `DEBUG` or `INFO`. Example: ```python -import ... +from pydantic import Field +from docarray import BaseDocument +from docarray.doc_index.backends.hnswlib_doc_index import HnswDocumentIndex +from docarray.typing import NdArray # import logging and set the level to DEBUG import logging logging.getLogger('docarray').setLevel(logging.DEBUG) diff --git a/docarray/doc_index/abstract_doc_index.py b/docarray/doc_index/abstract_doc_index.py index 409bc4b7a66..e1fd847c841 100644 --- a/docarray/doc_index/abstract_doc_index.py +++ b/docarray/doc_index/abstract_doc_index.py @@ -378,13 +378,13 @@ def index(self, docs: Union[BaseDocument, Sequence[BaseDocument]], **kwargs): :param docs: Documents to index. """ - if not isinstance(docs, BaseDocument) and not isinstance(docs, DocumentArray): + if not isinstance(docs, (BaseDocument, DocumentArray)): self._logger.warning( 'Passing a sequence of Documents that is not a DocumentArray comes at ' 'a performance penalty, since compatibility with the schema of Index ' 'needs to be checked for every Document individually.' ) - self._logger.info(f'Indexing {len(docs)} documents') + self._logger.debug(f'Indexing {len(docs)} documents') docs_validated = self._validate_docs(docs) data_by_columns = self._get_col_value_dict(docs_validated) self._index(data_by_columns, **kwargs) @@ -407,7 +407,7 @@ def find( :param limit: maximum number of documents to return :return: a named tuple containing `documents` and `scores` """ - self._logger.info(f'Executing `find` for search field {search_field}') + self._logger.debug(f'Executing `find` for search field {search_field}') if isinstance(query, BaseDocument): query_vec = self._get_values_by_column([query], search_field)[0] else: @@ -441,7 +441,7 @@ def find_batched( :param limit: maximum number of documents to return per query :return: a named tuple containing `documents` and `scores` """ - self._logger.info(f'Executing `find_batched` for search field {search_field}') + self._logger.debug(f'Executing `find_batched` for search field {search_field}') if isinstance(queries, Sequence): query_vec_list = self._get_values_by_column(queries, search_field) query_vec_np = np.stack( @@ -471,7 +471,7 @@ def filter( :param limit: maximum number of documents to return :return: a DocumentArray containing the documents that match the filter query """ - self._logger.info(f'Executing `filter` for the query {filter_query}') + self._logger.debug(f'Executing `filter` for the query {filter_query}') docs = self._filter(filter_query, limit=limit, **kwargs) if isinstance(docs, List): @@ -491,7 +491,7 @@ def filter_batched( :param limit: maximum number of documents to return :return: a DocumentArray containing the documents that match the filter query """ - self._logger.info( + self._logger.debug( f'Executing `filter_batched` for the queries {filter_queries}' ) da_list = self._filter_batched(filter_queries, limit=limit, **kwargs) @@ -515,7 +515,7 @@ def text_search( :param limit: maximum number of documents to return :return: a named tuple containing `documents` and `scores` """ - self._logger.info(f'Executing `text_search` for search field {search_field}') + self._logger.debug(f'Executing `text_search` for search field {search_field}') if isinstance(query, BaseDocument): query_text = self._get_values_by_column([query], search_field)[0] else: @@ -543,7 +543,7 @@ def text_search_batched( :param limit: maximum number of documents to return :return: a named tuple containing `documents` and `scores` """ - self._logger.info( + self._logger.debug( f'Executing `text_search_batched` for search field {search_field}' ) if isinstance(queries[0], BaseDocument): diff --git a/docarray/doc_index/backends/hnswlib_doc_index.py b/docarray/doc_index/backends/hnswlib_doc_index.py index f38402714f6..3d6923fff80 100644 --- a/docarray/doc_index/backends/hnswlib_doc_index.py +++ b/docarray/doc_index/backends/hnswlib_doc_index.py @@ -171,7 +171,7 @@ def index(self, docs: Union[BaseDocument, Sequence[BaseDocument]], **kwargs): if kwargs: raise ValueError(f'{list(kwargs.keys())} are not valid keyword arguments') - self._logger.info(f'Indexing {len(docs)} documents') + self._logger.debug(f'Indexing {len(docs)} documents') docs_validated = self._validate_docs(docs) data_by_columns = self._get_col_value_dict(docs_validated) hashed_ids = tuple(self._to_hashed_id(doc.id) for doc in docs_validated) @@ -214,7 +214,7 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: ) docs_filtered = da_cls(filter_docs(docs_filtered, cond)) - self._logger.info(f'{len(docs_filtered)} results found') + self._logger.debug(f'{len(docs_filtered)} results found') docs_and_scores = zip( docs_filtered, (doc_to_score[doc.id] for doc in docs_filtered) )