From 6d31526e3cc1e1c42329b9788a56d4e401bfe0c6 Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Tue, 18 Jul 2023 09:45:06 +0200 Subject: [PATCH 1/4] fix: slow hnsw by caching num docs Signed-off-by: jupyterjazz --- docarray/index/backends/hnswlib.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index 00124c7fbd0..6b89c5b7dc6 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -127,6 +127,7 @@ def __init__(self, db_config=None, **kwargs): self._sqlite_cursor = self._sqlite_conn.cursor() self._create_docs_table() self._sqlite_conn.commit() + self._num_docs = self._get_num_docs_sqlite() self._logger.info(f'{self.__class__.__name__} has been initialized') @property @@ -259,6 +260,7 @@ def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): self._send_docs_to_sqlite(docs_validated) self._sqlite_conn.commit() + self._num_docs = self._get_num_docs_sqlite() def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: """ @@ -379,6 +381,7 @@ def _del_items(self, doc_ids: Sequence[str]): self._delete_docs_from_sqlite(doc_ids) self._sqlite_conn.commit() + self._num_docs = self._get_num_docs_sqlite() def _get_items(self, doc_ids: Sequence[str], out: bool = True) -> Sequence[TSchema]: """Get Documents from the hnswlib index, by `id`. @@ -403,7 +406,8 @@ def num_docs(self) -> int: """ Get the number of documents. """ - return self._get_num_docs_sqlite() + return self._num_docs + # return self._get_num_docs_sqlite() ############################################### # Helpers # From 232f6be08c24218f3b893ee244546b2a05ad381f Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Tue, 18 Jul 2023 09:46:16 +0200 Subject: [PATCH 2/4] chore: remove unused line Signed-off-by: jupyterjazz --- docarray/index/backends/hnswlib.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index 6b89c5b7dc6..d4929569c63 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -407,7 +407,6 @@ def num_docs(self) -> int: Get the number of documents. """ return self._num_docs - # return self._get_num_docs_sqlite() ############################################### # Helpers # From a411c5ec919846e6e87f310ad951a9bd10df4f31 Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Tue, 18 Jul 2023 09:58:30 +0200 Subject: [PATCH 3/4] refactor: set num docs to 0 Signed-off-by: jupyterjazz --- docarray/index/backends/hnswlib.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index d4929569c63..d2ca9728d6b 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -127,7 +127,7 @@ def __init__(self, db_config=None, **kwargs): self._sqlite_cursor = self._sqlite_conn.cursor() self._create_docs_table() self._sqlite_conn.commit() - self._num_docs = self._get_num_docs_sqlite() + self._num_docs = 0 self._logger.info(f'{self.__class__.__name__} has been initialized') @property @@ -260,7 +260,7 @@ def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): self._send_docs_to_sqlite(docs_validated) self._sqlite_conn.commit() - self._num_docs = self._get_num_docs_sqlite() + self._num_docs = 0 # reset the cache def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: """ @@ -381,7 +381,7 @@ def _del_items(self, doc_ids: Sequence[str]): self._delete_docs_from_sqlite(doc_ids) self._sqlite_conn.commit() - self._num_docs = self._get_num_docs_sqlite() + self._num_docs = 0 # reset the cache def _get_items(self, doc_ids: Sequence[str], out: bool = True) -> Sequence[TSchema]: """Get Documents from the hnswlib index, by `id`. @@ -406,6 +406,7 @@ def num_docs(self) -> int: """ Get the number of documents. """ + self._num_docs = self._num_docs or self._get_num_docs_sqlite() return self._num_docs ############################################### From 916ed2749a96fbf4a8ad553e0709af9b027b747a Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Tue, 18 Jul 2023 10:12:53 +0200 Subject: [PATCH 4/4] refactor: go back to the initial solution Signed-off-by: jupyterjazz --- docarray/index/backends/hnswlib.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index d2ca9728d6b..d4929569c63 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -127,7 +127,7 @@ def __init__(self, db_config=None, **kwargs): self._sqlite_cursor = self._sqlite_conn.cursor() self._create_docs_table() self._sqlite_conn.commit() - self._num_docs = 0 + self._num_docs = self._get_num_docs_sqlite() self._logger.info(f'{self.__class__.__name__} has been initialized') @property @@ -260,7 +260,7 @@ def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): self._send_docs_to_sqlite(docs_validated) self._sqlite_conn.commit() - self._num_docs = 0 # reset the cache + self._num_docs = self._get_num_docs_sqlite() def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: """ @@ -381,7 +381,7 @@ def _del_items(self, doc_ids: Sequence[str]): self._delete_docs_from_sqlite(doc_ids) self._sqlite_conn.commit() - self._num_docs = 0 # reset the cache + self._num_docs = self._get_num_docs_sqlite() def _get_items(self, doc_ids: Sequence[str], out: bool = True) -> Sequence[TSchema]: """Get Documents from the hnswlib index, by `id`. @@ -406,7 +406,6 @@ def num_docs(self) -> int: """ Get the number of documents. """ - self._num_docs = self._num_docs or self._get_num_docs_sqlite() return self._num_docs ###############################################