docarray · alaeddine-13 · May 25, 2022 · May 9, 2022 · May 9, 2022 · May 10, 2022
diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py
@@ -87,22 +87,29 @@ def find(self: 'T', query: Dict, **kwargs) -> 'DocumentArray':
 
     def find(
         self: 'T',
-        query: Union['DocumentArray', 'Document', 'ArrayType', Dict, str, List[str]],
+        query: Union[
+            'DocumentArray', 'Document', 'ArrayType', Dict, str, List[str], None
+        ] = None,
         metric: Union[
             str, Callable[['ArrayType', 'ArrayType'], 'np.ndarray']
         ] = 'cosine',
         limit: Optional[Union[int, float]] = 20,
         metric_name: Optional[str] = None,
         exclude_self: bool = False,
+        filter: Optional[Dict] = None,
         only_id: bool = False,
         index: str = 'text',
         **kwargs,
     ) -> Union['DocumentArray', List['DocumentArray']]:
         """Returns matching Documents given an input query.
         If the query is a `DocumentArray`, `Document` or `ArrayType`, exhaustive or approximate nearest neighbor search
-        will be performed depending on whether the storage backend supports ANN.
-        If the query is a `dict` object, Documents will be filtered according to DocArray's query language and all
-        matching Documents that match the filter will be returned.
+        will be performed depending on whether the storage backend supports ANN. Furthermore, if filter is not None,
+        pre-filtering will be applied along with vector search.
+        If the query is a `dict` object or, query is None and filter is not None, Documents will be filtered and all
+        matching Documents that match the filter will be returned. In this case, query (if it's dict) or filter will be
+        used for filtering. The object must follow the backend-specific filter format if the backend supports filtering
+        or DocArray's query language format. In the latter case, filtering will be applied in the client side not the
+        backend side.
         If the query is a string or list of strings, a search by text will be performed if the backend supports
         indexing and searching text fields. If not, a `NotImplementedError` will be raised.
 
@@ -112,6 +119,7 @@ def find(
         :param metric: the distance metric.
         :param exclude_self: if set, Documents in results with same ``id`` as the query values will not be
                         considered as matches. This is only applied when the input query is Document or DocumentArray.
+        :param filter: filter query used for pre-filtering or filtering
         :param only_id: if set, then returning matches will only contain ``id``
         :param index: if the query is a string, text search will be performed on the `index` field, otherwise, this
                       parameter is ignored. By default, the Document `text` attribute will be used for search,
@@ -125,21 +133,35 @@ def find(
         from ... import Document, DocumentArray
 
         if isinstance(query, dict):
-            return self._filter(query)
-        elif isinstance(query, (DocumentArray, Document)):
-
-            if isinstance(query, Document):
-                query = DocumentArray(query)
-
-            _query = query.embeddings
+            if filter is None:
+                return self._filter(query)
+            else:
+                raise ValueError(
+                    'filter and query cannot be both dict type, set only one for filtering'
+                )
+        elif query is None:
+            if isinstance(filter, dict):
+                return self._filter(filter)
+            else:
+                raise ValueError('filter must be dict when query is None')
         elif isinstance(query, str) or (
             isinstance(query, list) and isinstance(query[0], str)
         ):
+            if filter is not None:
+                raise ValueError('cannot use filter with text search')
             result = self._find_by_text(query, index=index, limit=limit, **kwargs)
             if isinstance(query, str):
                 return result[0]
             else:
                 return result
+
+        # for all the rest, vector search will be performed
+        elif isinstance(query, (DocumentArray, Document)):
+
+            if isinstance(query, Document):
+                query = DocumentArray(query)
+
+            _query = query.embeddings
         else:
             _query = query
 
@@ -169,6 +191,7 @@ def find(
 
         _result = self._find(
             _query,
+            filter=filter,
             **kwargs,
         )
 
@@ -227,7 +250,7 @@ def find(
 
     @abc.abstractmethod
     def _find(
-        self, query: 'ArrayType', limit: int, **kwargs
+        self, query: 'ArrayType', limit: int, filter: Optional[Dict] = None, **kwargs
     ) -> Tuple['np.ndarray', 'np.ndarray']:
         raise NotImplementedError
 

diff --git a/docarray/array/storage/annlite/backend.py b/docarray/array/storage/annlite/backend.py
@@ -5,6 +5,8 @@
     Optional,
     TYPE_CHECKING,
     Iterable,
+    List,
+    Tuple,
 )
 
 import numpy as np
@@ -25,11 +27,14 @@ class AnnliteConfig:
     ef_construction: Optional[int] = None
     ef_search: Optional[int] = None
     max_connection: Optional[int] = None
+    columns: Optional[List[Tuple[str, str]]] = None
 
 
 class BackendMixin(BaseBackendMixin):
     """Provide necessary functions to enable this storage backend."""
 
+    TYPE_MAP = {'str': 'TEXT', 'float': 'float', 'int': 'integer'}
+
     def _map_embedding(self, embedding: 'ArrayType') -> 'ArrayType':
         if embedding is None:
             embedding = np.zeros(self.n_dim, dtype=np.float32)
@@ -62,6 +67,15 @@ def _init_storage(
 
         self._config = config
 
+        if self._config.columns is None:
+            self._config.columns = []
+
+        for i in range(len(self._config.columns)):
+            self._config.columns[i] = (
+                self._config.columns[i][0],
+                self._map_type(self._config.columns[i][1]),
+            )
+
         config = asdict(config)
         self.n_dim = config.pop('n_dim')
 

diff --git a/docarray/array/storage/annlite/find.py b/docarray/array/storage/annlite/find.py
@@ -3,6 +3,7 @@
     Optional,
     TYPE_CHECKING,
     List,
+    Dict,
 )
 
 if TYPE_CHECKING:
@@ -16,13 +17,15 @@ def _find(
         query: 'np.ndarray',
         limit: Optional[Union[int, float]] = 20,
         only_id: bool = False,
+        filter: Optional[Dict] = None,
         **kwargs,
     ) -> List['DocumentArray']:
         """Returns approximate nearest neighbors given an input query.
 
         :param query: the query documents to search.
         :param limit: the number of results to get for each query document in search.
         :param only_id: if set, then returning matches will only contain ``id``
+        :param filter: filter query used for pre-filtering
         :param kwargs: other kwargs.
 
         :return: a list of DocumentArrays containing the closest Document objects for each of the queries in `query`.
@@ -34,7 +37,7 @@ def _find(
             query = query.reshape(1, -1)
 
         _, match_docs = self._annlite._search_documents(
-            query, limit=limit, include_metadata=not only_id
+            query, limit=limit, filter=filter or {}, include_metadata=not only_id
         )
 
         return match_docs
diff --git a/docarray/array/storage/base/backend.py b/docarray/array/storage/base/backend.py
@@ -7,6 +7,8 @@
 
 
 class BaseBackendMixin(ABC):
+    TYPE_MAP: Dict
+
     def _init_storage(
         self,
         _docs: Optional['DocumentArraySourceType'] = None,
@@ -27,3 +29,6 @@ def _map_embedding(self, embedding: 'ArrayType') -> 'ArrayType':
         from ....math.ndarray import to_numpy_array
 
         return to_numpy_array(embedding)
+
+    def _map_type(self, col_type: str) -> str:
+        return self.TYPE_MAP[col_type]
diff --git a/docarray/array/storage/elastic/backend.py b/docarray/array/storage/elastic/backend.py
@@ -9,6 +9,7 @@
     List,
     Iterable,
     Any,
+    Tuple,
     Mapping,
 )
 
@@ -41,6 +42,7 @@ class ElasticConfig:
     batch_size: int = 64
     ef_construction: Optional[int] = None
     m: Optional[int] = None
+    columns: Optional[List[Tuple[str, str]]] = None
 
 
 class BackendMixin(BaseBackendMixin):

diff --git a/docarray/array/storage/elastic/find.py b/docarray/array/storage/elastic/find.py
@@ -4,6 +4,8 @@
     Sequence,
     List,
     Union,
+    Optional,
+    Dict,
 )
 
 import numpy as np
@@ -99,16 +101,22 @@ def _find(
         self,
         query: 'ElasticArrayType',
         limit: int = 10,
+        filter: Optional[Dict] = None,
         **kwargs,
     ) -> List['DocumentArray']:
         """Returns approximate nearest neighbors given a batch of input queries.
 
         :param query: input supported to be stored in Elastic. This includes any from the list '[np.ndarray, tensorflow.Tensor, torch.Tensor, Sequence[float]]'
         :param limit: number of retrieved items
+        :param filter: filter query used for pre-filtering
 
         :return: DocumentArray containing the closest documents to the query if it is a single query, otherwise a list of DocumentArrays containing
            the closest Document objects for each of the queries in `query`.
         """
+        if filter is not None:
+            raise ValueError(
+                'Filtered vector search is not supported for ElasticSearch backend'
+            )
         query = np.array(query)
         num_rows, n_dim = ndarray.get_array_rows(query)
         if n_dim != 2:

diff --git a/docarray/array/storage/memory/find.py b/docarray/array/storage/memory/find.py
@@ -1,4 +1,4 @@
-from typing import Optional, Union, Tuple, Callable, TYPE_CHECKING
+from typing import Optional, Union, Tuple, Callable, TYPE_CHECKING, Dict
 
 import numpy as np
 
@@ -27,6 +27,7 @@ def _find(
         use_scipy: bool = False,
         device: str = 'cpu',
         num_worker: Optional[int] = 1,
+        filter: Optional[Dict] = None,
         **kwargs,
     ) -> Tuple['np.ndarray', 'np.ndarray']:
         """Returns approximate nearest neighbors given a batch of input queries.
@@ -47,11 +48,15 @@ def _find(
 
                 .. note::
                     This argument is only effective when ``batch_size`` is set.
-
+        :param filter: filter query used for pre-filtering
         :param kwargs: other kwargs.
 
         :return: a list of DocumentArrays containing the closest Document objects for each of the queries in `query`.
         """
+        if filter is not None:
+            raise ValueError(
+                'Filtered vector search is not supported for In-Memory backend'
+            )
 
         if batch_size is not None:
             if batch_size <= 0:

diff --git a/docarray/array/storage/qdrant/backend.py b/docarray/array/storage/qdrant/backend.py
@@ -7,6 +7,7 @@
     Dict,
     Iterable,
     List,
+    Tuple,
 )
 
 import numpy as np
@@ -41,6 +42,7 @@ class QdrantConfig:
     ef_construct: Optional[int] = None
     full_scan_threshold: Optional[int] = None
     m: Optional[int] = None
+    columns: Optional[List[Tuple[str, str]]] = None
 
 
 class BackendMixin(BaseBackendMixin):
@@ -86,6 +88,9 @@ def _init_storage(
         self._config = config
         self._persist = bool(self._config.collection_name)
 
+        if self._config.columns is None:
+            self._config.columns = []
+
         self._config.collection_name = (
             self.__class__.__name__ + random_identity()
             if self._config.collection_name is None

diff --git a/docarray/array/storage/qdrant/find.py b/docarray/array/storage/qdrant/find.py
@@ -4,6 +4,8 @@
     TypeVar,
     Sequence,
     List,
+    Dict,
+    Optional,
 )
 
 from qdrant_client.http.models.models import Distance
@@ -48,13 +50,15 @@ def serialize_config(self) -> dict:
     def distance(self) -> 'Distance':
         raise NotImplementedError()
 
-    def _find_similar_vectors(self, q: 'QdrantArrayType', limit=10):
+    def _find_similar_vectors(
+        self, q: 'QdrantArrayType', limit: int = 10, filter: Optional[Dict] = None
+    ):
         query_vector = self._map_embedding(q)
 
         search_result = self.client.search(
             self.collection_name,
             query_vector=query_vector,
-            query_filter=None,
+            query_filter=filter,
             search_params=None,
             top=limit,
             append_payload=['_serialized'],
@@ -74,11 +78,16 @@ def _find_similar_vectors(self, q: 'QdrantArrayType', limit=10):
         return DocumentArray(docs)
 
     def _find(
-        self, query: 'QdrantArrayType', limit: int = 10, **kwargs
+        self,
+        query: 'QdrantArrayType',
+        limit: int = 10,
+        filter: Optional[Dict] = None,
+        **kwargs,
     ) -> List['DocumentArray']:
         """Returns approximate nearest neighbors given a batch of input queries.
         :param query: input supported to be used in Qdrant.
         :param limit: number of retrieved items
+        :param filter: filter query used for pre-filtering
 
 
         :return: a list of DocumentArrays containing the closest Document objects for each of the queries in `query`.
@@ -87,10 +96,10 @@ def _find(
         num_rows, _ = ndarray.get_array_rows(query)
 
         if num_rows == 1:
-            return [self._find_similar_vectors(query, limit=limit)]
+            return [self._find_similar_vectors(query, limit=limit, filter=filter)]
         else:
             closest_docs = []
             for q in query:
-                da = self._find_similar_vectors(q, limit=limit)
+                da = self._find_similar_vectors(q, limit=limit, filter=filter)
                 closest_docs.append(da)
             return closest_docs
diff --git a/docarray/array/storage/qdrant/getsetdel.py b/docarray/array/storage/qdrant/getsetdel.py
@@ -65,9 +65,13 @@ def _qdrant_to_document(self, qdrant_record: dict) -> 'Document':
         )
 
     def _document_to_qdrant(self, doc: 'Document') -> 'PointStruct':
+        extra_columns = {col: doc.tags.get(col) for col, _ in self._config.columns}
+
         return PointStruct(
             id=self._map_id(doc.id),
-            payload=dict(_serialized=doc.to_base64(**self.serialization_config)),
+            payload=dict(
+                _serialized=doc.to_base64(**self.serialization_config), **extra_columns
+            ),
             vector=self._map_embedding(doc.embedding),
         )
 

diff --git a/docarray/array/storage/sqlite/backend.py b/docarray/array/storage/sqlite/backend.py
@@ -8,6 +8,8 @@
     Optional,
     TYPE_CHECKING,
     Union,
+    List,
+    Tuple,
 )
 
 from .helper import initialize_table
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,8 @@ @@
     Optional,
     TYPE_CHECKING,
     Union,
+    List,
+    Tuple,
 )
 from .helper import initialize_table
@@ Expand Down @@