Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
651ad8c
feat: add annlite filter
davidbp May 9, 2022
06be300
refactor: black style
davidbp May 9, 2022
2df4bc0
feat: add filter parameter to interface
alaeddine-13 May 10, 2022
4768f3e
fix: linting
alaeddine-13 May 10, 2022
035c8e9
feat: add columns to backend configs
davidbp May 11, 2022
30741b8
fix: missing type hint
davidbp May 11, 2022
1f63703
feat: add column to weaviate schema
davidbp May 16, 2022
232a419
fix: add col type mapping for weaviate
alaeddine-13 May 17, 2022
a430a1f
Merge branch 'main' into feat-add-annlite-filtering
alaeddine-13 May 18, 2022
d33ddf2
fix: default to empty list if None
alaeddine-13 May 18, 2022
38be2d7
Merge branch 'feat-add-annlite-filtering' of https://github.com/jina-…
alaeddine-13 May 18, 2022
c1a3fdb
feat: set attributes in _set_doc_by_id for weaviate
alaeddine-13 May 18, 2022
e7df1a0
fix: columns are added as properties not classes
alaeddine-13 May 18, 2022
d249573
feat: use filter in _find for weaviate
alaeddine-13 May 18, 2022
2b3d91a
fix: set indexInverted to True to enable filtering for weaviate
alaeddine-13 May 18, 2022
a16fe2a
fix: weaviate error handling message
davidbp May 19, 2022
64b5532
feat: pre filtering for qdrant
davidbp May 19, 2022
79b7db7
test: pre filtering in weaviate and qdrant
davidbp May 19, 2022
c3b5aea
refactor: unify pre-filtering tests of all backends
alaeddine-13 May 19, 2022
a6d5d30
fix: map col types in annlite
alaeddine-13 May 19, 2022
a548c9f
fix: start_storage at module level
alaeddine-13 May 20, 2022
aacf3fe
test: cover both API usages
alaeddine-13 May 20, 2022
1421ee7
chore: address review
alaeddine-13 May 20, 2022
1986f82
chore: address review
alaeddine-13 May 20, 2022
a497a85
fix: cryptographic random generator for weaviate classnames
alaeddine-13 May 20, 2022
df0c35f
Merge branch 'main' into feat-add-annlite-filtering
alaeddine-13 May 20, 2022
200b83c
refactor: make filter weaviate more readable
davidbp May 20, 2022
b4991df
docs: showcase pre-filtering in annlite
davidbp May 23, 2022
d0e18e8
docs: document filter parameter
alaeddine-13 May 24, 2022
4d65d70
refactor: refactor find type checking
alaeddine-13 May 24, 2022
d6bbb5a
docs: add qdrant filtering documentation
davidbp May 24, 2022
b9c9da8
Merge branch 'feat-add-annlite-filtering' of https://github.com/jina-…
davidbp May 24, 2022
cbfaafe
docs: add filter weaviate
davidbp May 24, 2022
d8d6664
docs: use context manager in examples
davidbp May 24, 2022
b2936bd
test: cover wrong filter format for weaviate
alaeddine-13 May 24, 2022
3f3f3e9
Merge branch 'feat-add-annlite-filtering' of https://github.com/jina-…
alaeddine-13 May 24, 2022
92c637a
test: cover unsupported pre-filtering for memory and elastic
alaeddine-13 May 24, 2022
e29434f
Merge branch 'main' into feat-add-annlite-filtering
alaeddine-13 May 24, 2022
240ef89
Merge branch 'main' into feat-add-annlite-filtering
alaeddine-13 May 24, 2022
d368a58
test: update test_embedding_ops_error
alaeddine-13 May 24, 2022
7cde83b
docs: add weaviate filter reference
davidbp May 24, 2022
eff948f
Merge branch 'feat-add-annlite-filtering' of https://github.com/jina-…
davidbp May 24, 2022
3d4d13f
docs: fix typos
davidbp May 24, 2022
d1dd27c
docs: reference backend filter query language definition
alaeddine-13 May 24, 2022
bbc1473
docs: fix weaviate example
alaeddine-13 May 24, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 35 additions & 12 deletions docarray/array/mixins/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,22 +87,29 @@ def find(self: 'T', query: Dict, **kwargs) -> 'DocumentArray':

def find(
self: 'T',
query: Union['DocumentArray', 'Document', 'ArrayType', Dict, str, List[str]],
query: Union[
'DocumentArray', 'Document', 'ArrayType', Dict, str, List[str], None
] = None,
metric: Union[
str, Callable[['ArrayType', 'ArrayType'], 'np.ndarray']
] = 'cosine',
limit: Optional[Union[int, float]] = 20,
metric_name: Optional[str] = None,
exclude_self: bool = False,
filter: Optional[Dict] = None,
Comment thread
alaeddine-13 marked this conversation as resolved.
only_id: bool = False,
index: str = 'text',
**kwargs,
) -> Union['DocumentArray', List['DocumentArray']]:
"""Returns matching Documents given an input query.
If the query is a `DocumentArray`, `Document` or `ArrayType`, exhaustive or approximate nearest neighbor search
will be performed depending on whether the storage backend supports ANN.
If the query is a `dict` object, Documents will be filtered according to DocArray's query language and all
matching Documents that match the filter will be returned.
will be performed depending on whether the storage backend supports ANN. Furthermore, if filter is not None,
pre-filtering will be applied along with vector search.
If the query is a `dict` object or, query is None and filter is not None, Documents will be filtered and all
matching Documents that match the filter will be returned. In this case, query (if it's dict) or filter will be
used for filtering. The object must follow the backend-specific filter format if the backend supports filtering
or DocArray's query language format. In the latter case, filtering will be applied in the client side not the
backend side.
If the query is a string or list of strings, a search by text will be performed if the backend supports
indexing and searching text fields. If not, a `NotImplementedError` will be raised.

Expand All @@ -112,6 +119,7 @@ def find(
:param metric: the distance metric.
:param exclude_self: if set, Documents in results with same ``id`` as the query values will not be
considered as matches. This is only applied when the input query is Document or DocumentArray.
:param filter: filter query used for pre-filtering or filtering
:param only_id: if set, then returning matches will only contain ``id``
:param index: if the query is a string, text search will be performed on the `index` field, otherwise, this
parameter is ignored. By default, the Document `text` attribute will be used for search,
Expand All @@ -125,21 +133,35 @@ def find(
from ... import Document, DocumentArray

if isinstance(query, dict):
return self._filter(query)
elif isinstance(query, (DocumentArray, Document)):

if isinstance(query, Document):
query = DocumentArray(query)

_query = query.embeddings
if filter is None:
return self._filter(query)
else:
raise ValueError(
'filter and query cannot be both dict type, set only one for filtering'
)
elif query is None:
if isinstance(filter, dict):
return self._filter(filter)
else:
raise ValueError('filter must be dict when query is None')
elif isinstance(query, str) or (
isinstance(query, list) and isinstance(query[0], str)
):
if filter is not None:
raise ValueError('cannot use filter with text search')
result = self._find_by_text(query, index=index, limit=limit, **kwargs)
if isinstance(query, str):
return result[0]
else:
return result

# for all the rest, vector search will be performed
elif isinstance(query, (DocumentArray, Document)):

if isinstance(query, Document):
query = DocumentArray(query)

_query = query.embeddings
else:
_query = query
Comment thread
mapleeit marked this conversation as resolved.

Expand Down Expand Up @@ -169,6 +191,7 @@ def find(

_result = self._find(
_query,
filter=filter,
**kwargs,
)

Expand Down Expand Up @@ -227,7 +250,7 @@ def find(

@abc.abstractmethod
def _find(
self, query: 'ArrayType', limit: int, **kwargs
self, query: 'ArrayType', limit: int, filter: Optional[Dict] = None, **kwargs
Comment thread
mapleeit marked this conversation as resolved.
) -> Tuple['np.ndarray', 'np.ndarray']:
raise NotImplementedError

Expand Down
14 changes: 14 additions & 0 deletions docarray/array/storage/annlite/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
Optional,
TYPE_CHECKING,
Iterable,
List,
Tuple,
)

import numpy as np
Expand All @@ -25,11 +27,14 @@ class AnnliteConfig:
ef_construction: Optional[int] = None
ef_search: Optional[int] = None
max_connection: Optional[int] = None
columns: Optional[List[Tuple[str, str]]] = None
Comment thread
alaeddine-13 marked this conversation as resolved.
Comment thread
alaeddine-13 marked this conversation as resolved.


class BackendMixin(BaseBackendMixin):
"""Provide necessary functions to enable this storage backend."""

TYPE_MAP = {'str': 'TEXT', 'float': 'float', 'int': 'integer'}

def _map_embedding(self, embedding: 'ArrayType') -> 'ArrayType':
if embedding is None:
embedding = np.zeros(self.n_dim, dtype=np.float32)
Expand Down Expand Up @@ -62,6 +67,15 @@ def _init_storage(

self._config = config

if self._config.columns is None:
self._config.columns = []

for i in range(len(self._config.columns)):
self._config.columns[i] = (
self._config.columns[i][0],
self._map_type(self._config.columns[i][1]),
)

config = asdict(config)
self.n_dim = config.pop('n_dim')

Expand Down
5 changes: 4 additions & 1 deletion docarray/array/storage/annlite/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Optional,
TYPE_CHECKING,
List,
Dict,
)

if TYPE_CHECKING:
Expand All @@ -16,13 +17,15 @@ def _find(
query: 'np.ndarray',
limit: Optional[Union[int, float]] = 20,
only_id: bool = False,
filter: Optional[Dict] = None,
**kwargs,
) -> List['DocumentArray']:
"""Returns approximate nearest neighbors given an input query.

:param query: the query documents to search.
:param limit: the number of results to get for each query document in search.
:param only_id: if set, then returning matches will only contain ``id``
:param filter: filter query used for pre-filtering
:param kwargs: other kwargs.

:return: a list of DocumentArrays containing the closest Document objects for each of the queries in `query`.
Expand All @@ -34,7 +37,7 @@ def _find(
query = query.reshape(1, -1)

_, match_docs = self._annlite._search_documents(
query, limit=limit, include_metadata=not only_id
query, limit=limit, filter=filter or {}, include_metadata=not only_id
)

return match_docs
5 changes: 5 additions & 0 deletions docarray/array/storage/base/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@


class BaseBackendMixin(ABC):
TYPE_MAP: Dict

def _init_storage(
self,
_docs: Optional['DocumentArraySourceType'] = None,
Expand All @@ -27,3 +29,6 @@ def _map_embedding(self, embedding: 'ArrayType') -> 'ArrayType':
from ....math.ndarray import to_numpy_array

return to_numpy_array(embedding)

def _map_type(self, col_type: str) -> str:
return self.TYPE_MAP[col_type]
2 changes: 2 additions & 0 deletions docarray/array/storage/elastic/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
List,
Iterable,
Any,
Tuple,
Mapping,
)

Expand Down Expand Up @@ -41,6 +42,7 @@ class ElasticConfig:
batch_size: int = 64
ef_construction: Optional[int] = None
m: Optional[int] = None
columns: Optional[List[Tuple[str, str]]] = None


class BackendMixin(BaseBackendMixin):
Expand Down
8 changes: 8 additions & 0 deletions docarray/array/storage/elastic/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
Sequence,
List,
Union,
Optional,
Dict,
)

import numpy as np
Expand Down Expand Up @@ -99,16 +101,22 @@ def _find(
self,
query: 'ElasticArrayType',
limit: int = 10,
filter: Optional[Dict] = None,
**kwargs,
) -> List['DocumentArray']:
"""Returns approximate nearest neighbors given a batch of input queries.

:param query: input supported to be stored in Elastic. This includes any from the list '[np.ndarray, tensorflow.Tensor, torch.Tensor, Sequence[float]]'
:param limit: number of retrieved items
:param filter: filter query used for pre-filtering

:return: DocumentArray containing the closest documents to the query if it is a single query, otherwise a list of DocumentArrays containing
the closest Document objects for each of the queries in `query`.
"""
if filter is not None:
raise ValueError(
'Filtered vector search is not supported for ElasticSearch backend'
)
query = np.array(query)
num_rows, n_dim = ndarray.get_array_rows(query)
if n_dim != 2:
Expand Down
9 changes: 7 additions & 2 deletions docarray/array/storage/memory/find.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, Union, Tuple, Callable, TYPE_CHECKING
from typing import Optional, Union, Tuple, Callable, TYPE_CHECKING, Dict

import numpy as np

Expand Down Expand Up @@ -27,6 +27,7 @@ def _find(
use_scipy: bool = False,
device: str = 'cpu',
num_worker: Optional[int] = 1,
filter: Optional[Dict] = None,
**kwargs,
) -> Tuple['np.ndarray', 'np.ndarray']:
"""Returns approximate nearest neighbors given a batch of input queries.
Expand All @@ -47,11 +48,15 @@ def _find(

.. note::
This argument is only effective when ``batch_size`` is set.

:param filter: filter query used for pre-filtering
:param kwargs: other kwargs.

:return: a list of DocumentArrays containing the closest Document objects for each of the queries in `query`.
"""
if filter is not None:
raise ValueError(
'Filtered vector search is not supported for In-Memory backend'
)

if batch_size is not None:
if batch_size <= 0:
Expand Down
5 changes: 5 additions & 0 deletions docarray/array/storage/qdrant/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
Dict,
Iterable,
List,
Tuple,
)

import numpy as np
Expand Down Expand Up @@ -41,6 +42,7 @@ class QdrantConfig:
ef_construct: Optional[int] = None
full_scan_threshold: Optional[int] = None
m: Optional[int] = None
columns: Optional[List[Tuple[str, str]]] = None


class BackendMixin(BaseBackendMixin):
Expand Down Expand Up @@ -86,6 +88,9 @@ def _init_storage(
self._config = config
self._persist = bool(self._config.collection_name)

if self._config.columns is None:
self._config.columns = []

self._config.collection_name = (
self.__class__.__name__ + random_identity()
if self._config.collection_name is None
Expand Down
19 changes: 14 additions & 5 deletions docarray/array/storage/qdrant/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
TypeVar,
Sequence,
List,
Dict,
Optional,
)

from qdrant_client.http.models.models import Distance
Expand Down Expand Up @@ -48,13 +50,15 @@ def serialize_config(self) -> dict:
def distance(self) -> 'Distance':
raise NotImplementedError()

def _find_similar_vectors(self, q: 'QdrantArrayType', limit=10):
def _find_similar_vectors(
self, q: 'QdrantArrayType', limit: int = 10, filter: Optional[Dict] = None
):
query_vector = self._map_embedding(q)

search_result = self.client.search(
self.collection_name,
query_vector=query_vector,
query_filter=None,
query_filter=filter,
search_params=None,
top=limit,
append_payload=['_serialized'],
Expand All @@ -74,11 +78,16 @@ def _find_similar_vectors(self, q: 'QdrantArrayType', limit=10):
return DocumentArray(docs)

def _find(
self, query: 'QdrantArrayType', limit: int = 10, **kwargs
self,
query: 'QdrantArrayType',
limit: int = 10,
filter: Optional[Dict] = None,
**kwargs,
) -> List['DocumentArray']:
"""Returns approximate nearest neighbors given a batch of input queries.
:param query: input supported to be used in Qdrant.
:param limit: number of retrieved items
:param filter: filter query used for pre-filtering


:return: a list of DocumentArrays containing the closest Document objects for each of the queries in `query`.
Expand All @@ -87,10 +96,10 @@ def _find(
num_rows, _ = ndarray.get_array_rows(query)

if num_rows == 1:
return [self._find_similar_vectors(query, limit=limit)]
return [self._find_similar_vectors(query, limit=limit, filter=filter)]
else:
closest_docs = []
for q in query:
da = self._find_similar_vectors(q, limit=limit)
da = self._find_similar_vectors(q, limit=limit, filter=filter)
closest_docs.append(da)
return closest_docs
6 changes: 5 additions & 1 deletion docarray/array/storage/qdrant/getsetdel.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,13 @@ def _qdrant_to_document(self, qdrant_record: dict) -> 'Document':
)

def _document_to_qdrant(self, doc: 'Document') -> 'PointStruct':
extra_columns = {col: doc.tags.get(col) for col, _ in self._config.columns}

return PointStruct(
id=self._map_id(doc.id),
payload=dict(_serialized=doc.to_base64(**self.serialization_config)),
payload=dict(
_serialized=doc.to_base64(**self.serialization_config), **extra_columns
),
vector=self._map_embedding(doc.embedding),
)

Expand Down
2 changes: 2 additions & 0 deletions docarray/array/storage/sqlite/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
Optional,
TYPE_CHECKING,
Union,
List,
Tuple,
)

from .helper import initialize_table
Expand Down
Loading