diff --git a/docarray/typing/tensor/video/video_ndarray.py b/docarray/typing/tensor/video/video_ndarray.py index f6b3cd8d3b1..c237de5760e 100644 --- a/docarray/typing/tensor/video/video_ndarray.py +++ b/docarray/typing/tensor/video/video_ndarray.py @@ -42,15 +42,15 @@ class MyVideoDoc(BaseDoc): video_tensor=np.random.random((100, 224, 224, 3)), ) - doc_1.video_tensor.save(file_path='file_1.mp4') + doc_1.video_tensor.save(file_path='/tmp/file_1.mp4') doc_2 = MyVideoDoc( title='my_second_video_doc', - url='file_1.mp4', + url='/tmp/file_1.mp4', ) doc_2.video_tensor = parse_obj_as(VideoNdArray, doc_2.url.load().video) - doc_2.video_tensor.save(file_path='file_2.mp4') + doc_2.video_tensor.save(file_path='/tmp/file_2.mp4') ``` --- diff --git a/docarray/utils/filter.py b/docarray/utils/filter.py index a68641a7b13..773cbbe815d 100644 --- a/docarray/utils/filter.py +++ b/docarray/utils/filter.py @@ -1,3 +1,5 @@ +__all__ = ['filter_docs'] + import json from typing import Dict, List, Union @@ -13,50 +15,55 @@ def filter_docs( Filter the Documents in the index according to the given filter query. - EXAMPLE USAGE - .. code-block:: python + --- - from docarray import DocArray, BaseDoc - from docarray.documents import Text, Image - from docarray.util.filter import filter_docs + ```python + from docarray import DocArray, BaseDoc + from docarray.documents import TextDoc, ImageDoc + from docarray.utils.filter import filter_docs - class MyDocument(BaseDoc): - caption: Text - image: Image - price: int + class MyDocument(BaseDoc): + caption: TextDoc + ImageDoc: ImageDoc + price: int - docs = DocArray[MyDocument]( - [ - MyDocument( - caption='A tiger in the jungle', - image=Image(url='tigerphoto.png'), - price=100, - ), - MyDocument( - caption='A swimming turtle', image=Image(url='turtlepic.png'), price=50 - ), - MyDocument( - caption='A couple birdwatching with binoculars', - image=Image(url='binocularsphoto.png'), - price=30, - ), - ] - ) - query = { - '$and': { - 'image__url': {'$regex': 'photo'}, - 'price': {'$lte': 50}, - } + docs = DocArray[MyDocument]( + [ + MyDocument( + caption='A tiger in the jungle', + ImageDoc=ImageDoc(url='tigerphoto.png'), + price=100, + ), + MyDocument( + caption='A swimming turtle', + ImageDoc=ImageDoc(url='turtlepic.png'), + price=50, + ), + MyDocument( + caption='A couple birdwatching with binoculars', + ImageDoc=ImageDoc(url='binocularsphoto.png'), + price=30, + ), + ] + ) + query = { + '$and': { + 'ImageDoc__url': {'$regex': 'photo'}, + 'price': {'$lte': 50}, } + } + + results = filter_docs(docs, query) + assert len(results) == 1 + assert results[0].price == 30 + assert results[0].caption == 'A couple birdwatching with binoculars' + assert results[0].ImageDoc.url == 'binocularsphoto.png' + ``` - results = filter_docs(docs, query) - assert len(results) == 1 - assert results[0].price == 30 - assert results[0].caption == 'A couple birdwatching with binoculars' - assert results[0].image.url == 'binocularsphoto.png' + --- :param docs: the DocArray where to apply the filter :param query: the query to filter by diff --git a/docarray/utils/find.py b/docarray/utils/find.py index b7029578b56..c7eeb787159 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -1,3 +1,5 @@ +__all__ = ['find', 'find_batched'] + from typing import Any, Dict, List, NamedTuple, Optional, Type, Union, cast from typing_inspect import is_union_type @@ -34,52 +36,48 @@ def find( Find the closest Documents in the index to the query. Supports PyTorch and NumPy embeddings. - .. note:: - This utility function is likely to be removed once - Document Stores are available. - At that point, and in-memory Document Store will serve the same purpose - by exposing a .find() method. - - .. note:: - This is a simple implementation that assumes the same embedding field name for - both query and index, does not support nested search, and does not support - hybrid (multi-vector) search. These shortcoming will be addressed in future - versions. + !!! note + This is a simple implementation of exact search. If you need to do advance + search using approximate nearest neighbours search or hybrid search or + multi vector search please take a look at the [BaseDoc][docarray.base_doc.doc.BaseDoc] - EXAMPLE USAGE + --- - .. code-block:: python + ```python + from docarray import DocArray, BaseDoc + from docarray.typing import TorchTensor + from docarray.utils.find import find + import torch - from docarray import DocArray, BaseDoc - from docarray.typing import TorchTensor - from docarray.util.find import find + class MyDocument(BaseDoc): + embedding: TorchTensor - class MyDocument(BaseDoc): - embedding: TorchTensor + index = DocArray[MyDocument]( + [MyDocument(embedding=torch.rand(128)) for _ in range(100)] + ) - index = DocArray[MyDocument]( - [MyDocument(embedding=torch.rand(128)) for _ in range(100)] - ) + # use Document as query + query = MyDocument(embedding=torch.rand(128)) + top_matches, scores = find( + index=index, + query=query, + embedding_field='embedding', + metric='cosine_sim', + ) - # use Document as query - query = MyDocument(embedding=torch.rand(128)) - top_matches, scores = find( - index=index, - query=query, - embedding_field='tensor', - metric='cosine_sim', - ) + # use tensor as query + query = torch.rand(128) + top_matches, scores = find( + index=index, + query=query, + embedding_field='embedding', + metric='cosine_sim', + ) + ``` - # use tensor as query - query = torch.rand(128) - top_matches, scores = find( - index=index, - query=query, - embedding_field='tensor', - metric='cosine_sim', - ) + --- :param index: the index of Documents to search in :param query: the query to search for @@ -123,54 +121,51 @@ def find_batched( Find the closest Documents in the index to the queries. Supports PyTorch and NumPy embeddings. - .. note:: - This utility function is likely to be removed once - Document Stores are available. - At that point, and in-memory Document Store will serve the same purpose - by exposing a .find() method. - - .. note:: - This is a simple implementation that assumes the same embedding field name for - both query and index, does not support nested search, and does not support - hybrid (multi-vector) search. These shortcoming will be addressed in future - versions. - - EXAMPLE USAGE - - .. code-block:: python - - from docarray import DocArray, BaseDoc - from docarray.typing import TorchTensor - from docarray.util.find import find - - - class MyDocument(BaseDoc): - embedding: TorchTensor - - - index = DocArray[MyDocument]( - [MyDocument(embedding=torch.rand(128)) for _ in range(100)] - ) - - # use DocArray as query - query = DocArray[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)]) - results = find( - index=index, - query=query, - embedding_field='tensor', - metric='cosine_sim', - ) - top_matches, scores = results[0] - - # use tensor as query - query = torch.rand(3, 128) - results, scores = find( - index=index, - query=query, - embedding_field='tensor', - metric='cosine_sim', - ) - top_matches, scores = results[0] + !!! note + This is a simple implementation of exact search. If you need to do advance + search using approximate nearest neighbours search or hybrid search or + multi vector search please take a look at the [BaseDoc][docarray.base_doc.doc.BaseDoc] + + + --- + + ```python + # from docarray import DocArray, BaseDoc + # from docarray.typing import TorchTensor + # from docarray.utils.find import find + # import torch + # + # + # class MyDocument(BaseDoc): + # embedding: TorchTensor + # + # + # index = DocArray[MyDocument]( + # [MyDocument(embedding=torch.rand(128)) for _ in range(100)] + # ) + # + # # use DocArray as query + # query = DocArray[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)]) + # results = find( + # index=index, + # query=query, + # embedding_field='embedding', + # metric='cosine_sim', + # ) + # top_matches, scores = results[0] + # + # # use tensor as query + # query = torch.rand(3, 128) + # results, scores = find( + # index=index, + # query=query, + # embedding_field='embedding', + # metric='cosine_sim', + # ) + # top_matches, scores = results[0] + ``` + + --- :param index: the index of Documents to search in :param query: the query to search for diff --git a/docarray/utils/map.py b/docarray/utils/map.py index 9642d38f10a..3f8af0f2b7a 100644 --- a/docarray/utils/map.py +++ b/docarray/utils/map.py @@ -1,3 +1,4 @@ +__all__ = ['map_docs', 'map_docs_batch'] from contextlib import nullcontext from math import ceil from multiprocessing.pool import Pool, ThreadPool @@ -25,27 +26,33 @@ def map_docs( Return an iterator that applies `func` to every Document in `da` in parallel, yielding the results. - EXAMPLE USAGE - - .. code-block:: python - - from docarray import DocArray - from docarray.documents import Image - from docarray.utils.map import map_docs - - - def load_url_to_tensor(img: Image) -> Image: - img.tensor = img.url.load() - return img - - - da = DocArray[Image]([Image(url='/path/to/img.png') for _ in range(100)]) - da = DocArray[Image]( - list(map_docs(da, load_url_to_tensor, backend='thread')) - ) # threading is usually a good option for IO-bound tasks such as loading an image from url - - for doc in da: - assert doc.tensor is not None + --- + + ```python + # from docarray import DocArray + # from docarray.documents import ImageDoc + # from docarray.utils.map import map_docs + # + # + # def load_url_to_tensor(img: ImageDoc) -> ImageDoc: + # img.tensor = img.url.load() + # return img + # + # + # url = 'https://github.com/docarray/artwork/blob/main/stacked/color/docarray-stacked-color.png' + # + # + # da = DocArray[ImageDoc]([ImageDoc(url=url) for _ in range(100)]) + # da = DocArray[ImageDoc]( + # list(map_docs(da, load_url_to_tensor, backend='thread')) + # ) # threading is usually a good option for IO-bound tasks such as loading an + # # ImageDoc from url + # + # for doc in da: + # assert doc.tensor is not None + ``` + + --- :param da: DocArray to apply function to :param func: a function that takes a :class:`BaseDoc` as input and outputs @@ -72,7 +79,7 @@ def load_url_to_tensor(img: Image) -> Image: be responsible for closing the pool. :param show_progress: show a progress bar. Defaults to False. - :yield: Documents returned from `func` + :return: yield Documents returned from `func` """ if backend == 'process' and _is_lambda_or_partial_or_local_function(func): @@ -109,34 +116,39 @@ def map_docs_batch( yielding the results. Each element in the returned iterator is an :class:`AnyDocArray`. - EXAMPLE USAGE - - .. code-block:: python - from docarray import BaseDoc, DocArray - from docarray.utils.map import map_docs_batch - - - class MyDoc(BaseDoc): - name: str - - - def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: - da.name = [n.upper() for n in da.name] - return da - - - batch_size = 16 - da = DocArray[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)]) - it = map_docs_batch(da, upper_case_name, batch_size=batch_size) - for i, d in enumerate(it): - da[i * batch_size : (i + 1) * batch_size] = d - - assert len(da) == 100 - print(da.name[:3]) - - .. code-block:: text - - ['MY ORANGE CAT', 'MY ORANGE CAT', 'MY ORANGE CAT'] + --- + + ```python + # from docarray import BaseDoc, DocArray + # from docarray.utils.map import map_docs_batch + # + # + # class MyDoc(BaseDoc): + # name: str + # + # + # def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: + # da.name = [n.upper() for n in da.name] + # return da + # + # + # batch_size = 16 + # da = DocArray[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)]) + # it = map_docs_batch(da, upper_case_name, batch_size=batch_size) + # for i, d in enumerate(it): + # da[i * batch_size : (i + 1) * batch_size] = d + # + # assert len(da) == 100 + # print(da.name[:3]) + ``` + + --- + + ``` + ['MY ORANGE CAT', 'MY ORANGE CAT', 'MY ORANGE CAT'] + ``` + + --- :param da: DocArray to apply function to :param batch_size: Size of each generated batch (except the last one, which might @@ -166,7 +178,7 @@ def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: :param pool: use an existing/external pool. If given, `backend` is ignored and you will be responsible for closing the pool. - :yield: DocArrays returned from `func` + :return: yield DocArrays returned from `func` """ if backend == 'process' and _is_lambda_or_partial_or_local_function(func): raise ValueError( diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py index c021febea00..abf677b7cc9 100644 --- a/docarray/utils/reduce.py +++ b/docarray/utils/reduce.py @@ -1,3 +1,5 @@ +__all__ = ['reduce', 'reduce_all'] + from typing import Dict, List, Optional from docarray import DocArray diff --git a/docs/api_references/utils/filter.md b/docs/api_references/utils/filter.md new file mode 100644 index 00000000000..8eedc91509c --- /dev/null +++ b/docs/api_references/utils/filter.md @@ -0,0 +1,7 @@ +# Filter + +::: docarray.utils.filter.filter_docs + + + + diff --git a/docs/api_references/utils/find.md b/docs/api_references/utils/find.md new file mode 100644 index 00000000000..097e7372d29 --- /dev/null +++ b/docs/api_references/utils/find.md @@ -0,0 +1,8 @@ +# Find + +::: docarray.utils.find.find +::: docarray.utils.find.find_batched + + + + diff --git a/docs/api_references/utils/maps_docs.md b/docs/api_references/utils/maps_docs.md new file mode 100644 index 00000000000..3f663b114b3 --- /dev/null +++ b/docs/api_references/utils/maps_docs.md @@ -0,0 +1,8 @@ +# Map + +::: docarray.utils.map.map_docs +::: docarray.utils.map.map_docs_batch + + + + diff --git a/file_1.mp4 b/file_1.mp4 deleted file mode 100644 index 15b58e871af..00000000000 Binary files a/file_1.mp4 and /dev/null differ diff --git a/file_2.mp4 b/file_2.mp4 deleted file mode 100644 index 15b58e871af..00000000000 Binary files a/file_2.mp4 and /dev/null differ diff --git a/tests/documentation/test_docstring.py b/tests/documentation/test_docstring.py index c6a8e3d9a40..91bc43468c1 100644 --- a/tests/documentation/test_docstring.py +++ b/tests/documentation/test_docstring.py @@ -15,6 +15,7 @@ import docarray.index import docarray.store import docarray.typing +from docarray.utils import filter, find, map SUB_MODULE_TO_CHECK = [ docarray, @@ -23,6 +24,9 @@ docarray.documents, docarray.store, docarray.typing, + find, + map, + filter, ] @@ -38,6 +42,7 @@ def get_obj_to_check(lib): for lib in SUB_MODULE_TO_CHECK: obj_to_check.extend(get_obj_to_check(lib)) + members = [] for obj in obj_to_check: members.extend(get_codeblock_members(obj))