From be5c94fc713bd9c32b75c5c8ffc4ab939a009f3f Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 29 Mar 2023 11:52:10 +0200 Subject: [PATCH 1/8] feat: add utils for map to docs and fix docstring Signed-off-by: samsja --- docarray/utils/map.py | 75 ++++++++++++++------------ docs/api_references/utils/maps_docs.md | 8 +++ 2 files changed, 50 insertions(+), 33 deletions(-) create mode 100644 docs/api_references/utils/maps_docs.md diff --git a/docarray/utils/map.py b/docarray/utils/map.py index 9642d38f10a..ab41cc90d74 100644 --- a/docarray/utils/map.py +++ b/docarray/utils/map.py @@ -27,25 +27,29 @@ def map_docs( EXAMPLE USAGE - .. code-block:: python + --- - from docarray import DocArray - from docarray.documents import Image - from docarray.utils.map import map_docs + ```python + from docarray import DocArray + from docarray.documents import Image + from docarray.utils.map import map_docs - def load_url_to_tensor(img: Image) -> Image: - img.tensor = img.url.load() - return img + def load_url_to_tensor(img: Image) -> Image: + img.tensor = img.url.load() + return img - da = DocArray[Image]([Image(url='/path/to/img.png') for _ in range(100)]) - da = DocArray[Image]( - list(map_docs(da, load_url_to_tensor, backend='thread')) - ) # threading is usually a good option for IO-bound tasks such as loading an image from url + da = DocArray[Image]([Image(url='/path/to/img.png') for _ in range(100)]) + da = DocArray[Image]( + list(map_docs(da, load_url_to_tensor, backend='thread')) + ) # threading is usually a good option for IO-bound tasks such as loading an image from url - for doc in da: - assert doc.tensor is not None + for doc in da: + assert doc.tensor is not None + ``` + + --- :param da: DocArray to apply function to :param func: a function that takes a :class:`BaseDoc` as input and outputs @@ -72,7 +76,7 @@ def load_url_to_tensor(img: Image) -> Image: be responsible for closing the pool. :param show_progress: show a progress bar. Defaults to False. - :yield: Documents returned from `func` + :return: Documents returned from `func` """ if backend == 'process' and _is_lambda_or_partial_or_local_function(func): @@ -109,34 +113,39 @@ def map_docs_batch( yielding the results. Each element in the returned iterator is an :class:`AnyDocArray`. - EXAMPLE USAGE + --- + + ```python + from docarray import BaseDoc, DocArray + from docarray.utils.map import map_docs_batch - .. code-block:: python - from docarray import BaseDoc, DocArray - from docarray.utils.map import map_docs_batch + class MyDoc(BaseDoc): + name: str - class MyDoc(BaseDoc): - name: str + def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: + da.name = [n.upper() for n in da.name] + return da - def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: - da.name = [n.upper() for n in da.name] - return da + batch_size = 16 + da = DocArray[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)]) + it = map_docs_batch(da, upper_case_name, batch_size=batch_size) + for i, d in enumerate(it): + da[i * batch_size : (i + 1) * batch_size] = d - batch_size = 16 - da = DocArray[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)]) - it = map_docs_batch(da, upper_case_name, batch_size=batch_size) - for i, d in enumerate(it): - da[i * batch_size : (i + 1) * batch_size] = d + assert len(da) == 100 + print(da.name[:3]) + ``` - assert len(da) == 100 - print(da.name[:3]) + --- - .. code-block:: text + ``` + ['MY ORANGE CAT', 'MY ORANGE CAT', 'MY ORANGE CAT'] + ``` - ['MY ORANGE CAT', 'MY ORANGE CAT', 'MY ORANGE CAT'] + --- :param da: DocArray to apply function to :param batch_size: Size of each generated batch (except the last one, which might @@ -166,7 +175,7 @@ def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: :param pool: use an existing/external pool. If given, `backend` is ignored and you will be responsible for closing the pool. - :yield: DocArrays returned from `func` + :return: yield DocArrays returned from `func` """ if backend == 'process' and _is_lambda_or_partial_or_local_function(func): raise ValueError( diff --git a/docs/api_references/utils/maps_docs.md b/docs/api_references/utils/maps_docs.md new file mode 100644 index 00000000000..3f663b114b3 --- /dev/null +++ b/docs/api_references/utils/maps_docs.md @@ -0,0 +1,8 @@ +# Map + +::: docarray.utils.map.map_docs +::: docarray.utils.map.map_docs_batch + + + + From c702c0275a2ee3ac40561d4049272371127e73b4 Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 29 Mar 2023 11:53:56 +0200 Subject: [PATCH 2/8] feat: add utils for map to docs and fix docstring Signed-off-by: samsja --- docarray/utils/map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/utils/map.py b/docarray/utils/map.py index ab41cc90d74..2f0702b9827 100644 --- a/docarray/utils/map.py +++ b/docarray/utils/map.py @@ -76,7 +76,7 @@ def load_url_to_tensor(img: Image) -> Image: be responsible for closing the pool. :param show_progress: show a progress bar. Defaults to False. - :return: Documents returned from `func` + :return: yield Documents returned from `func` """ if backend == 'process' and _is_lambda_or_partial_or_local_function(func): From 72dd727cd2145019163a3132ffe6a51b9a3eee20 Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 29 Mar 2023 13:42:35 +0200 Subject: [PATCH 3/8] feat: add utils for find and fix docstring Signed-off-by: samsja --- docarray/utils/find.py | 145 ++++++++++++++---------------- docs/api_references/utils/find.md | 8 ++ file_1.mp4 | Bin 8232 -> 0 bytes file_2.mp4 | Bin 8232 -> 0 bytes 4 files changed, 77 insertions(+), 76 deletions(-) create mode 100644 docs/api_references/utils/find.md delete mode 100644 file_1.mp4 delete mode 100644 file_2.mp4 diff --git a/docarray/utils/find.py b/docarray/utils/find.py index b7029578b56..3be98dc6157 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -34,52 +34,48 @@ def find( Find the closest Documents in the index to the query. Supports PyTorch and NumPy embeddings. - .. note:: - This utility function is likely to be removed once - Document Stores are available. - At that point, and in-memory Document Store will serve the same purpose - by exposing a .find() method. + !!! note + This is a simple implementation of exact search. If you need to do advance + search using approximate nearest neighbours search or hybrid search or + multi vector search please take a look at the [BaseDoc][docarray.base_doc.doc.BaseDoc] - .. note:: - This is a simple implementation that assumes the same embedding field name for - both query and index, does not support nested search, and does not support - hybrid (multi-vector) search. These shortcoming will be addressed in future - versions. + --- - EXAMPLE USAGE + ```python + from docarray import DocArray, BaseDoc + from docarray.typing import TorchTensor + from docarray.utils.find import find + import torch - .. code-block:: python - from docarray import DocArray, BaseDoc - from docarray.typing import TorchTensor - from docarray.util.find import find + class MyDocument(BaseDoc): + embedding: TorchTensor - class MyDocument(BaseDoc): - embedding: TorchTensor - + index = DocArray[MyDocument]( + [MyDocument(embedding=torch.rand(128)) for _ in range(100)] + ) - index = DocArray[MyDocument]( - [MyDocument(embedding=torch.rand(128)) for _ in range(100)] - ) + # use Document as query + query = MyDocument(embedding=torch.rand(128)) + top_matches, scores = find( + index=index, + query=query, + embedding_field='tensor', + metric='cosine_sim', + ) - # use Document as query - query = MyDocument(embedding=torch.rand(128)) - top_matches, scores = find( - index=index, - query=query, - embedding_field='tensor', - metric='cosine_sim', - ) + # use tensor as query + query = torch.rand(128) + top_matches, scores = find( + index=index, + query=query, + embedding_field='tensor', + metric='cosine_sim', + ) + ``` - # use tensor as query - query = torch.rand(128) - top_matches, scores = find( - index=index, - query=query, - embedding_field='tensor', - metric='cosine_sim', - ) + --- :param index: the index of Documents to search in :param query: the query to search for @@ -123,54 +119,51 @@ def find_batched( Find the closest Documents in the index to the queries. Supports PyTorch and NumPy embeddings. - .. note:: - This utility function is likely to be removed once - Document Stores are available. - At that point, and in-memory Document Store will serve the same purpose - by exposing a .find() method. + !!! note + This is a simple implementation of exact search. If you need to do advance + search using approximate nearest neighbours search or hybrid search or + multi vector search please take a look at the [BaseDoc][docarray.base_doc.doc.BaseDoc] - .. note:: - This is a simple implementation that assumes the same embedding field name for - both query and index, does not support nested search, and does not support - hybrid (multi-vector) search. These shortcoming will be addressed in future - versions. - EXAMPLE USAGE + --- - .. code-block:: python + ```python + from docarray import DocArray, BaseDoc + from docarray.typing import TorchTensor + from docarray.utils.find import find + import torch - from docarray import DocArray, BaseDoc - from docarray.typing import TorchTensor - from docarray.util.find import find + class MyDocument(BaseDoc): + embedding: TorchTensor - class MyDocument(BaseDoc): - embedding: TorchTensor + index = DocArray[MyDocument]( + [MyDocument(embedding=torch.rand(128)) for _ in range(100)] + ) - index = DocArray[MyDocument]( - [MyDocument(embedding=torch.rand(128)) for _ in range(100)] - ) + # use DocArray as query + query = DocArray[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)]) + results = find( + index=index, + query=query, + embedding_field='tensor', + metric='cosine_sim', + ) + top_matches, scores = results[0] - # use DocArray as query - query = DocArray[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)]) - results = find( - index=index, - query=query, - embedding_field='tensor', - metric='cosine_sim', - ) - top_matches, scores = results[0] - - # use tensor as query - query = torch.rand(3, 128) - results, scores = find( - index=index, - query=query, - embedding_field='tensor', - metric='cosine_sim', - ) - top_matches, scores = results[0] + # use tensor as query + query = torch.rand(3, 128) + results, scores = find( + index=index, + query=query, + embedding_field='tensor', + metric='cosine_sim', + ) + top_matches, scores = results[0] + ``` + + --- :param index: the index of Documents to search in :param query: the query to search for diff --git a/docs/api_references/utils/find.md b/docs/api_references/utils/find.md new file mode 100644 index 00000000000..097e7372d29 --- /dev/null +++ b/docs/api_references/utils/find.md @@ -0,0 +1,8 @@ +# Find + +::: docarray.utils.find.find +::: docarray.utils.find.find_batched + + + + diff --git a/file_1.mp4 b/file_1.mp4 deleted file mode 100644 index 15b58e871afd89ea39501de2fd6f2618a77bd9fe..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8232 zcmeHMZ)hCV7QdTmW372rOSDz$W2EuVhh#Im*`#e?uxnadLBR@Ikq67&nY-CBGdq)+ zNxJciQHxgm@;*rArO#Ar_4%}-RQk$8eTh)vJrNO9w4&0|So~9JDQLPDUC+65?`CJ( z!t;ZEkR38}&%b-``Q0<;p4?eNh*}CKJS%V=LfQznaA)bbk#W3Sh7hu(Ax)o^{LW;nSy4Wi!L@7TDhC#UA` z9s-@o46qq;y$PF@LN%*teLYz%n+0Jx486hL-fi2qrN=Fkxi+n&U4OV2$4!?*#|9hM z3oW-245|h#QbW(EJ}c>cs>zDBYmDfbL2Xb|X@%Mo0n@c=-=J2_WcpNx>19?`gRvq| zYg8|ofUfW?_@U56H+Yeo%yX4_T()Y|G= z(+GhuMjc%P7pQr^TVZ-O)1AqvB^rbUFBq{rw1@^6^$KpO6tGb5$*N)5haLeqZ@caY zEdx_nkOj77fG<*zQMmxED?XphupAnqLslhZzD>azzZAw0M7!Fk|coRTdrk(|wn$$x$#X`}d0ZwB1j04=ptl@H41fA=#N@3XbKpP1? z=sLnCU~E0xuL-i?SQR~+Qv-ulm@yXWISt+PDXy*0f--dXjY0!A>U~3}MPGG_u)qkb zXn{Qf)LhpX;mLs2ZA7Eo}N=J526Qe3iN=n=_tGy7-ljF zFLRdd*iNpR`FZ|>o8H}jVyf%Fu@~m|OkZCZ8oMsFYv-X;N544B-aD}As?Sfa_DJgQ zM5I#Hel{Uy+2?uH$`50J&Tb5^NEEY~hgP17(xQYY;n zqB7OC?)ZugSI&@&KjExWi&(A&TjM@%`;oLA*o(G@r7dUMQM)`pRlfCweEDy;E$M%N zd;c0pJHeOdw@+=q^06moAKuDQE&7tnm0CjHhV~u#=|?tR-hK59A>(fWkLbTLw=93_ zr8*(6KFB3Xts&ty;N^2+{_#3Vy$yOq^v%`{D`!aClbls*5i7LQ|F{p97lxk&Z3h~U z@5Sj}o#Z@NM?BX9sC@v+UI2w5K~OIq0#N^30EL0+%qz8ooQDTMy|eL(IH(hQx7-;A z)sS#dm!zh@4&#oVd~!p~QNeRZ!Tj3&2a|YW{!*elDGsf zabfXl+UI4wc#%3!ekJ5GUXLVkL8(gY^9_l_1;s1oD3mLXOFAleY2K<|{GW|h{o?;` zwCb14FWlai!GB<_`X%=HR{e^~T~yEHKHsWeeBWr*FS&1EnZ(U@v;eR)1`bx$Yq;Iis0YE6=Oi0;MXr$o{wrH zr2R+uZaf0Yzl0-vPNm6Ej0W>2uf#m(o%aZI!kdsTW`+T@tC$_Y?;-qx5pdi^!smA1 zF)a!j)iIOUH4y)o+;xce>nWSI&nbA|?;euI_HO(gWHYUpHba~CYv6--35d>dhY#B9 zc9$9HR+;dX9XrE?`i?Q*RwH`x=RsJsfqpp%0~rvsc?y5*ffldkF(b(lfV~+=yzx5M zB9bpWSJvU%D(Qn!hf*f#tRE#ShN0bec<9mcdO{Qej}GPSrED@q7MH(0vA_GLGiR`S zCw%{pR`k-RflX6*tPDcF+CcKD!Ep#Q*a8TQ0$%76+{%r{f~^tBAR=71RJ4*u7W?qA zpl#`+^=Pe6v<~=s{E^;|C5Q)JcK*Jvg51IUz)R!~fxJRGwHI{vb4~OBxk7>Lhr7G! zz6I!<;CrE!Jkl-xw_Oj|`wSP>883_6!lg?<`FJvAx)o^{LW;nSy4Wi!L@7TDhC#UA` z9s-@o46qq;y$PF@LN%*teLYz%n+0Jx486hL-fi2qrN=Fkxi+n&U4OV2$4!?*#|9hM z3oW-245|h#QbW(EJ}c>cs>zDBYmDfbL2Xb|X@%Mo0n@c=-=J2_WcpNx>19?`gRvq| zYg8|ofUfW?_@U56H+Yeo%yX4_T()Y|G= z(+GhuMjc%P7pQr^TVZ-O)1AqvB^rbUFBq{rw1@^6^$KpO6tGb5$*N)5haLeqZ@caY zEdx_nkOj77fG<*zQMmxED?XphupAnqLslhZzD>azzZAw0M7!Fk|coRTdrk(|wn$$x$#X`}d0ZwB1j04=ptl@H41fA=#N@3XbKpP1? z=sLnCU~E0xuL-i?SQR~+Qv-ulm@yXWISt+PDXy*0f--dXjY0!A>U~3}MPGG_u)qkb zXn{Qf)LhpX;mLs2ZA7Eo}N=J526Qe3iN=n=_tGy7-ljF zFLRdd*iNpR`FZ|>o8H}jVyf%Fu@~m|OkZCZ8oMsFYv-X;N544B-aD}As?Sfa_DJgQ zM5I#Hel{Uy+2?uH$`50J&Tb5^NEEY~hgP17(xQYY;n zqB7OC?)ZugSI&@&KjExWi&(A&TjM@%`;oLA*o(G@r7dUMQM)`pRlfCweEDy;E$M%N zd;c0pJHeOdw@+=q^06moAKuDQE&7tnm0CjHhV~u#=|?tR-hK59A>(fWkLbTLw=93_ zr8*(6KFB3Xts&ty;N^2+{_#3Vy$yOq^v%`{D`!aClbls*5i7LQ|F{p97lxk&Z3h~U z@5Sj}o#Z@NM?BX9sC@v+UI2w5K~OIq0#N^30EL0+%qz8ooQDTMy|eL(IH(hQx7-;A z)sS#dm!zh@4&#oVd~!p~QNeRZ!Tj3&2a|YW{!*elDGsf zabfXl+UI4wc#%3!ekJ5GUXLVkL8(gY^9_l_1;s1oD3mLXOFAleY2K<|{GW|h{o?;` zwCb14FWlai!GB<_`X%=HR{e^~T~yEHKHsWeeBWr*FS&1EnZ(U@v;eR)1`bx$Yq;Iis0YE6=Oi0;MXr$o{wrH zr2R+uZaf0Yzl0-vPNm6Ej0W>2uf#m(o%aZI!kdsTW`+T@tC$_Y?;-qx5pdi^!smA1 zF)a!j)iIOUH4y)o+;xce>nWSI&nbA|?;euI_HO(gWHYUpHba~CYv6--35d>dhY#B9 zc9$9HR+;dX9XrE?`i?Q*RwH`x=RsJsfqpp%0~rvsc?y5*ffldkF(b(lfV~+=yzx5M zB9bpWSJvU%D(Qn!hf*f#tRE#ShN0bec<9mcdO{Qej}GPSrED@q7MH(0vA_GLGiR`S zCw%{pR`k-RflX6*tPDcF+CcKD!Ep#Q*a8TQ0$%76+{%r{f~^tBAR=71RJ4*u7W?qA zpl#`+^=Pe6v<~=s{E^;|C5Q)JcK*Jvg51IUz)R!~fxJRGwHI{vb4~OBxk7>Lhr7G! zz6I!<;CrE!Jkl-xw_Oj|`wSP>883_6!lg?<`FJv Date: Wed, 29 Mar 2023 13:49:22 +0200 Subject: [PATCH 4/8] fix: fix video ndaray docstrng Signed-off-by: samsja --- docarray/typing/tensor/video/video_ndarray.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docarray/typing/tensor/video/video_ndarray.py b/docarray/typing/tensor/video/video_ndarray.py index f6b3cd8d3b1..c237de5760e 100644 --- a/docarray/typing/tensor/video/video_ndarray.py +++ b/docarray/typing/tensor/video/video_ndarray.py @@ -42,15 +42,15 @@ class MyVideoDoc(BaseDoc): video_tensor=np.random.random((100, 224, 224, 3)), ) - doc_1.video_tensor.save(file_path='file_1.mp4') + doc_1.video_tensor.save(file_path='/tmp/file_1.mp4') doc_2 = MyVideoDoc( title='my_second_video_doc', - url='file_1.mp4', + url='/tmp/file_1.mp4', ) doc_2.video_tensor = parse_obj_as(VideoNdArray, doc_2.url.load().video) - doc_2.video_tensor.save(file_path='file_2.mp4') + doc_2.video_tensor.save(file_path='/tmp/file_2.mp4') ``` --- From b683b1a0c3eee2806848701cc8f84d6c9502b54e Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 29 Mar 2023 14:00:58 +0200 Subject: [PATCH 5/8] fix: fix video find docstrng Signed-off-by: samsja --- docarray/utils/find.py | 72 ++++++++++++++------------- tests/documentation/test_docstring.py | 3 ++ 2 files changed, 40 insertions(+), 35 deletions(-) diff --git a/docarray/utils/find.py b/docarray/utils/find.py index 3be98dc6157..c7eeb787159 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -1,3 +1,5 @@ +__all__ = ['find', 'find_batched'] + from typing import Any, Dict, List, NamedTuple, Optional, Type, Union, cast from typing_inspect import is_union_type @@ -61,7 +63,7 @@ class MyDocument(BaseDoc): top_matches, scores = find( index=index, query=query, - embedding_field='tensor', + embedding_field='embedding', metric='cosine_sim', ) @@ -70,7 +72,7 @@ class MyDocument(BaseDoc): top_matches, scores = find( index=index, query=query, - embedding_field='tensor', + embedding_field='embedding', metric='cosine_sim', ) ``` @@ -128,39 +130,39 @@ def find_batched( --- ```python - from docarray import DocArray, BaseDoc - from docarray.typing import TorchTensor - from docarray.utils.find import find - import torch - - - class MyDocument(BaseDoc): - embedding: TorchTensor - - - index = DocArray[MyDocument]( - [MyDocument(embedding=torch.rand(128)) for _ in range(100)] - ) - - # use DocArray as query - query = DocArray[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)]) - results = find( - index=index, - query=query, - embedding_field='tensor', - metric='cosine_sim', - ) - top_matches, scores = results[0] - - # use tensor as query - query = torch.rand(3, 128) - results, scores = find( - index=index, - query=query, - embedding_field='tensor', - metric='cosine_sim', - ) - top_matches, scores = results[0] + # from docarray import DocArray, BaseDoc + # from docarray.typing import TorchTensor + # from docarray.utils.find import find + # import torch + # + # + # class MyDocument(BaseDoc): + # embedding: TorchTensor + # + # + # index = DocArray[MyDocument]( + # [MyDocument(embedding=torch.rand(128)) for _ in range(100)] + # ) + # + # # use DocArray as query + # query = DocArray[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)]) + # results = find( + # index=index, + # query=query, + # embedding_field='embedding', + # metric='cosine_sim', + # ) + # top_matches, scores = results[0] + # + # # use tensor as query + # query = torch.rand(3, 128) + # results, scores = find( + # index=index, + # query=query, + # embedding_field='embedding', + # metric='cosine_sim', + # ) + # top_matches, scores = results[0] ``` --- diff --git a/tests/documentation/test_docstring.py b/tests/documentation/test_docstring.py index c6a8e3d9a40..1debc145240 100644 --- a/tests/documentation/test_docstring.py +++ b/tests/documentation/test_docstring.py @@ -15,6 +15,7 @@ import docarray.index import docarray.store import docarray.typing +from docarray.utils import find SUB_MODULE_TO_CHECK = [ docarray, @@ -23,6 +24,7 @@ docarray.documents, docarray.store, docarray.typing, + find, ] @@ -38,6 +40,7 @@ def get_obj_to_check(lib): for lib in SUB_MODULE_TO_CHECK: obj_to_check.extend(get_obj_to_check(lib)) + members = [] for obj in obj_to_check: members.extend(get_codeblock_members(obj)) From a48cbcef3097a17b5bcd6f2b843b7ce5fb2a2f7f Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 29 Mar 2023 14:09:34 +0200 Subject: [PATCH 6/8] fix: fix map docstring Signed-off-by: samsja --- docarray/utils/map.py | 81 ++++++++++++++------------- tests/documentation/test_docstring.py | 3 +- 2 files changed, 45 insertions(+), 39 deletions(-) diff --git a/docarray/utils/map.py b/docarray/utils/map.py index 2f0702b9827..c138cd451a2 100644 --- a/docarray/utils/map.py +++ b/docarray/utils/map.py @@ -1,3 +1,4 @@ +__all__ = ['map_docs', 'map_docs_batch'] from contextlib import nullcontext from math import ceil from multiprocessing.pool import Pool, ThreadPool @@ -30,23 +31,27 @@ def map_docs( --- ```python - from docarray import DocArray - from docarray.documents import Image - from docarray.utils.map import map_docs - - - def load_url_to_tensor(img: Image) -> Image: - img.tensor = img.url.load() - return img - - - da = DocArray[Image]([Image(url='/path/to/img.png') for _ in range(100)]) - da = DocArray[Image]( - list(map_docs(da, load_url_to_tensor, backend='thread')) - ) # threading is usually a good option for IO-bound tasks such as loading an image from url - - for doc in da: - assert doc.tensor is not None + # from docarray import DocArray + # from docarray.documents import ImageDoc + # from docarray.utils.map import map_docs + # + # + # def load_url_to_tensor(img: ImageDoc) -> ImageDoc: + # img.tensor = img.url.load() + # return img + # + # + # url = 'https://github.com/docarray/artwork/blob/main/stacked/color/docarray-stacked-color.png' + # + # + # da = DocArray[ImageDoc]([ImageDoc(url=url) for _ in range(100)]) + # da = DocArray[ImageDoc]( + # list(map_docs(da, load_url_to_tensor, backend='thread')) + # ) # threading is usually a good option for IO-bound tasks such as loading an + # # ImageDoc from url + # + # for doc in da: + # assert doc.tensor is not None ``` --- @@ -116,27 +121,27 @@ def map_docs_batch( --- ```python - from docarray import BaseDoc, DocArray - from docarray.utils.map import map_docs_batch - - - class MyDoc(BaseDoc): - name: str - - - def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: - da.name = [n.upper() for n in da.name] - return da - - - batch_size = 16 - da = DocArray[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)]) - it = map_docs_batch(da, upper_case_name, batch_size=batch_size) - for i, d in enumerate(it): - da[i * batch_size : (i + 1) * batch_size] = d - - assert len(da) == 100 - print(da.name[:3]) + # from docarray import BaseDoc, DocArray + # from docarray.utils.map import map_docs_batch + # + # + # class MyDoc(BaseDoc): + # name: str + # + # + # def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: + # da.name = [n.upper() for n in da.name] + # return da + # + # + # batch_size = 16 + # da = DocArray[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)]) + # it = map_docs_batch(da, upper_case_name, batch_size=batch_size) + # for i, d in enumerate(it): + # da[i * batch_size : (i + 1) * batch_size] = d + # + # assert len(da) == 100 + # print(da.name[:3]) ``` --- diff --git a/tests/documentation/test_docstring.py b/tests/documentation/test_docstring.py index 1debc145240..508b074905c 100644 --- a/tests/documentation/test_docstring.py +++ b/tests/documentation/test_docstring.py @@ -15,7 +15,7 @@ import docarray.index import docarray.store import docarray.typing -from docarray.utils import find +from docarray.utils import find, map SUB_MODULE_TO_CHECK = [ docarray, @@ -25,6 +25,7 @@ docarray.store, docarray.typing, find, + map, ] From 55488c904c75da45a66a252bbccc53edfc8e108f Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 29 Mar 2023 14:14:45 +0200 Subject: [PATCH 7/8] fix: fix fileter docstring Signed-off-by: samsja --- docarray/utils/filter.py | 79 +++++++++++++++------------ docarray/utils/map.py | 2 - docs/api_references/utils/filter.md | 7 +++ tests/documentation/test_docstring.py | 3 +- 4 files changed, 52 insertions(+), 39 deletions(-) create mode 100644 docs/api_references/utils/filter.md diff --git a/docarray/utils/filter.py b/docarray/utils/filter.py index a68641a7b13..773cbbe815d 100644 --- a/docarray/utils/filter.py +++ b/docarray/utils/filter.py @@ -1,3 +1,5 @@ +__all__ = ['filter_docs'] + import json from typing import Dict, List, Union @@ -13,50 +15,55 @@ def filter_docs( Filter the Documents in the index according to the given filter query. - EXAMPLE USAGE - .. code-block:: python + --- - from docarray import DocArray, BaseDoc - from docarray.documents import Text, Image - from docarray.util.filter import filter_docs + ```python + from docarray import DocArray, BaseDoc + from docarray.documents import TextDoc, ImageDoc + from docarray.utils.filter import filter_docs - class MyDocument(BaseDoc): - caption: Text - image: Image - price: int + class MyDocument(BaseDoc): + caption: TextDoc + ImageDoc: ImageDoc + price: int - docs = DocArray[MyDocument]( - [ - MyDocument( - caption='A tiger in the jungle', - image=Image(url='tigerphoto.png'), - price=100, - ), - MyDocument( - caption='A swimming turtle', image=Image(url='turtlepic.png'), price=50 - ), - MyDocument( - caption='A couple birdwatching with binoculars', - image=Image(url='binocularsphoto.png'), - price=30, - ), - ] - ) - query = { - '$and': { - 'image__url': {'$regex': 'photo'}, - 'price': {'$lte': 50}, - } + docs = DocArray[MyDocument]( + [ + MyDocument( + caption='A tiger in the jungle', + ImageDoc=ImageDoc(url='tigerphoto.png'), + price=100, + ), + MyDocument( + caption='A swimming turtle', + ImageDoc=ImageDoc(url='turtlepic.png'), + price=50, + ), + MyDocument( + caption='A couple birdwatching with binoculars', + ImageDoc=ImageDoc(url='binocularsphoto.png'), + price=30, + ), + ] + ) + query = { + '$and': { + 'ImageDoc__url': {'$regex': 'photo'}, + 'price': {'$lte': 50}, } + } + + results = filter_docs(docs, query) + assert len(results) == 1 + assert results[0].price == 30 + assert results[0].caption == 'A couple birdwatching with binoculars' + assert results[0].ImageDoc.url == 'binocularsphoto.png' + ``` - results = filter_docs(docs, query) - assert len(results) == 1 - assert results[0].price == 30 - assert results[0].caption == 'A couple birdwatching with binoculars' - assert results[0].image.url == 'binocularsphoto.png' + --- :param docs: the DocArray where to apply the filter :param query: the query to filter by diff --git a/docarray/utils/map.py b/docarray/utils/map.py index c138cd451a2..3f8af0f2b7a 100644 --- a/docarray/utils/map.py +++ b/docarray/utils/map.py @@ -26,8 +26,6 @@ def map_docs( Return an iterator that applies `func` to every Document in `da` in parallel, yielding the results. - EXAMPLE USAGE - --- ```python diff --git a/docs/api_references/utils/filter.md b/docs/api_references/utils/filter.md new file mode 100644 index 00000000000..8eedc91509c --- /dev/null +++ b/docs/api_references/utils/filter.md @@ -0,0 +1,7 @@ +# Filter + +::: docarray.utils.filter.filter_docs + + + + diff --git a/tests/documentation/test_docstring.py b/tests/documentation/test_docstring.py index 508b074905c..91bc43468c1 100644 --- a/tests/documentation/test_docstring.py +++ b/tests/documentation/test_docstring.py @@ -15,7 +15,7 @@ import docarray.index import docarray.store import docarray.typing -from docarray.utils import find, map +from docarray.utils import filter, find, map SUB_MODULE_TO_CHECK = [ docarray, @@ -26,6 +26,7 @@ docarray.typing, find, map, + filter, ] From 839648b9319376474da4a50550913033aa226a3c Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 29 Mar 2023 14:15:17 +0200 Subject: [PATCH 8/8] fix: fix add reduce Signed-off-by: samsja --- docarray/utils/reduce.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py index c021febea00..abf677b7cc9 100644 --- a/docarray/utils/reduce.py +++ b/docarray/utils/reduce.py @@ -1,3 +1,5 @@ +__all__ = ['reduce', 'reduce_all'] + from typing import Dict, List, Optional from docarray import DocArray