From a5ef6ca3baf9bc9e5054d376643b8914040e4aeb Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Wed, 1 Feb 2023 19:08:53 +0100 Subject: [PATCH 01/11] feat: add reduce utils --- docarray/utils/reduce.py | 134 ++++++++++++++++++++++++++++++++ tests/units/util/test_reduce.py | 33 ++++++++ 2 files changed, 167 insertions(+) create mode 100644 docarray/utils/reduce.py create mode 100644 tests/units/util/test_reduce.py diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py new file mode 100644 index 00000000000..b26947cab8e --- /dev/null +++ b/docarray/utils/reduce.py @@ -0,0 +1,134 @@ +from docarray import DocumentArray +from typing import List, Optional, Dict, TYPE_CHECKING, Tuple, _GenericAlias +from typing_inspect import is_union_type + + +if TYPE_CHECKING: # pragma: no cover + from docarray.base_document import BaseDocument + + +def _non_empty_fields(doc: 'BaseDocument') -> Tuple[str]: + r: List[str] = [] + for field_name in doc.__fields__.keys(): + v = getattr(doc, field_name) + if v: + r.append(field_name) + return tuple(r) + + +def _array_fields(doc: 'BaseDocument') -> Tuple[str]: + ret: List[str] = [] + for field_name, field in doc.__fields__.items(): + field_type = field.outer_type_ + print(f'HEY {field_type} => {type(field_type)}') + print(f' {isinstance(field_type, _GenericAlias)}') + if isinstance(field_type, _GenericAlias): + print(field_type.__origin__) + if isinstance(field_type, DocumentArray) or (isinstance(field_type, _GenericAlias) and field_type.__origin__ is list): + ret.append(field_name) + else: + print(f' hhey 2') + return tuple(ret) + + +""" +A mixin that provides reducing logic for :class:`DocumentArray` +Reducing 2 or more DocumentArrays consists in merging all Documents into the same DocumentArray. +If a Document belongs to 2 or more DocumentArrays, it is added once and data attributes are merged with priority to +the Document belonging to the left-most DocumentArray. Matches and chunks are also reduced in the same way. +Reduction is applied to all levels of DocumentArrays, that is, from root Documents to all their chunk and match +children. +""" + + +def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument', array_fields: Optional[List[str]] = None): + """ + Reduces doc1 and doc2 into one Document in-place. Changes are applied to doc1. + Reducing 2 Documents consists in setting data properties of the second Document to the first Document if they + are empty (that is, priority to the left-most Document) and reducing the matches and the chunks of both + documents. + Non-data properties are ignored. + Reduction of matches and chunks relies on :class:`DocumentArray`.:method:`reduce`. + :param doc1: first Document + :param doc2: second Document + :param array_fields: + """ + doc1_fields = set(_non_empty_fields(doc1)) + doc2_fields = set(_non_empty_fields(doc2)) + + # update only fields that are set in doc2 and not set in doc1 + fields = doc2_fields - doc1_fields + + for field in fields: + setattr(doc1, field, getattr(doc2, field)) + + array_fields = array_fields or _array_fields(doc1) + for field in array_fields: + array1 = getattr(doc1, field) + array2 = getattr(doc2, field) + if array1 is None and array2 is not None: + setattr(doc1, field, array2) + elif array1 is not None and array2 is not None: + array1.extend(array2) + setattr(doc1, field, array1) # I am not sure if this is optimal, how can I do (doc1.field.extend()) + + return doc1 + + +def reduce(left: DocumentArray, other: DocumentArray, left_id_map: Optional[Dict] = None, + array_fields: Optional[List[str]] = None) -> 'DocumentArray': + """ + Reduces other and the current DocumentArray into one DocumentArray in-place. Changes are applied to the current + DocumentArray. + Reducing 2 DocumentArrays consists in adding Documents in the second DocumentArray to the first DocumentArray + if they do not exist. If a Document exists in both DocumentArrays, the data properties are merged with priority + to the first Document (that is, to the current DocumentArray's Document). The matches and chunks are also + reduced in the same way. + :param left: DocumentArray + :param other: DocumentArray + :param left_id_map: + :param array_fields: + + :return: DocumentArray + """ + left_id_map = left_id_map or {doc.id: i for i, doc in enumerate(left)} + array_fields = array_fields or left[0].array_fields + + for doc in other: + if doc.id in left_id_map: + reduce_docs(left[left_id_map[doc.id]], doc, array_fields) + else: + left.append(doc) + + return left + + +def reduce_all(left: DocumentArray, others: List[DocumentArray]) -> DocumentArray: + """ + Reduces a list of DocumentArrays and this DocumentArray into one DocumentArray. Changes are applied to this + DocumentArray in-place. + + Reduction consists in reducing this DocumentArray with every DocumentArray in `others` sequentially using + :class:`DocumentArray`.:method:`reduce`. + The resulting DocumentArray contains Documents of all DocumentArrays. + If a Document exists in many DocumentArrays, data properties are merged with priority to the left-most + DocumentArrays (that is, if a data attribute is set in a Document belonging to many DocumentArrays, the + attribute value of the left-most DocumentArray is kept). + Matches and chunks of a Document belonging to many DocumentArrays are also reduced in the same way. + Other non-data properties are ignored. + + .. note:: + - Matches are not kept in a sorted order when they are reduced. You might want to re-sort them in a later + step. + - The final result depends on the order of DocumentArrays when applying reduction. + + :param left: + :param others: List of DocumentArrays to be reduced + :return: the resulting DocumentArray + """ + assert len(left) > 0, 'In order to reduce DocumentArrays we should have a non empty DocumentArray' + left_id_map = {doc.id: i for i, doc in enumerate(left)} + array_fields = left[0].array_fields + for da in others: + reduce(left, da, left_id_map, array_fields) + return left diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py new file mode 100644 index 00000000000..eb3ccaa8b52 --- /dev/null +++ b/tests/units/util/test_reduce.py @@ -0,0 +1,33 @@ +from typing import Optional, List, Dict, Any +from docarray import BaseDocument, DocumentArray +from docarray.documents import Image +from docarray.utils.reduce import reduce_docs + + +class MMDoc(BaseDocument): + text: str = '' + price: int = 0 + categories: Optional[List[str]] = None + image: Optional[Image] = None + matches: Optional[DocumentArray] = None + dictionary: Optional[Dict[str, Any]] = None + opt_int: Optional[int] = None + + +def test_simple_reduce_arrays_concatenated(): + doc1 = MMDoc(text='hey here', categories=['a', 'b', 'c'], price=10, matches=DocumentArray[MMDoc]([MMDoc()])) + doc2 = MMDoc(id=doc1.id, text='hey here 2', categories=['d', 'e', 'f'], price=5, opt_int=5, matches=DocumentArray[MMDoc]([MMDoc()])) + + result = reduce_docs(doc1, doc2) + assert result.text == 'hey here' + assert len(result.matches) == 2 + assert result.categories == ['a', 'b', 'c', 'd', 'e', 'f'] + assert result.opt_int == 5 + assert result.price == 10 + + # doc1 is changed in place (no extra memory) + assert doc1.text == 'hey here' + assert doc1.categories == ['a', 'b', 'c', 'd', 'e', 'f'] + assert len(doc1.matches) == 2 + assert doc1.opt_int == 5 + assert doc1.price == 10 From 745d1c23afb1ea04809fe6fad9b12082c6ce8c34 Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Thu, 2 Feb 2023 12:38:55 +0100 Subject: [PATCH 02/11] feat: support sets reducing Signed-off-by: Joan Fontanals Martinez --- docarray/utils/reduce.py | 14 +++++--------- tests/units/util/test_reduce.py | 11 ++++++++--- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py index b26947cab8e..fd63ad63b40 100644 --- a/docarray/utils/reduce.py +++ b/docarray/utils/reduce.py @@ -1,6 +1,5 @@ from docarray import DocumentArray from typing import List, Optional, Dict, TYPE_CHECKING, Tuple, _GenericAlias -from typing_inspect import is_union_type if TYPE_CHECKING: # pragma: no cover @@ -20,14 +19,8 @@ def _array_fields(doc: 'BaseDocument') -> Tuple[str]: ret: List[str] = [] for field_name, field in doc.__fields__.items(): field_type = field.outer_type_ - print(f'HEY {field_type} => {type(field_type)}') - print(f' {isinstance(field_type, _GenericAlias)}') - if isinstance(field_type, _GenericAlias): - print(field_type.__origin__) - if isinstance(field_type, DocumentArray) or (isinstance(field_type, _GenericAlias) and field_type.__origin__ is list): + if (not isinstance(field_type, _GenericAlias) and issubclass(field_type, DocumentArray)) or (isinstance(field_type, _GenericAlias) and field_type.__origin__ is list) or (isinstance(field_type, _GenericAlias) and (field_type.__origin__ is set)): ret.append(field_name) - else: - print(f' hhey 2') return tuple(ret) @@ -69,7 +62,10 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument', array_fields: Option if array1 is None and array2 is not None: setattr(doc1, field, array2) elif array1 is not None and array2 is not None: - array1.extend(array2) + if isinstance(array1, set): + array1.update(array2) + else: + array1.extend(array2) setattr(doc1, field, array1) # I am not sure if this is optimal, how can I do (doc1.field.extend()) return doc1 diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py index eb3ccaa8b52..3c5d9d56aa1 100644 --- a/tests/units/util/test_reduce.py +++ b/tests/units/util/test_reduce.py @@ -1,4 +1,4 @@ -from typing import Optional, List, Dict, Any +from typing import Optional, List, Dict, Any, Set from docarray import BaseDocument, DocumentArray from docarray.documents import Image from docarray.utils.reduce import reduce_docs @@ -12,11 +12,14 @@ class MMDoc(BaseDocument): matches: Optional[DocumentArray] = None dictionary: Optional[Dict[str, Any]] = None opt_int: Optional[int] = None + test_set: Optional[Set] = None def test_simple_reduce_arrays_concatenated(): - doc1 = MMDoc(text='hey here', categories=['a', 'b', 'c'], price=10, matches=DocumentArray[MMDoc]([MMDoc()])) - doc2 = MMDoc(id=doc1.id, text='hey here 2', categories=['d', 'e', 'f'], price=5, opt_int=5, matches=DocumentArray[MMDoc]([MMDoc()])) + doc1 = MMDoc(text='hey here', categories=['a', 'b', 'c'], price=10, matches=DocumentArray[MMDoc]([MMDoc()]), test_set={ + 'a', 'a'}) + doc2 = MMDoc(id=doc1.id, text='hey here 2', categories=['d', 'e', 'f'], price=5, opt_int=5, matches=DocumentArray[MMDoc]([MMDoc()]), test_set={ + 'a', 'b'}) result = reduce_docs(doc1, doc2) assert result.text == 'hey here' @@ -24,6 +27,7 @@ def test_simple_reduce_arrays_concatenated(): assert result.categories == ['a', 'b', 'c', 'd', 'e', 'f'] assert result.opt_int == 5 assert result.price == 10 + assert result.test_set == {'a', 'b'} # doc1 is changed in place (no extra memory) assert doc1.text == 'hey here' @@ -31,3 +35,4 @@ def test_simple_reduce_arrays_concatenated(): assert len(doc1.matches) == 2 assert doc1.opt_int == 5 assert doc1.price == 10 + assert doc1.test_set == {'a', 'b'} From 9419ec5198f07162040077ea8b9729919bfd4622 Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Thu, 2 Feb 2023 12:49:51 +0100 Subject: [PATCH 03/11] feat: support sub docarrays reducing Signed-off-by: Joan Fontanals Martinez --- docarray/utils/reduce.py | 94 +++++++++++++++++++-------------- tests/units/util/test_reduce.py | 38 +++++++++++-- 2 files changed, 89 insertions(+), 43 deletions(-) diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py index fd63ad63b40..a3fd12266a0 100644 --- a/docarray/utils/reduce.py +++ b/docarray/utils/reduce.py @@ -1,27 +1,30 @@ from docarray import DocumentArray -from typing import List, Optional, Dict, TYPE_CHECKING, Tuple, _GenericAlias +from typing import List, Optional, Dict, Tuple, _GenericAlias +from docarray.base_document import BaseDocument -if TYPE_CHECKING: # pragma: no cover - from docarray.base_document import BaseDocument +def _types_analysis(doc: 'BaseDocument') -> Tuple[List[str]]: + simple_non_empty_fields: List[str] = [] + list_fields: List[str] = [] + set_fields: List[str] = [] + nested_docs_fields: List[str] = [] + nested_docarray_fields: List[str] = [] - -def _non_empty_fields(doc: 'BaseDocument') -> Tuple[str]: - r: List[str] = [] - for field_name in doc.__fields__.keys(): - v = getattr(doc, field_name) - if v: - r.append(field_name) - return tuple(r) - - -def _array_fields(doc: 'BaseDocument') -> Tuple[str]: - ret: List[str] = [] for field_name, field in doc.__fields__.items(): field_type = field.outer_type_ - if (not isinstance(field_type, _GenericAlias) and issubclass(field_type, DocumentArray)) or (isinstance(field_type, _GenericAlias) and field_type.__origin__ is list) or (isinstance(field_type, _GenericAlias) and (field_type.__origin__ is set)): - ret.append(field_name) - return tuple(ret) + if not isinstance(field_type, _GenericAlias) and issubclass(field_type, DocumentArray): + nested_docarray_fields.append(field_name) + elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is list: + list_fields.append(field_name) + elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is set: + set_fields.append(field_name) + v = getattr(doc, field_name) + if v: + if isinstance(v, BaseDocument): + nested_docs_fields.append(field_name) + else: + simple_non_empty_fields.append(field_name) + return tuple([simple_non_empty_fields, list_fields, set_fields, nested_docarray_fields, nested_docs_fields]) """ @@ -34,7 +37,7 @@ def _array_fields(doc: 'BaseDocument') -> Tuple[str]: """ -def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument', array_fields: Optional[List[str]] = None): +def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument': """ Reduces doc1 and doc2 into one Document in-place. Changes are applied to doc1. Reducing 2 Documents consists in setting data properties of the second Document to the first Document if they @@ -44,35 +47,52 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument', array_fields: Option Reduction of matches and chunks relies on :class:`DocumentArray`.:method:`reduce`. :param doc1: first Document :param doc2: second Document - :param array_fields: """ - doc1_fields = set(_non_empty_fields(doc1)) - doc2_fields = set(_non_empty_fields(doc2)) + doc1_simple_non_empty_fields, doc1_list_fields, doc1_set_fields, doc1_nested_docarray_fields, doc1_nested_docs_fields = _types_analysis( + doc1) + doc2_simple_non_empty_fields, doc2_list_fields, doc2_set_fields, doc2_nested_docarray_fields, doc2_nested_docs_fields = _types_analysis( + doc2) # update only fields that are set in doc2 and not set in doc1 - fields = doc2_fields - doc1_fields + update_simple_fields = set(doc2_simple_non_empty_fields) - set(doc1_simple_non_empty_fields) - for field in fields: + for field in update_simple_fields: setattr(doc1, field, getattr(doc2, field)) - array_fields = array_fields or _array_fields(doc1) - for field in array_fields: + for field in set(doc1_nested_docs_fields + doc2_nested_docs_fields): + setattr(doc1, field, reduce_docs(getattr(doc1, field), getattr(doc2, field))) + + for field in doc1_list_fields: array1 = getattr(doc1, field) array2 = getattr(doc2, field) if array1 is None and array2 is not None: setattr(doc1, field, array2) elif array1 is not None and array2 is not None: - if isinstance(array1, set): - array1.update(array2) - else: - array1.extend(array2) - setattr(doc1, field, array1) # I am not sure if this is optimal, how can I do (doc1.field.extend()) + array1.extend(array2) + setattr(doc1, field, array1) + + for field in doc1_set_fields: + array1 = getattr(doc1, field) + array2 = getattr(doc2, field) + if array1 is None and array2 is not None: + setattr(doc1, field, array2) + elif array1 is not None and array2 is not None: + array1.update(array2) + setattr(doc1, field, array1) + + for field in doc1_nested_docarray_fields: + array1 = getattr(doc1, field) + array2 = getattr(doc2, field) + if array1 is None and array2 is not None: + setattr(doc1, field, array2) + elif array1 is not None and array2 is not None: + array1 = reduce(array1, array2) + setattr(doc1, field, array1) return doc1 -def reduce(left: DocumentArray, other: DocumentArray, left_id_map: Optional[Dict] = None, - array_fields: Optional[List[str]] = None) -> 'DocumentArray': +def reduce(left: DocumentArray, other: DocumentArray, left_id_map: Optional[Dict] = None) -> 'DocumentArray': """ Reduces other and the current DocumentArray into one DocumentArray in-place. Changes are applied to the current DocumentArray. @@ -83,16 +103,13 @@ def reduce(left: DocumentArray, other: DocumentArray, left_id_map: Optional[Dict :param left: DocumentArray :param other: DocumentArray :param left_id_map: - :param array_fields: - :return: DocumentArray """ left_id_map = left_id_map or {doc.id: i for i, doc in enumerate(left)} - array_fields = array_fields or left[0].array_fields for doc in other: if doc.id in left_id_map: - reduce_docs(left[left_id_map[doc.id]], doc, array_fields) + reduce_docs(left[left_id_map[doc.id]], doc) else: left.append(doc) @@ -124,7 +141,6 @@ def reduce_all(left: DocumentArray, others: List[DocumentArray]) -> DocumentArra """ assert len(left) > 0, 'In order to reduce DocumentArrays we should have a non empty DocumentArray' left_id_map = {doc.id: i for i, doc in enumerate(left)} - array_fields = left[0].array_fields for da in others: - reduce(left, da, left_id_map, array_fields) + reduce(left, da, left_id_map) return left diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py index 3c5d9d56aa1..9f27ea895a3 100644 --- a/tests/units/util/test_reduce.py +++ b/tests/units/util/test_reduce.py @@ -4,22 +4,43 @@ from docarray.utils.reduce import reduce_docs +class InnerDoc(BaseDocument): + integer: int + l: List + + class MMDoc(BaseDocument): text: str = '' price: int = 0 categories: Optional[List[str]] = None image: Optional[Image] = None matches: Optional[DocumentArray] = None + matches_with_same_id: Optional[DocumentArray] = None dictionary: Optional[Dict[str, Any]] = None opt_int: Optional[int] = None test_set: Optional[Set] = None + inner_doc: Optional[InnerDoc] = None def test_simple_reduce_arrays_concatenated(): - doc1 = MMDoc(text='hey here', categories=['a', 'b', 'c'], price=10, matches=DocumentArray[MMDoc]([MMDoc()]), test_set={ - 'a', 'a'}) - doc2 = MMDoc(id=doc1.id, text='hey here 2', categories=['d', 'e', 'f'], price=5, opt_int=5, matches=DocumentArray[MMDoc]([MMDoc()]), test_set={ - 'a', 'b'}) + doc1 = MMDoc( + text='hey here', + categories=['a', 'b', 'c'], + price=10, + matches=DocumentArray[MMDoc]([MMDoc()]), + matches_with_same_id=DocumentArray[MMDoc]([MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]), + test_set={'a', 'a'}, + inner_doc=InnerDoc(integer=2, l=['c', 'd'])) + doc2 = MMDoc( + id=doc1.id, + text='hey here 2', + categories=['d', 'e', 'f'], + price=5, + opt_int=5, + matches=DocumentArray[MMDoc]([MMDoc()]), + matches_with_same_id=DocumentArray[MMDoc]([MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]), + test_set={'a', 'b'}, + inner_doc=InnerDoc(integer=3, l=['a', 'b'])) result = reduce_docs(doc1, doc2) assert result.text == 'hey here' @@ -28,6 +49,10 @@ def test_simple_reduce_arrays_concatenated(): assert result.opt_int == 5 assert result.price == 10 assert result.test_set == {'a', 'b'} + assert len(result.matches_with_same_id) == 1 + assert len(result.matches_with_same_id[0].matches) == 2 + assert result.inner_doc.integer == 2 + assert result.inner_doc.l == ['c', 'd', 'a', 'b'] # doc1 is changed in place (no extra memory) assert doc1.text == 'hey here' @@ -36,3 +61,8 @@ def test_simple_reduce_arrays_concatenated(): assert doc1.opt_int == 5 assert doc1.price == 10 assert doc1.test_set == {'a', 'b'} + assert len(doc1.matches_with_same_id) == 1 + assert len(doc1.matches_with_same_id[0].matches) == 2 + assert doc1.inner_doc.integer == 2 + assert doc1.inner_doc.l == ['c', 'd', 'a', 'b'] + From 5f0ecefb24a1c6ade474214de0230d26024d6eb2 Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Thu, 2 Feb 2023 17:04:48 +0100 Subject: [PATCH 04/11] feat: finish feature implementation and testing Signed-off-by: Joan Fontanals Martinez --- docarray/utils/reduce.py | 46 ++++++++++++----------- tests/units/util/test_reduce.py | 65 +++++++++++++++++++++++++++++++-- 2 files changed, 86 insertions(+), 25 deletions(-) diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py index a3fd12266a0..c0a69a4e33e 100644 --- a/docarray/utils/reduce.py +++ b/docarray/utils/reduce.py @@ -1,9 +1,9 @@ from docarray import DocumentArray -from typing import List, Optional, Dict, Tuple, _GenericAlias +from typing import List, Optional, Dict, _GenericAlias from docarray.base_document import BaseDocument -def _types_analysis(doc: 'BaseDocument') -> Tuple[List[str]]: +def _types_analysis(doc: 'BaseDocument') -> List[List[str]]: simple_non_empty_fields: List[str] = [] list_fields: List[str] = [] set_fields: List[str] = [] @@ -24,7 +24,7 @@ def _types_analysis(doc: 'BaseDocument') -> Tuple[List[str]]: nested_docs_fields.append(field_name) else: simple_non_empty_fields.append(field_name) - return tuple([simple_non_empty_fields, list_fields, set_fields, nested_docarray_fields, nested_docs_fields]) + return [simple_non_empty_fields, list_fields, set_fields, nested_docarray_fields, nested_docs_fields] """ @@ -48,10 +48,19 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument': :param doc1: first Document :param doc2: second Document """ - doc1_simple_non_empty_fields, doc1_list_fields, doc1_set_fields, doc1_nested_docarray_fields, doc1_nested_docs_fields = _types_analysis( - doc1) - doc2_simple_non_empty_fields, doc2_list_fields, doc2_set_fields, doc2_nested_docarray_fields, doc2_nested_docs_fields = _types_analysis( - doc2) + doc1_fields = _types_analysis(doc1) + doc1_simple_non_empty_fields = doc1_fields[0] + doc1_list_fields = doc1_fields[1] + doc1_set_fields = doc1_fields[2] + doc1_nested_docarray_fields = doc1_fields[3] + doc1_nested_docs_fields = doc1_fields[4] + + doc2_fields = _types_analysis(doc2) + doc2_simple_non_empty_fields = doc2_fields[0] + doc2_list_fields = doc2_fields[1] + doc2_set_fields = doc2_fields[2] + doc2_nested_docarray_fields = doc2_fields[3] + doc2_nested_docs_fields = doc2_fields[4] # update only fields that are set in doc2 and not set in doc1 update_simple_fields = set(doc2_simple_non_empty_fields) - set(doc1_simple_non_empty_fields) @@ -116,30 +125,25 @@ def reduce(left: DocumentArray, other: DocumentArray, left_id_map: Optional[Dict return left -def reduce_all(left: DocumentArray, others: List[DocumentArray]) -> DocumentArray: +def reduce_all(docarrays: List[DocumentArray]) -> DocumentArray: """ - Reduces a list of DocumentArrays and this DocumentArray into one DocumentArray. Changes are applied to this - DocumentArray in-place. + Reduces a list of DocumentArrays into one DocumentArray. Changes are applied to the first DocumentArray in-place. - Reduction consists in reducing this DocumentArray with every DocumentArray in `others` sequentially using - :class:`DocumentArray`.:method:`reduce`. The resulting DocumentArray contains Documents of all DocumentArrays. - If a Document exists in many DocumentArrays, data properties are merged with priority to the left-most + If a Document exists (identified by their ID) in many DocumentArrays, data properties are merged with priority to the left-most DocumentArrays (that is, if a data attribute is set in a Document belonging to many DocumentArrays, the attribute value of the left-most DocumentArray is kept). - Matches and chunks of a Document belonging to many DocumentArrays are also reduced in the same way. - Other non-data properties are ignored. - + Nested DocumentArrays belonging to many DocumentArrays are also reduced in the same way. .. note:: - - Matches are not kept in a sorted order when they are reduced. You might want to re-sort them in a later - step. + - Nested DocumentArrays order does not follow any specific rule. You might want to re-sort them in a later step. - The final result depends on the order of DocumentArrays when applying reduction. - :param left: - :param others: List of DocumentArrays to be reduced + :param docarrays: List of DocumentArrays to be reduced :return: the resulting DocumentArray """ - assert len(left) > 0, 'In order to reduce DocumentArrays we should have a non empty DocumentArray' + assert len(docarrays) > 1, 'In order to reduce DocumentArrays we should have more than one DocumentArray' + left = docarrays[0] + others = docarrays[1:] left_id_map = {doc.id: i for i, doc in enumerate(left)} for da in others: reduce(left, da, left_id_map) diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py index 9f27ea895a3..2fb092b7e15 100644 --- a/tests/units/util/test_reduce.py +++ b/tests/units/util/test_reduce.py @@ -1,7 +1,8 @@ +import pytest from typing import Optional, List, Dict, Any, Set from docarray import BaseDocument, DocumentArray from docarray.documents import Image -from docarray.utils.reduce import reduce_docs +from docarray.utils.reduce import reduce_docs, reduce, reduce_all class InnerDoc(BaseDocument): @@ -22,8 +23,9 @@ class MMDoc(BaseDocument): inner_doc: Optional[InnerDoc] = None -def test_simple_reduce_arrays_concatenated(): - doc1 = MMDoc( +@pytest.fixture +def doc1(): + return MMDoc( text='hey here', categories=['a', 'b', 'c'], price=10, @@ -31,7 +33,11 @@ def test_simple_reduce_arrays_concatenated(): matches_with_same_id=DocumentArray[MMDoc]([MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]), test_set={'a', 'a'}, inner_doc=InnerDoc(integer=2, l=['c', 'd'])) - doc2 = MMDoc( + + +@pytest.fixture +def doc2(doc1): + return MMDoc( id=doc1.id, text='hey here 2', categories=['d', 'e', 'f'], @@ -42,6 +48,8 @@ def test_simple_reduce_arrays_concatenated(): test_set={'a', 'b'}, inner_doc=InnerDoc(integer=3, l=['a', 'b'])) + +def test_reduce_docs(doc1, doc2): result = reduce_docs(doc1, doc2) assert result.text == 'hey here' assert len(result.matches) == 2 @@ -66,3 +74,52 @@ def test_simple_reduce_arrays_concatenated(): assert doc1.inner_doc.integer == 2 assert doc1.inner_doc.l == ['c', 'd', 'a', 'b'] + +def test_reduce_different_ids(): + da1 = DocumentArray[MMDoc]([MMDoc() for _ in range(10)]) + da2 = DocumentArray[MMDoc]([MMDoc() for _ in range(10)]) + result = reduce(da1, da2) + assert len(result) == 20 + # da1 is changed in place (no extra memory) + assert len(da1) == 20 + + +def test_reduce(doc1, doc2): + da1 = DocumentArray[MMDoc]([doc1, MMDoc()]) + da2 = DocumentArray[MMDoc]([MMDoc(), doc2]) + result = reduce(da1, da2) + assert len(result) == 3 + # da1 is changed in place (no extra memory) + assert len(da1) == 3 + merged_doc = result[0] + assert merged_doc.text == 'hey here' + assert merged_doc.categories == ['a', 'b', 'c', 'd', 'e', 'f'] + assert len(merged_doc.matches) == 2 + assert merged_doc.opt_int == 5 + assert merged_doc.price == 10 + assert merged_doc.test_set == {'a', 'b'} + assert len(merged_doc.matches_with_same_id) == 1 + assert len(merged_doc.matches_with_same_id[0].matches) == 2 + assert merged_doc.inner_doc.integer == 2 + assert merged_doc.inner_doc.l == ['c', 'd', 'a', 'b'] + + +def test_reduce_all(doc1, doc2): + da1 = DocumentArray[MMDoc]([doc1, MMDoc()]) + da2 = DocumentArray[MMDoc]([MMDoc(), doc2]) + da3 = DocumentArray[MMDoc]([MMDoc(), MMDoc(), doc1]) + result = reduce_all([da1, da2, da3]) + assert len(result) == 5 + # da1 is changed in place (no extra memory) + assert len(da1) == 5 + merged_doc = result[0] + assert merged_doc.text == 'hey here' + assert merged_doc.categories == ['a', 'b', 'c', 'd', 'e', 'f', 'a', 'b', 'c', 'd', 'e', 'f'] + assert len(merged_doc.matches) == 2 + assert merged_doc.opt_int == 5 + assert merged_doc.price == 10 + assert merged_doc.test_set == {'a', 'b'} + assert len(merged_doc.matches_with_same_id) == 1 + assert len(merged_doc.matches_with_same_id[0].matches) == 2 + assert merged_doc.inner_doc.integer == 2 + assert merged_doc.inner_doc.l == ['c', 'd', 'a', 'b', 'c', 'd', 'a', 'b'] From 5c439b8b500286660afa280edddc4ab8e972eddb Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Thu, 2 Feb 2023 17:36:25 +0100 Subject: [PATCH 05/11] docs: add documentation and fix ruff Signed-off-by: Joan Fontanals Martinez --- docarray/utils/reduce.py | 118 ++++++++++++++++++-------------- tests/units/util/test_reduce.py | 29 ++++++-- 2 files changed, 90 insertions(+), 57 deletions(-) diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py index c0a69a4e33e..f842cac3f75 100644 --- a/docarray/utils/reduce.py +++ b/docarray/utils/reduce.py @@ -1,5 +1,5 @@ from docarray import DocumentArray -from typing import List, Optional, Dict, _GenericAlias +from typing import List, Optional, Dict, _GenericAlias # type: ignore from docarray.base_document import BaseDocument @@ -12,7 +12,9 @@ def _types_analysis(doc: 'BaseDocument') -> List[List[str]]: for field_name, field in doc.__fields__.items(): field_type = field.outer_type_ - if not isinstance(field_type, _GenericAlias) and issubclass(field_type, DocumentArray): + if not isinstance(field_type, _GenericAlias) and issubclass( + field_type, DocumentArray + ): nested_docarray_fields.append(field_name) elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is list: list_fields.append(field_name) @@ -24,46 +26,43 @@ def _types_analysis(doc: 'BaseDocument') -> List[List[str]]: nested_docs_fields.append(field_name) else: simple_non_empty_fields.append(field_name) - return [simple_non_empty_fields, list_fields, set_fields, nested_docarray_fields, nested_docs_fields] - - -""" -A mixin that provides reducing logic for :class:`DocumentArray` -Reducing 2 or more DocumentArrays consists in merging all Documents into the same DocumentArray. -If a Document belongs to 2 or more DocumentArrays, it is added once and data attributes are merged with priority to -the Document belonging to the left-most DocumentArray. Matches and chunks are also reduced in the same way. -Reduction is applied to all levels of DocumentArrays, that is, from root Documents to all their chunk and match -children. -""" + return [ + simple_non_empty_fields, + list_fields, + set_fields, + nested_docarray_fields, + nested_docs_fields, + ] def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument': """ Reduces doc1 and doc2 into one Document in-place. Changes are applied to doc1. - Reducing 2 Documents consists in setting data properties of the second Document to the first Document if they - are empty (that is, priority to the left-most Document) and reducing the matches and the chunks of both - documents. - Non-data properties are ignored. - Reduction of matches and chunks relies on :class:`DocumentArray`.:method:`reduce`. - :param doc1: first Document - :param doc2: second Document + Reducing 2 Documents consists in setting data properties of the second Document + to the first Document if they are empty (priority to the left-most Document) + and reducing recursively its nested Documents and DocumentArrays + :param doc1: first Document to be reduced. Change is applied in-place + :param doc2: second Document to be reduced + :return The reduced Document """ doc1_fields = _types_analysis(doc1) doc1_simple_non_empty_fields = doc1_fields[0] - doc1_list_fields = doc1_fields[1] - doc1_set_fields = doc1_fields[2] - doc1_nested_docarray_fields = doc1_fields[3] - doc1_nested_docs_fields = doc1_fields[4] + doc1_list_fields = doc1_fields[1] + doc1_set_fields = doc1_fields[2] + doc1_nested_docarray_fields = doc1_fields[3] + doc1_nested_docs_fields = doc1_fields[4] doc2_fields = _types_analysis(doc2) doc2_simple_non_empty_fields = doc2_fields[0] - doc2_list_fields = doc2_fields[1] - doc2_set_fields = doc2_fields[2] - doc2_nested_docarray_fields = doc2_fields[3] - doc2_nested_docs_fields = doc2_fields[4] + doc2_list_fields = doc2_fields[1] + doc2_set_fields = doc2_fields[2] + doc2_nested_docarray_fields = doc2_fields[3] + doc2_nested_docs_fields = doc2_fields[4] # update only fields that are set in doc2 and not set in doc1 - update_simple_fields = set(doc2_simple_non_empty_fields) - set(doc1_simple_non_empty_fields) + update_simple_fields = set(doc2_simple_non_empty_fields) - set( + doc1_simple_non_empty_fields + ) for field in update_simple_fields: setattr(doc1, field, getattr(doc2, field)) @@ -71,7 +70,7 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument': for field in set(doc1_nested_docs_fields + doc2_nested_docs_fields): setattr(doc1, field, reduce_docs(getattr(doc1, field), getattr(doc2, field))) - for field in doc1_list_fields: + for field in set(doc1_list_fields + doc2_list_fields): array1 = getattr(doc1, field) array2 = getattr(doc2, field) if array1 is None and array2 is not None: @@ -80,7 +79,7 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument': array1.extend(array2) setattr(doc1, field, array1) - for field in doc1_set_fields: + for field in set(doc1_set_fields + doc2_set_fields): array1 = getattr(doc1, field) array2 = getattr(doc2, field) if array1 is None and array2 is not None: @@ -89,7 +88,7 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument': array1.update(array2) setattr(doc1, field, array1) - for field in doc1_nested_docarray_fields: + for field in set(doc1_nested_docarray_fields + doc2_nested_docarray_fields): array1 = getattr(doc1, field) array2 = getattr(doc2, field) if array1 is None and array2 is not None: @@ -101,22 +100,29 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument': return doc1 -def reduce(left: DocumentArray, other: DocumentArray, left_id_map: Optional[Dict] = None) -> 'DocumentArray': +def reduce( + left: DocumentArray, right: DocumentArray, left_id_map: Optional[Dict] = None +) -> 'DocumentArray': """ - Reduces other and the current DocumentArray into one DocumentArray in-place. Changes are applied to the current - DocumentArray. - Reducing 2 DocumentArrays consists in adding Documents in the second DocumentArray to the first DocumentArray - if they do not exist. If a Document exists in both DocumentArrays, the data properties are merged with priority - to the first Document (that is, to the current DocumentArray's Document). The matches and chunks are also - reduced in the same way. - :param left: DocumentArray - :param other: DocumentArray - :param left_id_map: - :return: DocumentArray + Reduces left and right DocumentArray into one DocumentArray in-place. + Changes are applied to the left DocumentArray. + Reducing 2 DocumentArrays consists in adding Documents in the second DocumentArray + to the first DocumentArray if they do not exist. + If a Document exists in both DocumentArrays (identified by ID), + the data properties are merged with priority to the left Document. + + Nested DocumentArrays are also reduced in the same way. + :param left: First DocumentArray to be reduced. Changes will be applied to it + in-place + :param right: Second DocumentArray to be reduced + :param left_id_map: Optional parameter to be passed in repeated calls + for optimizations, keeping a map of the Document ID to its offset + in the DocumentArray + :return: Reduced DocumentArray """ left_id_map = left_id_map or {doc.id: i for i, doc in enumerate(left)} - for doc in other: + for doc in right: if doc.id in left_id_map: reduce_docs(left[left_id_map[doc.id]], doc) else: @@ -127,21 +133,29 @@ def reduce(left: DocumentArray, other: DocumentArray, left_id_map: Optional[Dict def reduce_all(docarrays: List[DocumentArray]) -> DocumentArray: """ - Reduces a list of DocumentArrays into one DocumentArray. Changes are applied to the first DocumentArray in-place. + Reduces a list of DocumentArrays into one DocumentArray. + Changes are applied to the first DocumentArray in-place. The resulting DocumentArray contains Documents of all DocumentArrays. - If a Document exists (identified by their ID) in many DocumentArrays, data properties are merged with priority to the left-most - DocumentArrays (that is, if a data attribute is set in a Document belonging to many DocumentArrays, the - attribute value of the left-most DocumentArray is kept). - Nested DocumentArrays belonging to many DocumentArrays are also reduced in the same way. + If a Document exists (identified by their ID) in many DocumentArrays, + data properties are merged with priority to the left-most + DocumentArrays (that is, if a data attribute is set in a Document + belonging to many DocumentArrays, the attribute value of the left-most + DocumentArray is kept). + Nested DocumentArrays belonging to many DocumentArrays + are also reduced in the same way. .. note:: - - Nested DocumentArrays order does not follow any specific rule. You might want to re-sort them in a later step. - - The final result depends on the order of DocumentArrays when applying reduction. + - Nested DocumentArrays order does not follow any specific rule. + You might want to re-sort them in a later step. + - The final result depends on the order of DocumentArrays + when applying reduction. :param docarrays: List of DocumentArrays to be reduced :return: the resulting DocumentArray """ - assert len(docarrays) > 1, 'In order to reduce DocumentArrays we should have more than one DocumentArray' + assert ( + len(docarrays) > 1 + ), 'In order to reduce DocumentArrays we should have more than one DocumentArray' left = docarrays[0] others = docarrays[1:] left_id_map = {doc.id: i for i, doc in enumerate(left)} diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py index 2fb092b7e15..ae10378dea3 100644 --- a/tests/units/util/test_reduce.py +++ b/tests/units/util/test_reduce.py @@ -30,9 +30,12 @@ def doc1(): categories=['a', 'b', 'c'], price=10, matches=DocumentArray[MMDoc]([MMDoc()]), - matches_with_same_id=DocumentArray[MMDoc]([MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]), + matches_with_same_id=DocumentArray[MMDoc]( + [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))] + ), test_set={'a', 'a'}, - inner_doc=InnerDoc(integer=2, l=['c', 'd'])) + inner_doc=InnerDoc(integer=2, l=['c', 'd']), + ) @pytest.fixture @@ -44,9 +47,12 @@ def doc2(doc1): price=5, opt_int=5, matches=DocumentArray[MMDoc]([MMDoc()]), - matches_with_same_id=DocumentArray[MMDoc]([MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]), + matches_with_same_id=DocumentArray[MMDoc]( + [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))] + ), test_set={'a', 'b'}, - inner_doc=InnerDoc(integer=3, l=['a', 'b'])) + inner_doc=InnerDoc(integer=3, l=['a', 'b']), + ) def test_reduce_docs(doc1, doc2): @@ -114,7 +120,20 @@ def test_reduce_all(doc1, doc2): assert len(da1) == 5 merged_doc = result[0] assert merged_doc.text == 'hey here' - assert merged_doc.categories == ['a', 'b', 'c', 'd', 'e', 'f', 'a', 'b', 'c', 'd', 'e', 'f'] + assert merged_doc.categories == [ + 'a', + 'b', + 'c', + 'd', + 'e', + 'f', + 'a', + 'b', + 'c', + 'd', + 'e', + 'f', + ] assert len(merged_doc.matches) == 2 assert merged_doc.opt_int == 5 assert merged_doc.price == 10 From 7c3d9f4f07c27dd000d87d5f2767eb4bd9905e2f Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Fri, 3 Feb 2023 10:46:35 +0100 Subject: [PATCH 06/11] fix: apply comments and support dicts Signed-off-by: Joan Fontanals Martinez --- docarray/utils/reduce.py | 82 +++++++++++++++++++++++---------- tests/units/util/test_reduce.py | 6 ++- 2 files changed, 62 insertions(+), 26 deletions(-) diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py index 8d4efb925b2..d2c2c3a021d 100644 --- a/docarray/utils/reduce.py +++ b/docarray/utils/reduce.py @@ -2,16 +2,32 @@ from typing import List, Optional, Dict, _GenericAlias # type: ignore from docarray.base_document import BaseDocument - -def _types_analysis(doc: 'BaseDocument') -> List[List[str]]: +from collections import namedtuple + +# Declaring namedtuple() +_FieldGroups = namedtuple( + '_FieldGroups', + [ + 'simple_non_empty_fields', + 'list_fields', + 'set_fields', + 'dict_fields', + 'nested_docarray_fields', + 'nested_docs_fields', + ], +) + + +def _group_fields(doc: 'BaseDocument') -> _FieldGroups: simple_non_empty_fields: List[str] = [] list_fields: List[str] = [] set_fields: List[str] = [] + dict_fields: List[str] = [] nested_docs_fields: List[str] = [] nested_docarray_fields: List[str] = [] for field_name, field in doc.__fields__.items(): - field_type = field.outer_type_ + field_type = doc._get_field_type(field_name) if not isinstance(field_type, _GenericAlias) and issubclass( field_type, DocumentArray ): @@ -20,44 +36,52 @@ def _types_analysis(doc: 'BaseDocument') -> List[List[str]]: list_fields.append(field_name) elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is set: set_fields.append(field_name) + elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is dict: + dict_fields.append(field_name) v = getattr(doc, field_name) if v: if isinstance(v, BaseDocument): nested_docs_fields.append(field_name) else: simple_non_empty_fields.append(field_name) - return [ + return _FieldGroups( simple_non_empty_fields, list_fields, set_fields, + dict_fields, nested_docarray_fields, nested_docs_fields, - ] + ) def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument': """ Reduces doc1 and doc2 into one Document in-place. Changes are applied to doc1. - Reducing 2 Documents consists in setting data properties of the second Document - to the first Document if they are empty (priority to the left-most Document) - and reducing recursively its nested Documents and DocumentArrays + Reducing 2 Documents consists in the following: + - setting data properties of the second Document to the first Document + if they are empty (priority to the left-most Document) + - Concatenating lists and updating sets + - Reducing recursively Documents and DocumentArrays + - Updating Dictionaries of the left with the right :param doc1: first Document to be reduced. Change is applied in-place :param doc2: second Document to be reduced - :return The reduced Document + :return: The reduced Document """ - doc1_fields = _types_analysis(doc1) - doc1_simple_non_empty_fields = doc1_fields[0] - doc1_list_fields = doc1_fields[1] - doc1_set_fields = doc1_fields[2] - doc1_nested_docarray_fields = doc1_fields[3] - doc1_nested_docs_fields = doc1_fields[4] - - doc2_fields = _types_analysis(doc2) - doc2_simple_non_empty_fields = doc2_fields[0] - doc2_list_fields = doc2_fields[1] - doc2_set_fields = doc2_fields[2] - doc2_nested_docarray_fields = doc2_fields[3] - doc2_nested_docs_fields = doc2_fields[4] + doc1_fields = _group_fields(doc1) + doc1_simple_non_empty_fields = doc1_fields.simple_non_empty_fields + doc1_list_fields = doc1_fields.list_fields + doc1_set_fields = doc1_fields.set_fields + doc1_dict_fields = doc1_fields.dict_fields + doc1_nested_docarray_fields = doc1_fields.nested_docarray_fields + doc1_nested_docs_fields = doc1_fields.nested_docs_fields + + doc2_fields = _group_fields(doc2) + doc2_simple_non_empty_fields = doc2_fields.simple_non_empty_fields + doc2_list_fields = doc2_fields.list_fields + doc2_set_fields = doc2_fields.set_fields + doc2_dict_fields = doc2_fields.dict_fields + doc2_nested_docarray_fields = doc2_fields.nested_docarray_fields + doc2_nested_docs_fields = doc2_fields.nested_docs_fields # update only fields that are set in doc2 and not set in doc1 update_simple_fields = set(doc2_simple_non_empty_fields) - set( @@ -97,6 +121,15 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument': array1 = reduce(array1, array2) setattr(doc1, field, array1) + for field in set(doc1_dict_fields + doc2_dict_fields): + dict1 = getattr(doc1, field) + dict2 = getattr(doc2, field) + if dict1 is None and dict2 is not None: + setattr(doc1, field, dict2) + elif dict1 is not None and dict2 is not None: + dict1.update(dict2) + setattr(doc1, field, dict1) + return doc1 @@ -153,9 +186,8 @@ def reduce_all(docarrays: List[DocumentArray]) -> DocumentArray: :param docarrays: List of DocumentArrays to be reduced :return: the resulting DocumentArray """ - assert ( - len(docarrays) > 1 - ), 'In order to reduce DocumentArrays we should have more than one DocumentArray' + if len(docarrays) <= 1: + raise Exception('In order to reduce DocumentArrays we should have more than one DocumentArray') left = docarrays[0] others = docarrays[1:] left_id_map = {doc.id: i for i, doc in enumerate(left)} diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py index ae10378dea3..33e154957e2 100644 --- a/tests/units/util/test_reduce.py +++ b/tests/units/util/test_reduce.py @@ -17,10 +17,10 @@ class MMDoc(BaseDocument): image: Optional[Image] = None matches: Optional[DocumentArray] = None matches_with_same_id: Optional[DocumentArray] = None - dictionary: Optional[Dict[str, Any]] = None opt_int: Optional[int] = None test_set: Optional[Set] = None inner_doc: Optional[InnerDoc] = None + test_dict: Optional[Dict] = None @pytest.fixture @@ -35,6 +35,7 @@ def doc1(): ), test_set={'a', 'a'}, inner_doc=InnerDoc(integer=2, l=['c', 'd']), + test_dict={'a': 0, 'b': 2, 'd': 4} ) @@ -52,6 +53,7 @@ def doc2(doc1): ), test_set={'a', 'b'}, inner_doc=InnerDoc(integer=3, l=['a', 'b']), + test_dict={'a': 10, 'b': 10, 'c': 3} ) @@ -67,6 +69,7 @@ def test_reduce_docs(doc1, doc2): assert len(result.matches_with_same_id[0].matches) == 2 assert result.inner_doc.integer == 2 assert result.inner_doc.l == ['c', 'd', 'a', 'b'] + assert result.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4} # doc1 is changed in place (no extra memory) assert doc1.text == 'hey here' @@ -79,6 +82,7 @@ def test_reduce_docs(doc1, doc2): assert len(doc1.matches_with_same_id[0].matches) == 2 assert doc1.inner_doc.integer == 2 assert doc1.inner_doc.l == ['c', 'd', 'a', 'b'] + assert doc1.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4} def test_reduce_different_ids(): From 5bb4262d59a949dc8417e3e326b05e14f2247d4e Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Fri, 3 Feb 2023 12:03:36 +0100 Subject: [PATCH 07/11] feat: add update method to BaseDocument and fix reduce behavior Signed-off-by: Joan Fontanals Martinez --- docarray/base_document/document.py | 36 +++++ docarray/utils/reduce.py | 165 ++++++++++----------- tests/units/document/test_base_document.py | 19 ++- tests/units/util/test_reduce.py | 38 ++--- 4 files changed, 149 insertions(+), 109 deletions(-) diff --git a/docarray/base_document/document.py b/docarray/base_document/document.py index e891eef0fb2..72904d49020 100644 --- a/docarray/base_document/document.py +++ b/docarray/base_document/document.py @@ -46,3 +46,39 @@ def __str__(self): def _get_string_for_regex_filter(self): return str(self) + + def update(self, other: 'BaseDocument'): + """ + Updates the content of this Document with the contents of other using + :func:`~docarray.utils.reduce.reduce_docs`. + + It behaves as an update operation for Dictionaries, except that since + it is applied to a static schema type, the presence of the field is + given by the field not having a None value. + + EXAMPLE USAGE + + .. code-block:: python + + from docarray import BaseDocument + from docarray.documents import Text + + class MyDocument(BaseDocument): + content: str + title: Optional[str] = None + tags_: List + + doc1 = MyDocument(content='Core content of the document', + title='Title', tags_=['python', 'AI']) + doc2 = MyDocument(content='Core content updated', tags_=['docarray']) + + doc1.update(doc2) + assert doc1.content == 'Core content updated' + assert doc1.title == 'Title' + assert doc1.tags_ == ['python', 'AI', 'docarray'] + + :param other: The Document with which to update the contents of this + """ + from docarray.utils.reduce import reduce_docs + + reduce_docs(self, other) diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py index d2c2c3a021d..1c53ebb1953 100644 --- a/docarray/utils/reduce.py +++ b/docarray/utils/reduce.py @@ -1,100 +1,96 @@ from docarray import DocumentArray -from typing import List, Optional, Dict, _GenericAlias # type: ignore +from typing import List, Optional, Dict, TypeVar, _GenericAlias # type: ignore from docarray.base_document import BaseDocument -from collections import namedtuple - -# Declaring namedtuple() -_FieldGroups = namedtuple( - '_FieldGroups', - [ - 'simple_non_empty_fields', - 'list_fields', - 'set_fields', - 'dict_fields', - 'nested_docarray_fields', - 'nested_docs_fields', - ], -) - - -def _group_fields(doc: 'BaseDocument') -> _FieldGroups: - simple_non_empty_fields: List[str] = [] - list_fields: List[str] = [] - set_fields: List[str] = [] - dict_fields: List[str] = [] - nested_docs_fields: List[str] = [] - nested_docarray_fields: List[str] = [] - - for field_name, field in doc.__fields__.items(): - field_type = doc._get_field_type(field_name) - if not isinstance(field_type, _GenericAlias) and issubclass( - field_type, DocumentArray - ): - nested_docarray_fields.append(field_name) - elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is list: - list_fields.append(field_name) - elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is set: - set_fields.append(field_name) - elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is dict: - dict_fields.append(field_name) - v = getattr(doc, field_name) - if v: - if isinstance(v, BaseDocument): - nested_docs_fields.append(field_name) - else: - simple_non_empty_fields.append(field_name) - return _FieldGroups( - simple_non_empty_fields, - list_fields, - set_fields, - dict_fields, - nested_docarray_fields, - nested_docs_fields, - ) +T = TypeVar('T', bound='BaseDocument') -def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument': +def reduce_docs(doc1: 'T', doc2: 'T') -> None: """ Reduces doc1 and doc2 into one Document in-place. Changes are applied to doc1. Reducing 2 Documents consists in the following: - setting data properties of the second Document to the first Document - if they are empty (priority to the left-most Document) + if they are not None - Concatenating lists and updating sets - Reducing recursively Documents and DocumentArrays - Updating Dictionaries of the left with the right :param doc1: first Document to be reduced. Change is applied in-place :param doc2: second Document to be reduced - :return: The reduced Document """ - doc1_fields = _group_fields(doc1) - doc1_simple_non_empty_fields = doc1_fields.simple_non_empty_fields - doc1_list_fields = doc1_fields.list_fields - doc1_set_fields = doc1_fields.set_fields - doc1_dict_fields = doc1_fields.dict_fields - doc1_nested_docarray_fields = doc1_fields.nested_docarray_fields - doc1_nested_docs_fields = doc1_fields.nested_docs_fields + from collections import namedtuple + + # Declaring namedtuple() + _FieldGroups = namedtuple( + '_FieldGroups', + [ + 'simple_non_empty_fields', + 'list_fields', + 'set_fields', + 'dict_fields', + 'nested_docarray_fields', + 'nested_docs_fields', + ], + ) + + FORBIDDEN_FIELDS_TO_UPDATE = ['ID'] + + def _group_fields(doc: 'BaseDocument') -> _FieldGroups: + simple_non_empty_fields: List[str] = [] + list_fields: List[str] = [] + set_fields: List[str] = [] + dict_fields: List[str] = [] + nested_docs_fields: List[str] = [] + nested_docarray_fields: List[str] = [] + + for field_name, field in doc.__fields__.items(): + if field_name not in FORBIDDEN_FIELDS_TO_UPDATE: + field_type = doc._get_field_type(field_name) + if not isinstance(field_type, _GenericAlias) and issubclass( + field_type, DocumentArray + ): + nested_docarray_fields.append(field_name) + elif ( + isinstance(field_type, _GenericAlias) + and field_type.__origin__ is list + ): + list_fields.append(field_name) + elif ( + isinstance(field_type, _GenericAlias) + and field_type.__origin__ is set + ): + set_fields.append(field_name) + elif ( + isinstance(field_type, _GenericAlias) + and field_type.__origin__ is dict + ): + dict_fields.append(field_name) + else: + v = getattr(doc, field_name) + if v: + if isinstance(v, BaseDocument): + nested_docs_fields.append(field_name) + else: + simple_non_empty_fields.append(field_name) + return _FieldGroups( + simple_non_empty_fields, + list_fields, + set_fields, + dict_fields, + nested_docarray_fields, + nested_docs_fields, + ) + doc1_fields = _group_fields(doc1) doc2_fields = _group_fields(doc2) - doc2_simple_non_empty_fields = doc2_fields.simple_non_empty_fields - doc2_list_fields = doc2_fields.list_fields - doc2_set_fields = doc2_fields.set_fields - doc2_dict_fields = doc2_fields.dict_fields - doc2_nested_docarray_fields = doc2_fields.nested_docarray_fields - doc2_nested_docs_fields = doc2_fields.nested_docs_fields - - # update only fields that are set in doc2 and not set in doc1 - update_simple_fields = set(doc2_simple_non_empty_fields) - set( - doc1_simple_non_empty_fields - ) - for field in update_simple_fields: + for field in doc2_fields.simple_non_empty_fields: setattr(doc1, field, getattr(doc2, field)) - for field in set(doc1_nested_docs_fields + doc2_nested_docs_fields): - setattr(doc1, field, reduce_docs(getattr(doc1, field), getattr(doc2, field))) + for field in set(doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields): + reduce_docs(getattr(doc1, field), getattr(doc2, field)) + setattr(doc1, field, getattr(doc1, field)) - for field in set(doc1_list_fields + doc2_list_fields): + for field in set(doc1_fields.list_fields + doc2_fields.list_fields): array1 = getattr(doc1, field) array2 = getattr(doc2, field) if array1 is None and array2 is not None: @@ -103,7 +99,7 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument': array1.extend(array2) setattr(doc1, field, array1) - for field in set(doc1_set_fields + doc2_set_fields): + for field in set(doc1_fields.set_fields + doc2_fields.set_fields): array1 = getattr(doc1, field) array2 = getattr(doc2, field) if array1 is None and array2 is not None: @@ -112,7 +108,9 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument': array1.update(array2) setattr(doc1, field, array1) - for field in set(doc1_nested_docarray_fields + doc2_nested_docarray_fields): + for field in set( + doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields + ): array1 = getattr(doc1, field) array2 = getattr(doc2, field) if array1 is None and array2 is not None: @@ -121,7 +119,7 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument': array1 = reduce(array1, array2) setattr(doc1, field, array1) - for field in set(doc1_dict_fields + doc2_dict_fields): + for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields): dict1 = getattr(doc1, field) dict2 = getattr(doc2, field) if dict1 is None and dict2 is not None: @@ -130,8 +128,6 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument': dict1.update(dict2) setattr(doc1, field, dict1) - return doc1 - def reduce( left: DocumentArray, right: DocumentArray, left_id_map: Optional[Dict] = None @@ -157,7 +153,7 @@ def reduce( for doc in right: if doc.id in left_id_map: - reduce_docs(left[left_id_map[doc.id]], doc) + left[left_id_map[doc.id]].update(doc) else: left.append(doc) @@ -187,7 +183,10 @@ def reduce_all(docarrays: List[DocumentArray]) -> DocumentArray: :return: the resulting DocumentArray """ if len(docarrays) <= 1: - raise Exception('In order to reduce DocumentArrays we should have more than one DocumentArray') + raise Exception( + 'In order to reduce DocumentArrays' + ' we should have more than one DocumentArray' + ) left = docarrays[0] others = docarrays[1:] left_id_map = {doc.id: i for i, doc in enumerate(left)} diff --git a/tests/units/document/test_base_document.py b/tests/units/document/test_base_document.py index be519424702..6a76c58f56b 100644 --- a/tests/units/document/test_base_document.py +++ b/tests/units/document/test_base_document.py @@ -1,8 +1,25 @@ +from typing import Optional, List from docarray.base_document.document import BaseDocument def test_base_document_init(): - doc = BaseDocument() assert doc.id is not None + + +def test_update(): + class MyDocument(BaseDocument): + content: str + title: Optional[str] = None + tags_: List + + doc1 = MyDocument( + content='Core content of the document', title='Title', tags_=['python', 'AI'] + ) + doc2 = MyDocument(content='Core content updated', tags_=['docarray']) + + doc1.update(doc2) + assert doc1.content == 'Core content updated' + assert doc1.title == 'Title' + assert doc1.tags_ == ['python', 'AI', 'docarray'] diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py index 33e154957e2..9759c4d24eb 100644 --- a/tests/units/util/test_reduce.py +++ b/tests/units/util/test_reduce.py @@ -35,7 +35,7 @@ def doc1(): ), test_set={'a', 'a'}, inner_doc=InnerDoc(integer=2, l=['c', 'd']), - test_dict={'a': 0, 'b': 2, 'd': 4} + test_dict={'a': 0, 'b': 2, 'd': 4, 'z': 3}, ) @@ -53,36 +53,24 @@ def doc2(doc1): ), test_set={'a', 'b'}, inner_doc=InnerDoc(integer=3, l=['a', 'b']), - test_dict={'a': 10, 'b': 10, 'c': 3} + test_dict={'a': 10, 'b': 10, 'c': 3, 'z': None}, ) def test_reduce_docs(doc1, doc2): - result = reduce_docs(doc1, doc2) - assert result.text == 'hey here' - assert len(result.matches) == 2 - assert result.categories == ['a', 'b', 'c', 'd', 'e', 'f'] - assert result.opt_int == 5 - assert result.price == 10 - assert result.test_set == {'a', 'b'} - assert len(result.matches_with_same_id) == 1 - assert len(result.matches_with_same_id[0].matches) == 2 - assert result.inner_doc.integer == 2 - assert result.inner_doc.l == ['c', 'd', 'a', 'b'] - assert result.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4} - + reduce_docs(doc1, doc2) # doc1 is changed in place (no extra memory) - assert doc1.text == 'hey here' + assert doc1.text == 'hey here 2' assert doc1.categories == ['a', 'b', 'c', 'd', 'e', 'f'] assert len(doc1.matches) == 2 assert doc1.opt_int == 5 - assert doc1.price == 10 + assert doc1.price == 5 assert doc1.test_set == {'a', 'b'} assert len(doc1.matches_with_same_id) == 1 assert len(doc1.matches_with_same_id[0].matches) == 2 - assert doc1.inner_doc.integer == 2 + assert doc1.inner_doc.integer == 3 assert doc1.inner_doc.l == ['c', 'd', 'a', 'b'] - assert doc1.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4} + assert doc1.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4, 'z': None} def test_reduce_different_ids(): @@ -102,15 +90,15 @@ def test_reduce(doc1, doc2): # da1 is changed in place (no extra memory) assert len(da1) == 3 merged_doc = result[0] - assert merged_doc.text == 'hey here' + assert merged_doc.text == 'hey here 2' assert merged_doc.categories == ['a', 'b', 'c', 'd', 'e', 'f'] assert len(merged_doc.matches) == 2 assert merged_doc.opt_int == 5 - assert merged_doc.price == 10 + assert merged_doc.price == 5 assert merged_doc.test_set == {'a', 'b'} assert len(merged_doc.matches_with_same_id) == 1 assert len(merged_doc.matches_with_same_id[0].matches) == 2 - assert merged_doc.inner_doc.integer == 2 + assert merged_doc.inner_doc.integer == 3 assert merged_doc.inner_doc.l == ['c', 'd', 'a', 'b'] @@ -123,7 +111,7 @@ def test_reduce_all(doc1, doc2): # da1 is changed in place (no extra memory) assert len(da1) == 5 merged_doc = result[0] - assert merged_doc.text == 'hey here' + assert merged_doc.text == 'hey here 2' assert merged_doc.categories == [ 'a', 'b', @@ -140,9 +128,9 @@ def test_reduce_all(doc1, doc2): ] assert len(merged_doc.matches) == 2 assert merged_doc.opt_int == 5 - assert merged_doc.price == 10 + assert merged_doc.price == 5 assert merged_doc.test_set == {'a', 'b'} assert len(merged_doc.matches_with_same_id) == 1 assert len(merged_doc.matches_with_same_id[0].matches) == 2 - assert merged_doc.inner_doc.integer == 2 + assert merged_doc.inner_doc.integer == 3 assert merged_doc.inner_doc.l == ['c', 'd', 'a', 'b', 'c', 'd', 'a', 'b'] From 44c5b8aaf798c089923dc9a0525e18db94e12712 Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Fri, 3 Feb 2023 12:45:19 +0100 Subject: [PATCH 08/11] refactor: move reduce docs to update Signed-off-by: Joan Fontanals Martinez --- docarray/base_document/document.py | 139 ++++++++++++++++++++++++++-- docarray/utils/reduce.py | 129 +------------------------- tests/units/document/test_update.py | 102 ++++++++++++++++++++ tests/units/util/test_reduce.py | 20 +--- 4 files changed, 238 insertions(+), 152 deletions(-) create mode 100644 tests/units/document/test_update.py diff --git a/docarray/base_document/document.py b/docarray/base_document/document.py index 72904d49020..ecbf8557c2d 100644 --- a/docarray/base_document/document.py +++ b/docarray/base_document/document.py @@ -1,5 +1,5 @@ import os -from typing import Type +from typing import Type, List, _GenericAlias # type: ignore import orjson from pydantic import BaseModel, Field, parse_obj_as @@ -49,8 +49,13 @@ def _get_string_for_regex_filter(self): def update(self, other: 'BaseDocument'): """ - Updates the content of this Document with the contents of other using - :func:`~docarray.utils.reduce.reduce_docs`. + Updates self with the content of other. Changes are applied to self. + Updating one Document with another consists in the following: + - setting data properties of the second Document to the first Document + if they are not None + - Concatenating lists and updating sets + - Updating recursively Documents and DocumentArrays + - Updating Dictionaries of the left with the right It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is @@ -79,6 +84,128 @@ class MyDocument(BaseDocument): :param other: The Document with which to update the contents of this """ - from docarray.utils.reduce import reduce_docs - - reduce_docs(self, other) + if type(self) != type(other): + raise Exception( + f'Update operation can only be applied to ' + f'Documents of the same type. ' + f'Trying to update Document of type ' + f'{type(self)} with Document of type ' + f'{type(other)}' + ) + from docarray.utils.reduce import reduce + from docarray import DocumentArray + + from collections import namedtuple + + # Declaring namedtuple() + _FieldGroups = namedtuple( + '_FieldGroups', + [ + 'simple_non_empty_fields', + 'list_fields', + 'set_fields', + 'dict_fields', + 'nested_docarray_fields', + 'nested_docs_fields', + ], + ) + + FORBIDDEN_FIELDS_TO_UPDATE = ['ID'] + + def _group_fields(doc: 'BaseDocument') -> _FieldGroups: + simple_non_empty_fields: List[str] = [] + list_fields: List[str] = [] + set_fields: List[str] = [] + dict_fields: List[str] = [] + nested_docs_fields: List[str] = [] + nested_docarray_fields: List[str] = [] + + for field_name, field in doc.__fields__.items(): + if field_name not in FORBIDDEN_FIELDS_TO_UPDATE: + field_type = doc._get_field_type(field_name) + if not isinstance(field_type, _GenericAlias) and issubclass( + field_type, DocumentArray + ): + nested_docarray_fields.append(field_name) + elif ( + isinstance(field_type, _GenericAlias) + and field_type.__origin__ is list + ): + list_fields.append(field_name) + elif ( + isinstance(field_type, _GenericAlias) + and field_type.__origin__ is set + ): + set_fields.append(field_name) + elif ( + isinstance(field_type, _GenericAlias) + and field_type.__origin__ is dict + ): + dict_fields.append(field_name) + else: + v = getattr(doc, field_name) + if v: + if isinstance(v, BaseDocument): + nested_docs_fields.append(field_name) + else: + simple_non_empty_fields.append(field_name) + return _FieldGroups( + simple_non_empty_fields, + list_fields, + set_fields, + dict_fields, + nested_docarray_fields, + nested_docs_fields, + ) + + doc1_fields = _group_fields(self) + doc2_fields = _group_fields(other) + + for field in doc2_fields.simple_non_empty_fields: + setattr(self, field, getattr(other, field)) + + for field in set( + doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields + ): + sub_doc_1: BaseDocument = getattr(self, field) + sub_doc_2: BaseDocument = getattr(other, field) + sub_doc_1.update(sub_doc_2) + setattr(self, field, sub_doc_1) + + for field in set(doc1_fields.list_fields + doc2_fields.list_fields): + array1 = getattr(self, field) + array2 = getattr(other, field) + if array1 is None and array2 is not None: + setattr(self, field, array2) + elif array1 is not None and array2 is not None: + array1.extend(array2) + setattr(self, field, array1) + + for field in set(doc1_fields.set_fields + doc2_fields.set_fields): + array1 = getattr(self, field) + array2 = getattr(other, field) + if array1 is None and array2 is not None: + setattr(self, field, array2) + elif array1 is not None and array2 is not None: + array1.update(array2) + setattr(self, field, array1) + + for field in set( + doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields + ): + array1 = getattr(self, field) + array2 = getattr(other, field) + if array1 is None and array2 is not None: + setattr(self, field, array2) + elif array1 is not None and array2 is not None: + array1 = reduce(array1, array2) + setattr(self, field, array1) + + for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields): + dict1 = getattr(self, field) + dict2 = getattr(other, field) + if dict1 is None and dict2 is not None: + setattr(self, field, dict2) + elif dict1 is not None and dict2 is not None: + dict1.update(dict2) + setattr(self, field, dict1) diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py index 1c53ebb1953..60493d04ea5 100644 --- a/docarray/utils/reduce.py +++ b/docarray/utils/reduce.py @@ -1,132 +1,5 @@ from docarray import DocumentArray -from typing import List, Optional, Dict, TypeVar, _GenericAlias # type: ignore -from docarray.base_document import BaseDocument - -T = TypeVar('T', bound='BaseDocument') - - -def reduce_docs(doc1: 'T', doc2: 'T') -> None: - """ - Reduces doc1 and doc2 into one Document in-place. Changes are applied to doc1. - Reducing 2 Documents consists in the following: - - setting data properties of the second Document to the first Document - if they are not None - - Concatenating lists and updating sets - - Reducing recursively Documents and DocumentArrays - - Updating Dictionaries of the left with the right - :param doc1: first Document to be reduced. Change is applied in-place - :param doc2: second Document to be reduced - """ - from collections import namedtuple - - # Declaring namedtuple() - _FieldGroups = namedtuple( - '_FieldGroups', - [ - 'simple_non_empty_fields', - 'list_fields', - 'set_fields', - 'dict_fields', - 'nested_docarray_fields', - 'nested_docs_fields', - ], - ) - - FORBIDDEN_FIELDS_TO_UPDATE = ['ID'] - - def _group_fields(doc: 'BaseDocument') -> _FieldGroups: - simple_non_empty_fields: List[str] = [] - list_fields: List[str] = [] - set_fields: List[str] = [] - dict_fields: List[str] = [] - nested_docs_fields: List[str] = [] - nested_docarray_fields: List[str] = [] - - for field_name, field in doc.__fields__.items(): - if field_name not in FORBIDDEN_FIELDS_TO_UPDATE: - field_type = doc._get_field_type(field_name) - if not isinstance(field_type, _GenericAlias) and issubclass( - field_type, DocumentArray - ): - nested_docarray_fields.append(field_name) - elif ( - isinstance(field_type, _GenericAlias) - and field_type.__origin__ is list - ): - list_fields.append(field_name) - elif ( - isinstance(field_type, _GenericAlias) - and field_type.__origin__ is set - ): - set_fields.append(field_name) - elif ( - isinstance(field_type, _GenericAlias) - and field_type.__origin__ is dict - ): - dict_fields.append(field_name) - else: - v = getattr(doc, field_name) - if v: - if isinstance(v, BaseDocument): - nested_docs_fields.append(field_name) - else: - simple_non_empty_fields.append(field_name) - return _FieldGroups( - simple_non_empty_fields, - list_fields, - set_fields, - dict_fields, - nested_docarray_fields, - nested_docs_fields, - ) - - doc1_fields = _group_fields(doc1) - doc2_fields = _group_fields(doc2) - - for field in doc2_fields.simple_non_empty_fields: - setattr(doc1, field, getattr(doc2, field)) - - for field in set(doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields): - reduce_docs(getattr(doc1, field), getattr(doc2, field)) - setattr(doc1, field, getattr(doc1, field)) - - for field in set(doc1_fields.list_fields + doc2_fields.list_fields): - array1 = getattr(doc1, field) - array2 = getattr(doc2, field) - if array1 is None and array2 is not None: - setattr(doc1, field, array2) - elif array1 is not None and array2 is not None: - array1.extend(array2) - setattr(doc1, field, array1) - - for field in set(doc1_fields.set_fields + doc2_fields.set_fields): - array1 = getattr(doc1, field) - array2 = getattr(doc2, field) - if array1 is None and array2 is not None: - setattr(doc1, field, array2) - elif array1 is not None and array2 is not None: - array1.update(array2) - setattr(doc1, field, array1) - - for field in set( - doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields - ): - array1 = getattr(doc1, field) - array2 = getattr(doc2, field) - if array1 is None and array2 is not None: - setattr(doc1, field, array2) - elif array1 is not None and array2 is not None: - array1 = reduce(array1, array2) - setattr(doc1, field, array1) - - for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields): - dict1 = getattr(doc1, field) - dict2 = getattr(doc2, field) - if dict1 is None and dict2 is not None: - setattr(doc1, field, dict2) - elif dict1 is not None and dict2 is not None: - dict1.update(dict2) - setattr(doc1, field, dict1) +from typing import List, Optional, Dict def reduce( diff --git a/tests/units/document/test_update.py b/tests/units/document/test_update.py new file mode 100644 index 00000000000..90e2d813f9f --- /dev/null +++ b/tests/units/document/test_update.py @@ -0,0 +1,102 @@ +import pytest +from typing import Optional, List, Dict, Set +from docarray import BaseDocument, DocumentArray +from docarray.documents import Image + + +class InnerDoc(BaseDocument): + integer: int + l: List + + +class MMDoc(BaseDocument): + text: str = '' + price: int = 0 + categories: Optional[List[str]] = None + image: Optional[Image] = None + matches: Optional[DocumentArray] = None + matches_with_same_id: Optional[DocumentArray] = None + opt_int: Optional[int] = None + test_set: Optional[Set] = None + inner_doc: Optional[InnerDoc] = None + test_dict: Optional[Dict] = None + + +@pytest.fixture +def doc1(): + return MMDoc( + text='hey here', + categories=['a', 'b', 'c'], + price=10, + matches=DocumentArray[MMDoc]([MMDoc()]), + matches_with_same_id=DocumentArray[MMDoc]( + [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))] + ), + test_set={'a', 'a'}, + inner_doc=InnerDoc(integer=2, l=['c', 'd']), + test_dict={'a': 0, 'b': 2, 'd': 4, 'z': 3}, + ) + + +@pytest.fixture +def doc2(doc1): + return MMDoc( + id=doc1.id, + text='hey here 2', + categories=['d', 'e', 'f'], + price=5, + opt_int=5, + matches=DocumentArray[MMDoc]([MMDoc()]), + matches_with_same_id=DocumentArray[MMDoc]( + [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))] + ), + test_set={'a', 'b'}, + inner_doc=InnerDoc(integer=3, l=['a', 'b']), + test_dict={'a': 10, 'b': 10, 'c': 3, 'z': None}, + ) + + +def test_update_complex(doc1, doc2): + doc1.update(doc2) + # doc1 is changed in place (no extra memory) + assert doc1.text == 'hey here 2' + assert doc1.categories == ['a', 'b', 'c', 'd', 'e', 'f'] + assert len(doc1.matches) == 2 + assert doc1.opt_int == 5 + assert doc1.price == 5 + assert doc1.test_set == {'a', 'b'} + assert len(doc1.matches_with_same_id) == 1 + assert len(doc1.matches_with_same_id[0].matches) == 2 + assert doc1.inner_doc.integer == 3 + assert doc1.inner_doc.l == ['c', 'd', 'a', 'b'] + assert doc1.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4, 'z': None} + + +def test_update_simple(): + class MyDocument(BaseDocument): + content: str + title: Optional[str] = None + tags_: List + + my_doc1 = MyDocument( + content='Core content of the document', title='Title', tags_=['python', 'AI'] + ) + my_doc2 = MyDocument(content='Core content updated', tags_=['docarray']) + + my_doc1.update(my_doc2) + assert my_doc1.content == 'Core content updated' + assert my_doc1.title == 'Title' + assert my_doc1.tags_ == ['python', 'AI', 'docarray'] + + +def test_update_different_schema_fails(): + class DocA(BaseDocument): + content: str + + class DocB(BaseDocument): + image: Optional[Image] = None + + docA = DocA(content='haha') + docB = DocB() + with pytest.raises(Exception): + docA.update(docB) diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py index 9759c4d24eb..7e82ddf181c 100644 --- a/tests/units/util/test_reduce.py +++ b/tests/units/util/test_reduce.py @@ -1,8 +1,8 @@ import pytest -from typing import Optional, List, Dict, Any, Set +from typing import Optional, List, Dict, Set from docarray import BaseDocument, DocumentArray from docarray.documents import Image -from docarray.utils.reduce import reduce_docs, reduce, reduce_all +from docarray.utils.reduce import reduce, reduce_all class InnerDoc(BaseDocument): @@ -57,22 +57,6 @@ def doc2(doc1): ) -def test_reduce_docs(doc1, doc2): - reduce_docs(doc1, doc2) - # doc1 is changed in place (no extra memory) - assert doc1.text == 'hey here 2' - assert doc1.categories == ['a', 'b', 'c', 'd', 'e', 'f'] - assert len(doc1.matches) == 2 - assert doc1.opt_int == 5 - assert doc1.price == 5 - assert doc1.test_set == {'a', 'b'} - assert len(doc1.matches_with_same_id) == 1 - assert len(doc1.matches_with_same_id[0].matches) == 2 - assert doc1.inner_doc.integer == 3 - assert doc1.inner_doc.l == ['c', 'd', 'a', 'b'] - assert doc1.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4, 'z': None} - - def test_reduce_different_ids(): da1 = DocumentArray[MMDoc]([MMDoc() for _ in range(10)]) da2 = DocumentArray[MMDoc]([MMDoc() for _ in range(10)]) From 72684cd435418f1683772c9fdcb0e1774ee38c11 Mon Sep 17 00:00:00 2001 From: samsja Date: Fri, 3 Feb 2023 15:26:53 +0100 Subject: [PATCH 09/11] refactor: use get origin instead of private _GenericAlais Signed-off-by: samsja --- docarray/base_document/document.py | 54 ++++++++++++++---------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/docarray/base_document/document.py b/docarray/base_document/document.py index ecbf8557c2d..c01749dabf7 100644 --- a/docarray/base_document/document.py +++ b/docarray/base_document/document.py @@ -1,9 +1,10 @@ import os -from typing import Type, List, _GenericAlias # type: ignore +from typing import List, Type import orjson from pydantic import BaseModel, Field, parse_obj_as from rich.console import Console +from typing_inspect import get_origin from docarray.base_document.abstract_document import AbstractDocument from docarray.base_document.base_node import BaseNode @@ -68,13 +69,16 @@ def update(self, other: 'BaseDocument'): from docarray import BaseDocument from docarray.documents import Text + class MyDocument(BaseDocument): content: str title: Optional[str] = None tags_: List - doc1 = MyDocument(content='Core content of the document', - title='Title', tags_=['python', 'AI']) + + doc1 = MyDocument( + content='Core content of the document', title='Title', tags_=['python', 'AI'] + ) doc2 = MyDocument(content='Core content updated', tags_=['docarray']) doc1.update(doc2) @@ -92,11 +96,11 @@ class MyDocument(BaseDocument): f'{type(self)} with Document of type ' f'{type(other)}' ) - from docarray.utils.reduce import reduce - from docarray import DocumentArray - from collections import namedtuple + from docarray import DocumentArray + from docarray.utils.reduce import reduce + # Declaring namedtuple() _FieldGroups = namedtuple( '_FieldGroups', @@ -123,32 +127,26 @@ def _group_fields(doc: 'BaseDocument') -> _FieldGroups: for field_name, field in doc.__fields__.items(): if field_name not in FORBIDDEN_FIELDS_TO_UPDATE: field_type = doc._get_field_type(field_name) - if not isinstance(field_type, _GenericAlias) and issubclass( + + if isinstance(field_type, type) and issubclass( field_type, DocumentArray ): nested_docarray_fields.append(field_name) - elif ( - isinstance(field_type, _GenericAlias) - and field_type.__origin__ is list - ): - list_fields.append(field_name) - elif ( - isinstance(field_type, _GenericAlias) - and field_type.__origin__ is set - ): - set_fields.append(field_name) - elif ( - isinstance(field_type, _GenericAlias) - and field_type.__origin__ is dict - ): - dict_fields.append(field_name) else: - v = getattr(doc, field_name) - if v: - if isinstance(v, BaseDocument): - nested_docs_fields.append(field_name) - else: - simple_non_empty_fields.append(field_name) + origin = get_origin(field_type) + if origin is list: + list_fields.append(field_name) + elif origin is set: + set_fields.append(field_name) + elif origin is dict: + dict_fields.append(field_name) + else: + v = getattr(doc, field_name) + if v: + if isinstance(v, BaseDocument): + nested_docs_fields.append(field_name) + else: + simple_non_empty_fields.append(field_name) return _FieldGroups( simple_non_empty_fields, list_fields, From e9219372214785c1291cd5e1c6aa81b35803cab8 Mon Sep 17 00:00:00 2001 From: samsja Date: Fri, 3 Feb 2023 15:29:39 +0100 Subject: [PATCH 10/11] fix: fix ruff Signed-off-by: samsja --- docarray/base_document/document.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docarray/base_document/document.py b/docarray/base_document/document.py index c01749dabf7..8ac2f4630ef 100644 --- a/docarray/base_document/document.py +++ b/docarray/base_document/document.py @@ -77,7 +77,9 @@ class MyDocument(BaseDocument): doc1 = MyDocument( - content='Core content of the document', title='Title', tags_=['python', 'AI'] + content='Core content of the document', + title='Title', + tags_=['python', 'AI'] ) doc2 = MyDocument(content='Core content updated', tags_=['docarray']) From dadffb50f2eb68341c3a2ccb02811e1236e3ed46 Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Mon, 6 Feb 2023 10:11:37 +0100 Subject: [PATCH 11/11] docs: add clarification about tuples --- docarray/base_document/document.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docarray/base_document/document.py b/docarray/base_document/document.py index 8ac2f4630ef..088c7551714 100644 --- a/docarray/base_document/document.py +++ b/docarray/base_document/document.py @@ -60,7 +60,11 @@ def update(self, other: 'BaseDocument'): It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is - given by the field not having a None value. + given by the field not having a None value and that DocumentArrays, + lists and sets are concatenated. It is worth mentioning that Tuples + are not merged together since they are meant to be inmutable, + so they behave as regular types and the value of `self` is updated + with the value of `other` EXAMPLE USAGE