From a5ef6ca3baf9bc9e5054d376643b8914040e4aeb Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Wed, 1 Feb 2023 19:08:53 +0100
Subject: [PATCH 01/11] feat: add reduce utils

---
 docarray/utils/reduce.py        | 134 ++++++++++++++++++++++++++++++++
 tests/units/util/test_reduce.py |  33 ++++++++
 2 files changed, 167 insertions(+)
 create mode 100644 docarray/utils/reduce.py
 create mode 100644 tests/units/util/test_reduce.py

diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py
new file mode 100644
index 00000000000..b26947cab8e
--- /dev/null
+++ b/docarray/utils/reduce.py
@@ -0,0 +1,134 @@
+from docarray import DocumentArray
+from typing import List, Optional, Dict, TYPE_CHECKING, Tuple, _GenericAlias
+from typing_inspect import is_union_type
+
+
+if TYPE_CHECKING:  # pragma: no cover
+    from docarray.base_document import BaseDocument
+
+
+def _non_empty_fields(doc: 'BaseDocument') -> Tuple[str]:
+    r: List[str] = []
+    for field_name in doc.__fields__.keys():
+        v = getattr(doc, field_name)
+        if v:
+            r.append(field_name)
+    return tuple(r)
+
+
+def _array_fields(doc: 'BaseDocument') -> Tuple[str]:
+    ret: List[str] = []
+    for field_name, field in doc.__fields__.items():
+        field_type = field.outer_type_
+        print(f'HEY {field_type} => {type(field_type)}')
+        print(f' {isinstance(field_type, _GenericAlias)}')
+        if isinstance(field_type, _GenericAlias):
+            print(field_type.__origin__)
+        if isinstance(field_type, DocumentArray) or (isinstance(field_type, _GenericAlias) and field_type.__origin__ is list):
+            ret.append(field_name)
+        else:
+            print(f' hhey 2')
+    return tuple(ret)
+
+
+"""
+A mixin that provides reducing logic for :class:`DocumentArray`
+Reducing 2 or more DocumentArrays consists in merging all Documents into the same DocumentArray.
+If a Document belongs to 2 or more DocumentArrays, it is added once and data attributes are merged with priority to
+the Document belonging to the left-most DocumentArray. Matches and chunks are also reduced in the same way.
+Reduction is applied to all levels of DocumentArrays, that is, from root Documents to all their chunk and match
+children.
+"""
+
+
+def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument', array_fields: Optional[List[str]] = None):
+    """
+    Reduces doc1 and doc2 into one Document in-place. Changes are applied to doc1.
+    Reducing 2 Documents consists in setting data properties of the second Document to the first Document if they
+    are empty (that is, priority to the left-most Document) and reducing the matches and the chunks of both
+    documents.
+    Non-data properties are ignored.
+    Reduction of matches and chunks relies on :class:`DocumentArray`.:method:`reduce`.
+    :param doc1: first Document
+    :param doc2: second Document
+    :param array_fields:
+    """
+    doc1_fields = set(_non_empty_fields(doc1))
+    doc2_fields = set(_non_empty_fields(doc2))
+
+    # update only fields that are set in doc2 and not set in doc1
+    fields = doc2_fields - doc1_fields
+
+    for field in fields:
+        setattr(doc1, field, getattr(doc2, field))
+
+    array_fields = array_fields or _array_fields(doc1)
+    for field in array_fields:
+        array1 = getattr(doc1, field)
+        array2 = getattr(doc2, field)
+        if array1 is None and array2 is not None:
+            setattr(doc1, field, array2)
+        elif array1 is not None and array2 is not None:
+            array1.extend(array2)
+            setattr(doc1, field, array1)  # I am not sure if this is optimal, how can I do (doc1.field.extend())
+
+    return doc1
+
+
+def reduce(left: DocumentArray, other: DocumentArray, left_id_map: Optional[Dict] = None,
+           array_fields: Optional[List[str]] = None) -> 'DocumentArray':
+    """
+    Reduces other and the current DocumentArray into one DocumentArray in-place. Changes are applied to the current
+    DocumentArray.
+    Reducing 2 DocumentArrays consists in adding Documents in the second DocumentArray to the first DocumentArray
+    if they do not exist. If a Document exists in both DocumentArrays, the data properties are merged with priority
+    to the first Document (that is, to the current DocumentArray's Document). The matches and chunks are also
+    reduced in the same way.
+    :param left: DocumentArray
+    :param other: DocumentArray
+    :param left_id_map:
+        :param array_fields:
+
+    :return: DocumentArray
+    """
+    left_id_map = left_id_map or {doc.id: i for i, doc in enumerate(left)}
+    array_fields = array_fields or left[0].array_fields
+
+    for doc in other:
+        if doc.id in left_id_map:
+            reduce_docs(left[left_id_map[doc.id]], doc, array_fields)
+        else:
+            left.append(doc)
+
+    return left
+
+
+def reduce_all(left: DocumentArray, others: List[DocumentArray]) -> DocumentArray:
+    """
+    Reduces a list of DocumentArrays and this DocumentArray into one DocumentArray. Changes are applied to this
+    DocumentArray in-place.
+
+    Reduction consists in reducing this DocumentArray with every DocumentArray in `others` sequentially using
+    :class:`DocumentArray`.:method:`reduce`.
+    The resulting DocumentArray contains Documents of all DocumentArrays.
+    If a Document exists in many DocumentArrays, data properties are merged with priority to the left-most
+    DocumentArrays (that is, if a data attribute is set in a Document belonging to many DocumentArrays, the
+    attribute value of the left-most DocumentArray is kept).
+    Matches and chunks of a Document belonging to many DocumentArrays are also reduced in the same way.
+    Other non-data properties are ignored.
+
+    .. note::
+        - Matches are not kept in a sorted order when they are reduced. You might want to re-sort them in a later
+            step.
+        - The final result depends on the order of DocumentArrays when applying reduction.
+
+    :param left:
+    :param others: List of DocumentArrays to be reduced
+    :return: the resulting DocumentArray
+    """
+    assert len(left) > 0, 'In order to reduce DocumentArrays we should have a non empty DocumentArray'
+    left_id_map = {doc.id: i for i, doc in enumerate(left)}
+    array_fields = left[0].array_fields
+    for da in others:
+        reduce(left, da, left_id_map, array_fields)
+    return left
diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py
new file mode 100644
index 00000000000..eb3ccaa8b52
--- /dev/null
+++ b/tests/units/util/test_reduce.py
@@ -0,0 +1,33 @@
+from typing import Optional, List, Dict, Any
+from docarray import BaseDocument, DocumentArray
+from docarray.documents import Image
+from docarray.utils.reduce import reduce_docs
+
+
+class MMDoc(BaseDocument):
+    text: str = ''
+    price: int = 0
+    categories: Optional[List[str]] = None
+    image: Optional[Image] = None
+    matches: Optional[DocumentArray] = None
+    dictionary: Optional[Dict[str, Any]] = None
+    opt_int: Optional[int] = None
+
+
+def test_simple_reduce_arrays_concatenated():
+    doc1 = MMDoc(text='hey here', categories=['a', 'b', 'c'], price=10, matches=DocumentArray[MMDoc]([MMDoc()]))
+    doc2 = MMDoc(id=doc1.id, text='hey here 2', categories=['d', 'e', 'f'], price=5, opt_int=5, matches=DocumentArray[MMDoc]([MMDoc()]))
+
+    result = reduce_docs(doc1, doc2)
+    assert result.text == 'hey here'
+    assert len(result.matches) == 2
+    assert result.categories == ['a', 'b', 'c', 'd', 'e', 'f']
+    assert result.opt_int == 5
+    assert result.price == 10
+
+    # doc1 is changed in place (no extra memory)
+    assert doc1.text == 'hey here'
+    assert doc1.categories == ['a', 'b', 'c', 'd', 'e', 'f']
+    assert len(doc1.matches) == 2
+    assert doc1.opt_int == 5
+    assert doc1.price == 10

From 745d1c23afb1ea04809fe6fad9b12082c6ce8c34 Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Thu, 2 Feb 2023 12:38:55 +0100
Subject: [PATCH 02/11] feat: support sets reducing

Signed-off-by: Joan Fontanals Martinez <joan.martinez@jina.ai>
---
 docarray/utils/reduce.py        | 14 +++++---------
 tests/units/util/test_reduce.py | 11 ++++++++---
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py
index b26947cab8e..fd63ad63b40 100644
--- a/docarray/utils/reduce.py
+++ b/docarray/utils/reduce.py
@@ -1,6 +1,5 @@
 from docarray import DocumentArray
 from typing import List, Optional, Dict, TYPE_CHECKING, Tuple, _GenericAlias
-from typing_inspect import is_union_type
 
 
 if TYPE_CHECKING:  # pragma: no cover
@@ -20,14 +19,8 @@ def _array_fields(doc: 'BaseDocument') -> Tuple[str]:
     ret: List[str] = []
     for field_name, field in doc.__fields__.items():
         field_type = field.outer_type_
-        print(f'HEY {field_type} => {type(field_type)}')
-        print(f' {isinstance(field_type, _GenericAlias)}')
-        if isinstance(field_type, _GenericAlias):
-            print(field_type.__origin__)
-        if isinstance(field_type, DocumentArray) or (isinstance(field_type, _GenericAlias) and field_type.__origin__ is list):
+        if (not isinstance(field_type, _GenericAlias) and issubclass(field_type, DocumentArray)) or (isinstance(field_type, _GenericAlias) and field_type.__origin__ is list) or (isinstance(field_type, _GenericAlias) and (field_type.__origin__ is set)):
             ret.append(field_name)
-        else:
-            print(f' hhey 2')
     return tuple(ret)
 
 
@@ -69,7 +62,10 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument', array_fields: Option
         if array1 is None and array2 is not None:
             setattr(doc1, field, array2)
         elif array1 is not None and array2 is not None:
-            array1.extend(array2)
+            if isinstance(array1, set):
+                array1.update(array2)
+            else:
+                array1.extend(array2)
             setattr(doc1, field, array1)  # I am not sure if this is optimal, how can I do (doc1.field.extend())
 
     return doc1
diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py
index eb3ccaa8b52..3c5d9d56aa1 100644
--- a/tests/units/util/test_reduce.py
+++ b/tests/units/util/test_reduce.py
@@ -1,4 +1,4 @@
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Set
 from docarray import BaseDocument, DocumentArray
 from docarray.documents import Image
 from docarray.utils.reduce import reduce_docs
@@ -12,11 +12,14 @@ class MMDoc(BaseDocument):
     matches: Optional[DocumentArray] = None
     dictionary: Optional[Dict[str, Any]] = None
     opt_int: Optional[int] = None
+    test_set: Optional[Set] = None
 
 
 def test_simple_reduce_arrays_concatenated():
-    doc1 = MMDoc(text='hey here', categories=['a', 'b', 'c'], price=10, matches=DocumentArray[MMDoc]([MMDoc()]))
-    doc2 = MMDoc(id=doc1.id, text='hey here 2', categories=['d', 'e', 'f'], price=5, opt_int=5, matches=DocumentArray[MMDoc]([MMDoc()]))
+    doc1 = MMDoc(text='hey here', categories=['a', 'b', 'c'], price=10, matches=DocumentArray[MMDoc]([MMDoc()]), test_set={
+        'a', 'a'})
+    doc2 = MMDoc(id=doc1.id, text='hey here 2', categories=['d', 'e', 'f'], price=5, opt_int=5, matches=DocumentArray[MMDoc]([MMDoc()]), test_set={
+        'a', 'b'})
 
     result = reduce_docs(doc1, doc2)
     assert result.text == 'hey here'
@@ -24,6 +27,7 @@ def test_simple_reduce_arrays_concatenated():
     assert result.categories == ['a', 'b', 'c', 'd', 'e', 'f']
     assert result.opt_int == 5
     assert result.price == 10
+    assert result.test_set == {'a', 'b'}
 
     # doc1 is changed in place (no extra memory)
     assert doc1.text == 'hey here'
@@ -31,3 +35,4 @@ def test_simple_reduce_arrays_concatenated():
     assert len(doc1.matches) == 2
     assert doc1.opt_int == 5
     assert doc1.price == 10
+    assert doc1.test_set == {'a', 'b'}

From 9419ec5198f07162040077ea8b9729919bfd4622 Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Thu, 2 Feb 2023 12:49:51 +0100
Subject: [PATCH 03/11] feat: support sub docarrays reducing

Signed-off-by: Joan Fontanals Martinez <joan.martinez@jina.ai>
---
 docarray/utils/reduce.py        | 94 +++++++++++++++++++--------------
 tests/units/util/test_reduce.py | 38 +++++++++++--
 2 files changed, 89 insertions(+), 43 deletions(-)

diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py
index fd63ad63b40..a3fd12266a0 100644
--- a/docarray/utils/reduce.py
+++ b/docarray/utils/reduce.py
@@ -1,27 +1,30 @@
 from docarray import DocumentArray
-from typing import List, Optional, Dict, TYPE_CHECKING, Tuple, _GenericAlias
+from typing import List, Optional, Dict, Tuple, _GenericAlias
+from docarray.base_document import BaseDocument
 
 
-if TYPE_CHECKING:  # pragma: no cover
-    from docarray.base_document import BaseDocument
+def _types_analysis(doc: 'BaseDocument') -> Tuple[List[str]]:
+    simple_non_empty_fields: List[str] = []
+    list_fields: List[str] = []
+    set_fields: List[str] = []
+    nested_docs_fields: List[str] = []
+    nested_docarray_fields: List[str] = []
 
-
-def _non_empty_fields(doc: 'BaseDocument') -> Tuple[str]:
-    r: List[str] = []
-    for field_name in doc.__fields__.keys():
-        v = getattr(doc, field_name)
-        if v:
-            r.append(field_name)
-    return tuple(r)
-
-
-def _array_fields(doc: 'BaseDocument') -> Tuple[str]:
-    ret: List[str] = []
     for field_name, field in doc.__fields__.items():
         field_type = field.outer_type_
-        if (not isinstance(field_type, _GenericAlias) and issubclass(field_type, DocumentArray)) or (isinstance(field_type, _GenericAlias) and field_type.__origin__ is list) or (isinstance(field_type, _GenericAlias) and (field_type.__origin__ is set)):
-            ret.append(field_name)
-    return tuple(ret)
+        if not isinstance(field_type, _GenericAlias) and issubclass(field_type, DocumentArray):
+            nested_docarray_fields.append(field_name)
+        elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is list:
+            list_fields.append(field_name)
+        elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is set:
+            set_fields.append(field_name)
+        v = getattr(doc, field_name)
+        if v:
+            if isinstance(v, BaseDocument):
+                nested_docs_fields.append(field_name)
+            else:
+                simple_non_empty_fields.append(field_name)
+    return tuple([simple_non_empty_fields, list_fields, set_fields, nested_docarray_fields, nested_docs_fields])
 
 
 """
@@ -34,7 +37,7 @@ def _array_fields(doc: 'BaseDocument') -> Tuple[str]:
 """
 
 
-def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument', array_fields: Optional[List[str]] = None):
+def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument':
     """
     Reduces doc1 and doc2 into one Document in-place. Changes are applied to doc1.
     Reducing 2 Documents consists in setting data properties of the second Document to the first Document if they
@@ -44,35 +47,52 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument', array_fields: Option
     Reduction of matches and chunks relies on :class:`DocumentArray`.:method:`reduce`.
     :param doc1: first Document
     :param doc2: second Document
-    :param array_fields:
     """
-    doc1_fields = set(_non_empty_fields(doc1))
-    doc2_fields = set(_non_empty_fields(doc2))
+    doc1_simple_non_empty_fields, doc1_list_fields, doc1_set_fields, doc1_nested_docarray_fields, doc1_nested_docs_fields = _types_analysis(
+        doc1)
+    doc2_simple_non_empty_fields, doc2_list_fields, doc2_set_fields, doc2_nested_docarray_fields, doc2_nested_docs_fields = _types_analysis(
+        doc2)
 
     # update only fields that are set in doc2 and not set in doc1
-    fields = doc2_fields - doc1_fields
+    update_simple_fields = set(doc2_simple_non_empty_fields) - set(doc1_simple_non_empty_fields)
 
-    for field in fields:
+    for field in update_simple_fields:
         setattr(doc1, field, getattr(doc2, field))
 
-    array_fields = array_fields or _array_fields(doc1)
-    for field in array_fields:
+    for field in set(doc1_nested_docs_fields + doc2_nested_docs_fields):
+        setattr(doc1, field, reduce_docs(getattr(doc1, field), getattr(doc2, field)))
+
+    for field in doc1_list_fields:
         array1 = getattr(doc1, field)
         array2 = getattr(doc2, field)
         if array1 is None and array2 is not None:
             setattr(doc1, field, array2)
         elif array1 is not None and array2 is not None:
-            if isinstance(array1, set):
-                array1.update(array2)
-            else:
-                array1.extend(array2)
-            setattr(doc1, field, array1)  # I am not sure if this is optimal, how can I do (doc1.field.extend())
+            array1.extend(array2)
+            setattr(doc1, field, array1)
+
+    for field in doc1_set_fields:
+        array1 = getattr(doc1, field)
+        array2 = getattr(doc2, field)
+        if array1 is None and array2 is not None:
+            setattr(doc1, field, array2)
+        elif array1 is not None and array2 is not None:
+            array1.update(array2)
+            setattr(doc1, field, array1)
+
+    for field in doc1_nested_docarray_fields:
+        array1 = getattr(doc1, field)
+        array2 = getattr(doc2, field)
+        if array1 is None and array2 is not None:
+            setattr(doc1, field, array2)
+        elif array1 is not None and array2 is not None:
+            array1 = reduce(array1, array2)
+            setattr(doc1, field, array1)
 
     return doc1
 
 
-def reduce(left: DocumentArray, other: DocumentArray, left_id_map: Optional[Dict] = None,
-           array_fields: Optional[List[str]] = None) -> 'DocumentArray':
+def reduce(left: DocumentArray, other: DocumentArray, left_id_map: Optional[Dict] = None) -> 'DocumentArray':
     """
     Reduces other and the current DocumentArray into one DocumentArray in-place. Changes are applied to the current
     DocumentArray.
@@ -83,16 +103,13 @@ def reduce(left: DocumentArray, other: DocumentArray, left_id_map: Optional[Dict
     :param left: DocumentArray
     :param other: DocumentArray
     :param left_id_map:
-        :param array_fields:
-
     :return: DocumentArray
     """
     left_id_map = left_id_map or {doc.id: i for i, doc in enumerate(left)}
-    array_fields = array_fields or left[0].array_fields
 
     for doc in other:
         if doc.id in left_id_map:
-            reduce_docs(left[left_id_map[doc.id]], doc, array_fields)
+            reduce_docs(left[left_id_map[doc.id]], doc)
         else:
             left.append(doc)
 
@@ -124,7 +141,6 @@ def reduce_all(left: DocumentArray, others: List[DocumentArray]) -> DocumentArra
     """
     assert len(left) > 0, 'In order to reduce DocumentArrays we should have a non empty DocumentArray'
     left_id_map = {doc.id: i for i, doc in enumerate(left)}
-    array_fields = left[0].array_fields
     for da in others:
-        reduce(left, da, left_id_map, array_fields)
+        reduce(left, da, left_id_map)
     return left
diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py
index 3c5d9d56aa1..9f27ea895a3 100644
--- a/tests/units/util/test_reduce.py
+++ b/tests/units/util/test_reduce.py
@@ -4,22 +4,43 @@
 from docarray.utils.reduce import reduce_docs
 
 
+class InnerDoc(BaseDocument):
+    integer: int
+    l: List
+
+
 class MMDoc(BaseDocument):
     text: str = ''
     price: int = 0
     categories: Optional[List[str]] = None
     image: Optional[Image] = None
     matches: Optional[DocumentArray] = None
+    matches_with_same_id: Optional[DocumentArray] = None
     dictionary: Optional[Dict[str, Any]] = None
     opt_int: Optional[int] = None
     test_set: Optional[Set] = None
+    inner_doc: Optional[InnerDoc] = None
 
 
 def test_simple_reduce_arrays_concatenated():
-    doc1 = MMDoc(text='hey here', categories=['a', 'b', 'c'], price=10, matches=DocumentArray[MMDoc]([MMDoc()]), test_set={
-        'a', 'a'})
-    doc2 = MMDoc(id=doc1.id, text='hey here 2', categories=['d', 'e', 'f'], price=5, opt_int=5, matches=DocumentArray[MMDoc]([MMDoc()]), test_set={
-        'a', 'b'})
+    doc1 = MMDoc(
+        text='hey here',
+        categories=['a', 'b', 'c'],
+        price=10,
+        matches=DocumentArray[MMDoc]([MMDoc()]),
+        matches_with_same_id=DocumentArray[MMDoc]([MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]),
+        test_set={'a', 'a'},
+        inner_doc=InnerDoc(integer=2, l=['c', 'd']))
+    doc2 = MMDoc(
+        id=doc1.id,
+        text='hey here 2',
+        categories=['d', 'e', 'f'],
+        price=5,
+        opt_int=5,
+        matches=DocumentArray[MMDoc]([MMDoc()]),
+        matches_with_same_id=DocumentArray[MMDoc]([MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]),
+        test_set={'a', 'b'},
+        inner_doc=InnerDoc(integer=3, l=['a', 'b']))
 
     result = reduce_docs(doc1, doc2)
     assert result.text == 'hey here'
@@ -28,6 +49,10 @@ def test_simple_reduce_arrays_concatenated():
     assert result.opt_int == 5
     assert result.price == 10
     assert result.test_set == {'a', 'b'}
+    assert len(result.matches_with_same_id) == 1
+    assert len(result.matches_with_same_id[0].matches) == 2
+    assert result.inner_doc.integer == 2
+    assert result.inner_doc.l == ['c', 'd', 'a', 'b']
 
     # doc1 is changed in place (no extra memory)
     assert doc1.text == 'hey here'
@@ -36,3 +61,8 @@ def test_simple_reduce_arrays_concatenated():
     assert doc1.opt_int == 5
     assert doc1.price == 10
     assert doc1.test_set == {'a', 'b'}
+    assert len(doc1.matches_with_same_id) == 1
+    assert len(doc1.matches_with_same_id[0].matches) == 2
+    assert doc1.inner_doc.integer == 2
+    assert doc1.inner_doc.l == ['c', 'd', 'a', 'b']
+

From 5f0ecefb24a1c6ade474214de0230d26024d6eb2 Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Thu, 2 Feb 2023 17:04:48 +0100
Subject: [PATCH 04/11] feat: finish feature implementation and testing

Signed-off-by: Joan Fontanals Martinez <joan.martinez@jina.ai>
---
 docarray/utils/reduce.py        | 46 ++++++++++++-----------
 tests/units/util/test_reduce.py | 65 +++++++++++++++++++++++++++++++--
 2 files changed, 86 insertions(+), 25 deletions(-)

diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py
index a3fd12266a0..c0a69a4e33e 100644
--- a/docarray/utils/reduce.py
+++ b/docarray/utils/reduce.py
@@ -1,9 +1,9 @@
 from docarray import DocumentArray
-from typing import List, Optional, Dict, Tuple, _GenericAlias
+from typing import List, Optional, Dict, _GenericAlias
 from docarray.base_document import BaseDocument
 
 
-def _types_analysis(doc: 'BaseDocument') -> Tuple[List[str]]:
+def _types_analysis(doc: 'BaseDocument') -> List[List[str]]:
     simple_non_empty_fields: List[str] = []
     list_fields: List[str] = []
     set_fields: List[str] = []
@@ -24,7 +24,7 @@ def _types_analysis(doc: 'BaseDocument') -> Tuple[List[str]]:
                 nested_docs_fields.append(field_name)
             else:
                 simple_non_empty_fields.append(field_name)
-    return tuple([simple_non_empty_fields, list_fields, set_fields, nested_docarray_fields, nested_docs_fields])
+    return [simple_non_empty_fields, list_fields, set_fields, nested_docarray_fields, nested_docs_fields]
 
 
 """
@@ -48,10 +48,19 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument':
     :param doc1: first Document
     :param doc2: second Document
     """
-    doc1_simple_non_empty_fields, doc1_list_fields, doc1_set_fields, doc1_nested_docarray_fields, doc1_nested_docs_fields = _types_analysis(
-        doc1)
-    doc2_simple_non_empty_fields, doc2_list_fields, doc2_set_fields, doc2_nested_docarray_fields, doc2_nested_docs_fields = _types_analysis(
-        doc2)
+    doc1_fields = _types_analysis(doc1)
+    doc1_simple_non_empty_fields = doc1_fields[0]
+    doc1_list_fields             = doc1_fields[1]
+    doc1_set_fields              = doc1_fields[2]
+    doc1_nested_docarray_fields  = doc1_fields[3]
+    doc1_nested_docs_fields      = doc1_fields[4]
+
+    doc2_fields = _types_analysis(doc2)
+    doc2_simple_non_empty_fields = doc2_fields[0]
+    doc2_list_fields             = doc2_fields[1]
+    doc2_set_fields              = doc2_fields[2]
+    doc2_nested_docarray_fields  = doc2_fields[3]
+    doc2_nested_docs_fields      = doc2_fields[4]
 
     # update only fields that are set in doc2 and not set in doc1
     update_simple_fields = set(doc2_simple_non_empty_fields) - set(doc1_simple_non_empty_fields)
@@ -116,30 +125,25 @@ def reduce(left: DocumentArray, other: DocumentArray, left_id_map: Optional[Dict
     return left
 
 
-def reduce_all(left: DocumentArray, others: List[DocumentArray]) -> DocumentArray:
+def reduce_all(docarrays: List[DocumentArray]) -> DocumentArray:
     """
-    Reduces a list of DocumentArrays and this DocumentArray into one DocumentArray. Changes are applied to this
-    DocumentArray in-place.
+    Reduces a list of DocumentArrays into one DocumentArray. Changes are applied to the first DocumentArray in-place.
 
-    Reduction consists in reducing this DocumentArray with every DocumentArray in `others` sequentially using
-    :class:`DocumentArray`.:method:`reduce`.
     The resulting DocumentArray contains Documents of all DocumentArrays.
-    If a Document exists in many DocumentArrays, data properties are merged with priority to the left-most
+    If a Document exists (identified by their ID) in many DocumentArrays, data properties are merged with priority to the left-most
     DocumentArrays (that is, if a data attribute is set in a Document belonging to many DocumentArrays, the
     attribute value of the left-most DocumentArray is kept).
-    Matches and chunks of a Document belonging to many DocumentArrays are also reduced in the same way.
-    Other non-data properties are ignored.
-
+    Nested DocumentArrays belonging to many DocumentArrays are also reduced in the same way.
     .. note::
-        - Matches are not kept in a sorted order when they are reduced. You might want to re-sort them in a later
-            step.
+        - Nested DocumentArrays order does not follow any specific rule. You might want to re-sort them in a later step.
         - The final result depends on the order of DocumentArrays when applying reduction.
 
-    :param left:
-    :param others: List of DocumentArrays to be reduced
+    :param docarrays: List of DocumentArrays to be reduced
     :return: the resulting DocumentArray
     """
-    assert len(left) > 0, 'In order to reduce DocumentArrays we should have a non empty DocumentArray'
+    assert len(docarrays) > 1, 'In order to reduce DocumentArrays we should have more than one DocumentArray'
+    left = docarrays[0]
+    others = docarrays[1:]
     left_id_map = {doc.id: i for i, doc in enumerate(left)}
     for da in others:
         reduce(left, da, left_id_map)
diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py
index 9f27ea895a3..2fb092b7e15 100644
--- a/tests/units/util/test_reduce.py
+++ b/tests/units/util/test_reduce.py
@@ -1,7 +1,8 @@
+import pytest
 from typing import Optional, List, Dict, Any, Set
 from docarray import BaseDocument, DocumentArray
 from docarray.documents import Image
-from docarray.utils.reduce import reduce_docs
+from docarray.utils.reduce import reduce_docs, reduce, reduce_all
 
 
 class InnerDoc(BaseDocument):
@@ -22,8 +23,9 @@ class MMDoc(BaseDocument):
     inner_doc: Optional[InnerDoc] = None
 
 
-def test_simple_reduce_arrays_concatenated():
-    doc1 = MMDoc(
+@pytest.fixture
+def doc1():
+    return MMDoc(
         text='hey here',
         categories=['a', 'b', 'c'],
         price=10,
@@ -31,7 +33,11 @@ def test_simple_reduce_arrays_concatenated():
         matches_with_same_id=DocumentArray[MMDoc]([MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]),
         test_set={'a', 'a'},
         inner_doc=InnerDoc(integer=2, l=['c', 'd']))
-    doc2 = MMDoc(
+
+
+@pytest.fixture
+def doc2(doc1):
+    return MMDoc(
         id=doc1.id,
         text='hey here 2',
         categories=['d', 'e', 'f'],
@@ -42,6 +48,8 @@ def test_simple_reduce_arrays_concatenated():
         test_set={'a', 'b'},
         inner_doc=InnerDoc(integer=3, l=['a', 'b']))
 
+
+def test_reduce_docs(doc1, doc2):
     result = reduce_docs(doc1, doc2)
     assert result.text == 'hey here'
     assert len(result.matches) == 2
@@ -66,3 +74,52 @@ def test_simple_reduce_arrays_concatenated():
     assert doc1.inner_doc.integer == 2
     assert doc1.inner_doc.l == ['c', 'd', 'a', 'b']
 
+
+def test_reduce_different_ids():
+    da1 = DocumentArray[MMDoc]([MMDoc() for _ in range(10)])
+    da2 = DocumentArray[MMDoc]([MMDoc() for _ in range(10)])
+    result = reduce(da1, da2)
+    assert len(result) == 20
+    # da1 is changed in place (no extra memory)
+    assert len(da1) == 20
+
+
+def test_reduce(doc1, doc2):
+    da1 = DocumentArray[MMDoc]([doc1, MMDoc()])
+    da2 = DocumentArray[MMDoc]([MMDoc(), doc2])
+    result = reduce(da1, da2)
+    assert len(result) == 3
+    # da1 is changed in place (no extra memory)
+    assert len(da1) == 3
+    merged_doc = result[0]
+    assert merged_doc.text == 'hey here'
+    assert merged_doc.categories == ['a', 'b', 'c', 'd', 'e', 'f']
+    assert len(merged_doc.matches) == 2
+    assert merged_doc.opt_int == 5
+    assert merged_doc.price == 10
+    assert merged_doc.test_set == {'a', 'b'}
+    assert len(merged_doc.matches_with_same_id) == 1
+    assert len(merged_doc.matches_with_same_id[0].matches) == 2
+    assert merged_doc.inner_doc.integer == 2
+    assert merged_doc.inner_doc.l == ['c', 'd', 'a', 'b']
+
+
+def test_reduce_all(doc1, doc2):
+    da1 = DocumentArray[MMDoc]([doc1, MMDoc()])
+    da2 = DocumentArray[MMDoc]([MMDoc(), doc2])
+    da3 = DocumentArray[MMDoc]([MMDoc(), MMDoc(), doc1])
+    result = reduce_all([da1, da2, da3])
+    assert len(result) == 5
+    # da1 is changed in place (no extra memory)
+    assert len(da1) == 5
+    merged_doc = result[0]
+    assert merged_doc.text == 'hey here'
+    assert merged_doc.categories == ['a', 'b', 'c', 'd', 'e', 'f', 'a', 'b', 'c', 'd', 'e', 'f']
+    assert len(merged_doc.matches) == 2
+    assert merged_doc.opt_int == 5
+    assert merged_doc.price == 10
+    assert merged_doc.test_set == {'a', 'b'}
+    assert len(merged_doc.matches_with_same_id) == 1
+    assert len(merged_doc.matches_with_same_id[0].matches) == 2
+    assert merged_doc.inner_doc.integer == 2
+    assert merged_doc.inner_doc.l == ['c', 'd', 'a', 'b', 'c', 'd', 'a', 'b']

From 5c439b8b500286660afa280edddc4ab8e972eddb Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Thu, 2 Feb 2023 17:36:25 +0100
Subject: [PATCH 05/11] docs: add documentation and fix ruff

Signed-off-by: Joan Fontanals Martinez <joan.martinez@jina.ai>
---
 docarray/utils/reduce.py        | 118 ++++++++++++++++++--------------
 tests/units/util/test_reduce.py |  29 ++++++--
 2 files changed, 90 insertions(+), 57 deletions(-)

diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py
index c0a69a4e33e..f842cac3f75 100644
--- a/docarray/utils/reduce.py
+++ b/docarray/utils/reduce.py
@@ -1,5 +1,5 @@
 from docarray import DocumentArray
-from typing import List, Optional, Dict, _GenericAlias
+from typing import List, Optional, Dict, _GenericAlias # type: ignore
 from docarray.base_document import BaseDocument
 
 
@@ -12,7 +12,9 @@ def _types_analysis(doc: 'BaseDocument') -> List[List[str]]:
 
     for field_name, field in doc.__fields__.items():
         field_type = field.outer_type_
-        if not isinstance(field_type, _GenericAlias) and issubclass(field_type, DocumentArray):
+        if not isinstance(field_type, _GenericAlias) and issubclass(
+            field_type, DocumentArray
+        ):
             nested_docarray_fields.append(field_name)
         elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is list:
             list_fields.append(field_name)
@@ -24,46 +26,43 @@ def _types_analysis(doc: 'BaseDocument') -> List[List[str]]:
                 nested_docs_fields.append(field_name)
             else:
                 simple_non_empty_fields.append(field_name)
-    return [simple_non_empty_fields, list_fields, set_fields, nested_docarray_fields, nested_docs_fields]
-
-
-"""
-A mixin that provides reducing logic for :class:`DocumentArray`
-Reducing 2 or more DocumentArrays consists in merging all Documents into the same DocumentArray.
-If a Document belongs to 2 or more DocumentArrays, it is added once and data attributes are merged with priority to
-the Document belonging to the left-most DocumentArray. Matches and chunks are also reduced in the same way.
-Reduction is applied to all levels of DocumentArrays, that is, from root Documents to all their chunk and match
-children.
-"""
+    return [
+        simple_non_empty_fields,
+        list_fields,
+        set_fields,
+        nested_docarray_fields,
+        nested_docs_fields,
+    ]
 
 
 def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument':
     """
     Reduces doc1 and doc2 into one Document in-place. Changes are applied to doc1.
-    Reducing 2 Documents consists in setting data properties of the second Document to the first Document if they
-    are empty (that is, priority to the left-most Document) and reducing the matches and the chunks of both
-    documents.
-    Non-data properties are ignored.
-    Reduction of matches and chunks relies on :class:`DocumentArray`.:method:`reduce`.
-    :param doc1: first Document
-    :param doc2: second Document
+    Reducing 2 Documents consists in setting data properties of the second Document
+    to the first Document if they are empty (priority to the left-most Document)
+    and reducing recursively its nested Documents and DocumentArrays
+    :param doc1: first Document to be reduced. Change is applied in-place
+    :param doc2: second Document to be reduced
+    :return The reduced Document
     """
     doc1_fields = _types_analysis(doc1)
     doc1_simple_non_empty_fields = doc1_fields[0]
-    doc1_list_fields             = doc1_fields[1]
-    doc1_set_fields              = doc1_fields[2]
-    doc1_nested_docarray_fields  = doc1_fields[3]
-    doc1_nested_docs_fields      = doc1_fields[4]
+    doc1_list_fields = doc1_fields[1]
+    doc1_set_fields = doc1_fields[2]
+    doc1_nested_docarray_fields = doc1_fields[3]
+    doc1_nested_docs_fields = doc1_fields[4]
 
     doc2_fields = _types_analysis(doc2)
     doc2_simple_non_empty_fields = doc2_fields[0]
-    doc2_list_fields             = doc2_fields[1]
-    doc2_set_fields              = doc2_fields[2]
-    doc2_nested_docarray_fields  = doc2_fields[3]
-    doc2_nested_docs_fields      = doc2_fields[4]
+    doc2_list_fields = doc2_fields[1]
+    doc2_set_fields = doc2_fields[2]
+    doc2_nested_docarray_fields = doc2_fields[3]
+    doc2_nested_docs_fields = doc2_fields[4]
 
     # update only fields that are set in doc2 and not set in doc1
-    update_simple_fields = set(doc2_simple_non_empty_fields) - set(doc1_simple_non_empty_fields)
+    update_simple_fields = set(doc2_simple_non_empty_fields) - set(
+        doc1_simple_non_empty_fields
+    )
 
     for field in update_simple_fields:
         setattr(doc1, field, getattr(doc2, field))
@@ -71,7 +70,7 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument':
     for field in set(doc1_nested_docs_fields + doc2_nested_docs_fields):
         setattr(doc1, field, reduce_docs(getattr(doc1, field), getattr(doc2, field)))
 
-    for field in doc1_list_fields:
+    for field in set(doc1_list_fields + doc2_list_fields):
         array1 = getattr(doc1, field)
         array2 = getattr(doc2, field)
         if array1 is None and array2 is not None:
@@ -80,7 +79,7 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument':
             array1.extend(array2)
             setattr(doc1, field, array1)
 
-    for field in doc1_set_fields:
+    for field in set(doc1_set_fields + doc2_set_fields):
         array1 = getattr(doc1, field)
         array2 = getattr(doc2, field)
         if array1 is None and array2 is not None:
@@ -89,7 +88,7 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument':
             array1.update(array2)
             setattr(doc1, field, array1)
 
-    for field in doc1_nested_docarray_fields:
+    for field in set(doc1_nested_docarray_fields + doc2_nested_docarray_fields):
         array1 = getattr(doc1, field)
         array2 = getattr(doc2, field)
         if array1 is None and array2 is not None:
@@ -101,22 +100,29 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument':
     return doc1
 
 
-def reduce(left: DocumentArray, other: DocumentArray, left_id_map: Optional[Dict] = None) -> 'DocumentArray':
+def reduce(
+    left: DocumentArray, right: DocumentArray, left_id_map: Optional[Dict] = None
+) -> 'DocumentArray':
     """
-    Reduces other and the current DocumentArray into one DocumentArray in-place. Changes are applied to the current
-    DocumentArray.
-    Reducing 2 DocumentArrays consists in adding Documents in the second DocumentArray to the first DocumentArray
-    if they do not exist. If a Document exists in both DocumentArrays, the data properties are merged with priority
-    to the first Document (that is, to the current DocumentArray's Document). The matches and chunks are also
-    reduced in the same way.
-    :param left: DocumentArray
-    :param other: DocumentArray
-    :param left_id_map:
-    :return: DocumentArray
+    Reduces left and right DocumentArray into one DocumentArray in-place.
+    Changes are applied to the left DocumentArray.
+    Reducing 2 DocumentArrays consists in adding Documents in the second DocumentArray
+    to the first DocumentArray if they do not exist.
+    If a Document exists in both DocumentArrays (identified by ID),
+    the data properties are merged with priority to the left Document.
+
+    Nested DocumentArrays are also reduced in the same way.
+    :param left: First DocumentArray to be reduced. Changes will be applied to it
+    in-place
+    :param right: Second DocumentArray to be reduced
+    :param left_id_map: Optional parameter to be passed in repeated calls
+    for optimizations, keeping a map of the Document ID to its offset
+    in the DocumentArray
+    :return: Reduced DocumentArray
     """
     left_id_map = left_id_map or {doc.id: i for i, doc in enumerate(left)}
 
-    for doc in other:
+    for doc in right:
         if doc.id in left_id_map:
             reduce_docs(left[left_id_map[doc.id]], doc)
         else:
@@ -127,21 +133,29 @@ def reduce(left: DocumentArray, other: DocumentArray, left_id_map: Optional[Dict
 
 def reduce_all(docarrays: List[DocumentArray]) -> DocumentArray:
     """
-    Reduces a list of DocumentArrays into one DocumentArray. Changes are applied to the first DocumentArray in-place.
+    Reduces a list of DocumentArrays into one DocumentArray.
+    Changes are applied to the first DocumentArray in-place.
 
     The resulting DocumentArray contains Documents of all DocumentArrays.
-    If a Document exists (identified by their ID) in many DocumentArrays, data properties are merged with priority to the left-most
-    DocumentArrays (that is, if a data attribute is set in a Document belonging to many DocumentArrays, the
-    attribute value of the left-most DocumentArray is kept).
-    Nested DocumentArrays belonging to many DocumentArrays are also reduced in the same way.
+    If a Document exists (identified by their ID) in many DocumentArrays,
+    data properties are merged with priority to the left-most
+    DocumentArrays (that is, if a data attribute is set in a Document
+    belonging to many DocumentArrays, the attribute value of the left-most
+     DocumentArray is kept).
+    Nested DocumentArrays belonging to many DocumentArrays
+     are also reduced in the same way.
     .. note::
-        - Nested DocumentArrays order does not follow any specific rule. You might want to re-sort them in a later step.
-        - The final result depends on the order of DocumentArrays when applying reduction.
+        - Nested DocumentArrays order does not follow any specific rule.
+        You might want to re-sort them in a later step.
+        - The final result depends on the order of DocumentArrays
+        when applying reduction.
 
     :param docarrays: List of DocumentArrays to be reduced
     :return: the resulting DocumentArray
     """
-    assert len(docarrays) > 1, 'In order to reduce DocumentArrays we should have more than one DocumentArray'
+    assert (
+        len(docarrays) > 1
+    ), 'In order to reduce DocumentArrays we should have more than one DocumentArray'
     left = docarrays[0]
     others = docarrays[1:]
     left_id_map = {doc.id: i for i, doc in enumerate(left)}
diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py
index 2fb092b7e15..ae10378dea3 100644
--- a/tests/units/util/test_reduce.py
+++ b/tests/units/util/test_reduce.py
@@ -30,9 +30,12 @@ def doc1():
         categories=['a', 'b', 'c'],
         price=10,
         matches=DocumentArray[MMDoc]([MMDoc()]),
-        matches_with_same_id=DocumentArray[MMDoc]([MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]),
+        matches_with_same_id=DocumentArray[MMDoc](
+            [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]
+        ),
         test_set={'a', 'a'},
-        inner_doc=InnerDoc(integer=2, l=['c', 'd']))
+        inner_doc=InnerDoc(integer=2, l=['c', 'd']),
+    )
 
 
 @pytest.fixture
@@ -44,9 +47,12 @@ def doc2(doc1):
         price=5,
         opt_int=5,
         matches=DocumentArray[MMDoc]([MMDoc()]),
-        matches_with_same_id=DocumentArray[MMDoc]([MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]),
+        matches_with_same_id=DocumentArray[MMDoc](
+            [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]
+        ),
         test_set={'a', 'b'},
-        inner_doc=InnerDoc(integer=3, l=['a', 'b']))
+        inner_doc=InnerDoc(integer=3, l=['a', 'b']),
+    )
 
 
 def test_reduce_docs(doc1, doc2):
@@ -114,7 +120,20 @@ def test_reduce_all(doc1, doc2):
     assert len(da1) == 5
     merged_doc = result[0]
     assert merged_doc.text == 'hey here'
-    assert merged_doc.categories == ['a', 'b', 'c', 'd', 'e', 'f', 'a', 'b', 'c', 'd', 'e', 'f']
+    assert merged_doc.categories == [
+        'a',
+        'b',
+        'c',
+        'd',
+        'e',
+        'f',
+        'a',
+        'b',
+        'c',
+        'd',
+        'e',
+        'f',
+    ]
     assert len(merged_doc.matches) == 2
     assert merged_doc.opt_int == 5
     assert merged_doc.price == 10

From 7c3d9f4f07c27dd000d87d5f2767eb4bd9905e2f Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Fri, 3 Feb 2023 10:46:35 +0100
Subject: [PATCH 06/11] fix: apply comments and support dicts

Signed-off-by: Joan Fontanals Martinez <joan.martinez@jina.ai>
---
 docarray/utils/reduce.py        | 82 +++++++++++++++++++++++----------
 tests/units/util/test_reduce.py |  6 ++-
 2 files changed, 62 insertions(+), 26 deletions(-)

diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py
index 8d4efb925b2..d2c2c3a021d 100644
--- a/docarray/utils/reduce.py
+++ b/docarray/utils/reduce.py
@@ -2,16 +2,32 @@
 from typing import List, Optional, Dict, _GenericAlias  # type: ignore
 from docarray.base_document import BaseDocument
 
-
-def _types_analysis(doc: 'BaseDocument') -> List[List[str]]:
+from collections import namedtuple
+
+# Declaring namedtuple()
+_FieldGroups = namedtuple(
+    '_FieldGroups',
+    [
+        'simple_non_empty_fields',
+        'list_fields',
+        'set_fields',
+        'dict_fields',
+        'nested_docarray_fields',
+        'nested_docs_fields',
+    ],
+)
+
+
+def _group_fields(doc: 'BaseDocument') -> _FieldGroups:
     simple_non_empty_fields: List[str] = []
     list_fields: List[str] = []
     set_fields: List[str] = []
+    dict_fields: List[str] = []
     nested_docs_fields: List[str] = []
     nested_docarray_fields: List[str] = []
 
     for field_name, field in doc.__fields__.items():
-        field_type = field.outer_type_
+        field_type = doc._get_field_type(field_name)
         if not isinstance(field_type, _GenericAlias) and issubclass(
             field_type, DocumentArray
         ):
@@ -20,44 +36,52 @@ def _types_analysis(doc: 'BaseDocument') -> List[List[str]]:
             list_fields.append(field_name)
         elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is set:
             set_fields.append(field_name)
+        elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is dict:
+            dict_fields.append(field_name)
         v = getattr(doc, field_name)
         if v:
             if isinstance(v, BaseDocument):
                 nested_docs_fields.append(field_name)
             else:
                 simple_non_empty_fields.append(field_name)
-    return [
+    return _FieldGroups(
         simple_non_empty_fields,
         list_fields,
         set_fields,
+        dict_fields,
         nested_docarray_fields,
         nested_docs_fields,
-    ]
+    )
 
 
 def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument':
     """
     Reduces doc1 and doc2 into one Document in-place. Changes are applied to doc1.
-    Reducing 2 Documents consists in setting data properties of the second Document
-    to the first Document if they are empty (priority to the left-most Document)
-    and reducing recursively its nested Documents and DocumentArrays
+    Reducing 2 Documents consists in the following:
+     - setting data properties of the second Document to the first Document
+     if they are empty (priority to the left-most Document)
+     - Concatenating lists and updating sets
+     - Reducing recursively Documents and DocumentArrays
+     - Updating Dictionaries of the left with the right
     :param doc1: first Document to be reduced. Change is applied in-place
     :param doc2: second Document to be reduced
-    :return The reduced Document
+    :return: The reduced Document
     """
-    doc1_fields = _types_analysis(doc1)
-    doc1_simple_non_empty_fields = doc1_fields[0]
-    doc1_list_fields = doc1_fields[1]
-    doc1_set_fields = doc1_fields[2]
-    doc1_nested_docarray_fields = doc1_fields[3]
-    doc1_nested_docs_fields = doc1_fields[4]
-
-    doc2_fields = _types_analysis(doc2)
-    doc2_simple_non_empty_fields = doc2_fields[0]
-    doc2_list_fields = doc2_fields[1]
-    doc2_set_fields = doc2_fields[2]
-    doc2_nested_docarray_fields = doc2_fields[3]
-    doc2_nested_docs_fields = doc2_fields[4]
+    doc1_fields = _group_fields(doc1)
+    doc1_simple_non_empty_fields = doc1_fields.simple_non_empty_fields
+    doc1_list_fields = doc1_fields.list_fields
+    doc1_set_fields = doc1_fields.set_fields
+    doc1_dict_fields = doc1_fields.dict_fields
+    doc1_nested_docarray_fields = doc1_fields.nested_docarray_fields
+    doc1_nested_docs_fields = doc1_fields.nested_docs_fields
+
+    doc2_fields = _group_fields(doc2)
+    doc2_simple_non_empty_fields = doc2_fields.simple_non_empty_fields
+    doc2_list_fields = doc2_fields.list_fields
+    doc2_set_fields = doc2_fields.set_fields
+    doc2_dict_fields = doc2_fields.dict_fields
+    doc2_nested_docarray_fields = doc2_fields.nested_docarray_fields
+    doc2_nested_docs_fields = doc2_fields.nested_docs_fields
 
     # update only fields that are set in doc2 and not set in doc1
     update_simple_fields = set(doc2_simple_non_empty_fields) - set(
@@ -97,6 +121,15 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument':
             array1 = reduce(array1, array2)
             setattr(doc1, field, array1)
 
+    for field in set(doc1_dict_fields + doc2_dict_fields):
+        dict1 = getattr(doc1, field)
+        dict2 = getattr(doc2, field)
+        if dict1 is None and dict2 is not None:
+            setattr(doc1, field, dict2)
+        elif dict1 is not None and dict2 is not None:
+            dict1.update(dict2)
+            setattr(doc1, field, dict1)
+
     return doc1
 
 
@@ -153,9 +186,8 @@ def reduce_all(docarrays: List[DocumentArray]) -> DocumentArray:
     :param docarrays: List of DocumentArrays to be reduced
     :return: the resulting DocumentArray
     """
-    assert (
-        len(docarrays) > 1
-    ), 'In order to reduce DocumentArrays we should have more than one DocumentArray'
+    if len(docarrays) <= 1:
+        raise Exception('In order to reduce DocumentArrays we should have more than one DocumentArray')
     left = docarrays[0]
     others = docarrays[1:]
     left_id_map = {doc.id: i for i, doc in enumerate(left)}
diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py
index ae10378dea3..33e154957e2 100644
--- a/tests/units/util/test_reduce.py
+++ b/tests/units/util/test_reduce.py
@@ -17,10 +17,10 @@ class MMDoc(BaseDocument):
     image: Optional[Image] = None
     matches: Optional[DocumentArray] = None
     matches_with_same_id: Optional[DocumentArray] = None
-    dictionary: Optional[Dict[str, Any]] = None
     opt_int: Optional[int] = None
     test_set: Optional[Set] = None
     inner_doc: Optional[InnerDoc] = None
+    test_dict: Optional[Dict] = None
 
 
 @pytest.fixture
@@ -35,6 +35,7 @@ def doc1():
         ),
         test_set={'a', 'a'},
         inner_doc=InnerDoc(integer=2, l=['c', 'd']),
+        test_dict={'a': 0, 'b': 2, 'd': 4}
     )
 
 
@@ -52,6 +53,7 @@ def doc2(doc1):
         ),
         test_set={'a', 'b'},
         inner_doc=InnerDoc(integer=3, l=['a', 'b']),
+        test_dict={'a': 10, 'b': 10, 'c': 3}
     )
 
 
@@ -67,6 +69,7 @@ def test_reduce_docs(doc1, doc2):
     assert len(result.matches_with_same_id[0].matches) == 2
     assert result.inner_doc.integer == 2
     assert result.inner_doc.l == ['c', 'd', 'a', 'b']
+    assert result.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4}
 
     # doc1 is changed in place (no extra memory)
     assert doc1.text == 'hey here'
@@ -79,6 +82,7 @@ def test_reduce_docs(doc1, doc2):
     assert len(doc1.matches_with_same_id[0].matches) == 2
     assert doc1.inner_doc.integer == 2
     assert doc1.inner_doc.l == ['c', 'd', 'a', 'b']
+    assert doc1.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4}
 
 
 def test_reduce_different_ids():

From 5bb4262d59a949dc8417e3e326b05e14f2247d4e Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Fri, 3 Feb 2023 12:03:36 +0100
Subject: [PATCH 07/11] feat: add update method to BaseDocument and fix reduce
 behavior

Signed-off-by: Joan Fontanals Martinez <joan.martinez@jina.ai>
---
 docarray/base_document/document.py         |  36 +++++
 docarray/utils/reduce.py                   | 165 ++++++++++-----------
 tests/units/document/test_base_document.py |  19 ++-
 tests/units/util/test_reduce.py            |  38 ++---
 4 files changed, 149 insertions(+), 109 deletions(-)

diff --git a/docarray/base_document/document.py b/docarray/base_document/document.py
index e891eef0fb2..72904d49020 100644
--- a/docarray/base_document/document.py
+++ b/docarray/base_document/document.py
@@ -46,3 +46,39 @@ def __str__(self):
 
     def _get_string_for_regex_filter(self):
         return str(self)
+
+    def update(self, other: 'BaseDocument'):
+        """
+        Updates the content of this Document with the contents of other using
+        :func:`~docarray.utils.reduce.reduce_docs`.
+
+        It behaves as an update operation for Dictionaries, except that since
+        it is applied to a static schema type, the presence of the field is
+        given by the field not having a None value.
+
+            EXAMPLE USAGE
+
+            .. code-block:: python
+
+                from docarray import BaseDocument
+                from docarray.documents import Text
+
+                class MyDocument(BaseDocument):
+                    content: str
+                    title: Optional[str] = None
+                    tags_: List
+
+                doc1 = MyDocument(content='Core content of the document',
+                    title='Title', tags_=['python', 'AI'])
+                doc2 = MyDocument(content='Core content updated', tags_=['docarray'])
+
+                doc1.update(doc2)
+                assert doc1.content == 'Core content updated'
+                assert doc1.title == 'Title'
+                assert doc1.tags_ == ['python', 'AI', 'docarray']
+
+        :param other: The Document with which to update the contents of this
+        """
+        from docarray.utils.reduce import reduce_docs
+
+        reduce_docs(self, other)
diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py
index d2c2c3a021d..1c53ebb1953 100644
--- a/docarray/utils/reduce.py
+++ b/docarray/utils/reduce.py
@@ -1,100 +1,96 @@
 from docarray import DocumentArray
-from typing import List, Optional, Dict, _GenericAlias  # type: ignore
+from typing import List, Optional, Dict, TypeVar, _GenericAlias  # type: ignore
 from docarray.base_document import BaseDocument
 
-from collections import namedtuple
-
-# Declaring namedtuple()
-_FieldGroups = namedtuple(
-    '_FieldGroups',
-    [
-        'simple_non_empty_fields',
-        'list_fields',
-        'set_fields',
-        'dict_fields',
-        'nested_docarray_fields',
-        'nested_docs_fields',
-    ],
-)
-
-
-def _group_fields(doc: 'BaseDocument') -> _FieldGroups:
-    simple_non_empty_fields: List[str] = []
-    list_fields: List[str] = []
-    set_fields: List[str] = []
-    dict_fields: List[str] = []
-    nested_docs_fields: List[str] = []
-    nested_docarray_fields: List[str] = []
-
-    for field_name, field in doc.__fields__.items():
-        field_type = doc._get_field_type(field_name)
-        if not isinstance(field_type, _GenericAlias) and issubclass(
-            field_type, DocumentArray
-        ):
-            nested_docarray_fields.append(field_name)
-        elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is list:
-            list_fields.append(field_name)
-        elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is set:
-            set_fields.append(field_name)
-        elif isinstance(field_type, _GenericAlias) and field_type.__origin__ is dict:
-            dict_fields.append(field_name)
-        v = getattr(doc, field_name)
-        if v:
-            if isinstance(v, BaseDocument):
-                nested_docs_fields.append(field_name)
-            else:
-                simple_non_empty_fields.append(field_name)
-    return _FieldGroups(
-        simple_non_empty_fields,
-        list_fields,
-        set_fields,
-        dict_fields,
-        nested_docarray_fields,
-        nested_docs_fields,
-    )
+T = TypeVar('T', bound='BaseDocument')
 
 
-def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument':
+def reduce_docs(doc1: 'T', doc2: 'T') -> None:
     """
     Reduces doc1 and doc2 into one Document in-place. Changes are applied to doc1.
     Reducing 2 Documents consists in the following:
      - setting data properties of the second Document to the first Document
-     if they are empty (priority to the left-most Document)
+     if they are not None
      - Concatenating lists and updating sets
      - Reducing recursively Documents and DocumentArrays
      - Updating Dictionaries of the left with the right
     :param doc1: first Document to be reduced. Change is applied in-place
     :param doc2: second Document to be reduced
-    :return: The reduced Document
     """
-    doc1_fields = _group_fields(doc1)
-    doc1_simple_non_empty_fields = doc1_fields.simple_non_empty_fields
-    doc1_list_fields = doc1_fields.list_fields
-    doc1_set_fields = doc1_fields.set_fields
-    doc1_dict_fields = doc1_fields.dict_fields
-    doc1_nested_docarray_fields = doc1_fields.nested_docarray_fields
-    doc1_nested_docs_fields = doc1_fields.nested_docs_fields
+    from collections import namedtuple
+
+    # Declaring namedtuple()
+    _FieldGroups = namedtuple(
+        '_FieldGroups',
+        [
+            'simple_non_empty_fields',
+            'list_fields',
+            'set_fields',
+            'dict_fields',
+            'nested_docarray_fields',
+            'nested_docs_fields',
+        ],
+    )
+
+    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']
+
+    def _group_fields(doc: 'BaseDocument') -> _FieldGroups:
+        simple_non_empty_fields: List[str] = []
+        list_fields: List[str] = []
+        set_fields: List[str] = []
+        dict_fields: List[str] = []
+        nested_docs_fields: List[str] = []
+        nested_docarray_fields: List[str] = []
+
+        for field_name, field in doc.__fields__.items():
+            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
+                field_type = doc._get_field_type(field_name)
+                if not isinstance(field_type, _GenericAlias) and issubclass(
+                    field_type, DocumentArray
+                ):
+                    nested_docarray_fields.append(field_name)
+                elif (
+                    isinstance(field_type, _GenericAlias)
+                    and field_type.__origin__ is list
+                ):
+                    list_fields.append(field_name)
+                elif (
+                    isinstance(field_type, _GenericAlias)
+                    and field_type.__origin__ is set
+                ):
+                    set_fields.append(field_name)
+                elif (
+                    isinstance(field_type, _GenericAlias)
+                    and field_type.__origin__ is dict
+                ):
+                    dict_fields.append(field_name)
+                else:
+                    v = getattr(doc, field_name)
+                    if v:
+                        if isinstance(v, BaseDocument):
+                            nested_docs_fields.append(field_name)
+                        else:
+                            simple_non_empty_fields.append(field_name)
+        return _FieldGroups(
+            simple_non_empty_fields,
+            list_fields,
+            set_fields,
+            dict_fields,
+            nested_docarray_fields,
+            nested_docs_fields,
+        )
 
+    doc1_fields = _group_fields(doc1)
     doc2_fields = _group_fields(doc2)
-    doc2_simple_non_empty_fields = doc2_fields.simple_non_empty_fields
-    doc2_list_fields = doc2_fields.list_fields
-    doc2_set_fields = doc2_fields.set_fields
-    doc2_dict_fields = doc2_fields.dict_fields
-    doc2_nested_docarray_fields = doc2_fields.nested_docarray_fields
-    doc2_nested_docs_fields = doc2_fields.nested_docs_fields
-
-    # update only fields that are set in doc2 and not set in doc1
-    update_simple_fields = set(doc2_simple_non_empty_fields) - set(
-        doc1_simple_non_empty_fields
-    )
 
-    for field in update_simple_fields:
+    for field in doc2_fields.simple_non_empty_fields:
         setattr(doc1, field, getattr(doc2, field))
 
-    for field in set(doc1_nested_docs_fields + doc2_nested_docs_fields):
-        setattr(doc1, field, reduce_docs(getattr(doc1, field), getattr(doc2, field)))
+    for field in set(doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields):
+        reduce_docs(getattr(doc1, field), getattr(doc2, field))
+        setattr(doc1, field, getattr(doc1, field))
 
-    for field in set(doc1_list_fields + doc2_list_fields):
+    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
         array1 = getattr(doc1, field)
         array2 = getattr(doc2, field)
         if array1 is None and array2 is not None:
@@ -103,7 +99,7 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument':
             array1.extend(array2)
             setattr(doc1, field, array1)
 
-    for field in set(doc1_set_fields + doc2_set_fields):
+    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
         array1 = getattr(doc1, field)
         array2 = getattr(doc2, field)
         if array1 is None and array2 is not None:
@@ -112,7 +108,9 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument':
             array1.update(array2)
             setattr(doc1, field, array1)
 
-    for field in set(doc1_nested_docarray_fields + doc2_nested_docarray_fields):
+    for field in set(
+        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
+    ):
         array1 = getattr(doc1, field)
         array2 = getattr(doc2, field)
         if array1 is None and array2 is not None:
@@ -121,7 +119,7 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument':
             array1 = reduce(array1, array2)
             setattr(doc1, field, array1)
 
-    for field in set(doc1_dict_fields + doc2_dict_fields):
+    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
         dict1 = getattr(doc1, field)
         dict2 = getattr(doc2, field)
         if dict1 is None and dict2 is not None:
@@ -130,8 +128,6 @@ def reduce_docs(doc1: 'BaseDocument', doc2: 'BaseDocument') -> 'BaseDocument':
             dict1.update(dict2)
             setattr(doc1, field, dict1)
 
-    return doc1
-
 
 def reduce(
     left: DocumentArray, right: DocumentArray, left_id_map: Optional[Dict] = None
@@ -157,7 +153,7 @@ def reduce(
 
     for doc in right:
         if doc.id in left_id_map:
-            reduce_docs(left[left_id_map[doc.id]], doc)
+            left[left_id_map[doc.id]].update(doc)
         else:
             left.append(doc)
 
@@ -187,7 +183,10 @@ def reduce_all(docarrays: List[DocumentArray]) -> DocumentArray:
     :return: the resulting DocumentArray
     """
     if len(docarrays) <= 1:
-        raise Exception('In order to reduce DocumentArrays we should have more than one DocumentArray')
+        raise Exception(
+            'In order to reduce DocumentArrays'
+            ' we should have more than one DocumentArray'
+        )
     left = docarrays[0]
     others = docarrays[1:]
     left_id_map = {doc.id: i for i, doc in enumerate(left)}
diff --git a/tests/units/document/test_base_document.py b/tests/units/document/test_base_document.py
index be519424702..6a76c58f56b 100644
--- a/tests/units/document/test_base_document.py
+++ b/tests/units/document/test_base_document.py
@@ -1,8 +1,25 @@
+from typing import Optional, List
 from docarray.base_document.document import BaseDocument
 
 
 def test_base_document_init():
-
     doc = BaseDocument()
 
     assert doc.id is not None
+
+
+def test_update():
+    class MyDocument(BaseDocument):
+        content: str
+        title: Optional[str] = None
+        tags_: List
+
+    doc1 = MyDocument(
+        content='Core content of the document', title='Title', tags_=['python', 'AI']
+    )
+    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])
+
+    doc1.update(doc2)
+    assert doc1.content == 'Core content updated'
+    assert doc1.title == 'Title'
+    assert doc1.tags_ == ['python', 'AI', 'docarray']
diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py
index 33e154957e2..9759c4d24eb 100644
--- a/tests/units/util/test_reduce.py
+++ b/tests/units/util/test_reduce.py
@@ -35,7 +35,7 @@ def doc1():
         ),
         test_set={'a', 'a'},
         inner_doc=InnerDoc(integer=2, l=['c', 'd']),
-        test_dict={'a': 0, 'b': 2, 'd': 4}
+        test_dict={'a': 0, 'b': 2, 'd': 4, 'z': 3},
     )
 
 
@@ -53,36 +53,24 @@ def doc2(doc1):
         ),
         test_set={'a', 'b'},
         inner_doc=InnerDoc(integer=3, l=['a', 'b']),
-        test_dict={'a': 10, 'b': 10, 'c': 3}
+        test_dict={'a': 10, 'b': 10, 'c': 3, 'z': None},
     )
 
 
 def test_reduce_docs(doc1, doc2):
-    result = reduce_docs(doc1, doc2)
-    assert result.text == 'hey here'
-    assert len(result.matches) == 2
-    assert result.categories == ['a', 'b', 'c', 'd', 'e', 'f']
-    assert result.opt_int == 5
-    assert result.price == 10
-    assert result.test_set == {'a', 'b'}
-    assert len(result.matches_with_same_id) == 1
-    assert len(result.matches_with_same_id[0].matches) == 2
-    assert result.inner_doc.integer == 2
-    assert result.inner_doc.l == ['c', 'd', 'a', 'b']
-    assert result.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4}
-
+    reduce_docs(doc1, doc2)
     # doc1 is changed in place (no extra memory)
-    assert doc1.text == 'hey here'
+    assert doc1.text == 'hey here 2'
     assert doc1.categories == ['a', 'b', 'c', 'd', 'e', 'f']
     assert len(doc1.matches) == 2
     assert doc1.opt_int == 5
-    assert doc1.price == 10
+    assert doc1.price == 5
     assert doc1.test_set == {'a', 'b'}
     assert len(doc1.matches_with_same_id) == 1
     assert len(doc1.matches_with_same_id[0].matches) == 2
-    assert doc1.inner_doc.integer == 2
+    assert doc1.inner_doc.integer == 3
     assert doc1.inner_doc.l == ['c', 'd', 'a', 'b']
-    assert doc1.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4}
+    assert doc1.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4, 'z': None}
 
 
 def test_reduce_different_ids():
@@ -102,15 +90,15 @@ def test_reduce(doc1, doc2):
     # da1 is changed in place (no extra memory)
     assert len(da1) == 3
     merged_doc = result[0]
-    assert merged_doc.text == 'hey here'
+    assert merged_doc.text == 'hey here 2'
     assert merged_doc.categories == ['a', 'b', 'c', 'd', 'e', 'f']
     assert len(merged_doc.matches) == 2
     assert merged_doc.opt_int == 5
-    assert merged_doc.price == 10
+    assert merged_doc.price == 5
     assert merged_doc.test_set == {'a', 'b'}
     assert len(merged_doc.matches_with_same_id) == 1
     assert len(merged_doc.matches_with_same_id[0].matches) == 2
-    assert merged_doc.inner_doc.integer == 2
+    assert merged_doc.inner_doc.integer == 3
     assert merged_doc.inner_doc.l == ['c', 'd', 'a', 'b']
 
 
@@ -123,7 +111,7 @@ def test_reduce_all(doc1, doc2):
     # da1 is changed in place (no extra memory)
     assert len(da1) == 5
     merged_doc = result[0]
-    assert merged_doc.text == 'hey here'
+    assert merged_doc.text == 'hey here 2'
     assert merged_doc.categories == [
         'a',
         'b',
@@ -140,9 +128,9 @@ def test_reduce_all(doc1, doc2):
     ]
     assert len(merged_doc.matches) == 2
     assert merged_doc.opt_int == 5
-    assert merged_doc.price == 10
+    assert merged_doc.price == 5
     assert merged_doc.test_set == {'a', 'b'}
     assert len(merged_doc.matches_with_same_id) == 1
     assert len(merged_doc.matches_with_same_id[0].matches) == 2
-    assert merged_doc.inner_doc.integer == 2
+    assert merged_doc.inner_doc.integer == 3
     assert merged_doc.inner_doc.l == ['c', 'd', 'a', 'b', 'c', 'd', 'a', 'b']

From 44c5b8aaf798c089923dc9a0525e18db94e12712 Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Fri, 3 Feb 2023 12:45:19 +0100
Subject: [PATCH 08/11] refactor: move reduce docs to update

Signed-off-by: Joan Fontanals Martinez <joan.martinez@jina.ai>
---
 docarray/base_document/document.py  | 139 ++++++++++++++++++++++++++--
 docarray/utils/reduce.py            | 129 +-------------------------
 tests/units/document/test_update.py | 102 ++++++++++++++++++++
 tests/units/util/test_reduce.py     |  20 +---
 4 files changed, 238 insertions(+), 152 deletions(-)
 create mode 100644 tests/units/document/test_update.py

diff --git a/docarray/base_document/document.py b/docarray/base_document/document.py
index 72904d49020..ecbf8557c2d 100644
--- a/docarray/base_document/document.py
+++ b/docarray/base_document/document.py
@@ -1,5 +1,5 @@
 import os
-from typing import Type
+from typing import Type, List, _GenericAlias  # type: ignore
 
 import orjson
 from pydantic import BaseModel, Field, parse_obj_as
@@ -49,8 +49,13 @@ def _get_string_for_regex_filter(self):
 
     def update(self, other: 'BaseDocument'):
         """
-        Updates the content of this Document with the contents of other using
-        :func:`~docarray.utils.reduce.reduce_docs`.
+        Updates self with the content of other. Changes are applied to self.
+        Updating one Document with another consists in the following:
+         - setting data properties of the second Document to the first Document
+         if they are not None
+         - Concatenating lists and updating sets
+         - Updating recursively Documents and DocumentArrays
+         - Updating Dictionaries of the left with the right
 
         It behaves as an update operation for Dictionaries, except that since
         it is applied to a static schema type, the presence of the field is
@@ -79,6 +84,128 @@ class MyDocument(BaseDocument):
 
         :param other: The Document with which to update the contents of this
         """
-        from docarray.utils.reduce import reduce_docs
-
-        reduce_docs(self, other)
+        if type(self) != type(other):
+            raise Exception(
+                f'Update operation can only be applied to '
+                f'Documents of the same type. '
+                f'Trying to update Document of type '
+                f'{type(self)} with Document of type '
+                f'{type(other)}'
+            )
+        from docarray.utils.reduce import reduce
+        from docarray import DocumentArray
+
+        from collections import namedtuple
+
+        # Declaring namedtuple()
+        _FieldGroups = namedtuple(
+            '_FieldGroups',
+            [
+                'simple_non_empty_fields',
+                'list_fields',
+                'set_fields',
+                'dict_fields',
+                'nested_docarray_fields',
+                'nested_docs_fields',
+            ],
+        )
+
+        FORBIDDEN_FIELDS_TO_UPDATE = ['ID']
+
+        def _group_fields(doc: 'BaseDocument') -> _FieldGroups:
+            simple_non_empty_fields: List[str] = []
+            list_fields: List[str] = []
+            set_fields: List[str] = []
+            dict_fields: List[str] = []
+            nested_docs_fields: List[str] = []
+            nested_docarray_fields: List[str] = []
+
+            for field_name, field in doc.__fields__.items():
+                if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
+                    field_type = doc._get_field_type(field_name)
+                    if not isinstance(field_type, _GenericAlias) and issubclass(
+                        field_type, DocumentArray
+                    ):
+                        nested_docarray_fields.append(field_name)
+                    elif (
+                        isinstance(field_type, _GenericAlias)
+                        and field_type.__origin__ is list
+                    ):
+                        list_fields.append(field_name)
+                    elif (
+                        isinstance(field_type, _GenericAlias)
+                        and field_type.__origin__ is set
+                    ):
+                        set_fields.append(field_name)
+                    elif (
+                        isinstance(field_type, _GenericAlias)
+                        and field_type.__origin__ is dict
+                    ):
+                        dict_fields.append(field_name)
+                    else:
+                        v = getattr(doc, field_name)
+                        if v:
+                            if isinstance(v, BaseDocument):
+                                nested_docs_fields.append(field_name)
+                            else:
+                                simple_non_empty_fields.append(field_name)
+            return _FieldGroups(
+                simple_non_empty_fields,
+                list_fields,
+                set_fields,
+                dict_fields,
+                nested_docarray_fields,
+                nested_docs_fields,
+            )
+
+        doc1_fields = _group_fields(self)
+        doc2_fields = _group_fields(other)
+
+        for field in doc2_fields.simple_non_empty_fields:
+            setattr(self, field, getattr(other, field))
+
+        for field in set(
+            doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
+        ):
+            sub_doc_1: BaseDocument = getattr(self, field)
+            sub_doc_2: BaseDocument = getattr(other, field)
+            sub_doc_1.update(sub_doc_2)
+            setattr(self, field, sub_doc_1)
+
+        for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
+            array1 = getattr(self, field)
+            array2 = getattr(other, field)
+            if array1 is None and array2 is not None:
+                setattr(self, field, array2)
+            elif array1 is not None and array2 is not None:
+                array1.extend(array2)
+                setattr(self, field, array1)
+
+        for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
+            array1 = getattr(self, field)
+            array2 = getattr(other, field)
+            if array1 is None and array2 is not None:
+                setattr(self, field, array2)
+            elif array1 is not None and array2 is not None:
+                array1.update(array2)
+                setattr(self, field, array1)
+
+        for field in set(
+            doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
+        ):
+            array1 = getattr(self, field)
+            array2 = getattr(other, field)
+            if array1 is None and array2 is not None:
+                setattr(self, field, array2)
+            elif array1 is not None and array2 is not None:
+                array1 = reduce(array1, array2)
+                setattr(self, field, array1)
+
+        for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
+            dict1 = getattr(self, field)
+            dict2 = getattr(other, field)
+            if dict1 is None and dict2 is not None:
+                setattr(self, field, dict2)
+            elif dict1 is not None and dict2 is not None:
+                dict1.update(dict2)
+                setattr(self, field, dict1)
diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py
index 1c53ebb1953..60493d04ea5 100644
--- a/docarray/utils/reduce.py
+++ b/docarray/utils/reduce.py
@@ -1,132 +1,5 @@
 from docarray import DocumentArray
-from typing import List, Optional, Dict, TypeVar, _GenericAlias  # type: ignore
-from docarray.base_document import BaseDocument
-
-T = TypeVar('T', bound='BaseDocument')
-
-
-def reduce_docs(doc1: 'T', doc2: 'T') -> None:
-    """
-    Reduces doc1 and doc2 into one Document in-place. Changes are applied to doc1.
-    Reducing 2 Documents consists in the following:
-     - setting data properties of the second Document to the first Document
-     if they are not None
-     - Concatenating lists and updating sets
-     - Reducing recursively Documents and DocumentArrays
-     - Updating Dictionaries of the left with the right
-    :param doc1: first Document to be reduced. Change is applied in-place
-    :param doc2: second Document to be reduced
-    """
-    from collections import namedtuple
-
-    # Declaring namedtuple()
-    _FieldGroups = namedtuple(
-        '_FieldGroups',
-        [
-            'simple_non_empty_fields',
-            'list_fields',
-            'set_fields',
-            'dict_fields',
-            'nested_docarray_fields',
-            'nested_docs_fields',
-        ],
-    )
-
-    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']
-
-    def _group_fields(doc: 'BaseDocument') -> _FieldGroups:
-        simple_non_empty_fields: List[str] = []
-        list_fields: List[str] = []
-        set_fields: List[str] = []
-        dict_fields: List[str] = []
-        nested_docs_fields: List[str] = []
-        nested_docarray_fields: List[str] = []
-
-        for field_name, field in doc.__fields__.items():
-            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
-                field_type = doc._get_field_type(field_name)
-                if not isinstance(field_type, _GenericAlias) and issubclass(
-                    field_type, DocumentArray
-                ):
-                    nested_docarray_fields.append(field_name)
-                elif (
-                    isinstance(field_type, _GenericAlias)
-                    and field_type.__origin__ is list
-                ):
-                    list_fields.append(field_name)
-                elif (
-                    isinstance(field_type, _GenericAlias)
-                    and field_type.__origin__ is set
-                ):
-                    set_fields.append(field_name)
-                elif (
-                    isinstance(field_type, _GenericAlias)
-                    and field_type.__origin__ is dict
-                ):
-                    dict_fields.append(field_name)
-                else:
-                    v = getattr(doc, field_name)
-                    if v:
-                        if isinstance(v, BaseDocument):
-                            nested_docs_fields.append(field_name)
-                        else:
-                            simple_non_empty_fields.append(field_name)
-        return _FieldGroups(
-            simple_non_empty_fields,
-            list_fields,
-            set_fields,
-            dict_fields,
-            nested_docarray_fields,
-            nested_docs_fields,
-        )
-
-    doc1_fields = _group_fields(doc1)
-    doc2_fields = _group_fields(doc2)
-
-    for field in doc2_fields.simple_non_empty_fields:
-        setattr(doc1, field, getattr(doc2, field))
-
-    for field in set(doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields):
-        reduce_docs(getattr(doc1, field), getattr(doc2, field))
-        setattr(doc1, field, getattr(doc1, field))
-
-    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
-        array1 = getattr(doc1, field)
-        array2 = getattr(doc2, field)
-        if array1 is None and array2 is not None:
-            setattr(doc1, field, array2)
-        elif array1 is not None and array2 is not None:
-            array1.extend(array2)
-            setattr(doc1, field, array1)
-
-    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
-        array1 = getattr(doc1, field)
-        array2 = getattr(doc2, field)
-        if array1 is None and array2 is not None:
-            setattr(doc1, field, array2)
-        elif array1 is not None and array2 is not None:
-            array1.update(array2)
-            setattr(doc1, field, array1)
-
-    for field in set(
-        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
-    ):
-        array1 = getattr(doc1, field)
-        array2 = getattr(doc2, field)
-        if array1 is None and array2 is not None:
-            setattr(doc1, field, array2)
-        elif array1 is not None and array2 is not None:
-            array1 = reduce(array1, array2)
-            setattr(doc1, field, array1)
-
-    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
-        dict1 = getattr(doc1, field)
-        dict2 = getattr(doc2, field)
-        if dict1 is None and dict2 is not None:
-            setattr(doc1, field, dict2)
-        elif dict1 is not None and dict2 is not None:
-            dict1.update(dict2)
-            setattr(doc1, field, dict1)
+from typing import List, Optional, Dict
 
 
 def reduce(
diff --git a/tests/units/document/test_update.py b/tests/units/document/test_update.py
new file mode 100644
index 00000000000..90e2d813f9f
--- /dev/null
+++ b/tests/units/document/test_update.py
@@ -0,0 +1,102 @@
+import pytest
+from typing import Optional, List, Dict, Set
+from docarray import BaseDocument, DocumentArray
+from docarray.documents import Image
+
+
+class InnerDoc(BaseDocument):
+    integer: int
+    l: List
+
+
+class MMDoc(BaseDocument):
+    text: str = ''
+    price: int = 0
+    categories: Optional[List[str]] = None
+    image: Optional[Image] = None
+    matches: Optional[DocumentArray] = None
+    matches_with_same_id: Optional[DocumentArray] = None
+    opt_int: Optional[int] = None
+    test_set: Optional[Set] = None
+    inner_doc: Optional[InnerDoc] = None
+    test_dict: Optional[Dict] = None
+
+
+@pytest.fixture
+def doc1():
+    return MMDoc(
+        text='hey here',
+        categories=['a', 'b', 'c'],
+        price=10,
+        matches=DocumentArray[MMDoc]([MMDoc()]),
+        matches_with_same_id=DocumentArray[MMDoc](
+            [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]
+        ),
+        test_set={'a', 'a'},
+        inner_doc=InnerDoc(integer=2, l=['c', 'd']),
+        test_dict={'a': 0, 'b': 2, 'd': 4, 'z': 3},
+    )
+
+
+@pytest.fixture
+def doc2(doc1):
+    return MMDoc(
+        id=doc1.id,
+        text='hey here 2',
+        categories=['d', 'e', 'f'],
+        price=5,
+        opt_int=5,
+        matches=DocumentArray[MMDoc]([MMDoc()]),
+        matches_with_same_id=DocumentArray[MMDoc](
+            [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]
+        ),
+        test_set={'a', 'b'},
+        inner_doc=InnerDoc(integer=3, l=['a', 'b']),
+        test_dict={'a': 10, 'b': 10, 'c': 3, 'z': None},
+    )
+
+
+def test_update_complex(doc1, doc2):
+    doc1.update(doc2)
+    # doc1 is changed in place (no extra memory)
+    assert doc1.text == 'hey here 2'
+    assert doc1.categories == ['a', 'b', 'c', 'd', 'e', 'f']
+    assert len(doc1.matches) == 2
+    assert doc1.opt_int == 5
+    assert doc1.price == 5
+    assert doc1.test_set == {'a', 'b'}
+    assert len(doc1.matches_with_same_id) == 1
+    assert len(doc1.matches_with_same_id[0].matches) == 2
+    assert doc1.inner_doc.integer == 3
+    assert doc1.inner_doc.l == ['c', 'd', 'a', 'b']
+    assert doc1.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4, 'z': None}
+
+
+def test_update_simple():
+    class MyDocument(BaseDocument):
+        content: str
+        title: Optional[str] = None
+        tags_: List
+
+    my_doc1 = MyDocument(
+        content='Core content of the document', title='Title', tags_=['python', 'AI']
+    )
+    my_doc2 = MyDocument(content='Core content updated', tags_=['docarray'])
+
+    my_doc1.update(my_doc2)
+    assert my_doc1.content == 'Core content updated'
+    assert my_doc1.title == 'Title'
+    assert my_doc1.tags_ == ['python', 'AI', 'docarray']
+
+
+def test_update_different_schema_fails():
+    class DocA(BaseDocument):
+        content: str
+
+    class DocB(BaseDocument):
+        image: Optional[Image] = None
+
+    docA = DocA(content='haha')
+    docB = DocB()
+    with pytest.raises(Exception):
+        docA.update(docB)
diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py
index 9759c4d24eb..7e82ddf181c 100644
--- a/tests/units/util/test_reduce.py
+++ b/tests/units/util/test_reduce.py
@@ -1,8 +1,8 @@
 import pytest
-from typing import Optional, List, Dict, Any, Set
+from typing import Optional, List, Dict, Set
 from docarray import BaseDocument, DocumentArray
 from docarray.documents import Image
-from docarray.utils.reduce import reduce_docs, reduce, reduce_all
+from docarray.utils.reduce import reduce, reduce_all
 
 
 class InnerDoc(BaseDocument):
@@ -57,22 +57,6 @@ def doc2(doc1):
     )
 
 
-def test_reduce_docs(doc1, doc2):
-    reduce_docs(doc1, doc2)
-    # doc1 is changed in place (no extra memory)
-    assert doc1.text == 'hey here 2'
-    assert doc1.categories == ['a', 'b', 'c', 'd', 'e', 'f']
-    assert len(doc1.matches) == 2
-    assert doc1.opt_int == 5
-    assert doc1.price == 5
-    assert doc1.test_set == {'a', 'b'}
-    assert len(doc1.matches_with_same_id) == 1
-    assert len(doc1.matches_with_same_id[0].matches) == 2
-    assert doc1.inner_doc.integer == 3
-    assert doc1.inner_doc.l == ['c', 'd', 'a', 'b']
-    assert doc1.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4, 'z': None}
-
-
 def test_reduce_different_ids():
     da1 = DocumentArray[MMDoc]([MMDoc() for _ in range(10)])
     da2 = DocumentArray[MMDoc]([MMDoc() for _ in range(10)])

From 72684cd435418f1683772c9fdcb0e1774ee38c11 Mon Sep 17 00:00:00 2001
From: samsja <sami.jaghouar@hotmail.fr>
Date: Fri, 3 Feb 2023 15:26:53 +0100
Subject: [PATCH 09/11] refactor: use get origin instead of private
 _GenericAlais

Signed-off-by: samsja <sami.jaghouar@hotmail.fr>
---
 docarray/base_document/document.py | 54 ++++++++++++++----------------
 1 file changed, 26 insertions(+), 28 deletions(-)

diff --git a/docarray/base_document/document.py b/docarray/base_document/document.py
index ecbf8557c2d..c01749dabf7 100644
--- a/docarray/base_document/document.py
+++ b/docarray/base_document/document.py
@@ -1,9 +1,10 @@
 import os
-from typing import Type, List, _GenericAlias  # type: ignore
+from typing import List, Type
 
 import orjson
 from pydantic import BaseModel, Field, parse_obj_as
 from rich.console import Console
+from typing_inspect import get_origin
 
 from docarray.base_document.abstract_document import AbstractDocument
 from docarray.base_document.base_node import BaseNode
@@ -68,13 +69,16 @@ def update(self, other: 'BaseDocument'):
                 from docarray import BaseDocument
                 from docarray.documents import Text
 
+
                 class MyDocument(BaseDocument):
                     content: str
                     title: Optional[str] = None
                     tags_: List
 
-                doc1 = MyDocument(content='Core content of the document',
-                    title='Title', tags_=['python', 'AI'])
+
+                doc1 = MyDocument(
+                    content='Core content of the document', title='Title', tags_=['python', 'AI']
+                )
                 doc2 = MyDocument(content='Core content updated', tags_=['docarray'])
 
                 doc1.update(doc2)
@@ -92,11 +96,11 @@ class MyDocument(BaseDocument):
                 f'{type(self)} with Document of type '
                 f'{type(other)}'
             )
-        from docarray.utils.reduce import reduce
-        from docarray import DocumentArray
-
         from collections import namedtuple
 
+        from docarray import DocumentArray
+        from docarray.utils.reduce import reduce
+
         # Declaring namedtuple()
         _FieldGroups = namedtuple(
             '_FieldGroups',
@@ -123,32 +127,26 @@ def _group_fields(doc: 'BaseDocument') -> _FieldGroups:
             for field_name, field in doc.__fields__.items():
                 if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                     field_type = doc._get_field_type(field_name)
-                    if not isinstance(field_type, _GenericAlias) and issubclass(
+
+                    if isinstance(field_type, type) and issubclass(
                         field_type, DocumentArray
                     ):
                         nested_docarray_fields.append(field_name)
-                    elif (
-                        isinstance(field_type, _GenericAlias)
-                        and field_type.__origin__ is list
-                    ):
-                        list_fields.append(field_name)
-                    elif (
-                        isinstance(field_type, _GenericAlias)
-                        and field_type.__origin__ is set
-                    ):
-                        set_fields.append(field_name)
-                    elif (
-                        isinstance(field_type, _GenericAlias)
-                        and field_type.__origin__ is dict
-                    ):
-                        dict_fields.append(field_name)
                     else:
-                        v = getattr(doc, field_name)
-                        if v:
-                            if isinstance(v, BaseDocument):
-                                nested_docs_fields.append(field_name)
-                            else:
-                                simple_non_empty_fields.append(field_name)
+                        origin = get_origin(field_type)
+                        if origin is list:
+                            list_fields.append(field_name)
+                        elif origin is set:
+                            set_fields.append(field_name)
+                        elif origin is dict:
+                            dict_fields.append(field_name)
+                        else:
+                            v = getattr(doc, field_name)
+                            if v:
+                                if isinstance(v, BaseDocument):
+                                    nested_docs_fields.append(field_name)
+                                else:
+                                    simple_non_empty_fields.append(field_name)
             return _FieldGroups(
                 simple_non_empty_fields,
                 list_fields,

From e9219372214785c1291cd5e1c6aa81b35803cab8 Mon Sep 17 00:00:00 2001
From: samsja <sami.jaghouar@hotmail.fr>
Date: Fri, 3 Feb 2023 15:29:39 +0100
Subject: [PATCH 10/11] fix: fix ruff

Signed-off-by: samsja <sami.jaghouar@hotmail.fr>
---
 docarray/base_document/document.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docarray/base_document/document.py b/docarray/base_document/document.py
index c01749dabf7..8ac2f4630ef 100644
--- a/docarray/base_document/document.py
+++ b/docarray/base_document/document.py
@@ -77,7 +77,9 @@ class MyDocument(BaseDocument):
 
 
                 doc1 = MyDocument(
-                    content='Core content of the document', title='Title', tags_=['python', 'AI']
+                    content='Core content of the document',
+                    title='Title',
+                    tags_=['python', 'AI']
                 )
                 doc2 = MyDocument(content='Core content updated', tags_=['docarray'])
 

From dadffb50f2eb68341c3a2ccb02811e1236e3ed46 Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Mon, 6 Feb 2023 10:11:37 +0100
Subject: [PATCH 11/11] docs: add clarification about tuples

---
 docarray/base_document/document.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docarray/base_document/document.py b/docarray/base_document/document.py
index 8ac2f4630ef..088c7551714 100644
--- a/docarray/base_document/document.py
+++ b/docarray/base_document/document.py
@@ -60,7 +60,11 @@ def update(self, other: 'BaseDocument'):
 
         It behaves as an update operation for Dictionaries, except that since
         it is applied to a static schema type, the presence of the field is
-        given by the field not having a None value.
+        given by the field not having a None value and that DocumentArrays,
+        lists and sets are concatenated. It is worth mentioning that Tuples
+        are not merged together since they are meant to be inmutable,
+        so they behave as regular types and the value of `self` is updated
+        with the value of `other`
 
             EXAMPLE USAGE