diff --git a/docarray/base_document/document.py b/docarray/base_document/document.py index e891eef0fb2..088c7551714 100644 --- a/docarray/base_document/document.py +++ b/docarray/base_document/document.py @@ -1,9 +1,10 @@ import os -from typing import Type +from typing import List, Type import orjson from pydantic import BaseModel, Field, parse_obj_as from rich.console import Console +from typing_inspect import get_origin from docarray.base_document.abstract_document import AbstractDocument from docarray.base_document.base_node import BaseNode @@ -46,3 +47,169 @@ def __str__(self): def _get_string_for_regex_filter(self): return str(self) + + def update(self, other: 'BaseDocument'): + """ + Updates self with the content of other. Changes are applied to self. + Updating one Document with another consists in the following: + - setting data properties of the second Document to the first Document + if they are not None + - Concatenating lists and updating sets + - Updating recursively Documents and DocumentArrays + - Updating Dictionaries of the left with the right + + It behaves as an update operation for Dictionaries, except that since + it is applied to a static schema type, the presence of the field is + given by the field not having a None value and that DocumentArrays, + lists and sets are concatenated. It is worth mentioning that Tuples + are not merged together since they are meant to be inmutable, + so they behave as regular types and the value of `self` is updated + with the value of `other` + + EXAMPLE USAGE + + .. code-block:: python + + from docarray import BaseDocument + from docarray.documents import Text + + + class MyDocument(BaseDocument): + content: str + title: Optional[str] = None + tags_: List + + + doc1 = MyDocument( + content='Core content of the document', + title='Title', + tags_=['python', 'AI'] + ) + doc2 = MyDocument(content='Core content updated', tags_=['docarray']) + + doc1.update(doc2) + assert doc1.content == 'Core content updated' + assert doc1.title == 'Title' + assert doc1.tags_ == ['python', 'AI', 'docarray'] + + :param other: The Document with which to update the contents of this + """ + if type(self) != type(other): + raise Exception( + f'Update operation can only be applied to ' + f'Documents of the same type. ' + f'Trying to update Document of type ' + f'{type(self)} with Document of type ' + f'{type(other)}' + ) + from collections import namedtuple + + from docarray import DocumentArray + from docarray.utils.reduce import reduce + + # Declaring namedtuple() + _FieldGroups = namedtuple( + '_FieldGroups', + [ + 'simple_non_empty_fields', + 'list_fields', + 'set_fields', + 'dict_fields', + 'nested_docarray_fields', + 'nested_docs_fields', + ], + ) + + FORBIDDEN_FIELDS_TO_UPDATE = ['ID'] + + def _group_fields(doc: 'BaseDocument') -> _FieldGroups: + simple_non_empty_fields: List[str] = [] + list_fields: List[str] = [] + set_fields: List[str] = [] + dict_fields: List[str] = [] + nested_docs_fields: List[str] = [] + nested_docarray_fields: List[str] = [] + + for field_name, field in doc.__fields__.items(): + if field_name not in FORBIDDEN_FIELDS_TO_UPDATE: + field_type = doc._get_field_type(field_name) + + if isinstance(field_type, type) and issubclass( + field_type, DocumentArray + ): + nested_docarray_fields.append(field_name) + else: + origin = get_origin(field_type) + if origin is list: + list_fields.append(field_name) + elif origin is set: + set_fields.append(field_name) + elif origin is dict: + dict_fields.append(field_name) + else: + v = getattr(doc, field_name) + if v: + if isinstance(v, BaseDocument): + nested_docs_fields.append(field_name) + else: + simple_non_empty_fields.append(field_name) + return _FieldGroups( + simple_non_empty_fields, + list_fields, + set_fields, + dict_fields, + nested_docarray_fields, + nested_docs_fields, + ) + + doc1_fields = _group_fields(self) + doc2_fields = _group_fields(other) + + for field in doc2_fields.simple_non_empty_fields: + setattr(self, field, getattr(other, field)) + + for field in set( + doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields + ): + sub_doc_1: BaseDocument = getattr(self, field) + sub_doc_2: BaseDocument = getattr(other, field) + sub_doc_1.update(sub_doc_2) + setattr(self, field, sub_doc_1) + + for field in set(doc1_fields.list_fields + doc2_fields.list_fields): + array1 = getattr(self, field) + array2 = getattr(other, field) + if array1 is None and array2 is not None: + setattr(self, field, array2) + elif array1 is not None and array2 is not None: + array1.extend(array2) + setattr(self, field, array1) + + for field in set(doc1_fields.set_fields + doc2_fields.set_fields): + array1 = getattr(self, field) + array2 = getattr(other, field) + if array1 is None and array2 is not None: + setattr(self, field, array2) + elif array1 is not None and array2 is not None: + array1.update(array2) + setattr(self, field, array1) + + for field in set( + doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields + ): + array1 = getattr(self, field) + array2 = getattr(other, field) + if array1 is None and array2 is not None: + setattr(self, field, array2) + elif array1 is not None and array2 is not None: + array1 = reduce(array1, array2) + setattr(self, field, array1) + + for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields): + dict1 = getattr(self, field) + dict2 = getattr(other, field) + if dict1 is None and dict2 is not None: + setattr(self, field, dict2) + elif dict1 is not None and dict2 is not None: + dict1.update(dict2) + setattr(self, field, dict1) diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py new file mode 100644 index 00000000000..60493d04ea5 --- /dev/null +++ b/docarray/utils/reduce.py @@ -0,0 +1,68 @@ +from docarray import DocumentArray +from typing import List, Optional, Dict + + +def reduce( + left: DocumentArray, right: DocumentArray, left_id_map: Optional[Dict] = None +) -> 'DocumentArray': + """ + Reduces left and right DocumentArray into one DocumentArray in-place. + Changes are applied to the left DocumentArray. + Reducing 2 DocumentArrays consists in adding Documents in the second DocumentArray + to the first DocumentArray if they do not exist. + If a Document exists in both DocumentArrays (identified by ID), + the data properties are merged with priority to the left Document. + + Nested DocumentArrays are also reduced in the same way. + :param left: First DocumentArray to be reduced. Changes will be applied to it + in-place + :param right: Second DocumentArray to be reduced + :param left_id_map: Optional parameter to be passed in repeated calls + for optimizations, keeping a map of the Document ID to its offset + in the DocumentArray + :return: Reduced DocumentArray + """ + left_id_map = left_id_map or {doc.id: i for i, doc in enumerate(left)} + + for doc in right: + if doc.id in left_id_map: + left[left_id_map[doc.id]].update(doc) + else: + left.append(doc) + + return left + + +def reduce_all(docarrays: List[DocumentArray]) -> DocumentArray: + """ + Reduces a list of DocumentArrays into one DocumentArray. + Changes are applied to the first DocumentArray in-place. + + The resulting DocumentArray contains Documents of all DocumentArrays. + If a Document exists (identified by their ID) in many DocumentArrays, + data properties are merged with priority to the left-most + DocumentArrays (that is, if a data attribute is set in a Document + belonging to many DocumentArrays, the attribute value of the left-most + DocumentArray is kept). + Nested DocumentArrays belonging to many DocumentArrays + are also reduced in the same way. + .. note:: + - Nested DocumentArrays order does not follow any specific rule. + You might want to re-sort them in a later step. + - The final result depends on the order of DocumentArrays + when applying reduction. + + :param docarrays: List of DocumentArrays to be reduced + :return: the resulting DocumentArray + """ + if len(docarrays) <= 1: + raise Exception( + 'In order to reduce DocumentArrays' + ' we should have more than one DocumentArray' + ) + left = docarrays[0] + others = docarrays[1:] + left_id_map = {doc.id: i for i, doc in enumerate(left)} + for da in others: + reduce(left, da, left_id_map) + return left diff --git a/tests/units/document/test_base_document.py b/tests/units/document/test_base_document.py index be519424702..6a76c58f56b 100644 --- a/tests/units/document/test_base_document.py +++ b/tests/units/document/test_base_document.py @@ -1,8 +1,25 @@ +from typing import Optional, List from docarray.base_document.document import BaseDocument def test_base_document_init(): - doc = BaseDocument() assert doc.id is not None + + +def test_update(): + class MyDocument(BaseDocument): + content: str + title: Optional[str] = None + tags_: List + + doc1 = MyDocument( + content='Core content of the document', title='Title', tags_=['python', 'AI'] + ) + doc2 = MyDocument(content='Core content updated', tags_=['docarray']) + + doc1.update(doc2) + assert doc1.content == 'Core content updated' + assert doc1.title == 'Title' + assert doc1.tags_ == ['python', 'AI', 'docarray'] diff --git a/tests/units/document/test_update.py b/tests/units/document/test_update.py new file mode 100644 index 00000000000..90e2d813f9f --- /dev/null +++ b/tests/units/document/test_update.py @@ -0,0 +1,102 @@ +import pytest +from typing import Optional, List, Dict, Set +from docarray import BaseDocument, DocumentArray +from docarray.documents import Image + + +class InnerDoc(BaseDocument): + integer: int + l: List + + +class MMDoc(BaseDocument): + text: str = '' + price: int = 0 + categories: Optional[List[str]] = None + image: Optional[Image] = None + matches: Optional[DocumentArray] = None + matches_with_same_id: Optional[DocumentArray] = None + opt_int: Optional[int] = None + test_set: Optional[Set] = None + inner_doc: Optional[InnerDoc] = None + test_dict: Optional[Dict] = None + + +@pytest.fixture +def doc1(): + return MMDoc( + text='hey here', + categories=['a', 'b', 'c'], + price=10, + matches=DocumentArray[MMDoc]([MMDoc()]), + matches_with_same_id=DocumentArray[MMDoc]( + [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))] + ), + test_set={'a', 'a'}, + inner_doc=InnerDoc(integer=2, l=['c', 'd']), + test_dict={'a': 0, 'b': 2, 'd': 4, 'z': 3}, + ) + + +@pytest.fixture +def doc2(doc1): + return MMDoc( + id=doc1.id, + text='hey here 2', + categories=['d', 'e', 'f'], + price=5, + opt_int=5, + matches=DocumentArray[MMDoc]([MMDoc()]), + matches_with_same_id=DocumentArray[MMDoc]( + [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))] + ), + test_set={'a', 'b'}, + inner_doc=InnerDoc(integer=3, l=['a', 'b']), + test_dict={'a': 10, 'b': 10, 'c': 3, 'z': None}, + ) + + +def test_update_complex(doc1, doc2): + doc1.update(doc2) + # doc1 is changed in place (no extra memory) + assert doc1.text == 'hey here 2' + assert doc1.categories == ['a', 'b', 'c', 'd', 'e', 'f'] + assert len(doc1.matches) == 2 + assert doc1.opt_int == 5 + assert doc1.price == 5 + assert doc1.test_set == {'a', 'b'} + assert len(doc1.matches_with_same_id) == 1 + assert len(doc1.matches_with_same_id[0].matches) == 2 + assert doc1.inner_doc.integer == 3 + assert doc1.inner_doc.l == ['c', 'd', 'a', 'b'] + assert doc1.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4, 'z': None} + + +def test_update_simple(): + class MyDocument(BaseDocument): + content: str + title: Optional[str] = None + tags_: List + + my_doc1 = MyDocument( + content='Core content of the document', title='Title', tags_=['python', 'AI'] + ) + my_doc2 = MyDocument(content='Core content updated', tags_=['docarray']) + + my_doc1.update(my_doc2) + assert my_doc1.content == 'Core content updated' + assert my_doc1.title == 'Title' + assert my_doc1.tags_ == ['python', 'AI', 'docarray'] + + +def test_update_different_schema_fails(): + class DocA(BaseDocument): + content: str + + class DocB(BaseDocument): + image: Optional[Image] = None + + docA = DocA(content='haha') + docB = DocB() + with pytest.raises(Exception): + docA.update(docB) diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py new file mode 100644 index 00000000000..7e82ddf181c --- /dev/null +++ b/tests/units/util/test_reduce.py @@ -0,0 +1,120 @@ +import pytest +from typing import Optional, List, Dict, Set +from docarray import BaseDocument, DocumentArray +from docarray.documents import Image +from docarray.utils.reduce import reduce, reduce_all + + +class InnerDoc(BaseDocument): + integer: int + l: List + + +class MMDoc(BaseDocument): + text: str = '' + price: int = 0 + categories: Optional[List[str]] = None + image: Optional[Image] = None + matches: Optional[DocumentArray] = None + matches_with_same_id: Optional[DocumentArray] = None + opt_int: Optional[int] = None + test_set: Optional[Set] = None + inner_doc: Optional[InnerDoc] = None + test_dict: Optional[Dict] = None + + +@pytest.fixture +def doc1(): + return MMDoc( + text='hey here', + categories=['a', 'b', 'c'], + price=10, + matches=DocumentArray[MMDoc]([MMDoc()]), + matches_with_same_id=DocumentArray[MMDoc]( + [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))] + ), + test_set={'a', 'a'}, + inner_doc=InnerDoc(integer=2, l=['c', 'd']), + test_dict={'a': 0, 'b': 2, 'd': 4, 'z': 3}, + ) + + +@pytest.fixture +def doc2(doc1): + return MMDoc( + id=doc1.id, + text='hey here 2', + categories=['d', 'e', 'f'], + price=5, + opt_int=5, + matches=DocumentArray[MMDoc]([MMDoc()]), + matches_with_same_id=DocumentArray[MMDoc]( + [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))] + ), + test_set={'a', 'b'}, + inner_doc=InnerDoc(integer=3, l=['a', 'b']), + test_dict={'a': 10, 'b': 10, 'c': 3, 'z': None}, + ) + + +def test_reduce_different_ids(): + da1 = DocumentArray[MMDoc]([MMDoc() for _ in range(10)]) + da2 = DocumentArray[MMDoc]([MMDoc() for _ in range(10)]) + result = reduce(da1, da2) + assert len(result) == 20 + # da1 is changed in place (no extra memory) + assert len(da1) == 20 + + +def test_reduce(doc1, doc2): + da1 = DocumentArray[MMDoc]([doc1, MMDoc()]) + da2 = DocumentArray[MMDoc]([MMDoc(), doc2]) + result = reduce(da1, da2) + assert len(result) == 3 + # da1 is changed in place (no extra memory) + assert len(da1) == 3 + merged_doc = result[0] + assert merged_doc.text == 'hey here 2' + assert merged_doc.categories == ['a', 'b', 'c', 'd', 'e', 'f'] + assert len(merged_doc.matches) == 2 + assert merged_doc.opt_int == 5 + assert merged_doc.price == 5 + assert merged_doc.test_set == {'a', 'b'} + assert len(merged_doc.matches_with_same_id) == 1 + assert len(merged_doc.matches_with_same_id[0].matches) == 2 + assert merged_doc.inner_doc.integer == 3 + assert merged_doc.inner_doc.l == ['c', 'd', 'a', 'b'] + + +def test_reduce_all(doc1, doc2): + da1 = DocumentArray[MMDoc]([doc1, MMDoc()]) + da2 = DocumentArray[MMDoc]([MMDoc(), doc2]) + da3 = DocumentArray[MMDoc]([MMDoc(), MMDoc(), doc1]) + result = reduce_all([da1, da2, da3]) + assert len(result) == 5 + # da1 is changed in place (no extra memory) + assert len(da1) == 5 + merged_doc = result[0] + assert merged_doc.text == 'hey here 2' + assert merged_doc.categories == [ + 'a', + 'b', + 'c', + 'd', + 'e', + 'f', + 'a', + 'b', + 'c', + 'd', + 'e', + 'f', + ] + assert len(merged_doc.matches) == 2 + assert merged_doc.opt_int == 5 + assert merged_doc.price == 5 + assert merged_doc.test_set == {'a', 'b'} + assert len(merged_doc.matches_with_same_id) == 1 + assert len(merged_doc.matches_with_same_id[0].matches) == 2 + assert merged_doc.inner_doc.integer == 3 + assert merged_doc.inner_doc.l == ['c', 'd', 'a', 'b', 'c', 'd', 'a', 'b']