From 4b018d8a066f2efdfdd3621f094caeb8838ef104 Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Wed, 21 Jun 2023 12:46:34 +0200
Subject: [PATCH 1/8] feat: add method to create BaseDoc from schema

Signed-off-by: Joan Fontanals Martinez <joan.martinez@jina.ai>
---
 docarray/utils/create.py        | 165 ++++++++++++++++++++++
 tests/units/util/test_create.py | 240 ++++++++++++++++++++++++++++++++
 2 files changed, 405 insertions(+)
 create mode 100644 docarray/utils/create.py
 create mode 100644 tests/units/util/test_create.py

diff --git a/docarray/utils/create.py b/docarray/utils/create.py
new file mode 100644
index 00000000000..b313cffbc08
--- /dev/null
+++ b/docarray/utils/create.py
@@ -0,0 +1,165 @@
+from docarray import DocList, BaseDoc
+from docarray.typing import AnyTensor
+from pydantic import create_model
+from typing import Dict, List, Any, Union, Optional
+
+
+def _create_aux_model_doc_list_to_list(model):
+    fields = {}
+    for field_name, field in model.__annotations__.items():
+        try:
+            if issubclass(field, DocList):
+                fields[field_name] = (List[field.doc_type], {})
+            else:
+                fields[field_name] = (field, {})
+        except TypeError:
+            fields[field_name] = (field, {})
+    return create_model(
+        model.__name__, __base__=model, __validators__=model.__validators__, **fields
+    )
+
+
+def _get_field_from_type(
+    field_schema,
+    field_name,
+    root_schema,
+    cached_models,
+    is_tensor=False,
+    num_recursions=0,
+):
+    field_type = field_schema.get('type', None)
+    tensor_shape = field_schema.get('tensor/array shape', None)
+    if 'anyOf' in field_schema:
+        any_of_types = []
+        for any_of_schema in field_schema['anyOf']:
+            if '$ref' in any_of_schema:
+                obj_ref = any_of_schema.get('$ref')
+                ref_name = obj_ref.split('/')[-1]
+                any_of_types.append(
+                    create_base_doc_from_schema(
+                        root_schema['definitions'][ref_name],
+                        ref_name,
+                        cached_models=cached_models,
+                    )
+                )
+            else:
+                any_of_types.append(
+                    _get_field_from_type(
+                        any_of_schema,
+                        field_name,
+                        root_schema=root_schema,
+                        cached_models=cached_models,
+                        is_tensor=tensor_shape is not None,
+                        num_recursions=0,
+                    )
+                )  # No Union of Lists
+        ret = Union[tuple(any_of_types)]
+        for rec in range(num_recursions):
+            ret = List[ret]
+    elif field_type == 'string':
+        ret = str
+        for rec in range(num_recursions):
+            ret = List[ret]
+    elif field_type == 'integer':
+        ret = int
+        for rec in range(num_recursions):
+            ret = List[ret]
+    elif field_type == 'number':
+        if num_recursions <= 1:
+            # This is a hack because AnyTensor is more generic than a simple List and it comes as simple List
+            if is_tensor:
+                ret = AnyTensor
+            else:
+                ret = List[float]
+        else:
+            ret = float
+            for rec in range(num_recursions):
+                ret = List[ret]
+    elif field_type == 'boolean':
+        ret = bool
+        for rec in range(num_recursions):
+            ret = List[ret]
+    elif field_type == 'object' or field_type is None:
+        if 'additionalProperties' in field_schema:  # handle Dictionaries
+            additional_props = field_schema['additionalProperties']
+            if additional_props.get('type') == 'object':
+                ret = Dict[
+                    str,
+                    create_base_doc_from_schema(
+                        additional_props, field_name, cached_models=cached_models
+                    ),
+                ]
+            else:
+                ret = Dict[str, Any]
+        else:
+            obj_ref = field_schema.get('$ref') or field_schema.get('allOf', [{}])[
+                0
+            ].get('$ref', None)
+            if num_recursions == 0:  # single object reference
+                if obj_ref:
+                    ref_name = obj_ref.split('/')[-1]
+                    ret = create_base_doc_from_schema(
+                        root_schema['definitions'][ref_name],
+                        ref_name,
+                        cached_models=cached_models,
+                    )
+                else:
+                    ret = Any
+            else:  # object reference in definitions
+                if obj_ref:
+                    ref_name = obj_ref.split('/')[-1]
+                    ret = DocList[
+                        create_base_doc_from_schema(
+                            root_schema['definitions'][ref_name],
+                            ref_name,
+                            cached_models=cached_models,
+                        )
+                    ]
+                else:
+                    ret = DocList[
+                        create_base_doc_from_schema(
+                            field_schema, field_name, cached_models=cached_models
+                        )
+                    ]
+    elif field_type == 'array':
+        ret = _get_field_from_type(
+            field_schema=field_schema.get('items', {}),
+            field_name=field_name,
+            root_schema=root_schema,
+            cached_models=cached_models,
+            is_tensor=tensor_shape is not None,
+            num_recursions=num_recursions + 1,
+        )
+    else:
+        if num_recursions > 0:
+            raise ValueError(
+                f"Unknown array item type: {field_type} for field_name {field_name}"
+            )
+        else:
+            raise ValueError(
+                f"Unknown field type: {field_type} for field_name {field_name}"
+            )
+    return ret
+
+
+def create_base_doc_from_schema(
+    schema: Dict[str, any], model_name: str, cached_models: Optional[Dict] = None
+) -> type:
+    cached_models = cached_models if cached_models is not None else {}
+    fields = {}
+    if model_name in cached_models:
+        return cached_models[model_name]
+    for field_name, field_schema in schema.get('properties', {}).items():
+        field_type = _get_field_from_type(
+            field_schema=field_schema,
+            field_name=field_name,
+            root_schema=schema,
+            cached_models=cached_models,
+            is_tensor=False,
+            num_recursions=0,
+        )
+        fields[field_name] = (field_type, field_schema.get('description'))
+
+    model = create_model(model_name, __base__=BaseDoc, **fields)
+    cached_models[model_name] = model
+    return model
diff --git a/tests/units/util/test_create.py b/tests/units/util/test_create.py
new file mode 100644
index 00000000000..74785c103b8
--- /dev/null
+++ b/tests/units/util/test_create.py
@@ -0,0 +1,240 @@
+import pytest
+from typing import List, Dict, Union, Any
+from docarray.utils.create import (
+    create_base_doc_from_schema,
+    _create_aux_model_doc_list_to_list,
+)
+import numpy as np
+from typing import Optional
+from docarray import BaseDoc, DocList
+from docarray.typing import AnyTensor, ImageUrl
+from docarray.documents import TextDoc
+
+
+@pytest.mark.parametrize('transformation', ['proto', 'json'])
+def test_create_pydantic_model_from_schema(transformation):
+    class CustomDoc(BaseDoc):
+        tensor: Optional[AnyTensor]
+        url: ImageUrl
+        lll: List[List[List[int]]] = [[[5]]]
+        fff: List[List[List[float]]] = [[[5.2]]]
+        single_text: TextDoc
+        texts: DocList[TextDoc]
+        d: Dict[str, str] = {'a': 'b'}
+        di: Optional[Dict[str, int]] = None
+        u: Union[str, int]
+        lu: List[Union[str, int]] = [0, 1, 2]
+        tags: Optional[Dict[str, Any]] = None
+
+    CustomDocCopy = _create_aux_model_doc_list_to_list(CustomDoc)
+    new_custom_doc_model = create_base_doc_from_schema(
+        CustomDocCopy.schema(), 'CustomDoc', {}
+    )
+
+    original_custom_docs = DocList[CustomDoc](
+        [
+            CustomDoc(
+                url='photo.jpg',
+                lll=[[[40]]],
+                fff=[[[40.2]]],
+                d={'b': 'a'},
+                texts=DocList[TextDoc]([TextDoc(text='hey ha', embedding=np.zeros(3))]),
+                single_text=TextDoc(text='single hey ha', embedding=np.zeros(2)),
+                u='a',
+                lu=[3, 4],
+            )
+        ]
+    )
+    for doc in original_custom_docs:
+        doc.tensor = np.zeros((10, 10, 10))
+        doc.di = {'a': 2}
+
+    if transformation == 'proto':
+        custom_partial_da = DocList[new_custom_doc_model].from_protobuf(
+            original_custom_docs.to_protobuf()
+        )
+        original_back = DocList[CustomDoc].from_protobuf(
+            custom_partial_da.to_protobuf()
+        )
+    elif transformation == 'json':
+        custom_partial_da = DocList[new_custom_doc_model].from_json(
+            original_custom_docs.to_json()
+        )
+        original_back = DocList[CustomDoc].from_json(custom_partial_da.to_json())
+
+    assert len(custom_partial_da) == 1
+    assert custom_partial_da[0].url == 'photo.jpg'
+    assert custom_partial_da[0].lll == [[[40]]]
+    assert custom_partial_da[0].lu == ['3', '4']  # Union validates back to string
+    assert custom_partial_da[0].fff == [[[40.2]]]
+    assert custom_partial_da[0].di == {'a': 2}
+    assert custom_partial_da[0].d == {'b': 'a'}
+    assert len(custom_partial_da[0].texts) == 1
+    assert custom_partial_da[0].texts[0].text == 'hey ha'
+    assert custom_partial_da[0].texts[0].embedding.shape == (3,)
+    assert custom_partial_da[0].tensor.shape == (10, 10, 10)
+    assert custom_partial_da[0].u == 'a'
+    assert custom_partial_da[0].single_text.text == 'single hey ha'
+    assert custom_partial_da[0].single_text.embedding.shape == (2,)
+
+    assert len(original_back) == 1
+    assert original_back[0].url == 'photo.jpg'
+    assert original_back[0].lll == [[[40]]]
+    assert original_back[0].lu == ['3', '4']  # Union validates back to string
+    assert original_back[0].fff == [[[40.2]]]
+    assert original_back[0].di == {'a': 2}
+    assert original_back[0].d == {'b': 'a'}
+    assert len(original_back[0].texts) == 1
+    assert original_back[0].texts[0].text == 'hey ha'
+    assert original_back[0].texts[0].embedding.shape == (3,)
+    assert original_back[0].tensor.shape == (10, 10, 10)
+    assert original_back[0].u == 'a'
+    assert original_back[0].single_text.text == 'single hey ha'
+    assert original_back[0].single_text.embedding.shape == (2,)
+
+    class TextDocWithId(BaseDoc):
+        ia: str
+
+    TextDocWithIdCopy = _create_aux_model_doc_list_to_list(TextDocWithId)
+    new_textdoc_with_id_model = create_base_doc_from_schema(
+        TextDocWithIdCopy.schema(), 'TextDocWithId', {}
+    )
+
+    original_text_doc_with_id = DocList[TextDocWithId](
+        [TextDocWithId(ia=f'ID {i}') for i in range(10)]
+    )
+    if transformation == 'proto':
+        custom_da = DocList[new_textdoc_with_id_model].from_protobuf(
+            original_text_doc_with_id.to_protobuf()
+        )
+        original_back = DocList[TextDocWithId].from_protobuf(custom_da.to_protobuf())
+    elif transformation == 'json':
+        custom_da = DocList[new_textdoc_with_id_model].from_json(
+            original_text_doc_with_id.to_json()
+        )
+        original_back = DocList[TextDocWithId].from_json(custom_da.to_json())
+
+    assert len(custom_da) == 10
+    for i, doc in enumerate(custom_da):
+        assert doc.ia == f'ID {i}'
+
+    assert len(original_back) == 10
+    for i, doc in enumerate(original_back):
+        assert doc.ia == f'ID {i}'
+
+    class ResultTestDoc(BaseDoc):
+        matches: DocList[TextDocWithId]
+
+    ResultTestDocCopy = _create_aux_model_doc_list_to_list(ResultTestDoc)
+    new_result_test_doc_with_id_model = create_base_doc_from_schema(
+        ResultTestDocCopy.schema(), 'ResultTestDoc', {}
+    )
+    result_test_docs = DocList[ResultTestDoc](
+        [ResultTestDoc(matches=original_text_doc_with_id)]
+    )
+
+    if transformation == 'proto':
+        custom_da = DocList[new_result_test_doc_with_id_model].from_protobuf(
+            result_test_docs.to_protobuf()
+        )
+        original_back = DocList[ResultTestDoc].from_protobuf(custom_da.to_protobuf())
+    elif transformation == 'json':
+        custom_da = DocList[new_result_test_doc_with_id_model].from_json(
+            result_test_docs.to_json()
+        )
+        original_back = DocList[ResultTestDoc].from_json(custom_da.to_json())
+
+    assert len(custom_da) == 1
+    assert len(custom_da[0].matches) == 10
+    for i, doc in enumerate(custom_da[0].matches):
+        assert doc.ia == f'ID {i}'
+
+    assert len(original_back) == 1
+    assert len(original_back[0].matches) == 10
+    for i, doc in enumerate(original_back[0].matches):
+        assert doc.ia == f'ID {i}'
+
+
+@pytest.mark.parametrize('transformation', ['proto', 'json'])
+def test_create_empty_doc_list_from_schema(transformation):
+    class CustomDoc(BaseDoc):
+        tensor: Optional[AnyTensor]
+        url: ImageUrl
+        lll: List[List[List[int]]] = [[[5]]]
+        fff: List[List[List[float]]] = [[[5.2]]]
+        single_text: TextDoc
+        texts: DocList[TextDoc]
+        d: Dict[str, str] = {'a': 'b'}
+        di: Optional[Dict[str, int]] = None
+        u: Union[str, int]
+        lu: List[Union[str, int]] = [0, 1, 2]
+        tags: Optional[Dict[str, Any]] = None
+        lf: List[float] = [3.0, 4.1]
+
+    CustomDocCopy = _create_aux_model_doc_list_to_list(CustomDoc)
+    new_custom_doc_model = create_base_doc_from_schema(
+        CustomDocCopy.schema(), 'CustomDoc'
+    )
+
+    original_custom_docs = DocList[CustomDoc]()
+    if transformation == 'proto':
+        custom_partial_da = DocList[new_custom_doc_model].from_protobuf(
+            original_custom_docs.to_protobuf()
+        )
+        original_back = DocList[CustomDoc].from_protobuf(
+            custom_partial_da.to_protobuf()
+        )
+    elif transformation == 'json':
+        custom_partial_da = DocList[new_custom_doc_model].from_json(
+            original_custom_docs.to_json()
+        )
+        original_back = DocList[CustomDoc].from_json(custom_partial_da.to_json())
+
+    assert len(custom_partial_da) == 0
+    assert len(original_back) == 0
+
+    class TextDocWithId(BaseDoc):
+        ia: str
+
+    TextDocWithIdCopy = _create_aux_model_doc_list_to_list(TextDocWithId)
+    new_textdoc_with_id_model = create_base_doc_from_schema(
+        TextDocWithIdCopy.schema(), 'TextDocWithId', {}
+    )
+
+    original_text_doc_with_id = DocList[TextDocWithId]()
+    if transformation == 'proto':
+        custom_da = DocList[new_textdoc_with_id_model].from_protobuf(
+            original_text_doc_with_id.to_protobuf()
+        )
+        original_back = DocList[TextDocWithId].from_protobuf(custom_da.to_protobuf())
+    elif transformation == 'json':
+        custom_da = DocList[new_textdoc_with_id_model].from_json(
+            original_text_doc_with_id.to_json()
+        )
+        original_back = DocList[TextDocWithId].from_json(custom_da.to_json())
+
+    assert len(original_back) == 0
+    assert len(custom_da) == 0
+
+    class ResultTestDoc(BaseDoc):
+        matches: DocList[TextDocWithId]
+
+    ResultTestDocCopy = _create_aux_model_doc_list_to_list(ResultTestDoc)
+    new_result_test_doc_with_id_model = create_base_doc_from_schema(
+        ResultTestDocCopy.schema(), 'ResultTestDoc', {}
+    )
+    result_test_docs = DocList[ResultTestDoc]()
+
+    if transformation == 'proto':
+        custom_da = DocList[new_result_test_doc_with_id_model].from_protobuf(
+            result_test_docs.to_protobuf()
+        )
+        original_back = DocList[ResultTestDoc].from_protobuf(custom_da.to_protobuf())
+    elif transformation == 'json':
+        custom_da = DocList[new_result_test_doc_with_id_model].from_json(
+            result_test_docs.to_json()
+        )
+        original_back = DocList[ResultTestDoc].from_json(custom_da.to_json())
+
+    assert len(original_back) == 0
+    assert len(custom_da) == 0

From e3e26a7eba3d6418a4fd3976225a1ba68efb2ad4 Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Wed, 21 Jun 2023 18:19:40 +0200
Subject: [PATCH 2/8] chore: apply some mypy changes

---
 docarray/utils/create.py        | 12 +++++++-----
 tests/units/util/test_create.py | 14 +++++++-------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/docarray/utils/create.py b/docarray/utils/create.py
index b313cffbc08..b324be78876 100644
--- a/docarray/utils/create.py
+++ b/docarray/utils/create.py
@@ -1,11 +1,12 @@
 from docarray import DocList, BaseDoc
 from docarray.typing import AnyTensor
 from pydantic import create_model
-from typing import Dict, List, Any, Union, Optional
+from typing import Dict, List, Any, Union, Optional, Tuple, Type
+from typing_extensions import TypeAlias
 
 
-def _create_aux_model_doc_list_to_list(model):
-    fields = {}
+def create_new_model_cast_doclist_to_list(model: BaseDoc) -> BaseDoc:
+    fields: Dict[str, Tuple[Type, Dict]] = {}
     for field_name, field in model.__annotations__.items():
         try:
             if issubclass(field, DocList):
@@ -29,6 +30,7 @@ def _get_field_from_type(
 ):
     field_type = field_schema.get('type', None)
     tensor_shape = field_schema.get('tensor/array shape', None)
+    ret: TypeAlias
     if 'anyOf' in field_schema:
         any_of_types = []
         for any_of_schema in field_schema['anyOf']:
@@ -143,8 +145,8 @@ def _get_field_from_type(
 
 
 def create_base_doc_from_schema(
-    schema: Dict[str, any], model_name: str, cached_models: Optional[Dict] = None
-) -> type:
+    schema: Dict[str, Any], model_name: str, cached_models: Optional[Dict] = None
+) -> Type:
     cached_models = cached_models if cached_models is not None else {}
     fields = {}
     if model_name in cached_models:
diff --git a/tests/units/util/test_create.py b/tests/units/util/test_create.py
index 74785c103b8..5a9850949a5 100644
--- a/tests/units/util/test_create.py
+++ b/tests/units/util/test_create.py
@@ -2,7 +2,7 @@
 from typing import List, Dict, Union, Any
 from docarray.utils.create import (
     create_base_doc_from_schema,
-    _create_aux_model_doc_list_to_list,
+    create_new_model_cast_doclist_to_list,
 )
 import numpy as np
 from typing import Optional
@@ -26,7 +26,7 @@ class CustomDoc(BaseDoc):
         lu: List[Union[str, int]] = [0, 1, 2]
         tags: Optional[Dict[str, Any]] = None
 
-    CustomDocCopy = _create_aux_model_doc_list_to_list(CustomDoc)
+    CustomDocCopy = create_new_model_cast_doclist_to_list(CustomDoc)
     new_custom_doc_model = create_base_doc_from_schema(
         CustomDocCopy.schema(), 'CustomDoc', {}
     )
@@ -95,7 +95,7 @@ class CustomDoc(BaseDoc):
     class TextDocWithId(BaseDoc):
         ia: str
 
-    TextDocWithIdCopy = _create_aux_model_doc_list_to_list(TextDocWithId)
+    TextDocWithIdCopy = create_new_model_cast_doclist_to_list(TextDocWithId)
     new_textdoc_with_id_model = create_base_doc_from_schema(
         TextDocWithIdCopy.schema(), 'TextDocWithId', {}
     )
@@ -125,7 +125,7 @@ class TextDocWithId(BaseDoc):
     class ResultTestDoc(BaseDoc):
         matches: DocList[TextDocWithId]
 
-    ResultTestDocCopy = _create_aux_model_doc_list_to_list(ResultTestDoc)
+    ResultTestDocCopy = create_new_model_cast_doclist_to_list(ResultTestDoc)
     new_result_test_doc_with_id_model = create_base_doc_from_schema(
         ResultTestDocCopy.schema(), 'ResultTestDoc', {}
     )
@@ -171,7 +171,7 @@ class CustomDoc(BaseDoc):
         tags: Optional[Dict[str, Any]] = None
         lf: List[float] = [3.0, 4.1]
 
-    CustomDocCopy = _create_aux_model_doc_list_to_list(CustomDoc)
+    CustomDocCopy = create_new_model_cast_doclist_to_list(CustomDoc)
     new_custom_doc_model = create_base_doc_from_schema(
         CustomDocCopy.schema(), 'CustomDoc'
     )
@@ -196,7 +196,7 @@ class CustomDoc(BaseDoc):
     class TextDocWithId(BaseDoc):
         ia: str
 
-    TextDocWithIdCopy = _create_aux_model_doc_list_to_list(TextDocWithId)
+    TextDocWithIdCopy = create_new_model_cast_doclist_to_list(TextDocWithId)
     new_textdoc_with_id_model = create_base_doc_from_schema(
         TextDocWithIdCopy.schema(), 'TextDocWithId', {}
     )
@@ -219,7 +219,7 @@ class TextDocWithId(BaseDoc):
     class ResultTestDoc(BaseDoc):
         matches: DocList[TextDocWithId]
 
-    ResultTestDocCopy = _create_aux_model_doc_list_to_list(ResultTestDoc)
+    ResultTestDocCopy = create_new_model_cast_doclist_to_list(ResultTestDoc)
     new_result_test_doc_with_id_model = create_base_doc_from_schema(
         ResultTestDocCopy.schema(), 'ResultTestDoc', {}
     )

From e5f065ffd712948a528284e3bb00d68981e7d109 Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Wed, 21 Jun 2023 18:54:44 +0200
Subject: [PATCH 3/8] chore: fix mypy more

Signed-off-by: Joan Fontanals Martinez <joan.martinez@jina.ai>
---
 docarray/utils/create.py | 47 +++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/docarray/utils/create.py b/docarray/utils/create.py
index b324be78876..41de8eefb46 100644
--- a/docarray/utils/create.py
+++ b/docarray/utils/create.py
@@ -1,16 +1,16 @@
 from docarray import DocList, BaseDoc
 from docarray.typing import AnyTensor
 from pydantic import create_model
-from typing import Dict, List, Any, Union, Optional, Tuple, Type
-from typing_extensions import TypeAlias
+from typing import Dict, List, Any, Union, Optional, Type
 
 
-def create_new_model_cast_doclist_to_list(model: BaseDoc) -> BaseDoc:
-    fields: Dict[str, Tuple[Type, Dict]] = {}
+def create_new_model_cast_doclist_to_list(model: Any) -> BaseDoc:
+    fields: Dict[str, Any] = {}
     for field_name, field in model.__annotations__.items():
         try:
             if issubclass(field, DocList):
-                fields[field_name] = (List[field.doc_type], {})
+                t: Any = field.doc_type
+                fields[field_name] = (List[t], {})
             else:
                 fields[field_name] = (field, {})
         except TypeError:
@@ -30,7 +30,7 @@ def _get_field_from_type(
 ):
     field_type = field_schema.get('type', None)
     tensor_shape = field_schema.get('tensor/array shape', None)
-    ret: TypeAlias
+    ret: Any
     if 'anyOf' in field_schema:
         any_of_types = []
         for any_of_schema in field_schema['anyOf']:
@@ -82,15 +82,14 @@ def _get_field_from_type(
         for rec in range(num_recursions):
             ret = List[ret]
     elif field_type == 'object' or field_type is None:
+        doc_type: Any
         if 'additionalProperties' in field_schema:  # handle Dictionaries
             additional_props = field_schema['additionalProperties']
             if additional_props.get('type') == 'object':
-                ret = Dict[
-                    str,
-                    create_base_doc_from_schema(
-                        additional_props, field_name, cached_models=cached_models
-                    ),
-                ]
+                doc_type = create_base_doc_from_schema(
+                    additional_props, field_name, cached_models=cached_models
+                )
+                ret = Dict[str, doc_type]
             else:
                 ret = Dict[str, Any]
         else:
@@ -110,19 +109,17 @@ def _get_field_from_type(
             else:  # object reference in definitions
                 if obj_ref:
                     ref_name = obj_ref.split('/')[-1]
-                    ret = DocList[
-                        create_base_doc_from_schema(
-                            root_schema['definitions'][ref_name],
-                            ref_name,
-                            cached_models=cached_models,
-                        )
-                    ]
+                    doc_type = create_base_doc_from_schema(
+                        root_schema['definitions'][ref_name],
+                        ref_name,
+                        cached_models=cached_models,
+                    )
+                    ret = DocList[doc_type]
                 else:
-                    ret = DocList[
-                        create_base_doc_from_schema(
-                            field_schema, field_name, cached_models=cached_models
-                        )
-                    ]
+                    doc_type = create_base_doc_from_schema(
+                        field_schema, field_name, cached_models=cached_models
+                    )
+                    ret = DocList[doc_type]
     elif field_type == 'array':
         ret = _get_field_from_type(
             field_schema=field_schema.get('items', {}),
@@ -148,7 +145,7 @@ def create_base_doc_from_schema(
     schema: Dict[str, Any], model_name: str, cached_models: Optional[Dict] = None
 ) -> Type:
     cached_models = cached_models if cached_models is not None else {}
-    fields = {}
+    fields: Dict[str, Any] = {}
     if model_name in cached_models:
         return cached_models[model_name]
     for field_name, field_schema in schema.get('properties', {}).items():

From 89b15f4f98fc427f4c0ad0c20e95e304f5dee84b Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Thu, 22 Jun 2023 08:54:22 +0200
Subject: [PATCH 4/8] docs: add docstrings

---
 docarray/utils/create.py | 56 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/docarray/utils/create.py b/docarray/utils/create.py
index 41de8eefb46..6e044443c2e 100644
--- a/docarray/utils/create.py
+++ b/docarray/utils/create.py
@@ -5,6 +5,30 @@
 
 
 def create_new_model_cast_doclist_to_list(model: Any) -> BaseDoc:
+    """
+    Take a Pydantic model and cast DocList fields into List fields.
+
+    This can be needed because of this limitation of Pydantic:
+
+    (https://docs.pydantic.dev/latest/blog/pydantic-v2/)
+
+    ---
+
+    ```python
+    from docarray import BaseDoc
+    class MyDoc(BaseDoc):
+        tensor: Optional[AnyTensor]
+        url: ImageUrl
+        title: str
+        texts: DocList[TextDoc]
+
+    MyDocCorrected = create_new_model_cast_doclist_to_list(CustomDoc)
+    ```
+
+    ---
+    :param model: The input model
+    :return: A BaseDoc class dynamically created with List instead of DocList in the schema.
+    """
     fields: Dict[str, Any] = {}
     for field_name, field in model.__annotations__.items():
         try:
@@ -144,6 +168,38 @@ def _get_field_from_type(
 def create_base_doc_from_schema(
     schema: Dict[str, Any], model_name: str, cached_models: Optional[Dict] = None
 ) -> Type:
+    """
+    Dynamically create a `BaseDoc` class from a `schema` of another `BaseDoc`.
+    This method is intended to dynamically create a `BaseDoc` compatible with the schema
+    of another BaseDoc that is not available in the context. For instance, you may have stored the schema
+    as a JSON, or sent it to another service, etc ...
+
+    Due to this Pydantic limitation (https://docs.pydantic.dev/latest/blog/pydantic-v2/), we need to make sure that the
+    input schema uses `List` and not `DocList`, therefore this is recommended to be used in combination with `create_new_model_cast_doclist_to_list`
+    to make sure that `DocLists` in schema are converted to `List`.
+
+    ---
+
+    ```python
+    from docarray import BaseDoc
+    class MyDoc(BaseDoc):
+        tensor: Optional[AnyTensor]
+        url: ImageUrl
+        title: str
+        texts: DocList[TextDoc]
+
+    MyDocCorrected = create_new_model_cast_doclist_to_list(CustomDoc)
+    new_my_doc_cls = create_base_doc_from_schema(
+        CustomDocCopy.schema(), 'MyDoc'
+    )
+    ```
+
+    ---
+    :param schema: The schema of the original `BaseDoc` where DocLists are passed as regular Lists of Documents.
+    :param model_name: The name of the new pydantic model created.
+    :param cached_models: Parameter used when this method is called recursively to reuse partial nested classes.
+    :return: A BaseDoc class dynamically created following the `schema`.
+    """
     cached_models = cached_models if cached_models is not None else {}
     fields: Dict[str, Any] = {}
     if model_name in cached_models:

From f39cadbf2f3a850daee0207f51430b4788ec2f54 Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Thu, 22 Jun 2023 09:57:22 +0200
Subject: [PATCH 5/8] docs: add more docstrings

---
 docarray/utils/create.py | 54 +++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 20 deletions(-)

diff --git a/docarray/utils/create.py b/docarray/utils/create.py
index 6e044443c2e..7e357778704 100644
--- a/docarray/utils/create.py
+++ b/docarray/utils/create.py
@@ -16,12 +16,15 @@ def create_new_model_cast_doclist_to_list(model: Any) -> BaseDoc:
 
     ```python
     from docarray import BaseDoc
+
+
     class MyDoc(BaseDoc):
         tensor: Optional[AnyTensor]
         url: ImageUrl
         title: str
         texts: DocList[TextDoc]
 
+
     MyDocCorrected = create_new_model_cast_doclist_to_list(CustomDoc)
     ```
 
@@ -44,14 +47,24 @@ class MyDoc(BaseDoc):
     )
 
 
-def _get_field_from_type(
-    field_schema,
-    field_name,
-    root_schema,
-    cached_models,
-    is_tensor=False,
-    num_recursions=0,
-):
+def _get_field_type_from_schema(
+        field_schema: Dict[str, Any],
+        field_name: str,
+        root_schema: Dict[str, Any],
+        cached_models: Dict[str, Any],
+        is_tensor: bool = False,
+        num_recursions: int = 0,
+) -> type:
+    """
+    Private method used to extract the corresponding field type from the schema.
+    :param field_schema: The schema from which to extract the type
+    :param field_name: The name of the field to be created
+    :param root_schema: The schema of the root object, important to get references
+    :param cached_models: Parameter used when this method is called recursively to reuse partial nested classes.
+    :param is_tensor: Boolean used to tell between tensor and list
+    :param num_recursions: Number of recursions to properly handle nested types (Dict, List, etc ..)
+    :return: A type created from the schema
+    """
     field_type = field_schema.get('type', None)
     tensor_shape = field_schema.get('tensor/array shape', None)
     ret: Any
@@ -70,7 +83,7 @@ def _get_field_from_type(
                 )
             else:
                 any_of_types.append(
-                    _get_field_from_type(
+                    _get_field_type_from_schema(
                         any_of_schema,
                         field_name,
                         root_schema=root_schema,
@@ -145,7 +158,7 @@ def _get_field_from_type(
                     )
                     ret = DocList[doc_type]
     elif field_type == 'array':
-        ret = _get_field_from_type(
+        ret = _get_field_type_from_schema(
             field_schema=field_schema.get('items', {}),
             field_name=field_name,
             root_schema=root_schema,
@@ -166,7 +179,7 @@ def _get_field_from_type(
 
 
 def create_base_doc_from_schema(
-    schema: Dict[str, Any], model_name: str, cached_models: Optional[Dict] = None
+        schema: Dict[str, Any], base_doc_name: str, cached_models: Optional[Dict] = None
 ) -> Type:
     """
     Dynamically create a `BaseDoc` class from a `schema` of another `BaseDoc`.
@@ -182,30 +195,31 @@ def create_base_doc_from_schema(
 
     ```python
     from docarray import BaseDoc
+
+
     class MyDoc(BaseDoc):
         tensor: Optional[AnyTensor]
         url: ImageUrl
         title: str
         texts: DocList[TextDoc]
 
+
     MyDocCorrected = create_new_model_cast_doclist_to_list(CustomDoc)
-    new_my_doc_cls = create_base_doc_from_schema(
-        CustomDocCopy.schema(), 'MyDoc'
-    )
+    new_my_doc_cls = create_base_doc_from_schema(CustomDocCopy.schema(), 'MyDoc')
     ```
 
     ---
     :param schema: The schema of the original `BaseDoc` where DocLists are passed as regular Lists of Documents.
-    :param model_name: The name of the new pydantic model created.
+    :param base_doc_name: The name of the new pydantic model created.
     :param cached_models: Parameter used when this method is called recursively to reuse partial nested classes.
     :return: A BaseDoc class dynamically created following the `schema`.
     """
     cached_models = cached_models if cached_models is not None else {}
     fields: Dict[str, Any] = {}
-    if model_name in cached_models:
-        return cached_models[model_name]
+    if base_doc_name in cached_models:
+        return cached_models[base_doc_name]
     for field_name, field_schema in schema.get('properties', {}).items():
-        field_type = _get_field_from_type(
+        field_type = _get_field_type_from_schema(
             field_schema=field_schema,
             field_name=field_name,
             root_schema=schema,
@@ -215,6 +229,6 @@ class MyDoc(BaseDoc):
         )
         fields[field_name] = (field_type, field_schema.get('description'))
 
-    model = create_model(model_name, __base__=BaseDoc, **fields)
-    cached_models[model_name] = model
+    model = create_model(base_doc_name, __base__=BaseDoc, **fields)
+    cached_models[base_doc_name] = model
     return model

From be35141d18afb78d5c8e64bc432ff50953cb3ae7 Mon Sep 17 00:00:00 2001
From: Joan Fontanals <joan.martinez@jina.ai>
Date: Thu, 22 Jun 2023 09:57:55 +0200
Subject: [PATCH 6/8] docs: apply suggestions from code review

Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com>
Signed-off-by: Joan Fontanals <jfontanalsmartinez@gmail.com>
Signed-off-by: Joan Fontanals Martinez <joan.martinez@jina.ai>
---
 docarray/utils/create.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/docarray/utils/create.py b/docarray/utils/create.py
index 7e357778704..344dc85602b 100644
--- a/docarray/utils/create.py
+++ b/docarray/utils/create.py
@@ -8,9 +8,10 @@ def create_new_model_cast_doclist_to_list(model: Any) -> BaseDoc:
     """
     Take a Pydantic model and cast DocList fields into List fields.
 
-    This can be needed because of this limitation of Pydantic:
+    This may be necessary due to limitations in Pydantic:
 
-    (https://docs.pydantic.dev/latest/blog/pydantic-v2/)
+    https://github.com/docarray/docarray/issues/1521
+    https://github.com/pydantic/pydantic/issues/1457
 
     ---
 
@@ -30,7 +31,7 @@ class MyDoc(BaseDoc):
 
     ---
     :param model: The input model
-    :return: A BaseDoc class dynamically created with List instead of DocList in the schema.
+    :return: A new subclass of BaseDoc, where every DocList type in the schema is replaced by List.
     """
     fields: Dict[str, Any] = {}
     for field_name, field in model.__annotations__.items():
@@ -48,12 +49,12 @@ class MyDoc(BaseDoc):
 
 
 def _get_field_type_from_schema(
-        field_schema: Dict[str, Any],
-        field_name: str,
-        root_schema: Dict[str, Any],
-        cached_models: Dict[str, Any],
-        is_tensor: bool = False,
-        num_recursions: int = 0,
+    field_schema: Dict[str, Any],
+    field_name: str,
+    root_schema: Dict[str, Any],
+    cached_models: Dict[str, Any],
+    is_tensor: bool = False,
+    num_recursions: int = 0,
 ) -> type:
     """
     Private method used to extract the corresponding field type from the schema.
@@ -179,16 +180,17 @@ def _get_field_type_from_schema(
 
 
 def create_base_doc_from_schema(
-        schema: Dict[str, Any], base_doc_name: str, cached_models: Optional[Dict] = None
+    schema: Dict[str, Any], base_doc_name: str, cached_models: Optional[Dict] = None
 ) -> Type:
     """
-    Dynamically create a `BaseDoc` class from a `schema` of another `BaseDoc`.
+    Dynamically create a `BaseDoc` subclass from a `schema` of another `BaseDoc`.
+
     This method is intended to dynamically create a `BaseDoc` compatible with the schema
-    of another BaseDoc that is not available in the context. For instance, you may have stored the schema
-    as a JSON, or sent it to another service, etc ...
+    of another BaseDoc. This is useful when that other `BaseDoc` is not available in the current scope. For instance, you may have stored the schema
+    as a JSON, or sent it to another service, etc.
 
-    Due to this Pydantic limitation (https://docs.pydantic.dev/latest/blog/pydantic-v2/), we need to make sure that the
-    input schema uses `List` and not `DocList`, therefore this is recommended to be used in combination with `create_new_model_cast_doclist_to_list`
+    Due to this Pydantic limitation (https://github.com/docarray/docarray/issues/1521, https://github.com/pydantic/pydantic/issues/1457), we need to make sure that the
+    input schema uses `List` and not `DocList`. Therefore this is recommended to be used in combination with `create_new_model_cast_doclist_to_list`
     to make sure that `DocLists` in schema are converted to `List`.
 
     ---

From 97c07e5cb128ea72e8bfda79ddfe4ee1bb253482 Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Thu, 22 Jun 2023 10:07:58 +0200
Subject: [PATCH 7/8] refactor: change name of files

Signed-off-by: Joan Fontanals Martinez <joan.martinez@jina.ai>
---
 docarray/utils/{create.py => create_dynamic_doc_class.py}       | 0
 .../util/{test_create.py => test_create_dynamic_code_class.py}  | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename docarray/utils/{create.py => create_dynamic_doc_class.py} (100%)
 rename tests/units/util/{test_create.py => test_create_dynamic_code_class.py} (99%)

diff --git a/docarray/utils/create.py b/docarray/utils/create_dynamic_doc_class.py
similarity index 100%
rename from docarray/utils/create.py
rename to docarray/utils/create_dynamic_doc_class.py
diff --git a/tests/units/util/test_create.py b/tests/units/util/test_create_dynamic_code_class.py
similarity index 99%
rename from tests/units/util/test_create.py
rename to tests/units/util/test_create_dynamic_code_class.py
index 5a9850949a5..4b78e389dc4 100644
--- a/tests/units/util/test_create.py
+++ b/tests/units/util/test_create_dynamic_code_class.py
@@ -1,6 +1,6 @@
 import pytest
 from typing import List, Dict, Union, Any
-from docarray.utils.create import (
+from docarray.utils.create_dynamic_doc_class import (
     create_base_doc_from_schema,
     create_new_model_cast_doclist_to_list,
 )

From 2a0b302f638febf9119128a9f216d3d4eeaa1cb4 Mon Sep 17 00:00:00 2001
From: Joan Fontanals Martinez <joan.martinez@jina.ai>
Date: Thu, 22 Jun 2023 12:24:43 +0200
Subject: [PATCH 8/8] chore: change method name

---
 docarray/utils/create_dynamic_doc_class.py         |  4 ++--
 tests/units/util/test_create_dynamic_code_class.py | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docarray/utils/create_dynamic_doc_class.py b/docarray/utils/create_dynamic_doc_class.py
index 344dc85602b..820743e1b66 100644
--- a/docarray/utils/create_dynamic_doc_class.py
+++ b/docarray/utils/create_dynamic_doc_class.py
@@ -4,7 +4,7 @@
 from typing import Dict, List, Any, Union, Optional, Type
 
 
-def create_new_model_cast_doclist_to_list(model: Any) -> BaseDoc:
+def create_pure_python_type_model(model: Any) -> BaseDoc:
     """
     Take a Pydantic model and cast DocList fields into List fields.
 
@@ -206,7 +206,7 @@ class MyDoc(BaseDoc):
         texts: DocList[TextDoc]
 
 
-    MyDocCorrected = create_new_model_cast_doclist_to_list(CustomDoc)
+    MyDocCorrected = create_pure_python_type_model(CustomDoc)
     new_my_doc_cls = create_base_doc_from_schema(CustomDocCopy.schema(), 'MyDoc')
     ```
 
diff --git a/tests/units/util/test_create_dynamic_code_class.py b/tests/units/util/test_create_dynamic_code_class.py
index 4b78e389dc4..ff7f6551403 100644
--- a/tests/units/util/test_create_dynamic_code_class.py
+++ b/tests/units/util/test_create_dynamic_code_class.py
@@ -2,7 +2,7 @@
 from typing import List, Dict, Union, Any
 from docarray.utils.create_dynamic_doc_class import (
     create_base_doc_from_schema,
-    create_new_model_cast_doclist_to_list,
+    create_pure_python_type_model,
 )
 import numpy as np
 from typing import Optional
@@ -26,7 +26,7 @@ class CustomDoc(BaseDoc):
         lu: List[Union[str, int]] = [0, 1, 2]
         tags: Optional[Dict[str, Any]] = None
 
-    CustomDocCopy = create_new_model_cast_doclist_to_list(CustomDoc)
+    CustomDocCopy = create_pure_python_type_model(CustomDoc)
     new_custom_doc_model = create_base_doc_from_schema(
         CustomDocCopy.schema(), 'CustomDoc', {}
     )
@@ -95,7 +95,7 @@ class CustomDoc(BaseDoc):
     class TextDocWithId(BaseDoc):
         ia: str
 
-    TextDocWithIdCopy = create_new_model_cast_doclist_to_list(TextDocWithId)
+    TextDocWithIdCopy = create_pure_python_type_model(TextDocWithId)
     new_textdoc_with_id_model = create_base_doc_from_schema(
         TextDocWithIdCopy.schema(), 'TextDocWithId', {}
     )
@@ -125,7 +125,7 @@ class TextDocWithId(BaseDoc):
     class ResultTestDoc(BaseDoc):
         matches: DocList[TextDocWithId]
 
-    ResultTestDocCopy = create_new_model_cast_doclist_to_list(ResultTestDoc)
+    ResultTestDocCopy = create_pure_python_type_model(ResultTestDoc)
     new_result_test_doc_with_id_model = create_base_doc_from_schema(
         ResultTestDocCopy.schema(), 'ResultTestDoc', {}
     )
@@ -171,7 +171,7 @@ class CustomDoc(BaseDoc):
         tags: Optional[Dict[str, Any]] = None
         lf: List[float] = [3.0, 4.1]
 
-    CustomDocCopy = create_new_model_cast_doclist_to_list(CustomDoc)
+    CustomDocCopy = create_pure_python_type_model(CustomDoc)
     new_custom_doc_model = create_base_doc_from_schema(
         CustomDocCopy.schema(), 'CustomDoc'
     )
@@ -196,7 +196,7 @@ class CustomDoc(BaseDoc):
     class TextDocWithId(BaseDoc):
         ia: str
 
-    TextDocWithIdCopy = create_new_model_cast_doclist_to_list(TextDocWithId)
+    TextDocWithIdCopy = create_pure_python_type_model(TextDocWithId)
     new_textdoc_with_id_model = create_base_doc_from_schema(
         TextDocWithIdCopy.schema(), 'TextDocWithId', {}
     )
@@ -219,7 +219,7 @@ class TextDocWithId(BaseDoc):
     class ResultTestDoc(BaseDoc):
         matches: DocList[TextDocWithId]
 
-    ResultTestDocCopy = create_new_model_cast_doclist_to_list(ResultTestDoc)
+    ResultTestDocCopy = create_pure_python_type_model(ResultTestDoc)
     new_result_test_doc_with_id_model = create_base_doc_from_schema(
         ResultTestDocCopy.schema(), 'ResultTestDoc', {}
     )