diff --git a/README.md b/README.md index 1cee23cf2c0..919c26a9652 100644 --- a/README.md +++ b/README.md @@ -242,10 +242,10 @@ dl.insert( And you can seamlessly switch between `DocVec` and `DocList`: ```python -vec_2 = dl.stack() +vec_2 = dl.to_doc_vec() assert isinstance(vec_2, DocVec) -dl_2 = vec_2.unstack() +dl_2 = vec_2.to_doc_list() assert isinstance(dl_2, DocList) ``` diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index 8eb1a822d59..9d1ca90a916 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -251,13 +251,13 @@ def _set_data_column( for doc, value in zip(self, values): setattr(doc, field, value) - def stack( + def to_doc_vec( self, tensor_type: Type['AbstractTensor'] = NdArray, ) -> 'DocVec': """ Convert the `DocList` into a `DocVec`. `Self` cannot be used - afterwards + afterward :param tensor_type: Tensor Class used to wrap the doc_vec tensors. This is useful if the BaseDoc has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor :return: A `DocVec` of the same document type as self diff --git a/docarray/array/doc_vec/doc_vec.py b/docarray/array/doc_vec/doc_vec.py index aa9ed59c09e..c7c94b393dd 100644 --- a/docarray/array/doc_vec/doc_vec.py +++ b/docarray/array/doc_vec/doc_vec.py @@ -160,7 +160,7 @@ def __init__( cast(AbstractTensor, tensor_columns[field_name])[i] = val elif issubclass(field_type, BaseDoc): - doc_columns[field_name] = getattr(docs, field_name).stack( + doc_columns[field_name] = getattr(docs, field_name).to_doc_vec( tensor_type=self.tensor_type ) @@ -169,7 +169,7 @@ def __init__( for doc in docs: docs_nested = getattr(doc, field_name) if isinstance(docs_nested, DocList): - docs_nested = docs_nested.stack( + docs_nested = docs_nested.to_doc_vec( tensor_type=self.tensor_type ) docs_list.append(docs_nested) @@ -213,7 +213,7 @@ def validate( if isinstance(value, cls): return value elif isinstance(value, DocList.__class_getitem__(cls.doc_type)): - return cast(T, value.stack()) + return cast(T, value.to_doc_vec()) elif isinstance(value, Sequence): return cls(value) elif isinstance(value, Iterable): @@ -328,7 +328,7 @@ def _set_data_and_columns( f'this DocVec schema : {self.doc_type}' ) processed_value = cast( - T, value.stack(tensor_type=self.tensor_type) + T, value.to_doc_vec(tensor_type=self.tensor_type) ) # we need to copy data here elif isinstance(value, DocVec): @@ -474,7 +474,7 @@ def to_protobuf(self) -> 'DocVecProto': any_columns=any_columns_proto, ) - def unstack(self: T) -> DocList[T_doc]: + def to_doc_list(self: T) -> DocList[T_doc]: """Convert DocVec into a DocList. Note this destroys the arguments and returns a new DocList @@ -486,10 +486,10 @@ def unstack(self: T) -> DocList[T_doc]: unstacked_any_column = self._storage.any_columns for field, doc_col in self._storage.doc_columns.items(): - unstacked_doc_column[field] = doc_col.unstack() + unstacked_doc_column[field] = doc_col.to_doc_list() for field, da_col in self._storage.docs_vec_columns.items(): - unstacked_da_column[field] = [docs.unstack() for docs in da_col] + unstacked_da_column[field] = [docs.to_doc_list() for docs in da_col] for field, tensor_col in list(self._storage.tensor_columns.items()): # list is needed here otherwise we cannot delete the column diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index 2807ba9b06b..af099a5eac1 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -48,10 +48,10 @@ if tf is not None: from docarray.typing import TensorFlowTensor -HNSWLIB_PY_VEC_TYPES = [list, tuple, np.ndarray, AbstractTensor] +HNSWLIB_PY_VEC_TYPES: List[Any] = [list, tuple, np.ndarray, AbstractTensor] if torch is not None: - HNSWLIB_PY_VEC_TYPES.append(torch.Tensor) + HNSWLIB_PY_VEC_TYPES.append(torch.Tensor) # type: ignore if tf is not None: HNSWLIB_PY_VEC_TYPES.append(tf.Tensor) diff --git a/docs/how_to/multimodal_training_and_serving.md b/docs/how_to/multimodal_training_and_serving.md index 604545c7cd2..b89b852297f 100644 --- a/docs/how_to/multimodal_training_and_serving.md +++ b/docs/how_to/multimodal_training_and_serving.md @@ -366,7 +366,7 @@ async def embed_text(doc: Text) -> Text: with torch.autocast(device_type="cuda", dtype=torch.float16): with torch.inference_mode(): text_preprocess(doc) - da = DocList[Text]([doc], tensor_type=TorchTensor).stack() + da = DocList[Text]([doc], tensor_type=TorchTensor).to_doc_vec() da.to(DEVICE) doc.embedding = text_encoder(da)[0].to('cpu') return doc diff --git a/tests/integrations/array/test_torch_train.py b/tests/integrations/array/test_torch_train.py index e269659462a..753a793afa3 100644 --- a/tests/integrations/array/test_torch_train.py +++ b/tests/integrations/array/test_torch_train.py @@ -16,7 +16,7 @@ class Mmdoc(BaseDoc): batch = DocList[Mmdoc](Mmdoc(text=f'hello{i}') for i in range(N)) batch.tensor = torch.zeros(N, 3, 224, 224) - batch = batch.stack() + batch = batch.to_doc_vec() class Model(torch.nn.Module): def __init__(self): diff --git a/tests/units/array/stack/test_array_stacked.py b/tests/units/array/stack/test_array_stacked.py index 14f5238873a..67f0bd44ba9 100644 --- a/tests/units/array/stack/test_array_stacked.py +++ b/tests/units/array/stack/test_array_stacked.py @@ -42,7 +42,7 @@ class MMdoc(BaseDoc): ] ) - return batch.stack() + return batch.to_doc_vec() def test_create_from_list_docs(): @@ -79,7 +79,7 @@ class ImageDoc(BaseDoc): [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) - batch = batch.stack() + batch = batch.to_doc_vec() batch.tensor = torch.ones(10, 3, 224, 224) assert (batch.tensor == torch.ones(10, 3, 224, 224)).all() @@ -96,7 +96,7 @@ class ImageDoc(BaseDoc): [ImageDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)] ) - batch = batch.stack() + batch = batch.to_doc_vec() batch.tensor = np.ones((10, 3, 224, 224)) assert (batch.tensor == np.ones((10, 3, 224, 224))).all() @@ -120,7 +120,7 @@ class ImageDoc(BaseDoc): [ImageDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)] ) - batch = batch.stack() + batch = batch.to_doc_vec() assert ( batch._storage.tensor_columns['tensor'] == np.zeros((10, 3, 224, 224)) @@ -156,7 +156,7 @@ class MMdoc(BaseDoc): [MMdoc(img=ImageDoc(tensor=torch.zeros(3, 224, 224))) for _ in range(10)] ) - batch = batch.stack() + batch = batch.to_doc_vec() assert ( batch._storage.doc_columns['img']._storage.tensor_columns['tensor'] @@ -192,8 +192,8 @@ class ImageDoc(BaseDoc): [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) - batch = batch.stack() - da = batch.unstack() + batch = batch.to_doc_vec() + da = batch.to_doc_list() for doc in da: assert (doc.tensor == torch.zeros(3, 224, 224)).all() @@ -210,16 +210,16 @@ class MMdoc(BaseDoc): [MMdoc(img=ImageDoc(tensor=torch.zeros(3, 224, 224))) for _ in range(10)] ) - batch = batch.stack() + batch = batch.to_doc_vec() - da = batch.unstack() + da = batch.to_doc_list() for doc in da: assert (doc.img.tensor == torch.zeros(3, 224, 224)).all() def test_unstack_nested_DocArray(nested_batch): - batch = nested_batch.unstack() + batch = nested_batch.to_doc_list() for i in range(len(batch)): assert isinstance(batch[i].img, DocList) for doc in batch[i].img: @@ -234,7 +234,7 @@ class ImageDoc(BaseDoc): [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) - da = da.stack() + da = da.to_doc_vec() assert len(da) == 10 @@ -252,7 +252,7 @@ class ImageDoc(BaseDoc): # union fields aren't actually doc_vec # just checking that there is no error - batch.stack() + batch.to_doc_vec() @pytest.mark.parametrize( @@ -402,7 +402,7 @@ class MyDoc(BaseDoc): ) assert da[0].tensor.dtype == torch.int32 - da = da.stack() + da = da.to_doc_vec() assert da[0].tensor.dtype == torch.int32 assert da.tensor.dtype == torch.int32 @@ -416,7 +416,7 @@ class MyDoc(BaseDoc): ) assert da[0].tensor.dtype == np.int32 - da = da.stack() + da = da.to_doc_vec() assert da[0].tensor.dtype == np.int32 assert da.tensor.dtype == np.int32 @@ -436,7 +436,7 @@ class MyDoc(BaseDoc): assert all(doc.scalar.ndim == 0 for doc in da) assert all(doc.scalar == 2.0 for doc in da) - stacked_da = da.stack() + stacked_da = da.to_doc_vec() assert type(stacked_da.scalar) == NdArray assert all(type(doc.scalar) == NdArray for doc in stacked_da) @@ -457,7 +457,7 @@ class MyDoc(BaseDoc): ) assert all(doc.scalar.ndim == 0 for doc in da) assert all(doc.scalar == 2.0 for doc in da) - stacked_da = da.stack(tensor_type=TorchTensor) + stacked_da = da.to_doc_vec(tensor_type=TorchTensor) assert type(stacked_da.scalar) == TorchTensor assert all(type(doc.scalar) == TorchTensor for doc in stacked_da) @@ -475,7 +475,7 @@ class MyDoc(BaseDoc): da = DocList[MyDoc]([MyDoc() for _ in range(3)]) assert all(doc.scalar is None for doc in da) assert all(doc.scalar == doc.scalar for doc in da) - stacked_da = da.stack() + stacked_da = da.to_doc_vec() assert type(stacked_da.scalar) == NdArray assert all(type(doc.scalar) == NdArray for doc in stacked_da) # TODO fail here @@ -494,7 +494,7 @@ class MyDoc(BaseDoc): da = DocList[MyDoc]([MyDoc() for _ in range(3)]) assert all(doc.scalar is None for doc in da) assert all(doc.scalar == doc.scalar for doc in da) - stacked_da = da.stack(tensor_type=TorchTensor) + stacked_da = da.to_doc_vec(tensor_type=TorchTensor) assert type(stacked_da.scalar) == TorchTensor assert all(type(doc.scalar) == TorchTensor for doc in stacked_da) diff --git a/tests/units/array/stack/test_array_stacked_tf.py b/tests/units/array/stack/test_array_stacked_tf.py index 4ad72559254..d61fe80566b 100644 --- a/tests/units/array/stack/test_array_stacked_tf.py +++ b/tests/units/array/stack/test_array_stacked_tf.py @@ -24,7 +24,7 @@ class Image(BaseDoc): batch = DocList[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) - return batch.stack() + return batch.to_doc_vec() @pytest.fixture() @@ -108,7 +108,7 @@ class MMdoc(BaseDoc): batch = DocList[MMdoc]( [MMdoc(img=Image(tensor=tf.zeros((3, 224, 224)))) for _ in range(10)] - ).stack() + ).to_doc_vec() assert tnp.allclose( batch._storage.doc_columns['img']._storage.tensor_columns['tensor'].tensor, @@ -133,7 +133,7 @@ def test_stack_nested_DocArray(nested_batch): @pytest.mark.tensorflow def test_convert_to_da(batch): - da = batch.unstack() + da = batch.to_doc_list() for doc in da: assert tnp.allclose(doc.tensor.tensor, tf.zeros((3, 224, 224))) @@ -151,7 +151,7 @@ class MMdoc(BaseDoc): [MMdoc(img=Image(tensor=tf.zeros((3, 224, 224)))) for _ in range(10)] ) assert isinstance(batch.img._storage.tensor_columns['tensor'], TensorFlowTensor) - da = batch.unstack() + da = batch.to_doc_list() for doc in da: assert tnp.allclose(doc.img.tensor.tensor, tf.zeros((3, 224, 224))) @@ -159,7 +159,7 @@ class MMdoc(BaseDoc): @pytest.mark.tensorflow def test_unstack_nested_DocArray(nested_batch): - batch = nested_batch.unstack() + batch = nested_batch.to_doc_list() for i in range(len(batch)): assert isinstance(batch[i].img, DocList) for doc in batch[i].img: @@ -173,7 +173,7 @@ class Image(BaseDoc): da = DocList[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) - da = da.stack() + da = da.to_doc_vec() assert len(da) == 10 @@ -285,6 +285,6 @@ class MyDoc(BaseDoc): ) assert da[0].tensor.tensor.dtype == tf.int32 - da = da.stack() + da = da.to_doc_vec() assert da[0].tensor.tensor.dtype == tf.int32 assert da.tensor.tensor.dtype == tf.int32 diff --git a/tests/units/array/stack/test_proto.py b/tests/units/array/stack/test_proto.py index 585bdcf8d05..0cda39db730 100644 --- a/tests/units/array/stack/test_proto.py +++ b/tests/units/array/stack/test_proto.py @@ -14,7 +14,7 @@ class Image(BaseDoc): batch = DocList[Image]([Image(tensor=torch.zeros(3, 224, 224)) for _ in range(10)]) - return batch.stack() + return batch.to_doc_vec() @pytest.mark.proto @@ -29,7 +29,7 @@ class MyDoc(BaseDoc): da = DocList[MyDoc]([MyDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)]) - da = da.stack() + da = da.to_doc_vec() da.from_protobuf(da.to_protobuf()) @@ -41,7 +41,7 @@ class CustomDocument(BaseDoc): da = DocList[CustomDocument]( [CustomDocument(image=np.zeros((3, 224, 224))) for _ in range(10)] - ).stack() + ).to_doc_vec() da2 = DocVec.from_protobuf(da.to_protobuf()) diff --git a/tests/units/array/test_batching.py b/tests/units/array/test_batching.py index 88689c0f644..98083216527 100644 --- a/tests/units/array/test_batching.py +++ b/tests/units/array/test_batching.py @@ -24,7 +24,7 @@ class MyDoc(BaseDoc): ] ) if stack: - da = da.stack() + da = da.to_doc_vec() batches = list(da._batch(batch_size=batch_size, shuffle=shuffle)) assert len(batches) == n_batches diff --git a/tests/units/array/test_indexing.py b/tests/units/array/test_indexing.py index eb225d97ec7..f733b1e6630 100644 --- a/tests/units/array/test_indexing.py +++ b/tests/units/array/test_indexing.py @@ -33,7 +33,7 @@ def da_to_set(): @pytest.mark.parametrize('stack', [True, False]) def test_simple_getitem(stack, da): if stack: - da = da.stack(tensor_type=TorchTensor) + da = da.to_doc_vec(tensor_type=TorchTensor) assert torch.all(da[0].embedding == torch.zeros((4,))) assert da[0].text == 'hello 0' @@ -42,7 +42,7 @@ def test_simple_getitem(stack, da): @pytest.mark.parametrize('stack', [True, False]) def test_get_none(stack, da): if stack: - da = da.stack(tensor_type=TorchTensor) + da = da.to_doc_vec(tensor_type=TorchTensor) assert da[None] is da @@ -51,7 +51,7 @@ def test_get_none(stack, da): @pytest.mark.parametrize('index', [(1, 2, 3, 4, 6), [1, 2, 3, 4, 6]]) def test_iterable_getitem(stack, da, index): if stack: - da = da.stack(tensor_type=TorchTensor) + da = da.to_doc_vec(tensor_type=TorchTensor) indexed_da = da[index] @@ -64,7 +64,7 @@ def test_iterable_getitem(stack, da, index): @pytest.mark.parametrize('index_dtype', [torch.int64]) def test_torchtensor_getitem(stack, da, index_dtype): if stack: - da = da.stack(tensor_type=TorchTensor) + da = da.to_doc_vec(tensor_type=TorchTensor) index = torch.tensor([1, 2, 3, 4, 6], dtype=index_dtype) @@ -79,7 +79,7 @@ def test_torchtensor_getitem(stack, da, index_dtype): @pytest.mark.parametrize('index_dtype', [int, np.int_, np.int32, np.int64]) def test_nparray_getitem(stack, da, index_dtype): if stack: - da = da.stack(tensor_type=TorchTensor) + da = da.to_doc_vec(tensor_type=TorchTensor) index = np.array([1, 2, 3, 4, 6], dtype=index_dtype) @@ -101,7 +101,7 @@ def test_nparray_getitem(stack, da, index_dtype): ) def test_boolmask_getitem(stack, da, index): if stack: - da = da.stack(tensor_type=TorchTensor) + da = da.to_doc_vec(tensor_type=TorchTensor) indexed_da = da[index] @@ -120,7 +120,7 @@ def test_boolmask_getitem(stack, da, index): @pytest.mark.parametrize('stack_left', [True, False]) def test_simple_setitem(stack_left, da, da_to_set): if stack_left: - da = da.stack(tensor_type=TorchTensor) + da = da.to_doc_vec(tensor_type=TorchTensor) da[0] = da_to_set[0] @@ -133,9 +133,9 @@ def test_simple_setitem(stack_left, da, da_to_set): @pytest.mark.parametrize('index', [(1, 2, 3, 4, 6), [1, 2, 3, 4, 6]]) def test_iterable_setitem(stack_left, stack_right, da, da_to_set, index): if stack_left: - da = da.stack(tensor_type=TorchTensor) + da = da.to_doc_vec(tensor_type=TorchTensor) if stack_right: - da_to_set = da_to_set.stack(tensor_type=TorchTensor) + da_to_set = da_to_set.to_doc_vec(tensor_type=TorchTensor) da[index] = da_to_set @@ -156,9 +156,9 @@ def test_iterable_setitem(stack_left, stack_right, da, da_to_set, index): @pytest.mark.parametrize('index_dtype', [torch.int64]) def test_torchtensor_setitem(stack_left, stack_right, da, da_to_set, index_dtype): if stack_left: - da = da.stack(tensor_type=TorchTensor) + da = da.to_doc_vec(tensor_type=TorchTensor) if stack_right: - da_to_set = da_to_set.stack(tensor_type=TorchTensor) + da_to_set = da_to_set.to_doc_vec(tensor_type=TorchTensor) index = torch.tensor([1, 2, 3, 4, 6], dtype=index_dtype) @@ -181,9 +181,9 @@ def test_torchtensor_setitem(stack_left, stack_right, da, da_to_set, index_dtype @pytest.mark.parametrize('index_dtype', [int, np.int_, np.int32, np.int64]) def test_nparray_setitem(stack_left, stack_right, da, da_to_set, index_dtype): if stack_left: - da = da.stack(tensor_type=TorchTensor) + da = da.to_doc_vec(tensor_type=TorchTensor) if stack_right: - da_to_set = da_to_set.stack(tensor_type=TorchTensor) + da_to_set = da_to_set.to_doc_vec(tensor_type=TorchTensor) index = np.array([1, 2, 3, 4, 6], dtype=index_dtype) @@ -214,9 +214,9 @@ def test_nparray_setitem(stack_left, stack_right, da, da_to_set, index_dtype): ) def test_boolmask_setitem(stack_left, stack_right, da, da_to_set, index): if stack_left: - da = da.stack(tensor_type=TorchTensor) + da = da.to_doc_vec(tensor_type=TorchTensor) if stack_right: - da_to_set = da_to_set.stack(tensor_type=TorchTensor) + da_to_set = da_to_set.to_doc_vec(tensor_type=TorchTensor) da[index] = da_to_set diff --git a/tests/units/array/test_traverse.py b/tests/units/array/test_traverse.py index 281abe0ce0e..75d225ea5ec 100644 --- a/tests/units/array/test_traverse.py +++ b/tests/units/array/test_traverse.py @@ -90,7 +90,7 @@ class Image(BaseDoc): ] ) - batch_stacked = batch.stack() + batch_stacked = batch.to_doc_vec() tensors = batch_stacked.traverse_flat(access_path='tensor') assert tensors.shape == (2, 3, 224, 224) diff --git a/tests/units/util/test_find.py b/tests/units/util/test_find.py index 96837cddc7f..0f01f07caae 100644 --- a/tests/units/util/test_find.py +++ b/tests/units/util/test_find.py @@ -79,7 +79,7 @@ def test_find_torch_tensor_query(random_torch_query, random_torch_index): def test_find_torch_stacked(random_torch_query, random_torch_index): - random_torch_index = random_torch_index.stack() + random_torch_index = random_torch_index.to_doc_vec() top_k, scores = find( random_torch_index, random_torch_query, @@ -124,7 +124,7 @@ def test_find_np_tensor_query(random_nd_query, random_nd_index): def test_find_np_stacked(random_nd_query, random_nd_index): - random_nd_index = random_nd_index.stack() + random_nd_index = random_nd_index.to_doc_vec() top_k, scores = find( random_nd_index, random_nd_query, @@ -179,9 +179,9 @@ def test_find_batched_torch_stacked( random_torch_batch_query, random_torch_index, stack_what ): if stack_what in ('index', 'both'): - random_torch_index = random_torch_index.stack() + random_torch_index = random_torch_index.to_doc_vec() if stack_what in ('query', 'both'): - random_torch_batch_query = random_torch_batch_query.stack() + random_torch_batch_query = random_torch_batch_query.to_doc_vec() results = find_batched( random_torch_index, @@ -238,9 +238,9 @@ def test_find_batched_np_tensor_query(random_nd_batch_query, random_nd_index): @pytest.mark.parametrize('stack_what', ['index', 'query', 'both']) def test_find_batched_np_stacked(random_nd_batch_query, random_nd_index, stack_what): if stack_what in ('index', 'both'): - random_nd_index = random_nd_index.stack() + random_nd_index = random_nd_index.to_doc_vec() if stack_what in ('query', 'both'): - random_nd_batch_query = random_nd_batch_query.stack() + random_nd_batch_query = random_nd_batch_query.to_doc_vec() results = find_batched( random_nd_index, random_nd_batch_query, @@ -309,7 +309,7 @@ class MyDoc(BaseDoc): ] ) if stack: - index = index.stack() + index = index.to_doc_vec() top_k, scores = find( index,