diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 82d991947f7..7e14491a2d6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -75,7 +75,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install poetry - poetry install + poetry install --all-extras poetry run mypy docarray # prep-testbed: diff --git a/docarray/typing/url/any_url.py b/docarray/typing/url/any_url.py index b5cf67c3901..44dc6d7eea2 100644 --- a/docarray/typing/url/any_url.py +++ b/docarray/typing/url/any_url.py @@ -1,15 +1,22 @@ -from typing import Type, TypeVar +from typing import TYPE_CHECKING, Type, TypeVar from pydantic import AnyUrl as BaseAnyUrl -from pydantic import parse_obj_as +from pydantic import errors, parse_obj_as from docarray.document.base_node import BaseNode from docarray.proto import NodeProto +if TYPE_CHECKING: + from pydantic.networks import Parts + T = TypeVar('T', bound='AnyUrl') class AnyUrl(BaseAnyUrl, BaseNode): + host_required = ( + False # turn off host requirement to allow passing of local paths as URL + ) + def _to_node_protobuf(self) -> NodeProto: """Convert Document into a NodeProto protobuf message. This function should be called when the Document is nested into another Document that need to @@ -19,6 +26,29 @@ def _to_node_protobuf(self) -> NodeProto: """ return NodeProto(any_url=str(self)) + @classmethod + def validate_parts(cls, parts: 'Parts', validate_port: bool = True) -> 'Parts': + """ + A method used to validate parts of a URL. + Our URLs should be able to function both in local and remote settings. + Therefore, we allow missing `scheme`, making it possible to pass a file path. + """ + scheme = parts['scheme'] + if scheme is None: + pass # allow missing scheme, unlike pydantic + + elif cls.allowed_schemes and scheme.lower() not in cls.allowed_schemes: + raise errors.UrlSchemePermittedError(set(cls.allowed_schemes)) + + if validate_port: + cls._validate_port(parts['port']) + + user = parts['user'] + if cls.user_required and user is None: + raise errors.UrlUserInfoError() + + return parts + @classmethod def from_protobuf(cls: Type[T], pb_msg: 'str') -> T: """ diff --git a/docarray/typing/url/helper.py b/docarray/typing/url/helper.py new file mode 100644 index 00000000000..31979c81828 --- /dev/null +++ b/docarray/typing/url/helper.py @@ -0,0 +1,47 @@ +import os +import urllib.parse +import urllib.request +from contextlib import nullcontext + + +def _uri_to_blob(uri: str, timeout=None) -> bytes: + """Convert uri to blob + Internally it reads uri into blob. + :param uri: the uri of Document + :param timeout: timeout for urlopen. Only relevant if uri is not local + :return: blob bytes. + """ + if urllib.parse.urlparse(uri).scheme in {'http', 'https', 'data'}: + req = urllib.request.Request(uri, headers={'User-Agent': 'Mozilla/5.0'}) + urlopen_kwargs = {'timeout': timeout} if timeout is not None else {} + with urllib.request.urlopen(req, **urlopen_kwargs) as fp: + return fp.read() + elif os.path.exists(uri): + with open(uri, 'rb') as fp: + return fp.read() + else: + raise FileNotFoundError(f'`{uri}` is not a URL or a valid local path') + + +def _get_file_context(file): + if hasattr(file, 'write'): + file_ctx = nullcontext(file) + else: + file_ctx = open(file, 'wb') + + return file_ctx + + +def _is_uri(value: str) -> bool: + scheme = urllib.parse.urlparse(value).scheme + return ( + (scheme in {'http', 'https'}) + or (scheme in {'data'}) + or os.path.exists(value) + or os.access(os.path.dirname(value), os.W_OK) + ) + + +def _is_datauri(value: str) -> bool: + scheme = urllib.parse.urlparse(value).scheme + return scheme in {'data'} diff --git a/docarray/typing/url/image_url.py b/docarray/typing/url/image_url.py index 062419fa53f..161fa5ecf0f 100644 --- a/docarray/typing/url/image_url.py +++ b/docarray/typing/url/image_url.py @@ -1,10 +1,29 @@ +import io +import struct +from typing import TYPE_CHECKING, Any, Optional, Tuple, Type, TypeVar, Union + import numpy as np from docarray.proto import NodeProto from docarray.typing.url.any_url import AnyUrl +from docarray.typing.url.helper import _uri_to_blob + +if TYPE_CHECKING: + import PIL + from pydantic import BaseConfig + from pydantic.fields import ModelField + +T = TypeVar('T', bound='ImageUrl') + +IMAGE_FILE_FORMATS = ('png', 'jpeg', 'jpg') class ImageUrl(AnyUrl): + """ " + URL to a .png, .jpeg, or .jpg file. + Cane be remote (web) URL, or a local file path. + """ + def _to_node_protobuf(self) -> NodeProto: """Convert Document into a NodeProto protobuf message. This function should be called when the Document is nested into another Document that need to @@ -14,12 +33,231 @@ def _to_node_protobuf(self) -> NodeProto: """ return NodeProto(image_url=str(self)) - def load(self) -> np.ndarray: + @classmethod + def validate( + cls: Type[T], + value: Union[T, np.ndarray, Any], + field: 'ModelField', + config: 'BaseConfig', + ) -> T: + + url = super().validate(value, field, config) # basic url validation + has_image_extension = any(url.endswith(ext) for ext in IMAGE_FILE_FORMATS) + if not has_image_extension: + raise ValueError( + f'Image URL must have one of the following extensions:' + f'{IMAGE_FILE_FORMATS}' + ) + return cls(str(url), scheme=None) + + def load( + self, + width: Optional[int] = None, + height: Optional[int] = None, + axis_layout: Tuple[str, str, str] = ('H', 'W', 'C'), + timeout: Optional[float] = None, + ) -> np.ndarray: """ - transform the url in a image Tensor + Load the data from the url into a numpy.ndarray image tensor + + EXAMPLE USAGE + + .. code-block:: python + + from docarray import Document + from docarray.typing import ImageUrl + import numpy as np + + + class MyDoc(Document): + img_url: ImageUrl + + + doc = MyDoc( + img_url="https://upload.wikimedia.org/wikipedia/commons/8/80/" + "Dag_Sebastian_Ahlander_at_G%C3%B6teborg_Book_Fair_2012b.jpg" + ) - this is just a patch we will move the function from old docarray - :return: tensor image + img_tensor = doc.img_url.load() + assert isinstance(img_tensor, np.ndarray) + + img_tensor = doc.img_url.load(height=224, width=224) + assert img_tensor.shape == (224, 224, 3) + + layout = ('C', 'W', 'H') + img_tensor = doc.img_url.load(height=100, width=200, axis_layout=layout) + assert img_tensor.shape == (3, 200, 100) + + + :param width: width of the image tensor. + :param height: height of the image tensor. + :param axis_layout: ordering of the different image axes. + 'H' = height, 'W' = width, 'C' = color channel + :param timeout: timeout (sec) for urlopen network request. + Only relevant if URL is not local + :return: np.ndarray representing the image as RGB values """ - return np.zeros((3, 224, 224)) + buffer = _uri_to_blob(self, timeout=timeout) + tensor = _to_image_tensor(io.BytesIO(buffer), width=width, height=height) + return _move_channel_axis(tensor, axis_layout=axis_layout) + + def load_to_bytes( + self, + image_format: str = 'png', + width: Optional[int] = None, + height: Optional[int] = None, + timeout: Optional[float] = None, + ) -> bytes: + """Load image at URL to bytes (buffer). + + EXAMPLE USAGE + + .. code-block:: python + + from docarray import Document + from docarray.typing import ImageUrl + import numpy as np + + + class MyDoc(Document): + img_url: ImageUrl + + + doc = MyDoc( + img_url="https://upload.wikimedia.org/wikipedia/commons/8/80/" + "Dag_Sebastian_Ahlander_at_G%C3%B6teborg_Book_Fair_2012b.jpg" + ) + + img_tensor = doc.img_url.load_to_bytes(image_format='jpg') + assert isinstance(img_tensor, bytes) + + :param image_format: File format of the file located the the url. + Supported formats are `png`, `jpg`, and `jpeg`. + :param width: Before converting to bytes, resize the image to this width. + :param height: Before converting to bytes, resize the image to this height. + :param timeout: timeout (sec) for urlopen network request. + Only relevant if URL is not local + :return: The image as bytes (buffer). + """ + image_tensor = self.load(width=width, height=height, timeout=timeout) + return _image_tensor_to_bytes(image_tensor, image_format=image_format) + + +def _image_tensor_to_bytes(arr: np.ndarray, image_format: str) -> bytes: + """ + Convert image-ndarray to buffer bytes. + + :param arr: Data representations of the png. + :param image_format: `png` or `jpeg` + :return: Png in buffer bytes. + """ + + if image_format not in IMAGE_FILE_FORMATS: + raise ValueError( + f'image_format must be one of {IMAGE_FILE_FORMATS},' + f'receiving `{image_format}`' + ) + if image_format == 'jpg': + image_format = 'jpeg' # unify it to ISO standard + + arr = arr.astype(np.uint8).squeeze() + + if arr.ndim == 1: + # note this should be only used for MNIST/FashionMNIST dataset, + # because of the nature of these two datasets + # no other image data should flattened into 1-dim array. + image_bytes = _png_to_buffer_1d(arr, 28, 28) + elif arr.ndim == 2: + from PIL import Image + + im = Image.fromarray(arr).convert('L') + image_bytes = _pillow_image_to_buffer(im, image_format=image_format.upper()) + elif arr.ndim == 3: + from PIL import Image + + im = Image.fromarray(arr).convert('RGB') + image_bytes = _pillow_image_to_buffer(im, image_format=image_format.upper()) + else: + raise ValueError( + f'{arr.shape} ndarray can not be converted into an image buffer.' + ) + + return image_bytes + + +def _png_to_buffer_1d(arr: np.ndarray, width: int, height: int) -> bytes: + import zlib + + pixels = [] + for p in arr[::-1]: + pixels.extend([p, p, p, 255]) + buf = bytearray(pixels) + + # reverse the vertical line order and add null bytes at the start + width_byte_4 = width * 4 + raw_data = b''.join( + b'\x00' + buf[span : span + width_byte_4] + for span in range((height - 1) * width_byte_4, -1, -width_byte_4) + ) + + def png_pack(png_tag, data): + chunk_head = png_tag + data + return ( + struct.pack('!I', len(data)) + + chunk_head + + struct.pack('!I', 0xFFFFFFFF & zlib.crc32(chunk_head)) + ) + + png_bytes = b''.join( + [ + b'\x89PNG\r\n\x1a\n', + png_pack(b'IHDR', struct.pack('!2I5B', width, height, 8, 6, 0, 0, 0)), + png_pack(b'IDAT', zlib.compress(raw_data, 9)), + png_pack(b'IEND', b''), + ] + ) + + return png_bytes + + +def _pillow_image_to_buffer(image: 'PIL.Image.Image', image_format: str) -> bytes: + img_byte_arr = io.BytesIO() + image.save(img_byte_arr, format=image_format) + img_bytes = img_byte_arr.getvalue() + return img_bytes + + +def _move_channel_axis( + tensor: np.ndarray, axis_layout: Tuple[str, str, str] = ('H', 'W', 'C') +) -> np.ndarray: + """Moves channel axis around.""" + channel_to_offset = {'H': 0, 'W': 1, 'C': 2} + permutation = tuple(channel_to_offset[axis] for axis in axis_layout) + return np.transpose(tensor, permutation) + + +def _to_image_tensor( + source: Union[str, bytes, io.BytesIO], + width: Optional[int] = None, + height: Optional[int] = None, +) -> 'np.ndarray': + """ + Convert an image blob to tensor + + :param source: binary blob or file path + :param width: the width of the image tensor. + :param height: the height of the tensor. + :return: image tensor + """ + from PIL import Image as PILImage + + raw_img = PILImage.open(source) + if width or height: + new_width = width or raw_img.width + new_height = height or raw_img.height + raw_img = raw_img.resize((new_width, new_height)) + try: + return np.array(raw_img.convert('RGB')) + except Exception: + return np.array(raw_img) diff --git a/poetry.lock b/poetry.lock index 88db66d8d36..c1f4138a6cc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -946,6 +946,18 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "pillow" +version = "9.3.0" +description = "Python Imaging Library (Fork)" +category = "main" +optional = true +python-versions = ">=3.7" + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-issues (>=3.0.1)", "sphinx-removed-in", "sphinxext-opengraph"] +tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] + [[package]] name = "pkgutil-resolve-name" version = "1.3.10" @@ -1369,6 +1381,14 @@ python-versions = ">=3.7" docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["pre-commit", "pytest"] +[[package]] +name = "types-pillow" +version = "9.3.0.1" +description = "Typing stubs for Pillow" +category = "main" +optional = true +python-versions = "*" + [[package]] name = "types-protobuf" version = "3.20.4.5" @@ -1469,12 +1489,13 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools" [extras] common = ["protobuf"] +image = ["pillow", "types-pillow"] torch = ["torch"] [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "9f20e49f31a6f56c379c1dca4b3a327dabff31cf217980ca9731deea7a4b821c" +content-hash = "7fabdc150fb15e67a5eff53967ccf3846e464d78544b12784953635d2866a64a" [metadata.files] anyio = [ @@ -1961,6 +1982,69 @@ pickleshare = [ {file = "pickleshare-0.7.5-py2.py3-none-any.whl", hash = "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"}, {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"}, ] +pillow = [ + {file = "Pillow-9.3.0-1-cp37-cp37m-win32.whl", hash = "sha256:e6ea6b856a74d560d9326c0f5895ef8050126acfdc7ca08ad703eb0081e82b74"}, + {file = "Pillow-9.3.0-1-cp37-cp37m-win_amd64.whl", hash = "sha256:32a44128c4bdca7f31de5be641187367fe2a450ad83b833ef78910397db491aa"}, + {file = "Pillow-9.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:0b7257127d646ff8676ec8a15520013a698d1fdc48bc2a79ba4e53df792526f2"}, + {file = "Pillow-9.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b90f7616ea170e92820775ed47e136208e04c967271c9ef615b6fbd08d9af0e3"}, + {file = "Pillow-9.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68943d632f1f9e3dce98908e873b3a090f6cba1cbb1b892a9e8d97c938871fbe"}, + {file = "Pillow-9.3.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:be55f8457cd1eac957af0c3f5ece7bc3f033f89b114ef30f710882717670b2a8"}, + {file = "Pillow-9.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d77adcd56a42d00cc1be30843d3426aa4e660cab4a61021dc84467123f7a00c"}, + {file = "Pillow-9.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:829f97c8e258593b9daa80638aee3789b7df9da5cf1336035016d76f03b8860c"}, + {file = "Pillow-9.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:801ec82e4188e935c7f5e22e006d01611d6b41661bba9fe45b60e7ac1a8f84de"}, + {file = "Pillow-9.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:871b72c3643e516db4ecf20efe735deb27fe30ca17800e661d769faab45a18d7"}, + {file = "Pillow-9.3.0-cp310-cp310-win32.whl", hash = "sha256:655a83b0058ba47c7c52e4e2df5ecf484c1b0b0349805896dd350cbc416bdd91"}, + {file = "Pillow-9.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:9f47eabcd2ded7698106b05c2c338672d16a6f2a485e74481f524e2a23c2794b"}, + {file = "Pillow-9.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:57751894f6618fd4308ed8e0c36c333e2f5469744c34729a27532b3db106ee20"}, + {file = "Pillow-9.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7db8b751ad307d7cf238f02101e8e36a128a6cb199326e867d1398067381bff4"}, + {file = "Pillow-9.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3033fbe1feb1b59394615a1cafaee85e49d01b51d54de0cbf6aa8e64182518a1"}, + {file = "Pillow-9.3.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22b012ea2d065fd163ca096f4e37e47cd8b59cf4b0fd47bfca6abb93df70b34c"}, + {file = "Pillow-9.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9a65733d103311331875c1dca05cb4606997fd33d6acfed695b1232ba1df193"}, + {file = "Pillow-9.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:502526a2cbfa431d9fc2a079bdd9061a2397b842bb6bc4239bb176da00993812"}, + {file = "Pillow-9.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:90fb88843d3902fe7c9586d439d1e8c05258f41da473952aa8b328d8b907498c"}, + {file = "Pillow-9.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:89dca0ce00a2b49024df6325925555d406b14aa3efc2f752dbb5940c52c56b11"}, + {file = "Pillow-9.3.0-cp311-cp311-win32.whl", hash = "sha256:3168434d303babf495d4ba58fc22d6604f6e2afb97adc6a423e917dab828939c"}, + {file = "Pillow-9.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:18498994b29e1cf86d505edcb7edbe814d133d2232d256db8c7a8ceb34d18cef"}, + {file = "Pillow-9.3.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:772a91fc0e03eaf922c63badeca75e91baa80fe2f5f87bdaed4280662aad25c9"}, + {file = "Pillow-9.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa4107d1b306cdf8953edde0534562607fe8811b6c4d9a486298ad31de733b2"}, + {file = "Pillow-9.3.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b4012d06c846dc2b80651b120e2cdd787b013deb39c09f407727ba90015c684f"}, + {file = "Pillow-9.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77ec3e7be99629898c9a6d24a09de089fa5356ee408cdffffe62d67bb75fdd72"}, + {file = "Pillow-9.3.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:6c738585d7a9961d8c2821a1eb3dcb978d14e238be3d70f0a706f7fa9316946b"}, + {file = "Pillow-9.3.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:828989c45c245518065a110434246c44a56a8b2b2f6347d1409c787e6e4651ee"}, + {file = "Pillow-9.3.0-cp37-cp37m-win32.whl", hash = "sha256:82409ffe29d70fd733ff3c1025a602abb3e67405d41b9403b00b01debc4c9a29"}, + {file = "Pillow-9.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:41e0051336807468be450d52b8edd12ac60bebaa97fe10c8b660f116e50b30e4"}, + {file = "Pillow-9.3.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:b03ae6f1a1878233ac620c98f3459f79fd77c7e3c2b20d460284e1fb370557d4"}, + {file = "Pillow-9.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4390e9ce199fc1951fcfa65795f239a8a4944117b5935a9317fb320e7767b40f"}, + {file = "Pillow-9.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40e1ce476a7804b0fb74bcfa80b0a2206ea6a882938eaba917f7a0f004b42502"}, + {file = "Pillow-9.3.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a0a06a052c5f37b4ed81c613a455a81f9a3a69429b4fd7bb913c3fa98abefc20"}, + {file = "Pillow-9.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03150abd92771742d4a8cd6f2fa6246d847dcd2e332a18d0c15cc75bf6703040"}, + {file = "Pillow-9.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:15c42fb9dea42465dfd902fb0ecf584b8848ceb28b41ee2b58f866411be33f07"}, + {file = "Pillow-9.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:51e0e543a33ed92db9f5ef69a0356e0b1a7a6b6a71b80df99f1d181ae5875636"}, + {file = "Pillow-9.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3dd6caf940756101205dffc5367babf288a30043d35f80936f9bfb37f8355b32"}, + {file = "Pillow-9.3.0-cp38-cp38-win32.whl", hash = "sha256:f1ff2ee69f10f13a9596480335f406dd1f70c3650349e2be67ca3139280cade0"}, + {file = "Pillow-9.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:276a5ca930c913f714e372b2591a22c4bd3b81a418c0f6635ba832daec1cbcfc"}, + {file = "Pillow-9.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:73bd195e43f3fadecfc50c682f5055ec32ee2c933243cafbfdec69ab1aa87cad"}, + {file = "Pillow-9.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1c7c8ae3864846fc95f4611c78129301e203aaa2af813b703c55d10cc1628535"}, + {file = "Pillow-9.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e0918e03aa0c72ea56edbb00d4d664294815aa11291a11504a377ea018330d3"}, + {file = "Pillow-9.3.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b0915e734b33a474d76c28e07292f196cdf2a590a0d25bcc06e64e545f2d146c"}, + {file = "Pillow-9.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af0372acb5d3598f36ec0914deed2a63f6bcdb7b606da04dc19a88d31bf0c05b"}, + {file = "Pillow-9.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:ad58d27a5b0262c0c19b47d54c5802db9b34d38bbf886665b626aff83c74bacd"}, + {file = "Pillow-9.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:97aabc5c50312afa5e0a2b07c17d4ac5e865b250986f8afe2b02d772567a380c"}, + {file = "Pillow-9.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9aaa107275d8527e9d6e7670b64aabaaa36e5b6bd71a1015ddd21da0d4e06448"}, + {file = "Pillow-9.3.0-cp39-cp39-win32.whl", hash = "sha256:bac18ab8d2d1e6b4ce25e3424f709aceef668347db8637c2296bcf41acb7cf48"}, + {file = "Pillow-9.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:b472b5ea442148d1c3e2209f20f1e0bb0eb556538690fa70b5e1f79fa0ba8dc2"}, + {file = "Pillow-9.3.0-pp37-pypy37_pp73-macosx_10_10_x86_64.whl", hash = "sha256:ab388aaa3f6ce52ac1cb8e122c4bd46657c15905904b3120a6248b5b8b0bc228"}, + {file = "Pillow-9.3.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbb8e7f2abee51cef77673be97760abff1674ed32847ce04b4af90f610144c7b"}, + {file = "Pillow-9.3.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca31dd6014cb8b0b2db1e46081b0ca7d936f856da3b39744aef499db5d84d02"}, + {file = "Pillow-9.3.0-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c7025dce65566eb6e89f56c9509d4f628fddcedb131d9465cacd3d8bac337e7e"}, + {file = "Pillow-9.3.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:ebf2029c1f464c59b8bdbe5143c79fa2045a581ac53679733d3a91d400ff9efb"}, + {file = "Pillow-9.3.0-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:b59430236b8e58840a0dfb4099a0e8717ffb779c952426a69ae435ca1f57210c"}, + {file = "Pillow-9.3.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:12ce4932caf2ddf3e41d17fc9c02d67126935a44b86df6a206cf0d7161548627"}, + {file = "Pillow-9.3.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae5331c23ce118c53b172fa64a4c037eb83c9165aba3a7ba9ddd3ec9fa64a699"}, + {file = "Pillow-9.3.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:0b07fffc13f474264c336298d1b4ce01d9c5a011415b79d4ee5527bb69ae6f65"}, + {file = "Pillow-9.3.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:073adb2ae23431d3b9bcbcff3fe698b62ed47211d0716b067385538a1b0f28b8"}, + {file = "Pillow-9.3.0.tar.gz", hash = "sha256:c935a22a557a560108d780f9a0fc426dd7459940dc54faa49d83249c8d3e760f"}, +] pkgutil-resolve-name = [ {file = "pkgutil_resolve_name-1.3.10-py3-none-any.whl", hash = "sha256:ca27cc078d25c5ad71a9de0a7a330146c4e014c2462d9af19c6b828280649c5e"}, {file = "pkgutil_resolve_name-1.3.10.tar.gz", hash = "sha256:357d6c9e6a755653cfd78893817c0853af365dd51ec97f3d358a819373bbd174"}, @@ -2359,6 +2443,10 @@ traitlets = [ {file = "traitlets-5.5.0-py3-none-any.whl", hash = "sha256:1201b2c9f76097195989cdf7f65db9897593b0dfd69e4ac96016661bb6f0d30f"}, {file = "traitlets-5.5.0.tar.gz", hash = "sha256:b122f9ff2f2f6c1709dab289a05555be011c87828e911c0cf4074b85cb780a79"}, ] +types-pillow = [ + {file = "types-Pillow-9.3.0.1.tar.gz", hash = "sha256:f3b7cada3fa496c78d75253c6b1f07a843d625f42e5639b320a72acaff6f7cfb"}, + {file = "types_Pillow-9.3.0.1-py3-none-any.whl", hash = "sha256:79837755fe9659f29efd1016e9903ac4a500e0c73260483f07296bd6ca47668b"}, +] types-protobuf = [ {file = "types-protobuf-3.20.4.5.tar.gz", hash = "sha256:e9b45008d106e1d10cc77a29d2d344b85c0f01e2e643aaccf32f69e9e81b0cdd"}, {file = "types_protobuf-3.20.4.5-py3-none-any.whl", hash = "sha256:97af5ce70d890fdb94cb0c906f5a6624ca2fef58bc04e27990a25509e992a950"}, diff --git a/pyproject.toml b/pyproject.toml index 1ed1d039fea..22f2edcbeb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,10 +11,13 @@ pydantic = "^1.10.2" numpy = "^1.23.4" protobuf = { version = "^4.21.9", optional = true } torch = { version = "^1.0.0", optional = true } +pillow = {version = "^9.3.0", optional = true } +types-pillow = {version = "^9.3.0.1", optional = true } [tool.poetry.extras] common = ["protobuf"] torch = ["torch"] +image = ["pillow", "types-pillow"] [tool.poetry.dev-dependencies] pytest = "^5.2" diff --git a/tests/integrations/predefined_document/test_image.py b/tests/integrations/predefined_document/test_image.py index 44d0dc6021f..fd16ba037cf 100644 --- a/tests/integrations/predefined_document/test_image.py +++ b/tests/integrations/predefined_document/test_image.py @@ -2,10 +2,15 @@ from docarray import Image +REMOTE_JPG = ( + 'https://upload.wikimedia.org/wikipedia/commons/8/80/' + 'Dag_Sebastian_Ahlander_at_G%C3%B6teborg_Book_Fair_2012b.jpg' +) + def test_image(): - image = Image(url='http://jina.ai') + image = Image(url=REMOTE_JPG) image.tensor = image.url.load() diff --git a/tests/integrations/typing/test_typing_proto.py b/tests/integrations/typing/test_typing_proto.py index d95a1bfaf36..4b93ee01eff 100644 --- a/tests/integrations/typing/test_typing_proto.py +++ b/tests/integrations/typing/test_typing_proto.py @@ -19,7 +19,7 @@ class Mymmdoc(Document): torch_tensor=torch.zeros((3, 224, 224)), embedding=np.zeros((100, 1)), any_url='http://jina.ai', - image_url='http://jina.ai', + image_url='http://jina.ai/bla.jpg', ) new_doc = AnyDocument.from_protobuf(doc.to_protobuf()) diff --git a/tests/toydata/cube.ply b/tests/toydata/cube.ply new file mode 100644 index 00000000000..681156a7fc4 --- /dev/null +++ b/tests/toydata/cube.ply @@ -0,0 +1,24 @@ +ply +format ascii 1.0 +comment created by platoply +element vertex 8 +property float32 x +property float32 y +property float32 z +element face 6 +property list uint8 int32 vertex_indices +end_header +-1 -1 -1 +1 -1 -1 +1 1 -1 +-1 1 -1 +-1 -1 1 +1 -1 1 +1 1 1 +-1 1 1 +4 0 1 2 3 +4 5 4 7 6 +4 6 2 1 5 +4 3 7 4 0 +4 7 3 2 6 +4 5 1 0 4 diff --git a/tests/toydata/docs.csv b/tests/toydata/docs.csv new file mode 100644 index 00000000000..e01131614fc --- /dev/null +++ b/tests/toydata/docs.csv @@ -0,0 +1,3 @@ +source,url,question,answer,wrong_answer +testsrc,https://jina.ai,What are the symptoms?,Symptoms ...,As cases ... +testsrc,https://jina.ai,When should I get tested?,Your doctor ...,If you ... diff --git a/tests/toydata/docs.jsonlines b/tests/toydata/docs.jsonlines new file mode 100644 index 00000000000..58b3fbfe534 --- /dev/null +++ b/tests/toydata/docs.jsonlines @@ -0,0 +1,2 @@ +{"text": "a"} +{"text": "b"} diff --git a/tests/toydata/docs_groundtruth.jsonlines b/tests/toydata/docs_groundtruth.jsonlines new file mode 100644 index 00000000000..d34b0083853 --- /dev/null +++ b/tests/toydata/docs_groundtruth.jsonlines @@ -0,0 +1,2 @@ +{"document": {"text": "a"}, "groundtruth": {"text": "b"}} +{"document": {"text": "c"}, "groundtruth": {"text": "d"}} diff --git a/tests/toydata/hello.wav b/tests/toydata/hello.wav new file mode 100644 index 00000000000..81769d19b55 Binary files /dev/null and b/tests/toydata/hello.wav differ diff --git a/tests/toydata/image-data/05978.jpg b/tests/toydata/image-data/05978.jpg new file mode 100644 index 00000000000..3f0bf32e01d Binary files /dev/null and b/tests/toydata/image-data/05978.jpg differ diff --git a/tests/toydata/image-data/05979.jpg b/tests/toydata/image-data/05979.jpg new file mode 100644 index 00000000000..dfe446c44d4 Binary files /dev/null and b/tests/toydata/image-data/05979.jpg differ diff --git a/tests/toydata/image-data/05980.jpg b/tests/toydata/image-data/05980.jpg new file mode 100644 index 00000000000..edcb3c43c11 Binary files /dev/null and b/tests/toydata/image-data/05980.jpg differ diff --git a/tests/toydata/image-data/05981.jpg b/tests/toydata/image-data/05981.jpg new file mode 100644 index 00000000000..da800d007a4 Binary files /dev/null and b/tests/toydata/image-data/05981.jpg differ diff --git a/tests/toydata/image-data/05982.jpg b/tests/toydata/image-data/05982.jpg new file mode 100644 index 00000000000..d1c2d774d7b Binary files /dev/null and b/tests/toydata/image-data/05982.jpg differ diff --git a/tests/toydata/image-data/05983.jpg b/tests/toydata/image-data/05983.jpg new file mode 100644 index 00000000000..8cdaa3911cf Binary files /dev/null and b/tests/toydata/image-data/05983.jpg differ diff --git a/tests/toydata/image-data/05984-2.jpeg b/tests/toydata/image-data/05984-2.jpeg new file mode 100644 index 00000000000..13e0ccb7aa2 Binary files /dev/null and b/tests/toydata/image-data/05984-2.jpeg differ diff --git a/tests/toydata/image-data/05984.jpg b/tests/toydata/image-data/05984.jpg new file mode 100644 index 00000000000..13e0ccb7aa2 Binary files /dev/null and b/tests/toydata/image-data/05984.jpg differ diff --git a/tests/toydata/image-data/so_good.png b/tests/toydata/image-data/so_good.png new file mode 100644 index 00000000000..fa78310610f Binary files /dev/null and b/tests/toydata/image-data/so_good.png differ diff --git a/tests/toydata/mov_bbb.mp4 b/tests/toydata/mov_bbb.mp4 new file mode 100644 index 00000000000..0a4dd5b4017 Binary files /dev/null and b/tests/toydata/mov_bbb.mp4 differ diff --git a/tests/toydata/olleh.wav b/tests/toydata/olleh.wav new file mode 100644 index 00000000000..28523a7750f Binary files /dev/null and b/tests/toydata/olleh.wav differ diff --git a/tests/toydata/test.glb b/tests/toydata/test.glb new file mode 100644 index 00000000000..6b321d7de25 Binary files /dev/null and b/tests/toydata/test.glb differ diff --git a/tests/toydata/test.png b/tests/toydata/test.png new file mode 100644 index 00000000000..6853a1c069c Binary files /dev/null and b/tests/toydata/test.png differ diff --git a/tests/toydata/tetrahedron.mtl b/tests/toydata/tetrahedron.mtl new file mode 100644 index 00000000000..1bccd4474e4 --- /dev/null +++ b/tests/toydata/tetrahedron.mtl @@ -0,0 +1,22 @@ + +newmtl red +Ka 0.4449 0.0000 0.0000 +Kd 0.7714 0.0000 0.0000 +Ks 0.8857 0.0000 0.0000 +illum 2 +Ns 136.4300 + +newmtl lime +Ka 0.0000 0.5000 0.0000 +Kd 0.0000 1.0000 0.0000 +Ks 0.0000 0.5000 0.0000 +illum 2 +Ns 65.8900 + +newmtl gold +Ka 0.5265 0.2735 0.0122 +Kd 1.0000 0.5184 0.0286 +Ks 0.3000 0.3000 0.3000 +illum 2 +Ns 123.2600 + diff --git a/tests/toydata/tetrahedron.obj b/tests/toydata/tetrahedron.obj new file mode 100644 index 00000000000..40347bad7b7 --- /dev/null +++ b/tests/toydata/tetrahedron.obj @@ -0,0 +1,20 @@ +# tetrahedron.obj +# + +mtllib tetrahedron.mtl + +g tetrahedron + +v 1.00 1.00 1.00 +v 2.00 1.00 1.00 +v 1.00 2.00 1.00 +v 1.00 1.00 2.00 + +usemtl lime +f 1 3 2 +usemtl gold +f 1 4 3 +usemtl lime +f 1 2 4 +usemtl red +f 2 3 4 diff --git a/tests/units/typing/url/test_image_url.py b/tests/units/typing/url/test_image_url.py index 37fcf525d23..22280398358 100644 --- a/tests/units/typing/url/test_image_url.py +++ b/tests/units/typing/url/test_image_url.py @@ -1,11 +1,28 @@ +import os +import urllib + import numpy as np +import PIL +import pytest from pydantic.tools import parse_obj_as from docarray.typing import ImageUrl +CUR_DIR = os.path.dirname(os.path.abspath(__file__)) +PATH_TO_IMAGE_DATA = os.path.join(CUR_DIR, '..', '..', '..', 'toydata', 'image-data') +IMAGE_PATHS = { + 'png': os.path.join(PATH_TO_IMAGE_DATA, 'so_good.png'), + 'jpg': os.path.join(PATH_TO_IMAGE_DATA, '05984.jpg'), + 'jpeg': os.path.join(PATH_TO_IMAGE_DATA, '05984-2.jpeg'), +} +REMOTE_JPG = ( + 'https://upload.wikimedia.org/wikipedia/commons/8/80/' + 'Dag_Sebastian_Ahlander_at_G%C3%B6teborg_Book_Fair_2012b.jpg' +) + def test_image_url(): - uri = parse_obj_as(ImageUrl, 'http://jina.ai/img.png') + uri = parse_obj_as(ImageUrl, REMOTE_JPG) tensor = uri.load() @@ -14,6 +31,120 @@ def test_image_url(): def test_proto_image_url(): - uri = parse_obj_as(ImageUrl, 'http://jina.ai/img.png') + uri = parse_obj_as(ImageUrl, REMOTE_JPG) uri._to_node_protobuf() + + +@pytest.mark.parametrize( + 'image_format,path_to_img', + [ + ('png', IMAGE_PATHS['png']), + ('jpg', IMAGE_PATHS['jpg']), + ('jpeg', IMAGE_PATHS['jpeg']), + ('remote-jpg', REMOTE_JPG), + ], +) +def test_load(image_format, path_to_img): + url = parse_obj_as(ImageUrl, path_to_img) + tensor = url.load() + assert isinstance(tensor, np.ndarray) + + +@pytest.mark.parametrize( + 'image_format,path_to_img', + [ + ('png', IMAGE_PATHS['png']), + ('jpg', IMAGE_PATHS['jpg']), + ('jpeg', IMAGE_PATHS['jpeg']), + ('remote-jpg', REMOTE_JPG), + ], +) +@pytest.mark.parametrize('width,height', [(224, None), (None, 224), (224, 224)]) +def test_load_width_height(image_format, path_to_img, width, height): + url = parse_obj_as(ImageUrl, path_to_img) + tensor = url.load(width=width, height=height) + assert isinstance(tensor, np.ndarray) + + shape = tensor.shape + if width: + assert shape[1] == width + if height: + assert shape[0] == height + + +@pytest.mark.parametrize( + 'image_format,path_to_img', + [ + ('png', IMAGE_PATHS['png']), + ('jpg', IMAGE_PATHS['jpg']), + ('jpeg', IMAGE_PATHS['jpeg']), + ('remote-jpg', REMOTE_JPG), + ], +) +@pytest.mark.parametrize( + 'axis_layout', + [ + ('H', 'W', 'C'), + ('H', 'C', 'W'), + ('C', 'H', 'W'), + ('C', 'W', 'H'), + ('W', 'C', 'H'), + ('W', 'H', 'C'), + ], +) +def test_load_channel_axis(image_format, path_to_img, axis_layout): + sizes = {'H': 100, 'W': 200, 'C': 3} + url = parse_obj_as(ImageUrl, path_to_img) + tensor = url.load(axis_layout=axis_layout, height=sizes['H'], width=sizes['W']) + assert isinstance(tensor, np.ndarray) + + shape = tensor.shape + for axis, axis_name in enumerate(axis_layout): + assert shape[axis] == sizes[axis_name] + + +def test_load_timeout(): + url = parse_obj_as(ImageUrl, REMOTE_JPG) + with pytest.raises(urllib.error.URLError): + _ = url.load(timeout=0.001) + + +@pytest.mark.parametrize( + 'image_format,path_to_img', + [ + ('png', IMAGE_PATHS['png']), + ('jpg', IMAGE_PATHS['jpg']), + ('jpeg', IMAGE_PATHS['jpeg']), + ('jpg', REMOTE_JPG), + ], +) +def test_load_to_bytes(image_format, path_to_img): + w, h = 224, 224 + url = parse_obj_as(ImageUrl, path_to_img) + _bytes = url.load_to_bytes(width=w, height=h) + assert isinstance(_bytes, bytes) + img = PIL.Image.frombytes(mode='1', size=(w, h), data=_bytes) + assert isinstance(img, PIL.Image.Image) + + +@pytest.mark.parametrize( + 'image_format,path_to_img', + [ + ('png', IMAGE_PATHS['png']), + ('jpg', IMAGE_PATHS['jpg']), + ('jpeg', IMAGE_PATHS['jpeg']), + ('jpg', REMOTE_JPG), + ('illegal', 'illegal'), + ('illegal', 'https://www.google.com'), + ('illegal', 'my/local/text/file.txt'), + ], +) +def test_validation(image_format, path_to_img): + if image_format == 'illegal': + with pytest.raises(ValueError): + parse_obj_as(ImageUrl, path_to_img) + else: + url = parse_obj_as(ImageUrl, path_to_img) + assert isinstance(url, ImageUrl) + assert isinstance(url, str)