Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
941c0f9
feat: add tensor type for ndarray
JohannesMessner Nov 11, 2022
4a0f7bf
fix: fix mypy typing
JohannesMessner Nov 11, 2022
f451044
Merge remote-tracking branch 'origin/feat-rewrite-v2' into feat-rewri…
JohannesMessner Nov 15, 2022
04583d4
Merge remote-tracking branch 'origin/feat-rewrite-v2' into feat-rewri…
JohannesMessner Nov 16, 2022
1d9eaaf
feat: torch tensor type
JohannesMessner Nov 16, 2022
e820675
fix: protobuf for pytorch type
JohannesMessner Nov 16, 2022
4b449aa
ci: install all extras in the ci
JohannesMessner Nov 16, 2022
633b701
refactor: make nice looking
JohannesMessner Nov 16, 2022
492659e
docs: update docarray/typing/tensor/torch_tensor.py
JohannesMessner Nov 16, 2022
a5c1a31
refactor: code style
JohannesMessner Nov 16, 2022
8df13c1
fix: black and mypy
JohannesMessner Nov 16, 2022
28f1ed0
fix: suppress mypy import error
JohannesMessner Nov 16, 2022
4df03ef
ci: fix ci install
JohannesMessner Nov 16, 2022
81f7810
feat: add new type for image urls
JohannesMessner Nov 17, 2022
d14d4af
feat: add new type for image urls
JohannesMessner Nov 17, 2022
370b7fa
Merge branch 'feat-rewrite-v2' into feat-image-url
JohannesMessner Nov 18, 2022
3b6d5b9
test: add real existing url to test
JohannesMessner Nov 18, 2022
ba8adf0
test: test output of image buffer loading
JohannesMessner Nov 18, 2022
75a440c
feat: add validation for image url
JohannesMessner Nov 18, 2022
5b90769
feat: specify image axis permutation
JohannesMessner Nov 18, 2022
289ee15
docs: add docstrings
JohannesMessner Nov 18, 2022
f8a8129
docs: make strings uniform
JohannesMessner Nov 18, 2022
4512286
test: pass valid url as imageurl
JohannesMessner Nov 18, 2022
5ed9ec1
Merge branch 'feat-rewrite-v2' into feat-image-url
JohannesMessner Nov 21, 2022
5452e09
Merge branch 'feat-rewrite-v2' into feat-image-url
JohannesMessner Nov 21, 2022
d9cbef6
chore: fix dependencies
JohannesMessner Nov 21, 2022
ec6ac68
chore(ci): add extras to mypy ci
samsja Nov 21, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
run: |
python -m pip install --upgrade pip
python -m pip install poetry
poetry install
poetry install --all-extras
poetry run mypy docarray

# prep-testbed:
Expand Down
34 changes: 32 additions & 2 deletions docarray/typing/url/any_url.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
from typing import Type, TypeVar
from typing import TYPE_CHECKING, Type, TypeVar

from pydantic import AnyUrl as BaseAnyUrl
from pydantic import parse_obj_as
from pydantic import errors, parse_obj_as

from docarray.document.base_node import BaseNode
from docarray.proto import NodeProto

if TYPE_CHECKING:
from pydantic.networks import Parts

T = TypeVar('T', bound='AnyUrl')


class AnyUrl(BaseAnyUrl, BaseNode):
host_required = (
False # turn off host requirement to allow passing of local paths as URL
)

def _to_node_protobuf(self) -> NodeProto:
"""Convert Document into a NodeProto protobuf message. This function should
be called when the Document is nested into another Document that need to
Expand All @@ -19,6 +26,29 @@ def _to_node_protobuf(self) -> NodeProto:
"""
return NodeProto(any_url=str(self))

@classmethod
def validate_parts(cls, parts: 'Parts', validate_port: bool = True) -> 'Parts':
"""
A method used to validate parts of a URL.
Our URLs should be able to function both in local and remote settings.
Therefore, we allow missing `scheme`, making it possible to pass a file path.
"""
scheme = parts['scheme']
if scheme is None:
pass # allow missing scheme, unlike pydantic

elif cls.allowed_schemes and scheme.lower() not in cls.allowed_schemes:
raise errors.UrlSchemePermittedError(set(cls.allowed_schemes))

if validate_port:
cls._validate_port(parts['port'])

user = parts['user']
if cls.user_required and user is None:
raise errors.UrlUserInfoError()

return parts

@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'str') -> T:
"""
Expand Down
47 changes: 47 additions & 0 deletions docarray/typing/url/helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os
import urllib.parse
import urllib.request
from contextlib import nullcontext


def _uri_to_blob(uri: str, timeout=None) -> bytes:
"""Convert uri to blob
Internally it reads uri into blob.
:param uri: the uri of Document
:param timeout: timeout for urlopen. Only relevant if uri is not local
:return: blob bytes.
Comment thread
samsja marked this conversation as resolved.
"""
if urllib.parse.urlparse(uri).scheme in {'http', 'https', 'data'}:
req = urllib.request.Request(uri, headers={'User-Agent': 'Mozilla/5.0'})
urlopen_kwargs = {'timeout': timeout} if timeout is not None else {}
with urllib.request.urlopen(req, **urlopen_kwargs) as fp:
return fp.read()
elif os.path.exists(uri):
with open(uri, 'rb') as fp:
return fp.read()
else:
raise FileNotFoundError(f'`{uri}` is not a URL or a valid local path')


def _get_file_context(file):
if hasattr(file, 'write'):
file_ctx = nullcontext(file)
else:
file_ctx = open(file, 'wb')

return file_ctx


def _is_uri(value: str) -> bool:
scheme = urllib.parse.urlparse(value).scheme
return (
(scheme in {'http', 'https'})
or (scheme in {'data'})
or os.path.exists(value)
or os.access(os.path.dirname(value), os.W_OK)
)


def _is_datauri(value: str) -> bool:
scheme = urllib.parse.urlparse(value).scheme
return scheme in {'data'}
248 changes: 243 additions & 5 deletions docarray/typing/url/image_url.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,29 @@
import io
import struct
from typing import TYPE_CHECKING, Any, Optional, Tuple, Type, TypeVar, Union

import numpy as np

from docarray.proto import NodeProto
from docarray.typing.url.any_url import AnyUrl
from docarray.typing.url.helper import _uri_to_blob

if TYPE_CHECKING:
import PIL
from pydantic import BaseConfig
from pydantic.fields import ModelField

T = TypeVar('T', bound='ImageUrl')

IMAGE_FILE_FORMATS = ('png', 'jpeg', 'jpg')


class ImageUrl(AnyUrl):
""" "
URL to a .png, .jpeg, or .jpg file.
Cane be remote (web) URL, or a local file path.
"""

def _to_node_protobuf(self) -> NodeProto:
"""Convert Document into a NodeProto protobuf message. This function should
be called when the Document is nested into another Document that need to
Expand All @@ -14,12 +33,231 @@ def _to_node_protobuf(self) -> NodeProto:
"""
return NodeProto(image_url=str(self))

def load(self) -> np.ndarray:
@classmethod
def validate(
cls: Type[T],
value: Union[T, np.ndarray, Any],
field: 'ModelField',
config: 'BaseConfig',
) -> T:

url = super().validate(value, field, config) # basic url validation
has_image_extension = any(url.endswith(ext) for ext in IMAGE_FILE_FORMATS)
if not has_image_extension:
raise ValueError(
f'Image URL must have one of the following extensions:'
f'{IMAGE_FILE_FORMATS}'
)
return cls(str(url), scheme=None)

def load(
self,
width: Optional[int] = None,
height: Optional[int] = None,
axis_layout: Tuple[str, str, str] = ('H', 'W', 'C'),
timeout: Optional[float] = None,
) -> np.ndarray:
"""
transform the url in a image Tensor
Load the data from the url into a numpy.ndarray image tensor

EXAMPLE USAGE

.. code-block:: python

from docarray import Document
from docarray.typing import ImageUrl
import numpy as np


class MyDoc(Document):
img_url: ImageUrl


doc = MyDoc(
img_url="https://upload.wikimedia.org/wikipedia/commons/8/80/"
"Dag_Sebastian_Ahlander_at_G%C3%B6teborg_Book_Fair_2012b.jpg"
)

this is just a patch we will move the function from old docarray
:return: tensor image
img_tensor = doc.img_url.load()
assert isinstance(img_tensor, np.ndarray)

img_tensor = doc.img_url.load(height=224, width=224)
assert img_tensor.shape == (224, 224, 3)

layout = ('C', 'W', 'H')
img_tensor = doc.img_url.load(height=100, width=200, axis_layout=layout)
assert img_tensor.shape == (3, 200, 100)


:param width: width of the image tensor.
:param height: height of the image tensor.
:param axis_layout: ordering of the different image axes.
'H' = height, 'W' = width, 'C' = color channel
:param timeout: timeout (sec) for urlopen network request.
Only relevant if URL is not local
:return: np.ndarray representing the image as RGB values
"""

return np.zeros((3, 224, 224))
buffer = _uri_to_blob(self, timeout=timeout)
tensor = _to_image_tensor(io.BytesIO(buffer), width=width, height=height)
return _move_channel_axis(tensor, axis_layout=axis_layout)

def load_to_bytes(
self,
image_format: str = 'png',
width: Optional[int] = None,
height: Optional[int] = None,
timeout: Optional[float] = None,
) -> bytes:
"""Load image at URL to bytes (buffer).

EXAMPLE USAGE

.. code-block:: python

from docarray import Document
from docarray.typing import ImageUrl
import numpy as np


class MyDoc(Document):
img_url: ImageUrl


doc = MyDoc(
img_url="https://upload.wikimedia.org/wikipedia/commons/8/80/"
"Dag_Sebastian_Ahlander_at_G%C3%B6teborg_Book_Fair_2012b.jpg"
)

img_tensor = doc.img_url.load_to_bytes(image_format='jpg')
assert isinstance(img_tensor, bytes)

:param image_format: File format of the file located the the url.
Supported formats are `png`, `jpg`, and `jpeg`.
:param width: Before converting to bytes, resize the image to this width.
:param height: Before converting to bytes, resize the image to this height.
:param timeout: timeout (sec) for urlopen network request.
Only relevant if URL is not local
:return: The image as bytes (buffer).
"""
image_tensor = self.load(width=width, height=height, timeout=timeout)
return _image_tensor_to_bytes(image_tensor, image_format=image_format)


def _image_tensor_to_bytes(arr: np.ndarray, image_format: str) -> bytes:
"""
Convert image-ndarray to buffer bytes.

:param arr: Data representations of the png.
:param image_format: `png` or `jpeg`
:return: Png in buffer bytes.
"""

if image_format not in IMAGE_FILE_FORMATS:
raise ValueError(
f'image_format must be one of {IMAGE_FILE_FORMATS},'
f'receiving `{image_format}`'
)
if image_format == 'jpg':
image_format = 'jpeg' # unify it to ISO standard

arr = arr.astype(np.uint8).squeeze()

if arr.ndim == 1:
# note this should be only used for MNIST/FashionMNIST dataset,
# because of the nature of these two datasets
# no other image data should flattened into 1-dim array.
image_bytes = _png_to_buffer_1d(arr, 28, 28)
elif arr.ndim == 2:
from PIL import Image

im = Image.fromarray(arr).convert('L')
image_bytes = _pillow_image_to_buffer(im, image_format=image_format.upper())
elif arr.ndim == 3:
from PIL import Image

im = Image.fromarray(arr).convert('RGB')
image_bytes = _pillow_image_to_buffer(im, image_format=image_format.upper())
else:
raise ValueError(
f'{arr.shape} ndarray can not be converted into an image buffer.'
)

return image_bytes


def _png_to_buffer_1d(arr: np.ndarray, width: int, height: int) -> bytes:
import zlib

pixels = []
for p in arr[::-1]:
pixels.extend([p, p, p, 255])
buf = bytearray(pixels)

# reverse the vertical line order and add null bytes at the start
width_byte_4 = width * 4
raw_data = b''.join(
b'\x00' + buf[span : span + width_byte_4]
for span in range((height - 1) * width_byte_4, -1, -width_byte_4)
)

def png_pack(png_tag, data):
chunk_head = png_tag + data
return (
struct.pack('!I', len(data))
+ chunk_head
+ struct.pack('!I', 0xFFFFFFFF & zlib.crc32(chunk_head))
)

png_bytes = b''.join(
[
b'\x89PNG\r\n\x1a\n',
png_pack(b'IHDR', struct.pack('!2I5B', width, height, 8, 6, 0, 0, 0)),
png_pack(b'IDAT', zlib.compress(raw_data, 9)),
png_pack(b'IEND', b''),
]
)

return png_bytes


def _pillow_image_to_buffer(image: 'PIL.Image.Image', image_format: str) -> bytes:
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format=image_format)
img_bytes = img_byte_arr.getvalue()
return img_bytes


def _move_channel_axis(
tensor: np.ndarray, axis_layout: Tuple[str, str, str] = ('H', 'W', 'C')
) -> np.ndarray:
"""Moves channel axis around."""
channel_to_offset = {'H': 0, 'W': 1, 'C': 2}
permutation = tuple(channel_to_offset[axis] for axis in axis_layout)
return np.transpose(tensor, permutation)


def _to_image_tensor(
source: Union[str, bytes, io.BytesIO],
width: Optional[int] = None,
height: Optional[int] = None,
) -> 'np.ndarray':
"""
Convert an image blob to tensor

:param source: binary blob or file path
:param width: the width of the image tensor.
:param height: the height of the tensor.
:return: image tensor
"""
from PIL import Image as PILImage

raw_img = PILImage.open(source)
if width or height:
new_width = width or raw_img.width
new_height = height or raw_img.height
raw_img = raw_img.resize((new_width, new_height))
try:
return np.array(raw_img.convert('RGB'))
except Exception:
return np.array(raw_img)
Loading