diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..42b35de --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,71 @@ +# Base stage: Build your environment with CUDA and Python dependencies +FROM nvidia/cuda:12.8.0-devel-ubuntu24.04 AS base + +# Set environment variables for CUDA and Python +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + DEBIAN_FRONTEND=noninteractive \ + NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,utility + +# Install required packages +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + nano \ + git \ + apt-utils \ + sudo \ + wget \ + python3-dev \ + python3-pip \ + python3-venv \ + build-essential \ + libnss3 \ + libatk-bridge2.0-0 \ + libatk1.0-0 \ + libxcomposite1 \ + libxdamage1 \ + libxrandr2 \ + libgtk-3-0 \ + libgbm1 \ + xvfb \ + libpq-dev \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +# Ensure "python" points to "python3" +RUN ln -s /usr/bin/python3 /usr/bin/python + +# Upgrade pip and install TensorFlow with GPU support +RUN pip install --break-system-packages --upgrade --ignore-installed pip setuptools wheel \ +&& pip install --break-system-packages tensorflow[and-cuda]==2.19.0 + +# Define user parameters +ARG USERNAME=vscode +ARG USER_UID=1001 +ARG USER_GID=1001 + +# Create group and user if they don't exist +RUN if ! getent group $USER_GID; then groupadd --gid $USER_GID $USERNAME; fi && \ +if ! id -u $USERNAME >/dev/null 2>&1; then \ +useradd -s /bin/bash --uid $USER_UID --gid $USER_GID -m $USERNAME; \ +fi && \ +echo "$USERNAME ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/$USERNAME && \ +chmod 0440 /etc/sudoers.d/$USERNAME + +# Set working directory and default user in base stage +WORKDIR /home/$USERNAME +USER $USERNAME + +# Final stage: Use the base image with all modifications +FROM base AS dev_containers_target_stage + +# Explicitly set the user and working directory (this ensures the user entry is present) +USER vscode +WORKDIR /home/vscode + +# Disable the warning about pip being installed in a system directory (this is a workaround for the warning) use only inside dockerfile +RUN python -m pip config set global.break-system-packages true + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..a53f3f1 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,32 @@ +{ + "name": "Python 3", + "context": "..", + "dockerFile": "Dockerfile", + + "remoteUser": "vscode", + + "customizations": { + "env": { + "PYTHONPATH": "/workspace:${PYTHONPATH}" + }, + "vscode": { + "extensions": [ + "ms-python.python", + "mhutchie.git-graph", + "vscode-icons-team.vscode-icons" + ] + } + }, + + // Add the following lines to enable GPU + "runArgs": [ + "--gpus", "all" // Explicitly target GPU 1 + ], + + // Ensures NVIDIA runtime is used + "hostRequirements": { + "gpu": true + } + // Uncomment the next line to run commands after the container is created. + // "postCreateCommand": "bash .devcontainer/setup.sh" +} \ No newline at end of file diff --git a/.gitignore b/.gitignore index e665169..7ccf541 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ __pycache__ *.egg-info *.pyc -venv +venv* Datasets/* Models/* diff --git a/CHANGELOG.md b/CHANGELOG.md index b9988a4..8707d4c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,21 @@ +## [2.0.0.dev1] - 2025-03-31 +### Added +- New `use_cache` parameter in the handwriting recognition tutorial +- Type annotations and metadata handling in various callbacks and preprocessors +- Added .devcointainer configuration for VSCode, enabling a consistent Docker development environment + +### Changed +- Version bump from 1.2.5 to 2.0.0 (updating mltu compatibility with TensorFlow 2.19) +- `.gitignore` updated to include `venv*` +- Tutorials now assert `mltu` must be version 1.2.5 or lower for backward compatibility +- Training configs updated (e.g., batch size from 16 to 128) +- Code transitioned from `tf.keras` to core `keras` modules +- Data providers updated with multiprocessing and other kwargs +- Various logging, import handling, and minor bug fixes throughout the codebase + +### TODO +- Remove `# pyright: ignore` comments, that temporarily suppress type checking errors + ## [1.2.5] - 2024-05-04 ### Added - Added exception in `mltu.dataProvider.DataProvider` to raise ValueError when dataset is not iterable diff --git a/Tutorials/01_image_to_word/train.py b/Tutorials/01_image_to_word/train.py index 2fd214a..7b358dd 100644 --- a/Tutorials/01_image_to_word/train.py +++ b/Tutorials/01_image_to_word/train.py @@ -1,3 +1,7 @@ +# For this to work you need to install mltu version 1.2.5 or lower +import mltu +assert mltu.__version__ == "1.2.5", "mltu version must be 1.2.5 or lower" + import os from tqdm import tqdm import tensorflow as tf diff --git a/Tutorials/02_captcha_to_text/train.py b/Tutorials/02_captcha_to_text/train.py index 4ceb03e..86967c2 100644 --- a/Tutorials/02_captcha_to_text/train.py +++ b/Tutorials/02_captcha_to_text/train.py @@ -1,3 +1,7 @@ +# For this to work you need to install mltu version 1.2.5 or lower +import mltu +assert mltu.__version__ == "1.2.5", "mltu version must be 1.2.5 or lower" + import tensorflow as tf try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")] except: pass diff --git a/Tutorials/03_handwriting_recognition/configs.py b/Tutorials/03_handwriting_recognition/configs.py index db01bf6..06b6680 100644 --- a/Tutorials/03_handwriting_recognition/configs.py +++ b/Tutorials/03_handwriting_recognition/configs.py @@ -11,7 +11,7 @@ def __init__(self): self.height = 32 self.width = 128 self.max_text_length = 0 - self.batch_size = 16 - self.learning_rate = 0.0005 + self.batch_size = 128 + self.learning_rate = 0.001 self.train_epochs = 1000 self.train_workers = 20 \ No newline at end of file diff --git a/Tutorials/03_handwriting_recognition/train.py b/Tutorials/03_handwriting_recognition/train.py index 2a00f29..b359526 100644 --- a/Tutorials/03_handwriting_recognition/train.py +++ b/Tutorials/03_handwriting_recognition/train.py @@ -4,6 +4,10 @@ from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard +# For this to work you need to install mltu version 1.2.5 or lower +import mltu +assert mltu.__version__ == "1.2.5", "mltu version must be 1.2.5 or lower" + from mltu.preprocessors import ImageReader from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding, ImageShowCV2 from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen @@ -88,6 +92,7 @@ def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024): LabelIndexer(configs.vocab), LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)), ], + use_cache=True ) # Split the dataset into training and validation sets diff --git a/Tutorials/04_sentence_recognition/train.py b/Tutorials/04_sentence_recognition/train.py index 0bce0f7..b8d7c29 100644 --- a/Tutorials/04_sentence_recognition/train.py +++ b/Tutorials/04_sentence_recognition/train.py @@ -1,3 +1,7 @@ +# For this to work you need to install mltu version 1.2.5 or lower +import mltu +assert mltu.__version__ == "1.2.5", "mltu version must be 1.2.5 or lower" + import tensorflow as tf try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")] except: pass diff --git a/Tutorials/05_sound_to_text/train.py b/Tutorials/05_sound_to_text/train.py index 91ad68e..fe941cd 100644 --- a/Tutorials/05_sound_to_text/train.py +++ b/Tutorials/05_sound_to_text/train.py @@ -1,3 +1,7 @@ +# For this to work you need to install mltu version 1.2.5 or lower +import mltu +assert mltu.__version__ == "1.2.5", "mltu version must be 1.2.5 or lower" + import tensorflow as tf try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")] except: pass diff --git a/Tutorials/06_pytorch_introduction/train.py b/Tutorials/06_pytorch_introduction/train.py index f5155f0..93eb0ff 100644 --- a/Tutorials/06_pytorch_introduction/train.py +++ b/Tutorials/06_pytorch_introduction/train.py @@ -1,3 +1,7 @@ +# For this to work you need to install mltu version 1.2.5 or lower +import mltu +assert mltu.__version__ == "1.2.5", "mltu version must be 1.2.5 or lower" + import os import cv2 import numpy as np diff --git a/Tutorials/07_pytorch_wrapper/train.py b/Tutorials/07_pytorch_wrapper/train.py index 9f17c85..32250a9 100644 --- a/Tutorials/07_pytorch_wrapper/train.py +++ b/Tutorials/07_pytorch_wrapper/train.py @@ -1,3 +1,7 @@ +# For this to work you need to install mltu version 1.2.5 or lower +import mltu +assert mltu.__version__ == "1.2.5", "mltu version must be 1.2.5 or lower" + import os import numpy as np import requests, gzip, os, hashlib diff --git a/Tutorials/08_handwriting_recognition_torch/train_torch.py b/Tutorials/08_handwriting_recognition_torch/train_torch.py index 8cf185e..b80e5cb 100644 --- a/Tutorials/08_handwriting_recognition_torch/train_torch.py +++ b/Tutorials/08_handwriting_recognition_torch/train_torch.py @@ -1,3 +1,7 @@ +# For this to work you need to install mltu version 1.2.5 or lower +import mltu +assert mltu.__version__ == "1.2.5", "mltu version must be 1.2.5 or lower" + import os import tarfile from tqdm import tqdm diff --git a/Tutorials/09_translation_transformer/train.py b/Tutorials/09_translation_transformer/train.py index 0658a13..175e4ef 100644 --- a/Tutorials/09_translation_transformer/train.py +++ b/Tutorials/09_translation_transformer/train.py @@ -1,3 +1,7 @@ +# For this to work you need to install mltu version 1.2.5 or lower +import mltu +assert mltu.__version__ == "1.2.5", "mltu version must be 1.2.5 or lower" + import numpy as np import tensorflow as tf diff --git a/Tutorials/10_wav2vec2_torch/train.py b/Tutorials/10_wav2vec2_torch/train.py index eb6c68c..0b8e636 100644 --- a/Tutorials/10_wav2vec2_torch/train.py +++ b/Tutorials/10_wav2vec2_torch/train.py @@ -1,3 +1,7 @@ +# For this to work you need to install mltu version 1.2.5 or lower +import mltu +assert mltu.__version__ == "1.2.5", "mltu version must be 1.2.5 or lower" + import os import tarfile import pandas as pd diff --git a/Tutorials/10_wav2vec2_torch/train_tf.py b/Tutorials/10_wav2vec2_torch/train_tf.py index 93c8fb2..08bd1dc 100644 --- a/Tutorials/10_wav2vec2_torch/train_tf.py +++ b/Tutorials/10_wav2vec2_torch/train_tf.py @@ -1,3 +1,7 @@ +# For this to work you need to install mltu version 1.2.5 or lower +import mltu +assert mltu.__version__ == "1.2.5", "mltu version must be 1.2.5 or lower" + import tensorflow as tf try: [ diff --git a/Tutorials/12_handwriting_recognition_2/configs.py b/Tutorials/12_handwriting_recognition_2/configs.py new file mode 100644 index 0000000..612e442 --- /dev/null +++ b/Tutorials/12_handwriting_recognition_2/configs.py @@ -0,0 +1,17 @@ +import os +from datetime import datetime + +from mltu.configs import BaseModelConfigs + +class ModelConfigs(BaseModelConfigs): + def __init__(self): + super().__init__() + self.model_path = os.path.join("Models/12_handwriting_recognition_2", datetime.strftime(datetime.now(), "%Y%m%d%H%M")) + self.vocab = "" + self.height = 32 + self.width = 128 + self.max_text_length = 0 + self.batch_size = 128 + self.learning_rate = 0.002 + self.train_epochs = 1000 + self.train_workers = 20 \ No newline at end of file diff --git a/Tutorials/12_handwriting_recognition_2/data_utils.py b/Tutorials/12_handwriting_recognition_2/data_utils.py new file mode 100644 index 0000000..ab5e923 --- /dev/null +++ b/Tutorials/12_handwriting_recognition_2/data_utils.py @@ -0,0 +1,127 @@ +""" +Data utilities for downloading, unzipping, and processing handwriting datasets. +""" + +import os +import concurrent.futures +from tqdm import tqdm +from urllib.request import urlopen +from io import BytesIO +from zipfile import ZipFile +from typing import Optional, Tuple, List, Set + +def download_and_unzip(url: str, extract_to: str = "Datasets", chunk_size: int = 1024 * 1024) -> None: + """ + Downloads a ZIP file from a URL and extracts it to the specified location. + + Args: + url (str): The URL of the ZIP file to download. + extract_to (str): The directory to extract files to. Defaults to "Datasets". + chunk_size (int): The size (in bytes) of each chunk to read from the URL. + Defaults to 1024 * 1024. + + Raises: + ConnectionError: If the URL cannot be opened. + OSError: If the ZIP file cannot be opened or extracted. + """ + try: + http_response = urlopen(url) + except Exception as e: + raise ConnectionError(f"Failed to open URL: {url}") from e + + if not hasattr(http_response, "length") or http_response.length is None: + raise OSError("Could not determine file size for download.") + + iterations = (http_response.length // chunk_size) + 1 + chunks = [] + for _ in tqdm(range(iterations), desc="Downloading"): + chunks.append(http_response.read(chunk_size)) + + data = b"".join(chunks) + try: + zipfile = ZipFile(BytesIO(data)) + zipfile.extractall(path=extract_to) + except Exception as e: + raise OSError("Failed to extract ZIP file.") from e + + +def process_line(line: str, dataset_path: str) -> Optional[Tuple[str, str]]: + """ + Processes a single line from the dataset metadata file to extract + the image path and label. + + Args: + line (str): A single line containing image metadata. + dataset_path (str): The base path of the dataset. + + Returns: + Optional[Tuple[str, str]]: + A tuple of (relative_path, label) if valid, otherwise None. + """ + if line.startswith("#"): + return None + + line_split = line.split(" ") + if len(line_split) < 2 or line_split[1] == "err": + return None + + folder1 = line_split[0][:3] + folder2 = "-".join(line_split[0].split("-")[:2]) + file_name = line_split[0] + ".png" + label = line_split[-1].rstrip("\n") + + rel_path = os.path.join(dataset_path, "words", folder1, folder2, file_name) + if not os.path.exists(rel_path): + return None + + return rel_path, label + + +def load_dataset(dataset_path: str) -> Tuple[List[List[str]], Set[str], int]: + """ + Loads the dataset by reading 'words.txt' in the specified folder, + processing each line, and collating results. + + Args: + dataset_path (str): The path to the dataset folder containing 'words.txt'. + + Returns: + Tuple[List[List[str]], Set[str], int]: + A tuple containing: + - dataset (List[List[str]]): A list of [rel_path, label] entries. + - vocab (Set[str]): A set of unique characters found in labels. + - max_len (int): The longest label length. + """ + dataset: List[List[str]] = [] + vocab: Set[str] = set() + max_len: int = 0 + + words_file = os.path.join(dataset_path, "words.txt") + if not os.path.exists(words_file): + raise FileNotFoundError(f"Could not find 'words.txt' at {words_file}") + + with open(words_file, "r", encoding="utf-8") as file_obj: + lines = file_obj.readlines() + # Reduced dataset size for demonstration; remove slicing as needed. + lines = lines[:1000] + + with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: + results = list( + tqdm( + executor.map( + lambda ln: process_line(ln, dataset_path), lines + ), + total=len(lines), + desc="Processing lines" + ) + ) + + for r in results: + if r is None: + continue + rel_path, label = r + dataset.append([rel_path, label]) + vocab.update(list(label)) + max_len = max(max_len, len(label)) + + return dataset, vocab, max_len \ No newline at end of file diff --git a/Tutorials/12_handwriting_recognition_2/inferenceModel.py b/Tutorials/12_handwriting_recognition_2/inferenceModel.py new file mode 100644 index 0000000..c4dd919 --- /dev/null +++ b/Tutorials/12_handwriting_recognition_2/inferenceModel.py @@ -0,0 +1,48 @@ +import cv2 +import typing +import numpy as np + +from mltu.inferenceModel import OnnxInferenceModel +from mltu.utils.text_utils import ctc_decoder, get_cer + +class ImageToWordModel(OnnxInferenceModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def predict(self, image: np.ndarray): + image = cv2.resize(image, self.input_shapes[0][1:3][::-1]) + + image_pred = np.expand_dims(image, axis=0).astype(np.float32) + + preds = self.model.run(self.output_names, {self.input_names[0]: image_pred})[0] + + text = ctc_decoder(preds, self.model.metadata["vocab"])[0] + + return text + +if __name__ == "__main__": + import pandas as pd + from tqdm import tqdm + + model = ImageToWordModel(model_path="Models/12_handwriting_recognition_2/202503281916/model.onnx",) + + df = pd.read_csv("Models/03_handwriting_recognition/202301111911/val.csv").values.tolist() + + accum_cer = [] + for image_path, label in tqdm(df): + image = cv2.imread(image_path.replace("\\", "/")) + + prediction_text = model.predict(image) + + cer = get_cer(prediction_text, label) + print(f"Image: {image_path}, Label: {label}, Prediction: {prediction_text}, CER: {cer}") + + accum_cer.append(cer) + + # resize by 4x + image = cv2.resize(image, (image.shape[1] * 4, image.shape[0] * 4)) + cv2.imshow("Image", image) + cv2.waitKey(0) + cv2.destroyAllWindows() + + print(f"Average CER: {np.average(accum_cer)}") \ No newline at end of file diff --git a/Tutorials/12_handwriting_recognition_2/model.py b/Tutorials/12_handwriting_recognition_2/model.py new file mode 100644 index 0000000..d705ca9 --- /dev/null +++ b/Tutorials/12_handwriting_recognition_2/model.py @@ -0,0 +1,37 @@ +import keras +from keras import layers + +from mltu.tensorflow.model_utils import residual_block + + +def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2): + + inputs = layers.Input(shape=input_dim, name="input") + + # normalize images here instead in preprocessing step + input = layers.Lambda(lambda x: x / 255)(inputs) + + x1 = residual_block(input, 16, activation=activation, skip_conv=True, strides=1, dropout=dropout) + + x2 = residual_block(x1, 16, activation=activation, skip_conv=True, strides=2, dropout=dropout) + x3 = residual_block(x2, 16, activation=activation, skip_conv=False, strides=1, dropout=dropout) + + x4 = residual_block(x3, 32, activation=activation, skip_conv=True, strides=2, dropout=dropout) + x5 = residual_block(x4, 32, activation=activation, skip_conv=False, strides=1, dropout=dropout) + + x6 = residual_block(x5, 64, activation=activation, skip_conv=True, strides=2, dropout=dropout) + x7 = residual_block(x6, 64, activation=activation, skip_conv=True, strides=1, dropout=dropout) + + x8 = residual_block(x7, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout) + x9 = residual_block(x8, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout) + + squeezed = layers.Reshape((x9.shape[-3] * x9.shape[-2], x9.shape[-1]))(x9) + # tf2onnx is only supporting converting LSTM layer with unroll=True + lstm_layer = layers.LSTM(128, return_sequences=True, unroll=True) + blstm = layers.Bidirectional(lstm_layer)(squeezed) + blstm = layers.Dropout(dropout)(blstm) + + output = layers.Dense(output_dim + 1, activation="softmax", name="output")(blstm) + + model = keras.models.Model(inputs=inputs, outputs=output) + return model diff --git a/Tutorials/12_handwriting_recognition_2/requirements.txt b/Tutorials/12_handwriting_recognition_2/requirements.txt new file mode 100644 index 0000000..6f3690b --- /dev/null +++ b/Tutorials/12_handwriting_recognition_2/requirements.txt @@ -0,0 +1,10 @@ +tensorflow==2.19.0 +tf-keras==2.19.0 +tqdm==4.67.1 +opencv-python==4.11.0.86 +pillow==11.1.0 +pandas==2.2.3 +PyYAML==6.0.2 +tf2onnx==1.16.1 +onnxruntime==1.21.0 +numpy==2.1.3 \ No newline at end of file diff --git a/Tutorials/12_handwriting_recognition_2/train.py b/Tutorials/12_handwriting_recognition_2/train.py new file mode 100644 index 0000000..270bf76 --- /dev/null +++ b/Tutorials/12_handwriting_recognition_2/train.py @@ -0,0 +1,111 @@ +import os +#visible 1 gpu +os.environ["CUDA_VISIBLE_DEVICES"] = "1" +import tensorflow as tf +try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")] +except: pass + +import keras + +from mltu.preprocessors import ImageReader +from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding, ImageShowCV2 +from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen, RandomElasticTransform, RandomGaussianBlur, RandomSaltAndPepper +from mltu.annotations.images import CVImage + + +from mltu.tensorflow.dataProvider import DataProvider +from mltu.tensorflow.losses import CTCloss +from mltu.tensorflow.callbacks import Model2onnx, TrainLogger +from mltu.tensorflow.metrics import CERMetric, WERMetric + +from model import train_model +from configs import ModelConfigs + +import os +import tarfile +from data_utils import download_and_unzip, load_dataset + +dataset_path = os.path.join("Datasets", "IAM_Words") +if not os.path.exists(dataset_path): + download_and_unzip("https://git.io/J0fjL", extract_to="Datasets") + + file = tarfile.open(os.path.join(dataset_path, "words.tgz")) + file.extractall(os.path.join(dataset_path, "words")) + +dataset, vocab, max_len = load_dataset(dataset_path) + +# Create a ModelConfigs object to store model configurations +configs = ModelConfigs() +configs.vocab = "".join(sorted(vocab)) +configs.max_text_length = max_len +configs.save() + +# Create a data provider for the dataset +data_provider = DataProvider( + dataset=dataset, + skip_validation=True, + batch_size=configs.batch_size, + data_preprocessors=[ImageReader(CVImage)], + transformers=[ + ImageResizer(configs.width, configs.height, keep_aspect_ratio=False), + LabelIndexer(configs.vocab), + LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)), + ], + use_cache=True, + workers=configs.train_workers, + max_queue_size=10, + # use_multiprocessing=True +) + +# Split the dataset into training and validation sets +train_data_provider, val_data_provider = data_provider.split(split = 0.9) + +# Augment training data with random brightness, rotation and erode/dilate +train_data_provider.augmentors = [ + RandomBrightness(), + RandomErodeDilate(), + RandomSharpen(), + RandomRotate(angle=10), + RandomElasticTransform(), + RandomGaussianBlur(), + RandomSaltAndPepper(), +] + +# Creating TensorFlow model architecture +model = train_model( + input_dim = (configs.height, configs.width, 3), + output_dim = len(configs.vocab), +) + +# Compile the model and print summary +model.compile( + optimizer=keras.optimizers.Nadam(learning_rate=configs.learning_rate), # pyright: ignore + loss=CTCloss(), + metrics=[ + CERMetric(vocabulary=configs.vocab), + WERMetric(vocabulary=configs.vocab) + ], + jit_compile=False, # pyright: ignore + run_eagerly=False, +) +model.summary(line_length=110) + +# Define callbacks +earlystopper = keras.callbacks.EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min") +checkpoint = keras.callbacks.ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min") +trainLogger = TrainLogger(configs.model_path) +tb_callback = keras.callbacks.TensorBoard(f"{configs.model_path}/logs", update_freq="epoch") +reduceLROnPlat = keras.callbacks.ReduceLROnPlateau(monitor="val_CER", factor=0.9, min_delta=1e-10, patience=5, verbose=1, mode="min") +model2onnx = Model2onnx(f"{configs.model_path}/model.h5", metadata={"vocab": configs.vocab}, save_on_epoch_end=True, opset=18) + +# Save training and validation datasets as csv files +train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv")) +val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv")) + +# Train the model +model.fit( + train_data_provider, + validation_data=val_data_provider, + epochs=configs.train_epochs, + callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx], +) \ No newline at end of file diff --git a/mltu/__init__.py b/mltu/__init__.py index 015215d..543835a 100644 --- a/mltu/__init__.py +++ b/mltu/__init__.py @@ -1,4 +1,4 @@ -__version__ = "1.2.5" +__version__ = "2.0.0.dev1" from .annotations.images import Image from .annotations.images import CVImage diff --git a/mltu/annotations/audio.py b/mltu/annotations/audio.py index ec9c2ac..8bb9d55 100644 --- a/mltu/annotations/audio.py +++ b/mltu/annotations/audio.py @@ -52,7 +52,7 @@ def numpy(self) -> np.ndarray: def __add__(self, other: np.ndarray) -> np.ndarray: self._audio = self._audio + other self.augmented = True - return self + return self # pyright: ignore def __len__(self) -> int: return len(self._audio) diff --git a/mltu/annotations/images.py b/mltu/annotations/images.py index 929db53..0f0741d 100644 --- a/mltu/annotations/images.py +++ b/mltu/annotations/images.py @@ -9,6 +9,9 @@ from PIL import Image as PilImage +import logging +logger = logging.getLogger(__name__) + class Image(ABC): def __init__(self) -> None: @@ -33,11 +36,11 @@ def HSV(self) -> np.ndarray: pass @abstractmethod - def update(self, image: np.ndarray): + def update(self, image: np.ndarray) -> "Image": pass @abstractmethod - def flip(self, axis: int = 0): + def flip(self, axis: int = 0) -> "Image": pass @abstractmethod @@ -67,13 +70,12 @@ def __init__( method: int = cv2.IMREAD_COLOR, path: str = "", color: str = "BGR" - ) -> None: + ) -> None: super().__init__() if isinstance(image, str): if not os.path.exists(image): raise FileNotFoundError(f"Image {image} not found.") - self._image = cv2.imread(image, method) self.path = image self.color = "BGR" @@ -130,7 +132,7 @@ def HSV(self) -> np.ndarray: else: raise ValueError(f"Unknown color format {self.color}") - def update(self, image: np.ndarray): + def update(self, image: np.ndarray) -> "CVImage": if isinstance(image, np.ndarray): self._image = image @@ -141,10 +143,9 @@ def update(self, image: np.ndarray): return self - else: - raise TypeError(f"image must be numpy.ndarray, not {type(image)}") + raise TypeError(f"image must be numpy.ndarray, not {type(image)}") - def flip(self, axis: int = 0): + def flip(self, axis: int = 0) -> "CVImage": """ Flip image along x or y axis Args: @@ -178,7 +179,8 @@ class PillowImage(Image): def __init__( self, - image: str) -> None: + image: typing.Union[str, np.ndarray], + ) -> None: super().__init__() if isinstance(image, str): @@ -189,6 +191,8 @@ def __init__( self._image = PilImage.open(image) self.init_successful = True + elif isinstance(image, np.ndarray): + raise NotImplementedError("PillowImage does not support numpy.ndarray as input") else: raise TypeError("Image must be a path to an image") @@ -202,7 +206,7 @@ def __init__( @property def is_animated(self) -> bool: - return hasattr(self._image, "is_animated") and self._image.is_animated + return isinstance(self._image, PilImage.Image) and getattr(self._image, "is_animated", False) @property def image(self) -> np.ndarray: @@ -247,7 +251,7 @@ def _init_attributes(self): self.height = self.image.shape[0] self.channels = 1 if len(self.image.shape) == 2 else self.image.shape[2] - def update(self, image: PilImage.Image): + def update(self, image: np.ndarray) -> "PillowImage": if isinstance(image, PilImage.Image): self._image = image elif isinstance(image, np.ndarray): diff --git a/mltu/dataProvider.py b/mltu/dataProvider.py index 06f708d..68be978 100644 --- a/mltu/dataProvider.py +++ b/mltu/dataProvider.py @@ -15,18 +15,19 @@ class DataProvider: def __init__( self, dataset: typing.Union[str, list, pd.DataFrame], - data_preprocessors: typing.List[typing.Callable] = None, + data_preprocessors: typing.Union[typing.List[typing.Callable], None] = None, batch_size: int = 4, shuffle: bool = True, initial_epoch: int = 1, - augmentors: typing.List[Augmentor] = None, - transformers: typing.List[Transformer] = None, - batch_postprocessors: typing.List[typing.Callable] = None, + augmentors: typing.Union[typing.List[Augmentor], None] = None, + transformers: typing.Union[typing.List[Transformer], None] = None, + batch_postprocessors: typing.Union[typing.List[typing.Callable], None] = None, skip_validation: bool = True, - limit: int = None, + limit: typing.Union[int, None]=None, use_cache: bool = False, log_level: int = logging.INFO, numpy: bool = True, + **kwargs ) -> None: """ Standardised object for providing data to a model while training. @@ -137,7 +138,7 @@ def on_epoch_end(self): # Remove any samples that were marked for removal for remove in self._on_epoch_end_remove: self.logger.warning(f"Removing {remove} from dataset.") - self._dataset.remove(remove) + self._dataset.remove(remove) # pyright: ignore self._on_epoch_end_remove = [] def validate_list_dataset(self, dataset: list) -> list: @@ -148,7 +149,7 @@ def validate_list_dataset(self, dataset: list) -> list: return validated_data - def validate(self, dataset: typing.Union[str, list, pd.DataFrame]) -> typing.Union[list, str]: + def validate(self, dataset: typing.Union[str, list, pd.DataFrame]) -> typing.Union[list, str]: # pyright: ignore """ Validate the dataset and return the dataset """ if isinstance(dataset, str): diff --git a/mltu/preprocessors.py b/mltu/preprocessors.py index cb65ca1..c4cebc1 100644 --- a/mltu/preprocessors.py +++ b/mltu/preprocessors.py @@ -2,11 +2,9 @@ import typing import importlib import numpy as np -import matplotlib.pyplot as plt -import matplotlib import logging -from . import Image +from . import Image, CVImage, PillowImage from mltu.annotations.audio import Audio """ Implemented Preprocessors: @@ -18,12 +16,16 @@ class ImageReader: """Read image from path and return image and label""" - def __init__(self, image_class: Image, log_level: int = logging.INFO, ) -> None: + def __init__(self, image_class: typing.Type[typing.Union[CVImage, PillowImage]] = CVImage, log_level: int = logging.INFO) -> None: + self._image_class = image_class self.logger = logging.getLogger(self.__class__.__name__) self.logger.setLevel(log_level) - self._image_class = image_class - def __call__(self, image_path: typing.Union[str, np.ndarray], label: typing.Any) -> typing.Tuple[Image, typing.Any]: + def __call__( + self, + image_path: typing.Union[str, np.ndarray, typing.Any], + label: typing.Any + ) -> typing.Tuple[typing.Union[Image, None], typing.Any]: """ Read image from path and return image and label Args: @@ -59,6 +61,7 @@ def import_librosa(object) -> None: try: object.librosa = importlib.import_module('librosa') print("librosa version:", object.librosa.__version__) + return object.librosa # pyright: ignore except: raise ImportError("librosa is required to augment Audio. Please install it with `pip install librosa`.") @@ -71,7 +74,7 @@ class AudioReader: """ def __init__( self, - sample_rate = None, + sample_rate: int=22050, log_level: int = logging.INFO, ) -> None: self.sample_rate = sample_rate @@ -108,7 +111,7 @@ def __call__(self, audio_path: str, label: typing.Any) -> typing.Tuple[np.ndarra audio = None self.logger.warning(f"Audio {audio_path} could not be read, returning None.") - return audio, label + return audio, label # pyright: ignore class WavReader: """Read wav file with librosa and return audio and label @@ -130,6 +133,7 @@ def __init__( self.frame_step = frame_step self.fft_length = fft_length + matplotlib = importlib.import_module('matplotlib') matplotlib.interactive(False) # import librosa using importlib import_librosa(self) @@ -150,12 +154,12 @@ def get_spectrogram(wav_path: str, frame_length: int, frame_step: int, fft_lengt import_librosa(WavReader) # Load the wav file and store the audio data in the variable 'audio' and the sample rate in 'orig_sr' - audio, orig_sr = WavReader.librosa.load(wav_path) + audio, orig_sr = WavReader.librosa.load(wav_path) # pyright: ignore # Compute the Short Time Fourier Transform (STFT) of the audio data and store it in the variable 'spectrogram' # The STFT is computed with a hop length of 'frame_step' samples, a window length of 'frame_length' samples, and 'fft_length' FFT components. # The resulting spectrogram is also transposed for convenience - spectrogram = WavReader.librosa.stft(audio, hop_length=frame_step, win_length=frame_length, n_fft=fft_length).T + spectrogram = WavReader.librosa.stft(audio, hop_length=frame_step, win_length=frame_length, n_fft=fft_length).T # pyright: ignore # Take the absolute value of the spectrogram to obtain the magnitude spectrum spectrogram = np.abs(spectrogram) @@ -170,17 +174,18 @@ def get_spectrogram(wav_path: str, frame_length: int, frame_step: int, fft_lengt return spectrogram @staticmethod - def plot_raw_audio(wav_path: str, title: str = None, sr: int = 16000) -> None: + def plot_raw_audio(wav_path: str, title: str="Audio Plot", sr: int=16000) -> None: """Plot the raw audio of a WAV file Args: wav_path (str): Path to the WAV file. sr (int, optional): Sample rate of the WAV file. Defaults to 16000. - title (str, optional): Title + title (str, optional): Title, defaults to "Audio Plot" """ + plt = importlib.import_module('matplotlib.pyplot') import_librosa(WavReader) # Load the wav file and store the audio data in the variable 'audio' and the sample rate in 'orig_sr' - audio, orig_sr = WavReader.librosa.load(wav_path, sr=sr) + audio, orig_sr = WavReader.librosa.load(wav_path, sr=sr) # pyright: ignore duration = len(audio) / orig_sr @@ -188,7 +193,7 @@ def plot_raw_audio(wav_path: str, title: str = None, sr: int = 16000) -> None: plt.figure(figsize=(15, 5)) plt.plot(time, audio) - plt.title(title) if title else plt.title("Audio Plot") + plt.title(title) plt.ylabel("signal wave") plt.xlabel("time (s)") plt.tight_layout() @@ -204,6 +209,8 @@ def plot_spectrogram(spectrogram: np.ndarray, title:str = "", transpose: bool = transpose (bool, optional): Transpose the spectrogram. Defaults to True. invert (bool, optional): Invert the spectrogram. Defaults to True. """ + plt = importlib.import_module('matplotlib.pyplot') + if transpose: spectrogram = spectrogram.T diff --git a/mltu/tensorflow/callbacks.py b/mltu/tensorflow/callbacks.py index ea9479e..86df348 100644 --- a/mltu/tensorflow/callbacks.py +++ b/mltu/tensorflow/callbacks.py @@ -1,20 +1,23 @@ import os +import typing import tensorflow as tf -from keras.callbacks import Callback +import keras +from pathlib import Path import logging -class Model2onnx(Callback): +class Model2onnx(keras.callbacks.Callback): """ Converts the model to onnx format after training is finished. """ def __init__( self, saved_model_path: str, - metadata: dict=None, + metadata: typing.Union[dict, None]=None, save_on_epoch_end: bool=False, + opset: typing.Union[int, None]=None ) -> None: """ Converts the model to onnx format after training is finished. Args: - saved_model_path (str): Path to the saved .h5 model. + saved_model_path (str): Path to the saved model. metadata (dict, optional): Dictionary containing metadata to be added to the onnx model. Defaults to None. save_on_epoch_end (bool, optional): Save the onnx model on every epoch end. Defaults to False. """ @@ -22,6 +25,7 @@ def __init__( self.saved_model_path = saved_model_path self.metadata = metadata self.save_on_epoch_end = save_on_epoch_end + self.opset = opset try: import tf2onnx @@ -34,51 +38,48 @@ def __init__( raise ImportError("onnx is not installed. Please install it using 'pip install onnx'") @staticmethod - def model2onnx(model: tf.keras.Model, onnx_model_path: str): - try: - import tf2onnx + def model2onnx(model: keras.Model, onnx_model_path: str, opset: typing.Union[int, None]=None): + import tf2onnx - # convert the model to onnx format - tf2onnx.convert.from_keras(model, output_path=onnx_model_path) + # Handle input signature where model has multiple inputs + input_signature = [tf.TensorSpec(shape=t.shape, dtype=t.dtype, name=t.name) for t in model.inputs] # pyright: ignore - except Exception as e: - print(e) + # convert the model to onnx format + model_proto, external_tensor_storage = tf2onnx.convert.from_keras(model, input_signature=input_signature, opset=opset, output_path=onnx_model_path) - @staticmethod - def include_metadata(onnx_model_path: str, metadata: dict=None): - try: - if metadata and isinstance(metadata, dict): + return model_proto, external_tensor_storage - import onnx - # Load the ONNX model - onnx_model = onnx.load(onnx_model_path) + @staticmethod + def include_metadata(onnx_model_path: str, metadata: typing.Union[dict, None]=None): + if metadata and isinstance(metadata, dict): - # Add the metadata dictionary to the model's metadata_props attribute - for key, value in metadata.items(): - meta = onnx_model.metadata_props.add() - meta.key = key - meta.value = str(value) + import onnx + # Load the ONNX model + onnx_model = onnx.load(onnx_model_path) - # Save the modified ONNX model - onnx.save(onnx_model, onnx_model_path) + # Add the metadata dictionary to the model's metadata_props attribute + for key, value in metadata.items(): + meta = onnx_model.metadata_props.add() + meta.key = key + meta.value = str(value) - except Exception as e: - print(e) + # Save the modified ONNX model + onnx.save(onnx_model, onnx_model_path) - def on_epoch_end(self, epoch: int, logs: dict=None): + def on_epoch_end(self, epoch: int, logs: typing.Union[dict, None]=None): """ Converts the model to onnx format on every epoch end. """ if self.save_on_epoch_end: self.on_train_end(logs=logs) - def on_train_end(self, logs=None): + def on_train_end(self, logs: typing.Union[dict, None]=None): """ Converts the model to onnx format after training is finished. """ - self.model.load_weights(self.saved_model_path) - onnx_model_path = self.saved_model_path.replace(".h5", ".onnx") - self.model2onnx(self.model, onnx_model_path) + self._model.load_weights(self.saved_model_path) # pyright: ignore + onnx_model_path = str(Path(self.saved_model_path).with_suffix('.onnx')) + self.model2onnx(self.model, onnx_model_path, self.opset) # pyright: ignore self.include_metadata(onnx_model_path, self.metadata) -class TrainLogger(Callback): +class TrainLogger(keras.callbacks.Callback): """Logs training metrics to a file. Args: @@ -108,13 +109,14 @@ def __init__(self, log_path: str, log_file: str="logs.log", logLevel=logging.INF self.logger.addHandler(self.file_handler) - def on_epoch_end(self, epoch: int, logs: dict=None): - epoch_message = f"Epoch {epoch}; " - logs_message = "; ".join([f"{key}: {value}" for key, value in logs.items()]) - self.logger.info(epoch_message + logs_message) + def on_epoch_end(self, epoch: int, logs: typing.Union[dict, None]=None): + if logs: + epoch_message = f"Epoch {epoch}; " + logs_message = "; ".join([f"{key}: {value}" for key, value in logs.items()]) + self.logger.info(epoch_message + logs_message) -class WarmupCosineDecay(Callback): +class WarmupCosineDecay(keras.callbacks.Callback): """ Cosine decay learning rate scheduler with warmup Args: @@ -141,28 +143,27 @@ def __init__( self.decay_epochs = decay_epochs self.initial_lr = initial_lr self.verbose = verbose + self.model: keras.Model - def on_epoch_begin(self, epoch: int, logs: dict=None): + def on_epoch_begin(self, epoch: int, logs: typing.Union[dict, None]=None): """ Adjust learning rate at the beginning of each epoch """ if epoch >= self.warmup_epochs + self.decay_epochs: - return logs + return if epoch < self.warmup_epochs: lr = self.initial_lr + (self.lr_after_warmup - self.initial_lr) * (epoch + 1) / self.warmup_epochs else: progress = (epoch - self.warmup_epochs) / self.decay_epochs - lr = self.final_lr + 0.5 * (self.lr_after_warmup - self.final_lr) * (1 + tf.cos(tf.constant(progress) * 3.14159)) + lr = self.final_lr + 0.5 * (self.lr_after_warmup - self.final_lr) * (1 + tf.cos(tf.constant(progress) * 3.14159)) # pyright: ignore - tf.keras.backend.set_value(self.model.optimizer.lr, lr) + tf.keras.backend.set_value(self.model.optimizer.lr, lr) # pyright: ignore if self.verbose: print(f"Epoch {epoch + 1} - Learning Rate: {lr}") - def on_epoch_end(self, epoch: int, logs: dict=None): + def on_epoch_end(self, epoch: int, logs: typing.Union[dict, None]=None): logs = logs or {} # Log the learning rate value - logs["lr"] = self.model.optimizer.lr - - return logs \ No newline at end of file + logs["lr"] = self.model.optimizer.lr \ No newline at end of file diff --git a/mltu/tensorflow/dataProvider.py b/mltu/tensorflow/dataProvider.py index d5f46ae..4e7f807 100644 --- a/mltu/tensorflow/dataProvider.py +++ b/mltu/tensorflow/dataProvider.py @@ -1,7 +1,10 @@ -import tensorflow as tf +import keras from ..dataProvider import DataProvider as dataProvider -class DataProvider(dataProvider, tf.keras.utils.Sequence): +class DataProvider(dataProvider, keras.utils.Sequence): def __init__(self, *args, **kwargs): + self.workers = kwargs.pop("workers", 10) + self.use_multiprocessing = kwargs.pop("use_multiprocessing", False) + self.max_queue_size = kwargs.pop("max_queue_size", 10) super().__init__(*args, **kwargs) diff --git a/mltu/tensorflow/model_utils.py b/mltu/tensorflow/model_utils.py index 1040f7b..c544c3c 100644 --- a/mltu/tensorflow/model_utils.py +++ b/mltu/tensorflow/model_utils.py @@ -1,10 +1,8 @@ +import keras import typing import tensorflow as tf -from tensorflow import keras -from keras import layers -from keras.models import Model -class CustomModel(Model): +class CustomModel(keras.models.Model): """ Custom TensorFlow model for debugging training process purposes """ def train_step(self, train_data): @@ -17,7 +15,7 @@ def train_step(self, train_data): gradients = tape.gradient(loss, self.trainable_weights) # Applying the gradients on the model using the specified optimizer - self.optimizer.apply_gradients(zip(gradients, self.trainable_weights)) + self.optimizer.apply_gradients(zip(gradients, self.trainable_weights)) # pyright: ignore # Update the metrics. # Metrics are configured in `compile()`. @@ -41,19 +39,19 @@ def test_step(self, test_data): return {m.name: m.result() for m in self.metrics} -def activation_layer(layer, activation: str="relu", alpha: float=0.1) -> tf.Tensor: +def activation_layer(layer, activation: str="relu", negative_slope: float=0.1) -> keras.layers.Layer: """ Activation layer wrapper for LeakyReLU and ReLU activation functions Args: layer: tf.Tensor activation: str, activation function name (default: 'relu') - alpha: float (LeakyReLU activation function parameter) + negative_slope: float (LeakyReLU activation function parameter) Returns: tf.Tensor """ if activation == "relu": - layer = layers.ReLU()(layer) + layer = keras.layers.ReLU()(layer) elif activation == "leaky_relu": - layer = layers.LeakyReLU(alpha=alpha)(layer) + layer = keras.layers.LeakyReLU(negative_slope=negative_slope)(layer) return layer @@ -67,29 +65,30 @@ def residual_block( padding: str = "same", kernel_initializer: str = "he_uniform", activation: str = "relu", - dropout: float = 0.2): + dropout: float = 0.2 + ) -> keras.layers.Layer: # Create skip connection tensor x_skip = x # Perform 1-st convolution - x = layers.Conv2D(filter_num, kernel_size, padding = padding, strides = strides, kernel_initializer=kernel_initializer)(x) - x = layers.BatchNormalization()(x) + x = keras.layers.Conv2D(filter_num, kernel_size, padding = padding, strides = strides, kernel_initializer=kernel_initializer)(x) + x = keras.layers.BatchNormalization()(x) x = activation_layer(x, activation=activation) # Perform 2-nd convoluti - x = layers.Conv2D(filter_num, kernel_size, padding = padding, kernel_initializer=kernel_initializer)(x) - x = layers.BatchNormalization()(x) + x = keras.layers.Conv2D(filter_num, kernel_size, padding = padding, kernel_initializer=kernel_initializer)(x) + x = keras.layers.BatchNormalization()(x) # Perform 3-rd convolution if skip_conv is True, matchin the number of filters and the shape of the skip connection tensor if skip_conv: - x_skip = layers.Conv2D(filter_num, 1, padding = padding, strides = strides, kernel_initializer=kernel_initializer)(x_skip) + x_skip = keras.layers.Conv2D(filter_num, 1, padding = padding, strides = strides, kernel_initializer=kernel_initializer)(x_skip) # Add x and skip connection and apply activation function - x = layers.Add()([x, x_skip]) + x = keras.layers.Add()([x, x_skip]) x = activation_layer(x, activation=activation) # Apply dropout if dropout: - x = layers.Dropout(dropout)(x) + x = keras.layers.Dropout(dropout)(x) return x \ No newline at end of file diff --git a/mltu/tensorflow/transformer/utils.py b/mltu/tensorflow/transformer/utils.py index 471f92e..13e64b0 100644 --- a/mltu/tensorflow/transformer/utils.py +++ b/mltu/tensorflow/transformer/utils.py @@ -1,7 +1,7 @@ +import keras import tensorflow as tf - -class MaskedLoss(tf.keras.losses.Loss): +class MaskedLoss(keras.losses.Loss): """ Masked loss function for Transformer. Args: @@ -12,7 +12,7 @@ def __init__(self, mask_value: int=0, reduction: str='none') -> None: super(MaskedLoss, self).__init__() self.mask_value = mask_value self.reduction = reduction - self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=reduction) + self.loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=reduction) def __call__(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight=None) -> tf.Tensor: """ Calculate masked loss. @@ -28,13 +28,13 @@ def __call__(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight=None) -> loss = self.loss_object(y_true, y_pred) mask = tf.cast(mask, dtype=loss.dtype) - loss *= mask + loss *= mask # pyright: ignore loss = tf.reduce_sum(loss) / tf.reduce_sum(mask) return loss -class MaskedAccuracy(tf.keras.metrics.Metric): +class MaskedAccuracy(keras.metrics.Metric): """ Masked accuracy metric for Transformer. Args: @@ -48,12 +48,13 @@ def __init__(self, mask_value: int=0, name: str='masked_accuracy') -> None: self.count = self.add_weight(name='count', initializer='zeros') @tf.function - def update_state(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight=None): + def update_state(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight=None): # pyright: ignore """ Update state of the metric. Args: y_true (tf.Tensor): True labels. y_pred (tf.Tensor): Predicted labels. + sample_weight: (Optional) sample weights. """ pred = tf.argmax(y_pred, axis=2) label = tf.cast(y_true, pred.dtype) @@ -80,7 +81,7 @@ def result(self) -> tf.Tensor: return self.total / self.count -class CERMetric(tf.keras.metrics.Metric): +class CERMetric(keras.metrics.Metric): """A custom TensorFlow metric to compute the Character Error Rate (CER). Args: @@ -116,8 +117,8 @@ def get_cer(self, pred, y_true, padding=-1): end_token_index = tf.argmax(equal_int, axis=1) # mask out everything after end token - new_range = tf.range(tf.shape(pred)[1], dtype=tf.int64) - range_matrix = tf.tile(new_range[None, :], [tf.shape(pred)[0], 1]) + new_range = tf.range(tf.shape(pred)[1], dtype=tf.int64) # pyright: ignore + range_matrix = tf.tile(new_range[None, :], [tf.shape(pred)[0], 1]) # pyright: ignore mask = range_matrix <= tf.expand_dims(end_token_index, axis=1) masked_pred = tf.where(mask, pred, padding) @@ -151,7 +152,7 @@ def update_state(self, y_true, y_pred, sample_weight=None): self.cer_accumulator.assign_add(tf.reduce_sum(distance)) # Increment the batch_counter by the batch size - self.batch_counter.assign_add(len(y_true)) + self.batch_counter.assign_add(y_true.shape[0]) def result(self): """ Computes and returns the metric result. diff --git a/mltu/torch/dataProvider.py b/mltu/torch/dataProvider.py index 14fb4bd..668ba3a 100644 --- a/mltu/torch/dataProvider.py +++ b/mltu/torch/dataProvider.py @@ -13,7 +13,7 @@ class ThreadExecutor: - def __init__(self, target: typing.Callable, workers: int = os.cpu_count()) -> None: + def __init__(self, target: typing.Callable, workers: int=os.cpu_count() or 1) -> None: self.target = target self.workers = workers @@ -87,7 +87,7 @@ def __exit__(self): class ProcessExecutor: - def __init__(self, target: typing.Callable, workers: int = os.cpu_count()) -> None: + def __init__(self, target: typing.Callable, workers: int=os.cpu_count() or 1) -> None: self.target = target self.workers = workers self.busy = False @@ -110,7 +110,7 @@ def __call__(self, data) -> typing.Any: for index, data_batch in enumerate(data): for worker in self.mp_workers: if worker.busy == False and results[index] is None: - results[index] = worker.send(data_batch) + results[index] = worker.send(data_batch) # pyright: ignore break # receive data from workers @@ -130,24 +130,23 @@ def __call__(self, data) -> typing.Any: self.busy = False return results - class DataProvider(BaseDataProvider): """ DataProvider for PyTorch with multiprocessing and multithreading support. """ def __init__( self, dataset: typing.Union[str, list, pd.DataFrame], - data_preprocessors: typing.List[typing.Callable] = None, + data_preprocessors: typing.Optional[typing.List[typing.Callable]] = None, batch_size: int = 4, shuffle: bool = True, initial_epoch: int = 1, - augmentors: typing.List[Augmentor] = None, - transformers: typing.List[Transformer] = None, - batch_postprocessors: typing.List[typing.Callable] = None, + augmentors: typing.Optional[typing.List[Augmentor]] = None, + transformers: typing.Optional[typing.List[Transformer]] = None, + batch_postprocessors: typing.Optional[typing.List[typing.Callable]] = None, skip_validation: bool = True, - limit: int = None, + limit: typing.Union[int, None] = None, use_cache: bool = False, - workers: int = os.cpu_count(), + workers: int = os.cpu_count() or 1, use_multiprocessing: bool = False, max_queue_size: int = 5, **kwargs diff --git a/mltu/transformers.py b/mltu/transformers.py index e4cb4bb..d1ebfc1 100644 --- a/mltu/transformers.py +++ b/mltu/transformers.py @@ -54,7 +54,7 @@ def __init__( width: int, height: int, keep_aspect_ratio: bool=False, - padding_color: typing.Tuple[int]=(0, 0, 0) + padding_color: typing.Tuple[int, int, int]=(0, 0, 0) ) -> None: self._width = width self._height = height @@ -77,7 +77,7 @@ def unpad_maintaining_aspect_ratio(padded_image: np.ndarray, original_width: int return original_image @staticmethod - def resize_maintaining_aspect_ratio(image: np.ndarray, width_target: int, height_target: int, padding_color: typing.Tuple[int]=(0, 0, 0)) -> np.ndarray: + def resize_maintaining_aspect_ratio(image: np.ndarray, width_target: int, height_target: int, padding_color: typing.Tuple[int, int, int]=(0, 0, 0)) -> np.ndarray: """ Resize image maintaining aspect ratio and pad with padding_color. Args: @@ -132,7 +132,7 @@ class LabelIndexer(Transformer): """ def __init__( self, - vocab: typing.List[str] + vocab: typing.Union[str, typing.List[str]] ) -> None: self.vocab = vocab @@ -150,7 +150,7 @@ class LabelPadding(Transformer): def __init__( self, padding_value: int, - max_word_length: int = None, + max_word_length: typing.Optional[int]=None, use_on_batch: bool = False ) -> None: self.max_word_length = max_word_length @@ -172,6 +172,8 @@ def __call__(self, data: np.ndarray, label: np.ndarray): return data, padded_labels label = label[:self.max_word_length] + if self.max_word_length is None: + raise ValueError("max_word_length must be specified.") return data, np.pad(label, (0, self.max_word_length - len(label)), "constant", constant_values=self.padding_value) @@ -208,7 +210,7 @@ class SpectrogramPadding(Transformer): def __init__( self, padding_value: int, - max_spectrogram_length: int = None, + max_spectrogram_length: typing.Optional[int] = None, use_on_batch: bool = False ) -> None: self.max_spectrogram_length = max_spectrogram_length @@ -247,9 +249,9 @@ def __init__(self, max_audio_length: int, padding_value: int = 0, use_on_batch: def __call__(self, audio: Audio, label: typing.Any): # batched padding if self.use_on_batch: - max_len = max([len(a) for a in audio]) + max_len = max([len(a) for a in audio]) # pyright: ignore padded_audios = [] - for a in audio: + for a in audio: # pyright: ignore # limit audio if it exceed max_audio_length padded_audio = np.pad(a, (0, max_len - a.shape[0]), mode="constant", constant_values=self.padding_value) padded_audios.append(padded_audio) diff --git a/requirements.txt b/requirements.txt index feec954..bfab0f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,8 @@ -PyYAML>=6.0 -tqdm +PyYAML==6.0.2 +tqdm==4.67.1 qqdm==0.0.7 -pandas -numpy -opencv-python -Pillow>=9.4.0 -onnxruntime>=1.15.0 # onnxruntime-gpu for GPU support -matplotlib \ No newline at end of file +pandas==2.2.3 +numpy==2.1.3 +opencv-python==4.11.0.86 +pillow==11.1.0 +onnxruntime==1.21.0 # onnxruntime-gpu for GPU support \ No newline at end of file