clarena.stl_datasets.kmnist

The submodule in stl_datasets for KMNIST dataset.

  1r"""
  2The submodule in `stl_datasets` for KMNIST dataset.
  3"""
  4
  5__all__ = ["KMNIST"]
  6
  7import logging
  8from typing import Callable
  9
 10import torch
 11from torch.utils.data import Dataset, random_split
 12from torchvision.datasets import KMNIST as KMNISTRaw
 13from torchvision.transforms import transforms
 14
 15from clarena.stl_datasets.base import STLDatasetFromRaw
 16
 17# always get logger for built-in logging in each module
 18pylogger = logging.getLogger(__name__)
 19
 20
 21class KMNIST(STLDatasetFromRaw):
 22    r"""Kuzushiji-MNIST dataset. The [Kuzushiji-MNIST dataset](https://github.com/rois-codh/kmnist) is a collection of Japanese Kuzushiji character images. It consists of 60,000 training and 10,000 test images of Japanese Kuzushiji images (10 classes), each 28x28 grayscale image (similar to MNIST)."""
 23
 24    original_dataset_python_class: type[Dataset] = KMNISTRaw
 25    r"""The original dataset class."""
 26
 27    def __init__(
 28        self,
 29        root: str,
 30        validation_percentage: float,
 31        batch_size: int = 1,
 32        num_workers: int = 0,
 33        custom_transforms: Callable | transforms.Compose | None = None,
 34        repeat_channels: int | None = None,
 35        to_tensor: bool = True,
 36        resize: tuple[int, int] | None = None,
 37    ) -> None:
 38        r"""
 39        **Args:**
 40        - **root** (`str`): the root directory where the original Kuzushiji-MNIST data 'KMNIST/' live.
 41        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 42        - **batch_size** (`int`): The batch size in train, val, test dataloader.
 43        - **num_workers** (`int`): the number of workers for dataloaders.
 44        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
 45        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
 46        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
 47        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
 48        """
 49        super().__init__(
 50            root=root,
 51            batch_size=batch_size,
 52            num_workers=num_workers,
 53            custom_transforms=custom_transforms,
 54            repeat_channels=repeat_channels,
 55            to_tensor=to_tensor,
 56            resize=resize,
 57        )
 58
 59        self.validation_percentage: float = validation_percentage
 60        r"""The percentage to randomly split some training data into validation data."""
 61
 62    def prepare_data(self) -> None:
 63        r"""Download the original Kuzushiji-MNIST dataset if haven't."""
 64
 65        KMNISTRaw(root=self.root, train=True, download=True)
 66        KMNISTRaw(root=self.root, train=False, download=True)
 67
 68        pylogger.debug(
 69            "The original Kuzushiji-MNIST dataset has been downloaded to %s.",
 70            self.root,
 71        )
 72
 73    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 74        """Get the training and validation dataset.
 75
 76        **Returns:**
 77        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
 78        """
 79        dataset_train_and_val = KMNISTRaw(
 80            root=self.root,
 81            train=True,
 82            transform=self.train_and_val_transforms(),
 83            target_transform=self.target_transform(),
 84            download=False,
 85        )
 86
 87        return random_split(
 88            dataset_train_and_val,
 89            lengths=[1 - self.validation_percentage, self.validation_percentage],
 90            generator=torch.Generator().manual_seed(
 91                42
 92            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
 93        )
 94
 95    def test_dataset(self) -> Dataset:
 96        r"""Get the test dataset.
 97
 98        **Returns:**
 99        - **test_dataset** (`Dataset`): the test dataset.
100        """
101        dataset_test = KMNISTRaw(
102            root=self.root,
103            train=False,
104            transform=self.test_transforms(),
105            target_transform=self.target_transform(),
106            download=False,
107        )
108
109        return dataset_test
class KMNIST(clarena.stl_datasets.base.STLDatasetFromRaw):
 22class KMNIST(STLDatasetFromRaw):
 23    r"""Kuzushiji-MNIST dataset. The [Kuzushiji-MNIST dataset](https://github.com/rois-codh/kmnist) is a collection of Japanese Kuzushiji character images. It consists of 60,000 training and 10,000 test images of Japanese Kuzushiji images (10 classes), each 28x28 grayscale image (similar to MNIST)."""
 24
 25    original_dataset_python_class: type[Dataset] = KMNISTRaw
 26    r"""The original dataset class."""
 27
 28    def __init__(
 29        self,
 30        root: str,
 31        validation_percentage: float,
 32        batch_size: int = 1,
 33        num_workers: int = 0,
 34        custom_transforms: Callable | transforms.Compose | None = None,
 35        repeat_channels: int | None = None,
 36        to_tensor: bool = True,
 37        resize: tuple[int, int] | None = None,
 38    ) -> None:
 39        r"""
 40        **Args:**
 41        - **root** (`str`): the root directory where the original Kuzushiji-MNIST data 'KMNIST/' live.
 42        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 43        - **batch_size** (`int`): The batch size in train, val, test dataloader.
 44        - **num_workers** (`int`): the number of workers for dataloaders.
 45        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
 46        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
 47        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
 48        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
 49        """
 50        super().__init__(
 51            root=root,
 52            batch_size=batch_size,
 53            num_workers=num_workers,
 54            custom_transforms=custom_transforms,
 55            repeat_channels=repeat_channels,
 56            to_tensor=to_tensor,
 57            resize=resize,
 58        )
 59
 60        self.validation_percentage: float = validation_percentage
 61        r"""The percentage to randomly split some training data into validation data."""
 62
 63    def prepare_data(self) -> None:
 64        r"""Download the original Kuzushiji-MNIST dataset if haven't."""
 65
 66        KMNISTRaw(root=self.root, train=True, download=True)
 67        KMNISTRaw(root=self.root, train=False, download=True)
 68
 69        pylogger.debug(
 70            "The original Kuzushiji-MNIST dataset has been downloaded to %s.",
 71            self.root,
 72        )
 73
 74    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 75        """Get the training and validation dataset.
 76
 77        **Returns:**
 78        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
 79        """
 80        dataset_train_and_val = KMNISTRaw(
 81            root=self.root,
 82            train=True,
 83            transform=self.train_and_val_transforms(),
 84            target_transform=self.target_transform(),
 85            download=False,
 86        )
 87
 88        return random_split(
 89            dataset_train_and_val,
 90            lengths=[1 - self.validation_percentage, self.validation_percentage],
 91            generator=torch.Generator().manual_seed(
 92                42
 93            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
 94        )
 95
 96    def test_dataset(self) -> Dataset:
 97        r"""Get the test dataset.
 98
 99        **Returns:**
100        - **test_dataset** (`Dataset`): the test dataset.
101        """
102        dataset_test = KMNISTRaw(
103            root=self.root,
104            train=False,
105            transform=self.test_transforms(),
106            target_transform=self.target_transform(),
107            download=False,
108        )
109
110        return dataset_test

Kuzushiji-MNIST dataset. The Kuzushiji-MNIST dataset is a collection of Japanese Kuzushiji character images. It consists of 60,000 training and 10,000 test images of Japanese Kuzushiji images (10 classes), each 28x28 grayscale image (similar to MNIST).

KMNIST( root: str, validation_percentage: float, batch_size: int = 1, num_workers: int = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType] = None, repeat_channels: int | None = None, to_tensor: bool = True, resize: tuple[int, int] | None = None)
28    def __init__(
29        self,
30        root: str,
31        validation_percentage: float,
32        batch_size: int = 1,
33        num_workers: int = 0,
34        custom_transforms: Callable | transforms.Compose | None = None,
35        repeat_channels: int | None = None,
36        to_tensor: bool = True,
37        resize: tuple[int, int] | None = None,
38    ) -> None:
39        r"""
40        **Args:**
41        - **root** (`str`): the root directory where the original Kuzushiji-MNIST data 'KMNIST/' live.
42        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
43        - **batch_size** (`int`): The batch size in train, val, test dataloader.
44        - **num_workers** (`int`): the number of workers for dataloaders.
45        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
46        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
47        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
48        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
49        """
50        super().__init__(
51            root=root,
52            batch_size=batch_size,
53            num_workers=num_workers,
54            custom_transforms=custom_transforms,
55            repeat_channels=repeat_channels,
56            to_tensor=to_tensor,
57            resize=resize,
58        )
59
60        self.validation_percentage: float = validation_percentage
61        r"""The percentage to randomly split some training data into validation data."""

Args:

  • root (str): the root directory where the original Kuzushiji-MNIST data 'KMNIST/' live.
  • validation_percentage (float): the percentage to randomly split some training data into validation data.
  • batch_size (int): The batch size in train, val, test dataloader.
  • num_workers (int): the number of workers for dataloaders.
  • custom_transforms (transform or transforms.Compose or None): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. ToTensor(), normalize and so on are not included.
  • repeat_channels (int | None): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
  • to_tensor (bool): whether to include ToTensor() transform. Default is True.
  • resize (tuple[int, int] | None or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
original_dataset_python_class: type[torch.utils.data.dataset.Dataset] = <class 'torchvision.datasets.mnist.KMNIST'>

The original dataset class.

validation_percentage: float

The percentage to randomly split some training data into validation data.

def prepare_data(self) -> None:
63    def prepare_data(self) -> None:
64        r"""Download the original Kuzushiji-MNIST dataset if haven't."""
65
66        KMNISTRaw(root=self.root, train=True, download=True)
67        KMNISTRaw(root=self.root, train=False, download=True)
68
69        pylogger.debug(
70            "The original Kuzushiji-MNIST dataset has been downloaded to %s.",
71            self.root,
72        )

Download the original Kuzushiji-MNIST dataset if haven't.

def train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]:
74    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
75        """Get the training and validation dataset.
76
77        **Returns:**
78        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
79        """
80        dataset_train_and_val = KMNISTRaw(
81            root=self.root,
82            train=True,
83            transform=self.train_and_val_transforms(),
84            target_transform=self.target_transform(),
85            download=False,
86        )
87
88        return random_split(
89            dataset_train_and_val,
90            lengths=[1 - self.validation_percentage, self.validation_percentage],
91            generator=torch.Generator().manual_seed(
92                42
93            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
94        )

Get the training and validation dataset.

Returns:

  • train_and_val_dataset (tuple[Dataset, Dataset]): the train and validation dataset.
def test_dataset(self) -> torch.utils.data.dataset.Dataset:
 96    def test_dataset(self) -> Dataset:
 97        r"""Get the test dataset.
 98
 99        **Returns:**
100        - **test_dataset** (`Dataset`): the test dataset.
101        """
102        dataset_test = KMNISTRaw(
103            root=self.root,
104            train=False,
105            transform=self.test_transforms(),
106            target_transform=self.target_transform(),
107            download=False,
108        )
109
110        return dataset_test

Get the test dataset.

Returns:

  • test_dataset (Dataset): the test dataset.