clarena.stl_datasets.SEMEION

The submodule in stl_datasets for SEMEION dataset.

  1r"""
  2The submodule in `stl_datasets` for SEMEION dataset.
  3"""
  4
  5__all__ = ["SEMEION"]
  6
  7import logging
  8from typing import Callable
  9
 10import torch
 11from torch.utils.data import Dataset, random_split
 12from torchvision.datasets import SEMEION as SEMEIONRaw
 13from torchvision.transforms import transforms
 14
 15from clarena.stl_datasets.base import STLDatasetFromRaw
 16
 17# always get logger for built-in logging in each module
 18pylogger = logging.getLogger(__name__)
 19
 20
 21class SEMEION(STLDatasetFromRaw):
 22    r"""SEMEION dataset. The [SEMEION dataset](https://archive.ics.uci.edu/dataset/178/semeion+handwritten+digit) is a collection of handwritten digits. It consists of 1,593 handwritten digit images (10 classes), each 16x16 grayscale image."""
 23
 24    original_dataset_python_class: type[Dataset] = SEMEIONRaw
 25    r"""The original dataset class."""
 26
 27    def __init__(
 28        self,
 29        root: str,
 30        test_percentage: float,
 31        validation_percentage: float,
 32        batch_size: int = 1,
 33        num_workers: int = 0,
 34        custom_transforms: Callable | transforms.Compose | None = None,
 35        repeat_channels: int | None = None,
 36        to_tensor: bool = True,
 37        resize: tuple[int, int] | None = None,
 38    ) -> None:
 39        r"""
 40        **Args:**
 41        - **root** (`str`): the root directory where the original SEMEION data 'SEMEION/' live.
 42        - **test_percentage** (`float`): the percentage to randomly split some data into test data.
 43        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 44        - **batch_size** (`int`): The batch size in train, val, test dataloader.
 45        - **num_workers** (`int`): the number of workers for dataloaders.
 46        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
 47        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
 48        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
 49        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
 50        """
 51        super().__init__(
 52            root=root,
 53            batch_size=batch_size,
 54            num_workers=num_workers,
 55            custom_transforms=custom_transforms,
 56            repeat_channels=repeat_channels,
 57            to_tensor=to_tensor,
 58            resize=resize,
 59        )
 60
 61        self.test_percentage: float = test_percentage
 62        r"""The percentage to randomly split some data into test data."""
 63        self.validation_percentage: float = validation_percentage
 64        r"""The percentage to randomly split some training data into validation data."""
 65
 66    def prepare_data(self) -> None:
 67        r"""Download the original SEMEION dataset if haven't."""
 68
 69        SEMEIONRaw(root=self.root, download=True)
 70
 71        pylogger.debug(
 72            "The original SEMEION dataset has been downloaded to %s.", self.root
 73        )
 74
 75    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 76        """Get the training and validation dataset.
 77
 78        **Returns:**
 79        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
 80        """
 81        dataset_all = SEMEIONRaw(
 82            root=self.root,
 83            transform=self.train_and_val_transforms(),
 84            target_transform=self.target_transform(),
 85            download=False,
 86        )
 87
 88        dataset_train_and_val, _ = random_split(
 89            dataset_all,
 90            lengths=[
 91                1 - self.test_percentage,
 92                self.test_percentage,
 93            ],
 94            generator=torch.Generator().manual_seed(
 95                42
 96            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
 97        )
 98
 99        return random_split(
100            dataset_train_and_val,
101            lengths=[1 - self.validation_percentage, self.validation_percentage],
102            generator=torch.Generator().manual_seed(
103                42
104            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
105        )
106
107    def test_dataset(self) -> Dataset:
108        r"""Get the test dataset.
109
110        **Returns:**
111        - **test_dataset** (`Dataset`): the test dataset.
112        """
113        dataset_all = SEMEIONRaw(
114            root=self.root,
115            transform=self.train_and_val_transforms(),
116            target_transform=self.target_transform(),
117            download=False,
118        )
119
120        _, dataset_test = random_split(
121            dataset_all,
122            lengths=[1 - self.test_percentage, self.test_percentage],
123            generator=torch.Generator().manual_seed(42),
124        )
125
126        return dataset_test
class SEMEION(clarena.stl_datasets.base.STLDatasetFromRaw):
 22class SEMEION(STLDatasetFromRaw):
 23    r"""SEMEION dataset. The [SEMEION dataset](https://archive.ics.uci.edu/dataset/178/semeion+handwritten+digit) is a collection of handwritten digits. It consists of 1,593 handwritten digit images (10 classes), each 16x16 grayscale image."""
 24
 25    original_dataset_python_class: type[Dataset] = SEMEIONRaw
 26    r"""The original dataset class."""
 27
 28    def __init__(
 29        self,
 30        root: str,
 31        test_percentage: float,
 32        validation_percentage: float,
 33        batch_size: int = 1,
 34        num_workers: int = 0,
 35        custom_transforms: Callable | transforms.Compose | None = None,
 36        repeat_channels: int | None = None,
 37        to_tensor: bool = True,
 38        resize: tuple[int, int] | None = None,
 39    ) -> None:
 40        r"""
 41        **Args:**
 42        - **root** (`str`): the root directory where the original SEMEION data 'SEMEION/' live.
 43        - **test_percentage** (`float`): the percentage to randomly split some data into test data.
 44        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 45        - **batch_size** (`int`): The batch size in train, val, test dataloader.
 46        - **num_workers** (`int`): the number of workers for dataloaders.
 47        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
 48        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
 49        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
 50        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
 51        """
 52        super().__init__(
 53            root=root,
 54            batch_size=batch_size,
 55            num_workers=num_workers,
 56            custom_transforms=custom_transforms,
 57            repeat_channels=repeat_channels,
 58            to_tensor=to_tensor,
 59            resize=resize,
 60        )
 61
 62        self.test_percentage: float = test_percentage
 63        r"""The percentage to randomly split some data into test data."""
 64        self.validation_percentage: float = validation_percentage
 65        r"""The percentage to randomly split some training data into validation data."""
 66
 67    def prepare_data(self) -> None:
 68        r"""Download the original SEMEION dataset if haven't."""
 69
 70        SEMEIONRaw(root=self.root, download=True)
 71
 72        pylogger.debug(
 73            "The original SEMEION dataset has been downloaded to %s.", self.root
 74        )
 75
 76    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 77        """Get the training and validation dataset.
 78
 79        **Returns:**
 80        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
 81        """
 82        dataset_all = SEMEIONRaw(
 83            root=self.root,
 84            transform=self.train_and_val_transforms(),
 85            target_transform=self.target_transform(),
 86            download=False,
 87        )
 88
 89        dataset_train_and_val, _ = random_split(
 90            dataset_all,
 91            lengths=[
 92                1 - self.test_percentage,
 93                self.test_percentage,
 94            ],
 95            generator=torch.Generator().manual_seed(
 96                42
 97            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
 98        )
 99
100        return random_split(
101            dataset_train_and_val,
102            lengths=[1 - self.validation_percentage, self.validation_percentage],
103            generator=torch.Generator().manual_seed(
104                42
105            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
106        )
107
108    def test_dataset(self) -> Dataset:
109        r"""Get the test dataset.
110
111        **Returns:**
112        - **test_dataset** (`Dataset`): the test dataset.
113        """
114        dataset_all = SEMEIONRaw(
115            root=self.root,
116            transform=self.train_and_val_transforms(),
117            target_transform=self.target_transform(),
118            download=False,
119        )
120
121        _, dataset_test = random_split(
122            dataset_all,
123            lengths=[1 - self.test_percentage, self.test_percentage],
124            generator=torch.Generator().manual_seed(42),
125        )
126
127        return dataset_test

SEMEION dataset. The SEMEION dataset is a collection of handwritten digits. It consists of 1,593 handwritten digit images (10 classes), each 16x16 grayscale image.

SEMEION( root: str, test_percentage: float, validation_percentage: float, batch_size: int = 1, num_workers: int = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType] = None, repeat_channels: int | None = None, to_tensor: bool = True, resize: tuple[int, int] | None = None)
28    def __init__(
29        self,
30        root: str,
31        test_percentage: float,
32        validation_percentage: float,
33        batch_size: int = 1,
34        num_workers: int = 0,
35        custom_transforms: Callable | transforms.Compose | None = None,
36        repeat_channels: int | None = None,
37        to_tensor: bool = True,
38        resize: tuple[int, int] | None = None,
39    ) -> None:
40        r"""
41        **Args:**
42        - **root** (`str`): the root directory where the original SEMEION data 'SEMEION/' live.
43        - **test_percentage** (`float`): the percentage to randomly split some data into test data.
44        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
45        - **batch_size** (`int`): The batch size in train, val, test dataloader.
46        - **num_workers** (`int`): the number of workers for dataloaders.
47        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
48        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
49        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
50        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
51        """
52        super().__init__(
53            root=root,
54            batch_size=batch_size,
55            num_workers=num_workers,
56            custom_transforms=custom_transforms,
57            repeat_channels=repeat_channels,
58            to_tensor=to_tensor,
59            resize=resize,
60        )
61
62        self.test_percentage: float = test_percentage
63        r"""The percentage to randomly split some data into test data."""
64        self.validation_percentage: float = validation_percentage
65        r"""The percentage to randomly split some training data into validation data."""

Args:

  • root (str): the root directory where the original SEMEION data 'SEMEION/' live.
  • test_percentage (float): the percentage to randomly split some data into test data.
  • validation_percentage (float): the percentage to randomly split some training data into validation data.
  • batch_size (int): The batch size in train, val, test dataloader.
  • num_workers (int): the number of workers for dataloaders.
  • custom_transforms (transform or transforms.Compose or None): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. ToTensor(), normalize and so on are not included.
  • repeat_channels (int | None): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
  • to_tensor (bool): whether to include ToTensor() transform. Default is True.
  • resize (tuple[int, int] | None or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
original_dataset_python_class: type[torch.utils.data.dataset.Dataset] = <class 'torchvision.datasets.semeion.SEMEION'>

The original dataset class.

test_percentage: float

The percentage to randomly split some data into test data.

validation_percentage: float

The percentage to randomly split some training data into validation data.

def prepare_data(self) -> None:
67    def prepare_data(self) -> None:
68        r"""Download the original SEMEION dataset if haven't."""
69
70        SEMEIONRaw(root=self.root, download=True)
71
72        pylogger.debug(
73            "The original SEMEION dataset has been downloaded to %s.", self.root
74        )

Download the original SEMEION dataset if haven't.

def train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]:
 76    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 77        """Get the training and validation dataset.
 78
 79        **Returns:**
 80        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
 81        """
 82        dataset_all = SEMEIONRaw(
 83            root=self.root,
 84            transform=self.train_and_val_transforms(),
 85            target_transform=self.target_transform(),
 86            download=False,
 87        )
 88
 89        dataset_train_and_val, _ = random_split(
 90            dataset_all,
 91            lengths=[
 92                1 - self.test_percentage,
 93                self.test_percentage,
 94            ],
 95            generator=torch.Generator().manual_seed(
 96                42
 97            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
 98        )
 99
100        return random_split(
101            dataset_train_and_val,
102            lengths=[1 - self.validation_percentage, self.validation_percentage],
103            generator=torch.Generator().manual_seed(
104                42
105            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
106        )

Get the training and validation dataset.

Returns:

  • train_and_val_dataset (tuple[Dataset, Dataset]): the train and validation dataset.
def test_dataset(self) -> torch.utils.data.dataset.Dataset:
108    def test_dataset(self) -> Dataset:
109        r"""Get the test dataset.
110
111        **Returns:**
112        - **test_dataset** (`Dataset`): the test dataset.
113        """
114        dataset_all = SEMEIONRaw(
115            root=self.root,
116            transform=self.train_and_val_transforms(),
117            target_transform=self.target_transform(),
118            download=False,
119        )
120
121        _, dataset_test = random_split(
122            dataset_all,
123            lengths=[1 - self.test_percentage, self.test_percentage],
124            generator=torch.Generator().manual_seed(42),
125        )
126
127        return dataset_test

Get the test dataset.

Returns:

  • test_dataset (Dataset): the test dataset.