clarena.cl_datasets.permuted_SEMEION

The submodule in cl_datasets for Permuted SEMEION dataset.

View Source

  1r"""
  2The submodule in `cl_datasets` for Permuted SEMEION dataset.
  3"""
  4
  5__all__ = ["PermutedSEMEION"]
  6
  7import logging
  8from typing import Callable
  9
 10import torch
 11from torch.utils.data import Dataset, random_split
 12from torchvision.datasets import SEMEION
 13from torchvision.transforms import transforms
 14
 15from clarena.cl_datasets import CLPermutedDataset
 16
 17# always get logger for built-in logging in each module
 18pylogger = logging.getLogger(__name__)
 19
 20
 21class PermutedSEMEION(CLPermutedDataset):
 22    r"""Permuted SEMEION dataset. The [SEMEION dataset](https://archive.ics.uci.edu/dataset/178/semeion+handwritten+digit) is a collection of handwritten digits. It consists of 1,593 handwritten digit images (10 classes), each 16x16 grayscale image."""
 23
 24    original_dataset_python_class: type[Dataset] = SEMEION
 25    r"""The original dataset class."""
 26
 27    def __init__(
 28        self,
 29        root: str,
 30        num_tasks: int,
 31        test_percentage: float,
 32        validation_percentage: float,
 33        batch_size: int | dict[int, int] = 1,
 34        num_workers: int | dict[int, int] = 0,
 35        custom_transforms: (
 36            Callable
 37            | transforms.Compose
 38            | None
 39            | dict[int, Callable | transforms.Compose | None]
 40        ) = None,
 41        repeat_channels: int | None | dict[int, int | None] = None,
 42        to_tensor: bool | dict[int, bool] = True,
 43        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 44        permutation_mode: str = "first_channel_only",
 45        permutation_seeds: dict[int, int] | None = None,
 46    ) -> None:
 47        r"""
 48        **Args:**
 49        - **root** (`str`): the root directory where the original SEMEION data 'SEMEION/' live.
 50        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
 51        - **test_percentage** (`float`): the percentage to randomly split some data into test data.
 52        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 53        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 54        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 55        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 56        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 57        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 58        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 59        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 60        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 61        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 62        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 63        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 64        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 65        - **permutation_mode** (`str`): the mode of permutation; one of:
 66            1. 'all': permute all pixels.
 67            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
 68            3. 'first_channel_only': permute only the first channel.
 69        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
 70        """
 71
 72        super().__init__(
 73            root=root,
 74            num_tasks=num_tasks,
 75            batch_size=batch_size,
 76            num_workers=num_workers,
 77            custom_transforms=custom_transforms,
 78            repeat_channels=repeat_channels,
 79            to_tensor=to_tensor,
 80            resize=resize,
 81            permutation_mode=permutation_mode,
 82            permutation_seeds=permutation_seeds,
 83        )
 84
 85        self.test_percentage: float = test_percentage
 86        r"""The percentage to randomly split some data into test data."""
 87        self.validation_percentage: float = validation_percentage
 88        r"""The percentage to randomly split some training data into validation data."""
 89
 90    def prepare_data(self) -> None:
 91        r"""Download the original SEMEION dataset if haven't."""
 92
 93        if self.task_id != 1:
 94            return  # download all original datasets only at the beginning of first task
 95
 96        SEMEION(root=self.root_t, download=True)
 97
 98        pylogger.debug(
 99            "The original SEMEION dataset has been downloaded to %s.", self.root_t
100        )
101
102    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
103        """Get the training and validation dataset of task `self.task_id`.
104
105        **Returns:**
106        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
107        """
108        dataset_all = SEMEION(
109            root=self.root_t,
110            transform=self.train_and_val_transforms(),
111            target_transform=self.target_transform(),
112            download=False,
113        )
114
115        dataset_train_and_val, _ = random_split(
116            dataset_all,
117            lengths=[
118                1 - self.test_percentage,
119                self.test_percentage,
120            ],
121            generator=torch.Generator().manual_seed(
122                42
123            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
124        )
125
126        return random_split(
127            dataset_train_and_val,
128            lengths=[1 - self.validation_percentage, self.validation_percentage],
129            generator=torch.Generator().manual_seed(
130                42
131            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
132        )
133
134    def test_dataset(self) -> Dataset:
135        r"""Get the test dataset of task `self.task_id`.
136
137        **Returns:**
138        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
139        """
140        dataset_all = SEMEION(
141            root=self.root_t,
142            transform=self.train_and_val_transforms(),
143            target_transform=self.target_transform(),
144            download=False,
145        )
146
147        _, dataset_test = random_split(
148            dataset_all,
149            lengths=[1 - self.test_percentage, self.test_percentage],
150            generator=torch.Generator().manual_seed(42),
151        )
152
153        return dataset_test

class PermutedSEMEION(clarena.cl_datasets.base.CLPermutedDataset): View Source

 22class PermutedSEMEION(CLPermutedDataset):
 23    r"""Permuted SEMEION dataset. The [SEMEION dataset](https://archive.ics.uci.edu/dataset/178/semeion+handwritten+digit) is a collection of handwritten digits. It consists of 1,593 handwritten digit images (10 classes), each 16x16 grayscale image."""
 24
 25    original_dataset_python_class: type[Dataset] = SEMEION
 26    r"""The original dataset class."""
 27
 28    def __init__(
 29        self,
 30        root: str,
 31        num_tasks: int,
 32        test_percentage: float,
 33        validation_percentage: float,
 34        batch_size: int | dict[int, int] = 1,
 35        num_workers: int | dict[int, int] = 0,
 36        custom_transforms: (
 37            Callable
 38            | transforms.Compose
 39            | None
 40            | dict[int, Callable | transforms.Compose | None]
 41        ) = None,
 42        repeat_channels: int | None | dict[int, int | None] = None,
 43        to_tensor: bool | dict[int, bool] = True,
 44        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 45        permutation_mode: str = "first_channel_only",
 46        permutation_seeds: dict[int, int] | None = None,
 47    ) -> None:
 48        r"""
 49        **Args:**
 50        - **root** (`str`): the root directory where the original SEMEION data 'SEMEION/' live.
 51        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
 52        - **test_percentage** (`float`): the percentage to randomly split some data into test data.
 53        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 54        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 55        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 56        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 57        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 58        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 59        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 60        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 61        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 62        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 63        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 64        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 65        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 66        - **permutation_mode** (`str`): the mode of permutation; one of:
 67            1. 'all': permute all pixels.
 68            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
 69            3. 'first_channel_only': permute only the first channel.
 70        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
 71        """
 72
 73        super().__init__(
 74            root=root,
 75            num_tasks=num_tasks,
 76            batch_size=batch_size,
 77            num_workers=num_workers,
 78            custom_transforms=custom_transforms,
 79            repeat_channels=repeat_channels,
 80            to_tensor=to_tensor,
 81            resize=resize,
 82            permutation_mode=permutation_mode,
 83            permutation_seeds=permutation_seeds,
 84        )
 85
 86        self.test_percentage: float = test_percentage
 87        r"""The percentage to randomly split some data into test data."""
 88        self.validation_percentage: float = validation_percentage
 89        r"""The percentage to randomly split some training data into validation data."""
 90
 91    def prepare_data(self) -> None:
 92        r"""Download the original SEMEION dataset if haven't."""
 93
 94        if self.task_id != 1:
 95            return  # download all original datasets only at the beginning of first task
 96
 97        SEMEION(root=self.root_t, download=True)
 98
 99        pylogger.debug(
100            "The original SEMEION dataset has been downloaded to %s.", self.root_t
101        )
102
103    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
104        """Get the training and validation dataset of task `self.task_id`.
105
106        **Returns:**
107        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
108        """
109        dataset_all = SEMEION(
110            root=self.root_t,
111            transform=self.train_and_val_transforms(),
112            target_transform=self.target_transform(),
113            download=False,
114        )
115
116        dataset_train_and_val, _ = random_split(
117            dataset_all,
118            lengths=[
119                1 - self.test_percentage,
120                self.test_percentage,
121            ],
122            generator=torch.Generator().manual_seed(
123                42
124            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
125        )
126
127        return random_split(
128            dataset_train_and_val,
129            lengths=[1 - self.validation_percentage, self.validation_percentage],
130            generator=torch.Generator().manual_seed(
131                42
132            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
133        )
134
135    def test_dataset(self) -> Dataset:
136        r"""Get the test dataset of task `self.task_id`.
137
138        **Returns:**
139        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
140        """
141        dataset_all = SEMEION(
142            root=self.root_t,
143            transform=self.train_and_val_transforms(),
144            target_transform=self.target_transform(),
145            download=False,
146        )
147
148        _, dataset_test = random_split(
149            dataset_all,
150            lengths=[1 - self.test_percentage, self.test_percentage],
151            generator=torch.Generator().manual_seed(42),
152        )
153
154        return dataset_test

Permuted SEMEION dataset. The SEMEION dataset is a collection of handwritten digits. It consists of 1,593 handwritten digit images (10 classes), each 16x16 grayscale image.

PermutedSEMEION( root: str, num_tasks: int, test_percentage: float, validation_percentage: float, batch_size: int | dict[int, int] = 1, num_workers: int | dict[int, int] = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType, dict[int, Union[Callable, torchvision.transforms.transforms.Compose, NoneType]]] = None, repeat_channels: int | None | dict[int, int | None] = None, to_tensor: bool | dict[int, bool] = True, resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, permutation_mode: str = 'first_channel_only', permutation_seeds: dict[int, int] | None = None) View Source

28    def __init__(
29        self,
30        root: str,
31        num_tasks: int,
32        test_percentage: float,
33        validation_percentage: float,
34        batch_size: int | dict[int, int] = 1,
35        num_workers: int | dict[int, int] = 0,
36        custom_transforms: (
37            Callable
38            | transforms.Compose
39            | None
40            | dict[int, Callable | transforms.Compose | None]
41        ) = None,
42        repeat_channels: int | None | dict[int, int | None] = None,
43        to_tensor: bool | dict[int, bool] = True,
44        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
45        permutation_mode: str = "first_channel_only",
46        permutation_seeds: dict[int, int] | None = None,
47    ) -> None:
48        r"""
49        **Args:**
50        - **root** (`str`): the root directory where the original SEMEION data 'SEMEION/' live.
51        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
52        - **test_percentage** (`float`): the percentage to randomly split some data into test data.
53        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
54        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
55        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
56        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
57        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
58        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
59        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
60        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
61        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
62        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
63        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
64        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
65        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
66        - **permutation_mode** (`str`): the mode of permutation; one of:
67            1. 'all': permute all pixels.
68            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
69            3. 'first_channel_only': permute only the first channel.
70        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
71        """
72
73        super().__init__(
74            root=root,
75            num_tasks=num_tasks,
76            batch_size=batch_size,
77            num_workers=num_workers,
78            custom_transforms=custom_transforms,
79            repeat_channels=repeat_channels,
80            to_tensor=to_tensor,
81            resize=resize,
82            permutation_mode=permutation_mode,
83            permutation_seeds=permutation_seeds,
84        )
85
86        self.test_percentage: float = test_percentage
87        r"""The percentage to randomly split some data into test data."""
88        self.validation_percentage: float = validation_percentage
89        r"""The percentage to randomly split some training data into validation data."""

Args:

root (str): the root directory where the original SEMEION data 'SEMEION/' live.
num_tasks (int): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to num_tasks.
test_percentage (float): the percentage to randomly split some data into test data.
validation_percentage (float): the percentage to randomly split some training data into validation data.
batch_size (int | dict[int, int]): the batch size for train, val, and test dataloaders. If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an int, it is the same batch size for all tasks.
num_workers (int | dict[int, int]): the number of workers for dataloaders. If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an int, it is the same number of workers for all tasks.
custom_transforms (transform or transforms.Compose or None or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. ToTensor(), normalization, permute, and so on are not included. If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is None, no custom transforms are applied.
repeat_channels (int | None | dict of them): the number of channels to repeat for each task. Default is None, which means no repeat. If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an int, it is the same number of channels to repeat for all tasks. If it is None, no repeat is applied.
to_tensor (bool | dict[int, bool]): whether to include the ToTensor() transform. Default is True. If it is a dict, the keys are task IDs and the values are whether to include the ToTensor() transform for each task. If it is a single boolean value, it is applied to all tasks.
resize (tuple[int, int] | None or dict of them): the size to resize the images to. Default is None, which means no resize. If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is None, no resize is applied.
permutation_mode (str): the mode of permutation; one of:
1. 'all': permute all pixels.
2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
3. 'first_channel_only': permute only the first channel.
permutation_seeds (dict[int, int] | None): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is None, which creates a dict of seeds from 0 to num_tasks-1.

original_dataset_python_class: type[torch.utils.data.dataset.Dataset] = <class 'torchvision.datasets.semeion.SEMEION'>

The original dataset class.

test_percentage: float

The percentage to randomly split some data into test data.

validation_percentage: float

The percentage to randomly split some training data into validation data.

def prepare_data(self) -> None: View Source

 91    def prepare_data(self) -> None:
 92        r"""Download the original SEMEION dataset if haven't."""
 93
 94        if self.task_id != 1:
 95            return  # download all original datasets only at the beginning of first task
 96
 97        SEMEION(root=self.root_t, download=True)
 98
 99        pylogger.debug(
100            "The original SEMEION dataset has been downloaded to %s.", self.root_t
101        )

Download the original SEMEION dataset if haven't.

def train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]: View Source

103    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
104        """Get the training and validation dataset of task `self.task_id`.
105
106        **Returns:**
107        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
108        """
109        dataset_all = SEMEION(
110            root=self.root_t,
111            transform=self.train_and_val_transforms(),
112            target_transform=self.target_transform(),
113            download=False,
114        )
115
116        dataset_train_and_val, _ = random_split(
117            dataset_all,
118            lengths=[
119                1 - self.test_percentage,
120                self.test_percentage,
121            ],
122            generator=torch.Generator().manual_seed(
123                42
124            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
125        )
126
127        return random_split(
128            dataset_train_and_val,
129            lengths=[1 - self.validation_percentage, self.validation_percentage],
130            generator=torch.Generator().manual_seed(
131                42
132            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
133        )

Get the training and validation dataset of task self.task_id.

Returns:

train_and_val_dataset (tuple[Dataset, Dataset]): the train and validation dataset of task self.task_id.

def test_dataset(self) -> torch.utils.data.dataset.Dataset: View Source

135    def test_dataset(self) -> Dataset:
136        r"""Get the test dataset of task `self.task_id`.
137
138        **Returns:**
139        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
140        """
141        dataset_all = SEMEION(
142            root=self.root_t,
143            transform=self.train_and_val_transforms(),
144            target_transform=self.target_transform(),
145            download=False,
146        )
147
148        _, dataset_test = random_split(
149            dataset_all,
150            lengths=[1 - self.test_percentage, self.test_percentage],
151            generator=torch.Generator().manual_seed(42),
152        )
153
154        return dataset_test

Get the test dataset of task self.task_id.

Returns:

test_dataset (Dataset): the test dataset of task self.task_id.