clarena.cl_datasets.permuted_gtsrb

The submodule in cl_datasets for Permuted GTSRB dataset.

  1r"""
  2The submodule in `cl_datasets` for Permuted GTSRB dataset.
  3"""
  4
  5__all__ = ["PermutedGTSRB"]
  6
  7import logging
  8from typing import Callable
  9
 10import torch
 11from torch.utils.data import Dataset, random_split
 12from torchvision.datasets import GTSRB
 13from torchvision.transforms import transforms
 14
 15from clarena.cl_datasets import CLPermutedDataset
 16
 17# always get logger for built-in logging in each module
 18pylogger = logging.getLogger(__name__)
 19
 20
 21class PermutedGTSRB(CLPermutedDataset):
 22    r"""Permuted GTSRB dataset. The [GTSRB dataset](http://yann.lecun.com/exdb/mnist/) is a collection of traffic sign images. It consists of 51,839 images of 43 different traffic signs (classes), each color image."""
 23
 24    original_dataset_python_class: type[Dataset] = GTSRB
 25    r"""The original dataset class."""
 26
 27    def __init__(
 28        self,
 29        root: str,
 30        num_tasks: int,
 31        validation_percentage: float,
 32        batch_size: int | dict[int, int] = 1,
 33        num_workers: int | dict[int, int] = 0,
 34        custom_transforms: (
 35            Callable
 36            | transforms.Compose
 37            | None
 38            | dict[int, Callable | transforms.Compose | None]
 39        ) = None,
 40        repeat_channels: int | None | dict[int, int | None] = None,
 41        to_tensor: bool | dict[int, bool] = True,
 42        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 43        permutation_mode: str = "first_channel_only",
 44        permutation_seeds: dict[int, int] | None = None,
 45    ) -> None:
 46        r"""
 47        **Args:**
 48        - **root** (`str`): the root directory where the original GTSRB data 'GTSRB/' live.
 49        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
 50        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 51        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 52        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 53        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 54        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 55        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 56        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 57        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 58        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 59        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 60        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 61        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 62        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 63        - **permutation_mode** (`str`): the mode of permutation; one of:
 64            1. 'all': permute all pixels.
 65            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
 66            3. 'first_channel_only': permute only the first channel.
 67        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
 68        """
 69        super().__init__(
 70            root=root,
 71            num_tasks=num_tasks,
 72            batch_size=batch_size,
 73            num_workers=num_workers,
 74            custom_transforms=custom_transforms,
 75            repeat_channels=repeat_channels,
 76            to_tensor=to_tensor,
 77            resize=resize,
 78            permutation_mode=permutation_mode,
 79            permutation_seeds=permutation_seeds,
 80        )
 81
 82        self.validation_percentage: float = validation_percentage
 83        r"""The percentage to randomly split some training data into validation data."""
 84
 85    def prepare_data(self) -> None:
 86        r"""Download the original GTSRB dataset if haven't."""
 87
 88        if self.task_id != 1:
 89            return  # download all original datasets only at the beginning of first task
 90
 91        GTSRB(root=self.root_t, split="train", download=True)
 92        GTSRB(root=self.root_t, split="test", download=True)
 93
 94        pylogger.debug(
 95            "The original GTSRB dataset has been downloaded to %s.", self.root_t
 96        )
 97
 98    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 99        """Get the training and validation dataset of task `self.task_id`.
100
101        **Returns:**
102        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
103        """
104        dataset_train_and_val = GTSRB(
105            root=self.root_t,
106            split="train",
107            transform=self.train_and_val_transforms(),
108            target_transform=self.target_transform(),
109            download=False,
110        )
111
112        return random_split(
113            dataset_train_and_val,
114            lengths=[1 - self.validation_percentage, self.validation_percentage],
115            generator=torch.Generator().manual_seed(
116                42
117            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
118        )
119
120    def test_dataset(self) -> Dataset:
121        r"""Get the test dataset of task `self.task_id`.
122
123        **Returns:**
124        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
125        """
126        dataset_test = GTSRB(
127            root=self.root_t,
128            split="test",
129            transform=self.test_transforms(),
130            target_transform=self.target_transform(),
131            download=False,
132        )
133
134        return dataset_test
class PermutedGTSRB(clarena.cl_datasets.base.CLPermutedDataset):
 22class PermutedGTSRB(CLPermutedDataset):
 23    r"""Permuted GTSRB dataset. The [GTSRB dataset](http://yann.lecun.com/exdb/mnist/) is a collection of traffic sign images. It consists of 51,839 images of 43 different traffic signs (classes), each color image."""
 24
 25    original_dataset_python_class: type[Dataset] = GTSRB
 26    r"""The original dataset class."""
 27
 28    def __init__(
 29        self,
 30        root: str,
 31        num_tasks: int,
 32        validation_percentage: float,
 33        batch_size: int | dict[int, int] = 1,
 34        num_workers: int | dict[int, int] = 0,
 35        custom_transforms: (
 36            Callable
 37            | transforms.Compose
 38            | None
 39            | dict[int, Callable | transforms.Compose | None]
 40        ) = None,
 41        repeat_channels: int | None | dict[int, int | None] = None,
 42        to_tensor: bool | dict[int, bool] = True,
 43        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 44        permutation_mode: str = "first_channel_only",
 45        permutation_seeds: dict[int, int] | None = None,
 46    ) -> None:
 47        r"""
 48        **Args:**
 49        - **root** (`str`): the root directory where the original GTSRB data 'GTSRB/' live.
 50        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
 51        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 52        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 53        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 54        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 55        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 56        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 57        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 58        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 59        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 60        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 61        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 62        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 63        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 64        - **permutation_mode** (`str`): the mode of permutation; one of:
 65            1. 'all': permute all pixels.
 66            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
 67            3. 'first_channel_only': permute only the first channel.
 68        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
 69        """
 70        super().__init__(
 71            root=root,
 72            num_tasks=num_tasks,
 73            batch_size=batch_size,
 74            num_workers=num_workers,
 75            custom_transforms=custom_transforms,
 76            repeat_channels=repeat_channels,
 77            to_tensor=to_tensor,
 78            resize=resize,
 79            permutation_mode=permutation_mode,
 80            permutation_seeds=permutation_seeds,
 81        )
 82
 83        self.validation_percentage: float = validation_percentage
 84        r"""The percentage to randomly split some training data into validation data."""
 85
 86    def prepare_data(self) -> None:
 87        r"""Download the original GTSRB dataset if haven't."""
 88
 89        if self.task_id != 1:
 90            return  # download all original datasets only at the beginning of first task
 91
 92        GTSRB(root=self.root_t, split="train", download=True)
 93        GTSRB(root=self.root_t, split="test", download=True)
 94
 95        pylogger.debug(
 96            "The original GTSRB dataset has been downloaded to %s.", self.root_t
 97        )
 98
 99    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
100        """Get the training and validation dataset of task `self.task_id`.
101
102        **Returns:**
103        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
104        """
105        dataset_train_and_val = GTSRB(
106            root=self.root_t,
107            split="train",
108            transform=self.train_and_val_transforms(),
109            target_transform=self.target_transform(),
110            download=False,
111        )
112
113        return random_split(
114            dataset_train_and_val,
115            lengths=[1 - self.validation_percentage, self.validation_percentage],
116            generator=torch.Generator().manual_seed(
117                42
118            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
119        )
120
121    def test_dataset(self) -> Dataset:
122        r"""Get the test dataset of task `self.task_id`.
123
124        **Returns:**
125        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
126        """
127        dataset_test = GTSRB(
128            root=self.root_t,
129            split="test",
130            transform=self.test_transforms(),
131            target_transform=self.target_transform(),
132            download=False,
133        )
134
135        return dataset_test

Permuted GTSRB dataset. The GTSRB dataset is a collection of traffic sign images. It consists of 51,839 images of 43 different traffic signs (classes), each color image.

PermutedGTSRB( root: str, num_tasks: int, validation_percentage: float, batch_size: int | dict[int, int] = 1, num_workers: int | dict[int, int] = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType, dict[int, Union[Callable, torchvision.transforms.transforms.Compose, NoneType]]] = None, repeat_channels: int | None | dict[int, int | None] = None, to_tensor: bool | dict[int, bool] = True, resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, permutation_mode: str = 'first_channel_only', permutation_seeds: dict[int, int] | None = None)
28    def __init__(
29        self,
30        root: str,
31        num_tasks: int,
32        validation_percentage: float,
33        batch_size: int | dict[int, int] = 1,
34        num_workers: int | dict[int, int] = 0,
35        custom_transforms: (
36            Callable
37            | transforms.Compose
38            | None
39            | dict[int, Callable | transforms.Compose | None]
40        ) = None,
41        repeat_channels: int | None | dict[int, int | None] = None,
42        to_tensor: bool | dict[int, bool] = True,
43        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
44        permutation_mode: str = "first_channel_only",
45        permutation_seeds: dict[int, int] | None = None,
46    ) -> None:
47        r"""
48        **Args:**
49        - **root** (`str`): the root directory where the original GTSRB data 'GTSRB/' live.
50        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
51        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
52        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
53        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
54        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
55        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
56        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
57        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
58        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
59        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
60        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
61        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
62        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
63        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
64        - **permutation_mode** (`str`): the mode of permutation; one of:
65            1. 'all': permute all pixels.
66            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
67            3. 'first_channel_only': permute only the first channel.
68        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
69        """
70        super().__init__(
71            root=root,
72            num_tasks=num_tasks,
73            batch_size=batch_size,
74            num_workers=num_workers,
75            custom_transforms=custom_transforms,
76            repeat_channels=repeat_channels,
77            to_tensor=to_tensor,
78            resize=resize,
79            permutation_mode=permutation_mode,
80            permutation_seeds=permutation_seeds,
81        )
82
83        self.validation_percentage: float = validation_percentage
84        r"""The percentage to randomly split some training data into validation data."""

Args:

  • root (str): the root directory where the original GTSRB data 'GTSRB/' live.
  • num_tasks (int): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to num_tasks.
  • validation_percentage (float): the percentage to randomly split some training data into validation data.
  • batch_size (int | dict[int, int]): the batch size for train, val, and test dataloaders. If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an int, it is the same batch size for all tasks.
  • num_workers (int | dict[int, int]): the number of workers for dataloaders. If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an int, it is the same number of workers for all tasks.
  • custom_transforms (transform or transforms.Compose or None or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. ToTensor(), normalization, permute, and so on are not included. If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is None, no custom transforms are applied.
  • repeat_channels (int | None | dict of them): the number of channels to repeat for each task. Default is None, which means no repeat. If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an int, it is the same number of channels to repeat for all tasks. If it is None, no repeat is applied.
  • to_tensor (bool | dict[int, bool]): whether to include the ToTensor() transform. Default is True. If it is a dict, the keys are task IDs and the values are whether to include the ToTensor() transform for each task. If it is a single boolean value, it is applied to all tasks.
  • resize (tuple[int, int] | None or dict of them): the size to resize the images to. Default is None, which means no resize. If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is None, no resize is applied.
  • permutation_mode (str): the mode of permutation; one of:
    1. 'all': permute all pixels.
    2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
    3. 'first_channel_only': permute only the first channel.
  • permutation_seeds (dict[int, int] | None): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is None, which creates a dict of seeds from 0 to num_tasks-1.
original_dataset_python_class: type[torch.utils.data.dataset.Dataset] = <class 'torchvision.datasets.gtsrb.GTSRB'>

The original dataset class.

validation_percentage: float

The percentage to randomly split some training data into validation data.

def prepare_data(self) -> None:
86    def prepare_data(self) -> None:
87        r"""Download the original GTSRB dataset if haven't."""
88
89        if self.task_id != 1:
90            return  # download all original datasets only at the beginning of first task
91
92        GTSRB(root=self.root_t, split="train", download=True)
93        GTSRB(root=self.root_t, split="test", download=True)
94
95        pylogger.debug(
96            "The original GTSRB dataset has been downloaded to %s.", self.root_t
97        )

Download the original GTSRB dataset if haven't.

def train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]:
 99    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
100        """Get the training and validation dataset of task `self.task_id`.
101
102        **Returns:**
103        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
104        """
105        dataset_train_and_val = GTSRB(
106            root=self.root_t,
107            split="train",
108            transform=self.train_and_val_transforms(),
109            target_transform=self.target_transform(),
110            download=False,
111        )
112
113        return random_split(
114            dataset_train_and_val,
115            lengths=[1 - self.validation_percentage, self.validation_percentage],
116            generator=torch.Generator().manual_seed(
117                42
118            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
119        )

Get the training and validation dataset of task self.task_id.

Returns:

  • train_and_val_dataset (tuple[Dataset, Dataset]): the train and validation dataset of task self.task_id.
def test_dataset(self) -> torch.utils.data.dataset.Dataset:
121    def test_dataset(self) -> Dataset:
122        r"""Get the test dataset of task `self.task_id`.
123
124        **Returns:**
125        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
126        """
127        dataset_test = GTSRB(
128            root=self.root_t,
129            split="test",
130            transform=self.test_transforms(),
131            target_transform=self.target_transform(),
132            download=False,
133        )
134
135        return dataset_test

Get the test dataset of task self.task_id.

Returns:

  • test_dataset (Dataset): the test dataset of task self.task_id.