clarena.cl_datasets.permuted_sun397

The submodule in cl_datasets for Permuted SUN397 dataset.

View Source

  1r"""
  2The submodule in `cl_datasets` for Permuted SUN397 dataset.
  3"""
  4
  5__all__ = ["PermutedSUN397"]
  6
  7import logging
  8from typing import Callable
  9
 10import torch
 11from torch.utils.data import Dataset, random_split
 12from torchvision.datasets import SUN397
 13from torchvision.transforms import transforms
 14
 15from clarena.cl_datasets import CLPermutedDataset
 16
 17# always get logger for built-in logging in each module
 18pylogger = logging.getLogger(__name__)
 19
 20
 21class PermutedSUN397(CLPermutedDataset):
 22    r"""Permuted SUN397 dataset. The [SUN397 dataset](https://vision.princeton.edu/projects/2010/SUN) is a collection of scene images. It consists of 108,754 images of 397 classes, each color image."""
 23
 24    original_dataset_python_class: type[Dataset] = SUN397
 25    r"""The original dataset class."""
 26
 27    def __init__(
 28        self,
 29        root: str,
 30        num_tasks: int,
 31        test_percentage: float,
 32        validation_percentage: float,
 33        batch_size: int | dict[int, int] = 1,
 34        num_workers: int | dict[int, int] = 0,
 35        custom_transforms: (
 36            Callable
 37            | transforms.Compose
 38            | None
 39            | dict[int, Callable | transforms.Compose | None]
 40        ) = None,
 41        repeat_channels: int | None | dict[int, int | None] = None,
 42        to_tensor: bool | dict[int, bool] = True,
 43        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 44        permutation_mode: str = "first_channel_only",
 45        permutation_seeds: dict[int, int] | None = None,
 46    ) -> None:
 47        r"""
 48        **Args:**
 49        - **root** (`str`): the root directory where the original SUN397 data 'SUN397/' live.
 50        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
 51        - **test_percentage** (`float`): the percentage to randomly split some data into test data.
 52        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 53        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 54        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 55        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 56        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 57        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 58        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 59        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 60        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 61        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 62        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 63        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 64        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 65        - **permutation_mode** (`str`): the mode of permutation; one of:
 66            1. 'all': permute all pixels.
 67            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
 68            3. 'first_channel_only': permute only the first channel.
 69        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
 70        """
 71        super().__init__(
 72            root=root,
 73            num_tasks=num_tasks,
 74            batch_size=batch_size,
 75            num_workers=num_workers,
 76            custom_transforms=custom_transforms,
 77            repeat_channels=repeat_channels,
 78            to_tensor=to_tensor,
 79            resize=resize,
 80            permutation_mode=permutation_mode,
 81            permutation_seeds=permutation_seeds,
 82        )
 83
 84        self.test_percentage: float = test_percentage
 85        r"""The percentage to randomly split some data into test data."""
 86        self.validation_percentage: float = validation_percentage
 87        r"""The percentage to randomly split some training data into validation data."""
 88
 89    def prepare_data(self) -> None:
 90        r"""Download the original SUN397 dataset if haven't."""
 91
 92        if self.task_id != 1:
 93            return  # download all original datasets only at the beginning of first task
 94
 95        SUN397(root=self.root_t, download=True)
 96
 97        pylogger.debug(
 98            "The original SUN397 dataset has been downloaded to %s.", self.root_t
 99        )
100
101    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
102        """Get the training and validation dataset of task `self.task_id`.
103
104        **Returns:**
105        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
106        """
107        dataset_all = SUN397(
108            root=self.root_t,
109            transform=self.train_and_val_transforms(),
110            target_transform=self.target_transform(),
111            download=False,
112        )
113
114        dataset_train_and_val, _ = random_split(
115            dataset_all,
116            lengths=[
117                1 - self.test_percentage,
118                self.test_percentage,
119            ],
120            generator=torch.Generator().manual_seed(
121                42
122            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
123        )
124
125        return random_split(
126            dataset_train_and_val,
127            lengths=[1 - self.validation_percentage, self.validation_percentage],
128            generator=torch.Generator().manual_seed(
129                42
130            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
131        )
132
133    def test_dataset(self) -> Dataset:
134        r"""Get the test dataset of task `self.task_id`.
135
136        **Returns:**
137        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
138        """
139        dataset_all = SUN397(
140            root=self.root_t,
141            transform=self.train_and_val_transforms(),
142            target_transform=self.target_transform(),
143            download=False,
144        )
145
146        _, dataset_test = random_split(
147            dataset_all,
148            lengths=[1 - self.test_percentage, self.test_percentage],
149            generator=torch.Generator().manual_seed(42),
150        )
151
152        return dataset_test

class PermutedSUN397(clarena.cl_datasets.base.CLPermutedDataset): View Source

 22class PermutedSUN397(CLPermutedDataset):
 23    r"""Permuted SUN397 dataset. The [SUN397 dataset](https://vision.princeton.edu/projects/2010/SUN) is a collection of scene images. It consists of 108,754 images of 397 classes, each color image."""
 24
 25    original_dataset_python_class: type[Dataset] = SUN397
 26    r"""The original dataset class."""
 27
 28    def __init__(
 29        self,
 30        root: str,
 31        num_tasks: int,
 32        test_percentage: float,
 33        validation_percentage: float,
 34        batch_size: int | dict[int, int] = 1,
 35        num_workers: int | dict[int, int] = 0,
 36        custom_transforms: (
 37            Callable
 38            | transforms.Compose
 39            | None
 40            | dict[int, Callable | transforms.Compose | None]
 41        ) = None,
 42        repeat_channels: int | None | dict[int, int | None] = None,
 43        to_tensor: bool | dict[int, bool] = True,
 44        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 45        permutation_mode: str = "first_channel_only",
 46        permutation_seeds: dict[int, int] | None = None,
 47    ) -> None:
 48        r"""
 49        **Args:**
 50        - **root** (`str`): the root directory where the original SUN397 data 'SUN397/' live.
 51        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
 52        - **test_percentage** (`float`): the percentage to randomly split some data into test data.
 53        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 54        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 55        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 56        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 57        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 58        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 59        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 60        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 61        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 62        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 63        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 64        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 65        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 66        - **permutation_mode** (`str`): the mode of permutation; one of:
 67            1. 'all': permute all pixels.
 68            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
 69            3. 'first_channel_only': permute only the first channel.
 70        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
 71        """
 72        super().__init__(
 73            root=root,
 74            num_tasks=num_tasks,
 75            batch_size=batch_size,
 76            num_workers=num_workers,
 77            custom_transforms=custom_transforms,
 78            repeat_channels=repeat_channels,
 79            to_tensor=to_tensor,
 80            resize=resize,
 81            permutation_mode=permutation_mode,
 82            permutation_seeds=permutation_seeds,
 83        )
 84
 85        self.test_percentage: float = test_percentage
 86        r"""The percentage to randomly split some data into test data."""
 87        self.validation_percentage: float = validation_percentage
 88        r"""The percentage to randomly split some training data into validation data."""
 89
 90    def prepare_data(self) -> None:
 91        r"""Download the original SUN397 dataset if haven't."""
 92
 93        if self.task_id != 1:
 94            return  # download all original datasets only at the beginning of first task
 95
 96        SUN397(root=self.root_t, download=True)
 97
 98        pylogger.debug(
 99            "The original SUN397 dataset has been downloaded to %s.", self.root_t
100        )
101
102    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
103        """Get the training and validation dataset of task `self.task_id`.
104
105        **Returns:**
106        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
107        """
108        dataset_all = SUN397(
109            root=self.root_t,
110            transform=self.train_and_val_transforms(),
111            target_transform=self.target_transform(),
112            download=False,
113        )
114
115        dataset_train_and_val, _ = random_split(
116            dataset_all,
117            lengths=[
118                1 - self.test_percentage,
119                self.test_percentage,
120            ],
121            generator=torch.Generator().manual_seed(
122                42
123            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
124        )
125
126        return random_split(
127            dataset_train_and_val,
128            lengths=[1 - self.validation_percentage, self.validation_percentage],
129            generator=torch.Generator().manual_seed(
130                42
131            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
132        )
133
134    def test_dataset(self) -> Dataset:
135        r"""Get the test dataset of task `self.task_id`.
136
137        **Returns:**
138        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
139        """
140        dataset_all = SUN397(
141            root=self.root_t,
142            transform=self.train_and_val_transforms(),
143            target_transform=self.target_transform(),
144            download=False,
145        )
146
147        _, dataset_test = random_split(
148            dataset_all,
149            lengths=[1 - self.test_percentage, self.test_percentage],
150            generator=torch.Generator().manual_seed(42),
151        )
152
153        return dataset_test

Permuted SUN397 dataset. The SUN397 dataset is a collection of scene images. It consists of 108,754 images of 397 classes, each color image.

PermutedSUN397( root: str, num_tasks: int, test_percentage: float, validation_percentage: float, batch_size: int | dict[int, int] = 1, num_workers: int | dict[int, int] = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType, dict[int, Union[Callable, torchvision.transforms.transforms.Compose, NoneType]]] = None, repeat_channels: int | None | dict[int, int | None] = None, to_tensor: bool | dict[int, bool] = True, resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, permutation_mode: str = 'first_channel_only', permutation_seeds: dict[int, int] | None = None) View Source

28    def __init__(
29        self,
30        root: str,
31        num_tasks: int,
32        test_percentage: float,
33        validation_percentage: float,
34        batch_size: int | dict[int, int] = 1,
35        num_workers: int | dict[int, int] = 0,
36        custom_transforms: (
37            Callable
38            | transforms.Compose
39            | None
40            | dict[int, Callable | transforms.Compose | None]
41        ) = None,
42        repeat_channels: int | None | dict[int, int | None] = None,
43        to_tensor: bool | dict[int, bool] = True,
44        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
45        permutation_mode: str = "first_channel_only",
46        permutation_seeds: dict[int, int] | None = None,
47    ) -> None:
48        r"""
49        **Args:**
50        - **root** (`str`): the root directory where the original SUN397 data 'SUN397/' live.
51        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
52        - **test_percentage** (`float`): the percentage to randomly split some data into test data.
53        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
54        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
55        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
56        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
57        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
58        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
59        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
60        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
61        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
62        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
63        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
64        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
65        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
66        - **permutation_mode** (`str`): the mode of permutation; one of:
67            1. 'all': permute all pixels.
68            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
69            3. 'first_channel_only': permute only the first channel.
70        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
71        """
72        super().__init__(
73            root=root,
74            num_tasks=num_tasks,
75            batch_size=batch_size,
76            num_workers=num_workers,
77            custom_transforms=custom_transforms,
78            repeat_channels=repeat_channels,
79            to_tensor=to_tensor,
80            resize=resize,
81            permutation_mode=permutation_mode,
82            permutation_seeds=permutation_seeds,
83        )
84
85        self.test_percentage: float = test_percentage
86        r"""The percentage to randomly split some data into test data."""
87        self.validation_percentage: float = validation_percentage
88        r"""The percentage to randomly split some training data into validation data."""

Args:

root (str): the root directory where the original SUN397 data 'SUN397/' live.
num_tasks (int): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to num_tasks.
test_percentage (float): the percentage to randomly split some data into test data.
validation_percentage (float): the percentage to randomly split some training data into validation data.
batch_size (int | dict[int, int]): the batch size for train, val, and test dataloaders. If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an int, it is the same batch size for all tasks.
num_workers (int | dict[int, int]): the number of workers for dataloaders. If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an int, it is the same number of workers for all tasks.
custom_transforms (transform or transforms.Compose or None or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. ToTensor(), normalization, permute, and so on are not included. If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is None, no custom transforms are applied.
repeat_channels (int | None | dict of them): the number of channels to repeat for each task. Default is None, which means no repeat. If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an int, it is the same number of channels to repeat for all tasks. If it is None, no repeat is applied.
to_tensor (bool | dict[int, bool]): whether to include the ToTensor() transform. Default is True. If it is a dict, the keys are task IDs and the values are whether to include the ToTensor() transform for each task. If it is a single boolean value, it is applied to all tasks.
resize (tuple[int, int] | None or dict of them): the size to resize the images to. Default is None, which means no resize. If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is None, no resize is applied.
permutation_mode (str): the mode of permutation; one of:
1. 'all': permute all pixels.
2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
3. 'first_channel_only': permute only the first channel.
permutation_seeds (dict[int, int] | None): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is None, which creates a dict of seeds from 0 to num_tasks-1.

original_dataset_python_class: type[torch.utils.data.dataset.Dataset] = <class 'torchvision.datasets.sun397.SUN397'>

The original dataset class.

test_percentage: float

The percentage to randomly split some data into test data.

validation_percentage: float

The percentage to randomly split some training data into validation data.

def prepare_data(self) -> None: View Source

 90    def prepare_data(self) -> None:
 91        r"""Download the original SUN397 dataset if haven't."""
 92
 93        if self.task_id != 1:
 94            return  # download all original datasets only at the beginning of first task
 95
 96        SUN397(root=self.root_t, download=True)
 97
 98        pylogger.debug(
 99            "The original SUN397 dataset has been downloaded to %s.", self.root_t
100        )

Download the original SUN397 dataset if haven't.

def train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]: View Source

102    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
103        """Get the training and validation dataset of task `self.task_id`.
104
105        **Returns:**
106        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
107        """
108        dataset_all = SUN397(
109            root=self.root_t,
110            transform=self.train_and_val_transforms(),
111            target_transform=self.target_transform(),
112            download=False,
113        )
114
115        dataset_train_and_val, _ = random_split(
116            dataset_all,
117            lengths=[
118                1 - self.test_percentage,
119                self.test_percentage,
120            ],
121            generator=torch.Generator().manual_seed(
122                42
123            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
124        )
125
126        return random_split(
127            dataset_train_and_val,
128            lengths=[1 - self.validation_percentage, self.validation_percentage],
129            generator=torch.Generator().manual_seed(
130                42
131            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
132        )

Get the training and validation dataset of task self.task_id.

Returns:

train_and_val_dataset (tuple[Dataset, Dataset]): the train and validation dataset of task self.task_id.

def test_dataset(self) -> torch.utils.data.dataset.Dataset: View Source

134    def test_dataset(self) -> Dataset:
135        r"""Get the test dataset of task `self.task_id`.
136
137        **Returns:**
138        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
139        """
140        dataset_all = SUN397(
141            root=self.root_t,
142            transform=self.train_and_val_transforms(),
143            target_transform=self.target_transform(),
144            download=False,
145        )
146
147        _, dataset_test = random_split(
148            dataset_all,
149            lengths=[1 - self.test_percentage, self.test_percentage],
150            generator=torch.Generator().manual_seed(42),
151        )
152
153        return dataset_test

Get the test dataset of task self.task_id.

Returns:

test_dataset (Dataset): the test dataset of task self.task_id.