clarena.cl_datasets.permuted_imagenette

The submodule in cl_datasets for Permuted Imagenette dataset.

View Source

  1r"""
  2The submodule in `cl_datasets` for Permuted Imagenette dataset.
  3"""
  4
  5__all__ = ["PermutedImagenette"]
  6
  7import logging
  8from typing import Callable
  9
 10import torch
 11from torch.utils.data import Dataset, random_split
 12from torchvision.datasets import Imagenette
 13from torchvision.transforms import transforms
 14
 15from clarena.cl_datasets import CLPermutedDataset
 16
 17# always get logger for built-in logging in each module
 18pylogger = logging.getLogger(__name__)
 19
 20
 21class PermutedImagenette(CLPermutedDataset):
 22    r"""Permuted Imagenette dataset. The [Imagenette dataset](https://github.com/fastai/imagenette) is a subset of 10 easily classified classes from [Imagenet](https://www.image-net.org). It provides full sizes (as Imagenet), and resized 320x320 and 160x160. We support all of them in Permuted Imagenette."""
 23
 24    original_dataset_python_class: type[Dataset] = Imagenette
 25    r"""The original dataset class."""
 26
 27    def __init__(
 28        self,
 29        root: str,
 30        size: str,
 31        num_tasks: int,
 32        validation_percentage: float,
 33        batch_size: int | dict[int, int] = 1,
 34        num_workers: int | dict[int, int] = 0,
 35        custom_transforms: (
 36            Callable
 37            | transforms.Compose
 38            | None
 39            | dict[int, Callable | transforms.Compose | None]
 40        ) = None,
 41        repeat_channels: int | None | dict[int, int | None] = None,
 42        to_tensor: bool | dict[int, bool] = True,
 43        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 44        permutation_mode: str = "first_channel_only",
 45        permutation_seeds: dict[int, int] | None = None,
 46    ) -> None:
 47        r"""
 48        **Args:**
 49        - **root** (`str`): the root directory where the original Imagenette data 'Imagenette/' live.
 50        - **size** (`str`): image size type. Supports "full" (default), "320px", and "160px".
 51        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
 52        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 53        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 54        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 55        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 56        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 57        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 58        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 59        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 60        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 61        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 62        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 63        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 64        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 65        - **permutation_mode** (`str`): the mode of permutation; one of:
 66            1. 'all': permute all pixels.
 67            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
 68            3. 'first_channel_only': permute only the first channel.
 69        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
 70        """
 71
 72        super().__init__(
 73            root=root,
 74            num_tasks=num_tasks,
 75            batch_size=batch_size,
 76            num_workers=num_workers,
 77            custom_transforms=custom_transforms,
 78            repeat_channels=repeat_channels,
 79            to_tensor=to_tensor,
 80            resize=resize,
 81            permutation_mode=permutation_mode,
 82            permutation_seeds=permutation_seeds,
 83        )
 84
 85        self.size: str = size
 86        r"""The size type of image."""
 87
 88        self.validation_percentage: float = validation_percentage
 89        r"""The percentage to randomly split some training data into validation data."""
 90
 91    def prepare_data(self) -> None:
 92        r"""Download the original Imagenette dataset if haven't."""
 93
 94        if self.task_id != 1:
 95            return  # download all original datasets only at the beginning of first task
 96
 97        Imagenette(root=self.root_t, split="train", size=self.size, download=True)
 98        Imagenette(root=self.root_t, split="val", size=self.size, download=True)
 99
100        pylogger.debug(
101            "The original Imagenette dataset has been downloaded to %s.",
102            self.root_t,
103        )
104
105    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
106        """Get the training and validation dataset of task `self.task_id`.
107
108        **Returns:**
109        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
110        """
111        dataset_train_and_val = Imagenette(
112            root=self.root_t,
113            split="train",
114            size=self.size,
115            transform=self.train_and_val_transforms(),
116            target_transform=self.target_transform(),
117            download=False,
118        )
119
120        return random_split(
121            dataset_train_and_val,
122            lengths=[1 - self.validation_percentage, self.validation_percentage],
123            generator=torch.Generator().manual_seed(
124                42
125            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
126        )
127
128    def test_dataset(self) -> Dataset:
129        r"""Get the test dataset of task `self.task_id`.
130
131        **Returns:**
132        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
133        """
134
135        dataset_test = Imagenette(
136            root=self.root_t,
137            split="test",
138            size=self.size,
139            transform=self.test_transforms(),
140            target_transform=self.target_transform(),
141            download=False,
142        )
143
144        return dataset_test

class PermutedImagenette(clarena.cl_datasets.base.CLPermutedDataset): View Source

 22class PermutedImagenette(CLPermutedDataset):
 23    r"""Permuted Imagenette dataset. The [Imagenette dataset](https://github.com/fastai/imagenette) is a subset of 10 easily classified classes from [Imagenet](https://www.image-net.org). It provides full sizes (as Imagenet), and resized 320x320 and 160x160. We support all of them in Permuted Imagenette."""
 24
 25    original_dataset_python_class: type[Dataset] = Imagenette
 26    r"""The original dataset class."""
 27
 28    def __init__(
 29        self,
 30        root: str,
 31        size: str,
 32        num_tasks: int,
 33        validation_percentage: float,
 34        batch_size: int | dict[int, int] = 1,
 35        num_workers: int | dict[int, int] = 0,
 36        custom_transforms: (
 37            Callable
 38            | transforms.Compose
 39            | None
 40            | dict[int, Callable | transforms.Compose | None]
 41        ) = None,
 42        repeat_channels: int | None | dict[int, int | None] = None,
 43        to_tensor: bool | dict[int, bool] = True,
 44        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 45        permutation_mode: str = "first_channel_only",
 46        permutation_seeds: dict[int, int] | None = None,
 47    ) -> None:
 48        r"""
 49        **Args:**
 50        - **root** (`str`): the root directory where the original Imagenette data 'Imagenette/' live.
 51        - **size** (`str`): image size type. Supports "full" (default), "320px", and "160px".
 52        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
 53        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 54        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 55        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 56        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 57        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 58        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 59        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 60        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 61        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 62        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 63        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 64        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 65        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 66        - **permutation_mode** (`str`): the mode of permutation; one of:
 67            1. 'all': permute all pixels.
 68            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
 69            3. 'first_channel_only': permute only the first channel.
 70        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
 71        """
 72
 73        super().__init__(
 74            root=root,
 75            num_tasks=num_tasks,
 76            batch_size=batch_size,
 77            num_workers=num_workers,
 78            custom_transforms=custom_transforms,
 79            repeat_channels=repeat_channels,
 80            to_tensor=to_tensor,
 81            resize=resize,
 82            permutation_mode=permutation_mode,
 83            permutation_seeds=permutation_seeds,
 84        )
 85
 86        self.size: str = size
 87        r"""The size type of image."""
 88
 89        self.validation_percentage: float = validation_percentage
 90        r"""The percentage to randomly split some training data into validation data."""
 91
 92    def prepare_data(self) -> None:
 93        r"""Download the original Imagenette dataset if haven't."""
 94
 95        if self.task_id != 1:
 96            return  # download all original datasets only at the beginning of first task
 97
 98        Imagenette(root=self.root_t, split="train", size=self.size, download=True)
 99        Imagenette(root=self.root_t, split="val", size=self.size, download=True)
100
101        pylogger.debug(
102            "The original Imagenette dataset has been downloaded to %s.",
103            self.root_t,
104        )
105
106    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
107        """Get the training and validation dataset of task `self.task_id`.
108
109        **Returns:**
110        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
111        """
112        dataset_train_and_val = Imagenette(
113            root=self.root_t,
114            split="train",
115            size=self.size,
116            transform=self.train_and_val_transforms(),
117            target_transform=self.target_transform(),
118            download=False,
119        )
120
121        return random_split(
122            dataset_train_and_val,
123            lengths=[1 - self.validation_percentage, self.validation_percentage],
124            generator=torch.Generator().manual_seed(
125                42
126            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
127        )
128
129    def test_dataset(self) -> Dataset:
130        r"""Get the test dataset of task `self.task_id`.
131
132        **Returns:**
133        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
134        """
135
136        dataset_test = Imagenette(
137            root=self.root_t,
138            split="test",
139            size=self.size,
140            transform=self.test_transforms(),
141            target_transform=self.target_transform(),
142            download=False,
143        )
144
145        return dataset_test

Permuted Imagenette dataset. The Imagenette dataset is a subset of 10 easily classified classes from Imagenet. It provides full sizes (as Imagenet), and resized 320x320 and 160x160. We support all of them in Permuted Imagenette.

PermutedImagenette( root: str, size: str, num_tasks: int, validation_percentage: float, batch_size: int | dict[int, int] = 1, num_workers: int | dict[int, int] = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType, dict[int, Union[Callable, torchvision.transforms.transforms.Compose, NoneType]]] = None, repeat_channels: int | None | dict[int, int | None] = None, to_tensor: bool | dict[int, bool] = True, resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, permutation_mode: str = 'first_channel_only', permutation_seeds: dict[int, int] | None = None) View Source

28    def __init__(
29        self,
30        root: str,
31        size: str,
32        num_tasks: int,
33        validation_percentage: float,
34        batch_size: int | dict[int, int] = 1,
35        num_workers: int | dict[int, int] = 0,
36        custom_transforms: (
37            Callable
38            | transforms.Compose
39            | None
40            | dict[int, Callable | transforms.Compose | None]
41        ) = None,
42        repeat_channels: int | None | dict[int, int | None] = None,
43        to_tensor: bool | dict[int, bool] = True,
44        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
45        permutation_mode: str = "first_channel_only",
46        permutation_seeds: dict[int, int] | None = None,
47    ) -> None:
48        r"""
49        **Args:**
50        - **root** (`str`): the root directory where the original Imagenette data 'Imagenette/' live.
51        - **size** (`str`): image size type. Supports "full" (default), "320px", and "160px".
52        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
53        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
54        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
55        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
56        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
57        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
58        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
59        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
60        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
61        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
62        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
63        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
64        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
65        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
66        - **permutation_mode** (`str`): the mode of permutation; one of:
67            1. 'all': permute all pixels.
68            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
69            3. 'first_channel_only': permute only the first channel.
70        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
71        """
72
73        super().__init__(
74            root=root,
75            num_tasks=num_tasks,
76            batch_size=batch_size,
77            num_workers=num_workers,
78            custom_transforms=custom_transforms,
79            repeat_channels=repeat_channels,
80            to_tensor=to_tensor,
81            resize=resize,
82            permutation_mode=permutation_mode,
83            permutation_seeds=permutation_seeds,
84        )
85
86        self.size: str = size
87        r"""The size type of image."""
88
89        self.validation_percentage: float = validation_percentage
90        r"""The percentage to randomly split some training data into validation data."""

Args:

root (str): the root directory where the original Imagenette data 'Imagenette/' live.
size (str): image size type. Supports "full" (default), "320px", and "160px".
num_tasks (int): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to num_tasks.
validation_percentage (float): the percentage to randomly split some training data into validation data.
batch_size (int | dict[int, int]): the batch size for train, val, and test dataloaders. If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an int, it is the same batch size for all tasks.
num_workers (int | dict[int, int]): the number of workers for dataloaders. If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an int, it is the same number of workers for all tasks.
custom_transforms (transform or transforms.Compose or None or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. ToTensor(), normalization, permute, and so on are not included. If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is None, no custom transforms are applied.
repeat_channels (int | None | dict of them): the number of channels to repeat for each task. Default is None, which means no repeat. If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an int, it is the same number of channels to repeat for all tasks. If it is None, no repeat is applied.
to_tensor (bool | dict[int, bool]): whether to include the ToTensor() transform. Default is True. If it is a dict, the keys are task IDs and the values are whether to include the ToTensor() transform for each task. If it is a single boolean value, it is applied to all tasks.
resize (tuple[int, int] | None or dict of them): the size to resize the images to. Default is None, which means no resize. If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is None, no resize is applied.
permutation_mode (str): the mode of permutation; one of:
1. 'all': permute all pixels.
2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
3. 'first_channel_only': permute only the first channel.
permutation_seeds (dict[int, int] | None): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is None, which creates a dict of seeds from 0 to num_tasks-1.

original_dataset_python_class: type[torch.utils.data.dataset.Dataset] = <class 'torchvision.datasets.imagenette.Imagenette'>

The original dataset class.

size: str

The size type of image.

validation_percentage: float

The percentage to randomly split some training data into validation data.

def prepare_data(self) -> None: View Source

 92    def prepare_data(self) -> None:
 93        r"""Download the original Imagenette dataset if haven't."""
 94
 95        if self.task_id != 1:
 96            return  # download all original datasets only at the beginning of first task
 97
 98        Imagenette(root=self.root_t, split="train", size=self.size, download=True)
 99        Imagenette(root=self.root_t, split="val", size=self.size, download=True)
100
101        pylogger.debug(
102            "The original Imagenette dataset has been downloaded to %s.",
103            self.root_t,
104        )

Download the original Imagenette dataset if haven't.

def train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]: View Source

106    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
107        """Get the training and validation dataset of task `self.task_id`.
108
109        **Returns:**
110        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
111        """
112        dataset_train_and_val = Imagenette(
113            root=self.root_t,
114            split="train",
115            size=self.size,
116            transform=self.train_and_val_transforms(),
117            target_transform=self.target_transform(),
118            download=False,
119        )
120
121        return random_split(
122            dataset_train_and_val,
123            lengths=[1 - self.validation_percentage, self.validation_percentage],
124            generator=torch.Generator().manual_seed(
125                42
126            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
127        )

Get the training and validation dataset of task self.task_id.

Returns:

train_and_val_dataset (tuple[Dataset, Dataset]): the train and validation dataset of task self.task_id.

def test_dataset(self) -> torch.utils.data.dataset.Dataset: View Source

129    def test_dataset(self) -> Dataset:
130        r"""Get the test dataset of task `self.task_id`.
131
132        **Returns:**
133        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
134        """
135
136        dataset_test = Imagenette(
137            root=self.root_t,
138            split="test",
139            size=self.size,
140            transform=self.test_transforms(),
141            target_transform=self.target_transform(),
142            download=False,
143        )
144
145        return dataset_test

Get the test dataset of task self.task_id.

Returns:

test_dataset (Dataset): the test dataset of task self.task_id.