clarena.cl_datasets.permuted_imagenette
The submodule in cl_datasets for Permuted Imagenette dataset.
1r""" 2The submodule in `cl_datasets` for Permuted Imagenette dataset. 3""" 4 5__all__ = ["PermutedImagenette"] 6 7import logging 8from typing import Callable 9 10import torch 11from torch.utils.data import Dataset, random_split 12from torchvision.datasets import Imagenette 13from torchvision.transforms import transforms 14 15from clarena.cl_datasets import CLPermutedDataset 16 17# always get logger for built-in logging in each module 18pylogger = logging.getLogger(__name__) 19 20 21class PermutedImagenette(CLPermutedDataset): 22 r"""Permuted Imagenette dataset. The [Imagenette dataset](https://github.com/fastai/imagenette) is a subset of 10 easily classified classes from [Imagenet](https://www.image-net.org). It provides full sizes (as Imagenet), and resized 320x320 and 160x160. We support all of them in Permuted Imagenette.""" 23 24 original_dataset_python_class: type[Dataset] = Imagenette 25 r"""The original dataset class.""" 26 27 def __init__( 28 self, 29 root: str, 30 size: str, 31 num_tasks: int, 32 validation_percentage: float, 33 batch_size: int | dict[int, int] = 1, 34 num_workers: int | dict[int, int] = 0, 35 custom_transforms: ( 36 Callable 37 | transforms.Compose 38 | None 39 | dict[int, Callable | transforms.Compose | None] 40 ) = None, 41 repeat_channels: int | None | dict[int, int | None] = None, 42 to_tensor: bool | dict[int, bool] = True, 43 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 44 permutation_mode: str = "first_channel_only", 45 permutation_seeds: dict[int, int] | None = None, 46 ) -> None: 47 r""" 48 **Args:** 49 - **root** (`str`): the root directory where the original Imagenette data 'Imagenette/' live. 50 - **size** (`str`): image size type. Supports "full" (default), "320px", and "160px". 51 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 52 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 53 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 54 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 55 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 56 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 57 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 58 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 59 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 60 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 61 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 62 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 63 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 64 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 65 - **permutation_mode** (`str`): the mode of permutation; one of: 66 1. 'all': permute all pixels. 67 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 68 3. 'first_channel_only': permute only the first channel. 69 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 70 """ 71 72 super().__init__( 73 root=root, 74 num_tasks=num_tasks, 75 batch_size=batch_size, 76 num_workers=num_workers, 77 custom_transforms=custom_transforms, 78 repeat_channels=repeat_channels, 79 to_tensor=to_tensor, 80 resize=resize, 81 permutation_mode=permutation_mode, 82 permutation_seeds=permutation_seeds, 83 ) 84 85 self.size: str = size 86 r"""The size type of image.""" 87 88 self.validation_percentage: float = validation_percentage 89 r"""The percentage to randomly split some training data into validation data.""" 90 91 def prepare_data(self) -> None: 92 r"""Download the original Imagenette dataset if haven't.""" 93 94 if self.task_id != 1: 95 return # download all original datasets only at the beginning of first task 96 97 Imagenette(root=self.root_t, split="train", size=self.size, download=True) 98 Imagenette(root=self.root_t, split="val", size=self.size, download=True) 99 100 pylogger.debug( 101 "The original Imagenette dataset has been downloaded to %s.", 102 self.root_t, 103 ) 104 105 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 106 """Get the training and validation dataset of task `self.task_id`. 107 108 **Returns:** 109 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 110 """ 111 dataset_train_and_val = Imagenette( 112 root=self.root_t, 113 split="train", 114 size=self.size, 115 transform=self.train_and_val_transforms(), 116 target_transform=self.target_transform(), 117 download=False, 118 ) 119 120 return random_split( 121 dataset_train_and_val, 122 lengths=[1 - self.validation_percentage, self.validation_percentage], 123 generator=torch.Generator().manual_seed( 124 42 125 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 126 ) 127 128 def test_dataset(self) -> Dataset: 129 r"""Get the test dataset of task `self.task_id`. 130 131 **Returns:** 132 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 133 """ 134 135 dataset_test = Imagenette( 136 root=self.root_t, 137 split="test", 138 size=self.size, 139 transform=self.test_transforms(), 140 target_transform=self.target_transform(), 141 download=False, 142 ) 143 144 return dataset_test
class
PermutedImagenette(clarena.cl_datasets.base.CLPermutedDataset):
22class PermutedImagenette(CLPermutedDataset): 23 r"""Permuted Imagenette dataset. The [Imagenette dataset](https://github.com/fastai/imagenette) is a subset of 10 easily classified classes from [Imagenet](https://www.image-net.org). It provides full sizes (as Imagenet), and resized 320x320 and 160x160. We support all of them in Permuted Imagenette.""" 24 25 original_dataset_python_class: type[Dataset] = Imagenette 26 r"""The original dataset class.""" 27 28 def __init__( 29 self, 30 root: str, 31 size: str, 32 num_tasks: int, 33 validation_percentage: float, 34 batch_size: int | dict[int, int] = 1, 35 num_workers: int | dict[int, int] = 0, 36 custom_transforms: ( 37 Callable 38 | transforms.Compose 39 | None 40 | dict[int, Callable | transforms.Compose | None] 41 ) = None, 42 repeat_channels: int | None | dict[int, int | None] = None, 43 to_tensor: bool | dict[int, bool] = True, 44 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 45 permutation_mode: str = "first_channel_only", 46 permutation_seeds: dict[int, int] | None = None, 47 ) -> None: 48 r""" 49 **Args:** 50 - **root** (`str`): the root directory where the original Imagenette data 'Imagenette/' live. 51 - **size** (`str`): image size type. Supports "full" (default), "320px", and "160px". 52 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 53 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 54 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 55 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 56 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 57 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 58 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 59 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 60 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 61 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 62 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 63 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 64 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 65 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 66 - **permutation_mode** (`str`): the mode of permutation; one of: 67 1. 'all': permute all pixels. 68 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 69 3. 'first_channel_only': permute only the first channel. 70 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 71 """ 72 73 super().__init__( 74 root=root, 75 num_tasks=num_tasks, 76 batch_size=batch_size, 77 num_workers=num_workers, 78 custom_transforms=custom_transforms, 79 repeat_channels=repeat_channels, 80 to_tensor=to_tensor, 81 resize=resize, 82 permutation_mode=permutation_mode, 83 permutation_seeds=permutation_seeds, 84 ) 85 86 self.size: str = size 87 r"""The size type of image.""" 88 89 self.validation_percentage: float = validation_percentage 90 r"""The percentage to randomly split some training data into validation data.""" 91 92 def prepare_data(self) -> None: 93 r"""Download the original Imagenette dataset if haven't.""" 94 95 if self.task_id != 1: 96 return # download all original datasets only at the beginning of first task 97 98 Imagenette(root=self.root_t, split="train", size=self.size, download=True) 99 Imagenette(root=self.root_t, split="val", size=self.size, download=True) 100 101 pylogger.debug( 102 "The original Imagenette dataset has been downloaded to %s.", 103 self.root_t, 104 ) 105 106 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 107 """Get the training and validation dataset of task `self.task_id`. 108 109 **Returns:** 110 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 111 """ 112 dataset_train_and_val = Imagenette( 113 root=self.root_t, 114 split="train", 115 size=self.size, 116 transform=self.train_and_val_transforms(), 117 target_transform=self.target_transform(), 118 download=False, 119 ) 120 121 return random_split( 122 dataset_train_and_val, 123 lengths=[1 - self.validation_percentage, self.validation_percentage], 124 generator=torch.Generator().manual_seed( 125 42 126 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 127 ) 128 129 def test_dataset(self) -> Dataset: 130 r"""Get the test dataset of task `self.task_id`. 131 132 **Returns:** 133 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 134 """ 135 136 dataset_test = Imagenette( 137 root=self.root_t, 138 split="test", 139 size=self.size, 140 transform=self.test_transforms(), 141 target_transform=self.target_transform(), 142 download=False, 143 ) 144 145 return dataset_test
Permuted Imagenette dataset. The Imagenette dataset is a subset of 10 easily classified classes from Imagenet. It provides full sizes (as Imagenet), and resized 320x320 and 160x160. We support all of them in Permuted Imagenette.
PermutedImagenette( root: str, size: str, num_tasks: int, validation_percentage: float, batch_size: int | dict[int, int] = 1, num_workers: int | dict[int, int] = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType, dict[int, Union[Callable, torchvision.transforms.transforms.Compose, NoneType]]] = None, repeat_channels: int | None | dict[int, int | None] = None, to_tensor: bool | dict[int, bool] = True, resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, permutation_mode: str = 'first_channel_only', permutation_seeds: dict[int, int] | None = None)
28 def __init__( 29 self, 30 root: str, 31 size: str, 32 num_tasks: int, 33 validation_percentage: float, 34 batch_size: int | dict[int, int] = 1, 35 num_workers: int | dict[int, int] = 0, 36 custom_transforms: ( 37 Callable 38 | transforms.Compose 39 | None 40 | dict[int, Callable | transforms.Compose | None] 41 ) = None, 42 repeat_channels: int | None | dict[int, int | None] = None, 43 to_tensor: bool | dict[int, bool] = True, 44 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 45 permutation_mode: str = "first_channel_only", 46 permutation_seeds: dict[int, int] | None = None, 47 ) -> None: 48 r""" 49 **Args:** 50 - **root** (`str`): the root directory where the original Imagenette data 'Imagenette/' live. 51 - **size** (`str`): image size type. Supports "full" (default), "320px", and "160px". 52 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 53 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 54 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 55 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 56 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 57 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 58 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 59 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 60 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 61 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 62 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 63 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 64 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 65 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 66 - **permutation_mode** (`str`): the mode of permutation; one of: 67 1. 'all': permute all pixels. 68 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 69 3. 'first_channel_only': permute only the first channel. 70 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 71 """ 72 73 super().__init__( 74 root=root, 75 num_tasks=num_tasks, 76 batch_size=batch_size, 77 num_workers=num_workers, 78 custom_transforms=custom_transforms, 79 repeat_channels=repeat_channels, 80 to_tensor=to_tensor, 81 resize=resize, 82 permutation_mode=permutation_mode, 83 permutation_seeds=permutation_seeds, 84 ) 85 86 self.size: str = size 87 r"""The size type of image.""" 88 89 self.validation_percentage: float = validation_percentage 90 r"""The percentage to randomly split some training data into validation data."""
Args:
- root (
str): the root directory where the original Imagenette data 'Imagenette/' live. - size (
str): image size type. Supports "full" (default), "320px", and "160px". - num_tasks (
int): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 tonum_tasks. - validation_percentage (
float): the percentage to randomly split some training data into validation data. - batch_size (
int|dict[int, int]): the batch size for train, val, and test dataloaders. If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is anint, it is the same batch size for all tasks. - num_workers (
int|dict[int, int]): the number of workers for dataloaders. If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is anint, it is the same number of workers for all tasks. - custom_transforms (
transformortransforms.ComposeorNoneor dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform.ToTensor(), normalization, permute, and so on are not included. If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it isNone, no custom transforms are applied. - repeat_channels (
int|None| dict of them): the number of channels to repeat for each task. Default isNone, which means no repeat. If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is anint, it is the same number of channels to repeat for all tasks. If it isNone, no repeat is applied. - to_tensor (
bool|dict[int, bool]): whether to include theToTensor()transform. Default isTrue. If it is a dict, the keys are task IDs and the values are whether to include theToTensor()transform for each task. If it is a single boolean value, it is applied to all tasks. - resize (
tuple[int, int]|Noneor dict of them): the size to resize the images to. Default isNone, which means no resize. If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it isNone, no resize is applied. - permutation_mode (
str): the mode of permutation; one of:- 'all': permute all pixels.
- 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
- 'first_channel_only': permute only the first channel.
- permutation_seeds (
dict[int, int]|None): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default isNone, which creates a dict of seeds from 0 tonum_tasks-1.
original_dataset_python_class: type[torch.utils.data.dataset.Dataset] =
<class 'torchvision.datasets.imagenette.Imagenette'>
The original dataset class.
validation_percentage: float
The percentage to randomly split some training data into validation data.
def
prepare_data(self) -> None:
92 def prepare_data(self) -> None: 93 r"""Download the original Imagenette dataset if haven't.""" 94 95 if self.task_id != 1: 96 return # download all original datasets only at the beginning of first task 97 98 Imagenette(root=self.root_t, split="train", size=self.size, download=True) 99 Imagenette(root=self.root_t, split="val", size=self.size, download=True) 100 101 pylogger.debug( 102 "The original Imagenette dataset has been downloaded to %s.", 103 self.root_t, 104 )
Download the original Imagenette dataset if haven't.
def
train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]:
106 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 107 """Get the training and validation dataset of task `self.task_id`. 108 109 **Returns:** 110 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 111 """ 112 dataset_train_and_val = Imagenette( 113 root=self.root_t, 114 split="train", 115 size=self.size, 116 transform=self.train_and_val_transforms(), 117 target_transform=self.target_transform(), 118 download=False, 119 ) 120 121 return random_split( 122 dataset_train_and_val, 123 lengths=[1 - self.validation_percentage, self.validation_percentage], 124 generator=torch.Generator().manual_seed( 125 42 126 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 127 )
Get the training and validation dataset of task self.task_id.
Returns:
- train_and_val_dataset (
tuple[Dataset, Dataset]): the train and validation dataset of taskself.task_id.
def
test_dataset(self) -> torch.utils.data.dataset.Dataset:
129 def test_dataset(self) -> Dataset: 130 r"""Get the test dataset of task `self.task_id`. 131 132 **Returns:** 133 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 134 """ 135 136 dataset_test = Imagenette( 137 root=self.root_t, 138 split="test", 139 size=self.size, 140 transform=self.test_transforms(), 141 target_transform=self.target_transform(), 142 download=False, 143 ) 144 145 return dataset_test
Get the test dataset of task self.task_id.
Returns:
- test_dataset (
Dataset): the test dataset of taskself.task_id.