clarena.cl_datasets.permuted_linnaeus5
The submodule in cl_datasets for Permuted Linnaeus 5 dataset.
1r""" 2The submodule in `cl_datasets` for Permuted Linnaeus 5 dataset. 3""" 4 5__all__ = ["PermutedLinnaeus5"] 6 7import logging 8from typing import Callable 9 10import torch 11from torch.utils.data import Dataset, random_split 12from torchvision.transforms import transforms 13 14from clarena.cl_datasets import CLPermutedDataset 15from clarena.stl_datasets.raw import ( 16 Linnaeus5, 17 Linnaeus5_32, 18 Linnaeus5_64, 19 Linnaeus5_128, 20 Linnaeus5_256, 21) 22 23# always get logger for built-in logging in each module 24pylogger = logging.getLogger(__name__) 25 26 27class PermutedLinnaeus5(CLPermutedDataset): 28 r"""Permuted Linnaeus 5 dataset. The [Linnaeus 5 dataset](https://chaladze.com/l5/) is a collection of flower images. It consists of 8,000 images of 5 flower species (classes). It provides 256x256, 128x128, 64x64, and 32x32 color images. We support all of them in Permuted Linnaeus 5.""" 29 30 def __init__( 31 self, 32 root: str, 33 resolution: str, 34 num_tasks: int, 35 validation_percentage: float, 36 batch_size: int | dict[int, int] = 1, 37 num_workers: int | dict[int, int] = 0, 38 custom_transforms: ( 39 Callable 40 | transforms.Compose 41 | None 42 | dict[int, Callable | transforms.Compose | None] 43 ) = None, 44 repeat_channels: int | None | dict[int, int | None] = None, 45 to_tensor: bool | dict[int, bool] = True, 46 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 47 permutation_mode: str = "first_channel_only", 48 permutation_seeds: dict[int, int] | None = None, 49 ) -> None: 50 r""" 51 **Args:** 52 - **root** (`str`): the root directory where the original Linnaeus 5 data 'Linnaeus5/' live. 53 - **resolution** (`str`): Image resolution, one of ["256", "128", "64", "32"]. 54 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 55 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 56 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 57 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 58 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 59 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 60 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 61 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 62 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 63 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 64 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 65 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 66 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 67 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 68 - **permutation_mode** (`str`): the mode of permutation; one of: 69 1. 'all': permute all pixels. 70 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 71 3. 'first_channel_only': permute only the first channel. 72 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 73 """ 74 75 if resolution == "32": 76 self.original_dataset_python_class: type[Dataset] = Linnaeus5_32 77 elif resolution == "64": 78 self.original_dataset_python_class: type[Dataset] = Linnaeus5_64 79 elif resolution == "128": 80 self.original_dataset_python_class: type[Dataset] = Linnaeus5_128 81 elif resolution == "256": 82 self.original_dataset_python_class: type[Dataset] = Linnaeus5_256 83 r"""The original dataset class.""" 84 85 super().__init__( 86 root=root, 87 num_tasks=num_tasks, 88 batch_size=batch_size, 89 num_workers=num_workers, 90 custom_transforms=custom_transforms, 91 repeat_channels=repeat_channels, 92 to_tensor=to_tensor, 93 resize=resize, 94 permutation_mode=permutation_mode, 95 permutation_seeds=permutation_seeds, 96 ) 97 98 self.resolution: str = resolution 99 r"""Store the resolution of the original dataset.""" 100 101 self.validation_percentage: float = validation_percentage 102 r"""The percentage to randomly split some training data into validation data.""" 103 104 def prepare_data(self) -> None: 105 r"""Download the original Linnaeus 5 dataset if haven't.""" 106 107 if self.task_id != 1: 108 return # download all original datasets only at the beginning of first task 109 110 Linnaeus5( 111 root=self.root_t, resolution=self.resolution, train=True, download=True 112 ) 113 Linnaeus5( 114 root=self.root_t, resolution=self.resolution, train=False, download=True 115 ) 116 117 pylogger.debug( 118 "The original Linnaeus 5 dataset has been downloaded to %s.", 119 self.root_t, 120 ) 121 122 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 123 """Get the training and validation dataset of task `self.task_id`. 124 125 **Returns:** 126 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 127 """ 128 dataset_train_and_val = Linnaeus5( 129 root=self.root_t, 130 resolution=self.resolution, 131 train=True, 132 transform=self.train_and_val_transforms(), 133 target_transform=self.target_transform(), 134 download=False, 135 ) 136 137 return random_split( 138 dataset_train_and_val, 139 lengths=[1 - self.validation_percentage, self.validation_percentage], 140 generator=torch.Generator().manual_seed( 141 42 142 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 143 ) 144 145 def test_dataset(self) -> Dataset: 146 r"""Get the test dataset of task `self.task_id`. 147 148 **Returns:** 149 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 150 """ 151 dataset_test = Linnaeus5( 152 root=self.root_t, 153 resolution=self.resolution, 154 train=False, 155 transform=self.test_transforms(), 156 target_transform=self.target_transform(), 157 download=False, 158 ) 159 160 return dataset_test
class
PermutedLinnaeus5(clarena.cl_datasets.base.CLPermutedDataset):
28class PermutedLinnaeus5(CLPermutedDataset): 29 r"""Permuted Linnaeus 5 dataset. The [Linnaeus 5 dataset](https://chaladze.com/l5/) is a collection of flower images. It consists of 8,000 images of 5 flower species (classes). It provides 256x256, 128x128, 64x64, and 32x32 color images. We support all of them in Permuted Linnaeus 5.""" 30 31 def __init__( 32 self, 33 root: str, 34 resolution: str, 35 num_tasks: int, 36 validation_percentage: float, 37 batch_size: int | dict[int, int] = 1, 38 num_workers: int | dict[int, int] = 0, 39 custom_transforms: ( 40 Callable 41 | transforms.Compose 42 | None 43 | dict[int, Callable | transforms.Compose | None] 44 ) = None, 45 repeat_channels: int | None | dict[int, int | None] = None, 46 to_tensor: bool | dict[int, bool] = True, 47 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 48 permutation_mode: str = "first_channel_only", 49 permutation_seeds: dict[int, int] | None = None, 50 ) -> None: 51 r""" 52 **Args:** 53 - **root** (`str`): the root directory where the original Linnaeus 5 data 'Linnaeus5/' live. 54 - **resolution** (`str`): Image resolution, one of ["256", "128", "64", "32"]. 55 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 56 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 57 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 58 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 59 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 60 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 61 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 62 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 63 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 64 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 65 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 66 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 67 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 68 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 69 - **permutation_mode** (`str`): the mode of permutation; one of: 70 1. 'all': permute all pixels. 71 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 72 3. 'first_channel_only': permute only the first channel. 73 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 74 """ 75 76 if resolution == "32": 77 self.original_dataset_python_class: type[Dataset] = Linnaeus5_32 78 elif resolution == "64": 79 self.original_dataset_python_class: type[Dataset] = Linnaeus5_64 80 elif resolution == "128": 81 self.original_dataset_python_class: type[Dataset] = Linnaeus5_128 82 elif resolution == "256": 83 self.original_dataset_python_class: type[Dataset] = Linnaeus5_256 84 r"""The original dataset class.""" 85 86 super().__init__( 87 root=root, 88 num_tasks=num_tasks, 89 batch_size=batch_size, 90 num_workers=num_workers, 91 custom_transforms=custom_transforms, 92 repeat_channels=repeat_channels, 93 to_tensor=to_tensor, 94 resize=resize, 95 permutation_mode=permutation_mode, 96 permutation_seeds=permutation_seeds, 97 ) 98 99 self.resolution: str = resolution 100 r"""Store the resolution of the original dataset.""" 101 102 self.validation_percentage: float = validation_percentage 103 r"""The percentage to randomly split some training data into validation data.""" 104 105 def prepare_data(self) -> None: 106 r"""Download the original Linnaeus 5 dataset if haven't.""" 107 108 if self.task_id != 1: 109 return # download all original datasets only at the beginning of first task 110 111 Linnaeus5( 112 root=self.root_t, resolution=self.resolution, train=True, download=True 113 ) 114 Linnaeus5( 115 root=self.root_t, resolution=self.resolution, train=False, download=True 116 ) 117 118 pylogger.debug( 119 "The original Linnaeus 5 dataset has been downloaded to %s.", 120 self.root_t, 121 ) 122 123 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 124 """Get the training and validation dataset of task `self.task_id`. 125 126 **Returns:** 127 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 128 """ 129 dataset_train_and_val = Linnaeus5( 130 root=self.root_t, 131 resolution=self.resolution, 132 train=True, 133 transform=self.train_and_val_transforms(), 134 target_transform=self.target_transform(), 135 download=False, 136 ) 137 138 return random_split( 139 dataset_train_and_val, 140 lengths=[1 - self.validation_percentage, self.validation_percentage], 141 generator=torch.Generator().manual_seed( 142 42 143 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 144 ) 145 146 def test_dataset(self) -> Dataset: 147 r"""Get the test dataset of task `self.task_id`. 148 149 **Returns:** 150 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 151 """ 152 dataset_test = Linnaeus5( 153 root=self.root_t, 154 resolution=self.resolution, 155 train=False, 156 transform=self.test_transforms(), 157 target_transform=self.target_transform(), 158 download=False, 159 ) 160 161 return dataset_test
Permuted Linnaeus 5 dataset. The Linnaeus 5 dataset is a collection of flower images. It consists of 8,000 images of 5 flower species (classes). It provides 256x256, 128x128, 64x64, and 32x32 color images. We support all of them in Permuted Linnaeus 5.
PermutedLinnaeus5( root: str, resolution: str, num_tasks: int, validation_percentage: float, batch_size: int | dict[int, int] = 1, num_workers: int | dict[int, int] = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType, dict[int, Union[Callable, torchvision.transforms.transforms.Compose, NoneType]]] = None, repeat_channels: int | None | dict[int, int | None] = None, to_tensor: bool | dict[int, bool] = True, resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, permutation_mode: str = 'first_channel_only', permutation_seeds: dict[int, int] | None = None)
31 def __init__( 32 self, 33 root: str, 34 resolution: str, 35 num_tasks: int, 36 validation_percentage: float, 37 batch_size: int | dict[int, int] = 1, 38 num_workers: int | dict[int, int] = 0, 39 custom_transforms: ( 40 Callable 41 | transforms.Compose 42 | None 43 | dict[int, Callable | transforms.Compose | None] 44 ) = None, 45 repeat_channels: int | None | dict[int, int | None] = None, 46 to_tensor: bool | dict[int, bool] = True, 47 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 48 permutation_mode: str = "first_channel_only", 49 permutation_seeds: dict[int, int] | None = None, 50 ) -> None: 51 r""" 52 **Args:** 53 - **root** (`str`): the root directory where the original Linnaeus 5 data 'Linnaeus5/' live. 54 - **resolution** (`str`): Image resolution, one of ["256", "128", "64", "32"]. 55 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 56 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 57 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 58 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 59 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 60 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 61 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 62 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 63 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 64 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 65 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 66 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 67 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 68 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 69 - **permutation_mode** (`str`): the mode of permutation; one of: 70 1. 'all': permute all pixels. 71 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 72 3. 'first_channel_only': permute only the first channel. 73 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 74 """ 75 76 if resolution == "32": 77 self.original_dataset_python_class: type[Dataset] = Linnaeus5_32 78 elif resolution == "64": 79 self.original_dataset_python_class: type[Dataset] = Linnaeus5_64 80 elif resolution == "128": 81 self.original_dataset_python_class: type[Dataset] = Linnaeus5_128 82 elif resolution == "256": 83 self.original_dataset_python_class: type[Dataset] = Linnaeus5_256 84 r"""The original dataset class.""" 85 86 super().__init__( 87 root=root, 88 num_tasks=num_tasks, 89 batch_size=batch_size, 90 num_workers=num_workers, 91 custom_transforms=custom_transforms, 92 repeat_channels=repeat_channels, 93 to_tensor=to_tensor, 94 resize=resize, 95 permutation_mode=permutation_mode, 96 permutation_seeds=permutation_seeds, 97 ) 98 99 self.resolution: str = resolution 100 r"""Store the resolution of the original dataset.""" 101 102 self.validation_percentage: float = validation_percentage 103 r"""The percentage to randomly split some training data into validation data."""
Args:
- root (
str): the root directory where the original Linnaeus 5 data 'Linnaeus5/' live. - resolution (
str): Image resolution, one of ["256", "128", "64", "32"]. - num_tasks (
int): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 tonum_tasks. - validation_percentage (
float): the percentage to randomly split some training data into validation data. - batch_size (
int|dict[int, int]): the batch size for train, val, and test dataloaders. If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is anint, it is the same batch size for all tasks. - num_workers (
int|dict[int, int]): the number of workers for dataloaders. If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is anint, it is the same number of workers for all tasks. - custom_transforms (
transformortransforms.ComposeorNoneor dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform.ToTensor(), normalization, permute, and so on are not included. If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it isNone, no custom transforms are applied. - repeat_channels (
int|None| dict of them): the number of channels to repeat for each task. Default isNone, which means no repeat. If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is anint, it is the same number of channels to repeat for all tasks. If it isNone, no repeat is applied. - to_tensor (
bool|dict[int, bool]): whether to include theToTensor()transform. Default isTrue. If it is a dict, the keys are task IDs and the values are whether to include theToTensor()transform for each task. If it is a single boolean value, it is applied to all tasks. - resize (
tuple[int, int]|Noneor dict of them): the size to resize the images to. Default isNone, which means no resize. If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it isNone, no resize is applied. - permutation_mode (
str): the mode of permutation; one of:- 'all': permute all pixels.
- 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
- 'first_channel_only': permute only the first channel.
- permutation_seeds (
dict[int, int]|None): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default isNone, which creates a dict of seeds from 0 tonum_tasks-1.
validation_percentage: float
The percentage to randomly split some training data into validation data.
def
prepare_data(self) -> None:
105 def prepare_data(self) -> None: 106 r"""Download the original Linnaeus 5 dataset if haven't.""" 107 108 if self.task_id != 1: 109 return # download all original datasets only at the beginning of first task 110 111 Linnaeus5( 112 root=self.root_t, resolution=self.resolution, train=True, download=True 113 ) 114 Linnaeus5( 115 root=self.root_t, resolution=self.resolution, train=False, download=True 116 ) 117 118 pylogger.debug( 119 "The original Linnaeus 5 dataset has been downloaded to %s.", 120 self.root_t, 121 )
Download the original Linnaeus 5 dataset if haven't.
def
train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]:
123 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 124 """Get the training and validation dataset of task `self.task_id`. 125 126 **Returns:** 127 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 128 """ 129 dataset_train_and_val = Linnaeus5( 130 root=self.root_t, 131 resolution=self.resolution, 132 train=True, 133 transform=self.train_and_val_transforms(), 134 target_transform=self.target_transform(), 135 download=False, 136 ) 137 138 return random_split( 139 dataset_train_and_val, 140 lengths=[1 - self.validation_percentage, self.validation_percentage], 141 generator=torch.Generator().manual_seed( 142 42 143 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 144 )
Get the training and validation dataset of task self.task_id.
Returns:
- train_and_val_dataset (
tuple[Dataset, Dataset]): the train and validation dataset of taskself.task_id.
def
test_dataset(self) -> torch.utils.data.dataset.Dataset:
146 def test_dataset(self) -> Dataset: 147 r"""Get the test dataset of task `self.task_id`. 148 149 **Returns:** 150 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 151 """ 152 dataset_test = Linnaeus5( 153 root=self.root_t, 154 resolution=self.resolution, 155 train=False, 156 transform=self.test_transforms(), 157 target_transform=self.target_transform(), 158 download=False, 159 ) 160 161 return dataset_test
Get the test dataset of task self.task_id.
Returns:
- test_dataset (
Dataset): the test dataset of taskself.task_id.