clarena.cl_datasets.permuted_SEMEION
The submodule in cl_datasets for Permuted SEMEION dataset.
1r""" 2The submodule in `cl_datasets` for Permuted SEMEION dataset. 3""" 4 5__all__ = ["PermutedSEMEION"] 6 7import logging 8from typing import Callable 9 10import torch 11from torch.utils.data import Dataset, random_split 12from torchvision.datasets import SEMEION 13from torchvision.transforms import transforms 14 15from clarena.cl_datasets import CLPermutedDataset 16 17# always get logger for built-in logging in each module 18pylogger = logging.getLogger(__name__) 19 20 21class PermutedSEMEION(CLPermutedDataset): 22 r"""Permuted SEMEION dataset. The [SEMEION dataset](https://archive.ics.uci.edu/dataset/178/semeion+handwritten+digit) is a collection of handwritten digits. It consists of 1,593 handwritten digit images (10 classes), each 16x16 grayscale image.""" 23 24 original_dataset_python_class: type[Dataset] = SEMEION 25 r"""The original dataset class.""" 26 27 def __init__( 28 self, 29 root: str, 30 num_tasks: int, 31 test_percentage: float, 32 validation_percentage: float, 33 batch_size: int | dict[int, int] = 1, 34 num_workers: int | dict[int, int] = 0, 35 custom_transforms: ( 36 Callable 37 | transforms.Compose 38 | None 39 | dict[int, Callable | transforms.Compose | None] 40 ) = None, 41 repeat_channels: int | None | dict[int, int | None] = None, 42 to_tensor: bool | dict[int, bool] = True, 43 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 44 permutation_mode: str = "first_channel_only", 45 permutation_seeds: dict[int, int] | None = None, 46 ) -> None: 47 r""" 48 **Args:** 49 - **root** (`str`): the root directory where the original SEMEION data 'SEMEION/' live. 50 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 51 - **test_percentage** (`float`): the percentage to randomly split some data into test data. 52 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 53 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 54 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 55 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 56 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 57 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 58 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 59 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 60 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 61 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 62 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 63 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 64 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 65 - **permutation_mode** (`str`): the mode of permutation; one of: 66 1. 'all': permute all pixels. 67 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 68 3. 'first_channel_only': permute only the first channel. 69 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 70 """ 71 72 super().__init__( 73 root=root, 74 num_tasks=num_tasks, 75 batch_size=batch_size, 76 num_workers=num_workers, 77 custom_transforms=custom_transforms, 78 repeat_channels=repeat_channels, 79 to_tensor=to_tensor, 80 resize=resize, 81 permutation_mode=permutation_mode, 82 permutation_seeds=permutation_seeds, 83 ) 84 85 self.test_percentage: float = test_percentage 86 r"""The percentage to randomly split some data into test data.""" 87 self.validation_percentage: float = validation_percentage 88 r"""The percentage to randomly split some training data into validation data.""" 89 90 def prepare_data(self) -> None: 91 r"""Download the original SEMEION dataset if haven't.""" 92 93 if self.task_id != 1: 94 return # download all original datasets only at the beginning of first task 95 96 SEMEION(root=self.root_t, download=True) 97 98 pylogger.debug( 99 "The original SEMEION dataset has been downloaded to %s.", self.root_t 100 ) 101 102 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 103 """Get the training and validation dataset of task `self.task_id`. 104 105 **Returns:** 106 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 107 """ 108 dataset_all = SEMEION( 109 root=self.root_t, 110 transform=self.train_and_val_transforms(), 111 target_transform=self.target_transform(), 112 download=False, 113 ) 114 115 dataset_train_and_val, _ = random_split( 116 dataset_all, 117 lengths=[ 118 1 - self.test_percentage, 119 self.test_percentage, 120 ], 121 generator=torch.Generator().manual_seed( 122 42 123 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 124 ) 125 126 return random_split( 127 dataset_train_and_val, 128 lengths=[1 - self.validation_percentage, self.validation_percentage], 129 generator=torch.Generator().manual_seed( 130 42 131 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 132 ) 133 134 def test_dataset(self) -> Dataset: 135 r"""Get the test dataset of task `self.task_id`. 136 137 **Returns:** 138 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 139 """ 140 dataset_all = SEMEION( 141 root=self.root_t, 142 transform=self.train_and_val_transforms(), 143 target_transform=self.target_transform(), 144 download=False, 145 ) 146 147 _, dataset_test = random_split( 148 dataset_all, 149 lengths=[1 - self.test_percentage, self.test_percentage], 150 generator=torch.Generator().manual_seed(42), 151 ) 152 153 return dataset_test
class
PermutedSEMEION(clarena.cl_datasets.base.CLPermutedDataset):
22class PermutedSEMEION(CLPermutedDataset): 23 r"""Permuted SEMEION dataset. The [SEMEION dataset](https://archive.ics.uci.edu/dataset/178/semeion+handwritten+digit) is a collection of handwritten digits. It consists of 1,593 handwritten digit images (10 classes), each 16x16 grayscale image.""" 24 25 original_dataset_python_class: type[Dataset] = SEMEION 26 r"""The original dataset class.""" 27 28 def __init__( 29 self, 30 root: str, 31 num_tasks: int, 32 test_percentage: float, 33 validation_percentage: float, 34 batch_size: int | dict[int, int] = 1, 35 num_workers: int | dict[int, int] = 0, 36 custom_transforms: ( 37 Callable 38 | transforms.Compose 39 | None 40 | dict[int, Callable | transforms.Compose | None] 41 ) = None, 42 repeat_channels: int | None | dict[int, int | None] = None, 43 to_tensor: bool | dict[int, bool] = True, 44 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 45 permutation_mode: str = "first_channel_only", 46 permutation_seeds: dict[int, int] | None = None, 47 ) -> None: 48 r""" 49 **Args:** 50 - **root** (`str`): the root directory where the original SEMEION data 'SEMEION/' live. 51 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 52 - **test_percentage** (`float`): the percentage to randomly split some data into test data. 53 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 54 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 55 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 56 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 57 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 58 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 59 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 60 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 61 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 62 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 63 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 64 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 65 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 66 - **permutation_mode** (`str`): the mode of permutation; one of: 67 1. 'all': permute all pixels. 68 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 69 3. 'first_channel_only': permute only the first channel. 70 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 71 """ 72 73 super().__init__( 74 root=root, 75 num_tasks=num_tasks, 76 batch_size=batch_size, 77 num_workers=num_workers, 78 custom_transforms=custom_transforms, 79 repeat_channels=repeat_channels, 80 to_tensor=to_tensor, 81 resize=resize, 82 permutation_mode=permutation_mode, 83 permutation_seeds=permutation_seeds, 84 ) 85 86 self.test_percentage: float = test_percentage 87 r"""The percentage to randomly split some data into test data.""" 88 self.validation_percentage: float = validation_percentage 89 r"""The percentage to randomly split some training data into validation data.""" 90 91 def prepare_data(self) -> None: 92 r"""Download the original SEMEION dataset if haven't.""" 93 94 if self.task_id != 1: 95 return # download all original datasets only at the beginning of first task 96 97 SEMEION(root=self.root_t, download=True) 98 99 pylogger.debug( 100 "The original SEMEION dataset has been downloaded to %s.", self.root_t 101 ) 102 103 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 104 """Get the training and validation dataset of task `self.task_id`. 105 106 **Returns:** 107 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 108 """ 109 dataset_all = SEMEION( 110 root=self.root_t, 111 transform=self.train_and_val_transforms(), 112 target_transform=self.target_transform(), 113 download=False, 114 ) 115 116 dataset_train_and_val, _ = random_split( 117 dataset_all, 118 lengths=[ 119 1 - self.test_percentage, 120 self.test_percentage, 121 ], 122 generator=torch.Generator().manual_seed( 123 42 124 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 125 ) 126 127 return random_split( 128 dataset_train_and_val, 129 lengths=[1 - self.validation_percentage, self.validation_percentage], 130 generator=torch.Generator().manual_seed( 131 42 132 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 133 ) 134 135 def test_dataset(self) -> Dataset: 136 r"""Get the test dataset of task `self.task_id`. 137 138 **Returns:** 139 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 140 """ 141 dataset_all = SEMEION( 142 root=self.root_t, 143 transform=self.train_and_val_transforms(), 144 target_transform=self.target_transform(), 145 download=False, 146 ) 147 148 _, dataset_test = random_split( 149 dataset_all, 150 lengths=[1 - self.test_percentage, self.test_percentage], 151 generator=torch.Generator().manual_seed(42), 152 ) 153 154 return dataset_test
Permuted SEMEION dataset. The SEMEION dataset is a collection of handwritten digits. It consists of 1,593 handwritten digit images (10 classes), each 16x16 grayscale image.
PermutedSEMEION( root: str, num_tasks: int, test_percentage: float, validation_percentage: float, batch_size: int | dict[int, int] = 1, num_workers: int | dict[int, int] = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType, dict[int, Union[Callable, torchvision.transforms.transforms.Compose, NoneType]]] = None, repeat_channels: int | None | dict[int, int | None] = None, to_tensor: bool | dict[int, bool] = True, resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, permutation_mode: str = 'first_channel_only', permutation_seeds: dict[int, int] | None = None)
28 def __init__( 29 self, 30 root: str, 31 num_tasks: int, 32 test_percentage: float, 33 validation_percentage: float, 34 batch_size: int | dict[int, int] = 1, 35 num_workers: int | dict[int, int] = 0, 36 custom_transforms: ( 37 Callable 38 | transforms.Compose 39 | None 40 | dict[int, Callable | transforms.Compose | None] 41 ) = None, 42 repeat_channels: int | None | dict[int, int | None] = None, 43 to_tensor: bool | dict[int, bool] = True, 44 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 45 permutation_mode: str = "first_channel_only", 46 permutation_seeds: dict[int, int] | None = None, 47 ) -> None: 48 r""" 49 **Args:** 50 - **root** (`str`): the root directory where the original SEMEION data 'SEMEION/' live. 51 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 52 - **test_percentage** (`float`): the percentage to randomly split some data into test data. 53 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 54 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 55 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 56 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 57 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 58 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 59 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 60 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 61 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 62 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 63 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 64 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 65 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 66 - **permutation_mode** (`str`): the mode of permutation; one of: 67 1. 'all': permute all pixels. 68 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 69 3. 'first_channel_only': permute only the first channel. 70 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 71 """ 72 73 super().__init__( 74 root=root, 75 num_tasks=num_tasks, 76 batch_size=batch_size, 77 num_workers=num_workers, 78 custom_transforms=custom_transforms, 79 repeat_channels=repeat_channels, 80 to_tensor=to_tensor, 81 resize=resize, 82 permutation_mode=permutation_mode, 83 permutation_seeds=permutation_seeds, 84 ) 85 86 self.test_percentage: float = test_percentage 87 r"""The percentage to randomly split some data into test data.""" 88 self.validation_percentage: float = validation_percentage 89 r"""The percentage to randomly split some training data into validation data."""
Args:
- root (
str): the root directory where the original SEMEION data 'SEMEION/' live. - num_tasks (
int): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 tonum_tasks. - test_percentage (
float): the percentage to randomly split some data into test data. - validation_percentage (
float): the percentage to randomly split some training data into validation data. - batch_size (
int|dict[int, int]): the batch size for train, val, and test dataloaders. If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is anint, it is the same batch size for all tasks. - num_workers (
int|dict[int, int]): the number of workers for dataloaders. If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is anint, it is the same number of workers for all tasks. - custom_transforms (
transformortransforms.ComposeorNoneor dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform.ToTensor(), normalization, permute, and so on are not included. If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it isNone, no custom transforms are applied. - repeat_channels (
int|None| dict of them): the number of channels to repeat for each task. Default isNone, which means no repeat. If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is anint, it is the same number of channels to repeat for all tasks. If it isNone, no repeat is applied. - to_tensor (
bool|dict[int, bool]): whether to include theToTensor()transform. Default isTrue. If it is a dict, the keys are task IDs and the values are whether to include theToTensor()transform for each task. If it is a single boolean value, it is applied to all tasks. - resize (
tuple[int, int]|Noneor dict of them): the size to resize the images to. Default isNone, which means no resize. If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it isNone, no resize is applied. - permutation_mode (
str): the mode of permutation; one of:- 'all': permute all pixels.
- 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
- 'first_channel_only': permute only the first channel.
- permutation_seeds (
dict[int, int]|None): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default isNone, which creates a dict of seeds from 0 tonum_tasks-1.
original_dataset_python_class: type[torch.utils.data.dataset.Dataset] =
<class 'torchvision.datasets.semeion.SEMEION'>
The original dataset class.
validation_percentage: float
The percentage to randomly split some training data into validation data.
def
prepare_data(self) -> None:
91 def prepare_data(self) -> None: 92 r"""Download the original SEMEION dataset if haven't.""" 93 94 if self.task_id != 1: 95 return # download all original datasets only at the beginning of first task 96 97 SEMEION(root=self.root_t, download=True) 98 99 pylogger.debug( 100 "The original SEMEION dataset has been downloaded to %s.", self.root_t 101 )
Download the original SEMEION dataset if haven't.
def
train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]:
103 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 104 """Get the training and validation dataset of task `self.task_id`. 105 106 **Returns:** 107 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 108 """ 109 dataset_all = SEMEION( 110 root=self.root_t, 111 transform=self.train_and_val_transforms(), 112 target_transform=self.target_transform(), 113 download=False, 114 ) 115 116 dataset_train_and_val, _ = random_split( 117 dataset_all, 118 lengths=[ 119 1 - self.test_percentage, 120 self.test_percentage, 121 ], 122 generator=torch.Generator().manual_seed( 123 42 124 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 125 ) 126 127 return random_split( 128 dataset_train_and_val, 129 lengths=[1 - self.validation_percentage, self.validation_percentage], 130 generator=torch.Generator().manual_seed( 131 42 132 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 133 )
Get the training and validation dataset of task self.task_id.
Returns:
- train_and_val_dataset (
tuple[Dataset, Dataset]): the train and validation dataset of taskself.task_id.
def
test_dataset(self) -> torch.utils.data.dataset.Dataset:
135 def test_dataset(self) -> Dataset: 136 r"""Get the test dataset of task `self.task_id`. 137 138 **Returns:** 139 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 140 """ 141 dataset_all = SEMEION( 142 root=self.root_t, 143 transform=self.train_and_val_transforms(), 144 target_transform=self.target_transform(), 145 download=False, 146 ) 147 148 _, dataset_test = random_split( 149 dataset_all, 150 lengths=[1 - self.test_percentage, self.test_percentage], 151 generator=torch.Generator().manual_seed(42), 152 ) 153 154 return dataset_test
Get the test dataset of task self.task_id.
Returns:
- test_dataset (
Dataset): the test dataset of taskself.task_id.