clarena.cl_datasets.permuted_emnist
The submodule in cl_datasets for Permuted EMNIST dataset.
1r""" 2The submodule in `cl_datasets` for Permuted EMNIST dataset. 3""" 4 5__all__ = ["PermutedEMNIST"] 6 7import logging 8from typing import Callable 9 10import torch 11from torch.utils.data import Dataset, random_split 12from torchvision.datasets import EMNIST 13from torchvision.transforms import transforms 14 15from clarena.cl_datasets import CLPermutedDataset 16from clarena.stl_datasets.raw import ( 17 EMNISTBalanced, 18 EMNISTByClass, 19 EMNISTByMerge, 20 EMNISTDigits, 21 EMNISTLetters, 22) 23 24# always get logger for built-in logging in each module 25pylogger = logging.getLogger(__name__) 26 27 28class PermutedEMNIST(CLPermutedDataset): 29 r"""Permuted EMNIST dataset. The [EMNIST dataset](https://www.nist.gov/itl/products-and-services/emnist-dataset/) is a collection of handwritten letters and digits (including A-Z, a-z, 0-9). It consists of 814,255 images in 62 classes, each 28x28 grayscale image. 30 31 EMNIST has 6 different splits: `byclass`, `bymerge`, `balanced`, `letters`, `digits` and `mnist`, each containing a different subset of the original collection. We support all of them in Permuted EMNIST. 32 """ 33 34 def __init__( 35 self, 36 root: str, 37 split: str, 38 num_tasks: int, 39 validation_percentage: float, 40 batch_size: int | dict[int, int] = 1, 41 num_workers: int | dict[int, int] = 0, 42 custom_transforms: ( 43 Callable 44 | transforms.Compose 45 | None 46 | dict[int, Callable | transforms.Compose | None] 47 ) = None, 48 repeat_channels: int | None | dict[int, int | None] = None, 49 to_tensor: bool | dict[int, bool] = True, 50 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 51 permutation_mode: str = "first_channel_only", 52 permutation_seeds: dict[int, int] | None = None, 53 ) -> None: 54 r""" 55 **Args:** 56 - **root** (`str`): the root directory where the original EMNIST data 'EMNIST/' live. 57 - **split** (`str`): the original EMNIST dataset has 6 different splits: `byclass`, `bymerge`, `balanced`, `letters`, `digits` and `mnist`. This argument specifies which one to use. 58 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 59 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 60 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 61 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 62 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 63 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 64 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 65 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 66 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 67 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 68 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 69 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 70 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 71 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 72 - **permutation_mode** (`str`): the mode of permutation; one of: 73 1. 'all': permute all pixels. 74 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 75 3. 'first_channel_only': permute only the first channel. 76 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 77 """ 78 79 if split == "byclass": 80 self.original_dataset_python_class: type[Dataset] = EMNISTByClass 81 elif split == "bymerge": 82 self.original_dataset_python_class: type[Dataset] = EMNISTByMerge 83 elif split == "balanced": 84 self.original_dataset_python_class: type[Dataset] = EMNISTBalanced 85 elif split == "letters": 86 self.original_dataset_python_class: type[Dataset] = EMNISTLetters 87 elif split == "digits": 88 self.original_dataset_python_class: type[Dataset] = EMNISTDigits 89 r"""The original dataset class.""" 90 91 super().__init__( 92 root=root, 93 num_tasks=num_tasks, 94 batch_size=batch_size, 95 num_workers=num_workers, 96 custom_transforms=custom_transforms, 97 repeat_channels=repeat_channels, 98 to_tensor=to_tensor, 99 resize=resize, 100 permutation_mode=permutation_mode, 101 permutation_seeds=permutation_seeds, 102 ) 103 104 self.split: str = split 105 r"""The split of the original EMNIST dataset. It can be `byclass`, `bymerge`, `balanced`, `letters`, `digits` or `mnist`.""" 106 107 self.validation_percentage: float = validation_percentage 108 r"""The percentage to randomly split some training data into validation data.""" 109 110 def prepare_data(self) -> None: 111 r"""Download the original EMNIST dataset if haven't.""" 112 113 if self.task_id != 1: 114 return # download all original datasets only at the beginning of first task 115 116 EMNIST(root=self.root_t, split=self.split, train=True, download=True) 117 EMNIST(root=self.root_t, split=self.split, train=False, download=True) 118 119 pylogger.debug( 120 "The original EMNIST dataset has been downloaded to %s.", self.root_t 121 ) 122 123 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 124 """Get the training and validation dataset of task `self.task_id`. 125 126 **Returns:** 127 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 128 """ 129 dataset_train_and_val = EMNIST( 130 root=self.root_t, 131 split=self.split, 132 train=True, 133 transform=self.train_and_val_transforms(), 134 target_transform=self.target_transform(), 135 download=False, 136 ) 137 138 return random_split( 139 dataset_train_and_val, 140 lengths=[1 - self.validation_percentage, self.validation_percentage], 141 generator=torch.Generator().manual_seed( 142 42 143 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 144 ) 145 146 def test_dataset(self) -> Dataset: 147 r"""Get the test dataset of task `self.task_id`. 148 149 **Returns:** 150 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 151 """ 152 dataset_test = EMNIST( 153 root=self.root_t, 154 split=self.split, 155 train=False, 156 transform=self.test_transforms(), 157 target_transform=self.target_transform(), 158 download=False, 159 ) 160 161 return dataset_test
29class PermutedEMNIST(CLPermutedDataset): 30 r"""Permuted EMNIST dataset. The [EMNIST dataset](https://www.nist.gov/itl/products-and-services/emnist-dataset/) is a collection of handwritten letters and digits (including A-Z, a-z, 0-9). It consists of 814,255 images in 62 classes, each 28x28 grayscale image. 31 32 EMNIST has 6 different splits: `byclass`, `bymerge`, `balanced`, `letters`, `digits` and `mnist`, each containing a different subset of the original collection. We support all of them in Permuted EMNIST. 33 """ 34 35 def __init__( 36 self, 37 root: str, 38 split: str, 39 num_tasks: int, 40 validation_percentage: float, 41 batch_size: int | dict[int, int] = 1, 42 num_workers: int | dict[int, int] = 0, 43 custom_transforms: ( 44 Callable 45 | transforms.Compose 46 | None 47 | dict[int, Callable | transforms.Compose | None] 48 ) = None, 49 repeat_channels: int | None | dict[int, int | None] = None, 50 to_tensor: bool | dict[int, bool] = True, 51 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 52 permutation_mode: str = "first_channel_only", 53 permutation_seeds: dict[int, int] | None = None, 54 ) -> None: 55 r""" 56 **Args:** 57 - **root** (`str`): the root directory where the original EMNIST data 'EMNIST/' live. 58 - **split** (`str`): the original EMNIST dataset has 6 different splits: `byclass`, `bymerge`, `balanced`, `letters`, `digits` and `mnist`. This argument specifies which one to use. 59 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 60 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 61 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 62 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 63 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 64 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 65 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 66 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 67 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 68 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 69 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 70 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 71 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 72 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 73 - **permutation_mode** (`str`): the mode of permutation; one of: 74 1. 'all': permute all pixels. 75 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 76 3. 'first_channel_only': permute only the first channel. 77 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 78 """ 79 80 if split == "byclass": 81 self.original_dataset_python_class: type[Dataset] = EMNISTByClass 82 elif split == "bymerge": 83 self.original_dataset_python_class: type[Dataset] = EMNISTByMerge 84 elif split == "balanced": 85 self.original_dataset_python_class: type[Dataset] = EMNISTBalanced 86 elif split == "letters": 87 self.original_dataset_python_class: type[Dataset] = EMNISTLetters 88 elif split == "digits": 89 self.original_dataset_python_class: type[Dataset] = EMNISTDigits 90 r"""The original dataset class.""" 91 92 super().__init__( 93 root=root, 94 num_tasks=num_tasks, 95 batch_size=batch_size, 96 num_workers=num_workers, 97 custom_transforms=custom_transforms, 98 repeat_channels=repeat_channels, 99 to_tensor=to_tensor, 100 resize=resize, 101 permutation_mode=permutation_mode, 102 permutation_seeds=permutation_seeds, 103 ) 104 105 self.split: str = split 106 r"""The split of the original EMNIST dataset. It can be `byclass`, `bymerge`, `balanced`, `letters`, `digits` or `mnist`.""" 107 108 self.validation_percentage: float = validation_percentage 109 r"""The percentage to randomly split some training data into validation data.""" 110 111 def prepare_data(self) -> None: 112 r"""Download the original EMNIST dataset if haven't.""" 113 114 if self.task_id != 1: 115 return # download all original datasets only at the beginning of first task 116 117 EMNIST(root=self.root_t, split=self.split, train=True, download=True) 118 EMNIST(root=self.root_t, split=self.split, train=False, download=True) 119 120 pylogger.debug( 121 "The original EMNIST dataset has been downloaded to %s.", self.root_t 122 ) 123 124 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 125 """Get the training and validation dataset of task `self.task_id`. 126 127 **Returns:** 128 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 129 """ 130 dataset_train_and_val = EMNIST( 131 root=self.root_t, 132 split=self.split, 133 train=True, 134 transform=self.train_and_val_transforms(), 135 target_transform=self.target_transform(), 136 download=False, 137 ) 138 139 return random_split( 140 dataset_train_and_val, 141 lengths=[1 - self.validation_percentage, self.validation_percentage], 142 generator=torch.Generator().manual_seed( 143 42 144 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 145 ) 146 147 def test_dataset(self) -> Dataset: 148 r"""Get the test dataset of task `self.task_id`. 149 150 **Returns:** 151 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 152 """ 153 dataset_test = EMNIST( 154 root=self.root_t, 155 split=self.split, 156 train=False, 157 transform=self.test_transforms(), 158 target_transform=self.target_transform(), 159 download=False, 160 ) 161 162 return dataset_test
Permuted EMNIST dataset. The EMNIST dataset is a collection of handwritten letters and digits (including A-Z, a-z, 0-9). It consists of 814,255 images in 62 classes, each 28x28 grayscale image.
EMNIST has 6 different splits: byclass, bymerge, balanced, letters, digits and mnist, each containing a different subset of the original collection. We support all of them in Permuted EMNIST.
35 def __init__( 36 self, 37 root: str, 38 split: str, 39 num_tasks: int, 40 validation_percentage: float, 41 batch_size: int | dict[int, int] = 1, 42 num_workers: int | dict[int, int] = 0, 43 custom_transforms: ( 44 Callable 45 | transforms.Compose 46 | None 47 | dict[int, Callable | transforms.Compose | None] 48 ) = None, 49 repeat_channels: int | None | dict[int, int | None] = None, 50 to_tensor: bool | dict[int, bool] = True, 51 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 52 permutation_mode: str = "first_channel_only", 53 permutation_seeds: dict[int, int] | None = None, 54 ) -> None: 55 r""" 56 **Args:** 57 - **root** (`str`): the root directory where the original EMNIST data 'EMNIST/' live. 58 - **split** (`str`): the original EMNIST dataset has 6 different splits: `byclass`, `bymerge`, `balanced`, `letters`, `digits` and `mnist`. This argument specifies which one to use. 59 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 60 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 61 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 62 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 63 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 64 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 65 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 66 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 67 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 68 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 69 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 70 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 71 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 72 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 73 - **permutation_mode** (`str`): the mode of permutation; one of: 74 1. 'all': permute all pixels. 75 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 76 3. 'first_channel_only': permute only the first channel. 77 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 78 """ 79 80 if split == "byclass": 81 self.original_dataset_python_class: type[Dataset] = EMNISTByClass 82 elif split == "bymerge": 83 self.original_dataset_python_class: type[Dataset] = EMNISTByMerge 84 elif split == "balanced": 85 self.original_dataset_python_class: type[Dataset] = EMNISTBalanced 86 elif split == "letters": 87 self.original_dataset_python_class: type[Dataset] = EMNISTLetters 88 elif split == "digits": 89 self.original_dataset_python_class: type[Dataset] = EMNISTDigits 90 r"""The original dataset class.""" 91 92 super().__init__( 93 root=root, 94 num_tasks=num_tasks, 95 batch_size=batch_size, 96 num_workers=num_workers, 97 custom_transforms=custom_transforms, 98 repeat_channels=repeat_channels, 99 to_tensor=to_tensor, 100 resize=resize, 101 permutation_mode=permutation_mode, 102 permutation_seeds=permutation_seeds, 103 ) 104 105 self.split: str = split 106 r"""The split of the original EMNIST dataset. It can be `byclass`, `bymerge`, `balanced`, `letters`, `digits` or `mnist`.""" 107 108 self.validation_percentage: float = validation_percentage 109 r"""The percentage to randomly split some training data into validation data."""
Args:
- root (
str): the root directory where the original EMNIST data 'EMNIST/' live. - split (
str): the original EMNIST dataset has 6 different splits:byclass,bymerge,balanced,letters,digitsandmnist. This argument specifies which one to use. - num_tasks (
int): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 tonum_tasks. - validation_percentage (
float): the percentage to randomly split some training data into validation data. - batch_size (
int|dict[int, int]): the batch size for train, val, and test dataloaders. If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is anint, it is the same batch size for all tasks. - num_workers (
int|dict[int, int]): the number of workers for dataloaders. If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is anint, it is the same number of workers for all tasks. - custom_transforms (
transformortransforms.ComposeorNoneor dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform.ToTensor(), normalization, permute, and so on are not included. If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it isNone, no custom transforms are applied. - repeat_channels (
int|None| dict of them): the number of channels to repeat for each task. Default isNone, which means no repeat. If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is anint, it is the same number of channels to repeat for all tasks. If it isNone, no repeat is applied. - to_tensor (
bool|dict[int, bool]): whether to include theToTensor()transform. Default isTrue. If it is a dict, the keys are task IDs and the values are whether to include theToTensor()transform for each task. If it is a single boolean value, it is applied to all tasks. - resize (
tuple[int, int]|Noneor dict of them): the size to resize the images to. Default isNone, which means no resize. If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it isNone, no resize is applied. - permutation_mode (
str): the mode of permutation; one of:- 'all': permute all pixels.
- 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
- 'first_channel_only': permute only the first channel.
- permutation_seeds (
dict[int, int]|None): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default isNone, which creates a dict of seeds from 0 tonum_tasks-1.
The split of the original EMNIST dataset. It can be byclass, bymerge, balanced, letters, digits or mnist.
The percentage to randomly split some training data into validation data.
111 def prepare_data(self) -> None: 112 r"""Download the original EMNIST dataset if haven't.""" 113 114 if self.task_id != 1: 115 return # download all original datasets only at the beginning of first task 116 117 EMNIST(root=self.root_t, split=self.split, train=True, download=True) 118 EMNIST(root=self.root_t, split=self.split, train=False, download=True) 119 120 pylogger.debug( 121 "The original EMNIST dataset has been downloaded to %s.", self.root_t 122 )
Download the original EMNIST dataset if haven't.
124 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 125 """Get the training and validation dataset of task `self.task_id`. 126 127 **Returns:** 128 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 129 """ 130 dataset_train_and_val = EMNIST( 131 root=self.root_t, 132 split=self.split, 133 train=True, 134 transform=self.train_and_val_transforms(), 135 target_transform=self.target_transform(), 136 download=False, 137 ) 138 139 return random_split( 140 dataset_train_and_val, 141 lengths=[1 - self.validation_percentage, self.validation_percentage], 142 generator=torch.Generator().manual_seed( 143 42 144 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 145 )
Get the training and validation dataset of task self.task_id.
Returns:
- train_and_val_dataset (
tuple[Dataset, Dataset]): the train and validation dataset of taskself.task_id.
147 def test_dataset(self) -> Dataset: 148 r"""Get the test dataset of task `self.task_id`. 149 150 **Returns:** 151 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 152 """ 153 dataset_test = EMNIST( 154 root=self.root_t, 155 split=self.split, 156 train=False, 157 transform=self.test_transforms(), 158 target_transform=self.target_transform(), 159 download=False, 160 ) 161 162 return dataset_test
Get the test dataset of task self.task_id.
Returns:
- test_dataset (
Dataset): the test dataset of taskself.task_id.