clarena.cl_datasets.permuted_kannadamnist
The submodule in cl_datasets for Permuted Kannada-MNIST dataset.
1r""" 2The submodule in `cl_datasets` for Permuted Kannada-MNIST dataset. 3""" 4 5__all__ = ["PermutedKannadaMNIST"] 6 7import logging 8from typing import Callable 9 10import torch 11from torch.utils.data import Dataset, random_split 12from torchvision.transforms import transforms 13 14from clarena.cl_datasets import CLPermutedDataset 15from clarena.stl_datasets.raw import KannadaMNIST 16 17# always get logger for built-in logging in each module 18pylogger = logging.getLogger(__name__) 19 20 21class PermutedKannadaMNIST(CLPermutedDataset): 22 r"""Permuted Kannada-MNIST dataset. The [Kannada-MNIST dataset](https://github.com/vinayprabhu/Kannada_MNIST) is a collection of handwritten Kannada digits (0-9). It consists of 60,000 training and 10,000 test images of handwritten Kannada digits (10 classes), each 28x28 grayscale image (similar to MNIST).""" 23 24 original_dataset_python_class: type[Dataset] = KannadaMNIST 25 r"""The original dataset class.""" 26 27 def __init__( 28 self, 29 root: str, 30 num_tasks: int, 31 validation_percentage: float, 32 batch_size: int | dict[int, int] = 1, 33 num_workers: int | dict[int, int] = 0, 34 custom_transforms: ( 35 Callable 36 | transforms.Compose 37 | None 38 | dict[int, Callable | transforms.Compose | None] 39 ) = None, 40 repeat_channels: int | None | dict[int, int | None] = None, 41 to_tensor: bool | dict[int, bool] = True, 42 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 43 permutation_mode: str = "first_channel_only", 44 permutation_seeds: dict[int, int] | None = None, 45 ) -> None: 46 r""" 47 **Args:** 48 - **root** (`str`): the root directory where the original Kannada-MNIST data 'KannadaMNIST/' live. 49 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 50 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 51 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 52 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 53 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 54 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 55 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 56 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 57 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 58 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 59 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 60 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 61 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 62 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 63 - **permutation_mode** (`str`): the mode of permutation; one of: 64 1. 'all': permute all pixels. 65 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 66 3. 'first_channel_only': permute only the first channel. 67 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 68 """ 69 super().__init__( 70 root=root, 71 num_tasks=num_tasks, 72 batch_size=batch_size, 73 num_workers=num_workers, 74 custom_transforms=custom_transforms, 75 repeat_channels=repeat_channels, 76 to_tensor=to_tensor, 77 resize=resize, 78 permutation_mode=permutation_mode, 79 permutation_seeds=permutation_seeds, 80 ) 81 82 self.validation_percentage: float = validation_percentage 83 r"""The percentage to randomly split some training data into validation data.""" 84 85 def prepare_data(self) -> None: 86 r"""Download the original Kannada-MNIST dataset if haven't. Because the original dataset is published on Kaggle, we need to download it manually. This function will not download the original dataset automatically.""" 87 pass 88 89 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 90 """Get the training and validation dataset of task `self.task_id`. 91 92 **Returns:** 93 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 94 """ 95 dataset_train_and_val = KannadaMNIST( 96 root=self.root_t, 97 train=True, 98 transform=self.train_and_val_transforms(), 99 target_transform=self.target_transform(), 100 download=False, 101 ) 102 103 return random_split( 104 dataset_train_and_val, 105 lengths=[1 - self.validation_percentage, self.validation_percentage], 106 generator=torch.Generator().manual_seed( 107 42 108 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 109 ) 110 111 def test_dataset(self) -> Dataset: 112 r"""Get the test dataset of task `self.task_id`. 113 114 **Returns:** 115 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 116 """ 117 dataset_test = KannadaMNIST( 118 root=self.root_t, 119 train=False, 120 transform=self.test_transforms(), 121 target_transform=self.target_transform(), 122 download=False, 123 ) 124 125 return dataset_test
22class PermutedKannadaMNIST(CLPermutedDataset): 23 r"""Permuted Kannada-MNIST dataset. The [Kannada-MNIST dataset](https://github.com/vinayprabhu/Kannada_MNIST) is a collection of handwritten Kannada digits (0-9). It consists of 60,000 training and 10,000 test images of handwritten Kannada digits (10 classes), each 28x28 grayscale image (similar to MNIST).""" 24 25 original_dataset_python_class: type[Dataset] = KannadaMNIST 26 r"""The original dataset class.""" 27 28 def __init__( 29 self, 30 root: str, 31 num_tasks: int, 32 validation_percentage: float, 33 batch_size: int | dict[int, int] = 1, 34 num_workers: int | dict[int, int] = 0, 35 custom_transforms: ( 36 Callable 37 | transforms.Compose 38 | None 39 | dict[int, Callable | transforms.Compose | None] 40 ) = None, 41 repeat_channels: int | None | dict[int, int | None] = None, 42 to_tensor: bool | dict[int, bool] = True, 43 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 44 permutation_mode: str = "first_channel_only", 45 permutation_seeds: dict[int, int] | None = None, 46 ) -> None: 47 r""" 48 **Args:** 49 - **root** (`str`): the root directory where the original Kannada-MNIST data 'KannadaMNIST/' live. 50 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 51 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 52 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 53 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 54 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 55 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 56 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 57 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 58 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 59 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 60 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 61 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 62 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 63 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 64 - **permutation_mode** (`str`): the mode of permutation; one of: 65 1. 'all': permute all pixels. 66 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 67 3. 'first_channel_only': permute only the first channel. 68 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 69 """ 70 super().__init__( 71 root=root, 72 num_tasks=num_tasks, 73 batch_size=batch_size, 74 num_workers=num_workers, 75 custom_transforms=custom_transforms, 76 repeat_channels=repeat_channels, 77 to_tensor=to_tensor, 78 resize=resize, 79 permutation_mode=permutation_mode, 80 permutation_seeds=permutation_seeds, 81 ) 82 83 self.validation_percentage: float = validation_percentage 84 r"""The percentage to randomly split some training data into validation data.""" 85 86 def prepare_data(self) -> None: 87 r"""Download the original Kannada-MNIST dataset if haven't. Because the original dataset is published on Kaggle, we need to download it manually. This function will not download the original dataset automatically.""" 88 pass 89 90 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 91 """Get the training and validation dataset of task `self.task_id`. 92 93 **Returns:** 94 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 95 """ 96 dataset_train_and_val = KannadaMNIST( 97 root=self.root_t, 98 train=True, 99 transform=self.train_and_val_transforms(), 100 target_transform=self.target_transform(), 101 download=False, 102 ) 103 104 return random_split( 105 dataset_train_and_val, 106 lengths=[1 - self.validation_percentage, self.validation_percentage], 107 generator=torch.Generator().manual_seed( 108 42 109 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 110 ) 111 112 def test_dataset(self) -> Dataset: 113 r"""Get the test dataset of task `self.task_id`. 114 115 **Returns:** 116 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 117 """ 118 dataset_test = KannadaMNIST( 119 root=self.root_t, 120 train=False, 121 transform=self.test_transforms(), 122 target_transform=self.target_transform(), 123 download=False, 124 ) 125 126 return dataset_test
Permuted Kannada-MNIST dataset. The Kannada-MNIST dataset is a collection of handwritten Kannada digits (0-9). It consists of 60,000 training and 10,000 test images of handwritten Kannada digits (10 classes), each 28x28 grayscale image (similar to MNIST).
28 def __init__( 29 self, 30 root: str, 31 num_tasks: int, 32 validation_percentage: float, 33 batch_size: int | dict[int, int] = 1, 34 num_workers: int | dict[int, int] = 0, 35 custom_transforms: ( 36 Callable 37 | transforms.Compose 38 | None 39 | dict[int, Callable | transforms.Compose | None] 40 ) = None, 41 repeat_channels: int | None | dict[int, int | None] = None, 42 to_tensor: bool | dict[int, bool] = True, 43 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 44 permutation_mode: str = "first_channel_only", 45 permutation_seeds: dict[int, int] | None = None, 46 ) -> None: 47 r""" 48 **Args:** 49 - **root** (`str`): the root directory where the original Kannada-MNIST data 'KannadaMNIST/' live. 50 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 51 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 52 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 53 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 54 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 55 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 56 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 57 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 58 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 59 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 60 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 61 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 62 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 63 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 64 - **permutation_mode** (`str`): the mode of permutation; one of: 65 1. 'all': permute all pixels. 66 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 67 3. 'first_channel_only': permute only the first channel. 68 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 69 """ 70 super().__init__( 71 root=root, 72 num_tasks=num_tasks, 73 batch_size=batch_size, 74 num_workers=num_workers, 75 custom_transforms=custom_transforms, 76 repeat_channels=repeat_channels, 77 to_tensor=to_tensor, 78 resize=resize, 79 permutation_mode=permutation_mode, 80 permutation_seeds=permutation_seeds, 81 ) 82 83 self.validation_percentage: float = validation_percentage 84 r"""The percentage to randomly split some training data into validation data."""
Args:
- root (
str): the root directory where the original Kannada-MNIST data 'KannadaMNIST/' live. - num_tasks (
int): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 tonum_tasks. - validation_percentage (
float): the percentage to randomly split some training data into validation data. - batch_size (
int|dict[int, int]): the batch size for train, val, and test dataloaders. If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is anint, it is the same batch size for all tasks. - num_workers (
int|dict[int, int]): the number of workers for dataloaders. If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is anint, it is the same number of workers for all tasks. - custom_transforms (
transformortransforms.ComposeorNoneor dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform.ToTensor(), normalization, permute, and so on are not included. If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it isNone, no custom transforms are applied. - repeat_channels (
int|None| dict of them): the number of channels to repeat for each task. Default isNone, which means no repeat. If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is anint, it is the same number of channels to repeat for all tasks. If it isNone, no repeat is applied. - to_tensor (
bool|dict[int, bool]): whether to include theToTensor()transform. Default isTrue. If it is a dict, the keys are task IDs and the values are whether to include theToTensor()transform for each task. If it is a single boolean value, it is applied to all tasks. - resize (
tuple[int, int]|Noneor dict of them): the size to resize the images to. Default isNone, which means no resize. If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it isNone, no resize is applied. - permutation_mode (
str): the mode of permutation; one of:- 'all': permute all pixels.
- 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
- 'first_channel_only': permute only the first channel.
- permutation_seeds (
dict[int, int]|None): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default isNone, which creates a dict of seeds from 0 tonum_tasks-1.
The original dataset class.
The percentage to randomly split some training data into validation data.
86 def prepare_data(self) -> None: 87 r"""Download the original Kannada-MNIST dataset if haven't. Because the original dataset is published on Kaggle, we need to download it manually. This function will not download the original dataset automatically.""" 88 pass
Download the original Kannada-MNIST dataset if haven't. Because the original dataset is published on Kaggle, we need to download it manually. This function will not download the original dataset automatically.
90 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 91 """Get the training and validation dataset of task `self.task_id`. 92 93 **Returns:** 94 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 95 """ 96 dataset_train_and_val = KannadaMNIST( 97 root=self.root_t, 98 train=True, 99 transform=self.train_and_val_transforms(), 100 target_transform=self.target_transform(), 101 download=False, 102 ) 103 104 return random_split( 105 dataset_train_and_val, 106 lengths=[1 - self.validation_percentage, self.validation_percentage], 107 generator=torch.Generator().manual_seed( 108 42 109 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 110 )
Get the training and validation dataset of task self.task_id.
Returns:
- train_and_val_dataset (
tuple[Dataset, Dataset]): the train and validation dataset of taskself.task_id.
112 def test_dataset(self) -> Dataset: 113 r"""Get the test dataset of task `self.task_id`. 114 115 **Returns:** 116 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 117 """ 118 dataset_test = KannadaMNIST( 119 root=self.root_t, 120 train=False, 121 transform=self.test_transforms(), 122 target_transform=self.target_transform(), 123 download=False, 124 ) 125 126 return dataset_test
Get the test dataset of task self.task_id.
Returns:
- test_dataset (
Dataset): the test dataset of taskself.task_id.